Fixing HIP breakage caused by the recent commit that introduces Packet4h2 as the Eigen::Half packet type

author: Deven Desai <deven.desai.amd@gmail.com> 2020-03-11 23:06:56 +0000
committer: Deven Desai <deven.desai.amd@gmail.com> 2020-03-12 01:06:24 +0000
commit: 7158ed4e0e34d40cd0f358a3bf69a5c30d8d0f83 (patch)
tree: 6ee1f2ce81b3e442210564b283fdf9e953ff0306 /unsupported
parent: d53ae40f7bcfb948b85b893acf305cdebcba3ba8 (diff)
1 files changed, 31 insertions, 15 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
index 9d3305cfd..36df03d62 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -306,11 +306,17 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
 #pragma unroll
   for (int offset = warpSize/2; offset > 0; offset /= 2) {
   #if defined(EIGEN_HIPCC)
-    // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
-    union { int i; half2 h; } wka_in, wka_out;
-    wka_in.h = accum;
-    wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
-    reducer.reducePacket(wka_out.h, &accum);
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+      union { int i; half2 h; } wka_in, wka_out;
+      wka_in.h = hacc[i];
+      wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
+      hr[i] = wka_out.h;
+    }
+    reducer.reducePacket(r1, &accum);
   #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
     PacketType r1;
     half2* hr = reinterpret_cast<half2*>(&r1);
@@ -661,16 +667,26 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
 #pragma unroll
       for (int offset = warpSize/2; offset > 0; offset /= 2) {
       #if defined(EIGEN_HIPCC)
-        // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
-        union { int i; half2 h; } wka_in, wka_out;
-
-        wka_in.h = reduced_val1;
-        wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
-        reducer.reducePacket(wka_out.h, &reduced_val1);
-
-        wka_in.h = reduced_val2;
-        wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
-        reducer.reducePacket(wka_out.h, &reduced_val2);
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+	  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+	  union { int i; half2 h; } wka_in1, wka_out1;
+	  wka_in1.h = rv1[i];
+	  wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
+	  hr1[i] = wka_out1.h;
+
+	  union { int i; half2 h; } wka_in2, wka_out2;
+	  wka_in2.h = rv2[i];
+	  wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
+	  hr2[i] = wka_out2.h;
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
       #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
         PacketType r1;
         PacketType r2;
author	Deven Desai <deven.desai.amd@gmail.com>	2020-03-11 23:06:56 +0000
committer	Deven Desai <deven.desai.amd@gmail.com>	2020-03-12 01:06:24 +0000
commit	7158ed4e0e34d40cd0f358a3bf69a5c30d8d0f83 (patch)
tree	6ee1f2ce81b3e442210564b283fdf9e953ff0306 /unsupported
parent	d53ae40f7bcfb948b85b893acf305cdebcba3ba8 (diff)