aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
authorGravatar Deven Desai <deven.desai.amd@gmail.com>2020-03-11 23:06:56 +0000
committerGravatar Deven Desai <deven.desai.amd@gmail.com>2020-03-12 01:06:24 +0000
commit7158ed4e0e34d40cd0f358a3bf69a5c30d8d0f83 (patch)
tree6ee1f2ce81b3e442210564b283fdf9e953ff0306 /unsupported
parentd53ae40f7bcfb948b85b893acf305cdebcba3ba8 (diff)
Fixing HIP breakage caused by the recent commit that introduces Packet4h2 as the Eigen::Half packet type
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h46
1 files changed, 31 insertions, 15 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
index 9d3305cfd..36df03d62 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -306,11 +306,17 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
#if defined(EIGEN_HIPCC)
- // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
- union { int i; half2 h; } wka_in, wka_out;
- wka_in.h = accum;
- wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
- reducer.reducePacket(wka_out.h, &accum);
+ PacketType r1;
+ half2* hr = reinterpret_cast<half2*>(&r1);
+ half2* hacc = reinterpret_cast<half2*>(&accum);
+ for (int i = 0; i < packet_width / 2; i++) {
+ // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+ union { int i; half2 h; } wka_in, wka_out;
+ wka_in.h = hacc[i];
+ wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
+ hr[i] = wka_out.h;
+ }
+ reducer.reducePacket(r1, &accum);
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
PacketType r1;
half2* hr = reinterpret_cast<half2*>(&r1);
@@ -661,16 +667,26 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
#if defined(EIGEN_HIPCC)
- // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
- union { int i; half2 h; } wka_in, wka_out;
-
- wka_in.h = reduced_val1;
- wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
- reducer.reducePacket(wka_out.h, &reduced_val1);
-
- wka_in.h = reduced_val2;
- wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
- reducer.reducePacket(wka_out.h, &reduced_val2);
+ PacketType r1;
+ PacketType r2;
+ half2* hr1 = reinterpret_cast<half2*>(&r1);
+ half2* hr2 = reinterpret_cast<half2*>(&r2);
+ half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+ half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+ for (int i = 0; i < packet_width / 2; i++) {
+ // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+ union { int i; half2 h; } wka_in1, wka_out1;
+ wka_in1.h = rv1[i];
+ wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
+ hr1[i] = wka_out1.h;
+
+ union { int i; half2 h; } wka_in2, wka_out2;
+ wka_in2.h = rv2[i];
+ wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
+ hr2[i] = wka_out2.h;
+ }
+ reducer.reducePacket(r1, &reduced_val1);
+ reducer.reducePacket(r2, &reduced_val2);
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
PacketType r1;
PacketType r2;