From 7158ed4e0e34d40cd0f358a3bf69a5c30d8d0f83 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 11 Mar 2020 23:06:56 +0000 Subject: Fixing HIP breakage caused by the recent commit that introduces Packet4h2 as the Eigen::Half packet type --- .../Eigen/CXX11/src/Tensor/TensorReductionGpu.h | 46 +++++++++++++++------- 1 file changed, 31 insertions(+), 15 deletions(-) (limited to 'unsupported') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index 9d3305cfd..36df03d62 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -306,11 +306,17 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { #if defined(EIGEN_HIPCC) - // FIXME : remove this workaround once we have native half/half2 support for __shfl_down - union { int i; half2 h; } wka_in, wka_out; - wka_in.h = accum; - wka_out.i = __shfl_down(wka_in.i, offset, warpSize); - reducer.reducePacket(wka_out.h, &accum); + PacketType r1; + half2* hr = reinterpret_cast(&r1); + half2* hacc = reinterpret_cast(&accum); + for (int i = 0; i < packet_width / 2; i++) { + // FIXME : remove this workaround once we have native half/half2 support for __shfl_down + union { int i; half2 h; } wka_in, wka_out; + wka_in.h = hacc[i]; + wka_out.i = __shfl_down(wka_in.i, offset, warpSize); + hr[i] = wka_out.h; + } + reducer.reducePacket(r1, &accum); #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 PacketType r1; half2* hr = reinterpret_cast(&r1); @@ -661,16 +667,26 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { #if defined(EIGEN_HIPCC) - // FIXME : remove this workaround once we have native half/half2 support for __shfl_down - union { int i; half2 h; } wka_in, wka_out; - - wka_in.h = reduced_val1; - wka_out.i = __shfl_down(wka_in.i, offset, warpSize); - reducer.reducePacket(wka_out.h, &reduced_val1); - - wka_in.h = reduced_val2; - wka_out.i = __shfl_down(wka_in.i, offset, warpSize); - reducer.reducePacket(wka_out.h, &reduced_val2); + PacketType r1; + PacketType r2; + half2* hr1 = reinterpret_cast(&r1); + half2* hr2 = reinterpret_cast(&r2); + half2* rv1 = reinterpret_cast(&reduced_val1); + half2* rv2 = reinterpret_cast(&reduced_val2); + for (int i = 0; i < packet_width / 2; i++) { + // FIXME : remove this workaround once we have native half/half2 support for __shfl_down + union { int i; half2 h; } wka_in1, wka_out1; + wka_in1.h = rv1[i]; + wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize); + hr1[i] = wka_out1.h; + + union { int i; half2 h; } wka_in2, wka_out2; + wka_in2.h = rv2[i]; + wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize); + hr2[i] = wka_out2.h; + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 PacketType r1; PacketType r2; -- cgit v1.2.3