diff options
author | 2016-05-10 11:58:18 -0700 | |
---|---|---|
committer | 2016-05-10 11:58:18 -0700 | |
commit | 0eb69b7552efe0194c2c96ce8dee09176c49c231 (patch) | |
tree | bbad537649204d1a79ce41acf414ebb18bad7389 /unsupported/Eigen | |
parent | 0b9e3dcd06585d28ac4b59dfd518b0a49af3a359 (diff) |
Small improvement to the full reduction of fp16
Diffstat (limited to 'unsupported/Eigen')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 10 |
1 files changed, 6 insertions, 4 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index b18200166..afa1a2697 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -193,16 +193,18 @@ static __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self __syncthreads(); if (gridDim.x == 1 && first_index == 0) { - reducer.reduce(__low2half(*scratch), output); - reducer.reduce(__high2half(*scratch), output); + half tmp = __low2half(*scratch); + reducer.reduce(__high2half(*scratch), &tmp); + *output = tmp; } } template <typename Op> __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) { eigen_assert(threadIdx.x == 1); - reducer.reduce(__low2half(*scratch), output); - reducer.reduce(__high2half(*scratch), output); + half tmp = __low2half(*scratch); + reducer.reduce(__high2half(*scratch), &tmp); + *output = tmp; } #endif |