aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-05-10 11:58:18 -0700
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-05-10 11:58:18 -0700
commit0eb69b7552efe0194c2c96ce8dee09176c49c231 (patch)
treebbad537649204d1a79ce41acf414ebb18bad7389 /unsupported/Eigen/CXX11/src/Tensor
parent0b9e3dcd06585d28ac4b59dfd518b0a49af3a359 (diff)
Small improvement to the full reduction of fp16
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h10
1 files changed, 6 insertions, 4 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index b18200166..afa1a2697 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -193,16 +193,18 @@ static __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self
__syncthreads();
if (gridDim.x == 1 && first_index == 0) {
- reducer.reduce(__low2half(*scratch), output);
- reducer.reduce(__high2half(*scratch), output);
+ half tmp = __low2half(*scratch);
+ reducer.reduce(__high2half(*scratch), &tmp);
+ *output = tmp;
}
}
template <typename Op>
__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
eigen_assert(threadIdx.x == 1);
- reducer.reduce(__low2half(*scratch), output);
- reducer.reduce(__high2half(*scratch), output);
+ half tmp = __low2half(*scratch);
+ reducer.reduce(__high2half(*scratch), &tmp);
+ *output = tmp;
}
#endif