From 4670d7d5ce2517b2e9201f1cf44ae62ef2284eb5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 May 2016 17:09:54 -0700 Subject: Improved the performance of full reductions on GPU: Before: BM_fullReduction/10 200000 11751 8.51 MFlops/s BM_fullReduction/80 5000 523385 12.23 MFlops/s BM_fullReduction/640 50 36179326 11.32 MFlops/s BM_fullReduction/4K 1 2173517195 11.50 MFlops/s After: BM_fullReduction/10 500000 5987 16.70 MFlops/s BM_fullReduction/80 200000 10636 601.73 MFlops/s BM_fullReduction/640 50000 58428 7010.31 MFlops/s BM_fullReduction/4K 1000 2006106 12461.95 MFlops/s --- .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 170 ++++++++++++++++++--- 1 file changed, 151 insertions(+), 19 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index fd2587dd5..9186dffe4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -67,6 +67,30 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) #endif } + +template