Improved the performance of large outer reductions on cuda

author: Benoit Steiner <benoit.steiner.goog@gmail.com> 2016-02-29 18:11:58 -0800
committer: Benoit Steiner <benoit.steiner.goog@gmail.com> 2016-02-29 18:11:58 -0800
commit: 68ac5c1738083796084fb554c5d167056bb92fc8 (patch)
tree: add2314562ac3028a939817a2e58d87463df7c3c /unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
parent: 56a3ada6701b8e8645df4e00a2ef93d45a4f970a (diff)
1 files changed, 1 insertions, 1 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index bad5c1425..444766f96 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -281,7 +281,7 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu
   }
 
   // Do the reduction.
-  const Index max_iter = num_preserved_coeffs * numext::maxi<Index>(1, (num_coeffs_to_reduce - NumPerThread + 1));
+  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
   for (Index i = thread_id; i < max_iter; i += num_threads) {
     const Index input_col = i % num_preserved_coeffs;
     const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
author	Benoit Steiner <benoit.steiner.goog@gmail.com>	2016-02-29 18:11:58 -0800
committer	Benoit Steiner <benoit.steiner.goog@gmail.com>	2016-02-29 18:11:58 -0800
commit	68ac5c1738083796084fb554c5d167056bb92fc8 (patch)
tree	add2314562ac3028a939817a2e58d87463df7c3c /unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
parent	56a3ada6701b8e8645df4e00a2ef93d45a4f970a (diff)