Improved the performance of full reductions.

AFTER: BM_fullReduction/10 4541 4543 154017 21.0M items/s BM_fullReduction/64 5191 5193 100000 752.5M items/s BM_fullReduction/512 9588 9588 71361 25.5G items/s BM_fullReduction/4k 244314 244281 2863 64.0G items/s BM_fullReduction/5k 359382 359363 1946 64.8G items/s BEFORE: BM_fullReduction/10 9085 9087 74395 10.5M items/s BM_fullReduction/64 9478 9478 72014 412.1M items/s BM_fullReduction/512 14643 14646 46902 16.7G items/s BM_fullReduction/4k 260338 260384 2678 60.0G items/s BM_fullReduction/5k 385076 385178 1818 60.5G items/s
author: Benoit Steiner <benoit.steiner.goog@gmail.com> 2016-06-03 17:27:08 -0700
committer: Benoit Steiner <benoit.steiner.goog@gmail.com> 2016-06-03 17:27:08 -0700
commit: c2a102345f627e4cd1908dad03e6ef0cbb2170c0 (patch)
tree: 37dcab31e24ba0f2e313c1c7e842ffb26032c7f5 /unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
parent: 8d97ba6b2251aabf325ff74f24959ceaa85cf11e (diff)
1 files changed, 2 insertions, 2 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 99a09c058..6ddf824e9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -316,7 +316,7 @@ struct OuterReducer {
 
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
 template <int B, int N, typename S, typename R, typename I>
-__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
 
 
 #ifdef EIGEN_HAS_CUDA_FP16
@@ -616,7 +616,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
 #endif
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
 #ifdef EIGEN_HAS_CUDA_FP16
   template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
   template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
author	Benoit Steiner <benoit.steiner.goog@gmail.com>	2016-06-03 17:27:08 -0700
committer	Benoit Steiner <benoit.steiner.goog@gmail.com>	2016-06-03 17:27:08 -0700
commit	c2a102345f627e4cd1908dad03e6ef0cbb2170c0 (patch)
tree	37dcab31e24ba0f2e313c1c7e842ffb26032c7f5 /unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
parent	8d97ba6b2251aabf325ff74f24959ceaa85cf11e (diff)