Allow vectorized padding on GPU. This helps speed things up a little.

Before: BM_padding/10 5000000 460 217.03 MFlops/s BM_padding/80 5000000 460 13899.40 MFlops/s BM_padding/640 5000000 461 888421.17 MFlops/s BM_padding/4K 5000000 460 54316322.55 MFlops/s After: BM_padding/10 5000000 454 220.20 MFlops/s BM_padding/80 5000000 455 14039.86 MFlops/s BM_padding/640 5000000 452 904968.83 MFlops/s BM_padding/4K 5000000 411 60750049.21 MFlops/s
author: Benoit Steiner <benoit.steiner.goog@gmail.com> 2016-05-17 09:13:27 -0700
committer: Benoit Steiner <benoit.steiner.goog@gmail.com> 2016-05-17 09:13:27 -0700
commit: 8d06c02ffd9eb43194311d0e21b8618d3a8f4937 (patch)
tree: 3536e517927555d667504b70e5b6f087697aae86 /unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
parent: a80d875916de350c1849cd97d8b2515f620911d4 (diff)
1 files changed, 7 insertions, 0 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 2a8047b7d..fb842e82f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -328,6 +328,9 @@ template <typename S, typename R, typename I>
 __global__ void ReductionInitKernelHalfFloat(R, const S, I, half2*);
 template <int B, int N, typename S, typename R, typename I>
 __global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+template <int NPT, typename S, typename R, typename I>
+__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*, half2*);
+
 #endif
 
 template <int NPT, typename S, typename R, typename I>
@@ -629,11 +632,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 #ifdef EIGEN_HAS_CUDA_FP16
   template <typename S, typename R, typename I> friend void internal::ReductionInitKernelHalfFloat(R, const S, I, half2*);
   template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*, half2*);
 #endif
   template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+
   template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 #endif
 
+  template <typename S, typename O, typename D> friend struct internal::InnerReducer;
+
   // Returns the Index in the input tensor of the first value that needs to be
   // used to compute the reduction at output index "index".
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
author	Benoit Steiner <benoit.steiner.goog@gmail.com>	2016-05-17 09:13:27 -0700
committer	Benoit Steiner <benoit.steiner.goog@gmail.com>	2016-05-17 09:13:27 -0700
commit	8d06c02ffd9eb43194311d0e21b8618d3a8f4937 (patch)
tree	3536e517927555d667504b70e5b6f087697aae86 /unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
parent	a80d875916de350c1849cd97d8b2515f620911d4 (diff)