aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-05-17 09:13:27 -0700
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-05-17 09:13:27 -0700
commit8d06c02ffd9eb43194311d0e21b8618d3a8f4937 (patch)
tree3536e517927555d667504b70e5b6f087697aae86 /unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
parenta80d875916de350c1849cd97d8b2515f620911d4 (diff)
Allow vectorized padding on GPU. This helps speed things up a little.
Before: BM_padding/10 5000000 460 217.03 MFlops/s BM_padding/80 5000000 460 13899.40 MFlops/s BM_padding/640 5000000 461 888421.17 MFlops/s BM_padding/4K 5000000 460 54316322.55 MFlops/s After: BM_padding/10 5000000 454 220.20 MFlops/s BM_padding/80 5000000 455 14039.86 MFlops/s BM_padding/640 5000000 452 904968.83 MFlops/s BM_padding/4K 5000000 411 60750049.21 MFlops/s
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h7
1 files changed, 7 insertions, 0 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 2a8047b7d..fb842e82f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -328,6 +328,9 @@ template <typename S, typename R, typename I>
__global__ void ReductionInitKernelHalfFloat(R, const S, I, half2*);
template <int B, int N, typename S, typename R, typename I>
__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+template <int NPT, typename S, typename R, typename I>
+__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*, half2*);
+
#endif
template <int NPT, typename S, typename R, typename I>
@@ -629,11 +632,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
#ifdef EIGEN_HAS_CUDA_FP16
template <typename S, typename R, typename I> friend void internal::ReductionInitKernelHalfFloat(R, const S, I, half2*);
template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+ template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*, half2*);
#endif
template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+
template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
#endif
+ template <typename S, typename O, typename D> friend struct internal::InnerReducer;
+
// Returns the Index in the input tensor of the first value that needs to be
// used to compute the reduction at output index "index".
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {