diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2016-05-17 09:13:27 -0700 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2016-05-17 09:13:27 -0700 |
commit | 8d06c02ffd9eb43194311d0e21b8618d3a8f4937 (patch) | |
tree | 3536e517927555d667504b70e5b6f087697aae86 /unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | |
parent | a80d875916de350c1849cd97d8b2515f620911d4 (diff) |
Allow vectorized padding on GPU. This helps speed things up a little.
Before:
BM_padding/10 5000000 460 217.03 MFlops/s
BM_padding/80 5000000 460 13899.40 MFlops/s
BM_padding/640 5000000 461 888421.17 MFlops/s
BM_padding/4K 5000000 460 54316322.55 MFlops/s
After:
BM_padding/10 5000000 454 220.20 MFlops/s
BM_padding/80 5000000 455 14039.86 MFlops/s
BM_padding/640 5000000 452 904968.83 MFlops/s
BM_padding/4K 5000000 411 60750049.21 MFlops/s
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 7 |
1 files changed, 7 insertions, 0 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 2a8047b7d..fb842e82f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -328,6 +328,9 @@ template <typename S, typename R, typename I> __global__ void ReductionInitKernelHalfFloat(R, const S, I, half2*); template <int B, int N, typename S, typename R, typename I> __global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*); +template <int NPT, typename S, typename R, typename I> +__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*, half2*); + #endif template <int NPT, typename S, typename R, typename I> @@ -629,11 +632,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> #ifdef EIGEN_HAS_CUDA_FP16 template <typename S, typename R, typename I> friend void internal::ReductionInitKernelHalfFloat(R, const S, I, half2*); template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); + template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*, half2*); #endif template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif + template <typename S, typename O, typename D> friend struct internal::InnerReducer; + // Returns the Index in the input tensor of the first value that needs to be // used to compute the reduction at output index "index". EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { |