diff options
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 6 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 18 |
2 files changed, 18 insertions, 6 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index f7c1a5cf4..88d51e5f0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -544,8 +544,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); Op reducer(m_reducer); - internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); - return false; + return internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); } bool preserving_inner_dims = true; @@ -561,8 +560,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); Op reducer(m_reducer); - internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); - return false; + return internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); } } return true; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 2da18b147..c3b1b8b7a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -230,9 +230,14 @@ struct InnerReducer<Self, Op, GpuDevice> { assert(false && "Should only be called to reduce floats on a gpu device"); } - static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; + // It's faster to use the usual code. + if (num_coeffs_to_reduce <= 32) { + return true; + } + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 128; @@ -255,6 +260,8 @@ struct InnerReducer<Self, Op, GpuDevice> { LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + + return false; } }; @@ -301,9 +308,14 @@ struct OuterReducer<Self, Op, GpuDevice> { assert(false && "Should only be called to reduce floats on a gpu device"); } - static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; + // It's faster to use the usual code. + if (num_coeffs_to_reduce <= 32) { + return true; + } + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 16; @@ -326,6 +338,8 @@ struct OuterReducer<Self, Op, GpuDevice> { LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + + return false; } }; |