aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-02-29 10:48:16 -0800
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-02-29 10:48:16 -0800
commit328484204559b3ae89c6131e65bdc397a17e0275 (patch)
treea785c3d77f61a37d5b64460454508deef84650a4 /unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
parente9bea614ecb6d910948e36b11483bbb0c0f83f76 (diff)
Optimized the performance of narrow reductions on CUDA devices
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h18
1 files changed, 16 insertions, 2 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 2da18b147..c3b1b8b7a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -230,9 +230,14 @@ struct InnerReducer<Self, Op, GpuDevice> {
assert(false && "Should only be called to reduce floats on a gpu device");
}
- static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
+ // It's faster to use the usual code.
+ if (num_coeffs_to_reduce <= 32) {
+ return true;
+ }
+
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = 256;
const int num_per_thread = 128;
@@ -255,6 +260,8 @@ struct InnerReducer<Self, Op, GpuDevice> {
LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+ return false;
}
};
@@ -301,9 +308,14 @@ struct OuterReducer<Self, Op, GpuDevice> {
assert(false && "Should only be called to reduce floats on a gpu device");
}
- static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
+ // It's faster to use the usual code.
+ if (num_coeffs_to_reduce <= 32) {
+ return true;
+ }
+
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = 256;
const int num_per_thread = 16;
@@ -326,6 +338,8 @@ struct OuterReducer<Self, Op, GpuDevice> {
LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+ return false;
}
};