aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2015-12-22 16:30:10 -0800
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2015-12-22 16:30:10 -0800
commita1e08fb2a55bf60c81de1687f825d0c3d4e62d22 (patch)
treea3c5b650f2097653f54c9f93a1327c8f660a9579 /unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
parent9c7d96697b4e21960d679b7be8d5514a22fd80ab (diff)
Optimized the configuration of the outer reduction cuda kernel
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h5
1 files changed, 4 insertions, 1 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 20dc72e85..8e250867c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -179,7 +179,10 @@ struct OuterReducer<Self, Op, GpuDevice> {
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = 256;
const int num_per_thread = 16;
- const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
+ const int dyn_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
+ const int max_blocks = device.getNumCudaMultiProcessors() *
+ device.maxCudaThreadsPerMultiProcessor() / block_size;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);