diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2015-12-22 16:30:10 -0800 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2015-12-22 16:30:10 -0800 |
commit | a1e08fb2a55bf60c81de1687f825d0c3d4e62d22 (patch) | |
tree | a3c5b650f2097653f54c9f93a1327c8f660a9579 /unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | |
parent | 9c7d96697b4e21960d679b7be8d5514a22fd80ab (diff) |
Optimized the configuration of the outer reduction cuda kernel
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 20dc72e85..8e250867c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -179,7 +179,10 @@ struct OuterReducer<Self, Op, GpuDevice> { const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 16; - const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread)); + const int dyn_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread)); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread>), num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); |