Made it possible to limit the number of blocks that will be used to evaluate a tensor expression on a CUDA device. This makesit possible to set aside streaming multiprocessors for other computations.

author: Benoit Steiner <benoit.steiner.goog@gmail.com> 2016-02-01 12:46:32 -0800
committer: Benoit Steiner <benoit.steiner.goog@gmail.com> 2016-02-01 12:46:32 -0800
commit: 6b5dff875e4ba2235f255b7cf0a86b7abed21df0 (patch)
tree: 9dbf00da5c4e1b0a9689422b8faf5448af750416 /unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
parent: 264f8141f86e84312f0eea9e741d2260ed839890 (diff)
1 files changed, 2 insertions, 2 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index d2ab70f2b..df15c6204 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -220,7 +220,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(
   if (needs_assign)
   {
     const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
     const Index size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
@@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(c
   if (needs_assign)
   {
     const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int max_blocks = numext::maxi<int>(device.maxBlocks(), device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size);
     const Index size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
author	Benoit Steiner <benoit.steiner.goog@gmail.com>	2016-02-01 12:46:32 -0800
committer	Benoit Steiner <benoit.steiner.goog@gmail.com>	2016-02-01 12:46:32 -0800
commit	6b5dff875e4ba2235f255b7cf0a86b7abed21df0 (patch)
tree	9dbf00da5c4e1b0a9689422b8faf5448af750416 /unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
parent	264f8141f86e84312f0eea9e741d2260ed839890 (diff)