Don't create more cuda blocks than necessary

author: Benoit Steiner <benoit.steiner.goog@gmail.com> 2015-11-23 11:00:10 -0800
committer: Benoit Steiner <benoit.steiner.goog@gmail.com> 2015-11-23 11:00:10 -0800
commit: 562078780a5511f33c6bb5639c5a93e56163a443 (patch)
tree: a1ea70ea75c837cb6296e316b2825c2dedfaa6ad /unsupported
parent: df31ca3b9e038d9b83226a3ed3fe3c8a4cf16bdd (diff)
1 files changed, 6 insertions, 2 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 956672771..d93e1de1b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -219,9 +219,11 @@ inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression&
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
   if (needs_assign)
   {
-    const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
     const int block_size = device.maxCudaThreadsPerBlock();
+    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
     const Index size = array_prod(evaluator.dimensions());
+    // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
+    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
     LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
   }
   evaluator.cleanup();
@@ -236,9 +238,11 @@ inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& e
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
   if (needs_assign)
   {
-    const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
     const int block_size = device.maxCudaThreadsPerBlock();
+    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
     const Index size = array_prod(evaluator.dimensions());
+    // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
+    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
     LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
   }
   evaluator.cleanup();
author	Benoit Steiner <benoit.steiner.goog@gmail.com>	2015-11-23 11:00:10 -0800
committer	Benoit Steiner <benoit.steiner.goog@gmail.com>	2015-11-23 11:00:10 -0800
commit	562078780a5511f33c6bb5639c5a93e56163a443 (patch)
tree	a1ea70ea75c837cb6296e316b2825c2dedfaa6ad /unsupported
parent	df31ca3b9e038d9b83226a3ed3fe3c8a4cf16bdd (diff)