aboutsummaryrefslogtreecommitdiffhomepage
path: root/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
diff options
context:
space:
mode:
authorGravatar Vijay Vasudevan <vrv@google.com>2015-12-02 15:05:37 -0800
committerGravatar Vijay Vasudevan <vrv@google.com>2015-12-02 15:05:37 -0800
commitbb7a7a8858dc18ba733ed64e0733e27a4224ece8 (patch)
tree26dc98ddbbb220fd008de2925f482edf00a8c6bf /third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
parentbf6b536bde7d8060c489b51fedb58968b8cbfd7c (diff)
TensorFlow: upstream changes from eigen to fix build from
changes in last commit.
Diffstat (limited to 'third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r--third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h16
1 files changed, 10 insertions, 6 deletions
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 863c28ab43..b7cea143ff 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -418,11 +418,13 @@ inline void TensorExecutor<Expression, GpuDevice, false, Tileable>::run(
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
- const int num_blocks = device.getNumCudaMultiProcessors() *
- device.maxCudaThreadsPerMultiProcessor() /
- device.maxCudaThreadsPerBlock();
const int block_size = device.maxCudaThreadsPerBlock();
+ const int max_blocks = device.getNumCudaMultiProcessors() *
+ device.maxCudaThreadsPerMultiProcessor() / block_size;
const Index size = array_prod(evaluator.dimensions());
+ // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+ const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
+
LAUNCH_CUDA_KERNEL(
(EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>,
Index>),
@@ -438,11 +440,13 @@ inline void TensorExecutor<Expression, GpuDevice, true, Tileable>::run(
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
- const int num_blocks = device.getNumCudaMultiProcessors() *
- device.maxCudaThreadsPerMultiProcessor() /
- device.maxCudaThreadsPerBlock();
const int block_size = device.maxCudaThreadsPerBlock();
+ const int max_blocks = device.getNumCudaMultiProcessors() *
+ device.maxCudaThreadsPerMultiProcessor() / block_size;
const Index size = array_prod(evaluator.dimensions());
+ // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+ const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
+
LAUNCH_CUDA_KERNEL(
(EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>,
Index>),