diff options
author | Vijay Vasudevan <vrv@google.com> | 2015-12-02 15:05:37 -0800 |
---|---|---|
committer | Vijay Vasudevan <vrv@google.com> | 2015-12-02 15:05:37 -0800 |
commit | bb7a7a8858dc18ba733ed64e0733e27a4224ece8 (patch) | |
tree | 26dc98ddbbb220fd008de2925f482edf00a8c6bf /third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | |
parent | bf6b536bde7d8060c489b51fedb58968b8cbfd7c (diff) |
TensorFlow: upstream changes from eigen to fix build from
changes in last commit.
Diffstat (limited to 'third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r-- | third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 16 |
1 files changed, 10 insertions, 6 deletions
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 863c28ab43..b7cea143ff 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -418,11 +418,13 @@ inline void TensorExecutor<Expression, GpuDevice, false, Tileable>::run( TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const int num_blocks = device.getNumCudaMultiProcessors() * - device.maxCudaThreadsPerMultiProcessor() / - device.maxCudaThreadsPerBlock(); const int block_size = device.maxCudaThreadsPerBlock(); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; const Index size = array_prod(evaluator.dimensions()); + // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. + const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1); + LAUNCH_CUDA_KERNEL( (EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), @@ -438,11 +440,13 @@ inline void TensorExecutor<Expression, GpuDevice, true, Tileable>::run( TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const int num_blocks = device.getNumCudaMultiProcessors() * - device.maxCudaThreadsPerMultiProcessor() / - device.maxCudaThreadsPerBlock(); const int block_size = device.maxCudaThreadsPerBlock(); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; const Index size = array_prod(evaluator.dimensions()); + // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. + const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1); + LAUNCH_CUDA_KERNEL( (EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), |