TensorFlow: upstream changes from eigen to fix build from

changes in last commit.
author: Vijay Vasudevan <vrv@google.com> 2015-12-02 15:05:37 -0800
committer: Vijay Vasudevan <vrv@google.com> 2015-12-02 15:05:37 -0800
commit: bb7a7a8858dc18ba733ed64e0733e27a4224ece8 (patch)
tree: 26dc98ddbbb220fd008de2925f482edf00a8c6bf /third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
parent: bf6b536bde7d8060c489b51fedb58968b8cbfd7c (diff)
1 files changed, 10 insertions, 6 deletions
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 863c28ab43..b7cea143ff 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -418,11 +418,13 @@ inline void TensorExecutor<Expression, GpuDevice, false, Tileable>::run(
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
   if (needs_assign) {
-    const int num_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() /
-                           device.maxCudaThreadsPerBlock();
     const int block_size = device.maxCudaThreadsPerBlock();
+    const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / block_size;
     const Index size = array_prod(evaluator.dimensions());
+    // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
+
     LAUNCH_CUDA_KERNEL(
         (EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>,
                                          Index>),
@@ -438,11 +440,13 @@ inline void TensorExecutor<Expression, GpuDevice, true, Tileable>::run(
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
   if (needs_assign) {
-    const int num_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() /
-                           device.maxCudaThreadsPerBlock();
     const int block_size = device.maxCudaThreadsPerBlock();
+    const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / block_size;
     const Index size = array_prod(evaluator.dimensions());
+    // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
+
     LAUNCH_CUDA_KERNEL(
         (EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>,
                                       Index>),
author	Vijay Vasudevan <vrv@google.com>	2015-12-02 15:05:37 -0800
committer	Vijay Vasudevan <vrv@google.com>	2015-12-02 15:05:37 -0800
commit	bb7a7a8858dc18ba733ed64e0733e27a4224ece8 (patch)
tree	26dc98ddbbb220fd008de2925f482edf00a8c6bf /third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
parent	bf6b536bde7d8060c489b51fedb58968b8cbfd7c (diff)