Fix a regression when using OpenMP, and fix bug #714: the number of threads might be lower than the number of requested ones

author: Gael Guennebaud <g.gael@free.fr> 2015-02-18 15:19:23 +0100
committer: Gael Guennebaud <g.gael@free.fr> 2015-02-18 15:19:23 +0100
commit: c7bb1e8ea8dfc984788d0cb77b82a90468393c2e (patch)
tree: 7c215b4e3975cfb9d23df098bf76c0ed9bcb3b89
parent: 548b7813805d9e314f97eb6f731d711df663a46b (diff)
3 files changed, 33 insertions, 14 deletions
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 44e44b986..c38c12c31 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -217,8 +217,9 @@ struct gemm_functor
     : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
   {}
 
-  void initParallelSession() const
+  void initParallelSession(Index num_threads) const
   {
+    m_blocking.initParallel(m_lhs.rows(), m_rhs.cols(), m_lhs.cols(), num_threads);
     m_blocking.allocateA();
   }
 
@@ -276,7 +277,7 @@ class level3_blocking
 };
 
 template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
-class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true>
+class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true /* == FiniteAtCompileTime */>
   : public level3_blocking<
       typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
       typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
@@ -299,7 +300,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
 
   public:
 
-    gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, int /*num_threads*/, bool /*full_rows = false*/)
+    gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, Index /*num_threads*/, bool /*full_rows = false*/)
     {
       this->m_mc = ActualRows;
       this->m_nc = ActualCols;
@@ -307,6 +308,9 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
       this->m_blockA = m_staticA;
       this->m_blockB = m_staticB;
     }
+    
+    void initParallel(Index, Index, Index, Index)
+    {}
 
     inline void allocateA() {}
     inline void allocateB() {}
@@ -331,7 +335,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
 
   public:
 
-    gemm_blocking_space(Index rows, Index cols, Index depth, int num_threads, bool l3_blocking)
+    gemm_blocking_space(Index rows, Index cols, Index depth, Index num_threads, bool l3_blocking)
     {
       this->m_mc = Transpose ? cols : rows;
       this->m_nc = Transpose ? rows : cols;
@@ -351,6 +355,19 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
       m_sizeA = this->m_mc * this->m_kc;
       m_sizeB = this->m_kc * this->m_nc;
     }
+    
+    void initParallel(Index rows, Index cols, Index depth, Index num_threads)
+    {
+      this->m_mc = Transpose ? cols : rows;
+      this->m_nc = Transpose ? rows : cols;
+      this->m_kc = depth;
+      
+      eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0);      
+      Index m = this->m_mc;
+      computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads);
+      m_sizeA = this->m_mc * this->m_kc;
+      m_sizeB = this->m_kc * this->m_nc;
+    }
 
     void allocateA()
     {
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 2b90abf8f..91d37a123 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -120,25 +120,28 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
     return func(0,rows, 0,cols);
 
   Eigen::initParallel();
-  func.initParallelSession();
+  func.initParallelSession(threads);
 
   if(transpose)
     std::swap(rows,cols);
-
-  Index blockCols = (cols / threads) & ~Index(0x3);
-  Index blockRows = (rows / threads);
-  blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
   
   ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
-
+  
   #pragma omp parallel num_threads(threads)
   {
     Index i = omp_get_thread_num();
+    // Note that the actual number of threads might be lower than the number of request ones.
+    Index actual_threads = omp_get_num_threads();
+    
+    Index blockCols = (cols / actual_threads) & ~Index(0x3);
+    Index blockRows = (rows / actual_threads);
+    blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
+  
     Index r0 = i*blockRows;
-    Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
+    Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;
 
     Index c0 = i*blockCols;
-    Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
+    Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols;
 
     info[i].lhs_start = r0;
     info[i].lhs_length = actualBlockRows;
diff --git a/test/product_large.cpp b/test/product_large.cpp
index ffb8b7bf2..84c489580 100644
--- a/test/product_large.cpp
+++ b/test/product_large.cpp
@@ -64,8 +64,7 @@ void test_product_large()
 #endif
 
   // Regression test for bug 714:
-#ifdef EIGEN_HAS_OPENMP
-  std::cout << "Testing omp_set_dynamic(1)\n";
+#if defined EIGEN_HAS_OPENMP
   omp_set_dynamic(1);
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_6( product(Matrix<float,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
author	Gael Guennebaud <g.gael@free.fr>	2015-02-18 15:19:23 +0100
committer	Gael Guennebaud <g.gael@free.fr>	2015-02-18 15:19:23 +0100
commit	c7bb1e8ea8dfc984788d0cb77b82a90468393c2e (patch)
tree	7c215b4e3975cfb9d23df098bf76c0ed9bcb3b89
parent	548b7813805d9e314f97eb6f731d711df663a46b (diff)