diff options
author | Gael Guennebaud <g.gael@free.fr> | 2015-02-18 15:19:23 +0100 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2015-02-18 15:19:23 +0100 |
commit | c7bb1e8ea8dfc984788d0cb77b82a90468393c2e (patch) | |
tree | 7c215b4e3975cfb9d23df098bf76c0ed9bcb3b89 | |
parent | 548b7813805d9e314f97eb6f731d711df663a46b (diff) |
Fix a regression when using OpenMP, and fix bug #714: the number of threads might be lower than the number of requested ones
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 25 | ||||
-rw-r--r-- | Eigen/src/Core/products/Parallelizer.h | 19 | ||||
-rw-r--r-- | test/product_large.cpp | 3 |
3 files changed, 33 insertions, 14 deletions
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 44e44b986..c38c12c31 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -217,8 +217,9 @@ struct gemm_functor : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking) {} - void initParallelSession() const + void initParallelSession(Index num_threads) const { + m_blocking.initParallel(m_lhs.rows(), m_rhs.cols(), m_lhs.cols(), num_threads); m_blocking.allocateA(); } @@ -276,7 +277,7 @@ class level3_blocking }; template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor> -class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true> +class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true /* == FiniteAtCompileTime */> : public level3_blocking< typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type, typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type> @@ -299,7 +300,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M public: - gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, int /*num_threads*/, bool /*full_rows = false*/) + gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, Index /*num_threads*/, bool /*full_rows = false*/) { this->m_mc = ActualRows; this->m_nc = ActualCols; @@ -307,6 +308,9 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M this->m_blockA = m_staticA; this->m_blockB = m_staticB; } + + void initParallel(Index, Index, Index, Index) + {} inline void allocateA() {} inline void allocateB() {} @@ -331,7 +335,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M public: - gemm_blocking_space(Index rows, Index cols, Index depth, int num_threads, bool l3_blocking) + gemm_blocking_space(Index rows, Index cols, Index depth, Index num_threads, bool l3_blocking) { this->m_mc = Transpose ? cols : rows; this->m_nc = Transpose ? rows : cols; @@ -351,6 +355,19 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M m_sizeA = this->m_mc * this->m_kc; m_sizeB = this->m_kc * this->m_nc; } + + void initParallel(Index rows, Index cols, Index depth, Index num_threads) + { + this->m_mc = Transpose ? cols : rows; + this->m_nc = Transpose ? rows : cols; + this->m_kc = depth; + + eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0); + Index m = this->m_mc; + computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads); + m_sizeA = this->m_mc * this->m_kc; + m_sizeB = this->m_kc * this->m_nc; + } void allocateA() { diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 2b90abf8f..91d37a123 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -120,25 +120,28 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos return func(0,rows, 0,cols); Eigen::initParallel(); - func.initParallelSession(); + func.initParallelSession(threads); if(transpose) std::swap(rows,cols); - - Index blockCols = (cols / threads) & ~Index(0x3); - Index blockRows = (rows / threads); - blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr; ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0); - + #pragma omp parallel num_threads(threads) { Index i = omp_get_thread_num(); + // Note that the actual number of threads might be lower than the number of request ones. + Index actual_threads = omp_get_num_threads(); + + Index blockCols = (cols / actual_threads) & ~Index(0x3); + Index blockRows = (rows / actual_threads); + blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr; + Index r0 = i*blockRows; - Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows; + Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows; Index c0 = i*blockCols; - Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols; + Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols; info[i].lhs_start = r0; info[i].lhs_length = actualBlockRows; diff --git a/test/product_large.cpp b/test/product_large.cpp index ffb8b7bf2..84c489580 100644 --- a/test/product_large.cpp +++ b/test/product_large.cpp @@ -64,8 +64,7 @@ void test_product_large() #endif // Regression test for bug 714: -#ifdef EIGEN_HAS_OPENMP - std::cout << "Testing omp_set_dynamic(1)\n"; +#if defined EIGEN_HAS_OPENMP omp_set_dynamic(1); for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_6( product(Matrix<float,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); |