diff options
author | Gael Guennebaud <g.gael@free.fr> | 2015-06-26 10:32:34 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2015-06-26 10:32:34 +0200 |
commit | 53b930887d118af5204840231f08b3307addce4e (patch) | |
tree | 7dd6cd3ab5929fd0f62e6f03052de97ed61ce300 /Eigen/src/SparseCore/SparseDenseProduct.h | |
parent | 3f49cf4c90e603fa6bae6e8640e8751d67b5d7fd (diff) |
Enable OpenMP parallelization of row-major-sparse * dense products.
I observed significant speed-up of the CG solver.
Diffstat (limited to 'Eigen/src/SparseCore/SparseDenseProduct.h')
-rw-r--r-- | Eigen/src/SparseCore/SparseDenseProduct.h | 39 |
1 files changed, 32 insertions, 7 deletions
diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h index 731d40f29..b8d71d3f8 100644 --- a/Eigen/src/SparseCore/SparseDenseProduct.h +++ b/Eigen/src/SparseCore/SparseDenseProduct.h @@ -30,23 +30,48 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t typedef typename internal::remove_all<DenseRhsType>::type Rhs; typedef typename internal::remove_all<DenseResType>::type Res; typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator; + typedef typename evaluator<Lhs>::type LhsEval; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha) { - typename evaluator<Lhs>::type lhsEval(lhs); + LhsEval lhsEval(lhs); + + Index n = lhs.outerSize(); +#ifdef EIGEN_HAS_OPENMP + Eigen::initParallel(); + Index threads = Eigen::nbThreads(); +#endif + for(Index c=0; c<rhs.cols(); ++c) { - Index n = lhs.outerSize(); - for(Index j=0; j<n; ++j) +#ifdef EIGEN_HAS_OPENMP + // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems. + // It basically represents the minimal amount of work to be done to be worth it. + if(threads>1 && lhs.nonZeros() > 20000) { - typename Res::Scalar tmp(0); - for(LhsInnerIterator it(lhsEval,j); it ;++it) - tmp += it.value() * rhs.coeff(it.index(),c); - res.coeffRef(j,c) += alpha * tmp; + #pragma omp parallel for schedule(static) num_threads(threads) + for(Index i=0; i<n; ++i) + processRow(lhsEval,rhs,res,alpha,i,c); + } + else +#endif + { + for(Index i=0; i<n; ++i) + processRow(lhsEval,rhs,res,alpha,i,c); } } } + + static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha, Index i, Index col) + { + typename Res::Scalar tmp(0); + for(LhsInnerIterator it(lhsEval,i); it ;++it) + tmp += it.value() * rhs.coeff(it.index(),col); + res.coeffRef(i,col) += alpha * tmp; + } + }; +// FIXME: what is the purpose of the following specialization? Is it for the BlockedSparse format? template<typename T1, typename T2/*, int _Options, typename _StrideType*/> struct scalar_product_traits<T1, Ref<T2/*, _Options, _StrideType*/> > { |