diff options
author | Gael Guennebaud <g.gael@free.fr> | 2009-07-23 14:20:45 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2009-07-23 14:20:45 +0200 |
commit | 713c92140c0033265b91ea0089bf6af5a89dff4c (patch) | |
tree | 21b1313f16b1c591df7f2eff1d4754e19f1d3860 /Eigen/src | |
parent | eee14846e3a676674899e164572564315ada3114 (diff) |
improve SYMV it is now faster and ready for use
Diffstat (limited to 'Eigen/src')
-rw-r--r-- | Eigen/src/Core/SelfAdjointView.h | 67 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixVector.h | 89 |
2 files changed, 106 insertions, 50 deletions
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index c73a3ffce..540f4fe93 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -120,7 +120,7 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView /** Perform a symmetric rank 2 update of the selfadjoint matrix \c *this: * \f$ this = this + \alpha ( u v^* + v u^*) \f$ - * + * * The vectors \a u and \c v \b must be column vectors, however they can be * a adjoint expression without any overhead. Only the meaningful triangular * part of the matrix is updated, the rest is left unchanged. @@ -184,34 +184,67 @@ struct ei_triangular_assignment_selector<Derived1, Derived2, SelfAdjoint, Dynami * Wrapper to ei_product_selfadjoint_vector ***************************************************************************/ -template<typename Lhs, int LhsMode, typename Rhs, int RhsMode> -struct ei_selfadjoint_product_returntype<Lhs,LhsMode,false,Rhs,RhsMode,true> - : public ReturnByValue<ei_selfadjoint_product_returntype<Lhs,LhsMode,false,Rhs,RhsMode,true>, +template<typename Lhs, int LhsMode, typename Rhs> +struct ei_selfadjoint_product_returntype<Lhs,LhsMode,false,Rhs,0,true> + : public ReturnByValue<ei_selfadjoint_product_returntype<Lhs,LhsMode,false,Rhs,0,true>, Matrix<typename ei_traits<Rhs>::Scalar, Rhs::RowsAtCompileTime,Rhs::ColsAtCompileTime> > { - typedef typename ei_cleantype<typename Rhs::Nested>::type RhsNested; + typedef typename Lhs::Scalar Scalar; + + typedef typename Lhs::Nested LhsNested; + typedef typename ei_cleantype<LhsNested>::type _LhsNested; + typedef ei_blas_traits<_LhsNested> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef typename ei_cleantype<ActualLhsType>::type _ActualLhsType; + + typedef typename Rhs::Nested RhsNested; + typedef typename ei_cleantype<RhsNested>::type _RhsNested; + typedef ei_blas_traits<_RhsNested> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef typename ei_cleantype<ActualRhsType>::type _ActualRhsType; + + enum { + LhsUpLo = LhsMode&(UpperTriangularBit|LowerTriangularBit) + }; + ei_selfadjoint_product_returntype(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) {} + template<typename Dest> inline void _addTo(Dest& dst) const + { evalTo(dst,1); } + template<typename Dest> inline void _subTo(Dest& dst) const + { evalTo(dst,-1); } + template<typename Dest> void evalTo(Dest& dst) const { - dst.resize(m_rhs.rows(), m_rhs.cols()); - ei_product_selfadjoint_vector<typename Lhs::Scalar,ei_traits<Lhs>::Flags&RowMajorBit, - LhsMode&(UpperTriangularBit|LowerTriangularBit)> + dst.resize(m_lhs.rows(), m_rhs.cols()); + dst.setZero(); + evalTo(dst,1); + } + + template<typename Dest> void evalTo(Dest& dst, Scalar alpha) const + { + const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); + const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) + * RhsBlasTraits::extractScalarFactor(m_rhs); + + ei_assert((&dst.coeff(1))-(&dst.coeff(0))==1 && "not implemented yet"); + ei_product_selfadjoint_vector<Scalar, ei_traits<_ActualLhsType>::Flags&RowMajorBit, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)> ( - m_lhs.rows(), // size - m_lhs.data(), // lhs - m_lhs.stride(), // lhsStride, - m_rhs.data(), // rhs - // int rhsIncr, - dst.data() // res + lhs.rows(), // size + &lhs.coeff(0,0), lhs.stride(), // lhs info + &rhs.coeff(0), (&rhs.coeff(1))-(&rhs.coeff(0)), // rhs info + &dst.coeffRef(0), // result info + actualAlpha // scale factor ); } - const typename Lhs::Nested m_lhs; - const typename Rhs::Nested m_rhs; + const LhsNested m_lhs; + const RhsNested m_rhs; }; /*************************************************************************** @@ -235,7 +268,7 @@ struct ei_selfadjoint_product_returntype<Lhs,LhsMode,false,Rhs,RhsMode,false> typedef ei_blas_traits<_LhsNested> LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; typedef typename ei_cleantype<ActualLhsType>::type _ActualLhsType; - + typedef typename Rhs::Nested RhsNested; typedef typename ei_cleantype<RhsNested>::type _RhsNested; typedef ei_blas_traits<_RhsNested> RhsBlasTraits; diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index 8ac83772c..8e0dc9526 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -30,12 +30,12 @@ * the number of load/stores of the result by a factor 2 and to reduce * the instruction dependency. */ -template<typename Scalar, int StorageOrder, int UpLo> +template<typename Scalar, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( int size, - const Scalar* lhs, int lhsStride, - const Scalar* rhs, //int rhsIncr, - Scalar* res) + const Scalar* lhs, int lhsStride, + const Scalar* _rhs, int rhsIncr, + Scalar* res, Scalar alpha) { typedef typename ei_packet_traits<Scalar>::type Packet; const int PacketSize = sizeof(Packet)/sizeof(Scalar); @@ -46,8 +46,22 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( FirstTriangular = IsRowMajor == IsLower }; - ei_conj_if<NumTraits<Scalar>::IsComplex && IsRowMajor> conj0; - ei_conj_if<NumTraits<Scalar>::IsComplex && !IsRowMajor> conj1; + ei_conj_helper<NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0; + ei_conj_helper<NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1; + + Scalar cjAlpha = ConjugateRhs ? ei_conj(alpha) : alpha; + + // if the rhs is not sequentially stored in memory we copy it to a temporary buffer, + // this is because we need to extract packets + const Scalar* EIGEN_RESTRICT rhs = _rhs; + if (rhsIncr!=1) + { + Scalar* r = ei_aligned_stack_new(Scalar, size); + const Scalar* it = _rhs; + for (int i=0; i<size; ++i, it+=rhsIncr) + r[i] = *it; + rhs = r; + } for (int i=0;i<size;i++) res[i] = 0; @@ -62,9 +76,9 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride; register const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride; - Scalar t0 = rhs[j]; + Scalar t0 = cjAlpha * rhs[j]; Packet ptmp0 = ei_pset1(t0); - Scalar t1 = rhs[j+1]; + Scalar t1 = cjAlpha * rhs[j+1]; Packet ptmp1 = ei_pset1(t1); Scalar t2 = 0; @@ -78,17 +92,17 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( size_t alignedStart = (starti) + ei_alignmentOffset(&res[starti], endi-starti); alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize); - res[j] += t0 * conj0(A0[j]); + res[j] += cj0.pmul(A0[j], t0); if(FirstTriangular) { - res[j+1] += t1 * conj0(A1[j+1]); - res[j] += t1 * conj0(A1[j]); - t3 += conj1(A1[j]) * rhs[j]; + res[j+1] += cj0.pmul(A1[j+1], t1); + res[j] += cj0.pmul(A1[j], t1); + t3 += cj1.pmul(A1[j], rhs[j]); } else { - res[j+1] += t0 * conj0(A0[j+1]) + t1 * conj0(A1[j+1]); - t2 += conj1(A0[j+1]) * rhs[j+1]; + res[j+1] += cj0.pmul(A0[j+1],t0) + cj0.pmul(A1[j+1],t1); + t2 += cj1.pmul(A0[j+1], rhs[j+1]); } for (size_t i=starti; i<alignedStart; ++i) @@ -97,41 +111,50 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( t2 += ei_conj(A0[i]) * rhs[i]; t3 += ei_conj(A1[i]) * rhs[i]; } + // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up) + // gcc 4.2 does this optimization automatically. + const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart; + const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart; + const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart; + Scalar* EIGEN_RESTRICT resIt = res + alignedStart; for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize) { - Packet A0i = ei_ploadu(&A0[i]); - Packet A1i = ei_ploadu(&A1[i]); - Packet Bi = ei_ploadu(&rhs[i]); // FIXME should be aligned in most cases - Packet Xi = ei_pload(&res[i]); - - Xi = ei_padd(ei_padd(Xi, ei_pmul(ptmp0, conj0(A0i))), ei_pmul(ptmp1, conj0(A1i))); - ptmp2 = ei_padd(ptmp2, ei_pmul(conj1(A0i), Bi)); - ptmp3 = ei_padd(ptmp3, ei_pmul(conj1(A1i), Bi)); - ei_pstore(&res[i],Xi); + Packet A0i = ei_ploadu(a0It); a0It += PacketSize; + Packet A1i = ei_ploadu(a1It); a1It += PacketSize; + Packet Bi = ei_ploadu(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases + Packet Xi = ei_pload (resIt); + + Xi = cj0.pmadd(A0i,ptmp0, cj0.pmadd(A1i,ptmp1,Xi)); + ptmp2 = cj1.pmadd(A0i, Bi, ptmp2); + ptmp3 = cj1.pmadd(A1i, Bi, ptmp3); + ei_pstore(resIt,Xi); resIt += PacketSize; } for (size_t i=alignedEnd; i<endi; i++) { - res[i] += t0 * conj0(A0[i]) + t1 * conj0(A1[i]); - t2 += conj1(A0[i]) * rhs[i]; - t3 += conj1(A1[i]) * rhs[i]; + res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1); + t2 += cj1.pmul(A0[i], rhs[i]); + t3 += cj1.pmul(A1[i], rhs[i]); } - res[j] += t2 + ei_predux(ptmp2); - res[j+1] += t3 + ei_predux(ptmp3); + res[j] += alpha * (t2 + ei_predux(ptmp2)); + res[j+1] += alpha * (t3 + ei_predux(ptmp3)); } for (int j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++) { register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride; - Scalar t1 = rhs[j]; + Scalar t1 = cjAlpha * rhs[j]; Scalar t2 = 0; - res[j] += t1 * conj0(A0[j]); + res[j] += cj0.pmul(A0[j],t1); for (int i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++) { - res[i] += t1 * conj0(A0[i]); - t2 += conj1(A0[i]) * rhs[i]; + res[i] += cj0.pmul(A0[i], t1); + t2 += cj1.pmul(A0[i], rhs[i]); } - res[j] += t2; + res[j] += alpha * t2; } + + if(rhsIncr!=1) + ei_aligned_stack_delete(Scalar, const_cast<Scalar*>(rhs), size); } |