diff options
author | Gael Guennebaud <g.gael@free.fr> | 2009-07-07 11:39:19 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2009-07-07 11:39:19 +0200 |
commit | 92a35c93b2b53694f77993142ce8fcad70bc5519 (patch) | |
tree | 38217646f9cc4720112f09d29eed613bf38fa9df /Eigen | |
parent | 544888e342bb7d75412bf0e17b66a3c6196d24eb (diff) |
* extended the cache friendly products to support C = alpha * A * M and C += alpha * A * B
* this allows to optimize xpr like C -= lazy_product, still have to catch "scalar_product_of_lazy_product"
* started to support conjugate in cache friendly products (very useful to evaluate A * B.adjoint() without
evaluating B.adjoint() into a temporary
* compilation fix
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/src/Core/MatrixBase.h | 7 | ||||
-rw-r--r-- | Eigen/src/Core/Product.h | 69 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 182 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixVector.h | 16 | ||||
-rw-r--r-- | Eigen/src/Core/util/ForwardDeclarations.h | 7 | ||||
-rw-r--r-- | Eigen/src/Sparse/SparseMatrixBase.h | 8 |
6 files changed, 181 insertions, 108 deletions
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 7aedbd3c7..309661f67 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -378,6 +378,9 @@ template<typename Derived> class MatrixBase template<typename Lhs,typename Rhs> Derived& operator+=(const Flagged<Product<Lhs,Rhs,CacheFriendlyProduct>, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit>& other); + template<typename Lhs,typename Rhs> + Derived& operator-=(const Flagged<Product<Lhs,Rhs,CacheFriendlyProduct>, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit>& other); + Derived& operator*=(const Scalar& other); Derived& operator/=(const Scalar& other); @@ -405,7 +408,7 @@ template<typename Derived> class MatrixBase { return *this = *this * other.derived(); } - + template<typename DiagonalDerived> const DiagonalProduct<Derived, DiagonalDerived, DiagonalOnTheRight> operator*(const DiagonalBase<DiagonalDerived> &diagonal) const; @@ -739,7 +742,7 @@ template<typename Derived> class MatrixBase // dense = dense * sparse template<typename Derived1, typename Derived2> Derived& lazyAssign(const SparseProduct<Derived1,Derived2,DenseTimeSparseProduct>& product); - + #ifdef EIGEN_MATRIXBASE_PLUGIN #include EIGEN_MATRIXBASE_PLUGIN #endif diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 75cfadfa1..39b211fa1 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -204,7 +204,7 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product * compute \a res += \c *this using the cache friendly product. */ template<typename DestDerived> - void _cacheFriendlyEvalAndAdd(DestDerived& res) const; + void _cacheFriendlyEvalAndAdd(DestDerived& res, Scalar alpha) const; /** \internal * \returns whether it is worth it to use the cache friendly product. @@ -499,13 +499,23 @@ struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMod * Cache friendly product callers and specific nested evaluation strategies ***************************************************************************/ +// Forward declarations + +template<typename Scalar, bool ConjugateLhs, bool ConjugateRhs> +void ei_cache_friendly_product( + int _rows, int _cols, int depth, + bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride, + bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride, + bool resRowMajor, Scalar* res, int resStride, + Scalar alpha); + template<typename Scalar, typename RhsType> static void ei_cache_friendly_product_colmajor_times_vector( - int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res); + int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res, Scalar alpha); template<typename Scalar, typename ResType> static void ei_cache_friendly_product_rowmajor_times_vector( - const Scalar* lhs, int lhsStride, const Scalar* rhs, int rhsSize, ResType& res); + const Scalar* lhs, int lhsStride, const Scalar* rhs, int rhsSize, ResType& res, Scalar alpha); template<typename ProductType, int LhsRows = ei_traits<ProductType>::RowsAtCompileTime, @@ -517,9 +527,9 @@ template<typename ProductType, struct ei_cache_friendly_product_selector { template<typename DestDerived> - inline static void run(DestDerived& res, const ProductType& product) + inline static void run(DestDerived& res, const ProductType& product, typename ProductType::Scalar alpha) { - product._cacheFriendlyEvalAndAdd(res); + product._cacheFriendlyEvalAndAdd(res, alpha); } }; @@ -528,11 +538,13 @@ template<typename ProductType, int LhsRows, int RhsOrder, int RhsAccess> struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,NoDirectAccess,1,RhsOrder,RhsAccess> { template<typename DestDerived> - inline static void run(DestDerived& res, const ProductType& product) + inline static void run(DestDerived& res, const ProductType& product, typename ProductType::Scalar alpha) { + // FIXME is it really used ? + ei_assert(alpha==typename ProductType::Scalar(1)); const int size = product.rhs().rows(); for (int k=0; k<size; ++k) - res += product.rhs().coeff(k) * product.lhs().col(k); + res += product.rhs().coeff(k) * product.lhs().col(k); } }; @@ -544,7 +556,7 @@ struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,HasDirect typedef typename ProductType::Scalar Scalar; template<typename DestDerived> - inline static void run(DestDerived& res, const ProductType& product) + inline static void run(DestDerived& res, const ProductType& product, typename ProductType::Scalar alpha) { enum { EvalToRes = (ei_packet_traits<Scalar>::size==1) @@ -559,7 +571,7 @@ struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,HasDirect } ei_cache_friendly_product_colmajor_times_vector(res.size(), &product.lhs().const_cast_derived().coeffRef(0,0), product.lhs().stride(), - product.rhs(), _res); + product.rhs(), _res, alpha); if (!EvalToRes) { @@ -574,8 +586,9 @@ template<typename ProductType, int LhsOrder, int LhsAccess, int RhsCols> struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCols,RowMajor,NoDirectAccess> { template<typename DestDerived> - inline static void run(DestDerived& res, const ProductType& product) + inline static void run(DestDerived& res, const ProductType& product, typename ProductType::Scalar alpha) { + ei_assert(alpha==typename ProductType::Scalar(1)); const int cols = product.lhs().cols(); for (int j=0; j<cols; ++j) res += product.lhs().coeff(j) * product.rhs().row(j); @@ -590,7 +603,7 @@ struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCo typedef typename ProductType::Scalar Scalar; template<typename DestDerived> - inline static void run(DestDerived& res, const ProductType& product) + inline static void run(DestDerived& res, const ProductType& product, typename ProductType::Scalar alpha) { enum { EvalToRes = (ei_packet_traits<Scalar>::size==1) @@ -605,7 +618,7 @@ struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCo } ei_cache_friendly_product_colmajor_times_vector(res.size(), &product.rhs().const_cast_derived().coeffRef(0,0), product.rhs().stride(), - product.lhs().transpose(), _res); + product.lhs().transpose(), _res, alpha); if (!EvalToRes) { @@ -626,7 +639,7 @@ struct ei_cache_friendly_product_selector<ProductType,LhsRows,RowMajor,HasDirect && (!(Rhs::Flags & RowMajorBit)) }; template<typename DestDerived> - inline static void run(DestDerived& res, const ProductType& product) + inline static void run(DestDerived& res, const ProductType& product, typename ProductType::Scalar alpha) { Scalar* EIGEN_RESTRICT _rhs; if (UseRhsDirectly) @@ -637,7 +650,7 @@ struct ei_cache_friendly_product_selector<ProductType,LhsRows,RowMajor,HasDirect Map<Matrix<Scalar,Rhs::SizeAtCompileTime,1> >(_rhs, product.rhs().size()) = product.rhs(); } ei_cache_friendly_product_rowmajor_times_vector(&product.lhs().const_cast_derived().coeffRef(0,0), product.lhs().stride(), - _rhs, product.rhs().size(), res); + _rhs, product.rhs().size(), res, alpha); if (!UseRhsDirectly) ei_aligned_stack_delete(Scalar, _rhs, product.rhs().size()); } @@ -654,7 +667,7 @@ struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCo && (Lhs::Flags & RowMajorBit) }; template<typename DestDerived> - inline static void run(DestDerived& res, const ProductType& product) + inline static void run(DestDerived& res, const ProductType& product, typename ProductType::Scalar alpha) { Scalar* EIGEN_RESTRICT _lhs; if (UseLhsDirectly) @@ -665,7 +678,7 @@ struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCo Map<Matrix<Scalar,Lhs::SizeAtCompileTime,1> >(_lhs, product.lhs().size()) = product.lhs(); } ei_cache_friendly_product_rowmajor_times_vector(&product.rhs().const_cast_derived().coeffRef(0,0), product.rhs().stride(), - _lhs, product.lhs().size(), res); + _lhs, product.lhs().size(), res, alpha); if(!UseLhsDirectly) ei_aligned_stack_delete(Scalar, _lhs, product.lhs().size()); } @@ -691,12 +704,25 @@ inline Derived& MatrixBase<Derived>::operator+=(const Flagged<Product<Lhs,Rhs,CacheFriendlyProduct>, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit>& other) { if (other._expression()._useCacheFriendlyProduct()) - ei_cache_friendly_product_selector<Product<Lhs,Rhs,CacheFriendlyProduct> >::run(const_cast_derived(), other._expression()); + ei_cache_friendly_product_selector<Product<Lhs,Rhs,CacheFriendlyProduct> >::run(const_cast_derived(), other._expression(), Scalar(1)); else lazyAssign(derived() + other._expression()); return derived(); } +/** \internal */ +template<typename Derived> +template<typename Lhs,typename Rhs> +inline Derived& +MatrixBase<Derived>::operator-=(const Flagged<Product<Lhs,Rhs,CacheFriendlyProduct>, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit>& other) +{ + if (other._expression()._useCacheFriendlyProduct()) + ei_cache_friendly_product_selector<Product<Lhs,Rhs,CacheFriendlyProduct> >::run(const_cast_derived(), other._expression(), Scalar(-1)); + else + lazyAssign(derived() - other._expression()); + return derived(); +} + template<typename Derived> template<typename Lhs, typename Rhs> inline Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheFriendlyProduct>& product) @@ -704,7 +730,7 @@ inline Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheFrien if (product._useCacheFriendlyProduct()) { setZero(); - ei_cache_friendly_product_selector<Product<Lhs,Rhs,CacheFriendlyProduct> >::run(const_cast_derived(), product); + ei_cache_friendly_product_selector<Product<Lhs,Rhs,CacheFriendlyProduct> >::run(const_cast_derived(), product, Scalar(1)); } else { @@ -734,7 +760,7 @@ template<typename T> struct ei_product_copy_lhs template<typename Lhs, typename Rhs, int ProductMode> template<typename DestDerived> -inline void Product<Lhs,Rhs,ProductMode>::_cacheFriendlyEvalAndAdd(DestDerived& res) const +inline void Product<Lhs,Rhs,ProductMode>::_cacheFriendlyEvalAndAdd(DestDerived& res, Scalar alpha) const { typedef typename ei_product_copy_lhs<_LhsNested>::type LhsCopy; typedef typename ei_unref<LhsCopy>::type _LhsCopy; @@ -742,11 +768,12 @@ inline void Product<Lhs,Rhs,ProductMode>::_cacheFriendlyEvalAndAdd(DestDerived& typedef typename ei_unref<RhsCopy>::type _RhsCopy; LhsCopy lhs(m_lhs); RhsCopy rhs(m_rhs); - ei_cache_friendly_product<Scalar>( + ei_cache_friendly_product<Scalar,false,false>( rows(), cols(), lhs.cols(), _LhsCopy::Flags&RowMajorBit, (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(), _RhsCopy::Flags&RowMajorBit, (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(), - Flags&RowMajorBit, (Scalar*)&(res.coeffRef(0,0)), res.stride() + Flags&RowMajorBit, (Scalar*)&(res.coeffRef(0,0)), res.stride(), + alpha ); } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 79f02c7e3..e7b8586be 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -30,20 +30,50 @@ struct ei_L2_block_traits { enum {width = 8 * ei_meta_sqrt<L2MemorySize/(64*sizeof(Scalar))>::ret }; }; +template<bool ConjLhs, bool ConjRhs> struct ei_conj_pmadd; + +template<> struct ei_conj_pmadd<false,false> +{ + template<typename T> + EIGEN_STRONG_INLINE T operator()(const T& x, const T& y, T& c) const { return ei_pmadd(x,y,c); } +}; + +template<> struct ei_conj_pmadd<false,true> +{ + template<typename T> std::complex<T> operator()(const std::complex<T>& x, const std::complex<T>& y, std::complex<T>& c) const + { return c + std::complex<T>(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_imag(x)*ei_real(y) - ei_real(x)*ei_imag(y)); } +}; + +template<> struct ei_conj_pmadd<true,false> +{ + template<typename T> std::complex<T> operator()(const std::complex<T>& x, const std::complex<T>& y, std::complex<T>& c) const + { return c + std::complex<T>(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } +}; + +template<> struct ei_conj_pmadd<true,true> +{ + template<typename T> std::complex<T> operator()(const std::complex<T>& x, const std::complex<T>& y, std::complex<T>& c) const + { return c + std::complex<T>(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } +}; + #ifndef EIGEN_EXTERN_INSTANTIATIONS -template<typename Scalar> +template<typename Scalar, bool ConjugateLhs, bool ConjugateRhs> static void ei_cache_friendly_product( int _rows, int _cols, int depth, bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride, bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride, - bool resRowMajor, Scalar* res, int resStride) + bool resRowMajor, Scalar* res, int resStride, + Scalar alpha) { const Scalar* EIGEN_RESTRICT lhs; const Scalar* EIGEN_RESTRICT rhs; int lhsStride, rhsStride, rows, cols; bool lhsRowMajor; + ei_conj_pmadd<ConjugateLhs,ConjugateRhs> cj_pmadd; + bool hasAlpha = alpha != Scalar(1); + if (resRowMajor) { lhs = _rhs; @@ -119,16 +149,34 @@ static void ei_cache_friendly_product( const Scalar* b1 = &rhs[(j2+1)*rhsStride + k2]; const Scalar* b2 = &rhs[(j2+2)*rhsStride + k2]; const Scalar* b3 = &rhs[(j2+3)*rhsStride + k2]; - for(int k=0; k<actual_kc; k++) + if (hasAlpha) + { + std::cerr << "* by " << alpha << "\n"; + for(int k=0; k<actual_kc; k++) + { + ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k])); + ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k])); + if (nr==4) + { + ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k])); + ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k])); + } + count += nr*PacketSize; + } + } + else { - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k])); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k])); - if (nr==4) + for(int k=0; k<actual_kc; k++) { - ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k])); - ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k])); + ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k])); + ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k])); + if (nr==4) + { + ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k])); + ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k])); + } + count += nr*PacketSize; } - count += nr*PacketSize; } } } @@ -205,59 +253,59 @@ static void ei_cache_friendly_product( A1 = ei_pload(&blA[1*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); B1 = ei_pload(&blB[1*PacketSize]); - C0 = ei_pmadd(B0, A0, C0); + C0 = cj_pmadd(B0, A0, C0); if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); - C4 = ei_pmadd(B0, A1, C4); + C4 = cj_pmadd(B0, A1, C4); if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); - C1 = ei_pmadd(B1, A0, C1); - C5 = ei_pmadd(B1, A1, C5); + C1 = cj_pmadd(B1, A0, C1); + C5 = cj_pmadd(B1, A1, C5); B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]); - if(nr==4) C2 = ei_pmadd(B2, A0, C2); - if(nr==4) C6 = ei_pmadd(B2, A1, C6); + if(nr==4) C2 = cj_pmadd(B2, A0, C2); + if(nr==4) C6 = cj_pmadd(B2, A1, C6); if(nr==4) B2 = ei_pload(&blB[6*PacketSize]); - if(nr==4) C3 = ei_pmadd(B3, A0, C3); + if(nr==4) C3 = cj_pmadd(B3, A0, C3); A0 = ei_pload(&blA[2*PacketSize]); - if(nr==4) C7 = ei_pmadd(B3, A1, C7); + if(nr==4) C7 = cj_pmadd(B3, A1, C7); A1 = ei_pload(&blA[3*PacketSize]); if(nr==4) B3 = ei_pload(&blB[7*PacketSize]); - C0 = ei_pmadd(B0, A0, C0); - C4 = ei_pmadd(B0, A1, C4); + C0 = cj_pmadd(B0, A0, C0); + C4 = cj_pmadd(B0, A1, C4); B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]); - C1 = ei_pmadd(B1, A0, C1); - C5 = ei_pmadd(B1, A1, C5); + C1 = cj_pmadd(B1, A0, C1); + C5 = cj_pmadd(B1, A1, C5); B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]); - if(nr==4) C2 = ei_pmadd(B2, A0, C2); - if(nr==4) C6 = ei_pmadd(B2, A1, C6); + if(nr==4) C2 = cj_pmadd(B2, A0, C2); + if(nr==4) C6 = cj_pmadd(B2, A1, C6); if(nr==4) B2 = ei_pload(&blB[10*PacketSize]); - if(nr==4) C3 = ei_pmadd(B3, A0, C3); + if(nr==4) C3 = cj_pmadd(B3, A0, C3); A0 = ei_pload(&blA[4*PacketSize]); - if(nr==4) C7 = ei_pmadd(B3, A1, C7); + if(nr==4) C7 = cj_pmadd(B3, A1, C7); A1 = ei_pload(&blA[5*PacketSize]); if(nr==4) B3 = ei_pload(&blB[11*PacketSize]); - C0 = ei_pmadd(B0, A0, C0); - C4 = ei_pmadd(B0, A1, C4); + C0 = cj_pmadd(B0, A0, C0); + C4 = cj_pmadd(B0, A1, C4); B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); - C1 = ei_pmadd(B1, A0, C1); - C5 = ei_pmadd(B1, A1, C5); + C1 = cj_pmadd(B1, A0, C1); + C5 = cj_pmadd(B1, A1, C5); B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); - if(nr==4) C2 = ei_pmadd(B2, A0, C2); - if(nr==4) C6 = ei_pmadd(B2, A1, C6); + if(nr==4) C2 = cj_pmadd(B2, A0, C2); + if(nr==4) C6 = cj_pmadd(B2, A1, C6); if(nr==4) B2 = ei_pload(&blB[14*PacketSize]); - if(nr==4) C3 = ei_pmadd(B3, A0, C3); + if(nr==4) C3 = cj_pmadd(B3, A0, C3); A0 = ei_pload(&blA[6*PacketSize]); - if(nr==4) C7 = ei_pmadd(B3, A1, C7); + if(nr==4) C7 = cj_pmadd(B3, A1, C7); A1 = ei_pload(&blA[7*PacketSize]); if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); - C0 = ei_pmadd(B0, A0, C0); - C4 = ei_pmadd(B0, A1, C4); - C1 = ei_pmadd(B1, A0, C1); - C5 = ei_pmadd(B1, A1, C5); - if(nr==4) C2 = ei_pmadd(B2, A0, C2); - if(nr==4) C6 = ei_pmadd(B2, A1, C6); - if(nr==4) C3 = ei_pmadd(B3, A0, C3); - if(nr==4) C7 = ei_pmadd(B3, A1, C7); + C0 = cj_pmadd(B0, A0, C0); + C4 = cj_pmadd(B0, A1, C4); + C1 = cj_pmadd(B1, A0, C1); + C5 = cj_pmadd(B1, A1, C5); + if(nr==4) C2 = cj_pmadd(B2, A0, C2); + if(nr==4) C6 = cj_pmadd(B2, A1, C6); + if(nr==4) C3 = cj_pmadd(B3, A0, C3); + if(nr==4) C7 = cj_pmadd(B3, A1, C7); blB += 4*nr*PacketSize; blA += 4*mr; @@ -271,16 +319,16 @@ static void ei_cache_friendly_product( A1 = ei_pload(&blA[1*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); B1 = ei_pload(&blB[1*PacketSize]); - C0 = ei_pmadd(B0, A0, C0); + C0 = cj_pmadd(B0, A0, C0); if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); - C4 = ei_pmadd(B0, A1, C4); + C4 = cj_pmadd(B0, A1, C4); if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); - C1 = ei_pmadd(B1, A0, C1); - C5 = ei_pmadd(B1, A1, C5); - if(nr==4) C2 = ei_pmadd(B2, A0, C2); - if(nr==4) C6 = ei_pmadd(B2, A1, C6); - if(nr==4) C3 = ei_pmadd(B3, A0, C3); - if(nr==4) C7 = ei_pmadd(B3, A1, C7); + C1 = cj_pmadd(B1, A0, C1); + C5 = cj_pmadd(B1, A1, C5); + if(nr==4) C2 = cj_pmadd(B2, A0, C2); + if(nr==4) C6 = cj_pmadd(B2, A1, C6); + if(nr==4) C3 = cj_pmadd(B3, A0, C3); + if(nr==4) C7 = cj_pmadd(B3, A1, C7); blB += nr*PacketSize; blA += mr; @@ -332,14 +380,14 @@ static void ei_cache_friendly_product( { for(int i=0; i<actual_mc; i++) { - Scalar c0 = res[(j2)*resStride + i2+i]; + Scalar c0 = Scalar(0); if (lhsRowMajor) for(int k=0; k<actual_kc; k++) c0 += lhs[(k2+k)+(i2+i)*lhsStride] * rhs[j2*rhsStride + k2 + k]; else for(int k=0; k<actual_kc; k++) c0 += lhs[(k2+k)*lhsStride + i2+i] * rhs[j2*rhsStride + k2 + k]; - res[(j2)*resStride + i2+i] = c0; + res[(j2)*resStride + i2+i] += alpha * c0; } } } @@ -435,39 +483,39 @@ static void ei_cache_friendly_product( L0 = ei_pload(&lb[1*PacketSize]); R1 = ei_pload(&lb[2*PacketSize]); L1 = ei_pload(&lb[3*PacketSize]); - T0 = ei_pmadd(R0, A0, T0); - T1 = ei_pmadd(L0, A0, T1); + T0 = cj_pmadd(R0, A0, T0); + T1 = cj_pmadd(L0, A0, T1); R0 = ei_pload(&lb[4*PacketSize]); L0 = ei_pload(&lb[5*PacketSize]); - T0 = ei_pmadd(R1, A1, T0); - T1 = ei_pmadd(L1, A1, T1); + T0 = cj_pmadd(R1, A1, T0); + T1 = cj_pmadd(L1, A1, T1); R1 = ei_pload(&lb[6*PacketSize]); L1 = ei_pload(&lb[7*PacketSize]); - T0 = ei_pmadd(R0, A2, T0); - T1 = ei_pmadd(L0, A2, T1); + T0 = cj_pmadd(R0, A2, T0); + T1 = cj_pmadd(L0, A2, T1); if(MaxBlockRows==8) { R0 = ei_pload(&lb[8*PacketSize]); L0 = ei_pload(&lb[9*PacketSize]); } - T0 = ei_pmadd(R1, A3, T0); - T1 = ei_pmadd(L1, A3, T1); + T0 = cj_pmadd(R1, A3, T0); + T1 = cj_pmadd(L1, A3, T1); if(MaxBlockRows==8) { R1 = ei_pload(&lb[10*PacketSize]); L1 = ei_pload(&lb[11*PacketSize]); - T0 = ei_pmadd(R0, A4, T0); - T1 = ei_pmadd(L0, A4, T1); + T0 = cj_pmadd(R0, A4, T0); + T1 = cj_pmadd(L0, A4, T1); R0 = ei_pload(&lb[12*PacketSize]); L0 = ei_pload(&lb[13*PacketSize]); - T0 = ei_pmadd(R1, A5, T0); - T1 = ei_pmadd(L1, A5, T1); + T0 = cj_pmadd(R1, A5, T0); + T1 = cj_pmadd(L1, A5, T1); R1 = ei_pload(&lb[14*PacketSize]); L1 = ei_pload(&lb[15*PacketSize]); - T0 = ei_pmadd(R0, A6, T0); - T1 = ei_pmadd(L0, A6, T1); - T0 = ei_pmadd(R1, A7, T0); - T1 = ei_pmadd(L1, A7, T1); + T0 = cj_pmadd(R0, A6, T0); + T1 = cj_pmadd(L0, A6, T1); + T0 = cj_pmadd(R1, A7, T0); + T1 = cj_pmadd(L1, A7, T1); } lb += MaxBlockRows*2*PacketSize; diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 2a04b1874..5cb0a9465 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -37,7 +37,8 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_colmajor_times_vector( int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, - Scalar* res) + Scalar* res, + Scalar alpha) { #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined @@ -104,8 +105,8 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_colmajor_times_vector( int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; for (int i=skipColumns; i<columnBound; i+=columnsAtOnce) { - Packet ptmp0 = ei_pset1(rhs[i]), ptmp1 = ei_pset1(rhs[i+offset1]), - ptmp2 = ei_pset1(rhs[i+2]), ptmp3 = ei_pset1(rhs[i+offset3]); + Packet ptmp0 = ei_pset1(alpha*rhs[i]), ptmp1 = ei_pset1(alpha*rhs[i+offset1]), + ptmp2 = ei_pset1(alpha*rhs[i+2]), ptmp3 = ei_pset1(alpha*rhs[i+offset3]); // this helps a lot generating better binary code const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, @@ -186,7 +187,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_colmajor_times_vector( { for (int i=start; i<end; ++i) { - Packet ptmp0 = ei_pset1(rhs[i]); + Packet ptmp0 = ei_pset1(alpha*rhs[i]); const Scalar* lhs0 = lhs + i*lhsStride; if (PacketSize>1) @@ -226,7 +227,8 @@ template<typename Scalar, typename ResType> static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( const Scalar* lhs, int lhsStride, const Scalar* rhs, int rhsSize, - ResType& res) + ResType& res, + Scalar alpha) { #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined @@ -382,7 +384,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( Scalar b = rhs[j]; tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j]; } - res[i] += tmp0; res[i+offset1] += tmp1; res[i+2] += tmp2; res[i+offset3] += tmp3; + res[i] += alpha*tmp0; res[i+offset1] += alpha*tmp1; res[i+2] += alpha*tmp2; res[i+offset3] += alpha*tmp3; } // process remaining first and last rows (at most columnsAtOnce-1) @@ -416,7 +418,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( // FIXME this loop get vectorized by the compiler ! for (int j=alignedSize; j<size; ++j) tmp0 += rhs[j] * lhs0[j]; - res[i] += tmp0; + res[i] += alpha*tmp0; } if (skipRows) { diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index db58e4982..3e752c84b 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -102,13 +102,6 @@ template<typename Scalar1,typename Scalar2> struct ei_scalar_multiple2_op; struct IOFormat; -template<typename Scalar> -void ei_cache_friendly_product( - int _rows, int _cols, int depth, - bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride, - bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride, - bool resRowMajor, Scalar* res, int resStride); - // Array module template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType> class Select; template<typename MatrixType, typename BinaryOp, int Direction> class PartialReduxExpr; diff --git a/Eigen/src/Sparse/SparseMatrixBase.h b/Eigen/src/Sparse/SparseMatrixBase.h index 82527f7ef..d62d23a78 100644 --- a/Eigen/src/Sparse/SparseMatrixBase.h +++ b/Eigen/src/Sparse/SparseMatrixBase.h @@ -315,7 +315,7 @@ template<typename Derived> class SparseMatrixBase operator*(const DiagonalBase<OtherDerived> &other) const; // diagonal * sparse - template<typename OtherDerived> friend + template<typename OtherDerived> friend const SparseDiagonalProduct<OtherDerived,Derived> operator*(const DiagonalBase<OtherDerived> &lhs, const SparseMatrixBase& rhs) { return SparseDiagonalProduct<OtherDerived,Derived>(lhs.derived(), rhs.derived()); } @@ -451,14 +451,14 @@ template<typename Derived> class SparseMatrixBase // Derived& setRandom(); // Derived& setIdentity(); + /** \internal use operator= */ template<typename DenseDerived> - void evalToDense(MatrixBase<DenseDerived>& dst) + void evalToDense(MatrixBase<DenseDerived>& dst) const { - dst.resize(rows(),cols()); dst.setZero(); for (int j=0; j<outerSize(); ++j) for (typename Derived::InnerIterator i(derived(),j); i; ++i) - res.coeffRef(i.row(),i.col()) = i.value(); + dst.coeffRef(i.row(),i.col()) = i.value(); } Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime> toDense() const |