diff options
author | Gael Guennebaud <g.gael@free.fr> | 2008-07-12 12:12:02 +0000 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2008-07-12 12:12:02 +0000 |
commit | b7bd1b3446aafe2fa81b9cd3218d9fb902ba2bbc (patch) | |
tree | d08fe60c2a32bd9e01eefa94a27b983ed3d4d1ee /Eigen/src | |
parent | 6f71ef8277405d268032f7c3bcaf316c7422c133 (diff) |
Add a *very efficient* evaluation path for both col-major matrix * vector
and vector * row-major products. Currently, it is enabled only is the matrix
has DirectAccessBit flag and the product is "large enough".
Added the respective unit tests in test/product/cpp.
Diffstat (limited to 'Eigen/src')
-rw-r--r-- | Eigen/src/Core/Assign.h | 1 | ||||
-rw-r--r-- | Eigen/src/Core/Block.h | 1 | ||||
-rw-r--r-- | Eigen/src/Core/CacheFriendlyProduct.h | 165 | ||||
-rw-r--r-- | Eigen/src/Core/Product.h | 97 |
4 files changed, 250 insertions, 14 deletions
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h index 828b49725..ba53e299d 100644 --- a/Eigen/src/Core/Assign.h +++ b/Eigen/src/Core/Assign.h @@ -249,7 +249,6 @@ struct ei_assign_impl<Derived1, Derived2, InnerVectorization, NoUnrolling> { static void run(Derived1 &dst, const Derived2 &src) { - const bool rowMajor = int(Derived1::Flags)&RowMajorBit; const int innerSize = dst.innerSize(); const int outerSize = dst.outerSize(); const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size; diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 245357511..ae4e83c9e 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -155,7 +155,6 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block return m_matrix.const_cast_derived() .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); - } inline const Scalar coeff(int index) const diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h index 0da84eeac..a710d44d4 100644 --- a/Eigen/src/Core/CacheFriendlyProduct.h +++ b/Eigen/src/Core/CacheFriendlyProduct.h @@ -25,6 +25,8 @@ #ifndef EIGEN_CACHE_FRIENDLY_PRODUCT_H #define EIGEN_CACHE_FRIENDLY_PRODUCT_H +#ifndef EIGEN_EXTERN_INSTANTIATIONS + template<typename Scalar> static void ei_cache_friendly_product( int _rows, int _cols, int depth, @@ -77,8 +79,6 @@ static void ei_cache_friendly_product( MaxL2BlockSize = EIGEN_TUNE_FOR_L2_CACHE_SIZE / sizeof(Scalar) }; - - //const bool rhsIsAligned = (PacketSize==1) || (((rhsStride%PacketSize) == 0) && (size_t(rhs)%16==0)); const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0)); const int remainingSize = depth % PacketSize; @@ -357,4 +357,165 @@ static void ei_cache_friendly_product( free(block); } +#endif // EIGEN_EXTERN_INSTANTIATIONS + +/* Optimized col-major matrix * vector product: + * This algorithm processes 4 columns at onces that allows to both reduce + * the number of load/stores of the result by a factor 4 and to reduce + * the instruction dependency. Moreover, we know that all bands have the + * same alignment pattern. + * TODO: since rhs gets evaluated only once, no need to evaluate it + */ +template<typename Scalar, typename RhsType> +EIGEN_DONT_INLINE static void ei_cache_friendly_product( + int size, + const Scalar* lhs, int lhsStride, + const RhsType& rhs, + Scalar* res) +{ + #ifdef _EIGEN_ACCUMULATE_PACKETS + #error _EIGEN_ACCUMULATE_PACKETS has already been defined + #endif + + #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2,OFFSET) \ + ei_pstore(&res[j OFFSET], \ + ei_padd(ei_pload(&res[j OFFSET]), \ + ei_padd( \ + ei_padd(ei_pmul(ptmp0,ei_pload ## A0(&lhs[j OFFSET +iN0])),ei_pmul(ptmp1,ei_pload ## A13(&lhs[j OFFSET +iN1]))), \ + ei_padd(ei_pmul(ptmp2,ei_pload ## A2(&lhs[j OFFSET +iN2])),ei_pmul(ptmp3,ei_pload ## A13(&lhs[j OFFSET +iN3]))) ))) + + asm("#begin matrix_vector_product"); + typedef typename ei_packet_traits<Scalar>::type Packet; + const int PacketSize = sizeof(Packet)/sizeof(Scalar); + + enum { AllAligned, EvenAligned, FirstAligned, NoneAligned }; + const int columnsAtOnce = 4; + const int peels = 2; + const int PacketAlignedMask = PacketSize-1; + const int PeelAlignedMask = PacketSize*peels-1; + const bool Vectorized = sizeof(Packet) != sizeof(Scalar); + + // How many coeffs of the result do we have to skip to be aligned. + // Here we assume data are at least aligned on the base scalar type that is mandatory anyway. + const int alignedStart = Vectorized + ? std::min<int>( (PacketSize - ((size_t(res)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size) + : 0; + const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask); + const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0; + + const int alignmentStep = lhsStride % PacketSize; + int alignmentPattern = alignmentStep==0 ? AllAligned + : alignmentStep==2 ? EvenAligned + : FirstAligned; + + // find how many column do we have to skip to be aligned with the result (if possible) + int skipColumns=0; + for (; skipColumns<PacketSize; ++skipColumns) + { + if (alignedStart == alignmentStep*skipColumns) + break; + } + if (skipColumns==PacketSize) + alignmentPattern = NoneAligned; + skipColumns = std::min(skipColumns,rhs.size()); + if (alignmentPattern!=NoneAligned) + for (int i=0; i<skipColumns; i++) + { + Scalar tmp0 = rhs[i]; + Packet ptmp0 = ei_pset1(tmp0); + int iN0 = i*lhsStride; + // process first unaligned result's coeffs + for (int j=0; j<alignedStart; j++) + res[j] += tmp0 * lhs[j+iN0]; + // process aligned result's coeffs (we know the lhs columns are not aligned) + for (int j = alignedStart;j<alignedSize;j+=PacketSize) + ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j]))); + // process remaining result's coeffs + for (int j=alignedSize; j<size; j++) + res[j] += tmp0 * lhs[j+iN0]; + } + + int columnBound = (rhs.size()/columnsAtOnce)*columnsAtOnce; + for (int i=0; i<columnBound; i+=columnsAtOnce) + { + Scalar tmp0 = rhs[i]; + Packet ptmp0 = ei_pset1(tmp0); + Scalar tmp1 = rhs[i+1]; + Packet ptmp1 = ei_pset1(tmp1); + Scalar tmp2 = rhs[i+2]; + Packet ptmp2 = ei_pset1(tmp2); + Scalar tmp3 = rhs[i+3]; + Packet ptmp3 = ei_pset1(tmp3); + int iN0 = i*lhsStride; + int iN1 = (i+1)*lhsStride; + int iN2 = (i+2)*lhsStride; + int iN3 = (i+3)*lhsStride; + + // process initial unaligned coeffs + for (int j=0; j<alignedStart; j++) + res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3]; + + if (alignedSize>0) + { + switch(alignmentPattern) + { + case AllAligned: + for (int j = alignedStart; j<alignedSize; j+=PacketSize) + _EIGEN_ACCUMULATE_PACKETS(,,,); + break; + case EvenAligned: + for (int j = alignedStart; j<alignedSize; j+=PacketSize) + _EIGEN_ACCUMULATE_PACKETS(,u,,); + break; + case FirstAligned: + if (peels>1) + for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize) + { + _EIGEN_ACCUMULATE_PACKETS(,u,u,); + _EIGEN_ACCUMULATE_PACKETS(,u,u,+PacketSize); + if (peels>2) _EIGEN_ACCUMULATE_PACKETS(,u,u,+2*PacketSize); + if (peels>3) _EIGEN_ACCUMULATE_PACKETS(,u,u,+3*PacketSize); + if (peels>4) _EIGEN_ACCUMULATE_PACKETS(,u,u,+4*PacketSize); + if (peels>5) _EIGEN_ACCUMULATE_PACKETS(,u,u,+5*PacketSize); + if (peels>6) _EIGEN_ACCUMULATE_PACKETS(,u,u,+6*PacketSize); + if (peels>7) _EIGEN_ACCUMULATE_PACKETS(,u,u,+7*PacketSize); + } + for (int j = peeledSize; j<alignedSize; j+=PacketSize) + _EIGEN_ACCUMULATE_PACKETS(,u,u,); + break; + default: + for (int j = peeledSize; j<alignedSize; j+=PacketSize) + _EIGEN_ACCUMULATE_PACKETS(u,u,u,); + break; + } + } + + // process remaining coeffs + for (int j=alignedSize; j<size; j++) + res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3]; + } + for (int i=columnBound; i<rhs.size(); i++) + { + Scalar tmp0 = rhs[i]; + Packet ptmp0 = ei_pset1(tmp0); + int iN0 = i*lhsStride; + if (alignedSize>0) + { + bool aligned0 = (iN0 % PacketSize) == 0; + if (aligned0) + for (int j = 0;j<alignedSize;j+=PacketSize) + ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_pload(&lhs[j+iN0])),ei_pload(&res[j]))); + else + for (int j = 0;j<alignedSize;j+=PacketSize) + ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j]))); + } + // process remaining scalars + for (int j=alignedSize; j<size; j++) + res[j] += tmp0 * lhs[j+iN0]; + } + asm("#end matrix_vector_product"); + + #undef _EIGEN_ACCUMULATE_PACKETS +} + #endif // EIGEN_CACHE_FRIENDLY_PRODUCT_H diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 6089c1be5..c2f0c07a8 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -250,8 +250,8 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product return res; } - const _LhsNested& lhs() const { return m_lhs; } - const _RhsNested& rhs() const { return m_rhs; } + inline const _LhsNested& lhs() const { return m_lhs; } + inline const _RhsNested& rhs() const { return m_rhs; } protected: const LhsNested m_lhs; @@ -480,11 +480,22 @@ struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMod * Cache friendly product callers and specific nested evaluation strategies ***************************************************************************/ +template<typename Scalar, typename RhsType> +static void ei_cache_friendly_product( + int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res); + +enum { + HasDirectAccess, + NoDirectAccess +}; + template<typename ProductType, int LhsRows = ei_traits<ProductType>::RowsAtCompileTime, int LhsOrder = int(ei_traits<ProductType>::LhsFlags)&RowMajorBit ? RowMajor : ColMajor, + int LhsHasDirectAccess = int(ei_traits<ProductType>::LhsFlags)&DirectAccessBit? HasDirectAccess : NoDirectAccess, int RhsCols = ei_traits<ProductType>::ColsAtCompileTime, - int RhsOrder = int(ei_traits<ProductType>::RhsFlags)&RowMajorBit ? RowMajor : ColMajor> + int RhsOrder = int(ei_traits<ProductType>::RhsFlags)&RowMajorBit ? RowMajor : ColMajor, + int RhsHasDirectAccess = int(ei_traits<ProductType>::RhsFlags)&DirectAccessBit? HasDirectAccess : NoDirectAccess> struct ei_cache_friendly_product_selector { template<typename DestDerived> @@ -495,21 +506,57 @@ struct ei_cache_friendly_product_selector }; // optimized colmajor * vector path -template<typename ProductType, int LhsRows, int RhsOrder> -struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,1,RhsOrder> +template<typename ProductType, int LhsRows, int RhsOrder, int RhsAccess> +struct ei_cache_friendly_product_selector<ProductType,LhsRows,NoDirectAccess,ColMajor,1,RhsOrder,RhsAccess> { + typedef typename ei_traits<ProductType>::_LhsNested Lhs; template<typename DestDerived> inline static void run(DestDerived& res, const ProductType& product) { - const int rows = product.rhs().rows(); - for (int j=0; j<rows; ++j) - res += product.rhs().coeff(j) * product.lhs().col(j); + ei_scalar_sum_op<typename ProductType::Scalar> _sum; + const int size = product.rhs().rows(); + for (int k=0; k<size; ++k) + if (Lhs::Flags&DirectAccessBit) + // TODO to properly hanlde this workaround let's specialize Block for type having the DirectAccessBit + res += product.rhs().coeff(k) * Map<DestDerived>(&product.lhs().const_cast_derived().coeffRef(0,k),product.lhs().rows()); + else + res += product.rhs().coeff(k) * product.lhs().col(k); + } +}; + +// optimized cache friendly colmajor * vector path for matrix with direct access flag +// NOTE this path coul also be enabled for expressions if we add runtime align queries +template<typename ProductType, int LhsRows, int RhsOrder, int RhsAccess> +struct ei_cache_friendly_product_selector<ProductType,LhsRows,HasDirectAccess,ColMajor,1,RhsOrder,RhsAccess> +{ + typedef typename ProductType::Scalar Scalar; + + template<typename DestDerived> + inline static void run(DestDerived& res, const ProductType& product) + { + enum { + EvalToRes = (ei_packet_traits<Scalar>::size==1) + ||(DestDerived::Flags&ActualPacketAccessBit) && (!(DestDerived::Flags & RowMajorBit)) }; + Scalar* __restrict__ _res; + if (EvalToRes) + _res = &res.coeffRef(0); + else + { + _res = (Scalar*)alloca(sizeof(Scalar)*res.size()); + Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size()) = res; + } + ei_cache_friendly_product(res.size(), + &product.lhs().const_cast_derived().coeffRef(0,0), product.lhs().stride(), + product.rhs(), _res); + + if (!EvalToRes) + res = Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size()); } }; // optimized vector * rowmajor path -template<typename ProductType, int LhsOrder, int RhsCols> -struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,RhsCols,RowMajor> +template<typename ProductType, int LhsOrder, int LhsAccess, int RhsCols> +struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCols,RowMajor,NoDirectAccess> { template<typename DestDerived> inline static void run(DestDerived& res, const ProductType& product) @@ -520,6 +567,36 @@ struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,RhsCols,RowMajo } }; +// optimized cache friendly vector * rowmajor path for matrix with direct access flag +// NOTE this path coul also be enabled for expressions if we add runtime align queries +template<typename ProductType, int LhsOrder, int LhsAccess, int RhsCols> +struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCols,RowMajor,HasDirectAccess> +{ + typedef typename ProductType::Scalar Scalar; + + template<typename DestDerived> + inline static void run(DestDerived& res, const ProductType& product) + { + enum { + EvalToRes = (ei_packet_traits<Scalar>::size==1) + ||(DestDerived::Flags & ActualPacketAccessBit) && (DestDerived::Flags & RowMajorBit) }; + Scalar* __restrict__ _res; + if (EvalToRes) + _res = &res.coeffRef(0); + else + { + _res = (Scalar*)alloca(sizeof(Scalar)*res.size()); + Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size()) = res; + } + ei_cache_friendly_product(res.size(), + &product.rhs().const_cast_derived().coeffRef(0,0), product.rhs().stride(), + product.lhs().transpose(), _res); + + if (!EvalToRes) + res = Map<Matrix<Scalar,1,DestDerived::ColsAtCompileTime> >(_res, res.size()); + } +}; + /** \internal */ template<typename Derived> template<typename Lhs,typename Rhs> |