diff options
Diffstat (limited to 'Eigen/src/Core/products/GeneralMatrixVector.h')
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixVector.h | 315 |
1 files changed, 174 insertions, 141 deletions
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 0997746ef..e0d71be7e 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -32,54 +32,71 @@ * same alignment pattern. * TODO: since rhs gets evaluated only once, no need to evaluate it */ -template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename RhsType> -static EIGEN_DONT_INLINE -void ei_cache_friendly_product_colmajor_times_vector( - Index size, - const Scalar* lhs, Index lhsStride, - const RhsType& rhs, - Scalar* res, - Scalar alpha) +template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> +struct ei_general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs> { +typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + +enum { + Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable + && ei_packet_traits<LhsScalar>::size==ei_packet_traits<RhsScalar>::size, + LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, + RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, + ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1 +}; + +typedef typename ei_packet_traits<LhsScalar>::type _LhsPacket; +typedef typename ei_packet_traits<RhsScalar>::type _RhsPacket; +typedef typename ei_packet_traits<ResScalar>::type _ResPacket; + +typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket; +typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket; +typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket; + +template<typename RhsType> +EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsScalar* lhs, Index lhsStride, + const RhsType&/*const RhsScalar**/ rhs, Index rhsIncr, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + EIGEN_UNUSED_VARIABLE(rhsIncr); + ei_internal_assert(resIncr==1); #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \ ei_pstore(&res[j], \ - ei_padd(ei_pload(&res[j]), \ + ei_padd(ei_pload<ResPacket>(&res[j]), \ ei_padd( \ - ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A0)(&lhs0[j]), ptmp0), \ - pcj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs1[j]), ptmp1)), \ - ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A2)(&lhs2[j]), ptmp2), \ - pcj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs3[j]), ptmp3)) ))) - - typedef typename NumTraits<Scalar>::Real RealScalar; - typedef typename ei_packet_traits<Scalar>::type Packet; - enum { - PacketSize = sizeof(Packet)/sizeof(Scalar), - Vectorizable = ei_packet_traits<Scalar>::Vectorizable - }; - - ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj; - ei_conj_helper<Packet,Packet,ConjugateLhs,ConjugateRhs> pcj; + ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \ + pcj.pmul(EIGEN_CAT(ei_ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \ + ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \ + pcj.pmul(EIGEN_CAT(ei_ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) ))) + + ei_conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; + ei_conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; if(ConjugateRhs) alpha = ei_conj(alpha); enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; const Index columnsAtOnce = 4; const Index peels = 2; - const Index PacketAlignedMask = PacketSize-1; - const Index PeelAlignedMask = PacketSize*peels-1; + const Index LhsPacketAlignedMask = LhsPacketSize-1; + const Index ResPacketAlignedMask = ResPacketSize-1; + const Index PeelAlignedMask = ResPacketSize*peels-1; + const Index size = rows; // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type. Index alignedStart = ei_first_aligned(res,size); - Index alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0; + Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; - const Index alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; + const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(PacketSize/2) ? EvenAligned + : alignmentStep==(LhsPacketSize/2) ? EvenAligned : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices @@ -88,19 +105,19 @@ void ei_cache_friendly_product_colmajor_times_vector( // find how many columns do we have to skip to be aligned with the result (if possible) Index skipColumns = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (size_t(lhs)%sizeof(Scalar)) || (size_t(res)%sizeof(Scalar)) ) + if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) ) { alignedSize = 0; alignedStart = 0; } - else if (PacketSize>1) + else if (LhsPacketSize>1) { - ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size<PacketSize); + ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); - while (skipColumns<PacketSize && - alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%PacketSize)) + while (skipColumns<LhsPacketSize && + alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize)) ++skipColumns; - if (skipColumns==PacketSize) + if (skipColumns==LhsPacketSize) { // nothing can be aligned, no need to skip any column alignmentPattern = NoneAligned; @@ -108,14 +125,14 @@ void ei_cache_friendly_product_colmajor_times_vector( } else { - skipColumns = std::min(skipColumns,rhs.size()); + skipColumns = std::min(skipColumns,cols); // note that the skiped columns are processed later. } ei_internal_assert( (alignmentPattern==NoneAligned) - || (skipColumns + columnsAtOnce >= rhs.size()) - || PacketSize > size - || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0); + || (skipColumns + columnsAtOnce >= cols) + || LhsPacketSize > size + || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0); } else if(Vectorizable) { @@ -127,15 +144,15 @@ void ei_cache_friendly_product_colmajor_times_vector( Index offset1 = (FirstAligned && alignmentStep==1?3:1); Index offset3 = (FirstAligned && alignmentStep==1?1:3); - Index columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; + Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce) { - Packet ptmp0 = ei_pset1(alpha*rhs[i]), ptmp1 = ei_pset1(alpha*rhs[i+offset1]), - ptmp2 = ei_pset1(alpha*rhs[i+2]), ptmp3 = ei_pset1(alpha*rhs[i+offset3]); + RhsPacket ptmp0 = ei_pset1<RhsPacket>(alpha*rhs[i]), ptmp1 = ei_pset1<RhsPacket>(alpha*rhs[i+offset1]), + ptmp2 = ei_pset1<RhsPacket>(alpha*rhs[i+2]), ptmp3 = ei_pset1<RhsPacket>(alpha*rhs[i+offset3]); // this helps a lot generating better binary code - const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, + *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; if (Vectorizable) { @@ -154,51 +171,51 @@ void ei_cache_friendly_product_colmajor_times_vector( switch(alignmentPattern) { case AllAligned: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,d,d); break; case EvenAligned: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,du,d); break; case FirstAligned: if(peels>1) { - Packet A00, A01, A02, A03, A10, A11, A12, A13; + LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; - A01 = ei_pload(&lhs1[alignedStart-1]); - A02 = ei_pload(&lhs2[alignedStart-2]); - A03 = ei_pload(&lhs3[alignedStart-3]); + A01 = ei_pload<LhsPacket>(&lhs1[alignedStart-1]); + A02 = ei_pload<LhsPacket>(&lhs2[alignedStart-2]); + A03 = ei_pload<LhsPacket>(&lhs3[alignedStart-3]); - for (Index j = alignedStart; j<peeledSize; j+=peels*PacketSize) + for (Index j = alignedStart; j<peeledSize; j+=peels*ResPacketSize) { - A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11); - A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12); - A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13); + A11 = ei_pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); ei_palign<1>(A01,A11); + A12 = ei_pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); ei_palign<2>(A02,A12); + A13 = ei_pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); ei_palign<3>(A03,A13); - A00 = ei_pload (&lhs0[j]); - A10 = ei_pload (&lhs0[j+PacketSize]); - A00 = pcj.pmadd(A00, ptmp0, ei_pload(&res[j])); - A10 = pcj.pmadd(A10, ptmp0, ei_pload(&res[j+PacketSize])); + A00 = ei_pload<LhsPacket>(&lhs0[j]); + A10 = ei_pload<LhsPacket>(&lhs0[j+LhsPacketSize]); + A00 = pcj.pmadd(A00, ptmp0, ei_pload<ResPacket>(&res[j])); + A10 = pcj.pmadd(A10, ptmp0, ei_pload<ResPacket>(&res[j+ResPacketSize])); A00 = pcj.pmadd(A01, ptmp1, A00); - A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); + A01 = ei_pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); ei_palign<1>(A11,A01); A00 = pcj.pmadd(A02, ptmp2, A00); - A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); + A02 = ei_pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); ei_palign<2>(A12,A02); A00 = pcj.pmadd(A03, ptmp3, A00); ei_pstore(&res[j],A00); - A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); + A03 = ei_pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); ei_palign<3>(A13,A03); A10 = pcj.pmadd(A11, ptmp1, A10); A10 = pcj.pmadd(A12, ptmp2, A10); A10 = pcj.pmadd(A13, ptmp3, A10); - ei_pstore(&res[j+PacketSize],A10); + ei_pstore(&res[j+ResPacketSize],A10); } } - for (Index j = peeledSize; j<alignedSize; j+=PacketSize) + for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,du,du); break; default: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) _EIGEN_ACCUMULATE_PACKETS(du,du,du); break; } @@ -216,14 +233,14 @@ void ei_cache_friendly_product_colmajor_times_vector( } // process remaining first and last columns (at most columnsAtOnce-1) - Index end = rhs.size(); + Index end = cols; Index start = columnBound; do { for (Index i=start; i<end; ++i) { - Packet ptmp0 = ei_pset1(alpha*rhs[i]); - const Scalar* lhs0 = lhs + i*lhsStride; + RhsPacket ptmp0 = ei_pset1<RhsPacket>(alpha*rhs[i]); + const LhsScalar* lhs0 = lhs + i*lhsStride; if (Vectorizable) { @@ -233,12 +250,12 @@ void ei_cache_friendly_product_colmajor_times_vector( res[j] += cj.pmul(lhs0[j], ei_pfirst(ptmp0)); // process aligned result's coeffs - if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0) - for (Index j = alignedStart;j<alignedSize;j+=PacketSize) - ei_pstore(&res[j], pcj.pmadd(ei_pload(&lhs0[j]), ptmp0, ei_pload(&res[j]))); + if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + for (Index j = alignedStart;j<alignedSize;j+=ResPacketSize) + ei_pstore(&res[j], pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j]), ptmp0, ei_pload<ResPacket>(&res[j]))); else - for (Index j = alignedStart;j<alignedSize;j+=PacketSize) - ei_pstore(&res[j], pcj.pmadd(ei_ploadu(&lhs0[j]), ptmp0, ei_pload(&res[j]))); + for (Index j = alignedStart;j<alignedSize;j+=ResPacketSize) + ei_pstore(&res[j], pcj.pmadd(ei_ploadu<LhsPacket>(&lhs0[j]), ptmp0, ei_pload<ResPacket>(&res[j]))); } // process remaining scalars (or all if no explicit vectorization) @@ -256,15 +273,35 @@ void ei_cache_friendly_product_colmajor_times_vector( } while(Vectorizable); #undef _EIGEN_ACCUMULATE_PACKETS } +}; -// TODO add peeling to mask unaligned load/stores -template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index> -static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( +template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> +struct ei_general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs> +{ +typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + +enum { + Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable + && int(ei_packet_traits<LhsScalar>::size)==int(ei_packet_traits<RhsScalar>::size), + LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, + RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, + ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1 +}; + +typedef typename ei_packet_traits<LhsScalar>::type _LhsPacket; +typedef typename ei_packet_traits<RhsScalar>::type _RhsPacket; +typedef typename ei_packet_traits<ResScalar>::type _ResPacket; + +typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket; +typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket; +typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket; + +EIGEN_DONT_INLINE static void run( Index rows, Index cols, - const Scalar* lhs, Index lhsStride, - const Scalar* rhs, Index rhsIncr, - Scalar* res, Index resIncr, - Scalar alpha) + const LhsScalar* lhs, Index lhsStride, + const RhsScalar* rhs, Index rhsIncr, + ResScalar* res, Index resIncr, + ResScalar alpha) { ei_internal_assert(rhsIncr==1); #ifdef _EIGEN_ACCUMULATE_PACKETS @@ -272,39 +309,33 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( #endif #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ - Packet b = ei_pload(&rhs[j]); \ - ptmp0 = pcj.pmadd(EIGEN_CAT(ei_ploa,A0) (&lhs0[j]), b, ptmp0); \ - ptmp1 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs1[j]), b, ptmp1); \ - ptmp2 = pcj.pmadd(EIGEN_CAT(ei_ploa,A2) (&lhs2[j]), b, ptmp2); \ - ptmp3 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs3[j]), b, ptmp3); } - - typedef typename NumTraits<Scalar>::Real RealScalar; - typedef typename ei_packet_traits<Scalar>::type Packet; - enum { - PacketSize = sizeof(Packet)/sizeof(Scalar), - Vectorizable = ei_packet_traits<Scalar>::Vectorizable - }; - - ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj; - ei_conj_helper<Packet,Packet,ConjugateLhs,ConjugateRhs> pcj; + RhsPacket b = ei_pload<RhsPacket>(&rhs[j]); \ + ptmp0 = pcj.pmadd(EIGEN_CAT(ei_ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \ + ptmp1 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \ + ptmp2 = pcj.pmadd(EIGEN_CAT(ei_ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \ + ptmp3 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); } + + ei_conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; + ei_conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; const Index rowsAtOnce = 4; const Index peels = 2; - const Index PacketAlignedMask = PacketSize-1; - const Index PeelAlignedMask = PacketSize*peels-1; + const Index RhsPacketAlignedMask = RhsPacketSize-1; + const Index LhsPacketAlignedMask = LhsPacketSize-1; + const Index PeelAlignedMask = RhsPacketSize*peels-1; const Index depth = cols; // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type // if that's not the case then vectorization is discarded, see below. Index alignedStart = ei_first_aligned(rhs, depth); - Index alignedSize = PacketSize>1 ? alignedStart + ((depth-alignedStart) & ~PacketAlignedMask) : 0; + Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; - const Index alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; + const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(PacketSize/2) ? EvenAligned + : alignmentStep==(LhsPacketSize/2) ? EvenAligned : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices @@ -313,19 +344,19 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( // find how many rows do we have to skip to be aligned with rhs (if possible) Index skipRows = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (size_t(lhs)%sizeof(Scalar)) || (size_t(rhs)%sizeof(Scalar)) ) + if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ) { alignedSize = 0; alignedStart = 0; } - else if (PacketSize>1) + else if (LhsPacketSize>1) { - ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || depth<PacketSize); + ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); - while (skipRows<PacketSize && - alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%PacketSize)) + while (skipRows<LhsPacketSize && + alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize)) ++skipRows; - if (skipRows==PacketSize) + if (skipRows==LhsPacketSize) { // nothing can be aligned, no need to skip any column alignmentPattern = NoneAligned; @@ -337,10 +368,10 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( // note that the skiped columns are processed later. } ei_internal_assert( alignmentPattern==NoneAligned - || PacketSize==1 + || LhsPacketSize==1 || (skipRows + rowsAtOnce >= rows) - || PacketSize > depth - || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0); + || LhsPacketSize > depth + || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0); } else if(Vectorizable) { @@ -355,23 +386,24 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) { - EIGEN_ALIGN16 Scalar tmp0 = Scalar(0); - Scalar tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0); + EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); + ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0); // this helps the compiler generating good binary code - const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, + *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; if (Vectorizable) { /* explicit vectorization */ - Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0)); + ResPacket ptmp0 = ei_pset1<ResPacket>(ResScalar(0)), ptmp1 = ei_pset1<ResPacket>(ResScalar(0)), + ptmp2 = ei_pset1<ResPacket>(ResScalar(0)), ptmp3 = ei_pset1<ResPacket>(ResScalar(0)); // process initial unaligned coeffs // FIXME this loop get vectorized by the compiler ! for (Index j=0; j<alignedStart; ++j) { - Scalar b = rhs[j]; + RhsScalar b = rhs[j]; tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); } @@ -381,11 +413,11 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( switch(alignmentPattern) { case AllAligned: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,d,d); break; case EvenAligned: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,du,d); break; case FirstAligned: @@ -397,38 +429,38 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( * overlaping the desired unaligned packet. This is *much* more efficient * than basic unaligned loads. */ - Packet A01, A02, A03, b, A11, A12, A13; - A01 = ei_pload(&lhs1[alignedStart-1]); - A02 = ei_pload(&lhs2[alignedStart-2]); - A03 = ei_pload(&lhs3[alignedStart-3]); + LhsPacket A01, A02, A03, A11, A12, A13; + A01 = ei_pload<LhsPacket>(&lhs1[alignedStart-1]); + A02 = ei_pload<LhsPacket>(&lhs2[alignedStart-2]); + A03 = ei_pload<LhsPacket>(&lhs3[alignedStart-3]); - for (Index j = alignedStart; j<peeledSize; j+=peels*PacketSize) + for (Index j = alignedStart; j<peeledSize; j+=peels*RhsPacketSize) { - b = ei_pload(&rhs[j]); - A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11); - A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12); - A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13); + RhsPacket b = ei_pload<RhsPacket>(&rhs[j]); + A11 = ei_pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); ei_palign<1>(A01,A11); + A12 = ei_pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); ei_palign<2>(A02,A12); + A13 = ei_pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); ei_palign<3>(A03,A13); - ptmp0 = pcj.pmadd(ei_pload (&lhs0[j]), b, ptmp0); + ptmp0 = pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j]), b, ptmp0); ptmp1 = pcj.pmadd(A01, b, ptmp1); - A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); + A01 = ei_pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); ei_palign<1>(A11,A01); ptmp2 = pcj.pmadd(A02, b, ptmp2); - A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); + A02 = ei_pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); ei_palign<2>(A12,A02); ptmp3 = pcj.pmadd(A03, b, ptmp3); - A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); + A03 = ei_pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); ei_palign<3>(A13,A03); - b = ei_pload(&rhs[j+PacketSize]); - ptmp0 = pcj.pmadd(ei_pload (&lhs0[j+PacketSize]), b, ptmp0); + b = ei_pload<RhsPacket>(&rhs[j+RhsPacketSize]); + ptmp0 = pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0); ptmp1 = pcj.pmadd(A11, b, ptmp1); ptmp2 = pcj.pmadd(A12, b, ptmp2); ptmp3 = pcj.pmadd(A13, b, ptmp3); } } - for (Index j = peeledSize; j<alignedSize; j+=PacketSize) + for (Index j = peeledSize; j<alignedSize; j+=RhsPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,du,du); break; default: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) _EIGEN_ACCUMULATE_PACKETS(du,du,du); break; } @@ -443,7 +475,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( // FIXME this loop get vectorized by the compiler ! for (Index j=alignedSize; j<depth; ++j) { - Scalar b = rhs[j]; + RhsScalar b = rhs[j]; tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); } @@ -460,9 +492,9 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( { for (Index i=start; i<end; ++i) { - EIGEN_ALIGN16 Scalar tmp0 = Scalar(0); - Packet ptmp0 = ei_pset1(tmp0); - const Scalar* lhs0 = lhs + i*lhsStride; + EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); + ResPacket ptmp0 = ei_pset1<ResPacket>(tmp0); + const LhsScalar* lhs0 = lhs + i*lhsStride; // process first unaligned result's coeffs // FIXME this loop get vectorized by the compiler ! for (Index j=0; j<alignedStart; ++j) @@ -471,12 +503,12 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( if (alignedSize>alignedStart) { // process aligned rhs coeffs - if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0) - for (Index j = alignedStart;j<alignedSize;j+=PacketSize) - ptmp0 = pcj.pmadd(ei_pload(&lhs0[j]), ei_pload(&rhs[j]), ptmp0); + if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) + ptmp0 = pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j]), ei_pload<RhsPacket>(&rhs[j]), ptmp0); else - for (Index j = alignedStart;j<alignedSize;j+=PacketSize) - ptmp0 = pcj.pmadd(ei_ploadu(&lhs0[j]), ei_pload(&rhs[j]), ptmp0); + for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) + ptmp0 = pcj.pmadd(ei_ploadu<LhsPacket>(&lhs0[j]), ei_pload<RhsPacket>(&rhs[j]), ptmp0); tmp0 += ei_predux(ptmp0); } @@ -498,5 +530,6 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( #undef _EIGEN_ACCUMULATE_PACKETS } +}; #endif // EIGEN_GENERAL_MATRIX_VECTOR_H |