// This file is part of Eigen, a lightweight C++ template library // for linear algebra. Eigen itself is part of the KDE project. // // Copyright (C) 2008 Gael Guennebaud // // Eigen is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 3 of the License, or (at your option) any later version. // // Alternatively, you can redistribute it and/or // modify it under the terms of the GNU General Public License as // published by the Free Software Foundation; either version 2 of // the License, or (at your option) any later version. // // Eigen is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the // GNU General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License and a copy of the GNU General Public License along with // Eigen. If not, see . #ifndef EIGEN_CACHE_FRIENDLY_PRODUCT_H #define EIGEN_CACHE_FRIENDLY_PRODUCT_H #ifndef EIGEN_EXTERN_INSTANTIATIONS template static void ei_cache_friendly_product( int _rows, int _cols, int depth, bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride, bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride, bool resRowMajor, Scalar* res, int resStride) { const Scalar* __restrict__ lhs; const Scalar* __restrict__ rhs; int lhsStride, rhsStride, rows, cols; bool lhsRowMajor; if (resRowMajor) { lhs = _rhs; rhs = _lhs; lhsStride = _rhsStride; rhsStride = _lhsStride; cols = _rows; rows = _cols; lhsRowMajor = !_rhsRowMajor; ei_assert(_lhsRowMajor); } else { lhs = _lhs; rhs = _rhs; lhsStride = _lhsStride; rhsStride = _rhsStride; rows = _rows; cols = _cols; lhsRowMajor = _lhsRowMajor; ei_assert(!_rhsRowMajor); } typedef typename ei_packet_traits::type PacketType; enum { PacketSize = sizeof(PacketType)/sizeof(Scalar), #if (defined __i386__) // i386 architecture provides only 8 xmm registers, // so let's reduce the max number of rows processed at once. MaxBlockRows = 4, MaxBlockRows_ClampingMask = 0xFFFFFC, #else MaxBlockRows = 8, MaxBlockRows_ClampingMask = 0xFFFFF8, #endif // maximal size of the blocks fitted in L2 cache MaxL2BlockSize = EIGEN_TUNE_FOR_L2_CACHE_SIZE / sizeof(Scalar) }; const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0)); const int remainingSize = depth % PacketSize; const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize; const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize; const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize; const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize; const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0)); Scalar* __restrict__ block = 0; const int allocBlockSize = sizeof(Scalar)*l2BlockRows*size; if (allocBlockSize>16000000) block = (Scalar*)malloc(allocBlockSize); else block = (Scalar*)alloca(allocBlockSize); Scalar* __restrict__ rhsCopy = (Scalar*)alloca(sizeof(Scalar)*l2BlockSizeAligned*l2BlockSizeAligned); // loops on each L2 cache friendly blocks of the result for(int l2i=0; l2i0) { for (int k=l2k; k1 && resIsAligned) { // the result is aligned: let's do packet reduction ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(&dst[0]))); if (PacketSize==2) ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2])))); if (MaxBlockRows==8) { ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4])))); if (PacketSize==2) ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6])))); } } else { // not aligned => per coeff packet reduction localRes[0] += ei_predux(dst[0]); localRes[1] += ei_predux(dst[1]); localRes[2] += ei_predux(dst[2]); localRes[3] += ei_predux(dst[3]); if (MaxBlockRows==8) { localRes[4] += ei_predux(dst[4]); localRes[5] += ei_predux(dst[5]); localRes[6] += ei_predux(dst[6]); localRes[7] += ei_predux(dst[7]); } } asm("#eigen endcore"); } } if (l2blockRemainingRows>0) { int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows; const Scalar* localB = &block[offsetblock]; asm("#eigen begin dynkernel"); for(int l1j=l2j; l1j=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+ PacketSize])), dst[1]); if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]); if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]); if (MaxBlockRows==8) { if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]); if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]); if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]); if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]); } } Scalar* __restrict__ localRes = &(res[l2blockRowEndBW + l1j*resStride]); // process the remaining rows once at a time localRes[0] += ei_predux(dst[0]); if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]); if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]); if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]); if (MaxBlockRows==8) { if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]); if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]); if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]); if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]); } asm("#eigen end dynkernel"); } } } } } if (PacketSize>1 && remainingSize) { if (lhsRowMajor) { for (int j=0; j16000000) free(block); } #endif // EIGEN_EXTERN_INSTANTIATIONS /* Optimized col-major matrix * vector product: * This algorithm processes 4 columns at onces that allows to both reduce * the number of load/stores of the result by a factor 4 and to reduce * the instruction dependency. Moreover, we know that all bands have the * same alignment pattern. * TODO: since rhs gets evaluated only once, no need to evaluate it */ template EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res) { #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2,OFFSET) \ ei_pstore(&res[j OFFSET], \ ei_padd(ei_pload(&res[j OFFSET]), \ ei_padd( \ ei_padd(ei_pmul(ptmp0,ei_pload ## A0(&lhs0[j OFFSET])),ei_pmul(ptmp1,ei_pload ## A13(&lhs1[j OFFSET]))), \ ei_padd(ei_pmul(ptmp2,ei_pload ## A2(&lhs2[j OFFSET])),ei_pmul(ptmp3,ei_pload ## A13(&lhs3[j OFFSET]))) ))) asm("#begin matrix_vector_product"); typedef typename ei_packet_traits::type Packet; const int PacketSize = sizeof(Packet)/sizeof(Scalar); enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; const int columnsAtOnce = 4; const int peels = 2; const int PacketAlignedMask = PacketSize-1; const int PeelAlignedMask = PacketSize*peels-1; // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type that is mandatory anyway. const int alignedStart = ei_alignmentOffset(res,size); const int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0; const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; int alignmentPattern = alignmentStep==0 ? AllAligned : alignmentStep==(PacketSize/2) ? EvenAligned : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size); // find how many columns do we have to skip to be aligned with the result (if possible) int skipColumns = 0; if (PacketSize>1) { ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size1) { /* explicit vectorization */ // process initial unaligned coeffs for (int j=0; jalignedStart) { switch(alignmentPattern) { case AllAligned: for (int j = alignedStart; j1) { Packet A00, A01, A02, A03, A10, A11, A12, A13; A01 = ei_pload(&lhs1[alignedStart-1]); A02 = ei_pload(&lhs2[alignedStart-2]); A03 = ei_pload(&lhs3[alignedStart-3]); for (int j = alignedStart; j(A01,A11); A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12); A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13); A00 = ei_pload (&lhs0[j]); A10 = ei_pload (&lhs0[j+PacketSize]); A00 = ei_pmadd(ptmp0, A00, ei_pload(&res[j])); A10 = ei_pmadd(ptmp0, A10, ei_pload(&res[j+PacketSize])); A00 = ei_pmadd(ptmp1, A01, A00); A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); A00 = ei_pmadd(ptmp2, A02, A00); A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); A00 = ei_pmadd(ptmp3, A03, A00); ei_pstore(&res[j],A00); A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); A10 = ei_pmadd(ptmp1, A11, A10); A10 = ei_pmadd(ptmp2, A12, A10); A10 = ei_pmadd(ptmp3, A13, A10); ei_pstore(&res[j+PacketSize],A10); } } for (int j = peeledSize; j1) { /* explicit vectorization */ // process first unaligned result's coeffs for (int j=0; j1); asm("#end matrix_vector_product"); #undef _EIGEN_ACCUMULATE_PACKETS } // TODO add peeling to mask unaligned load/stores template EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector( const Scalar* lhs, int lhsStride, const Scalar* rhs, int rhsSize, ResType& res) { #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2,OFFSET) {\ Packet b = ei_pload(&rhs[j]); \ ptmp0 = ei_pmadd(b, ei_pload##A0 (&lhs0[j]), ptmp0); \ ptmp1 = ei_pmadd(b, ei_pload##A13(&lhs1[j]), ptmp1); \ ptmp2 = ei_pmadd(b, ei_pload##A2 (&lhs2[j]), ptmp2); \ ptmp3 = ei_pmadd(b, ei_pload##A13(&lhs3[j]), ptmp3); } asm("#begin matrix_vector_product"); typedef typename ei_packet_traits::type Packet; const int PacketSize = sizeof(Packet)/sizeof(Scalar); enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; const int rowsAtOnce = 4; const int peels = 2; const int PacketAlignedMask = PacketSize-1; const int PeelAlignedMask = PacketSize*peels-1; const int size = rhsSize; // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type that is mandatory anyway. const int alignedStart = ei_alignmentOffset(rhs, size); const int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0; const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; int alignmentPattern = alignmentStep==0 ? AllAligned : alignmentStep==(PacketSize/2) ? EvenAligned : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size); // find how many rows do we have to skip to be aligned with rhs (if possible) int skipRows = 0; if (PacketSize>1) { ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size1) { /* explicit vectorization */ Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0)); // process initial unaligned coeffs // FIXME this loop get vectorized by the compiler ! for (int j=0; jalignedStart) { switch(alignmentPattern) { case AllAligned: for (int j = alignedStart; j1) { /* Here we proccess 4 rows with with two peeled iterations to hide * tghe overhead of unaligned loads. Moreover unaligned loads are handled * using special shift/move operations between the two aligned packets * overlaping the desired unaligned packet. This is *much* more efficient * than basic unaligned loads. */ Packet A01, A02, A03, b, A11, A12, A13; A01 = ei_pload(&lhs1[alignedStart-1]); A02 = ei_pload(&lhs2[alignedStart-2]); A03 = ei_pload(&lhs3[alignedStart-3]); for (int j = alignedStart; j(A01,A11); A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12); A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13); ptmp0 = ei_pmadd(b, ei_pload (&lhs0[j]), ptmp0); ptmp1 = ei_pmadd(b, A01, ptmp1); A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); ptmp2 = ei_pmadd(b, A02, ptmp2); A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); ptmp3 = ei_pmadd(b, A03, ptmp3); A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); b = ei_pload(&rhs[j+PacketSize]); ptmp0 = ei_pmadd(b, ei_pload (&lhs0[j+PacketSize]), ptmp0); ptmp1 = ei_pmadd(b, A11, ptmp1); ptmp2 = ei_pmadd(b, A12, ptmp2); ptmp3 = ei_pmadd(b, A13, ptmp3); } } for (int j = peeledSize; jalignedStart) { // process aligned rhs coeffs if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0) for (int j = alignedStart;j1); asm("#end matrix_vector_product"); #undef _EIGEN_ACCUMULATE_PACKETS } #endif // EIGEN_CACHE_FRIENDLY_PRODUCT_H