// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2008-2009 Gael Guennebaud // // Eigen is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 3 of the License, or (at your option) any later version. // // Alternatively, you can redistribute it and/or // modify it under the terms of the GNU General Public License as // published by the Free Software Foundation; either version 2 of // the License, or (at your option) any later version. // // Eigen is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the // GNU General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License and a copy of the GNU General Public License along with // Eigen. If not, see . #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H #define EIGEN_GENERAL_MATRIX_VECTOR_H /* Optimized col-major matrix * vector product: * This algorithm processes 4 columns at onces that allows to both reduce * the number of load/stores of the result by a factor 4 and to reduce * the instruction dependency. Moreover, we know that all bands have the * same alignment pattern. * TODO: since rhs gets evaluated only once, no need to evaluate it */ template static EIGEN_DONT_INLINE void ei_cache_friendly_product_colmajor_times_vector( int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res, Scalar alpha) { #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \ ei_pstore(&res[j], \ ei_padd(ei_pload(&res[j]), \ ei_padd( \ ei_padd(cj.pmul(EIGEN_CAT(ei_ploa , A0)(&lhs0[j]), ptmp0), \ cj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs1[j]), ptmp1)), \ ei_padd(cj.pmul(EIGEN_CAT(ei_ploa , A2)(&lhs2[j]), ptmp2), \ cj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs3[j]), ptmp3)) ))) ei_conj_helper cj; if(ConjugateRhs) alpha = ei_conj(alpha); typedef typename NumTraits::Real RealScalar; typedef typename ei_packet_traits::type Packet; const int PacketSize = sizeof(Packet)/sizeof(Scalar); enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; const int columnsAtOnce = 4; const int peels = 2; const int PacketAlignedMask = PacketSize-1; const int PeelAlignedMask = PacketSize*peels-1; // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type. int alignedStart = ei_first_aligned(res,size); int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0; const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; int alignmentPattern = alignmentStep==0 ? AllAligned : alignmentStep==(PacketSize/2) ? EvenAligned : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices const int lhsAlignmentOffset = ei_first_aligned(lhs,size); // find how many columns do we have to skip to be aligned with the result (if possible) int skipColumns = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) if( (size_t(lhs)%sizeof(RealScalar)) || (size_t(res)%sizeof(RealScalar)) ) { alignedSize = 0; alignedStart = 0; } else if (PacketSize>1) { ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size= rhs.size()) || PacketSize > size || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0); } int offset1 = (FirstAligned && alignmentStep==1?3:1); int offset3 = (FirstAligned && alignmentStep==1?1:3); int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; for (int i=skipColumns; i1) { /* explicit vectorization */ // process initial unaligned coeffs for (int j=0; jalignedStart) { switch(alignmentPattern) { case AllAligned: for (int j = alignedStart; j1) { Packet A00, A01, A02, A03, A10, A11, A12, A13; A01 = ei_pload(&lhs1[alignedStart-1]); A02 = ei_pload(&lhs2[alignedStart-2]); A03 = ei_pload(&lhs3[alignedStart-3]); for (int j = alignedStart; j(A01,A11); A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12); A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13); A00 = ei_pload (&lhs0[j]); A10 = ei_pload (&lhs0[j+PacketSize]); A00 = cj.pmadd(A00, ptmp0, ei_pload(&res[j])); A10 = cj.pmadd(A10, ptmp0, ei_pload(&res[j+PacketSize])); A00 = cj.pmadd(A01, ptmp1, A00); A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); A00 = cj.pmadd(A02, ptmp2, A00); A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); A00 = cj.pmadd(A03, ptmp3, A00); ei_pstore(&res[j],A00); A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); A10 = cj.pmadd(A11, ptmp1, A10); A10 = cj.pmadd(A12, ptmp2, A10); A10 = cj.pmadd(A13, ptmp3, A10); ei_pstore(&res[j+PacketSize],A10); } } for (int j = peeledSize; j1) { /* explicit vectorization */ // process first unaligned result's coeffs for (int j=0; j1); #undef _EIGEN_ACCUMULATE_PACKETS } // TODO add peeling to mask unaligned load/stores template static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( const Scalar* lhs, int lhsStride, const Scalar* rhs, int rhsSize, ResType& res, Scalar alpha) { #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ Packet b = ei_pload(&rhs[j]); \ ptmp0 = cj.pmadd(EIGEN_CAT(ei_ploa,A0) (&lhs0[j]), b, ptmp0); \ ptmp1 = cj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs1[j]), b, ptmp1); \ ptmp2 = cj.pmadd(EIGEN_CAT(ei_ploa,A2) (&lhs2[j]), b, ptmp2); \ ptmp3 = cj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs3[j]), b, ptmp3); } ei_conj_helper cj; typedef typename NumTraits::Real RealScalar; typedef typename ei_packet_traits::type Packet; const int PacketSize = sizeof(Packet)/sizeof(Scalar); enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; const int rowsAtOnce = 4; const int peels = 2; const int PacketAlignedMask = PacketSize-1; const int PeelAlignedMask = PacketSize*peels-1; const int size = rhsSize; // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type // if that's not the case then vectorization is discarded, see below. int alignedStart = ei_first_aligned(rhs, size); int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0; const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; int alignmentPattern = alignmentStep==0 ? AllAligned : alignmentStep==(PacketSize/2) ? EvenAligned : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices const int lhsAlignmentOffset = ei_first_aligned(lhs,size); // find how many rows do we have to skip to be aligned with rhs (if possible) int skipRows = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) if( (size_t(lhs)%sizeof(RealScalar)) || (size_t(rhs)%sizeof(RealScalar)) ) { alignedSize = 0; alignedStart = 0; } else if (PacketSize>1) { ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size= res.size()) || PacketSize > rhsSize || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0); } int offset1 = (FirstAligned && alignmentStep==1?3:1); int offset3 = (FirstAligned && alignmentStep==1?1:3); int rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; for (int i=skipRows; i1) { /* explicit vectorization */ Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0)); // process initial unaligned coeffs // FIXME this loop get vectorized by the compiler ! for (int j=0; jalignedStart) { switch(alignmentPattern) { case AllAligned: for (int j = alignedStart; j1) { /* Here we proccess 4 rows with with two peeled iterations to hide * tghe overhead of unaligned loads. Moreover unaligned loads are handled * using special shift/move operations between the two aligned packets * overlaping the desired unaligned packet. This is *much* more efficient * than basic unaligned loads. */ Packet A01, A02, A03, b, A11, A12, A13; A01 = ei_pload(&lhs1[alignedStart-1]); A02 = ei_pload(&lhs2[alignedStart-2]); A03 = ei_pload(&lhs3[alignedStart-3]); for (int j = alignedStart; j(A01,A11); A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12); A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13); ptmp0 = cj.pmadd(ei_pload (&lhs0[j]), b, ptmp0); ptmp1 = cj.pmadd(A01, b, ptmp1); A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); ptmp2 = cj.pmadd(A02, b, ptmp2); A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); ptmp3 = cj.pmadd(A03, b, ptmp3); A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); b = ei_pload(&rhs[j+PacketSize]); ptmp0 = cj.pmadd(ei_pload (&lhs0[j+PacketSize]), b, ptmp0); ptmp1 = cj.pmadd(A11, b, ptmp1); ptmp2 = cj.pmadd(A12, b, ptmp2); ptmp3 = cj.pmadd(A13, b, ptmp3); } } for (int j = peeledSize; jalignedStart) { // process aligned rhs coeffs if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0) for (int j = alignedStart;j1); #undef _EIGEN_ACCUMULATE_PACKETS } #endif // EIGEN_GENERAL_MATRIX_VECTOR_H