diff options
author | Gael Guennebaud <g.gael@free.fr> | 2010-07-11 23:57:23 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2010-07-11 23:57:23 +0200 |
commit | f8678272a4244babe25cc92bbb9298ed922330a4 (patch) | |
tree | 1c8241a1e3bfe345c53d105c8939966fe7ef83bf | |
parent | 8e3c4283f52a14b64ce346dcdd9115871a481ab6 (diff) |
mixing types step 3:
- improve support of colmajor by vector and matrix - matrix
- now all configurations are well handled, but the perf are not always very good
-rw-r--r-- | Eigen/src/Core/Product.h | 36 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/Complex.h | 9 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 48 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 10 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixVector.h | 43 | ||||
-rw-r--r-- | Eigen/src/Core/util/BlasUtil.h | 14 |
6 files changed, 122 insertions, 38 deletions
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index e929b8d89..25b48517d 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -323,9 +323,10 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true> static void run(const ProductType& prod, Dest& dest, typename ProductType::Scalar alpha) { typedef typename ProductType::Index Index; - typedef typename ProductType::LhsScalar LhsScalar; - typedef typename ProductType::RhsScalar RhsScalar; - typedef typename ProductType::Scalar ResScalar; + typedef typename ProductType::LhsScalar LhsScalar; + typedef typename ProductType::RhsScalar RhsScalar; + typedef typename ProductType::Scalar ResScalar; + typedef typename ProductType::RealScalar RealScalar; typedef typename ProductType::ActualLhsType ActualLhsType; typedef typename ProductType::ActualRhsType ActualRhsType; typedef typename ProductType::LhsBlasTraits LhsBlasTraits; @@ -340,16 +341,30 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true> enum { // FIXME find a way to allow an inner stride on the result if ei_packet_traits<Scalar>::size==1 - EvalToDest = Dest::InnerStrideAtCompileTime==1 + EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1, + ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex) }; + bool alphaIsCompatible = (!ComplexByReal) || (ei_imag(actualAlpha)==RealScalar(0)); + bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; + + RhsScalar compatibleAlpha = ei_get_factor<ResScalar,RhsScalar>::run(actualAlpha); + ResScalar* actualDest; - if (EvalToDest) + if (evalToDest) + { actualDest = &dest.coeffRef(0); + } else { actualDest = ei_aligned_stack_new(ResScalar,dest.size()); - MappedDest(actualDest, dest.size()) = dest; + if(!alphaIsCompatible) + { + MappedDest(actualDest, dest.size()).setZero(); + compatibleAlpha = RhsScalar(1); + } + else + MappedDest(actualDest, dest.size()) = dest; } ei_general_matrix_vector_product @@ -358,11 +373,14 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true> &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(), actualRhs.data(), actualRhs.innerStride(), actualDest, 1, - actualAlpha); + compatibleAlpha); - if (!EvalToDest) + if (!evalToDest) { - dest = MappedDest(actualDest, dest.size()); + if(!alphaIsCompatible) + dest += actualAlpha * MappedDest(actualDest, dest.size()); + else + dest = MappedDest(actualDest, dest.size()); ei_aligned_stack_delete(ResScalar, actualDest, dest.size()); } } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 6c72293fc..d1880294c 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -207,6 +207,15 @@ template<> struct ei_conj_helper<Packet4f, Packet2cf, false,false> { return Packet2cf(ei_pmul(x, y.v)); } }; +template<> struct ei_conj_helper<Packet2cf, Packet4f, false,false> +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const + { return ei_padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const + { return Packet2cf(ei_pmul(x.v, y)); } +}; + template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for SSE3 and 4 diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 9214582ed..e1c258ff0 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -140,17 +140,26 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st #ifdef EIGEN_HAS_FUSE_CJMADD #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); #else - #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = ei_padd(C,ResPacket(T)); + #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = ei_padd(C,T); #endif -// optimized GEneral packed Block * packed Panel product kernel +/* optimized GEneral packed Block * packed Panel product kernel + * + * Mixing type logic: C += A * B + * | A | B | comments + * |real |cplx | no vectorization yet, would require to pack A with duplication + * |cplx |real | easy vectorization + */ template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> struct ei_gebp_kernel { typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; enum { - Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable, + Vectorizable = ei_product_blocking_traits<LhsScalar,RhsScalar>::Vectorizable /*ei_packet_traits<LhsScalar>::Vectorizable + && ei_packet_traits<RhsScalar>::Vectorizable + && (ei_is_same_type<LhsScalar,RhsScalar>::ret + || (NumTraits<LhsScalar>::IsComplex && !NumTraits<RhsScalar>::IsComplex))*/, LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1 @@ -766,36 +775,51 @@ template<typename Scalar, typename Index, int mr, int StorageOrder, bool Conjuga struct ei_gemm_pack_lhs { void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, - Index stride=0, Index offset=0) + Scalar alpha = Scalar(1), Index stride=0, Index offset=0) { enum { PacketSize = ei_packet_traits<Scalar>::size }; ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; ei_const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride); + bool hasAlpha = alpha != Scalar(1); Index count = 0; Index peeled_mc = (rows/mr)*mr; for(Index i=0; i<peeled_mc; i+=mr) { if(PanelMode) count += mr * offset; - for(Index k=0; k<depth; k++) - for(Index w=0; w<mr; w++) - blockA[count++] = cj(lhs(i+w, k)); + if(hasAlpha) + for(Index k=0; k<depth; k++) + for(Index w=0; w<mr; w++) + blockA[count++] = alpha * cj(lhs(i+w, k)); + else + for(Index k=0; k<depth; k++) + for(Index w=0; w<mr; w++) + blockA[count++] = cj(lhs(i+w, k)); if(PanelMode) count += mr * (stride-offset-depth); } if(rows-peeled_mc>=PacketSize) { if(PanelMode) count += PacketSize*offset; - for(Index k=0; k<depth; k++) - for(Index w=0; w<PacketSize; w++) - blockA[count++] = cj(lhs(peeled_mc+w, k)); + if(hasAlpha) + for(Index k=0; k<depth; k++) + for(Index w=0; w<PacketSize; w++) + blockA[count++] = alpha * cj(lhs(peeled_mc+w, k)); + else + for(Index k=0; k<depth; k++) + for(Index w=0; w<PacketSize; w++) + blockA[count++] = cj(lhs(peeled_mc+w, k)); if(PanelMode) count += PacketSize * (stride-offset-depth); peeled_mc += PacketSize; } for(Index i=peeled_mc; i<rows; i++) { if(PanelMode) count += offset; - for(Index k=0; k<depth; k++) - blockA[count++] = cj(lhs(i, k)); + if(hasAlpha) + for(Index k=0; k<depth; k++) + blockA[count++] = alpha * cj(lhs(i, k)); + else + for(Index k=0; k<depth; k++) + blockA[count++] = cj(lhs(i, k)); if(PanelMode) count += (stride-offset-depth); } } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index f2f4ae506..b89cfa8a7 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -74,6 +74,7 @@ static void run(Index rows, Index cols, Index depth, ei_const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); typedef ei_product_blocking_traits<LhsScalar,RhsScalar> Blocking; + bool alphaOnLhs = NumTraits<LhsScalar>::IsComplex && !NumTraits<RhsScalar>::IsComplex; Index kc = blocking.kc(); // cache block size along the K direction Index mc = std::min(rows,blocking.mc()); // cache block size along the M direction @@ -86,7 +87,7 @@ static void run(Index rows, Index cols, Index depth, ei_gemm_pack_rhs<RhsScalar, Index, Blocking::nr, RhsStorageOrder, ConjugateRhs> pack_rhs; ei_gebp_kernel<LhsScalar, RhsScalar, Index, Blocking::mr, Blocking::nr> gebp; -// if (ConjugateRhs) +// if ((ConjugateRhs && !alphaOnLhs) || (ConjugateLhs && alphaOnLhs)) // alpha = ei_conj(alpha); // ei_gemm_pack_lhs<LhsScalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs; // ei_gemm_pack_rhs<RhsScalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs; @@ -177,6 +178,9 @@ static void run(Index rows, Index cols, Index depth, RhsScalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(RhsScalar, sizeB) : blocking.blockB(); RhsScalar *blockW = blocking.blockW()==0 ? ei_aligned_stack_new(RhsScalar, sizeW) : blocking.blockW(); + LhsScalar lhsAlpha = alphaOnLhs ? ei_get_factor<ResScalar,LhsScalar>::run(alpha) : LhsScalar(1); + RhsScalar rhsAlpha = alphaOnLhs ? RhsScalar(1) : ei_get_factor<ResScalar,RhsScalar>::run(alpha); + // For each horizontal panel of the rhs, and corresponding panel of the lhs... // (==GEMM_VAR1) for(Index k2=0; k2<depth; k2+=kc) @@ -187,7 +191,7 @@ static void run(Index rows, Index cols, Index depth, // => Pack rhs's panel into a sequential chunk of memory (L2 caching) // Note that this panel will be read as many times as the number of blocks in the lhs's // vertical panel which is, in practice, a very low number. - pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); + pack_rhs(blockB, &rhs(k2,0), rhsStride, rhsAlpha, actual_kc, cols); // For each mc x kc block of the lhs's vertical panel... @@ -199,7 +203,7 @@ static void run(Index rows, Index cols, Index depth, // We pack the lhs's block into a sequential chunk of memory (L1 caching) // Note that this block will be read a very high number of times, which is equal to the number of // micro vertical panel of the large rhs's panel (e.g., cols/4 times). - pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); + pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc, lhsAlpha); // Everything is packed, we can now call the block * panel kernel: gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1, -1, 0, 0, blockW); diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index d772834a2..4d2f82680 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -30,7 +30,13 @@ * the number of load/stores of the result by a factor 4 and to reduce * the instruction dependency. Moreover, we know that all bands have the * same alignment pattern. - * TODO: since rhs gets evaluated only once, no need to evaluate it + * + * Mixing type logic: C += alpha * A * B + * | A | B |alpha| comments + * |real |cplx |cplx | no vectorization + * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization + * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp + * |cplx |real |real | optimal case, vectorization possible via real-cplx mul */ template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> struct ei_general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs> @@ -39,7 +45,7 @@ typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResS enum { Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable - && ei_packet_traits<LhsScalar>::size==ei_packet_traits<RhsScalar>::size, + && int(ei_packet_traits<LhsScalar>::size)==int(ei_packet_traits<RhsScalar>::size), LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1 @@ -58,7 +64,7 @@ EIGEN_DONT_INLINE static void run( const LhsScalar* lhs, Index lhsStride, const RhsScalar* rhs, Index rhsIncr, ResScalar* res, Index resIncr, - ResScalar alpha) + RhsScalar alpha) { ei_internal_assert(resIncr==1); #ifdef _EIGEN_ACCUMULATE_PACKETS @@ -182,6 +188,7 @@ EIGEN_DONT_INLINE static void run( if(peels>1) { LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; + ResPacket T0, T1; A01 = ei_pload<LhsPacket>(&lhs1[alignedStart-1]); A02 = ei_pload<LhsPacket>(&lhs2[alignedStart-2]); @@ -195,20 +202,20 @@ EIGEN_DONT_INLINE static void run( A00 = ei_pload<LhsPacket>(&lhs0[j]); A10 = ei_pload<LhsPacket>(&lhs0[j+LhsPacketSize]); - A00 = pcj.pmadd(A00, ptmp0, ei_pload<ResPacket>(&res[j])); - A10 = pcj.pmadd(A10, ptmp0, ei_pload<ResPacket>(&res[j+ResPacketSize])); + T0 = pcj.pmadd(A00, ptmp0, ei_pload<ResPacket>(&res[j])); + T1 = pcj.pmadd(A10, ptmp0, ei_pload<ResPacket>(&res[j+ResPacketSize])); - A00 = pcj.pmadd(A01, ptmp1, A00); + T0 = pcj.pmadd(A01, ptmp1, T0); A01 = ei_pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); ei_palign<1>(A11,A01); - A00 = pcj.pmadd(A02, ptmp2, A00); + T0 = pcj.pmadd(A02, ptmp2, T0); A02 = ei_pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); ei_palign<2>(A12,A02); - A00 = pcj.pmadd(A03, ptmp3, A00); - ei_pstore(&res[j],A00); + T0 = pcj.pmadd(A03, ptmp3, T0); + ei_pstore(&res[j],T0); A03 = ei_pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); ei_palign<3>(A13,A03); - A10 = pcj.pmadd(A11, ptmp1, A10); - A10 = pcj.pmadd(A12, ptmp2, A10); - A10 = pcj.pmadd(A13, ptmp3, A10); - ei_pstore(&res[j+ResPacketSize],A10); + T1 = pcj.pmadd(A11, ptmp1, T1); + T1 = pcj.pmadd(A12, ptmp2, T1); + T1 = pcj.pmadd(A13, ptmp3, T1); + ei_pstore(&res[j+ResPacketSize],T1); } } for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize) @@ -275,6 +282,16 @@ EIGEN_DONT_INLINE static void run( } }; +/* Optimized row-major matrix * vector product: + * This algorithm processes 4 rows at onces that allows to both reduce + * the number of load/stores of the result by a factor 4 and to reduce + * the instruction dependency. Moreover, we know that all bands have the + * same alignment pattern. + * + * Mixing type logic: + * - alpha is always a complex (or converted to a complex) + * - no vectorization + */ template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> struct ei_general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs> { diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 459446409..797f6b0c4 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -115,6 +115,13 @@ template<typename RealScalar,bool Conj> struct ei_conj_helper<RealScalar, std::c { return x*ei_conj_if<Conj>()(y); } }; +template<typename From,typename To> struct ei_get_factor { + EIGEN_STRONG_INLINE static To run(const From& x) { return x; } +}; + +template<typename Scalar> struct ei_get_factor<Scalar,typename NumTraits<Scalar>::Real> { + EIGEN_STRONG_INLINE static typename NumTraits<Scalar>::Real run(const Scalar& x) { return ei_real(x); } +}; // Lightweight helper class to access matrix coefficients. // Yes, this is somehow redundant with Map<>, but this version is much much lighter, @@ -151,7 +158,11 @@ template<typename LhsScalar, typename RhsScalar> struct ei_product_blocking_traits { enum { - LhsPacketSize = ei_packet_traits<LhsScalar>::size, + Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable + && ei_packet_traits<RhsScalar>::Vectorizable + && (ei_is_same_type<LhsScalar,RhsScalar>::ret + || (NumTraits<LhsScalar>::IsComplex && !NumTraits<RhsScalar>::IsComplex)), + LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, // register block size along the N direction (must be either 2 or 4) @@ -167,6 +178,7 @@ struct ei_product_blocking_traits<std::complex<Real>, std::complex<Real> > { typedef std::complex<Real> Scalar; enum { + Vectorizable = ei_packet_traits<Scalar>::Vectorizable, PacketSize = ei_packet_traits<Scalar>::size, nr = 2, mr = 2 * PacketSize |