diff options
author | Gael Guennebaud <g.gael@free.fr> | 2009-07-22 11:54:58 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2009-07-22 11:54:58 +0200 |
commit | d6475ea390b5a4beeef64e71a247b3f72573d768 (patch) | |
tree | 20b734753444747a7a349f8657aa3593d294173d /Eigen | |
parent | d6627d540e6a8651ccd8ce4a4520b70fe5def870 (diff) |
more refactoring in the level3 products
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/Core | 1 | ||||
-rw-r--r-- | Eigen/src/Core/Product.h | 38 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 221 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 138 | ||||
-rw-r--r-- | Eigen/src/Core/util/BlasUtil.h | 157 |
5 files changed, 287 insertions, 268 deletions
diff --git a/Eigen/Core b/Eigen/Core index ea871dca3..d6a72765e 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -176,6 +176,7 @@ namespace Eigen { #include "src/Core/IO.h" #include "src/Core/Swap.h" #include "src/Core/CommaInitializer.h" +#include "src/Core/util/BlasUtil.h" #include "src/Core/Product.h" #include "src/Core/products/GeneralMatrixMatrix.h" #include "src/Core/products/GeneralMatrixVector.h" diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 93e318035..78cb88c33 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -577,22 +577,6 @@ struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMod // Forward declarations -template<typename Scalar, bool ConjugateLhs, bool ConjugateRhs> -static void ei_cache_friendly_product( - int _rows, int _cols, int depth, - bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride, - bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride, - bool resRowMajor, Scalar* res, int resStride, - Scalar alpha); - -template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename RhsType> -static void ei_cache_friendly_product_colmajor_times_vector( - int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res, Scalar alpha); - -template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename ResType> -static void ei_cache_friendly_product_rowmajor_times_vector( - const Scalar* lhs, int lhsStride, const Scalar* rhs, int rhsSize, ResType& res, Scalar alpha); - // This helper class aims to determine which optimized product to call, // and how to call it. We have to distinghish three major cases: // 1 - matrix-matrix @@ -926,16 +910,18 @@ inline void Product<Lhs,Rhs,ProductMode>::_cacheFriendlyEvalAndAdd(DestDerived& typedef typename ei_unref<RhsCopy>::type _RhsCopy; LhsCopy lhs(actualLhs); RhsCopy rhs(actualRhs); - ei_cache_friendly_product<Scalar, - ((int(Flags)&RowMajorBit) ? bool(RhsProductTraits::NeedToConjugate) : bool(LhsProductTraits::NeedToConjugate)), - ((int(Flags)&RowMajorBit) ? bool(LhsProductTraits::NeedToConjugate) : bool(RhsProductTraits::NeedToConjugate))> - ( - rows(), cols(), lhs.cols(), - _LhsCopy::Flags&RowMajorBit, (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(), - _RhsCopy::Flags&RowMajorBit, (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(), - Flags&RowMajorBit, (Scalar*)&(res.coeffRef(0,0)), res.stride(), - actualAlpha - ); + + ei_general_matrix_matrix_product< + Scalar, + (_LhsCopy::Flags&RowMajorBit)?RowMajor:ColMajor, bool(LhsProductTraits::NeedToConjugate), + (_RhsCopy::Flags&RowMajorBit)?RowMajor:ColMajor, bool(RhsProductTraits::NeedToConjugate), + (DestDerived::Flags&RowMajorBit)?RowMajor:ColMajor> + ::run( + rows(), cols(), lhs.cols(), + (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(), + (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(), + (Scalar*)&(res.coeffRef(0,0)), res.stride(), + actualAlpha); } #endif // EIGEN_PRODUCT_H diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 30fa4aa66..68949499a 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -25,145 +25,63 @@ #ifndef EIGEN_GENERAL_MATRIX_MATRIX_H #define EIGEN_GENERAL_MATRIX_MATRIX_H -template<typename Scalar, typename Packet, int PacketSize, int mr, int nr, typename Conj> -struct ei_gebp_kernel; - -template<typename Scalar, int PacketSize, int nr> -struct ei_gemm_pack_rhs; - -template<typename Scalar, int mr> -struct ei_gemm_pack_lhs; - -template <int L2MemorySize,typename Scalar> -struct ei_L2_block_traits { - enum {width = 8 * ei_meta_sqrt<L2MemorySize/(64*sizeof(Scalar))>::ret }; -}; - -template<bool ConjLhs, bool ConjRhs> struct ei_conj_helper; - -template<> struct ei_conj_helper<false,false> -{ - template<typename T> - EIGEN_STRONG_INLINE T pmadd(const T& x, const T& y, const T& c) const { return ei_pmadd(x,y,c); } - template<typename T> - EIGEN_STRONG_INLINE T pmul(const T& x, const T& y) const { return ei_pmul(x,y); } -}; - -template<> struct ei_conj_helper<false,true> -{ - template<typename T> std::complex<T> - pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const - { return c + pmul(x,y); } - - template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const - { return std::complex<T>(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_imag(x)*ei_real(y) - ei_real(x)*ei_imag(y)); } -}; +#ifndef EIGEN_EXTERN_INSTANTIATIONS -template<> struct ei_conj_helper<true,false> +template< + typename Scalar, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs> +struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,RowMajor> { - template<typename T> std::complex<T> - pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const - { return c + pmul(x,y); } - - template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const - { return std::complex<T>(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } + static EIGEN_STRONG_INLINE void run( + int rows, int cols, int depth, + const Scalar* lhs, int lhsStride, + const Scalar* rhs, int rhsStride, + Scalar* res, int resStride, + Scalar alpha) + { + // transpose the product such that the result is column major + ei_general_matrix_matrix_product<Scalar, + RhsStorageOrder==RowMajor ? ColMajor : RowMajor, + ConjugateRhs, + LhsStorageOrder==RowMajor ? ColMajor : RowMajor, + ConjugateLhs, + ColMajor> + ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha); + } }; -template<> struct ei_conj_helper<true,true> +template< + typename Scalar, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs> +struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> { - template<typename T> std::complex<T> - pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const - { return c + pmul(x,y); } - - template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const - { return std::complex<T>(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } -}; - -#ifndef EIGEN_EXTERN_INSTANTIATIONS - -/** \warning you should never call this function directly, - * this is because the ConjugateLhs/ConjugateRhs have to - * be flipped is resRowMajor==true */ -template<typename Scalar, bool ConjugateLhs, bool ConjugateRhs> -static void ei_cache_friendly_product( - int _rows, int _cols, int depth, - bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride, - bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride, - bool resRowMajor, Scalar* res, int resStride, +static void run(int rows, int cols, int depth, + const Scalar* _lhs, int lhsStride, + const Scalar* _rhs, int rhsStride, + Scalar* res, int resStride, Scalar alpha) { - const Scalar* EIGEN_RESTRICT lhs; - const Scalar* EIGEN_RESTRICT rhs; - int lhsStride, rhsStride, rows, cols; - bool lhsRowMajor; + ei_const_blas_data_mapper<Scalar, LhsStorageOrder> lhs(_lhs,lhsStride); + ei_const_blas_data_mapper<Scalar, RhsStorageOrder> rhs(_rhs,rhsStride); - ei_conj_helper<ConjugateLhs,ConjugateRhs> cj; if (ConjugateRhs) alpha = ei_conj(alpha); - bool hasAlpha = alpha != Scalar(1); - - if (resRowMajor) - { -// return ei_cache_friendly_product<Scalar,ConjugateRhs,ConjugateLhs>(_cols,_rows,depth, -// !_rhsRowMajor, _rhs, _rhsStride, -// !_lhsRowMajor, _lhs, _lhsStride, -// false, res, resStride, -// alpha); - - lhs = _rhs; - rhs = _lhs; - lhsStride = _rhsStride; - rhsStride = _lhsStride; - cols = _rows; - rows = _cols; - lhsRowMajor = !_rhsRowMajor; - ei_assert(_lhsRowMajor); - } - else - { - lhs = _lhs; - rhs = _rhs; - lhsStride = _lhsStride; - rhsStride = _rhsStride; - rows = _rows; - cols = _cols; - lhsRowMajor = _lhsRowMajor; - ei_assert(!_rhsRowMajor); - } typedef typename ei_packet_traits<Scalar>::type PacketType; + typedef ei_product_blocking_traits<Scalar> Blocking; - enum { - PacketSize = sizeof(PacketType)/sizeof(Scalar), - #if (defined __i386__) - HalfRegisterCount = 4, - #else - HalfRegisterCount = 8, - #endif - - // register block size along the N direction - nr = HalfRegisterCount/2, - - // register block size along the M direction - mr = 2 * PacketSize, - - // max cache block size along the K direction - Max_kc = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width, - - // max cache block size along the M direction - Max_mc = 2*Max_kc - }; - - int kc = std::min<int>(Max_kc,depth); // cache block size along the K direction - int mc = std::min<int>(Max_mc,rows); // cache block size along the M direction + int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction + int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*PacketSize); + Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); // number of columns which can be processed by packet of nr columns - int packet_cols = (cols/nr)*nr; + int packet_cols = (cols/Blocking::nr) * Blocking::nr; - // GEMM_VAR1 + // => GEMM_VAR1 for(int k2=0; k2<depth; k2+=kc) { const int actual_kc = std::min(k2+kc,depth)-k2; @@ -171,24 +89,26 @@ static void ei_cache_friendly_product( // we have selected one row panel of rhs and one column panel of lhs // pack rhs's panel into a sequential chunk of memory // and expand each coeff to a constant packet for further reuse - ei_gemm_pack_rhs<Scalar,PacketSize,nr>()(blockB, rhs, rhsStride, hasAlpha, alpha, actual_kc, packet_cols, k2, cols); + ei_gemm_pack_rhs<Scalar, Blocking::PacketSize, Blocking::nr>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, packet_cols, cols); // => GEPP_VAR1 for(int i2=0; i2<rows; i2+=mc) { const int actual_mc = std::min(i2+mc,rows)-i2; + + ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder>()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); - ei_gemm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, i2); - - ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >() + ei_gebp_kernel<Scalar, PacketType, Blocking::PacketSize, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >() (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); } } ei_aligned_stack_delete(Scalar, blockA, kc*mc); - ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize); + ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize); } +}; + // optimized GEneral packed Block * packed Panel product kernel template<typename Scalar, typename PacketType, int PacketSize, int mr, int nr, typename Conj> struct ei_gebp_kernel @@ -356,8 +276,7 @@ struct ei_gebp_kernel if(nr==4) res[(j2+3)*resStride + i2 + i] += C3; } } - // remaining rhs/res columns (<nr) - + // process remaining rhs/res columns one at a time // => do the same but with nr==1 for(int j2=packet_cols; j2<cols; j2++) @@ -413,35 +332,22 @@ struct ei_gebp_kernel }; // pack a block of the lhs -template<typename Scalar, int mr> +template<typename Scalar, int mr, int StorageOrder> struct ei_gemm_pack_lhs { - void operator()(Scalar* blockA, const Scalar* lhs, int lhsStride, bool lhsRowMajor, int actual_kc, int actual_mc, int k2, int i2) + void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int actual_kc, int actual_mc) { + ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride); int count = 0; const int peeled_mc = (actual_mc/mr)*mr; - if (lhsRowMajor) + for(int i=0; i<peeled_mc; i+=mr) + for(int k=0; k<actual_kc; k++) + for(int w=0; w<mr; w++) + blockA[count++] = lhs(i+w, k); + for(int i=peeled_mc; i<actual_mc; i++) { - for(int i=0; i<peeled_mc; i+=mr) - for(int k=0; k<actual_kc; k++) - for(int w=0; w<mr; w++) - blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride]; - for(int i=peeled_mc; i<actual_mc; i++) - { - const Scalar* llhs = &lhs[(k2) + (i2+i)*lhsStride]; - for(int k=0; k<actual_kc; k++) - blockA[count++] = llhs[k]; - } - } - else - { - for(int i=0; i<peeled_mc; i+=mr) - for(int k=0; k<actual_kc; k++) - for(int w=0; w<mr; w++) - blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w]; - for(int i=peeled_mc; i<actual_mc; i++) - for(int k=0; k<actual_kc; k++) - blockA[count++] = lhs[(k2+k)*lhsStride + i2+i]; + for(int k=0; k<actual_kc; k++) + blockA[count++] = lhs(i, k); } } }; @@ -450,15 +356,16 @@ struct ei_gemm_pack_lhs template<typename Scalar, int PacketSize, int nr> struct ei_gemm_pack_rhs { - void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, bool hasAlpha, Scalar alpha, int actual_kc, int packet_cols, int k2, int cols) + void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols) { + bool hasAlpha = alpha != Scalar(1); int count = 0; for(int j2=0; j2<packet_cols; j2+=nr) { - const Scalar* b0 = &rhs[(j2+0)*rhsStride + k2]; - const Scalar* b1 = &rhs[(j2+1)*rhsStride + k2]; - const Scalar* b2 = &rhs[(j2+2)*rhsStride + k2]; - const Scalar* b3 = &rhs[(j2+3)*rhsStride + k2]; + const Scalar* b0 = &rhs[(j2+0)*rhsStride]; + const Scalar* b1 = &rhs[(j2+1)*rhsStride]; + const Scalar* b2 = &rhs[(j2+2)*rhsStride]; + const Scalar* b3 = &rhs[(j2+3)*rhsStride]; if (hasAlpha) { for(int k=0; k<actual_kc; k++) @@ -491,7 +398,7 @@ struct ei_gemm_pack_rhs // copy the remaining columns one at a time (nr==1) for(int j2=packet_cols; j2<cols; ++j2) { - const Scalar* b0 = &rhs[(j2+0)*rhsStride + k2]; + const Scalar* b0 = &rhs[(j2+0)*rhsStride]; if (hasAlpha) { for(int k=0; k<actual_kc; k++) diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 6ab7b7211..5008d227e 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -25,61 +25,44 @@ #ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H #define EIGEN_SELFADJOINT_MATRIX_MATRIX_H -template<typename Scalar, int mr> +// pack a selfadjoint block diagonal for use with the gebp_kernel +template<typename Scalar, int mr, int StorageOrder> struct ei_symm_pack_lhs { - void operator()(Scalar* blockA, const Scalar* lhs, int lhsStride, bool lhsRowMajor, int actual_kc, int actual_mc, int k2, int i2) + void operator()(Scalar* blockA, const Scalar* _lhs, int lhsStride, int actual_kc, int actual_mc) { + ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride); int count = 0; const int peeled_mc = (actual_mc/mr)*mr; - if (lhsRowMajor) + for(int i=0; i<peeled_mc; i+=mr) { - for(int i=0; i<peeled_mc; i+=mr) - for(int k=0; k<actual_kc; k++) - for(int w=0; w<mr; w++) - blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride]; - for(int i=peeled_mc; i<actual_mc; i++) + for(int k=0; k<i; k++) + for(int w=0; w<mr; w++) + blockA[count++] = lhs(i+w,k); + // symmetric copy + int h = 0; + for(int k=i; k<i+mr; k++) { - const Scalar* llhs = &lhs[(k2) + (i2+i)*lhsStride]; - for(int k=0; k<actual_kc; k++) - blockA[count++] = llhs[k]; + // transposed copy + for(int w=0; w<h; w++) + blockA[count++] = lhs(k, i+w); + for(int w=h; w<mr; w++) + blockA[count++] = lhs(i+w, k); + ++h; } + // transposed copy + for(int k=i+mr; k<actual_kc; k++) + for(int w=0; w<mr; w++) + blockA[count++] = lhs(k, i+w); } - else + // do the same with mr==1 + for(int i=peeled_mc; i<actual_mc; i++) { - for(int i=0; i<peeled_mc; i+=mr) - { - for(int k=0; k<i; k++) - for(int w=0; w<mr; w++) - blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w]; - - // symmetric copy - int h = 0; - for(int k=i; k<i+mr; k++) - { - // transposed copy - for(int w=0; w<h; w++) - blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride]; - for(int w=h; w<mr; w++) - blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w]; - ++h; - } - - // transposed copy - for(int k=i+mr; k<actual_kc; k++) - for(int w=0; w<mr; w++) - blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride]; - } - // do the same with mr==1 - for(int i=peeled_mc; i<actual_mc; i++) - { - for(int k=0; k<=i; k++) - blockA[count++] = lhs[(k2+k)*lhsStride + i2+i]; - - // transposed copy - for(int k=i+1; k<actual_kc; k++) - blockA[count++] = lhs[(k2+k) + (i2+i)*lhsStride]; - } + for(int k=0; k<=i; k++) + blockA[count++] = lhs(i, k); + // transposed copy + for(int k=i+1; k<actual_kc; k++) + blockA[count++] = lhs(k, i); } } }; @@ -90,51 +73,36 @@ struct ei_symm_pack_lhs template<typename Scalar, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> static EIGEN_DONT_INLINE void ei_product_selfadjoint_matrix( int size, - const Scalar* lhs, int lhsStride, - const Scalar* rhs, int rhsStride, bool rhsRowMajor, int cols, + const Scalar* _lhs, int lhsStride, + const Scalar* _rhs, int rhsStride, bool rhsRowMajor, int cols, Scalar* res, int resStride, Scalar alpha) { typedef typename ei_packet_traits<Scalar>::type Packet; - ei_conj_helper<ConjugateLhs,ConjugateRhs> cj; + ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride); + ei_const_blas_data_mapper<Scalar, ColMajor> rhs(_rhs,rhsStride); + if (ConjugateRhs) alpha = ei_conj(alpha); - bool hasAlpha = alpha != Scalar(1); typedef typename ei_packet_traits<Scalar>::type PacketType; const bool lhsRowMajor = (StorageOrder==RowMajor); - enum { - PacketSize = sizeof(PacketType)/sizeof(Scalar), - #if (defined __i386__) - HalfRegisterCount = 4, - #else - HalfRegisterCount = 8, - #endif - - // register block size along the N direction - nr = HalfRegisterCount/2, - - // register block size along the M direction - mr = 2 * PacketSize, + typedef ei_product_blocking_traits<Scalar> Blocking; - // max cache block size along the K direction - Max_kc = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width, - - // max cache block size along the M direction - Max_mc = 2*Max_kc - }; - - int kc = std::min<int>(Max_kc,size); // cache block size along the K direction - int mc = std::min<int>(Max_mc,size); // cache block size along the M direction + int kc = std::min<int>(Blocking::Max_kc,size); // cache block size along the K direction + int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*PacketSize); + Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); // number of columns which can be processed by packet of nr columns - int packet_cols = (cols/nr)*nr; + int packet_cols = (cols/Blocking::nr)*Blocking::nr; + + ei_gebp_kernel<Scalar, PacketType, Blocking::PacketSize, + Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel; for(int k2=0; k2<size; k2+=kc) { @@ -143,7 +111,8 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_matrix( // we have selected one row panel of rhs and one column panel of lhs // pack rhs's panel into a sequential chunk of memory // and expand each coeff to a constant packet for further reuse - ei_gemm_pack_rhs<Scalar,PacketSize,nr>()(blockB, rhs, rhsStride, hasAlpha, alpha, actual_kc, packet_cols, k2, cols); + ei_gemm_pack_rhs<Scalar,Blocking::PacketSize,Blocking::nr>() + (blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, packet_cols, cols); // the select lhs's panel has to be split in three different parts: // 1 - the transposed panel above the diagonal block => transposed packed copy @@ -153,32 +122,31 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_matrix( { const int actual_mc = std::min(i2+mc,k2)-i2; // transposed packed copy - ei_gemm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, !lhsRowMajor, actual_kc, actual_mc, k2, i2); + ei_gemm_pack_lhs<Scalar,Blocking::mr,StorageOrder==RowMajor?ColMajor:RowMajor>() + (blockA, &lhs(k2,i2), lhsStride, actual_kc, actual_mc); - ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >() - (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); + gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); } // the block diagonal { const int actual_mc = std::min(k2+kc,size)-k2; // symmetric packed copy - ei_symm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, k2); - ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >() - (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, k2, cols); + ei_symm_pack_lhs<Scalar,Blocking::mr,StorageOrder>() + (blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc); + gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, k2, cols); } for(int i2=k2+kc; i2<size; i2+=mc) { const int actual_mc = std::min(i2+mc,size)-i2; - ei_gemm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, i2); - ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >() - (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); + ei_gemm_pack_lhs<Scalar,Blocking::mr,StorageOrder>() + (blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); + gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); } } ei_aligned_stack_delete(Scalar, blockA, kc*mc); - ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize); + ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize); } - #endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h new file mode 100644 index 000000000..6e4b21e6a --- /dev/null +++ b/Eigen/src/Core/util/BlasUtil.h @@ -0,0 +1,157 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_BLASUTIL_H +#define EIGEN_BLASUTIL_H + +// This file contains many lightweight helper classes used to +// implement and control fast level 2 and level 3 BLAS-like routines. + +// forward declarations +template<typename Scalar, typename Packet, int PacketSize, int mr, int nr, typename Conj> +struct ei_gebp_kernel; + +template<typename Scalar, int PacketSize, int nr> +struct ei_gemm_pack_rhs; + +template<typename Scalar, int mr, int StorageOrder> +struct ei_gemm_pack_lhs; + +template< + typename Scalar, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs, + int ResStorageOrder> +struct ei_general_matrix_matrix_product; + +template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename RhsType> +static void ei_cache_friendly_product_colmajor_times_vector( + int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res, Scalar alpha); + +template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename ResType> +static void ei_cache_friendly_product_rowmajor_times_vector( + const Scalar* lhs, int lhsStride, const Scalar* rhs, int rhsSize, ResType& res, Scalar alpha); + +// Provides scalar/packet-wise product and product with accumulation +// with optional conjugation of the arguments. +template<bool ConjLhs, bool ConjRhs> struct ei_conj_helper; + +template<> struct ei_conj_helper<false,false> +{ + template<typename T> + EIGEN_STRONG_INLINE T pmadd(const T& x, const T& y, const T& c) const { return ei_pmadd(x,y,c); } + template<typename T> + EIGEN_STRONG_INLINE T pmul(const T& x, const T& y) const { return ei_pmul(x,y); } +}; + +template<> struct ei_conj_helper<false,true> +{ + template<typename T> std::complex<T> + pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const + { return c + pmul(x,y); } + + template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const + { return std::complex<T>(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_imag(x)*ei_real(y) - ei_real(x)*ei_imag(y)); } +}; + +template<> struct ei_conj_helper<true,false> +{ + template<typename T> std::complex<T> + pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const + { return c + pmul(x,y); } + + template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const + { return std::complex<T>(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } +}; + +template<> struct ei_conj_helper<true,true> +{ + template<typename T> std::complex<T> + pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const + { return c + pmul(x,y); } + + template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const + { return std::complex<T>(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } +}; + +// lightweight helper class to access matrix coefficients +template<typename Scalar, int StorageOrder> +class ei_blas_data_mapper +{ + public: + ei_blas_data_mapper(Scalar* data, int stride) : m_data(data), m_stride(stride) {} + EIGEN_STRONG_INLINE Scalar& operator()(int i, int j) + { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } + protected: + Scalar* EIGEN_RESTRICT m_data; + int m_stride; +}; + +// lightweight helper class to access matrix coefficients (const version) +template<typename Scalar, int StorageOrder> +class ei_const_blas_data_mapper +{ + public: + ei_const_blas_data_mapper(const Scalar* data, int stride) : m_data(data), m_stride(stride) {} + EIGEN_STRONG_INLINE const Scalar& operator()(int i, int j) const + { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } + protected: + const Scalar* EIGEN_RESTRICT m_data; + int m_stride; +}; + +// +// template <int L2MemorySize,typename Scalar> +// struct ei_L2_block_traits { +// enum {width = 8 * ei_meta_sqrt<L2MemorySize/(64*sizeof(Scalar))>::ret }; +// }; + +// Defines various constant controlling level 3 blocking +template<typename Scalar> +struct ei_product_blocking_traits +{ + typedef typename ei_packet_traits<Scalar>::type PacketType; + enum { + PacketSize = sizeof(PacketType)/sizeof(Scalar), + #if (defined __i386__) + HalfRegisterCount = 4, + #else + HalfRegisterCount = 8, + #endif + + // register block size along the N direction + nr = HalfRegisterCount/2, + + // register block size along the M direction + mr = 2 * PacketSize, + + // max cache block size along the K direction + Max_kc = 8 * ei_meta_sqrt<EIGEN_TUNE_FOR_CPU_CACHE_SIZE/(64*sizeof(Scalar))>::ret, + + // max cache block size along the M direction + Max_mc = 2*Max_kc + }; +}; + +#endif // EIGEN_BLASUTIL_H |