// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2008-2009 Gael Guennebaud // // Eigen is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 3 of the License, or (at your option) any later version. // // Alternatively, you can redistribute it and/or // modify it under the terms of the GNU General Public License as // published by the Free Software Foundation; either version 2 of // the License, or (at your option) any later version. // // Eigen is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the // GNU General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License and a copy of the GNU General Public License along with // Eigen. If not, see . #ifndef EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H /** \internal */ inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* a=0, std::ptrdiff_t* b=0, std::ptrdiff_t* c=0, int scalar_size = 0) { const int nbScalarSizes = 12; static std::ptrdiff_t m_maxK[nbScalarSizes]; static std::ptrdiff_t m_maxM[nbScalarSizes]; static std::ptrdiff_t m_maxN[nbScalarSizes]; static std::ptrdiff_t m_l1CacheSize = 0; static std::ptrdiff_t m_l2CacheSize = 0; if(m_l1CacheSize==0) { // initialization m_l1CacheSize = EIGEN_TUNE_FOR_CPU_CACHE_SIZE; m_l2CacheSize = 32*EIGEN_TUNE_FOR_CPU_CACHE_SIZE; ei_manage_caching_sizes(SetAction,&m_l1CacheSize, &m_l2CacheSize); } if(action==SetAction && scalar_size==0) { // set the cpu cache size and cache all block sizes from a global cache size in byte ei_internal_assert(a!=0 && b!=0 && c==0); m_l1CacheSize = *a; m_l2CacheSize = *b; int ss = 4; for(int i=0; i(m_l1CacheSize/(64*ss))); m_maxM[i] = 2 * m_maxK[i]; m_maxN[i] = ((m_l2CacheSize / (2 * m_maxK[i] * ss))/4)*4; } } else if(action==SetAction && scalar_size!=0) { // set the block sizes for the given scalar type (represented as its size) ei_internal_assert(a!=0 && b!=0 && c!=0); int i = std::max((scalar_size>>2)-1,0); if(i>2),1),nbScalarSizes)-1; *a = m_maxK[i]; *b = m_maxM[i]; *c = m_maxN[i]; } else { ei_internal_assert(false); } } /** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. * \sa setCpuCacheSize */ inline std::ptrdiff_t l1CacheSize() { std::ptrdiff_t l1, l2; ei_manage_caching_sizes(GetAction, &l1, &l2); return l1; } /** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. * \sa setCpuCacheSize */ inline std::ptrdiff_t l2CacheSize() { std::ptrdiff_t l1, l2; ei_manage_caching_sizes(GetAction, &l1, &l2); return l2; } /** Set the cpu L1 and L2 cache sizes (in bytes). * These values are use to adjust the size of the blocks * for the algorithms working per blocks. * * This function also automatically set the blocking size parameters * for each scalar type using the following rules: * \code * max_k = 4 * sqrt(l1/(64*sizeof(Scalar))); * max_m = 2 * k; * max_n = l2/(2*max_k*sizeof(Scalar)); * \endcode * overwriting custom values set using the setBlockingSizes function. * * See setBlockingSizes() for an explanation about the meaning of these parameters. * * \sa setBlockingSizes */ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) { ei_manage_caching_sizes(SetAction, &l1, &l2); } /** \brief Set the blocking size parameters \a maxK, \a maxM and \a maxN for the scalar type \a Scalar. * * \param[in] maxK the size of the L1 and L2 blocks along the k dimension * \param[in] maxM the size of the L1 blocks along the m dimension * \param[in] maxN the size of the L2 blocks along the n dimension * * This function sets the blocking size parameters for matrix products and related algorithms. * More precisely, let A * B be a m x k by k x n matrix product. Then Eigen's product like * algorithms perform L2 blocking on B with horizontal panels of size maxK x maxN, * and L1 blocking on A with blocks of size maxM x maxK. * * Theoretically, for best performances maxM should be closed to maxK and maxM * maxK should * note exceed half of the L1 cache. Likewise, maxK * maxM should be smaller than the L2 cache. * * Note that in practice there is no distinction between scalar types of same size. * * \sa setCpuCacheSizes */ template void setBlockingSizes(std::ptrdiff_t maxK, std::ptrdiff_t maxM, std::ptrdiff_t maxN) { std::ptrdiff_t k, m, n; typedef ei_product_blocking_traits Traits; k = ((maxK)/4)*4; m = ((maxM)/Traits::mr)*Traits::mr; n = ((maxN)/Traits::nr)*Traits::nr; ei_manage_caching_sizes(SetAction,&k,&m,&n,sizeof(Scalar)); } /** \returns in \a makK, \a maxM and \a maxN the blocking size parameters for the scalar type \a Scalar. * * See setBlockingSizes for an explanation about the meaning of these parameters. * * \sa setBlockingSizes */ template void getBlockingSizes(std::ptrdiff_t& maxK, std::ptrdiff_t& maxM, std::ptrdiff_t& maxN) { ei_manage_caching_sizes(GetAction,&maxK,&maxM,&maxN,sizeof(Scalar)); } #ifdef EIGEN_HAS_FUSE_CJMADD #define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C); #else #define CJMADD(A,B,C,T) T = B; T = cj.pmul(A,T); C = ei_padd(C,T); #endif // optimized GEneral packed Block * packed Panel product kernel template struct ei_gebp_kernel { void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, Scalar* unpackedB = 0) { typedef typename ei_packet_traits::type PacketType; enum { PacketSize = ei_packet_traits::size }; if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; Conj cj; Index packet_cols = (cols/nr) * nr; const Index peeled_mc = (rows/mr)*mr; const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0); const Index peeled_kc = (depth/4)*4; if(unpackedB==0) unpackedB = const_cast(blockB - strideB * nr * PacketSize); // loops on each micro vertical panel of rhs (depth x nr) for(Index j2=0; j2 we select a mr x nr micro block of res which is entirely // stored into mr/packet_size x nr registers. for(Index i=0; i=PacketSize) { Index i = peeled_mc; const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize]; ei_prefetch(&blA[0]); // gets res block as register PacketType C0, C1, C2, C3; C0 = ei_ploadu(&res[(j2+0)*resStride + i]); C1 = ei_ploadu(&res[(j2+1)*resStride + i]); if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]); if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]); // performs "inner" product const Scalar* blB = unpackedB; for(Index k=0; k do the same but with nr==1 for(Index j2=packet_cols; j2=PacketSize) { Index i = peeled_mc; const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize]; ei_prefetch(&blA[0]); PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]); const Scalar* blB = unpackedB; for(Index k=0; k struct ei_gemm_pack_lhs { void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0) { enum { PacketSize = ei_packet_traits::size }; ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); ei_conj_if::IsComplex && Conjugate> cj; ei_const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; Index peeled_mc = (rows/mr)*mr; for(Index i=0; i=PacketSize) { if(PanelMode) count += PacketSize*offset; for(Index k=0; k struct ei_gemm_pack_rhs { typedef typename ei_packet_traits::type Packet; enum { PacketSize = ei_packet_traits::size }; void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Scalar alpha, Index depth, Index cols, Index stride=0, Index offset=0) { ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); bool hasAlpha = alpha != Scalar(1); Index packet_cols = (cols/nr) * nr; Index count = 0; for(Index j2=0; j2 struct ei_gemm_pack_rhs { enum { PacketSize = ei_packet_traits::size }; void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Scalar alpha, Index depth, Index cols, Index stride=0, Index offset=0) { ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); bool hasAlpha = alpha != Scalar(1); Index packet_cols = (cols/nr) * nr; Index count = 0; for(Index j2=0; j2