// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2008-2009 Gael Guennebaud // // Eigen is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 3 of the License, or (at your option) any later version. // // Alternatively, you can redistribute it and/or // modify it under the terms of the GNU General Public License as // published by the Free Software Foundation; either version 2 of // the License, or (at your option) any later version. // // Eigen is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the // GNU General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License and a copy of the GNU General Public License along with // Eigen. If not, see . #ifndef EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H /** \internal */ inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) { static std::ptrdiff_t m_l1CacheSize = 0; static std::ptrdiff_t m_l2CacheSize = 0; if(m_l1CacheSize==0) { m_l1CacheSize = ei_queryL1CacheSize(); m_l2CacheSize = ei_queryTopLevelCacheSize(); if(m_l1CacheSize<=0) m_l1CacheSize = 8 * 1024; if(m_l2CacheSize<=0) m_l2CacheSize = 1 * 1024 * 1024; } if(action==SetAction) { // set the cpu cache size and cache all block sizes from a global cache size in byte ei_internal_assert(l1!=0 && l2!=0); m_l1CacheSize = *l1; m_l2CacheSize = *l2; } else if(action==GetAction) { ei_internal_assert(l1!=0 && l2!=0); *l1 = m_l1CacheSize; *l2 = m_l2CacheSize; } else { ei_internal_assert(false); } } /** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. * \sa setCpuCacheSize */ inline std::ptrdiff_t l1CacheSize() { std::ptrdiff_t l1, l2; ei_manage_caching_sizes(GetAction, &l1, &l2); return l1; } /** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. * \sa setCpuCacheSize */ inline std::ptrdiff_t l2CacheSize() { std::ptrdiff_t l1, l2; ei_manage_caching_sizes(GetAction, &l1, &l2); return l2; } /** Set the cpu L1 and L2 cache sizes (in bytes). * These values are use to adjust the size of the blocks * for the algorithms working per blocks. * * \sa computeProductBlockingSizes */ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) { ei_manage_caching_sizes(SetAction, &l1, &l2); } /** \brief Computes the blocking parameters for a m x k times k x n matrix product * * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension. * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension. * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension. * * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, * this function computes the blocking size parameters along the respective dimensions * for matrix products and related algorithms. The blocking sizes depends on various * parameters: * - the L1 and L2 cache sizes, * - the register level blocking sizes defined by ei_product_blocking_traits, * - the number of scalars that fit into a packet (when vectorization is enabled). * * \sa setCpuCacheSizes */ template void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n) { // Explanations: // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed // per kc x nr vertical small panels where nr is the blocking size along the n dimension // at the register level. For vectorization purpose, these small vertical panels are unpacked, // i.e., each coefficient is replicated to fit a packet. This small vertical panel has to // stay in L1 cache. std::ptrdiff_t l1, l2; enum { kdiv = KcFactor * 2 * ei_product_blocking_traits::nr * ei_packet_traits::size * sizeof(RhsScalar), mr = ei_product_blocking_traits::mr, mr_mask = (0xffffffff/mr)*mr }; ei_manage_caching_sizes(GetAction, &l1, &l2); k = std::min(k, l1/kdiv); std::ptrdiff_t _m = l2/(4 * sizeof(LhsScalar) * k); if(_m inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n) { computeProductBlockingSizes(k, m, n); } #ifdef EIGEN_HAS_FUSE_CJMADD #define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C); #else #define CJMADD(A,B,C,T) T = B; T = cj.pmul(A,T); C = ei_padd(C,T); #endif // optimized GEneral packed Block * packed Panel product kernel template struct ei_gebp_kernel { void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, Scalar* unpackedB = 0) { typedef typename ei_packet_traits::type PacketType; enum { PacketSize = ei_packet_traits::size }; if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; Conj cj; Index packet_cols = (cols/nr) * nr; const Index peeled_mc = (rows/mr)*mr; const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0); const Index peeled_kc = (depth/4)*4; if(unpackedB==0) unpackedB = const_cast(blockB - strideB * nr * PacketSize); // loops on each micro vertical panel of rhs (depth x nr) for(Index j2=0; j2 we select a mr x nr micro block of res which is entirely // stored into mr/packet_size x nr registers. for(Index i=0; i=PacketSize) { Index i = peeled_mc; const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize]; ei_prefetch(&blA[0]); // gets res block as register PacketType C0, C1, C2, C3; C0 = ei_ploadu(&res[(j2+0)*resStride + i]); C1 = ei_ploadu(&res[(j2+1)*resStride + i]); if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]); if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]); // performs "inner" product const Scalar* blB = unpackedB; for(Index k=0; k do the same but with nr==1 for(Index j2=packet_cols; j2=PacketSize) { Index i = peeled_mc; const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize]; ei_prefetch(&blA[0]); PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]); const Scalar* blB = unpackedB; for(Index k=0; k struct ei_gemm_pack_lhs { void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0) { enum { PacketSize = ei_packet_traits::size }; ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); ei_conj_if::IsComplex && Conjugate> cj; ei_const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; Index peeled_mc = (rows/mr)*mr; for(Index i=0; i=PacketSize) { if(PanelMode) count += PacketSize*offset; for(Index k=0; k struct ei_gemm_pack_rhs { typedef typename ei_packet_traits::type Packet; enum { PacketSize = ei_packet_traits::size }; void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Scalar alpha, Index depth, Index cols, Index stride=0, Index offset=0) { ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); bool hasAlpha = alpha != Scalar(1); Index packet_cols = (cols/nr) * nr; Index count = 0; for(Index j2=0; j2 struct ei_gemm_pack_rhs { enum { PacketSize = ei_packet_traits::size }; void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Scalar alpha, Index depth, Index cols, Index stride=0, Index offset=0) { ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); bool hasAlpha = alpha != Scalar(1); Index packet_cols = (cols/nr) * nr; Index count = 0; for(Index j2=0; j2