From 0212eec23f4cb64e8426bf32568156df302f8fcf Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 21 Jun 2010 23:28:50 +0200 Subject: simplify and optimize block sizes computation for matrix products. They are now automatically computed from the L1 and L2 cache sizes which are themselves automatically determined at runtime. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 135 +++++++--------------- Eigen/src/Core/products/GeneralMatrixMatrix.h | 17 ++- bench/bench_gemm.cpp | 10 +- 3 files changed, 55 insertions(+), 107 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index be20be833..167a1ea8d 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -26,66 +26,31 @@ #define EIGEN_GENERAL_BLOCK_PANEL_H /** \internal */ -inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* a=0, std::ptrdiff_t* b=0, std::ptrdiff_t* c=0, int scalar_size = 0) +inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) { - const int nbScalarSizes = 12; - static std::ptrdiff_t m_maxK[nbScalarSizes]; - static std::ptrdiff_t m_maxM[nbScalarSizes]; - static std::ptrdiff_t m_maxN[nbScalarSizes]; static std::ptrdiff_t m_l1CacheSize = 0; static std::ptrdiff_t m_l2CacheSize = 0; if(m_l1CacheSize==0) { - // initialization - m_l1CacheSize = EIGEN_TUNE_FOR_CPU_CACHE_SIZE; - m_l2CacheSize = 32*EIGEN_TUNE_FOR_CPU_CACHE_SIZE; - ei_manage_caching_sizes(SetAction,&m_l1CacheSize, &m_l2CacheSize); + m_l1CacheSize = ei_queryL1CacheSize(); + m_l2CacheSize = ei_queryTopLevelCacheSize(); + + if(m_l1CacheSize==0) m_l1CacheSize = 8 * 1024; + if(m_l2CacheSize==0) m_l2CacheSize = 1 * 1024 * 1024; } - if(action==SetAction && scalar_size==0) + if(action==SetAction) { // set the cpu cache size and cache all block sizes from a global cache size in byte - ei_internal_assert(a!=0 && b!=0 && c==0); - m_l1CacheSize = *a; - m_l2CacheSize = *b; - int ss = 4; - for(int i=0; i(m_l1CacheSize/(64*ss))); - m_maxM[i] = 2 * m_maxK[i]; - m_maxN[i] = ((m_l2CacheSize / (2 * m_maxK[i] * ss))/4)*4; - } - } - else if(action==SetAction && scalar_size!=0) - { - // set the block sizes for the given scalar type (represented as its size) - ei_internal_assert(a!=0 && b!=0 && c!=0); - int i = std::max((scalar_size>>2)-1,0); - if(i>2),1),nbScalarSizes)-1; - *a = m_maxK[i]; - *b = m_maxM[i]; - *c = m_maxN[i]; + ei_internal_assert(l1!=0 && l2!=0); + *l1 = m_l1CacheSize; + *l2 = m_l2CacheSize; } else { @@ -115,60 +80,44 @@ inline std::ptrdiff_t l2CacheSize() * These values are use to adjust the size of the blocks * for the algorithms working per blocks. * - * This function also automatically set the blocking size parameters - * for each scalar type using the following rules: - * \code - * max_k = 4 * sqrt(l1/(64*sizeof(Scalar))); - * max_m = 2 * k; - * max_n = l2/(2*max_k*sizeof(Scalar)); - * \endcode - * overwriting custom values set using the setBlockingSizes function. - * - * See setBlockingSizes() for an explanation about the meaning of these parameters. - * - * \sa setBlockingSizes */ + * \sa computeProductBlockingSizes */ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) { ei_manage_caching_sizes(SetAction, &l1, &l2); } -/** \brief Set the blocking size parameters \a maxK, \a maxM and \a maxN for the scalar type \a Scalar. - * - * \param[in] maxK the size of the L1 and L2 blocks along the k dimension - * \param[in] maxM the size of the L1 blocks along the m dimension - * \param[in] maxN the size of the L2 blocks along the n dimension - * - * This function sets the blocking size parameters for matrix products and related algorithms. - * More precisely, let A * B be a m x k by k x n matrix product. Then Eigen's product like - * algorithms perform L2 blocking on B with horizontal panels of size maxK x maxN, - * and L1 blocking on A with blocks of size maxM x maxK. +/** \brief Computes the blocking parameters for a m x k times k x n matrix product * - * Theoretically, for best performances maxM should be closed to maxK and maxM * maxK should - * note exceed half of the L1 cache. Likewise, maxK * maxM should be smaller than the L2 cache. + * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension. + * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension. + * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension. * - * Note that in practice there is no distinction between scalar types of same size. + * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, + * this function computes the blocking size parameters along the respective dimensions + * for matrix products and related algorithms. The blocking sizes depends on various + * parameters: + * - the L1 and L2 cache sizes, + * - the register level blocking sizes defined by ei_product_blocking_traits, + * - the number of scalars that fit into a packet (when vectorization is enabled). * * \sa setCpuCacheSizes */ -template -void setBlockingSizes(std::ptrdiff_t maxK, std::ptrdiff_t maxM, std::ptrdiff_t maxN) -{ - std::ptrdiff_t k, m, n; - typedef ei_product_blocking_traits Traits; - k = ((maxK)/4)*4; - m = ((maxM)/Traits::mr)*Traits::mr; - n = ((maxN)/Traits::nr)*Traits::nr; - ei_manage_caching_sizes(SetAction,&k,&m,&n,sizeof(Scalar)); -} - -/** \returns in \a makK, \a maxM and \a maxN the blocking size parameters for the scalar type \a Scalar. - * - * See setBlockingSizes for an explanation about the meaning of these parameters. - * - * \sa setBlockingSizes */ -template -void getBlockingSizes(std::ptrdiff_t& maxK, std::ptrdiff_t& maxM, std::ptrdiff_t& maxN) +template +void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n) { - ei_manage_caching_sizes(GetAction,&maxK,&maxM,&maxN,sizeof(Scalar)); + // Explanations: + // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and + // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed + // per kc x nr vertical small panels where nr is the blocking size along the n dimension + // at the register level. For vectorization purpose, these small vertical panels are unpacked, + // i.e., each coefficient is replicated to fit a packet. This small vertical panel has to + // stay in L1 cache. + std::ptrdiff_t l1, l2; + ei_manage_caching_sizes(GetAction, &l1, &l2); + k = std::min(k, l1/(2 * ei_product_blocking_traits::nr + * ei_packet_traits::size * sizeof(RhsScalar))); + std::ptrdiff_t _m = l2/(4 * k * sizeof(LhsScalar)); + if(_m::mr) * ei_product_blocking_traits::mr; + n = n; } #ifdef EIGEN_HAS_FUSE_CJMADD diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 3513d118e..9139976c3 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -75,13 +75,10 @@ static void run(Index rows, Index cols, Index depth, typedef typename ei_packet_traits::type PacketType; typedef ei_product_blocking_traits Blocking; - Index kc; // cache block size along the K direction - Index mc; // cache block size along the M direction - Index nc; // cache block size along the N direction - getBlockingSizes(kc, mc, nc); - kc = std::min(kc,depth); - mc = std::min(mc,rows); - nc = std::min(nc,cols); + Index kc = depth; // cache block size along the K direction + Index mc = rows; // cache block size along the M direction + Index nc = cols; // cache block size along the N direction + computeProductBlockingSizes(kc, mc, nc); ei_gemm_pack_rhs pack_rhs; ei_gemm_pack_lhs pack_lhs; @@ -241,9 +238,9 @@ struct ei_gemm_functor Index sharedBlockBSize() const { - Index maxKc, maxMc, maxNc; - getBlockingSizes(maxKc, maxMc, maxNc); - return std::min(maxKc,m_rhs.rows()) * m_rhs.cols(); + Index kc = m_rhs.rows(), mc = m_lhs.rows(), nc = m_rhs.cols(); + computeProductBlockingSizes(kc, mc, nc); + return kc * nc;; } protected: diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp index ee34e6ddc..ca1e05c8d 100644 --- a/bench/bench_gemm.cpp +++ b/bench/bench_gemm.cpp @@ -66,10 +66,12 @@ void gemm(const M& a, const M& b, M& c) int main(int argc, char ** argv) { std::cout << "L1 cache size = " << ei_queryL1CacheSize()/1024 << " KB\n"; - std::cout << "L2/L3 cache size = " << ei_queryTopLevelCacheSize()/1024 << " KB\n"; + std::cout << "L2/L3 cache size = " << ei_queryTopLevelCacheSize()/1024 << " KB\n"; + + setCpuCacheSizes(ei_queryL1CacheSize()/1,ei_queryTopLevelCacheSize()/2); int rep = 1; // number of repetitions per try - int tries = 5; // number of tries, we keep the best + int tries = 2; // number of tries, we keep the best int s = 2048; int cache_size = -1; @@ -102,8 +104,8 @@ int main(int argc, char ** argv) M c(m,p); c.setOnes(); std::cout << "Matrix sizes = " << m << "x" << p << " * " << p << "x" << n << "\n"; - std::ptrdiff_t cm, cn, ck; - getBlockingSizes(ck, cm, cn); + std::ptrdiff_t cm(m), cn(n), ck(p); + computeProductBlockingSizes(ck, cm, cn); std::cout << "blocking size = " << cm << " x " << ck << "\n"; M r = c; -- cgit v1.2.3