From 842b54fe8051fb334da98652e9ea47533c646c33 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 10 Jun 2010 22:11:31 +0200 Subject: make the cache size mechanism future proof by adding level 2 parameters --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 128 ++++++++++++++-------- Eigen/src/Core/products/GeneralMatrixMatrix.h | 4 +- test/product_large.cpp | 17 +++ 3 files changed, 105 insertions(+), 44 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 12934b3b9..be20be833 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -26,53 +26,66 @@ #define EIGEN_GENERAL_BLOCK_PANEL_H /** \internal */ -inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* a=0, std::ptrdiff_t* b=0, int scalar_size = 0) +inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* a=0, std::ptrdiff_t* b=0, std::ptrdiff_t* c=0, int scalar_size = 0) { const int nbScalarSizes = 12; static std::ptrdiff_t m_maxK[nbScalarSizes]; static std::ptrdiff_t m_maxM[nbScalarSizes]; - static std::ptrdiff_t m_cpuCacheSize = 0; - if(m_cpuCacheSize==0) + static std::ptrdiff_t m_maxN[nbScalarSizes]; + static std::ptrdiff_t m_l1CacheSize = 0; + static std::ptrdiff_t m_l2CacheSize = 0; + if(m_l1CacheSize==0) { // initialization - m_cpuCacheSize = EIGEN_TUNE_FOR_CPU_CACHE_SIZE; - ei_manage_caching_sizes(SetAction,&m_cpuCacheSize); + m_l1CacheSize = EIGEN_TUNE_FOR_CPU_CACHE_SIZE; + m_l2CacheSize = 32*EIGEN_TUNE_FOR_CPU_CACHE_SIZE; + ei_manage_caching_sizes(SetAction,&m_l1CacheSize, &m_l2CacheSize); } if(action==SetAction && scalar_size==0) { // set the cpu cache size and cache all block sizes from a global cache size in byte - ei_internal_assert(a!=0 && b==0); - m_cpuCacheSize = *a; + ei_internal_assert(a!=0 && b!=0 && c==0); + m_l1CacheSize = *a; + m_l2CacheSize = *b; int ss = 4; for(int i=0; i(m_cpuCacheSize/(64*ss))); + // Round the block size such that it is a multiple of 64/ss. + // This is to make sure the block size are multiple of the register block sizes. + // And in the worst case we ensure an even number. + std::ptrdiff_t rb = 64/ss; + if(rb==0) rb = 1; + m_maxK[i] = 4 * std::ptrdiff_t(ei_sqrt(m_l1CacheSize/(64*ss))); m_maxM[i] = 2 * m_maxK[i]; + m_maxN[i] = ((m_l2CacheSize / (2 * m_maxK[i] * ss))/4)*4; } } else if(action==SetAction && scalar_size!=0) { // set the block sizes for the given scalar type (represented as its size) - ei_internal_assert(a!=0 && b!=0); + ei_internal_assert(a!=0 && b!=0 && c!=0); int i = std::max((scalar_size>>2)-1,0); if(i>2),1),nbScalarSizes)-1; *a = m_maxK[i]; *b = m_maxM[i]; + *c = m_maxN[i]; } else { @@ -80,53 +93,82 @@ inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* a=0, std::ptr } } -/** \returns the currently set cpu cache size (in bytes) used to estimate the ideal blocking size parameters. - * \sa setL1CacheSize */ +/** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. + * \sa setCpuCacheSize */ inline std::ptrdiff_t l1CacheSize() { - std::ptrdiff_t ret; - ei_manage_caching_sizes(GetAction, &ret); - return ret; + std::ptrdiff_t l1, l2; + ei_manage_caching_sizes(GetAction, &l1, &l2); + return l1; } -/** Set the cpu cache size (in bytes) for blocking. +/** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. + * \sa setCpuCacheSize */ +inline std::ptrdiff_t l2CacheSize() +{ + std::ptrdiff_t l1, l2; + ei_manage_caching_sizes(GetAction, &l1, &l2); + return l2; +} + +/** Set the cpu L1 and L2 cache sizes (in bytes). + * These values are use to adjust the size of the blocks + * for the algorithms working per blocks. + * * This function also automatically set the blocking size parameters - * for each scalar type using the following formula: + * for each scalar type using the following rules: * \code - * max_k = 4 * sqrt(cache_size/(64*sizeof(Scalar))); + * max_k = 4 * sqrt(l1/(64*sizeof(Scalar))); * max_m = 2 * k; + * max_n = l2/(2*max_k*sizeof(Scalar)); * \endcode - * overwriting custom values set using the ei_setBlockingSizes function. - * - * \b Explanations: \n - * Let A * B be a m x k times k x n matrix product. Then Eigen's product yield - * L2 blocking on B with panels of size max_k x n, and L1 blocking on A, - * with blocks of size max_m x max_k. - * - * \sa ei_setBlockingSizes */ -inline void setL1CacheSize(std::ptrdiff_t cache_size) { ei_manage_caching_sizes(SetAction,&cache_size); } - -/** Set the blocking size parameters \a maxK and \a maxM for the scalar type \a Scalar. + * overwriting custom values set using the setBlockingSizes function. + * + * See setBlockingSizes() for an explanation about the meaning of these parameters. + * + * \sa setBlockingSizes */ +inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) +{ + ei_manage_caching_sizes(SetAction, &l1, &l2); +} + +/** \brief Set the blocking size parameters \a maxK, \a maxM and \a maxN for the scalar type \a Scalar. + * + * \param[in] maxK the size of the L1 and L2 blocks along the k dimension + * \param[in] maxM the size of the L1 blocks along the m dimension + * \param[in] maxN the size of the L2 blocks along the n dimension + * + * This function sets the blocking size parameters for matrix products and related algorithms. + * More precisely, let A * B be a m x k by k x n matrix product. Then Eigen's product like + * algorithms perform L2 blocking on B with horizontal panels of size maxK x maxN, + * and L1 blocking on A with blocks of size maxM x maxK. + * + * Theoretically, for best performances maxM should be closed to maxK and maxM * maxK should + * note exceed half of the L1 cache. Likewise, maxK * maxM should be smaller than the L2 cache. + * * Note that in practice there is no distinction between scalar types of same size. - * - * See ei_setCpuCacheSize for an explanation about the meaning of maxK and maxM. - * - * \sa setL1CacheSize */ + * + * \sa setCpuCacheSizes */ template -void setBlockingSizes(std::ptrdiff_t maxK, std::ptrdiff_t maxM) +void setBlockingSizes(std::ptrdiff_t maxK, std::ptrdiff_t maxM, std::ptrdiff_t maxN) { - ei_manage_caching_sizes(SetAction,&maxK,&maxM,sizeof(Scalar)); + std::ptrdiff_t k, m, n; + typedef ei_product_blocking_traits Traits; + k = ((maxK)/4)*4; + m = ((maxM)/Traits::mr)*Traits::mr; + n = ((maxN)/Traits::nr)*Traits::nr; + ei_manage_caching_sizes(SetAction,&k,&m,&n,sizeof(Scalar)); } -/** \returns in \a makK, \a maxM the blocking size parameters for the scalar type \a Scalar. +/** \returns in \a makK, \a maxM and \a maxN the blocking size parameters for the scalar type \a Scalar. + * + * See setBlockingSizes for an explanation about the meaning of these parameters. * - * See ei_setCpuCacheSize for an explanation about the meaning of maxK and maxM. - * - * \sa setL1CacheSize */ + * \sa setBlockingSizes */ template -void getBlockingSizes(std::ptrdiff_t& maxK, std::ptrdiff_t& maxM) +void getBlockingSizes(std::ptrdiff_t& maxK, std::ptrdiff_t& maxM, std::ptrdiff_t& maxN) { - ei_manage_caching_sizes(GetAction,&maxK,&maxM,sizeof(Scalar)); + ei_manage_caching_sizes(GetAction,&maxK,&maxM,&maxN,sizeof(Scalar)); } #ifdef EIGEN_HAS_FUSE_CJMADD diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 07721145a..801ed2792 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -77,9 +77,11 @@ static void run(Index rows, Index cols, Index depth, Index kc; // cache block size along the K direction Index mc; // cache block size along the M direction - getBlockingSizes(kc, mc); + Index nc; // cache block size along the N direction + getBlockingSizes(kc, mc, nc); kc = std::min(kc,depth); mc = std::min(mc,rows); + nc = std::min(nc,cols); ei_gemm_pack_rhs pack_rhs; ei_gemm_pack_lhs pack_lhs; diff --git a/test/product_large.cpp b/test/product_large.cpp index 519213236..0351d134c 100644 --- a/test/product_large.cpp +++ b/test/product_large.cpp @@ -49,5 +49,22 @@ void test_product_large() MatrixXf a = MatrixXf::Random(10,4), b = MatrixXf::Random(4,10), c = a; VERIFY_IS_APPROX((a = a * b), (c * b).eval()); } + + { + // check the functions to setup blocking sizes compile and do not segfault + // FIXME check they do what they are supposed to do !! + std::ptrdiff_t l1 = ei_random(10000,20000); + std::ptrdiff_t l2 = ei_random(1000000,2000000); + setCpuCacheSizes(l1,l2); + VERIFY(l1==l1CacheSize()); + VERIFY(l2==l2CacheSize()); + std::ptrdiff_t k1 = ei_random(10,100)*16; + std::ptrdiff_t m1 = ei_random(10,100)*16; + std::ptrdiff_t n1 = ei_random(10,100)*16; + setBlockingSizes(k1,m1,n1); + std::ptrdiff_t k, m, n; + getBlockingSizes(k,m,n); + VERIFY(k==k1 && m==m1 && n==n1); + } #endif } -- cgit v1.2.3