diff options
author | 2010-06-11 08:04:06 -0400 | |
---|---|---|
committer | 2010-06-11 08:04:06 -0400 | |
commit | d72d538747d6c46f37bbcba2f0484414f10cb059 (patch) | |
tree | f64c6eeb7c33af9763e3e382197b233329bd132c /Eigen/src/Core/products | |
parent | bdd7c6c88a0b8cb931480e04e33a17aa08022e06 (diff) | |
parent | 00267e3a471a10e842f771de474f0dca6407a693 (diff) |
merge my Dynamic -> -1 change
Diffstat (limited to 'Eigen/src/Core/products')
-rw-r--r-- | Eigen/src/Core/products/CoeffBasedProduct.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 153 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 20 | ||||
-rw-r--r-- | Eigen/src/Core/products/Parallelizer.h | 51 |
4 files changed, 209 insertions, 17 deletions
diff --git a/Eigen/src/Core/products/CoeffBasedProduct.h b/Eigen/src/Core/products/CoeffBasedProduct.h index 170641589..43477282c 100644 --- a/Eigen/src/Core/products/CoeffBasedProduct.h +++ b/Eigen/src/Core/products/CoeffBasedProduct.h @@ -54,6 +54,8 @@ struct ei_traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> > typedef typename ei_scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar; typedef typename ei_promote_storage_type<typename ei_traits<_LhsNested>::StorageKind, typename ei_traits<_RhsNested>::StorageKind>::ret StorageKind; + typedef typename ei_promote_index_type<typename ei_traits<_LhsNested>::Index, + typename ei_traits<_RhsNested>::Index>::type Index; enum { LhsCoeffReadCost = _LhsNested::CoeffReadCost, diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index d81715528..be20be833 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -25,13 +25,156 @@ #ifndef EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H -#ifndef EIGEN_EXTERN_INSTANTIATIONS +/** \internal */ +inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* a=0, std::ptrdiff_t* b=0, std::ptrdiff_t* c=0, int scalar_size = 0) +{ + const int nbScalarSizes = 12; + static std::ptrdiff_t m_maxK[nbScalarSizes]; + static std::ptrdiff_t m_maxM[nbScalarSizes]; + static std::ptrdiff_t m_maxN[nbScalarSizes]; + static std::ptrdiff_t m_l1CacheSize = 0; + static std::ptrdiff_t m_l2CacheSize = 0; + if(m_l1CacheSize==0) + { + // initialization + m_l1CacheSize = EIGEN_TUNE_FOR_CPU_CACHE_SIZE; + m_l2CacheSize = 32*EIGEN_TUNE_FOR_CPU_CACHE_SIZE; + ei_manage_caching_sizes(SetAction,&m_l1CacheSize, &m_l2CacheSize); + } + + if(action==SetAction && scalar_size==0) + { + // set the cpu cache size and cache all block sizes from a global cache size in byte + ei_internal_assert(a!=0 && b!=0 && c==0); + m_l1CacheSize = *a; + m_l2CacheSize = *b; + int ss = 4; + for(int i=0; i<nbScalarSizes;++i,ss+=4) + { + // Round the block size such that it is a multiple of 64/ss. + // This is to make sure the block size are multiple of the register block sizes. + // And in the worst case we ensure an even number. + std::ptrdiff_t rb = 64/ss; + if(rb==0) rb = 1; + m_maxK[i] = 4 * std::ptrdiff_t(ei_sqrt<float>(m_l1CacheSize/(64*ss))); + m_maxM[i] = 2 * m_maxK[i]; + m_maxN[i] = ((m_l2CacheSize / (2 * m_maxK[i] * ss))/4)*4; + } + } + else if(action==SetAction && scalar_size!=0) + { + // set the block sizes for the given scalar type (represented as its size) + ei_internal_assert(a!=0 && b!=0 && c!=0); + int i = std::max((scalar_size>>2)-1,0); + if(i<nbScalarSizes) + { + m_maxK[i] = *a; + m_maxM[i] = *b; + m_maxN[i] = *c; + } + } + else if(action==GetAction && scalar_size==0) + { + ei_internal_assert(a!=0 && b!=0 && c==0); + *a = m_l1CacheSize; + *b = m_l2CacheSize; + } + else if(action==GetAction && scalar_size!=0) + { + ei_internal_assert(a!=0 && b!=0 && c!=0); + int i = std::min(std::max((scalar_size>>2),1),nbScalarSizes)-1; + *a = m_maxK[i]; + *b = m_maxM[i]; + *c = m_maxN[i]; + } + else + { + ei_internal_assert(false); + } +} + +/** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. + * \sa setCpuCacheSize */ +inline std::ptrdiff_t l1CacheSize() +{ + std::ptrdiff_t l1, l2; + ei_manage_caching_sizes(GetAction, &l1, &l2); + return l1; +} + +/** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. + * \sa setCpuCacheSize */ +inline std::ptrdiff_t l2CacheSize() +{ + std::ptrdiff_t l1, l2; + ei_manage_caching_sizes(GetAction, &l1, &l2); + return l2; +} + +/** Set the cpu L1 and L2 cache sizes (in bytes). + * These values are use to adjust the size of the blocks + * for the algorithms working per blocks. + * + * This function also automatically set the blocking size parameters + * for each scalar type using the following rules: + * \code + * max_k = 4 * sqrt(l1/(64*sizeof(Scalar))); + * max_m = 2 * k; + * max_n = l2/(2*max_k*sizeof(Scalar)); + * \endcode + * overwriting custom values set using the setBlockingSizes function. + * + * See setBlockingSizes() for an explanation about the meaning of these parameters. + * + * \sa setBlockingSizes */ +inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) +{ + ei_manage_caching_sizes(SetAction, &l1, &l2); +} + +/** \brief Set the blocking size parameters \a maxK, \a maxM and \a maxN for the scalar type \a Scalar. + * + * \param[in] maxK the size of the L1 and L2 blocks along the k dimension + * \param[in] maxM the size of the L1 blocks along the m dimension + * \param[in] maxN the size of the L2 blocks along the n dimension + * + * This function sets the blocking size parameters for matrix products and related algorithms. + * More precisely, let A * B be a m x k by k x n matrix product. Then Eigen's product like + * algorithms perform L2 blocking on B with horizontal panels of size maxK x maxN, + * and L1 blocking on A with blocks of size maxM x maxK. + * + * Theoretically, for best performances maxM should be closed to maxK and maxM * maxK should + * note exceed half of the L1 cache. Likewise, maxK * maxM should be smaller than the L2 cache. + * + * Note that in practice there is no distinction between scalar types of same size. + * + * \sa setCpuCacheSizes */ +template<typename Scalar> +void setBlockingSizes(std::ptrdiff_t maxK, std::ptrdiff_t maxM, std::ptrdiff_t maxN) +{ + std::ptrdiff_t k, m, n; + typedef ei_product_blocking_traits<Scalar> Traits; + k = ((maxK)/4)*4; + m = ((maxM)/Traits::mr)*Traits::mr; + n = ((maxN)/Traits::nr)*Traits::nr; + ei_manage_caching_sizes(SetAction,&k,&m,&n,sizeof(Scalar)); +} + +/** \returns in \a makK, \a maxM and \a maxN the blocking size parameters for the scalar type \a Scalar. + * + * See setBlockingSizes for an explanation about the meaning of these parameters. + * + * \sa setBlockingSizes */ +template<typename Scalar> +void getBlockingSizes(std::ptrdiff_t& maxK, std::ptrdiff_t& maxM, std::ptrdiff_t& maxN) +{ + ei_manage_caching_sizes(GetAction,&maxK,&maxM,&maxN,sizeof(Scalar)); +} #ifdef EIGEN_HAS_FUSE_CJMADD -#define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C); + #define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C); #else -#define CJMADD(A,B,C,T) T = B; T = cj.pmul(A,T); C = ei_padd(C,T); -// #define CJMADD(A,B,C,T) T = A; T = cj.pmul(T,B); C = ei_padd(C,T); + #define CJMADD(A,B,C,T) T = B; T = cj.pmul(A,T); C = ei_padd(C,T); #endif // optimized GEneral packed Block * packed Panel product kernel @@ -762,6 +905,4 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, PanelMode> } }; -#endif // EIGEN_EXTERN_INSTANTIATIONS - #endif // EIGEN_GENERAL_BLOCK_PANEL_H diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 991977c1f..3513d118e 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -25,8 +25,6 @@ #ifndef EIGEN_GENERAL_MATRIX_MATRIX_H #define EIGEN_GENERAL_MATRIX_MATRIX_H -#ifndef EIGEN_EXTERN_INSTANTIATIONS - /* Specialization for a row-major destination matrix => simple transposition of the product */ template< typename Scalar, typename Index, @@ -77,8 +75,13 @@ static void run(Index rows, Index cols, Index depth, typedef typename ei_packet_traits<Scalar>::type PacketType; typedef ei_product_blocking_traits<Scalar> Blocking; - Index kc = std::min<Index>(Blocking::Max_kc,depth); // cache block size along the K direction - Index mc = std::min<Index>(Blocking::Max_mc,rows); // cache block size along the M direction + Index kc; // cache block size along the K direction + Index mc; // cache block size along the M direction + Index nc; // cache block size along the N direction + getBlockingSizes<Scalar>(kc, mc, nc); + kc = std::min<Index>(kc,depth); + mc = std::min<Index>(mc,rows); + nc = std::min<Index>(nc,cols); ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs; ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs; @@ -159,7 +162,8 @@ static void run(Index rows, Index cols, Index depth, else #endif // EIGEN_HAS_OPENMP { - (void)info; // info is not used + EIGEN_UNUSED_VARIABLE(info); + // this is the sequential version! Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; @@ -203,8 +207,6 @@ static void run(Index rows, Index cols, Index depth, }; -#endif // EIGEN_EXTERN_INSTANTIATIONS - /********************************************************************************* * Specialization of GeneralProduct<> for "large" GEMM, i.e., * implementation of the high level wrapper to ei_general_matrix_matrix_product @@ -239,7 +241,9 @@ struct ei_gemm_functor Index sharedBlockBSize() const { - return std::min<Index>(ei_product_blocking_traits<Scalar>::Max_kc,m_rhs.rows()) * m_rhs.cols(); + Index maxKc, maxMc, maxNc; + getBlockingSizes<Scalar>(maxKc, maxMc, maxNc); + return std::min<Index>(maxKc,m_rhs.rows()) * m_rhs.cols(); } protected: diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 5e4eb6f1e..588f78b4c 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -25,6 +25,50 @@ #ifndef EIGEN_PARALLELIZER_H #define EIGEN_PARALLELIZER_H +/** \internal */ +inline void ei_manage_multi_threading(Action action, int* v) +{ + static int m_maxThreads = -1; + + if(action==SetAction) + { + ei_internal_assert(v!=0); + m_maxThreads = *v; + } + else if(action==GetAction) + { + ei_internal_assert(v!=0); + #ifdef EIGEN_HAS_OPENMP + if(m_maxThreads>0) + *v = m_maxThreads; + else + *v = omp_get_max_threads(); + #else + *v = 1; + #endif + } + else + { + ei_internal_assert(false); + } +} + +/** \returns the max number of threads reserved for Eigen + * \sa setNbThreads */ +inline int nbThreads() +{ + int ret; + ei_manage_multi_threading(GetAction, &ret); + return ret; +} + +/** Sets the max number of threads reserved for Eigen + * \sa nbThreads */ +inline void setNbThreads(int v) +{ + ei_manage_multi_threading(SetAction, &v); +} + template<typename BlockBScalar, typename Index> struct GemmParallelInfo { GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0), blockB(0) {} @@ -57,10 +101,10 @@ void ei_parallelize_gemm(const Functor& func, Index rows, Index cols) // 2- compute the maximal number of threads from the size of the product: // FIXME this has to be fine tuned - Index max_threads = std::max(1,rows / 32); + Index max_threads = std::max<Index>(1,rows / 32); // 3 - compute the number of threads we are going to use - Index threads = std::min<Index>(omp_get_max_threads(), max_threads); + Index threads = std::min<Index>(nbThreads(), max_threads); if(threads==1) return func(0,rows, 0,cols); @@ -71,7 +115,8 @@ void ei_parallelize_gemm(const Functor& func, Index rows, Index cols) typedef typename Functor::BlockBScalar BlockBScalar; BlockBScalar* sharedBlockB = new BlockBScalar[func.sharedBlockBSize()]; - GemmParallelInfo<BlockBScalar>* info = new GemmParallelInfo<BlockBScalar>[threads]; + GemmParallelInfo<BlockBScalar,Index>* info = new + GemmParallelInfo<BlockBScalar,Index>[threads]; #pragma omp parallel for schedule(static,1) num_threads(threads) for(Index i=0; i<threads; ++i) |