merge my Dynamic -> -1 change

author: Benoit Jacob <jacob.benoit.1@gmail.com> 2010-06-11 08:04:06 -0400
committer: Benoit Jacob <jacob.benoit.1@gmail.com> 2010-06-11 08:04:06 -0400
commit: d72d538747d6c46f37bbcba2f0484414f10cb059 (patch)
tree: f64c6eeb7c33af9763e3e382197b233329bd132c /Eigen/src/Core/products
parent: bdd7c6c88a0b8cb931480e04e33a17aa08022e06 (diff)
parent: 00267e3a471a10e842f771de474f0dca6407a693 (diff)
4 files changed, 209 insertions, 17 deletions
diff --git a/Eigen/src/Core/products/CoeffBasedProduct.h b/Eigen/src/Core/products/CoeffBasedProduct.h
index 170641589..43477282c 100644
--- a/Eigen/src/Core/products/CoeffBasedProduct.h
+++ b/Eigen/src/Core/products/CoeffBasedProduct.h
@@ -54,6 +54,8 @@ struct ei_traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> >
   typedef typename ei_scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar;
   typedef typename ei_promote_storage_type<typename ei_traits<_LhsNested>::StorageKind,
                                            typename ei_traits<_RhsNested>::StorageKind>::ret StorageKind;
+  typedef typename ei_promote_index_type<typename ei_traits<_LhsNested>::Index,
+                                         typename ei_traits<_RhsNested>::Index>::type Index;
 
   enum {
       LhsCoeffReadCost = _LhsNested::CoeffReadCost,
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index d81715528..be20be833 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -25,13 +25,156 @@
 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
 #define EIGEN_GENERAL_BLOCK_PANEL_H
 
-#ifndef EIGEN_EXTERN_INSTANTIATIONS
+/** \internal */
+inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* a=0, std::ptrdiff_t* b=0, std::ptrdiff_t* c=0, int scalar_size = 0)
+{
+  const int nbScalarSizes = 12;
+  static std::ptrdiff_t m_maxK[nbScalarSizes];
+  static std::ptrdiff_t m_maxM[nbScalarSizes];
+  static std::ptrdiff_t m_maxN[nbScalarSizes];
+  static std::ptrdiff_t m_l1CacheSize = 0;
+  static std::ptrdiff_t m_l2CacheSize = 0;
+  if(m_l1CacheSize==0)
+  {
+    // initialization
+    m_l1CacheSize =   EIGEN_TUNE_FOR_CPU_CACHE_SIZE;
+    m_l2CacheSize = 32*EIGEN_TUNE_FOR_CPU_CACHE_SIZE;
+    ei_manage_caching_sizes(SetAction,&m_l1CacheSize, &m_l2CacheSize);
+  }
+
+  if(action==SetAction && scalar_size==0)
+  {
+    // set the cpu cache size and cache all block sizes from a global cache size in byte
+    ei_internal_assert(a!=0 && b!=0 && c==0);
+    m_l1CacheSize = *a;
+    m_l2CacheSize = *b;
+    int ss = 4;
+    for(int i=0; i<nbScalarSizes;++i,ss+=4)
+    {
+      // Round the block size such that it is a multiple of 64/ss.
+      // This is to make sure the block size are multiple of the register block sizes.
+      // And in the worst case we ensure an even number.
+      std::ptrdiff_t rb = 64/ss;
+      if(rb==0) rb = 1;
+      m_maxK[i] = 4 * std::ptrdiff_t(ei_sqrt<float>(m_l1CacheSize/(64*ss)));
+      m_maxM[i] = 2 * m_maxK[i];
+      m_maxN[i] = ((m_l2CacheSize / (2 * m_maxK[i] * ss))/4)*4;
+    }
+  }
+  else if(action==SetAction && scalar_size!=0)
+  {
+    // set the block sizes for the given scalar type (represented as its size)
+    ei_internal_assert(a!=0 && b!=0 && c!=0);
+    int i = std::max((scalar_size>>2)-1,0);
+    if(i<nbScalarSizes)
+    {
+      m_maxK[i] = *a;
+      m_maxM[i] = *b;
+      m_maxN[i] = *c;
+    }
+  }
+  else if(action==GetAction && scalar_size==0)
+  {
+    ei_internal_assert(a!=0 && b!=0 && c==0);
+    *a = m_l1CacheSize;
+    *b = m_l2CacheSize;
+  }
+  else if(action==GetAction && scalar_size!=0)
+  {
+    ei_internal_assert(a!=0 && b!=0 && c!=0);
+    int i = std::min(std::max((scalar_size>>2),1),nbScalarSizes)-1;
+    *a = m_maxK[i];
+    *b = m_maxM[i];
+    *c = m_maxN[i];
+  }
+  else
+  {
+    ei_internal_assert(false);
+  }
+}
+
+/** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
+  * \sa setCpuCacheSize */
+inline std::ptrdiff_t l1CacheSize()
+{
+  std::ptrdiff_t l1, l2;
+  ei_manage_caching_sizes(GetAction, &l1, &l2);
+  return l1;
+}
+
+/** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
+  * \sa setCpuCacheSize */
+inline std::ptrdiff_t l2CacheSize()
+{
+  std::ptrdiff_t l1, l2;
+  ei_manage_caching_sizes(GetAction, &l1, &l2);
+  return l2;
+}
+
+/** Set the cpu L1 and L2 cache sizes (in bytes).
+  * These values are use to adjust the size of the blocks
+  * for the algorithms working per blocks.
+  *
+  * This function also automatically set the blocking size parameters
+  * for each scalar type using the following rules:
+  * \code
+  *  max_k = 4 * sqrt(l1/(64*sizeof(Scalar)));
+  *  max_m = 2 * k;
+  *  max_n = l2/(2*max_k*sizeof(Scalar));
+  * \endcode
+  * overwriting custom values set using the setBlockingSizes function.
+  *
+  * See setBlockingSizes() for an explanation about the meaning of these parameters.
+  *
+  * \sa setBlockingSizes */
+inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
+{
+  ei_manage_caching_sizes(SetAction, &l1, &l2);
+}
+
+/** \brief Set the blocking size parameters \a maxK, \a maxM and \a maxN for the scalar type \a Scalar.
+  *
+  * \param[in] maxK the size of the L1 and L2 blocks along the k dimension
+  * \param[in] maxM the size of the L1 blocks along the m dimension
+  * \param[in] maxN the size of the L2 blocks along the n dimension
+  *
+  * This function sets the blocking size parameters for matrix products and related algorithms.
+  * More precisely, let A * B be a m x k by k x n matrix product. Then Eigen's product like
+  * algorithms perform L2 blocking on B with horizontal panels of size maxK x maxN,
+  * and L1 blocking on A with blocks of size maxM x maxK.
+  *
+  * Theoretically, for best performances maxM should be closed to maxK and maxM * maxK should
+  * note exceed half of the L1 cache. Likewise, maxK * maxM should be smaller than the L2 cache.
+  *
+  * Note that in practice there is no distinction between scalar types of same size.
+  *
+  * \sa setCpuCacheSizes */
+template<typename Scalar>
+void setBlockingSizes(std::ptrdiff_t maxK, std::ptrdiff_t maxM, std::ptrdiff_t maxN)
+{
+  std::ptrdiff_t k, m, n;
+  typedef ei_product_blocking_traits<Scalar> Traits;
+  k = ((maxK)/4)*4;
+  m = ((maxM)/Traits::mr)*Traits::mr;
+  n = ((maxN)/Traits::nr)*Traits::nr;
+  ei_manage_caching_sizes(SetAction,&k,&m,&n,sizeof(Scalar));
+}
+
+/** \returns in \a makK, \a maxM and \a maxN the blocking size parameters for the scalar type \a Scalar.
+  *
+  * See setBlockingSizes for an explanation about the meaning of these parameters.
+  *
+  * \sa setBlockingSizes */
+template<typename Scalar>
+void getBlockingSizes(std::ptrdiff_t& maxK, std::ptrdiff_t& maxM, std::ptrdiff_t& maxN)
+{
+  ei_manage_caching_sizes(GetAction,&maxK,&maxM,&maxN,sizeof(Scalar));
+}
 
 #ifdef EIGEN_HAS_FUSE_CJMADD
-#define CJMADD(A,B,C,T)  C = cj.pmadd(A,B,C);
+  #define CJMADD(A,B,C,T)  C = cj.pmadd(A,B,C);
 #else
-#define CJMADD(A,B,C,T)  T = B; T = cj.pmul(A,T); C = ei_padd(C,T);
-// #define CJMADD(A,B,C,T)  T = A; T = cj.pmul(T,B); C = ei_padd(C,T);
+  #define CJMADD(A,B,C,T)  T = B; T = cj.pmul(A,T); C = ei_padd(C,T);
 #endif
 
 // optimized GEneral packed Block * packed Panel product kernel
@@ -762,6 +905,4 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, PanelMode>
   }
 };
 
-#endif // EIGEN_EXTERN_INSTANTIATIONS
-
 #endif // EIGEN_GENERAL_BLOCK_PANEL_H
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 991977c1f..3513d118e 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -25,8 +25,6 @@
 #ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
 #define EIGEN_GENERAL_MATRIX_MATRIX_H
 
-#ifndef EIGEN_EXTERN_INSTANTIATIONS
-
 /* Specialization for a row-major destination matrix => simple transposition of the product */
 template<
   typename Scalar, typename Index,
@@ -77,8 +75,13 @@ static void run(Index rows, Index cols, Index depth,
   typedef typename ei_packet_traits<Scalar>::type PacketType;
   typedef ei_product_blocking_traits<Scalar> Blocking;
 
-  Index kc = std::min<Index>(Blocking::Max_kc,depth);  // cache block size along the K direction
-  Index mc = std::min<Index>(Blocking::Max_mc,rows);   // cache block size along the M direction
+  Index kc; // cache block size along the K direction
+  Index mc; // cache block size along the M direction
+  Index nc; // cache block size along the N direction
+  getBlockingSizes<Scalar>(kc, mc, nc);
+  kc = std::min<Index>(kc,depth);
+  mc = std::min<Index>(mc,rows);
+  nc = std::min<Index>(nc,cols);
 
   ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs;
   ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs;
@@ -159,7 +162,8 @@ static void run(Index rows, Index cols, Index depth,
   else
 #endif // EIGEN_HAS_OPENMP
   {
-    (void)info; // info is not used
+    EIGEN_UNUSED_VARIABLE(info);
+
     // this is the sequential version!
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
     std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
@@ -203,8 +207,6 @@ static void run(Index rows, Index cols, Index depth,
 
 };
 
-#endif // EIGEN_EXTERN_INSTANTIATIONS
-
 /*********************************************************************************
 *  Specialization of GeneralProduct<> for "large" GEMM, i.e.,
 *  implementation of the high level wrapper to ei_general_matrix_matrix_product
@@ -239,7 +241,9 @@ struct ei_gemm_functor
 
   Index sharedBlockBSize() const
   {
-    return std::min<Index>(ei_product_blocking_traits<Scalar>::Max_kc,m_rhs.rows()) * m_rhs.cols();
+    Index maxKc, maxMc, maxNc;
+    getBlockingSizes<Scalar>(maxKc, maxMc, maxNc);
+    return std::min<Index>(maxKc,m_rhs.rows()) * m_rhs.cols();
   }
 
   protected:
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 5e4eb6f1e..588f78b4c 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -25,6 +25,50 @@
 #ifndef EIGEN_PARALLELIZER_H
 #define EIGEN_PARALLELIZER_H
 
+/** \internal */
+inline void ei_manage_multi_threading(Action action, int* v)
+{
+  static int m_maxThreads = -1;
+  
+  if(action==SetAction)
+  {
+    ei_internal_assert(v!=0);
+    m_maxThreads = *v;
+  }
+  else if(action==GetAction)
+  {
+    ei_internal_assert(v!=0);
+    #ifdef EIGEN_HAS_OPENMP
+    if(m_maxThreads>0)
+      *v = m_maxThreads;
+    else
+      *v = omp_get_max_threads();
+    #else
+    *v = 1;
+    #endif
+  }
+  else
+  {
+    ei_internal_assert(false);
+  }
+}
+
+/** \returns the max number of threads reserved for Eigen
+  * \sa setNbThreads */
+inline int nbThreads()
+{
+  int ret;
+  ei_manage_multi_threading(GetAction, &ret);
+  return ret;
+}
+
+/** Sets the max number of threads reserved for Eigen
+  * \sa nbThreads */
+inline void setNbThreads(int v)
+{
+  ei_manage_multi_threading(SetAction, &v);
+}
+
 template<typename BlockBScalar, typename Index> struct GemmParallelInfo
 {
   GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0), blockB(0) {}
@@ -57,10 +101,10 @@ void ei_parallelize_gemm(const Functor& func, Index rows, Index cols)
 
   // 2- compute the maximal number of threads from the size of the product:
   // FIXME this has to be fine tuned
-  Index max_threads = std::max(1,rows / 32);
+  Index max_threads = std::max<Index>(1,rows / 32);
 
   // 3 - compute the number of threads we are going to use
-  Index threads = std::min<Index>(omp_get_max_threads(), max_threads);
+  Index threads = std::min<Index>(nbThreads(), max_threads);
 
   if(threads==1)
     return func(0,rows, 0,cols);
@@ -71,7 +115,8 @@ void ei_parallelize_gemm(const Functor& func, Index rows, Index cols)
   typedef typename Functor::BlockBScalar BlockBScalar;
   BlockBScalar* sharedBlockB = new BlockBScalar[func.sharedBlockBSize()];
 
-  GemmParallelInfo<BlockBScalar>* info = new GemmParallelInfo<BlockBScalar>[threads];
+  GemmParallelInfo<BlockBScalar,Index>* info = new
+		  GemmParallelInfo<BlockBScalar,Index>[threads];
 
   #pragma omp parallel for schedule(static,1) num_threads(threads)
   for(Index i=0; i<threads; ++i)
author	Benoit Jacob <jacob.benoit.1@gmail.com>	2010-06-11 08:04:06 -0400
committer	Benoit Jacob <jacob.benoit.1@gmail.com>	2010-06-11 08:04:06 -0400
commit	d72d538747d6c46f37bbcba2f0484414f10cb059 (patch)
tree	f64c6eeb7c33af9763e3e382197b233329bd132c /Eigen/src/Core/products
parent	bdd7c6c88a0b8cb931480e04e33a17aa08022e06 (diff)
parent	00267e3a471a10e842f771de474f0dca6407a693 (diff)