7 files changed, 108 insertions, 12 deletions
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 0a7b07645..2feca365a 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -31,10 +31,10 @@
 
 #ifndef EIGEN_HAS_FUSE_CJMADD
 #define EIGEN_HAS_FUSE_CJMADD 1
-#endif 
+#endif
 
 #ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
-#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 8*128*128
+#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 8*256*256
 #endif
 
 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
@@ -153,7 +153,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) {
   return vc;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int&    from)   { 
+template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int&    from)   {
   int EIGEN_ALIGN16 ai[4];
   ai[0] = from;
   Packet4i vc = vec_ld(0, ai);
diff --git a/Eigen/src/Core/arch/Default/Settings.h b/Eigen/src/Core/arch/Default/Settings.h
index 1ab2877b6..150c4bdc7 100644
--- a/Eigen/src/Core/arch/Default/Settings.h
+++ b/Eigen/src/Core/arch/Default/Settings.h
@@ -52,7 +52,7 @@
   * Typically for a single-threaded application you would set that to 25% of the size of your CPU caches in bytes
   */
 #ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
-#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (sizeof(float)*256*256)
+#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (sizeof(float)*512*512)
 #endif
 
 /** Defines the maximal width of the blocks used in the triangular product and solver
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 96c75101c..d4dd33322 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -32,7 +32,7 @@
 #endif
 
 #ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
-#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 4*96*96
+#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 4*192*192
 #endif
 
 // FIXME NEON has 16 quad registers, but since the current register allocator
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index ca3e4eaf3..dc6c2ebf3 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -25,11 +25,100 @@
 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
 #define EIGEN_GENERAL_BLOCK_PANEL_H
 
+/** \internal */
+inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* a=0, std::ptrdiff_t* b=0, int scalar_size = 0)
+{
+  const int nbScalarSizes = 12;
+  static std::ptrdiff_t m_maxK[nbScalarSizes];
+  static std::ptrdiff_t m_maxM[nbScalarSizes];
+  static std::ptrdiff_t m_cpuCacheSize = 0;
+  if(m_cpuCacheSize==0)
+  {
+    // initialization
+    m_cpuCacheSize = EIGEN_TUNE_FOR_CPU_CACHE_SIZE;
+    ei_manage_caching_sizes(SetAction,&m_cpuCacheSize);
+  }
+
+  if(action==SetAction && scalar_size==0)
+  {
+    // set the cpu cache size and cache all block sizes from a global cache size in byte
+    ei_internal_assert(a!=0 && b==0);
+    m_cpuCacheSize = *a;
+    int ss = 4;
+    for(int i=0; i<nbScalarSizes;++i,ss+=4)
+    {
+      m_maxK[i] = 4 * std::ptrdiff_t(std::sqrt(std::ptrdiff_t(m_cpuCacheSize/(64*ss))));
+      m_maxM[i] = 2 * m_maxK[i];
+    }
+  }
+  else if(action==SetAction && scalar_size!=0)
+  {
+    // set the block sizes for the given scalar type (represented as its size)
+    ei_internal_assert(a!=0 && b!=0);
+    int i = std::max((scalar_size>>2)-1,0);
+    if(i<nbScalarSizes)
+    {
+      m_maxK[i] = *a;
+      m_maxM[i] = *b;
+    }
+  }
+  else if(action==GetAction && scalar_size==0)
+  {
+    ei_internal_assert(a!=0 && b==0);
+    *a = m_cpuCacheSize;
+  }
+  else if(action==GetAction && scalar_size!=0)
+  {
+    ei_internal_assert(a!=0 && b!=0);
+    int i = std::min(std::max((scalar_size>>2),1),nbScalarSizes)-1;
+    *a = m_maxK[i];
+    *b = m_maxM[i];
+  }
+  else
+  {
+    ei_internal_assert(false);
+  }
+}
+
+/** \returns the currently set cpu cache size (in bytes) used to estimate the ideal blocking size parameters */
+std::ptrdiff_t ei_cpuCacheSize()
+{
+  std::ptrdiff_t ret;
+  ei_manage_caching_sizes(GetAction, &ret);
+  return ret;
+}
+
+/** Set the cpu cache size (in bytes) for blocking.
+  * This function also automatically set the blocking size parameters for each scalar type using the following formula:
+  * \code
+  *  max_k = 4 * sqrt(cache_size/(64*sizeof(Scalar)));
+  *  max_m = 2 * k;
+  * \endcode
+  * overwriting custom values set using the ei_setBlockingSizes function.
+  * \sa ei_setBlockingSizes */
+void ei_setCpuCacheSize(std::ptrdiff_t cache_size) { ei_manage_caching_sizes(SetAction,&cache_size); }
+
+/** Set the blocking size parameters \a maxK and \a maxM for the scalar type \a Scalar.
+  * Note that in practice there is no distinction between scalar types of same size.
+  * \sa ei_setCpuCacheSize */
+template<typename Scalar>
+void ei_setBlockingSizes(std::ptrdiff_t maxK, std::ptrdiff_t maxM)
+{
+  ei_manage_caching_sizes(SetAction,&maxK,&maxM,sizeof(Scalar));
+}
+
+/** \returns in \a makK, \a maxM the blocking size parameters for the scalar type \a Scalar.
+  * \sa ei_setBlockingSizes */
+template<typename Scalar>
+void ei_getBlockingSizes(std::ptrdiff_t& maxK, std::ptrdiff_t& maxM)
+{
+  ei_manage_caching_sizes(GetAction,&maxK,&maxM,sizeof(Scalar));
+}
+
 #ifdef EIGEN_HAS_FUSE_CJMADD
-#define CJMADD(A,B,C,T)  C = cj.pmadd(A,B,C);
+  #define CJMADD(A,B,C,T)  C = cj.pmadd(A,B,C);
 #else
-#define CJMADD(A,B,C,T)  T = B; T = cj.pmul(A,T); C = ei_padd(C,T);
-// #define CJMADD(A,B,C,T)  T = A; T = cj.pmul(T,B); C = ei_padd(C,T);
+  #define CJMADD(A,B,C,T)  T = B; T = cj.pmul(A,T); C = ei_padd(C,T);
 #endif
 
 // optimized GEneral packed Block * packed Panel product kernel
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 457173382..3086616f8 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -75,8 +75,11 @@ static void run(Index rows, Index cols, Index depth,
   typedef typename ei_packet_traits<Scalar>::type PacketType;
   typedef ei_product_blocking_traits<Scalar> Blocking;
 
-  Index kc = std::min<Index>(Blocking::Max_kc,depth);  // cache block size along the K direction
-  Index mc = std::min<Index>(Blocking::Max_mc,rows);   // cache block size along the M direction
+  Index kc; // cache block size along the K direction
+  Index mc; // cache block size along the M direction
+  ei_getBlockingSizes<Scalar>(kc, mc);
+  kc = std::min<Index>(kc,depth);
+  mc = std::min<Index>(mc,rows);
 
   ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs;
   ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs;
@@ -235,7 +238,9 @@ struct ei_gemm_functor
 
   Index sharedBlockBSize() const
   {
-    return std::min<Index>(ei_product_blocking_traits<Scalar>::Max_kc,m_rhs.rows()) * m_rhs.cols();
+    int maxKc, maxMc;
+    ei_getBlockingSizes<Scalar>(maxKc,maxMc);
+    return std::min<Index>(maxKc,m_rhs.rows()) * m_rhs.cols();
   }
 
   protected:
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 89c094d31..24d27bce2 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -139,7 +139,7 @@ struct ei_product_blocking_traits
     mr = 2 * PacketSize,
 
     // max cache block size along the K direction
-    Max_kc = 8 * ei_meta_sqrt<EIGEN_TUNE_FOR_CPU_CACHE_SIZE/(64*sizeof(Scalar))>::ret,
+    Max_kc = 4 * ei_meta_sqrt<EIGEN_TUNE_FOR_CPU_CACHE_SIZE/(64*sizeof(Scalar))>::ret,
 
     // max cache block size along the M direction
     Max_mc = 2*Max_kc
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index a586f2a5d..a36e7d05b 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -269,6 +269,8 @@ namespace Architecture
 
 enum { CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct };
 
+enum Action {GetAction, SetAction};
+
 /** The type used to identify a dense storage. */
 struct Dense {};