7 files changed, 45 insertions, 26 deletions
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 9139976c3..062d75ba9 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -284,7 +284,7 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
         _ActualRhsType,
         Dest> GemmFunctor;
 
-      ei_parallelize_gemm<(Dest::MaxRowsAtCompileTime>32)>(GemmFunctor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols());
+      ei_parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols());
     }
 };
 
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 588f78b4c..750fa7b5f 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -29,7 +29,7 @@
 inline void ei_manage_multi_threading(Action action, int* v)
 {
   static int m_maxThreads = -1;
-  
+
   if(action==SetAction)
   {
     ei_internal_assert(v!=0);
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 31726e66d..cc9333384 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -258,8 +258,10 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate
 
     typedef ei_product_blocking_traits<Scalar> Blocking;
 
-    Index kc = std::min<Index>(Blocking::Max_kc,size);  // cache block size along the K direction
-    Index mc = std::min<Index>(Blocking::Max_mc,rows);  // cache block size along the M direction
+    Index kc = size; // cache block size along the K direction
+    Index mc = rows;  // cache block size along the M direction
+    Index nc = cols;  // cache block size along the N direction
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
     std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
@@ -339,8 +341,10 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat
 
     typedef ei_product_blocking_traits<Scalar> Blocking;
 
-    Index kc = std::min<Index>(Blocking::Max_kc,size);  // cache block size along the K direction
-    Index mc = std::min<Index>(Blocking::Max_mc,rows);  // cache block size along the M direction
+    Index kc = size; // cache block size along the K direction
+    Index mc = rows;  // cache block size along the M direction
+    Index nc = cols;  // cache block size along the N direction
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
     std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h
index befc4ff69..8ce797cff 100644
--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h
@@ -70,14 +70,16 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL
 
     typedef ei_product_blocking_traits<Scalar> Blocking;
 
-    Index kc = std::min<Index>(Blocking::Max_kc,depth); // cache block size along the K direction
-    Index mc = std::min<Index>(Blocking::Max_mc,size);  // cache block size along the M direction
+    Index kc = depth; // cache block size along the K direction
+    Index mc = size;  // cache block size along the M direction
+    Index nc = size;  // cache block size along the N direction
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
     std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size;
     Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
     Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
-    
+
     // note that the actual rhs is the transpose/adjoint of mat
     typedef ei_conj_helper<NumTraits<Scalar>::IsComplex && !AAT, NumTraits<Scalar>::IsComplex && AAT> Conj;
 
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index a099160c2..decf515b0 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -114,8 +114,12 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
       IsLower = (Mode&Lower) == Lower
     };
 
-    Index kc = std::min<Index>(Blocking::Max_kc/4,depth); // cache block size along the K direction
-    Index mc = std::min<Index>(Blocking::Max_mc,rows);    // cache block size along the M direction
+    Index kc = depth; // cache block size along the K direction
+    Index mc = rows;  // cache block size along the M direction
+    Index nc = cols;  // cache block size along the N direction
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
+    // it is better to use smaller blocks along the diagonal
+    kc /= 4;
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
     std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
@@ -238,8 +242,12 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
       IsLower = (Mode&Lower) == Lower
     };
 
-    Index kc = std::min<Index>(Blocking::Max_kc/4,depth); // cache block size along the K direction
-    Index mc = std::min<Index>(Blocking::Max_mc,rows);    // cache block size along the M direction
+    Index kc = depth; // cache block size along the K direction
+    Index mc = rows;  // cache block size along the M direction
+    Index nc = cols;  // cache block size along the N direction
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
+    // it is better to use smaller blocks along the diagonal
+    kc /= 4;
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
     std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
@@ -273,7 +281,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
       Index rs = IsLower ? std::min(cols,actual_k2) : cols - k2;
       // size of the triangular part
       Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
-      
+
       Scalar* geb = blockB+ts*ts;
 
       pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, alpha, actual_kc, rs);
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index 11e08c3b5..381983459 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -63,8 +63,12 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora
       IsLower = (Mode&Lower) == Lower
     };
 
-    Index kc = std::min<Index>(Blocking::Max_kc/4,size); // cache block size along the K direction
-    Index mc = std::min<Index>(Blocking::Max_mc,size);   // cache block size along the M direction
+    Index kc = size; // cache block size along the K direction
+    Index mc = size;  // cache block size along the M direction
+    Index nc = cols;  // cache block size along the N direction
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
+    // it is better to use smaller blocks along the diagonal
+    kc /= 4;
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
     std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
@@ -196,8 +200,15 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor
       IsLower = (Mode&Lower) == Lower
     };
 
-    Index kc = std::min<Index>(Blocking::Max_kc/4,size); // cache block size along the K direction
-    Index mc = std::min<Index>(Blocking::Max_mc,size);   // cache block size along the M direction
+//     Index kc = std::min<Index>(Blocking::Max_kc/4,size); // cache block size along the K direction
+//     Index mc = std::min<Index>(Blocking::Max_mc,size);   // cache block size along the M direction
+    // check that !!!!
+    Index kc = size; // cache block size along the K direction
+    Index mc = size;  // cache block size along the M direction
+    Index nc = rows;  // cache block size along the N direction
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
+    // it is better to use smaller blocks along the diagonal
+    kc /= 4;
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
     std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size;
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index a5fa1532d..c86d70fb2 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -123,7 +123,7 @@ class ei_const_blas_data_mapper
     Index m_stride;
 };
 
-// Defines various constant controlling level 3 blocking
+// Defines various constant controlling register blocking for matrix-matrix algorithms.
 template<typename Scalar>
 struct ei_product_blocking_traits
 {
@@ -136,13 +136,7 @@ struct ei_product_blocking_traits
     nr = NumberOfRegisters/4,
 
     // register block size along the M direction (currently, this one cannot be modified)
-    mr = 2 * PacketSize,
-
-    // max cache block size along the K direction
-    Max_kc = 4 * ei_meta_sqrt<EIGEN_TUNE_FOR_CPU_CACHE_SIZE/(64*sizeof(Scalar))>::ret,
-
-    // max cache block size along the M direction
-    Max_mc = 2*Max_kc
+    mr = 2 * PacketSize
   };
 };