fix computation of blocking sizes for small triangular matrices

author: Gael Guennebaud <g.gael@free.fr> 2010-06-24 11:50:28 +0200
committer: Gael Guennebaud <g.gael@free.fr> 2010-06-24 11:50:28 +0200
commit: d44fce501bf299692d578349b92c899c3f0d79cd (patch)
tree: 7c8061b30cace79ccf503f4faa4823e0e80e2290
parent: 0068d3ccf62c3e2960e3f9eb4a31562c0f5025d7 (diff)
3 files changed, 12 insertions, 14 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index cf48ca2f4..7e42eed6e 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -101,7 +101,7 @@ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
   * - the number of scalars that fit into a packet (when vectorization is enabled).
   *
   * \sa setCpuCacheSizes */
-template<typename LhsScalar, typename RhsScalar>
+template<typename LhsScalar, typename RhsScalar, int KcFactor>
 void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
 {
   // Explanations:
@@ -114,7 +114,7 @@ void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrd
   std::ptrdiff_t l1, l2;
 
   enum {
-    kdiv = 2 * ei_product_blocking_traits<RhsScalar>::nr
+    kdiv = KcFactor * 2 * ei_product_blocking_traits<RhsScalar>::nr
          * ei_packet_traits<RhsScalar>::size * sizeof(RhsScalar),
     mr = ei_product_blocking_traits<LhsScalar>::mr,
     mr_mask = (0xffffffff/mr)*mr
@@ -127,6 +127,12 @@ void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrd
   n = n;
 }
 
+template<typename LhsScalar, typename RhsScalar>
+inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
+{
+  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
+}
+
 #ifdef EIGEN_HAS_FUSE_CJMADD
   #define CJMADD(A,B,C,T)  C = cj.pmadd(A,B,C);
 #else
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index decf515b0..979609649 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -117,9 +117,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     Index kc = depth; // cache block size along the K direction
     Index mc = rows;  // cache block size along the M direction
     Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
-    // it is better to use smaller blocks along the diagonal
-    kc /= 4;
+    computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
     std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
@@ -245,9 +243,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
     Index kc = depth; // cache block size along the K direction
     Index mc = rows;  // cache block size along the M direction
     Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
-    // it is better to use smaller blocks along the diagonal
-    kc /= 4;
+    computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
     std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index 381983459..4723d355a 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -66,9 +66,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora
     Index kc = size; // cache block size along the K direction
     Index mc = size;  // cache block size along the M direction
     Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
-    // it is better to use smaller blocks along the diagonal
-    kc /= 4;
+    computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
     std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
@@ -206,9 +204,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor
     Index kc = size; // cache block size along the K direction
     Index mc = size;  // cache block size along the M direction
     Index nc = rows;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
-    // it is better to use smaller blocks along the diagonal
-    kc /= 4;
+    computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
     std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size;
author	Gael Guennebaud <g.gael@free.fr>	2010-06-24 11:50:28 +0200
committer	Gael Guennebaud <g.gael@free.fr>	2010-06-24 11:50:28 +0200
commit	d44fce501bf299692d578349b92c899c3f0d79cd (patch)
tree	7c8061b30cace79ccf503f4faa4823e0e80e2290
parent	0068d3ccf62c3e2960e3f9eb4a31562c0f5025d7 (diff)