slightly optimize computeProductBlockingSizes by explicitely precomputing what is known at compile time

author: Gael Guennebaud <g.gael@free.fr> 2010-06-22 11:10:38 +0200
committer: Gael Guennebaud <g.gael@free.fr> 2010-06-22 11:10:38 +0200
commit: fd9a9fa0aedea13866481309e8a5408d1d7c652b (patch)
tree: 92318e74abaed5d729cf2239ae2892087ac2abe1
parent: 3ae0eee0b853d3a8b9c61a078af55f3035ace152 (diff)
2 files changed, 18 insertions, 9 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 4e5ca77e7..cf48ca2f4 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -93,7 +93,7 @@ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
   * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
   *
   * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
-  * this function computes the blocking size parameters along the respective dimensions 
+  * this function computes the blocking size parameters along the respective dimensions
   * for matrix products and related algorithms. The blocking sizes depends on various
   * parameters:
   * - the L1 and L2 cache sizes,
@@ -112,11 +112,18 @@ void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrd
   // i.e., each coefficient is replicated to fit a packet. This small vertical panel has to
   // stay in L1 cache.
   std::ptrdiff_t l1, l2;
+
+  enum {
+    kdiv = 2 * ei_product_blocking_traits<RhsScalar>::nr
+         * ei_packet_traits<RhsScalar>::size * sizeof(RhsScalar),
+    mr = ei_product_blocking_traits<LhsScalar>::mr,
+    mr_mask = (0xffffffff/mr)*mr
+  };
+
   ei_manage_caching_sizes(GetAction, &l1, &l2);
-  k = std::min<std::ptrdiff_t>(k, l1/(2 * ei_product_blocking_traits<RhsScalar>::nr
-                                        * ei_packet_traits<RhsScalar>::size * sizeof(RhsScalar)));
-  std::ptrdiff_t _m = l2/(4 * k * sizeof(LhsScalar));
-  if(_m<m) m = (_m/ei_product_blocking_traits<LhsScalar>::mr) * ei_product_blocking_traits<LhsScalar>::mr;
+  k = std::min<std::ptrdiff_t>(k, l1/kdiv);
+  std::ptrdiff_t _m = l2/(4 * sizeof(LhsScalar) * k);
+  if(_m<m) m = _m & mr_mask;
   n = n;
 }
 
diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index e5b991130..4142236e9 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -2,8 +2,8 @@
 // g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2  ./a.out
 // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp  && OMP_NUM_THREADS=2  ./a.out
 
-#include <Eigen/Core>
 #include <iostream>
+#include <Eigen/Core>
 #include <bench/BenchTimer.h>
 
 using namespace std;
@@ -70,8 +70,6 @@ int main(int argc, char ** argv)
   std::cout << "L1 cache size    = " << (l1>0 ? l1/1024 : -1) << " KB\n";
   std::cout << "L2/L3 cache size = " << (l2>0 ? l2/1024 : -1) << " KB\n";
 
-  setCpuCacheSizes(ei_queryL1CacheSize()/1,ei_queryTopLevelCacheSize()/2);
-  
   int rep = 1;    // number of repetitions per try
   int tries = 2;  // number of tries, we keep the best
 
@@ -85,13 +83,17 @@ int main(int argc, char ** argv)
       s = atoi(argv[i]+1);
     else if(argv[i][0]=='c')
       cache_size = atoi(argv[i]+1);
+    else if(argv[i][0]=='t')
+      tries = atoi(argv[i]+1);
+    else if(argv[i][0]=='p')
+      rep = atoi(argv[i]+1);
     else
       need_help = true;
   }
 
   if(need_help)
   {
-    std::cout << argv[0] << " s<matrix size> c<cache size> \n";
+    std::cout << argv[0] << " s<matrix size> c<cache size> t<nb tries> p<nb repeats>\n";
     return 1;
   }
author	Gael Guennebaud <g.gael@free.fr>	2010-06-22 11:10:38 +0200
committer	Gael Guennebaud <g.gael@free.fr>	2010-06-22 11:10:38 +0200
commit	fd9a9fa0aedea13866481309e8a5408d1d7c652b (patch)
tree	92318e74abaed5d729cf2239ae2892087ac2abe1
parent	3ae0eee0b853d3a8b9c61a078af55f3035ace152 (diff)