diff options
author | Gael Guennebaud <g.gael@free.fr> | 2010-06-22 11:10:38 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2010-06-22 11:10:38 +0200 |
commit | fd9a9fa0aedea13866481309e8a5408d1d7c652b (patch) | |
tree | 92318e74abaed5d729cf2239ae2892087ac2abe1 | |
parent | 3ae0eee0b853d3a8b9c61a078af55f3035ace152 (diff) |
slightly optimize computeProductBlockingSizes by explicitely precomputing what is known at compile time
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 17 | ||||
-rw-r--r-- | bench/bench_gemm.cpp | 10 |
2 files changed, 18 insertions, 9 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 4e5ca77e7..cf48ca2f4 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -93,7 +93,7 @@ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension. * * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, - * this function computes the blocking size parameters along the respective dimensions + * this function computes the blocking size parameters along the respective dimensions * for matrix products and related algorithms. The blocking sizes depends on various * parameters: * - the L1 and L2 cache sizes, @@ -112,11 +112,18 @@ void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrd // i.e., each coefficient is replicated to fit a packet. This small vertical panel has to // stay in L1 cache. std::ptrdiff_t l1, l2; + + enum { + kdiv = 2 * ei_product_blocking_traits<RhsScalar>::nr + * ei_packet_traits<RhsScalar>::size * sizeof(RhsScalar), + mr = ei_product_blocking_traits<LhsScalar>::mr, + mr_mask = (0xffffffff/mr)*mr + }; + ei_manage_caching_sizes(GetAction, &l1, &l2); - k = std::min<std::ptrdiff_t>(k, l1/(2 * ei_product_blocking_traits<RhsScalar>::nr - * ei_packet_traits<RhsScalar>::size * sizeof(RhsScalar))); - std::ptrdiff_t _m = l2/(4 * k * sizeof(LhsScalar)); - if(_m<m) m = (_m/ei_product_blocking_traits<LhsScalar>::mr) * ei_product_blocking_traits<LhsScalar>::mr; + k = std::min<std::ptrdiff_t>(k, l1/kdiv); + std::ptrdiff_t _m = l2/(4 * sizeof(LhsScalar) * k); + if(_m<m) m = _m & mr_mask; n = n; } diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp index e5b991130..4142236e9 100644 --- a/bench/bench_gemm.cpp +++ b/bench/bench_gemm.cpp @@ -2,8 +2,8 @@ // g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2 ./a.out // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out -#include <Eigen/Core> #include <iostream> +#include <Eigen/Core> #include <bench/BenchTimer.h> using namespace std; @@ -70,8 +70,6 @@ int main(int argc, char ** argv) std::cout << "L1 cache size = " << (l1>0 ? l1/1024 : -1) << " KB\n"; std::cout << "L2/L3 cache size = " << (l2>0 ? l2/1024 : -1) << " KB\n"; - setCpuCacheSizes(ei_queryL1CacheSize()/1,ei_queryTopLevelCacheSize()/2); - int rep = 1; // number of repetitions per try int tries = 2; // number of tries, we keep the best @@ -85,13 +83,17 @@ int main(int argc, char ** argv) s = atoi(argv[i]+1); else if(argv[i][0]=='c') cache_size = atoi(argv[i]+1); + else if(argv[i][0]=='t') + tries = atoi(argv[i]+1); + else if(argv[i][0]=='p') + rep = atoi(argv[i]+1); else need_help = true; } if(need_help) { - std::cout << argv[0] << " s<matrix size> c<cache size> \n"; + std::cout << argv[0] << " s<matrix size> c<cache size> t<nb tries> p<nb repeats>\n"; return 1; } |