From 3ac2b96a2f131e8162d39f0976cfb31b1a853237 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 26 Feb 2010 12:32:00 +0100 Subject: implement a smarter parallelization strategy for gemm avoiding multiple paking of the same data --- bench/bench_gemm_blas.cpp | 83 ----------------------------------------------- 1 file changed, 83 deletions(-) delete mode 100644 bench/bench_gemm_blas.cpp (limited to 'bench/bench_gemm_blas.cpp') diff --git a/bench/bench_gemm_blas.cpp b/bench/bench_gemm_blas.cpp deleted file mode 100644 index babf1ec2c..000000000 --- a/bench/bench_gemm_blas.cpp +++ /dev/null @@ -1,83 +0,0 @@ - -#include -#include - -extern "C" -{ - #include - #include -} - -using namespace std; -using namespace Eigen; - -#ifndef SCALAR -#define SCALAR float -#endif - -typedef SCALAR Scalar; -typedef Matrix M; - -static float fone = 1; -static float fzero = 0; -static double done = 1; -static double szero = 0; -static char notrans = 'N'; -static char trans = 'T'; -static char nonunit = 'N'; -static char lower = 'L'; -static char right = 'R'; -static int intone = 1; - -void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c) -{ - int M = c.rows(); - int N = c.cols(); - int K = a.cols(); - - int lda = a.rows(); - int ldb = b.rows(); - int ldc = c.rows(); - - sgemm_(¬rans,¬rans,&M,&N,&K,&fone, - const_cast(a.data()),&lda, - const_cast(b.data()),&ldb,&fone, - c.data(),&ldc); -} - -void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c) -{ - int M = c.rows(); - int N = c.cols(); - int K = a.cols(); - - int lda = a.rows(); - int ldb = b.rows(); - int ldc = c.rows(); - - dgemm_(¬rans,¬rans,&M,&N,&K,&done, - const_cast(a.data()),&lda, - const_cast(b.data()),&ldb,&done, - c.data(),&ldc); -} - -int main(int argc, char **argv) -{ - int rep = 1; - int s = 2048; - int m = s; - int n = s; - int p = s; - M a(m,n); a.setOnes(); - M b(n,p); b.setOnes(); - M c(m,p); c.setOnes(); - - BenchTimer t; - - BENCH(t, 5, rep, blas_gemm(a,b,c)); - - std::cerr << "cpu " << t.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER) << "s)\n"; - std::cerr << "real " << t.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n"; - return 0; -} - -- cgit v1.2.3