aboutsummaryrefslogtreecommitdiffhomepage
path: root/bench/bench_gemm.cpp
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2014-04-17 21:03:26 +0200
committerGravatar Gael Guennebaud <g.gael@free.fr>2014-04-17 21:03:26 +0200
commitc354bd47f7001bd6b3c43fc4a4b5d27f764aa5c3 (patch)
treea414e46d391b89d78ffe5203733c5b6710df962a /bench/bench_gemm.cpp
parent9777a5ca60f0a82bb789f55912fd046ab7f3d15d (diff)
Make our gemm bench a little more powerful.
Diffstat (limited to 'bench/bench_gemm.cpp')
-rw-r--r--bench/bench_gemm.cpp100
1 files changed, 76 insertions, 24 deletions
diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index 1ef2e72c2..8222271fb 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -2,6 +2,14 @@
// g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2 ./a.out
// icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out
+// Compilation options:
+//
+// -DSCALAR=std::complex<double>
+// -DSCALARA=double or -DSCALARB=double
+// -DHAVE_BLAS
+// -DDECOUPLED
+//
+
#include <iostream>
#include <Eigen/Core>
#include <bench/BenchTimer.h>
@@ -14,10 +22,18 @@ using namespace Eigen;
#define SCALAR float
#endif
+#ifndef SCALARA
+#define SCALARA SCALAR
+#endif
+
+#ifndef SCALARB
+#define SCALARB SCALAR
+#endif
+
typedef SCALAR Scalar;
typedef NumTraits<Scalar>::Real RealScalar;
-typedef Matrix<RealScalar,Dynamic,Dynamic> A;
-typedef Matrix</*Real*/Scalar,Dynamic,Dynamic> B;
+typedef Matrix<SCALARA,Dynamic,Dynamic> A;
+typedef Matrix<SCALARB,Dynamic,Dynamic> B;
typedef Matrix<Scalar,Dynamic,Dynamic> C;
typedef Matrix<RealScalar,Dynamic,Dynamic> M;
@@ -135,32 +151,49 @@ int main(int argc, char ** argv)
int cache_size = -1;
bool need_help = false;
- for (int i=1; i<argc; ++i)
+ for (int i=1; i<argc;)
{
- if(argv[i][0]=='s')
- {
- s = atoi(argv[i]+1);
- m = n = p = s;
- }
- else if(argv[i][0]=='m')
+ if(argv[i][0]=='-')
{
- m = atoi(argv[++i]);
- n = atoi(argv[++i]);
- p = atoi(argv[++i]);
+ if(argv[i][1]=='s')
+ {
+ ++i;
+ s = atoi(argv[i++]);
+ m = n = p = s;
+ if(argv[i][0]!='-')
+ {
+ n = atoi(argv[i++]);
+ p = atoi(argv[i++]);
+ }
+ }
+ else if(argv[i][1]=='c')
+ {
+ ++i;
+ cache_size = atoi(argv[i++]);
+ }
+ else if(argv[i][1]=='t')
+ {
+ ++i;
+ tries = atoi(argv[i++]);
+ }
+ else if(argv[i][1]=='p')
+ {
+ ++i;
+ rep = atoi(argv[i++]);
+ }
}
- else if(argv[i][0]=='c')
- cache_size = atoi(argv[i]+1);
- else if(argv[i][0]=='t')
- tries = atoi(argv[i]+1);
- else if(argv[i][0]=='p')
- rep = atoi(argv[i]+1);
else
+ {
need_help = true;
+ break;
+ }
}
if(need_help)
{
- std::cout << argv[0] << " s<matrix size> c<cache size> t<nb tries> p<nb repeats>\n";
+ std::cout << argv[0] << " -s <matrix sizes> -c <cache size> -t <nb tries> -p <nb repeats>\n";
+ std::cout << " <matrix sizes> : size\n";
+ std::cout << " <matrix sizes> : rows columns depth\n";
return 1;
}
@@ -182,6 +215,7 @@ int main(int argc, char ** argv)
// check the parallel product is correct
#if defined EIGEN_HAS_OPENMP
+ Eigen::initParallel();
int procs = omp_get_max_threads();
if(procs>1)
{
@@ -198,11 +232,20 @@ int main(int argc, char ** argv)
#elif defined HAVE_BLAS
blas_gemm(a,b,r);
c.noalias() += a * b;
- if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n";
+ if(!r.isApprox(c)) {
+ std::cout << r - c << "\n";
+ std::cerr << "Warning, your product is crap!\n\n";
+ }
#else
- gemm(a,b,c);
- r.noalias() += a.cast<Scalar>() * b.cast<Scalar>();
- if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n";
+ if(1.*m*n*p<2000.*2000*2000)
+ {
+ gemm(a,b,c);
+ r.noalias() += a.cast<Scalar>() .lazyProduct( b.cast<Scalar>() );
+ if(!r.isApprox(c)) {
+ std::cout << r - c << "\n";
+ std::cerr << "Warning, your product is crap!\n\n";
+ }
+ }
#endif
#ifdef HAVE_BLAS
@@ -224,7 +267,7 @@ int main(int argc, char ** argv)
{
BenchTimer tmono;
omp_set_num_threads(1);
- Eigen::internal::setNbThreads(1);
+ Eigen::setNbThreads(1);
c = rc;
BENCH(tmono, tries, rep, gemm(a,b,c));
std::cout << "eigen mono cpu " << tmono.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmono.total(CPU_TIMER) << "s)\n";
@@ -233,6 +276,15 @@ int main(int argc, char ** argv)
}
#endif
+ if(1.*m*n*p<30*30*30)
+ {
+ BenchTimer tmt;
+ c = rc;
+ BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b));
+ std::cout << "lazy cpu " << tmt.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER) << "s)\n";
+ std::cout << "lazy real " << tmt.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n";
+ }
+
#ifdef DECOUPLED
if((NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex))
{