diff options
author | Gael Guennebaud <g.gael@free.fr> | 2019-02-15 16:52:34 +0100 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2019-02-15 16:52:34 +0100 |
commit | 902a7793f79d7638d8de05a091682dddf34530d1 (patch) | |
tree | 84246a70c22e06dc8101d473323827f1a078d184 /bench | |
parent | 83309068b42c85d919aff53eb3652fa7896ed020 (diff) |
Add possibility to bench row-major lhs and rhs
Diffstat (limited to 'bench')
-rw-r--r-- | bench/bench_gemm.cpp | 82 |
1 files changed, 58 insertions, 24 deletions
diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp index 7c6dbea61..78ca1cd13 100644 --- a/bench/bench_gemm.cpp +++ b/bench/bench_gemm.cpp @@ -11,8 +11,9 @@ // #include <iostream> -#include <Eigen/Core> #include <bench/BenchTimer.h> +#include <Eigen/Core> + using namespace std; using namespace Eigen; @@ -30,10 +31,22 @@ using namespace Eigen; #define SCALARB SCALAR #endif +#ifdef ROWMAJ_A +const int opt_A = RowMajor; +#else +const int opt_A = ColMajor; +#endif + +#ifdef ROWMAJ_B +const int opt_B = RowMajor; +#else +const int opt_B = ColMajor; +#endif + typedef SCALAR Scalar; typedef NumTraits<Scalar>::Real RealScalar; -typedef Matrix<SCALARA,Dynamic,Dynamic> A; -typedef Matrix<SCALARB,Dynamic,Dynamic> B; +typedef Matrix<SCALARA,Dynamic,Dynamic,opt_A> A; +typedef Matrix<SCALARB,Dynamic,Dynamic,opt_B> B; typedef Matrix<Scalar,Dynamic,Dynamic> C; typedef Matrix<RealScalar,Dynamic,Dynamic> M; @@ -58,45 +71,61 @@ static char lower = 'L'; static char right = 'R'; static int intone = 1; -void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c) +#ifdef ROWMAJ_A +const char transA = trans; +#else +const char transA = notrans; +#endif + +#ifdef ROWMAJ_B +const char transB = trans; +#else +const char transB = notrans; +#endif + +template<typename A,typename B> +void blas_gemm(const A& a, const B& b, MatrixXf& c) { int M = c.rows(); int N = c.cols(); int K = a.cols(); - int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); + int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows(); - sgemm_(¬rans,¬rans,&M,&N,&K,&fone, + sgemm_(&transA,&transB,&M,&N,&K,&fone, const_cast<float*>(a.data()),&lda, const_cast<float*>(b.data()),&ldb,&fone, c.data(),&ldc); } -EIGEN_DONT_INLINE void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c) +template<typename A,typename B> +void blas_gemm(const A& a, const B& b, MatrixXd& c) { int M = c.rows(); int N = c.cols(); int K = a.cols(); - int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); + int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows(); - dgemm_(¬rans,¬rans,&M,&N,&K,&done, + dgemm_(&transA,&transB,&M,&N,&K,&done, const_cast<double*>(a.data()),&lda, const_cast<double*>(b.data()),&ldb,&done, c.data(),&ldc); } -void blas_gemm(const MatrixXcf& a, const MatrixXcf& b, MatrixXcf& c) +template<typename A,typename B> +void blas_gemm(const A& a, const B& b, MatrixXcf& c) { int M = c.rows(); int N = c.cols(); int K = a.cols(); - int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); + int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows(); - cgemm_(¬rans,¬rans,&M,&N,&K,(float*)&cfone, + cgemm_(&transA,&transB,&M,&N,&K,(float*)&cfone, const_cast<float*>((const float*)a.data()),&lda, const_cast<float*>((const float*)b.data()),&ldb,(float*)&cfone, (float*)c.data(),&ldc); } -void blas_gemm(const MatrixXcd& a, const MatrixXcd& b, MatrixXcd& c) +template<typename A,typename B> +void blas_gemm(const A& a, const B& b, MatrixXcd& c) { int M = c.rows(); int N = c.cols(); int K = a.cols(); - int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); + int lda = a.outerStride(); int ldb = b.outerStride(); int ldc = c.rows(); - zgemm_(¬rans,¬rans,&M,&N,&K,(double*)&cdone, + zgemm_(&transA,&transB,&M,&N,&K,(double*)&cdone, const_cast<double*>((const double*)a.data()),&lda, const_cast<double*>((const double*)b.data()),&ldb,(double*)&cdone, (double*)c.data(),&ldc); @@ -127,6 +156,8 @@ void matlab_cplx_real(const M& ar, const M& ai, const M& b, M& cr, M& ci) ci.noalias() += ai * b; } + + template<typename A, typename B, typename C> EIGEN_DONT_INLINE void gemm(const A& a, const B& b, C& c) { @@ -180,8 +211,8 @@ int main(int argc, char ** argv) } else if(argv[i][1]=='t') { + tries = atoi(argv[++i]); ++i; - tries = atoi(argv[i++]); } else if(argv[i][1]=='p') { @@ -217,7 +248,7 @@ int main(int argc, char ** argv) std::cout << "Matrix sizes = " << m << "x" << p << " * " << p << "x" << n << "\n"; std::ptrdiff_t mc(m), nc(n), kc(p); internal::computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); - std::cout << "blocking size (mc x kc) = " << mc << " x " << kc << "\n"; + std::cout << "blocking size (mc x kc) = " << mc << " x " << kc << " x " << nc << "\n"; C r = c; @@ -241,7 +272,7 @@ int main(int argc, char ** argv) blas_gemm(a,b,r); c.noalias() += a * b; if(!r.isApprox(c)) { - std::cout << (r - c).norm() << "\n"; + std::cout << (r - c).norm()/r.norm() << "\n"; std::cerr << "Warning, your product is crap!\n\n"; } #else @@ -250,7 +281,7 @@ int main(int argc, char ** argv) gemm(a,b,c); r.noalias() += a.cast<Scalar>() .lazyProduct( b.cast<Scalar>() ); if(!r.isApprox(c)) { - std::cout << (r - c).norm() << "\n"; + std::cout << (r - c).norm()/r.norm() << "\n"; std::cerr << "Warning, your product is crap!\n\n"; } } @@ -264,6 +295,9 @@ int main(int argc, char ** argv) std::cout << "blas real " << tblas.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tblas.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tblas.total(REAL_TIMER) << "s)\n"; #endif + // warm start + if(b.norm()+a.norm()==123.554) std::cout << "\n"; + BenchTimer tmt; c = rc; BENCH(tmt, tries, rep, gemm(a,b,c)); @@ -286,11 +320,11 @@ int main(int argc, char ** argv) if(1.*m*n*p<30*30*30) { - BenchTimer tmt; - c = rc; - BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b)); - std::cout << "lazy cpu " << tmt.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER) << "s)\n"; - std::cout << "lazy real " << tmt.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n"; + BenchTimer tmt; + c = rc; + BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b)); + std::cout << "lazy cpu " << tmt.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER) << "s)\n"; + std::cout << "lazy real " << tmt.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n"; } #ifdef DECOUPLED |