bench/bench_gemm_blas.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109


#include <Eigen/Core>
#include <bench/BenchTimer.h>

extern "C"
{
  #include <bench/btl/libs/C_BLAS/blas.h>
  #include <cblas.h>

  void sgemm_kernel(int actual_mc, int cols, int actual_kc, float alpha,
                   float* blockA, float* blockB, float* res, int resStride);

  void sgemm_otcopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
  void sgemm_oncopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
  void sgemm_itcopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
  void sgemm_incopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
}

using namespace std;
using namespace Eigen;

#ifndef SCALAR
#define SCALAR float
#endif

typedef SCALAR Scalar;
typedef Matrix<Scalar,Dynamic,Dynamic> M;

static float fone = 1;
static float fzero = 0;
static double done = 1;
static double szero = 0;
static char notrans = 'N';
static char trans = 'T';
static char nonunit = 'N';
static char lower = 'L';
static char right = 'R';
static int intone = 1;

void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
{
  int M = c.rows();
  int N = c.cols();
  int K = a.cols();

  int lda = a.rows();
  int ldb = b.rows();
  int ldc = c.rows();

//   c.noalias() += a * b;
  sgemm_(&notrans,&notrans,&M,&N,&K,&fone,
         const_cast<float*>(a.data()),&lda,
         const_cast<float*>(b.data()),&ldb,&fone,
         c.data(),&ldc);
}

void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c)
{
  int M = c.rows();
  int N = c.cols();
  int K = a.cols();

  int lda = a.rows();
  int ldb = b.rows();
  int ldc = c.rows();

//   c.noalias() += a * b;

  dgemm_(&notrans,&notrans,&M,&N,&K,&done,
         const_cast<double*>(a.data()),&lda,
         const_cast<double*>(b.data()),&ldb,&done,
         c.data(),&ldc);
}

int main(int argc, char **argv)
{
  int rep = 1;
  int s = 2048;
  int m = s;
  int n = s;
  int p = s;
  const int N = 1;
  M a[N];
  M b[N];
  M c[N];

  for (int k=0; k<N; ++k)
  {
    a[k].resize(m,p); a[k].setOnes();
    b[k].resize(p,n); b[k].setOnes();
    c[k].resize(m,n); c[k].setOnes();
  }

  BenchTimer t;

  BENCH(t, 5, rep,
        for(int k=0;k<N;++k)
          blas_gemm(a[k],b[k],c[k]));

//   BENCH(t, 5, rep,
//         _Pragma("omp parallel for schedule(static,1)")
//         for(int k=0;k<N;++k)
//           blas_gemm(a[k],b[k],c[k]));

  std::cerr << "cpu   " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*N*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
  std::cerr << "real  " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*N*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
  return 0;
}