From 3ac2b96a2f131e8162d39f0976cfb31b1a853237 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 26 Feb 2010 12:32:00 +0100
Subject: implement a smarter parallelization strategy for gemm avoiding
 multiple paking of the same data

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h |  10 +-
 Eigen/src/Core/products/GeneralMatrixMatrix.h     | 146 ++++++++++++++++------
 Eigen/src/Core/products/Parallelizer.h            |  64 ++++++++--
 bench/bench_gemm.cpp                              |  79 +++++++++++-
 bench/bench_gemm_blas.cpp                         |  83 ------------
 5 files changed, 245 insertions(+), 137 deletions(-)
 delete mode 100644 bench/bench_gemm_blas.cpp
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index c29e4efc2..6836a10de 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -37,7 +37,8 @@
 template<typename Scalar, int mr, int nr, typename Conj>
 struct ei_gebp_kernel
 {
-  void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols, int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0)
+  void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols,
+                  int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0, Scalar* unpackedB = 0)
   {
     typedef typename ei_packet_traits<Scalar>::type PacketType;
     enum { PacketSize = ei_packet_traits<Scalar>::size };
@@ -45,11 +46,12 @@ struct ei_gebp_kernel
     if(strideB==-1) strideB = depth;
     Conj cj;
     int packet_cols = (cols/nr) * nr;
-    const int peeled_mc  = (rows/mr)*mr;
-    const int peeled_mc2  = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0);
+    const int peeled_mc = (rows/mr)*mr;
+    const int peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0);
     const int peeled_kc = (depth/4)*4;
 
-    Scalar* unpackedB = const_cast<Scalar*>(blockB - strideB * nr * PacketSize);
+    if(unpackedB==0)
+      unpackedB = const_cast<Scalar*>(blockB - strideB * nr * PacketSize);
 
     // loops on each micro vertical panel of rhs (depth x nr)
     for(int j2=0; j2<packet_cols; j2+=nr)
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 84429a0d9..ca3d1d200 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -39,7 +39,8 @@ struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsS
     const Scalar* lhs, int lhsStride,
     const Scalar* rhs, int rhsStride,
     Scalar* res, int resStride,
-    Scalar alpha)
+    Scalar alpha,
+    GemmParallelInfo* info = 0)
   {
     // transpose the product such that the result is column major
     ei_general_matrix_matrix_product<Scalar,
@@ -48,7 +49,7 @@ struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsS
       LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
       ConjugateLhs,
       ColMajor>
-    ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha);
+    ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,info);
   }
 };
 
@@ -64,7 +65,8 @@ static void run(int rows, int cols, int depth,
   const Scalar* _lhs, int lhsStride,
   const Scalar* _rhs, int rhsStride,
   Scalar* res, int resStride,
-  Scalar alpha)
+  Scalar alpha,
+  GemmParallelInfo* info = 0)
 {
   ei_const_blas_data_mapper<Scalar, LhsStorageOrder> lhs(_lhs,lhsStride);
   ei_const_blas_data_mapper<Scalar, RhsStorageOrder> rhs(_rhs,rhsStride);
@@ -75,47 +77,114 @@ static void run(int rows, int cols, int depth,
   typedef typename ei_packet_traits<Scalar>::type PacketType;
   typedef ei_product_blocking_traits<Scalar> Blocking;
 
-  int kc = std::min<int>(Blocking::Max_kc,depth);  // cache block size along the K direction
-  int mc = std::min<int>(Blocking::Max_mc,rows);   // cache block size along the M direction
+//   int kc = std::min<int>(Blocking::Max_kc,depth);  // cache block size along the K direction
+//   int mc = std::min<int>(Blocking::Max_mc,rows);   // cache block size along the M direction
 
-  Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
-  std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
-  Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
-  Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
+  int kc = std::min<int>(256,depth);  // cache block size along the K direction
+  int mc = std::min<int>(512,rows);   // cache block size along the M direction
 
-  // For each horizontal panel of the rhs, and corresponding panel of the lhs...
-  // (==GEMM_VAR1)
-  for(int k2=0; k2<depth; k2+=kc)
+  ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder> pack_rhs;
+  ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder> pack_lhs;
+  ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp;
+
+  if(info)
   {
-    const int actual_kc = std::min(k2+kc,depth)-k2;
+    // this is the parallel version!
+    int tid = omp_get_thread_num();
+
+    Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
+    std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr*8;
+    Scalar* w = ei_aligned_stack_new(Scalar, sizeW);
+    Scalar* blockB = (Scalar*)info[tid].blockB;
 
-    // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
-    // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
-    // Note that this panel will be read as many times as the number of blocks in the lhs's
-    // vertical panel which is, in practice, a very low number.
-    ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
+    // if you have the GOTO blas library you can try our parallelization strategy
+    // using GOTO's optimized routines.
+//     #define USEGOTOROUTINES
+    #ifdef USEGOTOROUTINES
+    void* u = alloca(4096+sizeW);
+    #endif
 
-    // For each mc x kc block of the lhs's vertical panel...
-    // (==GEPP_VAR1)
-    for(int i2=0; i2<rows; i2+=mc)
+    // For each horizontal panel of the rhs, and corresponding panel of the lhs...
+    // (==GEMM_VAR1)
+    for(int k=0; k<depth; k+=kc)
     {
-      const int actual_mc = std::min(i2+mc,rows)-i2;
+      #pragma omp barrier
+      const int actual_kc = std::min(k+kc,depth)-k;
+
+      // pack B_k to B' in parallel fashion,
+      // each thread packs the B_k,j sub block to B'_j where j is the thread id
+      #ifndef USEGOTOROUTINES
+      pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, alpha, actual_kc, info[tid].rhs_length);
+      #else
+      sgemm_oncopy(actual_kc, info[tid].rhs_length, &rhs(k,info[tid].rhs_start), rhsStride, blockB+info[tid].rhs_start*kc);
+      #endif
+
+      #pragma omp barrier
 
-      // We pack the lhs's block into a sequential chunk of memory (L1 caching)
-      // Note that this block will be read a very high number of times, which is equal to the number of
-      // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
-      ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder>()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
+      for(int i=0; i<rows; i+=mc)
+      {
+        const int actual_mc = std::min(i+mc,rows)-i;
 
-      // Everything is packed, we can now call the block * panel kernel:
-      ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
-        (res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
+        // pack A_i,k to A'
+        #ifndef USEGOTOROUTINES
+        pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
+        #else
+        sgemm_itcopy(actual_kc, actual_mc, &lhs(i,k), lhsStride, blockA);
+        #endif
 
-//         sgemm_kernel(actual_mc, cols, actual_kc, alpha, blockA, allocatedBlockB, res+i2, resStride);
+        // C_i += A' * B'
+        #ifndef USEGOTOROUTINES
+        gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1,-1,0,0, w);
+        #else
+        sgemm_kernel(actual_mc, cols, actual_kc, alpha, blockA, blockB, res+i, resStride);
+        #endif
+      }
     }
+
+    ei_aligned_stack_delete(Scalar, blockA, kc*mc);
+    ei_aligned_stack_delete(Scalar, w, sizeW);
   }
+  else
+  {
+    // this is the sequential version!
+    Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
+    std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
+    Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
+    Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
+
+    // For each horizontal panel of the rhs, and corresponding panel of the lhs...
+    // (==GEMM_VAR1)
+    for(int k2=0; k2<depth; k2+=kc)
+    {
+      const int actual_kc = std::min(k2+kc,depth)-k2;
 
-  ei_aligned_stack_delete(Scalar, blockA, kc*mc);
-  ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
+      // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
+      // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
+      // Note that this panel will be read as many times as the number of blocks in the lhs's
+      // vertical panel which is, in practice, a very low number.
+      pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
+
+
+      // For each mc x kc block of the lhs's vertical panel...
+      // (==GEPP_VAR1)
+      for(int i2=0; i2<rows; i2+=mc)
+      {
+        const int actual_mc = std::min(i2+mc,rows)-i2;
+
+        // We pack the lhs's block into a sequential chunk of memory (L1 caching)
+        // Note that this block will be read a very high number of times, which is equal to the number of
+        // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
+        pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
+
+        // Everything is packed, we can now call the block * panel kernel:
+        gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
+
+      }
+    }
+
+    ei_aligned_stack_delete(Scalar, blockA, kc*mc);
+    ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
+  }
 }
 
 };
@@ -139,15 +208,16 @@ struct ei_gemm_functor
     : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha)
   {}
 
-  void operator() (int col, int cols, int row=0, int rows=-1) const
+  void operator() (int row, int rows, int col=0, int cols=-1, GemmParallelInfo* info=0) const
   {
-    if(rows==-1)
-      rows = m_lhs.rows();
+    if(cols==-1)
+      cols = m_rhs.cols();
     Gemm::run(rows, cols, m_lhs.cols(),
               (const Scalar*)&(m_lhs.const_cast_derived().coeffRef(row,0)), m_lhs.stride(),
               (const Scalar*)&(m_rhs.const_cast_derived().coeffRef(0,col)), m_rhs.stride(),
               (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.stride(),
-              m_actualAlpha);
+              m_actualAlpha,
+              info);
   }
 
   protected:
@@ -191,7 +261,9 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
         _ActualRhsType,
         Dest> Functor;
 
-      ei_run_parallel_2d<true>(Functor(lhs, rhs, dst, actualAlpha), this->cols(), this->rows());
+//       ei_run_parallel_1d<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows());
+//       ei_run_parallel_2d<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols());
+      ei_run_parallel_gemm<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols());
     }
 };
 
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 088e387f9..e7a940992 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -25,6 +25,13 @@
 #ifndef EIGEN_PARALLELIZER_H
 #define EIGEN_PARALLELIZER_H
 
+struct GemmParallelInfo
+{
+  int rhs_start;
+  int rhs_length;
+  float* blockB;
+};
+
 template<bool Parallelize,typename Functor>
 void ei_run_parallel_1d(const Functor& func, int size)
 {
@@ -53,18 +60,21 @@ void ei_run_parallel_2d(const Functor& func, int size1, int size2)
 #ifndef EIGEN_HAS_OPENMP
   func(0,size1, 0,size2);
 #else
-  if(!Parallelize)
+
+  int threads = omp_get_max_threads();
+  if((!Parallelize)||(threads==1))
     return func(0,size1, 0,size2);
 
                                 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
-  static const int divide1[17] = { 0, 1, 2, 3, 2, 5, 3, 7, 4, 3,  5, 11,  4, 13,  7,  5, 4};
-  static const int divide2[17] = { 0, 1, 1, 1, 2, 1, 2, 1, 2, 3,  2,  1,  3,  1,  2,  3, 4};
+  static const int divide1[17] = { 0, 1, 2, 3, 2, 5, 3, 7, 4, 3,  5,  1,  4,  1,  7,  5, 4};
+  static const int divide2[17] = { 0, 1, 1, 1, 2, 1, 2, 1, 2, 3,  2, 11,  3, 13,  2,  3, 4};
+
+
 
-  int threads = omp_get_num_procs();
   ei_assert(threads<=16 && "too many threads !");
   int blockSize1 = size1 / divide1[threads];
   int blockSize2 = size2 / divide2[threads];
-  
+
   Matrix<int,4,Dynamic> ranges(4,threads);
   int k = 0;
   for(int i1=0; i1<divide1[threads]; ++i1)
@@ -78,7 +88,7 @@ void ei_run_parallel_2d(const Functor& func, int size1, int size2)
       ranges.col(k++) << blockStart1, actualBlockSize1, blockStart2, actualBlockSize2;
     }
   }
-  
+
   #pragma omp parallel for schedule(static,1)
   for(int i=0; i<threads; ++i)
   {
@@ -87,4 +97,44 @@ void ei_run_parallel_2d(const Functor& func, int size1, int size2)
 #endif
 }
 
-#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
+template<bool Parallelize,typename Functor>
+void ei_run_parallel_gemm(const Functor& func, int rows, int cols)
+{
+#ifndef EIGEN_HAS_OPENMP
+  func(0,size1, 0,size2);
+#else
+
+  int threads = omp_get_max_threads();
+  if((!Parallelize)||(threads==1))
+    return func(0,rows, 0,cols);
+
+
+  int blockCols = (cols / threads) & ~0x3;
+  int blockRows = (rows / threads) & ~0x7;
+
+  float* sharedBlockB = new float[2048*2048*4];
+
+  GemmParallelInfo* info = new GemmParallelInfo[threads];
+
+  #pragma omp parallel for schedule(static,1)
+  for(int i=0; i<threads; ++i)
+  {
+    int r0 = i*blockRows;
+    int actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
+
+    int c0 = i*blockCols;
+    int actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
+
+    info[i].rhs_start = c0;
+    info[i].rhs_length = actualBlockCols;
+    info[i].blockB = sharedBlockB;
+
+    func(r0, actualBlockRows, 0,cols, info);
+  }
+
+  delete[] sharedBlockB;
+  delete[] info;
+#endif
+}
+
+#endif // EIGEN_PARALLELIZER_H
diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index ccc155dc5..d958cc1bf 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -15,6 +15,52 @@ using namespace Eigen;
 typedef SCALAR Scalar;
 typedef Matrix<Scalar,Dynamic,Dynamic> M;
 
+#ifdef HAVE_BLAS
+
+extern "C" {
+  #include <bench/btl/libs/C_BLAS/blas.h>
+
+  void sgemm_kernel(int actual_mc, int cols, int actual_kc, float alpha,
+                    float* blockA, float* blockB, float* res, int resStride);
+  void sgemm_oncopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
+  void sgemm_itcopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
+}
+
+static float fone = 1;
+static float fzero = 0;
+static double done = 1;
+static double szero = 0;
+static char notrans = 'N';
+static char trans = 'T';
+static char nonunit = 'N';
+static char lower = 'L';
+static char right = 'R';
+static int intone = 1;
+
+void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
+{
+  int M = c.rows(); int N = c.cols(); int K = a.cols();
+  int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
+
+  sgemm_(&notrans,&notrans,&M,&N,&K,&fone,
+         const_cast<float*>(a.data()),&lda,
+         const_cast<float*>(b.data()),&ldb,&fone,
+         c.data(),&ldc);
+}
+
+void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c)
+{
+  int M = c.rows(); int N = c.cols(); int K = a.cols();
+  int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
+
+  dgemm_(&notrans,&notrans,&M,&N,&K,&done,
+         const_cast<double*>(a.data()),&lda,
+         const_cast<double*>(b.data()),&ldb,&done,
+         c.data(),&ldc);
+}
+
+#endif
+
 void gemm(const M& a, const M& b, M& c)
 {
   c.noalias() += a * b;
@@ -22,21 +68,42 @@ void gemm(const M& a, const M& b, M& c)
 
 int main(int argc, char ** argv)
 {
-  int rep = 1;
+  int rep = 1;    // number of repetitions per try
+  int tries = 5;  // number of tries, we keep the best
+
   int s = 2048;
   int m = s;
   int n = s;
   int p = s;
-  M a(m,n); a.setOnes();
-  M b(n,p); b.setOnes();
+  M a(m,n); a.setRandom();
+  M b(n,p); b.setRandom();
   M c(m,p); c.setOnes();
 
   BenchTimer t;
 
-  BENCH(t, 5, rep, gemm(a,b,c));
+  M r = c;
+
+  // check the parallel product is correct
+  #ifdef HAVE_BLAS
+  blas_gemm(a,b,r);
+  #else
+  int procs = omp_get_max_threads();
+  omp_set_num_threads(1);
+  r.noalias() += a * b;
+  omp_set_num_threads(procs);
+  #endif
+  c.noalias() += a * b;
+  if(!r.isApprox(c)) std::cerr << "Warning, your parallel product is crap!\n\n";
+
+  #ifdef HAVE_BLAS
+  BENCH(t, tries, rep, blas_gemm(a,b,c));
+  std::cerr << "blas  cpu   " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
+  std::cerr << "blas  real  " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
+  #endif
 
-  std::cerr << "cpu   " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
-  std::cerr << "real  " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
+  BENCH(t, tries, rep, gemm(a,b,c));
+  std::cerr << "eigen cpu   " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
+  std::cerr << "eigen real  " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
 
   return 0;
 }
diff --git a/bench/bench_gemm_blas.cpp b/bench/bench_gemm_blas.cpp
deleted file mode 100644
index babf1ec2c..000000000
--- a/bench/bench_gemm_blas.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-
-#include <Eigen/Core>
-#include <bench/BenchTimer.h>
-
-extern "C"
-{
-  #include <bench/btl/libs/C_BLAS/blas.h>
-  #include <cblas.h>
-}
-
-using namespace std;
-using namespace Eigen;
-
-#ifndef SCALAR
-#define SCALAR float
-#endif
-
-typedef SCALAR Scalar;
-typedef Matrix<Scalar,Dynamic,Dynamic> M;
-
-static float fone = 1;
-static float fzero = 0;
-static double done = 1;
-static double szero = 0;
-static char notrans = 'N';
-static char trans = 'T';
-static char nonunit = 'N';
-static char lower = 'L';
-static char right = 'R';
-static int intone = 1;
-
-void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
-{
-  int M = c.rows();
-  int N = c.cols();
-  int K = a.cols();
-
-  int lda = a.rows();
-  int ldb = b.rows();
-  int ldc = c.rows();
-  
-  sgemm_(&notrans,&notrans,&M,&N,&K,&fone,
-         const_cast<float*>(a.data()),&lda,
-         const_cast<float*>(b.data()),&ldb,&fone,
-         c.data(),&ldc);
-}
-
-void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c)
-{
-  int M = c.rows();
-  int N = c.cols();
-  int K = a.cols();
-
-  int lda = a.rows();
-  int ldb = b.rows();
-  int ldc = c.rows();
-
-  dgemm_(&notrans,&notrans,&M,&N,&K,&done,
-         const_cast<double*>(a.data()),&lda,
-         const_cast<double*>(b.data()),&ldb,&done,
-         c.data(),&ldc);
-}
-
-int main(int argc, char **argv)
-{
-  int rep = 1;
-  int s = 2048;
-  int m = s;
-  int n = s;
-  int p = s;
-  M a(m,n); a.setOnes();
-  M b(n,p); b.setOnes();
-  M c(m,p); c.setOnes();
-
-  BenchTimer t;
-
-  BENCH(t, 5, rep, blas_gemm(a,b,c));
-
-  std::cerr << "cpu   " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
-  std::cerr << "real  " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
-  return 0;
-}
-
-- 
cgit v1.2.3