From 3ac2b96a2f131e8162d39f0976cfb31b1a853237 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 26 Feb 2010 12:32:00 +0100 Subject: implement a smarter parallelization strategy for gemm avoiding multiple paking of the same data --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 10 +- Eigen/src/Core/products/GeneralMatrixMatrix.h | 146 ++++++++++++++++------ Eigen/src/Core/products/Parallelizer.h | 64 ++++++++-- bench/bench_gemm.cpp | 79 +++++++++++- bench/bench_gemm_blas.cpp | 83 ------------ 5 files changed, 245 insertions(+), 137 deletions(-) delete mode 100644 bench/bench_gemm_blas.cpp diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index c29e4efc2..6836a10de 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -37,7 +37,8 @@ template struct ei_gebp_kernel { - void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols, int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0) + void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols, + int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0, Scalar* unpackedB = 0) { typedef typename ei_packet_traits::type PacketType; enum { PacketSize = ei_packet_traits::size }; @@ -45,11 +46,12 @@ struct ei_gebp_kernel if(strideB==-1) strideB = depth; Conj cj; int packet_cols = (cols/nr) * nr; - const int peeled_mc = (rows/mr)*mr; - const int peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0); + const int peeled_mc = (rows/mr)*mr; + const int peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0); const int peeled_kc = (depth/4)*4; - Scalar* unpackedB = const_cast(blockB - strideB * nr * PacketSize); + if(unpackedB==0) + unpackedB = const_cast(blockB - strideB * nr * PacketSize); // loops on each micro vertical panel of rhs (depth x nr) for(int j2=0; j2 - ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha); + ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,info); } }; @@ -64,7 +65,8 @@ static void run(int rows, int cols, int depth, const Scalar* _lhs, int lhsStride, const Scalar* _rhs, int rhsStride, Scalar* res, int resStride, - Scalar alpha) + Scalar alpha, + GemmParallelInfo* info = 0) { ei_const_blas_data_mapper lhs(_lhs,lhsStride); ei_const_blas_data_mapper rhs(_rhs,rhsStride); @@ -75,47 +77,114 @@ static void run(int rows, int cols, int depth, typedef typename ei_packet_traits::type PacketType; typedef ei_product_blocking_traits Blocking; - int kc = std::min(Blocking::Max_kc,depth); // cache block size along the K direction - int mc = std::min(Blocking::Max_mc,rows); // cache block size along the M direction +// int kc = std::min(Blocking::Max_kc,depth); // cache block size along the K direction +// int mc = std::min(Blocking::Max_mc,rows); // cache block size along the M direction - Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; - Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); - Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; + int kc = std::min(256,depth); // cache block size along the K direction + int mc = std::min(512,rows); // cache block size along the M direction - // For each horizontal panel of the rhs, and corresponding panel of the lhs... - // (==GEMM_VAR1) - for(int k2=0; k2 pack_rhs; + ei_gemm_pack_lhs pack_lhs; + ei_gebp_kernel > gebp; + + if(info) { - const int actual_kc = std::min(k2+kc,depth)-k2; + // this is the parallel version! + int tid = omp_get_thread_num(); + + Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); + std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr*8; + Scalar* w = ei_aligned_stack_new(Scalar, sizeW); + Scalar* blockB = (Scalar*)info[tid].blockB; - // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs. - // => Pack rhs's panel into a sequential chunk of memory (L2 caching) - // Note that this panel will be read as many times as the number of blocks in the lhs's - // vertical panel which is, in practice, a very low number. - ei_gemm_pack_rhs()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); + // if you have the GOTO blas library you can try our parallelization strategy + // using GOTO's optimized routines. +// #define USEGOTOROUTINES + #ifdef USEGOTOROUTINES + void* u = alloca(4096+sizeW); + #endif - // For each mc x kc block of the lhs's vertical panel... - // (==GEPP_VAR1) - for(int i2=0; i2()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); + for(int i=0; i >() - (res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); + // pack A_i,k to A' + #ifndef USEGOTOROUTINES + pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc); + #else + sgemm_itcopy(actual_kc, actual_mc, &lhs(i,k), lhsStride, blockA); + #endif -// sgemm_kernel(actual_mc, cols, actual_kc, alpha, blockA, allocatedBlockB, res+i2, resStride); + // C_i += A' * B' + #ifndef USEGOTOROUTINES + gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1,-1,0,0, w); + #else + sgemm_kernel(actual_mc, cols, actual_kc, alpha, blockA, blockB, res+i, resStride); + #endif + } } + + ei_aligned_stack_delete(Scalar, blockA, kc*mc); + ei_aligned_stack_delete(Scalar, w, sizeW); } + else + { + // this is the sequential version! + Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; + + // For each horizontal panel of the rhs, and corresponding panel of the lhs... + // (==GEMM_VAR1) + for(int k2=0; k2 Pack rhs's panel into a sequential chunk of memory (L2 caching) + // Note that this panel will be read as many times as the number of blocks in the lhs's + // vertical panel which is, in practice, a very low number. + pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); + + + // For each mc x kc block of the lhs's vertical panel... + // (==GEPP_VAR1) + for(int i2=0; i2 _ActualRhsType, Dest> Functor; - ei_run_parallel_2d(Functor(lhs, rhs, dst, actualAlpha), this->cols(), this->rows()); +// ei_run_parallel_1d(Functor(lhs, rhs, dst, actualAlpha), this->rows()); +// ei_run_parallel_2d(Functor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols()); + ei_run_parallel_gemm(Functor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols()); } }; diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 088e387f9..e7a940992 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -25,6 +25,13 @@ #ifndef EIGEN_PARALLELIZER_H #define EIGEN_PARALLELIZER_H +struct GemmParallelInfo +{ + int rhs_start; + int rhs_length; + float* blockB; +}; + template void ei_run_parallel_1d(const Functor& func, int size) { @@ -53,18 +60,21 @@ void ei_run_parallel_2d(const Functor& func, int size1, int size2) #ifndef EIGEN_HAS_OPENMP func(0,size1, 0,size2); #else - if(!Parallelize) + + int threads = omp_get_max_threads(); + if((!Parallelize)||(threads==1)) return func(0,size1, 0,size2); // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 - static const int divide1[17] = { 0, 1, 2, 3, 2, 5, 3, 7, 4, 3, 5, 11, 4, 13, 7, 5, 4}; - static const int divide2[17] = { 0, 1, 1, 1, 2, 1, 2, 1, 2, 3, 2, 1, 3, 1, 2, 3, 4}; + static const int divide1[17] = { 0, 1, 2, 3, 2, 5, 3, 7, 4, 3, 5, 1, 4, 1, 7, 5, 4}; + static const int divide2[17] = { 0, 1, 1, 1, 2, 1, 2, 1, 2, 3, 2, 11, 3, 13, 2, 3, 4}; + + - int threads = omp_get_num_procs(); ei_assert(threads<=16 && "too many threads !"); int blockSize1 = size1 / divide1[threads]; int blockSize2 = size2 / divide2[threads]; - + Matrix ranges(4,threads); int k = 0; for(int i1=0; i1 +void ei_run_parallel_gemm(const Functor& func, int rows, int cols) +{ +#ifndef EIGEN_HAS_OPENMP + func(0,size1, 0,size2); +#else + + int threads = omp_get_max_threads(); + if((!Parallelize)||(threads==1)) + return func(0,rows, 0,cols); + + + int blockCols = (cols / threads) & ~0x3; + int blockRows = (rows / threads) & ~0x7; + + float* sharedBlockB = new float[2048*2048*4]; + + GemmParallelInfo* info = new GemmParallelInfo[threads]; + + #pragma omp parallel for schedule(static,1) + for(int i=0; i M; +#ifdef HAVE_BLAS + +extern "C" { + #include + + void sgemm_kernel(int actual_mc, int cols, int actual_kc, float alpha, + float* blockA, float* blockB, float* res, int resStride); + void sgemm_oncopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB); + void sgemm_itcopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB); +} + +static float fone = 1; +static float fzero = 0; +static double done = 1; +static double szero = 0; +static char notrans = 'N'; +static char trans = 'T'; +static char nonunit = 'N'; +static char lower = 'L'; +static char right = 'R'; +static int intone = 1; + +void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c) +{ + int M = c.rows(); int N = c.cols(); int K = a.cols(); + int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); + + sgemm_(¬rans,¬rans,&M,&N,&K,&fone, + const_cast(a.data()),&lda, + const_cast(b.data()),&ldb,&fone, + c.data(),&ldc); +} + +void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c) +{ + int M = c.rows(); int N = c.cols(); int K = a.cols(); + int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); + + dgemm_(¬rans,¬rans,&M,&N,&K,&done, + const_cast(a.data()),&lda, + const_cast(b.data()),&ldb,&done, + c.data(),&ldc); +} + +#endif + void gemm(const M& a, const M& b, M& c) { c.noalias() += a * b; @@ -22,21 +68,42 @@ void gemm(const M& a, const M& b, M& c) int main(int argc, char ** argv) { - int rep = 1; + int rep = 1; // number of repetitions per try + int tries = 5; // number of tries, we keep the best + int s = 2048; int m = s; int n = s; int p = s; - M a(m,n); a.setOnes(); - M b(n,p); b.setOnes(); + M a(m,n); a.setRandom(); + M b(n,p); b.setRandom(); M c(m,p); c.setOnes(); BenchTimer t; - BENCH(t, 5, rep, gemm(a,b,c)); + M r = c; + + // check the parallel product is correct + #ifdef HAVE_BLAS + blas_gemm(a,b,r); + #else + int procs = omp_get_max_threads(); + omp_set_num_threads(1); + r.noalias() += a * b; + omp_set_num_threads(procs); + #endif + c.noalias() += a * b; + if(!r.isApprox(c)) std::cerr << "Warning, your parallel product is crap!\n\n"; + + #ifdef HAVE_BLAS + BENCH(t, tries, rep, blas_gemm(a,b,c)); + std::cerr << "blas cpu " << t.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER) << "s)\n"; + std::cerr << "blas real " << t.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n"; + #endif - std::cerr << "cpu " << t.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER) << "s)\n"; - std::cerr << "real " << t.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n"; + BENCH(t, tries, rep, gemm(a,b,c)); + std::cerr << "eigen cpu " << t.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER) << "s)\n"; + std::cerr << "eigen real " << t.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n"; return 0; } diff --git a/bench/bench_gemm_blas.cpp b/bench/bench_gemm_blas.cpp deleted file mode 100644 index babf1ec2c..000000000 --- a/bench/bench_gemm_blas.cpp +++ /dev/null @@ -1,83 +0,0 @@ - -#include -#include - -extern "C" -{ - #include - #include -} - -using namespace std; -using namespace Eigen; - -#ifndef SCALAR -#define SCALAR float -#endif - -typedef SCALAR Scalar; -typedef Matrix M; - -static float fone = 1; -static float fzero = 0; -static double done = 1; -static double szero = 0; -static char notrans = 'N'; -static char trans = 'T'; -static char nonunit = 'N'; -static char lower = 'L'; -static char right = 'R'; -static int intone = 1; - -void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c) -{ - int M = c.rows(); - int N = c.cols(); - int K = a.cols(); - - int lda = a.rows(); - int ldb = b.rows(); - int ldc = c.rows(); - - sgemm_(¬rans,¬rans,&M,&N,&K,&fone, - const_cast(a.data()),&lda, - const_cast(b.data()),&ldb,&fone, - c.data(),&ldc); -} - -void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c) -{ - int M = c.rows(); - int N = c.cols(); - int K = a.cols(); - - int lda = a.rows(); - int ldb = b.rows(); - int ldc = c.rows(); - - dgemm_(¬rans,¬rans,&M,&N,&K,&done, - const_cast(a.data()),&lda, - const_cast(b.data()),&ldb,&done, - c.data(),&ldc); -} - -int main(int argc, char **argv) -{ - int rep = 1; - int s = 2048; - int m = s; - int n = s; - int p = s; - M a(m,n); a.setOnes(); - M b(n,p); b.setOnes(); - M c(m,p); c.setOnes(); - - BenchTimer t; - - BENCH(t, 5, rep, blas_gemm(a,b,c)); - - std::cerr << "cpu " << t.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER) << "s)\n"; - std::cerr << "real " << t.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n"; - return 0; -} - -- cgit v1.2.3