diff options
author | 2010-02-26 12:32:00 +0100 | |
---|---|---|
committer | 2010-02-26 12:32:00 +0100 | |
commit | 3ac2b96a2f131e8162d39f0976cfb31b1a853237 (patch) | |
tree | 977798f989db9d3182f48807b43ba7269029d216 /Eigen/src/Core/products/GeneralMatrixMatrix.h | |
parent | a1e110332829a4bb38ca8e55608a2b048876018e (diff) |
implement a smarter parallelization strategy for gemm avoiding multiple
paking of the same data
Diffstat (limited to 'Eigen/src/Core/products/GeneralMatrixMatrix.h')
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 146 |
1 files changed, 109 insertions, 37 deletions
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 84429a0d9..ca3d1d200 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -39,7 +39,8 @@ struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsS const Scalar* lhs, int lhsStride, const Scalar* rhs, int rhsStride, Scalar* res, int resStride, - Scalar alpha) + Scalar alpha, + GemmParallelInfo* info = 0) { // transpose the product such that the result is column major ei_general_matrix_matrix_product<Scalar, @@ -48,7 +49,7 @@ struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsS LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs, ColMajor> - ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha); + ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,info); } }; @@ -64,7 +65,8 @@ static void run(int rows, int cols, int depth, const Scalar* _lhs, int lhsStride, const Scalar* _rhs, int rhsStride, Scalar* res, int resStride, - Scalar alpha) + Scalar alpha, + GemmParallelInfo* info = 0) { ei_const_blas_data_mapper<Scalar, LhsStorageOrder> lhs(_lhs,lhsStride); ei_const_blas_data_mapper<Scalar, RhsStorageOrder> rhs(_rhs,rhsStride); @@ -75,47 +77,114 @@ static void run(int rows, int cols, int depth, typedef typename ei_packet_traits<Scalar>::type PacketType; typedef ei_product_blocking_traits<Scalar> Blocking; - int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction - int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction +// int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction +// int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction - Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; - Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); - Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; + int kc = std::min<int>(256,depth); // cache block size along the K direction + int mc = std::min<int>(512,rows); // cache block size along the M direction - // For each horizontal panel of the rhs, and corresponding panel of the lhs... - // (==GEMM_VAR1) - for(int k2=0; k2<depth; k2+=kc) + ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder> pack_rhs; + ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder> pack_lhs; + ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp; + + if(info) { - const int actual_kc = std::min(k2+kc,depth)-k2; + // this is the parallel version! + int tid = omp_get_thread_num(); + + Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); + std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr*8; + Scalar* w = ei_aligned_stack_new(Scalar, sizeW); + Scalar* blockB = (Scalar*)info[tid].blockB; - // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs. - // => Pack rhs's panel into a sequential chunk of memory (L2 caching) - // Note that this panel will be read as many times as the number of blocks in the lhs's - // vertical panel which is, in practice, a very low number. - ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); + // if you have the GOTO blas library you can try our parallelization strategy + // using GOTO's optimized routines. +// #define USEGOTOROUTINES + #ifdef USEGOTOROUTINES + void* u = alloca(4096+sizeW); + #endif - // For each mc x kc block of the lhs's vertical panel... - // (==GEPP_VAR1) - for(int i2=0; i2<rows; i2+=mc) + // For each horizontal panel of the rhs, and corresponding panel of the lhs... + // (==GEMM_VAR1) + for(int k=0; k<depth; k+=kc) { - const int actual_mc = std::min(i2+mc,rows)-i2; + #pragma omp barrier + const int actual_kc = std::min(k+kc,depth)-k; + + // pack B_k to B' in parallel fashion, + // each thread packs the B_k,j sub block to B'_j where j is the thread id + #ifndef USEGOTOROUTINES + pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, alpha, actual_kc, info[tid].rhs_length); + #else + sgemm_oncopy(actual_kc, info[tid].rhs_length, &rhs(k,info[tid].rhs_start), rhsStride, blockB+info[tid].rhs_start*kc); + #endif + + #pragma omp barrier - // We pack the lhs's block into a sequential chunk of memory (L1 caching) - // Note that this block will be read a very high number of times, which is equal to the number of - // micro vertical panel of the large rhs's panel (e.g., cols/4 times). - ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder>()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); + for(int i=0; i<rows; i+=mc) + { + const int actual_mc = std::min(i+mc,rows)-i; - // Everything is packed, we can now call the block * panel kernel: - ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >() - (res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); + // pack A_i,k to A' + #ifndef USEGOTOROUTINES + pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc); + #else + sgemm_itcopy(actual_kc, actual_mc, &lhs(i,k), lhsStride, blockA); + #endif -// sgemm_kernel(actual_mc, cols, actual_kc, alpha, blockA, allocatedBlockB, res+i2, resStride); + // C_i += A' * B' + #ifndef USEGOTOROUTINES + gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1,-1,0,0, w); + #else + sgemm_kernel(actual_mc, cols, actual_kc, alpha, blockA, blockB, res+i, resStride); + #endif + } } + + ei_aligned_stack_delete(Scalar, blockA, kc*mc); + ei_aligned_stack_delete(Scalar, w, sizeW); } + else + { + // this is the sequential version! + Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); + std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); + Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; + + // For each horizontal panel of the rhs, and corresponding panel of the lhs... + // (==GEMM_VAR1) + for(int k2=0; k2<depth; k2+=kc) + { + const int actual_kc = std::min(k2+kc,depth)-k2; - ei_aligned_stack_delete(Scalar, blockA, kc*mc); - ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB); + // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs. + // => Pack rhs's panel into a sequential chunk of memory (L2 caching) + // Note that this panel will be read as many times as the number of blocks in the lhs's + // vertical panel which is, in practice, a very low number. + pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); + + + // For each mc x kc block of the lhs's vertical panel... + // (==GEPP_VAR1) + for(int i2=0; i2<rows; i2+=mc) + { + const int actual_mc = std::min(i2+mc,rows)-i2; + + // We pack the lhs's block into a sequential chunk of memory (L1 caching) + // Note that this block will be read a very high number of times, which is equal to the number of + // micro vertical panel of the large rhs's panel (e.g., cols/4 times). + pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); + + // Everything is packed, we can now call the block * panel kernel: + gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); + + } + } + + ei_aligned_stack_delete(Scalar, blockA, kc*mc); + ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB); + } } }; @@ -139,15 +208,16 @@ struct ei_gemm_functor : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha) {} - void operator() (int col, int cols, int row=0, int rows=-1) const + void operator() (int row, int rows, int col=0, int cols=-1, GemmParallelInfo* info=0) const { - if(rows==-1) - rows = m_lhs.rows(); + if(cols==-1) + cols = m_rhs.cols(); Gemm::run(rows, cols, m_lhs.cols(), (const Scalar*)&(m_lhs.const_cast_derived().coeffRef(row,0)), m_lhs.stride(), (const Scalar*)&(m_rhs.const_cast_derived().coeffRef(0,col)), m_rhs.stride(), (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.stride(), - m_actualAlpha); + m_actualAlpha, + info); } protected: @@ -191,7 +261,9 @@ class GeneralProduct<Lhs, Rhs, GemmProduct> _ActualRhsType, Dest> Functor; - ei_run_parallel_2d<true>(Functor(lhs, rhs, dst, actualAlpha), this->cols(), this->rows()); +// ei_run_parallel_1d<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows()); +// ei_run_parallel_2d<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols()); + ei_run_parallel_gemm<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols()); } }; |