aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/products/GeneralMatrixMatrix.h
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2010-02-26 12:32:00 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2010-02-26 12:32:00 +0100
commit3ac2b96a2f131e8162d39f0976cfb31b1a853237 (patch)
tree977798f989db9d3182f48807b43ba7269029d216 /Eigen/src/Core/products/GeneralMatrixMatrix.h
parenta1e110332829a4bb38ca8e55608a2b048876018e (diff)
implement a smarter parallelization strategy for gemm avoiding multiple
paking of the same data
Diffstat (limited to 'Eigen/src/Core/products/GeneralMatrixMatrix.h')
-rw-r--r--Eigen/src/Core/products/GeneralMatrixMatrix.h146
1 files changed, 109 insertions, 37 deletions
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 84429a0d9..ca3d1d200 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -39,7 +39,8 @@ struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsS
const Scalar* lhs, int lhsStride,
const Scalar* rhs, int rhsStride,
Scalar* res, int resStride,
- Scalar alpha)
+ Scalar alpha,
+ GemmParallelInfo* info = 0)
{
// transpose the product such that the result is column major
ei_general_matrix_matrix_product<Scalar,
@@ -48,7 +49,7 @@ struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsS
LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
ConjugateLhs,
ColMajor>
- ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha);
+ ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,info);
}
};
@@ -64,7 +65,8 @@ static void run(int rows, int cols, int depth,
const Scalar* _lhs, int lhsStride,
const Scalar* _rhs, int rhsStride,
Scalar* res, int resStride,
- Scalar alpha)
+ Scalar alpha,
+ GemmParallelInfo* info = 0)
{
ei_const_blas_data_mapper<Scalar, LhsStorageOrder> lhs(_lhs,lhsStride);
ei_const_blas_data_mapper<Scalar, RhsStorageOrder> rhs(_rhs,rhsStride);
@@ -75,47 +77,114 @@ static void run(int rows, int cols, int depth,
typedef typename ei_packet_traits<Scalar>::type PacketType;
typedef ei_product_blocking_traits<Scalar> Blocking;
- int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction
- int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
+// int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction
+// int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
- Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
- std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
- Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
- Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
+ int kc = std::min<int>(256,depth); // cache block size along the K direction
+ int mc = std::min<int>(512,rows); // cache block size along the M direction
- // For each horizontal panel of the rhs, and corresponding panel of the lhs...
- // (==GEMM_VAR1)
- for(int k2=0; k2<depth; k2+=kc)
+ ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder> pack_rhs;
+ ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder> pack_lhs;
+ ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp;
+
+ if(info)
{
- const int actual_kc = std::min(k2+kc,depth)-k2;
+ // this is the parallel version!
+ int tid = omp_get_thread_num();
+
+ Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
+ std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr*8;
+ Scalar* w = ei_aligned_stack_new(Scalar, sizeW);
+ Scalar* blockB = (Scalar*)info[tid].blockB;
- // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
- // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
- // Note that this panel will be read as many times as the number of blocks in the lhs's
- // vertical panel which is, in practice, a very low number.
- ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
+ // if you have the GOTO blas library you can try our parallelization strategy
+ // using GOTO's optimized routines.
+// #define USEGOTOROUTINES
+ #ifdef USEGOTOROUTINES
+ void* u = alloca(4096+sizeW);
+ #endif
- // For each mc x kc block of the lhs's vertical panel...
- // (==GEPP_VAR1)
- for(int i2=0; i2<rows; i2+=mc)
+ // For each horizontal panel of the rhs, and corresponding panel of the lhs...
+ // (==GEMM_VAR1)
+ for(int k=0; k<depth; k+=kc)
{
- const int actual_mc = std::min(i2+mc,rows)-i2;
+ #pragma omp barrier
+ const int actual_kc = std::min(k+kc,depth)-k;
+
+ // pack B_k to B' in parallel fashion,
+ // each thread packs the B_k,j sub block to B'_j where j is the thread id
+ #ifndef USEGOTOROUTINES
+ pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, alpha, actual_kc, info[tid].rhs_length);
+ #else
+ sgemm_oncopy(actual_kc, info[tid].rhs_length, &rhs(k,info[tid].rhs_start), rhsStride, blockB+info[tid].rhs_start*kc);
+ #endif
+
+ #pragma omp barrier
- // We pack the lhs's block into a sequential chunk of memory (L1 caching)
- // Note that this block will be read a very high number of times, which is equal to the number of
- // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
- ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder>()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
+ for(int i=0; i<rows; i+=mc)
+ {
+ const int actual_mc = std::min(i+mc,rows)-i;
- // Everything is packed, we can now call the block * panel kernel:
- ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
- (res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
+ // pack A_i,k to A'
+ #ifndef USEGOTOROUTINES
+ pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
+ #else
+ sgemm_itcopy(actual_kc, actual_mc, &lhs(i,k), lhsStride, blockA);
+ #endif
-// sgemm_kernel(actual_mc, cols, actual_kc, alpha, blockA, allocatedBlockB, res+i2, resStride);
+ // C_i += A' * B'
+ #ifndef USEGOTOROUTINES
+ gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1,-1,0,0, w);
+ #else
+ sgemm_kernel(actual_mc, cols, actual_kc, alpha, blockA, blockB, res+i, resStride);
+ #endif
+ }
}
+
+ ei_aligned_stack_delete(Scalar, blockA, kc*mc);
+ ei_aligned_stack_delete(Scalar, w, sizeW);
}
+ else
+ {
+ // this is the sequential version!
+ Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
+ std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
+ Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
+ Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
+
+ // For each horizontal panel of the rhs, and corresponding panel of the lhs...
+ // (==GEMM_VAR1)
+ for(int k2=0; k2<depth; k2+=kc)
+ {
+ const int actual_kc = std::min(k2+kc,depth)-k2;
- ei_aligned_stack_delete(Scalar, blockA, kc*mc);
- ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
+ // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
+ // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
+ // Note that this panel will be read as many times as the number of blocks in the lhs's
+ // vertical panel which is, in practice, a very low number.
+ pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
+
+
+ // For each mc x kc block of the lhs's vertical panel...
+ // (==GEPP_VAR1)
+ for(int i2=0; i2<rows; i2+=mc)
+ {
+ const int actual_mc = std::min(i2+mc,rows)-i2;
+
+ // We pack the lhs's block into a sequential chunk of memory (L1 caching)
+ // Note that this block will be read a very high number of times, which is equal to the number of
+ // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
+ pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
+
+ // Everything is packed, we can now call the block * panel kernel:
+ gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
+
+ }
+ }
+
+ ei_aligned_stack_delete(Scalar, blockA, kc*mc);
+ ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
+ }
}
};
@@ -139,15 +208,16 @@ struct ei_gemm_functor
: m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha)
{}
- void operator() (int col, int cols, int row=0, int rows=-1) const
+ void operator() (int row, int rows, int col=0, int cols=-1, GemmParallelInfo* info=0) const
{
- if(rows==-1)
- rows = m_lhs.rows();
+ if(cols==-1)
+ cols = m_rhs.cols();
Gemm::run(rows, cols, m_lhs.cols(),
(const Scalar*)&(m_lhs.const_cast_derived().coeffRef(row,0)), m_lhs.stride(),
(const Scalar*)&(m_rhs.const_cast_derived().coeffRef(0,col)), m_rhs.stride(),
(Scalar*)&(m_dest.coeffRef(row,col)), m_dest.stride(),
- m_actualAlpha);
+ m_actualAlpha,
+ info);
}
protected:
@@ -191,7 +261,9 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
_ActualRhsType,
Dest> Functor;
- ei_run_parallel_2d<true>(Functor(lhs, rhs, dst, actualAlpha), this->cols(), this->rows());
+// ei_run_parallel_1d<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows());
+// ei_run_parallel_2d<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols());
+ ei_run_parallel_gemm<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols());
}
};