aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/products
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2012-01-21 19:34:28 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2012-01-21 19:34:28 +0100
commit238999045c0c870834620d9d86a0254d2dd2f6d6 (patch)
tree1c0a54c666e1285b0639fb7d166ea718cc5b313f /Eigen/src/Core/products
parent0e1e0a2a585f9c16719bdb412b9f4bd8a734ca05 (diff)
optimize the packing of lhs blocks for matrix-matrix products => significant speedup for small products
Diffstat (limited to 'Eigen/src/Core/products')
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h57
1 files changed, 49 insertions, 8 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 68fd7678e..c591cc1f9 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -1103,7 +1103,7 @@ EIGEN_ASM_COMMENT("mybegin4");
#undef CJMADD
// pack a block of the lhs
-// The travesal is as follow (mr==4):
+// The traversal is as follow (mr==4):
// 0 4 8 12 ...
// 1 5 9 13 ...
// 2 6 10 14 ...
@@ -1119,11 +1119,15 @@ EIGEN_ASM_COMMENT("mybegin4");
template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs
{
- void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows,
+ EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows,
Index stride=0, Index offset=0)
{
-// enum { PacketSize = packet_traits<Scalar>::size };
+ typedef typename packet_traits<Scalar>::type Packet;
+ enum { PacketSize = packet_traits<Scalar>::size };
+
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+ eigen_assert( (StorageOrder==RowMajor) || (Pack1%PacketSize)==0 && Pack1<=4*PacketSize);
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
Index count = 0;
@@ -1131,9 +1135,44 @@ struct gemm_pack_lhs
for(Index i=0; i<peeled_mc; i+=Pack1)
{
if(PanelMode) count += Pack1 * offset;
- for(Index k=0; k<depth; k++)
- for(Index w=0; w<Pack1; w++)
- blockA[count++] = cj(lhs(i+w, k));
+
+ if(StorageOrder==ColMajor)
+ {
+ for(Index k=0; k<depth; k++)
+ {
+ Packet A, B, C, D;
+ if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
+ if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
+ if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
+ if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k));
+ if(Pack1>=1*PacketSize) { pstore(blockA+count, cj(A)); count+=PacketSize; }
+ if(Pack1>=2*PacketSize) { pstore(blockA+count, cj(B)); count+=PacketSize; }
+ if(Pack1>=3*PacketSize) { pstore(blockA+count, cj(C)); count+=PacketSize; }
+ if(Pack1>=4*PacketSize) { pstore(blockA+count, cj(D)); count+=PacketSize; }
+ }
+ }
+ else
+ {
+ for(Index k=0; k<depth; k++)
+ {
+ // TODO add a vectorized transpose here
+ Index w=0;
+ for(; w<Pack1-3; w+=4)
+ {
+ Scalar a(cj(lhs(i+w+0, k))),
+ b(cj(lhs(i+w+1, k))),
+ c(cj(lhs(i+w+2, k))),
+ d(cj(lhs(i+w+3, k)));
+ blockA[count++] = a;
+ blockA[count++] = b;
+ blockA[count++] = c;
+ blockA[count++] = d;
+ }
+ if(Pack1%4)
+ for(;w<Pack1;++w)
+ blockA[count++] = cj(lhs(i+w, k));
+ }
+ }
if(PanelMode) count += Pack1 * (stride-offset-depth);
}
if(rows-peeled_mc>=Pack2)
@@ -1167,9 +1206,10 @@ struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
{
typedef typename packet_traits<Scalar>::type Packet;
enum { PacketSize = packet_traits<Scalar>::size };
- void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
Index stride=0, Index offset=0)
{
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
Index packet_cols = (cols/nr) * nr;
@@ -1214,9 +1254,10 @@ template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode
struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
{
enum { PacketSize = packet_traits<Scalar>::size };
- void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
Index stride=0, Index offset=0)
{
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
Index packet_cols = (cols/nr) * nr;