From ed134a0ce5670984b67c1797449b8d5b7a9a55f0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 2 Mar 2009 13:15:15 +0000 Subject: performance improvement: rewrite of the matrix-matrix product following Goto's paper => x1.4 speedup with more consistent perf results --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 542 ++++++++++++++------------ 1 file changed, 290 insertions(+), 252 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 3bf4a2e16..7ec33d7f4 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -69,298 +69,335 @@ static void ei_cache_friendly_product( typedef typename ei_packet_traits::type PacketType; + + +#ifndef EIGEN_USE_ALT_PRODUCT + enum { PacketSize = sizeof(PacketType)/sizeof(Scalar), #if (defined __i386__) - // i386 architecture provides only 8 xmm registers, - // so let's reduce the max number of rows processed at once. - MaxBlockRows = 4, - MaxBlockRows_ClampingMask = 0xFFFFFC, + HalfRegisterCount = 4, #else - MaxBlockRows = 8, - MaxBlockRows_ClampingMask = 0xFFFFF8, + HalfRegisterCount = 8, #endif - // maximal size of the blocks fitted in L2 cache - MaxL2BlockSize = ei_L2_block_traits::width - }; - const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0)); - - const int remainingSize = depth % PacketSize; - const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries - const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize; - const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize; - const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize; - const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize; - const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0)); - Scalar* EIGEN_RESTRICT block = 0; - const int allocBlockSize = l2BlockRows*size; - block = ei_aligned_stack_new(Scalar, allocBlockSize); - Scalar* EIGEN_RESTRICT rhsCopy - = ei_aligned_stack_new(Scalar, l2BlockSizeAligned*l2BlockSizeAligned); + // register block size along the N direction + nr = HalfRegisterCount/2, + + // register block size along the M direction + mr = 2 * PacketSize, + + // max cache block size along the K direction + Max_kc = ei_L2_block_traits::width, + + // max cache block size along the M direction + Max_mc = 2*Max_kc + }; -#ifndef EIGEN_USE_NEW_PRODUCT - // loops on each L2 cache friendly blocks of the result - for(int l2i=0; l2i(Max_kc,depth); // cache block size along the K direction + int mc = std::min(Max_mc,rows); // cache block size along the M direction + + Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); + Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*PacketSize); + + // number of columns which can be processed by packet of nr columns + int packet_cols = (cols/nr)*nr; + + // GEMM_VAR1 + for(int k2=0; k20) + } + + // => GEPP_VAR1 + for(int i2=0; i21 && resIsAligned) - { - // the result is aligned: let's do packet reduction - ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(&dst[0]))); - if (PacketSize==2) - ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2])))); - if (MaxBlockRows==8) - { - ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4])))); - if (PacketSize==2) - ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6])))); - } - } - else - { - // not aligned => per coeff packet reduction - localRes[0] += ei_predux(dst[0]); - localRes[1] += ei_predux(dst[1]); - localRes[2] += ei_predux(dst[2]); - localRes[3] += ei_predux(dst[3]); - if (MaxBlockRows==8) - { - localRes[4] += ei_predux(dst[4]); - localRes[5] += ei_predux(dst[5]); - localRes[6] += ei_predux(dst[6]); - localRes[7] += ei_predux(dst[7]); - } - } + PacketType B0, B1, B2, B3, A0, A1; + + A0 = ei_pload(&blA[0*PacketSize]); + A1 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + C0 = ei_pmadd(B0, A0, C0); + if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); + C4 = ei_pmadd(B0, A1, C4); + if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); + C1 = ei_pmadd(B1, A0, C1); + C5 = ei_pmadd(B1, A1, C5); + if(nr==4) C2 = ei_pmadd(B2, A0, C2); + if(nr==4) C6 = ei_pmadd(B2, A1, C6); + if(nr==4) C3 = ei_pmadd(B3, A0, C3); + if(nr==4) C7 = ei_pmadd(B3, A1, C7); + + blB += nr*PacketSize; + blA += mr; } + + ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0); + ei_pstoreu(&res[(j2+1)*resStride + i2 + i], C1); + if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i], C2); + if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i], C3); + ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4); + ei_pstoreu(&res[(j2+1)*resStride + i2 + i + PacketSize], C5); + if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i + PacketSize], C6); + if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i + PacketSize], C7); } - if (l2blockRemainingRows>0) + for(int i=peeled_mc; i=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+ PacketSize])), dst[1]); - if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]); - if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]); - if (MaxBlockRows==8) - { - if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]); - if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]); - if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]); - if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]); - } - } - - Scalar* EIGEN_RESTRICT localRes = &(res[l2blockRowEndBW + l1j*resStride]); - - // process the remaining rows once at a time - localRes[0] += ei_predux(dst[0]); - if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]); - if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]); - if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]); - if (MaxBlockRows==8) - { - if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]); - if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]); - if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]); - if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]); - } - + Scalar B0, B1, B2, B3, A0; + + A0 = blA[k]; + B0 = blB[0*PacketSize]; + B1 = blB[1*PacketSize]; + C0 += B0 * A0; + if(nr==4) B2 = blB[2*PacketSize]; + if(nr==4) B3 = blB[3*PacketSize]; + C1 += B1 * A0; + if(nr==4) C2 += B2 * A0; + if(nr==4) C3 += B3 * A0; + + blB += nr*PacketSize; } + res[(j2+0)*resStride + i2 + i] += C0; + res[(j2+1)*resStride + i2 + i] += C1; + if(nr==4) res[(j2+2)*resStride + i2 + i] += C2; + if(nr==4) res[(j2+3)*resStride + i2 + i] += C3; } } - } - } - if (PacketSize>1 && remainingSize) - { - if (lhsRowMajor) - { - for (int j=0; j::width + }; + + const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0)); - for(int l2j=0; l2j rows ? rows : 512; + const int l2BlockCols = MaxL2BlockSize > cols ? cols : 128; + const int l2BlockSize = MaxL2BlockSize > size ? size : 256; + const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize; + const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0)); + + Scalar* EIGEN_RESTRICT block = new Scalar[l2BlockRows*size]; +// for(int i=0; i