diff options
author | 2009-03-02 13:15:15 +0000 | |
---|---|---|
committer | 2009-03-02 13:15:15 +0000 | |
commit | ed134a0ce5670984b67c1797449b8d5b7a9a55f0 (patch) | |
tree | 599f37220472b37da4f273f2d1c627c315f17202 /Eigen | |
parent | 8ed186b9ab9c64a12688132764c49a3a889ad23e (diff) |
performance improvement: rewrite of the matrix-matrix product following
Goto's paper => x1.4 speedup with more consistent perf results
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 542 |
1 files changed, 290 insertions, 252 deletions
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 3bf4a2e16..7ec33d7f4 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -69,298 +69,335 @@ static void ei_cache_friendly_product( typedef typename ei_packet_traits<Scalar>::type PacketType; + + +#ifndef EIGEN_USE_ALT_PRODUCT + enum { PacketSize = sizeof(PacketType)/sizeof(Scalar), #if (defined __i386__) - // i386 architecture provides only 8 xmm registers, - // so let's reduce the max number of rows processed at once. - MaxBlockRows = 4, - MaxBlockRows_ClampingMask = 0xFFFFFC, + HalfRegisterCount = 4, #else - MaxBlockRows = 8, - MaxBlockRows_ClampingMask = 0xFFFFF8, + HalfRegisterCount = 8, #endif - // maximal size of the blocks fitted in L2 cache - MaxL2BlockSize = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width - }; - const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0)); - - const int remainingSize = depth % PacketSize; - const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries - const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize; - const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize; - const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize; - const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize; - const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0)); - Scalar* EIGEN_RESTRICT block = 0; - const int allocBlockSize = l2BlockRows*size; - block = ei_aligned_stack_new(Scalar, allocBlockSize); - Scalar* EIGEN_RESTRICT rhsCopy - = ei_aligned_stack_new(Scalar, l2BlockSizeAligned*l2BlockSizeAligned); + // register block size along the N direction + nr = HalfRegisterCount/2, + + // register block size along the M direction + mr = 2 * PacketSize, + + // max cache block size along the K direction + Max_kc = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width, + + // max cache block size along the M direction + Max_mc = 2*Max_kc + }; -#ifndef EIGEN_USE_NEW_PRODUCT - // loops on each L2 cache friendly blocks of the result - for(int l2i=0; l2i<rows; l2i+=l2BlockRows) + int kc = std::min<int>(Max_kc,depth); // cache block size along the K direction + int mc = std::min<int>(Max_mc,rows); // cache block size along the M direction + + Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); + Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*PacketSize); + + // number of columns which can be processed by packet of nr columns + int packet_cols = (cols/nr)*nr; + + // GEMM_VAR1 + for(int k2=0; k2<depth; k2+=kc) { - const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows); - const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask; // end of the rows aligned to bw - const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW; // number of remaining rows - //const int l2blockRowEndBWPlusOne = l2blockRowEndBW + (l2blockRemainingRows?0:MaxBlockRows); - - // build a cache friendly blocky matrix - int count = 0; - - // copy l2blocksize rows of m_lhs to blocks of ps x bw - for(int l2k=0; l2k<size; l2k+=l2BlockSize) + const int actual_kc = std::min(k2+kc,depth)-k2; + + // we have selected one row panel of rhs and one column panel of lhs + // pack rhs's panel into a sequential chunk of memory + // and expand each coeff to a constant packet for further reuse { - const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size); - - for (int i = l2i; i<l2blockRowEndBW/*PlusOne*/; i+=MaxBlockRows) + int count = 0; + for(int j2=0; j2<packet_cols; j2+=nr) { - // TODO merge the "if l2blockRemainingRows" using something like: - // const int blockRows = std::min(i+MaxBlockRows, rows) - i; - - for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize) + const Scalar* b0 = &rhs[(j2+0)*rhsStride + k2]; + const Scalar* b1 = &rhs[(j2+1)*rhsStride + k2]; + const Scalar* b2 = &rhs[(j2+2)*rhsStride + k2]; + const Scalar* b3 = &rhs[(j2+3)*rhsStride + k2]; + for(int k=0; k<actual_kc; k++) { - // TODO write these loops using meta unrolling - // negligible for large matrices but useful for small ones - if (lhsRowMajor) + ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k])); + ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k])); + if (nr==4) { - for (int w=0; w<MaxBlockRows; ++w) - for (int s=0; s<PacketSize; ++s) - block[count++] = lhs[(i+w)*lhsStride + (k+s)]; - } - else - { - for (int w=0; w<MaxBlockRows; ++w) - for (int s=0; s<PacketSize; ++s) - block[count++] = lhs[(i+w) + (k+s)*lhsStride]; + ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k])); + ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k])); } + count += nr*PacketSize; } } - if (l2blockRemainingRows>0) + } + + // => GEPP_VAR1 + for(int i2=0; i2<rows; i2+=mc) + { + const int actual_mc = std::min(i2+mc,rows)-i2; + + // We have selected a mc x kc block of lhs + // Let's pack it in a clever order for further purely sequential access + int count = 0; + const int peeled_mc = (actual_mc/mr)*mr; + if (lhsRowMajor) { - for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize) + for(int i=0; i<peeled_mc; i+=mr) + for(int k=0; k<actual_kc; k++) + for(int w=0; w<mr; w++) + blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride]; + for(int i=peeled_mc; i<actual_mc; i++) { - if (lhsRowMajor) - { - for (int w=0; w<l2blockRemainingRows; ++w) - for (int s=0; s<PacketSize; ++s) - block[count++] = lhs[(l2blockRowEndBW+w)*lhsStride + (k+s)]; - } - else - { - for (int w=0; w<l2blockRemainingRows; ++w) - for (int s=0; s<PacketSize; ++s) - block[count++] = lhs[(l2blockRowEndBW+w) + (k+s)*lhsStride]; - } + const Scalar* llhs = &lhs[(k2) + (i2+i)*lhsStride]; + for(int k=0; k<actual_kc; k++) + blockA[count++] = llhs[k]; } } - } - - for(int l2j=0; l2j<cols; l2j+=l2BlockCols) - { - int l2blockColEnd = std::min(l2j+l2BlockCols, cols); - - for(int l2k=0; l2k<size; l2k+=l2BlockSize) + else { - // acumulate bw rows of lhs time a single column of rhs to a bw x 1 block of res - int l2blockSizeEnd = std::min(l2k+l2BlockSize, size); + for(int i=0; i<peeled_mc; i+=mr) + for(int k=0; k<actual_kc; k++) + for(int w=0; w<mr; w++) + blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w]; + for(int i=peeled_mc; i<actual_mc; i++) + for(int k=0; k<actual_kc; k++) + blockA[count++] = lhs[(k2+k)*lhsStride + i2+i]; + } - // if not aligned, copy the rhs block - if (needRhsCopy) - for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1) + // GEBP + // loops on each cache friendly block of the result/rhs + for(int j2=0; j2<packet_cols; j2+=nr) + { + // loops on each register blocking of lhs/res + const int peeled_mc = (actual_mc/mr)*mr; + for(int i=0; i<peeled_mc; i+=mr) + { + const Scalar* blA = &blockA[i*actual_kc]; + #ifdef EIGEN_VECTORIZE_SSE + _mm_prefetch(&blA[0], _MM_HINT_T0); + #endif + + // TODO move the res loads to the stores + + // gets res block as register + PacketType C0, C1, C2, C3, C4, C5, C6, C7; + C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]); + C1 = ei_ploadu(&res[(j2+1)*resStride + i2 + i]); + if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i2 + i]); + if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i2 + i]); + C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]); + C5 = ei_ploadu(&res[(j2+1)*resStride + i2 + i + PacketSize]); + if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i2 + i + PacketSize]); + if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i2 + i + PacketSize]); + + // performs "inner" product + // TODO let's check wether the flowing peeled loop could not be + // optimized via optimal prefetching from one loop to the other + const Scalar* blB = &blockB[j2*actual_kc*PacketSize]; + const int peeled_kc = (actual_kc/4)*4; + for(int k=0; k<peeled_kc; k+=4) { - ei_internal_assert(l2BlockSizeAligned*(l1j-l2j)+(l2blockSizeEnd-l2k) < l2BlockSizeAligned*l2BlockSizeAligned); - memcpy(rhsCopy+l2BlockSizeAligned*(l1j-l2j),&(rhs[l1j*rhsStride+l2k]),(l2blockSizeEnd-l2k)*sizeof(Scalar)); + PacketType B0, B1, B2, B3, A0, A1; + + A0 = ei_pload(&blA[0*PacketSize]); + A1 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + C0 = ei_pmadd(B0, A0, C0); + if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); + C4 = ei_pmadd(B0, A1, C4); + if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); + B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); + C1 = ei_pmadd(B1, A0, C1); + C5 = ei_pmadd(B1, A1, C5); + B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]); + if(nr==4) C2 = ei_pmadd(B2, A0, C2); + if(nr==4) C6 = ei_pmadd(B2, A1, C6); + if(nr==4) B2 = ei_pload(&blB[6*PacketSize]); + if(nr==4) C3 = ei_pmadd(B3, A0, C3); + A0 = ei_pload(&blA[2*PacketSize]); + if(nr==4) C7 = ei_pmadd(B3, A1, C7); + A1 = ei_pload(&blA[3*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[7*PacketSize]); + C0 = ei_pmadd(B0, A0, C0); + C4 = ei_pmadd(B0, A1, C4); + B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]); + C1 = ei_pmadd(B1, A0, C1); + C5 = ei_pmadd(B1, A1, C5); + B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]); + if(nr==4) C2 = ei_pmadd(B2, A0, C2); + if(nr==4) C6 = ei_pmadd(B2, A1, C6); + if(nr==4) B2 = ei_pload(&blB[10*PacketSize]); + if(nr==4) C3 = ei_pmadd(B3, A0, C3); + A0 = ei_pload(&blA[4*PacketSize]); + if(nr==4) C7 = ei_pmadd(B3, A1, C7); + A1 = ei_pload(&blA[5*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[11*PacketSize]); + + C0 = ei_pmadd(B0, A0, C0); + C4 = ei_pmadd(B0, A1, C4); + B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); + C1 = ei_pmadd(B1, A0, C1); + C5 = ei_pmadd(B1, A1, C5); + B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); + if(nr==4) C2 = ei_pmadd(B2, A0, C2); + if(nr==4) C6 = ei_pmadd(B2, A1, C6); + if(nr==4) B2 = ei_pload(&blB[14*PacketSize]); + if(nr==4) C3 = ei_pmadd(B3, A0, C3); + A0 = ei_pload(&blA[6*PacketSize]); + if(nr==4) C7 = ei_pmadd(B3, A1, C7); + A1 = ei_pload(&blA[7*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); + C0 = ei_pmadd(B0, A0, C0); + C4 = ei_pmadd(B0, A1, C4); + C1 = ei_pmadd(B1, A0, C1); + C5 = ei_pmadd(B1, A1, C5); + if(nr==4) C2 = ei_pmadd(B2, A0, C2); + if(nr==4) C6 = ei_pmadd(B2, A1, C6); + if(nr==4) C3 = ei_pmadd(B3, A0, C3); + if(nr==4) C7 = ei_pmadd(B3, A1, C7); + + blB += 4*nr*PacketSize; + blA += 4*mr; } - - // for each bw x 1 result's block - for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows) - { - int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows; - const Scalar* EIGEN_RESTRICT localB = &block[offsetblock]; - - for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1) + for(int k=peeled_kc; k<actual_kc; k++) { - const Scalar* EIGEN_RESTRICT rhsColumn; - if (needRhsCopy) - rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]); - else - rhsColumn = &(rhs[l1j*rhsStride]); - - PacketType dst[MaxBlockRows]; - dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.)); - if (MaxBlockRows==8) - dst[7] = dst[6] = dst[5] = dst[4] = dst[0]; - - PacketType tmp; - - for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize) - { - tmp = ei_ploadu(&rhsColumn[k]); - PacketType A0, A1, A2, A3, A4, A5; - A0 = ei_pload(localB + k*MaxBlockRows); - A1 = ei_pload(localB + k*MaxBlockRows+1*PacketSize); - A2 = ei_pload(localB + k*MaxBlockRows+2*PacketSize); - A3 = ei_pload(localB + k*MaxBlockRows+3*PacketSize); - if (MaxBlockRows==8) A4 = ei_pload(localB + k*MaxBlockRows+4*PacketSize); - if (MaxBlockRows==8) A5 = ei_pload(localB + k*MaxBlockRows+5*PacketSize); - dst[0] = ei_pmadd(tmp, A0, dst[0]); - if (MaxBlockRows==8) A0 = ei_pload(localB + k*MaxBlockRows+6*PacketSize); - dst[1] = ei_pmadd(tmp, A1, dst[1]); - if (MaxBlockRows==8) A1 = ei_pload(localB + k*MaxBlockRows+7*PacketSize); - dst[2] = ei_pmadd(tmp, A2, dst[2]); - dst[3] = ei_pmadd(tmp, A3, dst[3]); - if (MaxBlockRows==8) - { - dst[4] = ei_pmadd(tmp, A4, dst[4]); - dst[5] = ei_pmadd(tmp, A5, dst[5]); - dst[6] = ei_pmadd(tmp, A0, dst[6]); - dst[7] = ei_pmadd(tmp, A1, dst[7]); - } - } - - Scalar* EIGEN_RESTRICT localRes = &(res[l1i + l1j*resStride]); - - if (PacketSize>1 && resIsAligned) - { - // the result is aligned: let's do packet reduction - ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(&dst[0]))); - if (PacketSize==2) - ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2])))); - if (MaxBlockRows==8) - { - ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4])))); - if (PacketSize==2) - ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6])))); - } - } - else - { - // not aligned => per coeff packet reduction - localRes[0] += ei_predux(dst[0]); - localRes[1] += ei_predux(dst[1]); - localRes[2] += ei_predux(dst[2]); - localRes[3] += ei_predux(dst[3]); - if (MaxBlockRows==8) - { - localRes[4] += ei_predux(dst[4]); - localRes[5] += ei_predux(dst[5]); - localRes[6] += ei_predux(dst[6]); - localRes[7] += ei_predux(dst[7]); - } - } + PacketType B0, B1, B2, B3, A0, A1; + + A0 = ei_pload(&blA[0*PacketSize]); + A1 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + C0 = ei_pmadd(B0, A0, C0); + if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); + C4 = ei_pmadd(B0, A1, C4); + if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); + C1 = ei_pmadd(B1, A0, C1); + C5 = ei_pmadd(B1, A1, C5); + if(nr==4) C2 = ei_pmadd(B2, A0, C2); + if(nr==4) C6 = ei_pmadd(B2, A1, C6); + if(nr==4) C3 = ei_pmadd(B3, A0, C3); + if(nr==4) C7 = ei_pmadd(B3, A1, C7); + + blB += nr*PacketSize; + blA += mr; } + + ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0); + ei_pstoreu(&res[(j2+1)*resStride + i2 + i], C1); + if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i], C2); + if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i], C3); + ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4); + ei_pstoreu(&res[(j2+1)*resStride + i2 + i + PacketSize], C5); + if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i + PacketSize], C6); + if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i + PacketSize], C7); } - if (l2blockRemainingRows>0) + for(int i=peeled_mc; i<actual_mc; i++) { - int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows; - const Scalar* localB = &block[offsetblock]; - - for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1) + const Scalar* blA = &blockA[i*actual_kc]; + #ifdef EIGEN_VECTORIZE_SSE + _mm_prefetch(&blA[0], _MM_HINT_T0); + #endif + + // gets a 1 x nr res block as registers + Scalar C0(0), C1(0), C2(0), C3(0); + const Scalar* blB = &blockB[j2*actual_kc*PacketSize]; + for(int k=0; k<actual_kc; k++) { - const Scalar* EIGEN_RESTRICT rhsColumn; - if (needRhsCopy) - rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]); - else - rhsColumn = &(rhs[l1j*rhsStride]); - - PacketType dst[MaxBlockRows]; - dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.)); - if (MaxBlockRows==8) - dst[7] = dst[6] = dst[5] = dst[4] = dst[0]; - - // let's declare a few other temporary registers - PacketType tmp; - - for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize) - { - tmp = ei_pload(&rhsColumn[k]); - - dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows ])), dst[0]); - if (l2blockRemainingRows>=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+ PacketSize])), dst[1]); - if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]); - if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]); - if (MaxBlockRows==8) - { - if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]); - if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]); - if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]); - if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]); - } - } - - Scalar* EIGEN_RESTRICT localRes = &(res[l2blockRowEndBW + l1j*resStride]); - - // process the remaining rows once at a time - localRes[0] += ei_predux(dst[0]); - if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]); - if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]); - if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]); - if (MaxBlockRows==8) - { - if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]); - if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]); - if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]); - if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]); - } - + Scalar B0, B1, B2, B3, A0; + + A0 = blA[k]; + B0 = blB[0*PacketSize]; + B1 = blB[1*PacketSize]; + C0 += B0 * A0; + if(nr==4) B2 = blB[2*PacketSize]; + if(nr==4) B3 = blB[3*PacketSize]; + C1 += B1 * A0; + if(nr==4) C2 += B2 * A0; + if(nr==4) C3 += B3 * A0; + + blB += nr*PacketSize; } + res[(j2+0)*resStride + i2 + i] += C0; + res[(j2+1)*resStride + i2 + i] += C1; + if(nr==4) res[(j2+2)*resStride + i2 + i] += C2; + if(nr==4) res[(j2+3)*resStride + i2 + i] += C3; } } - } - } - if (PacketSize>1 && remainingSize) - { - if (lhsRowMajor) - { - for (int j=0; j<cols; ++j) - for (int i=0; i<rows; ++i) - { - Scalar tmp = lhs[i*lhsStride+size] * rhs[j*rhsStride+size]; - // FIXME this loop get vectorized by the compiler ! - for (int k=1; k<remainingSize; ++k) - tmp += lhs[i*lhsStride+size+k] * rhs[j*rhsStride+size+k]; - res[i+j*resStride] += tmp; - } - } - else - { - for (int j=0; j<cols; ++j) - for (int i=0; i<rows; ++i) + // remaining rhs/res columns (<nr) + for(int j2=packet_cols; j2<cols; j2++) + { + for(int i=0; i<actual_mc; i++) { - Scalar tmp = lhs[i+size*lhsStride] * rhs[j*rhsStride+size]; - for (int k=1; k<remainingSize; ++k) - tmp += lhs[i+(size+k)*lhsStride] * rhs[j*rhsStride+size+k]; - res[i+j*resStride] += tmp; + Scalar c0 = res[(j2)*resStride + i2+i]; + if (lhsRowMajor) + for(int k=0; k<actual_kc; k++) + c0 += lhs[(k2+k)+(i2+i)*lhsStride] * rhs[j2*rhsStride + k2 + k]; + else + for(int k=0; k<actual_kc; k++) + c0 += lhs[(k2+k)*lhsStride + i2+i] * rhs[j2*rhsStride + k2 + k]; + res[(j2)*resStride + i2+i] = c0; } + } } } + + ei_aligned_stack_delete(Scalar, blockA, kc*mc); + ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize); + +#else // alternate product from cylmor -#else - // loops on each L2 cache friendly blocks of the result + enum { + PacketSize = sizeof(PacketType)/sizeof(Scalar), + #if (defined __i386__) + // i386 architecture provides only 8 xmm registers, + // so let's reduce the max number of rows processed at once. + MaxBlockRows = 4, + MaxBlockRows_ClampingMask = 0xFFFFFC, + #else + MaxBlockRows = 8, + MaxBlockRows_ClampingMask = 0xFFFFF8, + #endif + // maximal size of the blocks fitted in L2 cache + MaxL2BlockSize = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width + }; + + const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0)); - for(int l2j=0; l2j<cols; l2j+=l2BlockCols) + const int remainingSize = depth % PacketSize; + const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries + + const int l2BlockRows = MaxL2BlockSize > rows ? rows : 512; + const int l2BlockCols = MaxL2BlockSize > cols ? cols : 128; + const int l2BlockSize = MaxL2BlockSize > size ? size : 256; + const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize; + const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0)); + + Scalar* EIGEN_RESTRICT block = new Scalar[l2BlockRows*size]; +// for(int i=0; i<l2BlockRows*l2BlockSize; ++i) +// block[i] = 0; + // loops on each L2 cache friendly blocks of lhs + for(int l2k=0; l2k<depth; l2k+=l2BlockSize) { for(int l2i=0; l2i<rows; l2i+=l2BlockRows) { // We have selected a block of lhs // Packs this block into 'block' int count = 0; - for(int j=0; j<l2BlockCols; j+=MaxBlockRows) + for(int k=0; k<l2BlockSize; k+=MaxBlockRows) { for(int i=0; i<l2BlockRows; i+=2*PacketSize) for (int w=0; w<MaxBlockRows; ++w) for (int y=0; y<2*PacketSize; ++y) - block[count++] = lhs[(j+l2j+w)*rows + l2i+i+ y]; + block[count++] = lhs[(k+l2k+w)*lhsStride + l2i+i+ y]; } // loops on each L2 cache firendly block of the result/rhs - for(int l2k=0; l2k<cols; l2k+=l2BlockCols) + for(int l2j=0; l2j<cols; l2j+=l2BlockCols) { - for(int i=0; i<l2BlockRows; i+=MaxBlockRows) + for(int k=0; k<l2BlockSize; k+=MaxBlockRows) { for(int j=0; j<l2BlockCols; ++j) { @@ -370,7 +407,7 @@ static void ei_cache_friendly_product( // Here we need some vector reordering // Right now its hardcoded to packets of 4 elements - const Scalar* lrhs = &rhs[(j+l2k)*rows+(i+l2j)]; + const Scalar* lrhs = &rhs[(j+l2j)*rhsStride+(k+l2k)]; A0 = ei_pset1(lrhs[0]); A1 = ei_pset1(lrhs[1]); A2 = ei_pset1(lrhs[2]); @@ -383,16 +420,17 @@ static void ei_cache_friendly_product( A7 = ei_pset1(lrhs[7]); } - Scalar * lb = &block[l2BlockRows * i]; - for(int k=0; k<l2BlockRows; k+=2*PacketSize) + Scalar * lb = &block[l2BlockRows * k]; + for(int i=0; i<l2BlockRows; i+=2*PacketSize) { PacketType R0, R1, L0, L1, T0, T1; asm("#begin sgemm"); +// asm(".byte 0x66;"); // We perform "cross products" of vectors to avoid // reductions (horizontal ops) afterwards - T0 = ei_pload(&res[(j+l2k)*rows+l2i+k]); - T1 = ei_pload(&res[(j+l2k)*rows+l2i+k+PacketSize]); + T0 = ei_pload(&res[(j+l2j)*resStride+l2i+i]); + T1 = ei_pload(&res[(j+l2j)*resStride+l2i+i+PacketSize]); // uncomment to remove res cache miss // T0 = ei_pload(&res[k]); // T1 = ei_pload(&res[k+PacketSize]); @@ -437,11 +475,11 @@ static void ei_cache_friendly_product( } lb += MaxBlockRows*2*PacketSize; - ei_pstore(&res[(j+l2k)*rows+l2i+k], T0); - ei_pstore(&res[(j+l2k)*rows+l2i+k+PacketSize], T1); + ei_pstore(&res[(j+l2j)*resStride+l2i+i], T0); + ei_pstore(&res[(j+l2j)*resStride+l2i+i+PacketSize], T1); // uncomment to remove res cache miss -// ei_pstore(&res[k], T0); -// ei_pstore(&res[k+PacketSize], T1); +// ei_pstore(&res[0], T0); +// ei_pstore(&res[4/*k+PacketSize*/], T1); asm("#end sgemm"); } } @@ -449,10 +487,10 @@ static void ei_cache_friendly_product( } } } + delete[] block; #endif - ei_aligned_stack_delete(Scalar, block, allocBlockSize); - ei_aligned_stack_delete(Scalar, rhsCopy, l2BlockSizeAligned*l2BlockSizeAligned); + } #endif // EIGEN_EXTERN_INSTANTIATIONS |