diff options
author | Gael Guennebaud <g.gael@free.fr> | 2009-07-21 16:58:35 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2009-07-21 16:58:35 +0200 |
commit | d6627d540e6a8651ccd8ce4a4520b70fe5def870 (patch) | |
tree | 0b33c593046a5aa7c1a76c42cdd749a3938870ba /Eigen/src/Core | |
parent | afa8f2ca952e52201b36813b848aa7a68fca70e9 (diff) |
* refactoring of the matrix product into multiple small kernels
* started an efficient selfadjoint matrix * general matrix product
based on the generic kernels ( => need a very little LOC)
Diffstat (limited to 'Eigen/src/Core')
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 667 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 184 |
2 files changed, 494 insertions, 357 deletions
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index fe3e877e1..30fa4aa66 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -25,6 +25,15 @@ #ifndef EIGEN_GENERAL_MATRIX_MATRIX_H #define EIGEN_GENERAL_MATRIX_MATRIX_H +template<typename Scalar, typename Packet, int PacketSize, int mr, int nr, typename Conj> +struct ei_gebp_kernel; + +template<typename Scalar, int PacketSize, int nr> +struct ei_gemm_pack_rhs; + +template<typename Scalar, int mr> +struct ei_gemm_pack_lhs; + template <int L2MemorySize,typename Scalar> struct ei_L2_block_traits { enum {width = 8 * ei_meta_sqrt<L2MemorySize/(64*sizeof(Scalar))>::ret }; @@ -124,10 +133,6 @@ static void ei_cache_friendly_product( typedef typename ei_packet_traits<Scalar>::type PacketType; - - -#ifndef EIGEN_USE_ALT_PRODUCT - enum { PacketSize = sizeof(PacketType)/sizeof(Scalar), #if (defined __i386__) @@ -166,398 +171,346 @@ static void ei_cache_friendly_product( // we have selected one row panel of rhs and one column panel of lhs // pack rhs's panel into a sequential chunk of memory // and expand each coeff to a constant packet for further reuse - { - int count = 0; - for(int j2=0; j2<packet_cols; j2+=nr) - { - const Scalar* b0 = &rhs[(j2+0)*rhsStride + k2]; - const Scalar* b1 = &rhs[(j2+1)*rhsStride + k2]; - const Scalar* b2 = &rhs[(j2+2)*rhsStride + k2]; - const Scalar* b3 = &rhs[(j2+3)*rhsStride + k2]; - if (hasAlpha) - { - for(int k=0; k<actual_kc; k++) - { - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k])); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k])); - if (nr==4) - { - ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k])); - ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k])); - } - count += nr*PacketSize; - } - } - else - { - for(int k=0; k<actual_kc; k++) - { - ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k])); - ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k])); - if (nr==4) - { - ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k])); - ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k])); - } - count += nr*PacketSize; - } - } - } - } + ei_gemm_pack_rhs<Scalar,PacketSize,nr>()(blockB, rhs, rhsStride, hasAlpha, alpha, actual_kc, packet_cols, k2, cols); // => GEPP_VAR1 for(int i2=0; i2<rows; i2+=mc) { const int actual_mc = std::min(i2+mc,rows)-i2; - // We have selected a mc x kc block of lhs - // Let's pack it in a clever order for further purely sequential access - int count = 0; - const int peeled_mc = (actual_mc/mr)*mr; - if (lhsRowMajor) - { - for(int i=0; i<peeled_mc; i+=mr) - for(int k=0; k<actual_kc; k++) - for(int w=0; w<mr; w++) - blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride]; - for(int i=peeled_mc; i<actual_mc; i++) - { - const Scalar* llhs = &lhs[(k2) + (i2+i)*lhsStride]; - for(int k=0; k<actual_kc; k++) - blockA[count++] = llhs[k]; - } - } - else - { - for(int i=0; i<peeled_mc; i+=mr) - for(int k=0; k<actual_kc; k++) - for(int w=0; w<mr; w++) - blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w]; - for(int i=peeled_mc; i<actual_mc; i++) - for(int k=0; k<actual_kc; k++) - blockA[count++] = lhs[(k2+k)*lhsStride + i2+i]; - } + ei_gemm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, i2); + + ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >() + (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); + } + } + + ei_aligned_stack_delete(Scalar, blockA, kc*mc); + ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize); +} - // GEBP - // loops on each cache friendly block of the result/rhs - for(int j2=0; j2<packet_cols; j2+=nr) +// optimized GEneral packed Block * packed Panel product kernel +template<typename Scalar, typename PacketType, int PacketSize, int mr, int nr, typename Conj> +struct ei_gebp_kernel +{ + void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols, int i2, int cols) + { + Conj cj; + const int peeled_mc = (actual_mc/mr)*mr; + // loops on each cache friendly block of the result/rhs + for(int j2=0; j2<packet_cols; j2+=nr) + { + // loops on each register blocking of lhs/res + for(int i=0; i<peeled_mc; i+=mr) { - // loops on each register blocking of lhs/res - const int peeled_mc = (actual_mc/mr)*mr; - for(int i=0; i<peeled_mc; i+=mr) + const Scalar* blA = &blockA[i*actual_kc]; + #ifdef EIGEN_VECTORIZE_SSE + _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); + #endif + + // TODO move the res loads to the stores + + // gets res block as register + PacketType C0, C1, C2, C3, C4, C5, C6, C7; + C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]); + C1 = ei_ploadu(&res[(j2+1)*resStride + i2 + i]); + if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i2 + i]); + if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i2 + i]); + C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]); + C5 = ei_ploadu(&res[(j2+1)*resStride + i2 + i + PacketSize]); + if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i2 + i + PacketSize]); + if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i2 + i + PacketSize]); + + // performs "inner" product + // TODO let's check wether the flowing peeled loop could not be + // optimized via optimal prefetching from one loop to the other + const Scalar* blB = &blockB[j2*actual_kc*PacketSize]; + const int peeled_kc = (actual_kc/4)*4; + for(int k=0; k<peeled_kc; k+=4) { - const Scalar* blA = &blockA[i*actual_kc]; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); - #endif - - // TODO move the res loads to the stores - - // gets res block as register - PacketType C0, C1, C2, C3, C4, C5, C6, C7; - C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]); - C1 = ei_ploadu(&res[(j2+1)*resStride + i2 + i]); - if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i2 + i]); - if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i2 + i]); - C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]); - C5 = ei_ploadu(&res[(j2+1)*resStride + i2 + i + PacketSize]); - if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i2 + i + PacketSize]); - if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i2 + i + PacketSize]); - - // performs "inner" product - // TODO let's check wether the flowing peeled loop could not be - // optimized via optimal prefetching from one loop to the other - const Scalar* blB = &blockB[j2*actual_kc*PacketSize]; - const int peeled_kc = (actual_kc/4)*4; - for(int k=0; k<peeled_kc; k+=4) - { - PacketType B0, B1, B2, B3, A0, A1; - - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); - C4 = cj.pmadd(A1, B0, C4); - if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); - B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) B2 = ei_pload(&blB[6*PacketSize]); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - A0 = ei_pload(&blA[2*PacketSize]); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - A1 = ei_pload(&blA[3*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[7*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - C4 = cj.pmadd(A1, B0, C4); - B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) B2 = ei_pload(&blB[10*PacketSize]); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - A0 = ei_pload(&blA[4*PacketSize]); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - A1 = ei_pload(&blA[5*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[11*PacketSize]); - - C0 = cj.pmadd(A0, B0, C0); - C4 = cj.pmadd(A1, B0, C4); - B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) B2 = ei_pload(&blB[14*PacketSize]); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - A0 = ei_pload(&blA[6*PacketSize]); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - A1 = ei_pload(&blA[7*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - C4 = cj.pmadd(A1, B0, C4); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - - blB += 4*nr*PacketSize; - blA += 4*mr; - } - // process remaining peeled loop - for(int k=peeled_kc; k<actual_kc; k++) - { - PacketType B0, B1, B2, B3, A0, A1; - - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); - C4 = cj.pmadd(A1, B0, C4); - if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - - blB += nr*PacketSize; - blA += mr; - } - - ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0); - ei_pstoreu(&res[(j2+1)*resStride + i2 + i], C1); - if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i], C2); - if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i], C3); - ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4); - ei_pstoreu(&res[(j2+1)*resStride + i2 + i + PacketSize], C5); - if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i + PacketSize], C6); - if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i + PacketSize], C7); + PacketType B0, B1, B2, B3, A0, A1; + + A0 = ei_pload(&blA[0*PacketSize]); + A1 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + C0 = cj.pmadd(A0, B0, C0); + if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); + C4 = cj.pmadd(A1, B0, C4); + if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); + B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); + C1 = cj.pmadd(A0, B1, C1); + C5 = cj.pmadd(A1, B1, C5); + B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]); + if(nr==4) C2 = cj.pmadd(A0, B2, C2); + if(nr==4) C6 = cj.pmadd(A1, B2, C6); + if(nr==4) B2 = ei_pload(&blB[6*PacketSize]); + if(nr==4) C3 = cj.pmadd(A0, B3, C3); + A0 = ei_pload(&blA[2*PacketSize]); + if(nr==4) C7 = cj.pmadd(A1, B3, C7); + A1 = ei_pload(&blA[3*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[7*PacketSize]); + C0 = cj.pmadd(A0, B0, C0); + C4 = cj.pmadd(A1, B0, C4); + B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]); + C1 = cj.pmadd(A0, B1, C1); + C5 = cj.pmadd(A1, B1, C5); + B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]); + if(nr==4) C2 = cj.pmadd(A0, B2, C2); + if(nr==4) C6 = cj.pmadd(A1, B2, C6); + if(nr==4) B2 = ei_pload(&blB[10*PacketSize]); + if(nr==4) C3 = cj.pmadd(A0, B3, C3); + A0 = ei_pload(&blA[4*PacketSize]); + if(nr==4) C7 = cj.pmadd(A1, B3, C7); + A1 = ei_pload(&blA[5*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[11*PacketSize]); + + C0 = cj.pmadd(A0, B0, C0); + C4 = cj.pmadd(A1, B0, C4); + B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); + C1 = cj.pmadd(A0, B1, C1); + C5 = cj.pmadd(A1, B1, C5); + B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); + if(nr==4) C2 = cj.pmadd(A0, B2, C2); + if(nr==4) C6 = cj.pmadd(A1, B2, C6); + if(nr==4) B2 = ei_pload(&blB[14*PacketSize]); + if(nr==4) C3 = cj.pmadd(A0, B3, C3); + A0 = ei_pload(&blA[6*PacketSize]); + if(nr==4) C7 = cj.pmadd(A1, B3, C7); + A1 = ei_pload(&blA[7*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); + C0 = cj.pmadd(A0, B0, C0); + C4 = cj.pmadd(A1, B0, C4); + C1 = cj.pmadd(A0, B1, C1); + C5 = cj.pmadd(A1, B1, C5); + if(nr==4) C2 = cj.pmadd(A0, B2, C2); + if(nr==4) C6 = cj.pmadd(A1, B2, C6); + if(nr==4) C3 = cj.pmadd(A0, B3, C3); + if(nr==4) C7 = cj.pmadd(A1, B3, C7); + + blB += 4*nr*PacketSize; + blA += 4*mr; } - for(int i=peeled_mc; i<actual_mc; i++) + // process remaining peeled loop + for(int k=peeled_kc; k<actual_kc; k++) { - const Scalar* blA = &blockA[i*actual_kc]; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); - #endif - - // gets a 1 x nr res block as registers - Scalar C0(0), C1(0), C2(0), C3(0); - const Scalar* blB = &blockB[j2*actual_kc*PacketSize]; - for(int k=0; k<actual_kc; k++) - { - Scalar B0, B1, B2, B3, A0; - - A0 = blA[k]; - B0 = blB[0*PacketSize]; - B1 = blB[1*PacketSize]; - C0 = cj.pmadd(A0, B0, C0); - if(nr==4) B2 = blB[2*PacketSize]; - if(nr==4) B3 = blB[3*PacketSize]; - C1 = cj.pmadd(A0, B1, C1); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - - blB += nr*PacketSize; - } - res[(j2+0)*resStride + i2 + i] += C0; - res[(j2+1)*resStride + i2 + i] += C1; - if(nr==4) res[(j2+2)*resStride + i2 + i] += C2; - if(nr==4) res[(j2+3)*resStride + i2 + i] += C3; + PacketType B0, B1, B2, B3, A0, A1; + + A0 = ei_pload(&blA[0*PacketSize]); + A1 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + C0 = cj.pmadd(A0, B0, C0); + if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); + C4 = cj.pmadd(A1, B0, C4); + if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); + C1 = cj.pmadd(A0, B1, C1); + C5 = cj.pmadd(A1, B1, C5); + if(nr==4) C2 = cj.pmadd(A0, B2, C2); + if(nr==4) C6 = cj.pmadd(A1, B2, C6); + if(nr==4) C3 = cj.pmadd(A0, B3, C3); + if(nr==4) C7 = cj.pmadd(A1, B3, C7); + + blB += nr*PacketSize; + blA += mr; } + + ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0); + ei_pstoreu(&res[(j2+1)*resStride + i2 + i], C1); + if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i], C2); + if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i], C3); + ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4); + ei_pstoreu(&res[(j2+1)*resStride + i2 + i + PacketSize], C5); + if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i + PacketSize], C6); + if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i + PacketSize], C7); } - // remaining rhs/res columns (<nr) - for(int j2=packet_cols; j2<cols; j2++) + for(int i=peeled_mc; i<actual_mc; i++) { - for(int i=0; i<actual_mc; i++) + const Scalar* blA = &blockA[i*actual_kc]; + #ifdef EIGEN_VECTORIZE_SSE + _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); + #endif + + // gets a 1 x nr res block as registers + Scalar C0(0), C1(0), C2(0), C3(0); + const Scalar* blB = &blockB[j2*actual_kc*PacketSize]; + for(int k=0; k<actual_kc; k++) { - Scalar c0 = Scalar(0); - if (lhsRowMajor) - for(int k=0; k<actual_kc; k++) - c0 += cj.pmul(lhs[(k2+k)+(i2+i)*lhsStride], rhs[j2*rhsStride + k2 + k]); - else - for(int k=0; k<actual_kc; k++) - c0 += cj.pmul(lhs[(k2+k)*lhsStride + i2+i], rhs[j2*rhsStride + k2 + k]); - res[(j2)*resStride + i2+i] += (ConjugateRhs ? ei_conj(alpha) : alpha) * c0; + Scalar B0, B1, B2, B3, A0; + + A0 = blA[k]; + B0 = blB[0*PacketSize]; + B1 = blB[1*PacketSize]; + C0 = cj.pmadd(A0, B0, C0); + if(nr==4) B2 = blB[2*PacketSize]; + if(nr==4) B3 = blB[3*PacketSize]; + C1 = cj.pmadd(A0, B1, C1); + if(nr==4) C2 = cj.pmadd(A0, B2, C2); + if(nr==4) C3 = cj.pmadd(A0, B3, C3); + + blB += nr*PacketSize; } + res[(j2+0)*resStride + i2 + i] += C0; + res[(j2+1)*resStride + i2 + i] += C1; + if(nr==4) res[(j2+2)*resStride + i2 + i] += C2; + if(nr==4) res[(j2+3)*resStride + i2 + i] += C3; } } - } + // remaining rhs/res columns (<nr) - ei_aligned_stack_delete(Scalar, blockA, kc*mc); - ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize); + // process remaining rhs/res columns one at a time + // => do the same but with nr==1 + for(int j2=packet_cols; j2<cols; j2++) + { + for(int i=0; i<peeled_mc; i+=mr) + { + const Scalar* blA = &blockA[i*actual_kc]; + #ifdef EIGEN_VECTORIZE_SSE + _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); + #endif + // TODO move the res loads to the stores - -#else // alternate product from cylmor + // gets res block as register + PacketType C0, C4; + C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]); + C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]); - enum { - PacketSize = sizeof(PacketType)/sizeof(Scalar), - #if (defined __i386__) - // i386 architecture provides only 8 xmm registers, - // so let's reduce the max number of rows processed at once. - MaxBlockRows = 4, - MaxBlockRows_ClampingMask = 0xFFFFFC, - #else - MaxBlockRows = 8, - MaxBlockRows_ClampingMask = 0xFFFFF8, - #endif - // maximal size of the blocks fitted in L2 cache - MaxL2BlockSize = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width - }; + const Scalar* blB = &blockB[j2*actual_kc*PacketSize]; + for(int k=0; k<actual_kc; k++) + { + PacketType B0, A0, A1; - const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0)); + A0 = ei_pload(&blA[0*PacketSize]); + A1 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + C0 = cj.pmadd(A0, B0, C0); + C4 = cj.pmadd(A1, B0, C4); - const int remainingSize = depth % PacketSize; - const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries + blB += PacketSize; + blA += mr; + } - const int l2BlockRows = MaxL2BlockSize > rows ? rows : 512; - const int l2BlockCols = MaxL2BlockSize > cols ? cols : 128; - const int l2BlockSize = MaxL2BlockSize > size ? size : 256; - const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize; - const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0)); + ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0); + ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4); + } + for(int i=peeled_mc; i<actual_mc; i++) + { + const Scalar* blA = &blockA[i*actual_kc]; + #ifdef EIGEN_VECTORIZE_SSE + _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); + #endif + + // gets a 1 x 1 res block as registers + Scalar C0(0); + const Scalar* blB = &blockB[j2*actual_kc*PacketSize]; + for(int k=0; k<actual_kc; k++) + C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0); + res[(j2+0)*resStride + i2 + i] += C0; + } + } + } +}; - Scalar* EIGEN_RESTRICT block = new Scalar[l2BlockRows*size]; -// for(int i=0; i<l2BlockRows*l2BlockSize; ++i) -// block[i] = 0; - // loops on each L2 cache friendly blocks of lhs - for(int l2k=0; l2k<depth; l2k+=l2BlockSize) +// pack a block of the lhs +template<typename Scalar, int mr> +struct ei_gemm_pack_lhs +{ + void operator()(Scalar* blockA, const Scalar* lhs, int lhsStride, bool lhsRowMajor, int actual_kc, int actual_mc, int k2, int i2) { - for(int l2i=0; l2i<rows; l2i+=l2BlockRows) + int count = 0; + const int peeled_mc = (actual_mc/mr)*mr; + if (lhsRowMajor) { - // We have selected a block of lhs - // Packs this block into 'block' - int count = 0; - for(int k=0; k<l2BlockSize; k+=MaxBlockRows) + for(int i=0; i<peeled_mc; i+=mr) + for(int k=0; k<actual_kc; k++) + for(int w=0; w<mr; w++) + blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride]; + for(int i=peeled_mc; i<actual_mc; i++) { - for(int i=0; i<l2BlockRows; i+=2*PacketSize) - for (int w=0; w<MaxBlockRows; ++w) - for (int y=0; y<2*PacketSize; ++y) - block[count++] = lhs[(k+l2k+w)*lhsStride + l2i+i+ y]; + const Scalar* llhs = &lhs[(k2) + (i2+i)*lhsStride]; + for(int k=0; k<actual_kc; k++) + blockA[count++] = llhs[k]; } + } + else + { + for(int i=0; i<peeled_mc; i+=mr) + for(int k=0; k<actual_kc; k++) + for(int w=0; w<mr; w++) + blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w]; + for(int i=peeled_mc; i<actual_mc; i++) + for(int k=0; k<actual_kc; k++) + blockA[count++] = lhs[(k2+k)*lhsStride + i2+i]; + } + } +}; - // loops on each L2 cache firendly block of the result/rhs - for(int l2j=0; l2j<cols; l2j+=l2BlockCols) +// copy a complete panel of the rhs while expending each coefficient into a packet form +template<typename Scalar, int PacketSize, int nr> +struct ei_gemm_pack_rhs +{ + void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, bool hasAlpha, Scalar alpha, int actual_kc, int packet_cols, int k2, int cols) + { + int count = 0; + for(int j2=0; j2<packet_cols; j2+=nr) + { + const Scalar* b0 = &rhs[(j2+0)*rhsStride + k2]; + const Scalar* b1 = &rhs[(j2+1)*rhsStride + k2]; + const Scalar* b2 = &rhs[(j2+2)*rhsStride + k2]; + const Scalar* b3 = &rhs[(j2+3)*rhsStride + k2]; + if (hasAlpha) + { + for(int k=0; k<actual_kc; k++) + { + ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k])); + ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k])); + if (nr==4) + { + ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k])); + ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k])); + } + count += nr*PacketSize; + } + } + else { - for(int k=0; k<l2BlockSize; k+=MaxBlockRows) + for(int k=0; k<actual_kc; k++) { - for(int j=0; j<l2BlockCols; ++j) + ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k])); + ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k])); + if (nr==4) { - PacketType A0, A1, A2, A3, A4, A5, A6, A7; - - // Load the packets from rhs and reorder them - - // Here we need some vector reordering - // Right now its hardcoded to packets of 4 elements - const Scalar* lrhs = &rhs[(j+l2j)*rhsStride+(k+l2k)]; - A0 = ei_pset1(lrhs[0]); - A1 = ei_pset1(lrhs[1]); - A2 = ei_pset1(lrhs[2]); - A3 = ei_pset1(lrhs[3]); - if (MaxBlockRows==8) - { - A4 = ei_pset1(lrhs[4]); - A5 = ei_pset1(lrhs[5]); - A6 = ei_pset1(lrhs[6]); - A7 = ei_pset1(lrhs[7]); - } - - Scalar * lb = &block[l2BlockRows * k]; - for(int i=0; i<l2BlockRows; i+=2*PacketSize) - { - PacketType R0, R1, L0, L1, T0, T1; - - // We perform "cross products" of vectors to avoid - // reductions (horizontal ops) afterwards - T0 = ei_pload(&res[(j+l2j)*resStride+l2i+i]); - T1 = ei_pload(&res[(j+l2j)*resStride+l2i+i+PacketSize]); - - R0 = ei_pload(&lb[0*PacketSize]); - L0 = ei_pload(&lb[1*PacketSize]); - R1 = ei_pload(&lb[2*PacketSize]); - L1 = ei_pload(&lb[3*PacketSize]); - T0 = cj.pmadd(A0, R0, T0); - T1 = cj.pmadd(A0, L0, T1); - R0 = ei_pload(&lb[4*PacketSize]); - L0 = ei_pload(&lb[5*PacketSize]); - T0 = cj.pmadd(A1, R1, T0); - T1 = cj.pmadd(A1, L1, T1); - R1 = ei_pload(&lb[6*PacketSize]); - L1 = ei_pload(&lb[7*PacketSize]); - T0 = cj.pmadd(A2, R0, T0); - T1 = cj.pmadd(A2, L0, T1); - if(MaxBlockRows==8) - { - R0 = ei_pload(&lb[8*PacketSize]); - L0 = ei_pload(&lb[9*PacketSize]); - } - T0 = cj.pmadd(A3, R1, T0); - T1 = cj.pmadd(A3, L1, T1); - if(MaxBlockRows==8) - { - R1 = ei_pload(&lb[10*PacketSize]); - L1 = ei_pload(&lb[11*PacketSize]); - T0 = cj.pmadd(A4, R0, T0); - T1 = cj.pmadd(A4, L0, T1); - R0 = ei_pload(&lb[12*PacketSize]); - L0 = ei_pload(&lb[13*PacketSize]); - T0 = cj.pmadd(A5, R1, T0); - T1 = cj.pmadd(A5, L1, T1); - R1 = ei_pload(&lb[14*PacketSize]); - L1 = ei_pload(&lb[15*PacketSize]); - T0 = cj.pmadd(A6, R0, T0); - T1 = cj.pmadd(A6, L0, T1); - T0 = cj.pmadd(A7, R1, T0); - T1 = cj.pmadd(A7, L1, T1); - } - lb += MaxBlockRows*2*PacketSize; - - ei_pstore(&res[(j+l2j)*resStride+l2i+i], T0); - ei_pstore(&res[(j+l2j)*resStride+l2i+i+PacketSize], T1); - } + ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k])); + ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k])); } + count += nr*PacketSize; + } + } + } + // copy the remaining columns one at a time (nr==1) + for(int j2=packet_cols; j2<cols; ++j2) + { + const Scalar* b0 = &rhs[(j2+0)*rhsStride + k2]; + if (hasAlpha) + { + for(int k=0; k<actual_kc; k++) + { + ei_pstore(&blockB[count], ei_pset1(alpha*b0[k])); + count += PacketSize; + } + } + else + { + for(int k=0; k<actual_kc; k++) + { + ei_pstore(&blockB[count], ei_pset1(b0[k])); + count += PacketSize; } } } } - delete[] block; -#endif - - -} +}; #endif // EIGEN_EXTERN_INSTANTIATIONS diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h new file mode 100644 index 000000000..6ab7b7211 --- /dev/null +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -0,0 +1,184 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H +#define EIGEN_SELFADJOINT_MATRIX_MATRIX_H + +template<typename Scalar, int mr> +struct ei_symm_pack_lhs +{ + void operator()(Scalar* blockA, const Scalar* lhs, int lhsStride, bool lhsRowMajor, int actual_kc, int actual_mc, int k2, int i2) + { + int count = 0; + const int peeled_mc = (actual_mc/mr)*mr; + if (lhsRowMajor) + { + for(int i=0; i<peeled_mc; i+=mr) + for(int k=0; k<actual_kc; k++) + for(int w=0; w<mr; w++) + blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride]; + for(int i=peeled_mc; i<actual_mc; i++) + { + const Scalar* llhs = &lhs[(k2) + (i2+i)*lhsStride]; + for(int k=0; k<actual_kc; k++) + blockA[count++] = llhs[k]; + } + } + else + { + for(int i=0; i<peeled_mc; i+=mr) + { + for(int k=0; k<i; k++) + for(int w=0; w<mr; w++) + blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w]; + + // symmetric copy + int h = 0; + for(int k=i; k<i+mr; k++) + { + // transposed copy + for(int w=0; w<h; w++) + blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride]; + for(int w=h; w<mr; w++) + blockA[count++] = lhs[(k2+k)*lhsStride + i2+i+w]; + ++h; + } + + // transposed copy + for(int k=i+mr; k<actual_kc; k++) + for(int w=0; w<mr; w++) + blockA[count++] = lhs[(k2+k) + (i2+i+w)*lhsStride]; + } + // do the same with mr==1 + for(int i=peeled_mc; i<actual_mc; i++) + { + for(int k=0; k<=i; k++) + blockA[count++] = lhs[(k2+k)*lhsStride + i2+i]; + + // transposed copy + for(int k=i+1; k<actual_kc; k++) + blockA[count++] = lhs[(k2+k) + (i2+i)*lhsStride]; + } + } + } +}; + +/* Optimized selfadjoint matrix * matrix (_SYMM) product built on top of + * the general matrix matrix product. + */ +template<typename Scalar, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> +static EIGEN_DONT_INLINE void ei_product_selfadjoint_matrix( + int size, + const Scalar* lhs, int lhsStride, + const Scalar* rhs, int rhsStride, bool rhsRowMajor, int cols, + Scalar* res, int resStride, + Scalar alpha) +{ + typedef typename ei_packet_traits<Scalar>::type Packet; + + ei_conj_helper<ConjugateLhs,ConjugateRhs> cj; + if (ConjugateRhs) + alpha = ei_conj(alpha); + bool hasAlpha = alpha != Scalar(1); + + typedef typename ei_packet_traits<Scalar>::type PacketType; + + const bool lhsRowMajor = (StorageOrder==RowMajor); + + enum { + PacketSize = sizeof(PacketType)/sizeof(Scalar), + #if (defined __i386__) + HalfRegisterCount = 4, + #else + HalfRegisterCount = 8, + #endif + + // register block size along the N direction + nr = HalfRegisterCount/2, + + // register block size along the M direction + mr = 2 * PacketSize, + + // max cache block size along the K direction + Max_kc = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width, + + // max cache block size along the M direction + Max_mc = 2*Max_kc + }; + + int kc = std::min<int>(Max_kc,size); // cache block size along the K direction + int mc = std::min<int>(Max_mc,size); // cache block size along the M direction + + Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); + Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*PacketSize); + + // number of columns which can be processed by packet of nr columns + int packet_cols = (cols/nr)*nr; + + for(int k2=0; k2<size; k2+=kc) + { + const int actual_kc = std::min(k2+kc,size)-k2; + + // we have selected one row panel of rhs and one column panel of lhs + // pack rhs's panel into a sequential chunk of memory + // and expand each coeff to a constant packet for further reuse + ei_gemm_pack_rhs<Scalar,PacketSize,nr>()(blockB, rhs, rhsStride, hasAlpha, alpha, actual_kc, packet_cols, k2, cols); + + // the select lhs's panel has to be split in three different parts: + // 1 - the transposed panel above the diagonal block => transposed packed copy + // 2 - the diagonal block => special packed copy + // 3 - the panel below the diagonal block => generic packed copy + for(int i2=0; i2<k2; i2+=mc) + { + const int actual_mc = std::min(i2+mc,k2)-i2; + // transposed packed copy + ei_gemm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, !lhsRowMajor, actual_kc, actual_mc, k2, i2); + + ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >() + (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); + } + // the block diagonal + { + const int actual_mc = std::min(k2+kc,size)-k2; + // symmetric packed copy + ei_symm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, k2); + ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >() + (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, k2, cols); + } + + for(int i2=k2+kc; i2<size; i2+=mc) + { + const int actual_mc = std::min(i2+mc,size)-i2; + ei_gemm_pack_lhs<Scalar,mr>()(blockA, lhs, lhsStride, lhsRowMajor, actual_kc, actual_mc, k2, i2); + ei_gebp_kernel<Scalar, PacketType, PacketSize, mr, nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >() + (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); + } + } + + ei_aligned_stack_delete(Scalar, blockA, kc*mc); + ei_aligned_stack_delete(Scalar, blockB, kc*cols*PacketSize); +} + + +#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H |