diff options
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 122 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 39 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointProduct.h | 327 |
3 files changed, 108 insertions, 380 deletions
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 9166921fe..776b7c5d6 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -78,9 +78,6 @@ static void run(int rows, int cols, int depth, Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); - // number of columns which can be processed by packet of nr columns - int packet_cols = (cols/Blocking::nr) * Blocking::nr; - // => GEMM_VAR1 for(int k2=0; k2<depth; k2+=kc) { @@ -89,7 +86,7 @@ static void run(int rows, int cols, int depth, // we have selected one row panel of rhs and one column panel of lhs // pack rhs's panel into a sequential chunk of memory // and expand each coeff to a constant packet for further reuse - ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, packet_cols, cols); + ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); // => GEPP_VAR1 for(int i2=0; i2<rows; i2+=mc) @@ -99,7 +96,7 @@ static void run(int rows, int cols, int depth, ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder>()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >() - (res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); + (res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); } } @@ -113,19 +110,20 @@ static void run(int rows, int cols, int depth, template<typename Scalar, int mr, int nr, typename Conj> struct ei_gebp_kernel { - void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols, int i2, int cols) + void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols) { typedef typename ei_packet_traits<Scalar>::type PacketType; enum { PacketSize = ei_packet_traits<Scalar>::size }; Conj cj; - const int peeled_mc = (actual_mc/mr)*mr; + int packet_cols = (cols/nr) * nr; + const int peeled_mc = (rows/mr)*mr; // loops on each cache friendly block of the result/rhs for(int j2=0; j2<packet_cols; j2+=nr) { // loops on each register blocking of lhs/res for(int i=0; i<peeled_mc; i+=mr) { - const Scalar* blA = &blockA[i*actual_kc]; + const Scalar* blA = &blockA[i*depth]; #ifdef EIGEN_VECTORIZE_SSE _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); #endif @@ -134,20 +132,20 @@ struct ei_gebp_kernel // gets res block as register PacketType C0, C1, C2, C3, C4, C5, C6, C7; - C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]); - C1 = ei_ploadu(&res[(j2+1)*resStride + i2 + i]); - if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i2 + i]); - if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i2 + i]); - C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]); - C5 = ei_ploadu(&res[(j2+1)*resStride + i2 + i + PacketSize]); - if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i2 + i + PacketSize]); - if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i2 + i + PacketSize]); + C0 = ei_ploadu(&res[(j2+0)*resStride + i]); + C1 = ei_ploadu(&res[(j2+1)*resStride + i]); + if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]); + if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]); + C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]); + C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]); + if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]); + if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]); // performs "inner" product // TODO let's check wether the flowing peeled loop could not be // optimized via optimal prefetching from one loop to the other - const Scalar* blB = &blockB[j2*actual_kc*PacketSize]; - const int peeled_kc = (actual_kc/4)*4; + const Scalar* blB = &blockB[j2*depth*PacketSize]; + const int peeled_kc = (depth/4)*4; for(int k=0; k<peeled_kc; k+=4) { PacketType B0, B1, B2, B3, A0, A1; @@ -214,7 +212,7 @@ struct ei_gebp_kernel blA += 4*mr; } // process remaining peeled loop - for(int k=peeled_kc; k<actual_kc; k++) + for(int k=peeled_kc; k<depth; k++) { PacketType B0, B1, B2, B3, A0, A1; @@ -237,26 +235,26 @@ struct ei_gebp_kernel blA += mr; } - ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0); - ei_pstoreu(&res[(j2+1)*resStride + i2 + i], C1); - if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i], C2); - if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i], C3); - ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4); - ei_pstoreu(&res[(j2+1)*resStride + i2 + i + PacketSize], C5); - if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i2 + i + PacketSize], C6); - if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i2 + i + PacketSize], C7); + ei_pstoreu(&res[(j2+0)*resStride + i], C0); + ei_pstoreu(&res[(j2+1)*resStride + i], C1); + if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2); + if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3); + ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4); + ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5); + if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6); + if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7); } - for(int i=peeled_mc; i<actual_mc; i++) + for(int i=peeled_mc; i<rows; i++) { - const Scalar* blA = &blockA[i*actual_kc]; + const Scalar* blA = &blockA[i*depth]; #ifdef EIGEN_VECTORIZE_SSE _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); #endif // gets a 1 x nr res block as registers Scalar C0(0), C1(0), C2(0), C3(0); - const Scalar* blB = &blockB[j2*actual_kc*PacketSize]; - for(int k=0; k<actual_kc; k++) + const Scalar* blB = &blockB[j2*depth*PacketSize]; + for(int k=0; k<depth; k++) { Scalar B0, B1, B2, B3, A0; @@ -272,10 +270,10 @@ struct ei_gebp_kernel blB += nr*PacketSize; } - res[(j2+0)*resStride + i2 + i] += C0; - res[(j2+1)*resStride + i2 + i] += C1; - if(nr==4) res[(j2+2)*resStride + i2 + i] += C2; - if(nr==4) res[(j2+3)*resStride + i2 + i] += C3; + res[(j2+0)*resStride + i] += C0; + res[(j2+1)*resStride + i] += C1; + if(nr==4) res[(j2+2)*resStride + i] += C2; + if(nr==4) res[(j2+3)*resStride + i] += C3; } } @@ -285,7 +283,7 @@ struct ei_gebp_kernel { for(int i=0; i<peeled_mc; i+=mr) { - const Scalar* blA = &blockA[i*actual_kc]; + const Scalar* blA = &blockA[i*depth]; #ifdef EIGEN_VECTORIZE_SSE _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); #endif @@ -294,11 +292,11 @@ struct ei_gebp_kernel // gets res block as register PacketType C0, C4; - C0 = ei_ploadu(&res[(j2+0)*resStride + i2 + i]); - C4 = ei_ploadu(&res[(j2+0)*resStride + i2 + i + PacketSize]); + C0 = ei_ploadu(&res[(j2+0)*resStride + i]); + C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]); - const Scalar* blB = &blockB[j2*actual_kc*PacketSize]; - for(int k=0; k<actual_kc; k++) + const Scalar* blB = &blockB[j2*depth*PacketSize]; + for(int k=0; k<depth; k++) { PacketType B0, A0, A1; @@ -312,22 +310,22 @@ struct ei_gebp_kernel blA += mr; } - ei_pstoreu(&res[(j2+0)*resStride + i2 + i], C0); - ei_pstoreu(&res[(j2+0)*resStride + i2 + i + PacketSize], C4); + ei_pstoreu(&res[(j2+0)*resStride + i], C0); + ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4); } - for(int i=peeled_mc; i<actual_mc; i++) + for(int i=peeled_mc; i<rows; i++) { - const Scalar* blA = &blockA[i*actual_kc]; + const Scalar* blA = &blockA[i*depth]; #ifdef EIGEN_VECTORIZE_SSE _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); #endif // gets a 1 x 1 res block as registers Scalar C0(0); - const Scalar* blB = &blockB[j2*actual_kc*PacketSize]; - for(int k=0; k<actual_kc; k++) + const Scalar* blB = &blockB[j2*depth*PacketSize]; + for(int k=0; k<depth; k++) C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0); - res[(j2+0)*resStride + i2 + i] += C0; + res[(j2+0)*resStride + i] += C0; } } } @@ -350,19 +348,19 @@ struct ei_gebp_kernel template<typename Scalar, int mr, int StorageOrder, bool Conjugate> struct ei_gemm_pack_lhs { - void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int actual_kc, int actual_mc) + void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows) { ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; ei_const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride); int count = 0; - const int peeled_mc = (actual_mc/mr)*mr; + const int peeled_mc = (rows/mr)*mr; for(int i=0; i<peeled_mc; i+=mr) - for(int k=0; k<actual_kc; k++) + for(int k=0; k<depth; k++) for(int w=0; w<mr; w++) blockA[count++] = cj(lhs(i+w, k)); - for(int i=peeled_mc; i<actual_mc; i++) + for(int i=peeled_mc; i<rows; i++) { - for(int k=0; k<actual_kc; k++) + for(int k=0; k<depth; k++) blockA[count++] = cj(lhs(i, k)); } } @@ -379,9 +377,10 @@ template<typename Scalar, int nr> struct ei_gemm_pack_rhs<Scalar, nr, ColMajor> { enum { PacketSize = ei_packet_traits<Scalar>::size }; - void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols) + void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols) { bool hasAlpha = alpha != Scalar(1); + int packet_cols = (cols/nr) * nr; int count = 0; for(int j2=0; j2<packet_cols; j2+=nr) { @@ -391,7 +390,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor> const Scalar* b3 = &rhs[(j2+3)*rhsStride]; if (hasAlpha) { - for(int k=0; k<actual_kc; k++) + for(int k=0; k<depth; k++) { ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k])); ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k])); @@ -405,7 +404,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor> } else { - for(int k=0; k<actual_kc; k++) + for(int k=0; k<depth; k++) { ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k])); ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k])); @@ -424,7 +423,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor> const Scalar* b0 = &rhs[(j2+0)*rhsStride]; if (hasAlpha) { - for(int k=0; k<actual_kc; k++) + for(int k=0; k<depth; k++) { ei_pstore(&blockB[count], ei_pset1(alpha*b0[k])); count += PacketSize; @@ -432,7 +431,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor> } else { - for(int k=0; k<actual_kc; k++) + for(int k=0; k<depth; k++) { ei_pstore(&blockB[count], ei_pset1(b0[k])); count += PacketSize; @@ -447,15 +446,16 @@ template<typename Scalar, int nr> struct ei_gemm_pack_rhs<Scalar, nr, RowMajor> { enum { PacketSize = ei_packet_traits<Scalar>::size }; - void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols) + void operator()(Scalar* blockB, const Scalar* rhs, int rhsStride, Scalar alpha, int depth, int cols) { bool hasAlpha = alpha != Scalar(1); + int packet_cols = (cols/nr) * nr; int count = 0; for(int j2=0; j2<packet_cols; j2+=nr) { if (hasAlpha) { - for(int k=0; k<actual_kc; k++) + for(int k=0; k<depth; k++) { const Scalar* b0 = &rhs[k*rhsStride + j2]; ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0])); @@ -470,7 +470,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor> } else { - for(int k=0; k<actual_kc; k++) + for(int k=0; k<depth; k++) { const Scalar* b0 = &rhs[k*rhsStride + j2]; ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0])); @@ -488,7 +488,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor> for(int j2=packet_cols; j2<cols; ++j2) { const Scalar* b0 = &rhs[j2]; - for(int k=0; k<actual_kc; k++) + for(int k=0; k<depth; k++) { ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride])); count += PacketSize; diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 073a5ecf7..acd239d28 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -29,11 +29,11 @@ template<typename Scalar, int mr, int StorageOrder> struct ei_symm_pack_lhs { - void operator()(Scalar* blockA, const Scalar* _lhs, int lhsStride, int actual_kc, int actual_mc) + void operator()(Scalar* blockA, const Scalar* _lhs, int lhsStride, int cols, int rows) { ei_const_blas_data_mapper<Scalar,StorageOrder> lhs(_lhs,lhsStride); int count = 0; - const int peeled_mc = (actual_mc/mr)*mr; + const int peeled_mc = (rows/mr)*mr; for(int i=0; i<peeled_mc; i+=mr) { // normal copy @@ -51,17 +51,17 @@ struct ei_symm_pack_lhs ++h; } // transposed copy - for(int k=i+mr; k<actual_kc; k++) + for(int k=i+mr; k<cols; k++) for(int w=0; w<mr; w++) blockA[count++] = ei_conj(lhs(k, i+w)); // transposed } // do the same with mr==1 - for(int i=peeled_mc; i<actual_mc; i++) + for(int i=peeled_mc; i<rows; i++) { for(int k=0; k<=i; k++) blockA[count++] = lhs(i, k); // normal - for(int k=i+1; k<actual_kc; k++) + for(int k=i+1; k<cols; k++) blockA[count++] = ei_conj(lhs(k, i)); // transposed } } @@ -71,11 +71,12 @@ template<typename Scalar, int nr, int StorageOrder> struct ei_symm_pack_rhs { enum { PacketSize = ei_packet_traits<Scalar>::size }; - void operator()(Scalar* blockB, const Scalar* _rhs, int rhsStride, Scalar alpha, int actual_kc, int packet_cols, int cols, int k2) + void operator()(Scalar* blockB, const Scalar* _rhs, int rhsStride, Scalar alpha, int rows, int cols, int k2) { - int end_k = k2 + actual_kc; + int end_k = k2 + rows; int count = 0; ei_const_blas_data_mapper<Scalar,StorageOrder> rhs(_rhs,rhsStride); + int packet_cols = (cols/nr)*nr; // first part: normal case for(int j2=0; j2<k2; j2+=nr) @@ -94,7 +95,7 @@ struct ei_symm_pack_rhs } // second part: diagonal block - for(int j2=k2; j2<std::min(k2+actual_kc,packet_cols); j2+=nr) + for(int j2=k2; j2<std::min(k2+rows,packet_cols); j2+=nr) { // again we can split vertically in three different parts (transpose, symmetric, normal) // transpose @@ -137,7 +138,7 @@ struct ei_symm_pack_rhs } // third part: transposed - for(int j2=k2+actual_kc; j2<packet_cols; j2+=nr) + for(int j2=k2+rows; j2<packet_cols; j2+=nr) { for(int k=k2; k<end_k; k++) { @@ -163,7 +164,7 @@ struct ei_symm_pack_rhs count += PacketSize; } // normal - for(int k=half; k<k2+actual_kc; k++) + for(int k=half; k<k2+rows; k++) { ei_pstore(&blockB[count], ei_pset1(alpha*rhs(k,j2))); count += PacketSize; @@ -233,9 +234,6 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); - // number of columns which can be processed by packet of nr columns - int packet_cols = (cols/Blocking::nr)*Blocking::nr; - ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel; for(int k2=0; k2<size; k2+=kc) @@ -246,7 +244,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R // pack rhs's panel into a sequential chunk of memory // and expand each coeff to a constant packet for further reuse ei_gemm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>() - (blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, packet_cols, cols); + (blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); // the select lhs's panel has to be split in three different parts: // 1 - the transposed panel above the diagonal block => transposed packed copy @@ -259,7 +257,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true>() (blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); } // the block diagonal { @@ -268,7 +266,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R ei_symm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>() (blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, k2, cols); + gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols); } for(int i2=k2+kc; i2<size; i2+=mc) @@ -277,7 +275,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder,false>() (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); } } @@ -316,9 +314,6 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs, Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); - // number of columns which can be processed by packet of nr columns - int packet_cols = (cols/Blocking::nr)*Blocking::nr; - ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel; for(int k2=0; k2<size; k2+=kc) @@ -326,7 +321,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs, const int actual_kc = std::min(k2+kc,size)-k2; ei_symm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>() - (blockB, _rhs, rhsStride, alpha, actual_kc, packet_cols, cols, k2); + (blockB, _rhs, rhsStride, alpha, actual_kc, cols, k2); // => GEPP for(int i2=0; i2<rows; i2+=mc) @@ -335,7 +330,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs, ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>() (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, packet_cols, i2, cols); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); } } diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index facde7252..d971720d6 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -70,7 +70,7 @@ struct ei_selfadjoint_product<Scalar,MatStorageOrder, ColMajor, AAT, UpLo> typedef ei_product_blocking_traits<Scalar> Blocking; - int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction + int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); @@ -90,7 +90,7 @@ struct ei_selfadjoint_product<Scalar,MatStorageOrder, ColMajor, AAT, UpLo> // note that the actual rhs is the transpose/adjoint of mat ei_gemm_pack_rhs<Scalar,Blocking::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor>() - (blockB, &mat(0,k2), matStride, alpha, actual_kc, packet_cols, size); + (blockB, &mat(0,k2), matStride, alpha, actual_kc, size); for(int i2=0; i2<size; i2+=mc) { @@ -104,16 +104,15 @@ struct ei_selfadjoint_product<Scalar,MatStorageOrder, ColMajor, AAT, UpLo> // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel // 3 - after the diagonal => processed with gebp or skipped if (UpLo==LowerTriangular) - gebp_kernel(res, resStride, blockA, blockB, actual_mc, actual_kc, std::min(packet_cols,i2), i2, std::min(size,i2)); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2)); ei_sybb_kernel<Scalar, Blocking::mr, Blocking::nr, Conj, UpLo>() - (res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*Blocking::PacketSize*i2, actual_mc, actual_kc, std::min(actual_mc,std::max(packet_cols-i2,0))); + (res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*Blocking::PacketSize*i2, actual_mc, actual_kc); if (UpLo==UpperTriangular) { int j2 = i2+actual_mc; - gebp_kernel(res+resStride*j2, resStride, blockA, blockB+actual_kc*Blocking::PacketSize*j2, actual_mc, actual_kc, - std::max(0,packet_cols-j2), i2, std::max(0,size-j2)); + gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*Blocking::PacketSize*j2, actual_mc, actual_kc, std::max(0,size-j2)); } } } @@ -149,323 +148,57 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo> } -// optimized SYmmetric packed Block * packed Block product kernel +// Optimized SYmmetric packed Block * packed Block product kernel. +// This kernel is built on top of the gebp kernel: +// - the current selfadjoint block (res) is processed per panel of actual_mc x BlockSize +// where BlockSize is set to the minimal value allowing gebp to be as fast as possible +// - then, as usual, each panel is split into three parts along the diagonal, +// the sub blocks above and below the diagonal are processed as usual, +// while the selfadjoint block overlapping the diagonal is evaluated into a +// small temporary buffer which is then accumulated into the result using a +// triangular traversal. template<typename Scalar, int mr, int nr, typename Conj, int UpLo> struct ei_sybb_kernel { enum { PacketSize = ei_packet_traits<Scalar>::size, - BlockSize = EIGEN_ENUM_MAX(mr,nr) + BlockSize = EIGEN_ENUM_MAX(mr,nr) }; - void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols) + void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int size, int depth) { ei_gebp_kernel<Scalar, mr, nr, Conj> gebp_kernel; Matrix<Scalar,BlockSize,BlockSize,ColMajor> buffer; - + // let's process the block per panel of actual_mc x BlockSize, // again, each is split into three parts, etc. - for (int j=0; j<actual_mc; j+=BlockSize) + for (int j=0; j<size; j+=BlockSize) { - int actualBlockSize = std::min(BlockSize,actual_mc - j); - int actualPacketCols = std::min(actualBlockSize,std::max(packet_cols-j,0)); - const Scalar* actual_b = blockB+j*actual_kc*PacketSize; - + int actualBlockSize = std::min<int>(BlockSize,size - j); + const Scalar* actual_b = blockB+j*depth*PacketSize; + if(UpLo==UpperTriangular) - gebp_kernel(res+j*resStride, resStride, blockA, actual_b, - j, actual_kc, actualPacketCols, 0, actualBlockSize); + gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize); // selfadjoint micro block - // => use the gebp kernel on a temporary buffer, - // and then perform a triangular copy { int i = j; buffer.setZero(); - gebp_kernel(buffer.data(), BlockSize, blockA+actual_kc*i, actual_b, - actualBlockSize, actual_kc, actualPacketCols, 0, actualBlockSize); + // 1 - apply the kernel on the temporary buffer + gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize); + // 2 - triangular accumulation for(int j1=0; j1<actualBlockSize; ++j1) { - Scalar* r = res + (j+j1)*resStride; + Scalar* r = res + (j+j1)*resStride + i; for(int i1=UpLo==LowerTriangular ? j1 : 0; - UpLo==LowerTriangular ? i1<actualBlockSize : i1<=j1; ++j1) - r[i1+j1*resStride] = buffer(i1,j1); + UpLo==LowerTriangular ? i1<actualBlockSize : i1<=j1; ++i1) + r[i1] += buffer(i1,j1); } } - - if(UpLo==LowerTriangular) - { - int i = j+actualBlockSize; - if(i<actual_mc) - gebp_kernel(res+j*resStride+i, resStride, blockA+actual_kc*i, actual_b, - actual_mc-i, actual_kc, actualPacketCols, 0, actualBlockSize); - } - } - } -}; -// optimized SYmmetric packed Block * packed Block product kernel -// this kernel is very similar to the gebp kernel: the only differences are -// the piece of code to avoid the writes off the diagonal -// => TODO find a way to factorize the two kernels in a single one -template<typename Scalar, int mr, int nr, typename Conj, int UpLo> -struct ei_sybb_kernel_ -{ - void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int actual_mc, int actual_kc, int packet_cols) - { - typedef typename ei_packet_traits<Scalar>::type PacketType; - enum { PacketSize = ei_packet_traits<Scalar>::size }; - Conj cj; - const int peeled_mc = (actual_mc/mr)*mr; - // loops on each cache friendly block of the result/rhs - for(int j2=0; j2<packet_cols; j2+=nr) - { - // here we selected a vertical mc x nr panel of the result that we'll - // process normally until the end of the diagonal (or from the start if upper) - // - int start_i = UpLo==LowerTriangular ? (j2/mr)*mr : 0; - int end_i = UpLo==LowerTriangular ? actual_mc : std::min(actual_mc,((j2+std::max(mr,nr))/mr)*mr); - for(int i=start_i; i<std::min(peeled_mc,end_i); i+=mr) - { - const Scalar* blA = &blockA[i*actual_kc]; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); - #endif - - // TODO move the res loads to the stores - - // gets res block as register - PacketType C0, C1, C2, C3, C4, C5, C6, C7; - C0 = ei_ploadu(&res[(j2+0)*resStride + i]); - C1 = ei_ploadu(&res[(j2+1)*resStride + i]); - if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]); - if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]); - C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]); - C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]); - if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]); - if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]); - - // performs "inner" product - // TODO let's check wether the flowing peeled loop could not be - // optimized via optimal prefetching from one loop to the other - const Scalar* blB = &blockB[j2*actual_kc*PacketSize]; - const int peeled_kc = (actual_kc/4)*4; - for(int k=0; k<peeled_kc; k+=4) - { - PacketType B0, B1, B2, B3, A0, A1; - - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); - C4 = cj.pmadd(A1, B0, C4); - if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); - B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) B2 = ei_pload(&blB[6*PacketSize]); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - A0 = ei_pload(&blA[2*PacketSize]); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - A1 = ei_pload(&blA[3*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[7*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - C4 = cj.pmadd(A1, B0, C4); - B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) B2 = ei_pload(&blB[10*PacketSize]); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - A0 = ei_pload(&blA[4*PacketSize]); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - A1 = ei_pload(&blA[5*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[11*PacketSize]); - - C0 = cj.pmadd(A0, B0, C0); - C4 = cj.pmadd(A1, B0, C4); - B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) B2 = ei_pload(&blB[14*PacketSize]); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - A0 = ei_pload(&blA[6*PacketSize]); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - A1 = ei_pload(&blA[7*PacketSize]); - if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - C4 = cj.pmadd(A1, B0, C4); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - - blB += 4*nr*PacketSize; - blA += 4*mr; - } - // process remaining peeled loop - for(int k=peeled_kc; k<actual_kc; k++) - { - PacketType B0, B1, B2, B3, A0, A1; - - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); - C4 = cj.pmadd(A1, B0, C4); - if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); - C1 = cj.pmadd(A0, B1, C1); - C5 = cj.pmadd(A1, B1, C5); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C6 = cj.pmadd(A1, B2, C6); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - if(nr==4) C7 = cj.pmadd(A1, B3, C7); - - blB += nr*PacketSize; - blA += mr; - } - - // let's check whether the mr x nr block overlap the diagonal, - // is so then we have to carefully discard writes off the diagonal - if(UpLo==LowerTriangular ? i>=j2+nr : i+mr<=j2) - { - ei_pstoreu(&res[(j2+0)*resStride + i], C0); - ei_pstoreu(&res[(j2+1)*resStride + i], C1); - if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2); - if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3); - ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4); - ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5); - if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6); - if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7); - } - else - { - Scalar buf[mr*nr]; - // overlap => copy to a temporary mr x nr buffer and then triangular copy - ei_pstore(&buf[0*mr], C0); - ei_pstore(&buf[1*mr], C1); - if(nr==4) ei_pstore(&buf[2*mr], C2); - if(nr==4) ei_pstore(&buf[3*mr], C3); - ei_pstore(&buf[0*mr + PacketSize], C4); - ei_pstore(&buf[1*mr + PacketSize], C5); - if(nr==4) ei_pstore(&buf[2*mr + PacketSize], C6); - if(nr==4) ei_pstore(&buf[3*mr + PacketSize], C7); - - for(int j1=0; j1<nr; ++j1) - for(int i1=0; i1<mr; ++i1) - { - if(UpLo==LowerTriangular ? i+i1 >= j2+j1 : i+i1 <= j2+j1) - res[(j2+j1)*resStride + i+i1] = buf[i1 + j1 * mr]; - } - } - } - for(int i=std::max(start_i,peeled_mc); i<std::min(end_i,actual_mc); i++) - { - const Scalar* blA = &blockA[i*actual_kc]; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); - #endif - - // gets a 1 x nr res block as registers - Scalar C0(0), C1(0), C2(0), C3(0); - const Scalar* blB = &blockB[j2*actual_kc*PacketSize]; - for(int k=0; k<actual_kc; k++) - { - Scalar B0, B1, B2, B3, A0; - - A0 = blA[k]; - B0 = blB[0*PacketSize]; - B1 = blB[1*PacketSize]; - C0 = cj.pmadd(A0, B0, C0); - if(nr==4) B2 = blB[2*PacketSize]; - if(nr==4) B3 = blB[3*PacketSize]; - C1 = cj.pmadd(A0, B1, C1); - if(nr==4) C2 = cj.pmadd(A0, B2, C2); - if(nr==4) C3 = cj.pmadd(A0, B3, C3); - - blB += nr*PacketSize; - } - if(UpLo==LowerTriangular ? i>=j2+nr : i+mr<=j2) { - res[(j2+0)*resStride + i] += C0; - res[(j2+1)*resStride + i] += C1; - if(nr==4) res[(j2+2)*resStride + i] += C2; - if(nr==4) res[(j2+3)*resStride + i] += C3; - } - else - { - if(UpLo==LowerTriangular ? i>=j2+0 : i<=j2+0) res[(j2+0)*resStride + i] += C0; - if(UpLo==LowerTriangular ? i>=j2+1 : i<=j2+1) res[(j2+1)*resStride + i] += C1; - if(nr==4) if(UpLo==LowerTriangular ? i>=j2+2 : i<=j2+2) res[(j2+2)*resStride + i] += C2; - if(nr==4) if(UpLo==LowerTriangular ? i>=j2+3 : i<=j2+3) res[(j2+3)*resStride + i] += C3; - } - } - } - - // process remaining rhs/res columns one at a time - // => do the same but with nr==1 - for(int j2=packet_cols; j2<actual_mc; j2++) - { - int start_i = UpLo==LowerTriangular ? (j2/mr)*mr : 0; - int end_i = UpLo==LowerTriangular ? actual_mc : std::min(actual_mc,j2+1); - for(int i=start_i; i<std::min(end_i,peeled_mc); i+=mr) - { - const Scalar* blA = &blockA[i*actual_kc]; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); - #endif - - // TODO move the res loads to the stores - - // gets res block as register - PacketType C0, C4; - C0 = ei_ploadu(&res[(j2+0)*resStride + i]); - C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]); - - const Scalar* blB = &blockB[j2*actual_kc*PacketSize]; - for(int k=0; k<actual_kc; k++) - { - PacketType B0, A0, A1; - - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - C0 = cj.pmadd(A0, B0, C0); - C4 = cj.pmadd(A1, B0, C4); - - blB += PacketSize; - blA += mr; - } - - if(UpLo==LowerTriangular ? i>=j2 : i<=j2) ei_pstoreu(&res[(j2+0)*resStride + i], C0); - if(UpLo==LowerTriangular ? i+PacketSize>=j2 : i+PacketSize<=j2) ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4); - } if(UpLo==LowerTriangular) - start_i = j2; - for(int i=std::max(start_i,peeled_mc); i<std::min(end_i,actual_mc); i++) { - const Scalar* blA = &blockA[i*actual_kc]; - #ifdef EIGEN_VECTORIZE_SSE - _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0); - #endif - - // gets a 1 x 1 res block as registers - Scalar C0(0); - const Scalar* blB = &blockB[j2*actual_kc*PacketSize]; - for(int k=0; k<actual_kc; k++) - C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0); - res[(j2+0)*resStride + i] += C0; + int i = j+actualBlockSize; + gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize); } } } |