diff options
author | Gael Guennebaud <g.gael@free.fr> | 2014-03-31 10:42:19 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2014-03-31 10:42:19 +0200 |
commit | 8d0441052e7fac530fad12016f53f5b234a68d47 (patch) | |
tree | a2054eef05d035911263f5751898dd97b9e6a1f8 /Eigen/src/Core/products | |
parent | 82c81630679b44bd1a3f7842152f12179428c9f7 (diff) |
Finally, prefetching seems to help getting more stable performance
Diffstat (limited to 'Eigen/src/Core/products')
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 9 |
1 files changed, 8 insertions, 1 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index a9e42c8aa..d9e659c9a 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -639,7 +639,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index i=0; i<peeled_mc; i+=mr) { const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; - // prefetch(&blA[0]); + prefetch(&blA[0]); // gets res block as register AccPacket C0, C1, C2, C3, C4, C5, C6, C7; @@ -771,6 +771,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index i=peeled_mc; i<rows2; i+=2) { const LhsScalar* blA = &blockA[i*strideA+offsetA]; + prefetch(&blA[0]); const RhsScalar* blB = &blockB[j2*strideB+offsetB*8]; EIGEN_ASM_COMMENT("begin_vectorized_multiplication_of_last_rows 2x8"); @@ -822,6 +823,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> { Index i = rows-1; const LhsScalar* blA = &blockA[i*strideA+offsetA]; + prefetch(&blA[0]); const RhsScalar* blB = &blockB[j2*strideB+offsetB*8]; EIGEN_ASM_COMMENT("begin_vectorized_multiplication_of_last_rows 8"); @@ -863,6 +865,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index i=peeled_mc; i<rows; i++) { const LhsScalar* blA = &blockA[i*strideA+offsetA]; + prefetch(&blA[0]); const RhsScalar* blB = &blockB[j2*strideB+offsetB*8]; // gets a 1 x 8 res block as registers @@ -924,6 +927,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index i=0; i<peeled_mc; i+=mr) { const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; + prefetch(&blA[0]); // gets res block as register AccPacket C0, C1, C2, C3; @@ -996,6 +1000,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index i=peeled_mc; i<rows; i++) { const LhsScalar* blA = &blockA[i*strideA+offsetA]; + prefetch(&blA[0]); const RhsScalar* blB = &blockB[j2*strideB+offsetB*4]; // TODO vectorize in more cases @@ -1067,6 +1072,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> traits.initAcc(C0); const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; + prefetch(&blA[0]); const RhsScalar* blB = &blockB[j2*strideB+offsetB]; for(Index k=0; k<depth; k++) { @@ -1091,6 +1097,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> for(Index i=peeled_mc; i<rows; i++) { const LhsScalar* blA = &blockA[i*strideA+offsetA]; + prefetch(&blA[0]); // gets a 1 x 1 res block as registers ResScalar C0(0); const RhsScalar* blB = &blockB[j2*strideB+offsetB]; |