aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2014-03-31 10:42:19 +0200
committerGravatar Gael Guennebaud <g.gael@free.fr>2014-03-31 10:42:19 +0200
commit8d0441052e7fac530fad12016f53f5b234a68d47 (patch)
treea2054eef05d035911263f5751898dd97b9e6a1f8
parent82c81630679b44bd1a3f7842152f12179428c9f7 (diff)
Finally, prefetching seems to help getting more stable performance
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h9
1 files changed, 8 insertions, 1 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index a9e42c8aa..d9e659c9a 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -639,7 +639,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
for(Index i=0; i<peeled_mc; i+=mr)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
- // prefetch(&blA[0]);
+ prefetch(&blA[0]);
// gets res block as register
AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
@@ -771,6 +771,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
for(Index i=peeled_mc; i<rows2; i+=2)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA];
+ prefetch(&blA[0]);
const RhsScalar* blB = &blockB[j2*strideB+offsetB*8];
EIGEN_ASM_COMMENT("begin_vectorized_multiplication_of_last_rows 2x8");
@@ -822,6 +823,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
{
Index i = rows-1;
const LhsScalar* blA = &blockA[i*strideA+offsetA];
+ prefetch(&blA[0]);
const RhsScalar* blB = &blockB[j2*strideB+offsetB*8];
EIGEN_ASM_COMMENT("begin_vectorized_multiplication_of_last_rows 8");
@@ -863,6 +865,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
for(Index i=peeled_mc; i<rows; i++)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA];
+ prefetch(&blA[0]);
const RhsScalar* blB = &blockB[j2*strideB+offsetB*8];
// gets a 1 x 8 res block as registers
@@ -924,6 +927,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
for(Index i=0; i<peeled_mc; i+=mr)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
+ prefetch(&blA[0]);
// gets res block as register
AccPacket C0, C1, C2, C3;
@@ -996,6 +1000,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
for(Index i=peeled_mc; i<rows; i++)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA];
+ prefetch(&blA[0]);
const RhsScalar* blB = &blockB[j2*strideB+offsetB*4];
// TODO vectorize in more cases
@@ -1067,6 +1072,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.initAcc(C0);
const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
+ prefetch(&blA[0]);
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
for(Index k=0; k<depth; k++)
{
@@ -1091,6 +1097,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
for(Index i=peeled_mc; i<rows; i++)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA];
+ prefetch(&blA[0]);
// gets a 1 x 1 res block as registers
ResScalar C0(0);
const RhsScalar* blB = &blockB[j2*strideB+offsetB];