aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2010-02-22 15:18:29 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2010-02-22 15:18:29 +0100
commit51a4b929a17ec36c45b1fb814c566098a164b7df (patch)
treefa58e567acc9ab48fb0be6a2f8dd9a608ebb8dc4 /Eigen/src/Core
parent3e6ab8f93bd794f3f81f08c98098b146791719de (diff)
implement an even lower level version of the gebp kernel for MSVC (it seems to be faster with gcc as well)
Diffstat (limited to 'Eigen/src/Core')
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h72
1 files changed, 70 insertions, 2 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index fe1987bdd..8c29d2218 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -76,6 +76,7 @@ struct ei_gebp_kernel
{
PacketType B0, B1, B2, B3, A0, A1;
+ #if 0
A0 = ei_pload(&blA[0*PacketSize]);
A1 = ei_pload(&blA[1*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]);
@@ -134,6 +135,73 @@ struct ei_gebp_kernel
if(nr==4) C3 = cj.pmadd(A0, B3, C3);
if(nr==4) C7 = cj.pmadd(A1, B3, C7);
+ #else
+
+ PacketType T0, T1;
+
+ #define MADD(A,B,C,T) { T = A; T = ei_pmul(T,B); C = ei_padd(C,T); }
+
+ A0 = ei_pload(&blA[0*PacketSize]);
+ A1 = ei_pload(&blA[1*PacketSize]);
+ B0 = ei_pload(&blB[0*PacketSize]);
+ B1 = ei_pload(&blB[1*PacketSize]);
+
+ MADD(A0,B0,C0,T0); // C0 = cj.pmadd(A0, B0, C0);
+ if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
+ MADD(A1,B0,C4,T1); // C4 = cj.pmadd(A1, B0, C4);
+ if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
+ B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
+ MADD(A0,B1,C1,T0); // C1 = cj.pmadd(A0, B1, C1);
+ MADD(A1,B1,C5,T1); // C5 = cj.pmadd(A1, B1, C5);
+ B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
+ if(nr==4) { MADD(A0,B2,C2,T0); }// C2 = cj.pmadd(A0, B2, C2);
+ if(nr==4) { MADD(A1,B2,C6,T1); }// C6 = cj.pmadd(A1, B2, C6);
+ if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
+ if(nr==4) { MADD(A0,B3,C3,T0); }// C3 = cj.pmadd(A0, B3, C3);
+ A0 = ei_pload(&blA[2*PacketSize]);
+ if(nr==4) { MADD(A1,B3,C7,T1); }// C7 = cj.pmadd(A1, B3, C7);
+ A1 = ei_pload(&blA[3*PacketSize]);
+ if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
+ MADD(A0,B0,C0,T0); // C0 = cj.pmadd(A0, B0, C0);
+ MADD(A1,B0,C4,T1); // C4 = cj.pmadd(A1, B0, C4);
+ B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
+ MADD(A0,B1,C1,T0); // C1 = cj.pmadd(A0, B1, C1);
+ MADD(A1,B1,C5,T1); // C5 = cj.pmadd(A1, B1, C5);
+ B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
+ if(nr==4) { MADD(A0,B2,C2,T0); }// C2 = cj.pmadd(A0, B2, C2);
+ if(nr==4) { MADD(A1,B2,C6,T1); }// C6 = cj.pmadd(A1, B2, C6);
+ if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
+ if(nr==4) { MADD(A0,B3,C3,T0); } // C3 = cj.pmadd(A0, B3, C3);
+ A0 = ei_pload(&blA[4*PacketSize]);
+ if(nr==4) { MADD(A1,B3,C7,T1); }// C7 = cj.pmadd(A1, B3, C7);
+ A1 = ei_pload(&blA[5*PacketSize]);
+ if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
+
+ MADD(A0,B0,C0,T0); // C0 = cj.pmadd(A0, B0, C0);
+ MADD(A1,B0,C4,T1); // C4 = cj.pmadd(A1, B0, C4);
+ B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
+ MADD(A0,B1,C1,T0); // C1 = cj.pmadd(A0, B1, C1);
+ MADD(A1,B1,C5,T1); // C5 = cj.pmadd(A1, B1, C5);
+ B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
+ if(nr==4) { MADD(A0,B2,C2,T0); }// C2 = cj.pmadd(A0, B2, C2);
+ if(nr==4) { MADD(A1,B2,C6,T1); }// C6 = cj.pmadd(A1, B2, C6);
+ if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
+ if(nr==4) { MADD(A0,B3,C3,T0); } // C3 = cj.pmadd(A0, B3, C3);
+ A0 = ei_pload(&blA[6*PacketSize]);
+ if(nr==4) { MADD(A1,B3,C7,T1); } // C7 = cj.pmadd(A1, B3, C7);
+ A1 = ei_pload(&blA[7*PacketSize]);
+ if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
+ MADD(A0,B0,C0,T0); // C0 = cj.pmadd(A0, B0, C0);
+ MADD(A1,B0,C4,T1); // C4 = cj.pmadd(A1, B0, C4);
+ MADD(A0,B1,C1,T0); // C1 = cj.pmadd(A0, B1, C1);
+ MADD(A1,B1,C5,T1); // C5 = cj.pmadd(A1, B1, C5);
+ if(nr==4) { MADD(A0,B2,C2,T0); }// C2 = cj.pmadd(A0, B2, C2);
+ if(nr==4) { MADD(A1,B2,C6,T1); }//C6 = cj.pmadd(A1, B2, C6);
+ if(nr==4) { MADD(A0,B3,C3,T0); }//C3 = cj.pmadd(A0, B3, C3);
+ if(nr==4) { MADD(A1,B3,C7,T1); }//C7 = cj.pmadd(A1, B3, C7);
+
+ #endif
+
blB += 4*nr*PacketSize;
blA += 4*mr;
}
@@ -334,7 +402,7 @@ struct ei_gebp_kernel
#endif
PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
-
+
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
for(int k=0; k<depth; k++)
{
@@ -474,7 +542,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode>
// skip what we have after
if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
}
-
+
// copy the remaining columns one at a time (nr==1)
for(int j2=packet_cols; j2<cols; ++j2)
{