aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/products/GeneralBlockPanelKernel.h
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2010-07-11 15:48:30 +0200
committerGravatar Gael Guennebaud <g.gael@free.fr>2010-07-11 15:48:30 +0200
commitff96c94043d575e4d0dd477c1ed2487e33f79627 (patch)
tree5e9736916779fdacd431c2591a3ec1f77333e505 /Eigen/src/Core/products/GeneralBlockPanelKernel.h
parent4161b8be6772f2b7338458c9932d7417797966bb (diff)
mixing types in product step 2:
* pload* and pset1 are now templated on the packet type * gemv routines are now embeded into a structure with a consistent API with respect to gemm * some configurations of vector * matrix and matrix * matrix works fine, some need more work...
Diffstat (limited to 'Eigen/src/Core/products/GeneralBlockPanelKernel.h')
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h332
1 files changed, 167 insertions, 165 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 3dae26eee..ffb4cd386 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -133,11 +133,13 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st
computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
}
-#ifdef EIGEN_HAS_FUSE_CJMADD
+// FIXME
+// #ifdef EIGEN_HAS_FUSE_CJMADD
#define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
-#else
- #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = ei_padd(C,T);
-#endif
+// #else
+ //#define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = ei_padd(C,ResPacket(T));
+// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T);
+// #endif
// optimized GEneral packed Block * packed Panel product kernel
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -152,13 +154,13 @@ struct ei_gebp_kernel
ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1
};
- typedef typename ei_packet_traits<LhsScalar>::type _LhsPacketType;
- typedef typename ei_packet_traits<RhsScalar>::type _RhsPacketType;
- typedef typename ei_packet_traits<ResScalar>::type _ResPacketType;
+ typedef typename ei_packet_traits<LhsScalar>::type _LhsPacket;
+ typedef typename ei_packet_traits<RhsScalar>::type _RhsPacket;
+ typedef typename ei_packet_traits<ResScalar>::type _ResPacket;
- typedef typename ei_meta_if<Vectorizable,_LhsPacketType,LhsScalar>::ret LhsPacketType;
- typedef typename ei_meta_if<Vectorizable,_RhsPacketType,RhsScalar>::ret RhsPacketType;
- typedef typename ei_meta_if<Vectorizable,_ResPacketType,ResScalar>::ret ResPacketType;
+ typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket;
+ typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket;
+ typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket;
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols,
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0)
@@ -166,7 +168,7 @@ struct ei_gebp_kernel
if(strideA==-1) strideA = depth;
if(strideB==-1) strideB = depth;
ei_conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
- ei_conj_helper<LhsPacketType,RhsPacketType,ConjugateLhs,ConjugateRhs> pcj;
+ ei_conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
Index packet_cols = (cols/nr) * nr;
const Index peeled_mc = (rows/mr)*mr;
const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsPacketSize ? LhsPacketSize : 0);
@@ -183,7 +185,7 @@ struct ei_gebp_kernel
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
Index n = depth*nr;
for(Index k=0; k<n; k++)
- ei_pstore(&unpackedB[k*RhsPacketSize], ei_pset1(blB[k]));
+ ei_pstore(&unpackedB[k*RhsPacketSize], ei_pset1<RhsPacket>(blB[k]));
/*Scalar* dest = unpackedB;
for(Index k=0; k<n; k+=4*PacketSize)
{
@@ -197,11 +199,11 @@ struct ei_gebp_kernel
_mm_prefetch((const char*)(&dest[S+3*G]), _MM_HINT_T0);
#endif
- PacketType C0[PacketSize], C1[PacketSize], C2[PacketSize], C3[PacketSize];
- C0[0] = ei_pload(blB+0*PacketSize);
- C1[0] = ei_pload(blB+1*PacketSize);
- C2[0] = ei_pload(blB+2*PacketSize);
- C3[0] = ei_pload(blB+3*PacketSize);
+ RhsPacket C0[PacketSize], C1[PacketSize], C2[PacketSize], C3[PacketSize];
+ C0[0] = ei_pload<RhsPacket>(blB+0*PacketSize);
+ C1[0] = ei_pload<RhsPacket>(blB+1*PacketSize);
+ C2[0] = ei_pload<RhsPacket>(blB+2*PacketSize);
+ C3[0] = ei_pload<RhsPacket>(blB+3*PacketSize);
ei_punpackp(C0);
ei_punpackp(C1);
@@ -243,15 +245,15 @@ struct ei_gebp_kernel
// TODO move the res loads to the stores
// gets res block as register
- ResPacketType C0, C1, C2, C3, C4, C5, C6, C7;
- C0 = ei_pset1(ResScalar(0));
- C1 = ei_pset1(ResScalar(0));
- if(nr==4) C2 = ei_pset1(ResScalar(0));
- if(nr==4) C3 = ei_pset1(ResScalar(0));
- C4 = ei_pset1(ResScalar(0));
- C5 = ei_pset1(ResScalar(0));
- if(nr==4) C6 = ei_pset1(ResScalar(0));
- if(nr==4) C7 = ei_pset1(ResScalar(0));
+ ResPacket C0, C1, C2, C3, C4, C5, C6, C7;
+ C0 = ei_pset1<ResPacket>(ResScalar(0));
+ C1 = ei_pset1<ResPacket>(ResScalar(0));
+ if(nr==4) C2 = ei_pset1<ResPacket>(ResScalar(0));
+ if(nr==4) C3 = ei_pset1<ResPacket>(ResScalar(0));
+ C4 = ei_pset1<ResPacket>(ResScalar(0));
+ C5 = ei_pset1<ResPacket>(ResScalar(0));
+ if(nr==4) C6 = ei_pset1<ResPacket>(ResScalar(0));
+ if(nr==4) C7 = ei_pset1<ResPacket>(ResScalar(0));
ResScalar* r0 = &res[(j2+0)*resStride + i];
ResScalar* r1 = r0 + resStride;
@@ -271,106 +273,106 @@ struct ei_gebp_kernel
{
if(nr==2)
{
- LhsPacketType A0, A1;
- RhsPacketType B0;
+ LhsPacket A0, A1;
+ RhsPacket B0;
#ifndef EIGEN_HAS_FUSE_CJMADD
- ResPacketType T0;
+ RhsPacket T0;
#endif
EIGEN_ASM_COMMENT("mybegin");
- A0 = ei_pload(&blA[0*LhsPacketSize]);
- A1 = ei_pload(&blA[1*LhsPacketSize]);
- B0 = ei_pload(&blB[0*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+ A1 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
MADD(pcj,A0,B0,C0,T0);
MADD(pcj,A1,B0,C4,B0);
- B0 = ei_pload(&blB[1*RhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]);
MADD(pcj,A0,B0,C1,T0);
MADD(pcj,A1,B0,C5,B0);
- A0 = ei_pload(&blA[2*LhsPacketSize]);
- A1 = ei_pload(&blA[3*LhsPacketSize]);
- B0 = ei_pload(&blB[2*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[2*LhsPacketSize]);
+ A1 = ei_pload<LhsPacket>(&blA[3*LhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]);
MADD(pcj,A0,B0,C0,T0);
MADD(pcj,A1,B0,C4,B0);
- B0 = ei_pload(&blB[3*RhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]);
MADD(pcj,A0,B0,C1,T0);
MADD(pcj,A1,B0,C5,B0);
- A0 = ei_pload(&blA[4*LhsPacketSize]);
- A1 = ei_pload(&blA[5*LhsPacketSize]);
- B0 = ei_pload(&blB[4*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[4*LhsPacketSize]);
+ A1 = ei_pload<LhsPacket>(&blA[5*LhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[4*RhsPacketSize]);
MADD(pcj,A0,B0,C0,T0);
MADD(pcj,A1,B0,C4,B0);
- B0 = ei_pload(&blB[5*RhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[5*RhsPacketSize]);
MADD(pcj,A0,B0,C1,T0);
MADD(pcj,A1,B0,C5,B0);
- A0 = ei_pload(&blA[6*LhsPacketSize]);
- A1 = ei_pload(&blA[7*LhsPacketSize]);
- B0 = ei_pload(&blB[6*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[6*LhsPacketSize]);
+ A1 = ei_pload<LhsPacket>(&blA[7*LhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[6*RhsPacketSize]);
MADD(pcj,A0,B0,C0,T0);
MADD(pcj,A1,B0,C4,B0);
- B0 = ei_pload(&blB[7*RhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[7*RhsPacketSize]);
MADD(pcj,A0,B0,C1,T0);
MADD(pcj,A1,B0,C5,B0);
EIGEN_ASM_COMMENT("myend");
}
else
{
- LhsPacketType A0, A1;
- RhsPacketType B0, B1, B2, B3;
+ LhsPacket A0, A1;
+ RhsPacket B0, B1, B2, B3;
#ifndef EIGEN_HAS_FUSE_CJMADD
- ResPacketType T0;
+ RhsPacket T0;
#endif
- A0 = ei_pload(&blA[0*LhsPacketSize]);
- A1 = ei_pload(&blA[1*LhsPacketSize]);
- B0 = ei_pload(&blB[0*RhsPacketSize]);
- B1 = ei_pload(&blB[1*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+ A1 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
+ B1 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]);
MADD(pcj,A0,B0,C0,T0);
- B2 = ei_pload(&blB[2*RhsPacketSize]);
+ B2 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]);
MADD(pcj,A1,B0,C4,B0);
- B3 = ei_pload(&blB[3*RhsPacketSize]);
- B0 = ei_pload(&blB[4*RhsPacketSize]);
+ B3 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[4*RhsPacketSize]);
MADD(pcj,A0,B1,C1,T0);
MADD(pcj,A1,B1,C5,B1);
- B1 = ei_pload(&blB[5*RhsPacketSize]);
+ B1 = ei_pload<RhsPacket>(&blB[5*RhsPacketSize]);
MADD(pcj,A0,B2,C2,T0);
MADD(pcj,A1,B2,C6,B2);
- B2 = ei_pload(&blB[6*RhsPacketSize]);
+ B2 = ei_pload<RhsPacket>(&blB[6*RhsPacketSize]);
MADD(pcj,A0,B3,C3,T0);
- A0 = ei_pload(&blA[2*LhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[2*LhsPacketSize]);
MADD(pcj,A1,B3,C7,B3);
- A1 = ei_pload(&blA[3*LhsPacketSize]);
- B3 = ei_pload(&blB[7*RhsPacketSize]);
+ A1 = ei_pload<LhsPacket>(&blA[3*LhsPacketSize]);
+ B3 = ei_pload<RhsPacket>(&blB[7*RhsPacketSize]);
MADD(pcj,A0,B0,C0,T0);
MADD(pcj,A1,B0,C4,B0);
- B0 = ei_pload(&blB[8*RhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[8*RhsPacketSize]);
MADD(pcj,A0,B1,C1,T0);
MADD(pcj,A1,B1,C5,B1);
- B1 = ei_pload(&blB[9*RhsPacketSize]);
+ B1 = ei_pload<RhsPacket>(&blB[9*RhsPacketSize]);
MADD(pcj,A0,B2,C2,T0);
MADD(pcj,A1,B2,C6,B2);
- B2 = ei_pload(&blB[10*RhsPacketSize]);
+ B2 = ei_pload<RhsPacket>(&blB[10*RhsPacketSize]);
MADD(pcj,A0,B3,C3,T0);
- A0 = ei_pload(&blA[4*LhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[4*LhsPacketSize]);
MADD(pcj,A1,B3,C7,B3);
- A1 = ei_pload(&blA[5*LhsPacketSize]);
- B3 = ei_pload(&blB[11*RhsPacketSize]);
+ A1 = ei_pload<LhsPacket>(&blA[5*LhsPacketSize]);
+ B3 = ei_pload<RhsPacket>(&blB[11*RhsPacketSize]);
MADD(pcj,A0,B0,C0,T0);
MADD(pcj,A1,B0,C4,B0);
- B0 = ei_pload(&blB[12*RhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[12*RhsPacketSize]);
MADD(pcj,A0,B1,C1,T0);
MADD(pcj,A1,B1,C5,B1);
- B1 = ei_pload(&blB[13*RhsPacketSize]);
+ B1 = ei_pload<RhsPacket>(&blB[13*RhsPacketSize]);
MADD(pcj,A0,B2,C2,T0);
MADD(pcj,A1,B2,C6,B2);
- B2 = ei_pload(&blB[14*RhsPacketSize]);
+ B2 = ei_pload<RhsPacket>(&blB[14*RhsPacketSize]);
MADD(pcj,A0,B3,C3,T0);
- A0 = ei_pload(&blA[6*LhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[6*LhsPacketSize]);
MADD(pcj,A1,B3,C7,B3);
- A1 = ei_pload(&blA[7*LhsPacketSize]);
- B3 = ei_pload(&blB[15*RhsPacketSize]);
+ A1 = ei_pload<LhsPacket>(&blA[7*LhsPacketSize]);
+ B3 = ei_pload<RhsPacket>(&blB[15*RhsPacketSize]);
MADD(pcj,A0,B0,C0,T0);
MADD(pcj,A1,B0,C4,B0);
MADD(pcj,A0,B1,C1,T0);
@@ -389,38 +391,38 @@ EIGEN_ASM_COMMENT("myend");
{
if(nr==2)
{
- LhsPacketType A0, A1;
- RhsPacketType B0;
+ LhsPacket A0, A1;
+ RhsPacket B0;
#ifndef EIGEN_HAS_FUSE_CJMADD
- ResPacketType T0;
+ RhsPacket T0;
#endif
- A0 = ei_pload(&blA[0*LhsPacketSize]);
- A1 = ei_pload(&blA[1*LhsPacketSize]);
- B0 = ei_pload(&blB[0*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+ A1 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
MADD(pcj,A0,B0,C0,T0);
MADD(pcj,A1,B0,C4,B0);
- B0 = ei_pload(&blB[1*RhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]);
MADD(pcj,A0,B0,C1,T0);
MADD(pcj,A1,B0,C5,B0);
}
else
{
- LhsPacketType A0, A1;
- RhsPacketType B0, B1, B2, B3;
+ LhsPacket A0, A1;
+ RhsPacket B0, B1, B2, B3;
#ifndef EIGEN_HAS_FUSE_CJMADD
- ResPacketType T0;
+ RhsPacket T0;
#endif
- A0 = ei_pload(&blA[0*LhsPacketSize]);
- A1 = ei_pload(&blA[1*LhsPacketSize]);
- B0 = ei_pload(&blB[0*RhsPacketSize]);
- B1 = ei_pload(&blB[1*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+ A1 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
+ B1 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]);
MADD(pcj,A0,B0,C0,T0);
- B2 = ei_pload(&blB[2*RhsPacketSize]);
+ B2 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]);
MADD(pcj,A1,B0,C4,B0);
- B3 = ei_pload(&blB[3*RhsPacketSize]);
+ B3 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]);
MADD(pcj,A0,B1,C1,T0);
MADD(pcj,A1,B1,C5,B1);
MADD(pcj,A0,B2,C2,T0);
@@ -433,16 +435,16 @@ EIGEN_ASM_COMMENT("myend");
blA += mr;
}
- ResPacketType R0, R1, R2, R3, R4, R5, R6, R7;
+ ResPacket R0, R1, R2, R3, R4, R5, R6, R7;
- R0 = ei_ploadu(r0);
- R1 = ei_ploadu(r1);
- if(nr==4) R2 = ei_ploadu(r2);
- if(nr==4) R3 = ei_ploadu(r3);
- R4 = ei_ploadu(r0 + ResPacketSize);
- R5 = ei_ploadu(r1 + ResPacketSize);
- if(nr==4) R6 = ei_ploadu(r2 + ResPacketSize);
- if(nr==4) R7 = ei_ploadu(r3 + ResPacketSize);
+ R0 = ei_ploadu<ResPacket>(r0);
+ R1 = ei_ploadu<ResPacket>(r1);
+ if(nr==4) R2 = ei_ploadu<ResPacket>(r2);
+ if(nr==4) R3 = ei_ploadu<ResPacket>(r3);
+ R4 = ei_ploadu<ResPacket>(r0 + ResPacketSize);
+ R5 = ei_ploadu<ResPacket>(r1 + ResPacketSize);
+ if(nr==4) R6 = ei_ploadu<ResPacket>(r2 + ResPacketSize);
+ if(nr==4) R7 = ei_ploadu<ResPacket>(r3 + ResPacketSize);
C0 = ei_padd(R0, C0);
C1 = ei_padd(R1, C1);
@@ -469,11 +471,11 @@ EIGEN_ASM_COMMENT("myend");
ei_prefetch(&blA[0]);
// gets res block as register
- ResPacketType C0, C1, C2, C3;
- C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
- C1 = ei_ploadu(&res[(j2+1)*resStride + i]);
- if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]);
- if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]);
+ ResPacket C0, C1, C2, C3;
+ C0 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
+ C1 = ei_ploadu<ResPacket>(&res[(j2+1)*resStride + i]);
+ if(nr==4) C2 = ei_ploadu<ResPacket>(&res[(j2+2)*resStride + i]);
+ if(nr==4) C3 = ei_ploadu<ResPacket>(&res[(j2+3)*resStride + i]);
// performs "inner" product
const RhsScalar* blB = unpackedB;
@@ -481,70 +483,70 @@ EIGEN_ASM_COMMENT("myend");
{
if(nr==2)
{
- LhsPacketType A0;
- RhsPacketType B0, B1;
+ LhsPacket A0;
+ RhsPacket B0, B1;
- A0 = ei_pload(&blA[0*LhsPacketSize]);
- B0 = ei_pload(&blB[0*RhsPacketSize]);
- B1 = ei_pload(&blB[1*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
+ B1 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]);
MADD(pcj,A0,B0,C0,B0);
- B0 = ei_pload(&blB[2*RhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]);
MADD(pcj,A0,B1,C1,B1);
- A0 = ei_pload(&blA[1*LhsPacketSize]);
- B1 = ei_pload(&blB[3*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]);
+ B1 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]);
MADD(pcj,A0,B0,C0,B0);
- B0 = ei_pload(&blB[4*RhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[4*RhsPacketSize]);
MADD(pcj,A0,B1,C1,B1);
- A0 = ei_pload(&blA[2*LhsPacketSize]);
- B1 = ei_pload(&blB[5*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[2*LhsPacketSize]);
+ B1 = ei_pload<RhsPacket>(&blB[5*RhsPacketSize]);
MADD(pcj,A0,B0,C0,B0);
- B0 = ei_pload(&blB[6*RhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[6*RhsPacketSize]);
MADD(pcj,A0,B1,C1,B1);
- A0 = ei_pload(&blA[3*LhsPacketSize]);
- B1 = ei_pload(&blB[7*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[3*LhsPacketSize]);
+ B1 = ei_pload<RhsPacket>(&blB[7*RhsPacketSize]);
MADD(pcj,A0,B0,C0,B0);
MADD(pcj,A0,B1,C1,B1);
}
else
{
- LhsPacketType A0;
- RhsPacketType B0, B1, B2, B3;
+ LhsPacket A0;
+ RhsPacket B0, B1, B2, B3;
- A0 = ei_pload(&blA[0*LhsPacketSize]);
- B0 = ei_pload(&blB[0*RhsPacketSize]);
- B1 = ei_pload(&blB[1*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
+ B1 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]);
MADD(pcj,A0,B0,C0,B0);
- B2 = ei_pload(&blB[2*RhsPacketSize]);
- B3 = ei_pload(&blB[3*RhsPacketSize]);
- B0 = ei_pload(&blB[4*RhsPacketSize]);
+ B2 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]);
+ B3 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[4*RhsPacketSize]);
MADD(pcj,A0,B1,C1,B1);
- B1 = ei_pload(&blB[5*RhsPacketSize]);
+ B1 = ei_pload<RhsPacket>(&blB[5*RhsPacketSize]);
MADD(pcj,A0,B2,C2,B2);
- B2 = ei_pload(&blB[6*RhsPacketSize]);
+ B2 = ei_pload<RhsPacket>(&blB[6*RhsPacketSize]);
MADD(pcj,A0,B3,C3,B3);
- A0 = ei_pload(&blA[1*LhsPacketSize]);
- B3 = ei_pload(&blB[7*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]);
+ B3 = ei_pload<RhsPacket>(&blB[7*RhsPacketSize]);
MADD(pcj,A0,B0,C0,B0);
- B0 = ei_pload(&blB[8*RhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[8*RhsPacketSize]);
MADD(pcj,A0,B1,C1,B1);
- B1 = ei_pload(&blB[9*RhsPacketSize]);
+ B1 = ei_pload<RhsPacket>(&blB[9*RhsPacketSize]);
MADD(pcj,A0,B2,C2,B2);
- B2 = ei_pload(&blB[10*RhsPacketSize]);
+ B2 = ei_pload<RhsPacket>(&blB[10*RhsPacketSize]);
MADD(pcj,A0,B3,C3,B3);
- A0 = ei_pload(&blA[2*LhsPacketSize]);
- B3 = ei_pload(&blB[11*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[2*LhsPacketSize]);
+ B3 = ei_pload<RhsPacket>(&blB[11*RhsPacketSize]);
MADD(pcj,A0,B0,C0,B0);
- B0 = ei_pload(&blB[12*RhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[12*RhsPacketSize]);
MADD(pcj,A0,B1,C1,B1);
- B1 = ei_pload(&blB[13*RhsPacketSize]);
+ B1 = ei_pload<RhsPacket>(&blB[13*RhsPacketSize]);
MADD(pcj,A0,B2,C2,B2);
- B2 = ei_pload(&blB[14*RhsPacketSize]);
+ B2 = ei_pload<RhsPacket>(&blB[14*RhsPacketSize]);
MADD(pcj,A0,B3,C3,B3);
- A0 = ei_pload(&blA[3*LhsPacketSize]);
- B3 = ei_pload(&blB[15*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[3*LhsPacketSize]);
+ B3 = ei_pload<RhsPacket>(&blB[15*RhsPacketSize]);
MADD(pcj,A0,B0,C0,B0);
MADD(pcj,A0,B1,C1,B1);
MADD(pcj,A0,B2,C2,B2);
@@ -559,31 +561,31 @@ EIGEN_ASM_COMMENT("myend");
{
if(nr==2)
{
- LhsPacketType A0;
- RhsPacketType B0;
+ LhsPacket A0;
+ RhsPacket B0;
#ifndef EIGEN_HAS_FUSE_CJMADD
- ResPacketType T0;
+ RhsPacket T0;
#endif
- A0 = ei_pload(&blA[0*LhsPacketSize]);
- B0 = ei_pload(&blB[0*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
MADD(pcj,A0,B0,C0,T0);
- B0 = ei_pload(&blB[1*RhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]);
MADD(pcj,A0,B0,C1,T0);
}
else
{
- LhsPacketType A0;
- RhsPacketType B0, B1, B2, B3;
+ LhsPacket A0;
+ RhsPacket B0, B1, B2, B3;
#ifndef EIGEN_HAS_FUSE_CJMADD
- ResPacketType T0, T1;
+ RhsPacket T0, T1;
#endif
- A0 = ei_pload(&blA[0*LhsPacketSize]);
- B0 = ei_pload(&blB[0*RhsPacketSize]);
- B1 = ei_pload(&blB[1*RhsPacketSize]);
- B2 = ei_pload(&blB[2*RhsPacketSize]);
- B3 = ei_pload(&blB[3*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
+ B1 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]);
+ B2 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]);
+ B3 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]);
MADD(pcj,A0,B0,C0,T0);
MADD(pcj,A0,B1,C1,T1);
@@ -662,7 +664,7 @@ EIGEN_ASM_COMMENT("myend");
{
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
for(Index k=0; k<depth; k++)
- ei_pstore(&unpackedB[k*RhsPacketSize], ei_pset1(blB[k]));
+ ei_pstore(&unpackedB[k*RhsPacketSize], ei_pset1<RhsPacket>(blB[k]));
}
for(Index i=0; i<peeled_mc; i+=mr)
@@ -673,22 +675,22 @@ EIGEN_ASM_COMMENT("myend");
// TODO move the res loads to the stores
// get res block as registers
- ResPacketType C0, C4;
- C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
- C4 = ei_ploadu(&res[(j2+0)*resStride + i + ResPacketSize]);
+ ResPacket C0, C4;
+ C0 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
+ C4 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i + ResPacketSize]);
const RhsScalar* blB = unpackedB;
for(Index k=0; k<depth; k++)
{
- LhsPacketType A0, A1;
- RhsPacketType B0;
+ LhsPacket A0, A1;
+ RhsPacket B0;
#ifndef EIGEN_HAS_FUSE_CJMADD
- ResPacketType T0, T1;
+ RhsPacket T0, T1;
#endif
- A0 = ei_pload(&blA[0*LhsPacketSize]);
- A1 = ei_pload(&blA[1*LhsPacketSize]);
- B0 = ei_pload(&blB[0*RhsPacketSize]);
+ A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+ A1 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]);
+ B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
MADD(pcj,A0,B0,C0,T0);
MADD(pcj,A1,B0,C4,T1);
@@ -705,13 +707,13 @@ EIGEN_ASM_COMMENT("myend");
const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsPacketSize];
ei_prefetch(&blA[0]);
- ResPacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
+ ResPacket C0 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
const RhsScalar* blB = unpackedB;
for(Index k=0; k<depth; k++)
{
- ResPacketType T0;
- MADD(pcj,ei_pload(blA), ei_pload(blB), C0, T0);
+ RhsPacket T0;
+ MADD(pcj,ei_pload<LhsPacket>(blA), ei_pload<RhsPacket>(blB), C0, T0);
blB += RhsPacketSize;
blA += LhsPacketSize;
}