aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2014-03-30 21:57:05 +0200
committerGravatar Gael Guennebaud <g.gael@free.fr>2014-03-30 21:57:05 +0200
commite497a27ddc0d724bc0eadde1e211617b766dd100 (patch)
tree209004d068718b1db876a88da3a9e1e177401b04 /Eigen/src/Core
parentad59ade116969ca7b18409d690caf00c0b1c34c7 (diff)
Optimize gebp kernel:
1 - increase peeling level along the depth dimention (+5% for large matrices, i.e., >1000) 2 - improve pipelining when dealing with latest rows of the lhs
Diffstat (limited to 'Eigen/src/Core')
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h458
1 files changed, 273 insertions, 185 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 3ed1fc5a3..a9e42c8aa 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -95,6 +95,9 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
k = std::min<SizeType>(k, l1/kdiv);
SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
if(_m<m) m = _m & mr_mask;
+
+ m = 1024;
+ k = 256;
}
template<typename LhsScalar, typename RhsScalar, typename SizeType>
@@ -328,6 +331,22 @@ protected:
conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
};
+template<typename Packet>
+struct DoublePacket
+{
+ Packet first;
+ Packet second;
+};
+
+template<typename Packet>
+DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
+{
+ DoublePacket<Packet> res;
+ res.first = padd(a.first, b.first);
+ res.second = padd(a.second,b.second);
+ return res;
+}
+
template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
{
@@ -357,20 +376,16 @@ public:
typedef typename packet_traits<RealScalar>::type RealPacket;
typedef typename packet_traits<Scalar>::type ScalarPacket;
- struct DoublePacket
- {
- RealPacket first;
- RealPacket second;
- };
+ typedef DoublePacket<RealPacket> DoublePacketType;
typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
- typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket;
+ typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
- typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket;
+ typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
- EIGEN_STRONG_INLINE void initAcc(DoublePacket& p)
+ EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
{
p.first = pset1<RealPacket>(RealScalar(0));
p.second = pset1<RealPacket>(RealScalar(0));
@@ -383,7 +398,7 @@ public:
}
// Vectorized path
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
{
dest.first = pset1<RealPacket>(real(*b));
dest.second = pset1<RealPacket>(imag(*b));
@@ -393,7 +408,7 @@ public:
void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3);
// Vectorized path
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacket& b0, DoublePacket& b1)
+ EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
{
// FIXME not sure that's the best way to implement it!
loadRhs(b+0, b0);
@@ -419,7 +434,7 @@ public:
dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
}
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
{
c.first = padd(pmul(a,b.first), c.first);
c.second = padd(pmul(a,b.second),c.second);
@@ -432,7 +447,7 @@ public:
EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
- EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const
+ EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
{
// assemble c
ResPacket tmp;
@@ -571,6 +586,14 @@ struct gebp_kernel
typedef typename Traits::RhsPacket RhsPacket;
typedef typename Traits::ResPacket ResPacket;
typedef typename Traits::AccPacket AccPacket;
+
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
+ typedef typename SwappedTraits::ResScalar SResScalar;
+ typedef typename SwappedTraits::LhsPacket SLhsPacket;
+ typedef typename SwappedTraits::RhsPacket SRhsPacket;
+ typedef typename SwappedTraits::ResPacket SResPacket;
+ typedef typename SwappedTraits::AccPacket SAccPacket;
+
enum {
Vectorizable = Traits::Vectorizable,
@@ -591,6 +614,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
Index strideA, Index strideB, Index offsetA, Index offsetB)
{
Traits traits;
+ SwappedTraits straits;
if(strideA==-1) strideA = depth;
if(strideB==-1) strideB = depth;
@@ -599,7 +623,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
// Here we assume that mr==LhsProgress
const Index peeled_mc = (rows/mr)*mr;
- const Index peeled_kc = (depth/4)*4;
+ enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
+ const Index peeled_kc = depth & ~(pk-1);
+ const Index depth2 = depth & ~1;
// loops on each micro vertical panel of rhs (depth x nr)
// First pass using depth x 8 panels
@@ -634,14 +660,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
// uncomment for register prefetching
// LhsPacket A1;
// traits.loadLhs(blA, A0);
- for(Index k=0; k<peeled_kc; k+=4)
+ for(Index k=0; k<peeled_kc; k+=pk)
{
EIGEN_ASM_COMMENT("begin gegp micro kernel 1p x 8");
RhsPacket B_0, B1, B2, B3;
- // The following version is faster on some architures
- // but sometimes leads to segfaults because it might read one packet outside the bounds
- // To test it, you also need to uncomment the initialization of A0 above and the copy of A1 to A0 below.
+ // NOTE The following version is faster on some architures
+ // but sometimes leads to segfaults because it might read one packet outside the bounds
+ // To test it, you also need to uncomment the initialization of A0 above and the copy of A1 to A0 below.
#if 0
#define EIGEN_GEBGP_ONESTEP8(K,L,M) \
traits.loadLhs(&blA[(K+1)*LhsProgress], L); \
@@ -674,9 +700,13 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
EIGEN_GEBGP_ONESTEP8(1,A0,A1);
EIGEN_GEBGP_ONESTEP8(2,A1,A0);
EIGEN_GEBGP_ONESTEP8(3,A0,A1);
+ EIGEN_GEBGP_ONESTEP8(4,A1,A0);
+ EIGEN_GEBGP_ONESTEP8(5,A0,A1);
+ EIGEN_GEBGP_ONESTEP8(6,A1,A0);
+ EIGEN_GEBGP_ONESTEP8(7,A0,A1);
- blB += 4*8*RhsProgress;
- blA += 4*mr;
+ blB += pk*8*RhsProgress;
+ blA += pk*mr;
}
// process remaining peeled loop
for(Index k=peeled_kc; k<depth; k++)
@@ -720,97 +750,170 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
pstoreu(r0+5*resStride, R5);
pstoreu(r0+6*resStride, R6);
pstoreu(r0+7*resStride, R0);
-
}
- for(Index i=peeled_mc; i<rows; i++)
+ // Deal with remaining rows of the lhs
+ // TODO we should vectorize if <= 8, and not strictly ==
+ if(SwappedTraits::LhsProgress == 8)
{
- const LhsScalar* blA = &blockA[i*strideA+offsetA];
- prefetch(&blA[0]);
-
- // FIXME directly use blockB ???
- const RhsScalar* blB = &blockB[j2*strideB+offsetB*8];
-
- if(nr == Traits::RhsPacketSize)
- {
- EIGEN_ASM_COMMENT("begin_vectorized_multiplication_of_last_rows");
-
- typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
- typedef typename SwappedTraits::ResScalar SResScalar;
- typedef typename SwappedTraits::LhsPacket SLhsPacket;
- typedef typename SwappedTraits::RhsPacket SRhsPacket;
- typedef typename SwappedTraits::ResPacket SResPacket;
- typedef typename SwappedTraits::AccPacket SAccPacket;
- SwappedTraits straits;
-
- SAccPacket C0;
- straits.initAcc(C0);
- for(Index k=0; k<depth; k++)
- {
- SLhsPacket A0;
- straits.loadLhsUnaligned(blB, A0);
- SRhsPacket B_0;
- straits.loadRhs(&blA[k], B_0);
- SRhsPacket T0;
- straits.madd(A0,B_0,C0,T0);
- blB += nr;
- }
- SResPacket R = pgather<SResScalar, SResPacket>(&res[j2*resStride + i], resStride);
- SResPacket alphav = pset1<SResPacket>(alpha);
- straits.acc(C0, alphav, R);
- pscatter(&res[j2*resStride + i], R, resStride);
-
- EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows");
- }
- else
- {
- // gets a 1 x 8 res block as registers
- ResScalar C0(0), C1(0), C2(0), C3(0), C4(0), C5(0), C6(0), C7(0);
-
- for(Index k=0; k<depth; k++)
- {
- LhsScalar A0;
- RhsScalar B_0, B_1;
-
- A0 = blA[k];
-
- B_0 = blB[0];
- B_1 = blB[1];
- MADD(cj,A0,B_0,C0, B_0);
- MADD(cj,A0,B_1,C1, B_1);
-
- B_0 = blB[2];
- B_1 = blB[3];
- MADD(cj,A0,B_0,C2, B_0);
- MADD(cj,A0,B_1,C3, B_1);
-
- B_0 = blB[4];
- B_1 = blB[5];
- MADD(cj,A0,B_0,C4, B_0);
- MADD(cj,A0,B_1,C5, B_1);
+ // Apply the same logic but with reversed operands
+ // To improve pipelining, we process 2 rows at once and accumulate even and odd products along the k dimension
+ // into two different packets.
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
+ typedef typename SwappedTraits::ResScalar SResScalar;
+ typedef typename SwappedTraits::LhsPacket SLhsPacket;
+ typedef typename SwappedTraits::RhsPacket SRhsPacket;
+ typedef typename SwappedTraits::ResPacket SResPacket;
+ typedef typename SwappedTraits::AccPacket SAccPacket;
+ SwappedTraits straits;
+
+ Index rows2 = (rows & ~1);
+ for(Index i=peeled_mc; i<rows2; i+=2)
+ {
+ const LhsScalar* blA = &blockA[i*strideA+offsetA];
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*8];
+
+ EIGEN_ASM_COMMENT("begin_vectorized_multiplication_of_last_rows 2x8");
+
+ SAccPacket C0,C1, C2,C3;
+ straits.initAcc(C0); // even
+ straits.initAcc(C1); // odd
+ straits.initAcc(C2); // even
+ straits.initAcc(C3); // odd
+ for(Index k=0; k<depth2; k+=2)
+ {
+ SLhsPacket A0, A1;
+ straits.loadLhsUnaligned(blB+0, A0);
+ straits.loadLhsUnaligned(blB+8, A1);
+ SRhsPacket B_0, B_1, B_2, B_3;
+ straits.loadRhs(blA+k+0, B_0);
+ straits.loadRhs(blA+k+1, B_1);
+ straits.loadRhs(blA+strideA+k+0, B_2);
+ straits.loadRhs(blA+strideA+k+1, B_3);
+ straits.madd(A0,B_0,C0,B_0);
+ straits.madd(A1,B_1,C1,B_1);
+ straits.madd(A0,B_2,C2,B_2);
+ straits.madd(A1,B_3,C3,B_3);
+ blB += 2*nr;
+ }
+ if(depth2<depth)
+ {
+ Index k = depth-1;
+ SLhsPacket A0;
+ straits.loadLhsUnaligned(blB+0, A0);
+ SRhsPacket B_0, B_2;
+ straits.loadRhs(blA+k+0, B_0);
+ straits.loadRhs(blA+strideA+k+0, B_2);
+ straits.madd(A0,B_0,C0,B_0);
+ straits.madd(A0,B_2,C2,B_2);
+ }
+ SResPacket R = pgather<SResScalar, SResPacket>(&res[j2*resStride + i], resStride);
+ SResPacket alphav = pset1<SResPacket>(alpha);
+ straits.acc(padd(C0,C1), alphav, R);
+ pscatter(&res[j2*resStride + i], R, resStride);
+
+ R = pgather<SResScalar, SResPacket>(&res[j2*resStride + i + 1], resStride);
+ straits.acc(padd(C2,C3), alphav, R);
+ pscatter(&res[j2*resStride + i + 1], R, resStride);
+
+ EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows 8");
+ }
+ if(rows2!=rows)
+ {
+ Index i = rows-1;
+ const LhsScalar* blA = &blockA[i*strideA+offsetA];
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*8];
+
+ EIGEN_ASM_COMMENT("begin_vectorized_multiplication_of_last_rows 8");
+
+ SAccPacket C0,C1;
+ straits.initAcc(C0); // even
+ straits.initAcc(C1); // odd
+
+ for(Index k=0; k<depth2; k+=2)
+ {
+ SLhsPacket A0, A1;
+ straits.loadLhsUnaligned(blB+0, A0);
+ straits.loadLhsUnaligned(blB+8, A1);
+ SRhsPacket B_0, B_1;
+ straits.loadRhs(blA+k+0, B_0);
+ straits.loadRhs(blA+k+1, B_1);
+ straits.madd(A0,B_0,C0,B_0);
+ straits.madd(A1,B_1,C1,B_1);
+ blB += 2*8;
+ }
+ if(depth!=depth2)
+ {
+ Index k = depth-1;
+ SLhsPacket A0;
+ straits.loadLhsUnaligned(blB+0, A0);
+ SRhsPacket B_0;
+ straits.loadRhs(blA+k+0, B_0);
+ straits.madd(A0,B_0,C0,B_0);
+ }
+ SResPacket R = pgather<SResScalar, SResPacket>(&res[j2*resStride + i], resStride);
+ SResPacket alphav = pset1<SResPacket>(alpha);
+ straits.acc(padd(C0,C1), alphav, R);
+ pscatter(&res[j2*resStride + i], R, resStride);
+ }
+ }
+ else
+ {
+ // Pure scalar path
+ for(Index i=peeled_mc; i<rows; i++)
+ {
+ const LhsScalar* blA = &blockA[i*strideA+offsetA];
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*8];
- B_0 = blB[6];
- B_1 = blB[7];
- MADD(cj,A0,B_0,C6, B_0);
- MADD(cj,A0,B_1,C7, B_1);
-
- blB += 8;
- }
- res[(j2+0)*resStride + i] += alpha*C0;
- res[(j2+1)*resStride + i] += alpha*C1;
- res[(j2+2)*resStride + i] += alpha*C2;
- res[(j2+3)*resStride + i] += alpha*C3;
- res[(j2+4)*resStride + i] += alpha*C4;
- res[(j2+5)*resStride + i] += alpha*C5;
- res[(j2+6)*resStride + i] += alpha*C6;
- res[(j2+7)*resStride + i] += alpha*C7;
- }
- }
+ // gets a 1 x 8 res block as registers
+ ResScalar C0(0), C1(0), C2(0), C3(0), C4(0), C5(0), C6(0), C7(0);
+
+ for(Index k=0; k<depth; k++)
+ {
+ LhsScalar A0;
+ RhsScalar B_0, B_1;
+
+ A0 = blA[k];
+
+ B_0 = blB[0];
+ B_1 = blB[1];
+ MADD(cj,A0,B_0,C0, B_0);
+ MADD(cj,A0,B_1,C1, B_1);
+
+ B_0 = blB[2];
+ B_1 = blB[3];
+ MADD(cj,A0,B_0,C2, B_0);
+ MADD(cj,A0,B_1,C3, B_1);
+
+ B_0 = blB[4];
+ B_1 = blB[5];
+ MADD(cj,A0,B_0,C4, B_0);
+ MADD(cj,A0,B_1,C5, B_1);
+
+ B_0 = blB[6];
+ B_1 = blB[7];
+ MADD(cj,A0,B_0,C6, B_0);
+ MADD(cj,A0,B_1,C7, B_1);
+
+ blB += 8;
+ }
+ res[(j2+0)*resStride + i] += alpha*C0;
+ res[(j2+1)*resStride + i] += alpha*C1;
+ res[(j2+2)*resStride + i] += alpha*C2;
+ res[(j2+3)*resStride + i] += alpha*C3;
+ res[(j2+4)*resStride + i] += alpha*C4;
+ res[(j2+5)*resStride + i] += alpha*C5;
+ res[(j2+6)*resStride + i] += alpha*C6;
+ res[(j2+7)*resStride + i] += alpha*C7;
+ }
+ }
}
}
// Second pass using depth x 4 panels
// If nr==8, then we have at most one such panel
+ // TODO: with 16 registers, we coud optimize this part to leverage more pipelinining,
+ // for instance, by using a 2 packet * 4 kernel. Useful when the rhs is thin
if(nr>=4)
{
for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
@@ -821,7 +924,6 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
for(Index i=0; i<peeled_mc; i+=mr)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
- // prefetch(&blA[0]);
// gets res block as register
AccPacket C0, C1, C2, C3;
@@ -835,15 +937,11 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB*4];
LhsPacket A0;
- // uncomment for register prefetching
- // LhsPacket A1;
- // traits.loadLhs(blA, A0);
- for(Index k=0; k<peeled_kc; k+=4)
+ for(Index k=0; k<peeled_kc; k+=pk)
{
EIGEN_ASM_COMMENT("begin gegp micro kernel 1p x 4");
RhsPacket B_0, B1;
-
#define EIGEN_GEBGP_ONESTEP4(K) \
traits.loadLhs(&blA[K*LhsProgress], A0); \
traits.broadcastRhs(&blB[0+4*K*RhsProgress], B_0, B1); \
@@ -852,16 +950,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.broadcastRhs(&blB[2+4*K*RhsProgress], B_0, B1); \
traits.madd(A0, B_0,C2, B_0); \
traits.madd(A0, B1, C3, B1)
-
+
EIGEN_GEBGP_ONESTEP4(0);
EIGEN_GEBGP_ONESTEP4(1);
EIGEN_GEBGP_ONESTEP4(2);
EIGEN_GEBGP_ONESTEP4(3);
+ EIGEN_GEBGP_ONESTEP4(4);
+ EIGEN_GEBGP_ONESTEP4(5);
+ EIGEN_GEBGP_ONESTEP4(6);
+ EIGEN_GEBGP_ONESTEP4(7);
- blB += 4*4*RhsProgress;
- blA += 4*mr;
+ blB += pk*4*RhsProgress;
+ blA += pk*mr;
}
- // process remaining peeled loop
+ // process remaining of peeled loop
for(Index k=peeled_kc; k<depth; k++)
{
RhsPacket B_0, B1;
@@ -894,98 +996,86 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
for(Index i=peeled_mc; i<rows; i++)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA];
- prefetch(&blA[0]);
-
- // FIXME directly use blockB ???
const RhsScalar* blB = &blockB[j2*strideB+offsetB*4];
-
- if(nr == Traits::RhsPacketSize)
- {
- EIGEN_ASM_COMMENT("begin_vectorized_multiplication_of_last_rows");
-
- typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
- typedef typename SwappedTraits::ResScalar SResScalar;
- typedef typename SwappedTraits::LhsPacket SLhsPacket;
- typedef typename SwappedTraits::RhsPacket SRhsPacket;
- typedef typename SwappedTraits::ResPacket SResPacket;
- typedef typename SwappedTraits::AccPacket SAccPacket;
- SwappedTraits straits;
-
- SAccPacket C0;
- straits.initAcc(C0);
- for(Index k=0; k<depth; k++)
+
+ // TODO vectorize in more cases
+ if(SwappedTraits::LhsProgress==4)
+ {
+ EIGEN_ASM_COMMENT("begin_vectorized_multiplication_of_last_rows 1x4");
+
+ SAccPacket C0;
+ straits.initAcc(C0);
+ for(Index k=0; k<depth; k++)
{
- SLhsPacket A0;
- straits.loadLhsUnaligned(blB, A0);
- SRhsPacket B_0;
- straits.loadRhs(&blA[k], B_0);
- SRhsPacket T0;
- straits.madd(A0,B_0,C0,T0);
- blB += nr;
- }
- SResPacket R = pgather<SResScalar, SResPacket>(&res[j2*resStride + i], resStride);
- SResPacket alphav = pset1<SResPacket>(alpha);
- straits.acc(C0, alphav, R);
- pscatter(&res[j2*resStride + i], R, resStride);
-
- EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows");
- } else {
- // gets a 1 x 4 res block as registers
- ResScalar C0(0), C1(0), C2(0), C3(0);
-
- for(Index k=0; k<depth; k++)
+ SLhsPacket A0;
+ straits.loadLhsUnaligned(blB, A0);
+ SRhsPacket B_0;
+ straits.loadRhs(&blA[k], B_0);
+ SRhsPacket T0;
+ straits.madd(A0,B_0,C0,T0);
+ blB += 4;
+ }
+ SResPacket R = pgather<SResScalar, SResPacket>(&res[j2*resStride + i], resStride);
+ SResPacket alphav = pset1<SResPacket>(alpha);
+ straits.acc(C0, alphav, R);
+ pscatter(&res[j2*resStride + i], R, resStride);
+
+ EIGEN_ASM_COMMENT("end_vectorized_multiplication_of_last_rows 1x4");
+ }
+ else
+ {
+ // Pure scalar path
+ // gets a 1 x 4 res block as registers
+ ResScalar C0(0), C1(0), C2(0), C3(0);
+
+ for(Index k=0; k<depth; k++)
{
- LhsScalar A0;
- RhsScalar B_0, B_1;
-
- A0 = blA[k];
-
- B_0 = blB[0];
- B_1 = blB[1];
- MADD(cj,A0,B_0,C0, B_0);
- MADD(cj,A0,B_1,C1, B_1);
-
- B_0 = blB[2];
- B_1 = blB[3];
- MADD(cj,A0,B_0,C2, B_0);
- MADD(cj,A0,B_1,C3, B_1);
-
- blB += 4;
- }
- res[(j2+0)*resStride + i] += alpha*C0;
- res[(j2+1)*resStride + i] += alpha*C1;
- res[(j2+2)*resStride + i] += alpha*C2;
- res[(j2+3)*resStride + i] += alpha*C3;
- }
- }
+ LhsScalar A0;
+ RhsScalar B_0, B_1;
+
+ A0 = blA[k];
+
+ B_0 = blB[0];
+ B_1 = blB[1];
+ MADD(cj,A0,B_0,C0, B_0);
+ MADD(cj,A0,B_1,C1, B_1);
+
+ B_0 = blB[2];
+ B_1 = blB[3];
+ MADD(cj,A0,B_0,C2, B_0);
+ MADD(cj,A0,B_1,C3, B_1);
+
+ blB += 4;
+ }
+ res[(j2+0)*resStride + i] += alpha*C0;
+ res[(j2+1)*resStride + i] += alpha*C1;
+ res[(j2+2)*resStride + i] += alpha*C2;
+ res[(j2+3)*resStride + i] += alpha*C3;
+ }
+ }
}
}
// process remaining rhs/res columns one at a time
- // => do the same but with nr==1
for(Index j2=packet_cols4; j2<cols; j2++)
{
+ // vectorized path
for(Index i=0; i<peeled_mc; i+=mr)
{
- const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
- prefetch(&blA[0]);
-
- // TODO move the res loads to the stores
-
// get res block as registers
AccPacket C0;
traits.initAcc(C0);
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
for(Index k=0; k<depth; k++)
{
LhsPacket A0;
RhsPacket B_0;
- RhsPacket T0;
- traits.loadLhs(&blA[0*LhsProgress], A0);
- traits.loadRhs(&blB[0*RhsProgress], B_0);
- traits.madd(A0,B_0,C0,T0);
+ traits.loadLhs(blA, A0);
+ traits.loadRhs(blB, B_0);
+ traits.madd(A0,B_0,C0,B_0);
blB += RhsProgress;
blA += LhsProgress;
@@ -997,14 +1087,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.acc(C0, alphav, R0);
pstoreu(r0, R0);
}
+ // pure scalar path
for(Index i=peeled_mc; i<rows; i++)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA];
- prefetch(&blA[0]);
-
// gets a 1 x 1 res block as registers
ResScalar C0(0);
- // FIXME directly use blockB ??
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
for(Index k=0; k<depth; k++)
{