aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/products/GeneralMatrixVector.h
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2016-12-03 21:14:14 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2016-12-03 21:14:14 +0100
commit6a5fe860985311cc275c4bb7000e0d261822c756 (patch)
tree27f6c9826b284cf7dbadc0030a892ed53b6ca385 /Eigen/src/Core/products/GeneralMatrixVector.h
parent2bfece5cd1b13c14471177d13f24b46a28638d27 (diff)
Complete rewrite of column-major-matrix * vector product to deliver higher performance of modern CPU.
The previous code has been optimized for Intel core2 for which unaligned loads/stores were prohibitively expensive. This new version exhibits much higher instruction independence (better pipelining) and explicitly leverage FMA. According to my benchmark, on Haswell this new kernel is always faster than the previous one, and sometimes even twice as fast. Even higher performance could be achieved with a better blocking size heuristic and, perhaps, with explicit prefetching. We should also check triangular product/solve to optimally exploit this new kernel (working on vertical panel of 4 columns is probably not optimal anymore).
Diffstat (limited to 'Eigen/src/Core/products/GeneralMatrixVector.h')
-rw-r--r--Eigen/src/Core/products/GeneralMatrixVector.h339
1 files changed, 109 insertions, 230 deletions
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 3c1a7fc40..098b7c4e1 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -1,7 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
@@ -15,10 +15,8 @@ namespace Eigen {
namespace internal {
/* Optimized col-major matrix * vector product:
- * This algorithm processes 4 columns at onces that allows to both reduce
- * the number of load/stores of the result by a factor 4 and to reduce
- * the instruction dependency. Moreover, we know that all bands have the
- * same alignment pattern.
+ * This algorithm processes the matrix per vertical panels,
+ * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments.
*
* Mixing type logic: C += alpha * A * B
* | A | B |alpha| comments
@@ -27,33 +25,7 @@ namespace internal {
* |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
* |cplx |real |real | optimal case, vectorization possible via real-cplx mul
*
- * Accesses to the matrix coefficients follow the following logic:
- *
- * - if all columns have the same alignment then
- * - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
- * - otherwise perform unaligned loads only (-> NoneAligned case)
- * - otherwise
- * - if even columns have the same alignment then
- * // odd columns are guaranteed to have the same alignment too
- * - if even or odd columns have the same alignment as the result, then
- * // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
- * - perform half aligned and half unaligned loads (-> EvenAligned case)
- * - otherwise perform unaligned loads only (-> NoneAligned case)
- * - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
- * - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
- * perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
- * // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
- * - otherwise,
- * // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
- * // we currently fall back to the NoneAligned case
- *
* The same reasoning apply for the transposed case.
- *
- * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
- * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
- * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
- * compared to unaligned loads on a 4 byte boundary.
- *
*/
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
@@ -87,238 +59,145 @@ EIGEN_DONT_INLINE static void run(
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
Index rows, Index cols,
- const LhsMapper& lhs,
+ const LhsMapper& alhs,
const RhsMapper& rhs,
ResScalar* res, Index resIncr,
RhsScalar alpha)
{
EIGEN_UNUSED_VARIABLE(resIncr);
eigen_internal_assert(resIncr==1);
- #ifdef _EIGEN_ACCUMULATE_PACKETS
- #error _EIGEN_ACCUMULATE_PACKETS has already been defined
- #endif
- #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
- pstore(&res[j], \
- padd(pload<ResPacket>(&res[j]), \
- padd( \
- padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \
- pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \
- padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \
- pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) )))
- typedef typename LhsMapper::VectorMapper LhsScalars;
+ // The following copy tells the compiler that lhs's attributes are not modified outside this function
+ // This helps GCC to generate propoer code.
+ LhsMapper lhs(alhs);
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
- if(ConjugateRhs)
- alpha = numext::conj(alpha);
-
- enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
- const Index columnsAtOnce = 4;
- const Index peels = 2;
- const Index LhsPacketAlignedMask = LhsPacketSize-1;
- const Index ResPacketAlignedMask = ResPacketSize-1;
-// const Index PeelAlignedMask = ResPacketSize*peels-1;
- const Index size = rows;
-
const Index lhsStride = lhs.stride();
+ // TODO: for padded aligned inputs, we could enable aligned reads
+ enum { LhsAlignment = Unaligned };
- // How many coeffs of the result do we have to skip to be aligned.
- // Here we assume data are at least aligned on the base scalar type.
- Index alignedStart = internal::first_default_aligned(res,size);
- Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
- const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
-
- const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
- Index alignmentPattern = alignmentStep==0 ? AllAligned
- : alignmentStep==(LhsPacketSize/2) ? EvenAligned
- : FirstAligned;
+ const Index n8 = rows-8*ResPacketSize+1;
+ const Index n4 = rows-4*ResPacketSize+1;
+ const Index n3 = rows-3*ResPacketSize+1;
+ const Index n2 = rows-2*ResPacketSize+1;
+ const Index n1 = rows-1*ResPacketSize+1;
- // we cannot assume the first element is aligned because of sub-matrices
- const Index lhsAlignmentOffset = lhs.firstAligned(size);
+ // TODO: improve the following heuristic:
+ const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4);
+ ResPacket palpha = pset1<ResPacket>(alpha);
- // find how many columns do we have to skip to be aligned with the result (if possible)
- Index skipColumns = 0;
- // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
- if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) )
+ for(Index j2=0; j2<cols; j2+=block_cols)
{
- alignedSize = 0;
- alignedStart = 0;
- alignmentPattern = NoneAligned;
- }
- else if(LhsPacketSize > 4)
- {
- // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
- // Currently, it seems to be better to perform unaligned loads anyway
- alignmentPattern = NoneAligned;
- }
- else if (LhsPacketSize>1)
- {
- // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
-
- while (skipColumns<LhsPacketSize &&
- alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
- ++skipColumns;
- if (skipColumns==LhsPacketSize)
+ Index jend = numext::mini(j2+block_cols,cols);
+ Index i=0;
+ for(; i<n8; i+=ResPacketSize*8)
{
- // nothing can be aligned, no need to skip any column
- alignmentPattern = NoneAligned;
- skipColumns = 0;
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+ c1 = pset1<ResPacket>(ResScalar(0)),
+ c2 = pset1<ResPacket>(ResScalar(0)),
+ c3 = pset1<ResPacket>(ResScalar(0)),
+ c4 = pset1<ResPacket>(ResScalar(0)),
+ c5 = pset1<ResPacket>(ResScalar(0)),
+ c6 = pset1<ResPacket>(ResScalar(0)),
+ c7 = pset1<ResPacket>(ResScalar(0));
+
+ for(Index j=j2; j<jend; j+=1)
+ {
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+ c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
+ c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
+ c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4);
+ c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5);
+ c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6);
+ c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7);
+ }
+ pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+ pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+ pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
+ pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
+ pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4)));
+ pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5)));
+ pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6)));
+ pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7)));
}
- else
+ if(i<n4)
{
- skipColumns = (std::min)(skipColumns,cols);
- // note that the skiped columns are processed later.
- }
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+ c1 = pset1<ResPacket>(ResScalar(0)),
+ c2 = pset1<ResPacket>(ResScalar(0)),
+ c3 = pset1<ResPacket>(ResScalar(0));
- /* eigen_internal_assert( (alignmentPattern==NoneAligned)
- || (skipColumns + columnsAtOnce >= cols)
- || LhsPacketSize > size
- || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
- }
- else if(Vectorizable)
- {
- alignedStart = 0;
- alignedSize = size;
- alignmentPattern = AllAligned;
- }
-
- const Index offset1 = (FirstAligned && alignmentStep==1)?3:1;
- const Index offset3 = (FirstAligned && alignmentStep==1)?1:3;
-
- Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
- for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
- {
- RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
- ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
- ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
- ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
-
- // this helps a lot generating better binary code
- const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1),
- lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3);
-
- if (Vectorizable)
- {
- /* explicit vectorization */
- // process initial unaligned coeffs
- for (Index j=0; j<alignedStart; ++j)
+ for(Index j=j2; j<jend; j+=1)
{
- res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
- res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
- res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
- res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+ c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
+ c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
}
+ pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+ pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+ pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
+ pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
- if (alignedSize>alignedStart)
- {
- switch(alignmentPattern)
- {
- case AllAligned:
- for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
- _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
- break;
- case EvenAligned:
- for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
- _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
- break;
- case FirstAligned:
- {
- Index j = alignedStart;
- if(peels>1)
- {
- LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
- ResPacket T0, T1;
-
- A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
- A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
- A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
-
- for (; j<peeledSize; j+=peels*ResPacketSize)
- {
- A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
- A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
- A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
-
- A00 = lhs0.template load<LhsPacket, Aligned>(j);
- A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
- T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
- T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
+ i+=ResPacketSize*4;
+ }
+ if(i<n3)
+ {
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+ c1 = pset1<ResPacket>(ResScalar(0)),
+ c2 = pset1<ResPacket>(ResScalar(0));
- T0 = pcj.pmadd(A01, ptmp1, T0);
- A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
- T0 = pcj.pmadd(A02, ptmp2, T0);
- A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
- T0 = pcj.pmadd(A03, ptmp3, T0);
- pstore(&res[j],T0);
- A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
- T1 = pcj.pmadd(A11, ptmp1, T1);
- T1 = pcj.pmadd(A12, ptmp2, T1);
- T1 = pcj.pmadd(A13, ptmp3, T1);
- pstore(&res[j+ResPacketSize],T1);
- }
- }
- for (; j<alignedSize; j+=ResPacketSize)
- _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
- break;
- }
- default:
- for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
- _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
- break;
- }
+ for(Index j=j2; j<jend; j+=1)
+ {
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
+ c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
}
- } // end explicit vectorization
+ pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+ pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+ pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
- /* process remaining coeffs (or all if there is no explicit vectorization) */
- for (Index j=alignedSize; j<size; ++j)
- {
- res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
- res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
- res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
- res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
+ i+=ResPacketSize*3;
}
- }
-
- // process remaining first and last columns (at most columnsAtOnce-1)
- Index end = cols;
- Index start = columnBound;
- do
- {
- for (Index k=start; k<end; ++k)
+ if(i<n2)
{
- RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
- const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
+ c1 = pset1<ResPacket>(ResScalar(0));
- if (Vectorizable)
+ for(Index j=j2; j<jend; j+=1)
{
- /* explicit vectorization */
- // process first unaligned result's coeffs
- for (Index j=0; j<alignedStart; ++j)
- res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
- // process aligned result's coeffs
- if (lhs0.template aligned<LhsPacket>(alignedStart))
- for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
- pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
- else
- for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
- pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
+ c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
}
-
- // process remaining scalars (or all if no explicit vectorization)
- for (Index i=alignedSize; i<size; ++i)
- res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
+ pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+ pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
+ i+=ResPacketSize*2;
}
- if (skipColumns)
+ if(i<n1)
{
- start = 0;
- end = skipColumns;
- skipColumns = 0;
+ ResPacket c0 = pset1<ResPacket>(ResScalar(0));
+ for(Index j=j2; j<jend; j+=1)
+ {
+ RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
+ c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+ }
+ pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
+ i+=ResPacketSize;
}
- else
- break;
- } while(Vectorizable);
- #undef _EIGEN_ACCUMULATE_PACKETS
+ for(;i<rows;++i)
+ {
+ ResScalar c0(0);
+ for(Index j=j2; j<jend; j+=1)
+ c0 += cj.pmul(lhs(i,j), rhs(j,0));
+ res[i] += alpha*c0;
+ }
+ }
}
/* Optimized row-major matrix * vector product: