diff options
Diffstat (limited to 'Eigen/src/Core/CacheFriendlyProduct.h')
-rw-r--r-- | Eigen/src/Core/CacheFriendlyProduct.h | 99 |
1 files changed, 46 insertions, 53 deletions
diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h index a710d44d4..06b3f5876 100644 --- a/Eigen/src/Core/CacheFriendlyProduct.h +++ b/Eigen/src/Core/CacheFriendlyProduct.h @@ -367,7 +367,7 @@ static void ei_cache_friendly_product( * TODO: since rhs gets evaluated only once, no need to evaluate it */ template<typename Scalar, typename RhsType> -EIGEN_DONT_INLINE static void ei_cache_friendly_product( +EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, @@ -408,54 +408,34 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product( : alignmentStep==2 ? EvenAligned : FirstAligned; - // find how many column do we have to skip to be aligned with the result (if possible) + // find how many columns do we have to skip to be aligned with the result (if possible) int skipColumns=0; - for (; skipColumns<PacketSize; ++skipColumns) - { - if (alignedStart == alignmentStep*skipColumns) - break; - } + for (; skipColumns<PacketSize && alignedStart != alignmentStep*skipColumns; ++skipColumns) + {} if (skipColumns==PacketSize) + { + // nothing can be aligned, no need to skip any column alignmentPattern = NoneAligned; - skipColumns = std::min(skipColumns,rhs.size()); - if (alignmentPattern!=NoneAligned) - for (int i=0; i<skipColumns; i++) - { - Scalar tmp0 = rhs[i]; - Packet ptmp0 = ei_pset1(tmp0); - int iN0 = i*lhsStride; - // process first unaligned result's coeffs - for (int j=0; j<alignedStart; j++) - res[j] += tmp0 * lhs[j+iN0]; - // process aligned result's coeffs (we know the lhs columns are not aligned) - for (int j = alignedStart;j<alignedSize;j+=PacketSize) - ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j]))); - // process remaining result's coeffs - for (int j=alignedSize; j<size; j++) - res[j] += tmp0 * lhs[j+iN0]; - } + skipColumns = 0; + } + else + { + skipColumns = std::min(skipColumns,rhs.size()); + // note that the skiped columns are processed later. + } int columnBound = (rhs.size()/columnsAtOnce)*columnsAtOnce; - for (int i=0; i<columnBound; i+=columnsAtOnce) + for (int i=skipColumns; i<columnBound; i+=columnsAtOnce) { - Scalar tmp0 = rhs[i]; - Packet ptmp0 = ei_pset1(tmp0); - Scalar tmp1 = rhs[i+1]; - Packet ptmp1 = ei_pset1(tmp1); - Scalar tmp2 = rhs[i+2]; - Packet ptmp2 = ei_pset1(tmp2); - Scalar tmp3 = rhs[i+3]; - Packet ptmp3 = ei_pset1(tmp3); - int iN0 = i*lhsStride; - int iN1 = (i+1)*lhsStride; - int iN2 = (i+2)*lhsStride; - int iN3 = (i+3)*lhsStride; + Scalar tmp0 = rhs[i], tmp1 = rhs[i+1], tmp2 = rhs[i+2], tmp3 = rhs[i+3]; + Packet ptmp0 = ei_pset1(tmp0), ptmp1 = ei_pset1(tmp1), ptmp2 = ei_pset1(tmp2), ptmp3 = ei_pset1(tmp3); + int iN0 = i*lhsStride, iN1 = (i+1)*lhsStride, iN2 = (i+2)*lhsStride, iN3 = (i+3)*lhsStride; // process initial unaligned coeffs for (int j=0; j<alignedStart; j++) res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3]; - if (alignedSize>0) + if (alignedSize>alignedStart) { switch(alignmentPattern) { @@ -475,10 +455,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product( _EIGEN_ACCUMULATE_PACKETS(,u,u,+PacketSize); if (peels>2) _EIGEN_ACCUMULATE_PACKETS(,u,u,+2*PacketSize); if (peels>3) _EIGEN_ACCUMULATE_PACKETS(,u,u,+3*PacketSize); - if (peels>4) _EIGEN_ACCUMULATE_PACKETS(,u,u,+4*PacketSize); - if (peels>5) _EIGEN_ACCUMULATE_PACKETS(,u,u,+5*PacketSize); - if (peels>6) _EIGEN_ACCUMULATE_PACKETS(,u,u,+6*PacketSize); - if (peels>7) _EIGEN_ACCUMULATE_PACKETS(,u,u,+7*PacketSize); } for (int j = peeledSize; j<alignedSize; j+=PacketSize) _EIGEN_ACCUMULATE_PACKETS(,u,u,); @@ -494,25 +470,42 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product( for (int j=alignedSize; j<size; j++) res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3]; } - for (int i=columnBound; i<rhs.size(); i++) + + // process remaining first and last columns (at most columnsAtOnce-1) + int end = rhs.size(); + int start = columnBound; + do { - Scalar tmp0 = rhs[i]; - Packet ptmp0 = ei_pset1(tmp0); - int iN0 = i*lhsStride; - if (alignedSize>0) + for (int i=columnBound; i<end; i++) { - bool aligned0 = (iN0 % PacketSize) == 0; - if (aligned0) + Scalar tmp0 = rhs[i]; + Packet ptmp0 = ei_pset1(tmp0); + int iN0 = i*lhsStride; + // process first unaligned result's coeffs + for (int j=0; j<alignedStart; j++) + res[j] += tmp0 * lhs[j+iN0]; + + // process aligned result's coeffs + if ((iN0 % PacketSize) == 0) for (int j = 0;j<alignedSize;j+=PacketSize) ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_pload(&lhs[j+iN0])),ei_pload(&res[j]))); else for (int j = 0;j<alignedSize;j+=PacketSize) ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j]))); + + // process remaining scalars + for (int j=alignedSize; j<size; j++) + res[j] += tmp0 * lhs[j+iN0]; } - // process remaining scalars - for (int j=alignedSize; j<size; j++) - res[j] += tmp0 * lhs[j+iN0]; - } + if (skipColumns) + { + start = 0; + end = skipColumns; + skipColumns = 0; + } + else + break; + } while(true); asm("#end matrix_vector_product"); #undef _EIGEN_ACCUMULATE_PACKETS |