aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/CacheFriendlyProduct.h
diff options
context:
space:
mode:
Diffstat (limited to 'Eigen/src/Core/CacheFriendlyProduct.h')
-rw-r--r--Eigen/src/Core/CacheFriendlyProduct.h99
1 files changed, 46 insertions, 53 deletions
diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h
index a710d44d4..06b3f5876 100644
--- a/Eigen/src/Core/CacheFriendlyProduct.h
+++ b/Eigen/src/Core/CacheFriendlyProduct.h
@@ -367,7 +367,7 @@ static void ei_cache_friendly_product(
* TODO: since rhs gets evaluated only once, no need to evaluate it
*/
template<typename Scalar, typename RhsType>
-EIGEN_DONT_INLINE static void ei_cache_friendly_product(
+EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
int size,
const Scalar* lhs, int lhsStride,
const RhsType& rhs,
@@ -408,54 +408,34 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product(
: alignmentStep==2 ? EvenAligned
: FirstAligned;
- // find how many column do we have to skip to be aligned with the result (if possible)
+ // find how many columns do we have to skip to be aligned with the result (if possible)
int skipColumns=0;
- for (; skipColumns<PacketSize; ++skipColumns)
- {
- if (alignedStart == alignmentStep*skipColumns)
- break;
- }
+ for (; skipColumns<PacketSize && alignedStart != alignmentStep*skipColumns; ++skipColumns)
+ {}
if (skipColumns==PacketSize)
+ {
+ // nothing can be aligned, no need to skip any column
alignmentPattern = NoneAligned;
- skipColumns = std::min(skipColumns,rhs.size());
- if (alignmentPattern!=NoneAligned)
- for (int i=0; i<skipColumns; i++)
- {
- Scalar tmp0 = rhs[i];
- Packet ptmp0 = ei_pset1(tmp0);
- int iN0 = i*lhsStride;
- // process first unaligned result's coeffs
- for (int j=0; j<alignedStart; j++)
- res[j] += tmp0 * lhs[j+iN0];
- // process aligned result's coeffs (we know the lhs columns are not aligned)
- for (int j = alignedStart;j<alignedSize;j+=PacketSize)
- ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j])));
- // process remaining result's coeffs
- for (int j=alignedSize; j<size; j++)
- res[j] += tmp0 * lhs[j+iN0];
- }
+ skipColumns = 0;
+ }
+ else
+ {
+ skipColumns = std::min(skipColumns,rhs.size());
+ // note that the skiped columns are processed later.
+ }
int columnBound = (rhs.size()/columnsAtOnce)*columnsAtOnce;
- for (int i=0; i<columnBound; i+=columnsAtOnce)
+ for (int i=skipColumns; i<columnBound; i+=columnsAtOnce)
{
- Scalar tmp0 = rhs[i];
- Packet ptmp0 = ei_pset1(tmp0);
- Scalar tmp1 = rhs[i+1];
- Packet ptmp1 = ei_pset1(tmp1);
- Scalar tmp2 = rhs[i+2];
- Packet ptmp2 = ei_pset1(tmp2);
- Scalar tmp3 = rhs[i+3];
- Packet ptmp3 = ei_pset1(tmp3);
- int iN0 = i*lhsStride;
- int iN1 = (i+1)*lhsStride;
- int iN2 = (i+2)*lhsStride;
- int iN3 = (i+3)*lhsStride;
+ Scalar tmp0 = rhs[i], tmp1 = rhs[i+1], tmp2 = rhs[i+2], tmp3 = rhs[i+3];
+ Packet ptmp0 = ei_pset1(tmp0), ptmp1 = ei_pset1(tmp1), ptmp2 = ei_pset1(tmp2), ptmp3 = ei_pset1(tmp3);
+ int iN0 = i*lhsStride, iN1 = (i+1)*lhsStride, iN2 = (i+2)*lhsStride, iN3 = (i+3)*lhsStride;
// process initial unaligned coeffs
for (int j=0; j<alignedStart; j++)
res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3];
- if (alignedSize>0)
+ if (alignedSize>alignedStart)
{
switch(alignmentPattern)
{
@@ -475,10 +455,6 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product(
_EIGEN_ACCUMULATE_PACKETS(,u,u,+PacketSize);
if (peels>2) _EIGEN_ACCUMULATE_PACKETS(,u,u,+2*PacketSize);
if (peels>3) _EIGEN_ACCUMULATE_PACKETS(,u,u,+3*PacketSize);
- if (peels>4) _EIGEN_ACCUMULATE_PACKETS(,u,u,+4*PacketSize);
- if (peels>5) _EIGEN_ACCUMULATE_PACKETS(,u,u,+5*PacketSize);
- if (peels>6) _EIGEN_ACCUMULATE_PACKETS(,u,u,+6*PacketSize);
- if (peels>7) _EIGEN_ACCUMULATE_PACKETS(,u,u,+7*PacketSize);
}
for (int j = peeledSize; j<alignedSize; j+=PacketSize)
_EIGEN_ACCUMULATE_PACKETS(,u,u,);
@@ -494,25 +470,42 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product(
for (int j=alignedSize; j<size; j++)
res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3];
}
- for (int i=columnBound; i<rhs.size(); i++)
+
+ // process remaining first and last columns (at most columnsAtOnce-1)
+ int end = rhs.size();
+ int start = columnBound;
+ do
{
- Scalar tmp0 = rhs[i];
- Packet ptmp0 = ei_pset1(tmp0);
- int iN0 = i*lhsStride;
- if (alignedSize>0)
+ for (int i=columnBound; i<end; i++)
{
- bool aligned0 = (iN0 % PacketSize) == 0;
- if (aligned0)
+ Scalar tmp0 = rhs[i];
+ Packet ptmp0 = ei_pset1(tmp0);
+ int iN0 = i*lhsStride;
+ // process first unaligned result's coeffs
+ for (int j=0; j<alignedStart; j++)
+ res[j] += tmp0 * lhs[j+iN0];
+
+ // process aligned result's coeffs
+ if ((iN0 % PacketSize) == 0)
for (int j = 0;j<alignedSize;j+=PacketSize)
ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_pload(&lhs[j+iN0])),ei_pload(&res[j])));
else
for (int j = 0;j<alignedSize;j+=PacketSize)
ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j])));
+
+ // process remaining scalars
+ for (int j=alignedSize; j<size; j++)
+ res[j] += tmp0 * lhs[j+iN0];
}
- // process remaining scalars
- for (int j=alignedSize; j<size; j++)
- res[j] += tmp0 * lhs[j+iN0];
- }
+ if (skipColumns)
+ {
+ start = 0;
+ end = skipColumns;
+ skipColumns = 0;
+ }
+ else
+ break;
+ } while(true);
asm("#end matrix_vector_product");
#undef _EIGEN_ACCUMULATE_PACKETS