aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2008-07-12 12:12:02 +0000
committerGravatar Gael Guennebaud <g.gael@free.fr>2008-07-12 12:12:02 +0000
commitb7bd1b3446aafe2fa81b9cd3218d9fb902ba2bbc (patch)
treed08fe60c2a32bd9e01eefa94a27b983ed3d4d1ee /Eigen/src
parent6f71ef8277405d268032f7c3bcaf316c7422c133 (diff)
Add a *very efficient* evaluation path for both col-major matrix * vector
and vector * row-major products. Currently, it is enabled only is the matrix has DirectAccessBit flag and the product is "large enough". Added the respective unit tests in test/product/cpp.
Diffstat (limited to 'Eigen/src')
-rw-r--r--Eigen/src/Core/Assign.h1
-rw-r--r--Eigen/src/Core/Block.h1
-rw-r--r--Eigen/src/Core/CacheFriendlyProduct.h165
-rw-r--r--Eigen/src/Core/Product.h97
4 files changed, 250 insertions, 14 deletions
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 828b49725..ba53e299d 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -249,7 +249,6 @@ struct ei_assign_impl<Derived1, Derived2, InnerVectorization, NoUnrolling>
{
static void run(Derived1 &dst, const Derived2 &src)
{
- const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
const int innerSize = dst.innerSize();
const int outerSize = dst.outerSize();
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 245357511..ae4e83c9e 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -155,7 +155,6 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
return m_matrix.const_cast_derived()
.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
-
}
inline const Scalar coeff(int index) const
diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h
index 0da84eeac..a710d44d4 100644
--- a/Eigen/src/Core/CacheFriendlyProduct.h
+++ b/Eigen/src/Core/CacheFriendlyProduct.h
@@ -25,6 +25,8 @@
#ifndef EIGEN_CACHE_FRIENDLY_PRODUCT_H
#define EIGEN_CACHE_FRIENDLY_PRODUCT_H
+#ifndef EIGEN_EXTERN_INSTANTIATIONS
+
template<typename Scalar>
static void ei_cache_friendly_product(
int _rows, int _cols, int depth,
@@ -77,8 +79,6 @@ static void ei_cache_friendly_product(
MaxL2BlockSize = EIGEN_TUNE_FOR_L2_CACHE_SIZE / sizeof(Scalar)
};
-
- //const bool rhsIsAligned = (PacketSize==1) || (((rhsStride%PacketSize) == 0) && (size_t(rhs)%16==0));
const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0));
const int remainingSize = depth % PacketSize;
@@ -357,4 +357,165 @@ static void ei_cache_friendly_product(
free(block);
}
+#endif // EIGEN_EXTERN_INSTANTIATIONS
+
+/* Optimized col-major matrix * vector product:
+ * This algorithm processes 4 columns at onces that allows to both reduce
+ * the number of load/stores of the result by a factor 4 and to reduce
+ * the instruction dependency. Moreover, we know that all bands have the
+ * same alignment pattern.
+ * TODO: since rhs gets evaluated only once, no need to evaluate it
+ */
+template<typename Scalar, typename RhsType>
+EIGEN_DONT_INLINE static void ei_cache_friendly_product(
+ int size,
+ const Scalar* lhs, int lhsStride,
+ const RhsType& rhs,
+ Scalar* res)
+{
+ #ifdef _EIGEN_ACCUMULATE_PACKETS
+ #error _EIGEN_ACCUMULATE_PACKETS has already been defined
+ #endif
+
+ #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2,OFFSET) \
+ ei_pstore(&res[j OFFSET], \
+ ei_padd(ei_pload(&res[j OFFSET]), \
+ ei_padd( \
+ ei_padd(ei_pmul(ptmp0,ei_pload ## A0(&lhs[j OFFSET +iN0])),ei_pmul(ptmp1,ei_pload ## A13(&lhs[j OFFSET +iN1]))), \
+ ei_padd(ei_pmul(ptmp2,ei_pload ## A2(&lhs[j OFFSET +iN2])),ei_pmul(ptmp3,ei_pload ## A13(&lhs[j OFFSET +iN3]))) )))
+
+ asm("#begin matrix_vector_product");
+ typedef typename ei_packet_traits<Scalar>::type Packet;
+ const int PacketSize = sizeof(Packet)/sizeof(Scalar);
+
+ enum { AllAligned, EvenAligned, FirstAligned, NoneAligned };
+ const int columnsAtOnce = 4;
+ const int peels = 2;
+ const int PacketAlignedMask = PacketSize-1;
+ const int PeelAlignedMask = PacketSize*peels-1;
+ const bool Vectorized = sizeof(Packet) != sizeof(Scalar);
+
+ // How many coeffs of the result do we have to skip to be aligned.
+ // Here we assume data are at least aligned on the base scalar type that is mandatory anyway.
+ const int alignedStart = Vectorized
+ ? std::min<int>( (PacketSize - ((size_t(res)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size)
+ : 0;
+ const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask);
+ const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0;
+
+ const int alignmentStep = lhsStride % PacketSize;
+ int alignmentPattern = alignmentStep==0 ? AllAligned
+ : alignmentStep==2 ? EvenAligned
+ : FirstAligned;
+
+ // find how many column do we have to skip to be aligned with the result (if possible)
+ int skipColumns=0;
+ for (; skipColumns<PacketSize; ++skipColumns)
+ {
+ if (alignedStart == alignmentStep*skipColumns)
+ break;
+ }
+ if (skipColumns==PacketSize)
+ alignmentPattern = NoneAligned;
+ skipColumns = std::min(skipColumns,rhs.size());
+ if (alignmentPattern!=NoneAligned)
+ for (int i=0; i<skipColumns; i++)
+ {
+ Scalar tmp0 = rhs[i];
+ Packet ptmp0 = ei_pset1(tmp0);
+ int iN0 = i*lhsStride;
+ // process first unaligned result's coeffs
+ for (int j=0; j<alignedStart; j++)
+ res[j] += tmp0 * lhs[j+iN0];
+ // process aligned result's coeffs (we know the lhs columns are not aligned)
+ for (int j = alignedStart;j<alignedSize;j+=PacketSize)
+ ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j])));
+ // process remaining result's coeffs
+ for (int j=alignedSize; j<size; j++)
+ res[j] += tmp0 * lhs[j+iN0];
+ }
+
+ int columnBound = (rhs.size()/columnsAtOnce)*columnsAtOnce;
+ for (int i=0; i<columnBound; i+=columnsAtOnce)
+ {
+ Scalar tmp0 = rhs[i];
+ Packet ptmp0 = ei_pset1(tmp0);
+ Scalar tmp1 = rhs[i+1];
+ Packet ptmp1 = ei_pset1(tmp1);
+ Scalar tmp2 = rhs[i+2];
+ Packet ptmp2 = ei_pset1(tmp2);
+ Scalar tmp3 = rhs[i+3];
+ Packet ptmp3 = ei_pset1(tmp3);
+ int iN0 = i*lhsStride;
+ int iN1 = (i+1)*lhsStride;
+ int iN2 = (i+2)*lhsStride;
+ int iN3 = (i+3)*lhsStride;
+
+ // process initial unaligned coeffs
+ for (int j=0; j<alignedStart; j++)
+ res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3];
+
+ if (alignedSize>0)
+ {
+ switch(alignmentPattern)
+ {
+ case AllAligned:
+ for (int j = alignedStart; j<alignedSize; j+=PacketSize)
+ _EIGEN_ACCUMULATE_PACKETS(,,,);
+ break;
+ case EvenAligned:
+ for (int j = alignedStart; j<alignedSize; j+=PacketSize)
+ _EIGEN_ACCUMULATE_PACKETS(,u,,);
+ break;
+ case FirstAligned:
+ if (peels>1)
+ for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize)
+ {
+ _EIGEN_ACCUMULATE_PACKETS(,u,u,);
+ _EIGEN_ACCUMULATE_PACKETS(,u,u,+PacketSize);
+ if (peels>2) _EIGEN_ACCUMULATE_PACKETS(,u,u,+2*PacketSize);
+ if (peels>3) _EIGEN_ACCUMULATE_PACKETS(,u,u,+3*PacketSize);
+ if (peels>4) _EIGEN_ACCUMULATE_PACKETS(,u,u,+4*PacketSize);
+ if (peels>5) _EIGEN_ACCUMULATE_PACKETS(,u,u,+5*PacketSize);
+ if (peels>6) _EIGEN_ACCUMULATE_PACKETS(,u,u,+6*PacketSize);
+ if (peels>7) _EIGEN_ACCUMULATE_PACKETS(,u,u,+7*PacketSize);
+ }
+ for (int j = peeledSize; j<alignedSize; j+=PacketSize)
+ _EIGEN_ACCUMULATE_PACKETS(,u,u,);
+ break;
+ default:
+ for (int j = peeledSize; j<alignedSize; j+=PacketSize)
+ _EIGEN_ACCUMULATE_PACKETS(u,u,u,);
+ break;
+ }
+ }
+
+ // process remaining coeffs
+ for (int j=alignedSize; j<size; j++)
+ res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3];
+ }
+ for (int i=columnBound; i<rhs.size(); i++)
+ {
+ Scalar tmp0 = rhs[i];
+ Packet ptmp0 = ei_pset1(tmp0);
+ int iN0 = i*lhsStride;
+ if (alignedSize>0)
+ {
+ bool aligned0 = (iN0 % PacketSize) == 0;
+ if (aligned0)
+ for (int j = 0;j<alignedSize;j+=PacketSize)
+ ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_pload(&lhs[j+iN0])),ei_pload(&res[j])));
+ else
+ for (int j = 0;j<alignedSize;j+=PacketSize)
+ ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j])));
+ }
+ // process remaining scalars
+ for (int j=alignedSize; j<size; j++)
+ res[j] += tmp0 * lhs[j+iN0];
+ }
+ asm("#end matrix_vector_product");
+
+ #undef _EIGEN_ACCUMULATE_PACKETS
+}
+
#endif // EIGEN_CACHE_FRIENDLY_PRODUCT_H
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 6089c1be5..c2f0c07a8 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -250,8 +250,8 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
return res;
}
- const _LhsNested& lhs() const { return m_lhs; }
- const _RhsNested& rhs() const { return m_rhs; }
+ inline const _LhsNested& lhs() const { return m_lhs; }
+ inline const _RhsNested& rhs() const { return m_rhs; }
protected:
const LhsNested m_lhs;
@@ -480,11 +480,22 @@ struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMod
* Cache friendly product callers and specific nested evaluation strategies
***************************************************************************/
+template<typename Scalar, typename RhsType>
+static void ei_cache_friendly_product(
+ int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res);
+
+enum {
+ HasDirectAccess,
+ NoDirectAccess
+};
+
template<typename ProductType,
int LhsRows = ei_traits<ProductType>::RowsAtCompileTime,
int LhsOrder = int(ei_traits<ProductType>::LhsFlags)&RowMajorBit ? RowMajor : ColMajor,
+ int LhsHasDirectAccess = int(ei_traits<ProductType>::LhsFlags)&DirectAccessBit? HasDirectAccess : NoDirectAccess,
int RhsCols = ei_traits<ProductType>::ColsAtCompileTime,
- int RhsOrder = int(ei_traits<ProductType>::RhsFlags)&RowMajorBit ? RowMajor : ColMajor>
+ int RhsOrder = int(ei_traits<ProductType>::RhsFlags)&RowMajorBit ? RowMajor : ColMajor,
+ int RhsHasDirectAccess = int(ei_traits<ProductType>::RhsFlags)&DirectAccessBit? HasDirectAccess : NoDirectAccess>
struct ei_cache_friendly_product_selector
{
template<typename DestDerived>
@@ -495,21 +506,57 @@ struct ei_cache_friendly_product_selector
};
// optimized colmajor * vector path
-template<typename ProductType, int LhsRows, int RhsOrder>
-struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,1,RhsOrder>
+template<typename ProductType, int LhsRows, int RhsOrder, int RhsAccess>
+struct ei_cache_friendly_product_selector<ProductType,LhsRows,NoDirectAccess,ColMajor,1,RhsOrder,RhsAccess>
{
+ typedef typename ei_traits<ProductType>::_LhsNested Lhs;
template<typename DestDerived>
inline static void run(DestDerived& res, const ProductType& product)
{
- const int rows = product.rhs().rows();
- for (int j=0; j<rows; ++j)
- res += product.rhs().coeff(j) * product.lhs().col(j);
+ ei_scalar_sum_op<typename ProductType::Scalar> _sum;
+ const int size = product.rhs().rows();
+ for (int k=0; k<size; ++k)
+ if (Lhs::Flags&DirectAccessBit)
+ // TODO to properly hanlde this workaround let's specialize Block for type having the DirectAccessBit
+ res += product.rhs().coeff(k) * Map<DestDerived>(&product.lhs().const_cast_derived().coeffRef(0,k),product.lhs().rows());
+ else
+ res += product.rhs().coeff(k) * product.lhs().col(k);
+ }
+};
+
+// optimized cache friendly colmajor * vector path for matrix with direct access flag
+// NOTE this path coul also be enabled for expressions if we add runtime align queries
+template<typename ProductType, int LhsRows, int RhsOrder, int RhsAccess>
+struct ei_cache_friendly_product_selector<ProductType,LhsRows,HasDirectAccess,ColMajor,1,RhsOrder,RhsAccess>
+{
+ typedef typename ProductType::Scalar Scalar;
+
+ template<typename DestDerived>
+ inline static void run(DestDerived& res, const ProductType& product)
+ {
+ enum {
+ EvalToRes = (ei_packet_traits<Scalar>::size==1)
+ ||(DestDerived::Flags&ActualPacketAccessBit) && (!(DestDerived::Flags & RowMajorBit)) };
+ Scalar* __restrict__ _res;
+ if (EvalToRes)
+ _res = &res.coeffRef(0);
+ else
+ {
+ _res = (Scalar*)alloca(sizeof(Scalar)*res.size());
+ Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size()) = res;
+ }
+ ei_cache_friendly_product(res.size(),
+ &product.lhs().const_cast_derived().coeffRef(0,0), product.lhs().stride(),
+ product.rhs(), _res);
+
+ if (!EvalToRes)
+ res = Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size());
}
};
// optimized vector * rowmajor path
-template<typename ProductType, int LhsOrder, int RhsCols>
-struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,RhsCols,RowMajor>
+template<typename ProductType, int LhsOrder, int LhsAccess, int RhsCols>
+struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCols,RowMajor,NoDirectAccess>
{
template<typename DestDerived>
inline static void run(DestDerived& res, const ProductType& product)
@@ -520,6 +567,36 @@ struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,RhsCols,RowMajo
}
};
+// optimized cache friendly vector * rowmajor path for matrix with direct access flag
+// NOTE this path coul also be enabled for expressions if we add runtime align queries
+template<typename ProductType, int LhsOrder, int LhsAccess, int RhsCols>
+struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCols,RowMajor,HasDirectAccess>
+{
+ typedef typename ProductType::Scalar Scalar;
+
+ template<typename DestDerived>
+ inline static void run(DestDerived& res, const ProductType& product)
+ {
+ enum {
+ EvalToRes = (ei_packet_traits<Scalar>::size==1)
+ ||(DestDerived::Flags & ActualPacketAccessBit) && (DestDerived::Flags & RowMajorBit) };
+ Scalar* __restrict__ _res;
+ if (EvalToRes)
+ _res = &res.coeffRef(0);
+ else
+ {
+ _res = (Scalar*)alloca(sizeof(Scalar)*res.size());
+ Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size()) = res;
+ }
+ ei_cache_friendly_product(res.size(),
+ &product.rhs().const_cast_derived().coeffRef(0,0), product.rhs().stride(),
+ product.lhs().transpose(), _res);
+
+ if (!EvalToRes)
+ res = Map<Matrix<Scalar,1,DestDerived::ColsAtCompileTime> >(_res, res.size());
+ }
+};
+
/** \internal */
template<typename Derived>
template<typename Lhs,typename Rhs>