aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2008-05-01 13:53:05 +0000
committerGravatar Gael Guennebaud <g.gael@free.fr>2008-05-01 13:53:05 +0000
commit02f1615d2a5400df349f4cacc48490310926fdee (patch)
treef34a62cf71c1698e08a6f073ea4d6ae537989450 /Eigen/src
parent6486991ac349cc23cb1e1a51cc8a1019a7fdc5c4 (diff)
Enable vectorization of product with dynamic matrices,
extended cache optimal product to work in any row/column major situations, and a few bugfixes (forgot to add the Cholesky header, vectorization of CwiseBinary)
Diffstat (limited to 'Eigen/src')
-rw-r--r--Eigen/src/Core/Assign.h3
-rw-r--r--Eigen/src/Core/CwiseBinaryOp.h2
-rw-r--r--Eigen/src/Core/Lazy.h2
-rw-r--r--Eigen/src/Core/Product.h178
4 files changed, 130 insertions, 55 deletions
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 42eda8a4e..362e3e9e7 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -112,7 +112,8 @@ template<typename OtherDerived>
Derived& MatrixBase<Derived>
::lazyAssign(const MatrixBase<OtherDerived>& other)
{
-// std::cout << "lazyAssign = " << Derived::Flags << " " << OtherDerived::Flags << "\n";
+// std::cout << typeid(OtherDerived).name() << "\n";
+// std::cout << "lazyAssign = " << (Derived::Flags&VectorizableBit) << " " << (OtherDerived::Flags&VectorizableBit) << "\n";
ei_assignment_impl<Derived, OtherDerived>::execute(derived(),other.derived());
return derived();
}
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index 0032d22e2..ca7913efc 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -64,7 +64,7 @@ struct ei_traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
DefaultLostFlagMask
| Like1DArrayBit
| (ei_functor_traits<BinaryOp>::IsVectorizable && ((Lhs::Flags & RowMajorBit)==(Rhs::Flags & RowMajorBit))
- ? VectorizableBit : 0))),
+ ? Lhs::Flags & Rhs::Flags & VectorizableBit : 0))),
CoeffReadCost = Lhs::CoeffReadCost + Rhs::CoeffReadCost + ei_functor_traits<BinaryOp>::Cost
};
};
diff --git a/Eigen/src/Core/Lazy.h b/Eigen/src/Core/Lazy.h
index 3e25acb19..ffe777158 100644
--- a/Eigen/src/Core/Lazy.h
+++ b/Eigen/src/Core/Lazy.h
@@ -78,7 +78,7 @@ template<typename ExpressionType> class Lazy
}
protected:
- const typename ExpressionType::Nested m_expression;
+ const ExpressionType& m_expression;
};
/** \returns an expression of the lazy version of *this.
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index a60c13e45..86f37ec43 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -108,7 +108,9 @@ struct ei_packet_product_unroller<RowMajor, Index, Dynamic, Lhs, Rhs, PacketScal
*/
template<typename Lhs, typename Rhs> struct ei_product_eval_mode
{
- enum{ value = Lhs::MaxRowsAtCompileTime >= 16 && Rhs::MaxColsAtCompileTime >= 16
+ enum{ value = Lhs::MaxRowsAtCompileTime >= 16
+ && Rhs::MaxColsAtCompileTime >= 16
+ && (!( (Lhs::Flags&RowMajorBit) && (Rhs::Flags&RowMajorBit xor RowMajorBit)))
? CacheOptimalProduct : NormalProduct };
};
@@ -129,29 +131,18 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> >
ColsAtCompileTime = Rhs::ColsAtCompileTime,
MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime,
MaxColsAtCompileTime = Rhs::MaxColsAtCompileTime,
- Flags = (( (RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic)
- ? (unsigned int)(LhsFlags | RhsFlags)
- : (unsigned int)(LhsFlags | RhsFlags) & ~LargeBit )
+ _RhsVectorizable = (RhsFlags & RowMajorBit) && (RhsFlags & VectorizableBit) && (ColsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
+ _LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
+ _Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 1 : 0,
+ _RowMajor = (RhsFlags & RowMajorBit)
+ && (EvalMode==(int)CacheOptimalProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)),
+ _LostBits = DefaultLostFlagMask & ~(
+ (_RowMajor ? 0 : RowMajorBit)
+ | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
+ Flags = ((unsigned int)(LhsFlags | RhsFlags) & _LostBits)
| EvalBeforeAssigningBit
- | (ei_product_eval_mode<Lhs, Rhs>::value == (int)CacheOptimalProduct ? EvalBeforeNestingBit : 0))
- & (
- DefaultLostFlagMask & (~RowMajorBit)
- | (
- (
- (!(Lhs::Flags & RowMajorBit)) && (Lhs::Flags & VectorizableBit)
- && (Lhs::RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0)
- )
- ? VectorizableBit
- : (
- (
- (Rhs::Flags & RowMajorBit) && (Rhs::Flags & VectorizableBit)
- && (Lhs::ColsAtCompileTime % ei_packet_traits<Scalar>::size == 0)
- )
- ? RowMajorBit | VectorizableBit
- : 0
- )
- )
- ),
+ | ((int)EvalMode == (int)CacheOptimalProduct ? EvalBeforeNestingBit : 0)
+ | (_Vectorizable ? VectorizableBit : 0),
CoeffReadCost
= Lhs::ColsAtCompileTime == Dynamic
? Dynamic
@@ -278,11 +269,7 @@ Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheOptimalProdu
{
product._cacheOptimalEval(*this,
#ifdef EIGEN_VECTORIZE
- typename ei_meta_if<(Flags & VectorizableBit)
- && (!(Lhs::Flags & RowMajorBit)
- && (Lhs::RowsAtCompileTime!=Dynamic)
- && (Lhs::RowsAtCompileTime%ei_packet_traits<Scalar>::size==0) ),
- ei_meta_true,ei_meta_false>::ret()
+ typename ei_meta_if<Flags & VectorizableBit, ei_meta_true, ei_meta_false>::ret()
#else
ei_meta_false()
#endif
@@ -296,21 +283,53 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_fals
{
res.setZero();
const int cols4 = m_lhs.cols() & 0xfffffffC;
+ if (Lhs::Flags&RowMajorBit)
{
- for(int k=0; k<this->cols(); ++k)
+// std::cout << "opt rhs\n";
+ int j=0;
+ for(; j<cols4; j+=4)
+ {
+ for(int k=0; k<this->rows(); ++k)
+ {
+ const Scalar tmp0 = m_lhs.coeff(k,j );
+ const Scalar tmp1 = m_lhs.coeff(k,j+1);
+ const Scalar tmp2 = m_lhs.coeff(k,j+2);
+ const Scalar tmp3 = m_lhs.coeff(k,j+3);
+ for (int i=0; i<this->cols(); ++i)
+ res.coeffRef(k,i) += tmp0 * m_rhs.coeff(j+0,i) + tmp1 * m_rhs.coeff(j+1,i)
+ + tmp2 * m_rhs.coeff(j+2,i) + tmp3 * m_rhs.coeff(j+3,i);
+ }
+ }
+ for(; j<m_lhs.cols(); ++j)
{
- int j=0;
- for(; j<cols4; j+=4)
+ for(int k=0; k<this->rows(); ++k)
+ {
+ const Scalar tmp = m_rhs.coeff(k,j);
+ for (int i=0; i<this->cols(); ++i)
+ res.coeffRef(k,i) += tmp * m_lhs.coeff(j,i);
+ }
+ }
+ }
+ else
+ {
+// std::cout << "opt lhs\n";
+ int j = 0;
+ for(; j<cols4; j+=4)
+ {
+ for(int k=0; k<this->cols(); ++k)
{
const Scalar tmp0 = m_rhs.coeff(j ,k);
const Scalar tmp1 = m_rhs.coeff(j+1,k);
const Scalar tmp2 = m_rhs.coeff(j+2,k);
const Scalar tmp3 = m_rhs.coeff(j+3,k);
for (int i=0; i<this->rows(); ++i)
- res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j) + tmp1 * m_lhs.coeff(i,j+1)
- + tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3);
+ res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j+0) + tmp1 * m_lhs.coeff(i,j+1)
+ + tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3);
}
- for(; j<m_lhs.cols(); ++j)
+ }
+ for(; j<m_lhs.cols(); ++j)
+ {
+ for(int k=0; k<this->cols(); ++k)
{
const Scalar tmp = m_rhs.coeff(j,k);
for (int i=0; i<this->rows(); ++i)
@@ -325,40 +344,95 @@ template<typename Lhs, typename Rhs, int EvalMode>
template<typename DestDerived>
void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true) const
{
+
+ if (((Lhs::Flags&RowMajorBit) && (_cols() % ei_packet_traits<Scalar>::size != 0))
+ || (_rows() % ei_packet_traits<Scalar>::size != 0))
+ {
+ return _cacheOptimalEval(res, ei_meta_false());
+ }
+
res.setZero();
const int cols4 = m_lhs.cols() & 0xfffffffC;
- for(int k=0; k<this->cols(); k++)
+ if (Lhs::Flags&RowMajorBit)
{
+// std::cout << "packet rhs\n";
int j=0;
for(; j<cols4; j+=4)
{
- const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k));
- const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k));
- const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k));
- const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k));
- for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
+ for(int k=0; k<this->rows(); k++)
{
- res.writePacketCoeff(i,k,\
- ei_padd(
- res.packetCoeff(i,k),
+ const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_lhs.coeff(k,j+0));
+ const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_lhs.coeff(k,j+1));
+ const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_lhs.coeff(k,j+2));
+ const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_lhs.coeff(k,j+3));
+ for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
+ {
+ res.writePacketCoeff(k,i,
ei_padd(
+ res.packetCoeff(k,i),
ei_padd(
- ei_pmul(tmp0, m_lhs.packetCoeff(i,j)),
- ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
+ ei_padd(
+ ei_pmul(tmp0, m_rhs.packetCoeff(j+0,i)),
+ ei_pmul(tmp1, m_rhs.packetCoeff(j+1,i))),
+ ei_padd(
+ ei_pmul(tmp2, m_rhs.packetCoeff(j+2,i)),
+ ei_pmul(tmp3, m_rhs.packetCoeff(j+3,i))
+ )
+ )
+ )
+ );
+ }
+ }
+ }
+ for(; j<m_lhs.cols(); ++j)
+ {
+ for(int k=0; k<this->rows(); k++)
+ {
+ const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_lhs.coeff(k,j));
+ for (int i=0; i<this->cols(); i+=ei_packet_traits<Scalar>::size)
+ res.writePacketCoeff(k,i, ei_pmul(tmp, m_rhs.packetCoeff(j,i)));
+ }
+ }
+ }
+ else
+ {
+// std::cout << "packet lhs\n";
+ int j=0;
+ for(; j<cols4; j+=4)
+ {
+ for(int k=0; k<this->cols(); k++)
+ {
+ const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k));
+ const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k));
+ const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k));
+ const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k));
+ for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
+ {
+ res.writePacketCoeff(i,k,
+ ei_padd(
+ res.packetCoeff(i,k),
ei_padd(
- ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)),
- ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))
+ ei_padd(
+ ei_pmul(tmp0, m_lhs.packetCoeff(i,j)),
+ ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
+ ei_padd(
+ ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)),
+ ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))
+ )
)
)
- )
- );
+ );
+ }
}
}
for(; j<m_lhs.cols(); ++j)
{
- const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k));
- for (int i=0; i<this->rows(); ++i)
- res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j)));
+ for(int k=0; k<this->cols(); k++)
+ {
+ const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k));
+ for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size)
+ res.writePacketCoeff(i,k, ei_pmul(tmp, m_lhs.packetCoeff(i,j)));
+ }
}
}
}