diff options
author | Benoit Jacob <jacob.benoit.1@gmail.com> | 2008-04-06 18:01:03 +0000 |
---|---|---|
committer | Benoit Jacob <jacob.benoit.1@gmail.com> | 2008-04-06 18:01:03 +0000 |
commit | 371d302efbbbedf2b4818f3efae466bedab63a1e (patch) | |
tree | 433daed948c140f5e931bcfe8b7119b99be12780 | |
parent | 30ec34de362744fa9a3e82573cb23662aaafbf5a (diff) |
- merge ei_xpr_copy and ei_eval_if_needed_before_nesting
- make use of CoeffReadCost to determine when to unroll the loops,
for now only in Product.h and in OperatorEquals.h
performance remains the same: generally still not as good as before the
big changes.
-rw-r--r-- | Eigen/src/Core/ForwardDeclarations.h | 27 | ||||
-rw-r--r-- | Eigen/src/Core/OperatorEquals.h | 11 | ||||
-rw-r--r-- | Eigen/src/Core/Product.h | 35 | ||||
-rw-r--r-- | Eigen/src/Core/Util.h | 2 | ||||
-rw-r--r-- | bench/benchmark.cpp | 6 |
5 files changed, 49 insertions, 32 deletions
diff --git a/Eigen/src/Core/ForwardDeclarations.h b/Eigen/src/Core/ForwardDeclarations.h index 36519c7da..32be8cd68 100644 --- a/Eigen/src/Core/ForwardDeclarations.h +++ b/Eigen/src/Core/ForwardDeclarations.h @@ -83,27 +83,30 @@ template<typename T> struct ei_eval template<typename T> struct ei_unref { typedef T type; }; template<typename T> struct ei_unref<T&> { typedef T type; }; -template<typename T> struct ei_xpr_copy +template<typename T> struct ei_is_temporary { - typedef typename ei_meta_if< ei_traits<T>::Flags & EvalBeforeNestingBit, - typename ei_eval<T>::type, const T&>::ret type; + enum { ret = 0 }; }; -template<typename T> struct ei_xpr_copy<Temporary<T> > +template<typename T> struct ei_is_temporary<Temporary<T> > { - typedef Temporary<T> type; + enum { ret = 1 }; }; -template<typename T, int n=1> struct ei_eval_if_needed_before_nesting +template<typename T, int n=1> struct ei_xpr_copy { - // FIXME should we consider the additional store as well as the creation cost of the temporary ? - enum { eval = T::Flags & EvalBeforeNestingBit - || (n+1) * NumTraits<typename ei_traits<T>::Scalar>::ReadCost < (n-1) * T::CoeffReadCost }; - typedef typename ei_meta_if<eval, typename ei_eval<T>::type, T>::ret XprType; - typedef typename ei_meta_if<eval, typename ei_eval<T>::type, typename T::XprCopy>::ret CopyType; + typedef typename ei_meta_if< + ei_is_temporary<T>::ret, + T, + typename ei_meta_if< + ei_traits<T>::Flags & EvalBeforeNestingBit + || (n+1) * NumTraits<typename ei_traits<T>::Scalar>::ReadCost < (n-1) * T::CoeffReadCost, + typename ei_eval<T>::type, + const T& + >::ret + >::ret type; }; - template<typename T> struct ei_functor_traits { enum diff --git a/Eigen/src/Core/OperatorEquals.h b/Eigen/src/Core/OperatorEquals.h index c93a9329f..5529c8313 100644 --- a/Eigen/src/Core/OperatorEquals.h +++ b/Eigen/src/Core/OperatorEquals.h @@ -102,14 +102,15 @@ template<typename OtherDerived> Derived& MatrixBase<Derived> ::lazyAssign(const MatrixBase<OtherDerived>& other) { + const bool unroll = SizeAtCompileTime * OtherDerived::CoeffReadCost <= EIGEN_UNROLLING_LIMIT; if(IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime) // copying a vector expression into a vector { ei_assert(size() == other.size()); - if(SizeAtCompileTime <= EIGEN_UNROLLING_LIMIT) + if(unroll) ei_vector_operator_equals_unroller <Derived, OtherDerived, - SizeAtCompileTime <= EIGEN_UNROLLING_LIMIT ? SizeAtCompileTime : Dynamic + unroll ? SizeAtCompileTime : Dynamic >::run(derived(), other.derived()); else for(int i = 0; i < size(); i++) @@ -118,11 +119,11 @@ Derived& MatrixBase<Derived> else // copying a matrix expression into a matrix { ei_assert(rows() == other.rows() && cols() == other.cols()); - if(SizeAtCompileTime <= EIGEN_UNROLLING_LIMIT) + if(unroll) { ei_matrix_operator_equals_unroller <Derived, OtherDerived, - SizeAtCompileTime <= EIGEN_UNROLLING_LIMIT ? SizeAtCompileTime : Dynamic + unroll ? SizeAtCompileTime : Dynamic >::run(derived(), other.derived()); } else @@ -152,7 +153,7 @@ template<typename OtherDerived> Derived& MatrixBase<Derived> ::operator=(const MatrixBase<OtherDerived>& other) { - if (OtherDerived::Flags & EvalBeforeAssigningBit) + if(OtherDerived::Flags & EvalBeforeAssigningBit) { return lazyAssign(other.derived().eval()); } diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 608de0b9f..d303cbdb7 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -84,21 +84,29 @@ template<typename Lhs, typename Rhs, int EvalMode> struct ei_traits<Product<Lhs, Rhs, EvalMode> > { typedef typename Lhs::Scalar Scalar; + typedef typename ei_xpr_copy<Lhs,Rhs::ColsAtCompileTime>::type LhsXprCopy; + typedef typename ei_xpr_copy<Rhs,Lhs::RowsAtCompileTime>::type RhsXprCopy; + typedef typename ei_unref<LhsXprCopy>::type ActualLhs; + typedef typename ei_unref<RhsXprCopy>::type ActualRhs; enum { + LhsCoeffReadCost = ActualLhs::CoeffReadCost, + RhsCoeffReadCost = ActualRhs::CoeffReadCost, + LhsFlags = ActualLhs::Flags, + RhsFlags = ActualRhs::Flags, RowsAtCompileTime = Lhs::RowsAtCompileTime, ColsAtCompileTime = Rhs::ColsAtCompileTime, MaxRowsAtCompileTime = Lhs::MaxRowsAtCompileTime, MaxColsAtCompileTime = Rhs::MaxColsAtCompileTime, Flags = ( (RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) - ? (unsigned int)(Lhs::Flags | Rhs::Flags) - : (unsigned int)(Lhs::Flags | Rhs::Flags) & ~LargeBit ) + ? (unsigned int)(LhsFlags | RhsFlags) + : (unsigned int)(LhsFlags | RhsFlags) & ~LargeBit ) | EvalBeforeAssigningBit | (ei_product_eval_mode<Lhs, Rhs>::value == (int)CacheOptimal ? EvalBeforeNestingBit : 0), CoeffReadCost = Lhs::ColsAtCompileTime == Dynamic ? Dynamic : Lhs::ColsAtCompileTime - * (NumTraits<Scalar>::MulCost + Lhs::CoeffReadCost + Rhs::CoeffReadCost) + * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost) + (Lhs::ColsAtCompileTime - 1) * NumTraits<Scalar>::AddCost }; }; @@ -110,10 +118,8 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm EIGEN_GENERIC_PUBLIC_INTERFACE(Product) - typedef typename ei_eval_if_needed_before_nesting<Lhs,Rhs::ColsAtCompileTime>::CopyType CopyLhs; - typedef typename ei_eval_if_needed_before_nesting<Rhs,Lhs::RowsAtCompileTime>::CopyType CopyRhs; - typedef typename ei_eval_if_needed_before_nesting<Lhs,Rhs::ColsAtCompileTime>::XprType XprLhs; - typedef typename ei_eval_if_needed_before_nesting<Rhs,Lhs::RowsAtCompileTime>::XprType XprRhs; + typedef typename ei_traits<Product>::LhsXprCopy LhsXprCopy; + typedef typename ei_traits<Product>::RhsXprCopy RhsXprCopy; Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) @@ -133,12 +139,15 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm const Scalar _coeff(int row, int col) const { Scalar res; - if(Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT) + const bool unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT; + if(unroll) + { ei_product_unroller<Lhs::ColsAtCompileTime-1, - Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT - ? Lhs::ColsAtCompileTime : Dynamic, - XprLhs, XprRhs> + unroll ? Lhs::ColsAtCompileTime : Dynamic, + typename ei_unref<LhsXprCopy>::type, + typename ei_unref<RhsXprCopy>::type> ::run(row, col, m_lhs, m_rhs, res); + } else { res = m_lhs.coeff(row, 0) * m_rhs.coeff(0, col); @@ -149,8 +158,8 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm } protected: - const CopyLhs m_lhs; - const CopyRhs m_rhs; + const LhsXprCopy m_lhs; + const RhsXprCopy m_rhs; }; /** \returns the matrix product of \c *this and \a other. diff --git a/Eigen/src/Core/Util.h b/Eigen/src/Core/Util.h index 10fdacb8b..ad8a15b07 100644 --- a/Eigen/src/Core/Util.h +++ b/Eigen/src/Core/Util.h @@ -31,7 +31,7 @@ /** Defines the maximal loop size to enable meta unrolling of loops */ #ifndef EIGEN_UNROLLING_LIMIT -#define EIGEN_UNROLLING_LIMIT 16 +#define EIGEN_UNROLLING_LIMIT 400 #endif #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR diff --git a/bench/benchmark.cpp b/bench/benchmark.cpp index ee58607cd..4ff678d8a 100644 --- a/bench/benchmark.cpp +++ b/bench/benchmark.cpp @@ -8,6 +8,10 @@ using namespace std; USING_PART_OF_NAMESPACE_EIGEN +#ifndef REPEAT +#define REPEAT 40000000 +#endif + int main(int argc, char *argv[]) { Matrix<double,MATSIZE,MATSIZE> I; @@ -19,7 +23,7 @@ int main(int argc, char *argv[]) m(i,j) = (i+MATSIZE*j); } asm("#begin"); - for(int a = 0; a < 40000000; a++) + for(int a = 0; a < REPEAT; a++) { m = I + 0.00005 * (m + m*m); } |