diff options
author | 2008-04-25 15:46:18 +0000 | |
---|---|---|
committer | 2008-04-25 15:46:18 +0000 | |
commit | a451835bce179a999cddedc3c9dab49e421968eb (patch) | |
tree | df3fed6cf99e2bd9cf362e6e3800c284f115c31b | |
parent | 30d47b5250240d2313d1473adb6f6dd47c5d685a (diff) |
Make the explicit vectorization much more flexible:
- support dynamic sizes
- support arbitrary matrix size when the matrix can be seen as a 1D array
(except for fixed size matrices where the size in Bytes must be a factor of 16,
this is to allow compact storage of a vector of matrices)
Note that the explict vectorization is still experimental and far to be completely tested.
-rw-r--r-- | Eigen/Core | 2 | ||||
-rw-r--r-- | Eigen/src/Core/Assign.h | 62 | ||||
-rw-r--r-- | Eigen/src/Core/CwiseNullaryOp.h | 95 | ||||
-rw-r--r-- | Eigen/src/Core/Lazy.h | 5 | ||||
-rw-r--r-- | Eigen/src/Core/Matrix.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/MatrixBase.h | 14 | ||||
-rw-r--r-- | Eigen/src/Core/MatrixStorage.h | 40 | ||||
-rw-r--r-- | Eigen/src/Core/Product.h | 110 | ||||
-rw-r--r-- | Eigen/src/Core/Temporary.h | 5 | ||||
-rw-r--r-- | Eigen/src/Core/util/Meta.h | 29 |
10 files changed, 264 insertions, 100 deletions
diff --git a/Eigen/Core b/Eigen/Core index 950328aaa..3007899d1 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -2,7 +2,7 @@ #define EIGEN_CORE_H #ifndef EIGEN_DONT_VECTORIZE -#ifdef __SSE2__ +#if ((defined __SSE2__) && ( (!defined __GNUC__) || (__GNUC__>=4 && __GNUC_MINOR__>=2))) #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_SSE #include <emmintrin.h> diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h index d0f126689..c9e2b6b4b 100644 --- a/Eigen/src/Core/Assign.h +++ b/Eigen/src/Core/Assign.h @@ -99,7 +99,11 @@ struct ei_matrix_assignment_packet_unroller<Derived1, Derived2, Dynamic> template <typename Derived, typename OtherDerived, bool Vectorize = (Derived::Flags & OtherDerived::Flags & VectorizableBit) - && ((Derived::Flags&RowMajorBit)==(OtherDerived::Flags&RowMajorBit))> + && ((Derived::Flags&RowMajorBit)==(OtherDerived::Flags&RowMajorBit)) + && ( (Derived::Flags & OtherDerived::Flags & Like1DArrayBit) + ||((Derived::Flags&RowMajorBit) + ? Derived::ColsAtCompileTime!=Dynamic && (Derived::ColsAtCompileTime%ei_packet_traits<typename Derived::Scalar>::size==0) + : Derived::RowsAtCompileTime!=Dynamic && (Derived::RowsAtCompileTime%ei_packet_traits<typename Derived::Scalar>::size==0)) )> struct ei_assignment_impl; template<typename Derived> @@ -107,6 +111,7 @@ template<typename OtherDerived> Derived& MatrixBase<Derived> ::lazyAssign(const MatrixBase<OtherDerived>& other) { +// std::cout << "lazyAssign = " << Derived::Flags << " " << OtherDerived::Flags << "\n"; ei_assignment_impl<Derived,OtherDerived>::execute(derived(),other.derived()); return derived(); } @@ -178,6 +183,7 @@ struct ei_assignment_impl<Derived, OtherDerived, true> ei_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); if(unroll) { +// std::cout << "vectorized unrolled\n"; ei_matrix_assignment_packet_unroller <Derived, OtherDerived, unroll && int(Derived::SizeAtCompileTime)>=ei_packet_traits<typename Derived::Scalar>::size @@ -188,15 +194,61 @@ struct ei_assignment_impl<Derived, OtherDerived, true> { if(OtherDerived::Flags&RowMajorBit) { - for(int i = 0; i < dst.rows(); i++) - for(int j = 0; j < dst.cols(); j+=ei_packet_traits<typename Derived::Scalar>::size) + if ( (Derived::Flags & OtherDerived::Flags & Like1DArrayBit) + && (Derived::ColsAtCompileTime==Dynamic + || Derived::ColsAtCompileTime%ei_packet_traits<typename Derived::Scalar>::size!=0)) + { +// std::cout << "vectorized linear row major\n"; + const int size = dst.rows() * dst.cols(); + const int alignedSize = (size/ei_packet_traits<typename Derived::Scalar>::size)*ei_packet_traits<typename Derived::Scalar>::size; + int index = 0; + for ( ; index<alignedSize ; index+=ei_packet_traits<typename Derived::Scalar>::size) + { + // FIXME the following is not really efficient + int i = index/dst.rows(); + int j = index%dst.rows(); dst.writePacketCoeff(i, j, src.packetCoeff(i, j)); + } + for(int i = alignedSize/dst.rows(); i < dst.rows(); i++) + for(int j = alignedSize%dst.rows(); j < dst.cols(); j++) + dst.coeffRef(i, j) = src.coeff(i, j); + } + else + { +// std::cout << "vectorized normal row major\n"; + for(int i = 0; i < dst.rows(); i++) + for(int j = 0; j < dst.cols(); j+=ei_packet_traits<typename Derived::Scalar>::size) + dst.writePacketCoeff(i, j, src.packetCoeff(i, j)); + } } else { - for(int j = 0; j < dst.cols(); j++) - for(int i = 0; i < dst.rows(); i+=ei_packet_traits<typename Derived::Scalar>::size) + if ((Derived::Flags & OtherDerived::Flags & Like1DArrayBit) + && ( Derived::RowsAtCompileTime==Dynamic + || Derived::RowsAtCompileTime%ei_packet_traits<typename Derived::Scalar>::size!=0)) + { +// std::cout << "vectorized linear col major\n"; + const int size = dst.rows() * dst.cols(); + const int alignedSize = (size/ei_packet_traits<typename Derived::Scalar>::size)*ei_packet_traits<typename Derived::Scalar>::size; + int index = 0; + for ( ; index<alignedSize ; index+=ei_packet_traits<typename Derived::Scalar>::size) + { + // FIXME the following is not really efficient + int i = index%dst.rows(); + int j = index/dst.rows(); dst.writePacketCoeff(i, j, src.packetCoeff(i, j)); + } + for(int j = alignedSize/dst.rows(); j < dst.cols(); j++) + for(int i = alignedSize%dst.rows(); i < dst.rows(); i++) + dst.coeffRef(i, j) = src.coeff(i, j); + } + else + { +// std::cout << "vectorized normal col major\n"; + for(int j = 0; j < dst.cols(); j++) + for(int i = 0; i < dst.rows(); i+=ei_packet_traits<typename Derived::Scalar>::size) + dst.writePacketCoeff(i, j, src.packetCoeff(i, j)); + } } } } diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index d3bce41d8..4f09bd8a9 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -31,8 +31,8 @@ * * \param NullaryOp template functor implementing the operator * - * This class represents an expression of a generic zeroary operator. - * It is the return type of the ones(), zero(), constant() and random() functions, + * This class represents an expression of a generic nullary operator. + * It is the return type of the ones(), zero(), constant(), identity() and random() functions, * and most of the time this is the only way it is used. * * However, if you want to write a function returning such an expression, you @@ -94,12 +94,18 @@ class CwiseNullaryOp : ei_no_assignment_operator, }; -/* \returns an expression of a custom coefficient-wise operator \a func of *this and \a other +/** \returns an expression of a matrix defined by a custom functor \a func * - * The template parameter \a CustomNullaryOp is the type of the functor - * of the custom operator (see class CwiseNullaryOp for an example) + * The parameters \a rows and \a cols are the number of rows and of columns of + * the returned matrix. Must be compatible with this MatrixBase type. + * + * This variant is meant to be used for dynamic-size matrix types. For fixed-size types, + * it is redundant to pass \a rows and \a cols as arguments, so zero() should be used + * instead. * - * \sa class CwiseNullaryOp, MatrixBase::operator+, MatrixBase::operator-, MatrixBase::cwiseProduct, MatrixBase::cwiseQuotient + * The template parameter \a CustomNullaryOp is the type of the functor. + * + * \sa class CwiseNullaryOp */ template<typename Derived> template<typename CustomNullaryOp> @@ -109,6 +115,21 @@ MatrixBase<Derived>::cwiseCreate(int rows, int cols, const CustomNullaryOp& func return CwiseNullaryOp<CustomNullaryOp, Derived>(rows, cols, func); } +/** \returns an expression of a matrix defined by a custom functor \a func + * + * The parameter \a size is the size of the returned vector. + * Must be compatible with this MatrixBase type. + * + * \only_for_vectors + * + * This variant is meant to be used for dynamic-size vector types. For fixed-size types, + * it is redundant to pass \a size as argument, so zero() should be used + * instead. + * + * The template parameter \a CustomNullaryOp is the type of the functor. + * + * \sa class CwiseNullaryOp + */ template<typename Derived> template<typename CustomNullaryOp> const CwiseNullaryOp<CustomNullaryOp, Derived> @@ -119,6 +140,15 @@ MatrixBase<Derived>::cwiseCreate(int size, const CustomNullaryOp& func) else return CwiseNullaryOp<CustomNullaryOp, Derived>(size, 1, func); } +/** \returns an expression of a matrix defined by a custom functor \a func + * + * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you + * need to use the variants taking size arguments. + * + * The template parameter \a CustomNullaryOp is the type of the functor. + * + * \sa class CwiseNullaryOp + */ template<typename Derived> template<typename CustomNullaryOp> const CwiseNullaryOp<CustomNullaryOp, Derived> @@ -127,7 +157,16 @@ MatrixBase<Derived>::cwiseCreate(const CustomNullaryOp& func) return CwiseNullaryOp<CustomNullaryOp, Derived>(rows(), cols(), func); } -/* \returns an expression of the coefficient-wise \< operator of *this and \a other +/** \returns an expression of a constant matrix of value \a value + * + * The parameters \a rows and \a cols are the number of rows and of columns of + * the returned matrix. Must be compatible with this MatrixBase type. + * + * This variant is meant to be used for dynamic-size matrix types. For fixed-size types, + * it is redundant to pass \a rows and \a cols as arguments, so zero() should be used + * instead. + * + * The template parameter \a CustomNullaryOp is the type of the functor. * * \sa class CwiseNullaryOp */ @@ -138,6 +177,21 @@ MatrixBase<Derived>::constant(int rows, int cols, const Scalar& value) return cwiseCreate(rows, cols, ei_scalar_constant_op<Scalar>(value)); } +/** \returns an expression of a constant matrix of value \a value + * + * The parameter \a size is the size of the returned vector. + * Must be compatible with this MatrixBase type. + * + * \only_for_vectors + * + * This variant is meant to be used for dynamic-size vector types. For fixed-size types, + * it is redundant to pass \a size as argument, so zero() should be used + * instead. + * + * The template parameter \a CustomNullaryOp is the type of the functor. + * + * \sa class CwiseNullaryOp + */ template<typename Derived> const CwiseNullaryOp<ei_scalar_constant_op<typename ei_traits<Derived>::Scalar>, Derived> MatrixBase<Derived>::constant(int size, const Scalar& value) @@ -145,6 +199,15 @@ MatrixBase<Derived>::constant(int size, const Scalar& value) return cwiseCreate(size, ei_scalar_constant_op<Scalar>(value)); } +/** \returns an expression of a constant matrix of value \a value + * + * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you + * need to use the variants taking size arguments. + * + * The template parameter \a CustomNullaryOp is the type of the functor. + * + * \sa class CwiseNullaryOp + */ template<typename Derived> const CwiseNullaryOp<ei_scalar_constant_op<typename ei_traits<Derived>::Scalar>, Derived> MatrixBase<Derived>::constant(const Scalar& value) @@ -163,6 +226,10 @@ bool MatrixBase<Derived>::isEqualToConstant return true; } +/** Sets all coefficients in this expression to \a value. + * + * \sa class CwiseNullaryOp, zero(), ones() + */ template<typename Derived> Derived& MatrixBase<Derived>::setConstant(const Scalar& value) { @@ -238,7 +305,7 @@ MatrixBase<Derived>::zero() * Example: \include MatrixBase_isZero.cpp * Output: \verbinclude MatrixBase_isZero.out * - * \sa class Zero, zero() + * \sa class CwiseNullaryOp, zero() */ template<typename Derived> bool MatrixBase<Derived>::isZero @@ -256,7 +323,7 @@ bool MatrixBase<Derived>::isZero * Example: \include MatrixBase_setZero.cpp * Output: \verbinclude MatrixBase_setZero.out * - * \sa class Zero, zero() + * \sa class CwiseNullaryOp, zero() */ template<typename Derived> Derived& MatrixBase<Derived>::setZero() @@ -333,7 +400,7 @@ MatrixBase<Derived>::ones() * Example: \include MatrixBase_isOnes.cpp * Output: \verbinclude MatrixBase_isOnes.out * - * \sa class Ones, ones() + * \sa class CwiseNullaryOp, ones() */ template<typename Derived> bool MatrixBase<Derived>::isOnes @@ -347,7 +414,7 @@ bool MatrixBase<Derived>::isOnes * Example: \include MatrixBase_setOnes.cpp * Output: \verbinclude MatrixBase_setOnes.out * - * \sa class Ones, ones() + * \sa class CwiseNullaryOp, ones() */ template<typename Derived> Derived& MatrixBase<Derived>::setOnes() @@ -424,7 +491,7 @@ MatrixBase<Derived>::random() * Example: \include MatrixBase_setRandom.cpp * Output: \verbinclude MatrixBase_setRandom.out * - * \sa class Random, ei_random() + * \sa class CwiseNullaryOp, ei_random() */ template<typename Derived> Derived& MatrixBase<Derived>::setRandom() @@ -479,7 +546,7 @@ MatrixBase<Derived>::identity() * Example: \include MatrixBase_isIdentity.cpp * Output: \verbinclude MatrixBase_isIdentity.out * - * \sa class Identity, identity(), identity(int,int), setIdentity() + * \sa class CwiseNullaryOp, identity(), identity(int,int), setIdentity() */ template<typename Derived> bool MatrixBase<Derived>::isIdentity @@ -509,7 +576,7 @@ bool MatrixBase<Derived>::isIdentity * Example: \include MatrixBase_setIdentity.cpp * Output: \verbinclude MatrixBase_setIdentity.out * - * \sa class Identity, identity(), identity(int,int), isIdentity() + * \sa class CwiseNullaryOp, identity(), identity(int,int), isIdentity() */ template<typename Derived> Derived& MatrixBase<Derived>::setIdentity() diff --git a/Eigen/src/Core/Lazy.h b/Eigen/src/Core/Lazy.h index 0c65cdeba..3e25acb19 100644 --- a/Eigen/src/Core/Lazy.h +++ b/Eigen/src/Core/Lazy.h @@ -72,6 +72,11 @@ template<typename ExpressionType> class Lazy return m_expression.coeff(row, col); } + PacketScalar _packetCoeff(int row, int col) const + { + return m_expression.packetCoeff(row, col); + } + protected: const typename ExpressionType::Nested m_expression; }; diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index 92f726011..dd1235aa3 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -79,7 +79,7 @@ struct ei_traits<Matrix<_Scalar, _Rows, _Cols, _SuggestedFlags, _MaxRows, _MaxCo ColsAtCompileTime = _Cols, MaxRowsAtCompileTime = _MaxRows, MaxColsAtCompileTime = _MaxCols, - Flags = ei_corrected_matrix_flags<_Scalar, _Rows, _Cols, _SuggestedFlags>::ret, + Flags = ei_corrected_matrix_flags<_Scalar, ei_size_at_compile_time<_MaxRows,_MaxCols>::ret, _SuggestedFlags>::ret, CoeffReadCost = NumTraits<Scalar>::ReadCost }; }; diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 3247ec4bf..b6a161bdd 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -75,11 +75,8 @@ template<typename Derived> class MatrixBase * it is set to the \a Dynamic constant. * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */ - SizeAtCompileTime - = ei_traits<Derived>::RowsAtCompileTime == Dynamic - || ei_traits<Derived>::ColsAtCompileTime == Dynamic - ? Dynamic - : ei_traits<Derived>::RowsAtCompileTime * ei_traits<Derived>::ColsAtCompileTime, + SizeAtCompileTime = ei_size_at_compile_time<ei_traits<Derived>::RowsAtCompileTime, + ei_traits<Derived>::ColsAtCompileTime>::ret, /**< This is equal to the number of coefficients, i.e. the number of * rows times the number of columns, or to \a Dynamic if this is not * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */ @@ -106,11 +103,8 @@ template<typename Derived> class MatrixBase * \sa ColsAtCompileTime, MaxRowsAtCompileTime, MaxSizeAtCompileTime */ - MaxSizeAtCompileTime - = ei_traits<Derived>::MaxRowsAtCompileTime == Dynamic - || ei_traits<Derived>::MaxColsAtCompileTime == Dynamic - ? Dynamic - : ei_traits<Derived>::MaxRowsAtCompileTime * ei_traits<Derived>::MaxColsAtCompileTime, + MaxSizeAtCompileTime = ei_size_at_compile_time<ei_traits<Derived>::MaxRowsAtCompileTime, + ei_traits<Derived>::MaxColsAtCompileTime>::ret, /**< This value is equal to the maximum possible number of coefficients that this expression * might have. If this expression might have an arbitrarily high number of coefficients, * this value is set to \a Dynamic. diff --git a/Eigen/src/Core/MatrixStorage.h b/Eigen/src/Core/MatrixStorage.h index cca4414d3..c8ee7a62c 100644 --- a/Eigen/src/Core/MatrixStorage.h +++ b/Eigen/src/Core/MatrixStorage.h @@ -49,6 +49,28 @@ template <typename T, int Size> struct ei_aligned_array<T,Size,false> T array[Size]; }; +template<typename T> +T* ei_aligned_malloc(size_t size) +{ + #ifdef EIGEN_VECTORIZE + if (ei_packet_traits<T>::size>1) + return static_cast<T*>(_mm_malloc(sizeof(T)*size, 16)); + else + #endif + return new T[size]; +} + +template<typename T> +void ei_aligned_free(T* ptr) +{ + #ifdef EIGEN_VECTORIZE + if (ei_packet_traits<T>::size>1) + _mm_free(ptr); + else + #endif + delete[] ptr; +} + // purely fixed-size matrix template<typename T, int Size, int _Rows, int _Cols> class ei_matrix_storage { @@ -127,7 +149,7 @@ template<typename T> class ei_matrix_storage<T, Dynamic, Dynamic, Dynamic> int m_cols; public: ei_matrix_storage(int size, int rows, int cols) - : m_data(new T[size]), m_rows(rows), m_cols(cols) {} + : m_data(ei_aligned_malloc<T>(size)), m_rows(rows), m_cols(cols) {} ~ei_matrix_storage() { delete[] m_data; } int rows(void) const {return m_rows;} int cols(void) const {return m_cols;} @@ -135,8 +157,8 @@ template<typename T> class ei_matrix_storage<T, Dynamic, Dynamic, Dynamic> { if(size != m_rows*m_cols) { - delete[] m_data; - m_data = new T[size]; + ei_aligned_free(m_data); + m_data = ei_aligned_malloc<T>(size); } m_rows = rows; m_cols = cols; @@ -151,7 +173,7 @@ template<typename T, int _Rows> class ei_matrix_storage<T, Dynamic, _Rows, Dynam T *m_data; int m_cols; public: - ei_matrix_storage(int size, int, int cols) : m_data(new T[size]), m_cols(cols) {} + ei_matrix_storage(int size, int, int cols) : m_data(ei_aligned_malloc<T>(size)), m_cols(cols) {} ~ei_matrix_storage() { delete[] m_data; } static int rows(void) {return _Rows;} int cols(void) const {return m_cols;} @@ -159,8 +181,8 @@ template<typename T, int _Rows> class ei_matrix_storage<T, Dynamic, _Rows, Dynam { if(size != _Rows*m_cols) { - delete[] m_data; - m_data = new T[size]; + ei_aligned_free(m_data); + m_data = ei_aligned_malloc<T>(size); } m_cols = cols; } @@ -174,7 +196,7 @@ template<typename T, int _Cols> class ei_matrix_storage<T, Dynamic, Dynamic, _Co T *m_data; int m_rows; public: - ei_matrix_storage(int size, int rows, int) : m_data(new T[size]), m_rows(rows) {} + ei_matrix_storage(int size, int rows, int) : m_data(ei_aligned_malloc<T>(size)), m_rows(rows) {} ~ei_matrix_storage() { delete[] m_data; } int rows(void) const {return m_rows;} static int cols(void) {return _Cols;} @@ -182,8 +204,8 @@ template<typename T, int _Cols> class ei_matrix_storage<T, Dynamic, Dynamic, _Co { if(size != m_rows*_Cols) { - delete[] m_data; - m_data = new T[size]; + ei_aligned_free(m_data); + m_data = ei_aligned_malloc<T>(size); } m_rows = rows; } diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 590e03599..895e19e0e 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -135,7 +135,7 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> > | EvalBeforeAssigningBit | (ei_product_eval_mode<Lhs, Rhs>::value == (int)CacheOptimalProduct ? EvalBeforeNestingBit : 0)) & ( - ~(RowMajorBit | VectorizableBit) + ~(RowMajorBit | VectorizableBit | Like1DArrayBit) | ( ( !(Lhs::Flags & RowMajorBit) && (Lhs::Flags & VectorizableBit) @@ -178,7 +178,11 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm /** \internal */ template<typename DestDerived> - void _cacheOptimalEval(DestDerived& res) const; + void _cacheOptimalEval(DestDerived& res, ei_meta_false) const; + #ifdef EIGEN_VECTORIZE + template<typename DestDerived> + void _cacheOptimalEval(DestDerived& res, ei_meta_true) const; + #endif private: @@ -267,59 +271,29 @@ MatrixBase<Derived>::operator*=(const MatrixBase<OtherDerived> &other) } template<typename Derived> -template<typename Derived1, typename Derived2> -Derived& MatrixBase<Derived>::lazyAssign(const Product<Derived1,Derived2,CacheOptimalProduct>& product) +template<typename Lhs, typename Rhs> +Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheOptimalProduct>& product) { - product._cacheOptimalEval(*this); + product._cacheOptimalEval(*this, + #ifdef EIGEN_VECTORIZE + typename ei_meta_if<(Flags & VectorizableBit) + && (!(Lhs::Flags & RowMajorBit) + && (Lhs::RowsAtCompileTime!=Dynamic) + && (Lhs::RowsAtCompileTime%ei_packet_traits<Scalar>::size==0) ), + ei_meta_true,ei_meta_false>::ret() + #else + ei_meta_false + #endif + ); return derived(); } template<typename Lhs, typename Rhs, int EvalMode> template<typename DestDerived> -void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res) const +void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_false) const { res.setZero(); const int cols4 = m_lhs.cols() & 0xfffffffC; - #ifdef EIGEN_VECTORIZE - if( (Flags & VectorizableBit) && (!(Lhs::Flags & RowMajorBit)) ) - { - for(int k=0; k<this->cols(); k++) - { - int j=0; - for(; j<cols4; j+=4) - { - const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k)); - const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k)); - const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k)); - const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k)); - for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size) - { - res.writePacketCoeff(i,k,\ - ei_padd( - res.packetCoeff(i,k), - ei_padd( - ei_padd( - ei_pmul(tmp0, m_lhs.packetCoeff(i,j)), - ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))), - ei_padd( - ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)), - ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3)) - ) - ) - ) - ); - } - } - for(; j<m_lhs.cols(); ++j) - { - const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k)); - for (int i=0; i<this->rows(); ++i) - res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j))); - } - } - } - else - #endif // EIGEN_VECTORIZE { for(int k=0; k<this->cols(); ++k) { @@ -344,4 +318,48 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res) const } } +#ifdef EIGEN_VECTORIZE +template<typename Lhs, typename Rhs, int EvalMode> +template<typename DestDerived> +void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res, ei_meta_true) const +{ + res.setZero(); + const int cols4 = m_lhs.cols() & 0xfffffffC; + for(int k=0; k<this->cols(); k++) + { + int j=0; + for(; j<cols4; j+=4) + { + const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k)); + const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k)); + const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k)); + const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k)); + for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size) + { + res.writePacketCoeff(i,k,\ + ei_padd( + res.packetCoeff(i,k), + ei_padd( + ei_padd( + ei_pmul(tmp0, m_lhs.packetCoeff(i,j)), + ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))), + ei_padd( + ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)), + ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3)) + ) + ) + ) + ); + } + } + for(; j<m_lhs.cols(); ++j) + { + const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k)); + for (int i=0; i<this->rows(); ++i) + res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j))); + } + } +} +#endif // EIGEN_VECTORIZE + #endif // EIGEN_PRODUCT_H diff --git a/Eigen/src/Core/Temporary.h b/Eigen/src/Core/Temporary.h index 981a0c218..9157b10e4 100644 --- a/Eigen/src/Core/Temporary.h +++ b/Eigen/src/Core/Temporary.h @@ -71,6 +71,11 @@ template<typename ExpressionType> class Temporary return m_expression.coeff(row, col); } + PacketScalar _packetCoeff(int row, int col) const + { + return m_expression.packetCoeff(row, col); + } + protected: const ExpressionType m_expression; }; diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 3c8f9ad9a..19768c1ca 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -70,6 +70,9 @@ struct ei_meta_if <false, Then, Else> { typedef Else ret; }; template<typename T, typename U> struct ei_is_same_type { enum { ret = 0 }; }; template<typename T> struct ei_is_same_type<T,T> { enum { ret = 1 }; }; +struct ei_meta_true {}; +struct ei_meta_false {}; + /** \internal * Convenient struct to get the result type of a unary or binary functor. @@ -145,19 +148,12 @@ template<typename T> struct ei_packet_traits enum {size=1}; }; -template<typename Scalar, int Rows, int Cols, unsigned int SuggestedFlags> +template<typename Scalar, int Size, unsigned int SuggestedFlags> class ei_corrected_matrix_flags { enum { is_vectorizable = ei_packet_traits<Scalar>::size > 1 - && Rows!=Dynamic - && Cols!=Dynamic - && - ( - SuggestedFlags&RowMajorBit - ? Cols%ei_packet_traits<Scalar>::size==0 - : Rows%ei_packet_traits<Scalar>::size==0 - ), + && (Size%ei_packet_traits<Scalar>::size==0), _flags1 = (SuggestedFlags & ~(EvalBeforeNestingBit | EvalBeforeAssigningBit)) | Like1DArrayBit }; @@ -168,19 +164,24 @@ class ei_corrected_matrix_flags }; }; +template<int _Rows, int _Cols> struct ei_size_at_compile_time +{ + enum { ret = (_Rows==Dynamic || _Cols==Dynamic) ? Dynamic : _Rows * _Cols }; +}; + template<typename T> class ei_eval { typedef typename ei_traits<T>::Scalar _Scalar; - enum { _Rows = ei_traits<T>::RowsAtCompileTime, - _Cols = ei_traits<T>::ColsAtCompileTime, + enum {_MaxRows = ei_traits<T>::MaxRowsAtCompileTime, + _MaxCols = ei_traits<T>::MaxColsAtCompileTime, _Flags = ei_traits<T>::Flags }; public: typedef Matrix<_Scalar, - _Rows, - _Cols, - ei_corrected_matrix_flags<_Scalar, _Rows, _Cols, _Flags>::ret, + ei_traits<T>::RowsAtCompileTime, + ei_traits<T>::ColsAtCompileTime, + ei_corrected_matrix_flags<_Scalar, ei_size_at_compile_time<_MaxRows,_MaxCols>::ret, _Flags>::ret, ei_traits<T>::MaxRowsAtCompileTime, ei_traits<T>::MaxColsAtCompileTime> type; }; |