diff options
Diffstat (limited to 'Eigen')
62 files changed, 2142 insertions, 610 deletions
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 6831eab3d..67e97ffb8 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -16,6 +16,15 @@ namespace Eigen { namespace internal { + template<typename _MatrixType, int _UpLo> struct traits<LDLT<_MatrixType, _UpLo> > + : traits<_MatrixType> + { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; + }; + template<typename MatrixType, int UpLo> struct LDLT_Traits; // PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef @@ -48,20 +57,19 @@ namespace internal { * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT */ template<typename _MatrixType, int _UpLo> class LDLT + : public SolverBase<LDLT<_MatrixType, _UpLo> > { public: typedef _MatrixType MatrixType; + typedef SolverBase<LDLT> Base; + friend class SolverBase<LDLT>; + + EIGEN_GENERIC_PUBLIC_INTERFACE(LDLT) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, UpLo = _UpLo }; - typedef typename MatrixType::Scalar Scalar; - typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar; - typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 - typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix<Scalar, RowsAtCompileTime, 1, 0, MaxRowsAtCompileTime, 1> TmpMatrixType; typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType; @@ -180,6 +188,7 @@ template<typename _MatrixType, int _UpLo> class LDLT return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign; } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A. * * This function also supports in-place solves using the syntax <tt>x = decompositionObject.solve(x)</tt> . @@ -197,13 +206,8 @@ template<typename _MatrixType, int _UpLo> class LDLT */ template<typename Rhs> inline const Solve<LDLT, Rhs> - solve(const MatrixBase<Rhs>& b) const - { - eigen_assert(m_isInitialized && "LDLT is not initialized."); - eigen_assert(m_matrix.rows()==b.rows() - && "LDLT::solve(): invalid number of rows of the right hand side matrix b"); - return Solve<LDLT, Rhs>(*this, b.derived()); - } + solve(const MatrixBase<Rhs>& b) const; + #endif template<typename Derived> bool solveInPlace(MatrixBase<Derived> &bAndX) const; @@ -259,6 +263,9 @@ template<typename _MatrixType, int _UpLo> class LDLT #ifndef EIGEN_PARSED_BY_DOXYGEN template<typename RhsType, typename DstType> void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template<bool Conjugate, typename RhsType, typename DstType> + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -559,14 +566,22 @@ template<typename _MatrixType, int _UpLo> template<typename RhsType, typename DstType> void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); + _solve_impl_transposed<true>(rhs, dst); +} + +template<typename _MatrixType,int _UpLo> +template<bool Conjugate, typename RhsType, typename DstType> +void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ // dst = P b dst = m_transpositions * rhs; // dst = L^-1 (P b) - matrixL().solveInPlace(dst); + // dst = L^-*T (P b) + matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst); - // dst = D^-1 (L^-1 P b) + // dst = D^-* (L^-1 P b) + // dst = D^-1 (L^-*T P b) // more precisely, use pseudo-inverse of D (see bug 241) using std::abs; const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD()); @@ -578,7 +593,6 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons // Moreover, Lapack's xSYTRS routines use 0 for the tolerance. // Using numeric_limits::min() gives us more robustness to denormals. RealScalar tolerance = (std::numeric_limits<RealScalar>::min)(); - for (Index i = 0; i < vecD.size(); ++i) { if(abs(vecD(i)) > tolerance) @@ -587,10 +601,12 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons dst.row(i).setZero(); } - // dst = L^-T (D^-1 L^-1 P b) - matrixU().solveInPlace(dst); + // dst = L^-* (D^-* L^-1 P b) + // dst = L^-T (D^-1 L^-*T P b) + matrixL().transpose().template conjugateIf<Conjugate>().solveInPlace(dst); - // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b + // dst = P^T (L^-* D^-* L^-1 P b) = A^-1 b + // dst = P^-T (L^-T D^-1 L^-*T P b) = A^-1 b dst = m_transpositions.transpose() * dst; } #endif diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 868766365..5876966e6 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -13,6 +13,16 @@ namespace Eigen { namespace internal{ + +template<typename _MatrixType, int _UpLo> struct traits<LLT<_MatrixType, _UpLo> > + : traits<_MatrixType> +{ + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; +}; + template<typename MatrixType, int UpLo> struct LLT_Traits; } @@ -54,18 +64,17 @@ template<typename MatrixType, int UpLo> struct LLT_Traits; * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ template<typename _MatrixType, int _UpLo> class LLT + : public SolverBase<LLT<_MatrixType, _UpLo> > { public: typedef _MatrixType MatrixType; + typedef SolverBase<LLT> Base; + friend class SolverBase<LLT>; + + EIGEN_GENERIC_PUBLIC_INTERFACE(LLT) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar; - typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 - typedef typename MatrixType::StorageIndex StorageIndex; enum { PacketSize = internal::packet_traits<Scalar>::size, @@ -129,6 +138,7 @@ template<typename _MatrixType, int _UpLo> class LLT return Traits::getL(m_matrix); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A. * * Since this LLT class assumes anyway that the matrix A is invertible, the solution @@ -141,13 +151,8 @@ template<typename _MatrixType, int _UpLo> class LLT */ template<typename Rhs> inline const Solve<LLT, Rhs> - solve(const MatrixBase<Rhs>& b) const - { - eigen_assert(m_isInitialized && "LLT is not initialized."); - eigen_assert(m_matrix.rows()==b.rows() - && "LLT::solve(): invalid number of rows of the right hand side matrix b"); - return Solve<LLT, Rhs>(*this, b.derived()); - } + solve(const MatrixBase<Rhs>& b) const; + #endif template<typename Derived> void solveInPlace(const MatrixBase<Derived> &bAndX) const; @@ -205,6 +210,9 @@ template<typename _MatrixType, int _UpLo> class LLT #ifndef EIGEN_PARSED_BY_DOXYGEN template<typename RhsType, typename DstType> void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template<bool Conjugate, typename RhsType, typename DstType> + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -476,8 +484,17 @@ template<typename _MatrixType,int _UpLo> template<typename RhsType, typename DstType> void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const { - dst = rhs; - solveInPlace(dst); + _solve_impl_transposed<true>(rhs, dst); +} + +template<typename _MatrixType,int _UpLo> +template<bool Conjugate, typename RhsType, typename DstType> +void LLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + dst = rhs; + + matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst); + matrixU().template conjugateIf<!Conjugate>().solveInPlace(dst); } #endif diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index 16770fc7b..e58e68eda 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h @@ -178,6 +178,46 @@ class Array Base::_check_template_params(); this->template _init2<T0,T1>(val0, val1); } + + #if EIGEN_HAS_CXX11 + /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + * + * Example: \include Array_variadic_ctor_cxx11.cpp + * Output: \verbinclude Array_variadic_ctor_cxx11.out + * + * \sa Array(const std::initializer_list<std::initializer_list<Scalar>>&) + * \sa Array(Scalar), Array(Scalar,Scalar) + */ + template <typename... ArgTypes> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + : Base(a0, a1, a2, a3, args...) {} + + /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 + * + * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients: + * + * Example: \include Array_initializer_list_23_cxx11.cpp + * Output: \verbinclude Array_initializer_list_23_cxx11.out + * + * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered. + * + * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed. + * Therefore <code> Array<int,Dynamic,1>{{1,2,3,4,5}}</code> is legal and the more verbose syntax + * <code>Array<int,Dynamic,1>{{1},{2},{3},{4},{5}}</code> can be avoided: + * + * Example: \include Array_initializer_list_vector_cxx11.cpp + * Output: \verbinclude Array_initializer_list_vector_cxx11.out + * + * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes, + * and implicit transposition is allowed for compile-time 1D arrays only. + * + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + */ + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Array(const std::initializer_list<std::initializer_list<Scalar> >& list) : Base(list) {} + #endif // end EIGEN_HAS_CXX11 + #else /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */ EIGEN_DEVICE_FUNC explicit Array(const Scalar *data); @@ -189,7 +229,8 @@ class Array */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Array(Index dim); - /** constructs an initialized 1x1 Array with the given coefficient */ + /** constructs an initialized 1x1 Array with the given coefficient + * \sa const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args */ Array(const Scalar& value); /** constructs an uninitialized array with \a rows rows and \a cols columns. * @@ -197,11 +238,14 @@ class Array * it is redundant to pass these parameters, so one should use the default constructor * Array() instead. */ Array(Index rows, Index cols); - /** constructs an initialized 2D vector with given coefficients */ + /** constructs an initialized 2D vector with given coefficients + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */ Array(const Scalar& val0, const Scalar& val1); - #endif + #endif // end EIGEN_PARSED_BY_DOXYGEN - /** constructs an initialized 3D vector with given coefficients */ + /** constructs an initialized 3D vector with given coefficients + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2) { @@ -211,7 +255,9 @@ class Array m_storage.data()[1] = val1; m_storage.data()[2] = val2; } - /** constructs an initialized 4D vector with given coefficients */ + /** constructs an initialized 4D vector with given coefficients + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3) { diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h index 271033056..21cf5ea9e 100644 --- a/Eigen/src/Core/CwiseUnaryView.h +++ b/Eigen/src/Core/CwiseUnaryView.h @@ -81,7 +81,7 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in /** \returns the nested expression */ typename internal::remove_reference<MatrixTypeNested>::type& - nestedExpression() { return m_matrix.const_cast_derived(); } + nestedExpression() { return m_matrix; } protected: MatrixTypeNested m_matrix; diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index f8feefa27..65ec1f54b 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -150,8 +150,8 @@ template<typename Derived> class DenseBase * \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime */ - IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1 - || internal::traits<Derived>::MaxColsAtCompileTime == 1, + IsVectorAtCompileTime = internal::traits<Derived>::RowsAtCompileTime == 1 + || internal::traits<Derived>::ColsAtCompileTime == 1, /**< This is set to true if either the number of rows or the number of * columns is known at compile-time to be equal to 1. Indeed, in that case, * we are dealing with a column-vector (if there is only one column) or with diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 2b2ee9e2c..04a321b9f 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -214,6 +214,21 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; } template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) { return a & (~b); } +/** \internal \returns ones */ +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;} + +template <typename RealScalar> +EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) { + RealScalar b; + b = ptrue(b); + return std::complex<RealScalar>(b, b); +} + +/** \internal \returns the bitwise not of \a a */ +template <typename Packet> EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { return pxor(ptrue(a), a);} + /** \internal \returns \a a shifted by N bits to the right */ template<int N> EIGEN_DEVICE_FUNC inline int pshiftright(const int& a) { return a >> N; } @@ -250,19 +265,19 @@ pselect(const Packet& mask, const Packet& a, const Packet& b) { /** \internal \returns a <= b as a bit mask */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pcmp_le(const Packet& a, const Packet& b); /* { return a<=b ? pnot(pxor(a,a)) : pxor(a,a); } */ +pcmp_le(const Packet& a, const Packet& b) { return a<=b ? ptrue(a) : pzero(a); } /** \internal \returns a < b as a bit mask */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pcmp_lt(const Packet& a, const Packet& b); /* { return a<b ? pnot(pxor(a,a)) : pxor(a,a); } */ +pcmp_lt(const Packet& a, const Packet& b) { return a<b ? ptrue(a) : pzero(a); } /** \internal \returns a == b as a bit mask */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pcmp_eq(const Packet& a, const Packet& b); /* { return a==b ? pnot(pxor(a,a)) : pxor(a,a); } */ +pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); } /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pcmp_lt_or_nan(const Packet& a, const Packet& b); /* { return pnot(pcmp_le(b,a)); } */ +pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet @@ -393,18 +408,39 @@ typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_trai predux_half_dowto4(const Packet& a) { return a; } -/** \internal \returns the product of the elements of \a a*/ +/** \internal \returns the product of the elements of \a a */ template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a) { return a; } -/** \internal \returns the min of the elements of \a a*/ +/** \internal \returns the min of the elements of \a a */ template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a) { return a; } -/** \internal \returns the max of the elements of \a a*/ +/** \internal \returns the max of the elements of \a a */ template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a) { return a; } +/** \internal \returns true if all coeffs of \a a means "true" + * It is supposed to be called on values returned by pcmp_*. + */ +// not needed yet +// template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a) +// { return bool(a); } + +/** \internal \returns true if any coeffs of \a a means "true" + * It is supposed to be called on values returned by pcmp_*. + */ +template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a) +{ + // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames. + // It is expected that "true" is either: + // - Scalar(1) + // - bits full of ones (NaN for floats), + // - or first bit equals to 1 (1 for ints, smallest denormal for floats). + // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars. + return bool(predux(a)); +} + /** \internal \returns the reversed elements of \a a*/ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) { return a; } diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h index 563df6e84..71377cee5 100644 --- a/Eigen/src/Core/GlobalFunctions.h +++ b/Eigen/src/Core/GlobalFunctions.h @@ -66,6 +66,11 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh) +#if EIGEN_HAS_CXX11_MATH + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh,scalar_asinh_op,inverse hyperbolic sine,\sa ArrayBase::asinh) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh,scalar_acosh_op,inverse hyperbolic cosine,\sa ArrayBase::acosh) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh,scalar_atanh_op,inverse hyperbolic tangent,\sa ArrayBase::atanh) +#endif EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma) diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h index da7fd6cce..063511f24 100644 --- a/Eigen/src/Core/IO.h +++ b/Eigen/src/Core/IO.h @@ -41,6 +41,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& * - \b rowSuffix string printed at the end of each row * - \b matPrefix string printed at the beginning of the matrix * - \b matSuffix string printed at the end of the matrix + * - \b fill character printed to fill the empty space in aligned columns * * Example: \include IOFormat.cpp * Output: \verbinclude IOFormat.out @@ -53,9 +54,9 @@ struct IOFormat IOFormat(int _precision = StreamPrecision, int _flags = 0, const std::string& _coeffSeparator = " ", const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="", - const std::string& _matPrefix="", const std::string& _matSuffix="") + const std::string& _matPrefix="", const std::string& _matSuffix="", const char _fill=' ') : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator), - rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags) + rowSpacer(""), coeffSeparator(_coeffSeparator), fill(_fill), precision(_precision), flags(_flags) { // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline // don't add rowSpacer if columns are not to be aligned @@ -71,6 +72,7 @@ struct IOFormat std::string matPrefix, matSuffix; std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer; std::string coeffSeparator; + char fill; int precision; int flags; }; @@ -176,18 +178,26 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& width = std::max<Index>(width, Index(sstr.str().length())); } } + std::streamsize old_width = s.width(); + char old_fill_character = s.fill(); s << fmt.matPrefix; for(Index i = 0; i < m.rows(); ++i) { if (i) s << fmt.rowSpacer; s << fmt.rowPrefix; - if(width) s.width(width); + if(width) { + s.fill(fmt.fill); + s.width(width); + } s << m.coeff(i, 0); for(Index j = 1; j < m.cols(); ++j) { s << fmt.coeffSeparator; - if (width) s.width(width); + if(width) { + s.fill(fmt.fill); + s.width(width); + } s << m.coeff(i, j); } s << fmt.rowSuffix; @@ -196,6 +206,10 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& } s << fmt.matSuffix; if(explicit_precision) s.precision(old_precision); + if(width) { + s.fill(old_fill_character); + s.width(old_width); + } return s; } diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h index 3485d8f46..377f8a5cc 100644 --- a/Eigen/src/Core/IndexedView.h +++ b/Eigen/src/Core/IndexedView.h @@ -132,7 +132,7 @@ public: /** \returns the nested expression */ typename internal::remove_reference<XprType>::type& - nestedExpression() { return m_xpr.const_cast_derived(); } + nestedExpression() { return m_xpr; } /** \returns a const reference to the object storing/generating the row indices */ const RowIndices& rowIndices() const { return m_rowIndices; } diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index 7f4a7af93..32269ed2e 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -301,6 +301,45 @@ class Matrix Base::_check_template_params(); Base::template _init2<T0,T1>(x, y); } + + #if EIGEN_HAS_CXX11 + /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) + * + * Example: \include Matrix_variadic_ctor_cxx11.cpp + * Output: \verbinclude Matrix_variadic_ctor_cxx11.out + * + * \sa Matrix(const std::initializer_list<std::initializer_list<Scalar>>&) + */ + template <typename... ArgTypes> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + : Base(a0, a1, a2, a3, args...) {} + + /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 + * + * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients: + * + * Example: \include Matrix_initializer_list_23_cxx11.cpp + * Output: \verbinclude Matrix_initializer_list_23_cxx11.out + * + * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered. + * + * In the case of a compile-time column vector, implicit transposition from a single row is allowed. + * Therefore <code>VectorXd{{1,2,3,4,5}}</code> is legal and the more verbose syntax + * <code>RowVectorXd{{1},{2},{3},{4},{5}}</code> can be avoided: + * + * Example: \include Matrix_initializer_list_vector_cxx11.cpp + * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out + * + * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes, + * and implicit transposition is allowed for compile-time vectors only. + * + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) + */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {} + #endif // end EIGEN_HAS_CXX11 + #else /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */ EIGEN_DEVICE_FUNC @@ -319,7 +358,8 @@ class Matrix * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives). */ EIGEN_STRONG_INLINE explicit Matrix(Index dim); - /** \brief Constructs an initialized 1x1 matrix with the given coefficient */ + /** \brief Constructs an initialized 1x1 matrix with the given coefficient + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */ Matrix(const Scalar& x); /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns. * @@ -336,11 +376,14 @@ class Matrix EIGEN_DEVICE_FUNC Matrix(Index rows, Index cols); - /** \brief Constructs an initialized 2D vector with given coefficients */ + /** \brief Constructs an initialized 2D vector with given coefficients + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */ Matrix(const Scalar& x, const Scalar& y); - #endif + #endif // end EIGEN_PARSED_BY_DOXYGEN - /** \brief Constructs an initialized 3D vector with given coefficients */ + /** \brief Constructs an initialized 3D vector with given coefficients + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z) { @@ -350,7 +393,9 @@ class Matrix m_storage.data()[1] = y; m_storage.data()[2] = z; } - /** \brief Constructs an initialized 4D vector with given coefficients */ + /** \brief Constructs an initialized 4D vector with given coefficients + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w) { diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 596cdd133..4744e5cc4 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -468,6 +468,11 @@ template<typename Derived> class MatrixBase const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const; EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine) EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine) +#if EIGEN_HAS_CXX11_MATH + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine) +#endif EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine) EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine) EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root) diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h index 01cf192e9..239bbba63 100644 --- a/Eigen/src/Core/NestByValue.h +++ b/Eigen/src/Core/NestByValue.h @@ -16,7 +16,11 @@ namespace Eigen { namespace internal { template<typename ExpressionType> struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType> -{}; +{ + enum { + Flags = traits<ExpressionType>::Flags & ~NestByRefBit + }; +}; } /** \class NestByValue @@ -43,55 +47,11 @@ template<typename ExpressionType> class NestByValue EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); } EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); } - EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); } - - EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const - { - return m_expression.coeff(row, col); - } - - EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col) - { - return m_expression.const_cast_derived().coeffRef(row, col); - } - - EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const - { - return m_expression.coeff(index); - } - - EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) - { - return m_expression.const_cast_derived().coeffRef(index); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index row, Index col) const - { - return m_expression.template packet<LoadMode>(row, col); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC inline void writePacket(Index row, Index col, const PacketScalar& x) - { - m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index index) const - { - return m_expression.template packet<LoadMode>(index); - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& x) - { - m_expression.const_cast_derived().template writePacket<LoadMode>(index, x); - } EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; } + EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; } + protected: const ExpressionType m_expression; }; @@ -105,6 +65,21 @@ DenseBase<Derived>::nestByValue() const return NestByValue<Derived>(derived()); } +namespace internal { + +// Evaluator of Solve -> eval into a temporary +template<typename ArgType> +struct evaluator<NestByValue<ArgType> > + : public evaluator<ArgType> +{ + typedef evaluator<ArgType> Base; + + EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue<ArgType>& xpr) + : Base(xpr.nestedExpression()) + {} +}; +} + } // end namespace Eigen #endif // EIGEN_NESTBYVALUE_H diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index f551dabb0..2deaa5aab 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -526,6 +526,71 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type // EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } + #if EIGEN_HAS_CXX11 + /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11 + * + * \only_for_vectors + * + * This constructor is for 1D array or vectors with more than 4 coefficients. + * There exists c++98 anologue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients. + * + * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this + * constructor must match the the fixed number of rows (resp. columns) of \c *this. + */ + template <typename... ArgTypes> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + : m_storage() + { + _check_template_params(); + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4); + m_storage.data()[0] = a0; + m_storage.data()[1] = a1; + m_storage.data()[2] = a2; + m_storage.data()[3] = a3; + int i = 4; + auto x = {(m_storage.data()[i++] = args, 0)...}; + static_cast<void>(x); + } + + /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer + * lists \cpp11 + */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list<std::initializer_list<Scalar>>& list) + : m_storage() + { + _check_template_params(); + + size_t list_size = 0; + if (list.begin() != list.end()) { + list_size = list.begin()->size(); + } + + // This is to allow syntax like VectorXi {{1, 2, 3, 4}} + if (ColsAtCompileTime == 1 && list.size() == 1) { + eigen_assert(list_size == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic); + resize(list_size, ColsAtCompileTime); + std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data()); + } else { + eigen_assert(list.size() == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic); + eigen_assert(list_size == static_cast<size_t>(ColsAtCompileTime) || ColsAtCompileTime == Dynamic); + resize(list.size(), list_size); + + Index row_index = 0; + for (const std::initializer_list<Scalar>& row : list) { + eigen_assert(list_size == row.size()); + Index col_index = 0; + for (const Scalar& e : row) { + coeffRef(row_index, col_index) = e; + ++col_index; + } + ++row_index; + } + } + } + #endif // end EIGEN_HAS_CXX11 + /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */ template<typename OtherDerived> EIGEN_DEVICE_FUNC diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 720b6030c..e231a7d7d 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -397,6 +397,8 @@ public: * The template parameter \a BinaryOp is the type of the functor \a func which must be * an associative operator. Both current C++98 and C++11 functor styles are handled. * + * \warning the matrix must be not empty, otherwise an assertion is triggered. + * * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise() */ template<typename Derived> @@ -415,6 +417,7 @@ DenseBase<Derived>::redux(const Func& func) const } /** \returns the minimum of all coefficients of \c *this. + * \warning the matrix must be not empty, otherwise an assertion is triggered. * \warning the result is undefined if \c *this contains NaN. */ template<typename Derived> @@ -425,6 +428,7 @@ DenseBase<Derived>::minCoeff() const } /** \returns the maximum of all coefficients of \c *this. + * \warning the matrix must be not empty, otherwise an assertion is triggered. * \warning the result is undefined if \c *this contains NaN. */ template<typename Derived> diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h index b7bd1b292..c955815e6 100644 --- a/Eigen/src/Core/Reshaped.h +++ b/Eigen/src/Core/Reshaped.h @@ -191,7 +191,7 @@ class ReshapedImpl_dense<XprType,Rows,Cols,Order,false> /** \returns the nested expression */ EIGEN_DEVICE_FUNC typename internal::remove_reference<XprType>::type& - nestedExpression() { return m_xpr.const_cast_derived(); } + nestedExpression() { return m_xpr; } protected: diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h index 8b6b3ab03..711dbcf9a 100644 --- a/Eigen/src/Core/Reverse.h +++ b/Eigen/src/Core/Reverse.h @@ -203,7 +203,7 @@ struct vectorwise_reverse_inplace_impl<Horizontal> template<typename ExpressionType, int Direction> EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace() { - internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived()); + internal::vectorwise_reverse_inplace_impl<Direction>::run(m_matrix); } } // end namespace Eigen diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index 2cf3fa1ef..2173799d9 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -61,6 +61,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType; + typedef SelfAdjointView<typename internal::add_const<MatrixType>::type, UpLo> ConstSelfAdjointView; enum { Mode = internal::traits<SelfAdjointView>::Mode, @@ -197,6 +198,18 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView inline const ConjugateReturnType conjugate() const { return ConjugateReturnType(m_matrix.conjugate()); } + /** \returns an expression of the complex conjugate of \c *this if Cond==true, + * returns \c *this otherwise. + */ + template<bool Cond> + EIGEN_DEVICE_FUNC + inline typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type + conjugateIf() const + { + typedef typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type ReturnType; + return ReturnType(m_matrix.template conjugateIf<Cond>()); + } + typedef SelfAdjointView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType; /** \sa MatrixBase::adjoint() const */ EIGEN_DEVICE_FUNC diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h index 2bf940a26..ec4b4a987 100644 --- a/Eigen/src/Core/Solve.h +++ b/Eigen/src/Core/Solve.h @@ -19,7 +19,7 @@ template<typename Decomposition, typename RhsType, typename StorageKind> class S * * \brief Pseudo expression representing a solving operation * - * \tparam Decomposition the type of the matrix or decomposion object + * \tparam Decomposition the type of the matrix or decomposition object * \tparam Rhstype the type of the right-hand side * * This class represents an expression of A.solve(B) diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h index 702a5485c..501461042 100644 --- a/Eigen/src/Core/SolverBase.h +++ b/Eigen/src/Core/SolverBase.h @@ -14,8 +14,35 @@ namespace Eigen { namespace internal { +template<typename Derived> +struct solve_assertion { + template<bool Transpose_, typename Rhs> + static void run(const Derived& solver, const Rhs& b) { solver.template _check_solve_assertion<Transpose_>(b); } +}; + +template<typename Derived> +struct solve_assertion<Transpose<Derived> > +{ + typedef Transpose<Derived> type; + + template<bool Transpose_, typename Rhs> + static void run(const type& transpose, const Rhs& b) + { + internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<true>(transpose.nestedExpression(), b); + } +}; +template<typename Scalar, typename Derived> +struct solve_assertion<CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > > +{ + typedef CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > type; + template<bool Transpose_, typename Rhs> + static void run(const type& adjoint, const Rhs& b) + { + internal::solve_assertion<typename internal::remove_all<Transpose<Derived> >::type>::template run<true>(adjoint.nestedExpression(), b); + } +}; } // end namespace internal /** \class SolverBase @@ -35,7 +62,7 @@ namespace internal { * * \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors. * - * \sa class PartialPivLU, class FullPivLU + * \sa class PartialPivLU, class FullPivLU, class HouseholderQR, class ColPivHouseholderQR, class FullPivHouseholderQR, class CompleteOrthogonalDecomposition, class LLT, class LDLT, class SVDBase */ template<typename Derived> class SolverBase : public EigenBase<Derived> @@ -46,6 +73,9 @@ class SolverBase : public EigenBase<Derived> typedef typename internal::traits<Derived>::Scalar Scalar; typedef Scalar CoeffReturnType; + template<typename Derived_> + friend struct internal::solve_assertion; + enum { RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime, ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime, @@ -75,7 +105,7 @@ class SolverBase : public EigenBase<Derived> inline const Solve<Derived, Rhs> solve(const MatrixBase<Rhs>& b) const { - eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b"); + internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<false>(derived(), b); return Solve<Derived, Rhs>(derived(), b.derived()); } @@ -113,6 +143,13 @@ class SolverBase : public EigenBase<Derived> } protected: + + template<bool Transpose_, typename Rhs> + void _check_solve_assertion(const Rhs& b) const { + EIGEN_ONLY_USED_FOR_DEBUG(b); + eigen_assert(derived().m_isInitialized && "Solver is not initialized."); + eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "SolverBase::solve(): invalid number of rows of the right hand side matrix b"); + } }; namespace internal { diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index d7c204579..91a9ab1b9 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -392,7 +392,8 @@ struct checkTransposeAliasing_impl<Derived, OtherDerived, false> template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src) { - internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src); + if((!Dst::IsVectorAtCompileTime) && dst.rows()>1 && dst.cols()>1) + internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src); } } // end namespace internal diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index 521de6160..cf3532f06 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -198,6 +198,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef; typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType; + typedef TriangularView<typename internal::add_const<MatrixType>::type, _Mode> ConstTriangularView; public: @@ -243,6 +244,18 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView inline const ConjugateReturnType conjugate() const { return ConjugateReturnType(m_matrix.conjugate()); } + /** \returns an expression of the complex conjugate of \c *this if Cond==true, + * returns \c *this otherwise. + */ + template<bool Cond> + EIGEN_DEVICE_FUNC + inline typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type + conjugateIf() const + { + typedef typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type ReturnType; + return ReturnType(m_matrix.template conjugateIf<Cond>()); + } + typedef TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType; /** \sa MatrixBase::adjoint() const */ EIGEN_DEVICE_FUNC diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index a88b6e736..db0b9f8c4 100644 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -173,6 +173,14 @@ struct member_redux { * Example: \include MatrixBase_colwise_iterator_cxx11.cpp * Output: \verbinclude MatrixBase_colwise_iterator_cxx11.out * + * For a partial reduction on an empty input, some rules apply. + * For the sake of clarity, let's consider a vertical reduction: + * - If the number of columns is zero, then a 1x0 row-major vector expression is returned. + * - Otherwise, if the number of rows is zero, then + * - a row vector of zeros is returned for sum-like reductions (sum, squaredNorm, norm, etc.) + * - a row vector of ones is returned for a product reduction (e.g., <code>MatrixXd(n,0).colwise().prod()</code>) + * - an assert is triggered for all other reductions (minCoeff,maxCoeff,redux(bin_op)) + * * \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr */ template<typename ExpressionType, int Direction> class VectorwiseOp @@ -294,13 +302,19 @@ template<typename ExpressionType, int Direction> class VectorwiseOp * The template parameter \a BinaryOp is the type of the functor * of the custom redux operator. Note that func must be an associative operator. * + * \warning the size along the reduction direction must be strictly positive, + * otherwise an assertion is triggered. + * * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise() */ template<typename BinaryOp> EIGEN_DEVICE_FUNC const typename ReduxReturnType<BinaryOp>::Type redux(const BinaryOp& func = BinaryOp()) const - { return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func)); } + { + eigen_assert(redux_length()>0 && "you are using an empty matrix"); + return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func)); + } typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType; typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType; @@ -325,6 +339,9 @@ template<typename ExpressionType, int Direction> class VectorwiseOp /** \returns a row (or column) vector expression of the smallest coefficient * of each column (or row) of the referenced expression. * + * \warning the size along the reduction direction must be strictly positive, + * otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * Example: \include PartialRedux_minCoeff.cpp @@ -333,11 +350,17 @@ template<typename ExpressionType, int Direction> class VectorwiseOp * \sa DenseBase::minCoeff() */ EIGEN_DEVICE_FUNC const MinCoeffReturnType minCoeff() const - { return MinCoeffReturnType(_expression()); } + { + eigen_assert(redux_length()>0 && "you are using an empty matrix"); + return MinCoeffReturnType(_expression()); + } /** \returns a row (or column) vector expression of the largest coefficient * of each column (or row) of the referenced expression. * + * \warning the size along the reduction direction must be strictly positive, + * otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * Example: \include PartialRedux_maxCoeff.cpp @@ -346,7 +369,10 @@ template<typename ExpressionType, int Direction> class VectorwiseOp * \sa DenseBase::maxCoeff() */ EIGEN_DEVICE_FUNC const MaxCoeffReturnType maxCoeff() const - { return MaxCoeffReturnType(_expression()); } + { + eigen_assert(redux_length()>0 && "you are using an empty matrix"); + return MaxCoeffReturnType(_expression()); + } /** \returns a row (or column) vector expression of the squared norm * of each column (or row) of the referenced expression. @@ -531,7 +557,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) //eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME - return const_cast<ExpressionType&>(m_matrix = extendedTo(other.derived())); + return m_matrix = extendedTo(other.derived()); } /** Adds the vector \a other to each subvector of \c *this */ @@ -541,7 +567,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp { EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) - return const_cast<ExpressionType&>(m_matrix += extendedTo(other.derived())); + return m_matrix += extendedTo(other.derived()); } /** Substracts the vector \a other to each subvector of \c *this */ @@ -551,7 +577,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp { EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) - return const_cast<ExpressionType&>(m_matrix -= extendedTo(other.derived())); + return m_matrix -= extendedTo(other.derived()); } /** Multiples each subvector of \c *this by the vector \a other */ @@ -563,7 +589,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) m_matrix *= extendedTo(other.derived()); - return const_cast<ExpressionType&>(m_matrix); + return m_matrix; } /** Divides each subvector of \c *this by the vector \a other */ @@ -575,7 +601,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) m_matrix /= extendedTo(other.derived()); - return const_cast<ExpressionType&>(m_matrix); + return m_matrix; } /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */ @@ -690,6 +716,10 @@ template<typename ExpressionType, int Direction> class VectorwiseOp const HNormalizedReturnType hnormalized() const; protected: + Index redux_length() const + { + return Direction==Vertical ? m_matrix.rows() : m_matrix.cols(); + } ExpressionTypeNested m_matrix; }; diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h index 54c1883d9..f67d83bd1 100644 --- a/Eigen/src/Core/Visitor.h +++ b/Eigen/src/Core/Visitor.h @@ -40,6 +40,14 @@ struct visitor_impl<Visitor, Derived, 1> } }; +// This specialization enables visitors on empty matrices at compile-time +template<typename Visitor, typename Derived> +struct visitor_impl<Visitor, Derived, 0> { + EIGEN_DEVICE_FUNC + static inline void run(const Derived &/*mat*/, Visitor& /*visitor*/) + {} +}; + template<typename Visitor, typename Derived> struct visitor_impl<Visitor, Derived, Dynamic> { @@ -98,6 +106,8 @@ protected: * * \note compared to one or two \em for \em loops, visitors offer automatic * unrolling for small fixed size matrix. + * + * \note if the matrix is empty, then the visitor is left unchanged. * * \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux() */ @@ -106,6 +116,9 @@ template<typename Visitor> EIGEN_DEVICE_FUNC void DenseBase<Derived>::visit(Visitor& visitor) const { + if(size()==0) + return; + typedef typename internal::visitor_evaluator<Derived> ThisEvaluator; ThisEvaluator thisEval(derived()); @@ -124,6 +137,8 @@ namespace internal { template <typename Derived> struct coeff_visitor { + // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc + coeff_visitor() : row(-1), col(-1), res(0) {} typedef typename Derived::Scalar Scalar; Index row, col; Scalar res; @@ -196,6 +211,9 @@ struct functor_traits<max_coeff_visitor<Scalar> > { /** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const * \returns the minimum of all coefficients of *this and puts in *row and *col its location. + * + * \warning the matrix must be not empty, otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff() @@ -206,6 +224,8 @@ EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const { + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + internal::min_coeff_visitor<Derived> minVisitor; this->visit(minVisitor); *rowId = minVisitor.row; @@ -214,6 +234,9 @@ DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const } /** \returns the minimum of all coefficients of *this and puts in *index its location. + * + * \warning the matrix must be not empty, otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff() @@ -224,6 +247,8 @@ EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff(IndexType* index) const { + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) internal::min_coeff_visitor<Derived> minVisitor; this->visit(minVisitor); @@ -233,6 +258,9 @@ DenseBase<Derived>::minCoeff(IndexType* index) const /** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const * \returns the maximum of all coefficients of *this and puts in *row and *col its location. + * + * \warning the matrix must be not empty, otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff() @@ -243,6 +271,8 @@ EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const { + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + internal::max_coeff_visitor<Derived> maxVisitor; this->visit(maxVisitor); *rowPtr = maxVisitor.row; @@ -251,6 +281,9 @@ DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const } /** \returns the maximum of all coefficients of *this and puts in *index its location. + * + * \warning the matrix must be not empty, otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff() @@ -261,6 +294,8 @@ EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff(IndexType* index) const { + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) internal::max_coeff_visitor<Derived> maxVisitor; this->visit(maxVisitor); diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index e7e2a1033..5b8ff59bd 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -69,6 +69,14 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, con return Packet4cf(result); } +template <> +EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { + __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ); + return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1))); +} + +template<> EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); } +template<> EIGEN_STRONG_INLINE Packet4cf pnot<Packet4cf>(const Packet4cf& a) { return Packet4cf(pnot(Packet8f(a.v))); } template<> EIGEN_STRONG_INLINE Packet4cf pand <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } @@ -276,6 +284,14 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, con return Packet2cd(_mm256_addsub_pd(even, odd)); } +template <> +EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { + __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ); + return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5))); +} + +template<> EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet2cd pnot<Packet2cd>(const Packet2cd& a) { return Packet2cd(pnot(Packet4d(a.v))); } template<> EIGEN_STRONG_INLINE Packet2cd pand <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index e771c0f25..ee00f1f7d 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -228,6 +228,7 @@ template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); } template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); } template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); } +template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); } template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); } template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) { @@ -249,6 +250,37 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { ret template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); } + +template<> EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + // vpcmpeqd has lower latency than the more general vcmpps + return _mm256_cmpeq_epi32(a,a); +#else + const __m256 b = _mm256_castsi256_ps(a); + return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + // vpcmpeqd has lower latency than the more general vcmpps + const __m256i b = _mm256_castps_si256(a); + return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b)); +#else + return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + // vpcmpeqq has lower latency than the more general vcmppd + const __m256i b = _mm256_castpd_si256(a); + return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b)); +#else + return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ); +#endif +} + template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) { @@ -575,6 +607,16 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a) return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); } +// not needed yet +// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x) +// { +// return _mm256_movemask_ps(x)==0xFF; +// } + +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x) +{ + return _mm256_movemask_ps(x)!=0; +} template<int Offset> struct palign_impl<Offset,Packet8f> diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 569ee01ff..9a89dd01f 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -56,6 +56,8 @@ template<> struct unpacket_traits<Packet8cf> { typedef Packet4cf half; }; +template<> EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); } +template<> EIGEN_STRONG_INLINE Packet8cf pnot<Packet8cf>(const Packet8cf& a) { return Packet8cf(pnot(Packet16f(a.v))); } template<> EIGEN_STRONG_INLINE Packet8cf padd<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet8cf psub<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a) @@ -67,7 +69,7 @@ template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a) const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32( 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000, 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet8cf(_mm512_xor_ps(a.v,mask)); + return Packet8cf(pxor(a.v,mask)); } template<> EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, const Packet8cf& b) @@ -76,10 +78,16 @@ template<> EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, con return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2)); } -template<> EIGEN_STRONG_INLINE Packet8cf pand <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_and_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet8cf por <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_or_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet8cf pxor <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_xor_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_andnot_ps(b.v,a.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pand <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf por <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pxor <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); } + +template <> +EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) { + __m512 eq = pcmp_eq<Packet16f>(a.v, b.v); + return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1))); +} template<> EIGEN_STRONG_INLINE Packet8cf pload <Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload<Packet16f>(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu<Packet16f>(&numext::real_ref(*from))); } @@ -125,20 +133,20 @@ template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) { template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a) { - return predux(padd(Packet4cf(_mm512_extractf32x8_ps(a.v,0)), - Packet4cf(_mm512_extractf32x8_ps(a.v,1)))); + return predux(padd(Packet4cf(extract256<0>(a.v)), + Packet4cf(extract256<1>(a.v)))); } template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a) { - return predux_mul(pmul(Packet4cf(_mm512_extractf32x8_ps(a.v, 0)), - Packet4cf(_mm512_extractf32x8_ps(a.v, 1)))); + return predux_mul(pmul(Packet4cf(extract256<0>(a.v)), + Packet4cf(extract256<1>(a.v)))); } template <> EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4<Packet8cf>(const Packet8cf& a) { - __m256 lane0 = _mm512_extractf32x8_ps(a.v, 0); - __m256 lane1 = _mm512_extractf32x8_ps(a.v, 1); + __m256 lane0 = extract256<0>(a.v); + __m256 lane1 = extract256<1>(a.v); __m256 res = _mm256_add_ps(lane0, lane1); return Packet4cf(res); } @@ -264,10 +272,18 @@ template<> EIGEN_STRONG_INLINE Packet4cd pmul<Packet4cd>(const Packet4cd& a, con return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd)); } -template<> EIGEN_STRONG_INLINE Packet4cd pand <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_and_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cd por <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_or_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cd pxor <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_xor_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_andnot_pd(b.v,a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd ptrue<Packet4cd>(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet4cd pnot<Packet4cd>(const Packet4cd& a) { return Packet4cd(pnot(Packet8d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet4cd pand <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd por <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pxor <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); } + +template <> +EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) { + __m512d eq = pcmp_eq<Packet8d>(a.v, b.v); + return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55))); +} template<> EIGEN_STRONG_INLINE Packet4cd pload <Packet4cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload<Packet8d>((const double*)from)); } @@ -294,23 +310,23 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex< template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather<std::complex<double>, Packet4cd>(const std::complex<double>* from, Index stride) { return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512( - _mm256_insertf128_pd(_mm256_castpd128_pd256(pload<Packet1cd>(from+0*stride).v), pload<Packet1cd>(from+1*stride).v,1)), - _mm256_insertf128_pd(_mm256_castpd128_pd256(pload<Packet1cd>(from+2*stride).v), pload<Packet1cd>(from+3*stride).v,1), 1)); + _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+0*stride).v), ploadu<Packet1cd>(from+1*stride).v,1)), + _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+2*stride).v), ploadu<Packet1cd>(from+3*stride).v,1), 1)); } template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet4cd>(std::complex<double>* to, const Packet4cd& from, Index stride) { __m512i fromi = _mm512_castpd_si512(from.v); double* tod = (double*)(void*)to; - _mm_store_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) ); - _mm_store_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) ); - _mm_store_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) ); - _mm_store_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) ); + _mm_storeu_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) ); + _mm_storeu_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) ); + _mm_storeu_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) ); + _mm_storeu_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) ); } template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet4cd>(const Packet4cd& a) { - __m128d low = _mm512_extractf64x2_pd(a.v, 0); + __m128d low = extract128<0>(a.v); EIGEN_ALIGN16 double res[2]; _mm_store_pd(res, low); return std::complex<double>(res[0],res[1]); @@ -392,12 +408,40 @@ template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8cf,4>& kernel) { - ptranspose(reinterpret_cast<PacketBlock<Packet8d,4>&>(kernel)); + PacketBlock<Packet8d,4> pb; + + pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v); + pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v); + pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v); + pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v); + ptranspose(pb); + kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]); + kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]); + kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]); + kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]); } EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8cf,8>& kernel) { - ptranspose(reinterpret_cast<PacketBlock<Packet8d,8>&>(kernel)); + PacketBlock<Packet8d,8> pb; + + pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v); + pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v); + pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v); + pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v); + pb.packet[4] = _mm512_castps_pd(kernel.packet[4].v); + pb.packet[5] = _mm512_castps_pd(kernel.packet[5].v); + pb.packet[6] = _mm512_castps_pd(kernel.packet[6].v); + pb.packet[7] = _mm512_castps_pd(kernel.packet[7].v); + ptranspose(pb); + kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]); + kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]); + kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]); + kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]); + kernel.packet[4].v = _mm512_castpd_ps(pb.packet[4]); + kernel.packet[5].v = _mm512_castpd_ps(pb.packet[5]); + kernel.packet[6].v = _mm512_castpd_ps(pb.packet[6]); + kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]); } EIGEN_DEVICE_FUNC inline void diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index aac707596..c2158c538 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -47,6 +47,7 @@ plog<Packet16f>(const Packet16f& _x) { // The smallest non denormalized float number. _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000); _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000); _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); // Polynomial coefficients. @@ -116,10 +117,16 @@ plog<Packet16f>(const Packet16f& _x) { x = padd(x, y); x = padd(x, y2); - // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. + __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ); + // Filter out invalid inputs, i.e.: + // - negative arg will be NAN, + // - 0 will be -INF. + // - +INF will be +INF return _mm512_mask_blend_ps(iszero_mask, - _mm512_mask_blend_ps(invalid_mask, x, p16f_nan), - p16f_minus_inf); + _mm512_mask_blend_ps(invalid_mask, + _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf), + p16f_nan), + p16f_minus_inf); } #endif diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 9c3121062..2199970ad 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -57,7 +57,7 @@ template<> struct packet_traits<float> : default_packet_traits HasBlend = 0, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, -#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG +#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) #ifdef EIGEN_VECTORIZE_AVX512DQ HasLog = 1, #endif @@ -77,7 +77,7 @@ template<> struct packet_traits<double> : default_packet_traits AlignedOnScalar = 1, size = 8, HasHalfPacket = 1, -#if EIGEN_GNUC_AT_LEAST(5, 3) +#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) HasSqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH, #endif @@ -262,12 +262,78 @@ EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a, return _mm512_max_pd(b, a); } +#ifdef EIGEN_VECTORIZE_AVX512DQ +template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); } +template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); } +EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); } +#else +// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 +template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { + return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_)); +} + +// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512 +template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { + return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_)); +} + +EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { + return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)), + _mm256_castps_si256(b),1)); +} +#endif + +template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) { + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); +} + +template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) { + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); +} + +template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); +} + template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) { - __m256i lo = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0)); - __m256i hi = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1)); - return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1); + __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ); + return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu); } +template <> +EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); +} + +template <> +EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) { + __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} + +template <> +EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(const Packet16i& /*a*/) { + return _mm512_set1_epi32(0xffffffffu); +} + +template <> +EIGEN_STRONG_INLINE Packet16f ptrue<Packet16f>(const Packet16f& a) { + return _mm512_castsi512_ps(ptrue<Packet16i>(_mm512_castps_si512(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet8d ptrue<Packet8d>(const Packet8d& a) { + return _mm512_castsi512_pd(ptrue<Packet16i>(_mm512_castpd_si512(a))); +} template <> EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a, @@ -924,6 +990,13 @@ EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) { return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1))); } +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x) +{ + Packet16i xi = _mm512_castps_si512(x); + __mmask16 tmp = _mm512_test_epi32_mask(xi,xi); + return !_mm512_kortestz(tmp,tmp); +} + template <int Offset> struct palign_impl<Offset, Packet16f> { static EIGEN_STRONG_INLINE void run(Packet16f& first, diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 5404a624e..440d058d8 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -82,14 +82,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride) { - std::complex<float> EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex<float> af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload<Packet2cf>(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride) { - std::complex<float> EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex<float> af[2]; pstore<std::complex<float> >((std::complex<float> *) af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -128,7 +128,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) { - std::complex<float> EIGEN_ALIGN16 res[2]; + EIGEN_ALIGN16 std::complex<float> res[2]; pstore((float *)&res, a.v); return res[0]; @@ -298,14 +298,14 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride) { - std::complex<double> EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex<double> af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload<Packet1cd>(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride) { - std::complex<double> EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex<double> af[2]; pstore<std::complex<double> >(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -345,7 +345,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) { - std::complex<double> EIGEN_ALIGN16 res[2]; + EIGEN_ALIGN16 std::complex<double> res[2]; pstore<std::complex<double> >(res, a); return res[0]; diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 2c06003ed..9535724eb 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -324,7 +324,7 @@ pbroadcast4<Packet4i>(const int *a, template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) { - float EIGEN_ALIGN16 af[4]; + EIGEN_ALIGN16 float af[4]; af[0] = from[0*stride]; af[1] = from[1*stride]; af[2] = from[2*stride]; @@ -333,7 +333,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa } template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) { - int EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 int ai[4]; ai[0] = from[0*stride]; ai[1] = from[1*stride]; ai[2] = from[2*stride]; @@ -342,7 +342,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f } template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) { - float EIGEN_ALIGN16 af[4]; + EIGEN_ALIGN16 float af[4]; pstore<float>(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -351,7 +351,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co } template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) { - int EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 int ai[4]; pstore<int>((int *)ai, from); to[0*stride] = ai[0]; to[1*stride] = ai[1]; @@ -565,8 +565,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } -template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; } template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { @@ -720,6 +720,11 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) return pfirst(res); } +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) +{ + return vec_any_ne(x, pzero(x)); +} + template<int Offset> struct palign_impl<Offset,Packet4f> { @@ -980,14 +985,14 @@ pbroadcast4<Packet2d>(const double *a, template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload<Packet2d>(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; pstore<double>(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -1059,7 +1064,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 9481850c6..ce3f0fc68 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -3,7 +3,7 @@ // // Copyright (C) 2007 Julien Pommier // Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) -// Copyright (C) 2009-2018 Gael Guennebaud <gael.guennebaud@inria.fr> +// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -54,6 +54,7 @@ Packet plog_float(const Packet _x) // The smallest non denormalized float number. const Packet cst_min_norm_pos = pset1frombits<Packet>( 0x00800000u); const Packet cst_minus_inf = pset1frombits<Packet>( 0xff800000u); + const Packet cst_pos_inf = pset1frombits<Packet>( 0x7f800000u); // Polynomial coefficients. const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f); @@ -69,9 +70,6 @@ Packet plog_float(const Packet _x) const Packet cst_cephes_log_q1 = pset1<Packet>(-2.12194440e-4f); const Packet cst_cephes_log_q2 = pset1<Packet>(0.693359375f); - Packet invalid_mask = pcmp_lt_or_nan(x, pzero(x)); - Packet iszero_mask = pcmp_eq(x,pzero(x)); - // Truncate input values to the minimum positive normal. x = pmax(x, cst_min_norm_pos); @@ -117,8 +115,15 @@ Packet plog_float(const Packet _x) x = padd(x, y); x = padd(x, y2); - // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. - return pselect(iszero_mask, cst_minus_inf, por(x, invalid_mask)); + Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x)); + Packet iszero_mask = pcmp_eq(_x,pzero(_x)); + Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf); + // Filter out invalid inputs, i.e.: + // - negative arg will be NAN + // - 0 will be -INF + // - +INF will be +INF + return pselect(iszero_mask, cst_minus_inf, + por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask)); } // Exponential function. Works by writing "x = m*log(2) + r" where @@ -248,15 +253,68 @@ Packet pexp_double(const Packet _x) return pmax(pldexp(x,fx), _x); } -/* The code is the rewriting of the cephes sinf/cosf functions. - Precision is excellent as long as x < 8192 (I did not bother to - take into account the special handling they have for greater values - -- it does not return garbage for arguments over 8192, though, but - the extra precision is missing). - - Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the - surprising but correct result. -*/ +// The following code is inspired by the following stack-overflow answer: +// https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751 +// It has been largely optimized: +// - By-pass calls to frexp. +// - Aligned loads of required 96 bits of 2/pi. This is accomplished by +// (1) balancing the mantissa and exponent to the required bits of 2/pi are +// aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi. +// - Avoid a branch in rounding and extraction of the remaining fractional part. +// Overall, I measured a speed up higher than x2 on x86-64. +inline float trig_reduce_huge (float xf, int *quadrant) +{ + using Eigen::numext::int32_t; + using Eigen::numext::uint32_t; + using Eigen::numext::int64_t; + using Eigen::numext::uint64_t; + + const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62 + const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt + + // 192 bits of 2/pi for Payne-Hanek reduction + // Bits are introduced by packet of 8 to enable aligned reads. + static const uint32_t two_over_pi [] = + { + 0x00000028, 0x000028be, 0x0028be60, 0x28be60db, + 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a, + 0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4, + 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770, + 0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566, + 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410, + 0x10e41000, 0xe4100000 + }; + + uint32_t xi = numext::as_uint(xf); + // Below, -118 = -126 + 8. + // -126 is to get the exponent, + // +8 is to enable alignment of 2/pi's bits on 8 bits. + // This is possible because the fractional part of x as only 24 meaningful bits. + uint32_t e = (xi >> 23) - 118; + // Extract the mantissa and shift it to align it wrt the exponent + xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7); + + uint32_t i = e >> 3; + uint32_t twoopi_1 = two_over_pi[i-1]; + uint32_t twoopi_2 = two_over_pi[i+3]; + uint32_t twoopi_3 = two_over_pi[i+7]; + + // Compute x * 2/pi in 2.62-bit fixed-point format. + uint64_t p; + p = uint64_t(xi) * twoopi_3; + p = uint64_t(xi) * twoopi_2 + (p >> 32); + p = (uint64_t(xi * twoopi_1) << 32) + p; + + // Round to nearest: add 0.5 and extract integral part. + uint64_t q = (p + zero_dot_five) >> 62; + *quadrant = int(q); + // Now it remains to compute "r = x - q*pi/2" with high accuracy, + // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as: + // r = (p-q)*pi/2, + // where the product can be be carried out with sufficient accuracy using double precision. + p -= q<<62; + return float(double(int64_t(p)) * pio2_62); +} template<bool ComputeSine,typename Packet> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS @@ -264,80 +322,116 @@ EIGEN_UNUSED Packet psincos_float(const Packet& _x) { typedef typename unpacket_traits<Packet>::integer_packet PacketI; - const Packet cst_1 = pset1<Packet>(1.0f); - const Packet cst_half = pset1<Packet>(0.5f); - - const PacketI csti_1 = pset1<PacketI>(1); - const PacketI csti_not1 = pset1<PacketI>(~1); - const PacketI csti_2 = pset1<PacketI>(2); - const PacketI csti_3 = pset1<PacketI>(3); - - const Packet cst_sign_mask = pset1frombits<Packet>(0x80000000u); - - const Packet cst_minus_cephes_DP1 = pset1<Packet>(-0.78515625f); - const Packet cst_minus_cephes_DP2 = pset1<Packet>(-2.4187564849853515625e-4f); - const Packet cst_minus_cephes_DP3 = pset1<Packet>(-3.77489497744594108e-8f); - const Packet cst_sincof_p0 = pset1<Packet>(-1.9515295891E-4f); - const Packet cst_sincof_p1 = pset1<Packet>( 8.3321608736E-3f); - const Packet cst_sincof_p2 = pset1<Packet>(-1.6666654611E-1f); - const Packet cst_coscof_p0 = pset1<Packet>( 2.443315711809948E-005f); - const Packet cst_coscof_p1 = pset1<Packet>(-1.388731625493765E-003f); - const Packet cst_coscof_p2 = pset1<Packet>( 4.166664568298827E-002f); - const Packet cst_cephes_FOPI = pset1<Packet>( 1.27323954473516f); // 4 / M_PI - Packet x = pabs(_x); + const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI + const Packet cst_rounding_magic = pset1<Packet>(12582912); // 2^23 for rounding + const PacketI csti_1 = pset1<PacketI>(1); + const Packet cst_sign_mask = pset1frombits<Packet>(0x80000000u); - // Scale x by 4/Pi to find x's octant. - Packet y = pmul(x, cst_cephes_FOPI); + Packet x = pabs(_x); - // Get the octant. We'll reduce x by this number of octants or by one more than it. - PacketI y_int = pcast<Packet,PacketI>(y); - // x's from even-numbered octants will translate to octant 0: [0, +Pi/4]. - // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0]. - // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1). - PacketI y_int1 = pand(padd(y_int, csti_1), csti_not1); // could be pbitclear<0>(...) - y = pcast<PacketI,Packet>(y_int1); + // Scale x by 2/Pi to find x's octant. + Packet y = pmul(x, cst_2oPI); + + // Rounding trick: + Packet y_round = padd(y, cst_rounding_magic); + PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24) + y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi + + // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4 + // using "Extended precision modular arithmetic" + #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) + // This version requires true FMA for high accuracy + // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08): + const float huge_th = ComputeSine ? 117435.992f : 71476.0625f; + x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x); + x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x); + x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x); + #else + // Without true FMA, the previous set of coefficients maintain 1ULP accuracy + // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7. + // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs. + + // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively. + // and 2 ULP up to: + const float huge_th = ComputeSine ? 25966.f : 18838.f; + x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000 + x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000 + x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000 + x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee + + // For the record, the following set of coefficients maintain 2ULP up + // to a slightly larger range: + // const float huge_th = ComputeSine ? 51981.f : 39086.125f; + // but it slightly fails to maintain 1ULP for two values of sin below pi. + // x = pmadd(y, pset1<Packet>(-3.140625/2.), x); + // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x); + // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x); + // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x); + + // For the record, with only 3 iterations it is possible to maintain + // 1 ULP up to 3PI (maybe more) and 2ULP up to 255. + // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee + #endif + + if(predux_any(pcmp_le(pset1<Packet>(huge_th),pabs(_x)))) + { + const int PacketSize = unpacket_traits<Packet>::size; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize]; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize]; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize]; + pstoreu(vals, pabs(_x)); + pstoreu(x_cpy, x); + pstoreu(y_int2, y_int); + for(int k=0; k<PacketSize;++k) + { + float val = vals[k]; + if(val>=huge_th && (numext::isfinite)(val)) + x_cpy[k] = trig_reduce_huge(val,&y_int2[k]); + } + x = ploadu<Packet>(x_cpy); + y_int = ploadu<PacketI>(y_int2); + } // Compute the sign to apply to the polynomial. - // sign = third_bit(y_int1) xor signbit(_x) - Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(pshiftleft<29>(y_int1))) - : preinterpret<Packet>(pshiftleft<29>(padd(y_int1,csti_3))); + // sin: sign = second_bit(y_int) xor signbit(_x) + // cos: sign = second_bit(y_int+1) + Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(pshiftleft<30>(y_int))) + : preinterpret<Packet>(pshiftleft<30>(padd(y_int,csti_1))); sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit - // Get the polynomial selection mask from the second bit of y_int1 + // Get the polynomial selection mask from the second bit of y_int // We'll calculate both (sin and cos) polynomials and then select from the two. - Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int1, csti_2), pzero(y_int1))); - - // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4. - // The magic pass: "Extended precision modular arithmetic" - // x = ((x - y * DP1) - y * DP2) - y * DP3 - x = pmadd(y, cst_minus_cephes_DP1, x); - x = pmadd(y, cst_minus_cephes_DP2, x); - x = pmadd(y, cst_minus_cephes_DP3, x); + Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int))); Packet x2 = pmul(x,x); - // Evaluate the cos(x) polynomial. (0 <= x <= Pi/4) - Packet y1 = cst_coscof_p0; - y1 = pmadd(y1, x2, cst_coscof_p1); - y1 = pmadd(y1, x2, cst_coscof_p2); - y1 = pmul(y1, x2); - y1 = pmul(y1, x2); - y1 = psub(y1, pmul(x2, cst_half)); - y1 = padd(y1, cst_1); - - // Evaluate the sin(x) polynomial. (Pi/4 <= x <= 0) - Packet y2 = cst_sincof_p0; - y2 = pmadd(y2, x2, cst_sincof_p1); - y2 = pmadd(y2, x2, cst_sincof_p2); + // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4) + Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f); + y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f )); + y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f )); + y1 = pmadd(y1, x2, pset1<Packet>(-0.5f)); + y1 = pmadd(y1, x2, pset1<Packet>(1.f)); + + // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4) + // octave/matlab code to compute those coefficients: + // x = (0:0.0001:pi/4)'; + // A = [x.^3 x.^5 x.^7]; + // w = ((1.-(x/(pi/4)).^2).^5)*2000+1; # weights trading relative accuracy + // c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1 + // printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1)) + // + Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f); + y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f)); + y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f)); y2 = pmul(y2, x2); y2 = pmadd(y2, x, x); - // Select the correct result from the two polynoms. + // Select the correct result from the two polynomials. y = ComputeSine ? pselect(poly_mask,y2,y1) : pselect(poly_mask,y1,y2); - // Update the sign + // Update the sign and filter huge inputs return pxor(y, sign_bit); } diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index eaba60e26..c1b097fb9 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -100,6 +100,117 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const do return make_double2(from, from); } +namespace { + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) & __float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) & + __double_as_longlong(b)); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) | __float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) | + __double_as_longlong(b)); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) ^ __float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) ^ + __double_as_longlong(b)); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) & ~__float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) & + ~__double_as_longlong(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a, + const float& b) { + return __int_as_float(a == b ? 0xffffffffu : 0u); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a, + const double& b) { + return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull); +} + +} // namespace + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a, + const float4& b) { + return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y), + bitwise_and(a.z, b.z), bitwise_and(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a, + const double2& b) { + return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a, + const float4& b) { + return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y), + bitwise_or(a.z, b.z), bitwise_or(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a, + const double2& b) { + return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a, + const float4& b) { + return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y), + bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a, + const double2& b) { + return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a, + const float4& b) { + return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y), + bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pandnot<double2>(const double2& a, const double2& b) { + return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a, + const float4& b) { + return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z), + eq_mask(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pcmp_eq<double2>(const double2& a, const double2& b) { + return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y)); +} template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) { return make_float4(a, a+1, a+2, a+3); diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index cc5c484b6..316ac0283 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -143,6 +143,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& return result; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue<half2>(const half2& a) { + half2 result; + *(reinterpret_cast<unsigned*>(&(result))) = 0xffffffffu; +} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<half2,2>& kernel) { @@ -640,6 +644,14 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) { + Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r; +} + +template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) { + Packet16h r; r.x = Packet8i(ptrue(a.x)); return r; +} + template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { // in some cases Packet8i is a wrapper around __m256i, so we need to // cast to Packet8i to call the correct overload. @@ -655,6 +667,13 @@ template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet Packet16h r; r.x = pandnot(Packet8i(a.x),Packet8i(b.x)); return r; } +template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pcmp_eq(af, bf); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { // FIXME we could do that with bit manipulation Packet16f af = half2float(a); @@ -1078,6 +1097,10 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) { + Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r; +} + template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { // in some cases Packet4i is a wrapper around __m128i, so we either need to // cast to Packet4i to directly call the intrinsics as below: @@ -1093,6 +1116,13 @@ template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h Packet8h r; r.x = _mm_andnot_si128(b.x,a.x); return r; } +template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pcmp_eq(af, bf); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 5e6de1f40..d149275b5 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -146,7 +146,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) { - std::complex<float> EIGEN_ALIGN16 x[2]; + EIGEN_ALIGN16 std::complex<float> x[2]; vst1q_f32((float *)x, a.v); return x[0]; } @@ -401,7 +401,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1c template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) { - std::complex<double> EIGEN_ALIGN16 res; + EIGEN_ALIGN16 std::complex<double> res; pstore<std::complex<double> >(&res, a); return res; diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index ca4f2bf94..e8b351849 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -375,8 +375,8 @@ template<> EIGEN_STRONG_INLINE void prefetch<float> (const float* addr) { EI template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } // FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x[4]; vst1q_f32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int32_t x[4]; vst1q_s32(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { float32x2_t a_lo, a_hi; @@ -551,6 +551,13 @@ template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) return vget_lane_s32(max, 0); } +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) +{ + uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)), + vget_high_u32(vreinterpretq_u32_f32(x))); + return vget_lane_u32(vpmax_u32(tmp,tmp),0); +} + // this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, // see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 #define PALIGN_NEON(Offset,Type,Command) \ @@ -704,6 +711,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, con return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return vreinterpretq_f64_u64(vceqq_f64(a,b)); } + template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 911fe066e..f39988eac 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -82,6 +82,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet2cf ptrue <Packet2cf>(const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); } +template<> EIGEN_STRONG_INLINE Packet2cf pnot <Packet2cf>(const Packet2cf& a) { return Packet2cf(pnot(Packet4f(a.v))); } + template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } @@ -305,6 +308,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet1cd ptrue <Packet1cd>(const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet1cd pnot <Packet1cd>(const Packet1cd& a) { return Packet1cd(pnot(Packet2d(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } @@ -439,6 +444,18 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) { kernel.packet[1].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) +{ + __m128 eq = _mm_cmpeq_ps(a.v, b.v); + return Packet2cf(pand<Packet4f>(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) +{ + __m128d eq = _mm_cmpeq_pd(a.v, b.v); + return Packet1cd(pand<Packet2d>(eq, vec2d_swizzle1(eq, 1, 0))); +} + template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { __m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); return Packet2cf(_mm_castpd_ps(result)); diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 4c7dc5b64..9c3750af0 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -374,9 +374,21 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); } +template<> EIGEN_STRONG_INLINE Packet4f +ptrue<Packet4f>(const Packet4f& a) { + Packet4i b = _mm_castps_si128(a); + return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b)); +} +template<> EIGEN_STRONG_INLINE Packet2d +ptrue<Packet2d>(const Packet2d& a) { + Packet4i b = _mm_castpd_si128(a); + return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b)); +} template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } @@ -812,6 +824,17 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) #endif // EIGEN_VECTORIZE_SSE4_1 } +// not needed yet +// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x) +// { +// return _mm_movemask_ps(x) == 0xF; +// } + +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) +{ + return _mm_movemask_ps(x) != 0x0; +} + #if EIGEN_COMP_GNUC // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) // { diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 0c2d2cfca..55994047e 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -548,6 +548,23 @@ struct functor_traits<scalar_tanh_op<Scalar> > { }; }; +#if EIGEN_HAS_CXX11_MATH +/** \internal + * \brief Template functor to compute the atanh of a scalar + * \sa class CwiseUnaryOp, ArrayBase::atanh() + */ +template <typename Scalar> +struct scalar_atanh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); } +}; + +template <typename Scalar> +struct functor_traits<scalar_atanh_op<Scalar> > { + enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; +}; +#endif + /** \internal * \brief Template functor to compute the sinh of a scalar * \sa class CwiseUnaryOp, ArrayBase::sinh() @@ -567,6 +584,23 @@ struct functor_traits<scalar_sinh_op<Scalar> > }; }; +#if EIGEN_HAS_CXX11_MATH +/** \internal + * \brief Template functor to compute the asinh of a scalar + * \sa class CwiseUnaryOp, ArrayBase::asinh() + */ +template <typename Scalar> +struct scalar_asinh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); } +}; + +template <typename Scalar> +struct functor_traits<scalar_asinh_op<Scalar> > { + enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; +}; +#endif + /** \internal * \brief Template functor to compute the cosh of a scalar * \sa class CwiseUnaryOp, ArrayBase::cosh() @@ -586,6 +620,23 @@ struct functor_traits<scalar_cosh_op<Scalar> > }; }; +#if EIGEN_HAS_CXX11_MATH +/** \internal + * \brief Template functor to compute the acosh of a scalar + * \sa class CwiseUnaryOp, ArrayBase::acosh() + */ +template <typename Scalar> +struct scalar_acosh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); } +}; + +template <typename Scalar> +struct functor_traits<scalar_acosh_op<Scalar> > { + enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; +}; +#endif + /** \internal * \brief Template functor to compute the inverse of a scalar * \sa class CwiseUnaryOp, Cwise::inverse() @@ -598,9 +649,13 @@ struct scalar_inverse_op { EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const { return internal::pdiv(pset1<Packet>(Scalar(1)),a); } }; -template<typename Scalar> -struct functor_traits<scalar_inverse_op<Scalar> > -{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; }; +template <typename Scalar> +struct functor_traits<scalar_inverse_op<Scalar> > { + enum { + PacketAccess = packet_traits<Scalar>::HasDiv, + Cost = scalar_div_cost<Scalar, PacketAccess>::value + }; +}; /** \internal * \brief Template functor to compute the square of a scalar diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index afbd83eda..030c7740a 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -353,6 +353,24 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); #endif +template <typename RhsPacket, typename RhsPacketx4, int registers_taken> +struct RhsPanelHelper { + private: + static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken; + public: + typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type; +}; + +template <typename Packet> +struct QuadPacket +{ + Packet B_0, B1, B2, B3; + const Packet& get(const FixedInt<0>&) const { return B_0; } + const Packet& get(const FixedInt<1>&) const { return B1; } + const Packet& get(const FixedInt<2>&) const { return B2; } + const Packet& get(const FixedInt<3>&) const { return B3; } +}; + template <int N, typename T1, typename T2, typename T3> struct packet_conditional { typedef T3 type; }; @@ -448,29 +466,35 @@ public: typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket<RhsPacket> RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - + template<typename RhsPacketType> EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { dest = pset1<RhsPacketType>(*b); } - + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + { + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); @@ -488,8 +512,8 @@ public: dest = ploadu<LhsPacketType>(a); } - template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType> - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const + template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj; // It would be a lot cleaner to call pmadd all the time. Unfortunately if we @@ -504,6 +528,12 @@ public: #endif } + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { r = pmadd(c,alpha,r); @@ -555,6 +585,8 @@ public: typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket<RhsPacket> RhsPacketx4; + typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -567,6 +599,20 @@ public: { dest = pset1<RhsPacketType>(*b); } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { @@ -598,18 +644,8 @@ public: dest = ploadu<LhsPacketType>(a); } - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - - template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType> - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const + template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); } @@ -630,10 +666,16 @@ public: c += a * b; } + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + template <typename ResPacketType, typename AccPacketType> EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { - const conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj; + conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj; r = cj.pmadd(c,alpha,r); } @@ -756,6 +798,9 @@ public: typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket; typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket; typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket; + + // this actualy holds 8 packets! + typedef QuadPacket<RhsPacket> RhsPacketx4; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } @@ -778,39 +823,37 @@ public: dest.first = pset1<RealPacketType>(real(*b)); dest.second = pset1<RealPacketType>(imag(*b)); } - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - loadRhs(b,dest); + loadRhs(b, dest.B_0); + loadRhs(b + 1, dest.B1); + loadRhs(b + 2, dest.B2); + loadRhs(b + 3, dest.B3); } - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const + + // Scalar path + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const { - loadQuadToDoublePacket(b,dest); + loadRhs(b, dest); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + + // Vectorized path + template<typename RealPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); - loadRhs(b+2, b2); - loadRhs(b+3, b3); + loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} - // Vectorized path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); + loadRhs(b,dest); } - - // Scalar path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); + loadQuadToDoublePacket(b,dest); } // nothing special here @@ -825,17 +868,26 @@ public: dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a)); } - template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType> - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/) const + template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType> + EIGEN_STRONG_INLINE + typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type + madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const + template<typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const { c = cj.pmadd(a,b,c); } + + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } @@ -914,7 +966,7 @@ public: typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; typedef LhsPacket LhsPacket4Packing; - + typedef QuadPacket<RhsPacket> RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -927,18 +979,20 @@ public: { dest = pset1<RhsPacketType>(*b); } - - void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - pbroadcast4(b, b0, b1, b2, b3); + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// // FIXME not sure that's the best way to implement it! -// b0 = pload1<RhsPacket>(b+0); -// b1 = pload1<RhsPacket>(b+1); -// } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { @@ -956,8 +1010,8 @@ public: dest = ploaddup<LhsPacketType>(a); } - template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType> - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const + template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); } @@ -979,48 +1033,135 @@ public: c += a * b; } + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + template <typename ResPacketType, typename AccPacketType> EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { - const conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj; + conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj; r = cj.pmadd(alpha,c,r); } protected: - + }; #if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON -template<> -struct gebp_traits <float, float, false, false,Architecture::NEON> - : gebp_traits<float,float,false,false,Architecture::Generic> +template<int _PacketSize> +struct gebp_traits <float, float, false, false,Architecture::NEON,_PacketSize> + : gebp_traits<float,float,false,false,Architecture::Generic,_PacketSize> { typedef float RhsPacket; - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + typedef float32x4_t RhsPacketx4; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - loadRhs(b+0, b0); - loadRhs(b+1, b1); - loadRhs(b+2, b2); - loadRhs(b+3, b3); + dest = *b; } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - dest = *b; + dest = vld1q_f32(b); } + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const + {} + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b,dest); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const { c = vfmaq_n_f32(c, a, b); } + + template<int LaneID> + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<LaneID>&) const + { + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f32 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : ); + #else + c = vfmaq_laneq_f32(c, a, b, LaneID); + #endif + } +}; + + +template<> +struct gebp_traits <double, double, false, false,Architecture::NEON> + : gebp_traits<double,double,false,false,Architecture::Generic> +{ + typedef double RhsPacket; + + struct RhsPacketx4 { + float64x2_t B_0, B_1; + }; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest.B_0 = vld1q_f64(b); + dest.B_1 = vld1q_f64(b+2); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const + {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_n_f64(c, a, b); + } + + template<int LaneID> + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<LaneID>&) const + { + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f64 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + #else + if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0); + else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1); + else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0); + else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1); + #endif + } }; #endif @@ -1044,6 +1185,9 @@ struct gebp_kernel typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; + typedef typename Traits::RhsPacketx4 RhsPacketx4; + + typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15; typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits; @@ -1148,7 +1292,7 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, SRhsPacketQuarter b0; straits.loadLhsUnaligned(blB, a0); straits.loadRhs(blA, b0); - straits.madd(a0,b0,c0,b0); + straits.madd(a0,b0,c0,b0, fix<0>); blB += SwappedTraits::LhsProgress/4; blA += 1; } @@ -1165,21 +1309,25 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper> struct lhs_process_one_packet { + typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4; - EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) { EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); - traits.loadLhs(&blA[(0+1*K)*(LhsProgress)], *A0); - traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3); - traits.madd(*A0, *B_0, *C0, *B_0); - traits.madd(*A0, *B1, *C1, *B1); - traits.madd(*A0, *B2, *C2, *B2); - traits.madd(*A0, *B3, *C3, *B3); + traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0); + traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel); + traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>); + traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>); + traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>); + traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>); EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); } - EIGEN_STRONG_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, Index prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4) + EIGEN_STRONG_INLINE void operator()( + const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha, + Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, + int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4) { GEBPTraits traits; @@ -1221,18 +1369,19 @@ struct lhs_process_one_packet for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4"); - RhsPacket B_0, B1, B2, B3; + RhsPacketx4 rhs_panel; + RhsPacket T0; internal::prefetch(blB+(48+0)); - peeled_kc_onestep(0, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); - peeled_kc_onestep(1, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); - peeled_kc_onestep(2, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); - peeled_kc_onestep(3, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); + peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(1, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(3, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); internal::prefetch(blB+(48+16)); - peeled_kc_onestep(4, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); - peeled_kc_onestep(5, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); - peeled_kc_onestep(6, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); - peeled_kc_onestep(7, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); + peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(5, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(7, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); blB += pk*4*RhsProgress; blA += pk*LhsProgress; @@ -1243,8 +1392,9 @@ struct lhs_process_one_packet // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { - RhsPacket B_0, B1, B2, B3; - peeled_kc_onestep(0, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3); + RhsPacketx4 rhs_panel; + RhsPacket T0; + peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); blB += 4*RhsProgress; blA += LhsProgress; } @@ -1293,9 +1443,10 @@ struct lhs_process_one_packet do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + /* FIXME: why unaligned???? */ \ traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B_0); \ + traits.madd(A0, B_0, C0, B_0, fix<0>); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \ } while(false); @@ -1372,7 +1523,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0; enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) const Index peeled_kc = depth & ~(pk-1); - const Index prefetch_res_offset = 32/sizeof(ResScalar); + const int prefetch_res_offset = 32/sizeof(ResScalar); // const Index depth2 = depth & ~1; //---------- Process 3 * LhsProgress rows at once ---------- @@ -1430,36 +1581,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4"); - RhsPacket B_0, T0; + // 15 registers are taken (12 for acc, 2 for lhs). + RhsPanel15 rhs_panel; + RhsPacket T0; LhsPacket A2; - -#define EIGEN_GEBP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ + #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0)) + // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633 + // without this workaround A0, A1, and A2 are loaded in the same register, + // which is not good for pipelining + #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2)); + #else + #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND + #endif +#define EIGEN_GEBP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - internal::prefetch(blA+(3*K+16)*LhsProgress); \ - if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \ - traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ - traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ - traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \ - traits.madd(A0, B_0, C0, T0); \ - traits.madd(A1, B_0, C4, T0); \ - traits.madd(A2, B_0, C8, B_0); \ - traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \ - traits.madd(A0, B_0, C1, T0); \ - traits.madd(A1, B_0, C5, T0); \ - traits.madd(A2, B_0, C9, B_0); \ - traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \ - traits.madd(A0, B_0, C2, T0); \ - traits.madd(A1, B_0, C6, T0); \ - traits.madd(A2, B_0, C10, B_0); \ - traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \ - traits.madd(A0, B_0, C3 , T0); \ - traits.madd(A1, B_0, C7, T0); \ - traits.madd(A2, B_0, C11, B_0); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ - } while(false) + internal::prefetch(blA + (3 * K + 16) * LhsProgress); \ + if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \ + internal::prefetch(blB + (4 * K + 16) * RhsProgress); \ + } /* Bug 953 */ \ + traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \ + traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \ + EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \ + traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C0, T0, fix<0>); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A2, rhs_panel, C8, T0, fix<0>); \ + traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A2, rhs_panel, C9, T0, fix<1>); \ + traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A2, rhs_panel, C10, T0, fix<2>); \ + traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + traits.madd(A2, rhs_panel, C11, T0, fix<3>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ + } while (false) internal::prefetch(blB); EIGEN_GEBP_ONESTEP(0); @@ -1479,7 +1642,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { - RhsPacket B_0, T0; + RhsPanel15 rhs_panel; + RhsPacket T0; LhsPacket A2; EIGEN_GEBP_ONESTEP(0); blB += 4*RhsProgress; @@ -1559,20 +1723,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga { EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1"); RhsPacket B_0; -#define EIGEN_GEBGP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ - traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ - traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B_0); \ - traits.madd(A1, B_0, C4, B_0); \ - traits.madd(A2, B_0, C8, B_0); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ - } while(false) - + traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \ + traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \ + traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0, fix<0>); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ + traits.madd(A2, B_0, C8, B_0, fix<0>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ + } while (false) + EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); @@ -1661,7 +1825,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4"); - RhsPacket B_0, B1, B2, B3, T0; + RhsPacketx4 rhs_panel; + RhsPacket T0; // NOTE: the begin/end asm comments below work around bug 935! // but they are not enough for gcc>=6 without FMA (bug 1637) @@ -1670,24 +1835,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga #else #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND #endif - #define EIGEN_GEBGP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ - traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ - traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ - traits.madd(A0, B_0, C0, T0); \ - traits.madd(A1, B_0, C4, B_0); \ - traits.madd(A0, B1, C1, T0); \ - traits.madd(A1, B1, C5, B1); \ - traits.madd(A0, B2, C2, T0); \ - traits.madd(A1, B2, C6, B2); \ - traits.madd(A0, B3, C3, T0); \ - traits.madd(A1, B3, C7, B3); \ - EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ - } while(false) - +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ + traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \ + traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C0, T0, fix<0>); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ + } while (false) + internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -1707,7 +1872,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { - RhsPacket B_0, B1, B2, B3, T0; + RhsPacketx4 rhs_panel; + RhsPacket T0; EIGEN_GEBGP_ONESTEP(0); blB += 4*RhsProgress; blA += 2*Traits::LhsProgress; @@ -1778,8 +1944,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B1); \ - traits.madd(A1, B_0, C4, B_0); \ + traits.madd(A0, B_0, C0, B1, fix<0>); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ } while(false) @@ -1882,15 +2048,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga straits.loadRhsQuad(blA+0*spk, B_0); straits.loadRhsQuad(blA+1*spk, B_1); - straits.madd(A0,B_0,C0,B_0); - straits.madd(A1,B_1,C1,B_1); + straits.madd(A0,B_0,C0,B_0, fix<0>); + straits.madd(A1,B_1,C1,B_1, fix<0>); straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); straits.loadRhsQuad(blA+2*spk, B_0); straits.loadRhsQuad(blA+3*spk, B_1); - straits.madd(A0,B_0,C2,B_0); - straits.madd(A1,B_1,C3,B_1); + straits.madd(A0,B_0,C2,B_0, fix<0>); + straits.madd(A1,B_1,C3,B_1, fix<0>); blB += 4*SwappedTraits::LhsProgress; blA += 4*spk; @@ -1903,7 +2069,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga straits.loadLhsUnaligned(blB, A0); straits.loadRhsQuad(blA, B_0); - straits.madd(A0,B_0,C0,B_0); + straits.madd(A0,B_0,C0,B_0, fix<0>); blB += SwappedTraits::LhsProgress; blA += spk; @@ -1927,7 +2093,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga straits.loadLhsUnaligned(blB, a0); straits.loadRhs(blA, b0); SAccPacketHalf c0 = predux_half_dowto4(C0); - straits.madd(a0,b0,c0,b0); + straits.madd(a0,b0,c0,b0, fix<0>); straits.acc(c0, alphav, R); } else @@ -2273,7 +2439,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa } pack -= psize; - int left = rows - i; + Index left = rows - i; if (pack <= 0) { if (!gone_last && (starting_pos == i || left >= psize/2 || left >= psize/4) && diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 121476394..68765d4b2 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -29,10 +29,15 @@ * * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link * vectorized and non-vectorized code. + * + * FIXME: this code can be cleaned up once we switch to proper C++11 only. */ #if (defined EIGEN_CUDACC) #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) #define EIGEN_ALIGNOF(x) __alignof(x) +#elif EIGEN_HAS_ALIGNAS + #define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n) + #define EIGEN_ALIGNOF(x) alignof(x) #elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) #define EIGEN_ALIGNOF(x) __alignof(x) @@ -44,7 +49,7 @@ #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) #define EIGEN_ALIGNOF(x) __alignof(x) #else - #error Please tell me what is the equivalent of __attribute__((aligned(n))) and __alignof(x) for your compiler + #error Please tell me what is the equivalent of alignas(n) and alignof(x) for your compiler #endif // If the user explicitly disable vectorization, then we also disable alignment @@ -381,7 +386,7 @@ #endif #if defined(EIGEN_HAS_CUDA_FP16) - #include <host_defines.h> + #include <cuda_runtime_api.h> #include <cuda_fp16.h> #endif diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 3ab3a5f50..5d86a51ac 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -260,6 +260,7 @@ template<typename MatrixType> class HouseholderQR; template<typename MatrixType> class ColPivHouseholderQR; template<typename MatrixType> class FullPivHouseholderQR; template<typename MatrixType> class CompleteOrthogonalDecomposition; +template<typename MatrixType> class SVDBase; template<typename MatrixType, int QRPreconditioner = ColPivHouseholderQRPreconditioner> class JacobiSVD; template<typename MatrixType> class BDCSVD; template<typename MatrixType, int UpLo = Lower> class LLT; diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index c7dba1fc4..3a8001e8f 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -129,16 +129,21 @@ #define EIGEN_COMP_MSVC_STRICT 0 #endif -/// \internal EIGEN_COMP_IBM set to 1 if the compiler is IBM XL C++ -#if defined(__IBMCPP__) || defined(__xlc__) - #define EIGEN_COMP_IBM 1 +/// \internal EIGEN_COMP_IBM set to xlc version if the compiler is IBM XL C++ +// XLC version +// 3.1 0x0301 +// 4.5 0x0405 +// 5.0 0x0500 +// 12.1 0x0C01 +#if defined(__IBMCPP__) || defined(__xlc__) || defined(__ibmxl__) + #define EIGEN_COMP_IBM __xlC__ #else #define EIGEN_COMP_IBM 0 #endif -/// \internal EIGEN_COMP_PGI set to 1 if the compiler is Portland Group Compiler +/// \internal EIGEN_COMP_PGI set to PGI version if the compiler is Portland Group Compiler #if defined(__PGI) - #define EIGEN_COMP_PGI 1 + #define EIGEN_COMP_PGI (__PGIC__*100+__PGIC_MINOR__) #else #define EIGEN_COMP_PGI 0 #endif @@ -347,9 +352,17 @@ #define EIGEN_OS_WIN_STRICT 0 #endif -/// \internal EIGEN_OS_SUN set to 1 if the OS is SUN +/// \internal EIGEN_OS_SUN set to __SUNPRO_C if the OS is SUN +// compiler solaris __SUNPRO_C +// version studio +// 5.7 10 0x570 +// 5.8 11 0x580 +// 5.9 12 0x590 +// 5.10 12.1 0x5100 +// 5.11 12.2 0x5110 +// 5.12 12.3 0x5120 #if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__)) - #define EIGEN_OS_SUN 1 + #define EIGEN_OS_SUN __SUNPRO_C #else #define EIGEN_OS_SUN 0 #endif @@ -546,6 +559,22 @@ #endif #endif +#ifndef EIGEN_HAS_ALIGNAS +#if EIGEN_MAX_CPP_VER>=11 && EIGEN_HAS_CXX11 && \ + ( __has_feature(cxx_alignas) \ + || EIGEN_HAS_CXX14 \ + || (EIGEN_COMP_MSVC >= 1800) \ + || (EIGEN_GNUC_AT_LEAST(4,8)) \ + || (EIGEN_COMP_CLANG>=305) \ + || (EIGEN_COMP_ICC>=1500) \ + || (EIGEN_COMP_PGI>=1500) \ + || (EIGEN_COMP_SUNCC>=0x5130)) +#define EIGEN_HAS_ALIGNAS 1 +#else +#define EIGEN_HAS_ALIGNAS 0 +#endif +#endif + // Does the compiler support type_traits? // - full support of type traits was added only to GCC 5.1.0. // - 20150626 corresponds to the last release of 4.x libstdc++ diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 1415b3fc1..8fcb18a94 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -636,8 +636,41 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to<double>()(x,y); } #endif +/** \internal extract the bits of the float \a x */ +inline unsigned int as_uint(float x) +{ + unsigned int ret; + std::memcpy(&ret, &x, sizeof(float)); + return ret; +} + } // end namespace numext } // end namespace Eigen +// Define portable (u)int{32,64} types +#if EIGEN_HAS_CXX11 +#include <cstdint> +namespace Eigen { +namespace numext { +typedef std::uint32_t uint32_t; +typedef std::int32_t int32_t; +typedef std::uint64_t uint64_t; +typedef std::int64_t int64_t; +} +} +#else +// Without c++11, all compilers able to compile Eigen also +// provides the C99 stdint.h header file. +#include <stdint.h> +namespace Eigen { +namespace numext { +typedef ::uint32_t uint32_t; +typedef ::int32_t int32_t; +typedef ::uint64_t uint64_t; +typedef ::int64_t int64_t; +} +} +#endif + #endif // EIGEN_META_H diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index b2f95153e..67714e444 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -104,7 +104,8 @@ STORAGE_INDEX_MUST_MATCH=1, CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1, SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1, - INVALID_TEMPLATE_PARAMETER=1 + INVALID_TEMPLATE_PARAMETER=1, + GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1 }; }; diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index 75991aaed..3090351a0 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h @@ -97,6 +97,9 @@ template<int Mode> struct transform_make_affine; * - #AffineCompact: the transformation is stored as a (Dim)x(Dim+1) matrix. * - #Projective: the transformation is stored as a (Dim+1)^2 matrix * without any assumption. + * - #Isometry: same as #Affine with the additional assumption that + * the linear part represents a rotation. This assumption is exploited + * to speed up some functions such as inverse() and rotation(). * \tparam _Options has the same meaning as in class Matrix. It allows to specify DontAlign and/or RowMajor. * These Options are passed directly to the underlying matrix type. * @@ -252,11 +255,11 @@ protected: public: /** Default constructor without initialization of the meaningful coefficients. - * If Mode==Affine, then the last row is set to [0 ... 0 1] */ + * If Mode==Affine or Mode==Isometry, then the last row is set to [0 ... 0 1] */ EIGEN_DEVICE_FUNC inline Transform() { check_template_params(); - internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix); + internal::transform_make_affine<(int(Mode)==Affine || int(Mode)==Isometry) ? Affine : AffineCompact>::run(m_matrix); } EIGEN_DEVICE_FUNC inline Transform(const Transform& other) @@ -602,7 +605,9 @@ public: template<typename Derived> EIGEN_DEVICE_FUNC inline Transform operator*(const RotationBase<Derived,Dim>& r) const; - EIGEN_DEVICE_FUNC const LinearMatrixType rotation() const; + typedef typename internal::conditional<int(Mode)==Isometry,ConstLinearPart,const LinearMatrixType>::type RotationReturnType; + EIGEN_DEVICE_FUNC RotationReturnType rotation() const; + template<typename RotationMatrixType, typename ScalingMatrixType> EIGEN_DEVICE_FUNC void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const; @@ -1046,20 +1051,43 @@ EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim *** Special functions *** ************************/ +namespace internal { +template<int Mode> struct transform_rotation_impl { + template<typename TransformType> + EIGEN_DEVICE_FUNC static inline + const typename TransformType::LinearMatrixType run(const TransformType& t) + { + typedef typename TransformType::LinearMatrixType LinearMatrixType; + LinearMatrixType result; + t.computeRotationScaling(&result, (LinearMatrixType*)0); + return result; + } +}; +template<> struct transform_rotation_impl<Isometry> { + template<typename TransformType> + EIGEN_DEVICE_FUNC static inline + typename TransformType::ConstLinearPart run(const TransformType& t) + { + return t.linear(); + } +}; +} /** \returns the rotation part of the transformation * + * If Mode==Isometry, then this method is an alias for linear(), + * otherwise it calls computeRotationScaling() to extract the rotation + * through a SVD decomposition. * * \svd_module * * \sa computeRotationScaling(), computeScalingRotation(), class SVD */ template<typename Scalar, int Dim, int Mode, int Options> -EIGEN_DEVICE_FUNC const typename Transform<Scalar,Dim,Mode,Options>::LinearMatrixType +EIGEN_DEVICE_FUNC +typename Transform<Scalar,Dim,Mode,Options>::RotationReturnType Transform<Scalar,Dim,Mode,Options>::rotation() const { - LinearMatrixType result; - computeRotationScaling(&result, (LinearMatrixType*)0); - return result; + return internal::transform_rotation_impl<Mode>::run(*this); } diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h index e62befcb6..9318c281f 100644 --- a/Eigen/src/Householder/HouseholderSequence.h +++ b/Eigen/src/Householder/HouseholderSequence.h @@ -156,6 +156,12 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS Side > TransposeReturnType; + typedef HouseholderSequence< + typename internal::add_const<VectorsType>::type, + typename internal::add_const<CoeffsType>::type, + Side + > ConstHouseholderSequence; + /** \brief Constructor. * \param[in] v %Matrix containing the essential parts of the Householder vectors * \param[in] h Vector containing the Householder coefficients @@ -244,6 +250,18 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS .setShift(m_shift); } + /** \returns an expression of the complex conjugate of \c *this if Cond==true, + * returns \c *this otherwise. + */ + template<bool Cond> + EIGEN_DEVICE_FUNC + inline typename internal::conditional<Cond,ConjugateReturnType,ConstHouseholderSequence>::type + conjugateIf() const + { + typedef typename internal::conditional<Cond,ConjugateReturnType,ConstHouseholderSequence>::type ReturnType; + return ReturnType(m_vectors.template conjugateIf<Cond>(), m_coeffs.template conjugateIf<Cond>()); + } + /** \brief Adjoint (conjugate transpose) of the Householder sequence. */ AdjointReturnType adjoint() const { diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 344ec8926..ef93ec5eb 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -18,6 +18,7 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> > { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -62,9 +63,9 @@ template<typename _MatrixType> class FullPivLU public: typedef _MatrixType MatrixType; typedef SolverBase<FullPivLU> Base; + friend class SolverBase<FullPivLU>; EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU) - // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int enum { MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime @@ -218,6 +219,7 @@ template<typename _MatrixType> class FullPivLU return internal::image_retval<FullPivLU>(*this, originalMatrix); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \return a solution x to the equation Ax=b, where A is the matrix of which * *this is the LU decomposition. * @@ -237,14 +239,10 @@ template<typename _MatrixType> class FullPivLU * * \sa TriangularView::solve(), kernel(), inverse() */ - // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion. template<typename Rhs> inline const Solve<FullPivLU, Rhs> - solve(const MatrixBase<Rhs>& b) const - { - eigen_assert(m_isInitialized && "LU is not initialized."); - return Solve<FullPivLU, Rhs>(*this, b.derived()); - } + solve(const MatrixBase<Rhs>& b) const; + #endif /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is the LU decomposition. @@ -529,8 +527,8 @@ void FullPivLU<MatrixType>::computeInPlace() m_nonzero_pivots = k; for(Index i = k; i < size; ++i) { - m_rowsTranspositions.coeffRef(i) = i; - m_colsTranspositions.coeffRef(i) = i; + m_rowsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i); + m_colsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i); } break; } @@ -541,8 +539,8 @@ void FullPivLU<MatrixType>::computeInPlace() // Now that we've found the pivot, we need to apply the row/col swaps to // bring it to the location (k,k). - m_rowsTranspositions.coeffRef(k) = row_of_biggest_in_corner; - m_colsTranspositions.coeffRef(k) = col_of_biggest_in_corner; + m_rowsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner); + m_colsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner); if(k != row_of_biggest_in_corner) { m_lu.row(k).swap(m_lu.row(row_of_biggest_in_corner)); ++number_of_transpositions; @@ -755,7 +753,6 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const const Index rows = this->rows(), cols = this->cols(), nonzero_pivots = this->rank(); - eigen_assert(rhs.rows() == rows); const Index smalldim = (std::min)(rows, cols); if(nonzero_pivots == 0) @@ -805,7 +802,6 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType const Index rows = this->rows(), cols = this->cols(), nonzero_pivots = this->rank(); - eigen_assert(rhs.rows() == cols); const Index smalldim = (std::min)(rows, cols); if(nonzero_pivots == 0) @@ -819,29 +815,19 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType // Step 1 c = permutationQ().inverse() * rhs; - if (Conjugate) { - // Step 2 - m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) - .template triangularView<Upper>() - .adjoint() - .solveInPlace(c.topRows(nonzero_pivots)); - // Step 3 - m_lu.topLeftCorner(smalldim, smalldim) - .template triangularView<UnitLower>() - .adjoint() - .solveInPlace(c.topRows(smalldim)); - } else { - // Step 2 - m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) - .template triangularView<Upper>() - .transpose() - .solveInPlace(c.topRows(nonzero_pivots)); - // Step 3 - m_lu.topLeftCorner(smalldim, smalldim) - .template triangularView<UnitLower>() - .transpose() - .solveInPlace(c.topRows(smalldim)); - } + // Step 2 + m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) + .template triangularView<Upper>() + .transpose() + .template conjugateIf<Conjugate>() + .solveInPlace(c.topRows(nonzero_pivots)); + + // Step 3 + m_lu.topLeftCorner(smalldim, smalldim) + .template triangularView<UnitLower>() + .transpose() + .template conjugateIf<Conjugate>() + .solveInPlace(c.topRows(smalldim)); // Step 4 PermutationPType invp = permutationP().inverse().eval(); diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index bfcd2c95b..b414b5c46 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -19,6 +19,7 @@ template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> > { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; + typedef int StorageIndex; typedef traits<_MatrixType> BaseTraits; enum { Flags = BaseTraits::Flags & RowMajorBit, @@ -79,8 +80,9 @@ template<typename _MatrixType> class PartialPivLU typedef _MatrixType MatrixType; typedef SolverBase<PartialPivLU> Base; + friend class SolverBase<PartialPivLU>; + EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU) - // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int enum { MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime @@ -152,6 +154,7 @@ template<typename _MatrixType> class PartialPivLU return m_p; } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method returns the solution x to the equation Ax=b, where A is the matrix of which * *this is the LU decomposition. * @@ -169,14 +172,10 @@ template<typename _MatrixType> class PartialPivLU * * \sa TriangularView::solve(), inverse(), computeInverse() */ - // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion. template<typename Rhs> inline const Solve<PartialPivLU, Rhs> - solve(const MatrixBase<Rhs>& b) const - { - eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); - return Solve<PartialPivLU, Rhs>(*this, b.derived()); - } + solve(const MatrixBase<Rhs>& b) const; + #endif /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is the LU decomposition. @@ -231,8 +230,6 @@ template<typename _MatrixType> class PartialPivLU * Step 3: replace c by the solution x to Ux = c. */ - eigen_assert(rhs.rows() == m_lu.rows()); - // Step 1 dst = permutationP() * rhs; @@ -246,26 +243,21 @@ template<typename _MatrixType> class PartialPivLU template<bool Conjugate, typename RhsType, typename DstType> EIGEN_DEVICE_FUNC void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const { - /* The decomposition PA = LU can be rewritten as A = P^{-1} L U. + /* The decomposition PA = LU can be rewritten as A^T = U^T L^T P. * So we proceed as follows: - * Step 1: compute c = Pb. - * Step 2: replace c by the solution x to Lx = c. - * Step 3: replace c by the solution x to Ux = c. + * Step 1: compute c as the solution to L^T c = b + * Step 2: replace c by the solution x to U^T x = c. + * Step 3: update c = P^-1 c. */ eigen_assert(rhs.rows() == m_lu.cols()); - if (Conjugate) { - // Step 1 - dst = m_lu.template triangularView<Upper>().adjoint().solve(rhs); - // Step 2 - m_lu.template triangularView<UnitLower>().adjoint().solveInPlace(dst); - } else { - // Step 1 - dst = m_lu.template triangularView<Upper>().transpose().solve(rhs); - // Step 2 - m_lu.template triangularView<UnitLower>().transpose().solveInPlace(dst); - } + // Step 1 + dst = m_lu.template triangularView<Upper>().transpose() + .template conjugateIf<Conjugate>().solve(rhs); + // Step 2 + m_lu.template triangularView<UnitLower>().transpose() + .template conjugateIf<Conjugate>().solveInPlace(dst); // Step 3 dst = permutationP().transpose() * dst; } @@ -519,7 +511,10 @@ void PartialPivLU<MatrixType>::compute() // the row permutation is stored as int indices, so just to be sure: eigen_assert(m_lu.rows()<NumTraits<int>::highest()); - m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); + if(m_lu.cols()>0) + m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); + else + m_l1_norm = RealScalar(0); eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices"); const Index size = m_lu.rows(); diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index 1faa3442e..9b677e9bf 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -17,6 +17,9 @@ namespace internal { template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> > : traits<_MatrixType> { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -46,20 +49,19 @@ template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> > * \sa MatrixBase::colPivHouseholderQr() */ template<typename _MatrixType> class ColPivHouseholderQR + : public SolverBase<ColPivHouseholderQR<_MatrixType> > { public: typedef _MatrixType MatrixType; + typedef SolverBase<ColPivHouseholderQR> Base; + friend class SolverBase<ColPivHouseholderQR>; + + EIGEN_GENERIC_PUBLIC_INTERFACE(ColPivHouseholderQR) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - // FIXME should be int - typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType; typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType; typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType; @@ -156,6 +158,7 @@ template<typename _MatrixType> class ColPivHouseholderQR computeInPlace(); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * *this is the QR decomposition, if any exists. * @@ -172,11 +175,8 @@ template<typename _MatrixType> class ColPivHouseholderQR */ template<typename Rhs> inline const Solve<ColPivHouseholderQR, Rhs> - solve(const MatrixBase<Rhs>& b) const - { - eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized."); - return Solve<ColPivHouseholderQR, Rhs>(*this, b.derived()); - } + solve(const MatrixBase<Rhs>& b) const; + #endif HouseholderSequenceType householderQ() const; HouseholderSequenceType matrixQ() const @@ -417,6 +417,9 @@ template<typename _MatrixType> class ColPivHouseholderQR #ifndef EIGEN_PARSED_BY_DOXYGEN template<typename RhsType, typename DstType> void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template<bool Conjugate, typename RhsType, typename DstType> + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -583,8 +586,6 @@ template<typename _MatrixType> template<typename RhsType, typename DstType> void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); - const Index nonzero_pivots = nonzeroPivots(); if(nonzero_pivots == 0) @@ -604,6 +605,31 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType & for(Index i = 0; i < nonzero_pivots; ++i) dst.row(m_colsPermutation.indices().coeff(i)) = c.row(i); for(Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero(); } + +template<typename _MatrixType> +template<bool Conjugate, typename RhsType, typename DstType> +void ColPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index nonzero_pivots = nonzeroPivots(); + + if(nonzero_pivots == 0) + { + dst.setZero(); + return; + } + + typename RhsType::PlainObject c(m_colsPermutation.transpose()*rhs); + + m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots) + .template triangularView<Upper>() + .transpose().template conjugateIf<Conjugate>() + .solveInPlace(c.topRows(nonzero_pivots)); + + dst.topRows(nonzero_pivots) = c.topRows(nonzero_pivots); + dst.bottomRows(rows()-nonzero_pivots).setZero(); + + dst.applyOnTheLeft(householderQ().setLength(nonzero_pivots).template conjugateIf<!Conjugate>() ); +} #endif namespace internal { diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h index 03017a375..2fc3c871a 100644 --- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -16,6 +16,9 @@ namespace internal { template <typename _MatrixType> struct traits<CompleteOrthogonalDecomposition<_MatrixType> > : traits<_MatrixType> { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -44,19 +47,21 @@ struct traits<CompleteOrthogonalDecomposition<_MatrixType> > * * \sa MatrixBase::completeOrthogonalDecomposition() */ -template <typename _MatrixType> -class CompleteOrthogonalDecomposition { +template <typename _MatrixType> class CompleteOrthogonalDecomposition + : public SolverBase<CompleteOrthogonalDecomposition<_MatrixType> > +{ public: typedef _MatrixType MatrixType; + typedef SolverBase<CompleteOrthogonalDecomposition> Base; + + template<typename Derived> + friend struct internal::solve_assertion; + + EIGEN_GENERIC_PUBLIC_INTERFACE(CompleteOrthogonalDecomposition) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType; typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType; @@ -131,9 +136,9 @@ class CompleteOrthogonalDecomposition { m_temp(matrix.cols()) { computeInPlace(); - } - + } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method computes the minimum-norm solution X to a least squares * problem \f[\mathrm{minimize} \|A X - B\|, \f] where \b A is the matrix of * which \c *this is the complete orthogonal decomposition. @@ -145,11 +150,8 @@ class CompleteOrthogonalDecomposition { */ template <typename Rhs> inline const Solve<CompleteOrthogonalDecomposition, Rhs> solve( - const MatrixBase<Rhs>& b) const { - eigen_assert(m_cpqr.m_isInitialized && - "CompleteOrthogonalDecomposition is not initialized."); - return Solve<CompleteOrthogonalDecomposition, Rhs>(*this, b.derived()); - } + const MatrixBase<Rhs>& b) const; + #endif HouseholderSequenceType householderQ(void) const; HouseholderSequenceType matrixQ(void) const { return m_cpqr.householderQ(); } @@ -158,8 +160,8 @@ class CompleteOrthogonalDecomposition { */ MatrixType matrixZ() const { MatrixType Z = MatrixType::Identity(m_cpqr.cols(), m_cpqr.cols()); - applyZAdjointOnTheLeftInPlace(Z); - return Z.adjoint(); + applyZOnTheLeftInPlace<false>(Z); + return Z; } /** \returns a reference to the matrix where the complete orthogonal @@ -275,6 +277,7 @@ class CompleteOrthogonalDecomposition { */ inline const Inverse<CompleteOrthogonalDecomposition> pseudoInverse() const { + eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized."); return Inverse<CompleteOrthogonalDecomposition>(*this); } @@ -368,6 +371,9 @@ class CompleteOrthogonalDecomposition { #ifndef EIGEN_PARSED_BY_DOXYGEN template <typename RhsType, typename DstType> void _solve_impl(const RhsType& rhs, DstType& dst) const; + + template<bool Conjugate, typename RhsType, typename DstType> + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -375,8 +381,22 @@ class CompleteOrthogonalDecomposition { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } + template<bool Transpose_, typename Rhs> + void _check_solve_assertion(const Rhs& b) const { + EIGEN_ONLY_USED_FOR_DEBUG(b); + eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized."); + eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "CompleteOrthogonalDecomposition::solve(): invalid number of rows of the right hand side matrix b"); + } + void computeInPlace(); + /** Overwrites \b rhs with \f$ \mathbf{Z} * \mathbf{rhs} \f$ or + * \f$ \mathbf{\overline Z} * \mathbf{rhs} \f$ if \c Conjugate + * is set to \c true. + */ + template <bool Conjugate, typename Rhs> + void applyZOnTheLeftInPlace(Rhs& rhs) const; + /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$. */ template <typename Rhs> @@ -465,13 +485,35 @@ void CompleteOrthogonalDecomposition<MatrixType>::computeInPlace() } template <typename MatrixType> +template <bool Conjugate, typename Rhs> +void CompleteOrthogonalDecomposition<MatrixType>::applyZOnTheLeftInPlace( + Rhs& rhs) const { + const Index cols = this->cols(); + const Index nrhs = rhs.cols(); + const Index rank = this->rank(); + Matrix<typename Rhs::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs)); + for (Index k = rank-1; k >= 0; --k) { + if (k != rank - 1) { + rhs.row(k).swap(rhs.row(rank - 1)); + } + rhs.middleRows(rank - 1, cols - rank + 1) + .applyHouseholderOnTheLeft( + matrixQTZ().row(k).tail(cols - rank).transpose().template conjugateIf<!Conjugate>(), zCoeffs().template conjugateIf<Conjugate>()(k), + &temp(0)); + if (k != rank - 1) { + rhs.row(k).swap(rhs.row(rank - 1)); + } + } +} + +template <typename MatrixType> template <typename Rhs> void CompleteOrthogonalDecomposition<MatrixType>::applyZAdjointOnTheLeftInPlace( Rhs& rhs) const { const Index cols = this->cols(); const Index nrhs = rhs.cols(); const Index rank = this->rank(); - Matrix<typename MatrixType::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs)); + Matrix<typename Rhs::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs)); for (Index k = 0; k < rank; ++k) { if (k != rank - 1) { rhs.row(k).swap(rhs.row(rank - 1)); @@ -491,8 +533,6 @@ template <typename _MatrixType> template <typename RhsType, typename DstType> void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( const RhsType& rhs, DstType& dst) const { - eigen_assert(rhs.rows() == this->rows()); - const Index rank = this->rank(); if (rank == 0) { dst.setZero(); @@ -520,6 +560,34 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( // Undo permutation to get x = P^{-1} * y. dst = colsPermutation() * dst; } + +template<typename _MatrixType> +template<bool Conjugate, typename RhsType, typename DstType> +void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index rank = this->rank(); + + if (rank == 0) { + dst.setZero(); + return; + } + + typename RhsType::PlainObject c(colsPermutation().transpose()*rhs); + + if (rank < cols()) { + applyZOnTheLeftInPlace<!Conjugate>(c); + } + + matrixT().topLeftCorner(rank, rank) + .template triangularView<Upper>() + .transpose().template conjugateIf<Conjugate>() + .solveInPlace(c.topRows(rank)); + + dst.topRows(rank) = c.topRows(rank); + dst.bottomRows(rows()-rank).setZero(); + + dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf<!Conjugate>() ); +} #endif namespace internal { diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h index c31e47cc4..d0664a1d8 100644 --- a/Eigen/src/QR/FullPivHouseholderQR.h +++ b/Eigen/src/QR/FullPivHouseholderQR.h @@ -18,6 +18,9 @@ namespace internal { template<typename _MatrixType> struct traits<FullPivHouseholderQR<_MatrixType> > : traits<_MatrixType> { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -55,20 +58,19 @@ struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> > * \sa MatrixBase::fullPivHouseholderQr() */ template<typename _MatrixType> class FullPivHouseholderQR + : public SolverBase<FullPivHouseholderQR<_MatrixType> > { public: typedef _MatrixType MatrixType; + typedef SolverBase<FullPivHouseholderQR> Base; + friend class SolverBase<FullPivHouseholderQR>; + + EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivHouseholderQR) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - // FIXME should be int - typedef typename MatrixType::StorageIndex StorageIndex; typedef internal::FullPivHouseholderQRMatrixQReturnType<MatrixType> MatrixQReturnType; typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType; typedef Matrix<StorageIndex, 1, @@ -156,6 +158,7 @@ template<typename _MatrixType> class FullPivHouseholderQR computeInPlace(); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * \c *this is the QR decomposition. * @@ -173,11 +176,8 @@ template<typename _MatrixType> class FullPivHouseholderQR */ template<typename Rhs> inline const Solve<FullPivHouseholderQR, Rhs> - solve(const MatrixBase<Rhs>& b) const - { - eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized."); - return Solve<FullPivHouseholderQR, Rhs>(*this, b.derived()); - } + solve(const MatrixBase<Rhs>& b) const; + #endif /** \returns Expression object representing the matrix Q */ @@ -396,6 +396,9 @@ template<typename _MatrixType> class FullPivHouseholderQR #ifndef EIGEN_PARSED_BY_DOXYGEN template<typename RhsType, typename DstType> void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template<bool Conjugate, typename RhsType, typename DstType> + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -498,15 +501,15 @@ void FullPivHouseholderQR<MatrixType>::computeInPlace() m_nonzero_pivots = k; for(Index i = k; i < size; i++) { - m_rows_transpositions.coeffRef(i) = i; - m_cols_transpositions.coeffRef(i) = i; + m_rows_transpositions.coeffRef(i) = internal::convert_index<StorageIndex>(i); + m_cols_transpositions.coeffRef(i) = internal::convert_index<StorageIndex>(i); m_hCoeffs.coeffRef(i) = Scalar(0); } break; } - m_rows_transpositions.coeffRef(k) = row_of_biggest_in_corner; - m_cols_transpositions.coeffRef(k) = col_of_biggest_in_corner; + m_rows_transpositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner); + m_cols_transpositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner); if(k != row_of_biggest_in_corner) { m_qr.row(k).tail(cols-k).swap(m_qr.row(row_of_biggest_in_corner).tail(cols-k)); ++number_of_transpositions; @@ -540,7 +543,6 @@ template<typename _MatrixType> template<typename RhsType, typename DstType> void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); const Index l_rank = rank(); // FIXME introduce nonzeroPivots() and use it here. and more generally, @@ -553,7 +555,7 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType typename RhsType::PlainObject c(rhs); - Matrix<Scalar,1,RhsType::ColsAtCompileTime> temp(rhs.cols()); + Matrix<typename RhsType::Scalar,1,RhsType::ColsAtCompileTime> temp(rhs.cols()); for (Index k = 0; k < l_rank; ++k) { Index remainingSize = rows()-k; @@ -570,6 +572,42 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType for(Index i = 0; i < l_rank; ++i) dst.row(m_cols_permutation.indices().coeff(i)) = c.row(i); for(Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero(); } + +template<typename _MatrixType> +template<bool Conjugate, typename RhsType, typename DstType> +void FullPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index l_rank = rank(); + + if(l_rank == 0) + { + dst.setZero(); + return; + } + + typename RhsType::PlainObject c(m_cols_permutation.transpose()*rhs); + + m_qr.topLeftCorner(l_rank, l_rank) + .template triangularView<Upper>() + .transpose().template conjugateIf<Conjugate>() + .solveInPlace(c.topRows(l_rank)); + + dst.topRows(l_rank) = c.topRows(l_rank); + dst.bottomRows(rows()-l_rank).setZero(); + + Matrix<Scalar, 1, DstType::ColsAtCompileTime> temp(dst.cols()); + const Index size = (std::min)(rows(), cols()); + for (Index k = size-1; k >= 0; --k) + { + Index remainingSize = rows()-k; + + dst.bottomRightCorner(remainingSize, dst.cols()) + .applyHouseholderOnTheLeft(m_qr.col(k).tail(remainingSize-1).template conjugateIf<!Conjugate>(), + m_hCoeffs.template conjugateIf<Conjugate>().coeff(k), &temp.coeffRef(0)); + + dst.row(k).swap(dst.row(m_rows_transpositions.coeff(k))); + } +} #endif namespace internal { diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h index 33cb9c8ff..801739fbd 100644 --- a/Eigen/src/QR/HouseholderQR.h +++ b/Eigen/src/QR/HouseholderQR.h @@ -14,6 +14,18 @@ namespace Eigen { +namespace internal { +template<typename _MatrixType> struct traits<HouseholderQR<_MatrixType> > + : traits<_MatrixType> +{ + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; +}; + +} // end namespace internal + /** \ingroup QR_Module * * @@ -42,20 +54,19 @@ namespace Eigen { * \sa MatrixBase::householderQr() */ template<typename _MatrixType> class HouseholderQR + : public SolverBase<HouseholderQR<_MatrixType> > { public: typedef _MatrixType MatrixType; + typedef SolverBase<HouseholderQR> Base; + friend class SolverBase<HouseholderQR>; + + EIGEN_GENERIC_PUBLIC_INTERFACE(HouseholderQR) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - // FIXME should be int - typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, (MatrixType::Flags&RowMajorBit) ? RowMajor : ColMajor, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType; typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType; typedef typename internal::plain_row_type<MatrixType>::type RowVectorType; @@ -121,6 +132,7 @@ template<typename _MatrixType> class HouseholderQR computeInPlace(); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * *this is the QR decomposition, if any exists. * @@ -137,11 +149,8 @@ template<typename _MatrixType> class HouseholderQR */ template<typename Rhs> inline const Solve<HouseholderQR, Rhs> - solve(const MatrixBase<Rhs>& b) const - { - eigen_assert(m_isInitialized && "HouseholderQR is not initialized."); - return Solve<HouseholderQR, Rhs>(*this, b.derived()); - } + solve(const MatrixBase<Rhs>& b) const; + #endif /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations. * @@ -214,6 +223,9 @@ template<typename _MatrixType> class HouseholderQR #ifndef EIGEN_PARSED_BY_DOXYGEN template<typename RhsType, typename DstType> void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template<bool Conjugate, typename RhsType, typename DstType> + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -349,7 +361,6 @@ template<typename RhsType, typename DstType> void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { const Index rank = (std::min)(rows(), cols()); - eigen_assert(rhs.rows() == rows()); typename RhsType::PlainObject c(rhs); @@ -362,6 +373,25 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c dst.topRows(rank) = c.topRows(rank); dst.bottomRows(cols()-rank).setZero(); } + +template<typename _MatrixType> +template<bool Conjugate, typename RhsType, typename DstType> +void HouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index rank = (std::min)(rows(), cols()); + + typename RhsType::PlainObject c(rhs); + + m_qr.topLeftCorner(rank, rank) + .template triangularView<Upper>() + .transpose().template conjugateIf<Conjugate>() + .solveInPlace(c.topRows(rank)); + + dst.topRows(rank) = c.topRows(rank); + dst.bottomRows(rows()-rank).setZero(); + + dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf<!Conjugate>() ); +} #endif /** Performs the QR factorization of the given matrix \a matrix. The result of diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 18d7bdc0a..e3fddacbc 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -39,6 +39,7 @@ namespace internal { template<typename _MatrixType> struct traits<BDCSVD<_MatrixType> > + : traits<_MatrixType> { typedef _MatrixType MatrixType; }; @@ -1006,7 +1007,7 @@ void BDCSVD<MatrixType>::perturbCol0 #ifdef EIGEN_BDCSVD_SANITY_CHECKS assert((std::isfinite)(tmp)); #endif - zhat(k) = col0(k) > Literal(0) ? tmp : -tmp; + zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp); } } } diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 1c7c80376..2b6891105 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -425,6 +425,7 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true> template<typename _MatrixType, int QRPreconditioner> struct traits<JacobiSVD<_MatrixType,QRPreconditioner> > + : traits<_MatrixType> { typedef _MatrixType MatrixType; }; diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h index 851ad6836..68df48921 100644 --- a/Eigen/src/SVD/SVDBase.h +++ b/Eigen/src/SVD/SVDBase.h @@ -17,6 +17,18 @@ #define EIGEN_SVDBASE_H namespace Eigen { + +namespace internal { +template<typename Derived> struct traits<SVDBase<Derived> > + : traits<Derived> +{ + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; +}; +} + /** \ingroup SVD_Module * * @@ -44,15 +56,18 @@ namespace Eigen { * terminate in finite (and reasonable) time. * \sa class BDCSVD, class JacobiSVD */ -template<typename Derived> -class SVDBase +template<typename Derived> class SVDBase + : public SolverBase<SVDBase<Derived> > { +public: + + template<typename Derived_> + friend struct internal::solve_assertion; -public: typedef typename internal::traits<Derived>::MatrixType MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar; - typedef typename MatrixType::StorageIndex StorageIndex; + typedef typename Eigen::internal::traits<SVDBase>::StorageIndex StorageIndex; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, @@ -194,6 +209,7 @@ public: inline Index rows() const { return m_rows; } inline Index cols() const { return m_cols; } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A. * * \param b the right-hand-side of the equation to solve. @@ -205,16 +221,15 @@ public: */ template<typename Rhs> inline const Solve<Derived, Rhs> - solve(const MatrixBase<Rhs>& b) const - { - eigen_assert(m_isInitialized && "SVD is not initialized."); - eigen_assert(computeU() && computeV() && "SVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice)."); - return Solve<Derived, Rhs>(derived(), b.derived()); - } - + solve(const MatrixBase<Rhs>& b) const; + #endif + #ifndef EIGEN_PARSED_BY_DOXYGEN template<typename RhsType, typename DstType> void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template<bool Conjugate, typename RhsType, typename DstType> + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -223,6 +238,14 @@ protected: { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } + + template<bool Transpose_, typename Rhs> + void _check_solve_assertion(const Rhs& b) const { + EIGEN_ONLY_USED_FOR_DEBUG(b); + eigen_assert(m_isInitialized && "SVD is not initialized."); + eigen_assert(computeU() && computeV() && "SVDBase::solve(): Both unitaries U and V are required to be computed (thin unitaries suffice)."); + eigen_assert((Transpose_?cols():rows())==b.rows() && "SVDBase::solve(): invalid number of rows of the right hand side matrix b"); + } // return true if already allocated bool allocate(Index rows, Index cols, unsigned int computationOptions) ; @@ -263,17 +286,30 @@ template<typename Derived> template<typename RhsType, typename DstType> void SVDBase<Derived>::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); - // A = U S V^* // So A^{-1} = V S^{-1} U^* - Matrix<Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp; + Matrix<typename RhsType::Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp; Index l_rank = rank(); tmp.noalias() = m_matrixU.leftCols(l_rank).adjoint() * rhs; tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp; dst = m_matrixV.leftCols(l_rank) * tmp; } + +template<typename Derived> +template<bool Conjugate, typename RhsType, typename DstType> +void SVDBase<Derived>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + // A = U S V^* + // So A^{-*} = U S^{-1} V^* + // And A^{-T} = U_conj S^{-1} V^T + Matrix<typename RhsType::Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp; + Index l_rank = rank(); + + tmp.noalias() = m_matrixV.leftCols(l_rank).transpose().template conjugateIf<Conjugate>() * rhs; + tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp; + dst = m_matrixU.template conjugateIf<!Conjugate>().leftCols(l_rank) * tmp; +} #endif template<typename MatrixType> diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h index d89fa0dae..acd986fab 100644 --- a/Eigen/src/SparseCore/CompressedStorage.h +++ b/Eigen/src/SparseCore/CompressedStorage.h @@ -207,6 +207,22 @@ class CompressedStorage return m_values[id]; } + void moveChunk(Index from, Index to, Index chunkSize) + { + eigen_internal_assert(to+chunkSize <= m_size); + if(to>from && from+chunkSize>to) + { + // move backward + internal::smart_memmove(m_values+from, m_values+from+chunkSize, m_values+to); + internal::smart_memmove(m_indices+from, m_indices+from+chunkSize, m_indices+to); + } + else + { + internal::smart_copy(m_values+from, m_values+from+chunkSize, m_values+to); + internal::smart_copy(m_indices+from, m_indices+from+chunkSize, m_indices+to); + } + } + void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision()) { Index k = 0; diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h index 71452e75e..905485c88 100644 --- a/Eigen/src/SparseCore/SparseAssign.h +++ b/Eigen/src/SparseCore/SparseAssign.h @@ -246,35 +246,22 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse> { typedef typename DstXprType::StorageIndex StorageIndex; typedef typename DstXprType::Scalar Scalar; - typedef Array<StorageIndex,Dynamic,1> ArrayXI; - typedef Array<Scalar,Dynamic,1> ArrayXS; - template<int Options> - static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/) - { - Index dstRows = src.rows(); - Index dstCols = src.cols(); - if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) - dst.resize(dstRows, dstCols); - Index size = src.diagonal().size(); - dst.makeCompressed(); - dst.resizeNonZeros(size); - Map<ArrayXI>(dst.innerIndexPtr(), size).setLinSpaced(0,StorageIndex(size)-1); - Map<ArrayXI>(dst.outerIndexPtr(), size+1).setLinSpaced(0,StorageIndex(size)); - Map<ArrayXS>(dst.valuePtr(), size) = src.diagonal(); - } + template<int Options, typename AssignFunc> + static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const AssignFunc &func) + { dst.assignDiagonal(src.diagonal(), func); } template<typename DstDerived> static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/) - { - dst.diagonal() = src.diagonal(); - } + { dst.derived().diagonal() = src.diagonal(); } - static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/) - { dst.diagonal() += src.diagonal(); } + template<typename DstDerived> + static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/) + { dst.derived().diagonal() += src.diagonal(); } - static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/) - { dst.diagonal() -= src.diagonal(); } + template<typename DstDerived> + static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/) + { dst.derived().diagonal() -= src.diagonal(); } }; } // end namespace internal diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h index e0b3c22b6..6a2c7a8ce 100644 --- a/Eigen/src/SparseCore/SparseCompressedBase.h +++ b/Eigen/src/SparseCore/SparseCompressedBase.h @@ -128,6 +128,28 @@ class SparseCompressedBase protected: /** Default constructor. Do nothing. */ SparseCompressedBase() {} + + /** \internal return the index of the coeff at (row,col) or just before if it does not exist. + * This is an analogue of std::lower_bound. + */ + internal::LowerBoundIndex lower_bound(Index row, Index col) const + { + eigen_internal_assert(row>=0 && row<this->rows() && col>=0 && col<this->cols()); + + const Index outer = Derived::IsRowMajor ? row : col; + const Index inner = Derived::IsRowMajor ? col : row; + + Index start = this->outerIndexPtr()[outer]; + Index end = this->isCompressed() ? this->outerIndexPtr()[outer+1] : this->outerIndexPtr()[outer] + this->innerNonZeroPtr()[outer]; + eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist"); + internal::LowerBoundIndex p; + p.value = std::lower_bound(this->innerIndexPtr()+start, this->innerIndexPtr()+end,inner) - this->innerIndexPtr(); + p.found = (p.value<end) && (this->innerIndexPtr()[p.value]==inner); + return p; + } + + friend struct internal::evaluator<SparseCompressedBase<Derived> >; + private: template<typename OtherDerived> explicit SparseCompressedBase(const SparseCompressedBase<OtherDerived>&); }; @@ -333,17 +355,8 @@ protected: Index find(Index row, Index col) const { - eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols()); - - const Index outer = Derived::IsRowMajor ? row : col; - const Index inner = Derived::IsRowMajor ? col : row; - - Index start = m_matrix->outerIndexPtr()[outer]; - Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer]; - eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist"); - const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) - m_matrix->innerIndexPtr(); - - return ((p<end) && (m_matrix->innerIndexPtr()[p]==inner)) ? p : Dynamic; + internal::LowerBoundIndex p = m_matrix->lower_bound(row,col); + return p.found ? p.value : Dynamic; } const Derived *m_matrix; diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index eedae47e8..63dd1cc32 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -99,6 +99,8 @@ class SparseMatrix typedef SparseCompressedBase<SparseMatrix> Base; using Base::convert_index; friend class SparseVector<_Scalar,0,_StorageIndex>; + template<typename, typename, typename, typename, typename> + friend struct internal::Assignment; public: using Base::isCompressed; using Base::nonZeros; @@ -502,7 +504,7 @@ class SparseMatrix m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; } } - + /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */ void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision()) { @@ -895,6 +897,113 @@ public: m_data.index(p) = convert_index(inner); return (m_data.value(p) = Scalar(0)); } +protected: + struct IndexPosPair { + IndexPosPair(Index a_i, Index a_p) : i(a_i), p(a_p) {} + Index i; + Index p; + }; + + /** \internal assign \a diagXpr to the diagonal of \c *this + * There are different strategies: + * 1 - if *this is overwritten (Func==assign_op) or *this is empty, then we can work treat *this as a dense vector expression. + * 2 - otherwise, for each diagonal coeff, + * 2.a - if it already exists, then we update it, + * 2.b - otherwise, if *this is uncompressed and that the current inner-vector has empty room for at least 1 element, then we perform an in-place insertion. + * 2.c - otherwise, we'll have to reallocate and copy everything, so instead of doing so for each new element, it is recorded in a std::vector. + * 3 - at the end, if some entries failed to be inserted in-place, then we alloc a new buffer, copy each chunk at the right position, and insert the new elements. + * + * TODO: some piece of code could be isolated and reused for a general in-place update strategy. + * TODO: if we start to defer the insertion of some elements (i.e., case 2.c executed once), + * then it *might* be better to disable case 2.b since they will have to be copied anyway. + */ + template<typename DiagXpr, typename Func> + void assignDiagonal(const DiagXpr diagXpr, const Func& assignFunc) + { + Index n = diagXpr.size(); + + const bool overwrite = internal::is_same<Func, internal::assign_op<Scalar,Scalar> >::value; + if(overwrite) + { + if((this->rows()!=n) || (this->cols()!=n)) + this->resize(n, n); + } + + if(m_data.size()==0 || overwrite) + { + typedef Array<StorageIndex,Dynamic,1> ArrayXI; + this->makeCompressed(); + this->resizeNonZeros(n); + Eigen::Map<ArrayXI>(this->innerIndexPtr(), n).setLinSpaced(0,StorageIndex(n)-1); + Eigen::Map<ArrayXI>(this->outerIndexPtr(), n+1).setLinSpaced(0,StorageIndex(n)); + Eigen::Map<Array<Scalar,Dynamic,1> > values = this->coeffs(); + values.setZero(); + internal::call_assignment_no_alias(values, diagXpr, assignFunc); + } + else + { + bool isComp = isCompressed(); + internal::evaluator<DiagXpr> diaEval(diagXpr); + std::vector<IndexPosPair> newEntries; + + // 1 - try in-place update and record insertion failures + for(Index i = 0; i<n; ++i) + { + internal::LowerBoundIndex lb = this->lower_bound(i,i); + Index p = lb.value; + if(lb.found) + { + // the coeff already exists + assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i)); + } + else if((!isComp) && m_innerNonZeros[i] < (m_outerIndex[i+1]-m_outerIndex[i])) + { + // non compressed mode with local room for inserting one element + m_data.moveChunk(p, p+1, m_outerIndex[i]+m_innerNonZeros[i]-p); + m_innerNonZeros[i]++; + m_data.value(p) = Scalar(0); + m_data.index(p) = StorageIndex(i); + assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i)); + } + else + { + // defer insertion + newEntries.push_back(IndexPosPair(i,p)); + } + } + // 2 - insert deferred entries + Index n_entries = Index(newEntries.size()); + if(n_entries>0) + { + Storage newData(m_data.size()+n_entries); + Index prev_p = 0; + Index prev_i = 0; + for(Index k=0; k<n_entries;++k) + { + Index i = newEntries[k].i; + Index p = newEntries[k].p; + internal::smart_copy(m_data.valuePtr()+prev_p, m_data.valuePtr()+p, newData.valuePtr()+prev_p+k); + internal::smart_copy(m_data.indexPtr()+prev_p, m_data.indexPtr()+p, newData.indexPtr()+prev_p+k); + for(Index j=prev_i;j<i;++j) + m_outerIndex[j+1] += k; + if(!isComp) + m_innerNonZeros[i]++; + prev_p = p; + prev_i = i; + newData.value(p+k) = Scalar(0); + newData.index(p+k) = StorageIndex(i); + assignFunc.assignCoeff(newData.value(p+k), diaEval.coeff(i)); + } + { + internal::smart_copy(m_data.valuePtr()+prev_p, m_data.valuePtr()+m_data.size(), newData.valuePtr()+prev_p+n_entries); + internal::smart_copy(m_data.indexPtr()+prev_p, m_data.indexPtr()+m_data.size(), newData.indexPtr()+prev_p+n_entries); + for(Index j=prev_i+1;j<=m_outerSize;++j) + m_outerIndex[j] += n_entries; + } + m_data.swap(newData); + } + } + } private: static void check_template_parameters() diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h index 74df0d496..ceb936887 100644 --- a/Eigen/src/SparseCore/SparseUtil.h +++ b/Eigen/src/SparseCore/SparseUtil.h @@ -140,6 +140,14 @@ struct SparseSelfAdjointShape { static std::string debugName() { return "SparseS template<> struct glue_shapes<SparseShape,SelfAdjointShape> { typedef SparseSelfAdjointShape type; }; template<> struct glue_shapes<SparseShape,TriangularShape > { typedef SparseTriangularShape type; }; +// return type of SparseCompressedBase::lower_bound; +struct LowerBoundIndex { + LowerBoundIndex() : value(-1), found(false) {} + LowerBoundIndex(Index val, bool ok) : value(val), found(ok) {} + Index value; + bool found; +}; + } // end namespace internal /** \ingroup SparseCore_Module diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index e928db467..2f99ee0b2 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -23,6 +23,11 @@ typedef CwiseUnaryOp<internal::scalar_atan_op<Scalar>, const Derived> AtanReturn typedef CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> TanhReturnType; typedef CwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived> LogisticReturnType; typedef CwiseUnaryOp<internal::scalar_sinh_op<Scalar>, const Derived> SinhReturnType; +#if EIGEN_HAS_CXX11_MATH +typedef CwiseUnaryOp<internal::scalar_atanh_op<Scalar>, const Derived> AtanhReturnType; +typedef CwiseUnaryOp<internal::scalar_asinh_op<Scalar>, const Derived> AsinhReturnType; +typedef CwiseUnaryOp<internal::scalar_acosh_op<Scalar>, const Derived> AcoshReturnType; +#endif typedef CwiseUnaryOp<internal::scalar_cosh_op<Scalar>, const Derived> CoshReturnType; typedef CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> SquareReturnType; typedef CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived> CubeReturnType; @@ -327,7 +332,7 @@ sinh() const * Example: \include Cwise_cosh.cpp * Output: \verbinclude Cwise_cosh.out * - * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tan(), sinh(), cosh() + * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tanh(), sinh(), cosh() */ EIGEN_DEVICE_FUNC inline const CoshReturnType @@ -336,6 +341,41 @@ cosh() const return CoshReturnType(derived()); } +#if EIGEN_HAS_CXX11_MATH +/** \returns an expression of the coefficient-wise inverse hyperbolic tan of *this. + * + * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_atanh">Math functions</a>, atanh(), asinh(), acosh() + */ +EIGEN_DEVICE_FUNC +inline const AtanhReturnType +atanh() const +{ + return AtanhReturnType(derived()); +} + +/** \returns an expression of the coefficient-wise inverse hyperbolic sin of *this. + * + * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_asinh">Math functions</a>, atanh(), asinh(), acosh() + */ +EIGEN_DEVICE_FUNC +inline const AsinhReturnType +asinh() const +{ + return AsinhReturnType(derived()); +} + +/** \returns an expression of the coefficient-wise inverse hyperbolic cos of *this. + * + * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_acosh">Math functions</a>, atanh(), asinh(), acosh() + */ +EIGEN_DEVICE_FUNC +inline const AcoshReturnType +acosh() const +{ + return AcoshReturnType(derived()); +} +#endif + /** \returns an expression of the coefficient-wise logistic of *this. */ EIGEN_DEVICE_FUNC diff --git a/Eigen/src/plugins/CommonCwiseUnaryOps.h b/Eigen/src/plugins/CommonCwiseUnaryOps.h index 89f4faaac..5418dc415 100644 --- a/Eigen/src/plugins/CommonCwiseUnaryOps.h +++ b/Eigen/src/plugins/CommonCwiseUnaryOps.h @@ -76,6 +76,20 @@ conjugate() const return ConjugateReturnType(derived()); } +/// \returns an expression of the complex conjugate of \c *this if Cond==true, returns derived() otherwise. +/// +EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate) +/// +/// \sa conjugate() +template<bool Cond> +EIGEN_DEVICE_FUNC +inline typename internal::conditional<Cond,ConjugateReturnType,const Derived&>::type +conjugateIf() const +{ + typedef typename internal::conditional<Cond,ConjugateReturnType,const Derived&>::type ReturnType; + return ReturnType(derived()); +} + /// \returns a read-only expression of the real part of \c *this. /// EIGEN_DOC_UNARY_ADDONS(real,real part function) |