diff options
Diffstat (limited to 'Eigen/src')
46 files changed, 1613 insertions, 1007 deletions
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index c3cc3746c..538aff956 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -13,7 +13,7 @@ #ifndef EIGEN_LDLT_H #define EIGEN_LDLT_H -namespace Eigen { +namespace Eigen { namespace internal { template<typename MatrixType, int UpLo> struct LDLT_Traits; @@ -73,11 +73,11 @@ template<typename _MatrixType, int _UpLo> class LDLT * The default constructor is useful in cases in which the user intends to * perform decompositions via LDLT::compute(const MatrixType&). */ - LDLT() - : m_matrix(), - m_transpositions(), + LDLT() + : m_matrix(), + m_transpositions(), m_sign(internal::ZeroSign), - m_isInitialized(false) + m_isInitialized(false) {} /** \brief Default Constructor with memory preallocation @@ -168,7 +168,7 @@ template<typename _MatrixType, int _UpLo> class LDLT * \note_about_checking_solutions * * More precisely, this method solves \f$ A x = b \f$ using the decomposition \f$ A = P^T L D L^* P \f$ - * by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$, + * by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$, * \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then * \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function @@ -192,6 +192,15 @@ template<typename _MatrixType, int _UpLo> class LDLT template<typename InputType> LDLT& compute(const EigenBase<InputType>& matrix); + /** \returns an estimate of the reciprocal condition number of the matrix of + * which \c *this is the LDLT decomposition. + */ + RealScalar rcond() const + { + eigen_assert(m_isInitialized && "LDLT is not initialized."); + return internal::rcond_estimate_helper(m_l1_norm, *this); + } + template <typename Derived> LDLT& rankUpdate(const MatrixBase<Derived>& w, const RealScalar& alpha=1); @@ -207,6 +216,13 @@ template<typename _MatrixType, int _UpLo> class LDLT MatrixType reconstructedMatrix() const; + /** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix is self-adjoint. + * + * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as: + * \code x = decomposition.adjoint().solve(b) \endcode + */ + const LDLT& adjoint() const { return *this; }; + inline Index rows() const { return m_matrix.rows(); } inline Index cols() const { return m_matrix.cols(); } @@ -220,7 +236,7 @@ template<typename _MatrixType, int _UpLo> class LDLT eigen_assert(m_isInitialized && "LDLT is not initialized."); return Success; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template<typename RhsType, typename DstType> EIGEN_DEVICE_FUNC @@ -228,7 +244,7 @@ template<typename _MatrixType, int _UpLo> class LDLT #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); @@ -241,6 +257,7 @@ template<typename _MatrixType, int _UpLo> class LDLT * is not stored), and the diagonal entries correspond to D. */ MatrixType m_matrix; + RealScalar m_l1_norm; TranspositionType m_transpositions; TmpMatrixType m_temporary; internal::SignMatrix m_sign; @@ -266,8 +283,8 @@ template<> struct ldlt_inplace<Lower> if (size <= 1) { transpositions.setIdentity(); - if (numext::real(mat.coeff(0,0)) > 0) sign = PositiveSemiDef; - else if (numext::real(mat.coeff(0,0)) < 0) sign = NegativeSemiDef; + if (numext::real(mat.coeff(0,0)) > static_cast<RealScalar>(0) ) sign = PositiveSemiDef; + else if (numext::real(mat.coeff(0,0)) < static_cast<RealScalar>(0)) sign = NegativeSemiDef; else sign = ZeroSign; return true; } @@ -314,7 +331,7 @@ template<> struct ldlt_inplace<Lower> if(rs>0) A21.noalias() -= A20 * temp.head(k); } - + // In some previous versions of Eigen (e.g., 3.2.1), the scaling was omitted if the pivot // was smaller than the cutoff value. However, since LDLT is not rank-revealing // we should only make sure that we do not introduce INF or NaN values. @@ -324,12 +341,12 @@ template<> struct ldlt_inplace<Lower> A21 /= realAkk; if (sign == PositiveSemiDef) { - if (realAkk < 0) sign = Indefinite; + if (realAkk < static_cast<RealScalar>(0)) sign = Indefinite; } else if (sign == NegativeSemiDef) { - if (realAkk > 0) sign = Indefinite; + if (realAkk > static_cast<RealScalar>(0)) sign = Indefinite; } else if (sign == ZeroSign) { - if (realAkk > 0) sign = PositiveSemiDef; - else if (realAkk < 0) sign = NegativeSemiDef; + if (realAkk > static_cast<RealScalar>(0)) sign = PositiveSemiDef; + else if (realAkk < static_cast<RealScalar>(0)) sign = NegativeSemiDef; } } @@ -433,12 +450,25 @@ template<typename InputType> LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>& a) { check_template_parameters(); - + eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); m_matrix = a.derived(); + // Compute matrix L1 norm = max abs column sum. + m_l1_norm = RealScalar(0); + // TODO move this code to SelfAdjointView + for (Index col = 0; col < size; ++col) { + RealScalar abs_col_sum; + if (_UpLo == Lower) + abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>(); + else + abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>(); + if (abs_col_sum > m_l1_norm) + m_l1_norm = abs_col_sum; + } + m_transpositions.resize(size); m_isInitialized = false; m_temporary.resize(size); @@ -466,7 +496,7 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri eigen_assert(m_matrix.rows()==size); } else - { + { m_matrix.resize(size,size); m_matrix.setZero(); m_transpositions.resize(size); @@ -505,7 +535,7 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons // diagonal element is not well justified and leads to numerical issues in some cases. // Moreover, Lapack's xSYTRS routines use 0 for the tolerance. RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest(); - + for (Index i = 0; i < vecD.size(); ++i) { if(abs(vecD(i)) > tolerance) diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 74cf5bfe1..19578b216 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -10,7 +10,7 @@ #ifndef EIGEN_LLT_H #define EIGEN_LLT_H -namespace Eigen { +namespace Eigen { namespace internal{ template<typename MatrixType, int UpLo> struct LLT_Traits; @@ -40,7 +40,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits; * * Example: \include LLT_example.cpp * Output: \verbinclude LLT_example.out - * + * * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH) @@ -135,6 +135,16 @@ template<typename _MatrixType, int _UpLo> class LLT template<typename InputType> LLT& compute(const EigenBase<InputType>& matrix); + /** \returns an estimate of the reciprocal condition number of the matrix of + * which \c *this is the Cholesky decomposition. + */ + RealScalar rcond() const + { + eigen_assert(m_isInitialized && "LLT is not initialized."); + eigen_assert(m_info == Success && "LLT failed because matrix appears to be negative"); + return internal::rcond_estimate_helper(m_l1_norm, *this); + } + /** \returns the LLT decomposition matrix * * TODO: document the storage layout @@ -159,12 +169,19 @@ template<typename _MatrixType, int _UpLo> class LLT return m_info; } + /** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix is self-adjoint. + * + * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as: + * \code x = decomposition.adjoint().solve(b) \endcode + */ + const LLT& adjoint() const { return *this; }; + inline Index rows() const { return m_matrix.rows(); } inline Index cols() const { return m_matrix.cols(); } template<typename VectorType> LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1); - + #ifndef EIGEN_PARSED_BY_DOXYGEN template<typename RhsType, typename DstType> EIGEN_DEVICE_FUNC @@ -172,17 +189,18 @@ template<typename _MatrixType, int _UpLo> class LLT #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } - + /** \internal * Used to compute and store L * The strict upper part is not used and even not initialized. */ MatrixType m_matrix; + RealScalar m_l1_norm; bool m_isInitialized; ComputationInfo m_info; }; @@ -268,7 +286,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower> static Index unblocked(MatrixType& mat) { using std::sqrt; - + eigen_assert(mat.rows()==mat.cols()); const Index size = mat.rows(); for(Index k = 0; k < size; ++k) @@ -328,7 +346,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower> return Eigen::internal::llt_rank_update_lower(mat, vec, sigma); } }; - + template<typename Scalar> struct llt_inplace<Scalar, Upper> { typedef typename NumTraits<Scalar>::Real RealScalar; @@ -387,12 +405,25 @@ template<typename InputType> LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>& a) { check_template_parameters(); - + eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); m_matrix.resize(size, size); m_matrix = a.derived(); + // Compute matrix L1 norm = max abs column sum. + m_l1_norm = RealScalar(0); + // TODO move this code to SelfAdjointView + for (Index col = 0; col < size; ++col) { + RealScalar abs_col_sum; + if (_UpLo == Lower) + abs_col_sum = m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>(); + else + abs_col_sum = m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>(); + if (abs_col_sum > m_l1_norm) + m_l1_norm = abs_col_sum; + } + m_isInitialized = true; bool ok = Traits::inplace_decomposition(m_matrix); m_info = ok ? Success : NumericalIssue; @@ -419,7 +450,7 @@ LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, c return *this; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template<typename _MatrixType,int _UpLo> template<typename RhsType, typename DstType> @@ -431,7 +462,7 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const #endif /** \internal use x = llt_object.solve(x); - * + * * This is the \em in-place version of solve(). * * \param bAndX represents both the right-hand side matrix b and result x. @@ -483,7 +514,7 @@ SelfAdjointView<MatrixType, UpLo>::llt() const return LLT<PlainObject,UpLo>(m_matrix); } #endif // __CUDACC__ - + } // end namespace Eigen #endif // EIGEN_LLT_H diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index a9a524130..b1193e421 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -29,13 +29,10 @@ struct copy_using_evaluator_traits { typedef typename DstEvaluator::XprType Dst; typedef typename Dst::Scalar DstScalar; - // TODO distinguish between linear traversal and inner-traversals - typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type PacketType; enum { DstFlags = DstEvaluator::Flags, - SrcFlags = SrcEvaluator::Flags, - RequiredAlignment = unpacket_traits<PacketType>::alignment + SrcFlags = SrcEvaluator::Flags }; public: @@ -55,10 +52,25 @@ private: : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime) : int(Dst::MaxRowsAtCompileTime), OuterStride = int(outer_stride_at_compile_time<Dst>::ret), - MaxSizeAtCompileTime = Dst::SizeAtCompileTime, - PacketSize = unpacket_traits<PacketType>::size + MaxSizeAtCompileTime = Dst::SizeAtCompileTime + }; + + // TODO distinguish between linear traversal and inner-traversals + typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type LinearPacketType; + typedef typename find_best_packet<DstScalar,InnerSize>::type InnerPacketType; + + enum { + LinearPacketSize = unpacket_traits<LinearPacketType>::size, + InnerPacketSize = unpacket_traits<InnerPacketType>::size + }; + +public: + enum { + LinearRequiredAlignment = unpacket_traits<LinearPacketType>::alignment, + InnerRequiredAlignment = unpacket_traits<InnerPacketType>::alignment }; +private: enum { DstIsRowMajor = DstFlags&RowMajorBit, SrcIsRowMajor = SrcFlags&RowMajorBit, @@ -67,16 +79,16 @@ private: && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit) && (functor_traits<AssignFunc>::PacketAccess), MayInnerVectorize = MightVectorize - && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0 - && int(OuterStride)!=Dynamic && int(OuterStride)%int(PacketSize)==0 - && int(JointAlignment)>=int(RequiredAlignment), + && int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0 + && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0 + && int(JointAlignment)>=int(InnerRequiredAlignment), MayLinearize = StorageOrdersAgree && (int(DstFlags) & int(SrcFlags) & LinearAccessBit), MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess - && ((int(DstAlignment)>=int(RequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), + && ((int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), /* If the destination isn't aligned, we have to do runtime checks and we don't unroll, so it's only good for large enough sizes. */ MaySliceVectorize = MightVectorize && DstHasDirectAccess - && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize) + && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*InnerPacketSize) /* slice vectorization can be slow, so we only want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block in a fixed-size matrix */ @@ -84,7 +96,8 @@ private: public: enum { - Traversal = int(MayInnerVectorize) ? int(InnerVectorizedTraversal) + Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal) + : int(MayInnerVectorize) ? int(InnerVectorizedTraversal) : int(MayLinearVectorize) ? int(LinearVectorizedTraversal) : int(MaySliceVectorize) ? int(SliceVectorizedTraversal) : int(MayLinearize) ? int(LinearTraversal) @@ -94,9 +107,14 @@ public: || int(Traversal) == SliceVectorizedTraversal }; + typedef typename conditional<int(Traversal)==LinearVectorizedTraversal, LinearPacketType, InnerPacketType>::type PacketType; + private: enum { - UnrollingLimit = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1), + ActualPacketSize = int(Traversal)==LinearVectorizedTraversal ? LinearPacketSize + : Vectorized ? InnerPacketSize + : 1, + UnrollingLimit = EIGEN_UNROLLING_LIMIT * ActualPacketSize, MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic && int(Dst::SizeAtCompileTime) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit), MayUnrollInner = int(InnerSize) != Dynamic @@ -112,7 +130,7 @@ public: : int(NoUnrolling) ) : int(Traversal) == int(LinearVectorizedTraversal) - ? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(RequiredAlignment)) ? int(CompleteUnrolling) + ? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(LinearRequiredAlignment)) ? int(CompleteUnrolling) : int(NoUnrolling) ) : int(Traversal) == int(LinearTraversal) ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) @@ -131,11 +149,13 @@ public: std::cerr.unsetf(std::ios::hex); EIGEN_DEBUG_VAR(DstAlignment) EIGEN_DEBUG_VAR(SrcAlignment) - EIGEN_DEBUG_VAR(RequiredAlignment) + EIGEN_DEBUG_VAR(LinearRequiredAlignment) + EIGEN_DEBUG_VAR(InnerRequiredAlignment) EIGEN_DEBUG_VAR(JointAlignment) EIGEN_DEBUG_VAR(InnerSize) EIGEN_DEBUG_VAR(InnerMaxSize) - EIGEN_DEBUG_VAR(PacketSize) + EIGEN_DEBUG_VAR(LinearPacketSize) + EIGEN_DEBUG_VAR(InnerPacketSize) EIGEN_DEBUG_VAR(StorageOrdersAgree) EIGEN_DEBUG_VAR(MightVectorize) EIGEN_DEBUG_VAR(MayLinearize) @@ -236,12 +256,13 @@ struct copy_using_evaluator_innervec_CompleteUnrolling enum { outer = Index / DstXprType::InnerSizeAtCompileTime, inner = Index % DstXprType::InnerSizeAtCompileTime, - JointAlignment = Kernel::AssignmentTraits::JointAlignment + JointAlignment = Kernel::AssignmentTraits::JointAlignment, + DefaultAlignment = unpacket_traits<PacketType>::alignment }; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { - kernel.template assignPacketByOuterInner<Aligned, JointAlignment, PacketType>(outer, inner); + kernel.template assignPacketByOuterInner<DefaultAlignment, JointAlignment, PacketType>(outer, inner); enum { NextIndex = Index + unpacket_traits<PacketType>::size }; copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel); } @@ -257,9 +278,12 @@ template<typename Kernel, int Index_, int Stop> struct copy_using_evaluator_innervec_InnerUnrolling { typedef typename Kernel::PacketType PacketType; + enum { + DefaultAlignment = unpacket_traits<PacketType>::alignment + }; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer) { - kernel.template assignPacketByOuterInner<Aligned, Aligned, PacketType>(outer, Index_); + kernel.template assignPacketByOuterInner<DefaultAlignment, DefaultAlignment, PacketType>(outer, Index_); enum { NextIndex = Index_ + unpacket_traits<PacketType>::size }; copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop>::run(kernel, outer); } @@ -370,7 +394,7 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling> typedef typename Kernel::Scalar Scalar; typedef typename Kernel::PacketType PacketType; enum { - requestedAlignment = Kernel::AssignmentTraits::RequiredAlignment, + requestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment, packetSize = unpacket_traits<PacketType>::size, dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment), dstAlignment = packet_traits<Scalar>::AlignedOnScalar ? int(requestedAlignment) @@ -413,6 +437,9 @@ template<typename Kernel> struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling> { typedef typename Kernel::PacketType PacketType; + enum { + DefaultAlignment = unpacket_traits<PacketType>::alignment + }; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { const Index innerSize = kernel.innerSize(); @@ -420,7 +447,7 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling> const Index packetSize = unpacket_traits<PacketType>::size; for(Index outer = 0; outer < outerSize; ++outer) for(Index inner = 0; inner < innerSize; inner+=packetSize) - kernel.template assignPacketByOuterInner<Aligned, Aligned, PacketType>(outer, inner); + kernel.template assignPacketByOuterInner<DefaultAlignment, DefaultAlignment, PacketType>(outer, inner); } }; @@ -484,7 +511,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling> typedef typename Kernel::PacketType PacketType; enum { packetSize = unpacket_traits<PacketType>::size, - requestedAlignment = int(Kernel::AssignmentTraits::RequiredAlignment), + requestedAlignment = int(Kernel::AssignmentTraits::InnerRequiredAlignment), alignable = packet_traits<Scalar>::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar), dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment), dstAlignment = alignable ? int(requestedAlignment) @@ -788,8 +815,8 @@ template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, con template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar> struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar> { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -806,8 +833,8 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar> template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar> struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Scalar> { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/) + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); src.evalTo(dst); diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h new file mode 100644 index 000000000..68c5e918e --- /dev/null +++ b/Eigen/src/Core/ConditionEstimator.h @@ -0,0 +1,166 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Rasmus Munk Larsen (rmlarsen@google.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CONDITIONESTIMATOR_H +#define EIGEN_CONDITIONESTIMATOR_H + +namespace Eigen { + +namespace internal { + +template <typename Vector, typename RealVector, bool IsComplex> +struct rcond_compute_sign { + static inline Vector run(const Vector& v) { + const RealVector v_abs = v.cwiseAbs(); + return (v_abs.array() == static_cast<typename Vector::RealScalar>(0)) + .select(Vector::Ones(v.size()), v.cwiseQuotient(v_abs)); + } +}; + +// Partial specialization to avoid elementwise division for real vectors. +template <typename Vector> +struct rcond_compute_sign<Vector, Vector, false> { + static inline Vector run(const Vector& v) { + return (v.array() < static_cast<typename Vector::RealScalar>(0)) + .select(-Vector::Ones(v.size()), Vector::Ones(v.size())); + } +}; + +/** \brief Reciprocal condition number estimator. + * + * Computing a decomposition of a dense matrix takes O(n^3) operations, while + * this method estimates the condition number quickly and reliably in O(n^2) + * operations. + * + * \returns an estimate of the reciprocal condition number + * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and + * its decomposition. Supports the following decompositions: FullPivLU, + * PartialPivLU, LDLT, and LLT. + * + * \sa FullPivLU, PartialPivLU, LDLT, LLT. + */ +template <typename Decomposition> +typename Decomposition::RealScalar +rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Decomposition& dec) +{ + typedef typename Decomposition::RealScalar RealScalar; + eigen_assert(dec.rows() == dec.cols()); + if (dec.rows() == 0) return RealScalar(1); + if (matrix_norm == RealScalar(0)) return RealScalar(0); + if (dec.rows() == 1) return RealScalar(1); + const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec); + return (inverse_matrix_norm == RealScalar(0) ? RealScalar(0) + : (RealScalar(1) / inverse_matrix_norm) / matrix_norm); +} + +/** + * \returns an estimate of ||inv(matrix)||_1 given a decomposition of + * \a matrix that implements .solve() and .adjoint().solve() methods. + * + * This function implements Algorithms 4.1 and 5.1 from + * http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf + * which also forms the basis for the condition number estimators in + * LAPACK. Since at most 10 calls to the solve method of dec are + * performed, the total cost is O(dims^2), as opposed to O(dims^3) + * needed to compute the inverse matrix explicitly. + * + * The most common usage is in estimating the condition number + * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be + * computed directly in O(n^2) operations. + * + * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, and + * LLT. + * + * \sa FullPivLU, PartialPivLU, LDLT, LLT. + */ +template <typename Decomposition> +typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomposition& dec) +{ + typedef typename Decomposition::MatrixType MatrixType; + typedef typename Decomposition::Scalar Scalar; + typedef typename Decomposition::RealScalar RealScalar; + typedef typename internal::plain_col_type<MatrixType>::type Vector; + typedef typename internal::plain_col_type<MatrixType, RealScalar>::type RealVector; + const bool is_complex = (NumTraits<Scalar>::IsComplex != 0); + + eigen_assert(dec.rows() == dec.cols()); + const Index n = dec.rows(); + if (n == 0) + return 0; + + Vector v = dec.solve(Vector::Ones(n) / Scalar(n)); + + // lower_bound is a lower bound on + // ||inv(matrix)||_1 = sup_v ||inv(matrix) v||_1 / ||v||_1 + // and is the objective maximized by the ("super-") gradient ascent + // algorithm below. + RealScalar lower_bound = v.template lpNorm<1>(); + if (n == 1) + return lower_bound; + + // Gradient ascent algorithm follows: We know that the optimum is achieved at + // one of the simplices v = e_i, so in each iteration we follow a + // super-gradient to move towards the optimal one. + RealScalar old_lower_bound = lower_bound; + Vector sign_vector(n); + Vector old_sign_vector; + Index v_max_abs_index = -1; + Index old_v_max_abs_index = v_max_abs_index; + for (int k = 0; k < 4; ++k) + { + sign_vector = internal::rcond_compute_sign<Vector, RealVector, is_complex>::run(v); + if (k > 0 && !is_complex && sign_vector == old_sign_vector) { + // Break if the solution stagnated. + break; + } + // v_max_abs_index = argmax |real( inv(matrix)^T * sign_vector )| + v = dec.adjoint().solve(sign_vector); + v.real().cwiseAbs().maxCoeff(&v_max_abs_index); + if (v_max_abs_index == old_v_max_abs_index) { + // Break if the solution stagnated. + break; + } + // Move to the new simplex e_j, where j = v_max_abs_index. + v = dec.solve(Vector::Unit(n, v_max_abs_index)); // v = inv(matrix) * e_j. + lower_bound = v.template lpNorm<1>(); + if (lower_bound <= old_lower_bound) { + // Break if the gradient step did not increase the lower_bound. + break; + } + if (!is_complex) { + old_sign_vector = sign_vector; + } + old_v_max_abs_index = v_max_abs_index; + old_lower_bound = lower_bound; + } + // The following calculates an independent estimate of ||matrix||_1 by + // multiplying matrix by a vector with entries of slowly increasing + // magnitude and alternating sign: + // v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1. + // This improvement to Hager's algorithm above is due to Higham. It was + // added to make the algorithm more robust in certain corner cases where + // large elements in the matrix might otherwise escape detection due to + // exact cancellation (especially when op and op_adjoint correspond to a + // sequence of backsubstitutions and permutations), which could cause + // Hager's algorithm to vastly underestimate ||matrix||_1. + Scalar alternating_sign(RealScalar(1)); + for (Index i = 0; i < n; ++i) { + v[i] = alternating_sign * (RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1)))); + alternating_sign = -alternating_sign; + } + v = dec.solve(v); + const RealScalar alternate_lower_bound = (2 * v.template lpNorm<1>()) / (3 * RealScalar(n)); + return numext::maxi(lower_bound, alternate_lower_bound); +} + +} // namespace internal + +} // namespace Eigen + +#endif diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 388805f0d..932178f53 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -850,14 +850,14 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa template<int StoreMode, typename PacketType> EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) - { + { return m_argImpl.template writePacket<StoreMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col, x); } template<int StoreMode, typename PacketType> EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) - { + { return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0, x); diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 53f934999..f7c5f4276 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -81,6 +81,8 @@ public: * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */ // FIXME I'm not sure the current mapping is the ideal one. template<int M, int N> struct product_type_selector<M,N,1> { enum { ret = OuterProduct }; }; +template<int M> struct product_type_selector<M, 1, 1> { enum { ret = LazyCoeffBasedProductMode }; }; +template<int N> struct product_type_selector<1, N, 1> { enum { ret = LazyCoeffBasedProductMode }; }; template<int Depth> struct product_type_selector<1, 1, Depth> { enum { ret = InnerProduct }; }; template<> struct product_type_selector<1, 1, 1> { enum { ret = InnerProduct }; }; template<> struct product_type_selector<Small,1, Small> { enum { ret = CoeffBasedProductMode }; }; diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 6ff61c18a..001c2ffbf 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -62,7 +62,7 @@ struct default_packet_traits HasRsqrt = 0, HasExp = 0, HasLog = 0, - HasLog10 = 0, + HasLog10 = 0, HasPow = 0, HasSin = 0, @@ -71,9 +71,9 @@ struct default_packet_traits HasASin = 0, HasACos = 0, HasATan = 0, - HasSinh = 0, - HasCosh = 0, - HasTanh = 0, + HasSinh = 0, + HasCosh = 0, + HasTanh = 0, HasLGamma = 0, HasDiGamma = 0, HasZeta = 0, diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index fd73f543b..f31046b54 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -705,12 +705,12 @@ typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>: isfinite_impl(const T& x) { #ifdef __CUDA_ARCH__ - return (isfinite)(x); + return (::isfinite)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; return isfinite EIGEN_NOT_A_MACRO (x); #else - return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest(); + return x<=NumTraits<T>::highest() && x>=NumTraits<T>::lowest(); #endif } @@ -720,7 +720,7 @@ typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>: isinf_impl(const T& x) { #ifdef __CUDA_ARCH__ - return (isinf)(x); + return (::isinf)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; return isinf EIGEN_NOT_A_MACRO (x); @@ -735,7 +735,7 @@ typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>: isnan_impl(const T& x) { #ifdef __CUDA_ARCH__ - return (isnan)(x); + return (::isnan)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; return isnan EIGEN_NOT_A_MACRO (x); @@ -1025,6 +1025,66 @@ double log(const double &x) { return ::log(x); } template<typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +typename NumTraits<T>::Real abs(const T &x) { + EIGEN_USING_STD_MATH(abs); + return abs(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float abs(const float &x) { return ::fabsf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double abs(const double &x) { return ::fabs(x); } +#endif + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T exp(const T &x) { + EIGEN_USING_STD_MATH(exp); + return exp(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float exp(const float &x) { return ::expf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double exp(const double &x) { return ::exp(x); } +#endif + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T cos(const T &x) { + EIGEN_USING_STD_MATH(cos); + return cos(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float cos(const float &x) { return ::cosf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double cos(const double &x) { return ::cos(x); } +#endif + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T sin(const T &x) { + EIGEN_USING_STD_MATH(sin); + return sin(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float sin(const float &x) { return ::sinf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double sin(const double &x) { return ::sin(x); } +#endif + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tan(const T &x) { EIGEN_USING_STD_MATH(tan); return tan(x); @@ -1040,39 +1100,99 @@ double tan(const double &x) { return ::tan(x); } template<typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -typename NumTraits<T>::Real abs(const T &x) { - EIGEN_USING_STD_MATH(abs); - return abs(x); +T acos(const T &x) { + EIGEN_USING_STD_MATH(acos); + return acos(x); } #ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -float abs(const float &x) { return ::fabsf(x); } +float acos(const float &x) { return ::acosf(x); } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -double abs(const double &x) { return ::fabs(x); } +double acos(const double &x) { return ::acos(x); } #endif template<typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T exp(const T &x) { - EIGEN_USING_STD_MATH(exp); - return exp(x); +T asin(const T &x) { + EIGEN_USING_STD_MATH(asin); + return asin(x); } #ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -float exp(const float &x) { return ::expf(x); } +float asin(const float &x) { return ::asinf(x); } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -double exp(const double &x) { return ::exp(x); } +double asin(const double &x) { return ::asin(x); } +#endif + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T atan(const T &x) { + EIGEN_USING_STD_MATH(atan); + return atan(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float atan(const float &x) { return ::atanf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double atan(const double &x) { return ::atan(x); } #endif +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T cosh(const T &x) { + EIGEN_USING_STD_MATH(cosh); + return cosh(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float cosh(const float &x) { return ::coshf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double cosh(const double &x) { return ::cosh(x); } +#endif + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T sinh(const T &x) { + EIGEN_USING_STD_MATH(sinh); + return sinh(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float sinh(const float &x) { return ::sinhf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double sinh(const double &x) { return ::sinh(x); } +#endif + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T tanh(const T &x) { + EIGEN_USING_STD_MATH(tanh); + return tanh(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float tanh(const float &x) { return ::tanhf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double tanh(const double &x) { return ::tanh(x); } +#endif + template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T fmod(const T& a, const T& b) { - EIGEN_USING_STD_MATH(floor); + EIGEN_USING_STD_MATH(fmod); return fmod(a, b); } @@ -1128,14 +1248,12 @@ struct scalar_fuzzy_default_impl<Scalar, false, false> template<typename OtherScalar> EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec) { - EIGEN_USING_STD_MATH(abs); - return abs(x) <= abs(y) * prec; + return numext::abs(x) <= numext::abs(y) * prec; } EIGEN_DEVICE_FUNC static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec) { - EIGEN_USING_STD_MATH(abs); - return abs(x - y) <= numext::mini(abs(x), abs(y)) * prec; + return numext::abs(x - y) <= numext::mini(numext::abs(x), numext::abs(y)) * prec; } EIGEN_DEVICE_FUNC static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar& prec) diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 3ce86e8cd..d9fd888cf 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -410,8 +410,6 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, typedef Product<Lhs, Rhs, LazyProduct> XprType; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketScalar PacketScalar; - typedef typename XprType::PacketReturnType PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit product_evaluator(const XprType& xpr) @@ -437,16 +435,20 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, typedef evaluator<LhsNestedCleaned> LhsEtorType; typedef evaluator<RhsNestedCleaned> RhsEtorType; - + enum { RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime, ColsAtCompileTime = RhsNestedCleaned::ColsAtCompileTime, InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime), MaxRowsAtCompileTime = LhsNestedCleaned::MaxRowsAtCompileTime, - MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime, - - PacketSize = packet_traits<Scalar>::size, + MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime + }; + + typedef typename find_best_packet<Scalar,RowsAtCompileTime>::type LhsVecPacketType; + typedef typename find_best_packet<Scalar,ColsAtCompileTime>::type RhsVecPacketType; + enum { + LhsCoeffReadCost = LhsEtorType::CoeffReadCost, RhsCoeffReadCost = RhsEtorType::CoeffReadCost, CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost @@ -459,19 +461,23 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, LhsFlags = LhsEtorType::Flags, RhsFlags = RhsEtorType::Flags, - LhsAlignment = LhsEtorType::Alignment, - RhsAlignment = RhsEtorType::Alignment, - LhsRowMajor = LhsFlags & RowMajorBit, RhsRowMajor = RhsFlags & RowMajorBit, + + LhsVecPacketSize = unpacket_traits<LhsVecPacketType>::size, + RhsVecPacketSize = unpacket_traits<RhsVecPacketType>::size, + + // Here, we don't care about alignment larger than the usable packet size. + LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))), + RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))), SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value, CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit) - && (ColsAtCompileTime == Dynamic || ((ColsAtCompileTime % PacketSize) == 0) ), + && (ColsAtCompileTime == Dynamic || ((ColsAtCompileTime % RhsVecPacketSize) == 0) ), CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) - && (RowsAtCompileTime == Dynamic || ((RowsAtCompileTime % PacketSize) == 0) ), + && (RowsAtCompileTime == Dynamic || ((RowsAtCompileTime % LhsVecPacketSize) == 0) ), EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1 : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 @@ -491,10 +497,10 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, : 0, /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside - * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner - * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect - * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI. - */ + * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner + * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect + * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI. + */ CanVectorizeInner = SameType && LhsRowMajor && (!RhsRowMajor) @@ -1000,7 +1006,7 @@ struct transposition_matrix_product const Index size = tr.size(); StorageIndex j = 0; - if(!(is_same<MatrixTypeCleaned,Dest>::value && extract_data(dst) == extract_data(mat))) + if(!is_same_dense(dst,mat)) dst = mat; for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k) diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index d170cae29..98b2fd868 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -27,8 +27,9 @@ template<typename Func, typename Derived> struct redux_traits { public: + typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType; enum { - PacketSize = packet_traits<typename Derived::Scalar>::size, + PacketSize = unpacket_traits<PacketType>::size, InnerMaxSize = int(Derived::IsRowMajor) ? Derived::MaxColsAtCompileTime : Derived::MaxRowsAtCompileTime @@ -137,12 +138,12 @@ template<typename Func, typename Derived, int Start, int Length> struct redux_vec_unroller { enum { - PacketSize = packet_traits<typename Derived::Scalar>::size, + PacketSize = redux_traits<Func, Derived>::PacketSize, HalfLength = Length/2 }; typedef typename Derived::Scalar Scalar; - typedef typename packet_traits<Scalar>::type PacketScalar; + typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func) { @@ -156,14 +157,14 @@ template<typename Func, typename Derived, int Start> struct redux_vec_unroller<Func, Derived, Start, 1> { enum { - index = Start * packet_traits<typename Derived::Scalar>::size, + index = Start * redux_traits<Func, Derived>::PacketSize, outer = index / int(Derived::InnerSizeAtCompileTime), inner = index % int(Derived::InnerSizeAtCompileTime), alignment = Derived::Alignment }; typedef typename Derived::Scalar Scalar; - typedef typename packet_traits<Scalar>::type PacketScalar; + typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&) { @@ -209,13 +210,13 @@ template<typename Func, typename Derived> struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling> { typedef typename Derived::Scalar Scalar; - typedef typename packet_traits<Scalar>::type PacketScalar; + typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; static Scalar run(const Derived &mat, const Func& func) { const Index size = mat.size(); - const Index packetSize = packet_traits<Scalar>::size; + const Index packetSize = redux_traits<Func, Derived>::PacketSize; const int packetAlignment = unpacket_traits<PacketScalar>::alignment; enum { alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), @@ -268,7 +269,7 @@ template<typename Func, typename Derived, int Unrolling> struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling> { typedef typename Derived::Scalar Scalar; - typedef typename packet_traits<Scalar>::type PacketType; + typedef typename redux_traits<Func, Derived>::PacketType PacketType; EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func) { @@ -276,7 +277,7 @@ struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling> const Index innerSize = mat.innerSize(); const Index outerSize = mat.outerSize(); enum { - packetSize = packet_traits<Scalar>::size + packetSize = redux_traits<Func, Derived>::PacketSize }; const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize; Scalar res; @@ -306,9 +307,10 @@ template<typename Func, typename Derived> struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling> { typedef typename Derived::Scalar Scalar; - typedef typename packet_traits<Scalar>::type PacketScalar; + + typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; enum { - PacketSize = packet_traits<Scalar>::size, + PacketSize = redux_traits<Func, Derived>::PacketSize, Size = Derived::SizeAtCompileTime, VectorizedSize = (Size / PacketSize) * PacketSize }; @@ -367,11 +369,11 @@ public: { return m_evaluator.coeff(index); } template<int LoadMode, typename PacketType> - PacketReturnType packet(Index row, Index col) const + PacketType packet(Index row, Index col) const { return m_evaluator.template packet<LoadMode,PacketType>(row, col); } template<int LoadMode, typename PacketType> - PacketReturnType packet(Index index) const + PacketType packet(Index index) const { return m_evaluator.template packet<LoadMode,PacketType>(index); } EIGEN_DEVICE_FUNC @@ -379,7 +381,7 @@ public: { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } template<int LoadMode, typename PacketType> - PacketReturnType packetByOuterInner(Index outer, Index inner) const + PacketType packetByOuterInner(Index outer, Index inner) const { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } const XprType & nestedExpression() const { return m_xpr; } diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index 5a2010449..a33356423 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h @@ -213,7 +213,7 @@ template<int Side, typename TriangularType, typename Rhs> struct triangular_solv template<typename Dest> inline void evalTo(Dest& dst) const { - if(!(is_same<RhsNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_rhs))) + if(!is_same_dense(dst,m_rhs)) dst = m_rhs; m_triangularMatrix.template solveInPlace<Side>(dst); } diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 2a0a6ff15..c6a50bb1d 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -79,8 +79,8 @@ namespace cephes { */ template <typename Scalar, int N> struct polevl { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static Scalar run(const Scalar x, const Scalar coef[]) { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar x, const Scalar coef[]) { EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N]; @@ -89,8 +89,8 @@ struct polevl { template <typename Scalar> struct polevl<Scalar, 0> { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static Scalar run(const Scalar, const Scalar coef[]) { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar, const Scalar coef[]) { return coef[0]; } }; @@ -144,7 +144,7 @@ struct digamma_retval { template <typename Scalar> struct digamma_impl { EIGEN_DEVICE_FUNC - static Scalar run(Scalar x) { + static EIGEN_STRONG_INLINE Scalar run(Scalar x) { EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED); return Scalar(0); @@ -281,20 +281,18 @@ struct digamma_impl { */ Scalar p, q, nz, s, w, y; - bool negative; + bool negative = false; const Scalar maxnum = NumTraits<Scalar>::infinity(); - const Scalar m_pi = EIGEN_PI; - - negative = 0; - nz = 0.0; + const Scalar m_pi = Scalar(EIGEN_PI); - const Scalar zero = 0.0; - const Scalar one = 1.0; - const Scalar half = 0.5; + const Scalar zero = Scalar(0); + const Scalar one = Scalar(1); + const Scalar half = Scalar(0.5); + nz = zero; if (x <= zero) { - negative = one; + negative = true; q = x; p = numext::floor(q); if (p == q) { @@ -428,33 +426,33 @@ template <typename Scalar> struct igamma_impl; // predeclare igamma_impl template <typename Scalar> struct igamma_helper { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static Scalar big() { assert(false && "big not supported for this type"); return 0.0; } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar big() { assert(false && "big not supported for this type"); return 0.0; } }; template <> struct igamma_helper<float> { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static float machep() { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float machep() { return NumTraits<float>::epsilon() / 2; // 1.0 - machep == 1.0 } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static float big() { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float big() { // use epsneg (1.0 - epsneg == 1.0) - return 1.0 / (NumTraits<float>::epsilon() / 2); + return 1.0f / (NumTraits<float>::epsilon() / 2); } }; template <> struct igamma_helper<double> { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static double machep() { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double machep() { return NumTraits<double>::epsilon() / 2; // 1.0 - machep == 1.0 } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static double big() { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double big() { return 1.0 / NumTraits<double>::epsilon(); } }; @@ -463,7 +461,7 @@ template <typename Scalar> struct igammac_impl { EIGEN_DEVICE_FUNC static Scalar run(Scalar a, Scalar x) { - /* igamc() + /* igamc() * * Incomplete gamma integral (modified for Eigen) * @@ -519,26 +517,51 @@ struct igammac_impl { */ const Scalar zero = 0; const Scalar one = 1; + const Scalar nan = NumTraits<Scalar>::quiet_NaN(); + + if ((x < zero) || (a <= zero)) { + // domain error + return nan; + } + + if ((x < one) || (x < a)) { + /* The checks above ensure that we meet the preconditions for + * igamma_impl::Impl(), so call it, rather than igamma_impl::Run(). + * Calling Run() would also work, but in that case the compiler may not be + * able to prove that igammac_impl::Run and igamma_impl::Run are not + * mutually recursive. This leads to worse code, particularly on + * platforms like nvptx, where recursion is allowed only begrudgingly. + */ + return (one - igamma_impl<Scalar>::Impl(a, x)); + } + + return Impl(a, x); + } + + private: + /* igamma_impl calls igammac_impl::Impl. */ + friend struct igamma_impl<Scalar>; + + /* Actually computes igamc(a, x). + * + * Preconditions: + * a > 0 + * x >= 1 + * x >= a + */ + EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) { + const Scalar zero = 0; + const Scalar one = 1; const Scalar two = 2; const Scalar machep = igamma_helper<Scalar>::machep(); const Scalar maxlog = numext::log(NumTraits<Scalar>::highest()); const Scalar big = igamma_helper<Scalar>::big(); const Scalar biginv = 1 / big; - const Scalar nan = NumTraits<Scalar>::quiet_NaN(); const Scalar inf = NumTraits<Scalar>::infinity(); Scalar ans, ax, c, yc, r, t, y, z; Scalar pk, pkm1, pkm2, qk, qkm1, qkm2; - if ((x < zero) || ( a <= zero)) { - // domain error - return nan; - } - - if ((x < one) || (x < a)) { - return (one - igamma_impl<Scalar>::run(a, x)); - } - if (x == inf) return zero; // std::isinf crashes on CUDA /* Compute x**a * exp(-x) / gamma(a) */ @@ -605,7 +628,7 @@ struct igamma_retval { template <typename Scalar> struct igamma_impl { EIGEN_DEVICE_FUNC - static Scalar run(Scalar a, Scalar x) { + static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) { EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED); return Scalar(0); @@ -618,7 +641,7 @@ template <typename Scalar> struct igamma_impl { EIGEN_DEVICE_FUNC static Scalar run(Scalar a, Scalar x) { - /* igam() + /* igam() * Incomplete gamma integral * * @@ -680,22 +703,47 @@ struct igamma_impl { */ const Scalar zero = 0; const Scalar one = 1; - const Scalar machep = igamma_helper<Scalar>::machep(); - const Scalar maxlog = numext::log(NumTraits<Scalar>::highest()); const Scalar nan = NumTraits<Scalar>::quiet_NaN(); - double ans, ax, c, r; - if (x == zero) return zero; - if ((x < zero) || ( a <= zero)) { // domain error + if ((x < zero) || (a <= zero)) { // domain error return nan; } if ((x > one) && (x > a)) { - return (one - igammac_impl<Scalar>::run(a, x)); + /* The checks above ensure that we meet the preconditions for + * igammac_impl::Impl(), so call it, rather than igammac_impl::Run(). + * Calling Run() would also work, but in that case the compiler may not be + * able to prove that igammac_impl::Run and igamma_impl::Run are not + * mutually recursive. This leads to worse code, particularly on + * platforms like nvptx, where recursion is allowed only begrudgingly. + */ + return (one - igammac_impl<Scalar>::Impl(a, x)); } + return Impl(a, x); + } + + private: + /* igammac_impl calls igamma_impl::Impl. */ + friend struct igammac_impl<Scalar>; + + /* Actually computes igam(a, x). + * + * Preconditions: + * x > 0 + * a > 0 + * !(x > 1 && x > a) + */ + EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) { + const Scalar zero = 0; + const Scalar one = 1; + const Scalar machep = igamma_helper<Scalar>::machep(); + const Scalar maxlog = numext::log(NumTraits<Scalar>::highest()); + + Scalar ans, ax, c, r; + /* Compute x**a * exp(-x) / gamma(a) */ ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a); if (ax < -maxlog) { @@ -736,7 +784,7 @@ struct zeta_retval { template <typename Scalar> struct zeta_impl { EIGEN_DEVICE_FUNC - static Scalar run(Scalar x, Scalar q) { + static EIGEN_STRONG_INLINE Scalar run(Scalar x, Scalar q) { EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED); return Scalar(0); @@ -757,8 +805,8 @@ struct zeta_impl_series { template <> struct zeta_impl_series<float> { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static bool run(float& a, float& b, float& s, const float x, const float machep) { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE bool run(float& a, float& b, float& s, const float x, const float machep) { int i = 0; while(i < 9) { @@ -777,8 +825,8 @@ struct zeta_impl_series<float> { template <> struct zeta_impl_series<double> { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - static bool run(double& a, double& b, double& s, const double x, const double machep) { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE bool run(double& a, double& b, double& s, const double x, const double machep) { int i = 0; while( (i < 9) || (a <= 9.0) ) { @@ -881,13 +929,14 @@ struct zeta_impl { const Scalar maxnum = NumTraits<Scalar>::infinity(); const Scalar zero = 0.0, half = 0.5, one = 1.0; const Scalar machep = igamma_helper<Scalar>::machep(); + const Scalar nan = NumTraits<Scalar>::quiet_NaN(); if( x == one ) return maxnum; if( x < one ) { - return zero; + return nan; } if( q <= zero ) @@ -899,7 +948,7 @@ struct zeta_impl { p = x; r = numext::floor(p); if (p != r) - return zero; + return nan; } /* Permit negative q but continue sum until n+q > +9 . @@ -954,7 +1003,7 @@ struct polygamma_retval { template <typename Scalar> struct polygamma_impl { EIGEN_DEVICE_FUNC - static Scalar run(Scalar n, Scalar x) { + static EIGEN_STRONG_INLINE Scalar run(Scalar n, Scalar x) { EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED); return Scalar(0); @@ -969,9 +1018,14 @@ struct polygamma_impl { static Scalar run(Scalar n, Scalar x) { Scalar zero = 0.0, one = 1.0; Scalar nplus = n + one; + const Scalar nan = NumTraits<Scalar>::quiet_NaN(); + // Check that n is an integer + if (numext::floor(n) != n) { + return nan; + } // Just return the digamma function for n = 1 - if (n == zero) { + else if (n == zero) { return digamma_impl<Scalar>::run(x); } // Use the same implementation as scipy diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h index 7fe39808b..d2fe1e199 100644 --- a/Eigen/src/Core/StableNorm.h +++ b/Eigen/src/Core/StableNorm.h @@ -168,11 +168,12 @@ MatrixBase<Derived>::stableNorm() const DerivedCopy copy(derived()); enum { - CanAlign = (int(Flags)&DirectAccessBit) || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME + CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit) + || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough + ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization }; typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>, - typename DerivedCopyClean - ::ConstSegmentReturnType>::type SegmentWrapper; + typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper; Index n = size(); if(n==1) diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index e6d137e40..5c5e5028e 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -532,7 +532,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat template<typename RhsType, typename DstType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _solve_impl(const RhsType &rhs, DstType &dst) const { - if(!(internal::is_same<RhsType,DstType>::value && internal::extract_data(dst) == internal::extract_data(rhs))) + if(!internal::is_same_dense(dst,rhs)) dst = rhs; this->solveInPlace(dst); } diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 0a3b301bf..a63b20318 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -46,6 +46,8 @@ // Make our own __half definition that is similar to CUDA's. struct __half { + __half() {} + explicit __half(unsigned short raw) : x(raw) {} unsigned short x; }; @@ -70,12 +72,18 @@ struct half : public __half { explicit EIGEN_DEVICE_FUNC half(bool b) : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + explicit EIGEN_DEVICE_FUNC half(unsigned int ui) + : __half(internal::float_to_half_rtne(static_cast<float>(ui))) {} explicit EIGEN_DEVICE_FUNC half(int i) : __half(internal::float_to_half_rtne(static_cast<float>(i))) {} + explicit EIGEN_DEVICE_FUNC half(unsigned long ul) + : __half(internal::float_to_half_rtne(static_cast<float>(ul))) {} explicit EIGEN_DEVICE_FUNC half(long l) : __half(internal::float_to_half_rtne(static_cast<float>(l))) {} explicit EIGEN_DEVICE_FUNC half(long long ll) : __half(internal::float_to_half_rtne(static_cast<float>(ll))) {} + explicit EIGEN_DEVICE_FUNC half(unsigned long long ull) + : __half(internal::float_to_half_rtne(static_cast<float>(ull))) {} explicit EIGEN_DEVICE_FUNC half(float f) : __half(internal::float_to_half_rtne(f)) {} explicit EIGEN_DEVICE_FUNC half(double d) @@ -286,7 +294,8 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) const FP32 f16max = { (127 + 16) << 23 }; const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; unsigned int sign_mask = 0x80000000u; - __half o = { 0 }; + __half o; + o.x = static_cast<unsigned short>(0x0u); unsigned int sign = f.u & sign_mask; f.u ^= sign; @@ -366,13 +375,22 @@ template<> struct is_arithmetic<half> { enum { value = true }; }; template<> struct NumTraits<Eigen::half> : GenericNumTraits<Eigen::half> { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float dummy_precision() { return 1e-3f; } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { + return internal::raw_uint16_to_half(0x0800); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return half(1e-2f); } EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() { return internal::raw_uint16_to_half(0x7bff); } EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() { return internal::raw_uint16_to_half(0xfbff); } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() { + return internal::raw_uint16_to_half(0x7c00); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { + return internal::raw_uint16_to_half(0x7c01); + } }; // Infinity/NaN checks. @@ -392,6 +410,7 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) { return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a); } + template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) { Eigen::half result; result.x = a.x & 0x7FFF; @@ -406,6 +425,21 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::ha template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::half& a) { return Eigen::half(::sqrtf(float(a))); } +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half pow(const Eigen::half& a, const Eigen::half& b) { + return Eigen::half(::powf(float(a), float(b))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sin(const Eigen::half& a) { + return Eigen::half(::sinf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half cos(const Eigen::half& a) { + return Eigen::half(::cosf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half tan(const Eigen::half& a) { + return Eigen::half(::tanf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half tanh(const Eigen::half& a) { + return Eigen::half(::tanhf(float(a))); +} template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) { return Eigen::half(::floorf(float(a))); } @@ -413,6 +447,51 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::h return Eigen::half(::ceilf(float(a))); } +template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half mini(const Eigen::half& a, const Eigen::half& b) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hlt(b, a) ? b : a; +#else + const float f1 = static_cast<float>(a); + const float f2 = static_cast<float>(b); + return f2 < f1 ? b : a; +#endif +} +template <> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half maxi(const Eigen::half& a, const Eigen::half& b) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hlt(a, b) ? b : a; +#else + const float f1 = static_cast<float>(a); + const float f2 = static_cast<float>(b); + return f1 < f2 ? b : a; +#endif +} + +#ifdef EIGEN_HAS_C99_MATH +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) { + return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) { + return Eigen::half(Eigen::numext::digamma(static_cast<float>(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) { + return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) { + return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) { + return Eigen::half(Eigen::numext::erf(static_cast<float>(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) { + return Eigen::half(Eigen::numext::erfc(static_cast<float>(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) { + return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) { + return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x))); +} +#endif } // end namespace numext } // end namespace Eigen @@ -432,6 +511,9 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) { return Eigen::half(::sqrtf(float(a))); } +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) { + return Eigen::half(::powf(float(a), float(b))); +} static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) { return Eigen::half(::floorf(float(a))); } @@ -451,6 +533,11 @@ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isfinite)(const Eigen::half& a namespace std { +EIGEN_ALWAYS_INLINE ostream& operator << (ostream& os, const Eigen::half& v) { + os << static_cast<float>(v); + return os; +} + #if __cplusplus > 199711L template <> struct hash<Eigen::half> { diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 14f0c9415..0cebc1017 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -17,6 +17,7 @@ // we'll use on the host side (SSE, AVX, ...) #if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +// Most of the following operations require arch >= 3.0 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 namespace Eigen { @@ -33,14 +34,7 @@ template<> struct packet_traits<half> : default_packet_traits AlignedOnScalar = 1, size=2, HasHalfPacket = 0, - - HasDiv = 1, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - - HasBlend = 0, + HasDiv = 1 }; }; @@ -73,9 +67,9 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, co } template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) { #if __CUDA_ARCH__ >= 320 - return __ldg((const half2*)from); + return __ldg((const half2*)from); #else return __halves2half2(*(from+0), *(from+1)); #endif @@ -84,7 +78,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) { #if __CUDA_ARCH__ >= 320 - return __halves2half2(__ldg(from+0), __ldg(from+1)); + return __halves2half2(__ldg(from+0), __ldg(from+1)); #else return __halves2half2(*(from+0), *(from+1)); #endif @@ -120,33 +114,84 @@ ptranspose(PacketBlock<half2,2>& kernel) { kernel.packet[1] = __halves2half2(a2, b2); } -// The following operations require arch >= 5.3 -#if __CUDA_ARCH__ >= 530 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) { +#if __CUDA_ARCH__ >= 530 return __halves2half2(a, __hadd(a, __float2half(1.0f))); +#else + float f = __half2float(a) + 1.0f; + return __halves2half2(a, __float2half(f)); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) { +#if __CUDA_ARCH__ >= 530 return __hadd2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 + b1; + float r2 = a2 + b2; + return __floats2half2_rn(r1, r2); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) { +#if __CUDA_ARCH__ >= 530 return __hsub2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 - b1; + float r2 = a2 - b2; + return __floats2half2_rn(r1, r2); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { +#if __CUDA_ARCH__ >= 530 return __hneg2(a); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return __floats2half2_rn(-a1, -a2); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) { +#if __CUDA_ARCH__ >= 530 return __hmul2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 * b1; + float r2 = a2 * b2; + return __floats2half2_rn(r1, r2); +#endif } - template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) { +#if __CUDA_ARCH__ >= 530 return __hfma2(a, b, c); - } +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float c1 = __low2float(c); + float c2 = __high2float(c); + float r1 = a1 * b1 + c2; + float r2 = a2 * b2 + c2; + return __floats2half2_rn(r1, r2); +#endif +} template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) { float a1 = __low2float(a); @@ -179,25 +224,48 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& } template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) { +#if __CUDA_ARCH__ >= 530 return __hadd(__low2half(a), __high2half(a)); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return half(__float2half_rn(a1 + a2)); +#endif } template<> EIGEN_DEVICE_FUNC inline half predux_max<half2>(const half2& a) { +#if __CUDA_ARCH__ >= 530 half first = __low2half(a); half second = __high2half(a); return __hgt(first, second) ? first : second; +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return half(__float2half_rn(numext::maxi(a1, a2))); +#endif } template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) { +#if __CUDA_ARCH__ >= 530 half first = __low2half(a); half second = __high2half(a); return __hlt(first, second) ? first : second; +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return half(__float2half_rn(numext::mini(a1, a2))); +#endif } template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) { +#if __CUDA_ARCH__ >= 530 return __hmul(__low2half(a), __high2half(a)); -} +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return half(__float2half_rn(a1 * a2)); #endif +} } // end namespace internal diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index b2a9724de..3ea218133 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -71,6 +71,7 @@ struct functor_traits<scalar_cast_op<half, float> > +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 template <> struct type_casting_traits<half, float> { @@ -82,22 +83,9 @@ struct type_casting_traits<half, float> { }; template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 float2 r1 = __half22float2(a); float2 r2 = __half22float2(b); return make_float4(r1.x, r1.y, r2.x, r2.y); -#else - half r1; - r1.x = a.x & 0xFFFF; - half r2; - r2.x = (a.x & 0xFFFF0000) >> 16; - half r3; - r3.x = b.x & 0xFFFF; - half r4; - r4.x = (b.x & 0xFFFF0000) >> 16; - return make_float4(static_cast<float>(r1), static_cast<float>(r2), - static_cast<float>(r3), static_cast<float>(r4)); -#endif } template <> @@ -111,20 +99,11 @@ struct type_casting_traits<float, half> { template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) { // Simply discard the second half of the input -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float22half2_rn(make_float2(a.x, a.y)); -#else - half r1 = static_cast<half>(a.x); - half r2 = static_cast<half>(a.y); - half2 r; - r.x = 0; - r.x |= r1.x; - r.x |= (static_cast<unsigned int>(r2.x) << 16); - return r; -#endif } #endif +#endif } // end namespace internal diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 63a2d9f52..3224c36bd 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -179,6 +179,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co // Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available, // then implements a slow software scalar fallback calling fmaf()! +// Filed LLVM bug: +// https://llvm.org/bugs/show_bug.cgi?id=27216 #if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM) // See bug 936. // FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4. @@ -195,6 +197,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& // -march=armv7-a, that is a very common case. // See e.g. this thread: // http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html + // Filed LLVM bug: + // https://llvm.org/bugs/show_bug.cgi?id=27219 Packet4f r = c; asm volatile( "vmla.f32 %q[r], %q[a], %q[b]" diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index e28fecfd0..5cd8ca950 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -345,6 +345,22 @@ template<> struct functor_traits<scalar_boolean_or_op> { }; /** \internal + * \brief Template functor to compute the xor of two booleans + * + * \sa class CwiseBinaryOp, ArrayBase::operator^ + */ +struct scalar_boolean_xor_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; } +}; +template<> struct functor_traits<scalar_boolean_xor_op> { + enum { + Cost = NumTraits<bool>::AddCost, + PacketAccess = false + }; +}; + +/** \internal * \brief Template functor to compute the incomplete gamma function igamma(a, x) * * \sa class CwiseBinaryOp, Cwise::igamma diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 7ba0abedc..488ebf1d2 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -234,9 +234,33 @@ template<typename Scalar> struct scalar_exp_op { template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexp(a); } }; -template<typename Scalar> -struct functor_traits<scalar_exp_op<Scalar> > -{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasExp }; }; +template <typename Scalar> +struct functor_traits<scalar_exp_op<Scalar> > { + enum { + PacketAccess = packet_traits<Scalar>::HasExp, + // The following numbers are based on the AVX implementation. +#ifdef EIGEN_VECTORIZE_FMA + // Haswell can issue 2 add/mul/madd per cycle. + Cost = + (sizeof(Scalar) == 4 + // float: 8 pmadd, 4 pmul, 2 padd/psub, 6 other + ? (8 * NumTraits<Scalar>::AddCost + 6 * NumTraits<Scalar>::MulCost) + // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div, 13 other + : (14 * NumTraits<Scalar>::AddCost + + 6 * NumTraits<Scalar>::MulCost + + NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost)) +#else + Cost = + (sizeof(Scalar) == 4 + // float: 7 pmadd, 6 pmul, 4 padd/psub, 10 other + ? (21 * NumTraits<Scalar>::AddCost + 13 * NumTraits<Scalar>::MulCost) + // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div, 13 other + : (23 * NumTraits<Scalar>::AddCost + + 12 * NumTraits<Scalar>::MulCost + + NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost)) +#endif + }; +}; /** \internal * @@ -250,9 +274,24 @@ template<typename Scalar> struct scalar_log_op { template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog(a); } }; -template<typename Scalar> -struct functor_traits<scalar_log_op<Scalar> > -{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; }; +template <typename Scalar> +struct functor_traits<scalar_log_op<Scalar> > { + enum { + PacketAccess = packet_traits<Scalar>::HasLog, + Cost = + (PacketAccess + // The following numbers are based on the AVX implementation. +#ifdef EIGEN_VECTORIZE_FMA + // 8 pmadd, 6 pmul, 8 padd/psub, 16 other, can issue 2 add/mul/madd per cycle. + ? (20 * NumTraits<Scalar>::AddCost + 7 * NumTraits<Scalar>::MulCost) +#else + // 8 pmadd, 6 pmul, 8 padd/psub, 20 other + ? (36 * NumTraits<Scalar>::AddCost + 14 * NumTraits<Scalar>::MulCost) +#endif + // Measured cost of std::log. + : sizeof(Scalar)==4 ? 40 : 85) + }; +}; /** \internal * @@ -280,10 +319,19 @@ template<typename Scalar> struct scalar_sqrt_op { template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); } }; -template<typename Scalar> -struct functor_traits<scalar_sqrt_op<Scalar> > -{ enum { - Cost = 5 * NumTraits<Scalar>::MulCost, +template <typename Scalar> +struct functor_traits<scalar_sqrt_op<Scalar> > { + enum { +#if EIGEN_FAST_MATH + // The following numbers are based on the AVX implementation. + Cost = (sizeof(Scalar) == 8 ? 28 + // 4 pmul, 1 pmadd, 3 other + : (3 * NumTraits<Scalar>::AddCost + + 5 * NumTraits<Scalar>::MulCost)), +#else + // The following numbers are based on min VSQRT throughput on Haswell. + Cost = (sizeof(Scalar) == 8 ? 28 : 14), +#endif PacketAccess = packet_traits<Scalar>::HasSqrt }; }; @@ -313,7 +361,7 @@ struct functor_traits<scalar_rsqrt_op<Scalar> > */ template<typename Scalar> struct scalar_cos_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op) - EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { using std::cos; return cos(a); } + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return numext::cos(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcos(a); } }; @@ -332,7 +380,7 @@ struct functor_traits<scalar_cos_op<Scalar> > */ template<typename Scalar> struct scalar_sin_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sin; return sin(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sin(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psin(a); } }; @@ -352,7 +400,7 @@ struct functor_traits<scalar_sin_op<Scalar> > */ template<typename Scalar> struct scalar_tan_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tan; return tan(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tan(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptan(a); } }; @@ -371,7 +419,7 @@ struct functor_traits<scalar_tan_op<Scalar> > */ template<typename Scalar> struct scalar_acos_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::acos; return acos(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::acos(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pacos(a); } }; @@ -390,7 +438,7 @@ struct functor_traits<scalar_acos_op<Scalar> > */ template<typename Scalar> struct scalar_asin_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::asin; return asin(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::asin(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pasin(a); } }; @@ -546,7 +594,7 @@ struct functor_traits<scalar_erfc_op<Scalar> > */ template<typename Scalar> struct scalar_atan_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::atan(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::patan(a); } }; @@ -566,7 +614,7 @@ struct functor_traits<scalar_atan_op<Scalar> > */ template<typename Scalar> struct scalar_tanh_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tanh; return tanh(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tanh(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptanh(a); } }; @@ -574,8 +622,24 @@ template<typename Scalar> struct functor_traits<scalar_tanh_op<Scalar> > { enum { - Cost = 5 * NumTraits<Scalar>::MulCost, - PacketAccess = packet_traits<Scalar>::HasTanh + PacketAccess = packet_traits<Scalar>::HasTanh, + Cost = + (PacketAccess + // The following numbers are based on the AVX implementation, +#ifdef EIGEN_VECTORIZE_FMA + // Haswell can issue 2 add/mul/madd per cycle. + // 9 pmadd, 2 pmul, 1 div, 2 other + ? (2 * NumTraits<Scalar>::AddCost + 6 * NumTraits<Scalar>::MulCost + + NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost) +#else + ? (11 * NumTraits<Scalar>::AddCost + + 11 * NumTraits<Scalar>::MulCost + + NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost) +#endif + // This number assumes a naive implementation of tanh + : (6 * NumTraits<Scalar>::AddCost + 3 * NumTraits<Scalar>::MulCost + + 2 * NumTraits<Scalar>::template Div<packet_traits<Scalar>::HasDiv>::Cost + + functor_traits<scalar_exp_op<Scalar> >::Cost)) }; }; @@ -585,7 +649,7 @@ struct functor_traits<scalar_tanh_op<Scalar> > */ template<typename Scalar> struct scalar_sinh_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sinh_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sinh; return sinh(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sinh(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psinh(a); } }; @@ -604,7 +668,7 @@ struct functor_traits<scalar_sinh_op<Scalar> > */ template<typename Scalar> struct scalar_cosh_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cosh_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::cosh; return cosh(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::cosh(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcosh(a); } }; @@ -816,9 +880,9 @@ struct scalar_sign_op<Scalar,true> { { typedef typename NumTraits<Scalar>::Real real_type; real_type aa = numext::abs(a); - if (aa==0) + if (aa==real_type(0)) return Scalar(0); - aa = 1./aa; + aa = real_type(1)/aa; return Scalar(real(a)*aa, imag(a)*aa ); } //TODO diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 54e118395..5b0473598 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -11,8 +11,8 @@ #define EIGEN_GENERAL_BLOCK_PANEL_H -namespace Eigen { - +namespace Eigen { + namespace internal { template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false> @@ -36,7 +36,7 @@ const std::ptrdiff_t defaultL3CacheSize = 512*1024; #endif /** \internal */ -struct CacheSizes { +struct CacheSizes { CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) { int l1CacheSize, l2CacheSize, l3CacheSize; queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize); @@ -89,7 +89,7 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff * * \sa setCpuCacheSizes */ -template<typename LhsScalar, typename RhsScalar, int KcFactor> +template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index> void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1) { typedef gebp_traits<LhsScalar,RhsScalar> Traits; @@ -107,21 +107,17 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n enum { kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), ksub = Traits::mr * Traits::nr * sizeof(ResScalar), - k_mask = -8, - + kr = 8, mr = Traits::mr, - mr_mask = -mr, - - nr = Traits::nr, - nr_mask = -nr + nr = Traits::nr }; // Increasing k gives us more time to prefetch the content of the "C" // registers. However once the latency is hidden there is no point in // increasing the value of k, so we'll cap it at 320 (value determined // experimentally). - const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320); + const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320); if (k_cache < k) { - k = k_cache & k_mask; + k = k_cache - (k_cache % kr); eigen_internal_assert(k > 0); } @@ -130,10 +126,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n if (n_cache <= n_per_thread) { // Don't exceed the capacity of the l2 cache. eigen_internal_assert(n_cache >= static_cast<Index>(nr)); - n = n_cache & nr_mask; + n = n_cache - (n_cache % nr); eigen_internal_assert(n > 0); } else { - n = (std::min<Index>)(n, (n_per_thread + nr - 1) & nr_mask); + n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr)); } if (l3 > l2) { @@ -141,10 +137,10 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); const Index m_per_thread = numext::div_ceil(m, num_threads); if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) { - m = m_cache & mr_mask; + m = m_cache - (m_cache % mr); eigen_internal_assert(m > 0); } else { - m = (std::min<Index>)(m, (m_per_thread + mr - 1) & mr_mask); + m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr)); } } } @@ -156,29 +152,29 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n l2 = 32*1024; l3 = 512*1024; #endif - + // Early return for small problems because the computation below are time consuming for small problems. // Perhaps it would make more sense to consider k*n*m?? // Note that for very tiny problem, this function should be bypassed anyway // because we use the coefficient-based implementation for them. - if((std::max)(k,(std::max)(m,n))<48) + if((numext::maxi)(k,(numext::maxi)(m,n))<48) return; - + typedef typename Traits::ResScalar ResScalar; enum { k_peeling = 8, k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), k_sub = Traits::mr * Traits::nr * sizeof(ResScalar) }; - + // ---- 1st level of blocking on L1, yields kc ---- - + // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache. // We also include a register-level block of the result (mx x nr). // (In an ideal world only the lhs panel would stay in L1) // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of: - const Index max_kc = std::max<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1); + const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1); const Index old_k = k; if(k>max_kc) { @@ -187,12 +183,12 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // while keeping the same number of sweeps over the result. k = (k%max_kc)==0 ? max_kc : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1))); - + eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same"); } - + // ---- 2nd level of blocking on max(L2,L3), yields nc ---- - + // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is: // actual_l2 = max(l2, l3/nb_core_sharing_l3) // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it) @@ -202,7 +198,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n #else const Index actual_l2 = 1572864; // == 1.5 MB #endif - + // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2. // The second half is implicitly reserved to access the result and lhs coefficients. // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful @@ -223,7 +219,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar)); } // WARNING Below, we assume that Traits::nr is a power of two. - Index nc = std::min<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); + Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); if(n>nc) { // We are really blocking over the columns: @@ -252,9 +248,9 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // we have both L2 and L3, and problem is small enough to be kept in L2 // Let's choose m such that lhs's block fit in 1/3 of L2 actual_lm = l2; - max_mc = (std::min<Index>)(576,max_mc); + max_mc = (numext::mini<Index>)(576,max_mc); } - Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); + Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); if (mc > Traits::mr) mc -= mc % Traits::mr; else if (mc==0) return; m = (m%mc)==0 ? mc @@ -263,13 +259,14 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n } } +template <typename Index> inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) { #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { - k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); - m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); - n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); + k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); + m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); + n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); return true; } #else @@ -296,11 +293,11 @@ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) * * \sa setCpuCacheSizes */ -template<typename LhsScalar, typename RhsScalar, int KcFactor> +template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index> void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { if (!useSpecificBlockingSizes(k, m, n)) { - evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads); + evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads); } typedef gebp_traits<LhsScalar,RhsScalar> Traits; @@ -314,10 +311,10 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads if (n > nr) n -= n % nr; } -template<typename LhsScalar, typename RhsScalar> +template<typename LhsScalar, typename RhsScalar, typename Index> inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { - computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads); + computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads); } #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD @@ -2224,6 +2221,16 @@ inline std::ptrdiff_t l2CacheSize() return l2; } +/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\ +rs. +* \sa setCpuCacheSize */ +inline std::ptrdiff_t l3CacheSize() +{ + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); + return l3; +} + /** Set the cpu L1 and L2 cache sizes (in bytes). * These values are use to adjust the size of the blocks * for the algorithms working per blocks. diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 831089dee..80ba89465 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -43,7 +43,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder, typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, - const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking) + const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking) { general_matrix_matrix_triangular_product<Index, RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs, diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index 3deed068e..911df8ff3 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Level 3 BLAS SYRK/HERK implementation. ******************************************************************************** */ -#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H -#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H +#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H namespace Eigen { @@ -44,34 +44,35 @@ struct general_matrix_matrix_rankupdate : // try to go to BLAS specialization -#define EIGEN_MKL_RANKUPDATE_SPECIALIZE(Scalar) \ +#define EIGEN_BLAS_RANKUPDATE_SPECIALIZE(Scalar) \ template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs, int UpLo> \ struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \ Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \ - const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha) \ + const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \ { \ if (lhs==rhs) { \ general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \ - ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \ + ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ } else { \ general_matrix_matrix_triangular_product<Index, \ Scalar, LhsStorageOrder, ConjugateLhs, \ Scalar, RhsStorageOrder, ConjugateRhs, \ ColMajor, UpLo, BuiltIn> \ - ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \ + ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ } \ } \ }; -EIGEN_MKL_RANKUPDATE_SPECIALIZE(double) -//EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex) -EIGEN_MKL_RANKUPDATE_SPECIALIZE(float) -//EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex) +EIGEN_BLAS_RANKUPDATE_SPECIALIZE(double) +EIGEN_BLAS_RANKUPDATE_SPECIALIZE(float) +// TODO handle complex cases +// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(dcomplex) +// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(scomplex) // SYRK for float/double -#define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, MKLTYPE, MKLFUNC) \ +#define EIGEN_BLAS_RANKUPDATE_R(EIGTYPE, BLASTYPE, BLASFUNC) \ template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \ enum { \ @@ -80,23 +81,19 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \ }; \ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \ - const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \ + const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ /* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \ \ - MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \ + BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \ char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \ - MKLTYPE alpha_, beta_; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \ - MKLFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \ + EIGTYPE beta; \ + BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \ } \ }; // HERK for complex data -#define EIGEN_MKL_RANKUPDATE_C(EIGTYPE, MKLTYPE, RTYPE, MKLFUNC) \ +#define EIGEN_BLAS_RANKUPDATE_C(EIGTYPE, BLASTYPE, RTYPE, BLASFUNC) \ template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \ enum { \ @@ -105,18 +102,15 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \ }; \ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \ - const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \ + const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \ \ - MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \ + BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \ char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \ RTYPE alpha_, beta_; \ const EIGTYPE* a_ptr; \ \ -/* Set alpha_ & beta_ */ \ -/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); */\ -/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1));*/ \ alpha_ = alpha.real(); \ beta_ = 1.0; \ /* Copy with conjugation in some cases*/ \ @@ -127,20 +121,21 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C lda = a.outerStride(); \ a_ptr = a.data(); \ } else a_ptr=lhs; \ - MKLFUNC(&uplo, &trans, &n, &k, &alpha_, (MKLTYPE*)a_ptr, &lda, &beta_, (MKLTYPE*)res, &ldc); \ + BLASFUNC(&uplo, &trans, &n, &k, &alpha_, (BLASTYPE*)a_ptr, &lda, &beta_, (BLASTYPE*)res, &ldc); \ } \ }; -EIGEN_MKL_RANKUPDATE_R(double, double, dsyrk) -EIGEN_MKL_RANKUPDATE_R(float, float, ssyrk) +EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_) +EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_) -//EIGEN_MKL_RANKUPDATE_C(dcomplex, MKL_Complex16, double, zherk) -//EIGEN_MKL_RANKUPDATE_C(scomplex, MKL_Complex8, double, cherk) +// TODO hanlde complex cases +// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_) +// EIGEN_BLAS_RANKUPDATE_C(scomplex, float, float, cherk_) } // end namespace internal } // end namespace Eigen -#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H +#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h index b6ae729b2..7a3bdbf20 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * General matrix-matrix product functionality based on ?GEMM. ******************************************************************************** */ -#ifndef EIGEN_GENERAL_MATRIX_MATRIX_MKL_H -#define EIGEN_GENERAL_MATRIX_MATRIX_MKL_H +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H +#define EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H namespace Eigen { @@ -46,7 +46,7 @@ namespace internal { // gemm specialization -#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, MKLTYPE, MKLPREFIX) \ +#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASPREFIX) \ template< \ typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ @@ -66,55 +66,50 @@ static void run(Index rows, Index cols, Index depth, \ using std::conj; \ \ char transa, transb; \ - MKL_INT m, n, k, lda, ldb, ldc; \ + BlasIndex m, n, k, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX a_tmp, b_tmp; \ - EIGTYPE myone(1);\ \ /* Set transpose options */ \ transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \ transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ - k = (MKL_INT)depth; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ + k = convert_index<BlasIndex>(depth); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)lhsStride; \ - ldb = (MKL_INT)rhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index<BlasIndex>(lhsStride); \ + ldb = convert_index<BlasIndex>(rhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ \ /* Set a, b, c */ \ if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \ a_tmp = lhs.conjugate(); \ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else a = _lhs; \ \ if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \ b_tmp = rhs.conjugate(); \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } else b = _rhs; \ \ - MKLPREFIX##gemm(&transa, &transb, &m, &n, &k, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \ + BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ }}; -GEMM_SPECIALIZATION(double, d, double, d) -GEMM_SPECIALIZATION(float, f, float, s) -GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, z) -GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, c) +GEMM_SPECIALIZATION(double, d, double, d) +GEMM_SPECIALIZATION(float, f, float, s) +GEMM_SPECIALIZATION(dcomplex, cd, double, z) +GEMM_SPECIALIZATION(scomplex, cf, float, c) } // end namespase internal } // end namespace Eigen -#endif // EIGEN_GENERAL_MATRIX_MATRIX_MKL_H +#endif // EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h index 12c3d13bd..e3a5d5892 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h +++ b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * General matrix-vector product functionality based on ?GEMV. ******************************************************************************** */ -#ifndef EIGEN_GENERAL_MATRIX_VECTOR_MKL_H -#define EIGEN_GENERAL_MATRIX_VECTOR_MKL_H +#ifndef EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H +#define EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H namespace Eigen { @@ -49,7 +49,7 @@ namespace internal { template<typename Index, typename LhsScalar, int StorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> struct general_matrix_vector_product_gemv; -#define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \ +#define EIGEN_BLAS_GEMV_SPECIALIZE(Scalar) \ template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \ struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,Specialized> { \ static void run( \ @@ -80,12 +80,12 @@ static void run( \ } \ }; \ -EIGEN_MKL_GEMV_SPECIALIZE(double) -EIGEN_MKL_GEMV_SPECIALIZE(float) -EIGEN_MKL_GEMV_SPECIALIZE(dcomplex) -EIGEN_MKL_GEMV_SPECIALIZE(scomplex) +EIGEN_BLAS_GEMV_SPECIALIZE(double) +EIGEN_BLAS_GEMV_SPECIALIZE(float) +EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex) +EIGEN_BLAS_GEMV_SPECIALIZE(scomplex) -#define EIGEN_MKL_GEMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLPREFIX) \ +#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \ template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \ struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \ { \ @@ -97,16 +97,15 @@ static void run( \ const EIGTYPE* rhs, Index rhsIncr, \ EIGTYPE* res, Index resIncr, EIGTYPE alpha) \ { \ - MKL_INT m=rows, n=cols, lda=lhsStride, incx=rhsIncr, incy=resIncr; \ - MKLTYPE alpha_, beta_; \ - const EIGTYPE *x_ptr, myone(1); \ + BlasIndex m=convert_index<BlasIndex>(rows), n=convert_index<BlasIndex>(cols), \ + lda=convert_index<BlasIndex>(lhsStride), incx=convert_index<BlasIndex>(rhsIncr), incy=convert_index<BlasIndex>(resIncr); \ + const EIGTYPE beta(1); \ + const EIGTYPE *x_ptr; \ char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \ if (LhsStorageOrder==RowMajor) { \ - m=cols; \ - n=rows; \ + m = convert_index<BlasIndex>(cols); \ + n = convert_index<BlasIndex>(rows); \ }\ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ GEMVVector x_tmp; \ if (ConjugateRhs) { \ Map<const GEMVVector, 0, InnerStride<> > map_x(rhs,cols,1,InnerStride<>(incx)); \ @@ -114,17 +113,17 @@ static void run( \ x_ptr=x_tmp.data(); \ incx=1; \ } else x_ptr=rhs; \ - MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \ + BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; -EIGEN_MKL_GEMV_SPECIALIZATION(double, double, d) -EIGEN_MKL_GEMV_SPECIALIZATION(float, float, s) -EIGEN_MKL_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, z) -EIGEN_MKL_GEMV_SPECIALIZATION(scomplex, MKL_Complex8, c) +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c) } // end namespase internal } // end namespace Eigen -#endif // EIGEN_GENERAL_MATRIX_VECTOR_MKL_H +#endif // EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h index dfa687fef..c3e37b1e0 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM. ******************************************************************************** */ -#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H -#define EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H +#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H +#define EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H namespace Eigen { @@ -40,7 +40,7 @@ namespace internal { /* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */ -#define EIGEN_MKL_SYMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -52,28 +52,23 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ EIGTYPE* res, Index resStride, \ - EIGTYPE alpha) \ + EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ char side='L', uplo='L'; \ - MKL_INT m, n, lda, ldb, ldc; \ + BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ - EIGTYPE myone(1);\ \ /* Set transpose options */ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)lhsStride; \ - ldb = (MKL_INT)rhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index<BlasIndex>(lhsStride); \ + ldb = convert_index<BlasIndex>(rhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ \ /* Set a, b, c */ \ if (LhsStorageOrder==RowMajor) uplo='U'; \ @@ -83,16 +78,16 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \ b_tmp = rhs.adjoint(); \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } else b = _rhs; \ \ - MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \ + BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_MKL_HEMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -103,29 +98,24 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ EIGTYPE* res, Index resStride, \ - EIGTYPE alpha) \ + EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ char side='L', uplo='L'; \ - MKL_INT m, n, lda, ldb, ldc; \ + BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp; \ - EIGTYPE myone(1); \ \ /* Set transpose options */ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)lhsStride; \ - ldb = (MKL_INT)rhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index<BlasIndex>(lhsStride); \ + ldb = convert_index<BlasIndex>(rhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ \ /* Set a, b, c */ \ if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \ @@ -151,23 +141,23 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh b_tmp = rhs.transpose(); \ } \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } \ \ - MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \ + BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -EIGEN_MKL_SYMM_L(double, double, d, d) -EIGEN_MKL_SYMM_L(float, float, f, s) -EIGEN_MKL_HEMM_L(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_HEMM_L(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_SYMM_L(double, double, d, d) +EIGEN_BLAS_SYMM_L(float, float, f, s) +EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z) +EIGEN_BLAS_HEMM_L(scomplex, float, cf, c) /* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */ -#define EIGEN_MKL_SYMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -179,27 +169,22 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ EIGTYPE* res, Index resStride, \ - EIGTYPE alpha) \ + EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ char side='R', uplo='L'; \ - MKL_INT m, n, lda, ldb, ldc; \ + BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ - EIGTYPE myone(1);\ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)rhsStride; \ - ldb = (MKL_INT)lhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index<BlasIndex>(rhsStride); \ + ldb = convert_index<BlasIndex>(lhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ \ /* Set a, b, c */ \ if (RhsStorageOrder==RowMajor) uplo='U'; \ @@ -209,16 +194,16 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \ b_tmp = lhs.adjoint(); \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } else b = _lhs; \ \ - MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \ + BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_MKL_HEMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -229,35 +214,30 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ EIGTYPE* res, Index resStride, \ - EIGTYPE alpha) \ + EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ char side='R', uplo='L'; \ - MKL_INT m, n, lda, ldb, ldc; \ + BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp; \ - EIGTYPE myone(1); \ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)rhsStride; \ - ldb = (MKL_INT)lhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index<BlasIndex>(rhsStride); \ + ldb = convert_index<BlasIndex>(lhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ \ /* Set a, b, c */ \ if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \ Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \ a_tmp = rhs.conjugate(); \ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else a = _rhs; \ if (RhsStorageOrder==RowMajor) uplo='U'; \ \ @@ -279,17 +259,17 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL ldb = b_tmp.outerStride(); \ } \ \ - MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \ + BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ } \ }; -EIGEN_MKL_SYMM_R(double, double, d, d) -EIGEN_MKL_SYMM_R(float, float, f, s) -EIGEN_MKL_HEMM_R(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_HEMM_R(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_SYMM_R(double, double, d, d) +EIGEN_BLAS_SYMM_R(float, float, f, s) +EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z) +EIGEN_BLAS_HEMM_R(scomplex, float, cf, c) } // end namespace internal } // end namespace Eigen -#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H +#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h index a08f385bc..38f23accf 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Selfadjoint matrix-vector product functionality based on ?SYMV/HEMV. ******************************************************************************** */ -#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H -#define EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H +#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H +#define EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H namespace Eigen { @@ -47,7 +47,7 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju struct selfadjoint_matrix_vector_product_symv : selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn> {}; -#define EIGEN_MKL_SYMV_SPECIALIZE(Scalar) \ +#define EIGEN_BLAS_SYMV_SPECIALIZE(Scalar) \ template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \ struct selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Specialized> { \ static void run( \ @@ -66,12 +66,12 @@ static void run( \ } \ }; \ -EIGEN_MKL_SYMV_SPECIALIZE(double) -EIGEN_MKL_SYMV_SPECIALIZE(float) -EIGEN_MKL_SYMV_SPECIALIZE(dcomplex) -EIGEN_MKL_SYMV_SPECIALIZE(scomplex) +EIGEN_BLAS_SYMV_SPECIALIZE(double) +EIGEN_BLAS_SYMV_SPECIALIZE(float) +EIGEN_BLAS_SYMV_SPECIALIZE(dcomplex) +EIGEN_BLAS_SYMV_SPECIALIZE(scomplex) -#define EIGEN_MKL_SYMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLFUNC) \ +#define EIGEN_BLAS_SYMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \ template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \ struct selfadjoint_matrix_vector_product_symv<EIGTYPE,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs> \ { \ @@ -85,29 +85,27 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \ IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \ IsLower = UpLo == Lower ? 1 : 0 \ }; \ - MKL_INT n=size, lda=lhsStride, incx=1, incy=1; \ - MKLTYPE alpha_, beta_; \ - const EIGTYPE *x_ptr, myone(1); \ + BlasIndex n=convert_index<BlasIndex>(size), lda=convert_index<BlasIndex>(lhsStride), incx=1, incy=1; \ + EIGTYPE beta(1); \ + const EIGTYPE *x_ptr; \ char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ SYMVVector x_tmp; \ if (ConjugateRhs) { \ Map<const SYMVVector, 0 > map_x(_rhs,size,1); \ x_tmp=map_x.conjugate(); \ x_ptr=x_tmp.data(); \ } else x_ptr=_rhs; \ - MKLFUNC(&uplo, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \ + BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; -EIGEN_MKL_SYMV_SPECIALIZATION(double, double, dsymv) -EIGEN_MKL_SYMV_SPECIALIZATION(float, float, ssymv) -EIGEN_MKL_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv) -EIGEN_MKL_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv) +EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_) +EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_) +EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_) +EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_) } // end namespace internal } // end namespace Eigen -#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H +#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h index d9e7cf852..aecded6bb 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Triangular matrix * matrix product functionality based on ?TRMM. ******************************************************************************** */ -#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H -#define EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H +#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H +#define EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H namespace Eigen { @@ -50,7 +50,7 @@ struct product_triangular_matrix_matrix_trmm : // try to go to BLAS specialization -#define EIGEN_MKL_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \ +#define EIGEN_BLAS_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \ template <typename Index, int Mode, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -65,17 +65,17 @@ struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \ } \ }; -EIGEN_MKL_TRMM_SPECIALIZE(double, true) -EIGEN_MKL_TRMM_SPECIALIZE(double, false) -EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, true) -EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, false) -EIGEN_MKL_TRMM_SPECIALIZE(float, true) -EIGEN_MKL_TRMM_SPECIALIZE(float, false) -EIGEN_MKL_TRMM_SPECIALIZE(scomplex, true) -EIGEN_MKL_TRMM_SPECIALIZE(scomplex, false) +EIGEN_BLAS_TRMM_SPECIALIZE(double, true) +EIGEN_BLAS_TRMM_SPECIALIZE(double, false) +EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, true) +EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, false) +EIGEN_BLAS_TRMM_SPECIALIZE(float, true) +EIGEN_BLAS_TRMM_SPECIALIZE(float, false) +EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true) +EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false) // implements col-major += alpha * op(triangular) * op(general) -#define EIGEN_MKL_TRMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, int Mode, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -106,13 +106,14 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \ typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \ \ -/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \ +/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \ if (rows != depth) { \ \ - int nthr = mkl_domain_get_max_threads(EIGEN_MKL_DOMAIN_BLAS); \ + /* FIXME handle mkl_domain_get_max_threads */ \ + /*int nthr = mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS);*/ int nthr = 1;\ \ if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \ - /* Most likely no benefit to call TRMM or GEMM from MKL*/ \ + /* Most likely no benefit to call TRMM or GEMM from BLAS */ \ product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \ LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \ _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ @@ -121,27 +122,23 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ /* Make sense to call GEMM */ \ Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \ MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \ - MKL_INT aStride = aa_tmp.outerStride(); \ + BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \ gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \ general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \ rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \ \ - /*std::cout << "TRMM_L: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \ + /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ } \ return; \ } \ char side = 'L', transa, uplo, diag = 'N'; \ EIGTYPE *b; \ const EIGTYPE *a; \ - MKL_INT m, n, lda, ldb; \ - MKLTYPE alpha_; \ -\ -/* Set alpha_*/ \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \ + BlasIndex m, n, lda, ldb; \ \ /* Set m, n */ \ - m = (MKL_INT)diagSize; \ - n = (MKL_INT)cols; \ + m = convert_index<BlasIndex>(diagSize); \ + n = convert_index<BlasIndex>(cols); \ \ /* Set trans */ \ transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \ @@ -152,7 +149,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ \ if (ConjugateRhs) b_tmp = rhs.conjugate(); else b_tmp = rhs; \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ \ /* Set uplo */ \ uplo = IsLower ? 'L' : 'U'; \ @@ -168,14 +165,14 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ else if (IsUnitDiag) \ a_tmp.diagonal().setOnes();\ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else { \ a = _lhs; \ - lda = lhsStride; \ + lda = convert_index<BlasIndex>(lhsStride); \ } \ - /*std::cout << "TRMM_L: A is square! Go to MKL TRMM implementation! \n";*/ \ + /*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \ /* call ?trmm*/ \ - MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \ + BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ \ /* Add op(a_triangular)*b into res*/ \ Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -183,13 +180,13 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ } \ }; -EIGEN_MKL_TRMM_L(double, double, d, d) -EIGEN_MKL_TRMM_L(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_TRMM_L(float, float, f, s) -EIGEN_MKL_TRMM_L(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_TRMM_L(double, double, d, d) +EIGEN_BLAS_TRMM_L(dcomplex, double, cd, z) +EIGEN_BLAS_TRMM_L(float, float, f, s) +EIGEN_BLAS_TRMM_L(scomplex, float, cf, c) // implements col-major += alpha * op(general) * op(triangular) -#define EIGEN_MKL_TRMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, int Mode, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -220,13 +217,13 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \ typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \ \ -/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \ +/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \ if (cols != depth) { \ \ - int nthr = mkl_domain_get_max_threads(EIGEN_MKL_DOMAIN_BLAS); \ + int nthr = 1 /*mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS)*/; \ \ if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \ - /* Most likely no benefit to call TRMM or GEMM from MKL*/ \ + /* Most likely no benefit to call TRMM or GEMM from BLAS*/ \ product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \ LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \ _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ @@ -235,27 +232,23 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ /* Make sense to call GEMM */ \ Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \ MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \ - MKL_INT aStride = aa_tmp.outerStride(); \ + BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \ gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \ general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \ rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \ \ - /*std::cout << "TRMM_R: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \ + /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ } \ return; \ } \ char side = 'R', transa, uplo, diag = 'N'; \ EIGTYPE *b; \ const EIGTYPE *a; \ - MKL_INT m, n, lda, ldb; \ - MKLTYPE alpha_; \ -\ -/* Set alpha_*/ \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \ + BlasIndex m, n, lda, ldb; \ \ /* Set m, n */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)diagSize; \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(diagSize); \ \ /* Set trans */ \ transa = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \ @@ -266,7 +259,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ \ if (ConjugateLhs) b_tmp = lhs.conjugate(); else b_tmp = lhs; \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ \ /* Set uplo */ \ uplo = IsLower ? 'L' : 'U'; \ @@ -282,14 +275,14 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ else if (IsUnitDiag) \ a_tmp.diagonal().setOnes();\ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else { \ a = _rhs; \ - lda = rhsStride; \ + lda = convert_index<BlasIndex>(rhsStride); \ } \ - /*std::cout << "TRMM_R: A is square! Go to MKL TRMM implementation! \n";*/ \ + /*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \ /* call ?trmm*/ \ - MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \ + BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ \ /* Add op(a_triangular)*b into res*/ \ Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -297,13 +290,13 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ } \ }; -EIGEN_MKL_TRMM_R(double, double, d, d) -EIGEN_MKL_TRMM_R(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_TRMM_R(float, float, f, s) -EIGEN_MKL_TRMM_R(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_TRMM_R(double, double, d, d) +EIGEN_BLAS_TRMM_R(dcomplex, double, cd, z) +EIGEN_BLAS_TRMM_R(float, float, f, s) +EIGEN_BLAS_TRMM_R(scomplex, float, cf, c) } // end namespace internal } // end namespace Eigen -#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H +#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h index 7c014b72a..f79840aa7 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector.h +++ b/Eigen/src/Core/products/TriangularMatrixVector.h @@ -27,13 +27,13 @@ struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,C HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag }; static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha); + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha); }; template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version> EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version> ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha) + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha) { static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; Index size = (std::min)(_rows,_cols); diff --git a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h index 3672b1240..07bf26ce5 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h +++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Triangular matrix-vector product functionality based on ?TRMV. ******************************************************************************** */ -#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H -#define EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H +#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H +#define EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H namespace Eigen { @@ -47,7 +47,7 @@ template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename Rh struct triangular_matrix_vector_product_trmv : triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,StorageOrder,BuiltIn> {}; -#define EIGEN_MKL_TRMV_SPECIALIZE(Scalar) \ +#define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \ template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ @@ -65,13 +65,13 @@ struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs } \ }; -EIGEN_MKL_TRMV_SPECIALIZE(double) -EIGEN_MKL_TRMV_SPECIALIZE(float) -EIGEN_MKL_TRMV_SPECIALIZE(dcomplex) -EIGEN_MKL_TRMV_SPECIALIZE(scomplex) +EIGEN_BLAS_TRMV_SPECIALIZE(double) +EIGEN_BLAS_TRMV_SPECIALIZE(float) +EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex) +EIGEN_BLAS_TRMV_SPECIALIZE(scomplex) // implements col-major: res += alpha * op(triangular) * vector -#define EIGEN_MKL_TRMV_CM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \ enum { \ @@ -105,17 +105,15 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, /* Square part handling */\ \ char trans, uplo, diag; \ - MKL_INT m, n, lda, incx, incy; \ + BlasIndex m, n, lda, incx, incy; \ EIGTYPE const *a; \ - MKLTYPE alpha_, beta_; \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \ + EIGTYPE beta(1); \ \ /* Set m, n */ \ - n = (MKL_INT)size; \ - lda = lhsStride; \ + n = convert_index<BlasIndex>(size); \ + lda = convert_index<BlasIndex>(lhsStride); \ incx = 1; \ - incy = resIncr; \ + incy = convert_index<BlasIndex>(resIncr); \ \ /* Set uplo, trans and diag*/ \ trans = 'N'; \ @@ -123,39 +121,39 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, diag = IsUnitDiag ? 'U' : 'N'; \ \ /* call ?TRMV*/ \ - MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \ + BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ \ /* Add op(a_tr)rhs into res*/ \ - MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \ -/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \ + BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ +/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \ if (size<(std::max)(rows,cols)) { \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ if (size<rows) { \ y = _res + size*resIncr; \ a = _lhs + size; \ - m = rows-size; \ - n = size; \ + m = convert_index<BlasIndex>(rows-size); \ + n = convert_index<BlasIndex>(size); \ } \ else { \ x += size; \ y = _res; \ a = _lhs + size*lda; \ - m = size; \ - n = cols-size; \ + m = convert_index<BlasIndex>(size); \ + n = convert_index<BlasIndex>(cols-size); \ } \ - MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \ + BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_MKL_TRMV_CM(double, double, d, d) -EIGEN_MKL_TRMV_CM(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_TRMV_CM(float, float, f, s) -EIGEN_MKL_TRMV_CM(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_TRMV_CM(double, double, d, d) +EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z) +EIGEN_BLAS_TRMV_CM(float, float, f, s) +EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c) // implements row-major: res += alpha * op(triangular) * vector -#define EIGEN_MKL_TRMV_RM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \ enum { \ @@ -189,17 +187,15 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, /* Square part handling */\ \ char trans, uplo, diag; \ - MKL_INT m, n, lda, incx, incy; \ + BlasIndex m, n, lda, incx, incy; \ EIGTYPE const *a; \ - MKLTYPE alpha_, beta_; \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \ + EIGTYPE beta(1); \ \ /* Set m, n */ \ - n = (MKL_INT)size; \ - lda = lhsStride; \ + n = convert_index<BlasIndex>(size); \ + lda = convert_index<BlasIndex>(lhsStride); \ incx = 1; \ - incy = resIncr; \ + incy = convert_index<BlasIndex>(resIncr); \ \ /* Set uplo, trans and diag*/ \ trans = ConjLhs ? 'C' : 'T'; \ @@ -207,39 +203,39 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, diag = IsUnitDiag ? 'U' : 'N'; \ \ /* call ?TRMV*/ \ - MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \ + BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ \ /* Add op(a_tr)rhs into res*/ \ - MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \ -/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \ + BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ +/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \ if (size<(std::max)(rows,cols)) { \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ if (size<rows) { \ y = _res + size*resIncr; \ a = _lhs + size*lda; \ - m = rows-size; \ - n = size; \ + m = convert_index<BlasIndex>(rows-size); \ + n = convert_index<BlasIndex>(size); \ } \ else { \ x += size; \ y = _res; \ a = _lhs + size; \ - m = size; \ - n = cols-size; \ + m = convert_index<BlasIndex>(size); \ + n = convert_index<BlasIndex>(cols-size); \ } \ - MKLPREFIX##gemv(&trans, &n, &m, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \ + BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_MKL_TRMV_RM(double, double, d, d) -EIGEN_MKL_TRMV_RM(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_TRMV_RM(float, float, f, s) -EIGEN_MKL_TRMV_RM(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_TRMV_RM(double, double, d, d) +EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z) +EIGEN_BLAS_TRMV_RM(float, float, f, s) +EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c) } // end namespase internal } // end namespace Eigen -#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H +#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 208593718..1bed66ed8 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -83,7 +83,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju // coherence when accessing the rhs elements std::ptrdiff_t l1, l2, l3; manage_caching_sizes(GetAction, &l1, &l2, &l3); - Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0; + Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * std::max<Index>(otherStride,size)) : 0; subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr); for(Index k2=IsLower ? 0 : size; diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h index 6a0bb8339..88c0fb794 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h @@ -25,20 +25,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Triangular matrix * matrix product functionality based on ?TRMM. ******************************************************************************** */ -#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H -#define EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H +#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H +#define EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H namespace Eigen { namespace internal { // implements LeftSide op(triangular)^-1 * general -#define EIGEN_MKL_TRSM_L(EIGTYPE, MKLTYPE, MKLPREFIX) \ +#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \ template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \ { \ @@ -53,13 +53,11 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage const EIGTYPE* _tri, Index triStride, \ EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \ { \ - MKL_INT m = size, n = otherSize, lda, ldb; \ + BlasIndex m = convert_index<BlasIndex>(size), n = convert_index<BlasIndex>(otherSize), lda, ldb; \ char side = 'L', uplo, diag='N', transa; \ /* Set alpha_ */ \ - MKLTYPE alpha; \ - EIGTYPE myone(1); \ - assign_scalar_eig2mkl(alpha, myone); \ - ldb = otherStride;\ + EIGTYPE alpha(1); \ + ldb = convert_index<BlasIndex>(otherStride);\ \ const EIGTYPE *a; \ /* Set trans */ \ @@ -75,25 +73,25 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage if (conjA) { \ a_tmp = tri.conjugate(); \ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else { \ a = _tri; \ - lda = triStride; \ + lda = convert_index<BlasIndex>(triStride); \ } \ if (IsUnitDiag) diag='U'; \ /* call ?trsm*/ \ - MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \ + BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ } \ }; -EIGEN_MKL_TRSM_L(double, double, d) -EIGEN_MKL_TRSM_L(dcomplex, MKL_Complex16, z) -EIGEN_MKL_TRSM_L(float, float, s) -EIGEN_MKL_TRSM_L(scomplex, MKL_Complex8, c) +EIGEN_BLAS_TRSM_L(double, double, d) +EIGEN_BLAS_TRSM_L(dcomplex, double, z) +EIGEN_BLAS_TRSM_L(float, float, s) +EIGEN_BLAS_TRSM_L(scomplex, float, c) // implements RightSide general * op(triangular)^-1 -#define EIGEN_MKL_TRSM_R(EIGTYPE, MKLTYPE, MKLPREFIX) \ +#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASPREFIX) \ template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \ { \ @@ -108,13 +106,11 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag const EIGTYPE* _tri, Index triStride, \ EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \ { \ - MKL_INT m = otherSize, n = size, lda, ldb; \ + BlasIndex m = convert_index<BlasIndex>(otherSize), n = convert_index<BlasIndex>(size), lda, ldb; \ char side = 'R', uplo, diag='N', transa; \ /* Set alpha_ */ \ - MKLTYPE alpha; \ - EIGTYPE myone(1); \ - assign_scalar_eig2mkl(alpha, myone); \ - ldb = otherStride;\ + EIGTYPE alpha(1); \ + ldb = convert_index<BlasIndex>(otherStride);\ \ const EIGTYPE *a; \ /* Set trans */ \ @@ -130,26 +126,26 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag if (conjA) { \ a_tmp = tri.conjugate(); \ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else { \ a = _tri; \ - lda = triStride; \ + lda = convert_index<BlasIndex>(triStride); \ } \ if (IsUnitDiag) diag='U'; \ /* call ?trsm*/ \ - MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \ + BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ /*std::cout << "TRMS_L specialization!\n";*/ \ } \ }; -EIGEN_MKL_TRSM_R(double, double, d) -EIGEN_MKL_TRSM_R(dcomplex, MKL_Complex16, z) -EIGEN_MKL_TRSM_R(float, float, s) -EIGEN_MKL_TRSM_R(scomplex, MKL_Complex8, c) +EIGEN_BLAS_TRSM_R(double, double, d) +EIGEN_BLAS_TRSM_R(dcomplex, double, z) +EIGEN_BLAS_TRSM_R(float, float, s) +EIGEN_BLAS_TRSM_R(scomplex, float, c) } // end namespace internal } // end namespace Eigen -#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H +#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h index 1ef3b61db..8c9239b1d 100644 --- a/Eigen/src/Core/util/MKL_support.h +++ b/Eigen/src/Core/util/MKL_support.h @@ -49,7 +49,7 @@ #define EIGEN_USE_LAPACKE #endif -#if defined(EIGEN_USE_BLAS) || defined(EIGEN_USE_LAPACKE) || defined(EIGEN_USE_MKL_VML) +#if defined(EIGEN_USE_LAPACKE) || defined(EIGEN_USE_MKL_VML) #define EIGEN_USE_MKL #endif @@ -64,7 +64,6 @@ # ifndef EIGEN_USE_MKL /*If the MKL version is too old, undef everything*/ # undef EIGEN_USE_MKL_ALL -# undef EIGEN_USE_BLAS # undef EIGEN_USE_LAPACKE # undef EIGEN_USE_MKL_VML # undef EIGEN_USE_LAPACKE_STRICT @@ -107,52 +106,23 @@ #else #define EIGEN_MKL_DOMAIN_PARDISO MKL_PARDISO #endif +#endif namespace Eigen { typedef std::complex<double> dcomplex; typedef std::complex<float> scomplex; -namespace internal { - -template<typename MKLType, typename EigenType> -static inline void assign_scalar_eig2mkl(MKLType& mklScalar, const EigenType& eigenScalar) { - mklScalar=eigenScalar; -} - -template<typename MKLType, typename EigenType> -static inline void assign_conj_scalar_eig2mkl(MKLType& mklScalar, const EigenType& eigenScalar) { - mklScalar=eigenScalar; -} - -template <> -inline void assign_scalar_eig2mkl<MKL_Complex16,dcomplex>(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) { - mklScalar.real=eigenScalar.real(); - mklScalar.imag=eigenScalar.imag(); -} - -template <> -inline void assign_scalar_eig2mkl<MKL_Complex8,scomplex>(MKL_Complex8& mklScalar, const scomplex& eigenScalar) { - mklScalar.real=eigenScalar.real(); - mklScalar.imag=eigenScalar.imag(); -} - -template <> -inline void assign_conj_scalar_eig2mkl<MKL_Complex16,dcomplex>(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) { - mklScalar.real=eigenScalar.real(); - mklScalar.imag=-eigenScalar.imag(); -} - -template <> -inline void assign_conj_scalar_eig2mkl<MKL_Complex8,scomplex>(MKL_Complex8& mklScalar, const scomplex& eigenScalar) { - mklScalar.real=eigenScalar.real(); - mklScalar.imag=-eigenScalar.imag(); -} - -} // end namespace internal +#if defined(EIGEN_USE_MKL) +typedef MKL_INT BlasIndex; +#else +typedef int BlasIndex; +#endif } // end namespace Eigen +#if defined(EIGEN_USE_BLAS) +#include "../../misc/blas.h" #endif #endif // EIGEN_MKL_SUPPORT_H diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 97627d14c..69863d826 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -371,11 +371,11 @@ // Does the compiler support const expressions? #ifdef __CUDACC__ // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above -#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (defined(__clang__) || __CUDACC_VER__ >= 70500) +#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500) #define EIGEN_HAS_CONSTEXPR 1 #endif -#elif (defined(__cplusplus) && __cplusplus >= 201402L) || \ - EIGEN_GNUC_AT_LEAST(4,8) +#elif __has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ + (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L)) #define EIGEN_HAS_CONSTEXPR 1 #endif @@ -572,12 +572,12 @@ namespace Eigen { //------------------------------------------------------------------------------------------ // Static and dynamic alignment control -// +// // The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES // as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively. // The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not, // a default value is automatically computed based on architecture, compiler, and OS. -// +// // This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX} // to be used to declare statically aligned buffers. //------------------------------------------------------------------------------------------ @@ -637,7 +637,7 @@ namespace Eigen { #ifndef EIGEN_MAX_STATIC_ALIGN_BYTES // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES - + // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in @@ -664,13 +664,13 @@ namespace Eigen { #else #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 #endif - + #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES #else #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 #endif - + #endif // If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES diff --git a/Eigen/src/Geometry/Rotation2D.h b/Eigen/src/Geometry/Rotation2D.h index 5ab0d5920..b42a7df70 100644 --- a/Eigen/src/Geometry/Rotation2D.h +++ b/Eigen/src/Geometry/Rotation2D.h @@ -82,15 +82,15 @@ public: /** \returns the rotation angle in [0,2pi] */ inline Scalar smallestPositiveAngle() const { - Scalar tmp = fmod(m_angle,Scalar(2)*EIGEN_PI); - return tmp<Scalar(0) ? tmp + Scalar(2)*EIGEN_PI : tmp; + Scalar tmp = numext::fmod(m_angle,Scalar(2*EIGEN_PI)); + return tmp<Scalar(0) ? tmp + Scalar(2*EIGEN_PI) : tmp; } /** \returns the rotation angle in [-pi,pi] */ inline Scalar smallestAngle() const { - Scalar tmp = fmod(m_angle,Scalar(2)*EIGEN_PI); - if(tmp>Scalar(EIGEN_PI)) tmp -= Scalar(2)*Scalar(EIGEN_PI); - else if(tmp<-Scalar(EIGEN_PI)) tmp += Scalar(2)*Scalar(EIGEN_PI); + Scalar tmp = numext::fmod(m_angle,Scalar(2*EIGEN_PI)); + if(tmp>Scalar(EIGEN_PI)) tmp -= Scalar(2*EIGEN_PI); + else if(tmp<-Scalar(EIGEN_PI)) tmp += Scalar(2*EIGEN_PI); return tmp; } diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h index 74cd0a472..e9f3ebf88 100644 --- a/Eigen/src/Householder/HouseholderSequence.h +++ b/Eigen/src/Householder/HouseholderSequence.h @@ -243,8 +243,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS { workspace.resize(rows()); Index vecs = m_length; - if( internal::is_same<typename internal::remove_all<VectorsType>::type,Dest>::value - && internal::extract_data(dst) == internal::extract_data(m_vectors)) + if(is_same_dense(dst,m_vectors)) { // in-place dst.diagonal().setOnes(); diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 1721213d6..64b9eb7f1 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -231,6 +231,15 @@ template<typename _MatrixType> class FullPivLU return Solve<FullPivLU, Rhs>(*this, b.derived()); } + /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is + the LU decomposition. + */ + inline RealScalar rcond() const + { + eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); + return internal::rcond_estimate_helper(m_l1_norm, *this); + } + /** \returns the determinant of the matrix of which * *this is the LU decomposition. It has only linear complexity * (that is, O(n) where n is the dimension of the square matrix) @@ -410,6 +419,7 @@ template<typename _MatrixType> class FullPivLU IntColVectorType m_rowsTranspositions; IntRowVectorType m_colsTranspositions; Index m_det_pq, m_nonzero_pivots; + RealScalar m_l1_norm; RealScalar m_maxpivot, m_prescribedThreshold; bool m_isInitialized, m_usePrescribedThreshold; }; @@ -455,11 +465,12 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const EigenBase<InputType> // the permutations are stored as int indices, so just to be sure: eigen_assert(matrix.rows()<=NumTraits<int>::highest() && matrix.cols()<=NumTraits<int>::highest()); - m_isInitialized = true; m_lu = matrix.derived(); + m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); computeInPlace(); + m_isInitialized = true; return *this; } diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index ab7797d2a..2e6d91939 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -76,7 +76,6 @@ template<typename _MatrixType> class PartialPivLU typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType; typedef typename MatrixType::PlainObject PlainObject; - /** * \brief Default Constructor. * @@ -152,6 +151,15 @@ template<typename _MatrixType> class PartialPivLU return Solve<PartialPivLU, Rhs>(*this, b.derived()); } + /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is + the LU decomposition. + */ + inline RealScalar rcond() const + { + eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); + return internal::rcond_estimate_helper(m_l1_norm, *this); + } + /** \returns the inverse of the matrix of which *this is the LU decomposition. * * \warning The matrix being decomposed here is assumed to be invertible. If you need to check for @@ -178,7 +186,7 @@ template<typename _MatrixType> class PartialPivLU * * \sa MatrixBase::determinant() */ - typename internal::traits<MatrixType>::Scalar determinant() const; + Scalar determinant() const; MatrixType reconstructedMatrix() const; @@ -247,6 +255,7 @@ template<typename _MatrixType> class PartialPivLU PermutationType m_p; TranspositionType m_rowsTranspositions; Index m_det_p; + RealScalar m_l1_norm; bool m_isInitialized; }; @@ -256,6 +265,7 @@ PartialPivLU<MatrixType>::PartialPivLU() m_p(), m_rowsTranspositions(), m_det_p(0), + m_l1_norm(0), m_isInitialized(false) { } @@ -266,6 +276,7 @@ PartialPivLU<MatrixType>::PartialPivLU(Index size) m_p(size), m_rowsTranspositions(size), m_det_p(0), + m_l1_norm(0), m_isInitialized(false) { } @@ -277,6 +288,7 @@ PartialPivLU<MatrixType>::PartialPivLU(const EigenBase<InputType>& matrix) m_p(matrix.rows()), m_rowsTranspositions(matrix.rows()), m_det_p(0), + m_l1_norm(0), m_isInitialized(false) { compute(matrix.derived()); @@ -467,6 +479,7 @@ PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const EigenBase<Inpu eigen_assert(matrix.rows()<NumTraits<int>::highest()); m_lu = matrix.derived(); + m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); eigen_assert(matrix.rows() == matrix.cols() && "PartialPivLU is only for square (and moreover invertible) matrices"); const Index size = matrix.rows(); @@ -484,7 +497,7 @@ PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const EigenBase<Inpu } template<typename MatrixType> -typename internal::traits<MatrixType>::Scalar PartialPivLU<MatrixType>::determinant() const +typename PartialPivLU<MatrixType>::Scalar PartialPivLU<MatrixType>::determinant() const { eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); return Scalar(m_det_p) * m_lu.diagonal().prod(); diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h index e71944fd7..230d0d23c 100644 --- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -397,6 +397,10 @@ CompleteOrthogonalDecomposition<MatrixType>& CompleteOrthogonalDecomposition< const Index rank = m_cpqr.rank(); const Index cols = matrix.cols(); + const Index rows = matrix.rows(); + m_zCoeffs.resize((std::min)(rows, cols)); + m_temp.resize(cols); + if (rank < cols) { // We have reduced the (permuted) matrix to the form // [R11 R12] diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 3552c87bf..799e81bd7 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -11,7 +11,7 @@ // Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr> // Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr> // Copyright (C) 2013 Jitse Niesen <jitse@maths.leeds.ac.uk> -// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr> +// Copyright (C) 2014-2016 Gael Guennebaud <gael.guennebaud@inria.fr> // // Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -21,6 +21,7 @@ #define EIGEN_BDCSVD_H // #define EIGEN_BDCSVD_DEBUG_VERBOSE // #define EIGEN_BDCSVD_SANITY_CHECKS + namespace Eigen { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE @@ -49,6 +50,18 @@ struct traits<BDCSVD<_MatrixType> > * * \tparam _MatrixType the type of the matrix of which we are computing the SVD decomposition * + * This class first reduces the input matrix to bi-diagonal form using class UpperBidiagonalization, + * and then performs a divide-and-conquer diagonalization. Small blocks are diagonalized using class JacobiSVD. + * You can control the switching size with the setSwitchSize() method, default is 16. + * For small matrice (<16), it is thus preferable to directly use JacobiSVD. For larger ones, BDCSVD is highly + * recommended and can several order of magnitude faster. + * + * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations. + * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless + * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will + * significantly degrade the accuracy. + * + * \sa class JacobiSVD */ template<typename _MatrixType> class BDCSVD : public SVDBase<BDCSVD<_MatrixType> > @@ -228,6 +241,8 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign #endif allocate(matrix.rows(), matrix.cols(), computationOptions); using std::abs; + + const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)(); //**** step -1 - If the problem is too small, directly falls back to JacobiSVD and return if(matrix.cols() < m_algoswap) @@ -266,7 +281,7 @@ BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsign { RealScalar a = abs(m_computed.coeff(i, i)); m_singularValues.coeffRef(i) = a * scale; - if (a == 0) + if (a<considerZero) { m_nonzeroSingularValues = i; m_singularValues.tail(m_diagSize - i - 1).setZero(); @@ -380,6 +395,7 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, using std::abs; const Index n = lastCol - firstCol + 1; const Index k = n/2; + const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)(); RealScalar alphaK; RealScalar betaK; RealScalar r0; @@ -434,7 +450,7 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1); } if (m_compV) m_naiveV(firstRowW+k, firstColW) = 1; - if (r0 == 0) + if (r0<considerZero) { c0 = 1; s0 = 0; @@ -553,6 +569,8 @@ void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, template <typename MatrixType> void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) { + const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)(); + using std::abs; ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n); m_workspace.head(n) = m_computed.block(firstCol, firstCol, n, n).diagonal(); ArrayRef diag = m_workspace.head(n); @@ -575,7 +593,7 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec while(actual_n>1 && diag(actual_n-1)==0) --actual_n; Index m = 0; // size of the deflated problem for(Index k=0;k<actual_n;++k) - if(col0(k)!=0) + if(abs(col0(k))>considerZero) m_workspaceI(m++) = k; Map<ArrayXi> perm(m_workspaceI.data(),m); @@ -600,7 +618,7 @@ void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec { Index actual_n = n; - while(actual_n>1 && col0(actual_n-1)==0) --actual_n; + while(actual_n>1 && abs(col0(actual_n-1))<considerZero) --actual_n; std::cout << "\n\n mus: " << mus.head(actual_n).transpose() << "\n\n"; std::cout << " check1 (expect0) : " << ((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n).transpose() << "\n\n"; std::cout << " check2 (>0) : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n"; @@ -680,6 +698,7 @@ typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar res += numext::abs2(col0(j)) / ((diagShifted(j) - mu) * (diag(j) + shift + mu)); } return res; + } template <typename MatrixType> @@ -746,14 +765,14 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar muPrev, muCur; if (shift == left) { - muPrev = (right - left) * 0.1; + muPrev = (right - left) * RealScalar(0.1); if (k == actual_n-1) muCur = right - left; - else muCur = (right - left) * 0.5; + else muCur = (right - left) * RealScalar(0.5); } else { - muPrev = -(right - left) * 0.1; - muCur = -(right - left) * 0.5; + muPrev = -(right - left) * RealScalar(0.1); + muCur = -(right - left) * RealScalar(0.5); } RealScalar fPrev = secularEq(muPrev, col0, diag, perm, diagShifted, shift); @@ -798,15 +817,15 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar leftShifted, rightShifted; if (shift == left) { - leftShifted = RealScalar(1)/NumTraits<RealScalar>::highest(); + leftShifted = (std::numeric_limits<RealScalar>::min)(); // I don't understand why the case k==0 would be special there: // if (k == 0) rightShifted = right - left; else - rightShifted = (k==actual_n-1) ? right : ((right - left) * 0.6); // theoretically we can take 0.5, but let's be safe + rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.6)); // theoretically we can take 0.5, but let's be safe } else { - leftShifted = -(right - left) * 0.6; - rightShifted = -RealScalar(1)/NumTraits<RealScalar>::highest(); + leftShifted = -(right - left) * RealScalar(0.6); + rightShifted = -(std::numeric_limits<RealScalar>::min)(); } RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); @@ -817,7 +836,10 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE if(!(fLeft * fRight<0)) + { + std::cout << "fLeft: " << leftShifted << " - " << diagShifted.head(10).transpose() << "\n ; " << bool(left==shift) << " " << (left-shift) << "\n"; std::cout << k << " : " << fLeft << " * " << fRight << " == " << fLeft * fRight << " ; " << left << " - " << right << " -> " << leftShifted << " " << rightShifted << " shift=" << shift << "\n"; + } #endif eigen_internal_assert(fLeft * fRight < 0); @@ -1028,8 +1050,9 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index Diagonal<MatrixXr> fulldiag(m_computed); VectorBlock<Diagonal<MatrixXr>,Dynamic> diag(fulldiag, firstCol+shift, length); + const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)(); RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff(); - RealScalar epsilon_strict = NumTraits<RealScalar>::epsilon() * maxDiag; + RealScalar epsilon_strict = numext::maxi(considerZero,NumTraits<RealScalar>::epsilon() * maxDiag); RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag); #ifdef EIGEN_BDCSVD_SANITY_CHECKS @@ -1082,7 +1105,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index { // Check for total deflation // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting - bool total_deflation = (col0.tail(length-1).array()==RealScalar(0)).all(); + bool total_deflation = (col0.tail(length-1).array()<considerZero).all(); // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge. // First, compute the respective permutation. @@ -1093,7 +1116,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index // Move deflated diagonal entries at the end. for(Index i=1; i<length; ++i) - if(diag(i)==0) + if(abs(diag(i))<considerZero) permutation[p++] = i; Index i=1, j=k+1; @@ -1112,7 +1135,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index for(Index i=1; i<length; ++i) { Index pi = permutation[i]; - if(diag(pi)==0 || diag(0)<diag(pi)) + if(abs(diag(pi))<considerZero || diag(0)<diag(pi)) permutation[i-1] = permutation[i]; else { @@ -1163,7 +1186,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index //condition 4.4 { Index i = length-1; - while(i>0 && (diag(i)==0 || col0(i)==0)) --i; + while(i>0 && (abs(diag(i))<considerZero || abs(col0(i))<considerZero)) --i; for(; i>1;--i) if( (diag(i) - diag(i-1)) < NumTraits<RealScalar>::epsilon()*maxDiag ) { @@ -1177,7 +1200,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index #ifdef EIGEN_BDCSVD_SANITY_CHECKS for(Index j=2;j<length;++j) - assert(diag(j-1)<=diag(j) || diag(j)==0); + assert(diag(j-1)<=diag(j) || abs(diag(j))<considerZero); #endif #ifdef EIGEN_BDCSVD_SANITY_CHECKS diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index bf5ff48c3..1940c8294 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -350,7 +350,8 @@ template<typename MatrixType, int QRPreconditioner> struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, false> { typedef JacobiSVD<MatrixType, QRPreconditioner> SVD; - static void run(typename SVD::WorkMatrixType&, SVD&, Index, Index) {} + typedef typename MatrixType::RealScalar RealScalar; + static bool run(typename SVD::WorkMatrixType&, SVD&, Index, Index, RealScalar&) { return true; } }; template<typename MatrixType, int QRPreconditioner> @@ -359,19 +360,30 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true> typedef JacobiSVD<MatrixType, QRPreconditioner> SVD; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; - static void run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q) + static bool run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q, RealScalar& maxDiagEntry) { using std::sqrt; + using std::abs; Scalar z; JacobiRotation<Scalar> rot; RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p,p)) + numext::abs2(work_matrix.coeff(q,p))); - + + const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)(); + const RealScalar precision = NumTraits<Scalar>::epsilon(); + if(n==0) { - z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q); - work_matrix.row(p) *= z; - if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z); - if(work_matrix.coeff(q,q)!=Scalar(0)) + // make sure first column is zero + work_matrix.coeffRef(p,p) = work_matrix.coeffRef(q,p) = Scalar(0); + + if(abs(numext::imag(work_matrix.coeff(p,q)))>considerAsZero) + { + // work_matrix.coeff(p,q) can be zero if work_matrix.coeff(q,p) is not zero but small enough to underflow when computing n + z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q); + work_matrix.row(p) *= z; + if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z); + } + if(abs(numext::imag(work_matrix.coeff(q,q)))>considerAsZero) { z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q); work_matrix.row(q) *= z; @@ -385,19 +397,25 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true> rot.s() = work_matrix.coeff(q,p) / n; work_matrix.applyOnTheLeft(p,q,rot); if(svd.computeU()) svd.m_matrixU.applyOnTheRight(p,q,rot.adjoint()); - if(work_matrix.coeff(p,q) != Scalar(0)) + if(abs(numext::imag(work_matrix.coeff(p,q)))>considerAsZero) { z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q); work_matrix.col(q) *= z; if(svd.computeV()) svd.m_matrixV.col(q) *= z; } - if(work_matrix.coeff(q,q) != Scalar(0)) + if(abs(numext::imag(work_matrix.coeff(q,q)))>considerAsZero) { z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q); work_matrix.row(q) *= z; if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z); } } + + // update largest diagonal entry + maxDiagEntry = numext::maxi(maxDiagEntry,numext::maxi(abs(work_matrix.coeff(p,p)), abs(work_matrix.coeff(q,q)))); + // and check whether the 2x2 block is already diagonal + RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry); + return abs(work_matrix.coeff(p,q))>threshold || abs(work_matrix.coeff(q,p)) > threshold; } }; @@ -414,7 +432,6 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q, JacobiRotation<RealScalar> rot1; RealScalar t = m.coeff(0,0) + m.coeff(1,1); RealScalar d = m.coeff(1,0) - m.coeff(0,1); - if(d == RealScalar(0)) { rot1.s() = RealScalar(0); @@ -707,6 +724,7 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig } /*** step 2. The main Jacobi SVD iteration. ***/ + RealScalar maxDiagEntry = m_workMatrix.cwiseAbs().diagonal().maxCoeff(); bool finished = false; while(!finished) @@ -722,25 +740,27 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig // if this 2x2 sub-matrix is not diagonal already... // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't // keep us iterating forever. Similarly, small denormal numbers are considered zero. - RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, - precision * numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p,p)), - abs(m_workMatrix.coeff(q,q)))); - // We compare both values to threshold instead of calling max to be robust to NaN (See bug 791) + RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry); if(abs(m_workMatrix.coeff(p,q))>threshold || abs(m_workMatrix.coeff(q,p)) > threshold) { finished = false; - // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal - internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q); - JacobiRotation<RealScalar> j_left, j_right; - internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right); - - // accumulate resulting Jacobi rotations - m_workMatrix.applyOnTheLeft(p,q,j_left); - if(computeU()) m_matrixU.applyOnTheRight(p,q,j_left.transpose()); - - m_workMatrix.applyOnTheRight(p,q,j_right); - if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right); + // the complex to real operation returns true is the updated 2x2 block is not already diagonal + if(internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q, maxDiagEntry)) + { + JacobiRotation<RealScalar> j_left, j_right; + internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right); + + // accumulate resulting Jacobi rotations + m_workMatrix.applyOnTheLeft(p,q,j_left); + if(computeU()) m_matrixU.applyOnTheRight(p,q,j_left.transpose()); + + m_workMatrix.applyOnTheRight(p,q,j_right); + if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right); + + // keep track of the largest diagonal coefficient + maxDiagEntry = numext::maxi(maxDiagEntry,numext::maxi(abs(m_workMatrix.coeff(p,p)), abs(m_workMatrix.coeff(q,q)))); + } } } } diff --git a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h index fe4a97120..9143a4c82 100644 --- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h @@ -22,7 +22,7 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased> typedef CwiseUnaryOp<UnaryOp, ArgType> XprType; class InnerIterator; -// class ReverseInnerIterator; + class ReverseInnerIterator; enum { CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost, diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h index 0ae3017cc..7e2efd452 100644 --- a/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -986,7 +986,7 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest &m_sluStat, &info, Scalar()); StatFree(&m_sluStat); - if(&x.coeffRef(0) != x_ref.data()) + if(x.derived().data() != x_ref.data()) x = x_ref; m_info = info==0 ? Success : NumericalIssue; diff --git a/Eigen/src/misc/blas.h b/Eigen/src/misc/blas.h index 6fce99ed5..25215b15e 100644 --- a/Eigen/src/misc/blas.h +++ b/Eigen/src/misc/blas.h @@ -30,15 +30,15 @@ int BLASFUNC(cdotcw) (int *, float *, int *, float *, int *, float*); int BLASFUNC(zdotuw) (int *, double *, int *, double *, int *, double*); int BLASFUNC(zdotcw) (int *, double *, int *, double *, int *, double*); -int BLASFUNC(saxpy) (int *, float *, float *, int *, float *, int *); -int BLASFUNC(daxpy) (int *, double *, double *, int *, double *, int *); -int BLASFUNC(qaxpy) (int *, double *, double *, int *, double *, int *); -int BLASFUNC(caxpy) (int *, float *, float *, int *, float *, int *); -int BLASFUNC(zaxpy) (int *, double *, double *, int *, double *, int *); -int BLASFUNC(xaxpy) (int *, double *, double *, int *, double *, int *); -int BLASFUNC(caxpyc)(int *, float *, float *, int *, float *, int *); -int BLASFUNC(zaxpyc)(int *, double *, double *, int *, double *, int *); -int BLASFUNC(xaxpyc)(int *, double *, double *, int *, double *, int *); +int BLASFUNC(saxpy) (const int *, const float *, const float *, const int *, float *, const int *); +int BLASFUNC(daxpy) (const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(qaxpy) (const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(caxpy) (const int *, const float *, const float *, const int *, float *, const int *); +int BLASFUNC(zaxpy) (const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(xaxpy) (const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(caxpyc)(const int *, const float *, const float *, const int *, float *, const int *); +int BLASFUNC(zaxpyc)(const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(xaxpyc)(const int *, const double *, const double *, const int *, double *, const int *); int BLASFUNC(scopy) (int *, float *, int *, float *, int *); int BLASFUNC(dcopy) (int *, double *, int *, double *, int *); @@ -177,31 +177,19 @@ int BLASFUNC(xgeru)(int *, int *, double *, double *, int *, int BLASFUNC(xgerc)(int *, int *, double *, double *, int *, double *, int *, double *, int *); -int BLASFUNC(sgemv)(char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(dgemv)(char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(qgemv)(char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(cgemv)(char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zgemv)(char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(xgemv)(char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); +int BLASFUNC(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(qgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); -int BLASFUNC(strsv) (char *, char *, char *, int *, float *, int *, - float *, int *); -int BLASFUNC(dtrsv) (char *, char *, char *, int *, double *, int *, - double *, int *); -int BLASFUNC(qtrsv) (char *, char *, char *, int *, double *, int *, - double *, int *); -int BLASFUNC(ctrsv) (char *, char *, char *, int *, float *, int *, - float *, int *); -int BLASFUNC(ztrsv) (char *, char *, char *, int *, double *, int *, - double *, int *); -int BLASFUNC(xtrsv) (char *, char *, char *, int *, double *, int *, - double *, int *); +int BLASFUNC(strsv) (const char *, const char *, const char *, const int *, const float *, const int *, float *, const int *); +int BLASFUNC(dtrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(qtrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(ctrsv) (const char *, const char *, const char *, const int *, const float *, const int *, float *, const int *); +int BLASFUNC(ztrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(xtrsv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *); int BLASFUNC(stpsv) (char *, char *, char *, int *, float *, float *, int *); int BLASFUNC(dtpsv) (char *, char *, char *, int *, double *, double *, int *); @@ -210,18 +198,12 @@ int BLASFUNC(ctpsv) (char *, char *, char *, int *, float *, float *, int *); int BLASFUNC(ztpsv) (char *, char *, char *, int *, double *, double *, int *); int BLASFUNC(xtpsv) (char *, char *, char *, int *, double *, double *, int *); -int BLASFUNC(strmv) (char *, char *, char *, int *, float *, int *, - float *, int *); -int BLASFUNC(dtrmv) (char *, char *, char *, int *, double *, int *, - double *, int *); -int BLASFUNC(qtrmv) (char *, char *, char *, int *, double *, int *, - double *, int *); -int BLASFUNC(ctrmv) (char *, char *, char *, int *, float *, int *, - float *, int *); -int BLASFUNC(ztrmv) (char *, char *, char *, int *, double *, int *, - double *, int *); -int BLASFUNC(xtrmv) (char *, char *, char *, int *, double *, int *, - double *, int *); +int BLASFUNC(strmv) (const char *, const char *, const char *, const int *, const float *, const int *, float *, const int *); +int BLASFUNC(dtrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(qtrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(ctrmv) (const char *, const char *, const char *, const int *, const float *, const int *, float *, const int *); +int BLASFUNC(ztrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(xtrmv) (const char *, const char *, const char *, const int *, const double *, const int *, double *, const int *); int BLASFUNC(stpmv) (char *, char *, char *, int *, float *, float *, int *); int BLASFUNC(dtpmv) (char *, char *, char *, int *, double *, double *, int *); @@ -244,18 +226,9 @@ int BLASFUNC(ctbsv) (char *, char *, char *, int *, int *, float *, int *, floa int BLASFUNC(ztbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *); int BLASFUNC(xtbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *); -int BLASFUNC(ssymv) (char *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(dsymv) (char *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(qsymv) (char *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(csymv) (char *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zsymv) (char *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(xsymv) (char *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); +int BLASFUNC(ssymv) (const char *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(dsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(qsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); int BLASFUNC(sspmv) (char *, int *, float *, float *, float *, int *, float *, float *, int *); @@ -263,38 +236,17 @@ int BLASFUNC(dspmv) (char *, int *, double *, double *, double *, int *, double *, double *, int *); int BLASFUNC(qspmv) (char *, int *, double *, double *, double *, int *, double *, double *, int *); -int BLASFUNC(cspmv) (char *, int *, float *, float *, - float *, int *, float *, float *, int *); -int BLASFUNC(zspmv) (char *, int *, double *, double *, - double *, int *, double *, double *, int *); -int BLASFUNC(xspmv) (char *, int *, double *, double *, - double *, int *, double *, double *, int *); -int BLASFUNC(ssyr) (char *, int *, float *, float *, int *, - float *, int *); -int BLASFUNC(dsyr) (char *, int *, double *, double *, int *, - double *, int *); -int BLASFUNC(qsyr) (char *, int *, double *, double *, int *, - double *, int *); -int BLASFUNC(csyr) (char *, int *, float *, float *, int *, - float *, int *); -int BLASFUNC(zsyr) (char *, int *, double *, double *, int *, - double *, int *); -int BLASFUNC(xsyr) (char *, int *, double *, double *, int *, - double *, int *); +int BLASFUNC(ssyr) (const char *, const int *, const float *, const float *, const int *, float *, const int *); +int BLASFUNC(dsyr) (const char *, const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(qsyr) (const char *, const int *, const double *, const double *, const int *, double *, const int *); -int BLASFUNC(ssyr2) (char *, int *, float *, - float *, int *, float *, int *, float *, int *); -int BLASFUNC(dsyr2) (char *, int *, double *, - double *, int *, double *, int *, double *, int *); -int BLASFUNC(qsyr2) (char *, int *, double *, - double *, int *, double *, int *, double *, int *); -int BLASFUNC(csyr2) (char *, int *, float *, - float *, int *, float *, int *, float *, int *); -int BLASFUNC(zsyr2) (char *, int *, double *, - double *, int *, double *, int *, double *, int *); -int BLASFUNC(xsyr2) (char *, int *, double *, - double *, int *, double *, int *, double *, int *); +int BLASFUNC(ssyr2) (const char *, const int *, const float *, const float *, const int *, const float *, const int *, float *, const int *); +int BLASFUNC(dsyr2) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(qsyr2) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(csyr2) (const char *, const int *, const float *, const float *, const int *, const float *, const int *, float *, const int *); +int BLASFUNC(zsyr2) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, double *, const int *); +int BLASFUNC(xsyr2) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, double *, const int *); int BLASFUNC(sspr) (char *, int *, float *, float *, int *, float *); @@ -302,12 +254,6 @@ int BLASFUNC(dspr) (char *, int *, double *, double *, int *, double *); int BLASFUNC(qspr) (char *, int *, double *, double *, int *, double *); -int BLASFUNC(cspr) (char *, int *, float *, float *, int *, - float *); -int BLASFUNC(zspr) (char *, int *, double *, double *, int *, - double *); -int BLASFUNC(xspr) (char *, int *, double *, double *, int *, - double *); int BLASFUNC(sspr2) (char *, int *, float *, float *, int *, float *, int *, float *); @@ -347,12 +293,9 @@ int BLASFUNC(zhpr2) (char *, int *, double *, int BLASFUNC(xhpr2) (char *, int *, double *, double *, int *, double *, int *, double *); -int BLASFUNC(chemv) (char *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zhemv) (char *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(xhemv) (char *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); +int BLASFUNC(chemv) (const char *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zhemv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xhemv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); int BLASFUNC(chpmv) (char *, int *, float *, float *, float *, int *, float *, float *, int *); @@ -401,18 +344,12 @@ int BLASFUNC(xhbmv)(char *, int *, int *, double *, double *, int *, /* Level 3 routines */ -int BLASFUNC(sgemm)(char *, char *, int *, int *, int *, float *, - float *, int *, float *, int *, float *, float *, int *); -int BLASFUNC(dgemm)(char *, char *, int *, int *, int *, double *, - double *, int *, double *, int *, double *, double *, int *); -int BLASFUNC(qgemm)(char *, char *, int *, int *, int *, double *, - double *, int *, double *, int *, double *, double *, int *); -int BLASFUNC(cgemm)(char *, char *, int *, int *, int *, float *, - float *, int *, float *, int *, float *, float *, int *); -int BLASFUNC(zgemm)(char *, char *, int *, int *, int *, double *, - double *, int *, double *, int *, double *, double *, int *); -int BLASFUNC(xgemm)(char *, char *, int *, int *, int *, double *, - double *, int *, double *, int *, double *, double *, int *); +int BLASFUNC(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(qgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); int BLASFUNC(cgemm3m)(char *, char *, int *, int *, int *, float *, float *, int *, float *, int *, float *, float *, int *); @@ -434,84 +371,48 @@ int BLASFUNC(zge2mm)(char *, char *, char *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *); -int BLASFUNC(strsm)(char *, char *, char *, char *, int *, int *, - float *, float *, int *, float *, int *); -int BLASFUNC(dtrsm)(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); -int BLASFUNC(qtrsm)(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); -int BLASFUNC(ctrsm)(char *, char *, char *, char *, int *, int *, - float *, float *, int *, float *, int *); -int BLASFUNC(ztrsm)(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); -int BLASFUNC(xtrsm)(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); - -int BLASFUNC(strmm)(char *, char *, char *, char *, int *, int *, - float *, float *, int *, float *, int *); -int BLASFUNC(dtrmm)(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); -int BLASFUNC(qtrmm)(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); -int BLASFUNC(ctrmm)(char *, char *, char *, char *, int *, int *, - float *, float *, int *, float *, int *); -int BLASFUNC(ztrmm)(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); -int BLASFUNC(xtrmm)(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); - -int BLASFUNC(ssymm)(char *, char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(dsymm)(char *, char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(qsymm)(char *, char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(csymm)(char *, char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zsymm)(char *, char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(xsymm)(char *, char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); - -int BLASFUNC(csymm3m)(char *, char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zsymm3m)(char *, char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(xsymm3m)(char *, char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); - -int BLASFUNC(ssyrk)(char *, char *, int *, int *, float *, float *, int *, - float *, float *, int *); -int BLASFUNC(dsyrk)(char *, char *, int *, int *, double *, double *, int *, - double *, double *, int *); -int BLASFUNC(qsyrk)(char *, char *, int *, int *, double *, double *, int *, - double *, double *, int *); -int BLASFUNC(csyrk)(char *, char *, int *, int *, float *, float *, int *, - float *, float *, int *); -int BLASFUNC(zsyrk)(char *, char *, int *, int *, double *, double *, int *, - double *, double *, int *); -int BLASFUNC(xsyrk)(char *, char *, int *, int *, double *, double *, int *, - double *, double *, int *); - -int BLASFUNC(ssyr2k)(char *, char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(dsyr2k)(char *, char *, int *, int *, double *, double *, int *, - double*, int *, double *, double *, int *); -int BLASFUNC(qsyr2k)(char *, char *, int *, int *, double *, double *, int *, - double*, int *, double *, double *, int *); -int BLASFUNC(csyr2k)(char *, char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zsyr2k)(char *, char *, int *, int *, double *, double *, int *, - double*, int *, double *, double *, int *); -int BLASFUNC(xsyr2k)(char *, char *, int *, int *, double *, double *, int *, - double*, int *, double *, double *, int *); - -int BLASFUNC(chemm)(char *, char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zhemm)(char *, char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); -int BLASFUNC(xhemm)(char *, char *, int *, int *, double *, double *, int *, - double *, int *, double *, double *, int *); +int BLASFUNC(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +int BLASFUNC(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(qtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +int BLASFUNC(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(xtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); + +int BLASFUNC(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +int BLASFUNC(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(qtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +int BLASFUNC(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +int BLASFUNC(xtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); + +int BLASFUNC(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(qsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(csymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); + +int BLASFUNC(csymm3m)(char *, char *, int *, int *, float *, float *, int *, float *, int *, float *, float *, int *); +int BLASFUNC(zsymm3m)(char *, char *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *); +int BLASFUNC(xsymm3m)(char *, char *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *); + +int BLASFUNC(ssyrk)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(dsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(qsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(csyrk)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *); + +int BLASFUNC(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *); +int BLASFUNC(qsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *); +int BLASFUNC(csyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *); +int BLASFUNC(xsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *); + +int BLASFUNC(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); int BLASFUNC(chemm3m)(char *, char *, int *, int *, float *, float *, int *, float *, int *, float *, float *, int *); @@ -520,136 +421,17 @@ int BLASFUNC(zhemm3m)(char *, char *, int *, int *, double *, double *, int *, int BLASFUNC(xhemm3m)(char *, char *, int *, int *, double *, double *, int *, double *, int *, double *, double *, int *); -int BLASFUNC(cherk)(char *, char *, int *, int *, float *, float *, int *, - float *, float *, int *); -int BLASFUNC(zherk)(char *, char *, int *, int *, double *, double *, int *, - double *, double *, int *); -int BLASFUNC(xherk)(char *, char *, int *, int *, double *, double *, int *, - double *, double *, int *); - -int BLASFUNC(cher2k)(char *, char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zher2k)(char *, char *, int *, int *, double *, double *, int *, - double*, int *, double *, double *, int *); -int BLASFUNC(xher2k)(char *, char *, int *, int *, double *, double *, int *, - double*, int *, double *, double *, int *); -int BLASFUNC(cher2m)(char *, char *, char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *); -int BLASFUNC(zher2m)(char *, char *, char *, int *, int *, double *, double *, int *, - double*, int *, double *, double *, int *); -int BLASFUNC(xher2m)(char *, char *, char *, int *, int *, double *, double *, int *, - double*, int *, double *, double *, int *); - -int BLASFUNC(sgemt)(char *, int *, int *, float *, float *, int *, - float *, int *); -int BLASFUNC(dgemt)(char *, int *, int *, double *, double *, int *, - double *, int *); -int BLASFUNC(cgemt)(char *, int *, int *, float *, float *, int *, - float *, int *); -int BLASFUNC(zgemt)(char *, int *, int *, double *, double *, int *, - double *, int *); +int BLASFUNC(cherk)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zherk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xherk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, double *, const int *); + +int BLASFUNC(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(cher2m)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zher2m)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *); +int BLASFUNC(xher2m)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double*, const int *, const double *, double *, const int *); -int BLASFUNC(sgema)(char *, char *, int *, int *, float *, - float *, int *, float *, float *, int *, float *, int *); -int BLASFUNC(dgema)(char *, char *, int *, int *, double *, - double *, int *, double*, double *, int *, double*, int *); -int BLASFUNC(cgema)(char *, char *, int *, int *, float *, - float *, int *, float *, float *, int *, float *, int *); -int BLASFUNC(zgema)(char *, char *, int *, int *, double *, - double *, int *, double*, double *, int *, double*, int *); - -int BLASFUNC(sgems)(char *, char *, int *, int *, float *, - float *, int *, float *, float *, int *, float *, int *); -int BLASFUNC(dgems)(char *, char *, int *, int *, double *, - double *, int *, double*, double *, int *, double*, int *); -int BLASFUNC(cgems)(char *, char *, int *, int *, float *, - float *, int *, float *, float *, int *, float *, int *); -int BLASFUNC(zgems)(char *, char *, int *, int *, double *, - double *, int *, double*, double *, int *, double*, int *); - -int BLASFUNC(sgetf2)(int *, int *, float *, int *, int *, int *); -int BLASFUNC(dgetf2)(int *, int *, double *, int *, int *, int *); -int BLASFUNC(qgetf2)(int *, int *, double *, int *, int *, int *); -int BLASFUNC(cgetf2)(int *, int *, float *, int *, int *, int *); -int BLASFUNC(zgetf2)(int *, int *, double *, int *, int *, int *); -int BLASFUNC(xgetf2)(int *, int *, double *, int *, int *, int *); - -int BLASFUNC(sgetrf)(int *, int *, float *, int *, int *, int *); -int BLASFUNC(dgetrf)(int *, int *, double *, int *, int *, int *); -int BLASFUNC(qgetrf)(int *, int *, double *, int *, int *, int *); -int BLASFUNC(cgetrf)(int *, int *, float *, int *, int *, int *); -int BLASFUNC(zgetrf)(int *, int *, double *, int *, int *, int *); -int BLASFUNC(xgetrf)(int *, int *, double *, int *, int *, int *); - -int BLASFUNC(slaswp)(int *, float *, int *, int *, int *, int *, int *); -int BLASFUNC(dlaswp)(int *, double *, int *, int *, int *, int *, int *); -int BLASFUNC(qlaswp)(int *, double *, int *, int *, int *, int *, int *); -int BLASFUNC(claswp)(int *, float *, int *, int *, int *, int *, int *); -int BLASFUNC(zlaswp)(int *, double *, int *, int *, int *, int *, int *); -int BLASFUNC(xlaswp)(int *, double *, int *, int *, int *, int *, int *); - -int BLASFUNC(sgetrs)(char *, int *, int *, float *, int *, int *, float *, int *, int *); -int BLASFUNC(dgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *); -int BLASFUNC(qgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *); -int BLASFUNC(cgetrs)(char *, int *, int *, float *, int *, int *, float *, int *, int *); -int BLASFUNC(zgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *); -int BLASFUNC(xgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *); - -int BLASFUNC(sgesv)(int *, int *, float *, int *, int *, float *, int *, int *); -int BLASFUNC(dgesv)(int *, int *, double *, int *, int *, double*, int *, int *); -int BLASFUNC(qgesv)(int *, int *, double *, int *, int *, double*, int *, int *); -int BLASFUNC(cgesv)(int *, int *, float *, int *, int *, float *, int *, int *); -int BLASFUNC(zgesv)(int *, int *, double *, int *, int *, double*, int *, int *); -int BLASFUNC(xgesv)(int *, int *, double *, int *, int *, double*, int *, int *); - -int BLASFUNC(spotf2)(char *, int *, float *, int *, int *); -int BLASFUNC(dpotf2)(char *, int *, double *, int *, int *); -int BLASFUNC(qpotf2)(char *, int *, double *, int *, int *); -int BLASFUNC(cpotf2)(char *, int *, float *, int *, int *); -int BLASFUNC(zpotf2)(char *, int *, double *, int *, int *); -int BLASFUNC(xpotf2)(char *, int *, double *, int *, int *); - -int BLASFUNC(spotrf)(char *, int *, float *, int *, int *); -int BLASFUNC(dpotrf)(char *, int *, double *, int *, int *); -int BLASFUNC(qpotrf)(char *, int *, double *, int *, int *); -int BLASFUNC(cpotrf)(char *, int *, float *, int *, int *); -int BLASFUNC(zpotrf)(char *, int *, double *, int *, int *); -int BLASFUNC(xpotrf)(char *, int *, double *, int *, int *); - -int BLASFUNC(slauu2)(char *, int *, float *, int *, int *); -int BLASFUNC(dlauu2)(char *, int *, double *, int *, int *); -int BLASFUNC(qlauu2)(char *, int *, double *, int *, int *); -int BLASFUNC(clauu2)(char *, int *, float *, int *, int *); -int BLASFUNC(zlauu2)(char *, int *, double *, int *, int *); -int BLASFUNC(xlauu2)(char *, int *, double *, int *, int *); - -int BLASFUNC(slauum)(char *, int *, float *, int *, int *); -int BLASFUNC(dlauum)(char *, int *, double *, int *, int *); -int BLASFUNC(qlauum)(char *, int *, double *, int *, int *); -int BLASFUNC(clauum)(char *, int *, float *, int *, int *); -int BLASFUNC(zlauum)(char *, int *, double *, int *, int *); -int BLASFUNC(xlauum)(char *, int *, double *, int *, int *); - -int BLASFUNC(strti2)(char *, char *, int *, float *, int *, int *); -int BLASFUNC(dtrti2)(char *, char *, int *, double *, int *, int *); -int BLASFUNC(qtrti2)(char *, char *, int *, double *, int *, int *); -int BLASFUNC(ctrti2)(char *, char *, int *, float *, int *, int *); -int BLASFUNC(ztrti2)(char *, char *, int *, double *, int *, int *); -int BLASFUNC(xtrti2)(char *, char *, int *, double *, int *, int *); - -int BLASFUNC(strtri)(char *, char *, int *, float *, int *, int *); -int BLASFUNC(dtrtri)(char *, char *, int *, double *, int *, int *); -int BLASFUNC(qtrtri)(char *, char *, int *, double *, int *, int *); -int BLASFUNC(ctrtri)(char *, char *, int *, float *, int *, int *); -int BLASFUNC(ztrtri)(char *, char *, int *, double *, int *, int *); -int BLASFUNC(xtrtri)(char *, char *, int *, double *, int *, int *); - -int BLASFUNC(spotri)(char *, int *, float *, int *, int *); -int BLASFUNC(dpotri)(char *, int *, double *, int *, int *); -int BLASFUNC(qpotri)(char *, int *, double *, int *, int *); -int BLASFUNC(cpotri)(char *, int *, float *, int *, int *); -int BLASFUNC(zpotri)(char *, int *, double *, int *, int *); -int BLASFUNC(xpotri)(char *, int *, double *, int *, int *); #ifdef __cplusplus } diff --git a/Eigen/src/misc/lapack.h b/Eigen/src/misc/lapack.h new file mode 100644 index 000000000..249f3575c --- /dev/null +++ b/Eigen/src/misc/lapack.h @@ -0,0 +1,152 @@ +#ifndef LAPACK_H +#define LAPACK_H + +#include "blas.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +int BLASFUNC(csymv) (const char *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +int BLASFUNC(zsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +int BLASFUNC(xsymv) (const char *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); + + +int BLASFUNC(cspmv) (char *, int *, float *, float *, + float *, int *, float *, float *, int *); +int BLASFUNC(zspmv) (char *, int *, double *, double *, + double *, int *, double *, double *, int *); +int BLASFUNC(xspmv) (char *, int *, double *, double *, + double *, int *, double *, double *, int *); + +int BLASFUNC(csyr) (char *, int *, float *, float *, int *, + float *, int *); +int BLASFUNC(zsyr) (char *, int *, double *, double *, int *, + double *, int *); +int BLASFUNC(xsyr) (char *, int *, double *, double *, int *, + double *, int *); + +int BLASFUNC(cspr) (char *, int *, float *, float *, int *, + float *); +int BLASFUNC(zspr) (char *, int *, double *, double *, int *, + double *); +int BLASFUNC(xspr) (char *, int *, double *, double *, int *, + double *); + +int BLASFUNC(sgemt)(char *, int *, int *, float *, float *, int *, + float *, int *); +int BLASFUNC(dgemt)(char *, int *, int *, double *, double *, int *, + double *, int *); +int BLASFUNC(cgemt)(char *, int *, int *, float *, float *, int *, + float *, int *); +int BLASFUNC(zgemt)(char *, int *, int *, double *, double *, int *, + double *, int *); + +int BLASFUNC(sgema)(char *, char *, int *, int *, float *, + float *, int *, float *, float *, int *, float *, int *); +int BLASFUNC(dgema)(char *, char *, int *, int *, double *, + double *, int *, double*, double *, int *, double*, int *); +int BLASFUNC(cgema)(char *, char *, int *, int *, float *, + float *, int *, float *, float *, int *, float *, int *); +int BLASFUNC(zgema)(char *, char *, int *, int *, double *, + double *, int *, double*, double *, int *, double*, int *); + +int BLASFUNC(sgems)(char *, char *, int *, int *, float *, + float *, int *, float *, float *, int *, float *, int *); +int BLASFUNC(dgems)(char *, char *, int *, int *, double *, + double *, int *, double*, double *, int *, double*, int *); +int BLASFUNC(cgems)(char *, char *, int *, int *, float *, + float *, int *, float *, float *, int *, float *, int *); +int BLASFUNC(zgems)(char *, char *, int *, int *, double *, + double *, int *, double*, double *, int *, double*, int *); + +int BLASFUNC(sgetf2)(int *, int *, float *, int *, int *, int *); +int BLASFUNC(dgetf2)(int *, int *, double *, int *, int *, int *); +int BLASFUNC(qgetf2)(int *, int *, double *, int *, int *, int *); +int BLASFUNC(cgetf2)(int *, int *, float *, int *, int *, int *); +int BLASFUNC(zgetf2)(int *, int *, double *, int *, int *, int *); +int BLASFUNC(xgetf2)(int *, int *, double *, int *, int *, int *); + +int BLASFUNC(sgetrf)(int *, int *, float *, int *, int *, int *); +int BLASFUNC(dgetrf)(int *, int *, double *, int *, int *, int *); +int BLASFUNC(qgetrf)(int *, int *, double *, int *, int *, int *); +int BLASFUNC(cgetrf)(int *, int *, float *, int *, int *, int *); +int BLASFUNC(zgetrf)(int *, int *, double *, int *, int *, int *); +int BLASFUNC(xgetrf)(int *, int *, double *, int *, int *, int *); + +int BLASFUNC(slaswp)(int *, float *, int *, int *, int *, int *, int *); +int BLASFUNC(dlaswp)(int *, double *, int *, int *, int *, int *, int *); +int BLASFUNC(qlaswp)(int *, double *, int *, int *, int *, int *, int *); +int BLASFUNC(claswp)(int *, float *, int *, int *, int *, int *, int *); +int BLASFUNC(zlaswp)(int *, double *, int *, int *, int *, int *, int *); +int BLASFUNC(xlaswp)(int *, double *, int *, int *, int *, int *, int *); + +int BLASFUNC(sgetrs)(char *, int *, int *, float *, int *, int *, float *, int *, int *); +int BLASFUNC(dgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *); +int BLASFUNC(qgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *); +int BLASFUNC(cgetrs)(char *, int *, int *, float *, int *, int *, float *, int *, int *); +int BLASFUNC(zgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *); +int BLASFUNC(xgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *); + +int BLASFUNC(sgesv)(int *, int *, float *, int *, int *, float *, int *, int *); +int BLASFUNC(dgesv)(int *, int *, double *, int *, int *, double*, int *, int *); +int BLASFUNC(qgesv)(int *, int *, double *, int *, int *, double*, int *, int *); +int BLASFUNC(cgesv)(int *, int *, float *, int *, int *, float *, int *, int *); +int BLASFUNC(zgesv)(int *, int *, double *, int *, int *, double*, int *, int *); +int BLASFUNC(xgesv)(int *, int *, double *, int *, int *, double*, int *, int *); + +int BLASFUNC(spotf2)(char *, int *, float *, int *, int *); +int BLASFUNC(dpotf2)(char *, int *, double *, int *, int *); +int BLASFUNC(qpotf2)(char *, int *, double *, int *, int *); +int BLASFUNC(cpotf2)(char *, int *, float *, int *, int *); +int BLASFUNC(zpotf2)(char *, int *, double *, int *, int *); +int BLASFUNC(xpotf2)(char *, int *, double *, int *, int *); + +int BLASFUNC(spotrf)(char *, int *, float *, int *, int *); +int BLASFUNC(dpotrf)(char *, int *, double *, int *, int *); +int BLASFUNC(qpotrf)(char *, int *, double *, int *, int *); +int BLASFUNC(cpotrf)(char *, int *, float *, int *, int *); +int BLASFUNC(zpotrf)(char *, int *, double *, int *, int *); +int BLASFUNC(xpotrf)(char *, int *, double *, int *, int *); + +int BLASFUNC(slauu2)(char *, int *, float *, int *, int *); +int BLASFUNC(dlauu2)(char *, int *, double *, int *, int *); +int BLASFUNC(qlauu2)(char *, int *, double *, int *, int *); +int BLASFUNC(clauu2)(char *, int *, float *, int *, int *); +int BLASFUNC(zlauu2)(char *, int *, double *, int *, int *); +int BLASFUNC(xlauu2)(char *, int *, double *, int *, int *); + +int BLASFUNC(slauum)(char *, int *, float *, int *, int *); +int BLASFUNC(dlauum)(char *, int *, double *, int *, int *); +int BLASFUNC(qlauum)(char *, int *, double *, int *, int *); +int BLASFUNC(clauum)(char *, int *, float *, int *, int *); +int BLASFUNC(zlauum)(char *, int *, double *, int *, int *); +int BLASFUNC(xlauum)(char *, int *, double *, int *, int *); + +int BLASFUNC(strti2)(char *, char *, int *, float *, int *, int *); +int BLASFUNC(dtrti2)(char *, char *, int *, double *, int *, int *); +int BLASFUNC(qtrti2)(char *, char *, int *, double *, int *, int *); +int BLASFUNC(ctrti2)(char *, char *, int *, float *, int *, int *); +int BLASFUNC(ztrti2)(char *, char *, int *, double *, int *, int *); +int BLASFUNC(xtrti2)(char *, char *, int *, double *, int *, int *); + +int BLASFUNC(strtri)(char *, char *, int *, float *, int *, int *); +int BLASFUNC(dtrtri)(char *, char *, int *, double *, int *, int *); +int BLASFUNC(qtrtri)(char *, char *, int *, double *, int *, int *); +int BLASFUNC(ctrtri)(char *, char *, int *, float *, int *, int *); +int BLASFUNC(ztrtri)(char *, char *, int *, double *, int *, int *); +int BLASFUNC(xtrtri)(char *, char *, int *, double *, int *, int *); + +int BLASFUNC(spotri)(char *, int *, float *, int *, int *); +int BLASFUNC(dpotri)(char *, int *, double *, int *, int *); +int BLASFUNC(qpotri)(char *, int *, double *, int *, int *); +int BLASFUNC(cpotri)(char *, int *, float *, int *, int *); +int BLASFUNC(zpotri)(char *, int *, double *, int *, int *); +int BLASFUNC(xpotri)(char *, int *, double *, int *, int *); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/Eigen/src/plugins/ArrayCwiseBinaryOps.h index 9422c40bc..5694592d6 100644 --- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h @@ -280,3 +280,21 @@ operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const return CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>(derived(),other.derived()); } +/** \returns an expression of the coefficient-wise ^ operator of *this and \a other + * + * \warning this operator is for expression of bool only. + * + * Example: \include Cwise_boolean_xor.cpp + * Output: \verbinclude Cwise_boolean_xor.out + * + * \sa operator&&(), select() + */ +template<typename OtherDerived> +EIGEN_DEVICE_FUNC +inline const CwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived> +operator^(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const +{ + EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value), + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL); + return CwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>(derived(),other.derived()); +} |