aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen
diff options
context:
space:
mode:
Diffstat (limited to 'Eigen')
-rw-r--r--Eigen/src/Cholesky/LDLT.h56
-rw-r--r--Eigen/src/Cholesky/LLT.h47
-rw-r--r--Eigen/src/Core/Array.h56
-rw-r--r--Eigen/src/Core/CwiseUnaryView.h2
-rw-r--r--Eigen/src/Core/DenseBase.h4
-rw-r--r--Eigen/src/Core/GenericPacketMath.h50
-rw-r--r--Eigen/src/Core/GlobalFunctions.h5
-rw-r--r--Eigen/src/Core/IO.h22
-rw-r--r--Eigen/src/Core/IndexedView.h2
-rw-r--r--Eigen/src/Core/Matrix.h55
-rw-r--r--Eigen/src/Core/MatrixBase.h5
-rw-r--r--Eigen/src/Core/NestByValue.h69
-rw-r--r--Eigen/src/Core/PlainObjectBase.h65
-rw-r--r--Eigen/src/Core/Redux.h4
-rw-r--r--Eigen/src/Core/Reshaped.h2
-rw-r--r--Eigen/src/Core/Reverse.h2
-rw-r--r--Eigen/src/Core/SelfAdjointView.h13
-rw-r--r--Eigen/src/Core/Solve.h2
-rw-r--r--Eigen/src/Core/SolverBase.h41
-rw-r--r--Eigen/src/Core/Transpose.h3
-rw-r--r--Eigen/src/Core/TriangularMatrix.h13
-rw-r--r--Eigen/src/Core/VectorwiseOp.h46
-rw-r--r--Eigen/src/Core/Visitor.h35
-rw-r--r--Eigen/src/Core/arch/AVX/Complex.h16
-rw-r--r--Eigen/src/Core/arch/AVX/PacketMath.h42
-rw-r--r--Eigen/src/Core/arch/AVX512/Complex.h92
-rw-r--r--Eigen/src/Core/arch/AVX512/MathFunctions.h13
-rw-r--r--Eigen/src/Core/arch/AVX512/PacketMath.h83
-rw-r--r--Eigen/src/Core/arch/AltiVec/Complex.h12
-rwxr-xr-xEigen/src/Core/arch/AltiVec/PacketMath.h23
-rw-r--r--Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h238
-rw-r--r--Eigen/src/Core/arch/GPU/PacketMath.h111
-rw-r--r--Eigen/src/Core/arch/GPU/PacketMathHalf.h30
-rw-r--r--Eigen/src/Core/arch/NEON/Complex.h4
-rw-r--r--Eigen/src/Core/arch/NEON/PacketMath.h13
-rw-r--r--Eigen/src/Core/arch/SSE/Complex.h17
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h25
-rw-r--r--Eigen/src/Core/functors/UnaryFunctors.h61
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h510
-rw-r--r--Eigen/src/Core/util/ConfigureVectorization.h9
-rw-r--r--Eigen/src/Core/util/ForwardDeclarations.h1
-rw-r--r--Eigen/src/Core/util/Macros.h43
-rwxr-xr-xEigen/src/Core/util/Meta.h33
-rw-r--r--Eigen/src/Core/util/StaticAssert.h3
-rw-r--r--Eigen/src/Geometry/Transform.h42
-rw-r--r--Eigen/src/Householder/HouseholderSequence.h18
-rw-r--r--Eigen/src/LU/FullPivLU.h58
-rw-r--r--Eigen/src/LU/PartialPivLU.h45
-rw-r--r--Eigen/src/QR/ColPivHouseholderQR.h52
-rw-r--r--Eigen/src/QR/CompleteOrthogonalDecomposition.h106
-rw-r--r--Eigen/src/QR/FullPivHouseholderQR.h72
-rw-r--r--Eigen/src/QR/HouseholderQR.h54
-rw-r--r--Eigen/src/SVD/BDCSVD.h3
-rw-r--r--Eigen/src/SVD/JacobiSVD.h1
-rw-r--r--Eigen/src/SVD/SVDBase.h64
-rw-r--r--Eigen/src/SparseCore/CompressedStorage.h16
-rw-r--r--Eigen/src/SparseCore/SparseAssign.h33
-rw-r--r--Eigen/src/SparseCore/SparseCompressedBase.h35
-rw-r--r--Eigen/src/SparseCore/SparseMatrix.h111
-rw-r--r--Eigen/src/SparseCore/SparseUtil.h8
-rw-r--r--Eigen/src/plugins/ArrayCwiseUnaryOps.h42
-rw-r--r--Eigen/src/plugins/CommonCwiseUnaryOps.h14
62 files changed, 2142 insertions, 610 deletions
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 6831eab3d..67e97ffb8 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -16,6 +16,15 @@
namespace Eigen {
namespace internal {
+ template<typename _MatrixType, int _UpLo> struct traits<LDLT<_MatrixType, _UpLo> >
+ : traits<_MatrixType>
+ {
+ typedef MatrixXpr XprKind;
+ typedef SolverStorage StorageKind;
+ typedef int StorageIndex;
+ enum { Flags = 0 };
+ };
+
template<typename MatrixType, int UpLo> struct LDLT_Traits;
// PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef
@@ -48,20 +57,19 @@ namespace internal {
* \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
*/
template<typename _MatrixType, int _UpLo> class LDLT
+ : public SolverBase<LDLT<_MatrixType, _UpLo> >
{
public:
typedef _MatrixType MatrixType;
+ typedef SolverBase<LDLT> Base;
+ friend class SolverBase<LDLT>;
+
+ EIGEN_GENERIC_PUBLIC_INTERFACE(LDLT)
enum {
- RowsAtCompileTime = MatrixType::RowsAtCompileTime,
- ColsAtCompileTime = MatrixType::ColsAtCompileTime,
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
UpLo = _UpLo
};
- typedef typename MatrixType::Scalar Scalar;
- typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
- typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
- typedef typename MatrixType::StorageIndex StorageIndex;
typedef Matrix<Scalar, RowsAtCompileTime, 1, 0, MaxRowsAtCompileTime, 1> TmpMatrixType;
typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
@@ -180,6 +188,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign;
}
+ #ifdef EIGEN_PARSED_BY_DOXYGEN
/** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A.
*
* This function also supports in-place solves using the syntax <tt>x = decompositionObject.solve(x)</tt> .
@@ -197,13 +206,8 @@ template<typename _MatrixType, int _UpLo> class LDLT
*/
template<typename Rhs>
inline const Solve<LDLT, Rhs>
- solve(const MatrixBase<Rhs>& b) const
- {
- eigen_assert(m_isInitialized && "LDLT is not initialized.");
- eigen_assert(m_matrix.rows()==b.rows()
- && "LDLT::solve(): invalid number of rows of the right hand side matrix b");
- return Solve<LDLT, Rhs>(*this, b.derived());
- }
+ solve(const MatrixBase<Rhs>& b) const;
+ #endif
template<typename Derived>
bool solveInPlace(MatrixBase<Derived> &bAndX) const;
@@ -259,6 +263,9 @@ template<typename _MatrixType, int _UpLo> class LDLT
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename RhsType, typename DstType>
void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+ template<bool Conjugate, typename RhsType, typename DstType>
+ void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
#endif
protected:
@@ -559,14 +566,22 @@ template<typename _MatrixType, int _UpLo>
template<typename RhsType, typename DstType>
void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
{
- eigen_assert(rhs.rows() == rows());
+ _solve_impl_transposed<true>(rhs, dst);
+}
+
+template<typename _MatrixType,int _UpLo>
+template<bool Conjugate, typename RhsType, typename DstType>
+void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
// dst = P b
dst = m_transpositions * rhs;
// dst = L^-1 (P b)
- matrixL().solveInPlace(dst);
+ // dst = L^-*T (P b)
+ matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst);
- // dst = D^-1 (L^-1 P b)
+ // dst = D^-* (L^-1 P b)
+ // dst = D^-1 (L^-*T P b)
// more precisely, use pseudo-inverse of D (see bug 241)
using std::abs;
const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
@@ -578,7 +593,6 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
// Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
// Using numeric_limits::min() gives us more robustness to denormals.
RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
-
for (Index i = 0; i < vecD.size(); ++i)
{
if(abs(vecD(i)) > tolerance)
@@ -587,10 +601,12 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
dst.row(i).setZero();
}
- // dst = L^-T (D^-1 L^-1 P b)
- matrixU().solveInPlace(dst);
+ // dst = L^-* (D^-* L^-1 P b)
+ // dst = L^-T (D^-1 L^-*T P b)
+ matrixL().transpose().template conjugateIf<Conjugate>().solveInPlace(dst);
- // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
+ // dst = P^T (L^-* D^-* L^-1 P b) = A^-1 b
+ // dst = P^-T (L^-T D^-1 L^-*T P b) = A^-1 b
dst = m_transpositions.transpose() * dst;
}
#endif
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 868766365..5876966e6 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -13,6 +13,16 @@
namespace Eigen {
namespace internal{
+
+template<typename _MatrixType, int _UpLo> struct traits<LLT<_MatrixType, _UpLo> >
+ : traits<_MatrixType>
+{
+ typedef MatrixXpr XprKind;
+ typedef SolverStorage StorageKind;
+ typedef int StorageIndex;
+ enum { Flags = 0 };
+};
+
template<typename MatrixType, int UpLo> struct LLT_Traits;
}
@@ -54,18 +64,17 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
* \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
*/
template<typename _MatrixType, int _UpLo> class LLT
+ : public SolverBase<LLT<_MatrixType, _UpLo> >
{
public:
typedef _MatrixType MatrixType;
+ typedef SolverBase<LLT> Base;
+ friend class SolverBase<LLT>;
+
+ EIGEN_GENERIC_PUBLIC_INTERFACE(LLT)
enum {
- RowsAtCompileTime = MatrixType::RowsAtCompileTime,
- ColsAtCompileTime = MatrixType::ColsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
};
- typedef typename MatrixType::Scalar Scalar;
- typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
- typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
- typedef typename MatrixType::StorageIndex StorageIndex;
enum {
PacketSize = internal::packet_traits<Scalar>::size,
@@ -129,6 +138,7 @@ template<typename _MatrixType, int _UpLo> class LLT
return Traits::getL(m_matrix);
}
+ #ifdef EIGEN_PARSED_BY_DOXYGEN
/** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
*
* Since this LLT class assumes anyway that the matrix A is invertible, the solution
@@ -141,13 +151,8 @@ template<typename _MatrixType, int _UpLo> class LLT
*/
template<typename Rhs>
inline const Solve<LLT, Rhs>
- solve(const MatrixBase<Rhs>& b) const
- {
- eigen_assert(m_isInitialized && "LLT is not initialized.");
- eigen_assert(m_matrix.rows()==b.rows()
- && "LLT::solve(): invalid number of rows of the right hand side matrix b");
- return Solve<LLT, Rhs>(*this, b.derived());
- }
+ solve(const MatrixBase<Rhs>& b) const;
+ #endif
template<typename Derived>
void solveInPlace(const MatrixBase<Derived> &bAndX) const;
@@ -205,6 +210,9 @@ template<typename _MatrixType, int _UpLo> class LLT
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename RhsType, typename DstType>
void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+ template<bool Conjugate, typename RhsType, typename DstType>
+ void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
#endif
protected:
@@ -476,8 +484,17 @@ template<typename _MatrixType,int _UpLo>
template<typename RhsType, typename DstType>
void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
{
- dst = rhs;
- solveInPlace(dst);
+ _solve_impl_transposed<true>(rhs, dst);
+}
+
+template<typename _MatrixType,int _UpLo>
+template<bool Conjugate, typename RhsType, typename DstType>
+void LLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+ dst = rhs;
+
+ matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst);
+ matrixU().template conjugateIf<!Conjugate>().solveInPlace(dst);
}
#endif
diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index 16770fc7b..e58e68eda 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -178,6 +178,46 @@ class Array
Base::_check_template_params();
this->template _init2<T0,T1>(val0, val1);
}
+
+ #if EIGEN_HAS_CXX11
+ /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+ *
+ * Example: \include Array_variadic_ctor_cxx11.cpp
+ * Output: \verbinclude Array_variadic_ctor_cxx11.out
+ *
+ * \sa Array(const std::initializer_list<std::initializer_list<Scalar>>&)
+ * \sa Array(Scalar), Array(Scalar,Scalar)
+ */
+ template <typename... ArgTypes>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+ : Base(a0, a1, a2, a3, args...) {}
+
+ /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
+ *
+ * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
+ *
+ * Example: \include Array_initializer_list_23_cxx11.cpp
+ * Output: \verbinclude Array_initializer_list_23_cxx11.out
+ *
+ * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
+ *
+ * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed.
+ * Therefore <code> Array<int,Dynamic,1>{{1,2,3,4,5}}</code> is legal and the more verbose syntax
+ * <code>Array<int,Dynamic,1>{{1},{2},{3},{4},{5}}</code> can be avoided:
+ *
+ * Example: \include Array_initializer_list_vector_cxx11.cpp
+ * Output: \verbinclude Array_initializer_list_vector_cxx11.out
+ *
+ * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes,
+ * and implicit transposition is allowed for compile-time 1D arrays only.
+ *
+ * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+ */
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Array(const std::initializer_list<std::initializer_list<Scalar> >& list) : Base(list) {}
+ #endif // end EIGEN_HAS_CXX11
+
#else
/** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */
EIGEN_DEVICE_FUNC explicit Array(const Scalar *data);
@@ -189,7 +229,8 @@ class Array
*/
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE explicit Array(Index dim);
- /** constructs an initialized 1x1 Array with the given coefficient */
+ /** constructs an initialized 1x1 Array with the given coefficient
+ * \sa const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args */
Array(const Scalar& value);
/** constructs an uninitialized array with \a rows rows and \a cols columns.
*
@@ -197,11 +238,14 @@ class Array
* it is redundant to pass these parameters, so one should use the default constructor
* Array() instead. */
Array(Index rows, Index cols);
- /** constructs an initialized 2D vector with given coefficients */
+ /** constructs an initialized 2D vector with given coefficients
+ * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */
Array(const Scalar& val0, const Scalar& val1);
- #endif
+ #endif // end EIGEN_PARSED_BY_DOXYGEN
- /** constructs an initialized 3D vector with given coefficients */
+ /** constructs an initialized 3D vector with given coefficients
+ * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+ */
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2)
{
@@ -211,7 +255,9 @@ class Array
m_storage.data()[1] = val1;
m_storage.data()[2] = val2;
}
- /** constructs an initialized 4D vector with given coefficients */
+ /** constructs an initialized 4D vector with given coefficients
+ * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+ */
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3)
{
diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h
index 271033056..21cf5ea9e 100644
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -81,7 +81,7 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
/** \returns the nested expression */
typename internal::remove_reference<MatrixTypeNested>::type&
- nestedExpression() { return m_matrix.const_cast_derived(); }
+ nestedExpression() { return m_matrix; }
protected:
MatrixTypeNested m_matrix;
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index f8feefa27..65ec1f54b 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -150,8 +150,8 @@ template<typename Derived> class DenseBase
* \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime
*/
- IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
- || internal::traits<Derived>::MaxColsAtCompileTime == 1,
+ IsVectorAtCompileTime = internal::traits<Derived>::RowsAtCompileTime == 1
+ || internal::traits<Derived>::ColsAtCompileTime == 1,
/**< This is set to true if either the number of rows or the number of
* columns is known at compile-time to be equal to 1. Indeed, in that case,
* we are dealing with a column-vector (if there is only one column) or with
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 2b2ee9e2c..04a321b9f 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -214,6 +214,21 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; }
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pandnot(const Packet& a, const Packet& b) { return a & (~b); }
+/** \internal \returns ones */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;}
+
+template <typename RealScalar>
+EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) {
+ RealScalar b;
+ b = ptrue(b);
+ return std::complex<RealScalar>(b, b);
+}
+
+/** \internal \returns the bitwise not of \a a */
+template <typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pnot(const Packet& a) { return pxor(ptrue(a), a);}
+
/** \internal \returns \a a shifted by N bits to the right */
template<int N> EIGEN_DEVICE_FUNC inline int
pshiftright(const int& a) { return a >> N; }
@@ -250,19 +265,19 @@ pselect(const Packet& mask, const Packet& a, const Packet& b) {
/** \internal \returns a <= b as a bit mask */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_le(const Packet& a, const Packet& b); /* { return a<=b ? pnot(pxor(a,a)) : pxor(a,a); } */
+pcmp_le(const Packet& a, const Packet& b) { return a<=b ? ptrue(a) : pzero(a); }
/** \internal \returns a < b as a bit mask */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_lt(const Packet& a, const Packet& b); /* { return a<b ? pnot(pxor(a,a)) : pxor(a,a); } */
+pcmp_lt(const Packet& a, const Packet& b) { return a<b ? ptrue(a) : pzero(a); }
/** \internal \returns a == b as a bit mask */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_eq(const Packet& a, const Packet& b); /* { return a==b ? pnot(pxor(a,a)) : pxor(a,a); } */
+pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); }
/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_lt_or_nan(const Packet& a, const Packet& b); /* { return pnot(pcmp_le(b,a)); } */
+pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); }
/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -393,18 +408,39 @@ typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_trai
predux_half_dowto4(const Packet& a)
{ return a; }
-/** \internal \returns the product of the elements of \a a*/
+/** \internal \returns the product of the elements of \a a */
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
{ return a; }
-/** \internal \returns the min of the elements of \a a*/
+/** \internal \returns the min of the elements of \a a */
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
{ return a; }
-/** \internal \returns the max of the elements of \a a*/
+/** \internal \returns the max of the elements of \a a */
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
{ return a; }
+/** \internal \returns true if all coeffs of \a a means "true"
+ * It is supposed to be called on values returned by pcmp_*.
+ */
+// not needed yet
+// template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a)
+// { return bool(a); }
+
+/** \internal \returns true if any coeffs of \a a means "true"
+ * It is supposed to be called on values returned by pcmp_*.
+ */
+template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a)
+{
+ // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames.
+ // It is expected that "true" is either:
+ // - Scalar(1)
+ // - bits full of ones (NaN for floats),
+ // - or first bit equals to 1 (1 for ints, smallest denormal for floats).
+ // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars.
+ return bool(predux(a));
+}
+
/** \internal \returns the reversed elements of \a a*/
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
{ return a; }
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index 563df6e84..71377cee5 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -66,6 +66,11 @@ namespace Eigen
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
+#if EIGEN_HAS_CXX11_MATH
+ EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh,scalar_asinh_op,inverse hyperbolic sine,\sa ArrayBase::asinh)
+ EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh,scalar_acosh_op,inverse hyperbolic cosine,\sa ArrayBase::acosh)
+ EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh,scalar_atanh_op,inverse hyperbolic tangent,\sa ArrayBase::atanh)
+#endif
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h
index da7fd6cce..063511f24 100644
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -41,6 +41,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
* - \b rowSuffix string printed at the end of each row
* - \b matPrefix string printed at the beginning of the matrix
* - \b matSuffix string printed at the end of the matrix
+ * - \b fill character printed to fill the empty space in aligned columns
*
* Example: \include IOFormat.cpp
* Output: \verbinclude IOFormat.out
@@ -53,9 +54,9 @@ struct IOFormat
IOFormat(int _precision = StreamPrecision, int _flags = 0,
const std::string& _coeffSeparator = " ",
const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="",
- const std::string& _matPrefix="", const std::string& _matSuffix="")
+ const std::string& _matPrefix="", const std::string& _matSuffix="", const char _fill=' ')
: matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator),
- rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags)
+ rowSpacer(""), coeffSeparator(_coeffSeparator), fill(_fill), precision(_precision), flags(_flags)
{
// TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
// don't add rowSpacer if columns are not to be aligned
@@ -71,6 +72,7 @@ struct IOFormat
std::string matPrefix, matSuffix;
std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer;
std::string coeffSeparator;
+ char fill;
int precision;
int flags;
};
@@ -176,18 +178,26 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
width = std::max<Index>(width, Index(sstr.str().length()));
}
}
+ std::streamsize old_width = s.width();
+ char old_fill_character = s.fill();
s << fmt.matPrefix;
for(Index i = 0; i < m.rows(); ++i)
{
if (i)
s << fmt.rowSpacer;
s << fmt.rowPrefix;
- if(width) s.width(width);
+ if(width) {
+ s.fill(fmt.fill);
+ s.width(width);
+ }
s << m.coeff(i, 0);
for(Index j = 1; j < m.cols(); ++j)
{
s << fmt.coeffSeparator;
- if (width) s.width(width);
+ if(width) {
+ s.fill(fmt.fill);
+ s.width(width);
+ }
s << m.coeff(i, j);
}
s << fmt.rowSuffix;
@@ -196,6 +206,10 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
}
s << fmt.matSuffix;
if(explicit_precision) s.precision(old_precision);
+ if(width) {
+ s.fill(old_fill_character);
+ s.width(old_width);
+ }
return s;
}
diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h
index 3485d8f46..377f8a5cc 100644
--- a/Eigen/src/Core/IndexedView.h
+++ b/Eigen/src/Core/IndexedView.h
@@ -132,7 +132,7 @@ public:
/** \returns the nested expression */
typename internal::remove_reference<XprType>::type&
- nestedExpression() { return m_xpr.const_cast_derived(); }
+ nestedExpression() { return m_xpr; }
/** \returns a const reference to the object storing/generating the row indices */
const RowIndices& rowIndices() const { return m_rowIndices; }
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 7f4a7af93..32269ed2e 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -301,6 +301,45 @@ class Matrix
Base::_check_template_params();
Base::template _init2<T0,T1>(x, y);
}
+
+ #if EIGEN_HAS_CXX11
+ /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...)
+ *
+ * Example: \include Matrix_variadic_ctor_cxx11.cpp
+ * Output: \verbinclude Matrix_variadic_ctor_cxx11.out
+ *
+ * \sa Matrix(const std::initializer_list<std::initializer_list<Scalar>>&)
+ */
+ template <typename... ArgTypes>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+ : Base(a0, a1, a2, a3, args...) {}
+
+ /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
+ *
+ * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
+ *
+ * Example: \include Matrix_initializer_list_23_cxx11.cpp
+ * Output: \verbinclude Matrix_initializer_list_23_cxx11.out
+ *
+ * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
+ *
+ * In the case of a compile-time column vector, implicit transposition from a single row is allowed.
+ * Therefore <code>VectorXd{{1,2,3,4,5}}</code> is legal and the more verbose syntax
+ * <code>RowVectorXd{{1},{2},{3},{4},{5}}</code> can be avoided:
+ *
+ * Example: \include Matrix_initializer_list_vector_cxx11.cpp
+ * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out
+ *
+ * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes,
+ * and implicit transposition is allowed for compile-time vectors only.
+ *
+ * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...)
+ */
+ EIGEN_DEVICE_FUNC
+ explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
+ #endif // end EIGEN_HAS_CXX11
+
#else
/** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */
EIGEN_DEVICE_FUNC
@@ -319,7 +358,8 @@ class Matrix
* \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
*/
EIGEN_STRONG_INLINE explicit Matrix(Index dim);
- /** \brief Constructs an initialized 1x1 matrix with the given coefficient */
+ /** \brief Constructs an initialized 1x1 matrix with the given coefficient
+ * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */
Matrix(const Scalar& x);
/** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
*
@@ -336,11 +376,14 @@ class Matrix
EIGEN_DEVICE_FUNC
Matrix(Index rows, Index cols);
- /** \brief Constructs an initialized 2D vector with given coefficients */
+ /** \brief Constructs an initialized 2D vector with given coefficients
+ * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */
Matrix(const Scalar& x, const Scalar& y);
- #endif
+ #endif // end EIGEN_PARSED_BY_DOXYGEN
- /** \brief Constructs an initialized 3D vector with given coefficients */
+ /** \brief Constructs an initialized 3D vector with given coefficients
+ * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...)
+ */
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
{
@@ -350,7 +393,9 @@ class Matrix
m_storage.data()[1] = y;
m_storage.data()[2] = z;
}
- /** \brief Constructs an initialized 4D vector with given coefficients */
+ /** \brief Constructs an initialized 4D vector with given coefficients
+ * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...)
+ */
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w)
{
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 596cdd133..4744e5cc4 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -468,6 +468,11 @@ template<typename Derived> class MatrixBase
const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const;
EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine)
EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine)
+#if EIGEN_HAS_CXX11_MATH
+ EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine)
+ EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine)
+ EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine)
+#endif
EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine)
EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine)
EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root)
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index 01cf192e9..239bbba63 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -16,7 +16,11 @@ namespace Eigen {
namespace internal {
template<typename ExpressionType>
struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType>
-{};
+{
+ enum {
+ Flags = traits<ExpressionType>::Flags & ~NestByRefBit
+ };
+};
}
/** \class NestByValue
@@ -43,55 +47,11 @@ template<typename ExpressionType> class NestByValue
EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
- EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
- EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
-
- EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
- {
- return m_expression.coeff(row, col);
- }
-
- EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
- {
- return m_expression.const_cast_derived().coeffRef(row, col);
- }
-
- EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
- {
- return m_expression.coeff(index);
- }
-
- EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
- {
- return m_expression.const_cast_derived().coeffRef(index);
- }
-
- template<int LoadMode>
- EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index row, Index col) const
- {
- return m_expression.template packet<LoadMode>(row, col);
- }
-
- template<int LoadMode>
- EIGEN_DEVICE_FUNC inline void writePacket(Index row, Index col, const PacketScalar& x)
- {
- m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
- }
-
- template<int LoadMode>
- EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index index) const
- {
- return m_expression.template packet<LoadMode>(index);
- }
-
- template<int LoadMode>
- EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& x)
- {
- m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
- }
EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
+ EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; }
+
protected:
const ExpressionType m_expression;
};
@@ -105,6 +65,21 @@ DenseBase<Derived>::nestByValue() const
return NestByValue<Derived>(derived());
}
+namespace internal {
+
+// Evaluator of Solve -> eval into a temporary
+template<typename ArgType>
+struct evaluator<NestByValue<ArgType> >
+ : public evaluator<ArgType>
+{
+ typedef evaluator<ArgType> Base;
+
+ EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue<ArgType>& xpr)
+ : Base(xpr.nestedExpression())
+ {}
+};
+}
+
} // end namespace Eigen
#endif // EIGEN_NESTBYVALUE_H
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index f551dabb0..2deaa5aab 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -526,6 +526,71 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
// EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
}
+ #if EIGEN_HAS_CXX11
+ /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11
+ *
+ * \only_for_vectors
+ *
+ * This constructor is for 1D array or vectors with more than 4 coefficients.
+ * There exists c++98 anologue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients.
+ *
+ * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
+ * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+ */
+ template <typename... ArgTypes>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+ : m_storage()
+ {
+ _check_template_params();
+ EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4);
+ m_storage.data()[0] = a0;
+ m_storage.data()[1] = a1;
+ m_storage.data()[2] = a2;
+ m_storage.data()[3] = a3;
+ int i = 4;
+ auto x = {(m_storage.data()[i++] = args, 0)...};
+ static_cast<void>(x);
+ }
+
+ /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer
+ * lists \cpp11
+ */
+ EIGEN_DEVICE_FUNC
+ explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list<std::initializer_list<Scalar>>& list)
+ : m_storage()
+ {
+ _check_template_params();
+
+ size_t list_size = 0;
+ if (list.begin() != list.end()) {
+ list_size = list.begin()->size();
+ }
+
+ // This is to allow syntax like VectorXi {{1, 2, 3, 4}}
+ if (ColsAtCompileTime == 1 && list.size() == 1) {
+ eigen_assert(list_size == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
+ resize(list_size, ColsAtCompileTime);
+ std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data());
+ } else {
+ eigen_assert(list.size() == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
+ eigen_assert(list_size == static_cast<size_t>(ColsAtCompileTime) || ColsAtCompileTime == Dynamic);
+ resize(list.size(), list_size);
+
+ Index row_index = 0;
+ for (const std::initializer_list<Scalar>& row : list) {
+ eigen_assert(list_size == row.size());
+ Index col_index = 0;
+ for (const Scalar& e : row) {
+ coeffRef(row_index, col_index) = e;
+ ++col_index;
+ }
+ ++row_index;
+ }
+ }
+ }
+ #endif // end EIGEN_HAS_CXX11
+
/** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 720b6030c..e231a7d7d 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -397,6 +397,8 @@ public:
* The template parameter \a BinaryOp is the type of the functor \a func which must be
* an associative operator. Both current C++98 and C++11 functor styles are handled.
*
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
* \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
*/
template<typename Derived>
@@ -415,6 +417,7 @@ DenseBase<Derived>::redux(const Func& func) const
}
/** \returns the minimum of all coefficients of \c *this.
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
* \warning the result is undefined if \c *this contains NaN.
*/
template<typename Derived>
@@ -425,6 +428,7 @@ DenseBase<Derived>::minCoeff() const
}
/** \returns the maximum of all coefficients of \c *this.
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
* \warning the result is undefined if \c *this contains NaN.
*/
template<typename Derived>
diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h
index b7bd1b292..c955815e6 100644
--- a/Eigen/src/Core/Reshaped.h
+++ b/Eigen/src/Core/Reshaped.h
@@ -191,7 +191,7 @@ class ReshapedImpl_dense<XprType,Rows,Cols,Order,false>
/** \returns the nested expression */
EIGEN_DEVICE_FUNC
typename internal::remove_reference<XprType>::type&
- nestedExpression() { return m_xpr.const_cast_derived(); }
+ nestedExpression() { return m_xpr; }
protected:
diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index 8b6b3ab03..711dbcf9a 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -203,7 +203,7 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
template<typename ExpressionType, int Direction>
EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
{
- internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
+ internal::vectorwise_reverse_inplace_impl<Direction>::run(m_matrix);
}
} // end namespace Eigen
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 2cf3fa1ef..2173799d9 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -61,6 +61,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
typedef typename internal::traits<SelfAdjointView>::Scalar Scalar;
typedef typename MatrixType::StorageIndex StorageIndex;
typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
+ typedef SelfAdjointView<typename internal::add_const<MatrixType>::type, UpLo> ConstSelfAdjointView;
enum {
Mode = internal::traits<SelfAdjointView>::Mode,
@@ -197,6 +198,18 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
inline const ConjugateReturnType conjugate() const
{ return ConjugateReturnType(m_matrix.conjugate()); }
+ /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+ * returns \c *this otherwise.
+ */
+ template<bool Cond>
+ EIGEN_DEVICE_FUNC
+ inline typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type
+ conjugateIf() const
+ {
+ typedef typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type ReturnType;
+ return ReturnType(m_matrix.template conjugateIf<Cond>());
+ }
+
typedef SelfAdjointView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
/** \sa MatrixBase::adjoint() const */
EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
index 2bf940a26..ec4b4a987 100644
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -19,7 +19,7 @@ template<typename Decomposition, typename RhsType, typename StorageKind> class S
*
* \brief Pseudo expression representing a solving operation
*
- * \tparam Decomposition the type of the matrix or decomposion object
+ * \tparam Decomposition the type of the matrix or decomposition object
* \tparam Rhstype the type of the right-hand side
*
* This class represents an expression of A.solve(B)
diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h
index 702a5485c..501461042 100644
--- a/Eigen/src/Core/SolverBase.h
+++ b/Eigen/src/Core/SolverBase.h
@@ -14,8 +14,35 @@ namespace Eigen {
namespace internal {
+template<typename Derived>
+struct solve_assertion {
+ template<bool Transpose_, typename Rhs>
+ static void run(const Derived& solver, const Rhs& b) { solver.template _check_solve_assertion<Transpose_>(b); }
+};
+
+template<typename Derived>
+struct solve_assertion<Transpose<Derived> >
+{
+ typedef Transpose<Derived> type;
+
+ template<bool Transpose_, typename Rhs>
+ static void run(const type& transpose, const Rhs& b)
+ {
+ internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<true>(transpose.nestedExpression(), b);
+ }
+};
+template<typename Scalar, typename Derived>
+struct solve_assertion<CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > >
+{
+ typedef CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > type;
+ template<bool Transpose_, typename Rhs>
+ static void run(const type& adjoint, const Rhs& b)
+ {
+ internal::solve_assertion<typename internal::remove_all<Transpose<Derived> >::type>::template run<true>(adjoint.nestedExpression(), b);
+ }
+};
} // end namespace internal
/** \class SolverBase
@@ -35,7 +62,7 @@ namespace internal {
*
* \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors.
*
- * \sa class PartialPivLU, class FullPivLU
+ * \sa class PartialPivLU, class FullPivLU, class HouseholderQR, class ColPivHouseholderQR, class FullPivHouseholderQR, class CompleteOrthogonalDecomposition, class LLT, class LDLT, class SVDBase
*/
template<typename Derived>
class SolverBase : public EigenBase<Derived>
@@ -46,6 +73,9 @@ class SolverBase : public EigenBase<Derived>
typedef typename internal::traits<Derived>::Scalar Scalar;
typedef Scalar CoeffReturnType;
+ template<typename Derived_>
+ friend struct internal::solve_assertion;
+
enum {
RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
@@ -75,7 +105,7 @@ class SolverBase : public EigenBase<Derived>
inline const Solve<Derived, Rhs>
solve(const MatrixBase<Rhs>& b) const
{
- eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+ internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<false>(derived(), b);
return Solve<Derived, Rhs>(derived(), b.derived());
}
@@ -113,6 +143,13 @@ class SolverBase : public EigenBase<Derived>
}
protected:
+
+ template<bool Transpose_, typename Rhs>
+ void _check_solve_assertion(const Rhs& b) const {
+ EIGEN_ONLY_USED_FOR_DEBUG(b);
+ eigen_assert(derived().m_isInitialized && "Solver is not initialized.");
+ eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "SolverBase::solve(): invalid number of rows of the right hand side matrix b");
+ }
};
namespace internal {
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index d7c204579..91a9ab1b9 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -392,7 +392,8 @@ struct checkTransposeAliasing_impl<Derived, OtherDerived, false>
template<typename Dst, typename Src>
void check_for_aliasing(const Dst &dst, const Src &src)
{
- internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
+ if((!Dst::IsVectorAtCompileTime) && dst.rows()>1 && dst.cols()>1)
+ internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
}
} // end namespace internal
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 521de6160..cf3532f06 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -198,6 +198,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;
typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
+ typedef TriangularView<typename internal::add_const<MatrixType>::type, _Mode> ConstTriangularView;
public:
@@ -243,6 +244,18 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
inline const ConjugateReturnType conjugate() const
{ return ConjugateReturnType(m_matrix.conjugate()); }
+ /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+ * returns \c *this otherwise.
+ */
+ template<bool Cond>
+ EIGEN_DEVICE_FUNC
+ inline typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type
+ conjugateIf() const
+ {
+ typedef typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type ReturnType;
+ return ReturnType(m_matrix.template conjugateIf<Cond>());
+ }
+
typedef TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
/** \sa MatrixBase::adjoint() const */
EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index a88b6e736..db0b9f8c4 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -173,6 +173,14 @@ struct member_redux {
* Example: \include MatrixBase_colwise_iterator_cxx11.cpp
* Output: \verbinclude MatrixBase_colwise_iterator_cxx11.out
*
+ * For a partial reduction on an empty input, some rules apply.
+ * For the sake of clarity, let's consider a vertical reduction:
+ * - If the number of columns is zero, then a 1x0 row-major vector expression is returned.
+ * - Otherwise, if the number of rows is zero, then
+ * - a row vector of zeros is returned for sum-like reductions (sum, squaredNorm, norm, etc.)
+ * - a row vector of ones is returned for a product reduction (e.g., <code>MatrixXd(n,0).colwise().prod()</code>)
+ * - an assert is triggered for all other reductions (minCoeff,maxCoeff,redux(bin_op))
+ *
* \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr
*/
template<typename ExpressionType, int Direction> class VectorwiseOp
@@ -294,13 +302,19 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
* The template parameter \a BinaryOp is the type of the functor
* of the custom redux operator. Note that func must be an associative operator.
*
+ * \warning the size along the reduction direction must be strictly positive,
+ * otherwise an assertion is triggered.
+ *
* \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise()
*/
template<typename BinaryOp>
EIGEN_DEVICE_FUNC
const typename ReduxReturnType<BinaryOp>::Type
redux(const BinaryOp& func = BinaryOp()) const
- { return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func)); }
+ {
+ eigen_assert(redux_length()>0 && "you are using an empty matrix");
+ return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func));
+ }
typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
@@ -325,6 +339,9 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
/** \returns a row (or column) vector expression of the smallest coefficient
* of each column (or row) of the referenced expression.
*
+ * \warning the size along the reduction direction must be strictly positive,
+ * otherwise an assertion is triggered.
+ *
* \warning the result is undefined if \c *this contains NaN.
*
* Example: \include PartialRedux_minCoeff.cpp
@@ -333,11 +350,17 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
* \sa DenseBase::minCoeff() */
EIGEN_DEVICE_FUNC
const MinCoeffReturnType minCoeff() const
- { return MinCoeffReturnType(_expression()); }
+ {
+ eigen_assert(redux_length()>0 && "you are using an empty matrix");
+ return MinCoeffReturnType(_expression());
+ }
/** \returns a row (or column) vector expression of the largest coefficient
* of each column (or row) of the referenced expression.
*
+ * \warning the size along the reduction direction must be strictly positive,
+ * otherwise an assertion is triggered.
+ *
* \warning the result is undefined if \c *this contains NaN.
*
* Example: \include PartialRedux_maxCoeff.cpp
@@ -346,7 +369,10 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
* \sa DenseBase::maxCoeff() */
EIGEN_DEVICE_FUNC
const MaxCoeffReturnType maxCoeff() const
- { return MaxCoeffReturnType(_expression()); }
+ {
+ eigen_assert(redux_length()>0 && "you are using an empty matrix");
+ return MaxCoeffReturnType(_expression());
+ }
/** \returns a row (or column) vector expression of the squared norm
* of each column (or row) of the referenced expression.
@@ -531,7 +557,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
//eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME
- return const_cast<ExpressionType&>(m_matrix = extendedTo(other.derived()));
+ return m_matrix = extendedTo(other.derived());
}
/** Adds the vector \a other to each subvector of \c *this */
@@ -541,7 +567,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
- return const_cast<ExpressionType&>(m_matrix += extendedTo(other.derived()));
+ return m_matrix += extendedTo(other.derived());
}
/** Substracts the vector \a other to each subvector of \c *this */
@@ -551,7 +577,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
- return const_cast<ExpressionType&>(m_matrix -= extendedTo(other.derived()));
+ return m_matrix -= extendedTo(other.derived());
}
/** Multiples each subvector of \c *this by the vector \a other */
@@ -563,7 +589,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
m_matrix *= extendedTo(other.derived());
- return const_cast<ExpressionType&>(m_matrix);
+ return m_matrix;
}
/** Divides each subvector of \c *this by the vector \a other */
@@ -575,7 +601,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
m_matrix /= extendedTo(other.derived());
- return const_cast<ExpressionType&>(m_matrix);
+ return m_matrix;
}
/** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
@@ -690,6 +716,10 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
const HNormalizedReturnType hnormalized() const;
protected:
+ Index redux_length() const
+ {
+ return Direction==Vertical ? m_matrix.rows() : m_matrix.cols();
+ }
ExpressionTypeNested m_matrix;
};
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index 54c1883d9..f67d83bd1 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -40,6 +40,14 @@ struct visitor_impl<Visitor, Derived, 1>
}
};
+// This specialization enables visitors on empty matrices at compile-time
+template<typename Visitor, typename Derived>
+struct visitor_impl<Visitor, Derived, 0> {
+ EIGEN_DEVICE_FUNC
+ static inline void run(const Derived &/*mat*/, Visitor& /*visitor*/)
+ {}
+};
+
template<typename Visitor, typename Derived>
struct visitor_impl<Visitor, Derived, Dynamic>
{
@@ -98,6 +106,8 @@ protected:
*
* \note compared to one or two \em for \em loops, visitors offer automatic
* unrolling for small fixed size matrix.
+ *
+ * \note if the matrix is empty, then the visitor is left unchanged.
*
* \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux()
*/
@@ -106,6 +116,9 @@ template<typename Visitor>
EIGEN_DEVICE_FUNC
void DenseBase<Derived>::visit(Visitor& visitor) const
{
+ if(size()==0)
+ return;
+
typedef typename internal::visitor_evaluator<Derived> ThisEvaluator;
ThisEvaluator thisEval(derived());
@@ -124,6 +137,8 @@ namespace internal {
template <typename Derived>
struct coeff_visitor
{
+ // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc
+ coeff_visitor() : row(-1), col(-1), res(0) {}
typedef typename Derived::Scalar Scalar;
Index row, col;
Scalar res;
@@ -196,6 +211,9 @@ struct functor_traits<max_coeff_visitor<Scalar> > {
/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
* \returns the minimum of all coefficients of *this and puts in *row and *col its location.
+ *
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
* \warning the result is undefined if \c *this contains NaN.
*
* \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
@@ -206,6 +224,8 @@ EIGEN_DEVICE_FUNC
typename internal::traits<Derived>::Scalar
DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
{
+ eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
internal::min_coeff_visitor<Derived> minVisitor;
this->visit(minVisitor);
*rowId = minVisitor.row;
@@ -214,6 +234,9 @@ DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
}
/** \returns the minimum of all coefficients of *this and puts in *index its location.
+ *
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
* \warning the result is undefined if \c *this contains NaN.
*
* \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff()
@@ -224,6 +247,8 @@ EIGEN_DEVICE_FUNC
typename internal::traits<Derived>::Scalar
DenseBase<Derived>::minCoeff(IndexType* index) const
{
+ eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
internal::min_coeff_visitor<Derived> minVisitor;
this->visit(minVisitor);
@@ -233,6 +258,9 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
/** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
* \returns the maximum of all coefficients of *this and puts in *row and *col its location.
+ *
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
* \warning the result is undefined if \c *this contains NaN.
*
* \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
@@ -243,6 +271,8 @@ EIGEN_DEVICE_FUNC
typename internal::traits<Derived>::Scalar
DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
{
+ eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
internal::max_coeff_visitor<Derived> maxVisitor;
this->visit(maxVisitor);
*rowPtr = maxVisitor.row;
@@ -251,6 +281,9 @@ DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
}
/** \returns the maximum of all coefficients of *this and puts in *index its location.
+ *
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
* \warning the result is undefined if \c *this contains NaN.
*
* \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
@@ -261,6 +294,8 @@ EIGEN_DEVICE_FUNC
typename internal::traits<Derived>::Scalar
DenseBase<Derived>::maxCoeff(IndexType* index) const
{
+ eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
internal::max_coeff_visitor<Derived> maxVisitor;
this->visit(maxVisitor);
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index e7e2a1033..5b8ff59bd 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -69,6 +69,14 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, con
return Packet4cf(result);
}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) {
+ __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ);
+ return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet4cf pnot<Packet4cf>(const Packet4cf& a) { return Packet4cf(pnot(Packet8f(a.v))); }
template<> EIGEN_STRONG_INLINE Packet4cf pand <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf por <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pxor <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
@@ -276,6 +284,14 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, con
return Packet2cd(_mm256_addsub_pd(even, odd));
}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) {
+ __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ);
+ return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cd pnot<Packet2cd>(const Packet2cd& a) { return Packet2cd(pnot(Packet4d(a.v))); }
template<> EIGEN_STRONG_INLINE Packet2cd pand <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd por <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pxor <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index e771c0f25..ee00f1f7d 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -228,6 +228,7 @@ template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const
template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); }
template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); }
template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); }
+template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); }
template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); }
template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
@@ -249,6 +250,37 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { ret
template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ // vpcmpeqd has lower latency than the more general vcmpps
+ return _mm256_cmpeq_epi32(a,a);
+#else
+ const __m256 b = _mm256_castsi256_ps(a);
+ return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ // vpcmpeqd has lower latency than the more general vcmpps
+ const __m256i b = _mm256_castps_si256(a);
+ return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b));
+#else
+ return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ // vpcmpeqq has lower latency than the more general vcmppd
+ const __m256i b = _mm256_castpd_si256(a);
+ return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b));
+#else
+ return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ);
+#endif
+}
+
template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
@@ -575,6 +607,16 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
}
+// not needed yet
+// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
+// {
+// return _mm256_movemask_ps(x)==0xFF;
+// }
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
+{
+ return _mm256_movemask_ps(x)!=0;
+}
template<int Offset>
struct palign_impl<Offset,Packet8f>
diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h
index 569ee01ff..9a89dd01f 100644
--- a/Eigen/src/Core/arch/AVX512/Complex.h
+++ b/Eigen/src/Core/arch/AVX512/Complex.h
@@ -56,6 +56,8 @@ template<> struct unpacket_traits<Packet8cf> {
typedef Packet4cf half;
};
+template<> EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet8cf pnot<Packet8cf>(const Packet8cf& a) { return Packet8cf(pnot(Packet16f(a.v))); }
template<> EIGEN_STRONG_INLINE Packet8cf padd<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet8cf psub<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a)
@@ -67,7 +69,7 @@ template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a)
const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32(
0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,
0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
- return Packet8cf(_mm512_xor_ps(a.v,mask));
+ return Packet8cf(pxor(a.v,mask));
}
template<> EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
@@ -76,10 +78,16 @@ template<> EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, con
return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2));
}
-template<> EIGEN_STRONG_INLINE Packet8cf pand <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_and_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf por <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_or_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf pxor <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_andnot_ps(b.v,a.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pand <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf por <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pxor <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); }
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) {
+ __m512 eq = pcmp_eq<Packet16f>(a.v, b.v);
+ return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1)));
+}
template<> EIGEN_STRONG_INLINE Packet8cf pload <Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload<Packet16f>(&numext::real_ref(*from))); }
template<> EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu<Packet16f>(&numext::real_ref(*from))); }
@@ -125,20 +133,20 @@ template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) {
template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a)
{
- return predux(padd(Packet4cf(_mm512_extractf32x8_ps(a.v,0)),
- Packet4cf(_mm512_extractf32x8_ps(a.v,1))));
+ return predux(padd(Packet4cf(extract256<0>(a.v)),
+ Packet4cf(extract256<1>(a.v))));
}
template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a)
{
- return predux_mul(pmul(Packet4cf(_mm512_extractf32x8_ps(a.v, 0)),
- Packet4cf(_mm512_extractf32x8_ps(a.v, 1))));
+ return predux_mul(pmul(Packet4cf(extract256<0>(a.v)),
+ Packet4cf(extract256<1>(a.v))));
}
template <>
EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4<Packet8cf>(const Packet8cf& a) {
- __m256 lane0 = _mm512_extractf32x8_ps(a.v, 0);
- __m256 lane1 = _mm512_extractf32x8_ps(a.v, 1);
+ __m256 lane0 = extract256<0>(a.v);
+ __m256 lane1 = extract256<1>(a.v);
__m256 res = _mm256_add_ps(lane0, lane1);
return Packet4cf(res);
}
@@ -264,10 +272,18 @@ template<> EIGEN_STRONG_INLINE Packet4cd pmul<Packet4cd>(const Packet4cd& a, con
return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd));
}
-template<> EIGEN_STRONG_INLINE Packet4cd pand <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_and_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd por <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_or_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pxor <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_andnot_pd(b.v,a.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd ptrue<Packet4cd>(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet4cd pnot<Packet4cd>(const Packet4cd& a) { return Packet4cd(pnot(Packet8d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet4cd pand <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd por <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pxor <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); }
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) {
+ __m512d eq = pcmp_eq<Packet8d>(a.v, b.v);
+ return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55)));
+}
template<> EIGEN_STRONG_INLINE Packet4cd pload <Packet4cd>(const std::complex<double>* from)
{ EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload<Packet8d>((const double*)from)); }
@@ -294,23 +310,23 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<
template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather<std::complex<double>, Packet4cd>(const std::complex<double>* from, Index stride)
{
return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512(
- _mm256_insertf128_pd(_mm256_castpd128_pd256(pload<Packet1cd>(from+0*stride).v), pload<Packet1cd>(from+1*stride).v,1)),
- _mm256_insertf128_pd(_mm256_castpd128_pd256(pload<Packet1cd>(from+2*stride).v), pload<Packet1cd>(from+3*stride).v,1), 1));
+ _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+0*stride).v), ploadu<Packet1cd>(from+1*stride).v,1)),
+ _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+2*stride).v), ploadu<Packet1cd>(from+3*stride).v,1), 1));
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet4cd>(std::complex<double>* to, const Packet4cd& from, Index stride)
{
__m512i fromi = _mm512_castpd_si512(from.v);
double* tod = (double*)(void*)to;
- _mm_store_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) );
- _mm_store_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) );
- _mm_store_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) );
- _mm_store_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) );
+ _mm_storeu_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) );
+ _mm_storeu_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) );
+ _mm_storeu_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) );
+ _mm_storeu_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) );
}
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet4cd>(const Packet4cd& a)
{
- __m128d low = _mm512_extractf64x2_pd(a.v, 0);
+ __m128d low = extract128<0>(a.v);
EIGEN_ALIGN16 double res[2];
_mm_store_pd(res, low);
return std::complex<double>(res[0],res[1]);
@@ -392,12 +408,40 @@ template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet8cf,4>& kernel) {
- ptranspose(reinterpret_cast<PacketBlock<Packet8d,4>&>(kernel));
+ PacketBlock<Packet8d,4> pb;
+
+ pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
+ pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
+ pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
+ pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
+ ptranspose(pb);
+ kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
+ kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
+ kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
+ kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet8cf,8>& kernel) {
- ptranspose(reinterpret_cast<PacketBlock<Packet8d,8>&>(kernel));
+ PacketBlock<Packet8d,8> pb;
+
+ pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
+ pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
+ pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
+ pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
+ pb.packet[4] = _mm512_castps_pd(kernel.packet[4].v);
+ pb.packet[5] = _mm512_castps_pd(kernel.packet[5].v);
+ pb.packet[6] = _mm512_castps_pd(kernel.packet[6].v);
+ pb.packet[7] = _mm512_castps_pd(kernel.packet[7].v);
+ ptranspose(pb);
+ kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
+ kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
+ kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
+ kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
+ kernel.packet[4].v = _mm512_castpd_ps(pb.packet[4]);
+ kernel.packet[5].v = _mm512_castpd_ps(pb.packet[5]);
+ kernel.packet[6].v = _mm512_castpd_ps(pb.packet[6]);
+ kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]);
}
EIGEN_DEVICE_FUNC inline void
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index aac707596..c2158c538 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -47,6 +47,7 @@ plog<Packet16f>(const Packet16f& _x) {
// The smallest non denormalized float number.
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
+ _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
// Polynomial coefficients.
@@ -116,10 +117,16 @@ plog<Packet16f>(const Packet16f& _x) {
x = padd(x, y);
x = padd(x, y2);
- // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
+ __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
+ // Filter out invalid inputs, i.e.:
+ // - negative arg will be NAN,
+ // - 0 will be -INF.
+ // - +INF will be +INF
return _mm512_mask_blend_ps(iszero_mask,
- _mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
- p16f_minus_inf);
+ _mm512_mask_blend_ps(invalid_mask,
+ _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
+ p16f_nan),
+ p16f_minus_inf);
}
#endif
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 9c3121062..2199970ad 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -57,7 +57,7 @@ template<> struct packet_traits<float> : default_packet_traits
HasBlend = 0,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
-#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
#ifdef EIGEN_VECTORIZE_AVX512DQ
HasLog = 1,
#endif
@@ -77,7 +77,7 @@ template<> struct packet_traits<double> : default_packet_traits
AlignedOnScalar = 1,
size = 8,
HasHalfPacket = 1,
-#if EIGEN_GNUC_AT_LEAST(5, 3)
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
HasSqrt = EIGEN_FAST_MATH,
HasRsqrt = EIGEN_FAST_MATH,
#endif
@@ -262,12 +262,78 @@ EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
return _mm512_max_pd(b, a);
}
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
+template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); }
+#else
+// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
+template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
+ return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_));
+}
+
+// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
+template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
+ return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_));
+}
+
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
+ return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)),
+ _mm256_castps_si256(b),1));
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
+ __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
+ return _mm512_castsi512_ps(
+ _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
+ __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
+ return _mm512_castsi512_ps(
+ _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
+ __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ);
+ return _mm512_castsi512_ps(
+ _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
- __m256i lo = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0));
- __m256i hi = _mm256_cmpeq_epi32(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1));
- return _mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1);
+ __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ);
+ return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
}
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
+ __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
+ return _mm512_castsi512_ps(
+ _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
+ __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
+ return _mm512_castsi512_pd(
+ _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(const Packet16i& /*a*/) {
+ return _mm512_set1_epi32(0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f ptrue<Packet16f>(const Packet16f& a) {
+ return _mm512_castsi512_ps(ptrue<Packet16i>(_mm512_castps_si512(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d ptrue<Packet8d>(const Packet8d& a) {
+ return _mm512_castsi512_pd(ptrue<Packet16i>(_mm512_castpd_si512(a)));
+}
template <>
EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
@@ -924,6 +990,13 @@ EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
}
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x)
+{
+ Packet16i xi = _mm512_castps_si512(x);
+ __mmask16 tmp = _mm512_test_epi32_mask(xi,xi);
+ return !_mm512_kortestz(tmp,tmp);
+}
+
template <int Offset>
struct palign_impl<Offset, Packet16f> {
static EIGEN_STRONG_INLINE void run(Packet16f& first,
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 5404a624e..440d058d8 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -82,14 +82,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f
template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
{
- std::complex<float> EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 std::complex<float> af[2];
af[0] = from[0*stride];
af[1] = from[1*stride];
return pload<Packet2cf>(af);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
{
- std::complex<float> EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 std::complex<float> af[2];
pstore<std::complex<float> >((std::complex<float> *) af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
@@ -128,7 +128,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{
- std::complex<float> EIGEN_ALIGN16 res[2];
+ EIGEN_ALIGN16 std::complex<float> res[2];
pstore((float *)&res, a.v);
return res[0];
@@ -298,14 +298,14 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
{
- std::complex<double> EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 std::complex<double> af[2];
af[0] = from[0*stride];
af[1] = from[1*stride];
return pload<Packet1cd>(af);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
{
- std::complex<double> EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 std::complex<double> af[2];
pstore<std::complex<double> >(af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
@@ -345,7 +345,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
{
- std::complex<double> EIGEN_ALIGN16 res[2];
+ EIGEN_ALIGN16 std::complex<double> res[2];
pstore<std::complex<double> >(res, a);
return res[0];
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 2c06003ed..9535724eb 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -324,7 +324,7 @@ pbroadcast4<Packet4i>(const int *a,
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
{
- float EIGEN_ALIGN16 af[4];
+ EIGEN_ALIGN16 float af[4];
af[0] = from[0*stride];
af[1] = from[1*stride];
af[2] = from[2*stride];
@@ -333,7 +333,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
}
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
{
- int EIGEN_ALIGN16 ai[4];
+ EIGEN_ALIGN16 int ai[4];
ai[0] = from[0*stride];
ai[1] = from[1*stride];
ai[2] = from[2*stride];
@@ -342,7 +342,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
{
- float EIGEN_ALIGN16 af[4];
+ EIGEN_ALIGN16 float af[4];
pstore<float>(af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
@@ -351,7 +351,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
{
- int EIGEN_ALIGN16 ai[4];
+ EIGEN_ALIGN16 int ai[4];
pstore<int>((int *)ai, from);
to[0*stride] = ai[0];
to[1*stride] = ai[1];
@@ -565,8 +565,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f&
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
-template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; }
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
{
@@ -720,6 +720,11 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
return pfirst(res);
}
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+ return vec_any_ne(x, pzero(x));
+}
+
template<int Offset>
struct palign_impl<Offset,Packet4f>
{
@@ -980,14 +985,14 @@ pbroadcast4<Packet2d>(const double *a,
template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
{
- double EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 double af[2];
af[0] = from[0*stride];
af[1] = from[1*stride];
return pload<Packet2d>(af);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
{
- double EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 double af[2];
pstore<double>(af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
@@ -1059,7 +1064,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d&
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
{
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 9481850c6..ce3f0fc68 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -3,7 +3,7 @@
//
// Copyright (C) 2007 Julien Pommier
// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
-// Copyright (C) 2009-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
@@ -54,6 +54,7 @@ Packet plog_float(const Packet _x)
// The smallest non denormalized float number.
const Packet cst_min_norm_pos = pset1frombits<Packet>( 0x00800000u);
const Packet cst_minus_inf = pset1frombits<Packet>( 0xff800000u);
+ const Packet cst_pos_inf = pset1frombits<Packet>( 0x7f800000u);
// Polynomial coefficients.
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
@@ -69,9 +70,6 @@ Packet plog_float(const Packet _x)
const Packet cst_cephes_log_q1 = pset1<Packet>(-2.12194440e-4f);
const Packet cst_cephes_log_q2 = pset1<Packet>(0.693359375f);
- Packet invalid_mask = pcmp_lt_or_nan(x, pzero(x));
- Packet iszero_mask = pcmp_eq(x,pzero(x));
-
// Truncate input values to the minimum positive normal.
x = pmax(x, cst_min_norm_pos);
@@ -117,8 +115,15 @@ Packet plog_float(const Packet _x)
x = padd(x, y);
x = padd(x, y2);
- // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
- return pselect(iszero_mask, cst_minus_inf, por(x, invalid_mask));
+ Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
+ Packet iszero_mask = pcmp_eq(_x,pzero(_x));
+ Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
+ // Filter out invalid inputs, i.e.:
+ // - negative arg will be NAN
+ // - 0 will be -INF
+ // - +INF will be +INF
+ return pselect(iszero_mask, cst_minus_inf,
+ por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
}
// Exponential function. Works by writing "x = m*log(2) + r" where
@@ -248,15 +253,68 @@ Packet pexp_double(const Packet _x)
return pmax(pldexp(x,fx), _x);
}
-/* The code is the rewriting of the cephes sinf/cosf functions.
- Precision is excellent as long as x < 8192 (I did not bother to
- take into account the special handling they have for greater values
- -- it does not return garbage for arguments over 8192, though, but
- the extra precision is missing).
-
- Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
- surprising but correct result.
-*/
+// The following code is inspired by the following stack-overflow answer:
+// https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
+// It has been largely optimized:
+// - By-pass calls to frexp.
+// - Aligned loads of required 96 bits of 2/pi. This is accomplished by
+// (1) balancing the mantissa and exponent to the required bits of 2/pi are
+// aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
+// - Avoid a branch in rounding and extraction of the remaining fractional part.
+// Overall, I measured a speed up higher than x2 on x86-64.
+inline float trig_reduce_huge (float xf, int *quadrant)
+{
+ using Eigen::numext::int32_t;
+ using Eigen::numext::uint32_t;
+ using Eigen::numext::int64_t;
+ using Eigen::numext::uint64_t;
+
+ const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62
+ const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt
+
+ // 192 bits of 2/pi for Payne-Hanek reduction
+ // Bits are introduced by packet of 8 to enable aligned reads.
+ static const uint32_t two_over_pi [] =
+ {
+ 0x00000028, 0x000028be, 0x0028be60, 0x28be60db,
+ 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a,
+ 0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4,
+ 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770,
+ 0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566,
+ 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410,
+ 0x10e41000, 0xe4100000
+ };
+
+ uint32_t xi = numext::as_uint(xf);
+ // Below, -118 = -126 + 8.
+ // -126 is to get the exponent,
+ // +8 is to enable alignment of 2/pi's bits on 8 bits.
+ // This is possible because the fractional part of x as only 24 meaningful bits.
+ uint32_t e = (xi >> 23) - 118;
+ // Extract the mantissa and shift it to align it wrt the exponent
+ xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7);
+
+ uint32_t i = e >> 3;
+ uint32_t twoopi_1 = two_over_pi[i-1];
+ uint32_t twoopi_2 = two_over_pi[i+3];
+ uint32_t twoopi_3 = two_over_pi[i+7];
+
+ // Compute x * 2/pi in 2.62-bit fixed-point format.
+ uint64_t p;
+ p = uint64_t(xi) * twoopi_3;
+ p = uint64_t(xi) * twoopi_2 + (p >> 32);
+ p = (uint64_t(xi * twoopi_1) << 32) + p;
+
+ // Round to nearest: add 0.5 and extract integral part.
+ uint64_t q = (p + zero_dot_five) >> 62;
+ *quadrant = int(q);
+ // Now it remains to compute "r = x - q*pi/2" with high accuracy,
+ // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
+ // r = (p-q)*pi/2,
+ // where the product can be be carried out with sufficient accuracy using double precision.
+ p -= q<<62;
+ return float(double(int64_t(p)) * pio2_62);
+}
template<bool ComputeSine,typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
@@ -264,80 +322,116 @@ EIGEN_UNUSED
Packet psincos_float(const Packet& _x)
{
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
- const Packet cst_1 = pset1<Packet>(1.0f);
- const Packet cst_half = pset1<Packet>(0.5f);
-
- const PacketI csti_1 = pset1<PacketI>(1);
- const PacketI csti_not1 = pset1<PacketI>(~1);
- const PacketI csti_2 = pset1<PacketI>(2);
- const PacketI csti_3 = pset1<PacketI>(3);
-
- const Packet cst_sign_mask = pset1frombits<Packet>(0x80000000u);
-
- const Packet cst_minus_cephes_DP1 = pset1<Packet>(-0.78515625f);
- const Packet cst_minus_cephes_DP2 = pset1<Packet>(-2.4187564849853515625e-4f);
- const Packet cst_minus_cephes_DP3 = pset1<Packet>(-3.77489497744594108e-8f);
- const Packet cst_sincof_p0 = pset1<Packet>(-1.9515295891E-4f);
- const Packet cst_sincof_p1 = pset1<Packet>( 8.3321608736E-3f);
- const Packet cst_sincof_p2 = pset1<Packet>(-1.6666654611E-1f);
- const Packet cst_coscof_p0 = pset1<Packet>( 2.443315711809948E-005f);
- const Packet cst_coscof_p1 = pset1<Packet>(-1.388731625493765E-003f);
- const Packet cst_coscof_p2 = pset1<Packet>( 4.166664568298827E-002f);
- const Packet cst_cephes_FOPI = pset1<Packet>( 1.27323954473516f); // 4 / M_PI
- Packet x = pabs(_x);
+ const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI
+ const Packet cst_rounding_magic = pset1<Packet>(12582912); // 2^23 for rounding
+ const PacketI csti_1 = pset1<PacketI>(1);
+ const Packet cst_sign_mask = pset1frombits<Packet>(0x80000000u);
- // Scale x by 4/Pi to find x's octant.
- Packet y = pmul(x, cst_cephes_FOPI);
+ Packet x = pabs(_x);
- // Get the octant. We'll reduce x by this number of octants or by one more than it.
- PacketI y_int = pcast<Packet,PacketI>(y);
- // x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
- // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
- // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
- PacketI y_int1 = pand(padd(y_int, csti_1), csti_not1); // could be pbitclear<0>(...)
- y = pcast<PacketI,Packet>(y_int1);
+ // Scale x by 2/Pi to find x's octant.
+ Packet y = pmul(x, cst_2oPI);
+
+ // Rounding trick:
+ Packet y_round = padd(y, cst_rounding_magic);
+ PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
+ y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
+
+ // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
+ // using "Extended precision modular arithmetic"
+ #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
+ // This version requires true FMA for high accuracy
+ // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
+ const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
+ x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
+ x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
+ x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
+ #else
+ // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
+ // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
+ // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
+
+ // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
+ // and 2 ULP up to:
+ const float huge_th = ComputeSine ? 25966.f : 18838.f;
+ x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
+ x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
+ x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
+ x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
+
+ // For the record, the following set of coefficients maintain 2ULP up
+ // to a slightly larger range:
+ // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
+ // but it slightly fails to maintain 1ULP for two values of sin below pi.
+ // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
+ // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
+ // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
+ // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
+
+ // For the record, with only 3 iterations it is possible to maintain
+ // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
+ // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
+ #endif
+
+ if(predux_any(pcmp_le(pset1<Packet>(huge_th),pabs(_x))))
+ {
+ const int PacketSize = unpacket_traits<Packet>::size;
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize];
+ pstoreu(vals, pabs(_x));
+ pstoreu(x_cpy, x);
+ pstoreu(y_int2, y_int);
+ for(int k=0; k<PacketSize;++k)
+ {
+ float val = vals[k];
+ if(val>=huge_th && (numext::isfinite)(val))
+ x_cpy[k] = trig_reduce_huge(val,&y_int2[k]);
+ }
+ x = ploadu<Packet>(x_cpy);
+ y_int = ploadu<PacketI>(y_int2);
+ }
// Compute the sign to apply to the polynomial.
- // sign = third_bit(y_int1) xor signbit(_x)
- Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(pshiftleft<29>(y_int1)))
- : preinterpret<Packet>(pshiftleft<29>(padd(y_int1,csti_3)));
+ // sin: sign = second_bit(y_int) xor signbit(_x)
+ // cos: sign = second_bit(y_int+1)
+ Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(pshiftleft<30>(y_int)))
+ : preinterpret<Packet>(pshiftleft<30>(padd(y_int,csti_1)));
sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
- // Get the polynomial selection mask from the second bit of y_int1
+ // Get the polynomial selection mask from the second bit of y_int
// We'll calculate both (sin and cos) polynomials and then select from the two.
- Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int1, csti_2), pzero(y_int1)));
-
- // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
- // The magic pass: "Extended precision modular arithmetic"
- // x = ((x - y * DP1) - y * DP2) - y * DP3
- x = pmadd(y, cst_minus_cephes_DP1, x);
- x = pmadd(y, cst_minus_cephes_DP2, x);
- x = pmadd(y, cst_minus_cephes_DP3, x);
+ Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
Packet x2 = pmul(x,x);
- // Evaluate the cos(x) polynomial. (0 <= x <= Pi/4)
- Packet y1 = cst_coscof_p0;
- y1 = pmadd(y1, x2, cst_coscof_p1);
- y1 = pmadd(y1, x2, cst_coscof_p2);
- y1 = pmul(y1, x2);
- y1 = pmul(y1, x2);
- y1 = psub(y1, pmul(x2, cst_half));
- y1 = padd(y1, cst_1);
-
- // Evaluate the sin(x) polynomial. (Pi/4 <= x <= 0)
- Packet y2 = cst_sincof_p0;
- y2 = pmadd(y2, x2, cst_sincof_p1);
- y2 = pmadd(y2, x2, cst_sincof_p2);
+ // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
+ Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
+ y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f ));
+ y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f ));
+ y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
+ y1 = pmadd(y1, x2, pset1<Packet>(1.f));
+
+ // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
+ // octave/matlab code to compute those coefficients:
+ // x = (0:0.0001:pi/4)';
+ // A = [x.^3 x.^5 x.^7];
+ // w = ((1.-(x/(pi/4)).^2).^5)*2000+1; # weights trading relative accuracy
+ // c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
+ // printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
+ //
+ Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
+ y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
+ y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
y2 = pmul(y2, x2);
y2 = pmadd(y2, x, x);
- // Select the correct result from the two polynoms.
+ // Select the correct result from the two polynomials.
y = ComputeSine ? pselect(poly_mask,y2,y1)
: pselect(poly_mask,y1,y2);
- // Update the sign
+ // Update the sign and filter huge inputs
return pxor(y, sign_bit);
}
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index eaba60e26..c1b097fb9 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -100,6 +100,117 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const do
return make_double2(from, from);
}
+namespace {
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
+ const float& b) {
+ return __int_as_float(__float_as_int(a) & __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a,
+ const double& b) {
+ return __longlong_as_double(__double_as_longlong(a) &
+ __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a,
+ const float& b) {
+ return __int_as_float(__float_as_int(a) | __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a,
+ const double& b) {
+ return __longlong_as_double(__double_as_longlong(a) |
+ __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a,
+ const float& b) {
+ return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a,
+ const double& b) {
+ return __longlong_as_double(__double_as_longlong(a) ^
+ __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a,
+ const float& b) {
+ return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a,
+ const double& b) {
+ return __longlong_as_double(__double_as_longlong(a) &
+ ~__double_as_longlong(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a,
+ const float& b) {
+ return __int_as_float(a == b ? 0xffffffffu : 0u);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a,
+ const double& b) {
+ return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
+}
+
+} // namespace
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
+ const float4& b) {
+ return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
+ bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a,
+ const double2& b) {
+ return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a,
+ const float4& b) {
+ return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
+ bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a,
+ const double2& b) {
+ return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a,
+ const float4& b) {
+ return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
+ bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a,
+ const double2& b) {
+ return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a,
+ const float4& b) {
+ return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y),
+ bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pandnot<double2>(const double2& a, const double2& b) {
+ return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a,
+ const float4& b) {
+ return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
+ eq_mask(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pcmp_eq<double2>(const double2& a, const double2& b) {
+ return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
+}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
return make_float4(a, a+1, a+2, a+3);
diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index cc5c484b6..316ac0283 100644
--- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -143,6 +143,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2&
return result;
}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue<half2>(const half2& a) {
+ half2 result;
+ *(reinterpret_cast<unsigned*>(&(result))) = 0xffffffffu;
+}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<half2,2>& kernel) {
@@ -640,6 +644,14 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
#endif
}
+template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) {
+ Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
+ Packet16h r; r.x = Packet8i(ptrue(a.x)); return r;
+}
+
template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
// in some cases Packet8i is a wrapper around __m256i, so we need to
// cast to Packet8i to call the correct overload.
@@ -655,6 +667,13 @@ template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet
Packet16h r; r.x = pandnot(Packet8i(a.x),Packet8i(b.x)); return r;
}
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) {
+ Packet16f af = half2float(a);
+ Packet16f bf = half2float(b);
+ Packet16f rf = pcmp_eq(af, bf);
+ return float2half(rf);
+}
+
template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
// FIXME we could do that with bit manipulation
Packet16f af = half2float(a);
@@ -1078,6 +1097,10 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
#endif
}
+template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
+ Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r;
+}
+
template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
// in some cases Packet4i is a wrapper around __m128i, so we either need to
// cast to Packet4i to directly call the intrinsics as below:
@@ -1093,6 +1116,13 @@ template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h
Packet8h r; r.x = _mm_andnot_si128(b.x,a.x); return r;
}
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) {
+ Packet8f af = half2float(a);
+ Packet8f bf = half2float(b);
+ Packet8f rf = pcmp_eq(af, bf);
+ return float2half(rf);
+}
+
template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 5e6de1f40..d149275b5 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -146,7 +146,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{
- std::complex<float> EIGEN_ALIGN16 x[2];
+ EIGEN_ALIGN16 std::complex<float> x[2];
vst1q_f32((float *)x, a.v);
return x[0];
}
@@ -401,7 +401,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1c
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
{
- std::complex<double> EIGEN_ALIGN16 res;
+ EIGEN_ALIGN16 std::complex<double> res;
pstore<std::complex<double> >(&res, a);
return res;
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index ca4f2bf94..e8b351849 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -375,8 +375,8 @@ template<> EIGEN_STRONG_INLINE void prefetch<float> (const float* addr) { EI
template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
// FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x[4]; vst1q_f32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int32_t x[4]; vst1q_s32(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
float32x2_t a_lo, a_hi;
@@ -551,6 +551,13 @@ template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
return vget_lane_s32(max, 0);
}
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+ uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
+ vget_high_u32(vreinterpretq_u32_f32(x)));
+ return vget_lane_u32(vpmax_u32(tmp,tmp),0);
+}
+
// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
#define PALIGN_NEON(Offset,Type,Command) \
@@ -704,6 +711,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, con
return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
}
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return vreinterpretq_f64_u64(vceqq_f64(a,b)); }
+
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 911fe066e..f39988eac 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -82,6 +82,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
#endif
}
+template<> EIGEN_STRONG_INLINE Packet2cf ptrue <Packet2cf>(const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnot <Packet2cf>(const Packet2cf& a) { return Packet2cf(pnot(Packet4f(a.v))); }
+
template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
@@ -305,6 +308,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
#endif
}
+template<> EIGEN_STRONG_INLINE Packet1cd ptrue <Packet1cd>(const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnot <Packet1cd>(const Packet1cd& a) { return Packet1cd(pnot(Packet2d(a.v))); }
template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
@@ -439,6 +444,18 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
kernel.packet[1].v = tmp;
}
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
+{
+ __m128 eq = _mm_cmpeq_ps(a.v, b.v);
+ return Packet2cf(pand<Packet4f>(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
+{
+ __m128d eq = _mm_cmpeq_pd(a.v, b.v);
+ return Packet1cd(pand<Packet2d>(eq, vec2d_swizzle1(eq, 1, 0)));
+}
+
template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
__m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
return Packet2cf(_mm_castpd_ps(result));
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 4c7dc5b64..9c3750af0 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -374,9 +374,21 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const
template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
+template<> EIGEN_STRONG_INLINE Packet4f
+ptrue<Packet4f>(const Packet4f& a) {
+ Packet4i b = _mm_castps_si128(a);
+ return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
+}
+template<> EIGEN_STRONG_INLINE Packet2d
+ptrue<Packet2d>(const Packet2d& a) {
+ Packet4i b = _mm_castpd_si128(a);
+ return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
+}
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
@@ -812,6 +824,17 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
#endif // EIGEN_VECTORIZE_SSE4_1
}
+// not needed yet
+// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
+// {
+// return _mm_movemask_ps(x) == 0xF;
+// }
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+ return _mm_movemask_ps(x) != 0x0;
+}
+
#if EIGEN_COMP_GNUC
// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
// {
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index 0c2d2cfca..55994047e 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -548,6 +548,23 @@ struct functor_traits<scalar_tanh_op<Scalar> > {
};
};
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+ * \brief Template functor to compute the atanh of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::atanh()
+ */
+template <typename Scalar>
+struct scalar_atanh_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_atanh_op<Scalar> > {
+ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
/** \internal
* \brief Template functor to compute the sinh of a scalar
* \sa class CwiseUnaryOp, ArrayBase::sinh()
@@ -567,6 +584,23 @@ struct functor_traits<scalar_sinh_op<Scalar> >
};
};
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+ * \brief Template functor to compute the asinh of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::asinh()
+ */
+template <typename Scalar>
+struct scalar_asinh_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_asinh_op<Scalar> > {
+ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
/** \internal
* \brief Template functor to compute the cosh of a scalar
* \sa class CwiseUnaryOp, ArrayBase::cosh()
@@ -586,6 +620,23 @@ struct functor_traits<scalar_cosh_op<Scalar> >
};
};
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+ * \brief Template functor to compute the acosh of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::acosh()
+ */
+template <typename Scalar>
+struct scalar_acosh_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_acosh_op<Scalar> > {
+ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
/** \internal
* \brief Template functor to compute the inverse of a scalar
* \sa class CwiseUnaryOp, Cwise::inverse()
@@ -598,9 +649,13 @@ struct scalar_inverse_op {
EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
{ return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
};
-template<typename Scalar>
-struct functor_traits<scalar_inverse_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
+template <typename Scalar>
+struct functor_traits<scalar_inverse_op<Scalar> > {
+ enum {
+ PacketAccess = packet_traits<Scalar>::HasDiv,
+ Cost = scalar_div_cost<Scalar, PacketAccess>::value
+ };
+};
/** \internal
* \brief Template functor to compute the square of a scalar
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index afbd83eda..030c7740a 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -353,6 +353,24 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
// #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
#endif
+template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
+struct RhsPanelHelper {
+ private:
+ static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
+ public:
+ typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
+};
+
+template <typename Packet>
+struct QuadPacket
+{
+ Packet B_0, B1, B2, B3;
+ const Packet& get(const FixedInt<0>&) const { return B_0; }
+ const Packet& get(const FixedInt<1>&) const { return B1; }
+ const Packet& get(const FixedInt<2>&) const { return B2; }
+ const Packet& get(const FixedInt<3>&) const { return B3; }
+};
+
template <int N, typename T1, typename T2, typename T3>
struct packet_conditional { typedef T3 type; };
@@ -448,29 +466,35 @@ public:
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
typedef LhsPacket LhsPacket4Packing;
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
typedef ResPacket AccPacket;
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
{
p = pset1<ResPacket>(ResScalar(0));
}
-
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
- {
- pbroadcast4(b, b0, b1, b2, b3);
- }
-
-// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
-// {
-// pbroadcast2(b, b0, b1);
-// }
-
+
template<typename RhsPacketType>
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
{
dest = pset1<RhsPacketType>(*b);
}
-
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+ }
+
+ template<typename RhsPacketType>
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+ {
+ loadRhs(b, dest);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+ {
+ }
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
{
dest = ploadquad<RhsPacket>(b);
@@ -488,8 +512,8 @@ public:
dest = ploadu<LhsPacketType>(a);
}
- template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
- EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
+ template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
{
conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
// It would be a lot cleaner to call pmadd all the time. Unfortunately if we
@@ -504,6 +528,12 @@ public:
#endif
}
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+ {
+ madd(a, b.get(lane), c, tmp, lane);
+ }
+
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
{
r = pmadd(c,alpha,r);
@@ -555,6 +585,8 @@ public:
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
typedef LhsPacket LhsPacket4Packing;
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
+
typedef ResPacket AccPacket;
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
@@ -567,6 +599,20 @@ public:
{
dest = pset1<RhsPacketType>(*b);
}
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+ }
+
+ template<typename RhsPacketType>
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+ {
+ loadRhs(b, dest);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+ {}
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
{
@@ -598,18 +644,8 @@ public:
dest = ploadu<LhsPacketType>(a);
}
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
- {
- pbroadcast4(b, b0, b1, b2, b3);
- }
-
-// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
-// {
-// pbroadcast2(b, b0, b1);
-// }
-
- template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
- EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
{
madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
}
@@ -630,10 +666,16 @@ public:
c += a * b;
}
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+ {
+ madd(a, b.get(lane), c, tmp, lane);
+ }
+
template <typename ResPacketType, typename AccPacketType>
EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
{
- const conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
+ conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
r = cj.pmadd(c,alpha,r);
}
@@ -756,6 +798,9 @@ public:
typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
+
+ // this actualy holds 8 packets!
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
@@ -778,39 +823,37 @@ public:
dest.first = pset1<RealPacketType>(real(*b));
dest.second = pset1<RealPacketType>(imag(*b));
}
-
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
{
- loadRhs(b,dest);
+ loadRhs(b, dest.B_0);
+ loadRhs(b + 1, dest.B1);
+ loadRhs(b + 2, dest.B2);
+ loadRhs(b + 3, dest.B3);
}
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
+
+ // Scalar path
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
{
- loadQuadToDoublePacket(b,dest);
+ loadRhs(b, dest);
}
-
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+
+ // Vectorized path
+ template<typename RealPacketType>
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
{
- // FIXME not sure that's the best way to implement it!
- loadRhs(b+0, b0);
- loadRhs(b+1, b1);
- loadRhs(b+2, b2);
- loadRhs(b+3, b3);
+ loadRhs(b, dest);
}
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
- // Vectorized path
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
{
- // FIXME not sure that's the best way to implement it!
- loadRhs(b+0, b0);
- loadRhs(b+1, b1);
+ loadRhs(b,dest);
}
-
- // Scalar path
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
{
- // FIXME not sure that's the best way to implement it!
- loadRhs(b+0, b0);
- loadRhs(b+1, b1);
+ loadQuadToDoublePacket(b,dest);
}
// nothing special here
@@ -825,17 +868,26 @@ public:
dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
}
- template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType>
- EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/) const
+ template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
+ EIGEN_STRONG_INLINE
+ typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type
+ madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
{
c.first = padd(pmul(a,b.first), c.first);
c.second = padd(pmul(a,b.second),c.second);
}
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
+ template<typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
{
c = cj.pmadd(a,b,c);
}
+
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+ {
+ madd(a, b.get(lane), c, tmp, lane);
+ }
EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
@@ -914,7 +966,7 @@ public:
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
typedef LhsPacket LhsPacket4Packing;
-
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
typedef ResPacket AccPacket;
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
@@ -927,18 +979,20 @@ public:
{
dest = pset1<RhsPacketType>(*b);
}
-
- void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
{
- pbroadcast4(b, b0, b1, b2, b3);
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
}
-
-// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
-// {
-// // FIXME not sure that's the best way to implement it!
-// b0 = pload1<RhsPacket>(b+0);
-// b1 = pload1<RhsPacket>(b+1);
-// }
+
+ template<typename RhsPacketType>
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+ {
+ loadRhs(b, dest);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+ {}
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
{
@@ -956,8 +1010,8 @@ public:
dest = ploaddup<LhsPacketType>(a);
}
- template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
- EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
{
madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
}
@@ -979,48 +1033,135 @@ public:
c += a * b;
}
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+ {
+ madd(a, b.get(lane), c, tmp, lane);
+ }
+
template <typename ResPacketType, typename AccPacketType>
EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
{
- const conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
+ conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
r = cj.pmadd(alpha,c,r);
}
protected:
-
+
};
#if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON
-template<>
-struct gebp_traits <float, float, false, false,Architecture::NEON>
- : gebp_traits<float,float,false,false,Architecture::Generic>
+template<int _PacketSize>
+struct gebp_traits <float, float, false, false,Architecture::NEON,_PacketSize>
+ : gebp_traits<float,float,false,false,Architecture::Generic,_PacketSize>
{
typedef float RhsPacket;
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+ typedef float32x4_t RhsPacketx4;
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
{
- loadRhs(b+0, b0);
- loadRhs(b+1, b1);
- loadRhs(b+2, b2);
- loadRhs(b+3, b3);
+ dest = *b;
}
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
{
- dest = *b;
+ dest = vld1q_f32(b);
}
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = *b;
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {}
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
{
loadRhs(b,dest);
}
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/) const
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
{
c = vfmaq_n_f32(c, a, b);
}
+
+ template<int LaneID>
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<LaneID>&) const
+ {
+ #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
+ // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+ // vfmaq_laneq_f32 is implemented through a costly dup
+ if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : );
+ else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : );
+ else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : );
+ else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : );
+ #else
+ c = vfmaq_laneq_f32(c, a, b, LaneID);
+ #endif
+ }
+};
+
+
+template<>
+struct gebp_traits <double, double, false, false,Architecture::NEON>
+ : gebp_traits<double,double,false,false,Architecture::Generic>
+{
+ typedef double RhsPacket;
+
+ struct RhsPacketx4 {
+ float64x2_t B_0, B_1;
+ };
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = *b;
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {
+ dest.B_0 = vld1q_f64(b);
+ dest.B_1 = vld1q_f64(b+2);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ loadRhs(b,dest);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {}
+
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+ {
+ loadRhs(b,dest);
+ }
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+ {
+ c = vfmaq_n_f64(c, a, b);
+ }
+
+ template<int LaneID>
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<LaneID>&) const
+ {
+ #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
+ // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+ // vfmaq_laneq_f64 is implemented through a costly dup
+ if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
+ else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
+ else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
+ else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
+ #else
+ if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
+ else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
+ else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
+ else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
+ #endif
+ }
};
#endif
@@ -1044,6 +1185,9 @@ struct gebp_kernel
typedef typename Traits::RhsPacket RhsPacket;
typedef typename Traits::ResPacket ResPacket;
typedef typename Traits::AccPacket AccPacket;
+ typedef typename Traits::RhsPacketx4 RhsPacketx4;
+
+ typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
@@ -1148,7 +1292,7 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr,
SRhsPacketQuarter b0;
straits.loadLhsUnaligned(blB, a0);
straits.loadRhs(blA, b0);
- straits.madd(a0,b0,c0,b0);
+ straits.madd(a0,b0,c0,b0, fix<0>);
blB += SwappedTraits::LhsProgress/4;
blA += 1;
}
@@ -1165,21 +1309,25 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr,
template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
struct lhs_process_one_packet
{
+ typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
- EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
+ EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
{
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
- traits.loadLhs(&blA[(0+1*K)*(LhsProgress)], *A0);
- traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
- traits.madd(*A0, *B_0, *C0, *B_0);
- traits.madd(*A0, *B1, *C1, *B1);
- traits.madd(*A0, *B2, *C2, *B2);
- traits.madd(*A0, *B3, *C3, *B3);
+ traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
+ traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
+ traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
+ traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
+ traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
+ traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
}
- EIGEN_STRONG_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, Index prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
+ EIGEN_STRONG_INLINE void operator()(
+ const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
+ Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
+ int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
{
GEBPTraits traits;
@@ -1221,18 +1369,19 @@ struct lhs_process_one_packet
for(Index k=0; k<peeled_kc; k+=pk)
{
EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
- RhsPacket B_0, B1, B2, B3;
+ RhsPacketx4 rhs_panel;
+ RhsPacket T0;
internal::prefetch(blB+(48+0));
- peeled_kc_onestep(0, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
- peeled_kc_onestep(1, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
- peeled_kc_onestep(2, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
- peeled_kc_onestep(3, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
+ peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+ peeled_kc_onestep(1, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+ peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+ peeled_kc_onestep(3, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
internal::prefetch(blB+(48+16));
- peeled_kc_onestep(4, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
- peeled_kc_onestep(5, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
- peeled_kc_onestep(6, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
- peeled_kc_onestep(7, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
+ peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+ peeled_kc_onestep(5, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+ peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+ peeled_kc_onestep(7, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
blB += pk*4*RhsProgress;
blA += pk*LhsProgress;
@@ -1243,8 +1392,9 @@ struct lhs_process_one_packet
// process remaining peeled loop
for(Index k=peeled_kc; k<depth; k++)
{
- RhsPacket B_0, B1, B2, B3;
- peeled_kc_onestep(0, blA, blB, traits, &A0, &B_0, &B1, &B2, &B3, &C0, &C1, &C2, &C3);
+ RhsPacketx4 rhs_panel;
+ RhsPacket T0;
+ peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
blB += 4*RhsProgress;
blA += LhsProgress;
}
@@ -1293,9 +1443,10 @@ struct lhs_process_one_packet
do { \
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ /* FIXME: why unaligned???? */ \
traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
- traits.madd(A0, B_0, C0, B_0); \
+ traits.madd(A0, B_0, C0, B_0, fix<0>); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
} while(false);
@@ -1372,7 +1523,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
const Index peeled_kc = depth & ~(pk-1);
- const Index prefetch_res_offset = 32/sizeof(ResScalar);
+ const int prefetch_res_offset = 32/sizeof(ResScalar);
// const Index depth2 = depth & ~1;
//---------- Process 3 * LhsProgress rows at once ----------
@@ -1430,36 +1581,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
for(Index k=0; k<peeled_kc; k+=pk)
{
EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
- RhsPacket B_0, T0;
+ // 15 registers are taken (12 for acc, 2 for lhs).
+ RhsPanel15 rhs_panel;
+ RhsPacket T0;
LhsPacket A2;
-
-#define EIGEN_GEBP_ONESTEP(K) \
- do { \
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
+ #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
+ // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
+ // without this workaround A0, A1, and A2 are loaded in the same register,
+ // which is not good for pipelining
+ #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
+ #else
+ #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
+ #endif
+#define EIGEN_GEBP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
- internal::prefetch(blA+(3*K+16)*LhsProgress); \
- if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
- traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
- traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
- traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
- traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
- traits.madd(A0, B_0, C0, T0); \
- traits.madd(A1, B_0, C4, T0); \
- traits.madd(A2, B_0, C8, B_0); \
- traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
- traits.madd(A0, B_0, C1, T0); \
- traits.madd(A1, B_0, C5, T0); \
- traits.madd(A2, B_0, C9, B_0); \
- traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
- traits.madd(A0, B_0, C2, T0); \
- traits.madd(A1, B_0, C6, T0); \
- traits.madd(A2, B_0, C10, B_0); \
- traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
- traits.madd(A0, B_0, C3 , T0); \
- traits.madd(A1, B_0, C7, T0); \
- traits.madd(A2, B_0, C11, B_0); \
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
- } while(false)
+ internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
+ if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
+ internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
+ } /* Bug 953 */ \
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
+ EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
+ traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
+ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
+ traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
+ traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
+ traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
+ traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
+ traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
+ traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
+ traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
+ traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
+ traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
+ traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
+ } while (false)
internal::prefetch(blB);
EIGEN_GEBP_ONESTEP(0);
@@ -1479,7 +1642,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
// process remaining peeled loop
for(Index k=peeled_kc; k<depth; k++)
{
- RhsPacket B_0, T0;
+ RhsPanel15 rhs_panel;
+ RhsPacket T0;
LhsPacket A2;
EIGEN_GEBP_ONESTEP(0);
blB += 4*RhsProgress;
@@ -1559,20 +1723,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
{
EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
RhsPacket B_0;
-#define EIGEN_GEBGP_ONESTEP(K) \
- do { \
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
+#define EIGEN_GEBGP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
- traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
- traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
- traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
- traits.madd(A0, B_0, C0, B_0); \
- traits.madd(A1, B_0, C4, B_0); \
- traits.madd(A2, B_0, C8, B_0); \
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
- } while(false)
-
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
+ traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
+ traits.madd(A0, B_0, C0, B_0, fix<0>); \
+ traits.madd(A1, B_0, C4, B_0, fix<0>); \
+ traits.madd(A2, B_0, C8, B_0, fix<0>); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
+ } while (false)
+
EIGEN_GEBGP_ONESTEP(0);
EIGEN_GEBGP_ONESTEP(1);
EIGEN_GEBGP_ONESTEP(2);
@@ -1661,7 +1825,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
for(Index k=0; k<peeled_kc; k+=pk)
{
EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
- RhsPacket B_0, B1, B2, B3, T0;
+ RhsPacketx4 rhs_panel;
+ RhsPacket T0;
// NOTE: the begin/end asm comments below work around bug 935!
// but they are not enough for gcc>=6 without FMA (bug 1637)
@@ -1670,24 +1835,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
#else
#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
#endif
- #define EIGEN_GEBGP_ONESTEP(K) \
- do { \
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
- traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
- traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
- traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
- traits.madd(A0, B_0, C0, T0); \
- traits.madd(A1, B_0, C4, B_0); \
- traits.madd(A0, B1, C1, T0); \
- traits.madd(A1, B1, C5, B1); \
- traits.madd(A0, B2, C2, T0); \
- traits.madd(A1, B2, C6, B2); \
- traits.madd(A0, B3, C3, T0); \
- traits.madd(A1, B3, C7, B3); \
- EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
- } while(false)
-
+#define EIGEN_GEBGP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
+ traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
+ traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
+ traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
+ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
+ traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
+ traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
+ traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
+ EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
+ } while (false)
+
internal::prefetch(blB+(48+0));
EIGEN_GEBGP_ONESTEP(0);
EIGEN_GEBGP_ONESTEP(1);
@@ -1707,7 +1872,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
// process remaining peeled loop
for(Index k=peeled_kc; k<depth; k++)
{
- RhsPacket B_0, B1, B2, B3, T0;
+ RhsPacketx4 rhs_panel;
+ RhsPacket T0;
EIGEN_GEBGP_ONESTEP(0);
blB += 4*RhsProgress;
blA += 2*Traits::LhsProgress;
@@ -1778,8 +1944,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
- traits.madd(A0, B_0, C0, B1); \
- traits.madd(A1, B_0, C4, B_0); \
+ traits.madd(A0, B_0, C0, B1, fix<0>); \
+ traits.madd(A1, B_0, C4, B_0, fix<0>); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
} while(false)
@@ -1882,15 +2048,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
straits.loadRhsQuad(blA+0*spk, B_0);
straits.loadRhsQuad(blA+1*spk, B_1);
- straits.madd(A0,B_0,C0,B_0);
- straits.madd(A1,B_1,C1,B_1);
+ straits.madd(A0,B_0,C0,B_0, fix<0>);
+ straits.madd(A1,B_1,C1,B_1, fix<0>);
straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
straits.loadRhsQuad(blA+2*spk, B_0);
straits.loadRhsQuad(blA+3*spk, B_1);
- straits.madd(A0,B_0,C2,B_0);
- straits.madd(A1,B_1,C3,B_1);
+ straits.madd(A0,B_0,C2,B_0, fix<0>);
+ straits.madd(A1,B_1,C3,B_1, fix<0>);
blB += 4*SwappedTraits::LhsProgress;
blA += 4*spk;
@@ -1903,7 +2069,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
straits.loadLhsUnaligned(blB, A0);
straits.loadRhsQuad(blA, B_0);
- straits.madd(A0,B_0,C0,B_0);
+ straits.madd(A0,B_0,C0,B_0, fix<0>);
blB += SwappedTraits::LhsProgress;
blA += spk;
@@ -1927,7 +2093,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
straits.loadLhsUnaligned(blB, a0);
straits.loadRhs(blA, b0);
SAccPacketHalf c0 = predux_half_dowto4(C0);
- straits.madd(a0,b0,c0,b0);
+ straits.madd(a0,b0,c0,b0, fix<0>);
straits.acc(c0, alphav, R);
}
else
@@ -2273,7 +2439,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
}
pack -= psize;
- int left = rows - i;
+ Index left = rows - i;
if (pack <= 0) {
if (!gone_last &&
(starting_pos == i || left >= psize/2 || left >= psize/4) &&
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 121476394..68765d4b2 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -29,10 +29,15 @@
*
* If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
* vectorized and non-vectorized code.
+ *
+ * FIXME: this code can be cleaned up once we switch to proper C++11 only.
*/
#if (defined EIGEN_CUDACC)
#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
#define EIGEN_ALIGNOF(x) __alignof(x)
+#elif EIGEN_HAS_ALIGNAS
+ #define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n)
+ #define EIGEN_ALIGNOF(x) alignof(x)
#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
#define EIGEN_ALIGNOF(x) __alignof(x)
@@ -44,7 +49,7 @@
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
#define EIGEN_ALIGNOF(x) __alignof(x)
#else
- #error Please tell me what is the equivalent of __attribute__((aligned(n))) and __alignof(x) for your compiler
+ #error Please tell me what is the equivalent of alignas(n) and alignof(x) for your compiler
#endif
// If the user explicitly disable vectorization, then we also disable alignment
@@ -381,7 +386,7 @@
#endif
#if defined(EIGEN_HAS_CUDA_FP16)
- #include <host_defines.h>
+ #include <cuda_runtime_api.h>
#include <cuda_fp16.h>
#endif
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 3ab3a5f50..5d86a51ac 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -260,6 +260,7 @@ template<typename MatrixType> class HouseholderQR;
template<typename MatrixType> class ColPivHouseholderQR;
template<typename MatrixType> class FullPivHouseholderQR;
template<typename MatrixType> class CompleteOrthogonalDecomposition;
+template<typename MatrixType> class SVDBase;
template<typename MatrixType, int QRPreconditioner = ColPivHouseholderQRPreconditioner> class JacobiSVD;
template<typename MatrixType> class BDCSVD;
template<typename MatrixType, int UpLo = Lower> class LLT;
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index c7dba1fc4..3a8001e8f 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -129,16 +129,21 @@
#define EIGEN_COMP_MSVC_STRICT 0
#endif
-/// \internal EIGEN_COMP_IBM set to 1 if the compiler is IBM XL C++
-#if defined(__IBMCPP__) || defined(__xlc__)
- #define EIGEN_COMP_IBM 1
+/// \internal EIGEN_COMP_IBM set to xlc version if the compiler is IBM XL C++
+// XLC version
+// 3.1 0x0301
+// 4.5 0x0405
+// 5.0 0x0500
+// 12.1 0x0C01
+#if defined(__IBMCPP__) || defined(__xlc__) || defined(__ibmxl__)
+ #define EIGEN_COMP_IBM __xlC__
#else
#define EIGEN_COMP_IBM 0
#endif
-/// \internal EIGEN_COMP_PGI set to 1 if the compiler is Portland Group Compiler
+/// \internal EIGEN_COMP_PGI set to PGI version if the compiler is Portland Group Compiler
#if defined(__PGI)
- #define EIGEN_COMP_PGI 1
+ #define EIGEN_COMP_PGI (__PGIC__*100+__PGIC_MINOR__)
#else
#define EIGEN_COMP_PGI 0
#endif
@@ -347,9 +352,17 @@
#define EIGEN_OS_WIN_STRICT 0
#endif
-/// \internal EIGEN_OS_SUN set to 1 if the OS is SUN
+/// \internal EIGEN_OS_SUN set to __SUNPRO_C if the OS is SUN
+// compiler solaris __SUNPRO_C
+// version studio
+// 5.7 10 0x570
+// 5.8 11 0x580
+// 5.9 12 0x590
+// 5.10 12.1 0x5100
+// 5.11 12.2 0x5110
+// 5.12 12.3 0x5120
#if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__))
- #define EIGEN_OS_SUN 1
+ #define EIGEN_OS_SUN __SUNPRO_C
#else
#define EIGEN_OS_SUN 0
#endif
@@ -546,6 +559,22 @@
#endif
#endif
+#ifndef EIGEN_HAS_ALIGNAS
+#if EIGEN_MAX_CPP_VER>=11 && EIGEN_HAS_CXX11 && \
+ ( __has_feature(cxx_alignas) \
+ || EIGEN_HAS_CXX14 \
+ || (EIGEN_COMP_MSVC >= 1800) \
+ || (EIGEN_GNUC_AT_LEAST(4,8)) \
+ || (EIGEN_COMP_CLANG>=305) \
+ || (EIGEN_COMP_ICC>=1500) \
+ || (EIGEN_COMP_PGI>=1500) \
+ || (EIGEN_COMP_SUNCC>=0x5130))
+#define EIGEN_HAS_ALIGNAS 1
+#else
+#define EIGEN_HAS_ALIGNAS 0
+#endif
+#endif
+
// Does the compiler support type_traits?
// - full support of type traits was added only to GCC 5.1.0.
// - 20150626 corresponds to the last release of 4.x libstdc++
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 1415b3fc1..8fcb18a94 100755
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -636,8 +636,41 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to<double>()(x,y); }
#endif
+/** \internal extract the bits of the float \a x */
+inline unsigned int as_uint(float x)
+{
+ unsigned int ret;
+ std::memcpy(&ret, &x, sizeof(float));
+ return ret;
+}
+
} // end namespace numext
} // end namespace Eigen
+// Define portable (u)int{32,64} types
+#if EIGEN_HAS_CXX11
+#include <cstdint>
+namespace Eigen {
+namespace numext {
+typedef std::uint32_t uint32_t;
+typedef std::int32_t int32_t;
+typedef std::uint64_t uint64_t;
+typedef std::int64_t int64_t;
+}
+}
+#else
+// Without c++11, all compilers able to compile Eigen also
+// provides the C99 stdint.h header file.
+#include <stdint.h>
+namespace Eigen {
+namespace numext {
+typedef ::uint32_t uint32_t;
+typedef ::int32_t int32_t;
+typedef ::uint64_t uint64_t;
+typedef ::int64_t int64_t;
+}
+}
+#endif
+
#endif // EIGEN_META_H
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index b2f95153e..67714e444 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -104,7 +104,8 @@
STORAGE_INDEX_MUST_MATCH=1,
CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1,
SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1,
- INVALID_TEMPLATE_PARAMETER=1
+ INVALID_TEMPLATE_PARAMETER=1,
+ GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1
};
};
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index 75991aaed..3090351a0 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -97,6 +97,9 @@ template<int Mode> struct transform_make_affine;
* - #AffineCompact: the transformation is stored as a (Dim)x(Dim+1) matrix.
* - #Projective: the transformation is stored as a (Dim+1)^2 matrix
* without any assumption.
+ * - #Isometry: same as #Affine with the additional assumption that
+ * the linear part represents a rotation. This assumption is exploited
+ * to speed up some functions such as inverse() and rotation().
* \tparam _Options has the same meaning as in class Matrix. It allows to specify DontAlign and/or RowMajor.
* These Options are passed directly to the underlying matrix type.
*
@@ -252,11 +255,11 @@ protected:
public:
/** Default constructor without initialization of the meaningful coefficients.
- * If Mode==Affine, then the last row is set to [0 ... 0 1] */
+ * If Mode==Affine or Mode==Isometry, then the last row is set to [0 ... 0 1] */
EIGEN_DEVICE_FUNC inline Transform()
{
check_template_params();
- internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix);
+ internal::transform_make_affine<(int(Mode)==Affine || int(Mode)==Isometry) ? Affine : AffineCompact>::run(m_matrix);
}
EIGEN_DEVICE_FUNC inline Transform(const Transform& other)
@@ -602,7 +605,9 @@ public:
template<typename Derived>
EIGEN_DEVICE_FUNC inline Transform operator*(const RotationBase<Derived,Dim>& r) const;
- EIGEN_DEVICE_FUNC const LinearMatrixType rotation() const;
+ typedef typename internal::conditional<int(Mode)==Isometry,ConstLinearPart,const LinearMatrixType>::type RotationReturnType;
+ EIGEN_DEVICE_FUNC RotationReturnType rotation() const;
+
template<typename RotationMatrixType, typename ScalingMatrixType>
EIGEN_DEVICE_FUNC
void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const;
@@ -1046,20 +1051,43 @@ EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim
*** Special functions ***
************************/
+namespace internal {
+template<int Mode> struct transform_rotation_impl {
+ template<typename TransformType>
+ EIGEN_DEVICE_FUNC static inline
+ const typename TransformType::LinearMatrixType run(const TransformType& t)
+ {
+ typedef typename TransformType::LinearMatrixType LinearMatrixType;
+ LinearMatrixType result;
+ t.computeRotationScaling(&result, (LinearMatrixType*)0);
+ return result;
+ }
+};
+template<> struct transform_rotation_impl<Isometry> {
+ template<typename TransformType>
+ EIGEN_DEVICE_FUNC static inline
+ typename TransformType::ConstLinearPart run(const TransformType& t)
+ {
+ return t.linear();
+ }
+};
+}
/** \returns the rotation part of the transformation
*
+ * If Mode==Isometry, then this method is an alias for linear(),
+ * otherwise it calls computeRotationScaling() to extract the rotation
+ * through a SVD decomposition.
*
* \svd_module
*
* \sa computeRotationScaling(), computeScalingRotation(), class SVD
*/
template<typename Scalar, int Dim, int Mode, int Options>
-EIGEN_DEVICE_FUNC const typename Transform<Scalar,Dim,Mode,Options>::LinearMatrixType
+EIGEN_DEVICE_FUNC
+typename Transform<Scalar,Dim,Mode,Options>::RotationReturnType
Transform<Scalar,Dim,Mode,Options>::rotation() const
{
- LinearMatrixType result;
- computeRotationScaling(&result, (LinearMatrixType*)0);
- return result;
+ return internal::transform_rotation_impl<Mode>::run(*this);
}
diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h
index e62befcb6..9318c281f 100644
--- a/Eigen/src/Householder/HouseholderSequence.h
+++ b/Eigen/src/Householder/HouseholderSequence.h
@@ -156,6 +156,12 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
Side
> TransposeReturnType;
+ typedef HouseholderSequence<
+ typename internal::add_const<VectorsType>::type,
+ typename internal::add_const<CoeffsType>::type,
+ Side
+ > ConstHouseholderSequence;
+
/** \brief Constructor.
* \param[in] v %Matrix containing the essential parts of the Householder vectors
* \param[in] h Vector containing the Householder coefficients
@@ -244,6 +250,18 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
.setShift(m_shift);
}
+ /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+ * returns \c *this otherwise.
+ */
+ template<bool Cond>
+ EIGEN_DEVICE_FUNC
+ inline typename internal::conditional<Cond,ConjugateReturnType,ConstHouseholderSequence>::type
+ conjugateIf() const
+ {
+ typedef typename internal::conditional<Cond,ConjugateReturnType,ConstHouseholderSequence>::type ReturnType;
+ return ReturnType(m_vectors.template conjugateIf<Cond>(), m_coeffs.template conjugateIf<Cond>());
+ }
+
/** \brief Adjoint (conjugate transpose) of the Householder sequence. */
AdjointReturnType adjoint() const
{
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 344ec8926..ef93ec5eb 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -18,6 +18,7 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
{
typedef MatrixXpr XprKind;
typedef SolverStorage StorageKind;
+ typedef int StorageIndex;
enum { Flags = 0 };
};
@@ -62,9 +63,9 @@ template<typename _MatrixType> class FullPivLU
public:
typedef _MatrixType MatrixType;
typedef SolverBase<FullPivLU> Base;
+ friend class SolverBase<FullPivLU>;
EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU)
- // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int
enum {
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
@@ -218,6 +219,7 @@ template<typename _MatrixType> class FullPivLU
return internal::image_retval<FullPivLU>(*this, originalMatrix);
}
+ #ifdef EIGEN_PARSED_BY_DOXYGEN
/** \return a solution x to the equation Ax=b, where A is the matrix of which
* *this is the LU decomposition.
*
@@ -237,14 +239,10 @@ template<typename _MatrixType> class FullPivLU
*
* \sa TriangularView::solve(), kernel(), inverse()
*/
- // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion.
template<typename Rhs>
inline const Solve<FullPivLU, Rhs>
- solve(const MatrixBase<Rhs>& b) const
- {
- eigen_assert(m_isInitialized && "LU is not initialized.");
- return Solve<FullPivLU, Rhs>(*this, b.derived());
- }
+ solve(const MatrixBase<Rhs>& b) const;
+ #endif
/** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
the LU decomposition.
@@ -529,8 +527,8 @@ void FullPivLU<MatrixType>::computeInPlace()
m_nonzero_pivots = k;
for(Index i = k; i < size; ++i)
{
- m_rowsTranspositions.coeffRef(i) = i;
- m_colsTranspositions.coeffRef(i) = i;
+ m_rowsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
+ m_colsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
}
break;
}
@@ -541,8 +539,8 @@ void FullPivLU<MatrixType>::computeInPlace()
// Now that we've found the pivot, we need to apply the row/col swaps to
// bring it to the location (k,k).
- m_rowsTranspositions.coeffRef(k) = row_of_biggest_in_corner;
- m_colsTranspositions.coeffRef(k) = col_of_biggest_in_corner;
+ m_rowsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner);
+ m_colsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner);
if(k != row_of_biggest_in_corner) {
m_lu.row(k).swap(m_lu.row(row_of_biggest_in_corner));
++number_of_transpositions;
@@ -755,7 +753,6 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
const Index rows = this->rows(),
cols = this->cols(),
nonzero_pivots = this->rank();
- eigen_assert(rhs.rows() == rows);
const Index smalldim = (std::min)(rows, cols);
if(nonzero_pivots == 0)
@@ -805,7 +802,6 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType
const Index rows = this->rows(), cols = this->cols(),
nonzero_pivots = this->rank();
- eigen_assert(rhs.rows() == cols);
const Index smalldim = (std::min)(rows, cols);
if(nonzero_pivots == 0)
@@ -819,29 +815,19 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType
// Step 1
c = permutationQ().inverse() * rhs;
- if (Conjugate) {
- // Step 2
- m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
- .template triangularView<Upper>()
- .adjoint()
- .solveInPlace(c.topRows(nonzero_pivots));
- // Step 3
- m_lu.topLeftCorner(smalldim, smalldim)
- .template triangularView<UnitLower>()
- .adjoint()
- .solveInPlace(c.topRows(smalldim));
- } else {
- // Step 2
- m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
- .template triangularView<Upper>()
- .transpose()
- .solveInPlace(c.topRows(nonzero_pivots));
- // Step 3
- m_lu.topLeftCorner(smalldim, smalldim)
- .template triangularView<UnitLower>()
- .transpose()
- .solveInPlace(c.topRows(smalldim));
- }
+ // Step 2
+ m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
+ .template triangularView<Upper>()
+ .transpose()
+ .template conjugateIf<Conjugate>()
+ .solveInPlace(c.topRows(nonzero_pivots));
+
+ // Step 3
+ m_lu.topLeftCorner(smalldim, smalldim)
+ .template triangularView<UnitLower>()
+ .transpose()
+ .template conjugateIf<Conjugate>()
+ .solveInPlace(c.topRows(smalldim));
// Step 4
PermutationPType invp = permutationP().inverse().eval();
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index bfcd2c95b..b414b5c46 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -19,6 +19,7 @@ template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> >
{
typedef MatrixXpr XprKind;
typedef SolverStorage StorageKind;
+ typedef int StorageIndex;
typedef traits<_MatrixType> BaseTraits;
enum {
Flags = BaseTraits::Flags & RowMajorBit,
@@ -79,8 +80,9 @@ template<typename _MatrixType> class PartialPivLU
typedef _MatrixType MatrixType;
typedef SolverBase<PartialPivLU> Base;
+ friend class SolverBase<PartialPivLU>;
+
EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU)
- // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int
enum {
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
@@ -152,6 +154,7 @@ template<typename _MatrixType> class PartialPivLU
return m_p;
}
+ #ifdef EIGEN_PARSED_BY_DOXYGEN
/** This method returns the solution x to the equation Ax=b, where A is the matrix of which
* *this is the LU decomposition.
*
@@ -169,14 +172,10 @@ template<typename _MatrixType> class PartialPivLU
*
* \sa TriangularView::solve(), inverse(), computeInverse()
*/
- // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion.
template<typename Rhs>
inline const Solve<PartialPivLU, Rhs>
- solve(const MatrixBase<Rhs>& b) const
- {
- eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
- return Solve<PartialPivLU, Rhs>(*this, b.derived());
- }
+ solve(const MatrixBase<Rhs>& b) const;
+ #endif
/** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
the LU decomposition.
@@ -231,8 +230,6 @@ template<typename _MatrixType> class PartialPivLU
* Step 3: replace c by the solution x to Ux = c.
*/
- eigen_assert(rhs.rows() == m_lu.rows());
-
// Step 1
dst = permutationP() * rhs;
@@ -246,26 +243,21 @@ template<typename _MatrixType> class PartialPivLU
template<bool Conjugate, typename RhsType, typename DstType>
EIGEN_DEVICE_FUNC
void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const {
- /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
+ /* The decomposition PA = LU can be rewritten as A^T = U^T L^T P.
* So we proceed as follows:
- * Step 1: compute c = Pb.
- * Step 2: replace c by the solution x to Lx = c.
- * Step 3: replace c by the solution x to Ux = c.
+ * Step 1: compute c as the solution to L^T c = b
+ * Step 2: replace c by the solution x to U^T x = c.
+ * Step 3: update c = P^-1 c.
*/
eigen_assert(rhs.rows() == m_lu.cols());
- if (Conjugate) {
- // Step 1
- dst = m_lu.template triangularView<Upper>().adjoint().solve(rhs);
- // Step 2
- m_lu.template triangularView<UnitLower>().adjoint().solveInPlace(dst);
- } else {
- // Step 1
- dst = m_lu.template triangularView<Upper>().transpose().solve(rhs);
- // Step 2
- m_lu.template triangularView<UnitLower>().transpose().solveInPlace(dst);
- }
+ // Step 1
+ dst = m_lu.template triangularView<Upper>().transpose()
+ .template conjugateIf<Conjugate>().solve(rhs);
+ // Step 2
+ m_lu.template triangularView<UnitLower>().transpose()
+ .template conjugateIf<Conjugate>().solveInPlace(dst);
// Step 3
dst = permutationP().transpose() * dst;
}
@@ -519,7 +511,10 @@ void PartialPivLU<MatrixType>::compute()
// the row permutation is stored as int indices, so just to be sure:
eigen_assert(m_lu.rows()<NumTraits<int>::highest());
- m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
+ if(m_lu.cols()>0)
+ m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
+ else
+ m_l1_norm = RealScalar(0);
eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
const Index size = m_lu.rows();
diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h
index 1faa3442e..9b677e9bf 100644
--- a/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/Eigen/src/QR/ColPivHouseholderQR.h
@@ -17,6 +17,9 @@ namespace internal {
template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> >
: traits<_MatrixType>
{
+ typedef MatrixXpr XprKind;
+ typedef SolverStorage StorageKind;
+ typedef int StorageIndex;
enum { Flags = 0 };
};
@@ -46,20 +49,19 @@ template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> >
* \sa MatrixBase::colPivHouseholderQr()
*/
template<typename _MatrixType> class ColPivHouseholderQR
+ : public SolverBase<ColPivHouseholderQR<_MatrixType> >
{
public:
typedef _MatrixType MatrixType;
+ typedef SolverBase<ColPivHouseholderQR> Base;
+ friend class SolverBase<ColPivHouseholderQR>;
+
+ EIGEN_GENERIC_PUBLIC_INTERFACE(ColPivHouseholderQR)
enum {
- RowsAtCompileTime = MatrixType::RowsAtCompileTime,
- ColsAtCompileTime = MatrixType::ColsAtCompileTime,
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
};
- typedef typename MatrixType::Scalar Scalar;
- typedef typename MatrixType::RealScalar RealScalar;
- // FIXME should be int
- typedef typename MatrixType::StorageIndex StorageIndex;
typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
@@ -156,6 +158,7 @@ template<typename _MatrixType> class ColPivHouseholderQR
computeInPlace();
}
+ #ifdef EIGEN_PARSED_BY_DOXYGEN
/** This method finds a solution x to the equation Ax=b, where A is the matrix of which
* *this is the QR decomposition, if any exists.
*
@@ -172,11 +175,8 @@ template<typename _MatrixType> class ColPivHouseholderQR
*/
template<typename Rhs>
inline const Solve<ColPivHouseholderQR, Rhs>
- solve(const MatrixBase<Rhs>& b) const
- {
- eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
- return Solve<ColPivHouseholderQR, Rhs>(*this, b.derived());
- }
+ solve(const MatrixBase<Rhs>& b) const;
+ #endif
HouseholderSequenceType householderQ() const;
HouseholderSequenceType matrixQ() const
@@ -417,6 +417,9 @@ template<typename _MatrixType> class ColPivHouseholderQR
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename RhsType, typename DstType>
void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+ template<bool Conjugate, typename RhsType, typename DstType>
+ void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
#endif
protected:
@@ -583,8 +586,6 @@ template<typename _MatrixType>
template<typename RhsType, typename DstType>
void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
{
- eigen_assert(rhs.rows() == rows());
-
const Index nonzero_pivots = nonzeroPivots();
if(nonzero_pivots == 0)
@@ -604,6 +605,31 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &
for(Index i = 0; i < nonzero_pivots; ++i) dst.row(m_colsPermutation.indices().coeff(i)) = c.row(i);
for(Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero();
}
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void ColPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+ const Index nonzero_pivots = nonzeroPivots();
+
+ if(nonzero_pivots == 0)
+ {
+ dst.setZero();
+ return;
+ }
+
+ typename RhsType::PlainObject c(m_colsPermutation.transpose()*rhs);
+
+ m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots)
+ .template triangularView<Upper>()
+ .transpose().template conjugateIf<Conjugate>()
+ .solveInPlace(c.topRows(nonzero_pivots));
+
+ dst.topRows(nonzero_pivots) = c.topRows(nonzero_pivots);
+ dst.bottomRows(rows()-nonzero_pivots).setZero();
+
+ dst.applyOnTheLeft(householderQ().setLength(nonzero_pivots).template conjugateIf<!Conjugate>() );
+}
#endif
namespace internal {
diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
index 03017a375..2fc3c871a 100644
--- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@@ -16,6 +16,9 @@ namespace internal {
template <typename _MatrixType>
struct traits<CompleteOrthogonalDecomposition<_MatrixType> >
: traits<_MatrixType> {
+ typedef MatrixXpr XprKind;
+ typedef SolverStorage StorageKind;
+ typedef int StorageIndex;
enum { Flags = 0 };
};
@@ -44,19 +47,21 @@ struct traits<CompleteOrthogonalDecomposition<_MatrixType> >
*
* \sa MatrixBase::completeOrthogonalDecomposition()
*/
-template <typename _MatrixType>
-class CompleteOrthogonalDecomposition {
+template <typename _MatrixType> class CompleteOrthogonalDecomposition
+ : public SolverBase<CompleteOrthogonalDecomposition<_MatrixType> >
+{
public:
typedef _MatrixType MatrixType;
+ typedef SolverBase<CompleteOrthogonalDecomposition> Base;
+
+ template<typename Derived>
+ friend struct internal::solve_assertion;
+
+ EIGEN_GENERIC_PUBLIC_INTERFACE(CompleteOrthogonalDecomposition)
enum {
- RowsAtCompileTime = MatrixType::RowsAtCompileTime,
- ColsAtCompileTime = MatrixType::ColsAtCompileTime,
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
};
- typedef typename MatrixType::Scalar Scalar;
- typedef typename MatrixType::RealScalar RealScalar;
- typedef typename MatrixType::StorageIndex StorageIndex;
typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime>
PermutationType;
@@ -131,9 +136,9 @@ class CompleteOrthogonalDecomposition {
m_temp(matrix.cols())
{
computeInPlace();
- }
-
+ }
+ #ifdef EIGEN_PARSED_BY_DOXYGEN
/** This method computes the minimum-norm solution X to a least squares
* problem \f[\mathrm{minimize} \|A X - B\|, \f] where \b A is the matrix of
* which \c *this is the complete orthogonal decomposition.
@@ -145,11 +150,8 @@ class CompleteOrthogonalDecomposition {
*/
template <typename Rhs>
inline const Solve<CompleteOrthogonalDecomposition, Rhs> solve(
- const MatrixBase<Rhs>& b) const {
- eigen_assert(m_cpqr.m_isInitialized &&
- "CompleteOrthogonalDecomposition is not initialized.");
- return Solve<CompleteOrthogonalDecomposition, Rhs>(*this, b.derived());
- }
+ const MatrixBase<Rhs>& b) const;
+ #endif
HouseholderSequenceType householderQ(void) const;
HouseholderSequenceType matrixQ(void) const { return m_cpqr.householderQ(); }
@@ -158,8 +160,8 @@ class CompleteOrthogonalDecomposition {
*/
MatrixType matrixZ() const {
MatrixType Z = MatrixType::Identity(m_cpqr.cols(), m_cpqr.cols());
- applyZAdjointOnTheLeftInPlace(Z);
- return Z.adjoint();
+ applyZOnTheLeftInPlace<false>(Z);
+ return Z;
}
/** \returns a reference to the matrix where the complete orthogonal
@@ -275,6 +277,7 @@ class CompleteOrthogonalDecomposition {
*/
inline const Inverse<CompleteOrthogonalDecomposition> pseudoInverse() const
{
+ eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized.");
return Inverse<CompleteOrthogonalDecomposition>(*this);
}
@@ -368,6 +371,9 @@ class CompleteOrthogonalDecomposition {
#ifndef EIGEN_PARSED_BY_DOXYGEN
template <typename RhsType, typename DstType>
void _solve_impl(const RhsType& rhs, DstType& dst) const;
+
+ template<bool Conjugate, typename RhsType, typename DstType>
+ void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
#endif
protected:
@@ -375,8 +381,22 @@ class CompleteOrthogonalDecomposition {
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
}
+ template<bool Transpose_, typename Rhs>
+ void _check_solve_assertion(const Rhs& b) const {
+ EIGEN_ONLY_USED_FOR_DEBUG(b);
+ eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized.");
+ eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "CompleteOrthogonalDecomposition::solve(): invalid number of rows of the right hand side matrix b");
+ }
+
void computeInPlace();
+ /** Overwrites \b rhs with \f$ \mathbf{Z} * \mathbf{rhs} \f$ or
+ * \f$ \mathbf{\overline Z} * \mathbf{rhs} \f$ if \c Conjugate
+ * is set to \c true.
+ */
+ template <bool Conjugate, typename Rhs>
+ void applyZOnTheLeftInPlace(Rhs& rhs) const;
+
/** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$.
*/
template <typename Rhs>
@@ -465,13 +485,35 @@ void CompleteOrthogonalDecomposition<MatrixType>::computeInPlace()
}
template <typename MatrixType>
+template <bool Conjugate, typename Rhs>
+void CompleteOrthogonalDecomposition<MatrixType>::applyZOnTheLeftInPlace(
+ Rhs& rhs) const {
+ const Index cols = this->cols();
+ const Index nrhs = rhs.cols();
+ const Index rank = this->rank();
+ Matrix<typename Rhs::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
+ for (Index k = rank-1; k >= 0; --k) {
+ if (k != rank - 1) {
+ rhs.row(k).swap(rhs.row(rank - 1));
+ }
+ rhs.middleRows(rank - 1, cols - rank + 1)
+ .applyHouseholderOnTheLeft(
+ matrixQTZ().row(k).tail(cols - rank).transpose().template conjugateIf<!Conjugate>(), zCoeffs().template conjugateIf<Conjugate>()(k),
+ &temp(0));
+ if (k != rank - 1) {
+ rhs.row(k).swap(rhs.row(rank - 1));
+ }
+ }
+}
+
+template <typename MatrixType>
template <typename Rhs>
void CompleteOrthogonalDecomposition<MatrixType>::applyZAdjointOnTheLeftInPlace(
Rhs& rhs) const {
const Index cols = this->cols();
const Index nrhs = rhs.cols();
const Index rank = this->rank();
- Matrix<typename MatrixType::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
+ Matrix<typename Rhs::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
for (Index k = 0; k < rank; ++k) {
if (k != rank - 1) {
rhs.row(k).swap(rhs.row(rank - 1));
@@ -491,8 +533,6 @@ template <typename _MatrixType>
template <typename RhsType, typename DstType>
void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
const RhsType& rhs, DstType& dst) const {
- eigen_assert(rhs.rows() == this->rows());
-
const Index rank = this->rank();
if (rank == 0) {
dst.setZero();
@@ -520,6 +560,34 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
// Undo permutation to get x = P^{-1} * y.
dst = colsPermutation() * dst;
}
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+ const Index rank = this->rank();
+
+ if (rank == 0) {
+ dst.setZero();
+ return;
+ }
+
+ typename RhsType::PlainObject c(colsPermutation().transpose()*rhs);
+
+ if (rank < cols()) {
+ applyZOnTheLeftInPlace<!Conjugate>(c);
+ }
+
+ matrixT().topLeftCorner(rank, rank)
+ .template triangularView<Upper>()
+ .transpose().template conjugateIf<Conjugate>()
+ .solveInPlace(c.topRows(rank));
+
+ dst.topRows(rank) = c.topRows(rank);
+ dst.bottomRows(rows()-rank).setZero();
+
+ dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf<!Conjugate>() );
+}
#endif
namespace internal {
diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h
index c31e47cc4..d0664a1d8 100644
--- a/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/Eigen/src/QR/FullPivHouseholderQR.h
@@ -18,6 +18,9 @@ namespace internal {
template<typename _MatrixType> struct traits<FullPivHouseholderQR<_MatrixType> >
: traits<_MatrixType>
{
+ typedef MatrixXpr XprKind;
+ typedef SolverStorage StorageKind;
+ typedef int StorageIndex;
enum { Flags = 0 };
};
@@ -55,20 +58,19 @@ struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
* \sa MatrixBase::fullPivHouseholderQr()
*/
template<typename _MatrixType> class FullPivHouseholderQR
+ : public SolverBase<FullPivHouseholderQR<_MatrixType> >
{
public:
typedef _MatrixType MatrixType;
+ typedef SolverBase<FullPivHouseholderQR> Base;
+ friend class SolverBase<FullPivHouseholderQR>;
+
+ EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivHouseholderQR)
enum {
- RowsAtCompileTime = MatrixType::RowsAtCompileTime,
- ColsAtCompileTime = MatrixType::ColsAtCompileTime,
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
};
- typedef typename MatrixType::Scalar Scalar;
- typedef typename MatrixType::RealScalar RealScalar;
- // FIXME should be int
- typedef typename MatrixType::StorageIndex StorageIndex;
typedef internal::FullPivHouseholderQRMatrixQReturnType<MatrixType> MatrixQReturnType;
typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
typedef Matrix<StorageIndex, 1,
@@ -156,6 +158,7 @@ template<typename _MatrixType> class FullPivHouseholderQR
computeInPlace();
}
+ #ifdef EIGEN_PARSED_BY_DOXYGEN
/** This method finds a solution x to the equation Ax=b, where A is the matrix of which
* \c *this is the QR decomposition.
*
@@ -173,11 +176,8 @@ template<typename _MatrixType> class FullPivHouseholderQR
*/
template<typename Rhs>
inline const Solve<FullPivHouseholderQR, Rhs>
- solve(const MatrixBase<Rhs>& b) const
- {
- eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
- return Solve<FullPivHouseholderQR, Rhs>(*this, b.derived());
- }
+ solve(const MatrixBase<Rhs>& b) const;
+ #endif
/** \returns Expression object representing the matrix Q
*/
@@ -396,6 +396,9 @@ template<typename _MatrixType> class FullPivHouseholderQR
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename RhsType, typename DstType>
void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+ template<bool Conjugate, typename RhsType, typename DstType>
+ void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
#endif
protected:
@@ -498,15 +501,15 @@ void FullPivHouseholderQR<MatrixType>::computeInPlace()
m_nonzero_pivots = k;
for(Index i = k; i < size; i++)
{
- m_rows_transpositions.coeffRef(i) = i;
- m_cols_transpositions.coeffRef(i) = i;
+ m_rows_transpositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
+ m_cols_transpositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
m_hCoeffs.coeffRef(i) = Scalar(0);
}
break;
}
- m_rows_transpositions.coeffRef(k) = row_of_biggest_in_corner;
- m_cols_transpositions.coeffRef(k) = col_of_biggest_in_corner;
+ m_rows_transpositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner);
+ m_cols_transpositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner);
if(k != row_of_biggest_in_corner) {
m_qr.row(k).tail(cols-k).swap(m_qr.row(row_of_biggest_in_corner).tail(cols-k));
++number_of_transpositions;
@@ -540,7 +543,6 @@ template<typename _MatrixType>
template<typename RhsType, typename DstType>
void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
{
- eigen_assert(rhs.rows() == rows());
const Index l_rank = rank();
// FIXME introduce nonzeroPivots() and use it here. and more generally,
@@ -553,7 +555,7 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType
typename RhsType::PlainObject c(rhs);
- Matrix<Scalar,1,RhsType::ColsAtCompileTime> temp(rhs.cols());
+ Matrix<typename RhsType::Scalar,1,RhsType::ColsAtCompileTime> temp(rhs.cols());
for (Index k = 0; k < l_rank; ++k)
{
Index remainingSize = rows()-k;
@@ -570,6 +572,42 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType
for(Index i = 0; i < l_rank; ++i) dst.row(m_cols_permutation.indices().coeff(i)) = c.row(i);
for(Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero();
}
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void FullPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+ const Index l_rank = rank();
+
+ if(l_rank == 0)
+ {
+ dst.setZero();
+ return;
+ }
+
+ typename RhsType::PlainObject c(m_cols_permutation.transpose()*rhs);
+
+ m_qr.topLeftCorner(l_rank, l_rank)
+ .template triangularView<Upper>()
+ .transpose().template conjugateIf<Conjugate>()
+ .solveInPlace(c.topRows(l_rank));
+
+ dst.topRows(l_rank) = c.topRows(l_rank);
+ dst.bottomRows(rows()-l_rank).setZero();
+
+ Matrix<Scalar, 1, DstType::ColsAtCompileTime> temp(dst.cols());
+ const Index size = (std::min)(rows(), cols());
+ for (Index k = size-1; k >= 0; --k)
+ {
+ Index remainingSize = rows()-k;
+
+ dst.bottomRightCorner(remainingSize, dst.cols())
+ .applyHouseholderOnTheLeft(m_qr.col(k).tail(remainingSize-1).template conjugateIf<!Conjugate>(),
+ m_hCoeffs.template conjugateIf<Conjugate>().coeff(k), &temp.coeffRef(0));
+
+ dst.row(k).swap(dst.row(m_rows_transpositions.coeff(k)));
+ }
+}
#endif
namespace internal {
diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h
index 33cb9c8ff..801739fbd 100644
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@@ -14,6 +14,18 @@
namespace Eigen {
+namespace internal {
+template<typename _MatrixType> struct traits<HouseholderQR<_MatrixType> >
+ : traits<_MatrixType>
+{
+ typedef MatrixXpr XprKind;
+ typedef SolverStorage StorageKind;
+ typedef int StorageIndex;
+ enum { Flags = 0 };
+};
+
+} // end namespace internal
+
/** \ingroup QR_Module
*
*
@@ -42,20 +54,19 @@ namespace Eigen {
* \sa MatrixBase::householderQr()
*/
template<typename _MatrixType> class HouseholderQR
+ : public SolverBase<HouseholderQR<_MatrixType> >
{
public:
typedef _MatrixType MatrixType;
+ typedef SolverBase<HouseholderQR> Base;
+ friend class SolverBase<HouseholderQR>;
+
+ EIGEN_GENERIC_PUBLIC_INTERFACE(HouseholderQR)
enum {
- RowsAtCompileTime = MatrixType::RowsAtCompileTime,
- ColsAtCompileTime = MatrixType::ColsAtCompileTime,
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
};
- typedef typename MatrixType::Scalar Scalar;
- typedef typename MatrixType::RealScalar RealScalar;
- // FIXME should be int
- typedef typename MatrixType::StorageIndex StorageIndex;
typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, (MatrixType::Flags&RowMajorBit) ? RowMajor : ColMajor, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType;
typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
@@ -121,6 +132,7 @@ template<typename _MatrixType> class HouseholderQR
computeInPlace();
}
+ #ifdef EIGEN_PARSED_BY_DOXYGEN
/** This method finds a solution x to the equation Ax=b, where A is the matrix of which
* *this is the QR decomposition, if any exists.
*
@@ -137,11 +149,8 @@ template<typename _MatrixType> class HouseholderQR
*/
template<typename Rhs>
inline const Solve<HouseholderQR, Rhs>
- solve(const MatrixBase<Rhs>& b) const
- {
- eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
- return Solve<HouseholderQR, Rhs>(*this, b.derived());
- }
+ solve(const MatrixBase<Rhs>& b) const;
+ #endif
/** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations.
*
@@ -214,6 +223,9 @@ template<typename _MatrixType> class HouseholderQR
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename RhsType, typename DstType>
void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+ template<bool Conjugate, typename RhsType, typename DstType>
+ void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
#endif
protected:
@@ -349,7 +361,6 @@ template<typename RhsType, typename DstType>
void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
{
const Index rank = (std::min)(rows(), cols());
- eigen_assert(rhs.rows() == rows());
typename RhsType::PlainObject c(rhs);
@@ -362,6 +373,25 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c
dst.topRows(rank) = c.topRows(rank);
dst.bottomRows(cols()-rank).setZero();
}
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void HouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+ const Index rank = (std::min)(rows(), cols());
+
+ typename RhsType::PlainObject c(rhs);
+
+ m_qr.topLeftCorner(rank, rank)
+ .template triangularView<Upper>()
+ .transpose().template conjugateIf<Conjugate>()
+ .solveInPlace(c.topRows(rank));
+
+ dst.topRows(rank) = c.topRows(rank);
+ dst.bottomRows(rows()-rank).setZero();
+
+ dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf<!Conjugate>() );
+}
#endif
/** Performs the QR factorization of the given matrix \a matrix. The result of
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index 18d7bdc0a..e3fddacbc 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -39,6 +39,7 @@ namespace internal {
template<typename _MatrixType>
struct traits<BDCSVD<_MatrixType> >
+ : traits<_MatrixType>
{
typedef _MatrixType MatrixType;
};
@@ -1006,7 +1007,7 @@ void BDCSVD<MatrixType>::perturbCol0
#ifdef EIGEN_BDCSVD_SANITY_CHECKS
assert((std::isfinite)(tmp));
#endif
- zhat(k) = col0(k) > Literal(0) ? tmp : -tmp;
+ zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp);
}
}
}
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index 1c7c80376..2b6891105 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -425,6 +425,7 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
template<typename _MatrixType, int QRPreconditioner>
struct traits<JacobiSVD<_MatrixType,QRPreconditioner> >
+ : traits<_MatrixType>
{
typedef _MatrixType MatrixType;
};
diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h
index 851ad6836..68df48921 100644
--- a/Eigen/src/SVD/SVDBase.h
+++ b/Eigen/src/SVD/SVDBase.h
@@ -17,6 +17,18 @@
#define EIGEN_SVDBASE_H
namespace Eigen {
+
+namespace internal {
+template<typename Derived> struct traits<SVDBase<Derived> >
+ : traits<Derived>
+{
+ typedef MatrixXpr XprKind;
+ typedef SolverStorage StorageKind;
+ typedef int StorageIndex;
+ enum { Flags = 0 };
+};
+}
+
/** \ingroup SVD_Module
*
*
@@ -44,15 +56,18 @@ namespace Eigen {
* terminate in finite (and reasonable) time.
* \sa class BDCSVD, class JacobiSVD
*/
-template<typename Derived>
-class SVDBase
+template<typename Derived> class SVDBase
+ : public SolverBase<SVDBase<Derived> >
{
+public:
+
+ template<typename Derived_>
+ friend struct internal::solve_assertion;
-public:
typedef typename internal::traits<Derived>::MatrixType MatrixType;
typedef typename MatrixType::Scalar Scalar;
typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
- typedef typename MatrixType::StorageIndex StorageIndex;
+ typedef typename Eigen::internal::traits<SVDBase>::StorageIndex StorageIndex;
typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
enum {
RowsAtCompileTime = MatrixType::RowsAtCompileTime,
@@ -194,6 +209,7 @@ public:
inline Index rows() const { return m_rows; }
inline Index cols() const { return m_cols; }
+ #ifdef EIGEN_PARSED_BY_DOXYGEN
/** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
*
* \param b the right-hand-side of the equation to solve.
@@ -205,16 +221,15 @@ public:
*/
template<typename Rhs>
inline const Solve<Derived, Rhs>
- solve(const MatrixBase<Rhs>& b) const
- {
- eigen_assert(m_isInitialized && "SVD is not initialized.");
- eigen_assert(computeU() && computeV() && "SVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
- return Solve<Derived, Rhs>(derived(), b.derived());
- }
-
+ solve(const MatrixBase<Rhs>& b) const;
+ #endif
+
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename RhsType, typename DstType>
void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+ template<bool Conjugate, typename RhsType, typename DstType>
+ void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
#endif
protected:
@@ -223,6 +238,14 @@ protected:
{
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
}
+
+ template<bool Transpose_, typename Rhs>
+ void _check_solve_assertion(const Rhs& b) const {
+ EIGEN_ONLY_USED_FOR_DEBUG(b);
+ eigen_assert(m_isInitialized && "SVD is not initialized.");
+ eigen_assert(computeU() && computeV() && "SVDBase::solve(): Both unitaries U and V are required to be computed (thin unitaries suffice).");
+ eigen_assert((Transpose_?cols():rows())==b.rows() && "SVDBase::solve(): invalid number of rows of the right hand side matrix b");
+ }
// return true if already allocated
bool allocate(Index rows, Index cols, unsigned int computationOptions) ;
@@ -263,17 +286,30 @@ template<typename Derived>
template<typename RhsType, typename DstType>
void SVDBase<Derived>::_solve_impl(const RhsType &rhs, DstType &dst) const
{
- eigen_assert(rhs.rows() == rows());
-
// A = U S V^*
// So A^{-1} = V S^{-1} U^*
- Matrix<Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
+ Matrix<typename RhsType::Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
Index l_rank = rank();
tmp.noalias() = m_matrixU.leftCols(l_rank).adjoint() * rhs;
tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
dst = m_matrixV.leftCols(l_rank) * tmp;
}
+
+template<typename Derived>
+template<bool Conjugate, typename RhsType, typename DstType>
+void SVDBase<Derived>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+ // A = U S V^*
+ // So A^{-*} = U S^{-1} V^*
+ // And A^{-T} = U_conj S^{-1} V^T
+ Matrix<typename RhsType::Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
+ Index l_rank = rank();
+
+ tmp.noalias() = m_matrixV.leftCols(l_rank).transpose().template conjugateIf<Conjugate>() * rhs;
+ tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
+ dst = m_matrixU.template conjugateIf<!Conjugate>().leftCols(l_rank) * tmp;
+}
#endif
template<typename MatrixType>
diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h
index d89fa0dae..acd986fab 100644
--- a/Eigen/src/SparseCore/CompressedStorage.h
+++ b/Eigen/src/SparseCore/CompressedStorage.h
@@ -207,6 +207,22 @@ class CompressedStorage
return m_values[id];
}
+ void moveChunk(Index from, Index to, Index chunkSize)
+ {
+ eigen_internal_assert(to+chunkSize <= m_size);
+ if(to>from && from+chunkSize>to)
+ {
+ // move backward
+ internal::smart_memmove(m_values+from, m_values+from+chunkSize, m_values+to);
+ internal::smart_memmove(m_indices+from, m_indices+from+chunkSize, m_indices+to);
+ }
+ else
+ {
+ internal::smart_copy(m_values+from, m_values+from+chunkSize, m_values+to);
+ internal::smart_copy(m_indices+from, m_indices+from+chunkSize, m_indices+to);
+ }
+ }
+
void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
{
Index k = 0;
diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h
index 71452e75e..905485c88 100644
--- a/Eigen/src/SparseCore/SparseAssign.h
+++ b/Eigen/src/SparseCore/SparseAssign.h
@@ -246,35 +246,22 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse>
{
typedef typename DstXprType::StorageIndex StorageIndex;
typedef typename DstXprType::Scalar Scalar;
- typedef Array<StorageIndex,Dynamic,1> ArrayXI;
- typedef Array<Scalar,Dynamic,1> ArrayXS;
- template<int Options>
- static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
- {
- Index dstRows = src.rows();
- Index dstCols = src.cols();
- if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
- dst.resize(dstRows, dstCols);
- Index size = src.diagonal().size();
- dst.makeCompressed();
- dst.resizeNonZeros(size);
- Map<ArrayXI>(dst.innerIndexPtr(), size).setLinSpaced(0,StorageIndex(size)-1);
- Map<ArrayXI>(dst.outerIndexPtr(), size+1).setLinSpaced(0,StorageIndex(size));
- Map<ArrayXS>(dst.valuePtr(), size) = src.diagonal();
- }
+ template<int Options, typename AssignFunc>
+ static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const AssignFunc &func)
+ { dst.assignDiagonal(src.diagonal(), func); }
template<typename DstDerived>
static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
- {
- dst.diagonal() = src.diagonal();
- }
+ { dst.derived().diagonal() = src.diagonal(); }
- static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
- { dst.diagonal() += src.diagonal(); }
+ template<typename DstDerived>
+ static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+ { dst.derived().diagonal() += src.diagonal(); }
- static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
- { dst.diagonal() -= src.diagonal(); }
+ template<typename DstDerived>
+ static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+ { dst.derived().diagonal() -= src.diagonal(); }
};
} // end namespace internal
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h
index e0b3c22b6..6a2c7a8ce 100644
--- a/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -128,6 +128,28 @@ class SparseCompressedBase
protected:
/** Default constructor. Do nothing. */
SparseCompressedBase() {}
+
+ /** \internal return the index of the coeff at (row,col) or just before if it does not exist.
+ * This is an analogue of std::lower_bound.
+ */
+ internal::LowerBoundIndex lower_bound(Index row, Index col) const
+ {
+ eigen_internal_assert(row>=0 && row<this->rows() && col>=0 && col<this->cols());
+
+ const Index outer = Derived::IsRowMajor ? row : col;
+ const Index inner = Derived::IsRowMajor ? col : row;
+
+ Index start = this->outerIndexPtr()[outer];
+ Index end = this->isCompressed() ? this->outerIndexPtr()[outer+1] : this->outerIndexPtr()[outer] + this->innerNonZeroPtr()[outer];
+ eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist");
+ internal::LowerBoundIndex p;
+ p.value = std::lower_bound(this->innerIndexPtr()+start, this->innerIndexPtr()+end,inner) - this->innerIndexPtr();
+ p.found = (p.value<end) && (this->innerIndexPtr()[p.value]==inner);
+ return p;
+ }
+
+ friend struct internal::evaluator<SparseCompressedBase<Derived> >;
+
private:
template<typename OtherDerived> explicit SparseCompressedBase(const SparseCompressedBase<OtherDerived>&);
};
@@ -333,17 +355,8 @@ protected:
Index find(Index row, Index col) const
{
- eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols());
-
- const Index outer = Derived::IsRowMajor ? row : col;
- const Index inner = Derived::IsRowMajor ? col : row;
-
- Index start = m_matrix->outerIndexPtr()[outer];
- Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer];
- eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist");
- const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) - m_matrix->innerIndexPtr();
-
- return ((p<end) && (m_matrix->innerIndexPtr()[p]==inner)) ? p : Dynamic;
+ internal::LowerBoundIndex p = m_matrix->lower_bound(row,col);
+ return p.found ? p.value : Dynamic;
}
const Derived *m_matrix;
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index eedae47e8..63dd1cc32 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -99,6 +99,8 @@ class SparseMatrix
typedef SparseCompressedBase<SparseMatrix> Base;
using Base::convert_index;
friend class SparseVector<_Scalar,0,_StorageIndex>;
+ template<typename, typename, typename, typename, typename>
+ friend struct internal::Assignment;
public:
using Base::isCompressed;
using Base::nonZeros;
@@ -502,7 +504,7 @@ class SparseMatrix
m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i];
}
}
-
+
/** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */
void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
{
@@ -895,6 +897,113 @@ public:
m_data.index(p) = convert_index(inner);
return (m_data.value(p) = Scalar(0));
}
+protected:
+ struct IndexPosPair {
+ IndexPosPair(Index a_i, Index a_p) : i(a_i), p(a_p) {}
+ Index i;
+ Index p;
+ };
+
+ /** \internal assign \a diagXpr to the diagonal of \c *this
+ * There are different strategies:
+ * 1 - if *this is overwritten (Func==assign_op) or *this is empty, then we can work treat *this as a dense vector expression.
+ * 2 - otherwise, for each diagonal coeff,
+ * 2.a - if it already exists, then we update it,
+ * 2.b - otherwise, if *this is uncompressed and that the current inner-vector has empty room for at least 1 element, then we perform an in-place insertion.
+ * 2.c - otherwise, we'll have to reallocate and copy everything, so instead of doing so for each new element, it is recorded in a std::vector.
+ * 3 - at the end, if some entries failed to be inserted in-place, then we alloc a new buffer, copy each chunk at the right position, and insert the new elements.
+ *
+ * TODO: some piece of code could be isolated and reused for a general in-place update strategy.
+ * TODO: if we start to defer the insertion of some elements (i.e., case 2.c executed once),
+ * then it *might* be better to disable case 2.b since they will have to be copied anyway.
+ */
+ template<typename DiagXpr, typename Func>
+ void assignDiagonal(const DiagXpr diagXpr, const Func& assignFunc)
+ {
+ Index n = diagXpr.size();
+
+ const bool overwrite = internal::is_same<Func, internal::assign_op<Scalar,Scalar> >::value;
+ if(overwrite)
+ {
+ if((this->rows()!=n) || (this->cols()!=n))
+ this->resize(n, n);
+ }
+
+ if(m_data.size()==0 || overwrite)
+ {
+ typedef Array<StorageIndex,Dynamic,1> ArrayXI;
+ this->makeCompressed();
+ this->resizeNonZeros(n);
+ Eigen::Map<ArrayXI>(this->innerIndexPtr(), n).setLinSpaced(0,StorageIndex(n)-1);
+ Eigen::Map<ArrayXI>(this->outerIndexPtr(), n+1).setLinSpaced(0,StorageIndex(n));
+ Eigen::Map<Array<Scalar,Dynamic,1> > values = this->coeffs();
+ values.setZero();
+ internal::call_assignment_no_alias(values, diagXpr, assignFunc);
+ }
+ else
+ {
+ bool isComp = isCompressed();
+ internal::evaluator<DiagXpr> diaEval(diagXpr);
+ std::vector<IndexPosPair> newEntries;
+
+ // 1 - try in-place update and record insertion failures
+ for(Index i = 0; i<n; ++i)
+ {
+ internal::LowerBoundIndex lb = this->lower_bound(i,i);
+ Index p = lb.value;
+ if(lb.found)
+ {
+ // the coeff already exists
+ assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i));
+ }
+ else if((!isComp) && m_innerNonZeros[i] < (m_outerIndex[i+1]-m_outerIndex[i]))
+ {
+ // non compressed mode with local room for inserting one element
+ m_data.moveChunk(p, p+1, m_outerIndex[i]+m_innerNonZeros[i]-p);
+ m_innerNonZeros[i]++;
+ m_data.value(p) = Scalar(0);
+ m_data.index(p) = StorageIndex(i);
+ assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i));
+ }
+ else
+ {
+ // defer insertion
+ newEntries.push_back(IndexPosPair(i,p));
+ }
+ }
+ // 2 - insert deferred entries
+ Index n_entries = Index(newEntries.size());
+ if(n_entries>0)
+ {
+ Storage newData(m_data.size()+n_entries);
+ Index prev_p = 0;
+ Index prev_i = 0;
+ for(Index k=0; k<n_entries;++k)
+ {
+ Index i = newEntries[k].i;
+ Index p = newEntries[k].p;
+ internal::smart_copy(m_data.valuePtr()+prev_p, m_data.valuePtr()+p, newData.valuePtr()+prev_p+k);
+ internal::smart_copy(m_data.indexPtr()+prev_p, m_data.indexPtr()+p, newData.indexPtr()+prev_p+k);
+ for(Index j=prev_i;j<i;++j)
+ m_outerIndex[j+1] += k;
+ if(!isComp)
+ m_innerNonZeros[i]++;
+ prev_p = p;
+ prev_i = i;
+ newData.value(p+k) = Scalar(0);
+ newData.index(p+k) = StorageIndex(i);
+ assignFunc.assignCoeff(newData.value(p+k), diaEval.coeff(i));
+ }
+ {
+ internal::smart_copy(m_data.valuePtr()+prev_p, m_data.valuePtr()+m_data.size(), newData.valuePtr()+prev_p+n_entries);
+ internal::smart_copy(m_data.indexPtr()+prev_p, m_data.indexPtr()+m_data.size(), newData.indexPtr()+prev_p+n_entries);
+ for(Index j=prev_i+1;j<=m_outerSize;++j)
+ m_outerIndex[j] += n_entries;
+ }
+ m_data.swap(newData);
+ }
+ }
+ }
private:
static void check_template_parameters()
diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h
index 74df0d496..ceb936887 100644
--- a/Eigen/src/SparseCore/SparseUtil.h
+++ b/Eigen/src/SparseCore/SparseUtil.h
@@ -140,6 +140,14 @@ struct SparseSelfAdjointShape { static std::string debugName() { return "SparseS
template<> struct glue_shapes<SparseShape,SelfAdjointShape> { typedef SparseSelfAdjointShape type; };
template<> struct glue_shapes<SparseShape,TriangularShape > { typedef SparseTriangularShape type; };
+// return type of SparseCompressedBase::lower_bound;
+struct LowerBoundIndex {
+ LowerBoundIndex() : value(-1), found(false) {}
+ LowerBoundIndex(Index val, bool ok) : value(val), found(ok) {}
+ Index value;
+ bool found;
+};
+
} // end namespace internal
/** \ingroup SparseCore_Module
diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index e928db467..2f99ee0b2 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -23,6 +23,11 @@ typedef CwiseUnaryOp<internal::scalar_atan_op<Scalar>, const Derived> AtanReturn
typedef CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> TanhReturnType;
typedef CwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived> LogisticReturnType;
typedef CwiseUnaryOp<internal::scalar_sinh_op<Scalar>, const Derived> SinhReturnType;
+#if EIGEN_HAS_CXX11_MATH
+typedef CwiseUnaryOp<internal::scalar_atanh_op<Scalar>, const Derived> AtanhReturnType;
+typedef CwiseUnaryOp<internal::scalar_asinh_op<Scalar>, const Derived> AsinhReturnType;
+typedef CwiseUnaryOp<internal::scalar_acosh_op<Scalar>, const Derived> AcoshReturnType;
+#endif
typedef CwiseUnaryOp<internal::scalar_cosh_op<Scalar>, const Derived> CoshReturnType;
typedef CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> SquareReturnType;
typedef CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived> CubeReturnType;
@@ -327,7 +332,7 @@ sinh() const
* Example: \include Cwise_cosh.cpp
* Output: \verbinclude Cwise_cosh.out
*
- * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tan(), sinh(), cosh()
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tanh(), sinh(), cosh()
*/
EIGEN_DEVICE_FUNC
inline const CoshReturnType
@@ -336,6 +341,41 @@ cosh() const
return CoshReturnType(derived());
}
+#if EIGEN_HAS_CXX11_MATH
+/** \returns an expression of the coefficient-wise inverse hyperbolic tan of *this.
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_atanh">Math functions</a>, atanh(), asinh(), acosh()
+ */
+EIGEN_DEVICE_FUNC
+inline const AtanhReturnType
+atanh() const
+{
+ return AtanhReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise inverse hyperbolic sin of *this.
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_asinh">Math functions</a>, atanh(), asinh(), acosh()
+ */
+EIGEN_DEVICE_FUNC
+inline const AsinhReturnType
+asinh() const
+{
+ return AsinhReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise inverse hyperbolic cos of *this.
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_acosh">Math functions</a>, atanh(), asinh(), acosh()
+ */
+EIGEN_DEVICE_FUNC
+inline const AcoshReturnType
+acosh() const
+{
+ return AcoshReturnType(derived());
+}
+#endif
+
/** \returns an expression of the coefficient-wise logistic of *this.
*/
EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/plugins/CommonCwiseUnaryOps.h b/Eigen/src/plugins/CommonCwiseUnaryOps.h
index 89f4faaac..5418dc415 100644
--- a/Eigen/src/plugins/CommonCwiseUnaryOps.h
+++ b/Eigen/src/plugins/CommonCwiseUnaryOps.h
@@ -76,6 +76,20 @@ conjugate() const
return ConjugateReturnType(derived());
}
+/// \returns an expression of the complex conjugate of \c *this if Cond==true, returns derived() otherwise.
+///
+EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate)
+///
+/// \sa conjugate()
+template<bool Cond>
+EIGEN_DEVICE_FUNC
+inline typename internal::conditional<Cond,ConjugateReturnType,const Derived&>::type
+conjugateIf() const
+{
+ typedef typename internal::conditional<Cond,ConjugateReturnType,const Derived&>::type ReturnType;
+ return ReturnType(derived());
+}
+
/// \returns a read-only expression of the real part of \c *this.
///
EIGEN_DOC_UNARY_ADDONS(real,real part function)