diff options
Diffstat (limited to 'Eigen/src/Core')
91 files changed, 5074 insertions, 1488 deletions
diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index e38eda72c..7480d1e24 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h @@ -12,7 +12,16 @@ namespace Eigen { -/** \class Array +namespace internal { +template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols> +struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > +{ + typedef ArrayXpr XprKind; + typedef ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > XprBase; +}; +} + +/** \class Array * \ingroup Core_Module * * \brief General-purpose arrays with easy API for coefficient-wise operations @@ -26,21 +35,12 @@ namespace Eigen { * * See documentation of class Matrix for detailed information on the template parameters * storage layout. - * + * * This class can be extended with the help of the plugin mechanism described on the page * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN. * - * \sa \ref TutorialArrayClass, \ref TopicClassHierarchy + * \sa \blank \ref TutorialArrayClass, \ref TopicClassHierarchy */ -namespace internal { -template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols> -struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > -{ - typedef ArrayXpr XprKind; - typedef ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > XprBase; -}; -} - template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols> class Array : public PlainObjectBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h index b4c24a27a..0443e3032 100644 --- a/Eigen/src/Core/ArrayBase.h +++ b/Eigen/src/Core/ArrayBase.h @@ -103,7 +103,7 @@ template<typename Derived> class ArrayBase /** Special case of the template operator=, in order to prevent the compiler * from generating a default operator= (issue hit with g++ 4.1) */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const ArrayBase& other) { internal::call_assignment(derived(), other.derived()); @@ -112,28 +112,28 @@ template<typename Derived> class ArrayBase /** Set all the entries to \a value. * \sa DenseBase::setConstant(), DenseBase::fill() */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Scalar &value) { Base::setConstant(value); return derived(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const Scalar& scalar); - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const Scalar& scalar); template<typename OtherDerived> - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const ArrayBase<OtherDerived>& other); template<typename OtherDerived> - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const ArrayBase<OtherDerived>& other); template<typename OtherDerived> - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const ArrayBase<OtherDerived>& other); template<typename OtherDerived> - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const ArrayBase<OtherDerived>& other); public: diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h index 4e484f290..6013d4d85 100644 --- a/Eigen/src/Core/ArrayWrapper.h +++ b/Eigen/src/Core/ArrayWrapper.h @@ -52,7 +52,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> > const Scalar >::type ScalarWithConstIfNotLvalue; - typedef typename internal::ref_selector<ExpressionType>::type NestedExpressionType; + typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType; EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {} @@ -67,7 +67,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> > inline Index innerStride() const { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC - inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); } + inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); } EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_expression.data(); } @@ -80,13 +80,13 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> > EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index rowId, Index colId) { - return m_expression.const_cast_derived().coeffRef(rowId, colId); + return m_expression.coeffRef(rowId, colId); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index rowId, Index colId) const { - return m_expression.const_cast_derived().coeffRef(rowId, colId); + return m_expression.coeffRef(rowId, colId); } EIGEN_DEVICE_FUNC @@ -98,13 +98,13 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> > EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) { - return m_expression.const_cast_derived().coeffRef(index); + return m_expression.coeffRef(index); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const { - return m_expression.const_cast_derived().coeffRef(index); + return m_expression.coeffRef(index); } template<int LoadMode> @@ -116,7 +116,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> > template<int LoadMode> inline void writePacket(Index rowId, Index colId, const PacketScalar& val) { - m_expression.const_cast_derived().template writePacket<LoadMode>(rowId, colId, val); + m_expression.template writePacket<LoadMode>(rowId, colId, val); } template<int LoadMode> @@ -128,7 +128,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> > template<int LoadMode> inline void writePacket(Index index, const PacketScalar& val) { - m_expression.const_cast_derived().template writePacket<LoadMode>(index, val); + m_expression.template writePacket<LoadMode>(index, val); } template<typename Dest> @@ -145,11 +145,11 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> > /** Forwards the resizing request to the nested expression * \sa DenseBase::resize(Index) */ EIGEN_DEVICE_FUNC - void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); } + void resize(Index newSize) { m_expression.resize(newSize); } /** Forwards the resizing request to the nested expression * \sa DenseBase::resize(Index,Index)*/ EIGEN_DEVICE_FUNC - void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); } + void resize(Index rows, Index cols) { m_expression.resize(rows,cols); } protected: NestedExpressionType m_expression; @@ -195,7 +195,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> > const Scalar >::type ScalarWithConstIfNotLvalue; - typedef typename internal::ref_selector<ExpressionType>::type NestedExpressionType; + typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType; EIGEN_DEVICE_FUNC explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {} @@ -210,7 +210,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> > inline Index innerStride() const { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC - inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); } + inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); } EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_expression.data(); } @@ -223,7 +223,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> > EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index rowId, Index colId) { - return m_expression.const_cast_derived().coeffRef(rowId, colId); + return m_expression.coeffRef(rowId, colId); } EIGEN_DEVICE_FUNC @@ -241,13 +241,13 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> > EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) { - return m_expression.const_cast_derived().coeffRef(index); + return m_expression.coeffRef(index); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const { - return m_expression.const_cast_derived().coeffRef(index); + return m_expression.coeffRef(index); } template<int LoadMode> @@ -259,7 +259,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> > template<int LoadMode> inline void writePacket(Index rowId, Index colId, const PacketScalar& val) { - m_expression.const_cast_derived().template writePacket<LoadMode>(rowId, colId, val); + m_expression.template writePacket<LoadMode>(rowId, colId, val); } template<int LoadMode> @@ -271,7 +271,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> > template<int LoadMode> inline void writePacket(Index index, const PacketScalar& val) { - m_expression.const_cast_derived().template writePacket<LoadMode>(index, val); + m_expression.template writePacket<LoadMode>(index, val); } EIGEN_DEVICE_FUNC @@ -284,11 +284,11 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> > /** Forwards the resizing request to the nested expression * \sa DenseBase::resize(Index) */ EIGEN_DEVICE_FUNC - void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); } + void resize(Index newSize) { m_expression.resize(newSize); } /** Forwards the resizing request to the nested expression * \sa DenseBase::resize(Index,Index)*/ EIGEN_DEVICE_FUNC - void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); } + void resize(Index rows, Index cols) { m_expression.resize(rows,cols); } protected: NestedExpressionType m_expression; diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index db3bef38d..3de8aa9a2 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -606,7 +606,7 @@ public: assignPacket<StoreMode,LoadMode,PacketType>(row, col); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index rowIndexByOuterInner(Index outer, Index inner) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) { typedef typename DstEvaluatorType::ExpressionTraits Traits; return int(Traits::RowsAtCompileTime) == 1 ? 0 @@ -615,7 +615,7 @@ public: : inner; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index colIndexByOuterInner(Index outer, Index inner) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) { typedef typename DstEvaluatorType::ExpressionTraits Traits; return int(Traits::ColsAtCompileTime) == 1 ? 0 @@ -637,7 +637,7 @@ protected: ***************************************************************************/ template<typename DstXprType, typename SrcXprType, typename Functor> -EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -654,7 +654,7 @@ EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const S } template<typename DstXprType, typename SrcXprType> -EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src) { call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar>()); } @@ -682,47 +682,53 @@ template< typename DstXprType, typename SrcXprType, typename Functor, struct Assignment; -// The only purpose of this call_assignment() function is to deal with noalias() / AssumeAliasing and automatic transposition. -// Indeed, I (Gael) think that this concept of AssumeAliasing was a mistake, and it makes thing quite complicated. -// So this intermediate function removes everything related to AssumeAliasing such that Assignment +// The only purpose of this call_assignment() function is to deal with noalias() / "assume-aliasing" and automatic transposition. +// Indeed, I (Gael) think that this concept of "assume-aliasing" was a mistake, and it makes thing quite complicated. +// So this intermediate function removes everything related to "assume-aliasing" such that Assignment // does not has to bother about these annoying details. template<typename Dst, typename Src> -EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(Dst& dst, const Src& src) { call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>()); } template<typename Dst, typename Src> -EIGEN_DEVICE_FUNC void call_assignment(const Dst& dst, const Src& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(const Dst& dst, const Src& src) { call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>()); } -// Deal with AssumeAliasing +// Deal with "assume-aliasing" template<typename Dst, typename Src, typename Func> -EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==1, void*>::type = 0) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing<Src>::value, void*>::type = 0) { typename plain_matrix_type<Src>::type tmp(src); call_assignment_no_alias(dst, tmp, func); } template<typename Dst, typename Src, typename Func> -EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==0, void*>::type = 0) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<!evaluator_assume_aliasing<Src>::value, void*>::type = 0) { call_assignment_no_alias(dst, src, func); } -// by-pass AssumeAliasing +// by-pass "assume-aliasing" // When there is no aliasing, we require that 'dst' has been properly resized template<typename Dst, template <typename> class StorageBase, typename Src, typename Func> -EIGEN_DEVICE_FUNC void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func) { call_assignment_no_alias(dst.expression(), src, func); } template<typename Dst, typename Src, typename Func> -EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func) { enum { NeedToTranspose = ( (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) @@ -747,13 +753,15 @@ EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func); } template<typename Dst, typename Src> -EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment_no_alias(Dst& dst, const Src& src) { call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar>()); } template<typename Dst, typename Src, typename Func> -EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func) { Index dstRows = src.rows(); Index dstCols = src.cols(); @@ -767,7 +775,8 @@ EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src Assignment<Dst,Src,Func>::run(dst, src, func); } template<typename Dst, typename Src> -EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src) { call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar>()); } @@ -779,7 +788,8 @@ template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, con template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar> struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar> { - EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -796,7 +806,8 @@ struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar> template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar> struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Scalar> { - EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/) + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); src.evalTo(dst); diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h index 897187a30..897187a30 100755..100644 --- a/Eigen/src/Core/Assign_MKL.h +++ b/Eigen/src/Core/Assign_MKL.h diff --git a/Eigen/src/Core/BandMatrix.h b/Eigen/src/Core/BandMatrix.h index 87c124fdf..4978c9140 100644 --- a/Eigen/src/Core/BandMatrix.h +++ b/Eigen/src/Core/BandMatrix.h @@ -161,15 +161,15 @@ class BandMatrixBase : public EigenBase<Derived> * * \brief Represents a rectangular matrix with a banded storage * - * \param _Scalar Numeric type, i.e. float, double, int - * \param Rows Number of rows, or \b Dynamic - * \param Cols Number of columns, or \b Dynamic - * \param Supers Number of super diagonal - * \param Subs Number of sub diagonal - * \param _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint - * The former controls \ref TopicStorageOrders "storage order", and defaults to - * column-major. The latter controls whether the matrix represents a selfadjoint - * matrix in which case either Supers of Subs have to be null. + * \tparam _Scalar Numeric type, i.e. float, double, int + * \tparam _Rows Number of rows, or \b Dynamic + * \tparam _Cols Number of columns, or \b Dynamic + * \tparam _Supers Number of super diagonal + * \tparam _Subs Number of sub diagonal + * \tparam _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint + * The former controls \ref TopicStorageOrders "storage order", and defaults to + * column-major. The latter controls whether the matrix represents a selfadjoint + * matrix in which case either Supers of Subs have to be null. * * \sa class TridiagonalMatrix */ @@ -302,9 +302,9 @@ class BandMatrixWrapper : public BandMatrixBase<BandMatrixWrapper<_CoefficientsT * * \brief Represents a tridiagonal matrix with a compact banded storage * - * \param _Scalar Numeric type, i.e. float, double, int - * \param Size Number of rows and cols, or \b Dynamic - * \param _Options Can be 0 or \b SelfAdjoint + * \tparam Scalar Numeric type, i.e. float, double, int + * \tparam Size Number of rows and cols, or \b Dynamic + * \tparam Options Can be 0 or \b SelfAdjoint * * \sa class BandMatrix */ diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 3748e259b..11de45c2e 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -13,41 +13,6 @@ namespace Eigen { -/** \class Block - * \ingroup Core_Module - * - * \brief Expression of a fixed-size or dynamic-size block - * - * \param XprType the type of the expression in which we are taking a block - * \param BlockRows the number of rows of the block we are taking at compile time (optional) - * \param BlockCols the number of columns of the block we are taking at compile time (optional) - * \param InnerPanel is true, if the block maps to a set of rows of a row major matrix or - * to set of columns of a column major matrix (optional). The parameter allows to determine - * at compile time whether aligned access is possible on the block expression. - * - * This class represents an expression of either a fixed-size or dynamic-size block. It is the return - * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and - * most of the time this is the only way it is used. - * - * However, if you want to directly maniputate block expressions, - * for instance if you want to write a function returning such an expression, you - * will need to use this class. - * - * Here is an example illustrating the dynamic case: - * \include class_Block.cpp - * Output: \verbinclude class_Block.out - * - * \note Even though this expression has dynamic size, in the case where \a XprType - * has fixed size, this expression inherits a fixed maximal size which means that evaluating - * it does not cause a dynamic memory allocation. - * - * Here is an example illustrating the fixed-size case: - * \include class_FixedBlock.cpp - * Output: \verbinclude class_FixedBlock.out - * - * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock - */ - namespace internal { template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprType> @@ -101,6 +66,40 @@ template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool In template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, typename StorageKind> class BlockImpl; +/** \class Block + * \ingroup Core_Module + * + * \brief Expression of a fixed-size or dynamic-size block + * + * \tparam XprType the type of the expression in which we are taking a block + * \tparam BlockRows the number of rows of the block we are taking at compile time (optional) + * \tparam BlockCols the number of columns of the block we are taking at compile time (optional) + * \tparam InnerPanel is true, if the block maps to a set of rows of a row major matrix or + * to set of columns of a column major matrix (optional). The parameter allows to determine + * at compile time whether aligned access is possible on the block expression. + * + * This class represents an expression of either a fixed-size or dynamic-size block. It is the return + * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and + * most of the time this is the only way it is used. + * + * However, if you want to directly maniputate block expressions, + * for instance if you want to write a function returning such an expression, you + * will need to use this class. + * + * Here is an example illustrating the dynamic case: + * \include class_Block.cpp + * Output: \verbinclude class_Block.out + * + * \note Even though this expression has dynamic size, in the case where \a XprType + * has fixed size, this expression inherits a fixed maximal size which means that evaluating + * it does not cause a dynamic memory allocation. + * + * Here is an example illustrating the fixed-size case: + * \include class_FixedBlock.cpp + * Output: \verbinclude class_FixedBlock.out + * + * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock + */ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class Block : public BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, typename internal::traits<XprType>::StorageKind> { @@ -130,8 +129,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class : Impl(xpr, startRow, startCol) { EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE) - eigen_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= xpr.rows() - && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= xpr.cols()); + eigen_assert(startRow >= 0 && BlockRows >= 0 && startRow + BlockRows <= xpr.rows() + && startCol >= 0 && BlockCols >= 0 && startCol + BlockCols <= xpr.cols()); } /** Dynamic-size constructor @@ -174,6 +173,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H : public internal::dense_xpr_base<Block<XprType, BlockRows, BlockCols, InnerPanel> >::type { typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType; + typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested; public: typedef typename internal::dense_xpr_base<BlockType>::type Base; @@ -222,15 +222,13 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H inline Scalar& coeffRef(Index rowId, Index colId) { EIGEN_STATIC_ASSERT_LVALUE(XprType) - return m_xpr.const_cast_derived() - .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value()); + return m_xpr.coeffRef(rowId + m_startRow.value(), colId + m_startCol.value()); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index rowId, Index colId) const { - return m_xpr.derived() - .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value()); + return m_xpr.derived().coeffRef(rowId + m_startRow.value(), colId + m_startCol.value()); } EIGEN_DEVICE_FUNC @@ -243,39 +241,34 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H inline Scalar& coeffRef(Index index) { EIGEN_STATIC_ASSERT_LVALUE(XprType) - return m_xpr.const_cast_derived() - .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), - m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); + return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), + m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const { - return m_xpr.const_cast_derived() - .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), - m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); + return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), + m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); } EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const { - return m_xpr - .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), - m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); + return m_xpr.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), + m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); } template<int LoadMode> inline PacketScalar packet(Index rowId, Index colId) const { - return m_xpr.template packet<Unaligned> - (rowId + m_startRow.value(), colId + m_startCol.value()); + return m_xpr.template packet<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value()); } template<int LoadMode> inline void writePacket(Index rowId, Index colId, const PacketScalar& val) { - m_xpr.const_cast_derived().template writePacket<Unaligned> - (rowId + m_startRow.value(), colId + m_startCol.value(), val); + m_xpr.template writePacket<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value(), val); } template<int LoadMode> @@ -289,7 +282,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H template<int LoadMode> inline void writePacket(Index index, const PacketScalar& val) { - m_xpr.const_cast_derived().template writePacket<Unaligned> + m_xpr.template writePacket<Unaligned> (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), val); } @@ -302,10 +295,13 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H #endif EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const + const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + XprType& nestedExpression() { return m_xpr; } EIGEN_DEVICE_FUNC StorageIndex startRow() const @@ -321,9 +317,9 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H protected: - const typename XprType::Nested m_xpr; - const internal::variable_if_dynamic<StorageIndex, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow; - const internal::variable_if_dynamic<StorageIndex, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol; + XprTypeNested m_xpr; + const internal::variable_if_dynamic<StorageIndex, (XprType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow; + const internal::variable_if_dynamic<StorageIndex, (XprType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol; const internal::variable_if_dynamic<StorageIndex, RowsAtCompileTime> m_blockRows; const internal::variable_if_dynamic<StorageIndex, ColsAtCompileTime> m_blockCols; }; @@ -334,6 +330,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true> : public MapBase<Block<XprType, BlockRows, BlockCols, InnerPanel> > { typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType; + typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested; enum { XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0 }; @@ -351,7 +348,9 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true> || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()), BlockRows==1 ? 1 : xpr.rows(), BlockCols==1 ? 1 : xpr.cols()), - m_xpr(xpr) + m_xpr(xpr), + m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0), + m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0) { init(); } @@ -361,7 +360,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true> EIGEN_DEVICE_FUNC inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)), - m_xpr(xpr) + m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) { init(); } @@ -373,16 +372,19 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true> Index startRow, Index startCol, Index blockRows, Index blockCols) : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols), - m_xpr(xpr) + m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) { init(); } EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const + const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + XprType& nestedExpression() { return m_xpr; } /** \sa MapBase::innerStride() */ EIGEN_DEVICE_FUNC @@ -400,6 +402,18 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true> return m_outerStride; } + EIGEN_DEVICE_FUNC + StorageIndex startRow() const + { + return m_startRow.value(); + } + + EIGEN_DEVICE_FUNC + StorageIndex startCol() const + { + return m_startCol.value(); + } + #ifndef __SUNPRO_CC // FIXME sunstudio is not friendly with the above friend... // META-FIXME there is no 'friend' keyword around here. Is this obsolete? @@ -425,7 +439,9 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true> : m_xpr.innerStride(); } - typename XprType::Nested m_xpr; + XprTypeNested m_xpr; + const internal::variable_if_dynamic<StorageIndex, (XprType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow; + const internal::variable_if_dynamic<StorageIndex, (XprType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol; Index m_outerStride; }; diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h index 89bcd750c..2abc6605c 100644 --- a/Eigen/src/Core/CommaInitializer.h +++ b/Eigen/src/Core/CommaInitializer.h @@ -22,7 +22,7 @@ namespace Eigen { * the return type of MatrixBase::operator<<, and most of the time this is the only * way it is used. * - * \sa \ref MatrixBaseCommaInitRef "MatrixBase::operator<<", CommaInitializer::finished() + * \sa \blank \ref MatrixBaseCommaInitRef "MatrixBase::operator<<", CommaInitializer::finished() */ template<typename XprType> struct CommaInitializer diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 42ad452f7..388805f0d 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -63,10 +63,6 @@ struct evaluator_traits_base // by default, get evaluator kind and shape from storage typedef typename storage_kind_to_evaluator_kind<typename traits<T>::StorageKind>::Kind Kind; typedef typename storage_kind_to_shape<typename traits<T>::StorageKind>::Shape Shape; - - // 1 if assignment A = B assumes aliasing when B is of type T and thus B needs to be evaluated into a - // temporary; 0 if not. - static const int AssumeAliasing = 0; }; // Default evaluator traits @@ -75,6 +71,10 @@ struct evaluator_traits : public evaluator_traits_base<T> { }; +template<typename T, typename Shape = typename evaluator_traits<T>::Shape > +struct evaluator_assume_aliasing { + static const bool value = false; +}; // By default, we assume a unary expression: template<typename T> @@ -148,7 +148,8 @@ struct evaluator<PlainObjectBase<Derived> > EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { if (IsRowMajor) return m_data[row * m_outerStride.value() + col]; @@ -156,12 +157,14 @@ struct evaluator<PlainObjectBase<Derived> > return m_data[row + col * m_outerStride.value()]; } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_data[index]; } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { if (IsRowMajor) return const_cast<Scalar*>(m_data)[row * m_outerStride.value() + col]; @@ -169,12 +172,14 @@ struct evaluator<PlainObjectBase<Derived> > return const_cast<Scalar*>(m_data)[row + col * m_outerStride.value()]; } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return const_cast<Scalar*>(m_data)[index]; } template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { if (IsRowMajor) @@ -184,12 +189,14 @@ struct evaluator<PlainObjectBase<Derived> > } template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return ploadt<PacketType, LoadMode>(m_data + index); } template<int StoreMode,typename PacketType> + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { if (IsRowMajor) @@ -201,6 +208,7 @@ struct evaluator<PlainObjectBase<Derived> > } template<int StoreMode, typename PacketType> + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_data) + index, x); @@ -260,45 +268,53 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_argImpl.coeff(col, row); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(index); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(col, row); } - EIGEN_DEVICE_FUNC typename XprType::Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename XprType::Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(index); } template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_argImpl.template packet<LoadMode,PacketType>(col, row); } template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return m_argImpl.template packet<LoadMode,PacketType>(index); } - template<int StoreMode, typename PacketType> + template<int StoreMode, typename PacketType> + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { m_argImpl.template writePacket<StoreMode,PacketType>(col, row, x); } - template<int StoreMode, typename PacketType> + template<int StoreMode, typename PacketType> + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { m_argImpl.template writePacket<StoreMode,PacketType>(index, x); @@ -338,23 +354,27 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> > typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_functor(row, col); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_functor(index); } template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_functor.template packetOp<Index,PacketType>(row, col); } template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return m_functor.template packetOp<Index,PacketType>(index); @@ -380,7 +400,8 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased > Alignment = evaluator<ArgType>::Alignment }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression()) { @@ -390,23 +411,27 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased > typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_functor(m_argImpl.coeff(row, col)); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_functor(m_argImpl.coeff(index)); } template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(row, col)); } template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(index)); @@ -466,17 +491,20 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col)); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index)); } template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(row, col), @@ -484,6 +512,7 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase } template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(index), @@ -523,22 +552,26 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_unaryOp(m_argImpl.coeff(row, col)); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_unaryOp(m_argImpl.coeff(index)); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_unaryOp(m_argImpl.coeffRef(row, col)); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return m_unaryOp(m_argImpl.coeffRef(index)); } @@ -578,47 +611,55 @@ struct mapbase_evaluator : evaluator_base<Derived> EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()]; } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_data[index * m_xpr.innerStride()]; } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()]; } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return m_data[index * m_xpr.innerStride()]; } - template<int LoadMode, typename PacketType> + template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride(); return internal::ploadt<PacketType, LoadMode>(ptr); } - template<int LoadMode, typename PacketType> + template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return internal::ploadt<PacketType, LoadMode>(m_data + index * m_xpr.innerStride()); } - template<int StoreMode, typename PacketType> + template<int StoreMode, typename PacketType> + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride(); return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x); } - template<int StoreMode, typename PacketType> + template<int StoreMode, typename PacketType> + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_xpr.innerStride(), x); @@ -767,46 +808,54 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa RowsAtCompileTime = XprType::RowsAtCompileTime }; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } - template<int LoadMode, typename PacketType> + template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_argImpl.template packet<LoadMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col); } - template<int LoadMode, typename PacketType> + template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } - template<int StoreMode, typename PacketType> + template<int StoreMode, typename PacketType> + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { return m_argImpl.template writePacket<StoreMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col, x); } - template<int StoreMode, typename PacketType> + template<int StoreMode, typename PacketType> + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index, @@ -816,8 +865,8 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa protected: evaluator<ArgType> m_argImpl; - const variable_if_dynamic<Index, ArgType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow; - const variable_if_dynamic<Index, ArgType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol; + const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow; + const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol; }; // TODO: This evaluator does not actually use the child evaluator; @@ -859,7 +908,7 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> > Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ThenMatrixType>::Alignment, evaluator<ElseMatrixType>::Alignment) }; - inline EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select) + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select) : m_conditionImpl(select.conditionMatrix()), m_thenImpl(select.thenMatrix()), m_elseImpl(select.elseMatrix()) @@ -869,7 +918,8 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> > typedef typename XprType::CoeffReturnType CoeffReturnType; - inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { if (m_conditionImpl.coeff(row, col)) return m_thenImpl.coeff(row, col); @@ -877,7 +927,8 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> > return m_elseImpl.coeff(row, col); } - inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { if (m_conditionImpl.coeff(index)) return m_thenImpl.coeff(index); @@ -921,7 +972,8 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> > m_cols(replicate.nestedExpression().cols()) {} - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { // try to avoid using modulo; this is a pure optimization strategy const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0 @@ -934,7 +986,8 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> > return m_argImpl.coeff(actual_row, actual_col); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { // try to avoid using modulo; this is a pure optimization strategy const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1 @@ -945,6 +998,7 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> > } template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0 @@ -958,6 +1012,7 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> > } template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index index) const { const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1 @@ -994,7 +1049,7 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> > CoeffReadCost = TraversalSize==Dynamic ? HugeCost : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value), - Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&HereditaryBits), + Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit))) | LinearAccessBit, Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized }; @@ -1008,7 +1063,8 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> > typedef typename XprType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index i, Index j) const { if (Direction==Vertical) return m_functor(m_arg.col(j)); @@ -1016,7 +1072,8 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> > return m_functor(m_arg.row(i)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index index) const { if (Direction==Vertical) return m_functor(m_arg.col(index)); @@ -1051,45 +1108,53 @@ struct evaluator_wrapper_base typedef typename ArgType::Scalar Scalar; typedef typename ArgType::CoeffReturnType CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_argImpl.coeff(row, col); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(index); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(row, col); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(index); } - template<int LoadMode, typename PacketType> + template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return m_argImpl.template packet<LoadMode,PacketType>(row, col); } - template<int LoadMode, typename PacketType> + template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index index) const { return m_argImpl.template packet<LoadMode,PacketType>(index); } - template<int StoreMode, typename PacketType> + template<int StoreMode, typename PacketType> + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { m_argImpl.template writePacket<StoreMode>(row, col, x); } - template<int StoreMode, typename PacketType> + template<int StoreMode, typename PacketType> + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { m_argImpl.template writePacket<StoreMode>(index, x); @@ -1164,29 +1229,34 @@ struct unary_evaluator<Reverse<ArgType, Direction> > m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1) { } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const { return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row, ReverseCol ? m_cols.value() - col - 1 : col); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(m_rows.value() * m_cols.value() - index - 1); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row, ReverseCol ? m_cols.value() - col - 1 : col); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1); } template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { enum { @@ -1201,6 +1271,7 @@ struct unary_evaluator<Reverse<ArgType, Direction> > } template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE PacketType packet(Index index) const { enum { PacketSize = unpacket_traits<PacketType>::size }; @@ -1208,6 +1279,7 @@ struct unary_evaluator<Reverse<ArgType, Direction> > } template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) { // FIXME we could factorize some code with packet(i,j) @@ -1224,6 +1296,7 @@ struct unary_evaluator<Reverse<ArgType, Direction> > } template<int LoadMode, typename PacketType> + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { enum { PacketSize = unpacket_traits<PacketType>::size }; @@ -1267,22 +1340,26 @@ struct evaluator<Diagonal<ArgType, DiagIndex> > typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value, typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType; - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index) const { return m_argImpl.coeff(row + rowOffset(), row + colOffset()); } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(index + rowOffset(), index + colOffset()); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index) { return m_argImpl.coeffRef(row + rowOffset(), row + colOffset()); } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(index + rowOffset(), index + colOffset()); } diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h index e42c3031b..39820fd7d 100644 --- a/Eigen/src/Core/CwiseBinaryOp.h +++ b/Eigen/src/Core/CwiseBinaryOp.h @@ -13,26 +13,6 @@ namespace Eigen { -/** \class CwiseBinaryOp - * \ingroup Core_Module - * - * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions - * - * \param BinaryOp template functor implementing the operator - * \param Lhs the type of the left-hand side - * \param Rhs the type of the right-hand side - * - * This class represents an expression where a coefficient-wise binary operator is applied to two expressions. - * It is the return type of binary operators, by which we mean only those binary operators where - * both the left-hand side and the right-hand side are Eigen expressions. - * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp. - * - * Most of the time, this is the only way that it is used, so you typically don't have to name - * CwiseBinaryOp types explicitly. - * - * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp - */ - namespace internal { template<typename BinaryOp, typename Lhs, typename Rhs> struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > @@ -52,8 +32,8 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > // we still want to handle the case when the result type is different. typedef typename result_of< BinaryOp( - typename Lhs::Scalar, - typename Rhs::Scalar + const typename Lhs::Scalar&, + const typename Rhs::Scalar& ) >::type Scalar; typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind, @@ -74,6 +54,25 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind> class CwiseBinaryOpImpl; +/** \class CwiseBinaryOp + * \ingroup Core_Module + * + * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions + * + * \tparam BinaryOp template functor implementing the operator + * \tparam LhsType the type of the left-hand side + * \tparam RhsType the type of the right-hand side + * + * This class represents an expression where a coefficient-wise binary operator is applied to two expressions. + * It is the return type of binary operators, by which we mean only those binary operators where + * both the left-hand side and the right-hand side are Eigen expressions. + * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp. + * + * Most of the time, this is the only way that it is used, so you typically don't have to name + * CwiseBinaryOp types explicitly. + * + * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp + */ template<typename BinaryOp, typename LhsType, typename RhsType> class CwiseBinaryOp : public CwiseBinaryOpImpl< diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index 2bc6933d9..3c6508cd0 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -12,13 +12,23 @@ namespace Eigen { +namespace internal { +template<typename NullaryOp, typename PlainObjectType> +struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType> +{ + enum { + Flags = traits<PlainObjectType>::Flags & RowMajorBit + }; +}; +} + /** \class CwiseNullaryOp * \ingroup Core_Module * * \brief Generic expression of a matrix where all coefficients are defined by a functor * - * \param NullaryOp template functor implementing the operator - * \param PlainObjectType the underlying plain matrix/array type + * \tparam NullaryOp template functor implementing the operator + * \tparam PlainObjectType the underlying plain matrix/array type * * This class represents an expression of a generic nullary operator. * It is the return type of the Ones(), Zero(), Constant(), Identity() and Random() methods, @@ -29,17 +39,6 @@ namespace Eigen { * * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr() */ - -namespace internal { -template<typename NullaryOp, typename PlainObjectType> -struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType> -{ - enum { - Flags = traits<PlainObjectType>::Flags & RowMajorBit - }; -}; -} - template<typename NullaryOp, typename PlainObjectType> class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type, internal::no_assignment_operator { @@ -224,7 +223,7 @@ DenseBase<Derived>::Constant(const Scalar& value) } /** - * \brief Sets a linearly space vector. + * \brief Sets a linearly spaced vector. * * The function generates 'size' equally spaced values in the closed interval [low,high]. * This particular version of LinSpaced() uses sequential access, i.e. vector access is @@ -262,7 +261,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig } /** - * \brief Sets a linearly space vector. + * \brief Sets a linearly spaced vector. * * The function generates 'size' equally spaced values in the closed interval [low,high]. * When size is set to 1, a vector of length 1 containing 'high' is returned. @@ -328,7 +327,7 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val) setConstant(val); } -/** Sets all coefficients in this expression to \a value. +/** Sets all coefficients in this expression to value \a val. * * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes() */ @@ -338,7 +337,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val) return derived() = Constant(rows(), cols(), val); } -/** Resizes to the given \a size, and sets all coefficients in this expression to the given \a value. +/** Resizes to the given \a size, and sets all coefficients in this expression to the given value \a val. * * \only_for_vectors * @@ -355,7 +354,7 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val) return setConstant(val); } -/** Resizes to the given size, and sets all coefficients in this expression to the given \a value. +/** Resizes to the given size, and sets all coefficients in this expression to the given value \a val. * * \param rows the new number of rows * \param cols the new number of columns @@ -375,7 +374,7 @@ PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val) } /** - * \brief Sets a linearly space vector. + * \brief Sets a linearly spaced vector. * * The function generates 'size' equally spaced values in the closed interval [low,high]. * When size is set to 1, a vector of length 1 containing 'high' is returned. @@ -395,7 +394,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con } /** - * \brief Sets a linearly space vector. + * \brief Sets a linearly spaced vector. * * The function fill *this with equally spaced values in the closed interval [low,high]. * When size is set to 1, a vector of length 1 containing 'high' is returned. diff --git a/Eigen/src/Core/CwiseUnaryOp.h b/Eigen/src/Core/CwiseUnaryOp.h index da1d1992d..1d2dd19f2 100644 --- a/Eigen/src/Core/CwiseUnaryOp.h +++ b/Eigen/src/Core/CwiseUnaryOp.h @@ -13,33 +13,13 @@ namespace Eigen { -/** \class CwiseUnaryOp - * \ingroup Core_Module - * - * \brief Generic expression where a coefficient-wise unary operator is applied to an expression - * - * \param UnaryOp template functor implementing the operator - * \param XprType the type of the expression to which we are applying the unary operator - * - * This class represents an expression where a unary operator is applied to an expression. - * It is the return type of all operations taking exactly 1 input expression, regardless of the - * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix - * is considered unary, because only the right-hand side is an expression, and its - * return type is a specialization of CwiseUnaryOp. - * - * Most of the time, this is the only way that it is used, so you typically don't have to name - * CwiseUnaryOp types explicitly. - * - * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp - */ - namespace internal { template<typename UnaryOp, typename XprType> struct traits<CwiseUnaryOp<UnaryOp, XprType> > : traits<XprType> { typedef typename result_of< - UnaryOp(typename XprType::Scalar) + UnaryOp(const typename XprType::Scalar&) >::type Scalar; typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference<XprTypeNested>::type _XprTypeNested; @@ -52,6 +32,25 @@ struct traits<CwiseUnaryOp<UnaryOp, XprType> > template<typename UnaryOp, typename XprType, typename StorageKind> class CwiseUnaryOpImpl; +/** \class CwiseUnaryOp + * \ingroup Core_Module + * + * \brief Generic expression where a coefficient-wise unary operator is applied to an expression + * + * \tparam UnaryOp template functor implementing the operator + * \tparam XprType the type of the expression to which we are applying the unary operator + * + * This class represents an expression where a unary operator is applied to an expression. + * It is the return type of all operations taking exactly 1 input expression, regardless of the + * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix + * is considered unary, because only the right-hand side is an expression, and its + * return type is a specialization of CwiseUnaryOp. + * + * Most of the time, this is the only way that it is used, so you typically don't have to name + * CwiseUnaryOp types explicitly. + * + * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp + */ template<typename UnaryOp, typename XprType> class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>, internal::no_assignment_operator { @@ -59,33 +58,34 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal typedef typename CwiseUnaryOpImpl<UnaryOp, XprType,typename internal::traits<XprType>::StorageKind>::Base Base; EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp) + typedef typename internal::ref_selector<XprType>::type XprTypeNested; typedef typename internal::remove_all<XprType>::type NestedExpression; - EIGEN_DEVICE_FUNC - explicit inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) : m_xpr(xpr), m_functor(func) {} - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rows() const { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index cols() const { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index rows() const { return m_xpr.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index cols() const { return m_xpr.cols(); } /** \returns the functor representing the unary operation */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryOp& functor() const { return m_functor; } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename XprType::Nested>::type& + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const { return m_xpr; } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC - typename internal::remove_all<typename XprType::Nested>::type& - nestedExpression() { return m_xpr.const_cast_derived(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::remove_all<XprTypeNested>::type& + nestedExpression() { return m_xpr; } protected: - typename XprType::Nested m_xpr; + XprTypeNested m_xpr; const UnaryOp m_functor; }; diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h index 72244751e..271033056 100644 --- a/Eigen/src/Core/CwiseUnaryView.h +++ b/Eigen/src/Core/CwiseUnaryView.h @@ -12,27 +12,13 @@ namespace Eigen { -/** \class CwiseUnaryView - * \ingroup Core_Module - * - * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector - * - * \param ViewOp template functor implementing the view - * \param MatrixType the type of the matrix we are applying the unary operator - * - * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector. - * It is the return type of real() and imag(), and most of the time this is the only way it is used. - * - * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp - */ - namespace internal { template<typename ViewOp, typename MatrixType> struct traits<CwiseUnaryView<ViewOp, MatrixType> > : traits<MatrixType> { typedef typename result_of< - ViewOp(typename traits<MatrixType>::Scalar) + ViewOp(const typename traits<MatrixType>::Scalar&) >::type Scalar; typedef typename MatrixType::Nested MatrixTypeNested; typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested; @@ -55,6 +41,19 @@ struct traits<CwiseUnaryView<ViewOp, MatrixType> > template<typename ViewOp, typename MatrixType, typename StorageKind> class CwiseUnaryViewImpl; +/** \class CwiseUnaryView + * \ingroup Core_Module + * + * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector + * + * \tparam ViewOp template functor implementing the view + * \tparam MatrixType the type of the matrix we are applying the unary operator + * + * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector. + * It is the return type of real() and imag(), and most of the time this is the only way it is used. + * + * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp + */ template<typename ViewOp, typename MatrixType> class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename internal::traits<MatrixType>::StorageKind> { @@ -62,6 +61,7 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in typedef typename CwiseUnaryViewImpl<ViewOp, MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base; EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryView) + typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested; typedef typename internal::remove_all<MatrixType>::type NestedExpression; explicit inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp()) @@ -76,15 +76,15 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in const ViewOp& functor() const { return m_functor; } /** \returns the nested expression */ - const typename internal::remove_all<typename MatrixType::Nested>::type& + const typename internal::remove_all<MatrixTypeNested>::type& nestedExpression() const { return m_matrix; } /** \returns the nested expression */ - typename internal::remove_all<typename MatrixType::Nested>::type& + typename internal::remove_reference<MatrixTypeNested>::type& nestedExpression() { return m_matrix.const_cast_derived(); } protected: - typename internal::ref_selector<MatrixType>::type m_matrix; + MatrixTypeNested m_matrix; ViewOp m_functor; }; diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index e181dafaf..5a38e5f22 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -36,7 +36,7 @@ static inline void check_DenseIndex_is_signed() { * This class can be extended with the help of the plugin mechanism described on the page * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_DENSEBASE_PLUGIN. * - * \sa \ref TopicClassHierarchy + * \sa \blank \ref TopicClassHierarchy */ template<typename Derived> class DenseBase #ifndef EIGEN_PARSED_BY_DOXYGEN @@ -60,7 +60,7 @@ template<typename Derived> class DenseBase * \brief The type used to store indices * \details This typedef is relevant for types that store multiple indices such as * PermutationMatrix or Transpositions, otherwise it defaults to Eigen::Index - * \sa \ref TopicPreprocessorDirectives, Eigen::Index, SparseMatrixBase. + * \sa \blank \ref TopicPreprocessorDirectives, Eigen::Index, SparseMatrixBase. */ typedef typename internal::traits<Derived>::StorageIndex StorageIndex; @@ -275,13 +275,13 @@ template<typename Derived> class DenseBase /** Copies \a other into *this. \returns a reference to *this. */ template<typename OtherDerived> - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other); /** Special case of the template operator=, in order to prevent the compiler * from generating a default operator= (issue hit with g++ 4.1) */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other); template<typename OtherDerived> @@ -388,10 +388,10 @@ template<typename Derived> class DenseBase inline bool hasNaN() const; inline bool allFinite() const; - EIGEN_DEVICE_FUNC - inline Derived& operator*=(const Scalar& other); - EIGEN_DEVICE_FUNC - inline Derived& operator/=(const Scalar& other); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator*=(const Scalar& other); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator/=(const Scalar& other); typedef typename internal::add_const_on_value_type<typename internal::eval<Derived>::type>::type EvalReturnType; /** \returns the matrix or vector obtained by evaluating this expression. diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h index 820a90e6f..423ab167d 100644 --- a/Eigen/src/Core/DenseCoeffsBase.h +++ b/Eigen/src/Core/DenseCoeffsBase.h @@ -191,19 +191,31 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType - y() const { return (*this)[1]; } + y() const + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS); + return (*this)[1]; + } /** equivalent to operator[](2). */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType - z() const { return (*this)[2]; } + z() const + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS); + return (*this)[2]; + } /** equivalent to operator[](3). */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType - w() const { return (*this)[3]; } + w() const + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS); + return (*this)[3]; + } /** \internal * \returns the packet of coefficients starting at the given row and column. It is your responsibility @@ -424,19 +436,31 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& - y() { return (*this)[1]; } + y() + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS); + return (*this)[1]; + } /** equivalent to operator[](2). */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& - z() { return (*this)[2]; } + z() + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS); + return (*this)[2]; + } /** equivalent to operator[](3). */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& - w() { return (*this)[3]; } + w() + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS); + return (*this)[3]; + } }; /** \brief Base class providing direct read-only coefficient access to matrices and arrays. @@ -448,7 +472,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived, * inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which defines functions to access entries read-only using * \c operator() . * - * \sa \ref TopicClassHierarchy + * \sa \blank \ref TopicClassHierarchy */ template<typename Derived> class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived, ReadOnlyAccessors> @@ -521,7 +545,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived * inherits DenseCoeffsBase<Derived, WriteAccessors> which defines functions to access entries read/write using * \c operator(). * - * \sa \ref TopicClassHierarchy + * \sa \blank \ref TopicClassHierarchy */ template<typename Derived> class DenseCoeffsBase<Derived, DirectWriteAccessors> diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h index fa3176266..bfea0584b 100644 --- a/Eigen/src/Core/Diagonal.h +++ b/Eigen/src/Core/Diagonal.h @@ -103,21 +103,21 @@ template<typename MatrixType, int _DiagIndex> class Diagonal >::type ScalarWithConstIfNotLvalue; EIGEN_DEVICE_FUNC - inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); } + inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.coeffRef(rowOffset(), colOffset())); } EIGEN_DEVICE_FUNC - inline const Scalar* data() const { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); } + inline const Scalar* data() const { return &(m_matrix.coeffRef(rowOffset(), colOffset())); } EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index) { EIGEN_STATIC_ASSERT_LVALUE(MatrixType) - return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset()); + return m_matrix.coeffRef(row+rowOffset(), row+colOffset()); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index row, Index) const { - return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset()); + return m_matrix.coeffRef(row+rowOffset(), row+colOffset()); } EIGEN_DEVICE_FUNC @@ -130,13 +130,13 @@ template<typename MatrixType, int _DiagIndex> class Diagonal inline Scalar& coeffRef(Index idx) { EIGEN_STATIC_ASSERT_LVALUE(MatrixType) - return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset()); + return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset()); } EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index idx) const { - return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset()); + return m_matrix.coeffRef(idx+rowOffset(), idx+colOffset()); } EIGEN_DEVICE_FUNC @@ -159,7 +159,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal } protected: - typename MatrixType::Nested m_matrix; + typename internal::ref_selector<MatrixType>::non_const_type m_matrix; const internal::variable_if_dynamicindex<Index, DiagIndex> m_index; private: diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index 003450f1a..82d58fc0b 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -82,7 +82,7 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const * In both cases, it consists in the sum of the square of all the matrix entries. * For vectors, this is also equals to the dot product of \c *this with itself. * - * \sa dot(), norm() + * \sa dot(), norm(), lpNorm() */ template<typename Derived> EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const @@ -94,16 +94,18 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala * In both cases, it consists in the square root of the sum of the square of all the matrix entries. * For vectors, this is also equals to the square root of the dot product of \c *this with itself. * - * \sa dot(), squaredNorm() + * \sa lpNorm(), dot(), squaredNorm() */ template<typename Derived> inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const { - EIGEN_USING_STD_MATH(sqrt) - return sqrt(squaredNorm()); + return numext::sqrt(squaredNorm()); } -/** \returns an expression of the quotient of *this by its own norm. +/** \returns an expression of the quotient of \c *this by its own norm. + * + * \warning If the input vector is too small (i.e., this->norm()==0), + * then this function returns a copy of the input. * * \only_for_vectors * @@ -115,19 +117,75 @@ MatrixBase<Derived>::normalized() const { typedef typename internal::nested_eval<Derived,2>::type _Nested; _Nested n(derived()); - return n / n.norm(); + RealScalar z = n.squaredNorm(); + // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU + if(z>RealScalar(0)) + return n / numext::sqrt(z); + else + return n; } /** Normalizes the vector, i.e. divides it by its own norm. * * \only_for_vectors * + * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged. + * * \sa norm(), normalized() */ template<typename Derived> inline void MatrixBase<Derived>::normalize() { - *this /= norm(); + RealScalar z = squaredNorm(); + // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU + if(z>RealScalar(0)) + derived() /= numext::sqrt(z); +} + +/** \returns an expression of the quotient of \c *this by its own norm while avoiding underflow and overflow. + * + * \only_for_vectors + * + * This method is analogue to the normalized() method, but it reduces the risk of + * underflow and overflow when computing the norm. + * + * \warning If the input vector is too small (i.e., this->norm()==0), + * then this function returns a copy of the input. + * + * \sa stableNorm(), stableNormalize(), normalized() + */ +template<typename Derived> +inline const typename MatrixBase<Derived>::PlainObject +MatrixBase<Derived>::stableNormalized() const +{ + typedef typename internal::nested_eval<Derived,3>::type _Nested; + _Nested n(derived()); + RealScalar w = n.cwiseAbs().maxCoeff(); + RealScalar z = (n/w).squaredNorm(); + if(z>RealScalar(0)) + return n / (numext::sqrt(z)*w); + else + return n; +} + +/** Normalizes the vector while avoid underflow and overflow + * + * \only_for_vectors + * + * This method is analogue to the normalize() method, but it reduces the risk of + * underflow and overflow when computing the norm. + * + * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged. + * + * \sa stableNorm(), stableNormalized(), normalize() + */ +template<typename Derived> +inline void MatrixBase<Derived>::stableNormalize() +{ + RealScalar w = cwiseAbs().maxCoeff(); + RealScalar z = (derived()/w).squaredNorm(); + if(z>RealScalar(0)) + derived() /= numext::sqrt(z)*w; } //---------- implementation of other norms ---------- @@ -188,7 +246,11 @@ struct lpNorm_selector<Derived, Infinity> */ template<typename Derived> template<int p> +#ifndef EIGEN_PARSED_BY_DOXYGEN inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real +#else +MatrixBase<Derived>::RealScalar +#endif MatrixBase<Derived>::lpNorm() const { return internal::lpNorm_selector<Derived, p>::run(*this); diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h index 79dabda37..ba8e09674 100644 --- a/Eigen/src/Core/EigenBase.h +++ b/Eigen/src/Core/EigenBase.h @@ -23,7 +23,7 @@ namespace Eigen { * * Notice that this class is trivial, it is only used to disambiguate overloaded functions. * - * \sa \ref TopicClassHierarchy + * \sa \blank \ref TopicClassHierarchy */ template<typename Derived> struct EigenBase { diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index fe8204ac3..53f934999 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -76,32 +76,6 @@ public: #endif }; -// template<typename Lhs, typename Rhs> struct product_tag -// { -// private: -// -// typedef typename remove_all<Lhs>::type _Lhs; -// typedef typename remove_all<Rhs>::type _Rhs; -// enum { -// Rows = _Lhs::RowsAtCompileTime, -// Cols = _Rhs::ColsAtCompileTime, -// Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime, _Rhs::RowsAtCompileTime) -// }; -// -// enum { -// rows_select = Rows==1 ? int(Rows) : int(Large), -// cols_select = Cols==1 ? int(Cols) : int(Large), -// depth_select = Depth==1 ? int(Depth) : int(Large) -// }; -// typedef product_type_selector<rows_select, cols_select, depth_select> selector; -// -// public: -// enum { -// ret = selector::ret -// }; -// -// }; - /* The following allows to select the kind of product at compile time * based on the three dimensions of the product. * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */ @@ -125,8 +99,8 @@ template<> struct product_type_selector<Small,Small,Large> { enum template<> struct product_type_selector<Large,Small,Large> { enum { ret = GemmProduct }; }; template<> struct product_type_selector<Small,Large,Large> { enum { ret = GemmProduct }; }; template<> struct product_type_selector<Large,Large,Large> { enum { ret = GemmProduct }; }; -template<> struct product_type_selector<Large,Small,Small> { enum { ret = GemmProduct }; }; -template<> struct product_type_selector<Small,Large,Small> { enum { ret = GemmProduct }; }; +template<> struct product_type_selector<Large,Small,Small> { enum { ret = CoeffBasedProductMode }; }; +template<> struct product_type_selector<Small,Large,Small> { enum { ret = CoeffBasedProductMode }; }; template<> struct product_type_selector<Large,Large,Small> { enum { ret = GemmProduct }; }; } // end namespace internal @@ -239,15 +213,18 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true> ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) * RhsBlasTraits::extractScalarFactor(rhs); + // make sure Dest is a compile-time vector type (bug 1166) + typedef typename conditional<Dest::IsVectorAtCompileTime, Dest, typename Dest::ColXpr>::type ActualDest; + enum { // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1 // on, the other hand it is good for the cache to pack the vector anyways... - EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1, + EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1), ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex), - MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal + MightCannotUseDest = (ActualDest::InnerStrideAtCompileTime!=1) || ComplexByReal }; - gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest; + gemv_static_vector_if<ResScalar,ActualDest::SizeAtCompileTime,ActualDest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest; const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; @@ -340,7 +317,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,true> actualLhs.rows(), actualLhs.cols(), LhsMapper(actualLhs.data(), actualLhs.outerStride()), RhsMapper(actualRhsPtr, 1), - dest.data(), dest.innerStride(), + dest.data(), dest.col(0).innerStride(), //NOTE if dest is not a vector at compile-time, then dest.innerStride() might be wrong. (bug 1166) actualAlpha); } }; diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index d51413e98..679b22f53 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -43,7 +43,7 @@ struct default_packet_traits { enum { HasHalfPacket = 0, - + HasAdd = 1, HasSub = 1, HasMul = 1, @@ -62,7 +62,7 @@ struct default_packet_traits HasRsqrt = 0, HasExp = 0, HasLog = 0, - HasLog10 = 0, + HasLog10 = 0, HasPow = 0, HasSin = 0, @@ -71,9 +71,17 @@ struct default_packet_traits HasASin = 0, HasACos = 0, HasATan = 0, - HasSinh = 0, - HasCosh = 0, - HasTanh = 0, + HasSinh = 0, + HasCosh = 0, + HasTanh = 0, + HasLGamma = 0, + HasDiGamma = 0, + HasZeta = 0, + HasPolygamma = 0, + HasErf = 0, + HasErfc = 0, + HasIGamma = 0, + HasIGammac = 0, HasRound = 0, HasFloor = 0, @@ -130,6 +138,11 @@ pcast(const SrcPacket& a, const SrcPacket& /*b*/) { return static_cast<TgtPacket>(a); } +template <typename SrcPacket, typename TgtPacket> +EIGEN_DEVICE_FUNC inline TgtPacket +pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/) { + return static_cast<TgtPacket>(a); +} /** \internal \returns a + b (coeff-wise) */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet @@ -281,7 +294,7 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu { pstore(to, from); } /** \internal tries to do cache prefetching of \a addr */ -template<typename Scalar> inline void prefetch(const Scalar* addr) +template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) { #ifdef __CUDA_ARCH__ #if defined(__LP64__) @@ -432,6 +445,38 @@ Packet pfloor(const Packet& a) { using numext::floor; return floor(a); } template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); } +/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); } + +/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); } + +/** \internal \returns the zeta function of two arguments (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pzeta(const Packet& x, const Packet& q) { using numext::zeta; return zeta(x, q); } + +/** \internal \returns the polygamma function (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet ppolygamma(const Packet& n, const Packet& x) { using numext::polygamma; return polygamma(n, x); } + +/** \internal \returns the erf(\a a) (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet perf(const Packet& a) { using numext::erf; return erf(a); } + +/** \internal \returns the erfc(\a a) (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); } + +/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */ +template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); } + +/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */ +template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); } + /*************************************************************************** * The following functions might not have to be overwritten for vectorized types ***************************************************************************/ diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h index 585974809..05ba6ddb4 100644 --- a/Eigen/src/Core/GlobalFunctions.h +++ b/Eigen/src/Core/GlobalFunctions.h @@ -49,6 +49,12 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(zeta,scalar_zeta_op) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(polygamma,scalar_polygamma_op) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op) @@ -125,6 +131,36 @@ namespace Eigen ); } + /** \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays. + * + * This function computes the coefficient-wise incomplete gamma function. + * + */ + template<typename Derived,typename ExponentDerived> + inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived> + igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) + { + return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>( + a.derived(), + x.derived() + ); + } + + /** \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays. + * + * This function computes the coefficient-wise complementary incomplete gamma function. + * + */ + template<typename Derived,typename ExponentDerived> + inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived> + igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) + { + return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>( + a.derived(), + x.derived() + ); + } + namespace internal { EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(real,scalar_real_op) diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h index 9ae37bb5a..dfd9097cc 100644 --- a/Eigen/src/Core/IO.h +++ b/Eigen/src/Core/IO.h @@ -80,7 +80,7 @@ struct IOFormat * * \brief Pseudo expression providing matrix output with given format * - * \param ExpressionType the type of the object on which IO stream operations are performed + * \tparam ExpressionType the type of the object on which IO stream operations are performed * * This class represents an expression with stream operators controlled by a given IOFormat. * It is the return type of DenseBase::format() diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 3a8375da9..06d196702 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -13,6 +13,28 @@ namespace Eigen { +namespace internal { +template<typename PlainObjectType, int MapOptions, typename StrideType> +struct traits<Map<PlainObjectType, MapOptions, StrideType> > + : public traits<PlainObjectType> +{ + typedef traits<PlainObjectType> TraitsBase; + enum { + InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 + ? int(PlainObjectType::InnerStrideAtCompileTime) + : int(StrideType::InnerStrideAtCompileTime), + OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 + ? int(PlainObjectType::OuterStrideAtCompileTime) + : int(StrideType::OuterStrideAtCompileTime), + Alignment = int(MapOptions)&int(AlignedMask), + Flags0 = TraitsBase::Flags & (~NestByRefBit), + Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit) + }; +private: + enum { Options }; // Expressions don't have Options +}; +} + /** \class Map * \ingroup Core_Module * @@ -63,29 +85,6 @@ namespace Eigen { * * \sa PlainObjectBase::Map(), \ref TopicStorageOrders */ - -namespace internal { -template<typename PlainObjectType, int MapOptions, typename StrideType> -struct traits<Map<PlainObjectType, MapOptions, StrideType> > - : public traits<PlainObjectType> -{ - typedef traits<PlainObjectType> TraitsBase; - enum { - InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 - ? int(PlainObjectType::InnerStrideAtCompileTime) - : int(StrideType::InnerStrideAtCompileTime), - OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 - ? int(PlainObjectType::OuterStrideAtCompileTime) - : int(StrideType::OuterStrideAtCompileTime), - Alignment = int(MapOptions)&int(AlignedMask), - Flags0 = TraitsBase::Flags & (~NestByRefBit), - Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit) - }; -private: - enum { Options }; // Expressions don't have Options -}; -} - template<typename PlainObjectType, int MapOptions, typename StrideType> class Map : public MapBase<Map<PlainObjectType, MapOptions, StrideType> > { diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index ae28d4db6..12c464a5a 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -130,7 +130,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors> explicit inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime) { EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) - checkSanity(); + checkSanity<Derived>(); } EIGEN_DEVICE_FUNC @@ -142,7 +142,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors> EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) eigen_assert(vecSize >= 0); eigen_assert(dataPtr == 0 || SizeAtCompileTime == Dynamic || SizeAtCompileTime == vecSize); - checkSanity(); + checkSanity<Derived>(); } EIGEN_DEVICE_FUNC @@ -152,19 +152,30 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors> eigen_assert( (dataPtr == 0) || ( rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows) && cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols))); - checkSanity(); + checkSanity<Derived>(); } + #ifdef EIGEN_MAPBASE_PLUGIN + #include EIGEN_MAPBASE_PLUGIN + #endif + protected: + template<typename T> EIGEN_DEVICE_FUNC - void checkSanity() const + void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const { #if EIGEN_MAX_ALIGN_BYTES>0 - eigen_assert(((size_t(m_data) % EIGEN_PLAIN_ENUM_MAX(1,internal::traits<Derived>::Alignment)) == 0) && "data is not aligned"); + eigen_assert(( ((size_t(m_data) % internal::traits<Derived>::Alignment) == 0) + || (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned"); #endif } + template<typename T> + EIGEN_DEVICE_FUNC + void checkSanity(typename internal::enable_if<internal::traits<T>::Alignment==0,void*>::type = 0) const + {} + PointerType m_data; const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_rows; const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_cols; diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 48cf565fb..8e7dd2b73 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -23,10 +23,10 @@ double abs(double x) { return (fabs(x)); } float abs(float x) { return (fabsf(x)); } long double abs(long double x) { return (fabsl(x)); } #endif - + namespace internal { -/** \internal \struct global_math_functions_filtering_base +/** \internal \class global_math_functions_filtering_base * * What it does: * Defines a typedef 'type' as follows: @@ -496,7 +496,7 @@ template<typename Scalar, bool IsInteger> struct pow_default_impl { typedef Scalar retval; - static inline Scalar run(const Scalar& x, const Scalar& y) + static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y) { EIGEN_USING_STD_MATH(pow); return pow(x, y); @@ -506,7 +506,7 @@ struct pow_default_impl template<typename Scalar> struct pow_default_impl<Scalar, true> { - static inline Scalar run(Scalar x, Scalar y) + static EIGEN_DEVICE_FUNC inline Scalar run(Scalar x, Scalar y) { Scalar res(1); eigen_assert(!NumTraits<Scalar>::IsSigned || y >= 0); @@ -704,11 +704,13 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type isfinite_impl(const T& x) { - #if EIGEN_USE_STD_FPCLASSIFY + #ifdef __CUDA_ARCH__ + return (::isfinite)(x); + #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; return isfinite EIGEN_NOT_A_MACRO (x); #else - return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest(); + return x<=NumTraits<T>::highest() && x>=NumTraits<T>::lowest(); #endif } @@ -717,7 +719,9 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type isinf_impl(const T& x) { - #if EIGEN_USE_STD_FPCLASSIFY + #ifdef __CUDA_ARCH__ + return (::isinf)(x); + #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; return isinf EIGEN_NOT_A_MACRO (x); #else @@ -730,7 +734,9 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type isnan_impl(const T& x) { - #if EIGEN_USE_STD_FPCLASSIFY + #ifdef __CUDA_ARCH__ + return (::isnan)(x); + #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; return isnan EIGEN_NOT_A_MACRO (x); #else @@ -748,9 +754,9 @@ template<typename T> EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x) } //MSVC defines a _isnan builtin function, but for double only -EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x); } -EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x) { return _isnan(x); } -EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x) { return _isnan(x); } +EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x)!=0; } +EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x) { return _isnan(x)!=0; } +EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x) { return _isnan(x)!=0; } EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); } EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x) { return isinf_msvc_helper(x); } @@ -780,9 +786,9 @@ template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) { return #endif // The following overload are defined at the end of this file -template<typename T> bool isfinite_impl(const std::complex<T>& x); -template<typename T> bool isnan_impl(const std::complex<T>& x); -template<typename T> bool isinf_impl(const std::complex<T>& x); +template<typename T> EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x); +template<typename T> EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x); +template<typename T> EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x); } // end namespace internal @@ -946,6 +952,14 @@ T (floor)(const T& x) return floor(x); } +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float floor(const float &x) { return ::floorf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double floor(const double &x) { return ::floor(x); } +#endif + template<typename T> EIGEN_DEVICE_FUNC T (ceil)(const T& x) @@ -954,8 +968,17 @@ T (ceil)(const T& x) return ceil(x); } -// Log base 2 for 32 bits positive integers. -// Conveniently returns 0 for x==0. +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float ceil(const float &x) { return ::ceilf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double ceil(const double &x) { return ::ceil(x); } +#endif + + +/** Log base 2 for 32 bits positive integers. + * Conveniently returns 0 for x==0. */ inline int log2(int x) { eigen_assert(x>=0); @@ -969,24 +992,122 @@ inline int log2(int x) return table[(v * 0x07C4ACDDU) >> 27]; } +/** \returns the square root of \a x. + * + * It is essentially equivalent to \code using std::sqrt; return sqrt(x); \endcode, + * but slightly faster for float/double and some compilers (e.g., gcc), thanks to + * specializations when SSE is enabled. + * + * It's usage is justified in performance critical functions, like norm/normalize. + */ +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T sqrt(const T &x) +{ + EIGEN_USING_STD_MATH(sqrt); + return sqrt(x); +} + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T log(const T &x) { + EIGEN_USING_STD_MATH(log); + return log(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float log(const float &x) { return ::logf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double log(const double &x) { return ::log(x); } +#endif + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T tan(const T &x) { + EIGEN_USING_STD_MATH(tan); + return tan(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float tan(const float &x) { return ::tanf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double tan(const double &x) { return ::tan(x); } +#endif + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +typename NumTraits<T>::Real abs(const T &x) { + EIGEN_USING_STD_MATH(abs); + return abs(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float abs(const float &x) { return ::fabsf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double abs(const double &x) { return ::fabs(x); } +#endif + +template<typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T exp(const T &x) { + EIGEN_USING_STD_MATH(exp); + return exp(x); +} + +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float exp(const float &x) { return ::expf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double exp(const double &x) { return ::exp(x); } +#endif + + +template <typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T fmod(const T& a, const T& b) { + EIGEN_USING_STD_MATH(floor); + return fmod(a, b); +} + +#ifdef __CUDACC__ +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float fmod(const float& a, const float& b) { + return ::fmodf(a, b); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double fmod(const double& a, const double& b) { + return ::fmod(a, b); +} +#endif + } // end namespace numext namespace internal { template<typename T> -bool isfinite_impl(const std::complex<T>& x) +EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x) { return (numext::isfinite)(numext::real(x)) && (numext::isfinite)(numext::imag(x)); } template<typename T> -bool isnan_impl(const std::complex<T>& x) +EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x) { return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x)); } template<typename T> -bool isinf_impl(const std::complex<T>& x) +EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x) { return ((numext::isinf)(numext::real(x)) || (numext::isinf)(numext::imag(x))) && (!(numext::isnan)(x)); } @@ -1007,14 +1128,12 @@ struct scalar_fuzzy_default_impl<Scalar, false, false> template<typename OtherScalar> EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec) { - EIGEN_USING_STD_MATH(abs); - return abs(x) <= abs(y) * prec; + return numext::abs(x) <= numext::abs(y) * prec; } EIGEN_DEVICE_FUNC static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec) { - EIGEN_USING_STD_MATH(abs); - return abs(x - y) <= numext::mini(abs(x), abs(y)) * prec; + return numext::abs(x - y) <= numext::mini(numext::abs(x), numext::abs(y)) * prec; } EIGEN_DEVICE_FUNC static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar& prec) @@ -1064,21 +1183,21 @@ struct scalar_fuzzy_impl : scalar_fuzzy_default_impl<Scalar, NumTraits<Scalar>:: template<typename Scalar, typename OtherScalar> EIGEN_DEVICE_FUNC inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, - typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) + const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision()) { return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(x, y, precision); } template<typename Scalar> EIGEN_DEVICE_FUNC inline bool isApprox(const Scalar& x, const Scalar& y, - typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) + const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision()) { return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision); } template<typename Scalar> EIGEN_DEVICE_FUNC inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, - typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) + const typename NumTraits<Scalar>::Real &precision = NumTraits<Scalar>::dummy_precision()) { return scalar_fuzzy_impl<Scalar>::isApproxOrLessThan(x, y, precision); } diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index ce1b70d23..bcbbbf9ae 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -13,6 +13,45 @@ namespace Eigen { +namespace internal { +template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols> +struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > +{ +private: + enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret }; + typedef typename find_best_packet<_Scalar,size>::type PacketScalar; + enum { + row_major_bit = _Options&RowMajor ? RowMajorBit : 0, + is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic, + max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols, + default_alignment = compute_default_alignment<_Scalar,max_size>::value, + actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0, + required_alignment = unpacket_traits<PacketScalar>::alignment, + packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0 + }; + +public: + typedef _Scalar Scalar; + typedef Dense StorageKind; + typedef Eigen::Index StorageIndex; + typedef MatrixXpr XprKind; + enum { + RowsAtCompileTime = _Rows, + ColsAtCompileTime = _Cols, + MaxRowsAtCompileTime = _MaxRows, + MaxColsAtCompileTime = _MaxCols, + Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret, + Options = _Options, + InnerStrideAtCompileTime = 1, + OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime, + + // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase + EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit, + Alignment = actual_alignment + }; +}; +} + /** \class Matrix * \ingroup Core_Module * @@ -98,7 +137,7 @@ namespace Eigen { * </dl> * * <i><b>ABI and storage layout</b></i> - * + * * The table below summarizes the ABI of some possible Matrix instances which is fixed thorough the lifetime of Eigen 3. * <table class="manual"> * <tr><th>Matrix type</th><th>Equivalent C structure</th></tr> @@ -130,50 +169,11 @@ namespace Eigen { * </table> * Note that in this table Rows, Cols, MaxRows and MaxCols are all positive integers. A(S) is defined to the largest possible power-of-two * smaller to EIGEN_MAX_STATIC_ALIGN_BYTES. - * - * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy, - * \ref TopicStorageOrders + * + * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy, + * \ref TopicStorageOrders */ -namespace internal { -template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols> -struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > -{ -private: - enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret }; - typedef typename find_best_packet<_Scalar,size>::type PacketScalar; - enum { - row_major_bit = _Options&RowMajor ? RowMajorBit : 0, - is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic, - max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols, - default_alignment = compute_default_alignment<_Scalar,max_size>::value, - actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0, - required_alignment = unpacket_traits<PacketScalar>::alignment, - packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0 - }; - -public: - typedef _Scalar Scalar; - typedef Dense StorageKind; - typedef Eigen::Index StorageIndex; - typedef MatrixXpr XprKind; - enum { - RowsAtCompileTime = _Rows, - ColsAtCompileTime = _Cols, - MaxRowsAtCompileTime = _MaxRows, - MaxColsAtCompileTime = _MaxCols, - Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret, - Options = _Options, - InnerStrideAtCompileTime = 1, - OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime, - - // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase - EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit, - Alignment = actual_alignment - }; -}; -} - template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols> class Matrix : public PlainObjectBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 9d612c852..1e66b4e1b 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -43,7 +43,7 @@ namespace Eigen { * This class can be extended with the help of the plugin mechanism described on the page * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_MATRIXBASE_PLUGIN. * - * \sa \ref TopicClassHierarchy + * \sa \blank \ref TopicClassHierarchy */ template<typename Derived> class MatrixBase : public DenseBase<Derived> @@ -66,7 +66,7 @@ template<typename Derived> class MatrixBase using Base::MaxSizeAtCompileTime; using Base::IsVectorAtCompileTime; using Base::Flags; - + using Base::derived; using Base::const_cast_derived; using Base::rows; @@ -135,14 +135,14 @@ template<typename Derived> class MatrixBase /** Special case of the template operator=, in order to prevent the compiler * from generating a default operator= (issue hit with g++ 4.1) */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const MatrixBase& other); // We cannot inherit here via Base::operator= since it is causing // trouble with MSVC. template <typename OtherDerived> - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other); template <typename OtherDerived> @@ -154,10 +154,10 @@ template<typename Derived> class MatrixBase Derived& operator=(const ReturnByValue<OtherDerived>& other); template<typename OtherDerived> - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const MatrixBase<OtherDerived>& other); template<typename OtherDerived> - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const MatrixBase<OtherDerived>& other); #ifdef __CUDACC__ @@ -175,7 +175,7 @@ template<typename Derived> class MatrixBase #endif template<typename OtherDerived> - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC const Product<Derived,OtherDerived,LazyProduct> lazyProduct(const MatrixBase<OtherDerived> &other) const; @@ -204,7 +204,9 @@ template<typename Derived> class MatrixBase RealScalar blueNorm() const; RealScalar hypotNorm() const; EIGEN_DEVICE_FUNC const PlainObject normalized() const; + EIGEN_DEVICE_FUNC const PlainObject stableNormalized() const; EIGEN_DEVICE_FUNC void normalize(); + EIGEN_DEVICE_FUNC void stableNormalize(); EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const; EIGEN_DEVICE_FUNC void adjointInPlace(); @@ -212,7 +214,7 @@ template<typename Derived> class MatrixBase typedef Diagonal<Derived> DiagonalReturnType; EIGEN_DEVICE_FUNC DiagonalReturnType diagonal(); - + typedef typename internal::add_const<Diagonal<const Derived> >::type ConstDiagonalReturnType; EIGEN_DEVICE_FUNC ConstDiagonalReturnType diagonal() const; @@ -220,14 +222,14 @@ template<typename Derived> class MatrixBase template<int Index> struct DiagonalIndexReturnType { typedef Diagonal<Derived,Index> Type; }; template<int Index> struct ConstDiagonalIndexReturnType { typedef const Diagonal<const Derived,Index> Type; }; - template<int Index> + template<int Index> EIGEN_DEVICE_FUNC typename DiagonalIndexReturnType<Index>::Type diagonal(); template<int Index> EIGEN_DEVICE_FUNC typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const; - + typedef Diagonal<Derived,DynamicIndex> DiagonalDynamicIndexReturnType; typedef typename internal::add_const<Diagonal<const Derived,DynamicIndex> >::type ConstDiagonalDynamicIndexReturnType; @@ -249,7 +251,7 @@ template<typename Derived> class MatrixBase template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SelfAdjointView<Derived, UpLo> Type; }; template<unsigned int UpLo> struct ConstSelfAdjointViewReturnType { typedef const SelfAdjointView<const Derived, UpLo> Type; }; - template<unsigned int UpLo> + template<unsigned int UpLo> EIGEN_DEVICE_FUNC typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView(); template<unsigned int UpLo> @@ -338,7 +340,7 @@ template<typename Derived> class MatrixBase EIGEN_DEVICE_FUNC inline const Inverse<Derived> inverse() const; - + template<typename ResultType> inline void computeInverseAndDetWithCheck( ResultType& inverse, @@ -364,6 +366,7 @@ template<typename Derived> class MatrixBase inline const HouseholderQR<PlainObject> householderQr() const; inline const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const; inline const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const; + inline const CompleteOrthogonalDecomposition<PlainObject> completeOrthogonalDecomposition() const; /////////// Eigenvalues module /////////// @@ -386,25 +389,29 @@ template<typename Derived> class MatrixBase #endif // EIGEN_PARSED_BY_DOXYGEN template<typename OtherDerived> EIGEN_DEVICE_FUNC +#ifndef EIGEN_PARSED_BY_DOXYGEN inline typename cross_product_return_type<OtherDerived>::type +#else + inline PlainObject +#endif cross(const MatrixBase<OtherDerived>& other) const; - + template<typename OtherDerived> EIGEN_DEVICE_FUNC inline PlainObject cross3(const MatrixBase<OtherDerived>& other) const; - + EIGEN_DEVICE_FUNC inline PlainObject unitOrthogonal(void) const; - + inline Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const; - + inline ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const; // put this as separate enum value to work around possible GCC 4.3 bug (?) enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1&&RowsAtCompileTime==1 ? ((internal::traits<Derived>::Flags&RowMajorBit)==RowMajorBit ? Horizontal : Vertical) : ColsAtCompileTime==1 ? Vertical : Horizontal }; typedef Homogeneous<Derived, HomogeneousReturnTypeDirection> HomogeneousReturnType; inline HomogeneousReturnType homogeneous() const; - + enum { SizeMinusOne = SizeAtCompileTime==Dynamic ? Dynamic : SizeAtCompileTime-1 }; diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h index 9aeaf8d18..13adf070e 100644 --- a/Eigen/src/Core/NestByValue.h +++ b/Eigen/src/Core/NestByValue.h @@ -13,25 +13,24 @@ namespace Eigen { +namespace internal { +template<typename ExpressionType> +struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType> +{}; +} + /** \class NestByValue * \ingroup Core_Module * * \brief Expression which must be nested by value * - * \param ExpressionType the type of the object of which we are requiring nesting-by-value + * \tparam ExpressionType the type of the object of which we are requiring nesting-by-value * * This class is the return type of MatrixBase::nestByValue() * and most of the time this is the only way it is used. * * \sa MatrixBase::nestByValue() */ - -namespace internal { -template<typename ExpressionType> -struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType> -{}; -} - template<typename ExpressionType> class NestByValue : public internal::dense_xpr_base< NestByValue<ExpressionType> >::type { diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h index 0ade75255..ffb673cee 100644 --- a/Eigen/src/Core/NoAlias.h +++ b/Eigen/src/Core/NoAlias.h @@ -17,7 +17,7 @@ namespace Eigen { * * \brief Pseudo expression providing an operator = assuming no aliasing * - * \param ExpressionType the type of the object on which to do the lazy assignment + * \tparam ExpressionType the type of the object on which to do the lazy assignment * * This class represents an expression with special assignment operators * assuming no aliasing between the target expression and the source expression. diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 1d85dec72..e065fa714 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -17,7 +17,7 @@ namespace Eigen { * * \brief Holds information about the various numeric (i.e. scalar) types allowed by Eigen. * - * \param T the numeric type at hand + * \tparam T the numeric type at hand * * This class stores enums, typedefs and static methods giving information about a numeric type. * @@ -60,6 +60,23 @@ template<typename T> struct GenericNumTraits MulCost = 1 }; + // Division is messy but important, because it is expensive and throughput + // varies significantly. The following numbers are based on min division + // throughput on Haswell. + template<bool Vectorized> + struct Div { + enum { +#ifdef EIGEN_VECTORIZE_AVX + AVX = true, +#else + AVX = false, +#endif + Cost = IsInteger ? (sizeof(T) == 8 ? (IsSigned ? 24 : 21) : (IsSigned ? 8 : 9)): + Vectorized ? (sizeof(T) == 8 ? (AVX ? 16 : 8) : (AVX ? 14 : 7)) : 8 + }; + }; + + typedef T Real; typedef typename internal::conditional< IsInteger, @@ -71,11 +88,7 @@ template<typename T> struct GenericNumTraits EIGEN_DEVICE_FUNC static inline Real epsilon() { - #if defined(__CUDA_ARCH__) - return internal::device::numeric_limits<T>::epsilon(); - #else - return std::numeric_limits<T>::epsilon(); - #endif + return numext::numeric_limits<T>::epsilon(); } EIGEN_DEVICE_FUNC static inline Real dummy_precision() @@ -87,20 +100,22 @@ template<typename T> struct GenericNumTraits EIGEN_DEVICE_FUNC static inline T highest() { -#if defined(__CUDA_ARCH__) - return (internal::device::numeric_limits<T>::max)(); -#else - return (std::numeric_limits<T>::max)(); -#endif + return (numext::numeric_limits<T>::max)(); } EIGEN_DEVICE_FUNC static inline T lowest() { -#if defined(__CUDA_ARCH__) - return IsInteger ? (internal::device::numeric_limits<T>::min)() : (-(internal::device::numeric_limits<T>::max)()); -#else - return IsInteger ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)()); -#endif + return IsInteger ? (numext::numeric_limits<T>::min)() : (-(numext::numeric_limits<T>::max)()); + } + + EIGEN_DEVICE_FUNC + static inline T infinity() { + return numext::numeric_limits<T>::infinity(); + } + + EIGEN_DEVICE_FUNC + static inline T quiet_NaN() { + return numext::numeric_limits<T>::quiet_NaN(); } }; @@ -138,7 +153,9 @@ template<typename _Real> struct NumTraits<std::complex<_Real> > MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost }; + EIGEN_DEVICE_FUNC static inline Real epsilon() { return NumTraits<Real>::epsilon(); } + EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); } }; @@ -151,7 +168,7 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > typedef typename NumTraits<Scalar>::NonInteger NonIntegerScalar; typedef Array<NonIntegerScalar, Rows, Cols, Options, MaxRows, MaxCols> NonInteger; typedef ArrayType & Nested; - + enum { IsComplex = NumTraits<Scalar>::IsComplex, IsInteger = NumTraits<Scalar>::IsInteger, @@ -161,8 +178,10 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > AddCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost, MulCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost }; - + + EIGEN_DEVICE_FUNC static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); } + EIGEN_DEVICE_FUNC static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); } }; diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h index 90e1df233..b1fb455b9 100644 --- a/Eigen/src/Core/PermutationMatrix.h +++ b/Eigen/src/Core/PermutationMatrix.h @@ -13,12 +13,18 @@ namespace Eigen { +namespace internal { + +enum PermPermProduct_t {PermPermProduct}; + +} // end namespace internal + /** \class PermutationBase * \ingroup Core_Module * * \brief Base class for permutations * - * \param Derived the derived class + * \tparam Derived the derived class * * This class is the base class for all expressions representing a permutation matrix, * internally stored as a vector of integers. @@ -36,13 +42,6 @@ namespace Eigen { * * \sa class PermutationMatrix, class PermutationWrapper */ - -namespace internal { - -enum PermPermProduct_t {PermPermProduct}; - -} // end namespace internal - template<typename Derived> class PermutationBase : public EigenBase<Derived> { @@ -192,13 +191,13 @@ class PermutationBase : public EigenBase<Derived> /** \returns the inverse permutation matrix. * - * \note \note_try_to_help_rvo + * \note \blank \note_try_to_help_rvo */ inline InverseReturnType inverse() const { return InverseReturnType(derived()); } /** \returns the tranpose permutation matrix. * - * \note \note_try_to_help_rvo + * \note \blank \note_try_to_help_rvo */ inline InverseReturnType transpose() const { return InverseReturnType(derived()); } @@ -225,7 +224,7 @@ class PermutationBase : public EigenBase<Derived> /** \returns the product permutation matrix. * - * \note \note_try_to_help_rvo + * \note \blank \note_try_to_help_rvo */ template<typename Other> inline PlainPermutationType operator*(const PermutationBase<Other>& other) const @@ -233,7 +232,7 @@ class PermutationBase : public EigenBase<Derived> /** \returns the product of a permutation with another inverse permutation. * - * \note \note_try_to_help_rvo + * \note \blank \note_try_to_help_rvo */ template<typename Other> inline PlainPermutationType operator*(const InverseImpl<Other,PermutationStorage>& other) const @@ -241,7 +240,7 @@ class PermutationBase : public EigenBase<Derived> /** \returns the product of an inverse permutation with another permutation. * - * \note \note_try_to_help_rvo + * \note \blank \note_try_to_help_rvo */ template<typename Other> friend inline PlainPermutationType operator*(const InverseImpl<Other, PermutationStorage>& other, const PermutationBase& perm) @@ -280,20 +279,6 @@ class PermutationBase : public EigenBase<Derived> }; -/** \class PermutationMatrix - * \ingroup Core_Module - * - * \brief Permutation matrix - * - * \param SizeAtCompileTime the number of rows/cols, or Dynamic - * \param MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it. - * \param StorageIndex the integer type of the indices - * - * This class represents a permutation matrix, internally stored as a vector of integers. - * - * \sa class PermutationBase, class PermutationWrapper, class DiagonalMatrix - */ - namespace internal { template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex> struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> > @@ -306,6 +291,19 @@ struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _Storag }; } +/** \class PermutationMatrix + * \ingroup Core_Module + * + * \brief Permutation matrix + * + * \tparam SizeAtCompileTime the number of rows/cols, or Dynamic + * \tparam MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it. + * \tparam _StorageIndex the integer type of the indices + * + * This class represents a permutation matrix, internally stored as a vector of integers. + * + * \sa class PermutationBase, class PermutationWrapper, class DiagonalMatrix + */ template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex> class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> > { @@ -482,18 +480,6 @@ class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageInd IndicesType m_indices; }; -/** \class PermutationWrapper - * \ingroup Core_Module - * - * \brief Class to view a vector of integers as a permutation matrix - * - * \param _IndicesType the type of the vector of integer (can be any compatible expression) - * - * This class allows to view any vector expression of integers as a permutation matrix. - * - * \sa class PermutationBase, class PermutationMatrix - */ - template<typename _IndicesType> class TranspositionsWrapper; namespace internal { template<typename _IndicesType> @@ -513,6 +499,17 @@ struct traits<PermutationWrapper<_IndicesType> > }; } +/** \class PermutationWrapper + * \ingroup Core_Module + * + * \brief Class to view a vector of integers as a permutation matrix + * + * \tparam _IndicesType the type of the vector of integer (can be any compatible expression) + * + * This class allows to view any vector expression of integers as a permutation matrix. + * + * \sa class PermutationBase, class PermutationMatrix + */ template<typename _IndicesType> class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesType> > { diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index 1225e85b4..b7a4fcea8 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -533,7 +533,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type public: - /** \copydoc MatrixBase::operator=(const EigenBase<OtherDerived>&) + /** \copydoc DenseBase::operator=(const EigenBase<OtherDerived>&) */ template<typename OtherDerived> EIGEN_DEVICE_FUNC @@ -618,8 +618,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type //@} using Base::setConstant; - EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& value); - EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& value); + EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val); + EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& val); using Base::setZero; EIGEN_DEVICE_FUNC Derived& setZero(Index size); diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index fdd2fed3f..8aa1de081 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -14,22 +14,6 @@ namespace Eigen { template<typename Lhs, typename Rhs, int Option, typename StorageKind> class ProductImpl; -/** \class Product - * \ingroup Core_Module - * - * \brief Expression of the product of two arbitrary matrices or vectors - * - * \param Lhs the type of the left-hand side expression - * \param Rhs the type of the right-hand side expression - * - * This class represents an expression of the product of two arbitrary matrices. - * - * The other template parameters are: - * \tparam Option can be DefaultProduct, AliasFreeProduct, or LazyProduct - * - */ - - namespace internal { // Determine the scalar of Product<Lhs, Rhs>. This is normally the same as Lhs::Scalar times @@ -102,7 +86,20 @@ struct traits<Product<Lhs, Rhs, Option> > } // end namespace internal - +/** \class Product + * \ingroup Core_Module + * + * \brief Expression of the product of two arbitrary matrices or vectors + * + * \tparam _Lhs the type of the left-hand side expression + * \tparam _Rhs the type of the right-hand side expression + * + * This class represents an expression of the product of two arbitrary matrices. + * + * The other template parameters are: + * \tparam Option can be DefaultProduct, AliasFreeProduct, or LazyProduct + * + */ template<typename _Lhs, typename _Rhs, int Option> class Product : public ProductImpl<_Lhs,_Rhs,Option, typename internal::product_promote_storage_type<typename internal::traits<_Lhs>::StorageKind, diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 794038a2a..3ce86e8cd 100755..100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -38,10 +38,9 @@ struct evaluator<Product<Lhs, Rhs, Options> > // Catch scalar * ( A * B ) and transform it to (A*scalar) * B // TODO we should apply that rule only if that's really helpful template<typename Lhs, typename Rhs, typename Scalar> -struct evaluator_traits<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Product<Lhs, Rhs, DefaultProduct> > > - : evaluator_traits_base<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Product<Lhs, Rhs, DefaultProduct> > > +struct evaluator_assume_aliasing<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Product<Lhs, Rhs, DefaultProduct> > > { - enum { AssumeAliasing = 1 }; + static const bool value = true; }; template<typename Lhs, typename Rhs, typename Scalar> struct evaluator<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Product<Lhs, Rhs, DefaultProduct> > > @@ -81,17 +80,8 @@ template< typename Lhs, typename Rhs, struct generic_product_impl; template<typename Lhs, typename Rhs> -struct evaluator_traits<Product<Lhs, Rhs, DefaultProduct> > - : evaluator_traits_base<Product<Lhs, Rhs, DefaultProduct> > -{ - enum { AssumeAliasing = 1 }; -}; - -template<typename Lhs, typename Rhs> -struct evaluator_traits<Product<Lhs, Rhs, AliasFreeProduct> > - : evaluator_traits_base<Product<Lhs, Rhs, AliasFreeProduct> > -{ - enum { AssumeAliasing = 0 }; +struct evaluator_assume_aliasing<Product<Lhs, Rhs, DefaultProduct> > { + static const bool value = true; }; // This is the default evaluator implementation for products: @@ -107,7 +97,8 @@ struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsSh Flags = Base::Flags | EvalBeforeNestingBit }; - EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit product_evaluator(const XprType& xpr) : m_result(xpr.rows(), xpr.cols()) { ::new (static_cast<Base*>(this)) Base(m_result); @@ -189,6 +180,13 @@ struct Assignment<DstXprType, CwiseUnaryOp<internal::scalar_multiple_op<ScalarBi //---------------------------------------- // Catch "Dense ?= xpr + Product<>" expression to save one temporary // FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct +// TODO enable it for "Dense ?= xpr - Product<>" as well. + +template<typename OtherXpr, typename Lhs, typename Rhs> +struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename OtherXpr::Scalar>, const OtherXpr, + const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > { + static const bool value = true; +}; template<typename DstXprType, typename OtherXpr, typename ProductType, typename Scalar, typename Func1, typename Func2> struct assignment_from_xpr_plus_product @@ -415,7 +413,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, typedef typename XprType::PacketScalar PacketScalar; typedef typename XprType::PacketReturnType PacketReturnType; - EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit product_evaluator(const XprType& xpr) : m_lhs(xpr.lhs()), m_rhs(xpr.rhs()), m_lhsImpl(m_lhs), // FIXME the creation of the evaluator objects should result in a no-op, but check that! diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h index 61de5ed17..6e94181f3 100644 --- a/Eigen/src/Core/Ref.h +++ b/Eigen/src/Core/Ref.h @@ -12,76 +12,6 @@ namespace Eigen { -/** \class Ref - * \ingroup Core_Module - * - * \brief A matrix or vector expression mapping an existing expression - * - * \tparam PlainObjectType the equivalent matrix type of the mapped data - * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned. - * The default is \c #Unaligned. - * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1), - * but accepts a variable outer stride (leading dimension). - * This can be overridden by specifying strides. - * The type passed here must be a specialization of the Stride template, see examples below. - * - * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies. - * A Ref<> object can represent either a const expression or a l-value: - * \code - * // in-out argument: - * void foo1(Ref<VectorXf> x); - * - * // read-only const argument: - * void foo2(const Ref<const VectorXf>& x); - * \endcode - * - * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered. - * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout. - * Likewise, a Ref<MatrixXf> can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with - * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension) - * can be greater than the number of rows. - * - * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function. - * Here are some examples: - * \code - * MatrixXf A; - * VectorXf a; - * foo1(a.head()); // OK - * foo1(A.col()); // OK - * foo1(A.row()); // Compilation error because here innerstride!=1 - * foo2(A.row()); // Compilation error because A.row() is a 1xN object while foo2 is expecting a Nx1 object - * foo2(A.row().transpose()); // The row is copied into a contiguous temporary - * foo2(2*a); // The expression is evaluated into a temporary - * foo2(A.col().segment(2,4)); // No temporary - * \endcode - * - * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters. - * Here is an example accepting an innerstride!=1: - * \code - * // in-out argument: - * void foo3(Ref<VectorXf,0,InnerStride<> > x); - * foo3(A.row()); // OK - * \endcode - * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more - * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a - * template function, e.g.: - * \code - * // in the .h: - * void foo(const Ref<MatrixXf>& A); - * void foo(const Ref<MatrixXf,0,Stride<> >& A); - * - * // in the .cpp: - * template<typename TypeOfA> void foo_impl(const TypeOfA& A) { - * ... // crazy code goes here - * } - * void foo(const Ref<MatrixXf>& A) { foo_impl(A); } - * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); } - * \endcode - * - * - * \sa PlainObjectBase::Map(), \ref TopicStorageOrders - */ - namespace internal { template<typename _PlainObjectType, int _Options, typename _StrideType> @@ -182,7 +112,75 @@ protected: StrideBase m_stride; }; - +/** \class Ref + * \ingroup Core_Module + * + * \brief A matrix or vector expression mapping an existing expression + * + * \tparam PlainObjectType the equivalent matrix type of the mapped data + * \tparam Options specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned. + * The default is \c #Unaligned. + * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1), + * but accepts a variable outer stride (leading dimension). + * This can be overridden by specifying strides. + * The type passed here must be a specialization of the Stride template, see examples below. + * + * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies. + * A Ref<> object can represent either a const expression or a l-value: + * \code + * // in-out argument: + * void foo1(Ref<VectorXf> x); + * + * // read-only const argument: + * void foo2(const Ref<const VectorXf>& x); + * \endcode + * + * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered. + * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout. + * Likewise, a Ref<MatrixXf> can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with + * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension) + * can be greater than the number of rows. + * + * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function. + * Here are some examples: + * \code + * MatrixXf A; + * VectorXf a; + * foo1(a.head()); // OK + * foo1(A.col()); // OK + * foo1(A.row()); // Compilation error because here innerstride!=1 + * foo2(A.row()); // Compilation error because A.row() is a 1xN object while foo2 is expecting a Nx1 object + * foo2(A.row().transpose()); // The row is copied into a contiguous temporary + * foo2(2*a); // The expression is evaluated into a temporary + * foo2(A.col().segment(2,4)); // No temporary + * \endcode + * + * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters. + * Here is an example accepting an innerstride!=1: + * \code + * // in-out argument: + * void foo3(Ref<VectorXf,0,InnerStride<> > x); + * foo3(A.row()); // OK + * \endcode + * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more + * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a + * template function, e.g.: + * \code + * // in the .h: + * void foo(const Ref<MatrixXf>& A); + * void foo(const Ref<MatrixXf,0,Stride<> >& A); + * + * // in the .cpp: + * template<typename TypeOfA> void foo_impl(const TypeOfA& A) { + * ... // crazy code goes here + * } + * void foo(const Ref<MatrixXf>& A) { foo_impl(A); } + * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); } + * \endcode + * + * + * \sa PlainObjectBase::Map(), \ref TopicStorageOrders + */ template<typename PlainObjectType, int Options, typename StrideType> class Ref : public RefBase<Ref<PlainObjectType, Options, StrideType> > { @@ -209,6 +207,7 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr, typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0) #else + /** Implicit constructor from any dense expression */ template<typename Derived> inline Ref(DenseBase<Derived>& expr) #endif diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h index bec598310..9960ef884 100644 --- a/Eigen/src/Core/Replicate.h +++ b/Eigen/src/Core/Replicate.h @@ -12,21 +12,6 @@ namespace Eigen { -/** - * \class Replicate - * \ingroup Core_Module - * - * \brief Expression of the multiple replication of a matrix or vector - * - * \param MatrixType the type of the object we are replicating - * - * This class represents an expression of the multiple replication of a matrix or vector. - * It is the return type of DenseBase::replicate() and most of the time - * this is the only way it is used. - * - * \sa DenseBase::replicate() - */ - namespace internal { template<typename MatrixType,int RowFactor,int ColFactor> struct traits<Replicate<MatrixType,RowFactor,ColFactor> > @@ -57,6 +42,22 @@ struct traits<Replicate<MatrixType,RowFactor,ColFactor> > }; } +/** + * \class Replicate + * \ingroup Core_Module + * + * \brief Expression of the multiple replication of a matrix or vector + * + * \tparam MatrixType the type of the object we are replicating + * \tparam RowFactor number of repetitions at compile time along the vertical direction, can be Dynamic. + * \tparam ColFactor number of repetitions at compile time along the horizontal direction, can be Dynamic. + * + * This class represents an expression of the multiple replication of a matrix or vector. + * It is the return type of DenseBase::replicate() and most of the time + * this is the only way it is used. + * + * \sa DenseBase::replicate() + */ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate : public internal::dense_xpr_base< Replicate<MatrixType,RowFactor,ColFactor> >::type { diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h index 7feb6e01c..c44b7673b 100644 --- a/Eigen/src/Core/ReturnByValue.h +++ b/Eigen/src/Core/ReturnByValue.h @@ -13,11 +13,6 @@ namespace Eigen { -/** \class ReturnByValue - * \ingroup Core_Module - * - */ - namespace internal { template<typename Derived> @@ -48,6 +43,10 @@ struct nested_eval<ReturnByValue<Derived>, n, PlainObject> } // end namespace internal +/** \class ReturnByValue + * \ingroup Core_Module + * + */ template<typename Derived> class ReturnByValue : public internal::dense_xpr_base< ReturnByValue<Derived> >::type, internal::no_assignment_operator { diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h index d7c380c78..0640cda2a 100644 --- a/Eigen/src/Core/Reverse.h +++ b/Eigen/src/Core/Reverse.h @@ -14,20 +14,6 @@ namespace Eigen { -/** \class Reverse - * \ingroup Core_Module - * - * \brief Expression of the reverse of a vector or matrix - * - * \param MatrixType the type of the object of which we are taking the reverse - * - * This class represents an expression of the reverse of a vector. - * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse() - * and most of the time this is the only way it is used. - * - * \sa MatrixBase::reverse(), VectorwiseOp::reverse() - */ - namespace internal { template<typename MatrixType, int Direction> @@ -60,6 +46,20 @@ template<typename PacketType> struct reverse_packet_cond<PacketType,false> } // end namespace internal +/** \class Reverse + * \ingroup Core_Module + * + * \brief Expression of the reverse of a vector or matrix + * + * \tparam MatrixType the type of the object of which we are taking the reverse + * \tparam Direction defines the direction of the reverse operation, can be Vertical, Horizontal, or BothDirections + * + * This class represents an expression of the reverse of a vector. + * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse() + * and most of the time this is the only way it is used. + * + * \sa MatrixBase::reverse(), VectorwiseOp::reverse() + */ template<typename MatrixType, int Direction> class Reverse : public internal::dense_xpr_base< Reverse<MatrixType, Direction> >::type { diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index 87e87ab3a..9fda02691 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -32,7 +32,7 @@ namespace internal { template<typename MatrixType, unsigned int UpLo> struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType> { - typedef typename ref_selector<MatrixType>::type MatrixTypeNested; + typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested; typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned; typedef MatrixType ExpressionType; typedef typename MatrixType::PlainObject FullMatrixType; @@ -97,7 +97,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView { EIGEN_STATIC_ASSERT_LVALUE(SelfAdjointView); Base::check_coordinates_internal(row, col); - return m_matrix.const_cast_derived().coeffRef(row, col); + return m_matrix.coeffRef(row, col); } /** \internal */ @@ -107,7 +107,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView EIGEN_DEVICE_FUNC const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; } EIGEN_DEVICE_FUNC - MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); } + MatrixTypeNestedCleaned& nestedExpression() { return m_matrix; } /** Efficient triangular matrix times vector/matrix product */ template<typename OtherDerived> @@ -203,8 +203,6 @@ struct evaluator_traits<SelfAdjointView<MatrixType,Mode> > { typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind; typedef SelfAdjointShape Shape; - - static const int AssumeAliasing = 0; }; template<int UpLo, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version> diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h index 38185d9d7..78fff1549 100644 --- a/Eigen/src/Core/SelfCwiseBinaryOp.h +++ b/Eigen/src/Core/SelfCwiseBinaryOp.h @@ -13,7 +13,7 @@ namespace Eigen { template<typename Derived> -inline Derived& DenseBase<Derived>::operator*=(const Scalar& other) +EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other) { typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar>()); @@ -21,7 +21,7 @@ inline Derived& DenseBase<Derived>::operator*=(const Scalar& other) } template<typename Derived> -inline Derived& ArrayBase<Derived>::operator+=(const Scalar& other) +EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other) { typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar>()); @@ -29,7 +29,7 @@ inline Derived& ArrayBase<Derived>::operator+=(const Scalar& other) } template<typename Derived> -inline Derived& ArrayBase<Derived>::operator-=(const Scalar& other) +EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other) { typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar>()); @@ -37,7 +37,7 @@ inline Derived& ArrayBase<Derived>::operator-=(const Scalar& other) } template<typename Derived> -inline Derived& DenseBase<Derived>::operator/=(const Scalar& other) +EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other) { typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar>()); diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h new file mode 100644 index 000000000..adb055b15 --- /dev/null +++ b/Eigen/src/Core/SpecialFunctions.h @@ -0,0 +1,1050 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIAL_FUNCTIONS_H +#define EIGEN_SPECIAL_FUNCTIONS_H + +namespace Eigen { +namespace internal { + +// Parts of this code are based on the Cephes Math Library. +// +// Cephes Math Library Release 2.8: June, 2000 +// Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier +// +// Permission has been kindly provided by the original author +// to incorporate the Cephes software into the Eigen codebase: +// +// From: Stephen Moshier +// To: Eugene Brevdo +// Subject: Re: Permission to wrap several cephes functions in Eigen +// +// Hello Eugene, +// +// Thank you for writing. +// +// If your licensing is similar to BSD, the formal way that has been +// handled is simply to add a statement to the effect that you are incorporating +// the Cephes software by permission of the author. +// +// Good luck with your project, +// Steve + +namespace cephes { + +/* polevl (modified for Eigen) + * + * Evaluate polynomial + * + * + * + * SYNOPSIS: + * + * int N; + * Scalar x, y, coef[N+1]; + * + * y = polevl<decltype(x), N>( x, coef); + * + * + * + * DESCRIPTION: + * + * Evaluates polynomial of degree N: + * + * 2 N + * y = C + C x + C x +...+ C x + * 0 1 2 N + * + * Coefficients are stored in reverse order: + * + * coef[0] = C , ..., coef[N] = C . + * N 0 + * + * The function p1evl() assumes that coef[N] = 1.0 and is + * omitted from the array. Its calling arguments are + * otherwise the same as polevl(). + * + * + * The Eigen implementation is templatized. For best speed, store + * coef as a const array (constexpr), e.g. + * + * const double coef[] = {1.0, 2.0, 3.0, ...}; + * + */ +template <typename Scalar, int N> +struct polevl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar x, const Scalar coef[]) { + EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + + return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N]; + } +}; + +template <typename Scalar> +struct polevl<Scalar, 0> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar, const Scalar coef[]) { + return coef[0]; + } +}; + +} // end namespace cephes + +/**************************************************************************** + * Implementation of lgamma * + ****************************************************************************/ + +template <typename Scalar> +struct lgamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template <typename Scalar> +struct lgamma_retval { + typedef Scalar type; +}; + +#ifdef EIGEN_HAS_C99_MATH +template <> +struct lgamma_impl<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(float x) { return ::lgammaf(x); } +}; + +template <> +struct lgamma_impl<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(double x) { return ::lgamma(x); } +}; +#endif + +/**************************************************************************** + * Implementation of digamma (psi) * + ****************************************************************************/ + +template <typename Scalar> +struct digamma_retval { + typedef Scalar type; +}; + +#ifndef EIGEN_HAS_C99_MATH + +template <typename Scalar> +struct digamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +/* + * + * Polynomial evaluation helper for the Psi (digamma) function. + * + * digamma_impl_maybe_poly::run(s) evaluates the asymptotic Psi expansion for + * input Scalar s, assuming s is above 10.0. + * + * If s is above a certain threshold for the given Scalar type, zero + * is returned. Otherwise the polynomial is evaluated with enough + * coefficients for results matching Scalar machine precision. + * + * + */ +template <typename Scalar> +struct digamma_impl_maybe_poly { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + + +template <> +struct digamma_impl_maybe_poly<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(const float s) { + const float A[] = { + -4.16666666666666666667E-3f, + 3.96825396825396825397E-3f, + -8.33333333333333333333E-3f, + 8.33333333333333333333E-2f + }; + + float z; + if (s < 1.0e8f) { + z = 1.0f / (s * s); + return z * cephes::polevl<float, 3>::run(z, A); + } else return 0.0f; + } +}; + +template <> +struct digamma_impl_maybe_poly<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(const double s) { + const double A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2 + }; + + double z; + if (s < 1.0e17) { + z = 1.0 / (s * s); + return z * cephes::polevl<double, 6>::run(z, A); + } + else return 0.0; + } +}; + +template <typename Scalar> +struct digamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar x) { + /* + * + * Psi (digamma) function (modified for Eigen) + * + * + * SYNOPSIS: + * + * double x, y, psi(); + * + * y = psi( x ); + * + * + * DESCRIPTION: + * + * d - + * psi(x) = -- ln | (x) + * dx + * + * is the logarithmic derivative of the gamma function. + * For integer x, + * n-1 + * - + * psi(n) = -EUL + > 1/k. + * - + * k=1 + * + * If x is negative, it is transformed to a positive argument by the + * reflection formula psi(1-x) = psi(x) + pi cot(pi x). + * For general positive x, the argument is made greater than 10 + * using the recurrence psi(x+1) = psi(x) + 1/x. + * Then the following asymptotic expansion is applied: + * + * inf. B + * - 2k + * psi(x) = log(x) - 1/2x - > ------- + * - 2k + * k=1 2k x + * + * where the B2k are Bernoulli numbers. + * + * ACCURACY (float): + * Relative error (except absolute when |psi| < 1): + * arithmetic domain # trials peak rms + * IEEE 0,30 30000 1.3e-15 1.4e-16 + * IEEE -30,0 40000 1.5e-15 2.2e-16 + * + * ACCURACY (double): + * Absolute error, relative when |psi| > 1 : + * arithmetic domain # trials peak rms + * IEEE -33,0 30000 8.2e-7 1.2e-7 + * IEEE 0,33 100000 7.3e-7 7.7e-8 + * + * ERROR MESSAGES: + * message condition value returned + * psi singularity x integer <=0 INFINITY + */ + + Scalar p, q, nz, s, w, y; + bool negative; + + const Scalar maxnum = NumTraits<Scalar>::infinity(); + const Scalar m_pi = EIGEN_PI; + + negative = 0; + nz = 0.0; + + const Scalar zero = 0.0; + const Scalar one = 1.0; + const Scalar half = 0.5; + + if (x <= zero) { + negative = one; + q = x; + p = numext::floor(q); + if (p == q) { + return maxnum; + } + /* Remove the zeros of tan(m_pi x) + * by subtracting the nearest integer from x + */ + nz = q - p; + if (nz != half) { + if (nz > half) { + p += one; + nz = q - p; + } + nz = m_pi / numext::tan(m_pi * nz); + } + else { + nz = zero; + } + x = one - x; + } + + /* use the recurrence psi(x+1) = psi(x) + 1/x. */ + s = x; + w = zero; + while (s < Scalar(10)) { + w += one / s; + s += one; + } + + y = digamma_impl_maybe_poly<Scalar>::run(s); + + y = numext::log(s) - (half / s) - y - w; + + return (negative) ? y - nz : y; + } +}; + +#endif // EIGEN_HAS_C99_MATH + +/**************************************************************************** + * Implementation of erf * + ****************************************************************************/ + +template <typename Scalar> +struct erf_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template <typename Scalar> +struct erf_retval { + typedef Scalar type; +}; + +#ifdef EIGEN_HAS_C99_MATH +template <> +struct erf_impl<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(float x) { return ::erff(x); } +}; + +template <> +struct erf_impl<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(double x) { return ::erf(x); } +}; +#endif // EIGEN_HAS_C99_MATH + +/*************************************************************************** +* Implementation of erfc * +****************************************************************************/ + +template <typename Scalar> +struct erfc_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template <typename Scalar> +struct erfc_retval { + typedef Scalar type; +}; + +#ifdef EIGEN_HAS_C99_MATH +template <> +struct erfc_impl<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); } +}; + +template <> +struct erfc_impl<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); } +}; +#endif // EIGEN_HAS_C99_MATH + +/**************************************************************************** + * Implementation of igammac (complemented incomplete gamma integral) * + ****************************************************************************/ + +template <typename Scalar> +struct igammac_retval { + typedef Scalar type; +}; + +#ifndef EIGEN_HAS_C99_MATH + +template <typename Scalar> +struct igammac_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template <typename Scalar> struct igamma_impl; // predeclare igamma_impl + +template <typename Scalar> +struct igamma_helper { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar big() { assert(false && "big not supported for this type"); return 0.0; } +}; + +template <> +struct igamma_helper<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float machep() { + return NumTraits<float>::epsilon() / 2; // 1.0 - machep == 1.0 + } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float big() { + // use epsneg (1.0 - epsneg == 1.0) + return 1.0 / (NumTraits<float>::epsilon() / 2); + } +}; + +template <> +struct igamma_helper<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double machep() { + return NumTraits<double>::epsilon() / 2; // 1.0 - machep == 1.0 + } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double big() { + return 1.0 / NumTraits<double>::epsilon(); + } +}; + +template <typename Scalar> +struct igammac_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + /* igamc() + * + * Incomplete gamma integral (modified for Eigen) + * + * + * + * SYNOPSIS: + * + * double a, x, y, igamc(); + * + * y = igamc( a, x ); + * + * DESCRIPTION: + * + * The function is defined by + * + * + * igamc(a,x) = 1 - igam(a,x) + * + * inf. + * - + * 1 | | -t a-1 + * = ----- | e t dt. + * - | | + * | (a) - + * x + * + * + * In this implementation both arguments must be positive. + * The integral is evaluated by either a power series or + * continued fraction expansion, depending on the relative + * values of a and x. + * + * ACCURACY (float): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 30000 7.8e-6 5.9e-7 + * + * + * ACCURACY (double): + * + * Tested at random a, x. + * a x Relative error: + * arithmetic domain domain # trials peak rms + * IEEE 0.5,100 0,100 200000 1.9e-14 1.7e-15 + * IEEE 0.01,0.5 0,100 200000 1.4e-13 1.6e-15 + * + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1985, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ + const Scalar zero = 0; + const Scalar one = 1; + const Scalar two = 2; + const Scalar machep = igamma_helper<Scalar>::machep(); + const Scalar maxlog = numext::log(NumTraits<Scalar>::highest()); + const Scalar big = igamma_helper<Scalar>::big(); + const Scalar biginv = 1 / big; + const Scalar nan = NumTraits<Scalar>::quiet_NaN(); + const Scalar inf = NumTraits<Scalar>::infinity(); + + Scalar ans, ax, c, yc, r, t, y, z; + Scalar pk, pkm1, pkm2, qk, qkm1, qkm2; + + if ((x < zero) || ( a <= zero)) { + // domain error + return nan; + } + + if ((x < one) || (x < a)) { + return (one - igamma_impl<Scalar>::run(a, x)); + } + + if (x == inf) return zero; // std::isinf crashes on CUDA + + /* Compute x**a * exp(-x) / gamma(a) */ + ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a); + if (ax < -maxlog) { // underflow + return zero; + } + ax = numext::exp(ax); + + // continued fraction + y = one - a; + z = x + y + one; + c = zero; + pkm2 = one; + qkm2 = x; + pkm1 = x + one; + qkm1 = z * x; + ans = pkm1 / qkm1; + + while (true) { + c += one; + y += one; + z += two; + yc = y * c; + pk = pkm1 * z - pkm2 * yc; + qk = qkm1 * z - qkm2 * yc; + if (qk != zero) { + r = pk / qk; + t = numext::abs((ans - r) / r); + ans = r; + } else { + t = one; + } + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + if (numext::abs(pk) > big) { + pkm2 *= biginv; + pkm1 *= biginv; + qkm2 *= biginv; + qkm1 *= biginv; + } + if (t <= machep) break; + } + + return (ans * ax); + } +}; + +#endif // EIGEN_HAS_C99_MATH + +/**************************************************************************** + * Implementation of igamma (incomplete gamma integral) * + ****************************************************************************/ + +template <typename Scalar> +struct igamma_retval { + typedef Scalar type; +}; + +#ifndef EIGEN_HAS_C99_MATH + +template <typename Scalar> +struct igamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template <typename Scalar> +struct igamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + /* igam() + * Incomplete gamma integral + * + * + * + * SYNOPSIS: + * + * double a, x, y, igam(); + * + * y = igam( a, x ); + * + * DESCRIPTION: + * + * The function is defined by + * + * x + * - + * 1 | | -t a-1 + * igam(a,x) = ----- | e t dt. + * - | | + * | (a) - + * 0 + * + * + * In this implementation both arguments must be positive. + * The integral is evaluated by either a power series or + * continued fraction expansion, depending on the relative + * values of a and x. + * + * ACCURACY (double): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 200000 3.6e-14 2.9e-15 + * IEEE 0,100 300000 9.9e-14 1.5e-14 + * + * + * ACCURACY (float): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 20000 7.8e-6 5.9e-7 + * + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1985, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ + + + /* left tail of incomplete gamma function: + * + * inf. k + * a -x - x + * x e > ---------- + * - - + * k=0 | (a+k+1) + * + */ + const Scalar zero = 0; + const Scalar one = 1; + const Scalar machep = igamma_helper<Scalar>::machep(); + const Scalar maxlog = numext::log(NumTraits<Scalar>::highest()); + const Scalar nan = NumTraits<Scalar>::quiet_NaN(); + + double ans, ax, c, r; + + if (x == zero) return zero; + + if ((x < zero) || ( a <= zero)) { // domain error + return nan; + } + + if ((x > one) && (x > a)) { + return (one - igammac_impl<Scalar>::run(a, x)); + } + + /* Compute x**a * exp(-x) / gamma(a) */ + ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a); + if (ax < -maxlog) { + // underflow + return zero; + } + ax = numext::exp(ax); + + /* power series */ + r = a; + c = one; + ans = one; + + while (true) { + r += one; + c *= x/r; + ans += c; + if (c/ans <= machep) break; + } + + return (ans * ax / a); + } +}; + +#endif // EIGEN_HAS_C99_MATH + +/**************************************************************************** + * Implementation of Riemann zeta function of two arguments * + ****************************************************************************/ + +template <typename Scalar> +struct zeta_retval { + typedef Scalar type; +}; + +#ifndef EIGEN_HAS_C99_MATH + +template <typename Scalar> +struct zeta_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar x, Scalar q) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template <typename Scalar> +struct zeta_impl_series { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template <> +struct zeta_impl_series<float> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE bool run(float& a, float& b, float& s, const float x, const float machep) { + int i = 0; + while(i < 9) + { + i += 1; + a += 1.0f; + b = numext::pow( a, -x ); + s += b; + if( numext::abs(b/s) < machep ) + return true; + } + + //Return whether we are done + return false; + } +}; + +template <> +struct zeta_impl_series<double> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE bool run(double& a, double& b, double& s, const double x, const double machep) { + int i = 0; + while( (i < 9) || (a <= 9.0) ) + { + i += 1; + a += 1.0; + b = numext::pow( a, -x ); + s += b; + if( numext::abs(b/s) < machep ) + return true; + } + + //Return whether we are done + return false; + } +}; + +template <typename Scalar> +struct zeta_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar x, Scalar q) { + /* zeta.c + * + * Riemann zeta function of two arguments + * + * + * + * SYNOPSIS: + * + * double x, q, y, zeta(); + * + * y = zeta( x, q ); + * + * + * + * DESCRIPTION: + * + * + * + * inf. + * - -x + * zeta(x,q) = > (k+q) + * - + * k=0 + * + * where x > 1 and q is not a negative integer or zero. + * The Euler-Maclaurin summation formula is used to obtain + * the expansion + * + * n + * - -x + * zeta(x,q) = > (k+q) + * - + * k=1 + * + * 1-x inf. B x(x+1)...(x+2j) + * (n+q) 1 - 2j + * + --------- - ------- + > -------------------- + * x-1 x - x+2j+1 + * 2(n+q) j=1 (2j)! (n+q) + * + * where the B2j are Bernoulli numbers. Note that (see zetac.c) + * zeta(x,1) = zetac(x) + 1. + * + * + * + * ACCURACY: + * + * Relative error for single precision: + * arithmetic domain # trials peak rms + * IEEE 0,25 10000 6.9e-7 1.0e-7 + * + * Large arguments may produce underflow in powf(), in which + * case the results are inaccurate. + * + * REFERENCE: + * + * Gradshteyn, I. S., and I. M. Ryzhik, Tables of Integrals, + * Series, and Products, p. 1073; Academic Press, 1980. + * + */ + + int i; + Scalar p, r, a, b, k, s, t, w; + + const Scalar A[] = { + Scalar(12.0), + Scalar(-720.0), + Scalar(30240.0), + Scalar(-1209600.0), + Scalar(47900160.0), + Scalar(-1.8924375803183791606e9), /*1.307674368e12/691*/ + Scalar(7.47242496e10), + Scalar(-2.950130727918164224e12), /*1.067062284288e16/3617*/ + Scalar(1.1646782814350067249e14), /*5.109094217170944e18/43867*/ + Scalar(-4.5979787224074726105e15), /*8.028576626982912e20/174611*/ + Scalar(1.8152105401943546773e17), /*1.5511210043330985984e23/854513*/ + Scalar(-7.1661652561756670113e18) /*1.6938241367317436694528e27/236364091*/ + }; + + const Scalar maxnum = NumTraits<Scalar>::infinity(); + const Scalar zero = 0.0, half = 0.5, one = 1.0; + const Scalar machep = igamma_helper<Scalar>::machep(); + const Scalar nan = NumTraits<Scalar>::quiet_NaN(); + + if( x == one ) + return maxnum; + + if( x < one ) + { + return nan; + } + + if( q <= zero ) + { + if(q == numext::floor(q)) + { + return maxnum; + } + p = x; + r = numext::floor(p); + if (p != r) + return nan; + } + + /* Permit negative q but continue sum until n+q > +9 . + * This case should be handled by a reflection formula. + * If q<0 and x is an integer, there is a relation to + * the polygamma function. + */ + s = numext::pow( q, -x ); + a = q; + b = zero; + // Run the summation in a helper function that is specific to the floating precision + if (zeta_impl_series<Scalar>::run(a, b, s, x, machep)) { + return s; + } + + w = a; + s += b*w/(x-one); + s -= half * b; + a = one; + k = zero; + for( i=0; i<12; i++ ) + { + a *= x + k; + b /= w; + t = a*b/A[i]; + s = s + t; + t = numext::abs(t/s); + if( t < machep ) + return s; + k += one; + a *= x + k; + b /= w; + k += one; + } + return s; + } +}; + +#endif // EIGEN_HAS_C99_MATH + +/**************************************************************************** + * Implementation of polygamma function * + ****************************************************************************/ + +template <typename Scalar> +struct polygamma_retval { + typedef Scalar type; +}; + +#ifndef EIGEN_HAS_C99_MATH + +template <typename Scalar> +struct polygamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar n, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template <typename Scalar> +struct polygamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar n, Scalar x) { + Scalar zero = 0.0, one = 1.0; + Scalar nplus = n + one; + const Scalar nan = NumTraits<Scalar>::quiet_NaN(); + + // Check that n is an integer + if (numext::floor(n) != n) { + return nan; + } + // Just return the digamma function for n = 1 + else if (n == zero) { + return digamma_impl<Scalar>::run(x); + } + // Use the same implementation as scipy + else { + Scalar factorial = numext::exp(lgamma_impl<Scalar>::run(nplus)); + return numext::pow(-one, nplus) * factorial * zeta_impl<Scalar>::run(nplus, x); + } + } +}; + +#endif // EIGEN_HAS_C99_MATH + +} // end namespace internal + +namespace numext { + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar) + lgamma(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar) + digamma(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(zeta, Scalar) +zeta(const Scalar& x, const Scalar& q) { + return EIGEN_MATHFUNC_IMPL(zeta, Scalar)::run(x, q); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(polygamma, Scalar) +polygamma(const Scalar& n, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(polygamma, Scalar)::run(n, x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) + erf(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) + erfc(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar) + igamma(const Scalar& a, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x); +} + +template <typename Scalar> +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar) + igammac(const Scalar& a, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x); +} + +} // end namespace numext + + +} // end namespace Eigen + +#endif // EIGEN_SPECIAL_FUNCTIONS_H diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h index 9a2f4f1eb..513742f34 100644 --- a/Eigen/src/Core/Stride.h +++ b/Eigen/src/Core/Stride.h @@ -31,8 +31,8 @@ namespace Eigen { * arguments to the constructor. * * Indeed, this class takes two template parameters: - * \param _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime. - * \param _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime. + * \tparam _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime. + * \tparam _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime. * * Here is an example: * \include Map_general_stride.cpp diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index 5b66eb5e1..bc232526a 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -13,20 +13,6 @@ namespace Eigen { -/** \class Transpose - * \ingroup Core_Module - * - * \brief Expression of the transpose of a matrix - * - * \param MatrixType the type of the object of which we are taking the transpose - * - * This class represents an expression of the transpose of a matrix. - * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint() - * and most of the time this is the only way it is used. - * - * \sa MatrixBase::transpose(), MatrixBase::adjoint() - */ - namespace internal { template<typename MatrixType> struct traits<Transpose<MatrixType> > : public traits<MatrixType> @@ -50,11 +36,26 @@ struct traits<Transpose<MatrixType> > : public traits<MatrixType> template<typename MatrixType, typename StorageKind> class TransposeImpl; +/** \class Transpose + * \ingroup Core_Module + * + * \brief Expression of the transpose of a matrix + * + * \tparam MatrixType the type of the object of which we are taking the transpose + * + * This class represents an expression of the transpose of a matrix. + * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint() + * and most of the time this is the only way it is used. + * + * \sa MatrixBase::transpose(), MatrixBase::adjoint() + */ template<typename MatrixType> class Transpose : public TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind> { public: + typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested; + typedef typename TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base; EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose) typedef typename internal::remove_all<MatrixType>::type NestedExpression; @@ -69,16 +70,16 @@ template<typename MatrixType> class Transpose /** \returns the nested expression */ EIGEN_DEVICE_FUNC - const typename internal::remove_all<typename MatrixType::Nested>::type& + const typename internal::remove_all<MatrixTypeNested>::type& nestedExpression() const { return m_matrix; } /** \returns the nested expression */ EIGEN_DEVICE_FUNC - typename internal::remove_all<typename MatrixType::Nested>::type& - nestedExpression() { return m_matrix.const_cast_derived(); } + typename internal::remove_reference<MatrixTypeNested>::type& + nestedExpression() { return m_matrix; } protected: - typename MatrixType::Nested m_matrix; + typename internal::ref_selector<MatrixType>::non_const_type m_matrix; }; namespace internal { diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h index 3b1c1815d..19c17bb4a 100644 --- a/Eigen/src/Core/Transpositions.h +++ b/Eigen/src/Core/Transpositions.h @@ -12,35 +12,6 @@ namespace Eigen { -/** \class Transpositions - * \ingroup Core_Module - * - * \brief Represents a sequence of transpositions (row/column interchange) - * - * \param SizeAtCompileTime the number of transpositions, or Dynamic - * \param MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it. - * - * This class represents a permutation transformation as a sequence of \em n transpositions - * \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices. - * Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges - * the rows \c i and \c indices[i] of the matrix \c M. - * A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange. - * - * Compared to the class PermutationMatrix, such a sequence of transpositions is what is - * computed during a decomposition with pivoting, and it is faster when applying the permutation in-place. - * - * To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example: - * \code - * Transpositions tr; - * MatrixXf mat; - * mat = tr * mat; - * \endcode - * In this example, we detect that the matrix appears on both side, and so the transpositions - * are applied in-place without any temporary or extra copy. - * - * \sa class PermutationMatrix - */ - template<typename Derived> class TranspositionsBase { @@ -154,6 +125,35 @@ struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageInde }; } +/** \class Transpositions + * \ingroup Core_Module + * + * \brief Represents a sequence of transpositions (row/column interchange) + * + * \tparam SizeAtCompileTime the number of transpositions, or Dynamic + * \tparam MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it. + * + * This class represents a permutation transformation as a sequence of \em n transpositions + * \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices. + * Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges + * the rows \c i and \c indices[i] of the matrix \c M. + * A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange. + * + * Compared to the class PermutationMatrix, such a sequence of transpositions is what is + * computed during a decomposition with pivoting, and it is faster when applying the permutation in-place. + * + * To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example: + * \code + * Transpositions tr; + * MatrixXf mat; + * mat = tr * mat; + * \endcode + * In this example, we detect that the matrix appears on both side, and so the transpositions + * are applied in-place without any temporary or extra copy. + * + * \sa class PermutationMatrix + */ + template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex> class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> > { @@ -325,7 +325,7 @@ class TranspositionsWrapper protected: - const typename IndicesType::Nested m_indices; + typename IndicesType::Nested m_indices; }; diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index 099a02ec3..e6d137e40 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -168,7 +168,7 @@ namespace internal { template<typename MatrixType, unsigned int _Mode> struct traits<TriangularView<MatrixType, _Mode> > : traits<MatrixType> { - typedef typename ref_selector<MatrixType>::type MatrixTypeNested; + typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested; typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedNonRef; typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned; typedef typename MatrixType::PlainObject FullMatrixType; @@ -213,7 +213,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView IsVectorAtCompileTime = false }; - // FIXME This, combined with const_cast_derived in transpose() leads to a const-correctness loophole EIGEN_DEVICE_FUNC explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix) {} @@ -235,7 +234,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView /** \returns a reference to the nested expression */ EIGEN_DEVICE_FUNC - NestedExpression& nestedExpression() { return *const_cast<NestedExpression*>(&m_matrix); } + NestedExpression& nestedExpression() { return m_matrix; } typedef TriangularView<const MatrixConjugateReturnType,Mode> ConjugateReturnType; /** \sa MatrixBase::conjugate() const */ @@ -255,7 +254,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView inline TransposeReturnType transpose() { EIGEN_STATIC_ASSERT_LVALUE(MatrixType) - typename MatrixType::TransposeReturnType tmp(m_matrix.const_cast_derived()); + typename MatrixType::TransposeReturnType tmp(m_matrix); return TransposeReturnType(tmp); } @@ -418,7 +417,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat { EIGEN_STATIC_ASSERT_LVALUE(TriangularViewType); Base::check_coordinates_internal(row, col); - return derived().nestedExpression().const_cast_derived().coeffRef(row, col); + return derived().nestedExpression().coeffRef(row, col); } /** Assigns a triangular matrix to a triangular part of a dense matrix */ @@ -595,14 +594,7 @@ template<typename Derived> template<typename DenseDerived> void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const { - if(internal::traits<Derived>::Flags & EvalBeforeAssigningBit) - { - typename internal::plain_matrix_type<Derived>::type other_evaluated(rows(), cols()); - evalToLazy(other_evaluated); - other.derived().swap(other_evaluated); - } - else - evalToLazy(other.derived()); + evalToLazy(other.derived()); } /*************************************************************************** @@ -711,10 +703,6 @@ struct evaluator_traits<TriangularView<MatrixType,Mode> > { typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind; typedef typename glue_shapes<typename evaluator_traits<MatrixType>::Shape, TriangularShape>::type Shape; - - // 1 if assignment A = B assumes aliasing when B is of type T and thus B needs to be evaluated into a - // temporary; 0 if not. - static const int AssumeAliasing = 0; }; template<typename MatrixType, unsigned int Mode> @@ -788,7 +776,8 @@ public: }; template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType, typename Functor> -EIGEN_DEVICE_FUNC void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -812,7 +801,8 @@ EIGEN_DEVICE_FUNC void call_triangular_assignment_loop(const DstXprType& dst, co } template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType> -EIGEN_DEVICE_FUNC void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src) { call_triangular_assignment_loop<Mode,SetOpposite>(dst, src, internal::assign_op<typename DstXprType::Scalar>()); } diff --git a/Eigen/src/Core/VectorBlock.h b/Eigen/src/Core/VectorBlock.h index 216c568c4..d72fbf7e9 100644 --- a/Eigen/src/Core/VectorBlock.h +++ b/Eigen/src/Core/VectorBlock.h @@ -13,13 +13,23 @@ namespace Eigen { +namespace internal { +template<typename VectorType, int Size> +struct traits<VectorBlock<VectorType, Size> > + : public traits<Block<VectorType, + traits<VectorType>::Flags & RowMajorBit ? 1 : Size, + traits<VectorType>::Flags & RowMajorBit ? Size : 1> > +{ +}; +} + /** \class VectorBlock * \ingroup Core_Module * * \brief Expression of a fixed-size or dynamic-size sub-vector * - * \param VectorType the type of the object in which we are taking a sub-vector - * \param Size size of the sub-vector we are taking at compile time (optional) + * \tparam VectorType the type of the object in which we are taking a sub-vector + * \tparam Size size of the sub-vector we are taking at compile time (optional) * * This class represents an expression of either a fixed-size or dynamic-size sub-vector. * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and @@ -43,17 +53,6 @@ namespace Eigen { * * \sa class Block, DenseBase::segment(Index,Index,Index,Index), DenseBase::segment(Index,Index) */ - -namespace internal { -template<typename VectorType, int Size> -struct traits<VectorBlock<VectorType, Size> > - : public traits<Block<VectorType, - traits<VectorType>::Flags & RowMajorBit ? 1 : Size, - traits<VectorType>::Flags & RowMajorBit ? Size : 1> > -{ -}; -} - template<typename VectorType, int Size> class VectorBlock : public Block<VectorType, internal::traits<VectorType>::Flags & RowMajorBit ? 1 : Size, diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index dbc272dae..193891189 100644 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -115,7 +115,7 @@ struct member_lpnorm { typedef ResultType result_type; template<typename Scalar, int Size> struct Cost { enum { value = (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost }; }; - EIGEN_DEVICE_FUNC explicit member_lpnorm() {} + EIGEN_DEVICE_FUNC member_lpnorm() {} template<typename XprType> EIGEN_DEVICE_FUNC inline ResultType operator()(const XprType& mat) const { return mat.template lpNorm<p>(); } @@ -124,7 +124,7 @@ struct member_lpnorm { template <typename BinaryOp, typename Scalar> struct member_redux { typedef typename result_of< - BinaryOp(Scalar,Scalar) + BinaryOp(const Scalar&,const Scalar&) >::type result_type; template<typename _Scalar, int Size> struct Cost { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; }; @@ -141,8 +141,8 @@ struct member_redux { * * \brief Pseudo expression providing partial reduction operations * - * \param ExpressionType the type of the object on which to do partial reductions - * \param Direction indicates the direction of the redux (#Vertical or #Horizontal) + * \tparam ExpressionType the type of the object on which to do partial reductions + * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal) * * This class represents a pseudo expression with partial reduction features. * It is the return type of DenseBase::colwise() and DenseBase::rowwise() @@ -187,11 +187,11 @@ template<typename ExpressionType, int Direction> class VectorwiseOp protected: - /** \internal - * \returns the i-th subvector according to the \c Direction */ typedef typename internal::conditional<isVertical, typename ExpressionType::ColXpr, typename ExpressionType::RowXpr>::type SubVector; + /** \internal + * \returns the i-th subvector according to the \c Direction */ EIGEN_DEVICE_FUNC SubVector subVector(Index i) { diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h index 7aac0b6e1..d71dfc968 100644 --- a/Eigen/src/Core/Visitor.h +++ b/Eigen/src/Core/Visitor.h @@ -197,7 +197,7 @@ struct functor_traits<max_coeff_visitor<Scalar> > { /** \returns the minimum of all coefficients of *this and puts in *row and *col its location. * \warning the result is undefined if \c *this contains NaN. * - * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visitor(), DenseBase::minCoeff() + * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff() */ template<typename Derived> template<typename IndexType> @@ -215,7 +215,7 @@ DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const /** \returns the minimum of all coefficients of *this and puts in *index its location. * \warning the result is undefined if \c *this contains NaN. * - * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::minCoeff() + * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff() */ template<typename Derived> template<typename IndexType> @@ -233,7 +233,7 @@ DenseBase<Derived>::minCoeff(IndexType* index) const /** \returns the maximum of all coefficients of *this and puts in *row and *col its location. * \warning the result is undefined if \c *this contains NaN. * - * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff() + * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff() */ template<typename Derived> template<typename IndexType> diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index c4bd6bd53..98d8e029f 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -10,11 +10,6 @@ #ifndef EIGEN_MATH_FUNCTIONS_AVX_H #define EIGEN_MATH_FUNCTIONS_AVX_H -// For some reason, this function didn't make it into the avxintirn.h -// used by the compiler, so we'll just wrap it. -#define _mm256_setr_m128(lo, hi) \ - _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1) - /* The sin, cos, exp, and log functions of this file are loosely derived from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ @@ -23,6 +18,28 @@ namespace Eigen { namespace internal { +inline Packet8i pshiftleft(Packet8i v, int n) +{ +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_slli_epi32(v, n); +#else + __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n); + __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + +inline Packet8f pshiftright(Packet8f v, int n) +{ +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n)); +#else + __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n); + __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n); + return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); +#endif +} + // Sine function // Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and // evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants @@ -54,17 +71,8 @@ psin<Packet8f>(const Packet8f& _x) { // Make a mask for the entries that need flipping, i.e. wherever the shift // is odd. Packet8i shift_ints = _mm256_cvtps_epi32(shift); - Packet8i shift_isodd = - _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one))); -#ifdef EIGEN_VECTORIZE_AVX2 - Packet8i sign_flip_mask = _mm256_slli_epi32(shift_isodd, 31); -#else - __m128i lo = - _mm_slli_epi32(_mm256_extractf128_si256(shift_isodd, 0), 31); - __m128i hi = - _mm_slli_epi32(_mm256_extractf128_si256(shift_isodd, 1), 31); - Packet8i sign_flip_mask = _mm256_setr_m128(lo, hi); -#endif + Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one))); + Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31); // Create a mask for which interpolant to use, i.e. if z > 1, then the mask // is set to ones for that entry. @@ -142,15 +150,7 @@ plog<Packet8f>(const Packet8f& _x) { // Truncate input values to the minimum positive normal. x = pmax(x, p8f_min_norm_pos); -// Extract the shifted exponents (No bitwise shifting in regular AVX, so -// convert to SSE and do it there). -#ifdef EIGEN_VECTORIZE_AVX2 - Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(x), 23)); -#else - __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(x), 0), 23); - __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(x), 1), 23); - Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_setr_m128(lo, hi)); -#endif + Packet8f emm0 = pshiftright(x,23); Packet8f e = _mm256_sub_ps(emm0, p8f_126f); // Set the exponents to -1, i.e. x are in the range [0.5,1). @@ -259,18 +259,61 @@ pexp<Packet8f>(const Packet8f& _x) { // Build emm0 = 2^m. Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127)); -#ifdef EIGEN_VECTORIZE_AVX2 - emm0 = _mm256_slli_epi32(emm0, 23); -#else - __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(emm0, 0), 23); - __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(emm0, 1), 23); - emm0 = _mm256_setr_m128(lo, hi); -#endif + emm0 = pshiftleft(emm0, 23); // Return 2^m * exp(r). return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x); } +// Hyperbolic Tangent function. +// Doesn't do anything fancy, just a 13/6-degree rational interpolant which +// is accurate up to a couple of ulp in the range [-9, 9], outside of which the +// fl(tanh(x)) = +/-1. +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +ptanh<Packet8f>(const Packet8f& _x) { + // Clamp the inputs to the range [-9, 9] since anything outside + // this range is +/-1.0f in single-precision. + _EIGEN_DECLARE_CONST_Packet8f(plus_9, 9.0f); + _EIGEN_DECLARE_CONST_Packet8f(minus_9, -9.0f); + const Packet8f x = pmax(p8f_minus_9, pmin(p8f_plus_9, _x)); + + // The monomial coefficients of the numerator polynomial (odd). + _EIGEN_DECLARE_CONST_Packet8f(alpha_1, 4.89352455891786e-03f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_3, 6.37261928875436e-04f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_5, 1.48572235717979e-05f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_7, 5.12229709037114e-08f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_9, -8.60467152213735e-11f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_11, 2.00018790482477e-13f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_13, -2.76076847742355e-16f); + + // The monomial coefficients of the denominator polynomial (even). + _EIGEN_DECLARE_CONST_Packet8f(beta_0, 4.89352518554385e-03f); + _EIGEN_DECLARE_CONST_Packet8f(beta_2, 2.26843463243900e-03f); + _EIGEN_DECLARE_CONST_Packet8f(beta_4, 1.18534705686654e-04f); + _EIGEN_DECLARE_CONST_Packet8f(beta_6, 1.19825839466702e-06f); + + // Since the polynomials are odd/even, we need x^2. + const Packet8f x2 = pmul(x, x); + + // Evaluate the numerator polynomial p. + Packet8f p = pmadd(x2, p8f_alpha_13, p8f_alpha_11); + p = pmadd(x2, p, p8f_alpha_9); + p = pmadd(x2, p, p8f_alpha_7); + p = pmadd(x2, p, p8f_alpha_5); + p = pmadd(x2, p, p8f_alpha_3); + p = pmadd(x2, p, p8f_alpha_1); + p = pmul(x, p); + + // Evaluate the denominator polynomial p. + Packet8f q = pmadd(x2, p8f_beta_6, p8f_beta_4); + q = pmadd(x2, q, p8f_beta_2); + q = pmadd(x2, q, p8f_beta_0); + + // Divide the numerator by the denominator. + return pdiv(p, q); +} + template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d pexp<Packet4d>(const Packet4d& _x) { diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 7161f3867..ba2a6c1e1 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -68,6 +68,7 @@ template<> struct packet_traits<float> : default_packet_traits HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, HasBlend = 1, HasRound = 1, HasFloor = 1, diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h new file mode 100644 index 000000000..281b8e4c6 --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -0,0 +1,497 @@ +// Standard 16-bit float type, mostly useful for GPUs. Defines a new +// class Eigen::half (inheriting from CUDA's __half struct) with +// operator overloads such that it behaves basically as an arithmetic +// type. It will be quite slow on CPUs (so it is recommended to stay +// in fp32 for CPUs, except for simple parameter conversions, I/O +// to disk and the likes), but fast on GPUs. +// +// +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The conversion routines are Copyright (c) Fabian Giesen, 2016. +// The original license follows: +// +// Copyright (c) Fabian Giesen, 2016 +// All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted. +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef EIGEN_HALF_CUDA_H +#define EIGEN_HALF_CUDA_H + +#if __cplusplus > 199711L +#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() +#else +#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type() +#endif + + +#if !defined(EIGEN_HAS_CUDA_FP16) + +// Make our own __half definition that is similar to CUDA's. +struct __half { + unsigned short x; +}; + +#endif + +namespace Eigen { + +namespace internal { + +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h); + +} // end namespace internal + +// Class definition. +struct half : public __half { + EIGEN_DEVICE_FUNC half() {} + + EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {} + EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {} + + explicit EIGEN_DEVICE_FUNC half(bool b) + : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + explicit EIGEN_DEVICE_FUNC half(int i) + : __half(internal::float_to_half_rtne(static_cast<float>(i))) {} + explicit EIGEN_DEVICE_FUNC half(long l) + : __half(internal::float_to_half_rtne(static_cast<float>(l))) {} + explicit EIGEN_DEVICE_FUNC half(long long ll) + : __half(internal::float_to_half_rtne(static_cast<float>(ll))) {} + explicit EIGEN_DEVICE_FUNC half(float f) + : __half(internal::float_to_half_rtne(f)) {} + explicit EIGEN_DEVICE_FUNC half(double d) + : __half(internal::float_to_half_rtne(static_cast<float>(d))) {} + + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const { + // +0.0 and -0.0 become false, everything else becomes true. + return (x & 0x7fff) != 0; + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const { + return static_cast<signed char>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const { + return static_cast<unsigned char>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const { + return static_cast<short>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const { + return static_cast<unsigned short>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const { + return static_cast<int>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const { + return static_cast<unsigned int>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const { + return static_cast<long>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const { + return static_cast<unsigned long>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const { + return static_cast<long long>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const { + return static_cast<unsigned long long>(internal::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { + return internal::half_to_float(*this); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { + return static_cast<double>(internal::half_to_float(*this)); + } + + EIGEN_DEVICE_FUNC half& operator=(const half& other) { + x = other.x; + return *this; + } +}; + +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + +// Intrinsics for native fp16 support. Note that on current hardware, +// these are no faster than fp32 arithmetic (you need to use the half2 +// versions to get the ALU speed increased), but you do save the +// conversion steps back and forth. + +__device__ half operator + (const half& a, const half& b) { + return __hadd(a, b); +} +__device__ half operator * (const half& a, const half& b) { + return __hmul(a, b); +} +__device__ half operator - (const half& a, const half& b) { + return __hsub(a, b); +} +__device__ half operator / (const half& a, const half& b) { + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); +} +__device__ half operator - (const half& a) { + return __hneg(a); +} +__device__ half& operator += (half& a, const half& b) { + a = a + b; + return a; +} +__device__ half& operator *= (half& a, const half& b) { + a = a * b; + return a; +} +__device__ half& operator -= (half& a, const half& b) { + a = a - b; + return a; +} +__device__ half& operator /= (half& a, const half& b) { + a = a / b; + return a; +} +__device__ bool operator == (const half& a, const half& b) { + return __heq(a, b); +} +__device__ bool operator != (const half& a, const half& b) { + return __hne(a, b); +} +__device__ bool operator < (const half& a, const half& b) { + return __hlt(a, b); +} +__device__ bool operator <= (const half& a, const half& b) { + return __hle(a, b); +} +__device__ bool operator > (const half& a, const half& b) { + return __hgt(a, b); +} +__device__ bool operator >= (const half& a, const half& b) { + return __hge(a, b); +} + +#else // Emulate support for half floats + +// Definitions for CPUs and older CUDA, mostly working through conversion +// to/from fp32. + +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { + return half(float(a) + float(b)); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { + return half(float(a) * float(b)); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { + return half(float(a) - float(b)); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { + return half(float(a) / float(b)); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { + half result; + result.x = a.x ^ 0x8000; + return result; +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { + a = half(float(a) + float(b)); + return a; +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { + a = half(float(a) * float(b)); + return a; +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { + a = half(float(a) - float(b)); + return a; +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { + a = half(float(a) / float(b)); + return a; +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { + return float(a) == float(b); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { + return float(a) != float(b); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { + return float(a) < float(b); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { + return float(a) <= float(b); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { + return float(a) > float(b); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { + return float(a) >= float(b); +} + +#endif // Emulate support for half floats + +// Division by an index. Do it in full float precision to avoid accuracy +// issues in converting the denominator to half. +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { + return Eigen::half(static_cast<float>(a) / static_cast<float>(b)); +} + +// Conversion routines, including fallbacks for the host or older CUDA. +// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of +// these in hardware. If we need more performance on older/other CPUs, they are +// also possible to vectorize directly. + +namespace internal { + +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { + __half h; + h.x = x; + return h; +} + +union FP32 { + unsigned int u; + float f; +}; + +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __float2half(ff); + +#elif defined(EIGEN_HAS_FP16_C) + __half h; + h.x = _cvtss_sh(ff, 0); + return h; + +#else + FP32 f; f.f = ff; + + const FP32 f32infty = { 255 << 23 }; + const FP32 f16max = { (127 + 16) << 23 }; + const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + unsigned int sign_mask = 0x80000000u; + __half o = { 0 }; + + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + + // and one integer subtract of the bias later, we have our final float! + o.x = static_cast<unsigned short>(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = static_cast<unsigned short>(f.u >> 13); + } + } + + o.x |= static_cast<unsigned short>(sign >> 16); + return o; +#endif +} + +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __half2float(h); + +#elif defined(EIGEN_HAS_FP16_C) + return _cvtsh_ss(h.x); + +#else + const FP32 magic = { 113 << 23 }; + const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + FP32 o; + + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; +#endif +} + +} // end namespace internal + +// Traits. + +namespace internal { + +template<> struct is_arithmetic<half> { enum { value = true }; }; + +} // end namespace internal + +template<> struct NumTraits<Eigen::half> + : GenericNumTraits<Eigen::half> +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { + return internal::raw_uint16_to_half(0x0800); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return half(1e-3f); } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() { + return internal::raw_uint16_to_half(0x7bff); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() { + return internal::raw_uint16_to_half(0xfbff); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() { + return internal::raw_uint16_to_half(0x7c00); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { + return internal::raw_uint16_to_half(0x7c01); + } +}; + +// Infinity/NaN checks. + +namespace numext { + +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) { + return (a.x & 0x7fff) == 0x7c00; +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hisnan(a); +#else + return (a.x & 0x7fff) > 0x7c00; +#endif +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) { + return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) { + Eigen::half result; + result.x = a.x & 0x7FFF; + return result; +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exp(const Eigen::half& a) { + return Eigen::half(::expf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) { + return Eigen::half(::logf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::half& a) { + return Eigen::half(::sqrtf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half pow(const Eigen::half& a, const Eigen::half& b) { + return Eigen::half(::powf(float(a), float(b))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) { + return Eigen::half(::floorf(float(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::half& a) { + return Eigen::half(::ceilf(float(a))); +} + +} // end namespace numext + +} // end namespace Eigen + +// Standard mathematical functions and trancendentals. +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) { + Eigen::half result; + result.x = a.x & 0x7FFF; + return result; +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { + return Eigen::half(::expf(float(a))); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { + return Eigen::half(::logf(float(a))); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) { + return Eigen::half(::sqrtf(float(a))); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) { + return Eigen::half(::powf(float(a), float(b))); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) { + return Eigen::half(::floorf(float(a))); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) { + return Eigen::half(::ceilf(float(a))); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isnan)(const Eigen::half& a) { + return (Eigen::numext::isnan)(a); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isinf)(const Eigen::half& a) { + return (Eigen::numext::isinf)(a); +} +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int (isfinite)(const Eigen::half& a) { + return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a); +} + + +namespace std { + +#if __cplusplus > 199711L +template <> +struct hash<Eigen::half> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const { + return static_cast<std::size_t>(a.x); + } +}; +#endif + +} // end namespace std + + +// Add the missing shfl_xor intrinsic +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width)); +} +#endif + +// ldg() has an overload for __half, but we also need one for Eigen::half. +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320 +static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { + return Eigen::internal::raw_uint16_to_half( + __ldg(reinterpret_cast<const unsigned short*>(ptr))); +} +#endif + + +#endif // EIGEN_HALF_CUDA_H diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h index 3bea88bea..317499b29 100644 --- a/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -66,6 +66,121 @@ double2 prsqrt<double2>(const double2& a) return make_double2(rsqrt(a.x), rsqrt(a.y)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 plgamma<float4>(const float4& a) +{ + return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 plgamma<double2>(const double2& a) +{ + return make_double2(lgamma(a.x), lgamma(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pdigamma<float4>(const float4& a) +{ + using numext::digamma; + return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pdigamma<double2>(const double2& a) +{ + using numext::digamma; + return make_double2(digamma(a.x), digamma(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pzeta<float4>(const float4& x, const float4& q) +{ + using numext::zeta; + return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pzeta<double2>(const double2& x, const double2& q) +{ + using numext::zeta; + return make_double2(zeta(x.x, q.x), zeta(x.y, q.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 ppolygamma<float4>(const float4& n, const float4& x) +{ + using numext::polygamma; + return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 ppolygamma<double2>(const double2& n, const double2& x) +{ + using numext::polygamma; + return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 perf<float4>(const float4& a) +{ + return make_float4(erf(a.x), erf(a.y), erf(a.z), erf(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 perf<double2>(const double2& a) +{ + return make_double2(erf(a.x), erf(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 perfc<float4>(const float4& a) +{ + return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 perfc<double2>(const double2& a) +{ + return make_double2(erfc(a.x), erfc(a.y)); +} + + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pigamma<float4>(const float4& a, const float4& x) +{ + using numext::igamma; + return make_float4( + igamma(a.x, x.x), + igamma(a.y, x.y), + igamma(a.z, x.z), + igamma(a.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pigamma<double2>(const double2& a, const double2& x) +{ + using numext::igamma; + return make_double2(igamma(a.x, x.x), igamma(a.y, x.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pigammac<float4>(const float4& a, const float4& x) +{ + using numext::igammac; + return make_float4( + igammac(a.x, x.x), + igammac(a.y, x.y), + igammac(a.z, x.z), + igammac(a.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pigammac<double2>(const double2& a, const double2& x) +{ + using numext::igammac; + return make_double2(igammac(a.x, x.x), igammac(a.y, x.y)); +} + #endif } // end namespace internal diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 0d2c2fef0..932df1092 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -21,7 +21,6 @@ namespace internal { template<> struct is_arithmetic<float4> { enum { value = true }; }; template<> struct is_arithmetic<double2> { enum { value = true }; }; - template<> struct packet_traits<float> : default_packet_traits { typedef float4 type; @@ -39,6 +38,14 @@ template<> struct packet_traits<float> : default_packet_traits HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasZeta = 1, + HasPolygamma = 1, + HasErf = 1, + HasErfc = 1, + HasIgamma = 1, + HasIGammac = 1, HasBlend = 0, }; @@ -59,6 +66,12 @@ template<> struct packet_traits<double> : default_packet_traits HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasErf = 1, + HasErfc = 1, + HasIGamma = 1, + HasIGammac = 1, HasBlend = 0, }; @@ -177,25 +190,39 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to to[1] = from.y; } -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return __ldg((const float4*)from); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return __ldg((const double2*)from); +#else + return make_double2(from[0], from[1]); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 return make_double2(__ldg(from+0), __ldg(from+1)); -} +#else + return make_double2(from[0], from[1]); #endif +} template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) { return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); @@ -251,6 +278,35 @@ template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) return a.x * a.y; } +template<size_t offset> +struct protate_impl<offset, float4> +{ + static float4 run(const float4& a) { + if (offset == 0) { + return make_float4(a.x, a.y, a.z, a.w); + } + if (offset == 1) { + return make_float4(a.w, a.x, a.y, a.z); + } + if (offset == 2) { + return make_float4(a.z, a.w, a.x, a.y); + } + return make_float4(a.y, a.z, a.w, a.x); + } +}; + +template<size_t offset> +struct protate_impl<offset, double2> +{ + static double2 run(const double2& a) { + if (offset == 0) { + return make_double2(a.x, a.y); + } + return make_double2(a.y, a.x); + } +}; + + template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) { return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); } @@ -258,7 +314,6 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) { return make_double2(fabs(a.x), fabs(a.y)); } - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<float4,4>& kernel) { double tmp = kernel.packet[0].y; diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h new file mode 100644 index 000000000..61d532e4d --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -0,0 +1,192 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H +#define EIGEN_PACKET_MATH_HALF_CUDA_H + +#if defined(EIGEN_HAS_CUDA_FP16) + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) + +// Most of the following operations require arch >= 5.3 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + +namespace Eigen { +namespace internal { + +template<> struct is_arithmetic<half2> { enum { value = true }; }; + +template<> struct packet_traits<half> : default_packet_traits +{ + typedef half2 type; + typedef half2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + HasDiv = 1 + }; +}; + + +template<> struct unpacket_traits<half2> { typedef half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const half& from) { + return __half2half2(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const half* from) { + return *reinterpret_cast<const half2*>(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const half* from) { + return __halves2half2(from[0], from[1]); +} + +template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const half* from) { + return __halves2half2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<half>(half* to, const half2& from) { + *reinterpret_cast<half2*>(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, const half2& from) { + to[0] = __low2half(from); + to[1] = __high2half(from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) { + return __ldg((const half2*)from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) { + return __halves2half2(__ldg(from+0), __ldg(from+1)); +} + +template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) { + return __halves2half2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<half, half2>(half* to, const half2& from, Index stride) { + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); +} + +template<> EIGEN_DEVICE_FUNC inline half pfirst<half2>(const half2& a) { + return __low2half(a); +} + +template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) { + half2 result; + result.x = a.x & 0x7FFF7FFF; + return result; +} + + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<half2,2>& kernel) { + half a1 = __low2half(kernel.packet[0]); + half a2 = __high2half(kernel.packet[0]); + half b1 = __low2half(kernel.packet[1]); + half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) { + return __halves2half2(a, __hadd(a, __float2half(1.0f))); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) { + return __hadd2(a, b); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) { + return __hsub2(a, b); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { + return __hneg2(a); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) { + return __hmul2(a, b); +} + + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) { + return __hfma2(a, b, c); + } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 / b1; + float r2 = a2 / b2; + return __floats2half2_rn(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) { + return __hadd(__low2half(a), __high2half(a)); +} + +template<> EIGEN_DEVICE_FUNC inline half predux_max<half2>(const half2& a) { + half first = __low2half(a); + half second = __high2half(a); + return __hgt(first, second) ? first : second; +} + +template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) { + half first = __low2half(a); + half second = __high2half(a); + return __hlt(first, second) ? first : second; +} + +template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) { + return __hmul(__low2half(a), __high2half(a)); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif +#endif +#endif +#endif // EIGEN_PACKET_MATH_HALF_CUDA_H diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h new file mode 100644 index 000000000..396b38eaf --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -0,0 +1,112 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_CUDA_H +#define EIGEN_TYPE_CASTING_CUDA_H + +namespace Eigen { + +namespace internal { + +#if defined(EIGEN_HAS_CUDA_FP16) + +template<> +struct scalar_cast_op<float, half> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const float& a) const { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __float2half(a); + #else + return half(a); + #endif + } +}; + +template<> +struct functor_traits<scalar_cast_op<float, half> > +{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op<int, half> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const int& a) const { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __float2half(static_cast<float>(a)); + #else + return half(static_cast<float>(a)); + #endif + } +}; + +template<> +struct functor_traits<scalar_cast_op<int, half> > +{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op<half, float> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const half& a) const { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __half2float(a); + #else + return static_cast<float>(a); + #endif + } +}; + +template<> +struct functor_traits<scalar_cast_op<half, float> > +{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; }; + + + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + +template <> +struct type_casting_traits<half, float> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) { + float2 r1 = __half22float2(a); + float2 r2 = __half22float2(b); + return make_float4(r1.x, r1.y, r2.x, r2.y); +} + +template <> +struct type_casting_traits<float, half> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) { + // Simply discard the second half of the input + return __float22half2_rn(make_float2(a.x, a.y)); +} + +#endif +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_CUDA_H diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index fc4c0d03a..3224c36bd 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -177,7 +177,11 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co return pset1<Packet4i>(0); } -#ifdef __ARM_FEATURE_FMA +// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available, +// then implements a slow software scalar fallback calling fmaf()! +// Filed LLVM bug: +// https://llvm.org/bugs/show_bug.cgi?id=27216 +#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM) // See bug 936. // FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4. // FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding. @@ -186,7 +190,27 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co // MLA: 10 GFlop/s ; FMA: 12 GFlops/s. template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } #else -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { +#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM + // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu, + // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on + // -march=armv7-a, that is a very common case. + // See e.g. this thread: + // http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html + // Filed LLVM bug: + // https://llvm.org/bugs/show_bug.cgi?id=27219 + Packet4f r = c; + asm volatile( + "vmla.f32 %q[r], %q[a], %q[b]" + : [r] "+w" (r) + : [a] "w" (a), + [b] "w" (b) + : ); + return r; +#else + return vmlaq_f32(c,a,b); +#endif +} #endif // No FMA instruction for int, so use MLA unconditionally. @@ -532,20 +556,21 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) { #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG -#if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__) // Bug 907: workaround missing declarations of the following two functions in the ADK -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vreinterpretq_u64_f64 (float64x2_t __a) +// Defining these functions as templates ensures that if these intrinsics are +// already defined in arm_neon.h, then our workaround doesn't cause a conflict +// and has lower priority in overload resolution. +template <typename T> +uint64x2_t vreinterpretq_u64_f64(T a) { - return (uint64x2_t) __a; + return (uint64x2_t) a; } -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vreinterpretq_f64_u64 (uint64x2_t __a) +template <typename T> +float64x2_t vreinterpretq_f64_u64(T a) { - return (float64x2_t) __a; + return (float64x2_t) a; } -#endif typedef float64x2_t Packet2d; typedef float64x1_t Packet1d; diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 4f45ddfbf..fd7f4d740 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -255,7 +255,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); } -EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x) +EIGEN_STRONG_INLINE Packet2cf pcplxflip/* <Packet2cf> */(const Packet2cf& x) { return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); } @@ -456,7 +456,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); } -EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x) +EIGEN_STRONG_INLINE Packet1cd pcplxflip/* <Packet1cd> */(const Packet1cd& x) { return Packet1cd(preverse(Packet2d(x.v))); } diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 3b8b7303f..28f103eeb 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -516,8 +516,81 @@ Packet2d prsqrt<Packet2d>(const Packet2d& x) { return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x)); } +// Hyperbolic Tangent function. +// Doesn't do anything fancy, just a 13/6-degree rational interpolant which +// is accurate up to a couple of ulp in the range [-9, 9], outside of which the +// fl(tanh(x)) = +/-1. +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +ptanh<Packet4f>(const Packet4f& _x) { + // Clamp the inputs to the range [-9, 9] since anything outside + // this range is +/-1.0f in single-precision. + _EIGEN_DECLARE_CONST_Packet4f(plus_9, 9.0f); + _EIGEN_DECLARE_CONST_Packet4f(minus_9, -9.0f); + const Packet4f x = pmax(p4f_minus_9, pmin(p4f_plus_9, _x)); + + // The monomial coefficients of the numerator polynomial (odd). + _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-03f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-04f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-05f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-08f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f); + + // The monomial coefficients of the denominator polynomial (even). + _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-03f); + _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-03f); + _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-04f); + _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-06f); + + // Since the polynomials are odd/even, we need x^2. + const Packet4f x2 = pmul(x, x); + + // Evaluate the numerator polynomial p. + Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11); + p = pmadd(x2, p, p4f_alpha_9); + p = pmadd(x2, p, p4f_alpha_7); + p = pmadd(x2, p, p4f_alpha_5); + p = pmadd(x2, p, p4f_alpha_3); + p = pmadd(x2, p, p4f_alpha_1); + p = pmul(x, p); + + // Evaluate the denominator polynomial p. + Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4); + q = pmadd(x2, q, p4f_beta_2); + q = pmadd(x2, q, p4f_beta_0); + + // Divide the numerator by the denominator. + return pdiv(p, q); +} + } // end namespace internal +namespace numext { + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float sqrt(const float &x) +{ + return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x)))); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double sqrt(const double &x) +{ +#if EIGEN_COMP_GNUC_STRICT + // This works around a GCC bug generating poor code for _mm_sqrt_pd + // See https://bitbucket.org/eigen/eigen/commits/14f468dba4d350d7c19c9b93072e19f7b3df563b + return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x)))); +#else + return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x)))); +#endif +} + +} // end namespace numex + } // end namespace Eigen #endif // EIGEN_MATH_FUNCTIONS_SSE_H diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index eb517b871..451034560 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -109,6 +109,7 @@ template<> struct packet_traits<float> : default_packet_traits HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, HasBlend = 1 #ifdef EIGEN_VECTORIZE_SSE4_1 @@ -314,58 +315,27 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { E return _mm_loadu_ps(from); #endif } - template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } - template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); } #else // NOTE: with the code below, MSVC's compiler crashes! -#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386 || (EIGEN_ARCH_x86_64 && EIGEN_GNUC_AT_LEAST(4, 8))) - // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd - #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 -#elif EIGEN_COMP_CLANG - // bug 201: Segfaults in __mm_loadh_pd with clang 2.8 - #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 -#else - #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0 -#endif - template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD -#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS return _mm_loadu_ps(from); -#else - __m128d res; - res = _mm_load_sd((const double*)(from)) ; - res = _mm_loadh_pd(res, (const double*)(from+2)) ; - return _mm_castpd_ps(res); -#endif } +#endif + template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD -#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS return _mm_loadu_pd(from); -#else - __m128d res; - res = _mm_load_sd(from) ; - res = _mm_loadh_pd(res,from+1); - return res; -#endif } template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD -#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); -#else - __m128d res; - res = _mm_load_sd((const double*)(from)) ; - res = _mm_loadh_pd(res, (const double*)(from+2)) ; - return _mm_castpd_si128(res); -#endif } -#endif + template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) { diff --git a/Eigen/src/Core/arch/ZVector/CMakeLists.txt b/Eigen/src/Core/arch/ZVector/CMakeLists.txt new file mode 100644 index 000000000..5eb0957eb --- /dev/null +++ b/Eigen/src/Core/arch/ZVector/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_Core_arch_ZVector_SRCS "*.h") + +INSTALL(FILES + ${Eigen_Core_arch_ZVector_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/ZVector COMPONENT Devel +) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h new file mode 100644 index 000000000..9a8735ac1 --- /dev/null +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -0,0 +1,201 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX32_ALTIVEC_H +#define EIGEN_COMPLEX32_ALTIVEC_H + +namespace Eigen { + +namespace internal { + +static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; +static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 }; + +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {} + Packet2d v; +}; + +template<> struct packet_traits<std::complex<double> > : default_packet_traits +{ + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; + +template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) +{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); } + +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride) +{ + std::complex<double> EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload<Packet1cd>(af); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride) +{ + std::complex<double> EIGEN_ALIGN16 af[2]; + pstore<std::complex<double> >(af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} + +template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } +template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); } + +template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + Packet2d a_re, a_im, v1, v2; + + // Permute and multiply the real parts of a and b + a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI); + // Get the imaginary parts of a + a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO); + // multiply a_re * b + v1 = vec_madd(a_re, b.v, p2d_ZERO); + // multiply a_im * b and get the conjugate result + v2 = vec_madd(a_im, b.v, p2d_ZERO); + v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8); + v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1); + + return Packet1cd(v1 + v2); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } + +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) +{ + return pset1<Packet1cd>(*from); +} + +template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } + +template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) +{ + std::complex<double> EIGEN_ALIGN16 res[2]; + pstore<std::complex<double> >(res, a); + + return res[0]; +} + +template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } + +template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) +{ + return pfirst(a); +} + +template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) +{ + return vecs[0]; +} + +template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) +{ + return pfirst(a); +} + +template<int Offset> +struct palign_impl<Offset,Packet1cd> +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) + { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, false,true> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, true,false> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, true,true> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for AltiVec + Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b); + Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); + return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel) +{ + Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX32_ALTIVEC_H diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h new file mode 100644 index 000000000..6fff8524e --- /dev/null +++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h @@ -0,0 +1,110 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2007 Julien Pommier +// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The sin, cos, exp, and log functions of this file come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H +#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H + +namespace Eigen { + +namespace internal { + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d pexp<Packet2d>(const Packet2d& _x) +{ + Packet2d x = _x; + + _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); + _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); + _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); + + _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); + _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); + + _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); + + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); + + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); + + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); + + Packet2d tmp, fx; + Packet2l emm0; + + // clamp x + x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); + /* express exp(x) as exp(g + n*log(2)) */ + fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half); + + fx = vec_floor(fx); + + tmp = pmul(fx, p2d_cephes_exp_C1); + Packet2d z = pmul(fx, p2d_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + Packet2d x2 = pmul(x,x); + + Packet2d px = p2d_cephes_exp_p0; + px = pmadd(px, x2, p2d_cephes_exp_p1); + px = pmadd(px, x2, p2d_cephes_exp_p2); + px = pmul (px, x); + + Packet2d qx = p2d_cephes_exp_q0; + qx = pmadd(qx, x2, p2d_cephes_exp_q1); + qx = pmadd(qx, x2, p2d_cephes_exp_q2); + qx = pmadd(qx, x2, p2d_cephes_exp_q3); + + x = pdiv(px,psub(qx,px)); + x = pmadd(p2d_2,x,p2d_1); + + // build 2^n + emm0 = vec_ctsl(fx, 0); + + static const Packet2l p2l_1023 = { 1023, 1023 }; + static const Packet2ul p2ul_52 = { 52, 52 }; + + emm0 = emm0 + p2l_1023; + emm0 = emm0 << reinterpret_cast<Packet2l>(p2ul_52); + + // Altivec's max & min operators just drop silent NaNs. Check NaNs in + // inputs and return them unmodified. + Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x)); + return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x), + isnumber_mask); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d psqrt<Packet2d>(const Packet2d& x) +{ + return __builtin_s390_vfsqdb(x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d prsqrt<Packet2d>(const Packet2d& x) { + // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. + return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_ALTIVEC_H diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h new file mode 100755 index 000000000..5a7226be6 --- /dev/null +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -0,0 +1,575 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_ZVECTOR_H +#define EIGEN_PACKET_MATH_ZVECTOR_H + +#include <stdint.h> + +namespace Eigen { + +namespace internal { + +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#endif + +// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 +#endif + +typedef __vector int Packet4i; +typedef __vector unsigned int Packet4ui; +typedef __vector __bool int Packet4bi; +typedef __vector short int Packet8i; +typedef __vector unsigned char Packet16uc; +typedef __vector double Packet2d; +typedef __vector unsigned long long Packet2ul; +typedef __vector long long Packet2l; + +typedef union { + int32_t i[4]; + uint32_t ui[4]; + int64_t l[2]; + uint64_t ul[2]; + double d[2]; + Packet4i v4i; + Packet4ui v4ui; + Packet2l v2l; + Packet2ul v2ul; + Packet2d v2d; +} Packet; + +// We don't want to write the same code all the time, but we need to reuse the constants +// and it doesn't really work to declare them global, so we define macros instead + +#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ + Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X)) + +#define _EIGEN_DECLARE_CONST_FAST_Packet2d(NAME,X) \ + Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X)) + +#define _EIGEN_DECLARE_CONST_FAST_Packet2l(NAME,X) \ + Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X)) + +#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ + Packet4i p4i_##NAME = pset1<Packet4i>(X) + +#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ + Packet2d p2d_##NAME = pset1<Packet2d>(X) + +#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ + Packet2l p2l_##NAME = pset1<Packet2l>(X) + +// These constants are endian-agnostic +//static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1} + +static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0); +static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0); +static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); + +static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; + +static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; +static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8)); + +static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; + +// Mask alignment +#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 + +#define _EIGEN_ALIGNED_PTR(x) ((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) + +// Handle endianness properly while loading constants +// Define global static constants: + +static Packet16uc p16uc_FORWARD = { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }; +static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; +static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; + +static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; +static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; +/*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; + +static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/ +static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; +/*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; +static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/ +static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; +static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; + +//static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; + +//static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; + + +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + #define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR); +#else + #define EIGEN_ZVECTOR_PREFETCH(ADDR) asm( " pfd [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); +#endif + +template<> struct packet_traits<int> : default_packet_traits +{ + typedef Packet4i type; + typedef Packet4i half; + enum { + // FIXME check the Has* + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + + // FIXME check the Has* + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasBlend = 1 + }; +}; + +template<> struct packet_traits<double> : default_packet_traits +{ + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 1, + + // FIXME check the Has* + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasNegate = 1, + HasBlend = 1 + }; +}; + +template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; + +inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) +{ + Packet vt; + vt.v4i = v; + s << vt.i[0] << ", " << vt.i[1] << ", " << vt.i[2] << ", " << vt.i[3]; + return s; +} + +inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) +{ + Packet vt; + vt.v4ui = v; + s << vt.ui[0] << ", " << vt.ui[1] << ", " << vt.ui[2] << ", " << vt.ui[3]; + return s; +} + +inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) +{ + Packet vt; + vt.v2l = v; + s << vt.l[0] << ", " << vt.l[1]; + return s; +} + +inline std::ostream & operator <<(std::ostream & s, const Packet2ul & v) +{ + Packet vt; + vt.v2ul = v; + s << vt.ul[0] << ", " << vt.ul[1] ; + return s; +} + +inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) +{ + Packet vt; + vt.v2d = v; + s << vt.d[0] << ", " << vt.d[1]; + return s; +} + +template<int Offset> +struct palign_impl<Offset,Packet4i> +{ + static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) + { + switch (Offset % 4) { + case 1: + first = vec_sld(first, second, 4); break; + case 2: + first = vec_sld(first, second, 8); break; + case 3: + first = vec_sld(first, second, 12); break; + } + } +}; + +template<int Offset> +struct palign_impl<Offset,Packet2d> +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset == 1) + first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(first), reinterpret_cast<Packet4i>(second), 8)); + } +}; + +template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v4i; +} + +template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v2d; +} + +template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v4i = from; +} + +template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v2d = from; +} + +template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) +{ + return vec_splats(from); +} + +template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { + return vec_splats(from); +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4<Packet4i>(const int *a, + Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) +{ + a3 = pload<Packet4i>(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4<Packet2d>(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ + a1 = pload<Packet2d>(a); + a0 = vec_splat(a1, 0); + a1 = vec_splat(a1, 1); + a3 = pload<Packet2d>(a+2); + a2 = vec_splat(a3, 0); + a3 = vec_splat(a3, 1); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) +{ + int EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return pload<Packet4i>(ai); +} + +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) +{ + double EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload<Packet2d>(af); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) +{ + int EIGEN_ALIGN16 ai[4]; + pstore<int>((int *)ai, from); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) +{ + double EIGEN_ALIGN16 af[2]; + pstore<double>(af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} + +template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); } +template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); } + +template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); } +template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); } + +template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); } +template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); } + +template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); } +template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); } + +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); } +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); } + +template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } + +template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); } + +template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } + +template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } + +template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } + +template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } + +template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } + +template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } + +template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); } + +template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { return pload<Packet4i>(from); } +template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { return pload<Packet2d>(from); } + + +template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) +{ + Packet4i p = pload<Packet4i>(from); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); +} + +template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) +{ + Packet2d p = pload<Packet2d>(from); + return vec_perm(p, p, p16uc_PSET64_HI); +} + +template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { pstore<int>(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { pstore<double>(to, from); } + +template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } + +template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) +{ + return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); +} + +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) +{ + return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64)); +} + +template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } + +template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) +{ + Packet4i b, sum; + b = vec_sld(a, a, 8); + sum = padd<Packet4i>(a, b); + b = vec_sld(sum, sum, 4); + sum = padd<Packet4i>(sum, b); + return pfirst(sum); +} + +template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) +{ + Packet2d b, sum; + b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8)); + sum = padd<Packet2d>(a, b); + return pfirst(sum); +} + +template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) +{ + Packet4i v[4], sum[4]; + + // It's easier and faster to transpose then add as columns + // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation + // Do the transpose, first set of moves + v[0] = vec_mergeh(vecs[0], vecs[2]); + v[1] = vec_mergel(vecs[0], vecs[2]); + v[2] = vec_mergeh(vecs[1], vecs[3]); + v[3] = vec_mergel(vecs[1], vecs[3]); + // Get the resulting vectors + sum[0] = vec_mergeh(v[0], v[2]); + sum[1] = vec_mergel(v[0], v[2]); + sum[2] = vec_mergeh(v[1], v[3]); + sum[3] = vec_mergel(v[1], v[3]); + + // Now do the summation: + // Lines 0+1 + sum[0] = padd<Packet4i>(sum[0], sum[1]); + // Lines 2+3 + sum[1] = padd<Packet4i>(sum[2], sum[3]); + // Add the results + sum[0] = padd<Packet4i>(sum[0], sum[1]); + + return sum[0]; +} + +template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) +{ + Packet2d v[2], sum; + v[0] = padd<Packet2d>(vecs[0], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[0]), reinterpret_cast<Packet4ui>(vecs[0]), 8))); + v[1] = padd<Packet2d>(vecs[1], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[1]), reinterpret_cast<Packet4ui>(vecs[1]), 8))); + + sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v[0]), reinterpret_cast<Packet4ui>(v[1]), 8)); + + return sum; +} + +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) +{ + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + return aux[0] * aux[1] * aux[2] * aux[3]; +} + +template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) +{ + return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8)))); +} + +// min +template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) +{ + Packet4i b, res; + b = pmin<Packet4i>(a, vec_sld(a, a, 8)); + res = pmin<Packet4i>(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) +{ + return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8)))); +} + +// max +template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) +{ + Packet4i b, res; + b = pmax<Packet4i>(a, vec_sld(a, a, 8)); + res = pmax<Packet4i>(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +// max +template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) +{ + return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8)))); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4i,4>& kernel) { + Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet2d,2>& kernel) { + Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); + Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); + kernel.packet[0] = t0; + kernel.packet[1] = t1; +} + +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_ZVECTOR_H diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index 4962d625c..e28fecfd0 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -238,7 +238,13 @@ template<typename Scalar> struct scalar_hypot_op { }; template<typename Scalar> struct functor_traits<scalar_hypot_op<Scalar> > { - enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess=0 }; + enum + { + Cost = 3 * NumTraits<Scalar>::AddCost + + 2 * NumTraits<Scalar>::MulCost + + 2 * NumTraits<Scalar>::template Div<false>::Cost, + PacketAccess = false + }; }; /** \internal @@ -297,9 +303,10 @@ template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op { }; template<typename LhsScalar,typename RhsScalar> struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > { + typedef typename scalar_quotient_op<LhsScalar,RhsScalar>::result_type result_type; enum { - Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost), // rough estimate! - PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable + PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable, + Cost = NumTraits<result_type>::template Div<PacketAccess>::Cost }; }; @@ -337,6 +344,55 @@ template<> struct functor_traits<scalar_boolean_or_op> { }; }; +/** \internal + * \brief Template functor to compute the incomplete gamma function igamma(a, x) + * + * \sa class CwiseBinaryOp, Cwise::igamma + */ +template<typename Scalar> struct scalar_igamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const { + using numext::igamma; return igamma(a, x); + } + template<typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const { + return internal::pigammac(a, x); + } +}; +template<typename Scalar> +struct functor_traits<scalar_igamma_op<Scalar> > { + enum { + // Guesstimate + Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasIGamma + }; +}; + + +/** \internal + * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x) + * + * \sa class CwiseBinaryOp, Cwise::igammac + */ +template<typename Scalar> struct scalar_igammac_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const { + using numext::igammac; return igammac(a, x); + } + template<typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const + { + return internal::pigammac(a, x); + } +}; +template<typename Scalar> +struct functor_traits<scalar_igammac_op<Scalar> > { + enum { + // Guesstimate + Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasIGammac + }; +}; //---------- binary functors bound to a constant, thus appearing as a unary functor ---------- @@ -515,6 +571,10 @@ struct scalar_inverse_mult_op { { return internal::pdiv(pset1<Packet>(m_other),a); } Scalar m_other; }; +template<typename Scalar> +struct functor_traits<scalar_inverse_mult_op<Scalar> > +{ enum { PacketAccess = packet_traits<Scalar>::HasDiv, Cost = NumTraits<Scalar>::template Div<PacketAccess>::Cost }; }; + } // end namespace internal diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h index cd9fbf267..c5836d048 100644 --- a/Eigen/src/Core/functors/NullaryFunctors.h +++ b/Eigen/src/Core/functors/NullaryFunctors.h @@ -37,7 +37,7 @@ template<typename Scalar> struct functor_traits<scalar_identity_op<Scalar> > { enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; }; -template <typename Scalar, typename Packet, bool RandomAccess> struct linspaced_op_impl; +template <typename Scalar, typename Packet, bool RandomAccess, bool IsInteger> struct linspaced_op_impl; // linear access for packet ops: // 1) initialization @@ -48,12 +48,12 @@ template <typename Scalar, typename Packet, bool RandomAccess> struct linspaced_ // TODO: Perhaps it's better to initialize lazily (so not in the constructor but in packetOp) // in order to avoid the padd() in operator() ? template <typename Scalar, typename Packet> -struct linspaced_op_impl<Scalar,Packet,false> +struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/false,/*IsInteger*/false> { - linspaced_op_impl(const Scalar& low, const Scalar& step) : - m_low(low), m_step(step), - m_packetStep(pset1<Packet>(unpacket_traits<Packet>::size*step)), - m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(step),plset<Packet>(-unpacket_traits<Packet>::size)))) {} + linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + m_low(low), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), + m_packetStep(pset1<Packet>(unpacket_traits<Packet>::size*m_step)), + m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(m_step),plset<Packet>(-unpacket_traits<Packet>::size)))) {} template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const @@ -75,11 +75,11 @@ struct linspaced_op_impl<Scalar,Packet,false> // 1) each step // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) ) template <typename Scalar, typename Packet> -struct linspaced_op_impl<Scalar,Packet,true> +struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/true,/*IsInteger*/false> { - linspaced_op_impl(const Scalar& low, const Scalar& step) : - m_low(low), m_step(step), - m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Packet>(0)) {} + linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + m_low(low), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), + m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Packet>(0)) {} template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; } @@ -95,6 +95,31 @@ struct linspaced_op_impl<Scalar,Packet,true> const Packet m_interPacket; }; +template <typename Scalar, typename Packet> +struct linspaced_op_impl<Scalar,Packet,/*RandomAccess*/true,/*IsInteger*/true> +{ + linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + m_low(low), m_length(high-low), m_divisor(num_steps==1?1:num_steps-1), m_interPacket(plset<Packet>(0)) + {} + + template<typename Index> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar operator() (Index i) const { + return m_low + (m_length*Scalar(i))/m_divisor; + } + + template<typename Index> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Packet packetOp(Index i) const { + return internal::padd(pset1<Packet>(m_low), pdiv(pmul(pset1<Packet>(m_length), padd(pset1<Packet>(Scalar(i)),m_interPacket)), + pset1<Packet>(m_divisor))); } + + const Scalar m_low; + const Scalar m_length; + const Index m_divisor; + const Packet m_interPacket; +}; + // ----- Linspace functor ---------------------------------------------------------------- // Forward declaration (we default to random access which does not really give @@ -102,10 +127,20 @@ struct linspaced_op_impl<Scalar,Packet,true> // nested expressions). template <typename Scalar, typename PacketType, bool RandomAccess = true> struct linspaced_op; template <typename Scalar, typename PacketType, bool RandomAccess> struct functor_traits< linspaced_op<Scalar,PacketType,RandomAccess> > -{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; }; +{ + enum + { + Cost = 1, + PacketAccess = packet_traits<Scalar>::HasSetLinear + && ((!NumTraits<Scalar>::IsInteger) || packet_traits<Scalar>::HasDiv), + IsRepeatable = true + }; +}; template <typename Scalar, typename PacketType, bool RandomAccess> struct linspaced_op { - linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) : impl((num_steps==1 ? high : low), (num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1))) {} + linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) + : impl((num_steps==1 ? high : low),high,num_steps) + {} template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return impl(i); } @@ -134,7 +169,9 @@ template <typename Scalar, typename PacketType, bool RandomAccess> struct linspa // This proxy object handles the actual required temporaries, the different // implementations (random vs. sequential access) as well as the // correct piping to size 2/4 packet operations. - const linspaced_op_impl<Scalar,PacketType,RandomAccess> impl; + // As long as we don't have a Bresenham-like implementation for linear-access and integer types, + // we have to by-pass RandomAccess for integer types. See bug 698. + const linspaced_op_impl<Scalar,PacketType,(NumTraits<Scalar>::IsInteger?true:RandomAccess),NumTraits<Scalar>::IsInteger> impl; }; // all functors allow linear access, except scalar_identity_op. So we fix here a quick meta diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index e630acc38..7ba0abedc 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -41,7 +41,7 @@ struct functor_traits<scalar_opposite_op<Scalar> > template<typename Scalar> struct scalar_abs_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_abs_op) typedef typename NumTraits<Scalar>::Real result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using std::abs; return abs(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs(a); } template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pabs(a); } @@ -73,7 +73,7 @@ template<typename Scalar, typename=void> struct abs_knowing_score EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score) typedef typename NumTraits<Scalar>::Real result_type; template<typename Score> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a, const Score&) const { using std::abs; return abs(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a, const Score&) const { return numext::abs(a); } }; template<typename Scalar> struct abs_knowing_score<Scalar, typename scalar_score_coeff_op<Scalar>::Score_is_abs> { @@ -230,7 +230,7 @@ struct functor_traits<scalar_imag_ref_op<Scalar> > */ template<typename Scalar> struct scalar_exp_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::exp; return exp(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::exp(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexp(a); } }; @@ -246,7 +246,7 @@ struct functor_traits<scalar_exp_op<Scalar> > */ template<typename Scalar> struct scalar_log_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::log; return log(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::log(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog(a); } }; @@ -276,7 +276,7 @@ struct functor_traits<scalar_log10_op<Scalar> > */ template<typename Scalar> struct scalar_sqrt_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return sqrt(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sqrt(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); } }; @@ -294,7 +294,7 @@ struct functor_traits<scalar_sqrt_op<Scalar> > */ template<typename Scalar> struct scalar_rsqrt_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return Scalar(1)/sqrt(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(1)/numext::sqrt(a); } template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); } }; @@ -403,6 +403,143 @@ struct functor_traits<scalar_asin_op<Scalar> > }; }; + +/** \internal + * \brief Template functor to compute the natural log of the absolute + * value of Gamma of a scalar + * \sa class CwiseUnaryOp, Cwise::lgamma() + */ +template<typename Scalar> struct scalar_lgamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::lgamma; return lgamma(a); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); } +}; +template<typename Scalar> +struct functor_traits<scalar_lgamma_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasLGamma + }; +}; + +/** \internal + * \brief Template functor to compute psi, the derivative of lgamma of a scalar. + * \sa class CwiseUnaryOp, Cwise::digamma() + */ +template<typename Scalar> struct scalar_digamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::digamma; return digamma(a); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); } +}; +template<typename Scalar> +struct functor_traits<scalar_digamma_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasDiGamma + }; +}; + +/** \internal + * \brief Template functor to compute the Riemann Zeta function of two arguments. + * \sa class CwiseUnaryOp, Cwise::zeta() + */ +template<typename Scalar> struct scalar_zeta_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const { + using numext::zeta; return zeta(x, q); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); } +}; +template<typename Scalar> +struct functor_traits<scalar_zeta_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasZeta + }; +}; + +/** \internal + * \brief Template functor to compute the polygamma function. + * \sa class CwiseUnaryOp, Cwise::polygamma() + */ +template<typename Scalar> struct scalar_polygamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const { + using numext::polygamma; return polygamma(n, x); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); } +}; +template<typename Scalar> +struct functor_traits<scalar_polygamma_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasPolygamma + }; +}; + +/** \internal + * \brief Template functor to compute the Gauss error function of a + * scalar + * \sa class CwiseUnaryOp, Cwise::erf() + */ +template<typename Scalar> struct scalar_erf_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::erf; return erf(a); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); } +}; +template<typename Scalar> +struct functor_traits<scalar_erf_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasErf + }; +}; + +/** \internal + * \brief Template functor to compute the Complementary Error Function + * of a scalar + * \sa class CwiseUnaryOp, Cwise::erfc() + */ +template<typename Scalar> struct scalar_erfc_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + using numext::erfc; return erfc(a); + } + typedef typename packet_traits<Scalar>::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); } +}; +template<typename Scalar> +struct functor_traits<scalar_erfc_op<Scalar> > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost, + PacketAccess = packet_traits<Scalar>::HasErfc + }; +}; + + /** \internal * \brief Template functor to compute the atan of a scalar * \sa class CwiseUnaryOp, ArrayBase::atan() @@ -422,6 +559,7 @@ struct functor_traits<scalar_atan_op<Scalar> > }; }; + /** \internal * \brief Template functor to compute the tanh of a scalar * \sa class CwiseUnaryOp, ArrayBase::tanh() @@ -572,7 +710,7 @@ struct functor_traits<scalar_floor_op<Scalar> > template<typename Scalar> struct scalar_ceil_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_ceil_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::ceil(a); } - typedef typename packet_traits<Scalar>::type Packet; + template <typename Packet> EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pceil(a); } }; template<typename Scalar> @@ -660,10 +798,10 @@ struct functor_traits<scalar_boolean_not_op<Scalar> > { * \sa class CwiseUnaryOp, Cwise::sign() */ template<typename Scalar,bool iscpx=(NumTraits<Scalar>::IsComplex!=0) > struct scalar_sign_op; -template<typename Scalar> +template<typename Scalar> struct scalar_sign_op<Scalar,false> { EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar( (a>Scalar(0)) - (a<Scalar(0)) ); } @@ -671,17 +809,16 @@ struct scalar_sign_op<Scalar,false> { //template <typename Packet> //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); } }; -template<typename Scalar> +template<typename Scalar> struct scalar_sign_op<Scalar,true> { EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { - using std::abs; typedef typename NumTraits<Scalar>::Real real_type; - real_type aa = abs(a); + real_type aa = numext::abs(a); if (aa==0) - return Scalar(0); - aa = 1./aa; + return Scalar(0); + aa = 1./aa; return Scalar(real(a)*aa, imag(a)*aa ); } //TODO diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 665339c58..4c1a63d40 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -178,7 +178,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // We also include a register-level block of the result (mx x nr). // (In an ideal world only the lhs panel would stay in L1) // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of: - const Index max_kc = ((l1-k_sub)/k_div) & (~(k_peeling-1)); + const Index max_kc = std::max<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1); const Index old_k = k; if(k>max_kc) { @@ -252,7 +252,7 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // we have both L2 and L3, and problem is small enough to be kept in L2 // Let's choose m such that lhs's block fit in 1/3 of L2 actual_lm = l2; - max_mc = 576; + max_mc = (std::min<Index>)(576,max_mc); } Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); if (mc > Traits::mr) mc -= mc % Traits::mr; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index d830dfb96..a39c7808c 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -145,12 +145,9 @@ static void run(Index rows, Index cols, Index depth, // Release all the sub blocks A'_i of A' for the current thread, // i.e., we simply decrement the number of users by 1 - #pragma omp critical - { for(Index i=0; i<threads; ++i) #pragma omp atomic info[i].users -= 1; - } } } else @@ -355,9 +352,8 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M } else // no l3 blocking { - Index m = this->m_mc; Index n = this->m_nc; - computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads); + computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n, num_threads); } m_sizeA = this->m_mc * this->m_kc; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index a36eb2fe0..831089dee 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -42,13 +42,14 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder, { typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha) + const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, + const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking) { general_matrix_matrix_triangular_product<Index, RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs, LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs, ColMajor, UpLo==Lower?Upper:Lower> - ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha); + ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking); } }; @@ -58,7 +59,8 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder, { typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha) + const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, + const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking) { typedef gebp_traits<LhsScalar,RhsScalar> Traits; @@ -69,16 +71,18 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder, RhsMapper rhs(_rhs,rhsStride); ResMapper res(_res, resStride); - Index kc = depth; // cache block size along the K direction - Index mc = size; // cache block size along the M direction - Index nc = size; // cache block size along the N direction - computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, 1); + Index kc = blocking.kc(); + Index mc = (std::min)(size,blocking.mc()); + // !!! mc must be a multiple of nr: if(mc > Traits::nr) mc = (mc/Traits::nr)*Traits::nr; - ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0); + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*size; + + ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; @@ -136,7 +140,7 @@ struct tribb_kernel typedef typename Traits::ResScalar ResScalar; enum { - BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) + BlockSize = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret }; void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { @@ -256,13 +260,27 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false> typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived()); + enum { + IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0, + LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0, + RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0 + }; + + Index size = mat.cols(); + Index depth = actualLhs.cols(); + + typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,typename Lhs::Scalar,typename Rhs::Scalar, + MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualRhs::MaxColsAtCompileTime> BlockingType; + + BlockingType blocking(size, size, depth, 1, false); + internal::general_matrix_matrix_triangular_product<Index, - typename Lhs::Scalar, _ActualLhs::Flags&RowMajorBit ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, - typename Rhs::Scalar, _ActualRhs::Flags&RowMajorBit ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, - MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo> - ::run(mat.cols(), actualLhs.cols(), + typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, + typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, + IsRowMajor ? RowMajor : ColMajor, UpLo> + ::run(size, depth, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(), - mat.data(), mat.outerStride(), actualAlpha); + mat.data(), mat.outerStride(), actualAlpha, blocking); } }; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index 3deed068e..911df8ff3 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Level 3 BLAS SYRK/HERK implementation. ******************************************************************************** */ -#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H -#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H +#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H namespace Eigen { @@ -44,34 +44,35 @@ struct general_matrix_matrix_rankupdate : // try to go to BLAS specialization -#define EIGEN_MKL_RANKUPDATE_SPECIALIZE(Scalar) \ +#define EIGEN_BLAS_RANKUPDATE_SPECIALIZE(Scalar) \ template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs, int UpLo> \ struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \ Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \ - const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha) \ + const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \ { \ if (lhs==rhs) { \ general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \ - ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \ + ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ } else { \ general_matrix_matrix_triangular_product<Index, \ Scalar, LhsStorageOrder, ConjugateLhs, \ Scalar, RhsStorageOrder, ConjugateRhs, \ ColMajor, UpLo, BuiltIn> \ - ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \ + ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ } \ } \ }; -EIGEN_MKL_RANKUPDATE_SPECIALIZE(double) -//EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex) -EIGEN_MKL_RANKUPDATE_SPECIALIZE(float) -//EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex) +EIGEN_BLAS_RANKUPDATE_SPECIALIZE(double) +EIGEN_BLAS_RANKUPDATE_SPECIALIZE(float) +// TODO handle complex cases +// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(dcomplex) +// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(scomplex) // SYRK for float/double -#define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, MKLTYPE, MKLFUNC) \ +#define EIGEN_BLAS_RANKUPDATE_R(EIGTYPE, BLASTYPE, BLASFUNC) \ template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \ enum { \ @@ -80,23 +81,19 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \ }; \ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \ - const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \ + const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ /* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \ \ - MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \ + BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \ char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \ - MKLTYPE alpha_, beta_; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \ - MKLFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \ + EIGTYPE beta; \ + BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \ } \ }; // HERK for complex data -#define EIGEN_MKL_RANKUPDATE_C(EIGTYPE, MKLTYPE, RTYPE, MKLFUNC) \ +#define EIGEN_BLAS_RANKUPDATE_C(EIGTYPE, BLASTYPE, RTYPE, BLASFUNC) \ template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \ enum { \ @@ -105,18 +102,15 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \ }; \ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \ - const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \ + const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \ \ - MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \ + BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \ char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \ RTYPE alpha_, beta_; \ const EIGTYPE* a_ptr; \ \ -/* Set alpha_ & beta_ */ \ -/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); */\ -/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1));*/ \ alpha_ = alpha.real(); \ beta_ = 1.0; \ /* Copy with conjugation in some cases*/ \ @@ -127,20 +121,21 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C lda = a.outerStride(); \ a_ptr = a.data(); \ } else a_ptr=lhs; \ - MKLFUNC(&uplo, &trans, &n, &k, &alpha_, (MKLTYPE*)a_ptr, &lda, &beta_, (MKLTYPE*)res, &ldc); \ + BLASFUNC(&uplo, &trans, &n, &k, &alpha_, (BLASTYPE*)a_ptr, &lda, &beta_, (BLASTYPE*)res, &ldc); \ } \ }; -EIGEN_MKL_RANKUPDATE_R(double, double, dsyrk) -EIGEN_MKL_RANKUPDATE_R(float, float, ssyrk) +EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_) +EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_) -//EIGEN_MKL_RANKUPDATE_C(dcomplex, MKL_Complex16, double, zherk) -//EIGEN_MKL_RANKUPDATE_C(scomplex, MKL_Complex8, double, cherk) +// TODO hanlde complex cases +// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_) +// EIGEN_BLAS_RANKUPDATE_C(scomplex, float, float, cherk_) } // end namespace internal } // end namespace Eigen -#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H +#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h index b6ae729b2..7a3bdbf20 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * General matrix-matrix product functionality based on ?GEMM. ******************************************************************************** */ -#ifndef EIGEN_GENERAL_MATRIX_MATRIX_MKL_H -#define EIGEN_GENERAL_MATRIX_MATRIX_MKL_H +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H +#define EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H namespace Eigen { @@ -46,7 +46,7 @@ namespace internal { // gemm specialization -#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, MKLTYPE, MKLPREFIX) \ +#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASPREFIX) \ template< \ typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ @@ -66,55 +66,50 @@ static void run(Index rows, Index cols, Index depth, \ using std::conj; \ \ char transa, transb; \ - MKL_INT m, n, k, lda, ldb, ldc; \ + BlasIndex m, n, k, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX a_tmp, b_tmp; \ - EIGTYPE myone(1);\ \ /* Set transpose options */ \ transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \ transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ - k = (MKL_INT)depth; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ + k = convert_index<BlasIndex>(depth); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)lhsStride; \ - ldb = (MKL_INT)rhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index<BlasIndex>(lhsStride); \ + ldb = convert_index<BlasIndex>(rhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ \ /* Set a, b, c */ \ if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \ a_tmp = lhs.conjugate(); \ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else a = _lhs; \ \ if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \ b_tmp = rhs.conjugate(); \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } else b = _rhs; \ \ - MKLPREFIX##gemm(&transa, &transb, &m, &n, &k, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \ + BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ }}; -GEMM_SPECIALIZATION(double, d, double, d) -GEMM_SPECIALIZATION(float, f, float, s) -GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, z) -GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, c) +GEMM_SPECIALIZATION(double, d, double, d) +GEMM_SPECIALIZATION(float, f, float, s) +GEMM_SPECIALIZATION(dcomplex, cd, double, z) +GEMM_SPECIALIZATION(scomplex, cf, float, c) } // end namespase internal } // end namespace Eigen -#endif // EIGEN_GENERAL_MATRIX_MATRIX_MKL_H +#endif // EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h index 12c3d13bd..e3a5d5892 100755..100644 --- a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h +++ b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * General matrix-vector product functionality based on ?GEMV. ******************************************************************************** */ -#ifndef EIGEN_GENERAL_MATRIX_VECTOR_MKL_H -#define EIGEN_GENERAL_MATRIX_VECTOR_MKL_H +#ifndef EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H +#define EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H namespace Eigen { @@ -49,7 +49,7 @@ namespace internal { template<typename Index, typename LhsScalar, int StorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> struct general_matrix_vector_product_gemv; -#define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \ +#define EIGEN_BLAS_GEMV_SPECIALIZE(Scalar) \ template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \ struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,Specialized> { \ static void run( \ @@ -80,12 +80,12 @@ static void run( \ } \ }; \ -EIGEN_MKL_GEMV_SPECIALIZE(double) -EIGEN_MKL_GEMV_SPECIALIZE(float) -EIGEN_MKL_GEMV_SPECIALIZE(dcomplex) -EIGEN_MKL_GEMV_SPECIALIZE(scomplex) +EIGEN_BLAS_GEMV_SPECIALIZE(double) +EIGEN_BLAS_GEMV_SPECIALIZE(float) +EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex) +EIGEN_BLAS_GEMV_SPECIALIZE(scomplex) -#define EIGEN_MKL_GEMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLPREFIX) \ +#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \ template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \ struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \ { \ @@ -97,16 +97,15 @@ static void run( \ const EIGTYPE* rhs, Index rhsIncr, \ EIGTYPE* res, Index resIncr, EIGTYPE alpha) \ { \ - MKL_INT m=rows, n=cols, lda=lhsStride, incx=rhsIncr, incy=resIncr; \ - MKLTYPE alpha_, beta_; \ - const EIGTYPE *x_ptr, myone(1); \ + BlasIndex m=convert_index<BlasIndex>(rows), n=convert_index<BlasIndex>(cols), \ + lda=convert_index<BlasIndex>(lhsStride), incx=convert_index<BlasIndex>(rhsIncr), incy=convert_index<BlasIndex>(resIncr); \ + const EIGTYPE beta(1); \ + const EIGTYPE *x_ptr; \ char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \ if (LhsStorageOrder==RowMajor) { \ - m=cols; \ - n=rows; \ + m = convert_index<BlasIndex>(cols); \ + n = convert_index<BlasIndex>(rows); \ }\ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ GEMVVector x_tmp; \ if (ConjugateRhs) { \ Map<const GEMVVector, 0, InnerStride<> > map_x(rhs,cols,1,InnerStride<>(incx)); \ @@ -114,17 +113,17 @@ static void run( \ x_ptr=x_tmp.data(); \ incx=1; \ } else x_ptr=rhs; \ - MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \ + BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; -EIGEN_MKL_GEMV_SPECIALIZATION(double, double, d) -EIGEN_MKL_GEMV_SPECIALIZATION(float, float, s) -EIGEN_MKL_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, z) -EIGEN_MKL_GEMV_SPECIALIZATION(scomplex, MKL_Complex8, c) +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c) } // end namespase internal } // end namespace Eigen -#endif // EIGEN_GENERAL_MATRIX_VECTOR_MKL_H +#endif // EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index f84f54982..da6f82abc 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -291,7 +291,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, - const Scalar& alpha) + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { product_selfadjoint_matrix<Scalar, Index, EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor, @@ -299,7 +299,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor, LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs), ColMajor> - ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha); + ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); } }; @@ -314,7 +314,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, - const Scalar& alpha); + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); }; template <typename Scalar, typename Index, @@ -325,7 +325,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* _res, Index resStride, - const Scalar& alpha) + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { Index size = rows; @@ -340,17 +340,14 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t RhsMapper rhs(_rhs,rhsStride); ResMapper res(_res, resStride); - Index kc = size; // cache block size along the K direction - Index mc = rows; // cache block size along the M direction - Index nc = cols; // cache block size along the N direction - computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1); - // kc must smaller than mc + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + // kc must be smaller than mc kc = (std::min)(kc,mc); - + std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; @@ -410,7 +407,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLh const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, - const Scalar& alpha); + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); }; template <typename Scalar, typename Index, @@ -421,7 +418,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* _res, Index resStride, - const Scalar& alpha) + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { Index size = cols; @@ -432,14 +429,12 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f LhsMapper lhs(_lhs,lhsStride); ResMapper res(_res,resStride); - Index kc = size; // cache block size along the K direction - Index mc = rows; // cache block size along the M direction - Index nc = cols; // cache block size along the N direction - computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1); + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; @@ -498,6 +493,11 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false> Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) * RhsBlasTraits::extractScalarFactor(a_rhs); + typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, + Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,1> BlockingType; + + BlockingType blocking(lhs.rows(), rhs.cols(), lhs.cols(), 1, false); + internal::product_selfadjoint_matrix<Scalar, Index, EIGEN_LOGICAL_XOR(LhsIsUpper,internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)), @@ -509,7 +509,7 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false> &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info &dst.coeffRef(0,0), dst.outerStride(), // result info - actualAlpha // alpha + actualAlpha, blocking // alpha ); } }; diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h index dfa687fef..c3e37b1e0 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM. ******************************************************************************** */ -#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H -#define EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H +#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H +#define EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H namespace Eigen { @@ -40,7 +40,7 @@ namespace internal { /* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */ -#define EIGEN_MKL_SYMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -52,28 +52,23 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ EIGTYPE* res, Index resStride, \ - EIGTYPE alpha) \ + EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ char side='L', uplo='L'; \ - MKL_INT m, n, lda, ldb, ldc; \ + BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ - EIGTYPE myone(1);\ \ /* Set transpose options */ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)lhsStride; \ - ldb = (MKL_INT)rhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index<BlasIndex>(lhsStride); \ + ldb = convert_index<BlasIndex>(rhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ \ /* Set a, b, c */ \ if (LhsStorageOrder==RowMajor) uplo='U'; \ @@ -83,16 +78,16 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \ b_tmp = rhs.adjoint(); \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } else b = _rhs; \ \ - MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \ + BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_MKL_HEMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -103,29 +98,24 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ EIGTYPE* res, Index resStride, \ - EIGTYPE alpha) \ + EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ char side='L', uplo='L'; \ - MKL_INT m, n, lda, ldb, ldc; \ + BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp; \ - EIGTYPE myone(1); \ \ /* Set transpose options */ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)lhsStride; \ - ldb = (MKL_INT)rhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index<BlasIndex>(lhsStride); \ + ldb = convert_index<BlasIndex>(rhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ \ /* Set a, b, c */ \ if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \ @@ -151,23 +141,23 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh b_tmp = rhs.transpose(); \ } \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } \ \ - MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \ + BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -EIGEN_MKL_SYMM_L(double, double, d, d) -EIGEN_MKL_SYMM_L(float, float, f, s) -EIGEN_MKL_HEMM_L(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_HEMM_L(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_SYMM_L(double, double, d, d) +EIGEN_BLAS_SYMM_L(float, float, f, s) +EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z) +EIGEN_BLAS_HEMM_L(scomplex, float, cf, c) /* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */ -#define EIGEN_MKL_SYMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -179,27 +169,22 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ EIGTYPE* res, Index resStride, \ - EIGTYPE alpha) \ + EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ char side='R', uplo='L'; \ - MKL_INT m, n, lda, ldb, ldc; \ + BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ - EIGTYPE myone(1);\ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)rhsStride; \ - ldb = (MKL_INT)lhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index<BlasIndex>(rhsStride); \ + ldb = convert_index<BlasIndex>(lhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ \ /* Set a, b, c */ \ if (RhsStorageOrder==RowMajor) uplo='U'; \ @@ -209,16 +194,16 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \ b_tmp = lhs.adjoint(); \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } else b = _lhs; \ \ - MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \ + BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_MKL_HEMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -229,35 +214,30 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ EIGTYPE* res, Index resStride, \ - EIGTYPE alpha) \ + EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ char side='R', uplo='L'; \ - MKL_INT m, n, lda, ldb, ldc; \ + BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp; \ - EIGTYPE myone(1); \ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)rhsStride; \ - ldb = (MKL_INT)lhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index<BlasIndex>(rhsStride); \ + ldb = convert_index<BlasIndex>(lhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ \ /* Set a, b, c */ \ if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \ Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \ a_tmp = rhs.conjugate(); \ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else a = _rhs; \ if (RhsStorageOrder==RowMajor) uplo='U'; \ \ @@ -279,17 +259,17 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL ldb = b_tmp.outerStride(); \ } \ \ - MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \ + BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ } \ }; -EIGEN_MKL_SYMM_R(double, double, d, d) -EIGEN_MKL_SYMM_R(float, float, f, s) -EIGEN_MKL_HEMM_R(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_HEMM_R(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_SYMM_R(double, double, d, d) +EIGEN_BLAS_SYMM_R(float, float, f, s) +EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z) +EIGEN_BLAS_HEMM_R(scomplex, float, cf, c) } // end namespace internal } // end namespace Eigen -#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H +#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h index 86684b66d..38f23accf 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Selfadjoint matrix-vector product functionality based on ?SYMV/HEMV. ******************************************************************************** */ -#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H -#define EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H +#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H +#define EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H namespace Eigen { @@ -47,31 +47,31 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju struct selfadjoint_matrix_vector_product_symv : selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn> {}; -#define EIGEN_MKL_SYMV_SPECIALIZE(Scalar) \ +#define EIGEN_BLAS_SYMV_SPECIALIZE(Scalar) \ template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \ struct selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Specialized> { \ static void run( \ Index size, const Scalar* lhs, Index lhsStride, \ - const Scalar* _rhs, Index rhsIncr, Scalar* res, Scalar alpha) { \ + const Scalar* _rhs, Scalar* res, Scalar alpha) { \ enum {\ IsColMajor = StorageOrder==ColMajor \ }; \ if (IsColMajor == ConjugateLhs) {\ selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn>::run( \ - size, lhs, lhsStride, _rhs, rhsIncr, res, alpha); \ + size, lhs, lhsStride, _rhs, res, alpha); \ } else {\ selfadjoint_matrix_vector_product_symv<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs>::run( \ - size, lhs, lhsStride, _rhs, rhsIncr, res, alpha); \ + size, lhs, lhsStride, _rhs, res, alpha); \ }\ } \ }; \ -EIGEN_MKL_SYMV_SPECIALIZE(double) -EIGEN_MKL_SYMV_SPECIALIZE(float) -EIGEN_MKL_SYMV_SPECIALIZE(dcomplex) -EIGEN_MKL_SYMV_SPECIALIZE(scomplex) +EIGEN_BLAS_SYMV_SPECIALIZE(double) +EIGEN_BLAS_SYMV_SPECIALIZE(float) +EIGEN_BLAS_SYMV_SPECIALIZE(dcomplex) +EIGEN_BLAS_SYMV_SPECIALIZE(scomplex) -#define EIGEN_MKL_SYMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLFUNC) \ +#define EIGEN_BLAS_SYMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \ template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \ struct selfadjoint_matrix_vector_product_symv<EIGTYPE,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs> \ { \ @@ -79,36 +79,33 @@ typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> SYMVVector;\ \ static void run( \ Index size, const EIGTYPE* lhs, Index lhsStride, \ -const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* res, EIGTYPE alpha) \ +const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \ { \ enum {\ IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \ IsLower = UpLo == Lower ? 1 : 0 \ }; \ - MKL_INT n=size, lda=lhsStride, incx=rhsIncr, incy=1; \ - MKLTYPE alpha_, beta_; \ - const EIGTYPE *x_ptr, myone(1); \ + BlasIndex n=convert_index<BlasIndex>(size), lda=convert_index<BlasIndex>(lhsStride), incx=1, incy=1; \ + EIGTYPE beta(1); \ + const EIGTYPE *x_ptr; \ char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ SYMVVector x_tmp; \ if (ConjugateRhs) { \ - Map<const SYMVVector, 0, InnerStride<> > map_x(_rhs,size,1,InnerStride<>(incx)); \ + Map<const SYMVVector, 0 > map_x(_rhs,size,1); \ x_tmp=map_x.conjugate(); \ x_ptr=x_tmp.data(); \ - incx=1; \ } else x_ptr=_rhs; \ - MKLFUNC(&uplo, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \ + BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; -EIGEN_MKL_SYMV_SPECIALIZATION(double, double, dsymv) -EIGEN_MKL_SYMV_SPECIALIZATION(float, float, ssymv) -EIGEN_MKL_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv) -EIGEN_MKL_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv) +EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_) +EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_) +EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_) +EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_) } // end namespace internal } // end namespace Eigen -#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H +#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index 2af00058d..f038d686f 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -92,15 +92,27 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false> Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived()); - enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 }; + enum { + IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0, + OtherIsRowMajor = _ActualOtherType::Flags&RowMajorBit ? 1 : 0 + }; + + Index size = mat.cols(); + Index depth = actualOther.cols(); + + typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,Scalar,Scalar, + MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualOtherType::MaxColsAtCompileTime> BlockingType; + + BlockingType blocking(size, size, depth, 1, false); + internal::general_matrix_matrix_triangular_product<Index, - Scalar, _ActualOtherType::Flags&RowMajorBit ? RowMajor : ColMajor, OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex, - Scalar, _ActualOtherType::Flags&RowMajorBit ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex, - MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo> - ::run(mat.cols(), actualOther.cols(), + Scalar, OtherIsRowMajor ? RowMajor : ColMajor, OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex, + Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex, + IsRowMajor ? RowMajor : ColMajor, UpLo> + ::run(size, depth, &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(), - mat.data(), mat.outerStride(), actualAlpha); + mat.data(), mat.outerStride(), actualAlpha, blocking); } }; diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 39ab87df8..8a2f7cd78 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -126,6 +126,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + // The small panel size must not be larger than blocking size. + // Usually this should never be the case because SmallPanelWidth^2 is very small + // compared to L2 cache size, but let's be safe: + Index panelWidth = (std::min)(Index(SmallPanelWidth),(std::min)(kc,mc)); std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; @@ -169,9 +173,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, if(IsLower || actual_k2<rows) { // for each small vertical panels of lhs - for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth) + for (Index k1=0; k1<actual_kc; k1+=panelWidth) { - Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth); + Index actualPanelWidth = std::min<Index>(actual_kc-k1, panelWidth); Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1; Index startBlock = actual_k2+k1; Index blockBOffset = k1; diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h index d9e7cf852..aecded6bb 100755..100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Triangular matrix * matrix product functionality based on ?TRMM. ******************************************************************************** */ -#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H -#define EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H +#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H +#define EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H namespace Eigen { @@ -50,7 +50,7 @@ struct product_triangular_matrix_matrix_trmm : // try to go to BLAS specialization -#define EIGEN_MKL_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \ +#define EIGEN_BLAS_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \ template <typename Index, int Mode, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -65,17 +65,17 @@ struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \ } \ }; -EIGEN_MKL_TRMM_SPECIALIZE(double, true) -EIGEN_MKL_TRMM_SPECIALIZE(double, false) -EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, true) -EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, false) -EIGEN_MKL_TRMM_SPECIALIZE(float, true) -EIGEN_MKL_TRMM_SPECIALIZE(float, false) -EIGEN_MKL_TRMM_SPECIALIZE(scomplex, true) -EIGEN_MKL_TRMM_SPECIALIZE(scomplex, false) +EIGEN_BLAS_TRMM_SPECIALIZE(double, true) +EIGEN_BLAS_TRMM_SPECIALIZE(double, false) +EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, true) +EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, false) +EIGEN_BLAS_TRMM_SPECIALIZE(float, true) +EIGEN_BLAS_TRMM_SPECIALIZE(float, false) +EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true) +EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false) // implements col-major += alpha * op(triangular) * op(general) -#define EIGEN_MKL_TRMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, int Mode, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -106,13 +106,14 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \ typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \ \ -/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \ +/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \ if (rows != depth) { \ \ - int nthr = mkl_domain_get_max_threads(EIGEN_MKL_DOMAIN_BLAS); \ + /* FIXME handle mkl_domain_get_max_threads */ \ + /*int nthr = mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS);*/ int nthr = 1;\ \ if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \ - /* Most likely no benefit to call TRMM or GEMM from MKL*/ \ + /* Most likely no benefit to call TRMM or GEMM from BLAS */ \ product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \ LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \ _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ @@ -121,27 +122,23 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ /* Make sense to call GEMM */ \ Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \ MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \ - MKL_INT aStride = aa_tmp.outerStride(); \ + BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \ gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \ general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \ rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \ \ - /*std::cout << "TRMM_L: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \ + /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ } \ return; \ } \ char side = 'L', transa, uplo, diag = 'N'; \ EIGTYPE *b; \ const EIGTYPE *a; \ - MKL_INT m, n, lda, ldb; \ - MKLTYPE alpha_; \ -\ -/* Set alpha_*/ \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \ + BlasIndex m, n, lda, ldb; \ \ /* Set m, n */ \ - m = (MKL_INT)diagSize; \ - n = (MKL_INT)cols; \ + m = convert_index<BlasIndex>(diagSize); \ + n = convert_index<BlasIndex>(cols); \ \ /* Set trans */ \ transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \ @@ -152,7 +149,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ \ if (ConjugateRhs) b_tmp = rhs.conjugate(); else b_tmp = rhs; \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ \ /* Set uplo */ \ uplo = IsLower ? 'L' : 'U'; \ @@ -168,14 +165,14 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ else if (IsUnitDiag) \ a_tmp.diagonal().setOnes();\ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else { \ a = _lhs; \ - lda = lhsStride; \ + lda = convert_index<BlasIndex>(lhsStride); \ } \ - /*std::cout << "TRMM_L: A is square! Go to MKL TRMM implementation! \n";*/ \ + /*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \ /* call ?trmm*/ \ - MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \ + BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ \ /* Add op(a_triangular)*b into res*/ \ Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -183,13 +180,13 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ } \ }; -EIGEN_MKL_TRMM_L(double, double, d, d) -EIGEN_MKL_TRMM_L(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_TRMM_L(float, float, f, s) -EIGEN_MKL_TRMM_L(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_TRMM_L(double, double, d, d) +EIGEN_BLAS_TRMM_L(dcomplex, double, cd, z) +EIGEN_BLAS_TRMM_L(float, float, f, s) +EIGEN_BLAS_TRMM_L(scomplex, float, cf, c) // implements col-major += alpha * op(general) * op(triangular) -#define EIGEN_MKL_TRMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, int Mode, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -220,13 +217,13 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \ typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \ \ -/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \ +/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \ if (cols != depth) { \ \ - int nthr = mkl_domain_get_max_threads(EIGEN_MKL_DOMAIN_BLAS); \ + int nthr = 1 /*mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS)*/; \ \ if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \ - /* Most likely no benefit to call TRMM or GEMM from MKL*/ \ + /* Most likely no benefit to call TRMM or GEMM from BLAS*/ \ product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \ LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \ _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ @@ -235,27 +232,23 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ /* Make sense to call GEMM */ \ Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \ MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \ - MKL_INT aStride = aa_tmp.outerStride(); \ + BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \ gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \ general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \ rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \ \ - /*std::cout << "TRMM_R: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \ + /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ } \ return; \ } \ char side = 'R', transa, uplo, diag = 'N'; \ EIGTYPE *b; \ const EIGTYPE *a; \ - MKL_INT m, n, lda, ldb; \ - MKLTYPE alpha_; \ -\ -/* Set alpha_*/ \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \ + BlasIndex m, n, lda, ldb; \ \ /* Set m, n */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)diagSize; \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(diagSize); \ \ /* Set trans */ \ transa = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \ @@ -266,7 +259,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ \ if (ConjugateLhs) b_tmp = lhs.conjugate(); else b_tmp = lhs; \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ \ /* Set uplo */ \ uplo = IsLower ? 'L' : 'U'; \ @@ -282,14 +275,14 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ else if (IsUnitDiag) \ a_tmp.diagonal().setOnes();\ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else { \ a = _rhs; \ - lda = rhsStride; \ + lda = convert_index<BlasIndex>(rhsStride); \ } \ - /*std::cout << "TRMM_R: A is square! Go to MKL TRMM implementation! \n";*/ \ + /*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \ /* call ?trmm*/ \ - MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \ + BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ \ /* Add op(a_triangular)*b into res*/ \ Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -297,13 +290,13 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ } \ }; -EIGEN_MKL_TRMM_R(double, double, d, d) -EIGEN_MKL_TRMM_R(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_TRMM_R(float, float, f, s) -EIGEN_MKL_TRMM_R(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_TRMM_R(double, double, d, d) +EIGEN_BLAS_TRMM_R(dcomplex, double, cd, z) +EIGEN_BLAS_TRMM_R(float, float, f, s) +EIGEN_BLAS_TRMM_R(scomplex, float, cf, c) } // end namespace internal } // end namespace Eigen -#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H +#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H diff --git a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h index 3672b1240..07bf26ce5 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h +++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Triangular matrix-vector product functionality based on ?TRMV. ******************************************************************************** */ -#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H -#define EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H +#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H +#define EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H namespace Eigen { @@ -47,7 +47,7 @@ template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename Rh struct triangular_matrix_vector_product_trmv : triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,StorageOrder,BuiltIn> {}; -#define EIGEN_MKL_TRMV_SPECIALIZE(Scalar) \ +#define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \ template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ @@ -65,13 +65,13 @@ struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs } \ }; -EIGEN_MKL_TRMV_SPECIALIZE(double) -EIGEN_MKL_TRMV_SPECIALIZE(float) -EIGEN_MKL_TRMV_SPECIALIZE(dcomplex) -EIGEN_MKL_TRMV_SPECIALIZE(scomplex) +EIGEN_BLAS_TRMV_SPECIALIZE(double) +EIGEN_BLAS_TRMV_SPECIALIZE(float) +EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex) +EIGEN_BLAS_TRMV_SPECIALIZE(scomplex) // implements col-major: res += alpha * op(triangular) * vector -#define EIGEN_MKL_TRMV_CM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \ enum { \ @@ -105,17 +105,15 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, /* Square part handling */\ \ char trans, uplo, diag; \ - MKL_INT m, n, lda, incx, incy; \ + BlasIndex m, n, lda, incx, incy; \ EIGTYPE const *a; \ - MKLTYPE alpha_, beta_; \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \ + EIGTYPE beta(1); \ \ /* Set m, n */ \ - n = (MKL_INT)size; \ - lda = lhsStride; \ + n = convert_index<BlasIndex>(size); \ + lda = convert_index<BlasIndex>(lhsStride); \ incx = 1; \ - incy = resIncr; \ + incy = convert_index<BlasIndex>(resIncr); \ \ /* Set uplo, trans and diag*/ \ trans = 'N'; \ @@ -123,39 +121,39 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, diag = IsUnitDiag ? 'U' : 'N'; \ \ /* call ?TRMV*/ \ - MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \ + BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ \ /* Add op(a_tr)rhs into res*/ \ - MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \ -/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \ + BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ +/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \ if (size<(std::max)(rows,cols)) { \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ if (size<rows) { \ y = _res + size*resIncr; \ a = _lhs + size; \ - m = rows-size; \ - n = size; \ + m = convert_index<BlasIndex>(rows-size); \ + n = convert_index<BlasIndex>(size); \ } \ else { \ x += size; \ y = _res; \ a = _lhs + size*lda; \ - m = size; \ - n = cols-size; \ + m = convert_index<BlasIndex>(size); \ + n = convert_index<BlasIndex>(cols-size); \ } \ - MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \ + BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_MKL_TRMV_CM(double, double, d, d) -EIGEN_MKL_TRMV_CM(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_TRMV_CM(float, float, f, s) -EIGEN_MKL_TRMV_CM(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_TRMV_CM(double, double, d, d) +EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z) +EIGEN_BLAS_TRMV_CM(float, float, f, s) +EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c) // implements row-major: res += alpha * op(triangular) * vector -#define EIGEN_MKL_TRMV_RM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \ enum { \ @@ -189,17 +187,15 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, /* Square part handling */\ \ char trans, uplo, diag; \ - MKL_INT m, n, lda, incx, incy; \ + BlasIndex m, n, lda, incx, incy; \ EIGTYPE const *a; \ - MKLTYPE alpha_, beta_; \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \ + EIGTYPE beta(1); \ \ /* Set m, n */ \ - n = (MKL_INT)size; \ - lda = lhsStride; \ + n = convert_index<BlasIndex>(size); \ + lda = convert_index<BlasIndex>(lhsStride); \ incx = 1; \ - incy = resIncr; \ + incy = convert_index<BlasIndex>(resIncr); \ \ /* Set uplo, trans and diag*/ \ trans = ConjLhs ? 'C' : 'T'; \ @@ -207,39 +203,39 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, diag = IsUnitDiag ? 'U' : 'N'; \ \ /* call ?TRMV*/ \ - MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \ + BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ \ /* Add op(a_tr)rhs into res*/ \ - MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \ -/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \ + BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ +/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \ if (size<(std::max)(rows,cols)) { \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ if (size<rows) { \ y = _res + size*resIncr; \ a = _lhs + size*lda; \ - m = rows-size; \ - n = size; \ + m = convert_index<BlasIndex>(rows-size); \ + n = convert_index<BlasIndex>(size); \ } \ else { \ x += size; \ y = _res; \ a = _lhs + size; \ - m = size; \ - n = cols-size; \ + m = convert_index<BlasIndex>(size); \ + n = convert_index<BlasIndex>(cols-size); \ } \ - MKLPREFIX##gemv(&trans, &n, &m, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \ + BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_MKL_TRMV_RM(double, double, d, d) -EIGEN_MKL_TRMV_RM(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_TRMV_RM(float, float, f, s) -EIGEN_MKL_TRMV_RM(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_TRMV_RM(double, double, d, d) +EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z) +EIGEN_BLAS_TRMV_RM(float, float, f, s) +EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c) } // end namespase internal } // end namespace Eigen -#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H +#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h index 6a0bb8339..88c0fb794 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h @@ -25,20 +25,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Triangular matrix * matrix product functionality based on ?TRMM. ******************************************************************************** */ -#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H -#define EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H +#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H +#define EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H namespace Eigen { namespace internal { // implements LeftSide op(triangular)^-1 * general -#define EIGEN_MKL_TRSM_L(EIGTYPE, MKLTYPE, MKLPREFIX) \ +#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \ template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \ { \ @@ -53,13 +53,11 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage const EIGTYPE* _tri, Index triStride, \ EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \ { \ - MKL_INT m = size, n = otherSize, lda, ldb; \ + BlasIndex m = convert_index<BlasIndex>(size), n = convert_index<BlasIndex>(otherSize), lda, ldb; \ char side = 'L', uplo, diag='N', transa; \ /* Set alpha_ */ \ - MKLTYPE alpha; \ - EIGTYPE myone(1); \ - assign_scalar_eig2mkl(alpha, myone); \ - ldb = otherStride;\ + EIGTYPE alpha(1); \ + ldb = convert_index<BlasIndex>(otherStride);\ \ const EIGTYPE *a; \ /* Set trans */ \ @@ -75,25 +73,25 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage if (conjA) { \ a_tmp = tri.conjugate(); \ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else { \ a = _tri; \ - lda = triStride; \ + lda = convert_index<BlasIndex>(triStride); \ } \ if (IsUnitDiag) diag='U'; \ /* call ?trsm*/ \ - MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \ + BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ } \ }; -EIGEN_MKL_TRSM_L(double, double, d) -EIGEN_MKL_TRSM_L(dcomplex, MKL_Complex16, z) -EIGEN_MKL_TRSM_L(float, float, s) -EIGEN_MKL_TRSM_L(scomplex, MKL_Complex8, c) +EIGEN_BLAS_TRSM_L(double, double, d) +EIGEN_BLAS_TRSM_L(dcomplex, double, z) +EIGEN_BLAS_TRSM_L(float, float, s) +EIGEN_BLAS_TRSM_L(scomplex, float, c) // implements RightSide general * op(triangular)^-1 -#define EIGEN_MKL_TRSM_R(EIGTYPE, MKLTYPE, MKLPREFIX) \ +#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASPREFIX) \ template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \ { \ @@ -108,13 +106,11 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag const EIGTYPE* _tri, Index triStride, \ EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \ { \ - MKL_INT m = otherSize, n = size, lda, ldb; \ + BlasIndex m = convert_index<BlasIndex>(otherSize), n = convert_index<BlasIndex>(size), lda, ldb; \ char side = 'R', uplo, diag='N', transa; \ /* Set alpha_ */ \ - MKLTYPE alpha; \ - EIGTYPE myone(1); \ - assign_scalar_eig2mkl(alpha, myone); \ - ldb = otherStride;\ + EIGTYPE alpha(1); \ + ldb = convert_index<BlasIndex>(otherStride);\ \ const EIGTYPE *a; \ /* Set trans */ \ @@ -130,26 +126,26 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag if (conjA) { \ a_tmp = tri.conjugate(); \ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else { \ a = _tri; \ - lda = triStride; \ + lda = convert_index<BlasIndex>(triStride); \ } \ if (IsUnitDiag) diag='U'; \ /* call ?trsm*/ \ - MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \ + BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ /*std::cout << "TRMS_L specialization!\n";*/ \ } \ }; -EIGEN_MKL_TRSM_R(double, double, d) -EIGEN_MKL_TRSM_R(dcomplex, MKL_Complex16, z) -EIGEN_MKL_TRSM_R(float, float, s) -EIGEN_MKL_TRSM_R(scomplex, MKL_Complex8, c) +EIGEN_BLAS_TRSM_R(double, double, d) +EIGEN_BLAS_TRSM_R(dcomplex, double, z) +EIGEN_BLAS_TRSM_R(float, float, s) +EIGEN_BLAS_TRSM_R(scomplex, float, c) } // end namespace internal } // end namespace Eigen -#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H +#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index d00fa9707..498db3a70 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -123,18 +123,18 @@ template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::R template<typename Scalar, typename Index> class BlasVectorMapper { public: - EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {} - EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { return m_data[i]; } template <typename Packet, int AlignmentType> - EIGEN_ALWAYS_INLINE Packet load(Index i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet load(Index i) const { return ploadt<Packet, AlignmentType>(m_data + i); } template <typename Packet> - bool aligned(Index i) const { + EIGEN_DEVICE_FUNC bool aligned(Index i) const { return (size_t(m_data+i)%sizeof(Packet))==0; } @@ -148,25 +148,25 @@ class BlasLinearMapper { typedef typename packet_traits<Scalar>::type Packet; typedef typename packet_traits<Scalar>::half HalfPacket; - EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} - EIGEN_ALWAYS_INLINE void prefetch(int i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); } - EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { return m_data[i]; } - EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { return ploadt<Packet, AlignmentType>(m_data + i); } - EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { return ploadt<HalfPacket, AlignmentType>(m_data + i); } - EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const { pstoret<Scalar, Packet, AlignmentType>(m_data + i, p); } @@ -184,18 +184,18 @@ class blas_data_mapper { typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper; typedef BlasVectorMapper<Scalar, Index> VectorMapper; - EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} - EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType> getSubMapper(Index i, Index j) const { return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride); } - EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { return LinearMapper(&operator()(i, j)); } - EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { return VectorMapper(&operator()(i, j)); } @@ -205,28 +205,28 @@ class blas_data_mapper { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } - EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { return ploadt<Packet, AlignmentType>(&operator()(i, j)); } - EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { return ploadt<HalfPacket, AlignmentType>(&operator()(i, j)); } template<typename SubPacket> - EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride); } template<typename SubPacket> - EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const { return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride); } - const Index stride() const { return m_stride; } - const Scalar* data() const { return m_data; } + EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; } + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } - Index firstAligned(Index size) const { + EIGEN_DEVICE_FUNC Index firstAligned(Index size) const { if (size_t(m_data)%sizeof(Scalar)) { return -1; } diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index a364f48d1..5f71ba3df 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -56,8 +56,8 @@ const int HugeCost = 10000; * for a matrix, this means that the storage order is row-major. * If this bit is not set, the storage order is column-major. * For an expression, this determines the storage order of - * the matrix created by evaluation of that expression. - * \sa \ref TopicStorageOrders */ + * the matrix created by evaluation of that expression. + * \sa \blank \ref TopicStorageOrders */ const unsigned int RowMajorBit = 0x1; /** \ingroup flags @@ -67,6 +67,7 @@ const unsigned int EvalBeforeNestingBit = 0x2; /** \ingroup flags * \deprecated * means the expression should be evaluated before any assignment */ +EIGEN_DEPRECATED const unsigned int EvalBeforeAssigningBit = 0x4; // FIXME deprecated /** \ingroup flags @@ -158,7 +159,7 @@ const unsigned int DirectAccessBit = 0x40; * expression.packet<Aligned>(0); * \endcode */ -const unsigned int AlignedBit = 0x80; +EIGEN_DEPRECATED const unsigned int AlignedBit = 0x80; const unsigned int NestByRefBit = 0x100; @@ -168,7 +169,7 @@ const unsigned int NestByRefBit = 0x100; * can be either row-major or column-major. * The precise choice will be decided at evaluation time or when * combined with other expressions. - * \sa \ref RowMajorBit, \ref TopicStorageOrders */ + * \sa \blank \ref RowMajorBit, \ref TopicStorageOrders */ const unsigned int NoPreferredStorageOrderBit = 0x200; /** \ingroup flags @@ -187,8 +188,7 @@ const unsigned int CompressedAccessBit = 0x400; // list of flags that are inherited by default const unsigned int HereditaryBits = RowMajorBit - | EvalBeforeNestingBit - | EvalBeforeAssigningBit; + | EvalBeforeNestingBit; /** \defgroup enums Enumerations * \ingroup Core_Module @@ -224,7 +224,7 @@ enum { /** \ingroup enums * Enum for indicating whether a buffer is aligned or not. */ -enum { +enum { Unaligned=0, /**< Data pointer has no specific alignment. */ Aligned8=8, /**< Data pointer is aligned on a 8 bytes boundary. */ Aligned16=16, /**< Data pointer is aligned on a 16 bytes boundary. */ diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index 46c141ad5..cb27acff7 100644..100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -15,20 +15,25 @@ // 4522 - 'class' : multiple assignment operators specified // 4700 - uninitialized local variable 'xyz' used // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow + // 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning) #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma warning( push ) #endif - #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 ) + #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 4800) + #elif defined __INTEL_COMPILER // 2196 - routine is both "inline" and "noinline" ("noinline" assumed) // ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e. inside of class body // typedef that may be a reference type. // 279 - controlling expression is constant // ICC 12 generates this warning on assert(constant_expression_depending_on_template_params) and frankly this is a legitimate use case. + // 1684 - conversion from pointer to same-sized integral type (potential portability problem) + // 2259 - non-pointer conversion from "Eigen::Index={ptrdiff_t={long}}" to "int" may lose significant bits #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma warning push #endif - #pragma warning disable 2196 279 + #pragma warning disable 2196 279 1684 2259 + #elif defined __clang__ // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant // this is really a stupid warning as it warns on compile-time expressions involving enums @@ -38,4 +43,16 @@ #pragma clang diagnostic ignored "-Wconstant-logical-operand" #endif +#if defined __NVCC__ + // Disable the "statement is unreachable" message + #pragma diag_suppress code_is_unreachable + // Disable the "dynamic initialization in unreachable code" message + #pragma diag_suppress initialization_not_reachable + // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are 4 of them) + #pragma diag_suppress 2651 + #pragma diag_suppress 2653 + #pragma diag_suppress 2668 + #pragma diag_suppress 2670 +#endif + #endif // not EIGEN_WARNINGS_DISABLED diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index 483af876f..a102e5457 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -94,12 +94,8 @@ template<typename BinaryOp, typename Lhs, typename Rhs> class CwiseBinaryOp; template<typename Decomposition, typename Rhstype> class Solve; template<typename XprType> class Inverse; -namespace internal { - template<typename Lhs, typename Rhs> struct product_tag; -} - template<typename Lhs, typename Rhs, int Option = DefaultProduct> class Product; - + template<typename Derived> class DiagonalBase; template<typename _DiagonalVectorType> class DiagonalWrapper; template<typename _Scalar, int SizeAtCompileTime, int MaxSizeAtCompileTime=SizeAtCompileTime> class DiagonalMatrix; @@ -210,6 +206,8 @@ template<typename Scalar> struct scalar_add_op; template<typename Scalar> struct scalar_constant_op; template<typename Scalar> struct scalar_identity_op; template<typename Scalar,bool iscpx> struct scalar_sign_op; +template<typename Scalar> struct scalar_igamma_op; +template<typename Scalar> struct scalar_igammac_op; template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op; template<typename LhsScalar,typename RhsScalar> struct scalar_multiple2_op; @@ -252,6 +250,7 @@ template<typename MatrixType> struct inverse_impl; template<typename MatrixType> class HouseholderQR; template<typename MatrixType> class ColPivHouseholderQR; template<typename MatrixType> class FullPivHouseholderQR; +template<typename MatrixType> class CompleteOrthogonalDecomposition; template<typename MatrixType, int QRPreconditioner = ColPivHouseholderQRPreconditioner> class JacobiSVD; template<typename MatrixType> class BDCSVD; template<typename MatrixType, int UpLo = Lower> class LLT; diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h index 1ef3b61db..8c9239b1d 100644 --- a/Eigen/src/Core/util/MKL_support.h +++ b/Eigen/src/Core/util/MKL_support.h @@ -49,7 +49,7 @@ #define EIGEN_USE_LAPACKE #endif -#if defined(EIGEN_USE_BLAS) || defined(EIGEN_USE_LAPACKE) || defined(EIGEN_USE_MKL_VML) +#if defined(EIGEN_USE_LAPACKE) || defined(EIGEN_USE_MKL_VML) #define EIGEN_USE_MKL #endif @@ -64,7 +64,6 @@ # ifndef EIGEN_USE_MKL /*If the MKL version is too old, undef everything*/ # undef EIGEN_USE_MKL_ALL -# undef EIGEN_USE_BLAS # undef EIGEN_USE_LAPACKE # undef EIGEN_USE_MKL_VML # undef EIGEN_USE_LAPACKE_STRICT @@ -107,52 +106,23 @@ #else #define EIGEN_MKL_DOMAIN_PARDISO MKL_PARDISO #endif +#endif namespace Eigen { typedef std::complex<double> dcomplex; typedef std::complex<float> scomplex; -namespace internal { - -template<typename MKLType, typename EigenType> -static inline void assign_scalar_eig2mkl(MKLType& mklScalar, const EigenType& eigenScalar) { - mklScalar=eigenScalar; -} - -template<typename MKLType, typename EigenType> -static inline void assign_conj_scalar_eig2mkl(MKLType& mklScalar, const EigenType& eigenScalar) { - mklScalar=eigenScalar; -} - -template <> -inline void assign_scalar_eig2mkl<MKL_Complex16,dcomplex>(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) { - mklScalar.real=eigenScalar.real(); - mklScalar.imag=eigenScalar.imag(); -} - -template <> -inline void assign_scalar_eig2mkl<MKL_Complex8,scomplex>(MKL_Complex8& mklScalar, const scomplex& eigenScalar) { - mklScalar.real=eigenScalar.real(); - mklScalar.imag=eigenScalar.imag(); -} - -template <> -inline void assign_conj_scalar_eig2mkl<MKL_Complex16,dcomplex>(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) { - mklScalar.real=eigenScalar.real(); - mklScalar.imag=-eigenScalar.imag(); -} - -template <> -inline void assign_conj_scalar_eig2mkl<MKL_Complex8,scomplex>(MKL_Complex8& mklScalar, const scomplex& eigenScalar) { - mklScalar.real=eigenScalar.real(); - mklScalar.imag=-eigenScalar.imag(); -} - -} // end namespace internal +#if defined(EIGEN_USE_MKL) +typedef MKL_INT BlasIndex; +#else +typedef int BlasIndex; +#endif } // end namespace Eigen +#if defined(EIGEN_USE_BLAS) +#include "../../misc/blas.h" #endif #endif // EIGEN_MKL_SUPPORT_H diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 34f87ca40..a0cbd2247 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -13,7 +13,7 @@ #define EIGEN_WORLD_VERSION 3 #define EIGEN_MAJOR_VERSION 2 -#define EIGEN_MINOR_VERSION 91 +#define EIGEN_MINOR_VERSION 92 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ @@ -99,9 +99,16 @@ #define EIGEN_COMP_ARM 0 #endif +/// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler +#if defined(__EMSCRIPTEN__) + #define EIGEN_COMP_EMSCRIPTEN 1 +#else + #define EIGEN_COMP_EMSCRIPTEN 0 +#endif + /// \internal EIGEN_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.) -#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM ) +#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN) #define EIGEN_COMP_GNUC_STRICT 1 #else #define EIGEN_COMP_GNUC_STRICT 0 @@ -336,25 +343,35 @@ // Do we support r-value references? #if (__has_feature(cxx_rvalue_references) || \ (defined(__cplusplus) && __cplusplus >= 201103L) || \ - defined(__GXX_EXPERIMENTAL_CXX0X__) || \ (EIGEN_COMP_MSVC >= 1600)) #define EIGEN_HAVE_RVALUE_REFERENCES #endif +// Does the compiler support C99? +#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \ + || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \ + || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) +#define EIGEN_HAS_C99_MATH 1 +#endif + // Does the compiler support result_of? #if (__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L)) #define EIGEN_HAS_STD_RESULT_OF 1 #endif // Does the compiler support variadic templates? -#if __cplusplus > 199711L +#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900 +// Disable the use of variadic templates when compiling with nvcc on ARM devices: +// this prevents nvcc from crashing when compiling Eigen on Tegra X1 +#if !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 #define EIGEN_HAS_VARIADIC_TEMPLATES 1 #endif +#endif // Does the compiler support const expressions? #ifdef __CUDACC__ -// Const expressions are supported provided that c++11 is enabled and we're using nvcc 7.5 or above -#if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 70500 && __cplusplus > 199711L +// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above +#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (defined(__clang__) || __CUDACC_VER__ >= 70500) #define EIGEN_HAS_CONSTEXPR 1 #endif #elif (defined(__cplusplus) && __cplusplus >= 201402L) || \ diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index f64a2c409..5f8bf15b2 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -59,28 +59,6 @@ #endif -#ifndef EIGEN_HAS_POSIX_MEMALIGN - // See bug 554 (http://eigen.tuxfamily.org/bz/show_bug.cgi?id=554) - // It seems to be unsafe to check _POSIX_ADVISORY_INFO without including unistd.h first. - // Currently, let's include it only on unix systems: - #if EIGEN_OS_UNIX && !(EIGEN_OS_SUN || EIGEN_OS_SOLARIS) - #include <unistd.h> - #if (EIGEN_OS_QNX || (defined _GNU_SOURCE) || EIGEN_COMP_PGI || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0) - #define EIGEN_HAS_POSIX_MEMALIGN 1 - #endif - #endif - - #ifndef EIGEN_HAS_POSIX_MEMALIGN - #define EIGEN_HAS_POSIX_MEMALIGN 0 - #endif -#endif - -#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_AVX || defined EIGEN_VECTORIZE_AVX512 - #define EIGEN_HAS_MM_MALLOC 1 -#else - #define EIGEN_HAS_MM_MALLOC 0 -#endif - namespace Eigen { namespace internal { @@ -122,7 +100,7 @@ inline void handmade_aligned_free(void *ptr) /** \internal * \brief Reallocates aligned memory. - * Since we know that our handmade version is based on std::realloc + * Since we know that our handmade version is based on std::malloc * we can use std::realloc to implement efficient reallocation. */ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = 0) @@ -142,47 +120,6 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = } /***************************************************************************** -*** Implementation of generic aligned realloc (when no realloc can be used)*** -*****************************************************************************/ - -EIGEN_DEVICE_FUNC void* aligned_malloc(std::size_t size); -EIGEN_DEVICE_FUNC void aligned_free(void *ptr); - -/** \internal - * \brief Reallocates aligned memory. - * Allows reallocation with aligned ptr types. This implementation will - * always create a new memory chunk and copy the old data. - */ -inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size) -{ - if (ptr==0) - return aligned_malloc(size); - - if (size==0) - { - aligned_free(ptr); - return 0; - } - - void* newptr = aligned_malloc(size); - if (newptr == 0) - { - #ifdef EIGEN_HAS_ERRNO - errno = ENOMEM; // according to the standard - #endif - return 0; - } - - if (ptr != 0) - { - std::memcpy(newptr, ptr, (std::min)(size,old_size)); - aligned_free(ptr); - } - - return newptr; -} - -/***************************************************************************** *** Implementation of portable aligned versions of malloc/free/realloc *** *****************************************************************************/ @@ -218,16 +155,11 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size) check_that_malloc_is_allowed(); void *result; - #if EIGEN_DEFAULT_ALIGN_BYTES==0 - result = std::malloc(size); - #elif EIGEN_MALLOC_ALREADY_ALIGNED + #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED result = std::malloc(size); - #elif EIGEN_HAS_POSIX_MEMALIGN - if(posix_memalign(&result, EIGEN_DEFAULT_ALIGN_BYTES, size)) result = 0; - #elif EIGEN_HAS_MM_MALLOC - result = _mm_malloc(size, EIGEN_DEFAULT_ALIGN_BYTES); - #elif EIGEN_OS_WIN_STRICT - result = _aligned_malloc(size, EIGEN_DEFAULT_ALIGN_BYTES); + #if EIGEN_DEFAULT_ALIGN_BYTES==16 + eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator."); + #endif #else result = handmade_aligned_malloc(size); #endif @@ -241,48 +173,25 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size) /** \internal Frees memory allocated with aligned_malloc. */ EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { - #if EIGEN_DEFAULT_ALIGN_BYTES==0 - std::free(ptr); - #elif EIGEN_MALLOC_ALREADY_ALIGNED + #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED std::free(ptr); - #elif EIGEN_HAS_POSIX_MEMALIGN - free(ptr); - #elif EIGEN_HAS_MM_MALLOC - _mm_free(ptr); - #elif EIGEN_OS_WIN_STRICT - _aligned_free(ptr); #else handmade_aligned_free(ptr); #endif } /** -* \internal -* \brief Reallocates an aligned block of memory. -* \throws std::bad_alloc on allocation failure -**/ + * \internal + * \brief Reallocates an aligned block of memory. + * \throws std::bad_alloc on allocation failure + */ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size) { EIGEN_UNUSED_VARIABLE(old_size); void *result; -#if EIGEN_DEFAULT_ALIGN_BYTES==0 +#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED result = std::realloc(ptr,new_size); -#elif EIGEN_MALLOC_ALREADY_ALIGNED - result = std::realloc(ptr,new_size); -#elif EIGEN_HAS_POSIX_MEMALIGN - result = generic_aligned_realloc(ptr,new_size,old_size); -#elif EIGEN_HAS_MM_MALLOC - // The defined(_mm_free) is just here to verify that this MSVC version - // implements _mm_malloc/_mm_free based on the corresponding _aligned_ - // functions. This may not always be the case and we just try to be safe. - #if EIGEN_OS_WIN_STRICT && defined(_mm_free) - result = _aligned_realloc(ptr,new_size,EIGEN_DEFAULT_ALIGN_BYTES); - #else - result = generic_aligned_realloc(ptr,new_size,old_size); - #endif -#elif EIGEN_OS_WIN_STRICT - result = _aligned_realloc(ptr,new_size,EIGEN_DEFAULT_ALIGN_BYTES); #else result = handmade_aligned_realloc(ptr,new_size,old_size); #endif @@ -524,11 +433,11 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_align * \sa first_default_aligned() */ template<int Alignment, typename Scalar, typename Index> -inline Index first_aligned(const Scalar* array, Index size) +EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size) { - static const Index ScalarSize = sizeof(Scalar); - static const Index AlignmentSize = Alignment / ScalarSize; - static const Index AlignmentMask = AlignmentSize-1; + const Index ScalarSize = sizeof(Scalar); + const Index AlignmentSize = Alignment / ScalarSize; + const Index AlignmentMask = AlignmentSize-1; if(AlignmentSize<=1) { @@ -544,14 +453,15 @@ inline Index first_aligned(const Scalar* array, Index size) } else { - return std::min<Index>( (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask, size); + Index first = (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask; + return (first < size) ? first : size; } } /** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet requirement. * \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase<Derived>) */ template<typename Scalar, typename Index> -inline Index first_default_aligned(const Scalar* array, Index size) +EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index size) { typedef typename packet_traits<Scalar>::type DefaultPacketType; return first_aligned<unpacket_traits<DefaultPacketType>::alignment>(array, size); @@ -576,7 +486,12 @@ template<typename T> EIGEN_DEVICE_FUNC void smart_copy(const T* start, const T* template<typename T> struct smart_copy_helper<T,true> { EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target) - { memcpy(target, start, std::ptrdiff_t(end)-std::ptrdiff_t(start)); } + { + std::ptrdiff_t size = std::ptrdiff_t(end)-std::ptrdiff_t(start); + if(size==0) return; + eigen_internal_assert(start!=0 && end!=0 && target!=0); + memcpy(target, start, size); + } }; template<typename T> struct smart_copy_helper<T,false> { @@ -594,7 +509,12 @@ template<typename T> void smart_memmove(const T* start, const T* end, T* target) template<typename T> struct smart_memmove_helper<T,true> { static inline void run(const T* start, const T* end, T* target) - { std::memmove(target, start, std::ptrdiff_t(end)-std::ptrdiff_t(start)); } + { + std::ptrdiff_t size = std::ptrdiff_t(end)-std::ptrdiff_t(start); + if(size==0) return; + eigen_internal_assert(start!=0 && end!=0 && target!=0); + std::memmove(target, start, size); + } }; template<typename T> struct smart_memmove_helper<T,false> { @@ -784,7 +704,7 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b) * std::map< int, Vector3f > my_map_vec3; * \endcode * -* \sa \ref TopicStlContainers. +* \sa \blank \ref TopicStlContainers. */ template<class T> class aligned_allocator : public std::allocator<T> diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 3dee2bd7c..24e8a6d8a 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -147,6 +147,8 @@ template<typename T> struct numeric_limits static T epsilon() { return 0; } static T (max)() { assert(false && "Highest not supported for this type"); } static T (min)() { assert(false && "Lowest not supported for this type"); } + static T infinity() { assert(false && "Infinity not supported for this type"); } + static T quiet_NaN() { assert(false && "quiet_NaN not supported for this type"); } }; template<> struct numeric_limits<float> { @@ -156,6 +158,10 @@ template<> struct numeric_limits<float> static float (max)() { return CUDART_MAX_NORMAL_F; } EIGEN_DEVICE_FUNC static float (min)() { return FLT_MIN; } + EIGEN_DEVICE_FUNC + static float infinity() { return CUDART_INF_F; } + EIGEN_DEVICE_FUNC + static float quiet_NaN() { return CUDART_NAN_F; } }; template<> struct numeric_limits<double> { @@ -165,6 +171,10 @@ template<> struct numeric_limits<double> static double (max)() { return DBL_MAX; } EIGEN_DEVICE_FUNC static double (min)() { return DBL_MIN; } + EIGEN_DEVICE_FUNC + static double infinity() { return CUDART_INF; } + EIGEN_DEVICE_FUNC + static double quiet_NaN() { return CUDART_NAN; } }; template<> struct numeric_limits<int> { @@ -257,7 +267,7 @@ struct has_std_result_type {int a[2];}; struct has_tr1_result {int a[3];}; template<typename Func, typename ArgType, int SizeOf=sizeof(has_none)> -struct unary_result_of_select {typedef ArgType type;}; +struct unary_result_of_select {typedef typename internal::remove_all<ArgType>::type type;}; template<typename Func, typename ArgType> struct unary_result_of_select<Func, ArgType, sizeof(has_std_result_type)> {typedef typename Func::result_type type;}; @@ -279,7 +289,7 @@ struct result_of<Func(ArgType)> { }; template<typename Func, typename ArgType0, typename ArgType1, int SizeOf=sizeof(has_none)> -struct binary_result_of_select {typedef ArgType0 type;}; +struct binary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;}; template<typename Func, typename ArgType0, typename ArgType1> struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_std_result_type)> @@ -326,6 +336,22 @@ class meta_sqrt template<int Y, int InfX, int SupX> class meta_sqrt<Y, InfX, SupX, true> { public: enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; }; + +/** \internal Computes the least common multiple of two positive integer A and B + * at compile-time. It implements a naive algorithm testing all multiples of A. + * It thus works better if A>=B. + */ +template<int A, int B, int K=1, bool Done = ((A*K)%B)==0> +struct meta_least_common_multiple +{ + enum { ret = meta_least_common_multiple<A,B,K+1>::ret }; +}; +template<int A, int B, int K> +struct meta_least_common_multiple<A,B,K,true> +{ + enum { ret = A*K }; +}; + /** \internal determines whether the product of two numeric types is allowed and what the return type is */ template<typename T, typename U> struct scalar_product_traits { @@ -375,6 +401,12 @@ template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif +#if defined(__CUDA_ARCH__) +using internal::device::numeric_limits; +#else +using std::numeric_limits; +#endif + // Integer division with rounding up. // T is assumed to be an integer type with a>=0, and b>0 template<typename T> diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h index 5ddfbd4aa..a23fab198 100644 --- a/Eigen/src/Core/util/ReenableStupidWarnings.h +++ b/Eigen/src/Core/util/ReenableStupidWarnings.h @@ -9,6 +9,16 @@ #elif defined __clang__ #pragma clang diagnostic pop #endif + + #if defined __NVCC__ +// Don't reenable the diagnostic messages, as it turns out these messages need +// to be disabled at the point of the template instantiation (i.e the user code) +// otherwise they'll be triggeredby nvcc. +// #pragma diag_default code_is_unreachable +// #pragma diag_default initialization_not_reachable +// #pragma diag_default 2651 +// #pragma diag_default 2653 + #endif #endif #endif // EIGEN_WARNINGS_DISABLED diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index 108181419..afae2e51e 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -26,7 +26,7 @@ #ifndef EIGEN_NO_STATIC_ASSERT - #if defined(__GXX_EXPERIMENTAL_CXX0X__) || (EIGEN_COMP_MSVC >= 1600) + #if __has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600) // if native static_assert is enabled, let's use it #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG); @@ -50,6 +50,7 @@ THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE, THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE, THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE, + OUT_OF_RANGE_ACCESS, YOU_MADE_A_PROGRAMMING_MISTAKE, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT, EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE, @@ -96,7 +97,8 @@ STORAGE_LAYOUT_DOES_NOT_MATCH, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE, THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS, - MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY + MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY, + THIS_TYPE_IS_NOT_SUPPORTED }; }; diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index f9e2959cc..a001c473a 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -29,7 +29,7 @@ typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex; /** * \brief The Index type as used for the API. * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE. - * \sa \ref TopicPreprocessorDirectives, StorageIndex. + * \sa \blank \ref TopicPreprocessorDirectives, StorageIndex. */ typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index; @@ -390,9 +390,9 @@ struct transfer_constness * a*d. Evaluating can be beneficial for example if every coefficient access in the resulting expression causes * many coefficient accesses in the nested expressions -- as is the case with matrix product for example. * - * \param T the type of the expression being nested. - * \param n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression. - * \param PlainObject the type of the temporary if needed. + * \tparam T the type of the expression being nested. + * \tparam n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression. + * \tparam PlainObject the type of the temporary if needed. */ template<typename T, int n, typename PlainObject = typename plain_object_eval<T>::type> struct nested_eval { @@ -466,17 +466,17 @@ struct special_scalar_op_base : public BaseType template<typename Derived,typename Scalar,typename OtherScalar, typename BaseType> struct special_scalar_op_base<Derived,Scalar,OtherScalar,BaseType,true> : public BaseType { - const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived> + const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, const Derived> operator*(const OtherScalar& scalar) const { #ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN #endif - return CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived> + return CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, const Derived> (*static_cast<const Derived*>(this), scalar_multiple2_op<Scalar,OtherScalar>(scalar)); } - inline friend const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived> + inline friend const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, const Derived> operator*(const OtherScalar& scalar, const Derived& matrix) { #ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN @@ -485,13 +485,13 @@ struct special_scalar_op_base<Derived,Scalar,OtherScalar,BaseType,true> : publi return static_cast<const special_scalar_op_base&>(matrix).operator*(scalar); } - const CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, Derived> + const CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, const Derived> operator/(const OtherScalar& scalar) const { #ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN #endif - return CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, Derived> + return CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, const Derived> (*static_cast<const Derived*>(this), scalar_quotient2_op<Scalar,OtherScalar>(scalar)); } }; @@ -526,22 +526,21 @@ template <typename A> struct promote_storage_type<const A, A> * the functor. * The default rules are as follows: * \code - * A op A -> A - * A op dense -> dense - * dense op B -> dense - * A * dense -> A - * dense * B -> B + * A op A -> A + * A op dense -> dense + * dense op B -> dense + * sparse op dense -> sparse + * dense op sparse -> sparse * \endcode */ template <typename A, typename B, typename Functor> struct cwise_promote_storage_type; -template <typename A, typename Functor> struct cwise_promote_storage_type<A,A,Functor> { typedef A ret; }; -template <typename Functor> struct cwise_promote_storage_type<Dense,Dense,Functor> { typedef Dense ret; }; -template <typename ScalarA, typename ScalarB> struct cwise_promote_storage_type<Dense,Dense,scalar_product_op<ScalarA,ScalarB> > { typedef Dense ret; }; -template <typename A, typename Functor> struct cwise_promote_storage_type<A,Dense,Functor> { typedef Dense ret; }; -template <typename B, typename Functor> struct cwise_promote_storage_type<Dense,B,Functor> { typedef Dense ret; }; -template <typename A, typename ScalarA, typename ScalarB> struct cwise_promote_storage_type<A,Dense,scalar_product_op<ScalarA,ScalarB> > { typedef A ret; }; -template <typename B, typename ScalarA, typename ScalarB> struct cwise_promote_storage_type<Dense,B,scalar_product_op<ScalarA,ScalarB> > { typedef B ret; }; +template <typename A, typename Functor> struct cwise_promote_storage_type<A,A,Functor> { typedef A ret; }; +template <typename Functor> struct cwise_promote_storage_type<Dense,Dense,Functor> { typedef Dense ret; }; +template <typename A, typename Functor> struct cwise_promote_storage_type<A,Dense,Functor> { typedef Dense ret; }; +template <typename B, typename Functor> struct cwise_promote_storage_type<Dense,B,Functor> { typedef Dense ret; }; +template <typename Functor> struct cwise_promote_storage_type<Sparse,Dense,Functor> { typedef Sparse ret; }; +template <typename Functor> struct cwise_promote_storage_type<Dense,Sparse,Functor> { typedef Sparse ret; }; /** \internal Specify the "storage kind" of multiplying an expression of kind A with kind B. * The template parameter ProductTag permits to specialize the resulting storage kind wrt to @@ -575,7 +574,7 @@ template <int ProductTag> struct product_promote_storage_type<Dense, template <int ProductTag> struct product_promote_storage_type<PermutationStorage, Dense, ProductTag> { typedef Dense ret; }; /** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type. - * \param Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType. + * \tparam Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType. */ template<typename ExpressionType, typename Scalar = typename ExpressionType::Scalar> struct plain_row_type |