aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core
diff options
context:
space:
mode:
authorGravatar Deven Desai <deven.desai.amd@gmail.com>2019-03-19 16:52:38 -0400
committerGravatar Deven Desai <deven.desai.amd@gmail.com>2019-03-19 16:52:38 -0400
commit2dbea5510fe5cb64dbfdef9042c04a3a92b87f76 (patch)
treec187e7ec5e90a191e19466ff6084dd8f053dba7e /Eigen/src/Core
parente7e6809e6b38a5928efc0b5ca9520258e4d1fb3a (diff)
parent5c93b38c5fca514a08084e32feb8a8fb27bf3665 (diff)
Merged eigen/eigen into default
Diffstat (limited to 'Eigen/src/Core')
-rw-r--r--Eigen/src/Core/Array.h102
-rw-r--r--Eigen/src/Core/AssignEvaluator.h3
-rwxr-xr-xEigen/src/Core/Assign_MKL.h20
-rw-r--r--Eigen/src/Core/Block.h60
-rw-r--r--Eigen/src/Core/ConditionEstimator.h2
-rw-r--r--Eigen/src/Core/CoreEvaluators.h91
-rw-r--r--Eigen/src/Core/CwiseBinaryOp.h24
-rw-r--r--Eigen/src/Core/CwiseNullaryOp.h10
-rw-r--r--Eigen/src/Core/CwiseUnaryView.h2
-rw-r--r--Eigen/src/Core/DenseBase.h16
-rw-r--r--Eigen/src/Core/DiagonalMatrix.h24
-rw-r--r--Eigen/src/Core/GeneralProduct.h11
-rw-r--r--Eigen/src/Core/GenericPacketMath.h138
-rw-r--r--Eigen/src/Core/GlobalFunctions.h5
-rw-r--r--Eigen/src/Core/IO.h22
-rw-r--r--Eigen/src/Core/IndexedView.h2
-rw-r--r--Eigen/src/Core/Matrix.h135
-rw-r--r--Eigen/src/Core/MatrixBase.h5
-rw-r--r--Eigen/src/Core/NestByValue.h69
-rw-r--r--Eigen/src/Core/PlainObjectBase.h83
-rw-r--r--Eigen/src/Core/Product.h15
-rw-r--r--Eigen/src/Core/ProductEvaluators.h73
-rw-r--r--Eigen/src/Core/Redux.h10
-rw-r--r--Eigen/src/Core/Ref.h5
-rw-r--r--Eigen/src/Core/Reshaped.h2
-rw-r--r--Eigen/src/Core/Reverse.h10
-rw-r--r--Eigen/src/Core/SelfAdjointView.h13
-rw-r--r--Eigen/src/Core/Solve.h2
-rw-r--r--Eigen/src/Core/SolverBase.h41
-rw-r--r--Eigen/src/Core/Swap.h9
-rw-r--r--Eigen/src/Core/Transpose.h43
-rw-r--r--Eigen/src/Core/TriangularMatrix.h13
-rw-r--r--Eigen/src/Core/VectorBlock.h8
-rw-r--r--Eigen/src/Core/VectorwiseOp.h46
-rw-r--r--Eigen/src/Core/Visitor.h35
-rw-r--r--Eigen/src/Core/arch/AVX/Complex.h28
-rw-r--r--Eigen/src/Core/arch/AVX/MathFunctions.h316
-rw-r--r--Eigen/src/Core/arch/AVX/PacketMath.h221
-rw-r--r--Eigen/src/Core/arch/AVX/TypeCasting.h10
-rw-r--r--Eigen/src/Core/arch/AVX512/Complex.h488
-rw-r--r--Eigen/src/Core/arch/AVX512/MathFunctions.h26
-rw-r--r--Eigen/src/Core/arch/AVX512/PacketMath.h309
-rw-r--r--Eigen/src/Core/arch/AltiVec/Complex.h16
-rw-r--r--Eigen/src/Core/arch/AltiVec/MathFunctions.h267
-rwxr-xr-xEigen/src/Core/arch/AltiVec/PacketMath.h167
-rw-r--r--Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h471
-rw-r--r--Eigen/src/Core/arch/Default/Settings.h2
-rw-r--r--Eigen/src/Core/arch/GPU/PacketMath.h124
-rw-r--r--Eigen/src/Core/arch/GPU/PacketMathHalf.h217
-rw-r--r--Eigen/src/Core/arch/MSA/Complex.h4
-rw-r--r--Eigen/src/Core/arch/MSA/MathFunctions.h4
-rw-r--r--Eigen/src/Core/arch/MSA/PacketMath.h6
-rw-r--r--Eigen/src/Core/arch/NEON/Complex.h32
-rw-r--r--Eigen/src/Core/arch/NEON/MathFunctions.h170
-rw-r--r--Eigen/src/Core/arch/NEON/PacketMath.h69
-rw-r--r--Eigen/src/Core/arch/NEON/TypeCasting.h8
-rw-r--r--Eigen/src/Core/arch/SSE/Complex.h25
-rw-r--r--Eigen/src/Core/arch/SSE/MathFunctions.h417
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h181
-rw-r--r--Eigen/src/Core/arch/SSE/TypeCasting.h7
-rw-r--r--Eigen/src/Core/arch/SYCL/InteropHeaders.h2
-rw-r--r--Eigen/src/Core/arch/ZVector/Complex.h4
-rwxr-xr-xEigen/src/Core/arch/ZVector/PacketMath.h6
-rw-r--r--Eigen/src/Core/functors/AssignmentFunctors.h11
-rw-r--r--Eigen/src/Core/functors/NullaryFunctors.h39
-rw-r--r--Eigen/src/Core/functors/UnaryFunctors.h76
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h1371
-rw-r--r--Eigen/src/Core/products/GeneralMatrixMatrix.h14
-rw-r--r--Eigen/src/Core/products/Parallelizer.h3
-rw-r--r--Eigen/src/Core/products/SelfadjointMatrixMatrix.h23
-rwxr-xr-xEigen/src/Core/util/BlasUtil.h12
-rw-r--r--Eigen/src/Core/util/ConfigureVectorization.h39
-rw-r--r--Eigen/src/Core/util/ForwardDeclarations.h7
-rw-r--r--Eigen/src/Core/util/IndexedViewHelper.h4
-rw-r--r--Eigen/src/Core/util/Macros.h98
-rw-r--r--Eigen/src/Core/util/Memory.h15
-rwxr-xr-xEigen/src/Core/util/Meta.h33
-rw-r--r--Eigen/src/Core/util/StaticAssert.h3
-rw-r--r--Eigen/src/Core/util/XprHelper.h3
79 files changed, 4360 insertions, 2137 deletions
diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h
index e10020d4f..ee12d96fc 100644
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@@ -153,8 +153,6 @@ class Array
: Base(std::move(other))
{
Base::_check_template_params();
- if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
- Base::_set_noalias(other);
}
EIGEN_DEVICE_FUNC
Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
@@ -180,6 +178,46 @@ class Array
Base::_check_template_params();
this->template _init2<T0,T1>(val0, val1);
}
+
+ #if EIGEN_HAS_CXX11
+ /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+ *
+ * Example: \include Array_variadic_ctor_cxx11.cpp
+ * Output: \verbinclude Array_variadic_ctor_cxx11.out
+ *
+ * \sa Array(const std::initializer_list<std::initializer_list<Scalar>>&)
+ * \sa Array(Scalar), Array(Scalar,Scalar)
+ */
+ template <typename... ArgTypes>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+ : Base(a0, a1, a2, a3, args...) {}
+
+ /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
+ *
+ * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
+ *
+ * Example: \include Array_initializer_list_23_cxx11.cpp
+ * Output: \verbinclude Array_initializer_list_23_cxx11.out
+ *
+ * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
+ *
+ * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed.
+ * Therefore <code> Array<int,Dynamic,1>{{1,2,3,4,5}}</code> is legal and the more verbose syntax
+ * <code>Array<int,Dynamic,1>{{1},{2},{3},{4},{5}}</code> can be avoided:
+ *
+ * Example: \include Array_initializer_list_vector_cxx11.cpp
+ * Output: \verbinclude Array_initializer_list_vector_cxx11.out
+ *
+ * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes,
+ * and implicit transposition is allowed for compile-time 1D arrays only.
+ *
+ * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+ */
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Array(const std::initializer_list<std::initializer_list<Scalar> >& list) : Base(list) {}
+ #endif // end EIGEN_HAS_CXX11
+
#else
/** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */
EIGEN_DEVICE_FUNC explicit Array(const Scalar *data);
@@ -191,7 +229,8 @@ class Array
*/
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE explicit Array(Index dim);
- /** constructs an initialized 1x1 Array with the given coefficient */
+ /** constructs an initialized 1x1 Array with the given coefficient
+ * \sa const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args */
Array(const Scalar& value);
/** constructs an uninitialized array with \a rows rows and \a cols columns.
*
@@ -199,11 +238,14 @@ class Array
* it is redundant to pass these parameters, so one should use the default constructor
* Array() instead. */
Array(Index rows, Index cols);
- /** constructs an initialized 2D vector with given coefficients */
+ /** constructs an initialized 2D vector with given coefficients
+ * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */
Array(const Scalar& val0, const Scalar& val1);
- #endif
+ #endif // end EIGEN_PARSED_BY_DOXYGEN
- /** constructs an initialized 3D vector with given coefficients */
+ /** constructs an initialized 3D vector with given coefficients
+ * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+ */
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2)
{
@@ -213,7 +255,9 @@ class Array
m_storage.data()[1] = val1;
m_storage.data()[2] = val2;
}
- /** constructs an initialized 4D vector with given coefficients */
+ /** constructs an initialized 4D vector with given coefficients
+ * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+ */
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3)
{
@@ -260,7 +304,7 @@ class Array
/** \defgroup arraytypedefs Global array typedefs
* \ingroup Core_Module
*
- * Eigen defines several typedef shortcuts for most common 1D and 2D array types.
+ * %Eigen defines several typedef shortcuts for most common 1D and 2D array types.
*
* The general patterns are the following:
*
@@ -273,6 +317,12 @@ class Array
* There are also \c ArraySizeType which are self-explanatory. For example, \c Array4cf is
* a fixed-size 1D array of 4 complex floats.
*
+ * With \cpp11, template alias are also defined for common sizes.
+ * They follow the same pattern as above except that the scalar type suffix is replaced by a
+ * template parameter, i.e.:
+ * - `ArrayRowsCols<Type>` where `Rows` and `Cols` can be \c 2,\c 3,\c 4, or \c X for fixed or dynamic size.
+ * - `ArraySize<Type>` where `Size` can be \c 2,\c 3,\c 4 or \c X for fixed or dynamic size 1D arrays.
+ *
* \sa class Array
*/
@@ -305,9 +355,43 @@ EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)
#undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES
#undef EIGEN_MAKE_ARRAY_TYPEDEFS
+#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS
+
+#if EIGEN_HAS_CXX11
+
+#define EIGEN_MAKE_ARRAY_TYPEDEFS(Size, SizeSuffix) \
+/** \ingroup arraytypedefs */ \
+/** \brief \cpp11 */ \
+template <typename Type> \
+using Array##SizeSuffix##SizeSuffix = Array<Type, Size, Size>; \
+/** \ingroup arraytypedefs */ \
+/** \brief \cpp11 */ \
+template <typename Type> \
+using Array##SizeSuffix = Array<Type, Size, 1>;
+
+#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Size) \
+/** \ingroup arraytypedefs */ \
+/** \brief \cpp11 */ \
+template <typename Type> \
+using Array##Size##X = Array<Type, Size, Dynamic>; \
+/** \ingroup arraytypedefs */ \
+/** \brief \cpp11 */ \
+template <typename Type> \
+using Array##X##Size = Array<Type, Dynamic, Size>;
+
+EIGEN_MAKE_ARRAY_TYPEDEFS(2, 2)
+EIGEN_MAKE_ARRAY_TYPEDEFS(3, 3)
+EIGEN_MAKE_ARRAY_TYPEDEFS(4, 4)
+EIGEN_MAKE_ARRAY_TYPEDEFS(Dynamic, X)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(2)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(3)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(4)
-#undef EIGEN_MAKE_ARRAY_TYPEDEFS_LARGE
+#undef EIGEN_MAKE_ARRAY_TYPEDEFS
+#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS
+#endif // EIGEN_HAS_CXX11
+
#define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
using Eigen::Matrix##SizeSuffix##TypeSuffix; \
using Eigen::Vector##SizeSuffix##TypeSuffix; \
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index 79575e1b4..229e25854 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -611,7 +611,8 @@ public:
typedef typename AssignmentTraits::PacketType PacketType;
- EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
: m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
{
#ifdef EIGEN_DEBUG_ASSIGN
diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h
index 6866095bf..c6140d185 100755
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@@ -68,16 +68,16 @@ class vml_assign_traits
#define EIGEN_PP_EXPAND(ARG) ARG
#if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1)
-#define EIGEN_VMLMODE_EXPAND_LA , VML_HA
+#define EIGEN_VMLMODE_EXPAND_xLA , VML_HA
#else
-#define EIGEN_VMLMODE_EXPAND_LA , VML_LA
+#define EIGEN_VMLMODE_EXPAND_xLA , VML_LA
#endif
-#define EIGEN_VMLMODE_EXPAND__
+#define EIGEN_VMLMODE_EXPAND_x_
-#define EIGEN_VMLMODE_PREFIX_LA vm
-#define EIGEN_VMLMODE_PREFIX__ v
-#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_,VMLMODE)
+#define EIGEN_VMLMODE_PREFIX_xLA vm
+#define EIGEN_VMLMODE_PREFIX_x_ v
+#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_x,VMLMODE)
#define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE) \
template< typename DstXprType, typename SrcXprNested> \
@@ -89,7 +89,7 @@ class vml_assign_traits
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) { \
VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \
- (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) ); \
+ (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) ); \
} else { \
const Index outerSize = dst.outerSize(); \
for(Index outer = 0; outer < outerSize; ++outer) { \
@@ -97,7 +97,7 @@ class vml_assign_traits
&(src.nestedExpression().coeffRef(0, outer)); \
EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer)); \
VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, \
- (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE)); \
+ (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE)); \
} \
} \
} \
@@ -152,7 +152,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _)
if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) \
{ \
VMLOP( dst.size(), (const VMLTYPE*)src.lhs().data(), exponent, \
- (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) ); \
+ (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) ); \
} else { \
const Index outerSize = dst.outerSize(); \
for(Index outer = 0; outer < outerSize; ++outer) { \
@@ -160,7 +160,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _)
&(src.lhs().coeffRef(0, outer)); \
EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer)); \
VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent, \
- (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE)); \
+ (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE)); \
} \
} \
} \
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 11de45c2e..6e938ea58 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -114,8 +114,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
/** Column or Row constructor
*/
- EIGEN_DEVICE_FUNC
- inline Block(XprType& xpr, Index i) : Impl(xpr,i)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Block(XprType& xpr, Index i) : Impl(xpr,i)
{
eigen_assert( (i>=0) && (
((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && i<xpr.rows())
@@ -124,8 +124,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
/** Fixed-size constructor
*/
- EIGEN_DEVICE_FUNC
- inline Block(XprType& xpr, Index startRow, Index startCol)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Block(XprType& xpr, Index startRow, Index startCol)
: Impl(xpr, startRow, startCol)
{
EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
@@ -135,8 +135,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
/** Dynamic-size constructor
*/
- EIGEN_DEVICE_FUNC
- inline Block(XprType& xpr,
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Block(XprType& xpr,
Index startRow, Index startCol,
Index blockRows, Index blockCols)
: Impl(xpr, startRow, startCol, blockRows, blockCols)
@@ -159,10 +159,10 @@ class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Dense>
public:
typedef Impl Base;
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
- EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
- EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {}
EIGEN_DEVICE_FUNC
- inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+ EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
: Impl(xpr, startRow, startCol, blockRows, blockCols) {}
};
@@ -294,22 +294,22 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
EIGEN_DEVICE_FUNC inline Index outerStride() const;
#endif
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
{
return m_xpr;
}
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
XprType& nestedExpression() { return m_xpr; }
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
StorageIndex startRow() const
{
return m_startRow.value();
}
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
StorageIndex startCol() const
{
return m_startCol.value();
@@ -342,8 +342,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
/** Column or Row constructor
*/
- EIGEN_DEVICE_FUNC
- inline BlockImpl_dense(XprType& xpr, Index i)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ BlockImpl_dense(XprType& xpr, Index i)
: Base(xpr.data() + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor))
|| ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
BlockRows==1 ? 1 : xpr.rows(),
@@ -357,8 +357,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
/** Fixed-size constructor
*/
- EIGEN_DEVICE_FUNC
- inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
: Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
{
@@ -367,8 +367,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
/** Dynamic-size constructor
*/
- EIGEN_DEVICE_FUNC
- inline BlockImpl_dense(XprType& xpr,
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ BlockImpl_dense(XprType& xpr,
Index startRow, Index startCol,
Index blockRows, Index blockCols)
: Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
@@ -377,18 +377,18 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
init();
}
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
{
return m_xpr;
}
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
XprType& nestedExpression() { return m_xpr; }
/** \sa MapBase::innerStride() */
- EIGEN_DEVICE_FUNC
- inline Index innerStride() const
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index innerStride() const
{
return internal::traits<BlockType>::HasSameStorageOrderAsXprType
? m_xpr.innerStride()
@@ -396,19 +396,19 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
}
/** \sa MapBase::outerStride() */
- EIGEN_DEVICE_FUNC
- inline Index outerStride() const
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index outerStride() const
{
return m_outerStride;
}
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
StorageIndex startRow() const
{
return m_startRow.value();
}
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
StorageIndex startCol() const
{
return m_startCol.value();
@@ -422,8 +422,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
#ifndef EIGEN_PARSED_BY_DOXYGEN
/** \internal used by allowAligned() */
- EIGEN_DEVICE_FUNC
- inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
: Base(data, blockRows, blockCols), m_xpr(xpr)
{
init();
@@ -431,7 +431,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
#endif
protected:
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void init()
{
m_outerStride = internal::traits<BlockType>::HasSameStorageOrderAsXprType
diff --git a/Eigen/src/Core/ConditionEstimator.h b/Eigen/src/Core/ConditionEstimator.h
index aa7efdc76..51a2e5f1b 100644
--- a/Eigen/src/Core/ConditionEstimator.h
+++ b/Eigen/src/Core/ConditionEstimator.h
@@ -160,7 +160,7 @@ rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Deco
{
typedef typename Decomposition::RealScalar RealScalar;
eigen_assert(dec.rows() == dec.cols());
- if (dec.rows() == 0) return RealScalar(1);
+ if (dec.rows() == 0) return NumTraits<RealScalar>::infinity();
if (matrix_norm == RealScalar(0)) return RealScalar(0);
if (dec.rows() == 1) return RealScalar(1);
const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec);
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index d5da5cdec..670fa77b5 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -90,7 +90,8 @@ template<typename T>
struct evaluator : public unary_evaluator<T>
{
typedef unary_evaluator<T> Base;
- EIGEN_DEVICE_FUNC explicit evaluator(const T& xpr) : Base(xpr) {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit evaluator(const T& xpr) : Base(xpr) {}
};
@@ -99,7 +100,7 @@ template<typename T>
struct evaluator<const T>
: evaluator<T>
{
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
explicit evaluator(const T& xpr) : evaluator<T>(xpr) {}
};
@@ -134,21 +135,25 @@ private:
// this helper permits to completely eliminate m_outerStride if it is known at compiletime.
template<typename Scalar,int OuterStride> class plainobjectbase_evaluator_data {
public:
- EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
{
#ifndef EIGEN_INTERNAL_DEBUGGING
EIGEN_UNUSED_VARIABLE(outerStride);
#endif
eigen_internal_assert(outerStride==OuterStride);
}
- EIGEN_DEVICE_FUNC Index outerStride() const { return OuterStride; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index outerStride() const { return OuterStride; }
const Scalar *data;
};
template<typename Scalar> class plainobjectbase_evaluator_data<Scalar,Dynamic> {
public:
- EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
- EIGEN_DEVICE_FUNC Index outerStride() const { return m_outerStride; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index outerStride() const { return m_outerStride; }
const Scalar *data;
protected:
Index m_outerStride;
@@ -179,13 +184,15 @@ struct evaluator<PlainObjectBase<Derived> >
: RowsAtCompileTime
};
- EIGEN_DEVICE_FUNC evaluator()
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ evaluator()
: m_d(0,OuterStrideAtCompileTime)
{
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
}
- EIGEN_DEVICE_FUNC explicit evaluator(const PlainObjectType& m)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit evaluator(const PlainObjectType& m)
: m_d(m.data(),IsVectorAtCompileTime ? 0 : m.outerStride())
{
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -268,9 +275,11 @@ struct evaluator<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
{
typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
- EIGEN_DEVICE_FUNC evaluator() {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ evaluator() {}
- EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit evaluator(const XprType& m)
: evaluator<PlainObjectBase<XprType> >(m)
{ }
};
@@ -281,9 +290,11 @@ struct evaluator<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
{
typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
- EIGEN_DEVICE_FUNC evaluator() {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ evaluator() {}
- EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit evaluator(const XprType& m)
: evaluator<PlainObjectBase<XprType> >(m)
{ }
};
@@ -302,7 +313,8 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased>
Alignment = evaluator<ArgType>::Alignment
};
- EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -712,7 +724,8 @@ struct evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
typedef binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > Base;
- EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit evaluator(const XprType& xpr) : Base(xpr) {}
};
template<typename BinaryOp, typename Lhs, typename Rhs>
@@ -740,7 +753,8 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<Lhs>::Alignment,evaluator<Rhs>::Alignment)
};
- EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr) : m_d(xpr)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit binary_evaluator(const XprType& xpr) : m_d(xpr)
{
EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -877,7 +891,8 @@ struct mapbase_evaluator : evaluator_base<Derived>
CoeffReadCost = NumTraits<Scalar>::ReadCost
};
- EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit mapbase_evaluator(const XprType& map)
: m_data(const_cast<PointerType>(map.data())),
m_innerStride(map.innerStride()),
m_outerStride(map.outerStride())
@@ -941,10 +956,10 @@ struct mapbase_evaluator : evaluator_base<Derived>
internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
}
protected:
- EIGEN_DEVICE_FUNC
- inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
- EIGEN_DEVICE_FUNC
- inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
PointerType m_data;
const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
@@ -997,7 +1012,8 @@ struct evaluator<Ref<PlainObjectType, RefOptions, StrideType> >
Alignment = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Alignment
};
- EIGEN_DEVICE_FUNC explicit evaluator(const XprType& ref)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit evaluator(const XprType& ref)
: mapbase_evaluator<XprType, PlainObjectType>(ref)
{ }
};
@@ -1052,7 +1068,8 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
};
typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
- EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit evaluator(const XprType& block) : block_evaluator_type(block)
{
EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
}
@@ -1065,7 +1082,8 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAcc
{
typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
- EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit block_evaluator(const XprType& block)
: unary_evaluator<XprType>(block)
{}
};
@@ -1076,7 +1094,8 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
{
typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
- EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit unary_evaluator(const XprType& block)
: m_argImpl(block.nestedExpression()),
m_startRow(block.startRow()),
m_startCol(block.startCol()),
@@ -1176,7 +1195,8 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAc
typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
typedef typename XprType::Scalar Scalar;
- EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit block_evaluator(const XprType& block)
: mapbase_evaluator<XprType, typename XprType::PlainObject>(block)
{
// TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
@@ -1204,7 +1224,8 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ThenMatrixType>::Alignment, evaluator<ElseMatrixType>::Alignment)
};
- EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit evaluator(const XprType& select)
: m_conditionImpl(select.conditionMatrix()),
m_thenImpl(select.thenMatrix()),
m_elseImpl(select.elseMatrix())
@@ -1261,7 +1282,8 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
Alignment = evaluator<ArgTypeNestedCleaned>::Alignment
};
- EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& replicate)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit unary_evaluator(const XprType& replicate)
: m_arg(replicate.nestedExpression()),
m_argImpl(m_arg),
m_rows(replicate.nestedExpression().rows()),
@@ -1341,7 +1363,8 @@ struct evaluator_wrapper_base
Alignment = evaluator<ArgType>::Alignment
};
- EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
typedef typename ArgType::Scalar Scalar;
typedef typename ArgType::CoeffReturnType CoeffReturnType;
@@ -1408,7 +1431,8 @@ struct unary_evaluator<MatrixWrapper<TArgType> >
{
typedef MatrixWrapper<TArgType> XprType;
- EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit unary_evaluator(const XprType& wrapper)
: evaluator_wrapper_base<MatrixWrapper<TArgType> >(wrapper.nestedExpression())
{ }
};
@@ -1419,7 +1443,8 @@ struct unary_evaluator<ArrayWrapper<TArgType> >
{
typedef ArrayWrapper<TArgType> XprType;
- EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit unary_evaluator(const XprType& wrapper)
: evaluator_wrapper_base<ArrayWrapper<TArgType> >(wrapper.nestedExpression())
{ }
};
@@ -1461,7 +1486,8 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
Alignment = 0 // FIXME in some rare cases, Alignment could be preserved, like a Vector4f.
};
- EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& reverse)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit unary_evaluator(const XprType& reverse)
: m_argImpl(reverse.nestedExpression()),
m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1),
m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1)
@@ -1568,7 +1594,8 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
Alignment = 0
};
- EIGEN_DEVICE_FUNC explicit evaluator(const XprType& diagonal)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit evaluator(const XprType& diagonal)
: m_argImpl(diagonal.nestedExpression()),
m_index(diagonal.index())
{ }
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index bf2632d9e..8b8de8382 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -100,8 +100,14 @@ class CwiseBinaryOp :
typedef typename internal::remove_reference<LhsNested>::type _LhsNested;
typedef typename internal::remove_reference<RhsNested>::type _RhsNested;
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
+#if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11
+ //Required for Visual Studio or the Copy constructor will probably not get inlined!
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ CwiseBinaryOp(const CwiseBinaryOp<BinaryOp,LhsType,RhsType>&) = default;
+#endif
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
: m_lhs(aLhs), m_rhs(aRhs), m_functor(func)
{
EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar);
@@ -110,16 +116,16 @@ class CwiseBinaryOp :
eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
}
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE Index rows() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index rows() const {
// return the fixed size type if available to enable compile time optimizations
if (internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic)
return m_rhs.rows();
else
return m_lhs.rows();
}
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE Index cols() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index cols() const {
// return the fixed size type if available to enable compile time optimizations
if (internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic)
return m_rhs.cols();
@@ -128,13 +134,13 @@ class CwiseBinaryOp :
}
/** \returns the left hand side nested expression */
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const _LhsNested& lhs() const { return m_lhs; }
/** \returns the right hand side nested expression */
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const _RhsNested& rhs() const { return m_rhs; }
/** \returns the functor representing the binary operation */
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const BinaryOp& functor() const { return m_functor; }
protected:
diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h
index d149abe93..ef708197b 100644
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@@ -239,7 +239,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomA
DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
- return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar>(low,high,size));
+ return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low,high,size));
}
/** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(const Scalar&,const Scalar&)
@@ -252,7 +252,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
- return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar>(low,high,Derived::SizeAtCompileTime));
+ return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar>(low,high,Derived::SizeAtCompileTime));
}
/**
@@ -283,7 +283,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomA
DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
- return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar>(low,high,size));
+ return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low,high,size));
}
/**
@@ -296,7 +296,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
- return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar>(low,high,Derived::SizeAtCompileTime));
+ return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar>(low,high,Derived::SizeAtCompileTime));
}
/** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
@@ -398,7 +398,7 @@ template<typename Derived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
- return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize));
+ return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar>(low,high,newSize));
}
/**
diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h
index 271033056..21cf5ea9e 100644
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -81,7 +81,7 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
/** \returns the nested expression */
typename internal::remove_reference<MatrixTypeNested>::type&
- nestedExpression() { return m_matrix.const_cast_derived(); }
+ nestedExpression() { return m_matrix; }
protected:
MatrixTypeNested m_matrix;
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 2a0927317..2289fe41f 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -40,7 +40,7 @@ static inline void check_DenseIndex_is_signed() {
*/
template<typename Derived> class DenseBase
#ifndef EIGEN_PARSED_BY_DOXYGEN
- : public DenseCoeffsBase<Derived>
+ : public DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value>
#else
: public DenseCoeffsBase<Derived,DirectWriteAccessors>
#endif // not EIGEN_PARSED_BY_DOXYGEN
@@ -71,7 +71,7 @@ template<typename Derived> class DenseBase
typedef Scalar value_type;
typedef typename NumTraits<Scalar>::Real RealScalar;
- typedef DenseCoeffsBase<Derived> Base;
+ typedef DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value> Base;
using Base::derived;
using Base::const_cast_derived;
@@ -150,8 +150,8 @@ template<typename Derived> class DenseBase
* \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime
*/
- IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
- || internal::traits<Derived>::MaxColsAtCompileTime == 1,
+ IsVectorAtCompileTime = internal::traits<Derived>::RowsAtCompileTime == 1
+ || internal::traits<Derived>::ColsAtCompileTime == 1,
/**< This is set to true if either the number of rows or the number of
* columns is known at compile-time to be equal to 1. Indeed, in that case,
* we are dealing with a column-vector (if there is only one column) or with
@@ -266,9 +266,9 @@ template<typename Derived> class DenseBase
/** \internal Represents a matrix with all coefficients equal to one another*/
typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
/** \internal \deprecated Represents a vector with linearly spaced coefficients that allows sequential access only. */
- typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar>,PlainObject> SequentialLinSpacedReturnType;
+ typedef CwiseNullaryOp<internal::linspaced_op<Scalar>,PlainObject> SequentialLinSpacedReturnType;
/** \internal Represents a vector with linearly spaced coefficients that allows random access. */
- typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar>,PlainObject> RandomAccessLinSpacedReturnType;
+ typedef CwiseNullaryOp<internal::linspaced_op<Scalar>,PlainObject> RandomAccessLinSpacedReturnType;
/** \internal the return type of MatrixBase::eigenvalues() */
typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real, internal::traits<Derived>::ColsAtCompileTime, 1> EigenvaluesReturnType;
@@ -415,7 +415,7 @@ template<typename Derived> class DenseBase
*
*/
template<typename OtherDerived>
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void swap(const DenseBase<OtherDerived>& other)
{
EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
@@ -427,7 +427,7 @@ template<typename Derived> class DenseBase
*
*/
template<typename OtherDerived>
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void swap(PlainObjectBase<OtherDerived>& other)
{
eigen_assert(rows()==other.rows() && cols()==other.cols());
diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h
index afab2f1b6..542685c65 100644
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@@ -178,6 +178,30 @@ class DiagonalMatrix
EIGEN_DEVICE_FUNC
inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {}
+ #if EIGEN_HAS_CXX11
+ /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients. \cpp11
+ *
+ * There exists C++98 anologue constructors for fixed-size diagonal matrices having 2 or 3 coefficients.
+ *
+ * \warning To construct a diagonal matrix of fixed size, the number of values passed to this
+ * constructor must match the fixed dimension of \c *this.
+ *
+ * \sa DiagonalMatrix(const Scalar&, const Scalar&)
+ * \sa DiagonalMatrix(const Scalar&, const Scalar&, const Scalar&)
+ */
+ template <typename... ArgTypes>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const ArgTypes&... args)
+ : m_diagonal(a0, a1, a2, args...) {}
+
+ /** \brief Constructs a DiagonalMatrix and initializes it by elements given by an initializer list of initializer
+ * lists \cpp11
+ */
+ EIGEN_DEVICE_FUNC
+ explicit EIGEN_STRONG_INLINE DiagonalMatrix(const std::initializer_list<std::initializer_list<Scalar>>& list)
+ : m_diagonal(list) {}
+ #endif // EIGEN_HAS_CXX11
+
/** Copy constructor. */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index 43f3b84c8..bf7ef54b5 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -239,7 +239,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
// on, the other hand it is good for the cache to pack the vector anyways...
EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1),
ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
- MightCannotUseDest = (!EvalToDestAtCompileTime) || ComplexByReal
+ MightCannotUseDest = ((!EvalToDestAtCompileTime) || ComplexByReal) && (ActualDest::MaxSizeAtCompileTime!=0)
};
typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
@@ -326,7 +326,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,true>
enum {
// FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
// on, the other hand it is good for the cache to pack the vector anyways...
- DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
+ DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 || ActualRhsTypeCleaned::MaxSizeAtCompileTime==0
};
gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
@@ -396,8 +396,8 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
*/
template<typename Derived>
template<typename OtherDerived>
-EIGEN_DEVICE_FUNC
-inline const Product<Derived, OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const Product<Derived, OtherDerived>
MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
{
// A note regarding the function declaration: In MSVC, this function will sometimes
@@ -439,8 +439,9 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
*/
template<typename Derived>
template<typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Product<Derived,OtherDerived,LazyProduct>
-EIGEN_DEVICE_FUNC MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
+MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
{
enum {
ProductIsValid = Derived::ColsAtCompileTime==Dynamic
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index da1350f1b..04a321b9f 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -56,6 +56,7 @@ struct default_packet_traits
HasConj = 1,
HasSetLinear = 1,
HasBlend = 0,
+ HasReduxp = 1,
HasDiv = 0,
HasSqrt = 0,
@@ -151,15 +152,18 @@ pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const
return static_cast<TgtPacket>(a);
}
+/** \internal \returns reinterpret_cast<Target>(a) */
+template <typename Target, typename Packet>
+EIGEN_DEVICE_FUNC inline Target
+preinterpret(const Packet& a); /* { return reinterpret_cast<const Target&>(a); } */
+
/** \internal \returns a + b (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-padd(const Packet& a,
- const Packet& b) { return a+b; }
+padd(const Packet& a, const Packet& b) { return a+b; }
/** \internal \returns a - b (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-psub(const Packet& a,
- const Packet& b) { return a-b; }
+psub(const Packet& a, const Packet& b) { return a-b; }
/** \internal \returns -a (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -172,23 +176,19 @@ pconj(const Packet& a) { return numext::conj(a); }
/** \internal \returns a * b (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmul(const Packet& a,
- const Packet& b) { return a*b; }
+pmul(const Packet& a, const Packet& b) { return a*b; }
/** \internal \returns a / b (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pdiv(const Packet& a,
- const Packet& b) { return a/b; }
+pdiv(const Packet& a, const Packet& b) { return a/b; }
/** \internal \returns the min of \a a and \a b (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmin(const Packet& a,
- const Packet& b) { return numext::mini(a, b); }
+pmin(const Packet& a, const Packet& b) { return numext::mini(a, b); }
/** \internal \returns the max of \a a and \a b (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmax(const Packet& a,
- const Packet& b) { return numext::maxi(a, b); }
+pmax(const Packet& a, const Packet& b) { return numext::maxi(a, b); }
/** \internal \returns the absolute value of \a a */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -212,7 +212,72 @@ pxor(const Packet& a, const Packet& b) { return a ^ b; }
/** \internal \returns the bitwise andnot of \a a and \a b */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pandnot(const Packet& a, const Packet& b) { return a & (!b); }
+pandnot(const Packet& a, const Packet& b) { return a & (~b); }
+
+/** \internal \returns ones */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;}
+
+template <typename RealScalar>
+EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) {
+ RealScalar b;
+ b = ptrue(b);
+ return std::complex<RealScalar>(b, b);
+}
+
+/** \internal \returns the bitwise not of \a a */
+template <typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pnot(const Packet& a) { return pxor(ptrue(a), a);}
+
+/** \internal \returns \a a shifted by N bits to the right */
+template<int N> EIGEN_DEVICE_FUNC inline int
+pshiftright(const int& a) { return a >> N; }
+template<int N> EIGEN_DEVICE_FUNC inline long int
+pshiftright(const long int& a) { return a >> N; }
+
+/** \internal \returns \a a shifted by N bits to the left */
+template<int N> EIGEN_DEVICE_FUNC inline int
+pshiftleft(const int& a) { return a << N; }
+template<int N> EIGEN_DEVICE_FUNC inline long int
+pshiftleft(const long int& a) { return a << N; }
+
+/** \internal \returns the significant and exponent of the underlying floating point numbers
+ * See https://en.cppreference.com/w/cpp/numeric/math/frexp
+ */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pfrexp(const Packet &a, Packet &exponent) { return std::frexp(a,&exponent); }
+
+/** \internal \returns a * 2^exponent
+ * See https://en.cppreference.com/w/cpp/numeric/math/ldexp
+ */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pldexp(const Packet &a, const Packet &exponent) { return std::ldexp(a,exponent); }
+
+/** \internal \returns zeros */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pzero(const Packet& a) { return pxor(a,a); }
+
+/** \internal \returns bits of \a or \b according to the input bit mask \a mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pselect(const Packet& mask, const Packet& a, const Packet& b) {
+ return por(pand(a,mask),pandnot(b,mask));
+}
+
+/** \internal \returns a <= b as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_le(const Packet& a, const Packet& b) { return a<=b ? ptrue(a) : pzero(a); }
+
+/** \internal \returns a < b as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_lt(const Packet& a, const Packet& b) { return a<b ? ptrue(a) : pzero(a); }
+
+/** \internal \returns a == b as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); }
+
+/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); }
/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -226,6 +291,10 @@ ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
+/** \internal \returns a packet with constant coefficients set from bits */
+template<typename Packet,typename BitsType> EIGEN_DEVICE_FUNC inline Packet
+pset1frombits(BitsType a);
+
/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pload1(const typename unpacket_traits<Packet>::type *a) { return pset1<Packet>(*a); }
@@ -339,18 +408,39 @@ typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_trai
predux_half_dowto4(const Packet& a)
{ return a; }
-/** \internal \returns the product of the elements of \a a*/
+/** \internal \returns the product of the elements of \a a */
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
{ return a; }
-/** \internal \returns the min of the elements of \a a*/
+/** \internal \returns the min of the elements of \a a */
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
{ return a; }
-/** \internal \returns the max of the elements of \a a*/
+/** \internal \returns the max of the elements of \a a */
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
{ return a; }
+/** \internal \returns true if all coeffs of \a a means "true"
+ * It is supposed to be called on values returned by pcmp_*.
+ */
+// not needed yet
+// template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a)
+// { return bool(a); }
+
+/** \internal \returns true if any coeffs of \a a means "true"
+ * It is supposed to be called on values returned by pcmp_*.
+ */
+template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a)
+{
+ // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames.
+ // It is expected that "true" is either:
+ // - Scalar(1)
+ // - bits full of ones (NaN for floats),
+ // - or first bit equals to 1 (1 for ints, smallest denormal for floats).
+ // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars.
+ return bool(predux(a));
+}
+
/** \internal \returns the reversed elements of \a a*/
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
{ return a; }
@@ -597,6 +687,22 @@ pinsertlast(const Packet& a, typename unpacket_traits<Packet>::type b)
return pblend(mask, pset1<Packet>(b), a);
}
+/***************************************************************************
+ * Some generic implementations to be used by implementors
+***************************************************************************/
+
+/** Default implementation of pfrexp for float.
+ * It is expected to be called by implementers of template<> pfrexp.
+ */
+template<typename Packet> EIGEN_STRONG_INLINE Packet
+pfrexp_float(const Packet& a, Packet& exponent);
+
+/** Default implementation of pldexp for float.
+ * It is expected to be called by implementers of template<> pldexp.
+ */
+template<typename Packet> EIGEN_STRONG_INLINE Packet
+pldexp_float(Packet a, Packet exponent);
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index 563df6e84..71377cee5 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -66,6 +66,11 @@ namespace Eigen
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
+#if EIGEN_HAS_CXX11_MATH
+ EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh,scalar_asinh_op,inverse hyperbolic sine,\sa ArrayBase::asinh)
+ EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh,scalar_acosh_op,inverse hyperbolic cosine,\sa ArrayBase::acosh)
+ EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh,scalar_atanh_op,inverse hyperbolic tangent,\sa ArrayBase::atanh)
+#endif
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h
index da7fd6cce..063511f24 100644
--- a/Eigen/src/Core/IO.h
+++ b/Eigen/src/Core/IO.h
@@ -41,6 +41,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
* - \b rowSuffix string printed at the end of each row
* - \b matPrefix string printed at the beginning of the matrix
* - \b matSuffix string printed at the end of the matrix
+ * - \b fill character printed to fill the empty space in aligned columns
*
* Example: \include IOFormat.cpp
* Output: \verbinclude IOFormat.out
@@ -53,9 +54,9 @@ struct IOFormat
IOFormat(int _precision = StreamPrecision, int _flags = 0,
const std::string& _coeffSeparator = " ",
const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="",
- const std::string& _matPrefix="", const std::string& _matSuffix="")
+ const std::string& _matPrefix="", const std::string& _matSuffix="", const char _fill=' ')
: matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator),
- rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags)
+ rowSpacer(""), coeffSeparator(_coeffSeparator), fill(_fill), precision(_precision), flags(_flags)
{
// TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
// don't add rowSpacer if columns are not to be aligned
@@ -71,6 +72,7 @@ struct IOFormat
std::string matPrefix, matSuffix;
std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer;
std::string coeffSeparator;
+ char fill;
int precision;
int flags;
};
@@ -176,18 +178,26 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
width = std::max<Index>(width, Index(sstr.str().length()));
}
}
+ std::streamsize old_width = s.width();
+ char old_fill_character = s.fill();
s << fmt.matPrefix;
for(Index i = 0; i < m.rows(); ++i)
{
if (i)
s << fmt.rowSpacer;
s << fmt.rowPrefix;
- if(width) s.width(width);
+ if(width) {
+ s.fill(fmt.fill);
+ s.width(width);
+ }
s << m.coeff(i, 0);
for(Index j = 1; j < m.cols(); ++j)
{
s << fmt.coeffSeparator;
- if (width) s.width(width);
+ if(width) {
+ s.fill(fmt.fill);
+ s.width(width);
+ }
s << m.coeff(i, j);
}
s << fmt.rowSuffix;
@@ -196,6 +206,10 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
}
s << fmt.matSuffix;
if(explicit_precision) s.precision(old_precision);
+ if(width) {
+ s.fill(old_fill_character);
+ s.width(old_width);
+ }
return s;
}
diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h
index 3485d8f46..377f8a5cc 100644
--- a/Eigen/src/Core/IndexedView.h
+++ b/Eigen/src/Core/IndexedView.h
@@ -132,7 +132,7 @@ public:
/** \returns the nested expression */
typename internal::remove_reference<XprType>::type&
- nestedExpression() { return m_xpr.const_cast_derived(); }
+ nestedExpression() { return m_xpr; }
/** \returns a const reference to the object storing/generating the row indices */
const RowIndices& rowIndices() const { return m_rowIndices; }
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 90c336d8c..4b714328c 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -255,29 +255,27 @@ class Matrix
*
* \sa resize(Index,Index)
*/
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE Matrix() : Base()
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Matrix() : Base()
{
Base::_check_template_params();
EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
}
// FIXME is it still needed
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
explicit Matrix(internal::constructor_without_unaligned_array_assert)
: Base(internal::constructor_without_unaligned_array_assert())
{ Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
#if EIGEN_HAS_RVALUE_REFERENCES
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
: Base(std::move(other))
{
Base::_check_template_params();
- if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
- Base::_set_noalias(other);
}
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
{
other.swap(*this);
@@ -289,20 +287,59 @@ class Matrix
// This constructor is for both 1x1 matrices and dynamic vectors
template<typename T>
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE explicit Matrix(const T& x)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit Matrix(const T& x)
{
Base::_check_template_params();
Base::template _init1<T>(x);
}
template<typename T0, typename T1>
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Matrix(const T0& x, const T1& y)
{
Base::_check_template_params();
Base::template _init2<T0,T1>(x, y);
}
+
+ #if EIGEN_HAS_CXX11
+ /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...)
+ *
+ * Example: \include Matrix_variadic_ctor_cxx11.cpp
+ * Output: \verbinclude Matrix_variadic_ctor_cxx11.out
+ *
+ * \sa Matrix(const std::initializer_list<std::initializer_list<Scalar>>&)
+ */
+ template <typename... ArgTypes>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+ : Base(a0, a1, a2, a3, args...) {}
+
+ /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
+ *
+ * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
+ *
+ * Example: \include Matrix_initializer_list_23_cxx11.cpp
+ * Output: \verbinclude Matrix_initializer_list_23_cxx11.out
+ *
+ * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
+ *
+ * In the case of a compile-time column vector, implicit transposition from a single row is allowed.
+ * Therefore <code>VectorXd{{1,2,3,4,5}}</code> is legal and the more verbose syntax
+ * <code>RowVectorXd{{1},{2},{3},{4},{5}}</code> can be avoided:
+ *
+ * Example: \include Matrix_initializer_list_vector_cxx11.cpp
+ * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out
+ *
+ * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes,
+ * and implicit transposition is allowed for compile-time vectors only.
+ *
+ * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...)
+ */
+ EIGEN_DEVICE_FUNC
+ explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
+ #endif // end EIGEN_HAS_CXX11
+
#else
/** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */
EIGEN_DEVICE_FUNC
@@ -321,7 +358,8 @@ class Matrix
* \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
*/
EIGEN_STRONG_INLINE explicit Matrix(Index dim);
- /** \brief Constructs an initialized 1x1 matrix with the given coefficient */
+ /** \brief Constructs an initialized 1x1 matrix with the given coefficient
+ * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */
Matrix(const Scalar& x);
/** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
*
@@ -338,11 +376,14 @@ class Matrix
EIGEN_DEVICE_FUNC
Matrix(Index rows, Index cols);
- /** \brief Constructs an initialized 2D vector with given coefficients */
+ /** \brief Constructs an initialized 2D vector with given coefficients
+ * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */
Matrix(const Scalar& x, const Scalar& y);
- #endif
+ #endif // end EIGEN_PARSED_BY_DOXYGEN
- /** \brief Constructs an initialized 3D vector with given coefficients */
+ /** \brief Constructs an initialized 3D vector with given coefficients
+ * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...)
+ */
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
{
@@ -352,7 +393,9 @@ class Matrix
m_storage.data()[1] = y;
m_storage.data()[2] = z;
}
- /** \brief Constructs an initialized 4D vector with given coefficients */
+ /** \brief Constructs an initialized 4D vector with given coefficients
+ * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...)
+ */
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w)
{
@@ -407,7 +450,7 @@ class Matrix
*
* \ingroup Core_Module
*
- * Eigen defines several typedef shortcuts for most common matrix and vector types.
+ * %Eigen defines several typedef shortcuts for most common matrix and vector types.
*
* The general patterns are the following:
*
@@ -419,6 +462,15 @@ class Matrix
*
* There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is
* a fixed-size vector of 4 complex floats.
+ *
+ * With \cpp11, template alias are also defined for common sizes.
+ * They follow the same pattern as above except that the scalar type suffix is replaced by a
+ * template parameter, i.e.:
+ * - `MatrixSize<Type>` where `Size` can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size.
+ * - `MatrixXSize<Type>` and `MatrixSizeX<Type>` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices.
+ * - `VectorSize<Type>` and `RowVectorSize<Type>` for column and row vectors.
+ *
+ * With \cpp11, you can also use fully generic column and row vector types: `Vector<Type,Size>` and `RowVector<Type,Size>`.
*
* \sa class Matrix
*/
@@ -456,6 +508,55 @@ EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)
#undef EIGEN_MAKE_TYPEDEFS
#undef EIGEN_MAKE_FIXED_TYPEDEFS
+#if EIGEN_HAS_CXX11
+
+#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix) \
+/** \ingroup matrixtypedefs */ \
+/** \brief \cpp11 */ \
+template <typename Type> \
+using Matrix##SizeSuffix = Matrix<Type, Size, Size>; \
+/** \ingroup matrixtypedefs */ \
+/** \brief \cpp11 */ \
+template <typename Type> \
+using Vector##SizeSuffix = Matrix<Type, Size, 1>; \
+/** \ingroup matrixtypedefs */ \
+/** \brief \cpp11 */ \
+template <typename Type> \
+using RowVector##SizeSuffix = Matrix<Type, 1, Size>;
+
+#define EIGEN_MAKE_FIXED_TYPEDEFS(Size) \
+/** \ingroup matrixtypedefs */ \
+/** \brief \cpp11 */ \
+template <typename Type> \
+using Matrix##Size##X = Matrix<Type, Size, Dynamic>; \
+/** \ingroup matrixtypedefs */ \
+/** \brief \cpp11 */ \
+template <typename Type> \
+using Matrix##X##Size = Matrix<Type, Dynamic, Size>;
+
+EIGEN_MAKE_TYPEDEFS(2, 2)
+EIGEN_MAKE_TYPEDEFS(3, 3)
+EIGEN_MAKE_TYPEDEFS(4, 4)
+EIGEN_MAKE_TYPEDEFS(Dynamic, X)
+EIGEN_MAKE_FIXED_TYPEDEFS(2)
+EIGEN_MAKE_FIXED_TYPEDEFS(3)
+EIGEN_MAKE_FIXED_TYPEDEFS(4)
+
+/** \ingroup matrixtypedefs
+ * \brief \cpp11 */
+template <typename Type, int Size>
+using Vector = Matrix<Type, Size, 1>;
+
+/** \ingroup matrixtypedefs
+ * \brief \cpp11 */
+template <typename Type, int Size>
+using RowVector = Matrix<Type, 1, Size>;
+
+#undef EIGEN_MAKE_TYPEDEFS
+#undef EIGEN_MAKE_FIXED_TYPEDEFS
+
+#endif // EIGEN_HAS_CXX11
+
} // end namespace Eigen
#endif // EIGEN_MATRIX_H
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 596cdd133..4744e5cc4 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -468,6 +468,11 @@ template<typename Derived> class MatrixBase
const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const;
EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine)
EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine)
+#if EIGEN_HAS_CXX11_MATH
+ EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine)
+ EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine)
+ EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine)
+#endif
EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine)
EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine)
EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root)
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index 01cf192e9..239bbba63 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -16,7 +16,11 @@ namespace Eigen {
namespace internal {
template<typename ExpressionType>
struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType>
-{};
+{
+ enum {
+ Flags = traits<ExpressionType>::Flags & ~NestByRefBit
+ };
+};
}
/** \class NestByValue
@@ -43,55 +47,11 @@ template<typename ExpressionType> class NestByValue
EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
- EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
- EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
-
- EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
- {
- return m_expression.coeff(row, col);
- }
-
- EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
- {
- return m_expression.const_cast_derived().coeffRef(row, col);
- }
-
- EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
- {
- return m_expression.coeff(index);
- }
-
- EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
- {
- return m_expression.const_cast_derived().coeffRef(index);
- }
-
- template<int LoadMode>
- EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index row, Index col) const
- {
- return m_expression.template packet<LoadMode>(row, col);
- }
-
- template<int LoadMode>
- EIGEN_DEVICE_FUNC inline void writePacket(Index row, Index col, const PacketScalar& x)
- {
- m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
- }
-
- template<int LoadMode>
- EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index index) const
- {
- return m_expression.template packet<LoadMode>(index);
- }
-
- template<int LoadMode>
- EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& x)
- {
- m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
- }
EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
+ EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; }
+
protected:
const ExpressionType m_expression;
};
@@ -105,6 +65,21 @@ DenseBase<Derived>::nestByValue() const
return NestByValue<Derived>(derived());
}
+namespace internal {
+
+// Evaluator of Solve -> eval into a temporary
+template<typename ArgType>
+struct evaluator<NestByValue<ArgType> >
+ : public evaluator<ArgType>
+{
+ typedef evaluator<ArgType> Base;
+
+ EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue<ArgType>& xpr)
+ : Base(xpr.nestedExpression())
+ {}
+};
+}
+
} // end namespace Eigen
#endif // EIGEN_NESTBYVALUE_H
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index f551dabb0..6de78fd2f 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -526,6 +526,71 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
// EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
}
+ #if EIGEN_HAS_CXX11
+ /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11
+ *
+ * \only_for_vectors
+ *
+ * This constructor is for 1D array or vectors with more than 4 coefficients.
+ * There exists C++98 anologue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients.
+ *
+ * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
+ * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+ */
+ template <typename... ArgTypes>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+ : m_storage()
+ {
+ _check_template_params();
+ EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4);
+ m_storage.data()[0] = a0;
+ m_storage.data()[1] = a1;
+ m_storage.data()[2] = a2;
+ m_storage.data()[3] = a3;
+ int i = 4;
+ auto x = {(m_storage.data()[i++] = args, 0)...};
+ static_cast<void>(x);
+ }
+
+ /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer
+ * lists \cpp11
+ */
+ EIGEN_DEVICE_FUNC
+ explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list<std::initializer_list<Scalar>>& list)
+ : m_storage()
+ {
+ _check_template_params();
+
+ size_t list_size = 0;
+ if (list.begin() != list.end()) {
+ list_size = list.begin()->size();
+ }
+
+ // This is to allow syntax like VectorXi {{1, 2, 3, 4}}
+ if (ColsAtCompileTime == 1 && list.size() == 1) {
+ eigen_assert(list_size == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
+ resize(list_size, ColsAtCompileTime);
+ std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data());
+ } else {
+ eigen_assert(list.size() == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
+ eigen_assert(list_size == static_cast<size_t>(ColsAtCompileTime) || ColsAtCompileTime == Dynamic);
+ resize(list.size(), list_size);
+
+ Index row_index = 0;
+ for (const std::initializer_list<Scalar>& row : list) {
+ eigen_assert(list_size == row.size());
+ Index col_index = 0;
+ for (const Scalar& e : row) {
+ coeffRef(row_index, col_index) = e;
+ ++col_index;
+ }
+ ++row_index;
+ }
+ }
+ }
+ #endif // end EIGEN_HAS_CXX11
+
/** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
@@ -737,8 +802,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
{
- EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
- bool(NumTraits<T1>::IsInteger),
+ const bool t0_is_integer_alike = internal::is_valid_index_type<T0>::value;
+ const bool t1_is_integer_alike = internal::is_valid_index_type<T1>::value;
+ EIGEN_STATIC_ASSERT(t0_is_integer_alike &&
+ t1_is_integer_alike,
FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
resize(rows,cols);
}
@@ -773,9 +840,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
&& ((!internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0)
{
// NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
- const bool is_integer = NumTraits<T>::IsInteger;
- EIGEN_UNUSED_VARIABLE(is_integer);
- EIGEN_STATIC_ASSERT(is_integer,
+ const bool is_integer_alike = internal::is_valid_index_type<T>::value;
+ EIGEN_UNUSED_VARIABLE(is_integer_alike);
+ EIGEN_STATIC_ASSERT(is_integer_alike,
FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
resize(size);
}
@@ -882,7 +949,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
* of same type it is enough to swap the data pointers.
*/
template<typename OtherDerived>
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void swap(DenseBase<OtherDerived> & other)
{
enum { SwapPointers = internal::is_same<Derived, OtherDerived>::value && Base::SizeAtCompileTime==Dynamic };
@@ -893,7 +960,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
* \brief const version forwarded to DenseBase::swap
*/
template<typename OtherDerived>
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void swap(DenseBase<OtherDerived> const & other)
{ Base::swap(other.derived()); }
@@ -1027,7 +1094,7 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
struct matrix_swap_impl
{
EIGEN_DEVICE_FUNC
- static inline void run(MatrixTypeA& a, MatrixTypeB& b)
+ static EIGEN_STRONG_INLINE void run(MatrixTypeA& a, MatrixTypeB& b)
{
a.base().swap(b);
}
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 70790dbd4..13d5662df 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -90,18 +90,23 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
- EIGEN_DEVICE_FUNC Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
{
eigen_assert(lhs.cols() == rhs.rows()
&& "invalid matrix product"
&& "if you wanted a coeff-wise or a dot product use the respective explicit functions");
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index rows() const { return m_lhs.rows(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index cols() const { return m_rhs.cols(); }
- EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
- EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const LhsNestedCleaned& lhs() const { return m_lhs; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const RhsNestedCleaned& rhs() const { return m_rhs; }
protected:
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 246bca3e5..d53dc30a3 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -411,35 +411,58 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
}
- // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
- // dst {,+,-}= s * (A.lazyProduct(B))
- // This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
- // For them, this strategy is also faster than simply by-passing the heap allocation through
- // stack allocation.
- // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
- // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
- // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
- template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
+ // This is a special evaluation path called from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
+ // This variant tries to extract scalar multiples from both the LHS and RHS and factor them out. For instance:
+ // dst {,+,-}= (s1*A)*(B*s2)
+ // will be rewritten as:
+ // dst {,+,-}= (s1*s2) * (A.lazyProduct(B))
+ // There are at least four benefits of doing so:
+ // 1 - huge performance gain for heap-allocated matrix types as it save costly allocations.
+ // 2 - it is faster than simply by-passing the heap allocation through stack allocation.
+ // 3 - it makes this fallback consistent with the heavy GEMM routine.
+ // 4 - it fully by-passes huge stack allocation attempts when multiplying huge fixed-size matrices.
+ // (see https://stackoverflow.com/questions/54738495)
+ // For small fixed sizes matrices, howver, the gains are less obvious, it is sometimes x2 faster, but sometimes x3 slower,
+ // and the behavior depends also a lot on the compiler... This is why this re-writting strategy is currently
+ // enabled only when falling back from the main GEMM.
+ template<typename Dst, typename Func>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
- const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
+ void eval_dynamic(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func &func)
{
- call_restricted_packet_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
+ enum {
+ HasScalarFactor = blas_traits<Lhs>::HasScalarFactor || blas_traits<Rhs>::HasScalarFactor,
+ ConjLhs = blas_traits<Lhs>::NeedToConjugate,
+ ConjRhs = blas_traits<Rhs>::NeedToConjugate
+ };
+ // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto
+ // this is important for real*complex_mat
+ Scalar actualAlpha = blas_traits<Lhs>::extractScalarFactor(lhs)
+ * blas_traits<Rhs>::extractScalarFactor(rhs);
+ eval_dynamic_impl(dst,
+ blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(),
+ blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(),
+ func,
+ actualAlpha,
+ typename conditional<HasScalarFactor,true_type,false_type>::type());
}
- // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
- // overload more specialized.
- template<typename Dst, typename LhsT, typename Func>
+protected:
+
+ template<typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
+ void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s /* == 1 */, false_type)
{
+ EIGEN_UNUSED_VARIABLE(s);
+ eigen_internal_assert(s==Scalar(1));
call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
}
-
-
-// template<typename Dst>
-// static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-// { dst.noalias() += alpha * lhs.lazyProduct(rhs); }
+
+ template<typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s, true_type)
+ {
+ call_restricted_packet_assignment_no_alias(dst, s * lhs.lazyProduct(rhs), func);
+ }
};
// This specialization enforces the use of a coefficient-based evaluation strategy
@@ -582,7 +605,8 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
* which is why we don't set the LinearAccessBit.
* TODO: this seems possible when the result is a vector
*/
- EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const CoeffReturnType coeff(Index index) const
{
const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
@@ -590,6 +614,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
}
template<int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const PacketType packet(Index row, Index col) const
{
PacketType res;
@@ -601,6 +626,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
}
template<int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const PacketType packet(Index index) const
{
const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
@@ -629,7 +655,8 @@ struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProduc
enum {
Flags = Base::Flags | EvalBeforeNestingBit
};
- EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit product_evaluator(const XprType& xpr)
: Base(BaseProduct(xpr.lhs(),xpr.rhs()))
{}
};
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 720b6030c..2eef5abc5 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -359,7 +359,8 @@ class redux_evaluator : public internal::evaluator<_XprType>
typedef internal::evaluator<_XprType> Base;
public:
typedef _XprType XprType;
- EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -375,11 +376,12 @@ public:
InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
};
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
{ return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
template<int LoadMode, typename PacketType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
PacketType packetByOuterInner(Index outer, Index inner) const
{ return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
@@ -397,6 +399,8 @@ public:
* The template parameter \a BinaryOp is the type of the functor \a func which must be
* an associative operator. Both current C++98 and C++11 functor styles are handled.
*
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
* \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
*/
template<typename Derived>
@@ -415,6 +419,7 @@ DenseBase<Derived>::redux(const Func& func) const
}
/** \returns the minimum of all coefficients of \c *this.
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
* \warning the result is undefined if \c *this contains NaN.
*/
template<typename Derived>
@@ -425,6 +430,7 @@ DenseBase<Derived>::minCoeff() const
}
/** \returns the maximum of all coefficients of \c *this.
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
* \warning the result is undefined if \c *this contains NaN.
*/
template<typename Derived>
diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index ac9502bc4..172c8ffb6 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -28,12 +28,13 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
template<typename Derived> struct match {
enum {
+ IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime,
HasDirectAccess = internal::has_direct_access<Derived>::ret,
- StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
+ StorageOrderMatch = IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
|| int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
|| (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
- OuterStrideMatch = Derived::IsVectorAtCompileTime
+ OuterStrideMatch = IsVectorAtCompileTime
|| int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
// NOTE, this indirection of evaluator<Derived>::Alignment is needed
// to workaround a very strange bug in MSVC related to the instantiation
diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h
index b7bd1b292..c955815e6 100644
--- a/Eigen/src/Core/Reshaped.h
+++ b/Eigen/src/Core/Reshaped.h
@@ -191,7 +191,7 @@ class ReshapedImpl_dense<XprType,Rows,Cols,Order,false>
/** \returns the nested expression */
EIGEN_DEVICE_FUNC
typename internal::remove_reference<XprType>::type&
- nestedExpression() { return m_xpr.const_cast_derived(); }
+ nestedExpression() { return m_xpr; }
protected:
diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index 8b6b3ab03..853093923 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -171,8 +171,10 @@ struct vectorwise_reverse_inplace_impl<Vertical>
template<typename ExpressionType>
static void run(ExpressionType &xpr)
{
+ const int HalfAtCompileTime = ExpressionType::RowsAtCompileTime==Dynamic?Dynamic:ExpressionType::RowsAtCompileTime/2;
Index half = xpr.rows()/2;
- xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse());
+ xpr.topRows(fix<HalfAtCompileTime>(half))
+ .swap(xpr.bottomRows(fix<HalfAtCompileTime>(half)).colwise().reverse());
}
};
@@ -182,8 +184,10 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
template<typename ExpressionType>
static void run(ExpressionType &xpr)
{
+ const int HalfAtCompileTime = ExpressionType::ColsAtCompileTime==Dynamic?Dynamic:ExpressionType::ColsAtCompileTime/2;
Index half = xpr.cols()/2;
- xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse());
+ xpr.leftCols(fix<HalfAtCompileTime>(half))
+ .swap(xpr.rightCols(fix<HalfAtCompileTime>(half)).rowwise().reverse());
}
};
@@ -203,7 +207,7 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
template<typename ExpressionType, int Direction>
EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
{
- internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
+ internal::vectorwise_reverse_inplace_impl<Direction>::run(m_matrix);
}
} // end namespace Eigen
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 2cf3fa1ef..2173799d9 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -61,6 +61,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
typedef typename internal::traits<SelfAdjointView>::Scalar Scalar;
typedef typename MatrixType::StorageIndex StorageIndex;
typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
+ typedef SelfAdjointView<typename internal::add_const<MatrixType>::type, UpLo> ConstSelfAdjointView;
enum {
Mode = internal::traits<SelfAdjointView>::Mode,
@@ -197,6 +198,18 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
inline const ConjugateReturnType conjugate() const
{ return ConjugateReturnType(m_matrix.conjugate()); }
+ /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+ * returns \c *this otherwise.
+ */
+ template<bool Cond>
+ EIGEN_DEVICE_FUNC
+ inline typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type
+ conjugateIf() const
+ {
+ typedef typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type ReturnType;
+ return ReturnType(m_matrix.template conjugateIf<Cond>());
+ }
+
typedef SelfAdjointView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
/** \sa MatrixBase::adjoint() const */
EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
index 2bf940a26..ec4b4a987 100644
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -19,7 +19,7 @@ template<typename Decomposition, typename RhsType, typename StorageKind> class S
*
* \brief Pseudo expression representing a solving operation
*
- * \tparam Decomposition the type of the matrix or decomposion object
+ * \tparam Decomposition the type of the matrix or decomposition object
* \tparam Rhstype the type of the right-hand side
*
* This class represents an expression of A.solve(B)
diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h
index 702a5485c..501461042 100644
--- a/Eigen/src/Core/SolverBase.h
+++ b/Eigen/src/Core/SolverBase.h
@@ -14,8 +14,35 @@ namespace Eigen {
namespace internal {
+template<typename Derived>
+struct solve_assertion {
+ template<bool Transpose_, typename Rhs>
+ static void run(const Derived& solver, const Rhs& b) { solver.template _check_solve_assertion<Transpose_>(b); }
+};
+
+template<typename Derived>
+struct solve_assertion<Transpose<Derived> >
+{
+ typedef Transpose<Derived> type;
+
+ template<bool Transpose_, typename Rhs>
+ static void run(const type& transpose, const Rhs& b)
+ {
+ internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<true>(transpose.nestedExpression(), b);
+ }
+};
+template<typename Scalar, typename Derived>
+struct solve_assertion<CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > >
+{
+ typedef CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > type;
+ template<bool Transpose_, typename Rhs>
+ static void run(const type& adjoint, const Rhs& b)
+ {
+ internal::solve_assertion<typename internal::remove_all<Transpose<Derived> >::type>::template run<true>(adjoint.nestedExpression(), b);
+ }
+};
} // end namespace internal
/** \class SolverBase
@@ -35,7 +62,7 @@ namespace internal {
*
* \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors.
*
- * \sa class PartialPivLU, class FullPivLU
+ * \sa class PartialPivLU, class FullPivLU, class HouseholderQR, class ColPivHouseholderQR, class FullPivHouseholderQR, class CompleteOrthogonalDecomposition, class LLT, class LDLT, class SVDBase
*/
template<typename Derived>
class SolverBase : public EigenBase<Derived>
@@ -46,6 +73,9 @@ class SolverBase : public EigenBase<Derived>
typedef typename internal::traits<Derived>::Scalar Scalar;
typedef Scalar CoeffReturnType;
+ template<typename Derived_>
+ friend struct internal::solve_assertion;
+
enum {
RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
@@ -75,7 +105,7 @@ class SolverBase : public EigenBase<Derived>
inline const Solve<Derived, Rhs>
solve(const MatrixBase<Rhs>& b) const
{
- eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+ internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<false>(derived(), b);
return Solve<Derived, Rhs>(derived(), b.derived());
}
@@ -113,6 +143,13 @@ class SolverBase : public EigenBase<Derived>
}
protected:
+
+ template<bool Transpose_, typename Rhs>
+ void _check_solve_assertion(const Rhs& b) const {
+ EIGEN_ONLY_USED_FOR_DEBUG(b);
+ eigen_assert(derived().m_isInitialized && "Solver is not initialized.");
+ eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "SolverBase::solve(): invalid number of rows of the right hand side matrix b");
+ }
};
namespace internal {
diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h
index d70200918..180a4e5ad 100644
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@@ -30,12 +30,13 @@ public:
typedef typename Base::DstXprType DstXprType;
typedef swap_assign_op<Scalar> Functor;
- EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
: Base(dst, src, func, dstExpr)
{}
template<int StoreMode, int LoadMode, typename PacketType>
- void assignPacket(Index row, Index col)
+ EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
{
PacketType tmp = m_src.template packet<LoadMode,PacketType>(row,col);
const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(row,col, m_dst.template packet<StoreMode,PacketType>(row,col));
@@ -43,7 +44,7 @@ public:
}
template<int StoreMode, int LoadMode, typename PacketType>
- void assignPacket(Index index)
+ EIGEN_STRONG_INLINE void assignPacket(Index index)
{
PacketType tmp = m_src.template packet<LoadMode,PacketType>(index);
const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(index, m_dst.template packet<StoreMode,PacketType>(index));
@@ -52,7 +53,7 @@ public:
// TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
template<int StoreMode, int LoadMode, typename PacketType>
- void assignPacketByOuterInner(Index outer, Index inner)
+ EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
{
Index row = Base::rowIndexByOuterInner(outer, inner);
Index col = Base::colIndexByOuterInner(outer, inner);
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index d7c204579..c513f7f7c 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -61,25 +61,27 @@ template<typename MatrixType> class Transpose
typedef typename internal::remove_all<MatrixType>::type NestedExpression;
EIGEN_DEVICE_FUNC
- explicit inline Transpose(MatrixType& matrix) : m_matrix(matrix) {}
+ explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {}
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
- EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); }
- EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index rows() const { return m_matrix.cols(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index cols() const { return m_matrix.rows(); }
/** \returns the nested expression */
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const typename internal::remove_all<MatrixTypeNested>::type&
nestedExpression() const { return m_matrix; }
/** \returns the nested expression */
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename internal::remove_reference<MatrixTypeNested>::type&
nestedExpression() { return m_matrix; }
/** \internal */
- EIGEN_DEVICE_FUNC
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void resize(Index nrows, Index ncols) {
m_matrix.resize(ncols,nrows);
}
@@ -123,8 +125,10 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)
- EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
- EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index innerStride() const { return derived().nestedExpression().innerStride(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index outerStride() const { return derived().nestedExpression().outerStride(); }
typedef typename internal::conditional<
internal::is_lvalue<MatrixType>::value,
@@ -132,18 +136,20 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
const Scalar
>::type ScalarWithConstIfNotLvalue;
- EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
- EIGEN_DEVICE_FUNC inline const Scalar* data() const { return derived().nestedExpression().data(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Scalar* data() const { return derived().nestedExpression().data(); }
// FIXME: shall we keep the const version of coeffRef?
- EIGEN_DEVICE_FUNC
- inline const Scalar& coeffRef(Index rowId, Index colId) const
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Scalar& coeffRef(Index rowId, Index colId) const
{
return derived().nestedExpression().coeffRef(colId, rowId);
}
- EIGEN_DEVICE_FUNC
- inline const Scalar& coeffRef(Index index) const
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Scalar& coeffRef(Index index) const
{
return derived().nestedExpression().coeffRef(index);
}
@@ -169,7 +175,8 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
*
* \sa transposeInPlace(), adjoint() */
template<typename Derived>
-EIGEN_DEVICE_FUNC inline Transpose<Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Transpose<Derived>
DenseBase<Derived>::transpose()
{
return TransposeReturnType(derived());
@@ -181,7 +188,8 @@ DenseBase<Derived>::transpose()
*
* \sa transposeInPlace(), adjoint() */
template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ConstTransposeReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+typename DenseBase<Derived>::ConstTransposeReturnType
DenseBase<Derived>::transpose() const
{
return ConstTransposeReturnType(derived());
@@ -392,7 +400,8 @@ struct checkTransposeAliasing_impl<Derived, OtherDerived, false>
template<typename Dst, typename Src>
void check_for_aliasing(const Dst &dst, const Src &src)
{
- internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
+ if((!Dst::IsVectorAtCompileTime) && dst.rows()>1 && dst.cols()>1)
+ internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
}
} // end namespace internal
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 521de6160..cf3532f06 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -198,6 +198,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;
typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
+ typedef TriangularView<typename internal::add_const<MatrixType>::type, _Mode> ConstTriangularView;
public:
@@ -243,6 +244,18 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
inline const ConjugateReturnType conjugate() const
{ return ConjugateReturnType(m_matrix.conjugate()); }
+ /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+ * returns \c *this otherwise.
+ */
+ template<bool Cond>
+ EIGEN_DEVICE_FUNC
+ inline typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type
+ conjugateIf() const
+ {
+ typedef typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type ReturnType;
+ return ReturnType(m_matrix.template conjugateIf<Cond>());
+ }
+
typedef TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
/** \sa MatrixBase::adjoint() const */
EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/Core/VectorBlock.h b/Eigen/src/Core/VectorBlock.h
index 0ede5d58e..71c5b95ee 100644
--- a/Eigen/src/Core/VectorBlock.h
+++ b/Eigen/src/Core/VectorBlock.h
@@ -71,8 +71,8 @@ template<typename VectorType, int Size> class VectorBlock
/** Dynamic-size constructor
*/
- EIGEN_DEVICE_FUNC
- inline VectorBlock(VectorType& vector, Index start, Index size)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ VectorBlock(VectorType& vector, Index start, Index size)
: Base(vector,
IsColVector ? start : 0, IsColVector ? 0 : start,
IsColVector ? size : 1, IsColVector ? 1 : size)
@@ -82,8 +82,8 @@ template<typename VectorType, int Size> class VectorBlock
/** Fixed-size constructor
*/
- EIGEN_DEVICE_FUNC
- inline VectorBlock(VectorType& vector, Index start)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ VectorBlock(VectorType& vector, Index start)
: Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock);
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
index a88b6e736..db0b9f8c4 100644
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -173,6 +173,14 @@ struct member_redux {
* Example: \include MatrixBase_colwise_iterator_cxx11.cpp
* Output: \verbinclude MatrixBase_colwise_iterator_cxx11.out
*
+ * For a partial reduction on an empty input, some rules apply.
+ * For the sake of clarity, let's consider a vertical reduction:
+ * - If the number of columns is zero, then a 1x0 row-major vector expression is returned.
+ * - Otherwise, if the number of rows is zero, then
+ * - a row vector of zeros is returned for sum-like reductions (sum, squaredNorm, norm, etc.)
+ * - a row vector of ones is returned for a product reduction (e.g., <code>MatrixXd(n,0).colwise().prod()</code>)
+ * - an assert is triggered for all other reductions (minCoeff,maxCoeff,redux(bin_op))
+ *
* \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr
*/
template<typename ExpressionType, int Direction> class VectorwiseOp
@@ -294,13 +302,19 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
* The template parameter \a BinaryOp is the type of the functor
* of the custom redux operator. Note that func must be an associative operator.
*
+ * \warning the size along the reduction direction must be strictly positive,
+ * otherwise an assertion is triggered.
+ *
* \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise()
*/
template<typename BinaryOp>
EIGEN_DEVICE_FUNC
const typename ReduxReturnType<BinaryOp>::Type
redux(const BinaryOp& func = BinaryOp()) const
- { return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func)); }
+ {
+ eigen_assert(redux_length()>0 && "you are using an empty matrix");
+ return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func));
+ }
typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
@@ -325,6 +339,9 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
/** \returns a row (or column) vector expression of the smallest coefficient
* of each column (or row) of the referenced expression.
*
+ * \warning the size along the reduction direction must be strictly positive,
+ * otherwise an assertion is triggered.
+ *
* \warning the result is undefined if \c *this contains NaN.
*
* Example: \include PartialRedux_minCoeff.cpp
@@ -333,11 +350,17 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
* \sa DenseBase::minCoeff() */
EIGEN_DEVICE_FUNC
const MinCoeffReturnType minCoeff() const
- { return MinCoeffReturnType(_expression()); }
+ {
+ eigen_assert(redux_length()>0 && "you are using an empty matrix");
+ return MinCoeffReturnType(_expression());
+ }
/** \returns a row (or column) vector expression of the largest coefficient
* of each column (or row) of the referenced expression.
*
+ * \warning the size along the reduction direction must be strictly positive,
+ * otherwise an assertion is triggered.
+ *
* \warning the result is undefined if \c *this contains NaN.
*
* Example: \include PartialRedux_maxCoeff.cpp
@@ -346,7 +369,10 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
* \sa DenseBase::maxCoeff() */
EIGEN_DEVICE_FUNC
const MaxCoeffReturnType maxCoeff() const
- { return MaxCoeffReturnType(_expression()); }
+ {
+ eigen_assert(redux_length()>0 && "you are using an empty matrix");
+ return MaxCoeffReturnType(_expression());
+ }
/** \returns a row (or column) vector expression of the squared norm
* of each column (or row) of the referenced expression.
@@ -531,7 +557,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
//eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME
- return const_cast<ExpressionType&>(m_matrix = extendedTo(other.derived()));
+ return m_matrix = extendedTo(other.derived());
}
/** Adds the vector \a other to each subvector of \c *this */
@@ -541,7 +567,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
- return const_cast<ExpressionType&>(m_matrix += extendedTo(other.derived()));
+ return m_matrix += extendedTo(other.derived());
}
/** Substracts the vector \a other to each subvector of \c *this */
@@ -551,7 +577,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
- return const_cast<ExpressionType&>(m_matrix -= extendedTo(other.derived()));
+ return m_matrix -= extendedTo(other.derived());
}
/** Multiples each subvector of \c *this by the vector \a other */
@@ -563,7 +589,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
m_matrix *= extendedTo(other.derived());
- return const_cast<ExpressionType&>(m_matrix);
+ return m_matrix;
}
/** Divides each subvector of \c *this by the vector \a other */
@@ -575,7 +601,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
m_matrix /= extendedTo(other.derived());
- return const_cast<ExpressionType&>(m_matrix);
+ return m_matrix;
}
/** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
@@ -690,6 +716,10 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
const HNormalizedReturnType hnormalized() const;
protected:
+ Index redux_length() const
+ {
+ return Direction==Vertical ? m_matrix.rows() : m_matrix.cols();
+ }
ExpressionTypeNested m_matrix;
};
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index 54c1883d9..f67d83bd1 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -40,6 +40,14 @@ struct visitor_impl<Visitor, Derived, 1>
}
};
+// This specialization enables visitors on empty matrices at compile-time
+template<typename Visitor, typename Derived>
+struct visitor_impl<Visitor, Derived, 0> {
+ EIGEN_DEVICE_FUNC
+ static inline void run(const Derived &/*mat*/, Visitor& /*visitor*/)
+ {}
+};
+
template<typename Visitor, typename Derived>
struct visitor_impl<Visitor, Derived, Dynamic>
{
@@ -98,6 +106,8 @@ protected:
*
* \note compared to one or two \em for \em loops, visitors offer automatic
* unrolling for small fixed size matrix.
+ *
+ * \note if the matrix is empty, then the visitor is left unchanged.
*
* \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux()
*/
@@ -106,6 +116,9 @@ template<typename Visitor>
EIGEN_DEVICE_FUNC
void DenseBase<Derived>::visit(Visitor& visitor) const
{
+ if(size()==0)
+ return;
+
typedef typename internal::visitor_evaluator<Derived> ThisEvaluator;
ThisEvaluator thisEval(derived());
@@ -124,6 +137,8 @@ namespace internal {
template <typename Derived>
struct coeff_visitor
{
+ // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc
+ coeff_visitor() : row(-1), col(-1), res(0) {}
typedef typename Derived::Scalar Scalar;
Index row, col;
Scalar res;
@@ -196,6 +211,9 @@ struct functor_traits<max_coeff_visitor<Scalar> > {
/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
* \returns the minimum of all coefficients of *this and puts in *row and *col its location.
+ *
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
* \warning the result is undefined if \c *this contains NaN.
*
* \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
@@ -206,6 +224,8 @@ EIGEN_DEVICE_FUNC
typename internal::traits<Derived>::Scalar
DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
{
+ eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
internal::min_coeff_visitor<Derived> minVisitor;
this->visit(minVisitor);
*rowId = minVisitor.row;
@@ -214,6 +234,9 @@ DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
}
/** \returns the minimum of all coefficients of *this and puts in *index its location.
+ *
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
* \warning the result is undefined if \c *this contains NaN.
*
* \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff()
@@ -224,6 +247,8 @@ EIGEN_DEVICE_FUNC
typename internal::traits<Derived>::Scalar
DenseBase<Derived>::minCoeff(IndexType* index) const
{
+ eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
internal::min_coeff_visitor<Derived> minVisitor;
this->visit(minVisitor);
@@ -233,6 +258,9 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
/** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
* \returns the maximum of all coefficients of *this and puts in *row and *col its location.
+ *
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
* \warning the result is undefined if \c *this contains NaN.
*
* \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
@@ -243,6 +271,8 @@ EIGEN_DEVICE_FUNC
typename internal::traits<Derived>::Scalar
DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
{
+ eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
internal::max_coeff_visitor<Derived> maxVisitor;
this->visit(maxVisitor);
*rowPtr = maxVisitor.row;
@@ -251,6 +281,9 @@ DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
}
/** \returns the maximum of all coefficients of *this and puts in *index its location.
+ *
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
* \warning the result is undefined if \c *this contains NaN.
*
* \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
@@ -261,6 +294,8 @@ EIGEN_DEVICE_FUNC
typename internal::traits<Derived>::Scalar
DenseBase<Derived>::maxCoeff(IndexType* index) const
{
+ eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
internal::max_coeff_visitor<Derived> maxVisitor;
this->visit(maxVisitor);
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index 7fa61969d..5b8ff59bd 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -22,6 +22,7 @@ struct Packet4cf
__m256 v;
};
+#ifndef EIGEN_VECTORIZE_AVX512
template<> struct packet_traits<std::complex<float> > : default_packet_traits
{
typedef Packet4cf type;
@@ -44,8 +45,9 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
HasSetLinear = 0
};
};
+#endif
-template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32, vectorizable=true}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
@@ -67,10 +69,18 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, con
return Packet4cf(result);
}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) {
+ __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ);
+ return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet4cf pnot<Packet4cf>(const Packet4cf& a) { return Packet4cf(pnot(Packet8f(a.v))); }
template<> EIGEN_STRONG_INLINE Packet4cf pand <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf por <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pxor <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(b.v,a.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
@@ -228,6 +238,7 @@ struct Packet2cd
__m256d v;
};
+#ifndef EIGEN_VECTORIZE_AVX512
template<> struct packet_traits<std::complex<double> > : default_packet_traits
{
typedef Packet2cd type;
@@ -250,8 +261,9 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
HasSetLinear = 0
};
};
+#endif
-template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32, vectorizable=true}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
@@ -272,10 +284,18 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, con
return Packet2cd(_mm256_addsub_pd(even, odd));
}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) {
+ __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ);
+ return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cd pnot<Packet2cd>(const Packet2cd& a) { return Packet2cd(pnot(Packet4d(a.v))); }
template<> EIGEN_STRONG_INLINE Packet2cd pand <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd por <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pxor <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(b.v,a.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 6af67ce2d..9f375ed98 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -10,7 +10,7 @@
#ifndef EIGEN_MATH_FUNCTIONS_AVX_H
#define EIGEN_MATH_FUNCTIONS_AVX_H
-/* The sin, cos, exp, and log functions of this file are loosely derived from
+/* The sin and cos functions of this file are loosely derived from
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
*/
@@ -18,187 +18,22 @@ namespace Eigen {
namespace internal {
-inline Packet8i pshiftleft(Packet8i v, int n)
-{
-#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_slli_epi32(v, n);
-#else
- __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n);
- __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
-#endif
-}
-
-inline Packet8f pshiftright(Packet8f v, int n)
-{
-#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n));
-#else
- __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n);
- __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n);
- return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1));
-#endif
-}
-
-// Sine function
-// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and
-// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants
-// are (anti-)symmetric and thus have only odd/even coefficients
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
psin<Packet8f>(const Packet8f& _x) {
- Packet8f x = _x;
-
- // Some useful values.
- _EIGEN_DECLARE_CONST_Packet8i(one, 1);
- _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
- _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f);
- _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f);
- _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f);
- _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f);
- _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f);
- _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f);
- _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f);
-
- // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period.
- Packet8f z = pmul(x, p8f_one_over_pi);
- Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four));
- x = pmadd(shift, p8f_neg_pi_first, x);
- x = pmadd(shift, p8f_neg_pi_second, x);
- x = pmadd(shift, p8f_neg_pi_third, x);
- z = pmul(x, p8f_four_over_pi);
-
- // Make a mask for the entries that need flipping, i.e. wherever the shift
- // is odd.
- Packet8i shift_ints = _mm256_cvtps_epi32(shift);
- Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one)));
- Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31);
-
- // Create a mask for which interpolant to use, i.e. if z > 1, then the mask
- // is set to ones for that entry.
- Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ);
-
- // Evaluate the polynomial for the interval [1,3] in z.
- _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f);
- Packet8f z_minus_two = psub(z, p8f_two);
- Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two);
- Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4);
- right = pmadd(right, z_minus_two2, p8f_coeff_right_2);
- right = pmadd(right, z_minus_two2, p8f_coeff_right_0);
-
- // Evaluate the polynomial for the interval [-1,1] in z.
- _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f);
- _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f);
- Packet8f z2 = pmul(z, z);
- Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5);
- left = pmadd(left, z2, p8f_coeff_left_3);
- left = pmadd(left, z2, p8f_coeff_left_1);
- left = pmul(left, z);
-
- // Assemble the results, i.e. select the left and right polynomials.
- left = _mm256_andnot_ps(ival_mask, left);
- right = _mm256_and_ps(ival_mask, right);
- Packet8f res = _mm256_or_ps(left, right);
+ return psin_float(_x);
+}
- // Flip the sign on the odd intervals and return the result.
- res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask));
- return res;
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+pcos<Packet8f>(const Packet8f& _x) {
+ return pcos_float(_x);
}
-// Natural logarithm
-// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
-// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
-// be easily approximated by a polynomial centered on m=1 for stability.
-// TODO(gonnet): Further reduce the interval allowing for lower-degree
-// polynomial interpolants -> ... -> profit!
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
plog<Packet8f>(const Packet8f& _x) {
- Packet8f x = _x;
- _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
- _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f);
-
- _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
- // The smallest non denormalized float number.
- _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000);
- _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000);
-
- // Polynomial coefficients.
- _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f);
-
- Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN
- Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
-
- // Truncate input values to the minimum positive normal.
- x = pmax(x, p8f_min_norm_pos);
-
- Packet8f emm0 = pshiftright(x,23);
- Packet8f e = _mm256_sub_ps(emm0, p8f_126f);
-
- // Set the exponents to -1, i.e. x are in the range [0.5,1).
- x = _mm256_and_ps(x, p8f_inv_mant_mask);
- x = _mm256_or_ps(x, p8f_half);
-
- // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
- // and shift by -1. The values are then centered around 0, which improves
- // the stability of the polynomial evaluation.
- // if( x < SQRTHF ) {
- // e -= 1;
- // x = x + x - 1.0;
- // } else { x = x - 1.0; }
- Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ);
- Packet8f tmp = _mm256_and_ps(x, mask);
- x = psub(x, p8f_1);
- e = psub(e, _mm256_and_ps(p8f_1, mask));
- x = padd(x, tmp);
-
- Packet8f x2 = pmul(x, x);
- Packet8f x3 = pmul(x2, x);
-
- // Evaluate the polynomial approximant of degree 8 in three parts, probably
- // to improve instruction-level parallelism.
- Packet8f y, y1, y2;
- y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1);
- y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4);
- y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7);
- y = pmadd(y, x, p8f_cephes_log_p2);
- y1 = pmadd(y1, x, p8f_cephes_log_p5);
- y2 = pmadd(y2, x, p8f_cephes_log_p8);
- y = pmadd(y, x3, y1);
- y = pmadd(y, x3, y2);
- y = pmul(y, x3);
-
- // Add the logarithm of the exponent back to the result of the interpolation.
- y1 = pmul(e, p8f_cephes_log_q1);
- tmp = pmul(x2, p8f_half);
- y = padd(y, y1);
- x = psub(x, tmp);
- y2 = pmul(e, p8f_cephes_log_q2);
- x = padd(x, y);
- x = padd(x, y2);
-
- // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
- return _mm256_or_ps(
- _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)),
- _mm256_and_ps(iszero_mask, p8f_minus_inf));
+ return plog_float(_x);
}
// Exponential function. Works by writing "x = m*log(2) + r" where
@@ -207,62 +42,7 @@ plog<Packet8f>(const Packet8f& _x) {
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
pexp<Packet8f>(const Packet8f& _x) {
- _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
- _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f);
-
- _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f);
- _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f);
-
- _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f);
-
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f);
-
- // Clamp x.
- Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo);
-
- // Express exp(x) as exp(m*ln(2) + r), start by extracting
- // m = floor(x/ln(2) + 0.5).
- Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half));
-
-// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
-// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
-// truncation errors. Note that we don't use the "pmadd" function here to
-// ensure that a precision-preserving FMA instruction is used.
-#ifdef EIGEN_VECTORIZE_FMA
- _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f);
- Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x);
-#else
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f);
- _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f);
- Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1));
- r = psub(r, pmul(m, p8f_cephes_exp_C2));
-#endif
-
- Packet8f r2 = pmul(r, r);
-
- // TODO(gonnet): Split into odd/even polynomials and try to exploit
- // instruction-level parallelism.
- Packet8f y = p8f_cephes_exp_p0;
- y = pmadd(y, r, p8f_cephes_exp_p1);
- y = pmadd(y, r, p8f_cephes_exp_p2);
- y = pmadd(y, r, p8f_cephes_exp_p3);
- y = pmadd(y, r, p8f_cephes_exp_p4);
- y = pmadd(y, r, p8f_cephes_exp_p5);
- y = pmadd(y, r2, r);
- y = padd(y, p8f_1);
-
- // Build emm0 = 2^m.
- Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127));
- emm0 = pshiftleft(emm0, 23);
-
- // Return 2^m * exp(r).
- return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
+ return pexp_float(_x);
}
// Hyperbolic Tangent function.
@@ -274,82 +54,8 @@ ptanh<Packet8f>(const Packet8f& x) {
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
-pexp<Packet4d>(const Packet4d& _x) {
- Packet4d x = _x;
-
- _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
- _EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
- _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
-
- _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
- _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
-
- _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
-
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
-
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
-
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
- _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
- _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
-
- Packet4d tmp, fx;
-
- // clamp x
- x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
- // Express exp(x) as exp(g + n*log(2)).
- fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
-
- // Get the integer modulus of log(2), i.e. the "n" described above.
- fx = _mm256_floor_pd(fx);
-
- // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
- // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
- // digits right.
- tmp = pmul(fx, p4d_cephes_exp_C1);
- Packet4d z = pmul(fx, p4d_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- Packet4d x2 = pmul(x, x);
-
- // Evaluate the numerator polynomial of the rational interpolant.
- Packet4d px = p4d_cephes_exp_p0;
- px = pmadd(px, x2, p4d_cephes_exp_p1);
- px = pmadd(px, x2, p4d_cephes_exp_p2);
- px = pmul(px, x);
-
- // Evaluate the denominator polynomial of the rational interpolant.
- Packet4d qx = p4d_cephes_exp_q0;
- qx = pmadd(qx, x2, p4d_cephes_exp_q1);
- qx = pmadd(qx, x2, p4d_cephes_exp_q2);
- qx = pmadd(qx, x2, p4d_cephes_exp_q3);
-
- // I don't really get this bit, copied from the SSE2 routines, so...
- // TODO(gonnet): Figure out what is going on here, perhaps find a better
- // rational interpolant?
- x = _mm256_div_pd(px, psub(qx, px));
- x = pmadd(p4d_2, x, p4d_1);
-
- // Build e=2^n by constructing the exponents in a 128-bit vector and
- // shifting them to where they belong in double-precision values.
- __m128i emm0 = _mm256_cvtpd_epi32(fx);
- emm0 = _mm_add_epi32(emm0, p4i_1023);
- emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
- __m128i lo = _mm_slli_epi64(emm0, 52);
- __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
- __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
- e = _mm256_insertf128_si256(e, hi, 1);
-
- // Construct the result 2^n * exp(g) = e * x. The max is used to catch
- // non-finite values in the input.
- return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
+pexp<Packet4d>(const Packet4d& x) {
+ return pexp_double(x);
}
// Functions for sqrt.
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 774e64981..f88e36024 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -18,11 +18,11 @@ namespace internal {
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
#endif
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
@@ -63,7 +63,7 @@ template<> struct packet_traits<float> : default_packet_traits
HasDiv = 1,
HasSin = EIGEN_FAST_MATH,
- HasCos = 0,
+ HasCos = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
HasSqrt = 1,
@@ -113,14 +113,29 @@ template<> struct packet_traits<int> : default_packet_traits
};
*/
-template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; };
-template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; };
-template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; };
+template<> struct unpacket_traits<Packet8f> {
+ typedef float type;
+ typedef Packet4f half;
+ typedef Packet8i integer_packet;
+ enum {size=8, alignment=Aligned32, vectorizable=true};
+};
+template<> struct unpacket_traits<Packet4d> {
+ typedef double type;
+ typedef Packet2d half;
+ enum {size=4, alignment=Aligned32, vectorizable=true};
+};
+template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false}; };
template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); }
template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); }
+template<> EIGEN_STRONG_INLINE Packet8f pset1frombits<Packet8f>(unsigned int from) { return _mm256_castsi256_ps(pset1<Packet8i>(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); }
+template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); }
+template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); }
+
template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); }
template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
@@ -129,6 +144,15 @@ template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { retur
template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_add_epi32(a,b);
+#else
+ __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+ __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
@@ -157,13 +181,14 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co
return pset1<Packet8i>(0);
}
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
-#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
- // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
- // and gcc stupidly generates a vfmadd132ps instruction,
- // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
- // the result of the product.
+#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
+ // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
+ // and even register spilling with clang>=6.0 (bug 1637).
+ // Gcc stupidly generates a vfmadd132ps instruction.
+ // So let's enforce it to generate a vfmadd231ps instruction since the most common use
+ // case is to accumulate the result of the product.
Packet8f res = c;
__asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
return res;
@@ -172,7 +197,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f&
#endif
}
template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
-#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
+#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
// see above
Packet4d res = c;
__asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
@@ -184,21 +209,69 @@ template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d&
#endif
template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+ // There appears to be a bug in GCC, by which the optimizer may flip
+ // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to
+ // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+ // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+ Packet8f res;
+ asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ return res;
+#else
// Arguments are swapped to match NaN propagation behavior of std::min.
return _mm256_min_ps(b,a);
+#endif
}
template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+ // See pmin above
+ Packet4d res;
+ asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ return res;
+#else
// Arguments are swapped to match NaN propagation behavior of std::min.
return _mm256_min_pd(b,a);
+#endif
}
template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+ // See pmin above
+ Packet8f res;
+ asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ return res;
+#else
// Arguments are swapped to match NaN propagation behavior of std::max.
return _mm256_max_ps(b,a);
+#endif
}
template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+ // See pmin above
+ Packet4d res;
+ asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ return res;
+#else
// Arguments are swapped to match NaN propagation behavior of std::max.
return _mm256_max_pd(b,a);
+#endif
}
+
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); }
+template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); }
+template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); }
+
+template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_cmpeq_epi32(a,b);
+#else
+ __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+ __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
@@ -208,17 +281,101 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { ret
template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ // vpcmpeqd has lower latency than the more general vcmpps
+ return _mm256_cmpeq_epi32(a,a);
+#else
+ const __m256 b = _mm256_castsi256_ps(a);
+ return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ // vpcmpeqd has lower latency than the more general vcmpps
+ const __m256i b = _mm256_castps_si256(a);
+ return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b));
+#else
+ return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ);
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ // vpcmpeqq has lower latency than the more general vcmppd
+ const __m256i b = _mm256_castpd_si256(a);
+ return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b));
+#else
+ return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ);
+#endif
+}
+
template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_and_si256(a,b);
+#else
+ return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_or_si256(a,b);
+#else
+ return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_xor_si256(a,b);
+#else
+ return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); }
+template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); }
+template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_andnot_si256(b,a);
+#else
+ return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a)));
+#endif
+}
-template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b)
+{ return _mm256_blendv_ps(b,a,mask); }
+template<> EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& mask, const Packet4d& a, const Packet4d& b)
+{ return _mm256_blendv_pd(b,a,mask); }
+
+template<int N> EIGEN_STRONG_INLINE Packet8i pshiftright(Packet8i a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_srli_epi32(a, N);
+#else
+ __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(a, 0), N);
+ __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(a, 1), N);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet8i pshiftleft(Packet8i a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+ return _mm256_slli_epi32(a, N);
+#else
+ __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), N);
+ __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), N);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
@@ -363,6 +520,28 @@ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
return _mm256_and_pd(a,mask);
}
+template<> EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
+ return pfrexp_float(a,exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) {
+ return pldexp_float(a,exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
+ // Build e=2^n by constructing the exponents in a 128-bit vector and
+ // shifting them to where they belong in double-precision values.
+ Packet4i cst_1023 = pset1<Packet4i>(1023);
+ __m128i emm0 = _mm256_cvtpd_epi32(exponent);
+ emm0 = _mm_add_epi32(emm0, cst_1023);
+ emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
+ __m128i lo = _mm_slli_epi64(emm0, 52);
+ __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
+ __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
+ e = _mm256_insertf128_si256(e, hi, 1);
+ return pmul(a,_mm256_castsi256_pd(e));
+}
+
// preduxp should be ok
// FIXME: why is this ok? why isn't the simply implementation working as expected?
template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
@@ -459,6 +638,16 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
}
+// not needed yet
+// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
+// {
+// return _mm256_movemask_ps(x)==0xFF;
+// }
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
+{
+ return _mm256_movemask_ps(x)!=0;
+}
template<int Offset>
struct palign_impl<Offset,Packet8f>
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
index 83bfdc604..7d2e1e67f 100644
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -37,13 +37,21 @@ struct type_casting_traits<int, float> {
template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
- return _mm256_cvtps_epi32(a);
+ return _mm256_cvttps_epi32(a);
}
template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
return _mm256_cvtepi32_ps(a);
}
+template<> EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i,Packet8f>(const Packet8f& a) {
+ return _mm256_castps_si256(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f,Packet8i>(const Packet8i& a) {
+ return _mm256_castsi256_ps(a);
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h
new file mode 100644
index 000000000..9a89dd01f
--- /dev/null
+++ b/Eigen/src/Core/arch/AVX512/Complex.h
@@ -0,0 +1,488 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_AVX512_H
+#define EIGEN_COMPLEX_AVX512_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet8cf
+{
+ EIGEN_STRONG_INLINE Packet8cf() {}
+ EIGEN_STRONG_INLINE explicit Packet8cf(const __m512& a) : v(a) {}
+ __m512 v;
+};
+
+template<> struct packet_traits<std::complex<float> > : default_packet_traits
+{
+ typedef Packet8cf type;
+ typedef Packet4cf half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 8,
+ HasHalfPacket = 1,
+
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasNegate = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasSetLinear = 0,
+ HasReduxp = 0
+ };
+};
+
+template<> struct unpacket_traits<Packet8cf> {
+ typedef std::complex<float> type;
+ enum {
+ size = 8,
+ alignment=unpacket_traits<Packet16f>::alignment,
+ vectorizable=true
+ };
+ typedef Packet4cf half;
+};
+
+template<> EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet8cf pnot<Packet8cf>(const Packet8cf& a) { return Packet8cf(pnot(Packet16f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet8cf padd<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf psub<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a)
+{
+ return Packet8cf(pnegate(a.v));
+}
+template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a)
+{
+ const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32(
+ 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,
+ 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
+ return Packet8cf(pxor(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
+{
+ __m512 tmp2 = _mm512_mul_ps(_mm512_movehdup_ps(a.v), _mm512_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
+ return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pand <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf por <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pxor <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); }
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) {
+ __m512 eq = pcmp_eq<Packet16f>(a.v, b.v);
+ return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pload <Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload<Packet16f>(&numext::real_ref(*from))); }
+template<> EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu<Packet16f>(&numext::real_ref(*from))); }
+
+
+template<> EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from)
+{
+ return Packet8cf(_mm512_castpd_ps(pload1<Packet8d>((const double*)(const void*)&from)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from)
+{
+ return Packet8cf( _mm512_castpd_ps( ploaddup<Packet8d>((const double*)(const void*)from )) );
+}
+template<> EIGEN_STRONG_INLINE Packet8cf ploadquad<Packet8cf>(const std::complex<float>* from)
+{
+ return Packet8cf( _mm512_castpd_ps( ploadquad<Packet8d>((const double*)(const void*)from )) );
+}
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet8cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet8cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet8cf pgather<std::complex<float>, Packet8cf>(const std::complex<float>* from, Index stride)
+{
+ return Packet8cf(_mm512_castpd_ps(pgather<double,Packet8d>((const double*)(const void*)from, stride)));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet8cf>(std::complex<float>* to, const Packet8cf& from, Index stride)
+{
+ pscatter((double*)(void*)to, _mm512_castps_pd(from.v), stride);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet8cf>(const Packet8cf& a)
+{
+ return pfirst(Packet2cf(_mm512_castps512_ps128(a.v)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) {
+ return Packet8cf(_mm512_castsi512_ps(
+ _mm512_permutexvar_epi64( _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7),
+ _mm512_castps_si512(a.v))));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a)
+{
+ return predux(padd(Packet4cf(extract256<0>(a.v)),
+ Packet4cf(extract256<1>(a.v))));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a)
+{
+ return predux_mul(pmul(Packet4cf(extract256<0>(a.v)),
+ Packet4cf(extract256<1>(a.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4<Packet8cf>(const Packet8cf& a) {
+ __m256 lane0 = extract256<0>(a.v);
+ __m256 lane1 = extract256<1>(a.v);
+ __m256 res = _mm256_add_ps(lane0, lane1);
+ return Packet4cf(res);
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet8cf>
+{
+ static EIGEN_STRONG_INLINE void run(Packet8cf& first, const Packet8cf& second)
+ {
+ if (Offset==0) return;
+ palign_impl<Offset*2,Packet16f>::run(first.v, second.v);
+ }
+};
+
+template<> struct conj_helper<Packet8cf, Packet8cf, false,true>
+{
+ EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
+ {
+ return internal::pmul(a, pconj(b));
+ }
+};
+
+template<> struct conj_helper<Packet8cf, Packet8cf, true,false>
+{
+ EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
+ {
+ return internal::pmul(pconj(a), b);
+ }
+};
+
+template<> struct conj_helper<Packet8cf, Packet8cf, true,true>
+{
+ EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
+ {
+ return pconj(internal::pmul(a, b));
+ }
+};
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f)
+
+template<> EIGEN_STRONG_INLINE Packet8cf pdiv<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
+{
+ Packet8cf num = pmul(a, pconj(b));
+ __m512 tmp = _mm512_mul_ps(b.v, b.v);
+ __m512 tmp2 = _mm512_shuffle_ps(tmp,tmp,0xB1);
+ __m512 denom = _mm512_add_ps(tmp, tmp2);
+ return Packet8cf(_mm512_div_ps(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& x)
+{
+ return Packet8cf(_mm512_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
+}
+
+//---------- double ----------
+struct Packet4cd
+{
+ EIGEN_STRONG_INLINE Packet4cd() {}
+ EIGEN_STRONG_INLINE explicit Packet4cd(const __m512d& a) : v(a) {}
+ __m512d v;
+};
+
+template<> struct packet_traits<std::complex<double> > : default_packet_traits
+{
+ typedef Packet4cd type;
+ typedef Packet2cd half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 0,
+ size = 4,
+ HasHalfPacket = 1,
+
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasNegate = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasSetLinear = 0,
+ HasReduxp = 0
+ };
+};
+
+template<> struct unpacket_traits<Packet4cd> {
+ typedef std::complex<double> type;
+ enum {
+ size = 4,
+ alignment = unpacket_traits<Packet8d>::alignment,
+ vectorizable=true
+ };
+ typedef Packet2cd half;
+};
+
+template<> EIGEN_STRONG_INLINE Packet4cd padd<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd psub<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_sub_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) { return Packet4cd(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a)
+{
+ const __m512d mask = _mm512_castsi512_pd(
+ _mm512_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0,
+ 0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
+ return Packet4cd(pxor(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd pmul<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
+{
+ __m512d tmp1 = _mm512_shuffle_pd(a.v,a.v,0x0);
+ __m512d tmp2 = _mm512_shuffle_pd(a.v,a.v,0xFF);
+ __m512d tmp3 = _mm512_shuffle_pd(b.v,b.v,0x55);
+ __m512d odd = _mm512_mul_pd(tmp2, tmp3);
+ return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd ptrue<Packet4cd>(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet4cd pnot<Packet4cd>(const Packet4cd& a) { return Packet4cd(pnot(Packet8d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet4cd pand <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd por <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pxor <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); }
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) {
+ __m512d eq = pcmp_eq<Packet8d>(a.v, b.v);
+ return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd pload <Packet4cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload<Packet8d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet4cd ploadu<Packet4cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cd(ploadu<Packet8d>((const double*)from)); }
+
+template<> EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from)
+{
+ #ifdef EIGEN_VECTORIZE_AVX512DQ
+ return Packet4cd(_mm512_broadcast_f64x2(pset1<Packet1cd>(from).v));
+ #else
+ return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1<Packet1cd>(from).v))));
+ #endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
+ return Packet4cd(_mm512_insertf64x4(
+ _mm512_castpd256_pd512(ploaddup<Packet2cd>(from).v), ploaddup<Packet2cd>(from+1).v, 1));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet4cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet4cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather<std::complex<double>, Packet4cd>(const std::complex<double>* from, Index stride)
+{
+ return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512(
+ _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+0*stride).v), ploadu<Packet1cd>(from+1*stride).v,1)),
+ _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+2*stride).v), ploadu<Packet1cd>(from+3*stride).v,1), 1));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet4cd>(std::complex<double>* to, const Packet4cd& from, Index stride)
+{
+ __m512i fromi = _mm512_castpd_si512(from.v);
+ double* tod = (double*)(void*)to;
+ _mm_storeu_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) );
+ _mm_storeu_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) );
+ _mm_storeu_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) );
+ _mm_storeu_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) );
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet4cd>(const Packet4cd& a)
+{
+ __m128d low = extract128<0>(a.v);
+ EIGEN_ALIGN16 double res[2];
+ _mm_store_pd(res, low);
+ return std::complex<double>(res[0],res[1]);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) {
+ return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, EIGEN_SSE_SHUFFLE_MASK(3,2,1,0)));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a)
+{
+ return predux(padd(Packet2cd(_mm512_extractf64x4_pd(a.v,0)),
+ Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const Packet4cd& a)
+{
+ return predux_mul(pmul(Packet2cd(_mm512_extractf64x4_pd(a.v,0)),
+ Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet4cd>
+{
+ static EIGEN_STRONG_INLINE void run(Packet4cd& first, const Packet4cd& second)
+ {
+ if (Offset==0) return;
+ palign_impl<Offset*2,Packet8d>::run(first.v, second.v);
+ }
+};
+
+template<> struct conj_helper<Packet4cd, Packet4cd, false,true>
+{
+ EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
+ {
+ return internal::pmul(a, pconj(b));
+ }
+};
+
+template<> struct conj_helper<Packet4cd, Packet4cd, true,false>
+{
+ EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
+ {
+ return internal::pmul(pconj(a), b);
+ }
+};
+
+template<> struct conj_helper<Packet4cd, Packet4cd, true,true>
+{
+ EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
+ { return padd(pmul(x,y),c); }
+
+ EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
+ {
+ return pconj(internal::pmul(a, b));
+ }
+};
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d)
+
+template<> EIGEN_STRONG_INLINE Packet4cd pdiv<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
+{
+ Packet4cd num = pmul(a, pconj(b));
+ __m512d tmp = _mm512_mul_pd(b.v, b.v);
+ __m512d denom = padd(_mm512_permute_pd(tmp,0x55), tmp);
+ return Packet4cd(_mm512_div_pd(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x)
+{
+ return Packet4cd(_mm512_permute_pd(x.v,0x55));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8cf,4>& kernel) {
+ PacketBlock<Packet8d,4> pb;
+
+ pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
+ pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
+ pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
+ pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
+ ptranspose(pb);
+ kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
+ kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
+ kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
+ kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8cf,8>& kernel) {
+ PacketBlock<Packet8d,8> pb;
+
+ pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
+ pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
+ pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
+ pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
+ pb.packet[4] = _mm512_castps_pd(kernel.packet[4].v);
+ pb.packet[5] = _mm512_castps_pd(kernel.packet[5].v);
+ pb.packet[6] = _mm512_castps_pd(kernel.packet[6].v);
+ pb.packet[7] = _mm512_castps_pd(kernel.packet[7].v);
+ ptranspose(pb);
+ kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
+ kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
+ kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
+ kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
+ kernel.packet[4].v = _mm512_castpd_ps(pb.packet[4]);
+ kernel.packet[5].v = _mm512_castpd_ps(pb.packet[5]);
+ kernel.packet[6].v = _mm512_castpd_ps(pb.packet[6]);
+ kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4cd,4>& kernel) {
+ __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [a0 a1 b0 b1]
+ __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [a2 a3 b2 b3]
+ __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [c0 c1 d0 d1]
+ __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [c2 c3 d2 d3]
+
+ kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a3 b3 c3 d3]
+ kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a2 b2 c2 d2]
+ kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a1 b1 c1 d1]
+ kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a0 b0 c0 d0]
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pinsertfirst(const Packet8cf& a, std::complex<float> b)
+{
+ Packet2cf tmp = Packet2cf(_mm512_extractf32x4_ps(a.v,0));
+ tmp = pinsertfirst(tmp, b);
+ return Packet8cf( _mm512_insertf32x4(a.v, tmp.v, 0) );
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd pinsertfirst(const Packet4cd& a, std::complex<double> b)
+{
+ return Packet4cd(_mm512_castsi512_pd( _mm512_inserti32x4(_mm512_castpd_si512(a.v), _mm_castpd_si128(pset1<Packet1cd>(b).v), 0) ));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8cf pinsertlast(const Packet8cf& a, std::complex<float> b)
+{
+ Packet2cf tmp = Packet2cf(_mm512_extractf32x4_ps(a.v,3) );
+ tmp = pinsertlast(tmp, b);
+ return Packet8cf( _mm512_insertf32x4(a.v, tmp.v, 3) );
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cd pinsertlast(const Packet4cd& a, std::complex<double> b)
+{
+ return Packet4cd(_mm512_castsi512_pd( _mm512_inserti32x4(_mm512_castpd_si512(a.v), _mm_castpd_si128(pset1<Packet1cd>(b).v), 3) ));
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_AVX512_H
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 93c5ec43f..c2158c538 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -47,6 +47,7 @@ plog<Packet16f>(const Packet16f& _x) {
// The smallest non denormalized float number.
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
+ _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
// Polynomial coefficients.
@@ -116,10 +117,16 @@ plog<Packet16f>(const Packet16f& _x) {
x = padd(x, y);
x = padd(x, y2);
- // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
+ __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
+ // Filter out invalid inputs, i.e.:
+ // - negative arg will be NAN,
+ // - 0 will be -INF.
+ // - +INF will be +INF
return _mm512_mask_blend_ps(iszero_mask,
- _mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
- p16f_minus_inf);
+ _mm512_mask_blend_ps(invalid_mask,
+ _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
+ p16f_nan),
+ p16f_minus_inf);
}
#endif
@@ -373,6 +380,19 @@ EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
#endif
#endif
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+psin<Packet16f>(const Packet16f& _x) {
+ return psin_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
+pcos<Packet16f>(const Packet16f& _x) {
+ return pcos_float(_x);
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 86cefba92..60b723b08 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -19,10 +19,10 @@ namespace internal {
#endif
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
#endif
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
@@ -55,7 +55,9 @@ template<> struct packet_traits<float> : default_packet_traits
size = 16,
HasHalfPacket = 1,
HasBlend = 0,
-#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG
+ HasSin = EIGEN_FAST_MATH,
+ HasCos = EIGEN_FAST_MATH,
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
#ifdef EIGEN_VECTORIZE_AVX512DQ
HasLog = 1,
#endif
@@ -75,7 +77,7 @@ template<> struct packet_traits<double> : default_packet_traits
AlignedOnScalar = 1,
size = 8,
HasHalfPacket = 1,
-#if EIGEN_GNUC_AT_LEAST(5, 3)
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
HasSqrt = EIGEN_FAST_MATH,
HasRsqrt = EIGEN_FAST_MATH,
#endif
@@ -99,19 +101,20 @@ template <>
struct unpacket_traits<Packet16f> {
typedef float type;
typedef Packet8f half;
- enum { size = 16, alignment=Aligned64 };
+ typedef Packet16i integer_packet;
+ enum { size = 16, alignment=Aligned64, vectorizable=true };
};
template <>
struct unpacket_traits<Packet8d> {
typedef double type;
typedef Packet4d half;
- enum { size = 8, alignment=Aligned64 };
+ enum { size = 8, alignment=Aligned64, vectorizable=true };
};
template <>
struct unpacket_traits<Packet16i> {
typedef int type;
typedef Packet8i half;
- enum { size = 16, alignment=Aligned64 };
+ enum { size = 16, alignment=Aligned64, vectorizable=false };
};
template <>
@@ -128,12 +131,17 @@ EIGEN_STRONG_INLINE Packet16i pset1<Packet16i>(const int& from) {
}
template <>
+EIGEN_STRONG_INLINE Packet16f pset1frombits<Packet16f>(unsigned int from) {
+ return _mm512_castsi512_ps(_mm512_set1_epi32(from));
+}
+
+template <>
EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
return _mm512_broadcastss_ps(_mm_load_ps1(from));
}
template <>
EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
- return _mm512_broadcastsd_pd(_mm_load_pd1(from));
+ return _mm512_set1_pd(*from);
}
template <>
@@ -159,6 +167,11 @@ EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
const Packet8d& b) {
return _mm512_add_pd(a, b);
}
+template <>
+EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a,
+ const Packet16i& b) {
+ return _mm512_add_epi32(a, b);
+}
template <>
EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a,
@@ -170,6 +183,11 @@ EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a,
const Packet8d& b) {
return _mm512_sub_pd(a, b);
}
+template <>
+EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a,
+ const Packet16i& b) {
+ return _mm512_sub_epi32(a, b);
+}
template <>
EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
@@ -203,6 +221,11 @@ EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a,
const Packet8d& b) {
return _mm512_mul_pd(a, b);
}
+template <>
+EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a,
+ const Packet16i& b) {
+ return _mm512_mul_epi32(a, b);
+}
template <>
EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
@@ -215,7 +238,7 @@ EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
return _mm512_div_pd(a, b);
}
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
template <>
EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
const Packet16f& c) {
@@ -254,30 +277,92 @@ EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
return _mm512_max_pd(b, a);
}
-template <>
-EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
- return _mm512_and_ps(a, b);
+template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
+template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); }
#else
- Packet16f res = _mm512_undefined_ps();
- Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
- Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
- res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0);
+// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
+template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
+ return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_));
+}
- Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
- Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
- res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1);
+// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
+template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
+ return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_));
+}
- Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
- Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
- res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2);
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
+ return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)),
+ _mm256_castps_si256(b),1));
+}
+#endif
- Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
- Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
- res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3);
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
+ __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
+ return _mm512_castsi512_ps(
+ _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
- return res;
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
+ __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
+ return _mm512_castsi512_ps(
+ _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
+ __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ);
+ return _mm512_castsi512_ps(
+ _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
+ __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ);
+ return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
+ __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
+ return _mm512_castsi512_ps(
+ _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
+ __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
+ return _mm512_castsi512_pd(
+ _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(const Packet16i& /*a*/) {
+ return _mm512_set1_epi32(0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f ptrue<Packet16f>(const Packet16f& a) {
+ return _mm512_castsi512_ps(ptrue<Packet16i>(_mm512_castps_si512(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d ptrue<Packet8d>(const Packet8d& a) {
+ return _mm512_castsi512_pd(ptrue<Packet16i>(_mm512_castpd_si512(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
+ const Packet16i& b) {
+ return _mm512_and_si512(a,b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
+ const Packet16f& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+ return _mm512_and_ps(a, b);
+#else
+ return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b)));
#endif
}
template <>
@@ -298,30 +383,18 @@ EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a,
return res;
#endif
}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i por<Packet16i>(const Packet16i& a, const Packet16i& b) {
+ return _mm512_or_si512(a, b);
+}
+
template <>
-EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, const Packet16f& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_or_ps(a, b);
#else
- Packet16f res = _mm512_undefined_ps();
- Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
- Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
- res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0);
-
- Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
- Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
- res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1);
-
- Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
- Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
- res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2);
-
- Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
- Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
- res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3);
-
- return res;
+ return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b)));
#endif
}
@@ -331,109 +404,59 @@ EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_or_pd(a, b);
#else
- Packet8d res = _mm512_undefined_pd();
- Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
- Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
- res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0);
-
- Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
- Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
- res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1);
-
- return res;
+ return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
#endif
}
template <>
-EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16i pxor<Packet16i>(const Packet16i& a, const Packet16i& b) {
+ return _mm512_xor_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, const Packet16f& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_xor_ps(a, b);
#else
- Packet16f res = _mm512_undefined_ps();
- Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
- Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
- res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0);
-
- Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
- Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
- res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1);
-
- Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
- Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
- res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2);
-
- Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
- Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
- res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3);
-
- return res;
+ return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b)));
#endif
}
+
template <>
-EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a,
- const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, const Packet8d& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_xor_pd(a, b);
#else
- Packet8d res = _mm512_undefined_pd();
- Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
- Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
- res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0);
-
- Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
- Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
- res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1);
-
- return res;
+ return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
#endif
}
template <>
-EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16i pandnot<Packet16i>(const Packet16i& a, const Packet16i& b) {
+ return _mm512_andnot_si512(b, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, const Packet16f& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
- return _mm512_andnot_ps(a, b);
+ return _mm512_andnot_ps(b, a);
#else
- Packet16f res = _mm512_undefined_ps();
- Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
- Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
- res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0);
-
- Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
- Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
- res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1);
-
- Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
- Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
- res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2);
-
- Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
- Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
- res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3);
-
- return res;
+ return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b)));
#endif
}
template <>
-EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,
- const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,const Packet8d& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
- return _mm512_andnot_pd(a, b);
+ return _mm512_andnot_pd(b, a);
#else
- Packet8d res = _mm512_undefined_pd();
- Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
- Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
- res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0);
-
- Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
- Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
- res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1);
-
- return res;
+ return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
#endif
}
+template<int N> EIGEN_STRONG_INLINE Packet16i pshiftleft(Packet16i a) {
+ return _mm512_slli_epi32(a, N);
+}
+
template <>
EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from);
@@ -475,6 +498,7 @@ EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
}
#ifdef EIGEN_VECTORIZE_AVX512DQ
+// FIXME: this does not look optimal, better load a Packet4d and shuffle...
// Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3,
// a3}
template <>
@@ -502,21 +526,17 @@ EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
template <>
EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
- Packet16f tmp = _mm512_undefined_ps();
- tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0);
- tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1);
- tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2);
- tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3);
- return tmp;
+ Packet16f tmp = _mm512_castps128_ps512(ploadu<Packet4f>(from));
+ const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
+ return _mm512_permutexvar_ps(scatter_mask, tmp);
}
+
// Loads 2 doubles from memory a returns the packet
// {a0, a0 a0, a0, a1, a1, a1, a1}
template <>
EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
- __m128d tmp0 = _mm_load_pd1(from);
- __m256d lane0 = _mm256_broadcastsd_pd(tmp0);
- __m128d tmp1 = _mm_load_pd1(from + 1);
- __m256d lane1 = _mm256_broadcastsd_pd(tmp1);
+ __m256d lane0 = _mm256_set1_pd(*from);
+ __m256d lane1 = _mm256_set1_pd(*(from+1));
__m512d tmp = _mm512_undefined_pd();
tmp = _mm512_insertf64x4(tmp, lane0, 0);
return _mm512_insertf64x4(tmp, lane1, 1);
@@ -981,6 +1001,13 @@ EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
}
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x)
+{
+ Packet16i xi = _mm512_castps_si512(x);
+ __mmask16 tmp = _mm512_test_epi32_mask(xi,xi);
+ return !_mm512_kortestz(tmp,tmp);
+}
+
template <int Offset>
struct palign_impl<Offset, Packet16f> {
static EIGEN_STRONG_INLINE void run(Packet16f& first,
@@ -1322,6 +1349,22 @@ template<> EIGEN_STRONG_INLINE Packet8d pinsertlast(const Packet8d& a, double b)
return _mm512_mask_broadcastsd_pd(a, (1<<7), _mm_load_sd(&b));
}
+template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
+ return _mm512_cvttps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
+ return _mm512_cvtepi32_ps(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i,Packet16f>(const Packet16f& a) {
+ return _mm512_castps_si512(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f,Packet16i>(const Packet16i& a) {
+ return _mm512_castsi512_ps(a);
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 3e665730c..440d058d8 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -60,7 +60,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{
@@ -82,14 +82,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f
template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
{
- std::complex<float> EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 std::complex<float> af[2];
af[0] = from[0*stride];
af[1] = from[1*stride];
return pload<Packet2cf>(af);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
{
- std::complex<float> EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 std::complex<float> af[2];
pstore<std::complex<float> >((std::complex<float> *) af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
@@ -128,7 +128,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{
- std::complex<float> EIGEN_ALIGN16 res[2];
+ EIGEN_ALIGN16 std::complex<float> res[2];
pstore((float *)&res, a.v);
return res[0];
@@ -286,7 +286,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
@@ -298,14 +298,14 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
{
- std::complex<double> EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 std::complex<double> af[2];
af[0] = from[0*stride];
af[1] = from[1*stride];
return pload<Packet1cd>(af);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
{
- std::complex<double> EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 std::complex<double> af[2];
pstore<std::complex<double> >(af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
@@ -345,7 +345,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
{
- std::complex<double> EIGEN_ALIGN16 res[2];
+ EIGEN_ALIGN16 std::complex<double> res[2];
pstore<std::complex<double> >(res, a);
return res[0];
diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
index c5e4bede7..81097e668 100644
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -9,191 +9,37 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-/* The sin, cos, exp, and log functions of this file come from
- * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
- */
-
#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+#include "../Default/GenericPacketMathFunctions.h"
+
namespace Eigen {
namespace internal {
-static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-static _EIGEN_DECLARE_CONST_Packet4i(23, 23);
-
-static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
-/* the smallest non denormalized float number */
-static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
-static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f
-static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff);
-
-/* natural logarithm computed for 4 simultaneous float
- return NaN for x <= 0
-*/
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-
-static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
-static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-
-#ifdef __VSX__
-static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
-static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
-static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
-static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
-static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
-static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
-
-#ifdef __POWER8_VECTOR__
-static Packet2l p2l_1023 = { 1023, 1023 };
-static Packet2ul p2ul_52 = { 52, 52 };
-#endif
-
-#endif
-
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f plog<Packet4f>(const Packet4f& _x)
{
- Packet4f x = _x;
-
- Packet4i emm0;
-
- /* isvalid_mask is 0 if x < 0 or x is NaN. */
- Packet4ui isvalid_mask = reinterpret_cast<Packet4ui>(vec_cmpge(x, p4f_ZERO));
- Packet4ui iszero_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(x, p4f_ZERO));
-
- x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */
- emm0 = vec_sr(reinterpret_cast<Packet4i>(x),
- reinterpret_cast<Packet4ui>(p4i_23));
-
- /* keep only the fractional part */
- x = pand(x, p4f_inv_mant_mask);
- x = por(x, p4f_half);
-
- emm0 = psub(emm0, p4i_0x7f);
- Packet4f e = padd(vec_ctf(emm0, 0), p4f_1);
-
- /* part2:
- if( x < SQRTHF ) {
- e -= 1;
- x = x + x - 1.0;
- } else { x = x - 1.0; }
- */
- Packet4f mask = reinterpret_cast<Packet4f>(vec_cmplt(x, p4f_cephes_SQRTHF));
- Packet4f tmp = pand(x, mask);
- x = psub(x, p4f_1);
- e = psub(e, pand(p4f_1, mask));
- x = padd(x, tmp);
-
- Packet4f x2 = pmul(x,x);
- Packet4f x3 = pmul(x2,x);
-
- Packet4f y, y1, y2;
- y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
- y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
- y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
- y = pmadd(y , x, p4f_cephes_log_p2);
- y1 = pmadd(y1, x, p4f_cephes_log_p5);
- y2 = pmadd(y2, x, p4f_cephes_log_p8);
- y = pmadd(y, x3, y1);
- y = pmadd(y, x3, y2);
- y = pmul(y, x3);
-
- y1 = pmul(e, p4f_cephes_log_q1);
- tmp = pmul(x2, p4f_half);
- y = padd(y, y1);
- x = psub(x, tmp);
- y2 = pmul(e, p4f_cephes_log_q2);
- x = padd(x, y);
- x = padd(x, y2);
- // negative arg will be NAN, 0 will be -INF
- x = vec_sel(x, p4f_minus_inf, iszero_mask);
- x = vec_sel(p4f_minus_nan, x, isvalid_mask);
- return x;
+ return plog_float(_x);
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f pexp<Packet4f>(const Packet4f& _x)
{
- Packet4f x = _x;
-
- Packet4f tmp, fx;
- Packet4i emm0;
-
- // clamp x
- x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
-
- // express exp(x) as exp(g + n*log(2))
- fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
-
- fx = pfloor(fx);
-
- tmp = pmul(fx, p4f_cephes_exp_C1);
- Packet4f z = pmul(fx, p4f_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- z = pmul(x,x);
-
- Packet4f y = p4f_cephes_exp_p0;
- y = pmadd(y, x, p4f_cephes_exp_p1);
- y = pmadd(y, x, p4f_cephes_exp_p2);
- y = pmadd(y, x, p4f_cephes_exp_p3);
- y = pmadd(y, x, p4f_cephes_exp_p4);
- y = pmadd(y, x, p4f_cephes_exp_p5);
- y = pmadd(y, z, x);
- y = padd(y, p4f_1);
+ return pexp_float(_x);
+}
- // build 2^n
- emm0 = vec_cts(fx, 0);
- emm0 = vec_add(emm0, p4i_0x7f);
- emm0 = vec_sl(emm0, reinterpret_cast<Packet4ui>(p4i_23));
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psin<Packet4f>(const Packet4f& _x)
+{
+ return psin_float(_x);
+}
- // Altivec's max & min operators just drop silent NaNs. Check NaNs in
- // inputs and return them unmodified.
- Packet4ui isnumber_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(_x, _x));
- return vec_sel(_x, pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x),
- isnumber_mask);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pcos<Packet4f>(const Packet4f& _x)
+{
+ return pcos_float(_x);
}
#ifndef EIGEN_COMP_CLANG
@@ -225,93 +71,10 @@ Packet2d psqrt<Packet2d>(const Packet2d& x)
return vec_sqrt(x);
}
-// VSX support varies between different compilers and even different
-// versions of the same compiler. For gcc version >= 4.9.3, we can use
-// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use
-// a slow version that works with older compilers.
-// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
-// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
-static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
-#if EIGEN_GNUC_AT_LEAST(5, 4) || \
- (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
- return vec_cts(x, 0); // TODO: check clang version.
-#else
- double tmp[2];
- memcpy(tmp, &x, sizeof(tmp));
- Packet2l l = { static_cast<long long>(tmp[0]),
- static_cast<long long>(tmp[1]) };
- return l;
-#endif
-}
-
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet2d pexp<Packet2d>(const Packet2d& _x)
{
- Packet2d x = _x;
-
- Packet2d tmp, fx;
- Packet2l emm0;
-
- // clamp x
- x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
-
- /* express exp(x) as exp(g + n*log(2)) */
- fx = pmadd(x, p2d_cephes_LOG2EF, p2d_half);
-
- fx = pfloor(fx);
-
- tmp = pmul(fx, p2d_cephes_exp_C1);
- Packet2d z = pmul(fx, p2d_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- Packet2d x2 = pmul(x,x);
-
- Packet2d px = p2d_cephes_exp_p0;
- px = pmadd(px, x2, p2d_cephes_exp_p1);
- px = pmadd(px, x2, p2d_cephes_exp_p2);
- px = pmul (px, x);
-
- Packet2d qx = p2d_cephes_exp_q0;
- qx = pmadd(qx, x2, p2d_cephes_exp_q1);
- qx = pmadd(qx, x2, p2d_cephes_exp_q2);
- qx = pmadd(qx, x2, p2d_cephes_exp_q3);
-
- x = pdiv(px,psub(qx,px));
- x = pmadd(p2d_2,x,p2d_1);
-
- // build 2^n
- emm0 = ConvertToPacket2l(fx);
-
-#ifdef __POWER8_VECTOR__
- emm0 = vec_add(emm0, p2l_1023);
- emm0 = vec_sl(emm0, p2ul_52);
-#else
- // Code is a bit complex for POWER7. There is actually a
- // vec_xxsldi intrinsic but it is not supported by some gcc versions.
- // So we shift (52-32) bits and do a word swap with zeros.
- _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
- _EIGEN_DECLARE_CONST_Packet4i(20, 20); // 52 - 32
-
- Packet4i emm04i = reinterpret_cast<Packet4i>(emm0);
- emm04i = vec_add(emm04i, p4i_1023);
- emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20));
- static const Packet16uc perm = {
- 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
- 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
-#ifdef _BIG_ENDIAN
- emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm));
-#else
- emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm));
-#endif
-
-#endif
-
- // Altivec's max & min operators just drop silent NaNs. Check NaNs in
- // inputs and return them unmodified.
- Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
- return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x),
- isnumber_mask);
+ return pexp_double(_x);
}
#endif
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 7f4e90f75..9535724eb 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -146,9 +146,9 @@ template<> struct packet_traits<float> : default_packet_traits
HasMin = 1,
HasMax = 1,
HasAbs = 1,
- HasSin = 0,
- HasCos = 0,
- HasLog = 0,
+ HasSin = EIGEN_FAST_MATH,
+ HasCos = EIGEN_FAST_MATH,
+ HasLog = 1,
HasExp = 1,
#ifdef __VSX__
HasSqrt = 1,
@@ -187,8 +187,19 @@ template<> struct packet_traits<int> : default_packet_traits
};
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f>
+{
+ typedef float type;
+ typedef Packet4f half;
+ typedef Packet4i integer_packet;
+ enum {size=4, alignment=Aligned16, vectorizable=true};
+};
+template<> struct unpacket_traits<Packet4i>
+{
+ typedef int type;
+ typedef Packet4i half;
+ enum {size=4, alignment=Aligned16, vectorizable=false};
+};
inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
{
@@ -285,6 +296,11 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
Packet4i v = {from, from, from, from};
return v;
}
+
+template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
+ return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
+}
+
template<> EIGEN_STRONG_INLINE void
pbroadcast4<Packet4f>(const float *a,
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
@@ -308,7 +324,7 @@ pbroadcast4<Packet4i>(const int *a,
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
{
- float EIGEN_ALIGN16 af[4];
+ EIGEN_ALIGN16 float af[4];
af[0] = from[0*stride];
af[1] = from[1*stride];
af[2] = from[2*stride];
@@ -317,7 +333,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
}
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
{
- int EIGEN_ALIGN16 ai[4];
+ EIGEN_ALIGN16 int ai[4];
ai[0] = from[0*stride];
ai[1] = from[1*stride];
ai[2] = from[2*stride];
@@ -326,7 +342,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
{
- float EIGEN_ALIGN16 af[4];
+ EIGEN_ALIGN16 float af[4];
pstore<float>(af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
@@ -335,7 +351,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
{
- int EIGEN_ALIGN16 ai[4];
+ EIGEN_ALIGN16 int ai[4];
pstore<int>((int *)ai, from);
to[0*stride] = ai[0];
to[1*stride] = ai[1];
@@ -414,6 +430,15 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const
}
template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
+ Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
+ return vec_nor(c,c);
+}
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
+
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
@@ -426,6 +451,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+ return vec_sel(b, a, mask);
+}
+
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); }
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); }
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
@@ -536,8 +565,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f&
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
-template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; }
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
{
@@ -550,6 +579,19 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
+template<int N> EIGEN_STRONG_INLINE Packet4i pshiftright(Packet4i a)
+{ return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
+template<int N> EIGEN_STRONG_INLINE Packet4i pshiftleft(Packet4i a)
+{ return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+ return pfrexp_float(a,exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+ return pldexp_float(a,exponent);
+}
+
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
{
Packet4f b, sum;
@@ -678,6 +720,11 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
return pfirst(res);
}
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+ return vec_any_ne(x, pzero(x));
+}
+
template<int Offset>
struct palign_impl<Offset,Packet4f>
{
@@ -771,6 +818,43 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons
}
+template <>
+struct type_casting_traits<float, int> {
+ enum {
+ VectorizedCast = 1,
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 1
+ };
+};
+
+template <>
+struct type_casting_traits<int, float> {
+ enum {
+ VectorizedCast = 1,
+ SrcCoeffRatio = 1,
+ TgtCoeffRatio = 1
+ };
+};
+
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+ return vec_cts(a,0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+ return vec_ctf(a,0);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+ return reinterpret_cast<Packet4i>(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+ return reinterpret_cast<Packet4f>(a);
+}
+
+
+
//---------- double ----------
#ifdef __VSX__
typedef __vector double Packet2d;
@@ -837,7 +921,7 @@ template<> struct packet_traits<double> : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; };
inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
{
@@ -901,14 +985,14 @@ pbroadcast4<Packet2d>(const double *a,
template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
{
- double EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 double af[2];
af[0] = from[0*stride];
af[1] = from[1*stride];
return pload<Packet2d>(af);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
{
- double EIGEN_ALIGN16 af[2];
+ EIGEN_ALIGN16 double af[2];
pstore<double>(af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
@@ -980,7 +1064,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d&
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
{
@@ -988,6 +1072,59 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
}
template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
+// VSX support varies between different compilers and even different
+// versions of the same compiler. For gcc version >= 4.9.3, we can use
+// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use
+// a slow version that works with older compilers.
+// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
+// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
+static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
+#if EIGEN_GNUC_AT_LEAST(5, 4) || \
+ (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
+ return vec_cts(x, 0); // TODO: check clang version.
+#else
+ double tmp[2];
+ memcpy(tmp, &x, sizeof(tmp));
+ Packet2l l = { static_cast<long long>(tmp[0]),
+ static_cast<long long>(tmp[1]) };
+ return l;
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+
+ // build 2^n
+ Packet2l emm0 = ConvertToPacket2l(exponent);
+
+#ifdef __POWER8_VECTOR__
+ const Packet2l p2l_1023 = { 1023, 1023 };
+ const Packet2ul p2ul_52 = { 52, 52 };
+ emm0 = vec_add(emm0, p2l_1023);
+ emm0 = vec_sl(emm0, p2ul_52);
+#else
+ // Code is a bit complex for POWER7. There is actually a
+ // vec_xxsldi intrinsic but it is not supported by some gcc versions.
+ // So we shift (52-32) bits and do a word swap with zeros.
+ const Packet4i p4i_1023 = pset1<Packet4i>(1023);
+ const Packet4i p4i_20 = pset1<Packet4i>(20); // 52 - 32
+
+ Packet4i emm04i = reinterpret_cast<Packet4i>(emm0);
+ emm04i = vec_add(emm04i, p4i_1023);
+ emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20));
+ static const Packet16uc perm = {
+ 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
+ 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
+#ifdef _BIG_ENDIAN
+ emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm));
+#else
+ emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm));
+#endif
+
+#endif
+
+ return pmul(a, reinterpret_cast<Packet2d>(emm0));
+}
+
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
{
Packet2d b, sum;
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
new file mode 100644
index 000000000..452b4c806
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -0,0 +1,471 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The exp and log functions of this file initially come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+namespace Eigen {
+namespace internal {
+
+template<typename Packet> EIGEN_STRONG_INLINE Packet
+pfrexp_float(const Packet& a, Packet& exponent) {
+ typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+ const Packet cst_126f = pset1<Packet>(126.0f);
+ const Packet cst_half = pset1<Packet>(0.5f);
+ const Packet cst_inv_mant_mask = pset1frombits<Packet>(~0x7f800000u);
+ exponent = psub(pcast<PacketI,Packet>(pshiftright<23>(preinterpret<PacketI>(a))), cst_126f);
+ return por(pand(a, cst_inv_mant_mask), cst_half);
+}
+
+template<typename Packet> EIGEN_STRONG_INLINE Packet
+pldexp_float(Packet a, Packet exponent)
+{
+ typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+ const Packet cst_127 = pset1<Packet>(127.f);
+ // return a * 2^exponent
+ PacketI ei = pcast<Packet,PacketI>(padd(exponent, cst_127));
+ return pmul(a, preinterpret<Packet>(pshiftleft<23>(ei)));
+}
+
+// Natural logarithm
+// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
+// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
+// be easily approximated by a polynomial centered on m=1 for stability.
+// TODO(gonnet): Further reduce the interval allowing for lower-degree
+// polynomial interpolants -> ... -> profit!
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_float(const Packet _x)
+{
+ Packet x = _x;
+
+ const Packet cst_1 = pset1<Packet>(1.0f);
+ const Packet cst_half = pset1<Packet>(0.5f);
+ // The smallest non denormalized float number.
+ const Packet cst_min_norm_pos = pset1frombits<Packet>( 0x00800000u);
+ const Packet cst_minus_inf = pset1frombits<Packet>( 0xff800000u);
+ const Packet cst_pos_inf = pset1frombits<Packet>( 0x7f800000u);
+
+ // Polynomial coefficients.
+ const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
+ const Packet cst_cephes_log_p0 = pset1<Packet>(7.0376836292E-2f);
+ const Packet cst_cephes_log_p1 = pset1<Packet>(-1.1514610310E-1f);
+ const Packet cst_cephes_log_p2 = pset1<Packet>(1.1676998740E-1f);
+ const Packet cst_cephes_log_p3 = pset1<Packet>(-1.2420140846E-1f);
+ const Packet cst_cephes_log_p4 = pset1<Packet>(+1.4249322787E-1f);
+ const Packet cst_cephes_log_p5 = pset1<Packet>(-1.6668057665E-1f);
+ const Packet cst_cephes_log_p6 = pset1<Packet>(+2.0000714765E-1f);
+ const Packet cst_cephes_log_p7 = pset1<Packet>(-2.4999993993E-1f);
+ const Packet cst_cephes_log_p8 = pset1<Packet>(+3.3333331174E-1f);
+ const Packet cst_cephes_log_q1 = pset1<Packet>(-2.12194440e-4f);
+ const Packet cst_cephes_log_q2 = pset1<Packet>(0.693359375f);
+
+ // Truncate input values to the minimum positive normal.
+ x = pmax(x, cst_min_norm_pos);
+
+ Packet e;
+ // extract significant in the range [0.5,1) and exponent
+ x = pfrexp(x,e);
+
+ // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
+ // and shift by -1. The values are then centered around 0, which improves
+ // the stability of the polynomial evaluation.
+ // if( x < SQRTHF ) {
+ // e -= 1;
+ // x = x + x - 1.0;
+ // } else { x = x - 1.0; }
+ Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
+ Packet tmp = pand(x, mask);
+ x = psub(x, cst_1);
+ e = psub(e, pand(cst_1, mask));
+ x = padd(x, tmp);
+
+ Packet x2 = pmul(x, x);
+ Packet x3 = pmul(x2, x);
+
+ // Evaluate the polynomial approximant of degree 8 in three parts, probably
+ // to improve instruction-level parallelism.
+ Packet y, y1, y2;
+ y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
+ y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
+ y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7);
+ y = pmadd(y, x, cst_cephes_log_p2);
+ y1 = pmadd(y1, x, cst_cephes_log_p5);
+ y2 = pmadd(y2, x, cst_cephes_log_p8);
+ y = pmadd(y, x3, y1);
+ y = pmadd(y, x3, y2);
+ y = pmul(y, x3);
+
+ // Add the logarithm of the exponent back to the result of the interpolation.
+ y1 = pmul(e, cst_cephes_log_q1);
+ tmp = pmul(x2, cst_half);
+ y = padd(y, y1);
+ x = psub(x, tmp);
+ y2 = pmul(e, cst_cephes_log_q2);
+ x = padd(x, y);
+ x = padd(x, y2);
+
+ Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
+ Packet iszero_mask = pcmp_eq(_x,pzero(_x));
+ Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
+ // Filter out invalid inputs, i.e.:
+ // - negative arg will be NAN
+ // - 0 will be -INF
+ // - +INF will be +INF
+ return pselect(iszero_mask, cst_minus_inf,
+ por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
+}
+
+// Exponential function. Works by writing "x = m*log(2) + r" where
+// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
+// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pexp_float(const Packet _x)
+{
+ const Packet cst_1 = pset1<Packet>(1.0f);
+ const Packet cst_half = pset1<Packet>(0.5f);
+ const Packet cst_exp_hi = pset1<Packet>( 88.3762626647950f);
+ const Packet cst_exp_lo = pset1<Packet>(-88.3762626647949f);
+
+ const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
+ const Packet cst_cephes_exp_p0 = pset1<Packet>(1.9875691500E-4f);
+ const Packet cst_cephes_exp_p1 = pset1<Packet>(1.3981999507E-3f);
+ const Packet cst_cephes_exp_p2 = pset1<Packet>(8.3334519073E-3f);
+ const Packet cst_cephes_exp_p3 = pset1<Packet>(4.1665795894E-2f);
+ const Packet cst_cephes_exp_p4 = pset1<Packet>(1.6666665459E-1f);
+ const Packet cst_cephes_exp_p5 = pset1<Packet>(5.0000001201E-1f);
+
+ // Clamp x.
+ Packet x = pmax(pmin(_x, cst_exp_hi), cst_exp_lo);
+
+ // Express exp(x) as exp(m*ln(2) + r), start by extracting
+ // m = floor(x/ln(2) + 0.5).
+ Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half));
+
+ // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
+ // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
+ // truncation errors.
+ Packet r;
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+ const Packet cst_nln2 = pset1<Packet>(-0.6931471805599453f);
+ r = pmadd(m, cst_nln2, x);
+#else
+ const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693359375f);
+ const Packet cst_cephes_exp_C2 = pset1<Packet>(-2.12194440e-4f);
+ r = psub(x, pmul(m, cst_cephes_exp_C1));
+ r = psub(r, pmul(m, cst_cephes_exp_C2));
+#endif
+
+ Packet r2 = pmul(r, r);
+
+ // TODO(gonnet): Split into odd/even polynomials and try to exploit
+ // instruction-level parallelism.
+ Packet y = cst_cephes_exp_p0;
+ y = pmadd(y, r, cst_cephes_exp_p1);
+ y = pmadd(y, r, cst_cephes_exp_p2);
+ y = pmadd(y, r, cst_cephes_exp_p3);
+ y = pmadd(y, r, cst_cephes_exp_p4);
+ y = pmadd(y, r, cst_cephes_exp_p5);
+ y = pmadd(y, r2, r);
+ y = padd(y, cst_1);
+
+ // Return 2^m * exp(r).
+ return pmax(pldexp(y,m), _x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pexp_double(const Packet _x)
+{
+ Packet x = _x;
+
+ const Packet cst_1 = pset1<Packet>(1.0);
+ const Packet cst_2 = pset1<Packet>(2.0);
+ const Packet cst_half = pset1<Packet>(0.5);
+
+ const Packet cst_exp_hi = pset1<Packet>(709.437);
+ const Packet cst_exp_lo = pset1<Packet>(-709.436139303);
+
+ const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
+ const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
+ const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
+ const Packet cst_cephes_exp_p2 = pset1<Packet>(9.99999999999999999910e-1);
+ const Packet cst_cephes_exp_q0 = pset1<Packet>(3.00198505138664455042e-6);
+ const Packet cst_cephes_exp_q1 = pset1<Packet>(2.52448340349684104192e-3);
+ const Packet cst_cephes_exp_q2 = pset1<Packet>(2.27265548208155028766e-1);
+ const Packet cst_cephes_exp_q3 = pset1<Packet>(2.00000000000000000009e0);
+ const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693145751953125);
+ const Packet cst_cephes_exp_C2 = pset1<Packet>(1.42860682030941723212e-6);
+
+ Packet tmp, fx;
+
+ // clamp x
+ x = pmax(pmin(x, cst_exp_hi), cst_exp_lo);
+ // Express exp(x) as exp(g + n*log(2)).
+ fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
+
+ // Get the integer modulus of log(2), i.e. the "n" described above.
+ fx = pfloor(fx);
+
+ // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
+ // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
+ // digits right.
+ tmp = pmul(fx, cst_cephes_exp_C1);
+ Packet z = pmul(fx, cst_cephes_exp_C2);
+ x = psub(x, tmp);
+ x = psub(x, z);
+
+ Packet x2 = pmul(x, x);
+
+ // Evaluate the numerator polynomial of the rational interpolant.
+ Packet px = cst_cephes_exp_p0;
+ px = pmadd(px, x2, cst_cephes_exp_p1);
+ px = pmadd(px, x2, cst_cephes_exp_p2);
+ px = pmul(px, x);
+
+ // Evaluate the denominator polynomial of the rational interpolant.
+ Packet qx = cst_cephes_exp_q0;
+ qx = pmadd(qx, x2, cst_cephes_exp_q1);
+ qx = pmadd(qx, x2, cst_cephes_exp_q2);
+ qx = pmadd(qx, x2, cst_cephes_exp_q3);
+
+ // I don't really get this bit, copied from the SSE2 routines, so...
+ // TODO(gonnet): Figure out what is going on here, perhaps find a better
+ // rational interpolant?
+ x = pdiv(px, psub(qx, px));
+ x = pmadd(cst_2, x, cst_1);
+
+ // Construct the result 2^n * exp(g) = e * x. The max is used to catch
+ // non-finite values in the input.
+ return pmax(pldexp(x,fx), _x);
+}
+
+// The following code is inspired by the following stack-overflow answer:
+// https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
+// It has been largely optimized:
+// - By-pass calls to frexp.
+// - Aligned loads of required 96 bits of 2/pi. This is accomplished by
+// (1) balancing the mantissa and exponent to the required bits of 2/pi are
+// aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
+// - Avoid a branch in rounding and extraction of the remaining fractional part.
+// Overall, I measured a speed up higher than x2 on x86-64.
+inline float trig_reduce_huge (float xf, int *quadrant)
+{
+ using Eigen::numext::int32_t;
+ using Eigen::numext::uint32_t;
+ using Eigen::numext::int64_t;
+ using Eigen::numext::uint64_t;
+
+ const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62
+ const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt
+
+ // 192 bits of 2/pi for Payne-Hanek reduction
+ // Bits are introduced by packet of 8 to enable aligned reads.
+ static const uint32_t two_over_pi [] =
+ {
+ 0x00000028, 0x000028be, 0x0028be60, 0x28be60db,
+ 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a,
+ 0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4,
+ 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770,
+ 0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566,
+ 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410,
+ 0x10e41000, 0xe4100000
+ };
+
+ uint32_t xi = numext::as_uint(xf);
+ // Below, -118 = -126 + 8.
+ // -126 is to get the exponent,
+ // +8 is to enable alignment of 2/pi's bits on 8 bits.
+ // This is possible because the fractional part of x as only 24 meaningful bits.
+ uint32_t e = (xi >> 23) - 118;
+ // Extract the mantissa and shift it to align it wrt the exponent
+ xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7);
+
+ uint32_t i = e >> 3;
+ uint32_t twoopi_1 = two_over_pi[i-1];
+ uint32_t twoopi_2 = two_over_pi[i+3];
+ uint32_t twoopi_3 = two_over_pi[i+7];
+
+ // Compute x * 2/pi in 2.62-bit fixed-point format.
+ uint64_t p;
+ p = uint64_t(xi) * twoopi_3;
+ p = uint64_t(xi) * twoopi_2 + (p >> 32);
+ p = (uint64_t(xi * twoopi_1) << 32) + p;
+
+ // Round to nearest: add 0.5 and extract integral part.
+ uint64_t q = (p + zero_dot_five) >> 62;
+ *quadrant = int(q);
+ // Now it remains to compute "r = x - q*pi/2" with high accuracy,
+ // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
+ // r = (p-q)*pi/2,
+ // where the product can be be carried out with sufficient accuracy using double precision.
+ p -= q<<62;
+ return float(double(int64_t(p)) * pio2_62);
+}
+
+template<bool ComputeSine,typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+#if EIGEN_GNUC_AT_LEAST(4,4)
+__attribute__((optimize("-fno-unsafe-math-optimizations")))
+#endif
+Packet psincos_float(const Packet& _x)
+{
+// Workaround -ffast-math aggressive optimizations
+// See bug 1674
+#if EIGEN_COMP_CLANG && defined(EIGEN_VECTORIZE_SSE)
+#define EIGEN_SINCOS_DONT_OPT(X) __asm__ ("" : "+x" (X));
+#else
+#define EIGEN_SINCOS_DONT_OPT(X)
+#endif
+
+ typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+
+ const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI
+ const Packet cst_rounding_magic = pset1<Packet>(12582912); // 2^23 for rounding
+ const PacketI csti_1 = pset1<PacketI>(1);
+ const Packet cst_sign_mask = pset1frombits<Packet>(0x80000000u);
+
+ Packet x = pabs(_x);
+
+ // Scale x by 2/Pi to find x's octant.
+ Packet y = pmul(x, cst_2oPI);
+
+ // Rounding trick:
+ Packet y_round = padd(y, cst_rounding_magic);
+ EIGEN_SINCOS_DONT_OPT(y_round)
+ PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
+ y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
+
+ // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
+ // using "Extended precision modular arithmetic"
+ #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
+ // This version requires true FMA for high accuracy
+ // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
+ const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
+ x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
+ x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
+ x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
+ #else
+ // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
+ // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
+ // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
+
+ // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
+ // and 2 ULP up to:
+ const float huge_th = ComputeSine ? 25966.f : 18838.f;
+ x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
+ EIGEN_SINCOS_DONT_OPT(x)
+ x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
+ EIGEN_SINCOS_DONT_OPT(x)
+ x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
+ x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
+
+ // For the record, the following set of coefficients maintain 2ULP up
+ // to a slightly larger range:
+ // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
+ // but it slightly fails to maintain 1ULP for two values of sin below pi.
+ // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
+ // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
+ // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
+ // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
+
+ // For the record, with only 3 iterations it is possible to maintain
+ // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
+ // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
+ #endif
+
+ if(predux_any(pcmp_le(pset1<Packet>(huge_th),pabs(_x))))
+ {
+ const int PacketSize = unpacket_traits<Packet>::size;
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
+ EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize];
+ pstoreu(vals, pabs(_x));
+ pstoreu(x_cpy, x);
+ pstoreu(y_int2, y_int);
+ for(int k=0; k<PacketSize;++k)
+ {
+ float val = vals[k];
+ if(val>=huge_th && (numext::isfinite)(val))
+ x_cpy[k] = trig_reduce_huge(val,&y_int2[k]);
+ }
+ x = ploadu<Packet>(x_cpy);
+ y_int = ploadu<PacketI>(y_int2);
+ }
+
+ // Compute the sign to apply to the polynomial.
+ // sin: sign = second_bit(y_int) xor signbit(_x)
+ // cos: sign = second_bit(y_int+1)
+ Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(pshiftleft<30>(y_int)))
+ : preinterpret<Packet>(pshiftleft<30>(padd(y_int,csti_1)));
+ sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
+
+ // Get the polynomial selection mask from the second bit of y_int
+ // We'll calculate both (sin and cos) polynomials and then select from the two.
+ Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
+
+ Packet x2 = pmul(x,x);
+
+ // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
+ Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
+ y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f ));
+ y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f ));
+ y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
+ y1 = pmadd(y1, x2, pset1<Packet>(1.f));
+
+ // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
+ // octave/matlab code to compute those coefficients:
+ // x = (0:0.0001:pi/4)';
+ // A = [x.^3 x.^5 x.^7];
+ // w = ((1.-(x/(pi/4)).^2).^5)*2000+1; # weights trading relative accuracy
+ // c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
+ // printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
+ //
+ Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
+ y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
+ y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
+ y2 = pmul(y2, x2);
+ y2 = pmadd(y2, x, x);
+
+ // Select the correct result from the two polynomials.
+ y = ComputeSine ? pselect(poly_mask,y2,y1)
+ : pselect(poly_mask,y1,y2);
+
+ // Update the sign and filter huge inputs
+ return pxor(y, sign_bit);
+
+#undef EIGEN_SINCOS_DONT_OPT
+}
+
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet psin_float(const Packet& x)
+{
+ return psincos_float<true>(x);
+}
+
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet pcos_float(const Packet& x)
+{
+ return psincos_float<false>(x);
+}
+
+} // end namespace internal
+} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/Default/Settings.h b/Eigen/src/Core/arch/Default/Settings.h
index 097373c84..a5c3ada4c 100644
--- a/Eigen/src/Core/arch/Default/Settings.h
+++ b/Eigen/src/Core/arch/Default/Settings.h
@@ -21,7 +21,7 @@
* it does not correspond to the number of iterations or the number of instructions
*/
#ifndef EIGEN_UNROLLING_LIMIT
-#define EIGEN_UNROLLING_LIMIT 100
+#define EIGEN_UNROLLING_LIMIT 110
#endif
/** Defines the threshold between a "small" and a "large" matrix.
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index ddf37b9c1..cd4615a45 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -53,6 +53,7 @@ template<> struct packet_traits<float> : default_packet_traits
HasBetaInc = 1,
HasBlend = 0,
+ HasFloor = 1,
};
};
@@ -86,12 +87,13 @@ template<> struct packet_traits<double> : default_packet_traits
HasBetaInc = 1,
HasBlend = 0,
+ HasFloor = 1,
};
};
-template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16}; typedef float4 half; };
-template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; };
+template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef float4 half; };
+template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef double2 half; };
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
return make_float4(from, from, from, from);
@@ -100,6 +102,117 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const do
return make_double2(from, from);
}
+namespace {
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
+ const float& b) {
+ return __int_as_float(__float_as_int(a) & __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a,
+ const double& b) {
+ return __longlong_as_double(__double_as_longlong(a) &
+ __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a,
+ const float& b) {
+ return __int_as_float(__float_as_int(a) | __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a,
+ const double& b) {
+ return __longlong_as_double(__double_as_longlong(a) |
+ __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a,
+ const float& b) {
+ return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a,
+ const double& b) {
+ return __longlong_as_double(__double_as_longlong(a) ^
+ __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a,
+ const float& b) {
+ return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a,
+ const double& b) {
+ return __longlong_as_double(__double_as_longlong(a) &
+ ~__double_as_longlong(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a,
+ const float& b) {
+ return __int_as_float(a == b ? 0xffffffffu : 0u);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a,
+ const double& b) {
+ return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
+}
+
+} // namespace
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
+ const float4& b) {
+ return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
+ bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a,
+ const double2& b) {
+ return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a,
+ const float4& b) {
+ return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
+ bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a,
+ const double2& b) {
+ return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a,
+ const float4& b) {
+ return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
+ bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a,
+ const double2& b) {
+ return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a,
+ const float4& b) {
+ return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y),
+ bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pandnot<double2>(const double2& a, const double2& b) {
+ return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a,
+ const float4& b) {
+ return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
+ eq_mask(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pcmp_eq<double2>(const double2& a, const double2& b) {
+ return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
+}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
return make_float4(a, a+1, a+2, a+3);
@@ -297,6 +410,13 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
return make_double2(fabs(a.x), fabs(a.y));
}
+template<> EIGEN_DEVICE_FUNC inline float4 pfloor<float4>(const float4& a) {
+ return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
+ return make_double2(floor(a.x), floor(a.y));
+}
+
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<float4,4>& kernel) {
float tmp = kernel.packet[0].y;
diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index 8787adcde..869fa7ec6 100644
--- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -30,6 +30,7 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
size=2,
HasHalfPacket = 0,
HasAdd = 1,
+ HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasSqrt = 1,
@@ -41,7 +42,7 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
};
};
-template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
+template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef half2 half; };
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
@@ -137,12 +138,22 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
- half2 result;
- unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
- *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
- return result;
+ half a1 = __low2half(a);
+ half a2 = __high2half(a);
+ half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
+ half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
+ return __halves2half2(result1, result2);
}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue<half2>(const half2& a) {
+ half true_half = half_impl::raw_uint16_to_half(0xffffu);
+ return pset1<half2>(true_half);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero<half2>(const half2& a) {
+ half false_half = half_impl::raw_uint16_to_half(0x0000u);
+ return pset1<half2>(false_half);
+}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<half2,2>& kernel) {
@@ -171,6 +182,68 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen:
#endif
}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq<half2>(const half2& a,
+ const half2& b) {
+ half true_half = half_impl::raw_uint16_to_half(0xffffu);
+ half false_half = half_impl::raw_uint16_to_half(0x0000u);
+ half a1 = __low2half(a);
+ half a2 = __high2half(a);
+ half b1 = __low2half(b);
+ half b2 = __high2half(b);
+ half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
+ half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
+ return __halves2half2(eq1, eq2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand<half2>(const half2& a,
+ const half2& b) {
+ half a1 = __low2half(a);
+ half a2 = __high2half(a);
+ half b1 = __low2half(b);
+ half b2 = __high2half(b);
+ half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
+ half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
+ return __halves2half2(result1, result2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por<half2>(const half2& a,
+ const half2& b) {
+ half a1 = __low2half(a);
+ half a2 = __high2half(a);
+ half b1 = __low2half(b);
+ half b2 = __high2half(b);
+ half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
+ half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
+ return __halves2half2(result1, result2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor<half2>(const half2& a,
+ const half2& b) {
+ half a1 = __low2half(a);
+ half a2 = __high2half(a);
+ half b1 = __low2half(b);
+ half b2 = __high2half(b);
+ half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
+ half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
+ return __halves2half2(result1, result2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot<half2>(const half2& a,
+ const half2& b) {
+ half a1 = __low2half(a);
+ half a2 = __high2half(a);
+ half b1 = __low2half(b);
+ half b2 = __high2half(b);
+ half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
+ half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
+ return __halves2half2(result1, result2);
+}
+
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
@@ -500,6 +573,7 @@ struct packet_traits<half> : default_packet_traits {
HasAdd = 1,
HasSub = 1,
HasMul = 1,
+ HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
@@ -507,7 +581,6 @@ struct packet_traits<half> : default_packet_traits {
HasMax = 0,
HasConj = 0,
HasSetLinear = 0,
- HasDiv = 0,
HasSqrt = 0,
HasRsqrt = 0,
HasExp = 0,
@@ -517,7 +590,7 @@ struct packet_traits<half> : default_packet_traits {
};
-template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; };
+template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32, vectorizable=true}; typedef Packet16h half; };
template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
Packet16h result;
@@ -640,6 +713,36 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
#endif
}
+template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) {
+ Packet16h r; r.x = _mm256_xor_si256(a.x, pcmp_eq(a.x, a.x)); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
+ Packet16h r; r.x = Packet8i(ptrue(a.x)); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
+ // in some cases Packet8i is a wrapper around __m256i, so we need to
+ // cast to Packet8i to call the correct overload.
+ Packet16h r; r.x = por(Packet8i(a.x),Packet8i(b.x)); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) {
+ Packet16h r; r.x = pxor(Packet8i(a.x),Packet8i(b.x)); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) {
+ Packet16h r; r.x = pand(Packet8i(a.x),Packet8i(b.x)); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) {
+ Packet16h r; r.x = pandnot(Packet8i(a.x),Packet8i(b.x)); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) {
+ Packet16f af = half2float(a);
+ Packet16f bf = half2float(b);
+ Packet16f rf = pcmp_eq(af, bf);
+ return float2half(rf);
+}
+
template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
// FIXME we could do that with bit manipulation
Packet16f af = half2float(a);
@@ -668,6 +771,13 @@ template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, con
return float2half(rf);
}
+template<> EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
+ Packet16f af = half2float(a);
+ Packet16f bf = half2float(b);
+ Packet16f rf = pdiv(af, bf);
+ return float2half(rf);
+}
+
template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
Packet16f from_float = half2float(from);
return half(predux(from_float));
@@ -952,6 +1062,7 @@ struct packet_traits<Eigen::half> : default_packet_traits {
HasAdd = 1,
HasSub = 1,
HasMul = 1,
+ HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
@@ -959,7 +1070,6 @@ struct packet_traits<Eigen::half> : default_packet_traits {
HasMax = 0,
HasConj = 0,
HasSetLinear = 0,
- HasDiv = 0,
HasSqrt = 0,
HasRsqrt = 0,
HasExp = 0,
@@ -969,7 +1079,7 @@ struct packet_traits<Eigen::half> : default_packet_traits {
};
-template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; };
+template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true}; typedef Packet8h half; };
template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
Packet8h result;
@@ -1063,6 +1173,32 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
#endif
}
+template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
+ Packet8h r; r.x = _mm_cmpeq_epi32(a.x, a.x); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
+ // in some cases Packet4i is a wrapper around __m128i, so we either need to
+ // cast to Packet4i to directly call the intrinsics as below:
+ Packet8h r; r.x = _mm_or_si128(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) {
+ Packet8h r; r.x = _mm_xor_si128(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) {
+ Packet8h r; r.x = _mm_and_si128(a.x,b.x); return r;
+}
+template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) {
+ Packet8h r; r.x = _mm_andnot_si128(b.x,a.x); return r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) {
+ Packet8f af = half2float(a);
+ Packet8f bf = half2float(b);
+ Packet8f rf = pcmp_eq(af, bf);
+ return float2half(rf);
+}
+
template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
@@ -1093,6 +1229,13 @@ template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const
return float2half(rf);
}
+template<> EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
+ Packet8f af = half2float(a);
+ Packet8f bf = half2float(b);
+ Packet8f rf = pdiv(af, bf);
+ return float2half(rf);
+}
+
template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
{
Packet8h result;
@@ -1279,9 +1422,10 @@ struct packet_traits<Eigen::half> : default_packet_traits {
AlignedOnScalar = 1,
size = 4,
HasHalfPacket = 0,
- HasAdd = 0,
- HasSub = 0,
- HasMul = 0,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
HasNegate = 0,
HasAbs = 0,
HasAbs2 = 0,
@@ -1289,7 +1433,6 @@ struct packet_traits<Eigen::half> : default_packet_traits {
HasMax = 0,
HasConj = 0,
HasSetLinear = 0,
- HasDiv = 0,
HasSqrt = 0,
HasRsqrt = 0,
HasExp = 0,
@@ -1299,7 +1442,7 @@ struct packet_traits<Eigen::half> : default_packet_traits {
};
-template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; };
+template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4h half; };
template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
Packet4h result;
@@ -1336,6 +1479,29 @@ template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const
return result;
}
+template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
+ __int64_t a64 = _mm_cvtm64_si64(a.x);
+ __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+ Eigen::half h[4];
+
+ Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+ Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+ h[0] = ha - hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+ h[1] = ha - hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+ h[2] = ha - hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+ h[3] = ha - hb;
+ Packet4h result;
+ result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+ return result;
+}
+
template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
__int64_t a64 = _mm_cvtm64_si64(a.x);
__int64_t b64 = _mm_cvtm64_si64(b.x);
@@ -1359,6 +1525,29 @@ template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const
return result;
}
+template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
+ __int64_t a64 = _mm_cvtm64_si64(a.x);
+ __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+ Eigen::half h[4];
+
+ Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+ Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+ h[0] = ha / hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+ h[1] = ha / hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+ h[2] = ha / hb;
+ ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+ hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+ h[3] = ha / hb;
+ Packet4h result;
+ result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+ return result;
+}
+
template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
Packet4h result;
result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h
index 9a45cf51e..fa64d3564 100644
--- a/Eigen/src/Core/arch/MSA/Complex.h
+++ b/Eigen/src/Core/arch/MSA/Complex.h
@@ -127,7 +127,7 @@ struct packet_traits<std::complex<float> > : default_packet_traits {
template <>
struct unpacket_traits<Packet2cf> {
typedef std::complex<float> type;
- enum { size = 2, alignment = Aligned16 };
+ enum { size = 2, alignment = Aligned16, vectorizable=true };
typedef Packet2cf half;
};
@@ -500,7 +500,7 @@ struct packet_traits<std::complex<double> > : default_packet_traits {
template <>
struct unpacket_traits<Packet1cd> {
typedef std::complex<double> type;
- enum { size = 1, alignment = Aligned16 };
+ enum { size = 1, alignment = Aligned16, vectorizable=true };
typedef Packet1cd half;
};
diff --git a/Eigen/src/Core/arch/MSA/MathFunctions.h b/Eigen/src/Core/arch/MSA/MathFunctions.h
index 98e23e36f..f5181b90e 100644
--- a/Eigen/src/Core/arch/MSA/MathFunctions.h
+++ b/Eigen/src/Core/arch/MSA/MathFunctions.h
@@ -261,7 +261,7 @@ Packet4f psincos_inner_msa_float(const Packet4f& _x) {
// x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
// Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1);
- Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0);
+ Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0); // bclri = bit-clear
y = __builtin_msa_ffint_s_w(y_int2);
// Compute the sign to apply to the polynomial.
@@ -305,7 +305,7 @@ Packet4f psincos_inner_msa_float(const Packet4f& _x) {
// Update the sign.
sign_mask = pxor(sign_mask, (Packet4i)y);
- y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0);
+ y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0); // binsli = bit-insert-left
return y;
}
diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h
index 094c874ee..a97156a84 100644
--- a/Eigen/src/Core/arch/MSA/PacketMath.h
+++ b/Eigen/src/Core/arch/MSA/PacketMath.h
@@ -117,14 +117,14 @@ struct packet_traits<int32_t> : default_packet_traits {
template <>
struct unpacket_traits<Packet4f> {
typedef float type;
- enum { size = 4, alignment = Aligned16 };
+ enum { size = 4, alignment = Aligned16, vectorizable=true };
typedef Packet4f half;
};
template <>
struct unpacket_traits<Packet4i> {
typedef int32_t type;
- enum { size = 4, alignment = Aligned16 };
+ enum { size = 4, alignment = Aligned16, vectorizable=true };
typedef Packet4i half;
};
@@ -925,7 +925,7 @@ struct packet_traits<double> : default_packet_traits {
template <>
struct unpacket_traits<Packet2d> {
typedef double type;
- enum { size = 2, alignment = Aligned16 };
+ enum { size = 2, alignment = Aligned16, vectorizable=true };
typedef Packet2d half;
};
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 306a309be..f6c5c211c 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -62,7 +62,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{
@@ -101,6 +101,18 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
return Packet2cf(vaddq_f32(v1, v2));
}
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
+{
+ // Compare real and imaginary parts of a and b to get the mask vector:
+ // [re(a[0])==re(b[0]), im(a[0])==im(b[0]), re(a[1])==re(b[1]), im(a[1])==im(b[1])]
+ Packet4f eq = pcmp_eq<Packet4f>(a.v, b.v);
+ // Swap real/imag elements in the mask in to get:
+ // [im(a[0])==im(b[0]), re(a[0])==re(b[0]), im(a[1])==im(b[1]), re(a[1])==re(b[1])]
+ Packet4f eq_swapped = vrev64q_f32(eq);
+ // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+ return Packet2cf(pand<Packet4f>(eq, eq_swapped));
+}
+
template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
@@ -146,7 +158,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{
- std::complex<float> EIGEN_ALIGN16 x[2];
+ EIGEN_ALIGN16 std::complex<float> x[2];
vst1q_f32((float *)x, a.v);
return x[0];
}
@@ -328,7 +340,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
@@ -361,6 +373,18 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
return Packet1cd(vaddq_f64(v1, v2));
}
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
+{
+ // Compare real and imaginary parts of a and b to get the mask vector:
+ // [re(a)==re(b), im(a)==im(b)]
+ Packet2d eq = pcmp_eq<Packet2d>(a.v, b.v);
+ // Swap real/imag elements in the mask in to get:
+ // [im(a)==im(b), re(a)==re(b)]
+ Packet2d eq_swapped = vreinterpretq_f64_u32(vrev64q_u32(vreinterpretq_u32_f64(eq)));
+ // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+ return Packet1cd(pand<Packet2d>(eq, eq_swapped));
+}
+
template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{
return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
@@ -401,7 +425,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1c
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
{
- std::complex<double> EIGEN_ALIGN16 res;
+ EIGEN_ALIGN16 std::complex<double> res;
pstore<std::complex<double> >(&res, a);
return res;
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
index c48c61023..2e7d0e944 100644
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -5,175 +5,37 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-/* The sin, cos, exp, and log functions of this file come from
- * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
- */
-
#ifndef EIGEN_MATH_FUNCTIONS_NEON_H
#define EIGEN_MATH_FUNCTIONS_NEON_H
+#include "../Default/GenericPacketMathFunctions.h"
+
namespace Eigen {
namespace internal {
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pexp<Packet4f>(const Packet4f& _x)
+Packet4f pexp<Packet4f>(const Packet4f& x)
{
- Packet4f x = _x;
- Packet4f tmp, fx;
-
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
- _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
- _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-
- x = vminq_f32(x, p4f_exp_hi);
- x = vmaxq_f32(x, p4f_exp_lo);
-
- /* express exp(x) as exp(g + n*log(2)) */
- fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF);
-
- /* perform a floorf */
- tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
-
- /* if greater, substract 1 */
- Packet4ui mask = vcgtq_f32(tmp, fx);
- mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1));
-
- fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
-
- tmp = vmulq_f32(fx, p4f_cephes_exp_C1);
- Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2);
- x = vsubq_f32(x, tmp);
- x = vsubq_f32(x, z);
-
- Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x);
- z = vmulq_f32(x, x);
- y = vaddq_f32(y, p4f_cephes_exp_p1);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_exp_p2);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_exp_p3);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_exp_p4);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_exp_p5);
-
- y = vmulq_f32(y, z);
- y = vaddq_f32(y, x);
- y = vaddq_f32(y, p4f_1);
-
- /* build 2^n */
- int32x4_t mm;
- mm = vcvtq_s32_f32(fx);
- mm = vaddq_s32(mm, p4i_0x7f);
- mm = vshlq_n_s32(mm, 23);
- Packet4f pow2n = vreinterpretq_f32_s32(mm);
-
- y = vmulq_f32(y, pow2n);
- return y;
+ return pexp_float(x);
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f plog<Packet4f>(const Packet4f& _x)
+Packet4f plog<Packet4f>(const Packet4f& x)
{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-
- _EIGEN_DECLARE_CONST_Packet4i(inv_mant_mask, ~0x7f800000);
-
- /* natural logarithm computed for 4 simultaneous float
- return NaN for x <= 0
- */
- _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-
- x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
- Packet4ui invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
-
- Packet4i ux = vreinterpretq_s32_f32(x);
-
- Packet4i emm0 = vshrq_n_s32(ux, 23);
-
- /* keep only the fractional part */
- ux = vandq_s32(ux, p4i_inv_mant_mask);
- ux = vorrq_s32(ux, vreinterpretq_s32_f32(p4f_half));
- x = vreinterpretq_f32_s32(ux);
-
- emm0 = vsubq_s32(emm0, p4i_0x7f);
- Packet4f e = vcvtq_f32_s32(emm0);
-
- e = vaddq_f32(e, p4f_1);
-
- /* part2:
- if( x < SQRTHF ) {
- e -= 1;
- x = x + x - 1.0;
- } else { x = x - 1.0; }
- */
- Packet4ui mask = vcltq_f32(x, p4f_cephes_SQRTHF);
- Packet4f tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
- x = vsubq_f32(x, p4f_1);
- e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(p4f_1), mask)));
- x = vaddq_f32(x, tmp);
-
- Packet4f z = vmulq_f32(x,x);
-
- Packet4f y = p4f_cephes_log_p0;
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_log_p1);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_log_p2);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_log_p3);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_log_p4);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_log_p5);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_log_p6);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_log_p7);
- y = vmulq_f32(y, x);
- y = vaddq_f32(y, p4f_cephes_log_p8);
- y = vmulq_f32(y, x);
-
- y = vmulq_f32(y, z);
-
- tmp = vmulq_f32(e, p4f_cephes_log_q1);
- y = vaddq_f32(y, tmp);
-
+ return plog_float(x);
+}
- tmp = vmulq_f32(z, p4f_half);
- y = vsubq_f32(y, tmp);
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f psin<Packet4f>(const Packet4f& x)
+{
+ return psin_float(x);
+}
- tmp = vmulq_f32(e, p4f_cephes_log_q2);
- x = vaddq_f32(x, y);
- x = vaddq_f32(x, tmp);
- x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
- return x;
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pcos<Packet4f>(const Packet4f& x)
+{
+ return pcos_float(x);
}
} // end namespace internal
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 010739380..e8b351849 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -108,10 +108,11 @@ template<> struct packet_traits<float> : default_packet_traits
size = 4,
HasHalfPacket=0, // Packet2f intrinsics not implemented yet
- HasDiv = 1,
+ HasDiv = 1,
+ HasFloor = 1,
// FIXME check the Has*
- HasSin = 0,
- HasCos = 0,
+ HasSin = EIGEN_FAST_MATH,
+ HasCos = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
HasSqrt = 0
@@ -139,12 +140,25 @@ EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q
EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
#endif
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f>
+{
+ typedef float type;
+ typedef Packet4f half;
+ typedef Packet4i integer_packet;
+ enum {size=4, alignment=Aligned16, vectorizable=true};
+};
+template<> struct unpacket_traits<Packet4i>
+{
+ typedef int32_t type;
+ typedef Packet4i half;
+ enum {size=4, alignment=Aligned16, vectorizable=true};
+};
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return vreinterpretq_f32_u32(vdupq_n_u32(from)); }
+
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
{
const float f[] = {0, 1, 2, 3};
@@ -249,6 +263,25 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vcleq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vcltq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vceqq_f32(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return vreinterpretq_s32_u32(vceqq_s32(a,b)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+ const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+ /* perform a floorf */
+ Packet4f tmp = vcvtq_f32_s32(vcvtq_s32_f32(a));
+
+ /* if greater, substract 1 */
+ Packet4ui mask = vcgtq_f32(tmp, a);
+ mask = vandq_u32(mask, vreinterpretq_u32_f32(cst_1));
+ return vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+}
+
// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
{
@@ -274,6 +307,9 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con
}
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
+template<int N> EIGEN_STRONG_INLINE Packet4i pshiftright(Packet4i a) { return vshrq_n_s32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4i pshiftleft(Packet4i a) { return vshlq_n_s32(a,N); }
+
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
@@ -339,8 +375,8 @@ template<> EIGEN_STRONG_INLINE void prefetch<float> (const float* addr) { EI
template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
// FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x[4]; vst1q_f32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int32_t x[4]; vst1q_s32(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
float32x2_t a_lo, a_hi;
@@ -364,6 +400,14 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
+template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+ return pfrexp_float(a,exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+ return pldexp_float(a,exponent);
+}
+
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
{
float32x2_t a_lo, a_hi, sum;
@@ -507,6 +551,13 @@ template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
return vget_lane_s32(max, 0);
}
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+ uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
+ vget_high_u32(vreinterpretq_u32_f32(x)));
+ return vget_lane_u32(vpmax_u32(tmp,tmp),0);
+}
+
// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
#define PALIGN_NEON(Offset,Type,Command) \
@@ -606,7 +657,7 @@ template<> struct packet_traits<double> : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; };
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return vdupq_n_f64(from); }
@@ -660,6 +711,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, con
return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
}
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return vreinterpretq_f64_u64(vceqq_f64(a,b)); }
+
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h
index 95d1fd0e4..20dbe1332 100644
--- a/Eigen/src/Core/arch/NEON/TypeCasting.h
+++ b/Eigen/src/Core/arch/NEON/TypeCasting.h
@@ -41,6 +41,14 @@ template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i
return vcvtq_f32_s32(a);
}
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+ return vreinterpretq_s32_f32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+ return vreinterpretq_f32_s32(a);
+}
+
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index d075043ce..f39988eac 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -50,7 +50,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
};
#endif
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
@@ -82,10 +82,13 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
#endif
}
+template<> EIGEN_STRONG_INLINE Packet2cf ptrue <Packet2cf>(const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnot <Packet2cf>(const Packet2cf& a) { return Packet2cf(pnot(Packet4f(a.v))); }
+
template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(b.v,a.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from))); }
template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from))); }
@@ -280,7 +283,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
};
#endif
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
@@ -305,10 +308,12 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
#endif
}
+template<> EIGEN_STRONG_INLINE Packet1cd ptrue <Packet1cd>(const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnot <Packet1cd>(const Packet1cd& a) { return Packet1cd(pnot(Packet2d(a.v))); }
template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(b.v,a.v)); }
// FIXME force unaligned load, this is a temporary fix
template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from)
@@ -439,6 +444,18 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
kernel.packet[1].v = tmp;
}
+template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
+{
+ __m128 eq = _mm_cmpeq_ps(a.v, b.v);
+ return Packet2cf(pand<Packet4f>(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
+{
+ __m128d eq = _mm_cmpeq_pd(a.v, b.v);
+ return Packet1cd(pand<Packet2d>(eq, vec2d_swizzle1(eq, 1, 0)));
+}
+
template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
__m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
return Packet2cf(_mm_castpd_ps(result));
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 4af2c6cae..0d491ab88 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -8,13 +8,15 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-/* The sin, cos, exp, and log functions of this file come from
+/* The sin and cos and functions of this file come from
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
*/
#ifndef EIGEN_MATH_FUNCTIONS_SSE_H
#define EIGEN_MATH_FUNCTIONS_SSE_H
+#include "../Default/GenericPacketMathFunctions.h"
+
namespace Eigen {
namespace internal {
@@ -22,424 +24,31 @@ namespace internal {
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f plog<Packet4f>(const Packet4f& _x)
{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
- /* the smallest non denormalized float number */
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f);
-
- /* natural logarithm computed for 4 simultaneous float
- return NaN for x <= 0
- */
- _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-
-
- Packet4i emm0;
-
- Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN
- Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
-
- x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */
- emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
-
- /* keep only the fractional part */
- x = _mm_and_ps(x, p4f_inv_mant_mask);
- x = _mm_or_ps(x, p4f_half);
-
- emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
- Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1);
-
- /* part2:
- if( x < SQRTHF ) {
- e -= 1;
- x = x + x - 1.0;
- } else { x = x - 1.0; }
- */
- Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
- Packet4f tmp = pand(x, mask);
- x = psub(x, p4f_1);
- e = psub(e, pand(p4f_1, mask));
- x = padd(x, tmp);
-
- Packet4f x2 = pmul(x,x);
- Packet4f x3 = pmul(x2,x);
-
- Packet4f y, y1, y2;
- y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
- y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
- y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
- y = pmadd(y , x, p4f_cephes_log_p2);
- y1 = pmadd(y1, x, p4f_cephes_log_p5);
- y2 = pmadd(y2, x, p4f_cephes_log_p8);
- y = pmadd(y, x3, y1);
- y = pmadd(y, x3, y2);
- y = pmul(y, x3);
-
- y1 = pmul(e, p4f_cephes_log_q1);
- tmp = pmul(x2, p4f_half);
- y = padd(y, y1);
- x = psub(x, tmp);
- y2 = pmul(e, p4f_cephes_log_q2);
- x = padd(x, y);
- x = padd(x, y2);
- // negative arg will be NAN, 0 will be -INF
- return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)),
- _mm_and_ps(iszero_mask, p4f_minus_inf));
+ return plog_float(_x);
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f pexp<Packet4f>(const Packet4f& _x)
{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
- _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-
-
- _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
- _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-
- _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-
- Packet4f tmp, fx;
- Packet4i emm0;
-
- // clamp x
- x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
-
- /* express exp(x) as exp(g + n*log(2)) */
- fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
-
-#ifdef EIGEN_VECTORIZE_SSE4_1
- fx = _mm_floor_ps(fx);
-#else
- emm0 = _mm_cvttps_epi32(fx);
- tmp = _mm_cvtepi32_ps(emm0);
- /* if greater, substract 1 */
- Packet4f mask = _mm_cmpgt_ps(tmp, fx);
- mask = _mm_and_ps(mask, p4f_1);
- fx = psub(tmp, mask);
-#endif
-
- tmp = pmul(fx, p4f_cephes_exp_C1);
- Packet4f z = pmul(fx, p4f_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- z = pmul(x,x);
-
- Packet4f y = p4f_cephes_exp_p0;
- y = pmadd(y, x, p4f_cephes_exp_p1);
- y = pmadd(y, x, p4f_cephes_exp_p2);
- y = pmadd(y, x, p4f_cephes_exp_p3);
- y = pmadd(y, x, p4f_cephes_exp_p4);
- y = pmadd(y, x, p4f_cephes_exp_p5);
- y = pmadd(y, z, x);
- y = padd(y, p4f_1);
-
- // build 2^n
- emm0 = _mm_cvttps_epi32(fx);
- emm0 = _mm_add_epi32(emm0, p4i_0x7f);
- emm0 = _mm_slli_epi32(emm0, 23);
- return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x);
+ return pexp_float(_x);
}
+
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet2d pexp<Packet2d>(const Packet2d& _x)
+Packet2d pexp<Packet2d>(const Packet2d& x)
{
- Packet2d x = _x;
-
- _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
- _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
- _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
- _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
- _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
- _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
- static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
-
- Packet2d tmp, fx;
- Packet4i emm0;
-
- // clamp x
- x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
- /* express exp(x) as exp(g + n*log(2)) */
- fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
-
-#ifdef EIGEN_VECTORIZE_SSE4_1
- fx = _mm_floor_pd(fx);
-#else
- emm0 = _mm_cvttpd_epi32(fx);
- tmp = _mm_cvtepi32_pd(emm0);
- /* if greater, substract 1 */
- Packet2d mask = _mm_cmpgt_pd(tmp, fx);
- mask = _mm_and_pd(mask, p2d_1);
- fx = psub(tmp, mask);
-#endif
-
- tmp = pmul(fx, p2d_cephes_exp_C1);
- Packet2d z = pmul(fx, p2d_cephes_exp_C2);
- x = psub(x, tmp);
- x = psub(x, z);
-
- Packet2d x2 = pmul(x,x);
-
- Packet2d px = p2d_cephes_exp_p0;
- px = pmadd(px, x2, p2d_cephes_exp_p1);
- px = pmadd(px, x2, p2d_cephes_exp_p2);
- px = pmul (px, x);
-
- Packet2d qx = p2d_cephes_exp_q0;
- qx = pmadd(qx, x2, p2d_cephes_exp_q1);
- qx = pmadd(qx, x2, p2d_cephes_exp_q2);
- qx = pmadd(qx, x2, p2d_cephes_exp_q3);
-
- x = pdiv(px,psub(qx,px));
- x = pmadd(p2d_2,x,p2d_1);
-
- // build 2^n
- emm0 = _mm_cvttpd_epi32(fx);
- emm0 = _mm_add_epi32(emm0, p4i_1023_0);
- emm0 = _mm_slli_epi32(emm0, 20);
- emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
- return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
+ return pexp_double(x);
}
-/* evaluation of 4 sines at once, using SSE2 intrinsics.
-
- The code is the exact rewriting of the cephes sinf function.
- Precision is excellent as long as x < 8192 (I did not bother to
- take into account the special handling they have for greater values
- -- it does not return garbage for arguments over 8192, though, but
- the extra precision is missing).
-
- Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
- surprising but correct result.
-*/
-
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f psin<Packet4f>(const Packet4f& _x)
{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
- _EIGEN_DECLARE_CONST_Packet4i(1, 1);
- _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
- _EIGEN_DECLARE_CONST_Packet4i(2, 2);
- _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
-
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
- Packet4f xmm1, xmm2, xmm3, sign_bit, y;
-
- Packet4i emm0, emm2;
- sign_bit = x;
- /* take the absolute value */
- x = pabs(x);
-
- /* take the modulo */
-
- /* extract the sign bit (upper one) */
- sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask);
-
- /* scale by 4/Pi */
- y = pmul(x, p4f_cephes_FOPI);
-
- /* store the integer part of y in mm0 */
- emm2 = _mm_cvttps_epi32(y);
- /* j=(j+1) & (~1) (see the cephes sources) */
- emm2 = _mm_add_epi32(emm2, p4i_1);
- emm2 = _mm_and_si128(emm2, p4i_not1);
- y = _mm_cvtepi32_ps(emm2);
- /* get the swap sign flag */
- emm0 = _mm_and_si128(emm2, p4i_4);
- emm0 = _mm_slli_epi32(emm0, 29);
- /* get the polynom selection mask
- there is one polynom for 0 <= x <= Pi/4
- and another one for Pi/4<x<=Pi/2
-
- Both branches will be computed.
- */
- emm2 = _mm_and_si128(emm2, p4i_2);
- emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
- Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
- Packet4f poly_mask = _mm_castsi128_ps(emm2);
- sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
-
- /* The magic pass: "Extended precision modular arithmetic"
- x = ((x - y * DP1) - y * DP2) - y * DP3; */
- xmm1 = pmul(y, p4f_minus_cephes_DP1);
- xmm2 = pmul(y, p4f_minus_cephes_DP2);
- xmm3 = pmul(y, p4f_minus_cephes_DP3);
- x = padd(x, xmm1);
- x = padd(x, xmm2);
- x = padd(x, xmm3);
-
- /* Evaluate the first polynom (0 <= x <= Pi/4) */
- y = p4f_coscof_p0;
- Packet4f z = _mm_mul_ps(x,x);
-
- y = pmadd(y, z, p4f_coscof_p1);
- y = pmadd(y, z, p4f_coscof_p2);
- y = pmul(y, z);
- y = pmul(y, z);
- Packet4f tmp = pmul(z, p4f_half);
- y = psub(y, tmp);
- y = padd(y, p4f_1);
-
- /* Evaluate the second polynom (Pi/4 <= x <= 0) */
-
- Packet4f y2 = p4f_sincof_p0;
- y2 = pmadd(y2, z, p4f_sincof_p1);
- y2 = pmadd(y2, z, p4f_sincof_p2);
- y2 = pmul(y2, z);
- y2 = pmul(y2, x);
- y2 = padd(y2, x);
-
- /* select the correct result from the two polynoms */
- y2 = _mm_and_ps(poly_mask, y2);
- y = _mm_andnot_ps(poly_mask, y);
- y = _mm_or_ps(y,y2);
- /* update the sign */
- return _mm_xor_ps(y, sign_bit);
+ return psin_float(_x);
}
-/* almost the same as psin */
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f pcos<Packet4f>(const Packet4f& _x)
{
- Packet4f x = _x;
- _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
- _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
- _EIGEN_DECLARE_CONST_Packet4i(1, 1);
- _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
- _EIGEN_DECLARE_CONST_Packet4i(2, 2);
- _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
- _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
- _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
- _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
- _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
- Packet4f xmm1, xmm2, xmm3, y;
- Packet4i emm0, emm2;
-
- x = pabs(x);
-
- /* scale by 4/Pi */
- y = pmul(x, p4f_cephes_FOPI);
-
- /* get the integer part of y */
- emm2 = _mm_cvttps_epi32(y);
- /* j=(j+1) & (~1) (see the cephes sources) */
- emm2 = _mm_add_epi32(emm2, p4i_1);
- emm2 = _mm_and_si128(emm2, p4i_not1);
- y = _mm_cvtepi32_ps(emm2);
-
- emm2 = _mm_sub_epi32(emm2, p4i_2);
-
- /* get the swap sign flag */
- emm0 = _mm_andnot_si128(emm2, p4i_4);
- emm0 = _mm_slli_epi32(emm0, 29);
- /* get the polynom selection mask */
- emm2 = _mm_and_si128(emm2, p4i_2);
- emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
- Packet4f sign_bit = _mm_castsi128_ps(emm0);
- Packet4f poly_mask = _mm_castsi128_ps(emm2);
-
- /* The magic pass: "Extended precision modular arithmetic"
- x = ((x - y * DP1) - y * DP2) - y * DP3; */
- xmm1 = pmul(y, p4f_minus_cephes_DP1);
- xmm2 = pmul(y, p4f_minus_cephes_DP2);
- xmm3 = pmul(y, p4f_minus_cephes_DP3);
- x = padd(x, xmm1);
- x = padd(x, xmm2);
- x = padd(x, xmm3);
-
- /* Evaluate the first polynom (0 <= x <= Pi/4) */
- y = p4f_coscof_p0;
- Packet4f z = pmul(x,x);
-
- y = pmadd(y,z,p4f_coscof_p1);
- y = pmadd(y,z,p4f_coscof_p2);
- y = pmul(y, z);
- y = pmul(y, z);
- Packet4f tmp = _mm_mul_ps(z, p4f_half);
- y = psub(y, tmp);
- y = padd(y, p4f_1);
-
- /* Evaluate the second polynom (Pi/4 <= x <= 0) */
- Packet4f y2 = p4f_sincof_p0;
- y2 = pmadd(y2, z, p4f_sincof_p1);
- y2 = pmadd(y2, z, p4f_sincof_p2);
- y2 = pmul(y2, z);
- y2 = pmadd(y2, x, x);
-
- /* select the correct result from the two polynoms */
- y2 = _mm_and_ps(poly_mask, y2);
- y = _mm_andnot_ps(poly_mask, y);
- y = _mm_or_ps(y,y2);
-
- /* update the sign */
- return _mm_xor_ps(y, sign_bit);
+ return pcos_float(_x);
}
#if EIGEN_FAST_MATH
@@ -482,11 +91,11 @@ Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000);
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000);
+ _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u);
+ _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000u);
_EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
_EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
- _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000);
+ _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u);
Packet4f neg_half = pmul(_x, p4f_minus_half);
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 99d55d5e9..9c3750af0 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -18,11 +18,13 @@ namespace internal {
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
+// 32 bits => 8 registers
+// 64 bits => 16 registers
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
#endif
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
#endif
@@ -61,20 +63,22 @@ template<> struct is_arithmetic<__m128> { enum { value = true }; };
template<> struct is_arithmetic<__m128i> { enum { value = true }; };
template<> struct is_arithmetic<__m128d> { enum { value = true }; };
+#define EIGEN_SSE_SHUFFLE_MASK(p,q,r,s) ((s)<<6|(r)<<4|(q)<<2|(p))
+
#define vec4f_swizzle1(v,p,q,r,s) \
- (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
+ (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))))
#define vec4i_swizzle1(v,p,q,r,s) \
- (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
+ (_mm_shuffle_epi32( v, EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))
#define vec2d_swizzle1(v,p,q) \
- (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
+ (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), EIGEN_SSE_SHUFFLE_MASK(2*p,2*p+1,2*q,2*q+1))))
#define vec4f_swizzle2(a,b,p,q,r,s) \
- (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
+ (_mm_shuffle_ps( (a), (b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))
#define vec4i_swizzle2(a,b,p,q,r,s) \
- (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
+ (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))))
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
const Packet4f p4f_##NAME = pset1<Packet4f>(X)
@@ -83,7 +87,7 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
const Packet2d p2d_##NAME = pset1<Packet2d>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
- const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
+ const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
@@ -110,12 +114,12 @@ template<> struct packet_traits<float> : default_packet_traits
HasSqrt = 1,
HasRsqrt = 1,
HasTanh = EIGEN_FAST_MATH,
- HasBlend = 1
+ HasBlend = 1,
+ HasFloor = 1
#ifdef EIGEN_VECTORIZE_SSE4_1
,
HasRound = 1,
- HasFloor = 1,
HasCeil = 1
#endif
};
@@ -158,9 +162,22 @@ template<> struct packet_traits<int> : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
-template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> {
+ typedef float type;
+ typedef Packet4f half;
+ typedef Packet4i integer_packet;
+ enum {size=4, alignment=Aligned16, vectorizable=true};
+};
+template<> struct unpacket_traits<Packet2d> {
+ typedef double type;
+ typedef Packet2d half;
+ enum {size=2, alignment=Aligned16, vectorizable=true};
+};
+template<> struct unpacket_traits<Packet4i> {
+ typedef int type;
+ typedef Packet4i half;
+ enum {size=4, alignment=Aligned16, vectorizable=false};
+};
#ifndef EIGEN_VECTORIZE_AVX
template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
@@ -180,6 +197,12 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { re
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
#endif
+template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return _mm_castsi128_ps(pset1<Packet4i>(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); }
+template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); }
+template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); }
+
// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
@@ -245,19 +268,24 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const
// for some weird raisons, it has to be overloaded for packet of integers
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
#endif
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_COMP_GNUC
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
// There appears to be a bug in GCC, by which the optimizer may
// flip the argument order in calls to _mm_min_ps, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+ #ifdef EIGEN_VECTORIZE_AVX
+ Packet4f res;
+ asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ #else
Packet4f res = b;
asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+ #endif
return res;
#else
// Arguments are reversed to match NaN propagation behavior of std::min.
@@ -265,13 +293,18 @@ template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const
#endif
}
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_COMP_GNUC
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
// There appears to be a bug in GCC, by which the optimizer may
// flip the argument order in calls to _mm_min_pd, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+ #ifdef EIGEN_VECTORIZE_AVX
+ Packet2d res;
+ asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ #else
Packet2d res = b;
asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+ #endif
return res;
#else
// Arguments are reversed to match NaN propagation behavior of std::min.
@@ -290,13 +323,18 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const
}
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_COMP_GNUC
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
// There appears to be a bug in GCC, by which the optimizer may
// flip the argument order in calls to _mm_max_ps, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+ #ifdef EIGEN_VECTORIZE_AVX
+ Packet4f res;
+ asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ #else
Packet4f res = b;
asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+ #endif
return res;
#else
// Arguments are reversed to match NaN propagation behavior of std::max.
@@ -304,13 +342,18 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const
#endif
}
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_COMP_GNUC
+#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
// There appears to be a bug in GCC, by which the optimizer may
// flip the argument order in calls to _mm_max_pd, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+ #ifdef EIGEN_VECTORIZE_AVX
+ Packet2d res;
+ asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ #else
Packet2d res = b;
asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+ #endif
return res;
#else
// Arguments are reversed to match NaN propagation behavior of std::max.
@@ -328,16 +371,24 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const
#endif
}
-#ifdef EIGEN_VECTORIZE_SSE4_1
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
-template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
-#endif
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
+template<> EIGEN_STRONG_INLINE Packet4f
+ptrue<Packet4f>(const Packet4f& a) {
+ Packet4i b = _mm_castps_si128(a);
+ return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
+}
+template<> EIGEN_STRONG_INLINE Packet2d
+ptrue<Packet2d>(const Packet2d& a) {
+ Packet4i b = _mm_castpd_si128(a);
+ return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
+}
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
@@ -351,9 +402,47 @@ template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const
template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
+
+template<int N> EIGEN_STRONG_INLINE Packet4i pshiftright(Packet4i a) { return _mm_srli_epi32(a,N); }
+template<int N> EIGEN_STRONG_INLINE Packet4i pshiftleft(Packet4i a) { return _mm_slli_epi32(a,N); }
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
+#else
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+ const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+ Packet4i emm0 = _mm_cvttps_epi32(a);
+ Packet4f tmp = _mm_cvtepi32_ps(emm0);
+ /* if greater, substract 1 */
+ Packet4f mask = _mm_cmpgt_ps(tmp, a);
+ mask = pand(mask, cst_1);
+ return psub(tmp, mask);
+}
+
+// WARNING: this pfloor implementation makes sense for small inputs only,
+// It is currently only used by pexp and not exposed through HasFloor.
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
+{
+ const Packet2d cst_1 = pset1<Packet2d>(1.0);
+ Packet4i emm0 = _mm_cvttpd_epi32(a);
+ Packet2d tmp = _mm_cvtepi32_pd(emm0);
+ /* if greater, substract 1 */
+ Packet2d mask = _mm_cmpgt_pd(tmp, a);
+ mask = pand(mask, cst_1);
+ return psub(tmp, mask);
+}
+#endif
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
@@ -517,6 +606,23 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
#endif
}
+template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+ return pfrexp_float(a,exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+ return pldexp_float(a,exponent);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+ const Packet4i cst_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
+ Packet4i emm0 = _mm_cvttpd_epi32(exponent);
+ emm0 = padd(emm0, cst_1023_0);
+ emm0 = _mm_slli_epi32(emm0, 20);
+ emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
+ return pmul(a, Packet2d(_mm_castsi128_pd(emm0)));
+}
+
// with AVX, the default implementations based on pload1 are faster
#ifndef __AVX__
template<> EIGEN_STRONG_INLINE void
@@ -718,6 +824,17 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
#endif // EIGEN_VECTORIZE_SSE4_1
}
+// not needed yet
+// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
+// {
+// return _mm_movemask_ps(x) == 0xF;
+// }
+
+template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+{
+ return _mm_movemask_ps(x) != 0x0;
+}
+
#if EIGEN_COMP_GNUC
// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
// {
@@ -921,7 +1038,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b)
}
// Scalar path for pmadd with FMA to ensure consistency with vectorized path.
-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
return ::fmaf(a,b,c);
}
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index c6ca8c716..f607366f0 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -69,6 +69,13 @@ template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f
return _mm_cvtps_pd(a);
}
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+ return _mm_castps_si128(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+ return _mm_castsi128_ps(a);
+}
} // end namespace internal
diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
index c1da40d14..294cb101a 100644
--- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@@ -88,7 +88,7 @@ SYCL_ARITHMETIC(cl::sycl::cl_double2)
#define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)\
template<> struct unpacket_traits<packet_type> {\
typedef unpacket_type type;\
- enum {size=lengths, alignment=Aligned16};\
+ enum {size=lengths, alignment=Aligned16, vectorizable=true};\
typedef packet_type half;\
};
SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h
index 95aba428f..167c3ee4c 100644
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@@ -91,8 +91,8 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; };
/* Forward declaration */
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h
index 0b37f4992..c8e90f1a8 100755
--- a/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -239,9 +239,9 @@ template<> struct packet_traits<double> : default_packet_traits
};
};
-template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; };
/* Forward declaration */
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index 9765cc763..bf64ef4ed 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -157,7 +157,16 @@ template<typename Scalar>
struct functor_traits<swap_assign_op<Scalar> > {
enum {
Cost = 3 * NumTraits<Scalar>::ReadCost,
- PacketAccess = packet_traits<Scalar>::Vectorizable
+ PacketAccess =
+ #if defined(EIGEN_VECTORIZE_AVX) && EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<800 || defined(__apple_build_version__))
+ // This is a partial workaround for a bug in clang generating bad code
+ // when mixing 256/512 bits loads and 128 bits moves.
+ // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1684
+ // https://bugs.llvm.org/show_bug.cgi?id=40815
+ 0
+ #else
+ packet_traits<Scalar>::Vectorizable
+ #endif
};
};
diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index b03be0269..16b645f91 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -37,26 +37,27 @@ template<typename Scalar>
struct functor_traits<scalar_identity_op<Scalar> >
{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
-template <typename Scalar, typename Packet, bool IsInteger> struct linspaced_op_impl;
+template <typename Scalar, bool IsInteger> struct linspaced_op_impl;
-template <typename Scalar, typename Packet>
-struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
+template <typename Scalar>
+struct linspaced_op_impl<Scalar,/*IsInteger*/false>
{
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+
linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
- m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
+ m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/RealScalar(num_steps-1)),
m_flip(numext::abs(high)<numext::abs(low))
{}
template<typename IndexType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const {
- typedef typename NumTraits<Scalar>::Real RealScalar;
if(m_flip)
return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step);
else
return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step);
}
- template<typename IndexType>
+ template<typename Packet, typename IndexType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const
{
// Principle:
@@ -86,8 +87,8 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
const bool m_flip;
};
-template <typename Scalar, typename Packet>
-struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/true>
+template <typename Scalar>
+struct linspaced_op_impl<Scalar,/*IsInteger*/true>
{
linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
m_low(low),
@@ -115,8 +116,8 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/true>
// Forward declaration (we default to random access which does not really give
// us a speed gain when using packet access but it allows to use the functor in
// nested expressions).
-template <typename Scalar, typename PacketType> struct linspaced_op;
-template <typename Scalar, typename PacketType> struct functor_traits< linspaced_op<Scalar,PacketType> >
+template <typename Scalar> struct linspaced_op;
+template <typename Scalar> struct functor_traits< linspaced_op<Scalar> >
{
enum
{
@@ -126,7 +127,7 @@ template <typename Scalar, typename PacketType> struct functor_traits< linspaced
IsRepeatable = true
};
};
-template <typename Scalar, typename PacketType> struct linspaced_op
+template <typename Scalar> struct linspaced_op
{
linspaced_op(const Scalar& low, const Scalar& high, Index num_steps)
: impl((num_steps==1 ? high : low),high,num_steps)
@@ -136,11 +137,11 @@ template <typename Scalar, typename PacketType> struct linspaced_op
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return impl(i); }
template<typename Packet,typename IndexType>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.packetOp(i); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.template packetOp<Packet>(i); }
// This proxy object handles the actual required temporaries and the different
// implementations (integer vs. floating point).
- const linspaced_op_impl<Scalar,PacketType,NumTraits<Scalar>::IsInteger> impl;
+ const linspaced_op_impl<Scalar,NumTraits<Scalar>::IsInteger> impl;
};
// Linear access is automatically determined from the operator() prototypes available for the given functor.
@@ -166,12 +167,12 @@ struct has_unary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value =
template<typename Scalar,typename IndexType>
struct has_binary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value = 1}; };
-template<typename Scalar, typename PacketType,typename IndexType>
-struct has_nullary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 0}; };
-template<typename Scalar, typename PacketType,typename IndexType>
-struct has_unary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 1}; };
-template<typename Scalar, typename PacketType,typename IndexType>
-struct has_binary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_nullary_operator<linspaced_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar,typename IndexType>
+struct has_unary_operator<linspaced_op<Scalar>,IndexType> { enum { value = 1}; };
+template<typename Scalar,typename IndexType>
+struct has_binary_operator<linspaced_op<Scalar>,IndexType> { enum { value = 0}; };
template<typename Scalar,typename IndexType>
struct has_nullary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 1}; };
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index 0c2d2cfca..03f167ac9 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -117,7 +117,15 @@ template<typename Scalar>
struct functor_traits<scalar_conjugate_op<Scalar> >
{
enum {
- Cost = NumTraits<Scalar>::IsComplex ? NumTraits<Scalar>::AddCost : 0,
+ Cost = 0,
+ // Yes the cost is zero even for complexes because in most cases for which
+ // the cost is used, conjugation turns to be a no-op. Some examples:
+ // cost(a*conj(b)) == cost(a*b)
+ // cost(a+conj(b)) == cost(a+b)
+ // <etc.
+ // If we don't set it to zero, then:
+ // A.conjugate().lazyProduct(B.conjugate())
+ // will bake its operands. We definitely don't want that!
PacketAccess = packet_traits<Scalar>::HasConj
};
};
@@ -548,6 +556,23 @@ struct functor_traits<scalar_tanh_op<Scalar> > {
};
};
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+ * \brief Template functor to compute the atanh of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::atanh()
+ */
+template <typename Scalar>
+struct scalar_atanh_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_atanh_op<Scalar> > {
+ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
/** \internal
* \brief Template functor to compute the sinh of a scalar
* \sa class CwiseUnaryOp, ArrayBase::sinh()
@@ -567,6 +592,23 @@ struct functor_traits<scalar_sinh_op<Scalar> >
};
};
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+ * \brief Template functor to compute the asinh of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::asinh()
+ */
+template <typename Scalar>
+struct scalar_asinh_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_asinh_op<Scalar> > {
+ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
/** \internal
* \brief Template functor to compute the cosh of a scalar
* \sa class CwiseUnaryOp, ArrayBase::cosh()
@@ -586,6 +628,23 @@ struct functor_traits<scalar_cosh_op<Scalar> >
};
};
+#if EIGEN_HAS_CXX11_MATH
+/** \internal
+ * \brief Template functor to compute the acosh of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::acosh()
+ */
+template <typename Scalar>
+struct scalar_acosh_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op)
+ EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_acosh_op<Scalar> > {
+ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+#endif
+
/** \internal
* \brief Template functor to compute the inverse of a scalar
* \sa class CwiseUnaryOp, Cwise::inverse()
@@ -598,9 +657,13 @@ struct scalar_inverse_op {
EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
{ return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
};
-template<typename Scalar>
-struct functor_traits<scalar_inverse_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
+template <typename Scalar>
+struct functor_traits<scalar_inverse_op<Scalar> > {
+ enum {
+ PacketAccess = packet_traits<Scalar>::HasDiv,
+ Cost = scalar_div_cost<Scalar, PacketAccess>::value
+ };
+};
/** \internal
* \brief Template functor to compute the square of a scalar
@@ -864,8 +927,9 @@ template <>
struct scalar_logistic_op<float> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(const float& x) const {
- const float one = 1.0f;
- return one / (one + numext::exp(-x));
+ if (x < -18.0f) return 0.0f;
+ else if (x > 18.0f) return 1.0f;
+ else return 1.0f / (1.0f + numext::exp(-x));
}
template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index e7cab4720..fdd0ec0e9 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -15,7 +15,13 @@ namespace Eigen {
namespace internal {
-template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
+enum PacketSizeType {
+ PacketFull = 0,
+ PacketHalf,
+ PacketQuarter
+};
+
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=PacketFull>
class gebp_traits;
@@ -101,6 +107,16 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
// at the register level. This small horizontal panel has to stay within L1 cache.
std::ptrdiff_t l1, l2, l3;
manage_caching_sizes(GetAction, &l1, &l2, &l3);
+ #ifdef EIGEN_VECTORIZE_AVX512
+ // We need to find a rationale for that, but without this adjustment,
+ // performance with AVX512 is pretty bad, like -20% slower.
+ // One reason is that with increasing packet-size, the blocking size k
+ // has to become pretty small if we want that 1 lhs panel fit within L1.
+ // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
+ // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
+ // This is quite small for a good reuse of the accumulation registers.
+ l1 *= 4;
+ #endif
if (num_threads > 1) {
typedef typename Traits::ResScalar ResScalar;
@@ -337,6 +353,61 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
// #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
#endif
+template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
+struct RhsPanelHelper {
+ private:
+ static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
+ public:
+ typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
+};
+
+template <typename Packet>
+struct QuadPacket
+{
+ Packet B_0, B1, B2, B3;
+ const Packet& get(const FixedInt<0>&) const { return B_0; }
+ const Packet& get(const FixedInt<1>&) const { return B1; }
+ const Packet& get(const FixedInt<2>&) const { return B2; }
+ const Packet& get(const FixedInt<3>&) const { return B3; }
+};
+
+template <int N, typename T1, typename T2, typename T3>
+struct packet_conditional { typedef T3 type; };
+
+template <typename T1, typename T2, typename T3>
+struct packet_conditional<PacketFull, T1, T2, T3> { typedef T1 type; };
+
+template <typename T1, typename T2, typename T3>
+struct packet_conditional<PacketHalf, T1, T2, T3> { typedef T2 type; };
+
+#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
+ typedef typename packet_conditional<packet_size, \
+ typename packet_traits<name ## Scalar>::type, \
+ typename packet_traits<name ## Scalar>::half, \
+ typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
+ prefix ## name ## Packet
+
+#define PACKET_DECL_COND(name, packet_size) \
+ typedef typename packet_conditional<packet_size, \
+ typename packet_traits<name ## Scalar>::type, \
+ typename packet_traits<name ## Scalar>::half, \
+ typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
+ name ## Packet
+
+#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
+ typedef typename packet_conditional<packet_size, \
+ typename packet_traits<Scalar>::type, \
+ typename packet_traits<Scalar>::half, \
+ typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
+ prefix ## ScalarPacket
+
+#define PACKET_DECL_COND_SCALAR(packet_size) \
+ typedef typename packet_conditional<packet_size, \
+ typename packet_traits<Scalar>::type, \
+ typename packet_traits<Scalar>::half, \
+ typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
+ ScalarPacket
+
/* Vectorization logic
* real*real: unpack rhs to constant packets, ...
*
@@ -347,7 +418,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
* cplx*real : unpack rhs to constant packets, ...
* real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
*/
-template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
class gebp_traits
{
public:
@@ -355,13 +426,17 @@ public:
typedef _RhsScalar RhsScalar;
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+
enum {
ConjLhs = _ConjLhs,
ConjRhs = _ConjRhs,
- Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
+ Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
@@ -370,10 +445,12 @@ public:
// register block size along the M direction (currently, this one cannot be modified)
default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
-#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
- // we assume 16 registers
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
+ && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
+ // we assume 16 registers or more
// See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
// then using 3*LhsPacketSize triggers non-implemented paths in syrk.
+ // Bug 1515: MSVC prior to v19.14 yields to register spilling.
mr = Vectorizable ? 3*LhsPacketSize : default_mr,
#else
mr = default_mr,
@@ -383,38 +460,41 @@ public:
RhsProgress = 1
};
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
- typedef typename packet_traits<ResScalar>::type _ResPacket;
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
typedef LhsPacket LhsPacket4Packing;
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
typedef ResPacket AccPacket;
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
{
p = pset1<ResPacket>(ResScalar(0));
}
-
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
- {
- pbroadcast4(b, b0, b1, b2, b3);
- }
-
-// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
-// {
-// pbroadcast2(b, b0, b1);
-// }
-
+
template<typename RhsPacketType>
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
{
dest = pset1<RhsPacketType>(*b);
}
-
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+ }
+
+ template<typename RhsPacketType>
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+ {
+ loadRhs(b, dest);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+ {
+ }
+
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
{
dest = ploadquad<RhsPacket>(b);
@@ -432,8 +512,8 @@ public:
dest = ploadu<LhsPacketType>(a);
}
- template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
- EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
+ template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
{
conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
// It would be a lot cleaner to call pmadd all the time. Unfortunately if we
@@ -448,6 +528,12 @@ public:
#endif
}
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+ {
+ madd(a, b.get(lane), c, tmp, lane);
+ }
+
EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
{
r = pmadd(c,alpha,r);
@@ -461,21 +547,25 @@ public:
};
-template<typename RealScalar, bool _ConjLhs>
-class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
+template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize>
+class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
{
public:
typedef std::complex<RealScalar> LhsScalar;
typedef RealScalar RhsScalar;
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+
enum {
ConjLhs = _ConjLhs,
ConjRhs = false,
- Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
+ Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
nr = 4,
@@ -490,15 +580,13 @@ public:
RhsProgress = 1
};
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
- typedef typename packet_traits<ResScalar>::type _ResPacket;
-
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
typedef LhsPacket LhsPacket4Packing;
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
+
typedef ResPacket AccPacket;
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
@@ -506,42 +594,64 @@ public:
p = pset1<ResPacket>(ResScalar(0));
}
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+ template<typename RhsPacketType>
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
{
- dest = pset1<RhsPacket>(*b);
+ dest = pset1<RhsPacketType>(*b);
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+ }
+
+ template<typename RhsPacketType>
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+ {
+ loadRhs(b, dest);
}
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+ {}
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
{
- dest = pset1<RhsPacket>(*b);
+ loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type());
}
- EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+ EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
{
- dest = pload<LhsPacket>(a);
+ // FIXME we can do better!
+ // what we want here is a ploadheight
+ RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
+ dest = ploadquad<RhsPacket>(tmp);
}
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+ EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const
{
- dest = ploadu<LhsPacket>(a);
+ eigen_internal_assert(RhsPacketSize<=8);
+ dest = pset1<RhsPacket>(*b);
}
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+ EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
{
- pbroadcast4(b, b0, b1, b2, b3);
+ dest = pload<LhsPacket>(a);
}
-
-// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
-// {
-// pbroadcast2(b, b0, b1);
-// }
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
+ template<typename LhsPacketType>
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
+ {
+ dest = ploadu<LhsPacketType>(a);
+ }
+
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
{
madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
}
- EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
{
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
EIGEN_UNUSED_VARIABLE(tmp);
@@ -556,13 +666,20 @@ public:
c += a * b;
}
- EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+ {
+ madd(a, b.get(lane), c, tmp, lane);
+ }
+
+ template <typename ResPacketType, typename AccPacketType>
+ EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
{
+ conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
r = cj.pmadd(c,alpha,r);
}
protected:
- conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
};
template<typename Packet>
@@ -581,13 +698,57 @@ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Pack
return res;
}
+// note that for DoublePacket<RealPacket> the "4" in "downto4"
+// corresponds to the number of complexes, so it means "8"
+// it terms of real coefficients.
+
template<typename Packet>
-const DoublePacket<Packet>& predux_half_dowto4(const DoublePacket<Packet> &a)
+const DoublePacket<Packet>&
+predux_half_dowto4(const DoublePacket<Packet> &a,
+ typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
{
return a;
}
-template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
+template<typename Packet>
+DoublePacket<typename unpacket_traits<Packet>::half>
+predux_half_dowto4(const DoublePacket<Packet> &a,
+ typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
+{
+ // yes, that's pretty hackish :(
+ DoublePacket<typename unpacket_traits<Packet>::half> res;
+ typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
+ typedef typename packet_traits<Cplx>::type CplxPacket;
+ res.first = predux_half_dowto4(CplxPacket(a.first)).v;
+ res.second = predux_half_dowto4(CplxPacket(a.second)).v;
+ return res;
+}
+
+// same here, "quad" actually means "8" in terms of real coefficients
+template<typename Scalar, typename RealPacket>
+void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
+ typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0)
+{
+ dest.first = pset1<RealPacket>(real(*b));
+ dest.second = pset1<RealPacket>(imag(*b));
+}
+
+template<typename Scalar, typename RealPacket>
+void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
+ typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
+{
+ // yes, that's pretty hackish too :(
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ RealScalar r[4] = {real(b[0]), real(b[0]), real(b[1]), real(b[1])};
+ RealScalar i[4] = {imag(b[0]), imag(b[0]), imag(b[1]), imag(b[1])};
+ dest.first = ploadquad<RealPacket>(r);
+ dest.second = ploadquad<RealPacket>(i);
+}
+
+
+template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
+ typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
+};
// template<typename Packet>
// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
// {
@@ -597,8 +758,8 @@ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typede
// return res;
// }
-template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
+template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
+class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
{
public:
typedef std::complex<RealScalar> Scalar;
@@ -606,15 +767,21 @@ public:
typedef std::complex<RealScalar> RhsScalar;
typedef std::complex<RealScalar> ResScalar;
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+ PACKET_DECL_COND(Real, _PacketSize);
+ PACKET_DECL_COND_SCALAR(_PacketSize);
+
enum {
ConjLhs = _ConjLhs,
ConjRhs = _ConjRhs,
- Vectorizable = packet_traits<RealScalar>::Vectorizable
- && packet_traits<Scalar>::Vectorizable,
- RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+ Vectorizable = unpacket_traits<RealPacket>::vectorizable
+ && unpacket_traits<ScalarPacket>::vectorizable,
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+ RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
+ RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
// FIXME: should depend on NumberOfRegisters
nr = 4,
@@ -624,15 +791,16 @@ public:
RhsProgress = 1
};
- typedef typename packet_traits<RealScalar>::type RealPacket;
- typedef typename packet_traits<Scalar>::type ScalarPacket;
- typedef DoublePacket<RealPacket> DoublePacketType;
+ typedef DoublePacket<RealPacket> DoublePacketType;
typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing;
typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
+
+ // this actualy holds 8 packets!
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
@@ -643,51 +811,49 @@ public:
}
// Scalar path
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const
{
- dest = pset1<ResPacket>(*b);
+ dest = pset1<ScalarPacket>(*b);
}
// Vectorized path
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
+ template<typename RealPacketType>
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
{
- dest.first = pset1<RealPacket>(real(*b));
- dest.second = pset1<RealPacket>(imag(*b));
+ dest.first = pset1<RealPacketType>(real(*b));
+ dest.second = pset1<RealPacketType>(imag(*b));
}
-
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
{
- loadRhs(b,dest);
+ loadRhs(b, dest.B_0);
+ loadRhs(b + 1, dest.B1);
+ loadRhs(b + 2, dest.B2);
+ loadRhs(b + 3, dest.B3);
}
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
+
+ // Scalar path
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
{
- eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
- loadRhs(b,dest);
+ loadRhs(b, dest);
}
-
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+
+ // Vectorized path
+ template<typename RealPacketType>
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
{
- // FIXME not sure that's the best way to implement it!
- loadRhs(b+0, b0);
- loadRhs(b+1, b1);
- loadRhs(b+2, b2);
- loadRhs(b+3, b3);
+ loadRhs(b, dest);
}
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
- // Vectorized path
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
{
- // FIXME not sure that's the best way to implement it!
- loadRhs(b+0, b0);
- loadRhs(b+1, b1);
+ loadRhs(b,dest);
}
-
- // Scalar path
- EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
{
- // FIXME not sure that's the best way to implement it!
- loadRhs(b+0, b0);
- loadRhs(b+1, b1);
+ loadQuadToDoublePacket(b,dest);
}
// nothing special here
@@ -696,47 +862,59 @@ public:
dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
}
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+ template<typename LhsPacketType>
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
{
- dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
+ dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
}
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
+ template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
+ EIGEN_STRONG_INLINE
+ typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type
+ madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
{
c.first = padd(pmul(a,b.first), c.first);
c.second = padd(pmul(a,b.second),c.second);
}
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
+ template<typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
{
c = cj.pmadd(a,b,c);
}
+
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+ {
+ madd(a, b.get(lane), c, tmp, lane);
+ }
EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
- EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
+ template<typename RealPacketType, typename ResPacketType>
+ EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const
{
// assemble c
- ResPacket tmp;
+ ResPacketType tmp;
if((!ConjLhs)&&(!ConjRhs))
{
- tmp = pcplxflip(pconj(ResPacket(c.second)));
- tmp = padd(ResPacket(c.first),tmp);
+ tmp = pcplxflip(pconj(ResPacketType(c.second)));
+ tmp = padd(ResPacketType(c.first),tmp);
}
else if((!ConjLhs)&&(ConjRhs))
{
- tmp = pconj(pcplxflip(ResPacket(c.second)));
- tmp = padd(ResPacket(c.first),tmp);
+ tmp = pconj(pcplxflip(ResPacketType(c.second)));
+ tmp = padd(ResPacketType(c.first),tmp);
}
else if((ConjLhs)&&(!ConjRhs))
{
- tmp = pcplxflip(ResPacket(c.second));
- tmp = padd(pconj(ResPacket(c.first)),tmp);
+ tmp = pcplxflip(ResPacketType(c.second));
+ tmp = padd(pconj(ResPacketType(c.first)),tmp);
}
else if((ConjLhs)&&(ConjRhs))
{
- tmp = pcplxflip(ResPacket(c.second));
- tmp = psub(pconj(ResPacket(c.first)),tmp);
+ tmp = pcplxflip(ResPacketType(c.second));
+ tmp = psub(pconj(ResPacketType(c.first)),tmp);
}
r = pmadd(tmp,alpha,r);
@@ -746,8 +924,8 @@ protected:
conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
};
-template<typename RealScalar, bool _ConjRhs>
-class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
+template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize>
+class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
{
public:
typedef std::complex<RealScalar> Scalar;
@@ -755,14 +933,25 @@ public:
typedef Scalar RhsScalar;
typedef Scalar ResScalar;
+ PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
+ PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
+ PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
+
+#undef PACKET_DECL_COND_SCALAR_PREFIX
+#undef PACKET_DECL_COND_PREFIX
+#undef PACKET_DECL_COND_SCALAR
+#undef PACKET_DECL_COND
+
enum {
ConjLhs = false,
ConjRhs = _ConjRhs,
- Vectorizable = packet_traits<RealScalar>::Vectorizable
- && packet_traits<Scalar>::Vectorizable,
- LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
- RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
- ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
+ Vectorizable = unpacket_traits<_RealPacket>::vectorizable
+ && unpacket_traits<_ScalarPacket>::vectorizable,
+ LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
+ RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
+ ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
// FIXME: should depend on NumberOfRegisters
@@ -773,15 +962,11 @@ public:
RhsProgress = 1
};
- typedef typename packet_traits<LhsScalar>::type _LhsPacket;
- typedef typename packet_traits<RhsScalar>::type _RhsPacket;
- typedef typename packet_traits<ResScalar>::type _ResPacket;
-
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
typedef LhsPacket LhsPacket4Packing;
-
+ typedef QuadPacket<RhsPacket> RhsPacketx4;
typedef ResPacket AccPacket;
EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
@@ -789,22 +974,25 @@ public:
p = pset1<ResPacket>(ResScalar(0));
}
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+ template<typename RhsPacketType>
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
{
- dest = pset1<RhsPacket>(*b);
+ dest = pset1<RhsPacketType>(*b);
}
-
- void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
{
- pbroadcast4(b, b0, b1, b2, b3);
+ pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
}
-
-// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
-// {
-// // FIXME not sure that's the best way to implement it!
-// b0 = pload1<RhsPacket>(b+0);
-// b1 = pload1<RhsPacket>(b+1);
-// }
+
+ template<typename RhsPacketType>
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
+ {
+ loadRhs(b, dest);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+ {}
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
{
@@ -813,21 +1001,23 @@ public:
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
{
- eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
- loadRhs(b,dest);
+ dest = ploadquad<RhsPacket>(b);
}
- EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+ template<typename LhsPacketType>
+ EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
{
- dest = ploaddup<LhsPacket>(a);
+ dest = ploaddup<LhsPacketType>(a);
}
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
{
madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
}
- EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
+ template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+ EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
{
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
EIGEN_UNUSED_VARIABLE(tmp);
@@ -843,16 +1033,166 @@ public:
c += a * b;
}
- EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+ template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+ {
+ madd(a, b.get(lane), c, tmp, lane);
+ }
+
+ template <typename ResPacketType, typename AccPacketType>
+ EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
{
+ conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
r = cj.pmadd(alpha,c,r);
}
protected:
- conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
+
+};
+
+
+#if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON
+
+template<>
+struct gebp_traits <float, float, false, false,Architecture::NEON,PacketFull>
+ : gebp_traits<float,float,false,false,Architecture::Generic,PacketFull>
+{
+ typedef float RhsPacket;
+
+ typedef float32x4_t RhsPacketx4;
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = *b;
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {
+ dest = vld1q_f32(b);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = *b;
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {}
+
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+ {
+ loadRhs(b,dest);
+ }
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+ {
+ c = vfmaq_n_f32(c, a, b);
+ }
+
+ // NOTE: Template parameter inference failed when compiled with Android NDK:
+ // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+ { madd_helper<0>(a, b, c); }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
+ { madd_helper<1>(a, b, c); }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
+ { madd_helper<2>(a, b, c); }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
+ { madd_helper<3>(a, b, c); }
+
+ private:
+ template<int LaneID>
+ EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
+ {
+ #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
+ // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+ // vfmaq_laneq_f32 is implemented through a costly dup
+ if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : );
+ else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : );
+ else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : );
+ else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : );
+ #else
+ c = vfmaq_laneq_f32(c, a, b, LaneID);
+ #endif
+ }
+};
+
+
+template<>
+struct gebp_traits <double, double, false, false,Architecture::NEON>
+ : gebp_traits<double,double,false,false,Architecture::Generic>
+{
+ typedef double RhsPacket;
+
+ struct RhsPacketx4 {
+ float64x2_t B_0, B_1;
+ };
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = *b;
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {
+ dest.B_0 = vld1q_f64(b);
+ dest.B_1 = vld1q_f64(b+2);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ loadRhs(b,dest);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {}
+
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+ {
+ loadRhs(b,dest);
+ }
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+ {
+ c = vfmaq_n_f64(c, a, b);
+ }
+
+ // NOTE: Template parameter inference failed when compiled with Android NDK:
+ // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+ { madd_helper<0>(a, b, c); }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
+ { madd_helper<1>(a, b, c); }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
+ { madd_helper<2>(a, b, c); }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
+ { madd_helper<3>(a, b, c); }
+
+ private:
+ template <int LaneID>
+ EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
+ {
+ #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
+ // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+ // vfmaq_laneq_f64 is implemented through a costly dup
+ if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
+ else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
+ else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
+ else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
+ #else
+ if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
+ else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
+ else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
+ else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
+ #endif
+ }
};
-/* optimized GEneral packed Block * packed Panel product kernel
+#endif
+
+/* optimized General packed Block * packed Panel product kernel
*
* Mixing type logic: C += A * B
* | A | B | comments
@@ -862,26 +1202,47 @@ protected:
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel
{
- typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,PacketHalf> HalfTraits;
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,PacketQuarter> QuarterTraits;
+
typedef typename Traits::ResScalar ResScalar;
typedef typename Traits::LhsPacket LhsPacket;
typedef typename Traits::RhsPacket RhsPacket;
typedef typename Traits::ResPacket ResPacket;
typedef typename Traits::AccPacket AccPacket;
+ typedef typename Traits::RhsPacketx4 RhsPacketx4;
+
+ typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
+
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
- typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
typedef typename SwappedTraits::ResScalar SResScalar;
typedef typename SwappedTraits::LhsPacket SLhsPacket;
typedef typename SwappedTraits::RhsPacket SRhsPacket;
typedef typename SwappedTraits::ResPacket SResPacket;
typedef typename SwappedTraits::AccPacket SAccPacket;
+ typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+ typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+ typedef typename HalfTraits::ResPacket ResPacketHalf;
+ typedef typename HalfTraits::AccPacket AccPacketHalf;
+
+ typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+ typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+ typedef typename QuarterTraits::ResPacket ResPacketQuarter;
+ typedef typename QuarterTraits::AccPacket AccPacketQuarter;
+
typedef typename DataMapper::LinearMapper LinearMapper;
enum {
Vectorizable = Traits::Vectorizable,
LhsProgress = Traits::LhsProgress,
+ LhsProgressHalf = HalfTraits::LhsProgress,
+ LhsProgressQuarter = QuarterTraits::LhsProgress,
RhsProgress = Traits::RhsProgress,
+ RhsProgressHalf = HalfTraits::RhsProgress,
+ RhsProgressQuarter = QuarterTraits::RhsProgress,
ResPacketSize = Traits::ResPacketSize
};
@@ -892,11 +1253,11 @@ struct gebp_kernel
};
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
- int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs>::LhsProgress>
+int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress>
struct last_row_process_16_packets
{
- typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
- typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
typedef typename Traits::ResScalar ResScalar;
typedef typename SwappedTraits::LhsPacket SLhsPacket;
@@ -924,8 +1285,8 @@ struct last_row_process_16_packets
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
- typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
- typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
+ typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
+ typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
typedef typename Traits::ResScalar ResScalar;
typedef typename SwappedTraits::LhsPacket SLhsPacket;
@@ -957,7 +1318,7 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr,
SRhsPacketQuarter b0;
straits.loadLhsUnaligned(blB, a0);
straits.loadRhs(blA, b0);
- straits.madd(a0,b0,c0,b0);
+ straits.madd(a0,b0,c0,b0, fix<0>);
blB += SwappedTraits::LhsProgress/4;
blA += 1;
}
@@ -971,6 +1332,219 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr,
}
};
+template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
+struct lhs_process_one_packet
+{
+ typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
+
+ EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
+ {
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
+ traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
+ traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
+ traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
+ traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
+ traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
+ traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
+ #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
+ __asm__ ("" : "+x,m" (*A0));
+ #endif
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
+ }
+
+ EIGEN_STRONG_INLINE void operator()(
+ const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
+ Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
+ int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
+ {
+ GEBPTraits traits;
+
+ // loops on each largest micro horizontal panel of lhs
+ // (LhsProgress x depth)
+ for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
+ {
+ // loops on each largest micro vertical panel of rhs (depth * nr)
+ for(Index j2=0; j2<packet_cols4; j2+=nr)
+ {
+ // We select a LhsProgress x nr micro block of res
+ // which is entirely stored into 1 x nr registers.
+
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
+ prefetch(&blA[0]);
+
+ // gets res block as register
+ AccPacket C0, C1, C2, C3;
+ traits.initAcc(C0);
+ traits.initAcc(C1);
+ traits.initAcc(C2);
+ traits.initAcc(C3);
+ // To improve instruction pipelining, let's double the accumulation registers:
+ // even k will accumulate in C*, while odd k will accumulate in D*.
+ // This trick is crutial to get good performance with FMA, otherwise it is
+ // actually faster to perform separated MUL+ADD because of a naturally
+ // better instruction-level parallelism.
+ AccPacket D0, D1, D2, D3;
+ traits.initAcc(D0);
+ traits.initAcc(D1);
+ traits.initAcc(D2);
+ traits.initAcc(D3);
+
+ LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+ LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+ LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+ LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+ r0.prefetch(prefetch_res_offset);
+ r1.prefetch(prefetch_res_offset);
+ r2.prefetch(prefetch_res_offset);
+ r3.prefetch(prefetch_res_offset);
+
+ // performs "inner" products
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+ prefetch(&blB[0]);
+ LhsPacket A0, A1;
+
+ for(Index k=0; k<peeled_kc; k+=pk)
+ {
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
+ RhsPacketx4 rhs_panel;
+ RhsPacket T0;
+
+ internal::prefetch(blB+(48+0));
+ peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+ peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+ peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+ peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+ internal::prefetch(blB+(48+16));
+ peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+ peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+ peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+ peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+
+ blB += pk*4*RhsProgress;
+ blA += pk*LhsProgress;
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
+ }
+ C0 = padd(C0,D0);
+ C1 = padd(C1,D1);
+ C2 = padd(C2,D2);
+ C3 = padd(C3,D3);
+
+ // process remaining peeled loop
+ for(Index k=peeled_kc; k<depth; k++)
+ {
+ RhsPacketx4 rhs_panel;
+ RhsPacket T0;
+ peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+ blB += 4*RhsProgress;
+ blA += LhsProgress;
+ }
+
+ ResPacket R0, R1;
+ ResPacket alphav = pset1<ResPacket>(alpha);
+
+ R0 = r0.template loadPacket<ResPacket>(0);
+ R1 = r1.template loadPacket<ResPacket>(0);
+ traits.acc(C0, alphav, R0);
+ traits.acc(C1, alphav, R1);
+ r0.storePacket(0, R0);
+ r1.storePacket(0, R1);
+
+ R0 = r2.template loadPacket<ResPacket>(0);
+ R1 = r3.template loadPacket<ResPacket>(0);
+ traits.acc(C2, alphav, R0);
+ traits.acc(C3, alphav, R1);
+ r2.storePacket(0, R0);
+ r3.storePacket(0, R1);
+ }
+
+ // Deal with remaining columns of the rhs
+ for(Index j2=packet_cols4; j2<cols; j2++)
+ {
+ // One column at a time
+ const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
+ prefetch(&blA[0]);
+
+ // gets res block as register
+ AccPacket C0;
+ traits.initAcc(C0);
+
+ LinearMapper r0 = res.getLinearMapper(i, j2);
+
+ // performs "inner" products
+ const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+ LhsPacket A0;
+
+ for(Index k= 0; k<peeled_kc; k+=pk)
+ {
+ EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
+ RhsPacket B_0;
+
+#define EIGEN_GEBGP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+ /* FIXME: why unaligned???? */ \
+ traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
+ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
+ traits.madd(A0, B_0, C0, B_0, fix<0>); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
+ } while(false);
+
+ EIGEN_GEBGP_ONESTEP(0);
+ EIGEN_GEBGP_ONESTEP(1);
+ EIGEN_GEBGP_ONESTEP(2);
+ EIGEN_GEBGP_ONESTEP(3);
+ EIGEN_GEBGP_ONESTEP(4);
+ EIGEN_GEBGP_ONESTEP(5);
+ EIGEN_GEBGP_ONESTEP(6);
+ EIGEN_GEBGP_ONESTEP(7);
+
+ blB += pk*RhsProgress;
+ blA += pk*LhsProgress;
+
+ EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
+ }
+
+ // process remaining peeled loop
+ for(Index k=peeled_kc; k<depth; k++)
+ {
+ RhsPacket B_0;
+ EIGEN_GEBGP_ONESTEP(0);
+ blB += RhsProgress;
+ blA += LhsProgress;
+ }
+#undef EIGEN_GEBGP_ONESTEP
+ ResPacket R0;
+ ResPacket alphav = pset1<ResPacket>(alpha);
+ R0 = r0.template loadPacket<ResPacket>(0);
+ traits.acc(C0, alphav, R0);
+ r0.storePacket(0, R0);
+ }
+ }
+ }
+};
+
+template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
+struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
+{
+
+EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
+ {
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
+ traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
+ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
+ traits.madd(*A0, *B_0, *C0, *B_0);
+ traits.madd(*A0, *B1, *C1, *B1);
+ traits.madd(*A0, *B2, *C2, *B2);
+ traits.madd(*A0, *B3, *C3, *B3);
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
+ }
+};
+
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
EIGEN_DONT_INLINE
void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
@@ -987,10 +1561,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
- const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
+ const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
+ const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
+ const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
const Index peeled_kc = depth & ~(pk-1);
- const Index prefetch_res_offset = 32/sizeof(ResScalar);
+ const int prefetch_res_offset = 32/sizeof(ResScalar);
// const Index depth2 = depth & ~1;
//---------- Process 3 * LhsProgress rows at once ----------
@@ -1048,36 +1624,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
for(Index k=0; k<peeled_kc; k+=pk)
{
EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
- RhsPacket B_0, T0;
+ // 15 registers are taken (12 for acc, 2 for lhs).
+ RhsPanel15 rhs_panel;
+ RhsPacket T0;
LhsPacket A2;
-
-#define EIGEN_GEBP_ONESTEP(K) \
- do { \
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
+ #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
+ // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
+ // without this workaround A0, A1, and A2 are loaded in the same register,
+ // which is not good for pipelining
+ #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
+ #else
+ #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
+ #endif
+#define EIGEN_GEBP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
- internal::prefetch(blA+(3*K+16)*LhsProgress); \
- if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
- traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
- traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
- traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
- traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
- traits.madd(A0, B_0, C0, T0); \
- traits.madd(A1, B_0, C4, T0); \
- traits.madd(A2, B_0, C8, B_0); \
- traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
- traits.madd(A0, B_0, C1, T0); \
- traits.madd(A1, B_0, C5, T0); \
- traits.madd(A2, B_0, C9, B_0); \
- traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
- traits.madd(A0, B_0, C2, T0); \
- traits.madd(A1, B_0, C6, T0); \
- traits.madd(A2, B_0, C10, B_0); \
- traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
- traits.madd(A0, B_0, C3 , T0); \
- traits.madd(A1, B_0, C7, T0); \
- traits.madd(A2, B_0, C11, B_0); \
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
- } while(false)
+ internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
+ if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
+ internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
+ } /* Bug 953 */ \
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
+ EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
+ traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
+ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
+ traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
+ traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
+ traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
+ traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
+ traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
+ traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
+ traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
+ traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
+ traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
+ traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
+ } while (false)
internal::prefetch(blB);
EIGEN_GEBP_ONESTEP(0);
@@ -1097,7 +1685,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
// process remaining peeled loop
for(Index k=peeled_kc; k<depth; k++)
{
- RhsPacket B_0, T0;
+ RhsPanel15 rhs_panel;
+ RhsPacket T0;
LhsPacket A2;
EIGEN_GEBP_ONESTEP(0);
blB += 4*RhsProgress;
@@ -1177,20 +1766,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
{
EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
RhsPacket B_0;
-#define EIGEN_GEBGP_ONESTEP(K) \
- do { \
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
+#define EIGEN_GEBGP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
- traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
- traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
- traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
- traits.madd(A0, B_0, C0, B_0); \
- traits.madd(A1, B_0, C4, B_0); \
- traits.madd(A2, B_0, C8, B_0); \
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
- } while(false)
-
+ traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
+ traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
+ traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
+ traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
+ traits.madd(A0, B_0, C0, B_0, fix<0>); \
+ traits.madd(A1, B_0, C4, B_0, fix<0>); \
+ traits.madd(A2, B_0, C8, B_0, fix<0>); \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
+ } while (false)
+
EIGEN_GEBGP_ONESTEP(0);
EIGEN_GEBGP_ONESTEP(1);
EIGEN_GEBGP_ONESTEP(2);
@@ -1279,26 +1868,34 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
for(Index k=0; k<peeled_kc; k+=pk)
{
EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
- RhsPacket B_0, B1, B2, B3, T0;
+ RhsPacketx4 rhs_panel;
+ RhsPacket T0;
+
+ // NOTE: the begin/end asm comments below work around bug 935!
+ // but they are not enough for gcc>=6 without FMA (bug 1637)
+ #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
+ #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
+ #else
+ #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
+ #endif
+#define EIGEN_GEBGP_ONESTEP(K) \
+ do { \
+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
+ traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
+ traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
+ traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
+ traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
+ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
+ traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
+ traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
+ traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
+ traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
+ traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
+ traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
+ EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
+ } while (false)
- #define EIGEN_GEBGP_ONESTEP(K) \
- do { \
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
- traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
- traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
- traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
- traits.madd(A0, B_0, C0, T0); \
- traits.madd(A1, B_0, C4, B_0); \
- traits.madd(A0, B1, C1, T0); \
- traits.madd(A1, B1, C5, B1); \
- traits.madd(A0, B2, C2, T0); \
- traits.madd(A1, B2, C6, B2); \
- traits.madd(A0, B3, C3, T0); \
- traits.madd(A1, B3, C7, B3); \
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
- } while(false)
-
internal::prefetch(blB+(48+0));
EIGEN_GEBGP_ONESTEP(0);
EIGEN_GEBGP_ONESTEP(1);
@@ -1318,7 +1915,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
// process remaining peeled loop
for(Index k=peeled_kc; k<depth; k++)
{
- RhsPacket B_0, B1, B2, B3, T0;
+ RhsPacketx4 rhs_panel;
+ RhsPacket T0;
EIGEN_GEBGP_ONESTEP(0);
blB += 4*RhsProgress;
blA += 2*Traits::LhsProgress;
@@ -1389,8 +1987,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
- traits.madd(A0, B_0, C0, B1); \
- traits.madd(A1, B_0, C4, B_0); \
+ traits.madd(A0, B_0, C0, B1, fix<0>); \
+ traits.madd(A1, B_0, C4, B_0, fix<0>); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
} while(false)
@@ -1434,174 +2032,29 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
//---------- Process 1 * LhsProgress rows at once ----------
if(mr>=1*Traits::LhsProgress)
{
- // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
- for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
- {
- // loops on each largest micro vertical panel of rhs (depth * nr)
- for(Index j2=0; j2<packet_cols4; j2+=nr)
- {
- // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
- // stored into 1 x nr registers.
-
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
- prefetch(&blA[0]);
-
- // gets res block as register
- AccPacket C0, C1, C2, C3;
- traits.initAcc(C0);
- traits.initAcc(C1);
- traits.initAcc(C2);
- traits.initAcc(C3);
-
- LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
- LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
- LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
- LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
-
- r0.prefetch(prefetch_res_offset);
- r1.prefetch(prefetch_res_offset);
- r2.prefetch(prefetch_res_offset);
- r3.prefetch(prefetch_res_offset);
-
- // performs "inner" products
- const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
- prefetch(&blB[0]);
- LhsPacket A0;
-
- for(Index k=0; k<peeled_kc; k+=pk)
- {
- EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
- RhsPacket B_0, B1, B2, B3;
-
-#define EIGEN_GEBGP_ONESTEP(K) \
- do { \
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
- traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
- traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
- traits.madd(A0, B_0, C0, B_0); \
- traits.madd(A0, B1, C1, B1); \
- traits.madd(A0, B2, C2, B2); \
- traits.madd(A0, B3, C3, B3); \
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
- } while(false)
-
- internal::prefetch(blB+(48+0));
- EIGEN_GEBGP_ONESTEP(0);
- EIGEN_GEBGP_ONESTEP(1);
- EIGEN_GEBGP_ONESTEP(2);
- EIGEN_GEBGP_ONESTEP(3);
- internal::prefetch(blB+(48+16));
- EIGEN_GEBGP_ONESTEP(4);
- EIGEN_GEBGP_ONESTEP(5);
- EIGEN_GEBGP_ONESTEP(6);
- EIGEN_GEBGP_ONESTEP(7);
-
- blB += pk*4*RhsProgress;
- blA += pk*1*LhsProgress;
-
- EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
- }
- // process remaining peeled loop
- for(Index k=peeled_kc; k<depth; k++)
- {
- RhsPacket B_0, B1, B2, B3;
- EIGEN_GEBGP_ONESTEP(0);
- blB += 4*RhsProgress;
- blA += 1*LhsProgress;
- }
-#undef EIGEN_GEBGP_ONESTEP
-
- ResPacket R0, R1;
- ResPacket alphav = pset1<ResPacket>(alpha);
-
- R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
- R1 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
- traits.acc(C0, alphav, R0);
- traits.acc(C1, alphav, R1);
- r0.storePacket(0 * Traits::ResPacketSize, R0);
- r1.storePacket(0 * Traits::ResPacketSize, R1);
-
- R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
- R1 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
- traits.acc(C2, alphav, R0);
- traits.acc(C3, alphav, R1);
- r2.storePacket(0 * Traits::ResPacketSize, R0);
- r3.storePacket(0 * Traits::ResPacketSize, R1);
- }
-
- // Deal with remaining columns of the rhs
- for(Index j2=packet_cols4; j2<cols; j2++)
- {
- // One column at a time
- const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
- prefetch(&blA[0]);
-
- // gets res block as register
- AccPacket C0;
- traits.initAcc(C0);
-
- LinearMapper r0 = res.getLinearMapper(i, j2);
-
- // performs "inner" products
- const RhsScalar* blB = &blockB[j2*strideB+offsetB];
- LhsPacket A0;
-
- for(Index k=0; k<peeled_kc; k+=pk)
- {
- EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
- RhsPacket B_0;
-
-#define EIGEN_GEBGP_ONESTEP(K) \
- do { \
- EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
- EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
- traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
- traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
- traits.madd(A0, B_0, C0, B_0); \
- EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
- } while(false);
-
- EIGEN_GEBGP_ONESTEP(0);
- EIGEN_GEBGP_ONESTEP(1);
- EIGEN_GEBGP_ONESTEP(2);
- EIGEN_GEBGP_ONESTEP(3);
- EIGEN_GEBGP_ONESTEP(4);
- EIGEN_GEBGP_ONESTEP(5);
- EIGEN_GEBGP_ONESTEP(6);
- EIGEN_GEBGP_ONESTEP(7);
-
- blB += pk*RhsProgress;
- blA += pk*1*Traits::LhsProgress;
-
- EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
- }
-
- // process remaining peeled loop
- for(Index k=peeled_kc; k<depth; k++)
- {
- RhsPacket B_0;
- EIGEN_GEBGP_ONESTEP(0);
- blB += RhsProgress;
- blA += 1*Traits::LhsProgress;
- }
-#undef EIGEN_GEBGP_ONESTEP
- ResPacket R0;
- ResPacket alphav = pset1<ResPacket>(alpha);
- R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
- traits.acc(C0, alphav, R0);
- r0.storePacket(0 * Traits::ResPacketSize, R0);
- }
- }
+ lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
+ p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
+ }
+ //---------- Process LhsProgressHalf rows at once ----------
+ if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
+ {
+ lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
+ p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
+ }
+ //---------- Process LhsProgressQuarter rows at once ----------
+ if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
+ {
+ lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
+ p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
}
//---------- Process remaining rows, 1 at once ----------
- if(peeled_mc1<rows)
+ if(peeled_mc_quarter<rows)
{
// loop on each panel of the rhs
for(Index j2=0; j2<packet_cols4; j2+=nr)
{
// loop on each row of the lhs (1*LhsProgress x depth)
- for(Index i=peeled_mc1; i<rows; i+=1)
+ for(Index i=peeled_mc_quarter; i<rows; i+=1)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA];
prefetch(&blA[0]);
@@ -1614,7 +2067,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
if ((SwappedTraits::LhsProgress % 4) == 0 &&
(SwappedTraits::LhsProgress<=16) &&
- (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
+ (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
(SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
{
SAccPacket C0, C1, C2, C3;
@@ -1638,15 +2091,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
straits.loadRhsQuad(blA+0*spk, B_0);
straits.loadRhsQuad(blA+1*spk, B_1);
- straits.madd(A0,B_0,C0,B_0);
- straits.madd(A1,B_1,C1,B_1);
+ straits.madd(A0,B_0,C0,B_0, fix<0>);
+ straits.madd(A1,B_1,C1,B_1, fix<0>);
straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
straits.loadRhsQuad(blA+2*spk, B_0);
straits.loadRhsQuad(blA+3*spk, B_1);
- straits.madd(A0,B_0,C2,B_0);
- straits.madd(A1,B_1,C3,B_1);
+ straits.madd(A0,B_0,C2,B_0, fix<0>);
+ straits.madd(A1,B_1,C3,B_1, fix<0>);
blB += 4*SwappedTraits::LhsProgress;
blA += 4*spk;
@@ -1659,7 +2112,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
straits.loadLhsUnaligned(blB, A0);
straits.loadRhsQuad(blA, B_0);
- straits.madd(A0,B_0,C0,B_0);
+ straits.madd(A0,B_0,C0,B_0, fix<0>);
blB += SwappedTraits::LhsProgress;
blA += spk;
@@ -1669,7 +2122,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
// Special case where we have to first reduce the accumulation register C0
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
- typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
+ typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
@@ -1683,7 +2136,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
straits.loadLhsUnaligned(blB, a0);
straits.loadRhs(blA, b0);
SAccPacketHalf c0 = predux_half_dowto4(C0);
- straits.madd(a0,b0,c0,b0);
+ straits.madd(a0,b0,c0,b0, fix<0>);
straits.acc(c0, alphav, R);
}
else
@@ -1699,7 +2152,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
// template form, so that LhsProgress < 16 paths don't
// fail to compile
last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
- p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
+ p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
}
else
{
@@ -1744,7 +2197,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
for(Index j2=packet_cols4; j2<cols; j2++)
{
// loop on each row of the lhs (1*LhsProgress x depth)
- for(Index i=peeled_mc1; i<rows; i+=1)
+ for(Index i=peeled_mc_quarter; i<rows; i+=1)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA];
prefetch(&blA[0]);
@@ -1791,7 +2244,13 @@ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pa
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
- enum { PacketSize = unpacket_traits<Packet>::size };
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+ enum { PacketSize = unpacket_traits<Packet>::size,
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
EIGEN_UNUSED_VARIABLE(stride);
@@ -1803,9 +2262,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
- const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
- const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
- : Pack2>1 ? (rows/Pack2)*Pack2 : 0;
+ const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
+ const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
+ const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
+ const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
+ const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
+ : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
Index i=0;
@@ -1864,20 +2326,60 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
}
}
- // Pack scalars
+ // Pack half packets
+ if(HasHalf && Pack1>=HalfPacketSize)
+ {
+ for(; i<peeled_mc_half; i+=HalfPacketSize)
+ {
+ if(PanelMode) count += (HalfPacketSize) * offset;
+
+ for(Index k=0; k<depth; k++)
+ {
+ HalfPacket A;
+ A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
+ pstoreu(blockA+count, cj.pconj(A));
+ count+=HalfPacketSize;
+ }
+ if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
+ }
+ }
+ // Pack quarter packets
+ if(HasQuarter && Pack1>=QuarterPacketSize)
+ {
+ for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
+ {
+ if(PanelMode) count += (QuarterPacketSize) * offset;
+
+ for(Index k=0; k<depth; k++)
+ {
+ QuarterPacket A;
+ A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
+ pstoreu(blockA+count, cj.pconj(A));
+ count+=QuarterPacketSize;
+ }
+ if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
+ }
+ }
+ // Pack2 may be *smaller* than PacketSize—that happens for
+ // products like real * complex, where we have to go half the
+ // progress on the lhs in order to duplicate those operands to
+ // address both real & imaginary parts on the rhs. This portion will
+ // pack those half ones until they match the number expected on the
+ // last peeling loop at this point (for the rhs).
if(Pack2<PacketSize && Pack2>1)
{
- for(; i<peeled_mc0; i+=Pack2)
+ for(; i<peeled_mc0; i+=last_lhs_progress)
{
- if(PanelMode) count += Pack2 * offset;
+ if(PanelMode) count += last_lhs_progress * offset;
for(Index k=0; k<depth; k++)
- for(Index w=0; w<Pack2; w++)
+ for(Index w=0; w<last_lhs_progress; w++)
blockA[count++] = cj(lhs(i+w, k));
- if(PanelMode) count += Pack2 * (stride-offset-depth);
+ if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
}
}
+ // Pack scalars
for(; i<rows; i++)
{
if(PanelMode) count += offset;
@@ -1898,7 +2400,13 @@ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pa
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
- enum { PacketSize = unpacket_traits<Packet>::size };
+ typedef typename unpacket_traits<Packet>::half HalfPacket;
+ typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+ enum { PacketSize = unpacket_traits<Packet>::size,
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
EIGEN_UNUSED_VARIABLE(stride);
@@ -1906,37 +2414,51 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
Index count = 0;
+ bool gone_half = false, gone_quarter = false, gone_last = false;
-// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
-// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
-// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
-
- int pack = Pack1;
Index i = 0;
+ int pack = Pack1;
+ int psize = PacketSize;
while(pack>0)
{
Index remaining_rows = rows-i;
- Index peeled_mc = i+(remaining_rows/pack)*pack;
+ Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
+ Index starting_pos = i;
for(; i<peeled_mc; i+=pack)
{
if(PanelMode) count += pack * offset;
- const Index peeled_k = (depth/PacketSize)*PacketSize;
Index k=0;
- if(pack>=PacketSize)
+ if(pack>=psize && psize >= QuarterPacketSize)
{
- for(; k<peeled_k; k+=PacketSize)
+ const Index peeled_k = (depth/psize)*psize;
+ for(; k<peeled_k; k+=psize)
{
- for (Index m = 0; m < pack; m += PacketSize)
+ for (Index m = 0; m < pack; m += psize)
{
- PacketBlock<Packet> kernel;
- for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
- ptranspose(kernel);
- for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
+ if (psize == PacketSize) {
+ PacketBlock<Packet> kernel;
+ for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
+ ptranspose(kernel);
+ for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
+ } else if (HasHalf && psize == HalfPacketSize) {
+ gone_half = true;
+ PacketBlock<HalfPacket> kernel_half;
+ for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
+ ptranspose(kernel_half);
+ for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
+ } else if (HasQuarter && psize == QuarterPacketSize) {
+ gone_quarter = true;
+ PacketBlock<QuarterPacket> kernel_quarter;
+ for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
+ ptranspose(kernel_quarter);
+ for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
+ }
}
- count += PacketSize*pack;
+ count += psize*pack;
}
}
+
for(; k<depth; k++)
{
Index w=0;
@@ -1959,9 +2481,28 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
if(PanelMode) count += pack * (stride-offset-depth);
}
- pack -= PacketSize;
- if(pack<Pack2 && (pack+PacketSize)!=Pack2)
- pack = Pack2;
+ pack -= psize;
+ Index left = rows - i;
+ if (pack <= 0) {
+ if (!gone_last &&
+ (starting_pos == i || left >= psize/2 || left >= psize/4) &&
+ ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
+ (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
+ psize /= 2;
+ pack = psize;
+ continue;
+ }
+ // Pack2 may be *smaller* than PacketSize—that happens for
+ // products like real * complex, where we have to go half the
+ // progress on the lhs in order to duplicate those operands to
+ // address both real & imaginary parts on the rhs. This portion will
+ // pack those half ones until they match the number expected on the
+ // last peeling loop at this point (for the rhs).
+ if (Pack2 < PacketSize && !gone_last) {
+ gone_last = true;
+ psize = pack = left & ~1;
+ }
+ }
}
for(; i<rows; i++)
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index f49abcad5..90c9c4647 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -469,6 +469,20 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0)
return;
+ // Fallback to GEMV if either the lhs or rhs is a runtime vector
+ if (dst.cols() == 1)
+ {
+ typename Dest::ColXpr dst_vec(dst.col(0));
+ return internal::generic_product_impl<Lhs,typename Rhs::ConstColXpr,DenseShape,DenseShape,GemvProduct>
+ ::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha);
+ }
+ else if (dst.rows() == 1)
+ {
+ typename Dest::RowXpr dst_vec(dst.row(0));
+ return internal::generic_product_impl<typename Lhs::ConstRowXpr,Rhs,DenseShape,DenseShape,GemvProduct>
+ ::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha);
+ }
+
typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 92e9b0d9f..e01e798f1 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -21,7 +21,8 @@ namespace internal {
/** \internal */
inline void manage_multi_threading(Action action, int* v)
{
- static EIGEN_UNUSED int m_maxThreads = -1;
+ static int m_maxThreads = -1;
+ EIGEN_UNUSED_VARIABLE(m_maxThreads);
if(action==SetAction)
{
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index c84c71609..673073601 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -45,14 +45,23 @@ struct symm_pack_lhs
}
void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
{
- enum { PacketSize = packet_traits<Scalar>::size };
+ typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket;
+ typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket;
+ enum { PacketSize = packet_traits<Scalar>::size,
+ HalfPacketSize = unpacket_traits<HalfPacket>::size,
+ QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+ HasHalf = (int)HalfPacketSize < (int)PacketSize,
+ HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
+
const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
Index count = 0;
//Index peeled_mc3 = (rows/Pack1)*Pack1;
const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
- const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+ const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
+ const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
+ const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0;
if(Pack1>=3*PacketSize)
for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
@@ -66,8 +75,16 @@ struct symm_pack_lhs
for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
pack<1*PacketSize>(blockA, lhs, cols, i, count);
+ if(HasHalf && Pack1>=HalfPacketSize)
+ for(Index i=peeled_mc1; i<peeled_mc_half; i+=HalfPacketSize)
+ pack<HalfPacketSize>(blockA, lhs, cols, i, count);
+
+ if(HasQuarter && Pack1>=QuarterPacketSize)
+ for(Index i=peeled_mc_half; i<peeled_mc_quarter; i+=QuarterPacketSize)
+ pack<QuarterPacketSize>(blockA, lhs, cols, i, count);
+
// do the same with mr==1
- for(Index i=peeled_mc1; i<rows; i++)
+ for(Index i=peeled_mc_quarter; i<rows; i++)
{
for(Index k=0; k<i; k++)
blockA[count++] = lhs(i, k); // normal
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index a32630ed7..e6689c656 100755
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -274,7 +274,8 @@ template<typename XprType> struct blas_traits
HasUsableDirectAccess = ( (int(XprType::Flags)&DirectAccessBit)
&& ( bool(XprType::IsVectorAtCompileTime)
|| int(inner_stride_at_compile_time<XprType>::ret) == 1)
- ) ? 1 : 0
+ ) ? 1 : 0,
+ HasScalarFactor = false
};
typedef typename conditional<bool(HasUsableDirectAccess),
ExtractType,
@@ -306,6 +307,9 @@ template<typename Scalar, typename NestedXpr, typename Plain>
struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> >
: blas_traits<NestedXpr>
{
+ enum {
+ HasScalarFactor = true
+ };
typedef blas_traits<NestedXpr> Base;
typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType;
typedef typename Base::ExtractType ExtractType;
@@ -317,6 +321,9 @@ template<typename Scalar, typename NestedXpr, typename Plain>
struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > >
: blas_traits<NestedXpr>
{
+ enum {
+ HasScalarFactor = true
+ };
typedef blas_traits<NestedXpr> Base;
typedef CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > XprType;
typedef typename Base::ExtractType ExtractType;
@@ -335,6 +342,9 @@ template<typename Scalar, typename NestedXpr>
struct blas_traits<CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> >
: blas_traits<NestedXpr>
{
+ enum {
+ HasScalarFactor = true
+ };
typedef blas_traits<NestedXpr> Base;
typedef CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> XprType;
typedef typename Base::ExtractType ExtractType;
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 263604597..b00d8b038 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -10,13 +10,6 @@
#ifndef EIGEN_CONFIGURE_VECTORIZATION_H
#define EIGEN_CONFIGURE_VECTORIZATION_H
-// FIXME: not sure why this is needed, perhaps it is not needed anymore.
-#ifdef __NVCC__
- #ifndef EIGEN_DONT_VECTORIZE
- #define EIGEN_DONT_VECTORIZE
- #endif
-#endif
-
//------------------------------------------------------------------------------------------
// Static and dynamic alignment control
//
@@ -36,10 +29,15 @@
*
* If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
* vectorized and non-vectorized code.
+ *
+ * FIXME: this code can be cleaned up once we switch to proper C++11 only.
*/
#if (defined EIGEN_CUDACC)
#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
#define EIGEN_ALIGNOF(x) __alignof(x)
+#elif EIGEN_HAS_ALIGNAS
+ #define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n)
+ #define EIGEN_ALIGNOF(x) alignof(x)
#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
#define EIGEN_ALIGNOF(x) __alignof(x)
@@ -51,12 +49,18 @@
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
#define EIGEN_ALIGNOF(x) __alignof(x)
#else
- #error Please tell me what is the equivalent of __attribute__((aligned(n))) and __alignof(x) for your compiler
+ #error Please tell me what is the equivalent of alignas(n) and alignof(x) for your compiler
#endif
// If the user explicitly disable vectorization, then we also disable alignment
#if defined(EIGEN_DONT_VECTORIZE)
- #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
+ #if defined(EIGEN_GPUCC)
+ // GPU code is always vectorized and requires memory alignment for
+ // statically allocated buffers.
+ #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
+ #else
+ #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
+ #endif
#elif defined(__AVX512F__)
// 64 bytes static alignment is preferred only if really required
#define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
@@ -183,8 +187,6 @@
//----------------------------------------------------------------------
-
-
// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
#if EIGEN_MAX_ALIGN_BYTES==0
@@ -211,7 +213,7 @@
#endif
-#ifndef EIGEN_DONT_VECTORIZE
+#if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC))
#if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
@@ -253,10 +255,19 @@
#define EIGEN_VECTORIZE_SSE4_1
#define EIGEN_VECTORIZE_SSE4_2
#endif
- #ifdef __FMA__
+ #if defined(__FMA__) || (EIGEN_COMP_MSVC && defined(__AVX2__))
+ // MSVC does not expose a switch dedicated for FMA
+ // For MSVC, AVX2 => FMA
#define EIGEN_VECTORIZE_FMA
#endif
#if defined(__AVX512F__)
+ #ifndef EIGEN_VECTORIZE_FMA
+ #if EIGEN_COMP_GNUC
+ #error Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638).
+ #else
+ #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638).
+ #endif
+ #endif
#define EIGEN_VECTORIZE_AVX512
#define EIGEN_VECTORIZE_AVX2
#define EIGEN_VECTORIZE_AVX
@@ -375,7 +386,7 @@
#endif
#if defined(EIGEN_HAS_CUDA_FP16)
- #include <host_defines.h>
+ #include <cuda_runtime_api.h>
#include <cuda_fp16.h>
#endif
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 3ab3a5f50..050d15e96 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -47,11 +47,7 @@ template<typename T> struct NumTraits;
template<typename Derived> struct EigenBase;
template<typename Derived> class DenseBase;
template<typename Derived> class PlainObjectBase;
-
-
-template<typename Derived,
- int Level = internal::accessors_level<Derived>::value >
-class DenseCoeffsBase;
+template<typename Derived, int Level> class DenseCoeffsBase;
template<typename _Scalar, int _Rows, int _Cols,
int _Options = AutoAlign |
@@ -260,6 +256,7 @@ template<typename MatrixType> class HouseholderQR;
template<typename MatrixType> class ColPivHouseholderQR;
template<typename MatrixType> class FullPivHouseholderQR;
template<typename MatrixType> class CompleteOrthogonalDecomposition;
+template<typename MatrixType> class SVDBase;
template<typename MatrixType, int QRPreconditioner = ColPivHouseholderQRPreconditioner> class JacobiSVD;
template<typename MatrixType> class BDCSVD;
template<typename MatrixType, int UpLo = Lower> class LLT;
diff --git a/Eigen/src/Core/util/IndexedViewHelper.h b/Eigen/src/Core/util/IndexedViewHelper.h
index 40e16fdb4..1cda85060 100644
--- a/Eigen/src/Core/util/IndexedViewHelper.h
+++ b/Eigen/src/Core/util/IndexedViewHelper.h
@@ -23,7 +23,7 @@ struct symbolic_last_tag {};
* Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically reference the last element/row/columns
* of the underlying vector or matrix once passed to DenseBase::operator()(const RowIndices&, const ColIndices&).
*
- * This symbolic placeholder support standard arithmetic operation.
+ * This symbolic placeholder supports standard arithmetic operations.
*
* A typical usage example would be:
* \code
@@ -44,7 +44,7 @@ static const symbolic::SymbolExpr<internal::symbolic_last_tag> last; // PLEASE u
* reference the last+1 element/row/columns of the underlying vector or matrix once
* passed to DenseBase::operator()(const RowIndices&, const ColIndices&).
*
- * This symbolic placeholder support standard arithmetic operation.
+ * This symbolic placeholder supports standard arithmetic operations.
* It is essentially an alias to last+fix<1>.
*
* \sa last
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index a7c6f50c3..ce3633388 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -129,16 +129,21 @@
#define EIGEN_COMP_MSVC_STRICT 0
#endif
-/// \internal EIGEN_COMP_IBM set to 1 if the compiler is IBM XL C++
-#if defined(__IBMCPP__) || defined(__xlc__)
- #define EIGEN_COMP_IBM 1
+/// \internal EIGEN_COMP_IBM set to xlc version if the compiler is IBM XL C++
+// XLC version
+// 3.1 0x0301
+// 4.5 0x0405
+// 5.0 0x0500
+// 12.1 0x0C01
+#if defined(__IBMCPP__) || defined(__xlc__) || defined(__ibmxl__)
+ #define EIGEN_COMP_IBM __xlC__
#else
#define EIGEN_COMP_IBM 0
#endif
-/// \internal EIGEN_COMP_PGI set to 1 if the compiler is Portland Group Compiler
+/// \internal EIGEN_COMP_PGI set to PGI version if the compiler is Portland Group Compiler
#if defined(__PGI)
- #define EIGEN_COMP_PGI 1
+ #define EIGEN_COMP_PGI (__PGIC__*100+__PGIC_MINOR__)
#else
#define EIGEN_COMP_PGI 0
#endif
@@ -347,9 +352,17 @@
#define EIGEN_OS_WIN_STRICT 0
#endif
-/// \internal EIGEN_OS_SUN set to 1 if the OS is SUN
+/// \internal EIGEN_OS_SUN set to __SUNPRO_C if the OS is SUN
+// compiler solaris __SUNPRO_C
+// version studio
+// 5.7 10 0x570
+// 5.8 11 0x580
+// 5.9 12 0x590
+// 5.10 12.1 0x5100
+// 5.11 12.2 0x5110
+// 5.12 12.3 0x5120
#if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__))
- #define EIGEN_OS_SUN 1
+ #define EIGEN_OS_SUN __SUNPRO_C
#else
#define EIGEN_OS_SUN 0
#endif
@@ -495,13 +508,33 @@
#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0
#endif
-#if EIGEN_MAX_CPP_VER>=11 && (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900)
+
+// The macro EIGEN_COMP_CXXVER defines the c++ verson expected by the compiler.
+// For instance, if compiling with gcc and -std=c++17, then EIGEN_COMP_CXXVER
+// is defined to 17.
+#if (defined(__cplusplus) && (__cplusplus > 201402L) || EIGEN_COMP_MSVC >= 1914)
+#define EIGEN_COMP_CXXVER 17
+#elif (defined(__cplusplus) && (__cplusplus > 201103L) || EIGEN_COMP_MSVC >= 1910)
+#define EIGEN_COMP_CXXVER 14
+#elif (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900)
+#define EIGEN_COMP_CXXVER 11
+#else
+#define EIGEN_COMP_CXXVER 03
+#endif
+
+
+// The macros EIGEN_HAS_CXX?? defines a rough estimate of available c++ features
+// but in practice we should not rely on them but rather on the availabilty of
+// individual features as defined later.
+// This is why there is no EIGEN_HAS_CXX17.
+// FIXME: get rid of EIGEN_HAS_CXX14 and maybe even EIGEN_HAS_CXX11.
+#if EIGEN_MAX_CPP_VER>=11 && EIGEN_COMP_CXXVER>=11
#define EIGEN_HAS_CXX11 1
#else
#define EIGEN_HAS_CXX11 0
#endif
-#if EIGEN_MAX_CPP_VER>=14 && (defined(__cplusplus) && (__cplusplus > 201103L) || EIGEN_COMP_MSVC >= 1910)
+#if EIGEN_MAX_CPP_VER>=14 && EIGEN_COMP_CXXVER>=14
#define EIGEN_HAS_CXX14 1
#else
#define EIGEN_HAS_CXX14 0
@@ -546,6 +579,22 @@
#endif
#endif
+#ifndef EIGEN_HAS_ALIGNAS
+#if EIGEN_MAX_CPP_VER>=11 && EIGEN_HAS_CXX11 && \
+ ( __has_feature(cxx_alignas) \
+ || EIGEN_HAS_CXX14 \
+ || (EIGEN_COMP_MSVC >= 1800) \
+ || (EIGEN_GNUC_AT_LEAST(4,8)) \
+ || (EIGEN_COMP_CLANG>=305) \
+ || (EIGEN_COMP_ICC>=1500) \
+ || (EIGEN_COMP_PGI>=1500) \
+ || (EIGEN_COMP_SUNCC>=0x5130))
+#define EIGEN_HAS_ALIGNAS 1
+#else
+#define EIGEN_HAS_ALIGNAS 0
+#endif
+#endif
+
// Does the compiler support type_traits?
// - full support of type traits was added only to GCC 5.1.0.
// - 20150626 corresponds to the last release of 4.x libstdc++
@@ -649,6 +698,23 @@
#endif
#endif
+// NOTE: the required Apple's clang version is very conservative
+// and it could be that XCode 9 works just fine.
+// NOTE: the MSVC version is based on https://en.cppreference.com/w/cpp/compiler_support
+// and not tested.
+#ifndef EIGEN_HAS_CXX17_OVERALIGN
+#if EIGEN_MAX_CPP_VER>=17 && EIGEN_COMP_CXXVER>=17 && ( \
+ (EIGEN_COMP_MSVC >= 1912) \
+ || (EIGEN_GNUC_AT_LEAST(7,0)) \
+ || ((!defined(__apple_build_version__)) && (EIGEN_COMP_CLANG>=500)) \
+ || (( defined(__apple_build_version__)) && (__apple_build_version__>=10000000)) \
+ )
+#define EIGEN_HAS_CXX17_OVERALIGN 1
+#else
+#define EIGEN_HAS_CXX17_OVERALIGN 0
+#endif
+#endif
+
#if defined(EIGEN_CUDACC) && EIGEN_HAS_CONSTEXPR
// While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules
#if defined(__NVCC__)
@@ -742,10 +808,6 @@
// All functions callable from CUDA/HIP code must be qualified with __device__
#ifdef EIGEN_GPUCC
- #ifndef EIGEN_DONT_VECTORIZE
- #define EIGEN_DONT_VECTORIZE
- #endif
-
#define EIGEN_DEVICE_FUNC __host__ __device__
#else
#define EIGEN_DEVICE_FUNC
@@ -841,7 +903,7 @@
// Suppresses 'unused variable' warnings.
namespace Eigen {
namespace internal {
- template<typename T> EIGEN_DEVICE_FUNC void ignore_unused_variable(const T&) {}
+ template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ignore_unused_variable(const T&) {}
}
}
#define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var);
@@ -1035,7 +1097,7 @@ namespace Eigen {
#endif
#define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) \
- template <typename T> EIGEN_DEVICE_FUNC inline \
+ template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type,OPNAME))\
(METHOD)(const T& scalar) const { \
typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type PromotedT; \
@@ -1044,7 +1106,7 @@ namespace Eigen {
}
#define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \
- template <typename T> EIGEN_DEVICE_FUNC inline friend \
+ template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend \
EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type,Derived,OPNAME)) \
(METHOD)(const T& scalar, const StorageBaseType& matrix) { \
typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type PromotedT; \
@@ -1093,9 +1155,9 @@ namespace Eigen {
# define EIGEN_NOEXCEPT
# define EIGEN_NOEXCEPT_IF(x)
# define EIGEN_NO_THROW throw()
-# if EIGEN_COMP_MSVC
+# if EIGEN_COMP_MSVC || EIGEN_COMP_CXXVER>=17
// MSVC does not support exception specifications (warning C4290),
- // and they are deprecated in c++11 anyway.
+ // and they are deprecated in c++11 anyway. This is even an error in c++17.
# define EIGEN_EXCEPTION_SPEC(X) throw()
# else
# define EIGEN_EXCEPTION_SPEC(X) throw(X)
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 87b538658..1b12544d2 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -360,7 +360,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, std::size_t size)
{
destruct_elements_of_array<T>(ptr, size);
- aligned_free(ptr);
+ Eigen::internal::aligned_free(ptr);
}
/** \internal Deletes objects constructed with conditional_aligned_new
@@ -768,6 +768,17 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
*** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF] ***
*****************************************************************************/
+#if EIGEN_HAS_CXX17_OVERALIGN
+
+// C++17 -> no need to bother about alignment anymore :)
+
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign)
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size)
+
+#else
+
#if EIGEN_MAX_ALIGN_BYTES!=0
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \
@@ -810,6 +821,8 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
((EIGEN_MAX_ALIGN_BYTES>=32) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/2)==0)) || \
((EIGEN_MAX_ALIGN_BYTES>=64) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/4)==0)) )))
+#endif
+
/****************************************************************************/
/** \class aligned_allocator
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 1415b3fc1..8fcb18a94 100755
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -636,8 +636,41 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to<double>()(x,y); }
#endif
+/** \internal extract the bits of the float \a x */
+inline unsigned int as_uint(float x)
+{
+ unsigned int ret;
+ std::memcpy(&ret, &x, sizeof(float));
+ return ret;
+}
+
} // end namespace numext
} // end namespace Eigen
+// Define portable (u)int{32,64} types
+#if EIGEN_HAS_CXX11
+#include <cstdint>
+namespace Eigen {
+namespace numext {
+typedef std::uint32_t uint32_t;
+typedef std::int32_t int32_t;
+typedef std::uint64_t uint64_t;
+typedef std::int64_t int64_t;
+}
+}
+#else
+// Without c++11, all compilers able to compile Eigen also
+// provides the C99 stdint.h header file.
+#include <stdint.h>
+namespace Eigen {
+namespace numext {
+typedef ::uint32_t uint32_t;
+typedef ::int32_t int32_t;
+typedef ::uint64_t uint64_t;
+typedef ::int64_t int64_t;
+}
+}
+#endif
+
#endif // EIGEN_META_H
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index b2f95153e..67714e444 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -104,7 +104,8 @@
STORAGE_INDEX_MUST_MATCH=1,
CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1,
SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1,
- INVALID_TEMPLATE_PARAMETER=1
+ INVALID_TEMPLATE_PARAMETER=1,
+ GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1
};
};
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 836ff4711..91c2e42e4 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -184,7 +184,8 @@ template<typename T> struct unpacket_traits
enum
{
size = 1,
- alignment = 1
+ alignment = 1,
+ vectorizable = false
};
};