diff options
author | Gael Guennebaud <g.gael@free.fr> | 2014-06-20 15:55:44 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2014-06-20 15:55:44 +0200 |
commit | b29b81a1f46ad3b7340c9bbb8d1e23685e5ca756 (patch) | |
tree | ec31545094cba7c9d72c9132963fa3fecd448726 /Eigen/src | |
parent | 47585c8ab238f6a49b8097e221fa4b30763ef942 (diff) | |
parent | 963d338922e9ef1addcd29c1b43e9b66243207c0 (diff) |
merge with default branch
Diffstat (limited to 'Eigen/src')
56 files changed, 3314 insertions, 1078 deletions
diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index d66110821..11dd5624c 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -43,7 +43,7 @@ namespace internal { * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky * decomposition to determine whether a system of equations has a solution. * - * \sa MatrixBase::ldlt(), class LLT + * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT */ template<typename _MatrixType, int _UpLo> class LDLT { @@ -179,7 +179,7 @@ template<typename _MatrixType, int _UpLo> class LDLT * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular. * - * \sa MatrixBase::ldlt() + * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt() */ #ifdef EIGEN_TEST_EVALUATORS template<typename Rhs> @@ -610,6 +610,7 @@ MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const #ifndef __CUDACC__ /** \cholesky_module * \returns the Cholesky decomposition with full pivoting without square root of \c *this + * \sa MatrixBase::ldlt() */ template<typename MatrixType, unsigned int UpLo> inline const LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo> @@ -620,6 +621,7 @@ SelfAdjointView<MatrixType, UpLo>::ldlt() const /** \cholesky_module * \returns the Cholesky decomposition with full pivoting without square root of \c *this + * \sa SelfAdjointView::ldlt() */ template<typename Derived> inline const LDLT<typename MatrixBase<Derived>::PlainObject> diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index d9a8ef1fb..083b820f5 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -41,7 +41,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits; * Example: \include LLT_example.cpp * Output: \verbinclude LLT_example.out * - * \sa MatrixBase::llt(), class LDLT + * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH) * Note that during the decomposition, only the upper triangular part of A is considered. Therefore, @@ -115,7 +115,7 @@ template<typename _MatrixType, int _UpLo> class LLT * Example: \include LLT_solve.cpp * Output: \verbinclude LLT_solve.out * - * \sa solveInPlace(), MatrixBase::llt() + * \sa solveInPlace(), MatrixBase::llt(), SelfAdjointView::llt() */ #ifdef EIGEN_TEST_EVALUATORS template<typename Rhs> @@ -498,6 +498,7 @@ MatrixType LLT<MatrixType,_UpLo>::reconstructedMatrix() const #ifndef __CUDACC__ /** \cholesky_module * \returns the LLT decomposition of \c *this + * \sa SelfAdjointView::llt() */ template<typename Derived> inline const LLT<typename MatrixBase<Derived>::PlainObject> @@ -508,6 +509,7 @@ MatrixBase<Derived>::llt() const /** \cholesky_module * \returns the LLT decomposition of \c *this + * \sa SelfAdjointView::llt() */ template<typename MatrixType, unsigned int UpLo> inline const LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo> diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h index 0e10f125f..abbba84c3 100644 --- a/Eigen/src/Core/Assign.h +++ b/Eigen/src/Core/Assign.h @@ -105,6 +105,8 @@ public: EIGEN_DEBUG_VAR(DstIsAligned) EIGEN_DEBUG_VAR(SrcIsAligned) EIGEN_DEBUG_VAR(JointAlignment) + EIGEN_DEBUG_VAR(Derived::SizeAtCompileTime) + EIGEN_DEBUG_VAR(OtherDerived::CoeffReadCost) EIGEN_DEBUG_VAR(InnerSize) EIGEN_DEBUG_VAR(InnerMaxSize) EIGEN_DEBUG_VAR(PacketSize) diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h index 7772951b9..97134ffd7 100644 --- a/Eigen/src/Core/Assign_MKL.h +++ b/Eigen/src/Core/Assign_MKL.h @@ -202,6 +202,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin, Asin) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos, Cos) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan, Tan) +EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(atan, Atan) //EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs, Abs) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp, Exp) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log, Ln) diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index ef6c143d3..70e457028 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -88,8 +88,8 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0) && (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, - MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0, - FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0, + MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0, + FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (traits<XprType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0, FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0, FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) | diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index b9dd75ade..59f515495 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -40,7 +40,7 @@ void check_static_allocation_size() */ template <typename T, int Size, int MatrixOrArrayOptions, int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0 - : (((Size*sizeof(T))%16)==0) ? 16 + : (((Size*sizeof(T))%EIGEN_ALIGN_BYTES)==0) ? EIGEN_ALIGN_BYTES : 0 > struct plain_array { @@ -68,27 +68,27 @@ struct plain_array template<typename PtrType> EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; } #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \ - eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & sizemask) == 0 \ + eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \ && "this assertion is explained here: " \ "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \ " **** READ THIS WEB PAGE !!! ****"); #else #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \ - eigen_assert((reinterpret_cast<size_t>(array) & sizemask) == 0 \ + eigen_assert((reinterpret_cast<size_t>(array) & (sizemask)) == 0 \ && "this assertion is explained here: " \ "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \ " **** READ THIS WEB PAGE !!! ****"); #endif template <typename T, int Size, int MatrixOrArrayOptions> -struct plain_array<T, Size, MatrixOrArrayOptions, 16> +struct plain_array<T, Size, MatrixOrArrayOptions, EIGEN_ALIGN_BYTES> { - EIGEN_USER_ALIGN16 T array[Size]; + EIGEN_USER_ALIGN_DEFAULT T array[Size]; EIGEN_DEVICE_FUNC plain_array() { - EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0xf); + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(EIGEN_ALIGN_BYTES-1); check_static_allocation_size<T,Size>(); } @@ -102,7 +102,7 @@ struct plain_array<T, Size, MatrixOrArrayOptions, 16> template <typename T, int MatrixOrArrayOptions, int Alignment> struct plain_array<T, 0, MatrixOrArrayOptions, Alignment> { - EIGEN_USER_ALIGN16 T array[1]; + EIGEN_USER_ALIGN_DEFAULT T array[1]; EIGEN_DEVICE_FUNC plain_array() {} EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {} }; diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index f28d2e46c..734815986 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -69,8 +69,7 @@ template<typename Lhs, typename Rhs> struct product_type MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime, _Rhs::MaxRowsAtCompileTime), Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime, - _Rhs::RowsAtCompileTime), - LargeThreshold = EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD + _Rhs::RowsAtCompileTime) }; // the splitting into different lines of code here, introducing the _select enums and the typedef below, @@ -424,7 +423,7 @@ struct gemv_static_vector_if<Scalar,Size,MaxSize,true> internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data; EIGEN_STRONG_INLINE Scalar* data() { return ForceAlignment - ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(15))) + 16) + ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES) : m_data.array; } #endif diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index b0469fa1e..4523d2263 100644..100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -42,6 +42,8 @@ namespace internal { struct default_packet_traits { enum { + HasHalfPacket = 0, + HasAdd = 1, HasSub = 1, HasMul = 1, @@ -71,10 +73,12 @@ struct default_packet_traits template<typename T> struct packet_traits : default_packet_traits { typedef T type; + typedef T half; enum { Vectorizable = 0, size = 1, - AlignedOnScalar = 0 + AlignedOnScalar = 0, + HasHalfPacket = 0 }; enum { HasAdd = 0, @@ -157,17 +161,65 @@ pload(const typename unpacket_traits<Packet>::type* from) { return *from; } template<typename Packet> EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; } +/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +pset1(const typename unpacket_traits<Packet>::type& a) { return a; } + +/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ +template<typename Packet> EIGEN_DEVICE_FUNC inline Packet +pload1(const typename unpacket_traits<Packet>::type *a) { return pset1<Packet>(*a); } + /** \internal \returns a packet with elements of \a *from duplicated. - * For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and - * duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]} + * For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and + * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]} * Currently, this function is only used for scalar * complex products. - */ + */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; } -/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ +/** \internal \returns a packet with elements of \a *from quadrupled. + * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and + * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]} + * Currently, this function is only used in matrix products. + * For packet-size smaller or equal to 4, this function is equivalent to pload1 + */ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet -pset1(const typename unpacket_traits<Packet>::type& a) { return a; } +ploadquad(const typename unpacket_traits<Packet>::type* from) +{ return pload1<Packet>(from); } + +/** \internal equivalent to + * \code + * a0 = pload1(a+0); + * a1 = pload1(a+1); + * a2 = pload1(a+2); + * a3 = pload1(a+3); + * \endcode + * \sa pset1, pload1, ploaddup, pbroadcast2 + */ +template<typename Packet> EIGEN_DEVICE_FUNC +inline void pbroadcast4(const typename unpacket_traits<Packet>::type *a, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) +{ + a0 = pload1<Packet>(a+0); + a1 = pload1<Packet>(a+1); + a2 = pload1<Packet>(a+2); + a3 = pload1<Packet>(a+3); +} + +/** \internal equivalent to + * \code + * a0 = pload1(a+0); + * a1 = pload1(a+1); + * \endcode + * \sa pset1, pload1, ploaddup, pbroadcast4 + */ +template<typename Packet> EIGEN_DEVICE_FUNC +inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a, + Packet& a0, Packet& a1) +{ + a0 = pload1<Packet>(a+0); + a1 = pload1<Packet>(a+1); +} /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ template<typename Scalar> inline typename packet_traits<Scalar>::type @@ -179,7 +231,13 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore( /** \internal copy the packet \a from to \a *to, (un-aligned store) */ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) -{ (*to) = from; } +{ (*to) = from; } + + template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, int /*stride*/) + { return ploadu<Packet>(from); } + + template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, int /*stride*/) + { pstore(to, from); } /** \internal tries to do cache prefetching of \a addr */ template<typename Scalar> inline void prefetch(const Scalar* addr) @@ -201,6 +259,15 @@ preduxp(const Packet* vecs) { return vecs[0]; } template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a) { return a; } +/** \internal \returns the sum of the elements of \a a by block of 4 elements. + * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} + * For packet-size smaller or equal to 4, this boils down to a noop. + */ +template<typename Packet> EIGEN_DEVICE_FUNC inline +typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type +predux4(const Packet& a) +{ return a; } + /** \internal \returns the product of the elements of \a a*/ template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a) { return a; } @@ -251,6 +318,10 @@ Packet pasin(const Packet& a) { using std::asin; return asin(a); } template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos(const Packet& a) { using std::acos; return acos(a); } +/** \internal \returns the atan of \a a (coeff-wise) */ +template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet patan(const Packet& a) { using std::atan; return atan(a); } + /** \internal \returns the exp of \a a (coeff-wise) */ template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp(const Packet& a) { using std::exp; return exp(a); } @@ -348,9 +419,22 @@ template<> inline std::complex<double> pmul(const std::complex<double>& a, const #endif + +/*************************************************************************** + * PacketBlock, that is a collection of N packets where the number of words + * in the packet is a multiple of N. +***************************************************************************/ +template <typename Packet,int N=unpacket_traits<Packet>::size> struct PacketBlock { + Packet packet[N]; +}; + +template<typename Packet> EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet,1>& /*kernel*/) { + // Nothing to do in the scalar case, i.e. a 1x1 matrix. +} + } // end namespace internal } // end namespace Eigen #endif // EIGEN_GENERIC_PACKET_MATH_H - diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h index 2acf97723..2067a2a6e 100644 --- a/Eigen/src/Core/GlobalFunctions.h +++ b/Eigen/src/Core/GlobalFunctions.h @@ -45,6 +45,7 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op) diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 23bbb46bf..6b2d61909 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -89,7 +89,7 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> > && ( bool(IsDynamicSize) || HasNoOuterStride || ( OuterStrideAtCompileTime!=Dynamic - && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ), + && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ), Flags0 = TraitsBase::Flags & (~NestByRefBit), Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit), Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime)) diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index de1424b09..927a59c50 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -166,11 +166,10 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors> EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1), PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1); - eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % 16) == 0) && "data is not aligned"); + eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned"); #else - eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::IsAligned, (size_t(m_data) % 16) == 0) && "data is not aligned"); + eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::IsAligned, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned"); #endif - } PointerType m_data; @@ -255,6 +254,8 @@ template<typename Derived> class MapBase<Derived, WriteAccessors> using Base::Base::operator=; }; +#undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS + } // end namespace Eigen #endif // EIGEN_MAPBASE_H diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 63fb92b75..20fc2be74 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -669,6 +669,15 @@ bool (isfinite)(const T& x) return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest(); } +template<typename T> +EIGEN_DEVICE_FUNC +bool (isfinite)(const std::complex<T>& x) +{ + using std::real; + using std::imag; + return isfinite(real(x)) && isfinite(imag(x)); +} + } // end namespace numext namespace internal { diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 1e2270d34..136cd165e 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -325,10 +325,16 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling> }; static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) { - Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func)); - if (VectorizedSize != Size) - res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func)); - return res; + eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + if (VectorizedSize > 0) { + Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func)); + if (VectorizedSize != Size) + res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func)); + return res; + } + else { + return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func); + } } }; diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h index 665b7497a..fe493641d 100644 --- a/Eigen/src/Core/Ref.h +++ b/Eigen/src/Core/Ref.h @@ -97,7 +97,7 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> > template<typename Derived> struct match { enum { HasDirectAccess = internal::has_direct_access<Derived>::ret, - StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), + StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic) || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime) || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1), @@ -168,8 +168,12 @@ protected: } else ::new (static_cast<Base*>(this)) Base(expr.data(), expr.rows(), expr.cols()); - ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(), - StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride()); + + if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit))) + ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1); + else + ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(), + StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride()); } StrideBase m_stride; diff --git a/Eigen/src/Core/arch/AVX/CMakeLists.txt b/Eigen/src/Core/arch/AVX/CMakeLists.txt new file mode 100644 index 000000000..bdb71ab99 --- /dev/null +++ b/Eigen/src/Core/arch/AVX/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_Core_arch_AVX_SRCS "*.h") + +INSTALL(FILES + ${Eigen_Core_arch_AVX_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AVX COMPONENT Devel +) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h new file mode 100644 index 000000000..9ced85132 --- /dev/null +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -0,0 +1,463 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_AVX_H +#define EIGEN_COMPLEX_AVX_H + +namespace Eigen { + +namespace internal { + +//---------- float ---------- +struct Packet4cf +{ + EIGEN_STRONG_INLINE Packet4cf() {} + EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {} + __m256 v; +}; + +template<> struct packet_traits<std::complex<float> > : default_packet_traits +{ + typedef Packet4cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; typedef Packet2cf half; }; + +template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a) +{ + return Packet4cf(pnegate(a.v)); +} +template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a) +{ + const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet4cf(_mm256_xor_ps(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b) +{ + __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v); + __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1))); + __m256 result = _mm256_addsub_ps(tmp1, tmp2); + return Packet4cf(result); +} + +template<> EIGEN_STRONG_INLINE Packet4cf pand <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf por <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pxor <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); } + + +template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from) +{ + return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from))); +} + +template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from) +{ + // FIXME The following might be optimized using _mm256_movedup_pd + Packet2cf a = ploaddup<Packet2cf>(from); + Packet2cf b = ploaddup<Packet2cf>(from+1); + return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1)); +} + +template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, int stride) +{ + return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]), + std::imag(from[2*stride]), std::real(from[2*stride]), + std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, int stride) +{ + __m128 low = _mm256_extractf128_ps(from.v, 0); + to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1))); + to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3))); + + __m128 high = _mm256_extractf128_ps(from.v, 1); + to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1))); + to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3))); + +} + +template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a) +{ + return pfirst(Packet2cf(_mm256_castps256_ps128(a.v))); +} + +template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) { + __m128 low = _mm256_extractf128_ps(a.v, 0); + __m128 high = _mm256_extractf128_ps(a.v, 1); + __m128d lowd = _mm_castps_pd(low); + __m128d highd = _mm_castps_pd(high); + low = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1)); + high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1)); + __m256 result = _mm256_setzero_ps(); + result = _mm256_insertf128_ps(result, low, 1); + result = _mm256_insertf128_ps(result, high, 0); + return Packet4cf(result); +} + +template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a) +{ + return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)), + Packet2cf(_mm256_extractf128_ps(a.v,1)))); +} + +template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs) +{ + Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0)); + Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0)); + t0 = _mm256_hadd_ps(t0,t1); + Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0)); + Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0)); + t2 = _mm256_hadd_ps(t2,t3); + + t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4)); + t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4)); + + return Packet4cf(_mm256_add_ps(t1,t3)); +} + +template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a) +{ + return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)), + Packet2cf(_mm256_extractf128_ps(a.v, 1)))); +} + +template<int Offset> +struct palign_impl<Offset,Packet4cf> +{ + static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second) + { + if (Offset==0) return; + palign_impl<Offset*2,Packet8f>::run(first.v, second.v); + } +}; + +template<> struct conj_helper<Packet4cf, Packet4cf, false,true> +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper<Packet4cf, Packet4cf, true,false> +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper<Packet4cf, Packet4cf, true,true> +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> struct conj_helper<Packet8f, Packet4cf, false,false> +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const + { return Packet4cf(Eigen::internal::pmul(x, y.v)); } +}; + +template<> struct conj_helper<Packet4cf, Packet8f, false,false> +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const + { return Packet4cf(Eigen::internal::pmul(x.v, y)); } +}; + +template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b) +{ + Packet4cf num = pmul(a, pconj(b)); + __m256 tmp = _mm256_mul_ps(b.v, b.v); + __m256 tmp2 = _mm256_shuffle_ps(tmp,tmp,0xB1); + __m256 denom = _mm256_add_ps(tmp, tmp2); + return Packet4cf(_mm256_div_ps(num.v, denom)); +} + +template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x) +{ + return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1))); +} + +//---------- double ---------- +struct Packet2cd +{ + EIGEN_STRONG_INLINE Packet2cd() {} + EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {} + __m256d v; +}; + +template<> struct packet_traits<std::complex<double> > : default_packet_traits +{ + typedef Packet2cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 2, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; typedef Packet1cd half; }; + +template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a) +{ + const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0)); + return Packet2cd(_mm256_xor_pd(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b) +{ + __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0); + __m256d even = _mm256_mul_pd(tmp1, b.v); + __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF); + __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5); + __m256d odd = _mm256_mul_pd(tmp2, tmp3); + return Packet2cd(_mm256_addsub_pd(even, odd)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pand <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd por <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pxor <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); } + +template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from) +{ + // in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though) +// return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from)); + return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); } + +template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, int stride) +{ + return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, int stride) +{ + __m128d low = _mm256_extractf128_pd(from.v, 0); + to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1))); + __m128d high = _mm256_extractf128_pd(from.v, 1); + to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1))); +} + +template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a) +{ + __m128d low = _mm256_extractf128_pd(a.v, 0); + EIGEN_ALIGN16 double res[2]; + _mm_store_pd(res, low); + return std::complex<double>(res[0],res[1]); +} + +template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) { + __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1); + return Packet2cd(result); +} + +template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a) +{ + return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)), + Packet1cd(_mm256_extractf128_pd(a.v,1)))); +} + +template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs) +{ + Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4)); + Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4)); + + return Packet2cd(_mm256_add_pd(t0,t1)); +} + +template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a) +{ + return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)), + Packet1cd(_mm256_extractf128_pd(a.v,1)))); +} + +template<int Offset> +struct palign_impl<Offset,Packet2cd> +{ + static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second) + { + if (Offset==0) return; + palign_impl<Offset*2,Packet4d>::run(first.v, second.v); + } +}; + +template<> struct conj_helper<Packet2cd, Packet2cd, false,true> +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper<Packet2cd, Packet2cd, true,false> +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper<Packet2cd, Packet2cd, true,true> +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> struct conj_helper<Packet4d, Packet2cd, false,false> +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const + { return Packet2cd(Eigen::internal::pmul(x, y.v)); } +}; + +template<> struct conj_helper<Packet2cd, Packet4d, false,false> +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const + { return Packet2cd(Eigen::internal::pmul(x.v, y)); } +}; + +template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b) +{ + Packet2cd num = pmul(a, pconj(b)); + __m256d tmp = _mm256_mul_pd(b.v, b.v); + __m256d denom = _mm256_hadd_pd(tmp, tmp); + return Packet2cd(_mm256_div_pd(num.v, denom)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x) +{ + return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4cf,4>& kernel) { + __m256d P0 = _mm256_castps_pd(kernel.packet[0].v); + __m256d P1 = _mm256_castps_pd(kernel.packet[1].v); + __m256d P2 = _mm256_castps_pd(kernel.packet[2].v); + __m256d P3 = _mm256_castps_pd(kernel.packet[3].v); + + __m256d T0 = _mm256_shuffle_pd(P0, P1, 15); + __m256d T1 = _mm256_shuffle_pd(P0, P1, 0); + __m256d T2 = _mm256_shuffle_pd(P2, P3, 15); + __m256d T3 = _mm256_shuffle_pd(P2, P3, 0); + + kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32)); + kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49)); + kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32)); + kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet2cd,2>& kernel) { + __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4)); + kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4)); + kernel.packet[0].v = tmp; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_AVX_H diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h new file mode 100644 index 000000000..8b8307d75 --- /dev/null +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -0,0 +1,564 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_AVX_H +#define EIGEN_PACKET_MATH_AVX_H + +namespace Eigen { + +namespace internal { + +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#endif + +#ifdef EIGEN_VECTORIZE_FMA +#ifndef EIGEN_HAS_FUSED_MADD +#define EIGEN_HAS_FUSED_MADD 1 +#endif +#endif + +typedef __m256 Packet8f; +typedef __m256i Packet8i; +typedef __m256d Packet4d; + +template<> struct is_arithmetic<__m256> { enum { value = true }; }; +template<> struct is_arithmetic<__m256i> { enum { value = true }; }; +template<> struct is_arithmetic<__m256d> { enum { value = true }; }; + +#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ + const Packet8f p8f_##NAME = pset1<Packet8f>(X) + +#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \ + const Packet4d p4d_##NAME = pset1<Packet4d>(X) + + +template<> struct packet_traits<float> : default_packet_traits +{ + typedef Packet8f type; + typedef Packet4f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=8, + HasHalfPacket = 1, + + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 0, + HasSqrt = 0 + }; + }; +template<> struct packet_traits<double> : default_packet_traits +{ + typedef Packet4d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 1, + + HasDiv = 1, + HasExp = 0 + }; +}; + +/* Proper support for integers is only provided by AVX2. In the meantime, we'll + use SSE instructions and packets to deal with integers. +template<> struct packet_traits<int> : default_packet_traits +{ + typedef Packet8i type; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=8 + }; +}; +*/ + +template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; enum {size=8}; }; +template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4}; }; +template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8}; }; + +template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); } + +template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); } +template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); } + +template<> EIGEN_STRONG_INLINE Packet8f plset<float>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } +template<> EIGEN_STRONG_INLINE Packet4d plset<double>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); } + +template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) +{ + return _mm256_sub_ps(_mm256_set1_ps(0.0),a); +} +template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) +{ + return _mm256_sub_pd(_mm256_set1_pd(0.0),a); +} + +template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); } + + +template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/) +{ eigen_assert(false && "packet integer division are not supported by AVX"); + return pset1<Packet8i>(0); +} + +#ifdef EIGEN_VECTORIZE_FMA +template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { +#if defined(__clang__) || defined(__GNUC__) + // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, + // and gcc stupidly generates a vfmadd132ps instruction, + // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate + // the result of the product. + Packet8f res = c; + asm("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + return _mm256_fmadd_ps(a,b,c); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { +#if defined(__clang__) || defined(__GNUC__) + // see above + Packet4d res = c; + asm("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + return _mm256_fmadd_pd(a,b,c); +#endif +} +#endif + +template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); } + +template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); } + +// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} +template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from) +{ + // TODO try to find a way to avoid the need of a temporary register +// Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); +// tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); +// return _mm256_unpacklo_ps(tmp,tmp); + + // _mm256_insertf128_ps is very slow on Haswell, thus: + Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); + // mimic an "inplace" permutation of the lower 128bits using a blend + tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15); + // then we can perform a consistent permutation on the global register to get everything in shape: + return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); +} +// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} +template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from) +{ + Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from); + return _mm256_permute_pd(tmp, 3<<2); +} + +// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1} +template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from) +{ + Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from)); + return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1); +} + +template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } + +template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } + +// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available +// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4); +template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, int stride) +{ + return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride], + from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, int stride) +{ + return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, int stride) +{ + __m128 low = _mm256_extractf128_ps(from, 0); + to[stride*0] = _mm_cvtss_f32(low); + to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)); + to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)); + to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)); + + __m128 high = _mm256_extractf128_ps(from, 1); + to[stride*4] = _mm_cvtss_f32(high); + to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)); + to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)); + to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, int stride) +{ + __m128d low = _mm256_extractf128_pd(from, 0); + to[stride*0] = _mm_cvtsd_f64(low); + to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)); + __m128d high = _mm256_extractf128_pd(from, 1); + to[stride*2] = _mm_cvtsd_f64(high); + to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)); +} + +template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a) +{ + Packet8f pa = pset1<Packet8f>(a); + pstore(to, pa); +} +template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a) +{ + Packet4d pa = pset1<Packet4d>(a); + pstore(to, pa); +} +template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a) +{ + Packet8i pa = pset1<Packet8i>(a); + pstore(to, pa); +} + +template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } + +template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) { + return _mm_cvtss_f32(_mm256_castps256_ps128(a)); +} +template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) { + return _mm_cvtsd_f64(_mm256_castpd256_pd128(a)); +} +template<> EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) { + return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); +} + + +template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) +{ + __m256 tmp = _mm256_shuffle_ps(a,a,0x1b); + return _mm256_permute2f128_ps(tmp, tmp, 1); +} +template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) +{ + __m256d tmp = _mm256_shuffle_pd(a,a,5); + return _mm256_permute2f128_pd(tmp, tmp, 1); + + __m256d swap_halves = _mm256_permute2f128_pd(a,a,1); + return _mm256_permute_pd(swap_halves,5); +} + +// pabs should be ok +template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) +{ + const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); + return _mm256_and_ps(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) +{ + const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); + return _mm256_and_pd(a,mask); +} + +// preduxp should be ok +// FIXME: why is this ok? why isn't the simply implementation working as expected? +template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs) +{ + __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]); + __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]); + __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]); + __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]); + + __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); + __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); + __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); + __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); + + __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + + __m256 sum1 = _mm256_add_ps(perm1, hsum5); + __m256 sum2 = _mm256_add_ps(perm2, hsum6); + __m256 sum3 = _mm256_add_ps(perm3, hsum7); + __m256 sum4 = _mm256_add_ps(perm4, hsum8); + + __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + + __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); + return final; +} +template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs) +{ + Packet4d tmp0, tmp1; + + tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + + tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + + return _mm256_blend_pd(tmp0, tmp1, 0xC); +} + +template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) +{ + Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1)); + tmp0 = _mm256_hadd_ps(tmp0,tmp0); + return pfirst(_mm256_hadd_ps(tmp0, tmp0)); +} +template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a) +{ + Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_hadd_pd(tmp0,tmp0)); +} + +template<> EIGEN_STRONG_INLINE Packet4f predux4<Packet8f>(const Packet8f& a) +{ + return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); +} + +template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a) +{ + Packet8f tmp; + tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} +template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a) +{ + Packet4d tmp; + tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1))); +} + +template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a) +{ + Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} +template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a) +{ + Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); +} + +template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a) +{ + Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} + +template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a) +{ + Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); +} + + +template<int Offset> +struct palign_impl<Offset,Packet8f> +{ + static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second) + { + if (Offset==1) + { + first = _mm256_blend_ps(first, second, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); + first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0x88); + } + else if (Offset==2) + { + first = _mm256_blend_ps(first, second, 3); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); + first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xcc); + } + else if (Offset==3) + { + first = _mm256_blend_ps(first, second, 7); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); + first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xee); + } + else if (Offset==4) + { + first = _mm256_blend_ps(first, second, 15); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0)); + first = _mm256_permute_ps(_mm256_permute2f128_ps (tmp, tmp, 1), _MM_SHUFFLE(3,2,1,0)); + } + else if (Offset==5) + { + first = _mm256_blend_ps(first, second, 31); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0x88); + } + else if (Offset==6) + { + first = _mm256_blend_ps(first, second, 63); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0xcc); + } + else if (Offset==7) + { + first = _mm256_blend_ps(first, second, 127); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0xee); + } + } +}; + +template<int Offset> +struct palign_impl<Offset,Packet4d> +{ + static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second) + { + if (Offset==1) + { + first = _mm256_blend_pd(first, second, 1); + __m256d tmp = _mm256_permute_pd(first, 5); + first = _mm256_permute2f128_pd(tmp, tmp, 1); + first = _mm256_blend_pd(tmp, first, 0xA); + } + else if (Offset==2) + { + first = _mm256_blend_pd(first, second, 3); + first = _mm256_permute2f128_pd(first, first, 1); + } + else if (Offset==3) + { + first = _mm256_blend_pd(first, second, 7); + __m256d tmp = _mm256_permute_pd(first, 5); + first = _mm256_permute2f128_pd(tmp, tmp, 1); + first = _mm256_blend_pd(tmp, first, 5); + } + } +}; + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet8f,8>& kernel) { + __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]); + __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]); + __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]); + __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]); + __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); + __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); + __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); + __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); + __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0)); + __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2)); + __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0)); + __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2)); + kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20); + kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20); + kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20); + kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20); + kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31); + kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31); + kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31); + kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet8f,4>& kernel) { + __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + + __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); + __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); + __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); + __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); + + kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20); + kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20); + kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31); + kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4d,4>& kernel) { + __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15); + __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0); + __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15); + __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0); + + kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32); + kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49); + kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32); + kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_AVX_H diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 68d9a2bff..5409ddedd 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -21,6 +21,8 @@ static Packet16uc p16uc_COMPLEX_REV = vec_sld(p16uc_REVERSE, p16uc_REVERSE, 8); static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; +static Packet16uc p16uc_COMPLEX_TRANSPOSE_0 = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; +static Packet16uc p16uc_COMPLEX_TRANSPOSE_1 = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; //---------- float ---------- struct Packet2cf @@ -33,6 +35,7 @@ struct Packet2cf template<> struct packet_traits<std::complex<float> > : default_packet_traits { typedef Packet2cf type; + typedef Packet2cf half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -51,7 +54,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits }; }; -template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; +template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) { @@ -65,6 +68,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo return res; } +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride) +{ + std::complex<float> EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return Packet2cf(vec_ld(0, (const float*)af)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride) +{ + std::complex<float> EIGEN_ALIGN16 af[2]; + vec_st(from.v, 0, (float*)af); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} + + template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } @@ -210,6 +229,13 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV)); } +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel) +{ + Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_0); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_1); + kernel.packet[0].v = tmp; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 45d1954f7..0e9adf450 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -18,6 +18,10 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 #endif +#ifndef EIGEN_HAS_FUSED_MADD +#define EIGEN_HAS_FUSED_MADD 1 +#endif + #ifndef EIGEN_HAS_FUSE_CJMADD #define EIGEN_HAS_FUSE_CJMADD 1 #endif @@ -73,6 +77,7 @@ static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui) template<> struct packet_traits<float> : default_packet_traits { typedef Packet4f type; + typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -89,6 +94,7 @@ template<> struct packet_traits<float> : default_packet_traits template<> struct packet_traits<int> : default_packet_traits { typedef Packet4i type; + typedef Packet4i half; enum { // FIXME check the Has* Vectorizable = 1, @@ -97,8 +103,8 @@ template<> struct packet_traits<int> : default_packet_traits }; }; -template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; -template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; +template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; }; +template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; }; /* inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) { @@ -144,6 +150,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v) return s; } */ + template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html float EIGEN_ALIGN16 af[4]; @@ -161,6 +168,65 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vc; } + +template<> EIGEN_STRONG_INLINE void +pbroadcast4<Packet4f>(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = vec_ld(0,a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4<Packet4i>(const int *a, + Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) +{ + a3 = vec_ld(0,a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride) +{ + float EIGEN_ALIGN16 af[4]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + af[2] = from[2*stride]; + af[3] = from[3*stride]; + return vec_ld(0, af); +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride) +{ + int EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return vec_ld(0, ai); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride) +{ + float EIGEN_ALIGN16 af[4]; + vec_st(from, 0, af); + to[0*stride] = af[0]; + to[1*stride] = af[1]; + to[2*stride] = af[2]; + to[3*stride] = af[3]; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride) +{ + int EIGEN_ALIGN16 ai[4]; + vec_st(from, 0, ai); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} + template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); } @@ -494,6 +560,32 @@ struct palign_impl<Offset,Packet4i> } }; +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4f,4>& kernel) { + Packet4f t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4i,4>& kernel) { + Packet4i t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/CMakeLists.txt b/Eigen/src/Core/arch/CMakeLists.txt index 8456dec15..0db8c558d 100644 --- a/Eigen/src/Core/arch/CMakeLists.txt +++ b/Eigen/src/Core/arch/CMakeLists.txt @@ -1,4 +1,5 @@ ADD_SUBDIRECTORY(SSE) ADD_SUBDIRECTORY(AltiVec) ADD_SUBDIRECTORY(NEON) +ADD_SUBDIRECTORY(AVX) ADD_SUBDIRECTORY(Default) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 8d9255eef..259f2e7b8 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -28,6 +28,7 @@ struct Packet2cf template<> struct packet_traits<std::complex<float> > : default_packet_traits { typedef Packet2cf type; + typedef Packet2cf half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -46,7 +47,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits }; }; -template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; +template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) { @@ -110,6 +111,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex< template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride) +{ + Packet4f res; + res = vsetq_lane_f32(std::real(from[0*stride]), res, 0); + res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1); + res = vsetq_lane_f32(std::real(from[1*stride]), res, 2); + res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3); + return Packet2cf(res); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride) +{ + to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1)); + to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); +} + template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((float *)addr); } template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) @@ -246,6 +263,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); } +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet2cf,2>& kernel) { + float32x4_t tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); + kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v)); + kernel.packet[1].v = tmp; +} + + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 05e891df2..e5eb06f36 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -66,6 +66,7 @@ typedef uint32x4_t Packet4ui; template<> struct packet_traits<float> : default_packet_traits { typedef Packet4f type; + typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -83,6 +84,7 @@ template<> struct packet_traits<float> : default_packet_traits template<> struct packet_traits<int> : default_packet_traits { typedef Packet4i type; + typedef Packet4i half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -95,12 +97,13 @@ template<> struct packet_traits<int> : default_packet_traits // workaround gcc 4.2, 4.3 and 4.4 compilatin issue EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); } +EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } #endif -template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; -template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; +template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; }; +template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; }; template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); } @@ -219,6 +222,40 @@ template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& f template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride) +{ + Packet4f res; + res = vsetq_lane_f32(from[0*stride], res, 0); + res = vsetq_lane_f32(from[1*stride], res, 1); + res = vsetq_lane_f32(from[2*stride], res, 2); + res = vsetq_lane_f32(from[3*stride], res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride) +{ + Packet4i res; + res = vsetq_lane_s32(from[0*stride], res, 0); + res = vsetq_lane_s32(from[1*stride], res, 1); + res = vsetq_lane_s32(from[2*stride], res, 2); + res = vsetq_lane_s32(from[3*stride], res, 3); + return res; +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride) +{ + to[stride*0] = vgetq_lane_f32(from, 0); + to[stride*1] = vgetq_lane_f32(from, 1); + to[stride*2] = vgetq_lane_f32(from, 2); + to[stride*3] = vgetq_lane_f32(from, 3); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride) +{ + to[stride*0] = vgetq_lane_s32(from, 0); + to[stride*1] = vgetq_lane_s32(from, 1); + to[stride*2] = vgetq_lane_s32(from, 2); + to[stride*3] = vgetq_lane_s32(from, 3); +} + template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ARM_PREFETCH(addr); } @@ -385,6 +422,7 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) a_lo = vget_low_s32(a); a_hi = vget_high_s32(a); max = vpmax_s32(a_lo, a_hi); + max = vpmax_s32(max, max); return vget_lane_s32(max, 0); } @@ -410,9 +448,30 @@ PALIGN_NEON(0,Packet4i,vextq_s32) PALIGN_NEON(1,Packet4i,vextq_s32) PALIGN_NEON(2,Packet4i,vextq_s32) PALIGN_NEON(3,Packet4i,vextq_s32) - + #undef PALIGN_NEON +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4f,4>& kernel) { + float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); + float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); + + kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0])); + kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0])); + kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1])); + kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1])); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4i,4>& kernel) { + int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); + int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); + kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0])); + kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0])); + kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1])); + kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1])); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 91bba5e38..758183c18 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -22,13 +22,18 @@ struct Packet2cf __m128 v; }; +// Use the packet_traits defined in AVX/PacketMath.h instead if we're going +// to leverage AVX instructions. +#ifndef EIGEN_VECTORIZE_AVX template<> struct packet_traits<std::complex<float> > : default_packet_traits { typedef Packet2cf type; + typedef Packet2cf half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 2, + HasHalfPacket = 0, HasAdd = 1, HasSub = 1, @@ -42,8 +47,9 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits HasSetLinear = 0 }; }; +#endif -template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; +template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } @@ -104,8 +110,23 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); } -template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); } +template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); } + + +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride) +{ + return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride) +{ + to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1))); + to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); +} template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } @@ -124,7 +145,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Pack #endif } -template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); } template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) { @@ -214,7 +235,7 @@ template<> struct conj_helper<Packet4f, Packet2cf, false,false> { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const - { return Packet2cf(Eigen::internal::pmul(x, y.v)); } + { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); } }; template<> struct conj_helper<Packet2cf, Packet4f, false,false> @@ -223,7 +244,7 @@ template<> struct conj_helper<Packet2cf, Packet4f, false,false> { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const - { return Packet2cf(Eigen::internal::pmul(x.v, y)); } + { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); } }; template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) @@ -248,13 +269,18 @@ struct Packet1cd __m128d v; }; +// Use the packet_traits defined in AVX/PacketMath.h instead if we're going +// to leverage AVX instructions. +#ifndef EIGEN_VECTORIZE_AVX template<> struct packet_traits<std::complex<double> > : default_packet_traits { typedef Packet1cd type; + typedef Packet1cd half; enum { Vectorizable = 1, AlignedOnScalar = 0, size = 1, + HasHalfPacket = 0, HasAdd = 1, HasSub = 1, @@ -268,12 +294,13 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits HasSetLinear = 0 }; }; +#endif -template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; }; +template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); @@ -311,8 +338,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); } // FIXME force unaligned store, this is a temporary fix -template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); } +template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); } template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } @@ -410,7 +437,7 @@ template<> struct conj_helper<Packet2d, Packet1cd, false,false> { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const - { return Packet1cd(Eigen::internal::pmul(x, y.v)); } + { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); } }; template<> struct conj_helper<Packet1cd, Packet2d, false,false> @@ -419,7 +446,7 @@ template<> struct conj_helper<Packet1cd, Packet2d, false,false> { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const - { return Packet1cd(Eigen::internal::pmul(x.v, y)); } + { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); } }; template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) @@ -432,7 +459,17 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x) { - return Packet1cd(preverse(x.v)); + return Packet1cd(preverse(Packet2d(x.v))); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet2cf,2>& kernel) { + __m128d w1 = _mm_castps_pd(kernel.packet[0].v); + __m128d w2 = _mm_castps_pd(kernel.packet[1].v); + + __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2)); + kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2)); + kernel.packet[1].v = tmp; } } // end namespace internal diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 09f74c651..8f78b3a6c 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -63,7 +63,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x) x = _mm_or_ps(x, p4f_half); emm0 = _mm_sub_epi32(emm0, p4i_0x7f); - Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1); + Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1); /* part2: if( x < SQRTHF ) { @@ -72,9 +72,9 @@ Packet4f plog<Packet4f>(const Packet4f& _x) } else { x = x - 1.0; } */ Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); - Packet4f tmp = _mm_and_ps(x, mask); + Packet4f tmp = pand(x, mask); x = psub(x, p4f_1); - e = psub(e, _mm_and_ps(p4f_1, mask)); + e = psub(e, pand(p4f_1, mask)); x = padd(x, tmp); Packet4f x2 = pmul(x,x); @@ -167,7 +167,7 @@ Packet4f pexp<Packet4f>(const Packet4f& _x) emm0 = _mm_cvttps_epi32(fx); emm0 = _mm_add_epi32(emm0, p4i_0x7f); emm0 = _mm_slli_epi32(emm0, 23); - return pmul(y, _mm_castsi128_ps(emm0)); + return pmul(y, Packet4f(_mm_castsi128_ps(emm0))); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp<Packet2d>(const Packet2d& _x) @@ -241,7 +241,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x) emm0 = _mm_add_epi32(emm0, p4i_1023_0); emm0 = _mm_slli_epi32(emm0, 20); emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); - return pmul(x, _mm_castsi128_pd(emm0)); + return pmul(x, Packet2d(_mm_castsi128_pd(emm0))); } /* evaluation of 4 sines at onces, using SSE2 intrinsics. diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index f5a3dab52..6912f3bc3 100644..100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -22,9 +22,41 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif +#ifdef EIGEN_VECTORIZE_FMA +#ifndef EIGEN_HAS_FUSED_MADD +#define EIGEN_HAS_FUSED_MADD 1 +#endif +#endif + +#if defined EIGEN_VECTORIZE_AVX && defined __GNUC__ && !(defined __clang__ || defined __INTEL_COMPILER) +// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot +// have overloads for both types without linking error. +// One solution is to increase ABI version using -fabi-version=4 (or greater). +// To workaround this inconvenince, we rather wrap 128bit types into the following helper +// structure: +// TODO disable this wrapper if abi-versio>=4, but to detect that without asking the user to define a macro? +template<typename T> +struct eigen_packet_wrapper +{ + EIGEN_ALWAYS_INLINE operator T&() { return m_val; } + EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } + EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + + T m_val; +}; +typedef eigen_packet_wrapper<__m128> Packet4f; +typedef eigen_packet_wrapper<__m128i> Packet4i; +typedef eigen_packet_wrapper<__m128d> Packet2d; +#else typedef __m128 Packet4f; typedef __m128i Packet4i; typedef __m128d Packet2d; +#endif template<> struct is_arithmetic<__m128> { enum { value = true }; }; template<> struct is_arithmetic<__m128i> { enum { value = true }; }; @@ -58,13 +90,18 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; const Packet4i p4i_##NAME = pset1<Packet4i>(X) +// Use the packet_traits defined in AVX/PacketMath.h instead if we're going +// to leverage AVX instructions. +#ifndef EIGEN_VECTORIZE_AVX template<> struct packet_traits<float> : default_packet_traits { typedef Packet4f type; + typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, size=4, + HasHalfPacket = 0, HasDiv = 1, HasSin = EIGEN_FAST_MATH, @@ -77,19 +114,23 @@ template<> struct packet_traits<float> : default_packet_traits template<> struct packet_traits<double> : default_packet_traits { typedef Packet2d type; + typedef Packet2d half; enum { Vectorizable = 1, AlignedOnScalar = 1, size=2, + HasHalfPacket = 0, HasDiv = 1, HasExp = 1, HasSqrt = 1 }; }; +#endif template<> struct packet_traits<int> : default_packet_traits { typedef Packet4i type; + typedef Packet4i half; enum { // FIXME check the Has* Vectorizable = 1, @@ -98,9 +139,9 @@ template<> struct packet_traits<int> : default_packet_traits }; }; -template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; -template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; }; -template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; +template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; }; +template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; }; +template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; }; #if defined(_MSC_VER) && (_MSC_VER==1500) // Workaround MSVC 9 internal compiler error. @@ -110,13 +151,26 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { re template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); } template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); } #else -template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set1_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); } template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); } #endif +// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. +// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) +// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions. +// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply. +// Also note that with AVX, we want it to generate a vbroadcastss. +#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__) +template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) { + return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); +} +#endif + +#ifndef EIGEN_VECTORIZE_AVX template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); } +#endif template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } @@ -139,7 +193,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) } template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { - return psub(_mm_setr_epi32(0,0,0,0), a); + return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a); } template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } @@ -173,6 +227,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } +#ifdef EIGEN_VECTORIZE_FMA +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } +#endif template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } @@ -218,7 +276,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, con template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); } +template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); } #if defined(_MSC_VER) template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { @@ -236,7 +294,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { E #endif } template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } - template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); } + template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); } #else // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would // require pointer casting to incompatible pointer types and leads to invalid code @@ -245,14 +303,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { E // TODO: do the same for MSVC (ICC is compatible) // NOTE: with the code below, MSVC's compiler crashes! -#if defined(__GNUC__) && defined(__i386__) +#if defined(__GNUC__) && (defined(__i386__) || (defined(__x86_64) && EIGEN_GNUC_AT_LEAST(4, 8))) // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 + #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 1 #elif defined(__clang__) // bug 201: Segfaults in __mm_loadh_pd with clang 2.8 #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 + #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0 #else #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0 + #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0 #endif template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) @@ -283,7 +344,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD #if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS - return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); + return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); #else __m128d res; res = _mm_load_sd((const double*)(from)) ; @@ -302,38 +363,77 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) { Packet4i tmp; - tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from)); + tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from)); return vec4i_swizzle1(tmp, 0, 0, 1, 1); } template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } -template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE +#if EIGEN_AVOID_CUSTOM_UNALIGNED_STORES + _mm_storeu_pd(to, from); +#else _mm_storel_pd((to), from); _mm_storeh_pd((to+1), from); +#endif +} +template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castps_pd(from))); } +template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castsi128_pd(from))); } + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride) +{ + return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, int stride) +{ + return _mm_set_pd(from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride) +{ + return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); + } + +template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride) +{ + to[stride*0] = _mm_cvtss_f32(from); + to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1)); + to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2)); + to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, int stride) +{ + to[stride*0] = _mm_cvtsd_f64(from); + to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride) +{ + to[stride*0] = _mm_cvtsi128_si32(from); + to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); + to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); + to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); } -template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); } -template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); } // some compilers might be tempted to perform multiple moves instead of using a vector path. template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a) { Packet4f pa = _mm_set_ss(a); - pstore(to, vec4f_swizzle1(pa,0,0,0,0)); + pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0))); } // some compilers might be tempted to perform multiple moves instead of using a vector path. template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a) { Packet2d pa = _mm_set_sd(a); - pstore(to, vec2d_swizzle1(pa,0,0)); + pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); } +#ifndef EIGEN_VECTORIZE_AVX template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +#endif #if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER) // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 @@ -380,6 +480,38 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) #endif } +// with AVX, the default implementations based on pload1 are faster +#ifndef __AVX__ +template<> EIGEN_STRONG_INLINE void +pbroadcast4<Packet4f>(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload<Packet4f>(a); + a0 = vec4f_swizzle1(a3, 0,0,0,0); + a1 = vec4f_swizzle1(a3, 1,1,1,1); + a2 = vec4f_swizzle1(a3, 2,2,2,2); + a3 = vec4f_swizzle1(a3, 3,3,3,3); +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4<Packet2d>(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + a0 = _mm_loaddup_pd(a+0); + a1 = _mm_loaddup_pd(a+1); + a2 = _mm_loaddup_pd(a+2); + a3 = _mm_loaddup_pd(a+3); +#else + a1 = pload<Packet2d>(a); + a0 = vec2d_swizzle1(a1, 0,0); + a1 = vec2d_swizzle1(a1, 1,1); + a3 = pload<Packet2d>(a+2); + a2 = vec2d_swizzle1(a3, 0,0); + a3 = vec2d_swizzle1(a3, 1,1); +#endif +} +#endif + EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) { vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); @@ -407,10 +539,10 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) { Packet4f tmp0 = _mm_hadd_ps(a,a); - return pfirst(_mm_hadd_ps(tmp0, tmp0)); + return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0)); } -template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); } +template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); } // SSSE3 version: // EIGEN_STRONG_INLINE float predux(const Packet4i& a) @@ -453,7 +585,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) { Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); - return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); + return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)); } template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) @@ -476,11 +608,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) { Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); + return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { - return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); + return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); } template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) { @@ -496,17 +628,17 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) { Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); + return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { - return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); + return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); } template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) { #ifdef EIGEN_VECTORIZE_SSE4_1 Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); - return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); + return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); #else // after some experiments, it is seems this is the fastest way to implement it // for GCC (eg., it does not like using std::min after the pstore !!) @@ -522,17 +654,17 @@ template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) { Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); + return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { - return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); + return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); } template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) { #ifdef EIGEN_VECTORIZE_SSE4_1 Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); - return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); + return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); #else // after some experiments, it is seems this is the fastest way to implement it // for GCC (eg., it does not like using std::min after the pstore !!) @@ -652,6 +784,31 @@ struct palign_impl<Offset,Packet2d> }; #endif +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4f,4>& kernel) { + _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet2d,2>& kernel) { + __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]); + kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]); + kernel.packet[1] = tmp; +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4i,4>& kernel) { + __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); + __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); + __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); + __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); + + kernel.packet[0] = _mm_unpacklo_epi64(T0, T1); + kernel.packet[1] = _mm_unpackhi_epi64(T0, T1); + kernel.packet[2] = _mm_unpacklo_epi64(T2, T3); + kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index a0fcea3f9..ec42e6850 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -320,6 +320,26 @@ struct functor_traits<scalar_asin_op<Scalar> > }; }; + +/** \internal + * \brief Template functor to compute the atan of a scalar + * \sa class CwiseUnaryOp, ArrayBase::atan() + */ +template<typename Scalar> struct scalar_atan_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op) + inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); } + typedef typename packet_traits<Scalar>::type Packet; + inline Packet packetOp(const Packet& a) const { return internal::patan(a); } +}; +template<typename Scalar> +struct functor_traits<scalar_atan_op<Scalar> > +{ + enum { + Cost = 5 * NumTraits<Scalar>::MulCost, + PacketAccess = packet_traits<Scalar>::HasATan + }; +}; + /** \internal * \brief Template functor to compute the inverse of a scalar * \sa class CwiseUnaryOp, Cwise::inverse() diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 686ff84f1..7da52c2e8 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -10,6 +10,7 @@ #ifndef EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H + namespace Eigen { namespace internal { @@ -91,9 +92,26 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) }; manage_caching_sizes(GetAction, &l1, &l2); - k = std::min<SizeType>(k, l1/kdiv); - SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; - if(_m<m) m = _m & mr_mask; + +// k = std::min<SizeType>(k, l1/kdiv); +// SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; +// if(_m<m) m = _m & mr_mask; + + // In unit tests we do not want to use extra large matrices, + // so we reduce the block size to check the blocking strategy is not flawed +#ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS +// k = std::min<SizeType>(k,240); +// n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); +// m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); + + k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240); + n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); + m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); +#else + k = std::min<SizeType>(k,24); + n = std::min<SizeType>(n,384/sizeof(RhsScalar)); + m = std::min<SizeType>(m,384/sizeof(RhsScalar)); +#endif } template<typename LhsScalar, typename RhsScalar, typename SizeType> @@ -160,16 +178,19 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - // register block size along the N direction (must be either 2 or 4) - nr = NumberOfRegisters/4, + // register block size along the N direction must be 1 or 4 + nr = 4, // register block size along the M direction (currently, this one cannot be modified) - mr = 2 * LhsPacketSize, +#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) + // we assume 16 registers + mr = 3*LhsPacketSize, +#else + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, +#endif - WorkSpaceFactor = nr * RhsPacketSize, - LhsProgress = LhsPacketSize, - RhsProgress = RhsPacketSize + RhsProgress = 1 }; typedef typename packet_traits<LhsScalar>::type _LhsPacket; @@ -186,32 +207,65 @@ public: { p = pset1<ResPacket>(ResScalar(0)); } - - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// pbroadcast2(b, b0, b1); +// } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - for(DenseIndex k=0; k<n; k++) - pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); + dest = pset1<RhsPacketType>(*b); + } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + dest = ploadquad<RhsPacket>(b); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template<typename LhsPacketType> + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const { - dest = pload<RhsPacket>(b); + dest = pload<LhsPacketType>(a); } - EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + template<typename LhsPacketType> + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = pload<LhsPacket>(a); + dest = ploadu<LhsPacketType>(a); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const + template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const { + // It would be a lot cleaner to call pmadd all the time. Unfortunately if we + // let gcc allocate the register in which to store the result of the pmul + // (in the case where there is no FMA) gcc fails to figure out how to avoid + // spilling register. +#ifdef EIGEN_HAS_FUSED_MADD + EIGEN_UNUSED_VARIABLE(tmp); + c = pmadd(a,b,c); +#else tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp); +#endif } EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { r = pmadd(c,alpha,r); } + + template<typename ResPacketHalf> + EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const + { + r = pmadd(c,alpha,r); + } protected: // conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; @@ -235,12 +289,16 @@ public: ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - nr = NumberOfRegisters/4, - mr = 2 * LhsPacketSize, - WorkSpaceFactor = nr*RhsPacketSize, + nr = 4, +#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) + // we assume 16 registers + mr = 3*LhsPacketSize, +#else + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, +#endif LhsProgress = LhsPacketSize, - RhsProgress = RhsPacketSize + RhsProgress = 1 }; typedef typename packet_traits<LhsScalar>::type _LhsPacket; @@ -258,15 +316,14 @@ public: p = pset1<ResPacket>(ResScalar(0)); } - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - for(DenseIndex k=0; k<n; k++) - pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); + dest = pset1<RhsPacket>(*b); } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - dest = pload<RhsPacket>(b); + dest = pset1<RhsPacket>(*b); } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const @@ -274,6 +331,21 @@ public: dest = pload<LhsPacket>(a); } + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploadu<LhsPacket>(a); + } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// pbroadcast2(b, b0, b1); +// } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); @@ -281,7 +353,12 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { +#ifdef EIGEN_HAS_FUSED_MADD + EIGEN_UNUSED_VARIABLE(tmp); + c.v = pmadd(a.v,b,c.v); +#else tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp); +#endif } EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const @@ -298,6 +375,38 @@ protected: conj_helper<ResPacket,ResPacket,ConjLhs,false> cj; }; +template<typename Packet> +struct DoublePacket +{ + Packet first; + Packet second; +}; + +template<typename Packet> +DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b) +{ + DoublePacket<Packet> res; + res.first = padd(a.first, b.first); + res.second = padd(a.second,b.second); + return res; +} + +template<typename Packet> +const DoublePacket<Packet>& predux4(const DoublePacket<Packet> &a) +{ + return a; +} + +template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; }; +// template<typename Packet> +// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b) +// { +// DoublePacket<Packet> res; +// res.first = padd(a.first, b.first); +// res.second = padd(a.second,b.second); +// return res; +// } + template<typename RealScalar, bool _ConjLhs, bool _ConjRhs> class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs > { @@ -314,60 +423,80 @@ public: && packet_traits<Scalar>::Vectorizable, RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1, ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, - - nr = 2, - mr = 2 * ResPacketSize, - WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, + LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, + RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, + + // FIXME: should depend on NumberOfRegisters + nr = 4, + mr = ResPacketSize, LhsProgress = ResPacketSize, - RhsProgress = Vectorizable ? 2*ResPacketSize : 1 + RhsProgress = 1 }; typedef typename packet_traits<RealScalar>::type RealPacket; typedef typename packet_traits<Scalar>::type ScalarPacket; - struct DoublePacket - { - RealPacket first; - RealPacket second; - }; + typedef DoublePacket<RealPacket> DoublePacketType; typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket; - typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket; + typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket; typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket; - typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket; + typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } - EIGEN_STRONG_INLINE void initAcc(DoublePacket& p) + EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) { p.first = pset1<RealPacket>(RealScalar(0)); p.second = pset1<RealPacket>(RealScalar(0)); } - /* Unpack the rhs coeff such that each complex coefficient is spread into - * two packects containing respectively the real and imaginary coefficient - * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y] - */ - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b) + // Scalar path + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { - for(DenseIndex k=0; k<n; k++) - { - if(Vectorizable) - { - pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k])); - pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k])); - } - else - b[k] = rhs[k]; - } + dest = pset1<ResPacket>(*b); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const + // Vectorized path + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const + { + dest.first = pset1<RealPacket>(real(*b)); + dest.second = pset1<RealPacket>(imag(*b)); + } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const + { + loadRhs(b,dest); + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const + { + eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4); + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); + loadRhs(b+2, b2); + loadRhs(b+3, b3); + } + + // Vectorized path + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); + } + + // Scalar path + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) { - dest.first = pload<RealPacket>((const RealScalar*)b); - dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize)); + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); } // nothing special here @@ -376,7 +505,12 @@ public: dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a)); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a)); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); @@ -389,7 +523,7 @@ public: EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } - EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const + EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const { // assemble c ResPacket tmp; @@ -440,12 +574,12 @@ public: ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + // FIXME: should depend on NumberOfRegisters nr = 4, - mr = 2*ResPacketSize, - WorkSpaceFactor = nr*RhsPacketSize, + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize, LhsProgress = ResPacketSize, - RhsProgress = ResPacketSize + RhsProgress = 1 }; typedef typename packet_traits<LhsScalar>::type _LhsPacket; @@ -463,21 +597,38 @@ public: p = pset1<ResPacket>(ResScalar(0)); } - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - for(DenseIndex k=0; k<n; k++) - pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); + dest = pset1<RhsPacket>(*b); } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + + void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) { - dest = pload<RhsPacket>(b); + pbroadcast4(b, b0, b1, b2, b3); } + +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// // FIXME not sure that's the best way to implement it! +// b0 = pload1<RhsPacket>(b+0); +// b1 = pload1<RhsPacket>(b+1); +// } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = ploaddup<LhsPacket>(a); } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4); + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploaddup<LhsPacket>(a); + } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { @@ -486,7 +637,13 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { +#ifdef EIGEN_HAS_FUSED_MADD + EIGEN_UNUSED_VARIABLE(tmp); + c.v = pmadd(a,b.v,c.v); +#else tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); +#endif + } EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const @@ -519,6 +676,14 @@ struct gebp_kernel typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; + + typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits; + typedef typename SwappedTraits::ResScalar SResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + enum { Vectorizable = Traits::Vectorizable, @@ -529,570 +694,708 @@ struct gebp_kernel EIGEN_DONT_INLINE void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, - Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0); + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> EIGEN_DONT_INLINE void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, - Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB) + Index strideA, Index strideB, Index offsetA, Index offsetB) { Traits traits; + SwappedTraits straits; if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; -// conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; - Index packet_cols = (cols/nr) * nr; - const Index peeled_mc = (rows/mr)*mr; - // FIXME: - const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); - const Index peeled_kc = (depth/4)*4; - - if(unpackedB==0) - unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress); - - // loops on each micro vertical panel of rhs (depth x nr) - for(Index j2=0; j2<packet_cols; j2+=nr) + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; + const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; + const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; + const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; + enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) + const Index peeled_kc = depth & ~(pk-1); + const Index prefetch_res_offset = 32/sizeof(ResScalar); +// const Index depth2 = depth & ~1; + + //---------- Process 3 * LhsProgress rows at once ---------- + // This corresponds to 3*LhsProgress x nr register blocks. + // Usually, make sense only with FMA + if(mr>=3*Traits::LhsProgress) { - traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB); - - // loops on each largest micro horizontal panel of lhs (mr x depth) - // => we select a mr x nr micro block of res which is entirely - // stored into mr/packet_size x nr registers. - for(Index i=0; i<peeled_mc; i+=mr) + // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth) + for(Index i=0; i<peeled_mc3; i+=3*Traits::LhsProgress) { - const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; - prefetch(&blA[0]); - - // gets res block as register - AccPacket C0, C1, C2, C3, C4, C5, C6, C7; - traits.initAcc(C0); - traits.initAcc(C1); - if(nr==4) traits.initAcc(C2); - if(nr==4) traits.initAcc(C3); - traits.initAcc(C4); - traits.initAcc(C5); - if(nr==4) traits.initAcc(C6); - if(nr==4) traits.initAcc(C7); - - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = r0 + resStride; - ResScalar* r2 = r1 + resStride; - ResScalar* r3 = r2 + resStride; - - prefetch(r0+16); - prefetch(r1+16); - prefetch(r2+16); - prefetch(r3+16); - - // performs "inner" product - // TODO let's check wether the folowing peeled loop could not be - // optimized via optimal prefetching from one loop to the other - const RhsScalar* blB = unpackedB; - for(Index k=0; k<peeled_kc; k+=4) + // loops on each largest micro vertical panel of rhs (depth * nr) + for(Index j2=0; j2<packet_cols4; j2+=nr) { - if(nr==2) + // We select a 3*Traits::LhsProgress x nr micro block of res which is entirely + // stored into 3 x nr registers. + + const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C1, C2, C3, + C4, C5, C6, C7, + C8, C9, C10, C11; + traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); + traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); + traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + ResScalar* r1 = &res[(j2+1)*resStride + i]; + ResScalar* r2 = &res[(j2+2)*resStride + i]; + ResScalar* r3 = &res[(j2+3)*resStride + i]; + + internal::prefetch(r0); + internal::prefetch(r1); + internal::prefetch(r2); + internal::prefetch(r3); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + prefetch(&blB[0]); + LhsPacket A0, A1; + + for(Index k=0; k<peeled_kc; k+=pk) { - LhsPacket A0, A1; - RhsPacket B_0; - RhsPacket T0; - -EIGEN_ASM_COMMENT("mybegin2"); - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadLhs(&blA[1*LhsProgress], A1); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[1*RhsProgress], B_0); - traits.madd(A0,B_0,C1,T0); - traits.madd(A1,B_0,C5,B_0); - - traits.loadLhs(&blA[2*LhsProgress], A0); - traits.loadLhs(&blA[3*LhsProgress], A1); - traits.loadRhs(&blB[2*RhsProgress], B_0); - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[3*RhsProgress], B_0); - traits.madd(A0,B_0,C1,T0); - traits.madd(A1,B_0,C5,B_0); - - traits.loadLhs(&blA[4*LhsProgress], A0); - traits.loadLhs(&blA[5*LhsProgress], A1); - traits.loadRhs(&blB[4*RhsProgress], B_0); - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[5*RhsProgress], B_0); - traits.madd(A0,B_0,C1,T0); - traits.madd(A1,B_0,C5,B_0); - - traits.loadLhs(&blA[6*LhsProgress], A0); - traits.loadLhs(&blA[7*LhsProgress], A1); - traits.loadRhs(&blB[6*RhsProgress], B_0); - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[7*RhsProgress], B_0); - traits.madd(A0,B_0,C1,T0); - traits.madd(A1,B_0,C5,B_0); -EIGEN_ASM_COMMENT("myend"); + EIGEN_ASM_COMMENT("begin gegp micro kernel 3p x 4"); + RhsPacket B_0, T0; + LhsPacket A2; + +#define EIGEN_GEBGP_ONESTEP(K) \ + internal::prefetch(blA+(3*K+16)*LhsProgress); \ + traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ + traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ + traits.loadRhs(&blB[(0+4*K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, T0); \ + traits.madd(A1, B_0, C4, T0); \ + traits.madd(A2, B_0, C8, B_0); \ + traits.loadRhs(&blB[1+4*K*RhsProgress], B_0); \ + traits.madd(A0, B_0, C1, T0); \ + traits.madd(A1, B_0, C5, T0); \ + traits.madd(A2, B_0, C9, B_0); \ + traits.loadRhs(&blB[2+4*K*RhsProgress], B_0); \ + traits.madd(A0, B_0, C2, T0); \ + traits.madd(A1, B_0, C6, T0); \ + traits.madd(A2, B_0, C10, B_0); \ + traits.loadRhs(&blB[3+4*K*RhsProgress], B_0); \ + traits.madd(A0, B_0, C3 , T0); \ + traits.madd(A1, B_0, C7, T0); \ + traits.madd(A2, B_0, C11, B_0) + + internal::prefetch(blB+(48+0)); + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + internal::prefetch(blB+(48+16)); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*4*RhsProgress; + blA += pk*3*Traits::LhsProgress; } - else + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) { -EIGEN_ASM_COMMENT("mybegin4"); - LhsPacket A0, A1; - RhsPacket B_0, B1, B2, B3; - RhsPacket T0; - - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadLhs(&blA[1*LhsProgress], A1); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.loadRhs(&blB[1*RhsProgress], B1); - - traits.madd(A0,B_0,C0,T0); - traits.loadRhs(&blB[2*RhsProgress], B2); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[3*RhsProgress], B3); - traits.loadRhs(&blB[4*RhsProgress], B_0); - traits.madd(A0,B1,C1,T0); - traits.madd(A1,B1,C5,B1); - traits.loadRhs(&blB[5*RhsProgress], B1); - traits.madd(A0,B2,C2,T0); - traits.madd(A1,B2,C6,B2); - traits.loadRhs(&blB[6*RhsProgress], B2); - traits.madd(A0,B3,C3,T0); - traits.loadLhs(&blA[2*LhsProgress], A0); - traits.madd(A1,B3,C7,B3); - traits.loadLhs(&blA[3*LhsProgress], A1); - traits.loadRhs(&blB[7*RhsProgress], B3); - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[8*RhsProgress], B_0); - traits.madd(A0,B1,C1,T0); - traits.madd(A1,B1,C5,B1); - traits.loadRhs(&blB[9*RhsProgress], B1); - traits.madd(A0,B2,C2,T0); - traits.madd(A1,B2,C6,B2); - traits.loadRhs(&blB[10*RhsProgress], B2); - traits.madd(A0,B3,C3,T0); - traits.loadLhs(&blA[4*LhsProgress], A0); - traits.madd(A1,B3,C7,B3); - traits.loadLhs(&blA[5*LhsProgress], A1); - traits.loadRhs(&blB[11*RhsProgress], B3); - - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[12*RhsProgress], B_0); - traits.madd(A0,B1,C1,T0); - traits.madd(A1,B1,C5,B1); - traits.loadRhs(&blB[13*RhsProgress], B1); - traits.madd(A0,B2,C2,T0); - traits.madd(A1,B2,C6,B2); - traits.loadRhs(&blB[14*RhsProgress], B2); - traits.madd(A0,B3,C3,T0); - traits.loadLhs(&blA[6*LhsProgress], A0); - traits.madd(A1,B3,C7,B3); - traits.loadLhs(&blA[7*LhsProgress], A1); - traits.loadRhs(&blB[15*RhsProgress], B3); - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); - traits.madd(A0,B1,C1,T0); - traits.madd(A1,B1,C5,B1); - traits.madd(A0,B2,C2,T0); - traits.madd(A1,B2,C6,B2); - traits.madd(A0,B3,C3,T0); - traits.madd(A1,B3,C7,B3); + RhsPacket B_0, T0; + LhsPacket A2; + EIGEN_GEBGP_ONESTEP(0); + blB += 4*RhsProgress; + blA += 3*Traits::LhsProgress; } - - blB += 4*nr*RhsProgress; - blA += 4*mr; + #undef EIGEN_GEBGP_ONESTEP + + ResPacket R0, R1, R2; + ResPacket alphav = pset1<ResPacket>(alpha); + + R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); + R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize); + R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + traits.acc(C8, alphav, R2); + pstoreu(r0+0*Traits::ResPacketSize, R0); + pstoreu(r0+1*Traits::ResPacketSize, R1); + pstoreu(r0+2*Traits::ResPacketSize, R2); + + R0 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize); + R1 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize); + R2 = ploadu<ResPacket>(r1+2*Traits::ResPacketSize); + traits.acc(C1, alphav, R0); + traits.acc(C5, alphav, R1); + traits.acc(C9, alphav, R2); + pstoreu(r1+0*Traits::ResPacketSize, R0); + pstoreu(r1+1*Traits::ResPacketSize, R1); + pstoreu(r1+2*Traits::ResPacketSize, R2); + + R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize); + R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize); + R2 = ploadu<ResPacket>(r2+2*Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C6, alphav, R1); + traits.acc(C10, alphav, R2); + pstoreu(r2+0*Traits::ResPacketSize, R0); + pstoreu(r2+1*Traits::ResPacketSize, R1); + pstoreu(r2+2*Traits::ResPacketSize, R2); + + R0 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize); + R1 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize); + R2 = ploadu<ResPacket>(r3+2*Traits::ResPacketSize); + traits.acc(C3, alphav, R0); + traits.acc(C7, alphav, R1); + traits.acc(C11, alphav, R2); + pstoreu(r3+0*Traits::ResPacketSize, R0); + pstoreu(r3+1*Traits::ResPacketSize, R1); + pstoreu(r3+2*Traits::ResPacketSize, R2); } - // process remaining peeled loop - for(Index k=peeled_kc; k<depth; k++) + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2<cols; j2++) { - if(nr==2) + // One column at a time + const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C4, C8; + traits.initAcc(C0); + traits.initAcc(C4); + traits.initAcc(C8); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; + LhsPacket A0, A1, A2; + + for(Index k=0; k<peeled_kc; k+=pk) { - LhsPacket A0, A1; + EIGEN_ASM_COMMENT("begin gegp micro kernel 3p x 1"); RhsPacket B_0; - RhsPacket T0; - - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadLhs(&blA[1*LhsProgress], A1); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[1*RhsProgress], B_0); - traits.madd(A0,B_0,C1,T0); - traits.madd(A1,B_0,C5,B_0); +#define EIGEN_GEBGP_ONESTEP(K) \ + traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ + traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0); \ + traits.madd(A1, B_0, C4, B_0); \ + traits.madd(A2, B_0, C8, B_0) + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*RhsProgress; + blA += pk*3*Traits::LhsProgress; } - else + + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) { - LhsPacket A0, A1; - RhsPacket B_0, B1, B2, B3; - RhsPacket T0; - - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadLhs(&blA[1*LhsProgress], A1); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.loadRhs(&blB[1*RhsProgress], B1); - - traits.madd(A0,B_0,C0,T0); - traits.loadRhs(&blB[2*RhsProgress], B2); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[3*RhsProgress], B3); - traits.madd(A0,B1,C1,T0); - traits.madd(A1,B1,C5,B1); - traits.madd(A0,B2,C2,T0); - traits.madd(A1,B2,C6,B2); - traits.madd(A0,B3,C3,T0); - traits.madd(A1,B3,C7,B3); + RhsPacket B_0; + EIGEN_GEBGP_ONESTEP(0); + blB += RhsProgress; + blA += 3*Traits::LhsProgress; } - - blB += nr*RhsProgress; - blA += mr; - } - - if(nr==4) - { - ResPacket R0, R1, R2, R3, R4, R5, R6; +#undef EIGEN_GEBGP_ONESTEP + ResPacket R0, R1, R2; ResPacket alphav = pset1<ResPacket>(alpha); - R0 = ploadu<ResPacket>(r0); - R1 = ploadu<ResPacket>(r1); - R2 = ploadu<ResPacket>(r2); - R3 = ploadu<ResPacket>(r3); - R4 = ploadu<ResPacket>(r0 + ResPacketSize); - R5 = ploadu<ResPacket>(r1 + ResPacketSize); - R6 = ploadu<ResPacket>(r2 + ResPacketSize); + R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); + R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize); + R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize); traits.acc(C0, alphav, R0); - pstoreu(r0, R0); - R0 = ploadu<ResPacket>(r3 + ResPacketSize); - - traits.acc(C1, alphav, R1); - traits.acc(C2, alphav, R2); - traits.acc(C3, alphav, R3); - traits.acc(C4, alphav, R4); - traits.acc(C5, alphav, R5); - traits.acc(C6, alphav, R6); - traits.acc(C7, alphav, R0); - - pstoreu(r1, R1); - pstoreu(r2, R2); - pstoreu(r3, R3); - pstoreu(r0 + ResPacketSize, R4); - pstoreu(r1 + ResPacketSize, R5); - pstoreu(r2 + ResPacketSize, R6); - pstoreu(r3 + ResPacketSize, R0); + traits.acc(C4, alphav, R1); + traits.acc(C8 , alphav, R2); + pstoreu(r0+0*Traits::ResPacketSize, R0); + pstoreu(r0+1*Traits::ResPacketSize, R1); + pstoreu(r0+2*Traits::ResPacketSize, R2); } - else - { - ResPacket R0, R1, R4; - ResPacket alphav = pset1<ResPacket>(alpha); - - R0 = ploadu<ResPacket>(r0); - R1 = ploadu<ResPacket>(r1); - R4 = ploadu<ResPacket>(r0 + ResPacketSize); - traits.acc(C0, alphav, R0); - pstoreu(r0, R0); - R0 = ploadu<ResPacket>(r1 + ResPacketSize); - traits.acc(C1, alphav, R1); - traits.acc(C4, alphav, R4); - traits.acc(C5, alphav, R0); - pstoreu(r1, R1); - pstoreu(r0 + ResPacketSize, R4); - pstoreu(r1 + ResPacketSize, R0); - } - } + } - if(rows-peeled_mc>=LhsProgress) + //---------- Process 2 * LhsProgress rows at once ---------- + if(mr>=2*Traits::LhsProgress) + { + // loops on each largest micro horizontal panel of lhs (2*LhsProgress x depth) + for(Index i=peeled_mc3; i<peeled_mc2; i+=2*LhsProgress) { - Index i = peeled_mc; - const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; - prefetch(&blA[0]); - - // gets res block as register - AccPacket C0, C1, C2, C3; - traits.initAcc(C0); - traits.initAcc(C1); - if(nr==4) traits.initAcc(C2); - if(nr==4) traits.initAcc(C3); - - // performs "inner" product - const RhsScalar* blB = unpackedB; - for(Index k=0; k<peeled_kc; k+=4) + // loops on each largest micro vertical panel of rhs (depth * nr) + for(Index j2=0; j2<packet_cols4; j2+=nr) { - if(nr==2) - { - LhsPacket A0; - RhsPacket B_0, B1; - - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.loadRhs(&blB[1*RhsProgress], B1); - traits.madd(A0,B_0,C0,B_0); - traits.loadRhs(&blB[2*RhsProgress], B_0); - traits.madd(A0,B1,C1,B1); - traits.loadLhs(&blA[1*LhsProgress], A0); - traits.loadRhs(&blB[3*RhsProgress], B1); - traits.madd(A0,B_0,C0,B_0); - traits.loadRhs(&blB[4*RhsProgress], B_0); - traits.madd(A0,B1,C1,B1); - traits.loadLhs(&blA[2*LhsProgress], A0); - traits.loadRhs(&blB[5*RhsProgress], B1); - traits.madd(A0,B_0,C0,B_0); - traits.loadRhs(&blB[6*RhsProgress], B_0); - traits.madd(A0,B1,C1,B1); - traits.loadLhs(&blA[3*LhsProgress], A0); - traits.loadRhs(&blB[7*RhsProgress], B1); - traits.madd(A0,B_0,C0,B_0); - traits.madd(A0,B1,C1,B1); - } - else - { - LhsPacket A0; - RhsPacket B_0, B1, B2, B3; - - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.loadRhs(&blB[1*RhsProgress], B1); - - traits.madd(A0,B_0,C0,B_0); - traits.loadRhs(&blB[2*RhsProgress], B2); - traits.loadRhs(&blB[3*RhsProgress], B3); - traits.loadRhs(&blB[4*RhsProgress], B_0); - traits.madd(A0,B1,C1,B1); - traits.loadRhs(&blB[5*RhsProgress], B1); - traits.madd(A0,B2,C2,B2); - traits.loadRhs(&blB[6*RhsProgress], B2); - traits.madd(A0,B3,C3,B3); - traits.loadLhs(&blA[1*LhsProgress], A0); - traits.loadRhs(&blB[7*RhsProgress], B3); - traits.madd(A0,B_0,C0,B_0); - traits.loadRhs(&blB[8*RhsProgress], B_0); - traits.madd(A0,B1,C1,B1); - traits.loadRhs(&blB[9*RhsProgress], B1); - traits.madd(A0,B2,C2,B2); - traits.loadRhs(&blB[10*RhsProgress], B2); - traits.madd(A0,B3,C3,B3); - traits.loadLhs(&blA[2*LhsProgress], A0); - traits.loadRhs(&blB[11*RhsProgress], B3); - - traits.madd(A0,B_0,C0,B_0); - traits.loadRhs(&blB[12*RhsProgress], B_0); - traits.madd(A0,B1,C1,B1); - traits.loadRhs(&blB[13*RhsProgress], B1); - traits.madd(A0,B2,C2,B2); - traits.loadRhs(&blB[14*RhsProgress], B2); - traits.madd(A0,B3,C3,B3); - - traits.loadLhs(&blA[3*LhsProgress], A0); - traits.loadRhs(&blB[15*RhsProgress], B3); - traits.madd(A0,B_0,C0,B_0); - traits.madd(A0,B1,C1,B1); - traits.madd(A0,B2,C2,B2); - traits.madd(A0,B3,C3,B3); - } + // We select a 2*Traits::LhsProgress x nr micro block of res which is entirely + // stored into 2 x nr registers. + + const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C1, C2, C3, + C4, C5, C6, C7; + traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); + traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + ResScalar* r1 = &res[(j2+1)*resStride + i]; + ResScalar* r2 = &res[(j2+2)*resStride + i]; + ResScalar* r3 = &res[(j2+3)*resStride + i]; + + internal::prefetch(r0+prefetch_res_offset); + internal::prefetch(r1+prefetch_res_offset); + internal::prefetch(r2+prefetch_res_offset); + internal::prefetch(r3+prefetch_res_offset); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + prefetch(&blB[0]); + LhsPacket A0, A1; - blB += nr*4*RhsProgress; - blA += 4*LhsProgress; - } - // process remaining peeled loop - for(Index k=peeled_kc; k<depth; k++) - { - if(nr==2) + for(Index k=0; k<peeled_kc; k+=pk) { - LhsPacket A0; - RhsPacket B_0, B1; - - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.loadRhs(&blB[1*RhsProgress], B1); - traits.madd(A0,B_0,C0,B_0); - traits.madd(A0,B1,C1,B1); + EIGEN_ASM_COMMENT("begin gegp micro kernel 2pX4"); + RhsPacket B_0, B1, B2, B3, T0; + + #define EIGEN_GEBGP_ONESTEP(K) \ + traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ + traits.madd(A0, B_0, C0, T0); \ + traits.madd(A1, B_0, C4, B_0); \ + traits.madd(A0, B1, C1, T0); \ + traits.madd(A1, B1, C5, B1); \ + traits.madd(A0, B2, C2, T0); \ + traits.madd(A1, B2, C6, B2); \ + traits.madd(A0, B3, C3, T0); \ + traits.madd(A1, B3, C7, B3) + + internal::prefetch(blB+(48+0)); + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + internal::prefetch(blB+(48+16)); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*4*RhsProgress; + blA += pk*(2*Traits::LhsProgress); } - else + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) { - LhsPacket A0; - RhsPacket B_0, B1, B2, B3; - - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.loadRhs(&blB[1*RhsProgress], B1); - traits.loadRhs(&blB[2*RhsProgress], B2); - traits.loadRhs(&blB[3*RhsProgress], B3); - - traits.madd(A0,B_0,C0,B_0); - traits.madd(A0,B1,C1,B1); - traits.madd(A0,B2,C2,B2); - traits.madd(A0,B3,C3,B3); + RhsPacket B_0, B1, B2, B3, T0; + EIGEN_GEBGP_ONESTEP(0); + blB += 4*RhsProgress; + blA += 2*Traits::LhsProgress; } - - blB += nr*RhsProgress; - blA += LhsProgress; +#undef EIGEN_GEBGP_ONESTEP + + ResPacket R0, R1, R2, R3; + ResPacket alphav = pset1<ResPacket>(alpha); + + R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); + R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize); + R2 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize); + R3 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + traits.acc(C1, alphav, R2); + traits.acc(C5, alphav, R3); + pstoreu(r0+0*Traits::ResPacketSize, R0); + pstoreu(r0+1*Traits::ResPacketSize, R1); + pstoreu(r1+0*Traits::ResPacketSize, R2); + pstoreu(r1+1*Traits::ResPacketSize, R3); + + R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize); + R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize); + R2 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize); + R3 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C6, alphav, R1); + traits.acc(C3, alphav, R2); + traits.acc(C7, alphav, R3); + pstoreu(r2+0*Traits::ResPacketSize, R0); + pstoreu(r2+1*Traits::ResPacketSize, R1); + pstoreu(r3+0*Traits::ResPacketSize, R2); + pstoreu(r3+1*Traits::ResPacketSize, R3); } + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2<cols; j2++) + { + // One column at a time + const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)]; + prefetch(&blA[0]); - ResPacket R0, R1, R2, R3; - ResPacket alphav = pset1<ResPacket>(alpha); - - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = r0 + resStride; - ResScalar* r2 = r1 + resStride; - ResScalar* r3 = r2 + resStride; + // gets res block as register + AccPacket C0, C4; + traits.initAcc(C0); + traits.initAcc(C4); - R0 = ploadu<ResPacket>(r0); - R1 = ploadu<ResPacket>(r1); - if(nr==4) R2 = ploadu<ResPacket>(r2); - if(nr==4) R3 = ploadu<ResPacket>(r3); + ResScalar* r0 = &res[(j2+0)*resStride + i]; + internal::prefetch(r0+prefetch_res_offset); - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - if(nr==4) traits.acc(C2, alphav, R2); - if(nr==4) traits.acc(C3, alphav, R3); + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; + LhsPacket A0, A1; - pstoreu(r0, R0); - pstoreu(r1, R1); - if(nr==4) pstoreu(r2, R2); - if(nr==4) pstoreu(r3, R3); - } - for(Index i=peeled_mc2; i<rows; i++) - { - const LhsScalar* blA = &blockA[i*strideA+offsetA]; - prefetch(&blA[0]); - - // gets a 1 x nr res block as registers - ResScalar C0(0), C1(0), C2(0), C3(0); - // TODO directly use blockB ??? - const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - for(Index k=0; k<depth; k++) - { - if(nr==2) + for(Index k=0; k<peeled_kc; k+=pk) { - LhsScalar A0; - RhsScalar B_0, B1; - - A0 = blA[k]; - B_0 = blB[0]; - B1 = blB[1]; - MADD(cj,A0,B_0,C0,B_0); - MADD(cj,A0,B1,C1,B1); + EIGEN_ASM_COMMENT("begin gegp micro kernel 2p x 1"); + RhsPacket B_0, B1; + +#define EIGEN_GEBGP_ONESTEP(K) \ + traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B1); \ + traits.madd(A1, B_0, C4, B_0) + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*RhsProgress; + blA += pk*2*Traits::LhsProgress; } - else + + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) { - LhsScalar A0; - RhsScalar B_0, B1, B2, B3; - - A0 = blA[k]; - B_0 = blB[0]; - B1 = blB[1]; - B2 = blB[2]; - B3 = blB[3]; - - MADD(cj,A0,B_0,C0,B_0); - MADD(cj,A0,B1,C1,B1); - MADD(cj,A0,B2,C2,B2); - MADD(cj,A0,B3,C3,B3); + RhsPacket B_0, B1; + EIGEN_GEBGP_ONESTEP(0); + blB += RhsProgress; + blA += 2*Traits::LhsProgress; } +#undef EIGEN_GEBGP_ONESTEP + ResPacket R0, R1; + ResPacket alphav = pset1<ResPacket>(alpha); - blB += nr; + R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); + R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + pstoreu(r0+0*Traits::ResPacketSize, R0); + pstoreu(r0+1*Traits::ResPacketSize, R1); } - res[(j2+0)*resStride + i] += alpha*C0; - res[(j2+1)*resStride + i] += alpha*C1; - if(nr==4) res[(j2+2)*resStride + i] += alpha*C2; - if(nr==4) res[(j2+3)*resStride + i] += alpha*C3; } } - // process remaining rhs/res columns one at a time - // => do the same but with nr==1 - for(Index j2=packet_cols; j2<cols; j2++) + //---------- Process 1 * LhsProgress rows at once ---------- + if(mr>=1*Traits::LhsProgress) { - // unpack B - traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB); - - for(Index i=0; i<peeled_mc; i+=mr) + // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth) + for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress) { - const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; - prefetch(&blA[0]); - - // TODO move the res loads to the stores - - // get res block as registers - AccPacket C0, C4; - traits.initAcc(C0); - traits.initAcc(C4); - - const RhsScalar* blB = unpackedB; - for(Index k=0; k<depth; k++) + // loops on each largest micro vertical panel of rhs (depth * nr) + for(Index j2=0; j2<packet_cols4; j2+=nr) { - LhsPacket A0, A1; - RhsPacket B_0; - RhsPacket T0; - - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadLhs(&blA[1*LhsProgress], A1); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); + // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely + // stored into 1 x nr registers. + + const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C1, C2, C3; + traits.initAcc(C0); + traits.initAcc(C1); + traits.initAcc(C2); + traits.initAcc(C3); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + ResScalar* r1 = &res[(j2+1)*resStride + i]; + ResScalar* r2 = &res[(j2+2)*resStride + i]; + ResScalar* r3 = &res[(j2+3)*resStride + i]; + + internal::prefetch(r0+prefetch_res_offset); + internal::prefetch(r1+prefetch_res_offset); + internal::prefetch(r2+prefetch_res_offset); + internal::prefetch(r3+prefetch_res_offset); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + prefetch(&blB[0]); + LhsPacket A0; - blB += RhsProgress; - blA += 2*LhsProgress; + for(Index k=0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gegp micro kernel 1pX4"); + RhsPacket B_0, B1, B2, B3; + +#define EIGEN_GEBGP_ONESTEP(K) \ + traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ + traits.madd(A0, B_0, C0, B_0); \ + traits.madd(A0, B1, C1, B1); \ + traits.madd(A0, B2, C2, B2); \ + traits.madd(A0, B3, C3, B3); + + internal::prefetch(blB+(48+0)); + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + internal::prefetch(blB+(48+16)); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*4*RhsProgress; + blA += pk*1*LhsProgress; + } + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPacket B_0, B1, B2, B3; + EIGEN_GEBGP_ONESTEP(0); + blB += 4*RhsProgress; + blA += 1*LhsProgress; + } +#undef EIGEN_GEBGP_ONESTEP + + ResPacket R0, R1; + ResPacket alphav = pset1<ResPacket>(alpha); + + R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); + R1 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + pstoreu(r0+0*Traits::ResPacketSize, R0); + pstoreu(r1+0*Traits::ResPacketSize, R1); + + R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize); + R1 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C3, alphav, R1); + pstoreu(r2+0*Traits::ResPacketSize, R0); + pstoreu(r3+0*Traits::ResPacketSize, R1); } - ResPacket R0, R4; - ResPacket alphav = pset1<ResPacket>(alpha); + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2<cols; j2++) + { + // One column at a time + const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)]; + prefetch(&blA[0]); - ResScalar* r0 = &res[(j2+0)*resStride + i]; + // gets res block as register + AccPacket C0; + traits.initAcc(C0); - R0 = ploadu<ResPacket>(r0); - R4 = ploadu<ResPacket>(r0+ResPacketSize); + ResScalar* r0 = &res[(j2+0)*resStride + i]; + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; + LhsPacket A0; - traits.acc(C0, alphav, R0); - traits.acc(C4, alphav, R4); + for(Index k=0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gegp micro kernel 2p x 1"); + RhsPacket B_0; + +#define EIGEN_GEBGP_ONESTEP(K) \ + traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0); \ + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*RhsProgress; + blA += pk*1*Traits::LhsProgress; + } - pstoreu(r0, R0); - pstoreu(r0+ResPacketSize, R4); + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPacket B_0; + EIGEN_GEBGP_ONESTEP(0); + blB += RhsProgress; + blA += 1*Traits::LhsProgress; + } +#undef EIGEN_GEBGP_ONESTEP + ResPacket R0; + ResPacket alphav = pset1<ResPacket>(alpha); + R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + pstoreu(r0+0*Traits::ResPacketSize, R0); + } } - if(rows-peeled_mc>=LhsProgress) + } + //---------- Process remaining rows, 1 at once ---------- + if(peeled_mc1<rows) + { + // loop on each panel of the rhs + for(Index j2=0; j2<packet_cols4; j2+=nr) { - Index i = peeled_mc; - const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; - prefetch(&blA[0]); - - AccPacket C0; - traits.initAcc(C0); - - const RhsScalar* blB = unpackedB; - for(Index k=0; k<depth; k++) + // loop on each row of the lhs (1*LhsProgress x depth) + for(Index i=peeled_mc1; i<rows; i+=1) { - LhsPacket A0; - RhsPacket B_0; - traits.loadLhs(blA, A0); - traits.loadRhs(blB, B_0); - traits.madd(A0, B_0, C0, B_0); - blB += RhsProgress; - blA += LhsProgress; + const LhsScalar* blA = &blockA[i*strideA+offsetA]; + prefetch(&blA[0]); + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + + if( (SwappedTraits::LhsProgress % 4)==0 ) + { + // NOTE The following piece of code wont work for 512 bit registers + SAccPacket C0, C1, C2, C3; + straits.initAcc(C0); + straits.initAcc(C1); + straits.initAcc(C2); + straits.initAcc(C3); + + const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4); + const Index endk = (depth/spk)*spk; + const Index endk4 = (depth/(spk*4))*(spk*4); + + Index k=0; + for(; k<endk4; k+=4*spk) + { + SLhsPacket A0,A1; + SRhsPacket B_0,B_1; + + straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0); + straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1); + + straits.loadRhsQuad(blA+0*spk, B_0); + straits.loadRhsQuad(blA+1*spk, B_1); + straits.madd(A0,B_0,C0,B_0); + straits.madd(A1,B_1,C1,B_1); + + straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); + straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); + straits.loadRhsQuad(blA+2*spk, B_0); + straits.loadRhsQuad(blA+3*spk, B_1); + straits.madd(A0,B_0,C2,B_0); + straits.madd(A1,B_1,C3,B_1); + + blB += 4*SwappedTraits::LhsProgress; + blA += 4*spk; + } + C0 = padd(padd(C0,C1),padd(C2,C3)); + for(; k<endk; k+=spk) + { + SLhsPacket A0; + SRhsPacket B_0; + + straits.loadLhsUnaligned(blB, A0); + straits.loadRhsQuad(blA, B_0); + straits.madd(A0,B_0,C0,B_0); + + blB += SwappedTraits::LhsProgress; + blA += spk; + } + if(SwappedTraits::LhsProgress==8) + { + // Special case where we have to first reduce the accumulation register C0 + typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf; + typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf; + typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf; + typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf; + + SResPacketHalf R = pgather<SResScalar, SResPacketHalf>(&res[j2*resStride + i], resStride); + SResPacketHalf alphav = pset1<SResPacketHalf>(alpha); + + if(depth-endk>0) + { + // We have to handle the last row of the rhs which corresponds to a half-packet + SLhsPacketHalf a0; + SRhsPacketHalf b0; + straits.loadLhsUnaligned(blB, a0); + straits.loadRhs(blA, b0); + SAccPacketHalf c0 = predux4(C0); + straits.madd(a0,b0,c0,b0); + straits.acc(c0, alphav, R); + } + else + { + straits.acc(predux4(C0), alphav, R); + } + pscatter(&res[j2*resStride + i], R, resStride); + } + else + { + SResPacket R = pgather<SResScalar, SResPacket>(&res[j2*resStride + i], resStride); + SResPacket alphav = pset1<SResPacket>(alpha); + straits.acc(C0, alphav, R); + pscatter(&res[j2*resStride + i], R, resStride); + } + } + else // scalar path + { + // get a 1 x 4 res block as registers + ResScalar C0(0), C1(0), C2(0), C3(0); + + for(Index k=0; k<depth; k++) + { + LhsScalar A0; + RhsScalar B_0, B_1; + + A0 = blA[k]; + + B_0 = blB[0]; + B_1 = blB[1]; + MADD(cj,A0,B_0,C0, B_0); + MADD(cj,A0,B_1,C1, B_1); + + B_0 = blB[2]; + B_1 = blB[3]; + MADD(cj,A0,B_0,C2, B_0); + MADD(cj,A0,B_1,C3, B_1); + + blB += 4; + } + res[(j2+0)*resStride + i] += alpha*C0; + res[(j2+1)*resStride + i] += alpha*C1; + res[(j2+2)*resStride + i] += alpha*C2; + res[(j2+3)*resStride + i] += alpha*C3; + } } - - ResPacket alphav = pset1<ResPacket>(alpha); - ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]); - traits.acc(C0, alphav, R0); - pstoreu(&res[(j2+0)*resStride + i], R0); } - for(Index i=peeled_mc2; i<rows; i++) + // remaining columns + for(Index j2=packet_cols4; j2<cols; j2++) { - const LhsScalar* blA = &blockA[i*strideA+offsetA]; - prefetch(&blA[0]); - - // gets a 1 x 1 res block as registers - ResScalar C0(0); - // FIXME directly use blockB ?? - const RhsScalar* blB = &blockB[j2*strideB+offsetB]; - for(Index k=0; k<depth; k++) + // loop on each row of the lhs (1*LhsProgress x depth) + for(Index i=peeled_mc1; i<rows; i+=1) { - LhsScalar A0 = blA[k]; - RhsScalar B_0 = blB[k]; - MADD(cj, A0, B_0, C0, B_0); + const LhsScalar* blA = &blockA[i*strideA+offsetA]; + prefetch(&blA[0]); + // gets a 1 x 1 res block as registers + ResScalar C0(0); + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; + for(Index k=0; k<depth; k++) + { + LhsScalar A0 = blA[k]; + RhsScalar B_0 = blB[k]; + MADD(cj, A0, B_0, C0, B_0); + } + res[(j2+0)*resStride + i] += alpha*C0; } - res[(j2+0)*resStride + i] += alpha*C0; } } } @@ -1114,14 +1417,14 @@ EIGEN_ASM_COMMENT("mybegin4"); // // 32 33 34 35 ... // 36 36 38 39 ... -template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode> -struct gemm_pack_lhs +template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode> { EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); }; -template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode> -EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder, Conjugate, PanelMode> +template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode> ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) { typedef typename packet_traits<Scalar>::type Packet; @@ -1131,64 +1434,176 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder, EIGEN_UNUSED_VARIABLE(stride); EIGEN_UNUSED_VARIABLE(offset); eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); - eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) ); + eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) ); conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride); + const_blas_data_mapper<Scalar, Index, ColMajor> lhs(_lhs,lhsStride); Index count = 0; - Index peeled_mc = (rows/Pack1)*Pack1; - for(Index i=0; i<peeled_mc; i+=Pack1) + + const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; + const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 + : Pack2>1 ? (rows/Pack2)*Pack2 : 0; + + Index i=0; + + // Pack 3 packets + if(Pack1>=3*PacketSize) { - if(PanelMode) count += Pack1 * offset; - - if(StorageOrder==ColMajor) + for(; i<peeled_mc3; i+=3*PacketSize) + { + if(PanelMode) count += (3*PacketSize) * offset; + + for(Index k=0; k<depth; k++) + { + Packet A, B, C; + A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); + B = ploadu<Packet>(&lhs(i+1*PacketSize, k)); + C = ploadu<Packet>(&lhs(i+2*PacketSize, k)); + pstore(blockA+count, cj.pconj(A)); count+=PacketSize; + pstore(blockA+count, cj.pconj(B)); count+=PacketSize; + pstore(blockA+count, cj.pconj(C)); count+=PacketSize; + } + if(PanelMode) count += (3*PacketSize) * (stride-offset-depth); + } + } + // Pack 2 packets + if(Pack1>=2*PacketSize) + { + for(; i<peeled_mc2; i+=2*PacketSize) { + if(PanelMode) count += (2*PacketSize) * offset; + for(Index k=0; k<depth; k++) { - Packet A, B, C, D; - if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); - if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k)); - if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k)); - if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k)); - if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } - if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } - if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } - if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } + Packet A, B; + A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); + B = ploadu<Packet>(&lhs(i+1*PacketSize, k)); + pstore(blockA+count, cj.pconj(A)); count+=PacketSize; + pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } + if(PanelMode) count += (2*PacketSize) * (stride-offset-depth); } - else + } + // Pack 1 packets + if(Pack1>=1*PacketSize) + { + for(; i<peeled_mc1; i+=1*PacketSize) { + if(PanelMode) count += (1*PacketSize) * offset; + for(Index k=0; k<depth; k++) { - // TODO add a vectorized transpose here + Packet A; + A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); + pstore(blockA+count, cj.pconj(A)); + count+=PacketSize; + } + if(PanelMode) count += (1*PacketSize) * (stride-offset-depth); + } + } + // Pack scalars + if(Pack2<PacketSize && Pack2>1) + { + for(; i<peeled_mc0; i+=Pack2) + { + if(PanelMode) count += Pack2 * offset; + + for(Index k=0; k<depth; k++) + for(Index w=0; w<Pack2; w++) + blockA[count++] = cj(lhs(i+w, k)); + + if(PanelMode) count += Pack2 * (stride-offset-depth); + } + } + for(; i<rows; i++) + { + if(PanelMode) count += offset; + for(Index k=0; k<depth; k++) + blockA[count++] = cj(lhs(i, k)); + if(PanelMode) count += (stride-offset-depth); + } +} + +template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode> +{ + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) +{ + typedef typename packet_traits<Scalar>::type Packet; + enum { PacketSize = packet_traits<Scalar>::size }; + + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; + const_blas_data_mapper<Scalar, Index, RowMajor> lhs(_lhs,lhsStride); + Index count = 0; + +// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; +// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; +// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + + int pack = Pack1; + Index i = 0; + while(pack>0) + { + Index remaining_rows = rows-i; + Index peeled_mc = i+(remaining_rows/pack)*pack; + for(; i<peeled_mc; i+=pack) + { + if(PanelMode) count += pack * offset; + + const Index peeled_k = (depth/PacketSize)*PacketSize; + Index k=0; + if(pack>=PacketSize) + { + for(; k<peeled_k; k+=PacketSize) + { + for (Index m = 0; m < pack; m += PacketSize) + { + PacketBlock<Packet> kernel; + for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu<Packet>(&lhs(i+p+m, k)); + ptranspose(kernel); + for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + } + count += PacketSize*pack; + } + } + for(; k<depth; k++) + { Index w=0; - for(; w<Pack1-3; w+=4) + for(; w<pack-3; w+=4) { Scalar a(cj(lhs(i+w+0, k))), - b(cj(lhs(i+w+1, k))), - c(cj(lhs(i+w+2, k))), - d(cj(lhs(i+w+3, k))); + b(cj(lhs(i+w+1, k))), + c(cj(lhs(i+w+2, k))), + d(cj(lhs(i+w+3, k))); blockA[count++] = a; blockA[count++] = b; blockA[count++] = c; blockA[count++] = d; } - if(Pack1%4) - for(;w<Pack1;++w) + if(pack%4) + for(;w<pack;++w) blockA[count++] = cj(lhs(i+w, k)); } + + if(PanelMode) count += pack * (stride-offset-depth); } - if(PanelMode) count += Pack1 * (stride-offset-depth); - } - if(rows-peeled_mc>=Pack2) - { - if(PanelMode) count += Pack2*offset; - for(Index k=0; k<depth; k++) - for(Index w=0; w<Pack2; w++) - blockA[count++] = cj(lhs(peeled_mc+w, k)); - if(PanelMode) count += Pack2 * (stride-offset-depth); - peeled_mc += Pack2; + + pack -= PacketSize; + if(pack<Pack2 && (pack+PacketSize)!=Pack2) + pack = Pack2; } - for(Index i=peeled_mc; i<rows; i++) + + for(; i<rows; i++) { if(PanelMode) count += offset; for(Index k=0; k<depth; k++) @@ -1221,30 +1636,99 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan EIGEN_UNUSED_VARIABLE(offset); eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - Index packet_cols = (cols/nr) * nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; - for(Index j2=0; j2<packet_cols; j2+=nr) + const Index peeled_k = (depth/PacketSize)*PacketSize; +// if(nr>=8) +// { +// for(Index j2=0; j2<packet_cols8; j2+=8) +// { +// // skip what we have before +// if(PanelMode) count += 8 * offset; +// const Scalar* b0 = &rhs[(j2+0)*rhsStride]; +// const Scalar* b1 = &rhs[(j2+1)*rhsStride]; +// const Scalar* b2 = &rhs[(j2+2)*rhsStride]; +// const Scalar* b3 = &rhs[(j2+3)*rhsStride]; +// const Scalar* b4 = &rhs[(j2+4)*rhsStride]; +// const Scalar* b5 = &rhs[(j2+5)*rhsStride]; +// const Scalar* b6 = &rhs[(j2+6)*rhsStride]; +// const Scalar* b7 = &rhs[(j2+7)*rhsStride]; +// Index k=0; +// if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4 +// { +// for(; k<peeled_k; k+=PacketSize) { +// PacketBlock<Packet> kernel; +// for (int p = 0; p < PacketSize; ++p) { +// kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]); +// } +// ptranspose(kernel); +// for (int p = 0; p < PacketSize; ++p) { +// pstoreu(blockB+count, cj.pconj(kernel.packet[p])); +// count+=PacketSize; +// } +// } +// } +// for(; k<depth; k++) +// { +// blockB[count+0] = cj(b0[k]); +// blockB[count+1] = cj(b1[k]); +// blockB[count+2] = cj(b2[k]); +// blockB[count+3] = cj(b3[k]); +// blockB[count+4] = cj(b4[k]); +// blockB[count+5] = cj(b5[k]); +// blockB[count+6] = cj(b6[k]); +// blockB[count+7] = cj(b7[k]); +// count += 8; +// } +// // skip what we have after +// if(PanelMode) count += 8 * (stride-offset-depth); +// } +// } + + if(nr>=4) { - // skip what we have before - if(PanelMode) count += nr * offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - const Scalar* b1 = &rhs[(j2+1)*rhsStride]; - const Scalar* b2 = &rhs[(j2+2)*rhsStride]; - const Scalar* b3 = &rhs[(j2+3)*rhsStride]; - for(Index k=0; k<depth; k++) + for(Index j2=packet_cols8; j2<packet_cols4; j2+=4) { - blockB[count+0] = cj(b0[k]); - blockB[count+1] = cj(b1[k]); - if(nr==4) blockB[count+2] = cj(b2[k]); - if(nr==4) blockB[count+3] = cj(b3[k]); - count += nr; + // skip what we have before + if(PanelMode) count += 4 * offset; + const Scalar* b0 = &rhs[(j2+0)*rhsStride]; + const Scalar* b1 = &rhs[(j2+1)*rhsStride]; + const Scalar* b2 = &rhs[(j2+2)*rhsStride]; + const Scalar* b3 = &rhs[(j2+3)*rhsStride]; + + Index k=0; + if((PacketSize%4)==0) // TODO enbale vectorized transposition for PacketSize==2 ?? + { + for(; k<peeled_k; k+=PacketSize) { + PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel; + kernel.packet[0] = ploadu<Packet>(&b0[k]); + kernel.packet[1] = ploadu<Packet>(&b1[k]); + kernel.packet[2] = ploadu<Packet>(&b2[k]); + kernel.packet[3] = ploadu<Packet>(&b3[k]); + ptranspose(kernel); + pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); + pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1])); + pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2])); + pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3])); + count+=4*PacketSize; + } + } + for(; k<depth; k++) + { + blockB[count+0] = cj(b0[k]); + blockB[count+1] = cj(b1[k]); + blockB[count+2] = cj(b2[k]); + blockB[count+3] = cj(b3[k]); + count += 4; + } + // skip what we have after + if(PanelMode) count += 4 * (stride-offset-depth); } - // skip what we have after - if(PanelMode) count += nr * (stride-offset-depth); } // copy the remaining columns one at a time (nr==1) - for(Index j2=packet_cols; j2<cols; ++j2) + for(Index j2=packet_cols4; j2<cols; ++j2) { if(PanelMode) count += offset; const Scalar* b0 = &rhs[(j2+0)*rhsStride]; @@ -1275,32 +1759,70 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan EIGEN_UNUSED_VARIABLE(offset); eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - Index packet_cols = (cols/nr) * nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; - for(Index j2=0; j2<packet_cols; j2+=nr) + +// if(nr>=8) +// { +// for(Index j2=0; j2<packet_cols8; j2+=8) +// { +// // skip what we have before +// if(PanelMode) count += 8 * offset; +// for(Index k=0; k<depth; k++) +// { +// if (PacketSize==8) { +// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); +// pstoreu(blockB+count, cj.pconj(A)); +// } else if (PacketSize==4) { +// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); +// Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]); +// pstoreu(blockB+count, cj.pconj(A)); +// pstoreu(blockB+count+PacketSize, cj.pconj(B)); +// } else { +// const Scalar* b0 = &rhs[k*rhsStride + j2]; +// blockB[count+0] = cj(b0[0]); +// blockB[count+1] = cj(b0[1]); +// blockB[count+2] = cj(b0[2]); +// blockB[count+3] = cj(b0[3]); +// blockB[count+4] = cj(b0[4]); +// blockB[count+5] = cj(b0[5]); +// blockB[count+6] = cj(b0[6]); +// blockB[count+7] = cj(b0[7]); +// } +// count += 8; +// } +// // skip what we have after +// if(PanelMode) count += 8 * (stride-offset-depth); +// } +// } + if(nr>=4) { - // skip what we have before - if(PanelMode) count += nr * offset; - for(Index k=0; k<depth; k++) + for(Index j2=packet_cols8; j2<packet_cols4; j2+=4) { - if (nr == PacketSize) { - Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); - pstoreu(blockB+count, cj.pconj(A)); - count += PacketSize; - } else { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = cj(b0[0]); - blockB[count+1] = cj(b0[1]); - if(nr==4) blockB[count+2] = cj(b0[2]); - if(nr==4) blockB[count+3] = cj(b0[3]); - count += nr; + // skip what we have before + if(PanelMode) count += 4 * offset; + for(Index k=0; k<depth; k++) + { + if (PacketSize==4) { + Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); + pstoreu(blockB+count, cj.pconj(A)); + count += PacketSize; + } else { + const Scalar* b0 = &rhs[k*rhsStride + j2]; + blockB[count+0] = cj(b0[0]); + blockB[count+1] = cj(b0[1]); + blockB[count+2] = cj(b0[2]); + blockB[count+3] = cj(b0[3]); + count += 4; + } } + // skip what we have after + if(PanelMode) count += 4 * (stride-offset-depth); } - // skip what we have after - if(PanelMode) count += nr * (stride-offset-depth); } // copy the remaining columns one at a time (nr==1) - for(Index j2=packet_cols; j2<cols; ++j2) + for(Index j2=packet_cols4; j2<cols; ++j2) { if(PanelMode) count += offset; const Scalar* b0 = &rhs[j2]; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index b0a09216d..5fd9a98ac 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -23,6 +23,8 @@ template< typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor> { + typedef gebp_traits<RhsScalar,LhsScalar> Traits; + typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run( Index rows, Index cols, Index depth, @@ -51,6 +53,8 @@ template< struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor> { +typedef gebp_traits<LhsScalar,RhsScalar> Traits; + typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; static void run(Index rows, Index cols, Index depth, const LhsScalar* _lhs, Index lhsStride, @@ -63,11 +67,9 @@ static void run(Index rows, Index cols, Index depth, const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - typedef gebp_traits<LhsScalar,RhsScalar> Traits; - Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction - //Index nc = blocking.nc(); // cache block size along the N direction + Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs; @@ -80,68 +82,68 @@ static void run(Index rows, Index cols, Index depth, Index tid = omp_get_thread_num(); Index threads = omp_get_num_threads(); - std::size_t sizeA = kc*mc; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0); - ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0); + LhsScalar* blockA = blocking.blockA(); + eigen_internal_assert(blockA!=0); - RhsScalar* blockB = blocking.blockB(); - eigen_internal_assert(blockB!=0); - + std::size_t sizeB = kc*nc; + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0); + // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs... for(Index k=0; k<depth; k+=kc) { const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A' // In order to reduce the chance that a thread has to wait for the other, - // let's start by packing A'. - pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc); + // let's start by packing B'. + pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc); - // Pack B_k to B' in a parallel fashion: - // each thread packs the sub block B_k,j to B'_j where j is the thread id. + // Pack A_k to A' in a parallel fashion: + // each thread packs the sub block A_k,i to A'_i where i is the thread id. - // However, before copying to B'_j, we have to make sure that no other thread is still using it, + // However, before copying to A'_i, we have to make sure that no other thread is still using it, // i.e., we test that info[tid].users equals 0. // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(info[tid].users!=0) {} info[tid].users += threads; + + pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length); - pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length); - - // Notify the other threads that the part B'_j is ready to go. + // Notify the other threads that the part A'_i is ready to go. info[tid].sync = k; - - // Computes C_i += A' * B' per B'_j + + // Computes C_i += A' * B' per A'_i for(Index shift=0; shift<threads; ++shift) { - Index j = (tid+shift)%threads; + Index i = (tid+shift)%threads; - // At this point we have to make sure that B'_j has been updated by the thread j, + // At this point we have to make sure that A'_i has been updated by the thread i, // we use testAndSetOrdered to mimic a volatile access. // However, no need to wait for the B' part which has been updated by the current thread! if(shift>0) - while(info[j].sync!=k) {} - - gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w); + while(info[i].sync!=k) {} + gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha); } - // Then keep going as usual with the remaining A' - for(Index i=mc; i<rows; i+=mc) + // Then keep going as usual with the remaining B' + for(Index j=nc; j<cols; j+=nc) { - const Index actual_mc = (std::min)(i+mc,rows)-i; + const Index actual_nc = (std::min)(j+nc,cols)-j; - // pack A_i,k to A' - pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc); + // pack B_k,j to B' + pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc); - // C_i += A' * B' - gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w); + // C_j += A' * B' + gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha); } - // Release all the sub blocks B'_j of B' for the current thread, + // Release all the sub blocks A'_i of A' for the current thread, // i.e., we simply decrement the number of users by 1 - for(Index j=0; j<threads; ++j) + #pragma omp critical + { + for(Index i=0; i<threads; ++i) #pragma omp atomic - --(info[j].users); + --(info[i].users); + } } } else @@ -151,38 +153,34 @@ static void run(Index rows, Index cols, Index depth, // this is the sequential version! std::size_t sizeA = kc*mc; - std::size_t sizeB = kc*cols; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = kc*nc; ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); - ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW()); // For each horizontal panel of the rhs, and corresponding panel of the lhs... - // (==GEMM_VAR1) for(Index k2=0; k2<depth; k2+=kc) { const Index actual_kc = (std::min)(k2+kc,depth)-k2; // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs. - // => Pack rhs's panel into a sequential chunk of memory (L2 caching) - // Note that this panel will be read as many times as the number of blocks in the lhs's - // vertical panel which is, in practice, a very low number. - pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); - - // For each mc x kc block of the lhs's vertical panel... - // (==GEPP_VAR1) - for(Index i2=0; i2<rows; i2+=mc) + // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching) + // Note that this panel will be read as many times as the number of blocks in the rhs's + // horizontal panel which is, in practice, a very low number. + pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows); + + // For each kc x nc block of the rhs's horizontal panel... + for(Index j2=0; j2<cols; j2+=nc) { - const Index actual_mc = (std::min)(i2+mc,rows)-i2; + const Index actual_nc = (std::min)(j2+nc,cols)-j2; - // We pack the lhs's block into a sequential chunk of memory (L1 caching) + // We pack the rhs's block into a sequential chunk of memory (L2 caching) // Note that this block will be read a very high number of times, which is equal to the number of - // micro vertical panel of the large rhs's panel (e.g., cols/4 times). - pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); + // micro horizontal panel of the large rhs's panel (e.g., rows/12 times). + pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc); - // Everything is packed, we can now call the block * panel kernel: - gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); + // Everything is packed, we can now call the panel * block kernel: + gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha); } } } @@ -203,14 +201,13 @@ struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> > template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType> struct gemm_functor { - gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, - BlockingType& blocking) + gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking) : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking) {} void initParallelSession() const { - m_blocking.allocateB(); + m_blocking.allocateA(); } void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const @@ -224,6 +221,8 @@ struct gemm_functor (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(), m_actualAlpha, m_blocking, info); } + + typedef typename Gemm::Traits Traits; protected: const Lhs& m_lhs; @@ -245,7 +244,6 @@ class level3_blocking protected: LhsScalar* m_blockA; RhsScalar* m_blockB; - RhsScalar* m_blockW; DenseIndex m_mc; DenseIndex m_nc; @@ -254,7 +252,7 @@ class level3_blocking public: level3_blocking() - : m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0) + : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0) {} inline DenseIndex mc() const { return m_mc; } @@ -263,7 +261,6 @@ class level3_blocking inline LhsScalar* blockA() { return m_blockA; } inline RhsScalar* blockB() { return m_blockB; } - inline RhsScalar* blockW() { return m_blockW; } }; template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor> @@ -282,29 +279,25 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M typedef gebp_traits<LhsScalar,RhsScalar> Traits; enum { SizeA = ActualRows * MaxDepth, - SizeB = ActualCols * MaxDepth, - SizeW = MaxDepth * Traits::WorkSpaceFactor + SizeB = ActualCols * MaxDepth }; - EIGEN_ALIGN16 LhsScalar m_staticA[SizeA]; - EIGEN_ALIGN16 RhsScalar m_staticB[SizeB]; - EIGEN_ALIGN16 RhsScalar m_staticW[SizeW]; + EIGEN_ALIGN_DEFAULT LhsScalar m_staticA[SizeA]; + EIGEN_ALIGN_DEFAULT RhsScalar m_staticB[SizeB]; public: - gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/) + gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false) { this->m_mc = ActualRows; this->m_nc = ActualCols; this->m_kc = MaxDepth; this->m_blockA = m_staticA; this->m_blockB = m_staticB; - this->m_blockW = m_staticW; } inline void allocateA() {} inline void allocateB() {} - inline void allocateW() {} inline void allocateAll() {} }; @@ -323,20 +316,28 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M DenseIndex m_sizeA; DenseIndex m_sizeB; - DenseIndex m_sizeW; public: - gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth) + gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false) { this->m_mc = Transpose ? cols : rows; this->m_nc = Transpose ? rows : cols; this->m_kc = depth; - computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc); + if(full_rows) + { + DenseIndex m = this->m_mc; + computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc); + } + else // full columns + { + DenseIndex n = this->m_nc; + computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n); + } + m_sizeA = this->m_mc * this->m_kc; m_sizeB = this->m_kc * this->m_nc; - m_sizeW = this->m_kc*Traits::WorkSpaceFactor; } void allocateA() @@ -351,24 +352,16 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M this->m_blockB = aligned_new<RhsScalar>(m_sizeB); } - void allocateW() - { - if(this->m_blockW==0) - this->m_blockW = aligned_new<RhsScalar>(m_sizeW); - } - void allocateAll() { allocateA(); allocateB(); - allocateW(); } ~gemm_blocking_space() { aligned_delete(this->m_blockA, m_sizeA); aligned_delete(this->m_blockB, m_sizeB); - aligned_delete(this->m_blockW, m_sizeW); } }; @@ -394,7 +387,37 @@ class GeneralProduct<Lhs, Rhs, GemmProduct> typedef internal::scalar_product_op<LhsScalar,RhsScalar> BinOp; EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar); } + + template<typename Dest> + inline void evalTo(Dest& dst) const + { + if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0) + dst.noalias() = m_lhs .lazyProduct( m_rhs ); + else + { + dst.setZero(); + scaleAndAddTo(dst,Scalar(1)); + } + } + + template<typename Dest> + inline void addTo(Dest& dst) const + { + if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0) + dst.noalias() += m_lhs .lazyProduct( m_rhs ); + else + scaleAndAddTo(dst,Scalar(1)); + } + template<typename Dest> + inline void subTo(Dest& dst) const + { + if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0) + dst.noalias() -= m_lhs .lazyProduct( m_rhs ); + else + scaleAndAddTo(dst,Scalar(-1)); + } + template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const { eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); @@ -417,7 +440,7 @@ class GeneralProduct<Lhs, Rhs, GemmProduct> (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor; - BlockingType blocking(dst.rows(), dst.cols(), lhs.cols()); + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true); internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit); } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 46be46f85..6070664d5 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -73,11 +73,8 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder, if(mc > Traits::nr) mc = (mc/Traits::nr)*Traits::nr; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*size; ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0); - RhsScalar* blockB = allocatedBlockB + sizeW; + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0); gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs; @@ -103,15 +100,15 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder, // 3 - after the diagonal => processed with gebp or skipped if (UpLo==Lower) gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, - -1, -1, 0, 0, allocatedBlockB); + -1, -1, 0, 0); - sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB); + sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); if (UpLo==Upper) { Index j2 = i2+actual_mc; gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha, - -1, -1, 0, 0, allocatedBlockB); + -1, -1, 0, 0); } } } @@ -136,7 +133,7 @@ struct tribb_kernel enum { BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) }; - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace) + void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel; Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer; @@ -150,7 +147,7 @@ struct tribb_kernel if(UpLo==Upper) gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); + -1, -1, 0, 0); // selfadjoint micro block { @@ -158,7 +155,7 @@ struct tribb_kernel buffer.setZero(); // 1 - apply the kernel on the temporary buffer gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); + -1, -1, 0, 0); // 2 - triangular accumulation for(Index j1=0; j1<actualBlockSize; ++j1) { @@ -173,7 +170,7 @@ struct tribb_kernel { Index i = j+actualBlockSize; gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); + -1, -1, 0, 0); } } } @@ -278,6 +275,8 @@ template<typename MatrixType, unsigned int UpLo> template<typename ProductDerived, typename _Lhs, typename _Rhs> TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha) { + eigen_assert(m_matrix.rows() == prod.rows() && m_matrix.cols() == prod.cols()); + general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha); return *this; diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index a73ce5ff0..340c51394 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -141,6 +141,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co alignedSize = 0; alignedStart = 0; } + else if(LhsPacketSize > 4) + { + // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. + // Currently, it seems to be better to perform unaligned loads anyway + alignmentPattern = NoneAligned; + } else if (LhsPacketSize>1) { eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); @@ -405,6 +411,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co alignedSize = 0; alignedStart = 0; } + else if(LhsPacketSize > 4) + { + // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. + alignmentPattern = NoneAligned; + } else if (LhsPacketSize>1) { eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); @@ -442,7 +453,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) { - EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); + EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0); ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0); // this helps the compiler generating good binary code @@ -551,7 +562,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co { for (Index i=start; i<end; ++i) { - EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); + EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0); ResPacket ptmp0 = pset1<ResPacket>(tmp0); const LhsScalar* lhs0 = lhs + i*lhsStride; // process first unaligned result's coeffs diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 5c3e9b7ac..4079063eb 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -73,13 +73,13 @@ namespace internal { template<typename Index> struct GemmParallelInfo { - GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {} + GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {} int volatile sync; int volatile users; - Index rhs_start; - Index rhs_length; + Index lhs_start; + Index lhs_length; }; template<bool Condition, typename Functor, typename Index> @@ -107,7 +107,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos if((!Condition) || (omp_get_num_threads()>1)) return func(0,rows, 0,cols); - Index size = transpose ? cols : rows; + Index size = transpose ? rows : cols; // 2- compute the maximal number of threads from the size of the product: // FIXME this has to be fine tuned @@ -126,26 +126,25 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos std::swap(rows,cols); Index blockCols = (cols / threads) & ~Index(0x3); - Index blockRows = (rows / threads) & ~Index(0x7); + Index blockRows = (rows / threads); + blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr; GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads]; - #pragma omp parallel for schedule(static,1) num_threads(threads) - for(Index i=0; i<threads; ++i) + #pragma omp parallel num_threads(threads) { + Index i = omp_get_thread_num(); Index r0 = i*blockRows; Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows; Index c0 = i*blockCols; Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols; - info[i].rhs_start = c0; - info[i].rhs_length = actualBlockCols; + info[i].lhs_start = r0; + info[i].lhs_length = actualBlockRows; - if(transpose) - func(0, cols, r0, actualBlockRows, info); - else - func(r0, actualBlockRows, 0,cols, info); + if(transpose) func(c0, actualBlockCols, 0, rows, info); + else func(0, rows, c0, actualBlockCols, info); } delete[] info; diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index afa8af43c..0ab3f3a56 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -15,7 +15,7 @@ namespace Eigen { namespace internal { // pack a selfadjoint block diagonal for use with the gebp_kernel -template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder> +template<typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder> struct symm_pack_lhs { template<int BlockRows> inline @@ -45,25 +45,32 @@ struct symm_pack_lhs } void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) { + enum { PacketSize = packet_traits<Scalar>::size }; const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride); Index count = 0; - Index peeled_mc = (rows/Pack1)*Pack1; - for(Index i=0; i<peeled_mc; i+=Pack1) - { - pack<Pack1>(blockA, lhs, cols, i, count); - } - - if(rows-peeled_mc>=Pack2) - { - pack<Pack2>(blockA, lhs, cols, peeled_mc, count); - peeled_mc += Pack2; - } + //Index peeled_mc3 = (rows/Pack1)*Pack1; + + const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; + const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + + if(Pack1>=3*PacketSize) + for(Index i=0; i<peeled_mc3; i+=3*PacketSize) + pack<3*PacketSize>(blockA, lhs, cols, i, count); + + if(Pack1>=2*PacketSize) + for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize) + pack<2*PacketSize>(blockA, lhs, cols, i, count); + + if(Pack1>=1*PacketSize) + for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize) + pack<1*PacketSize>(blockA, lhs, cols, i, count); // do the same with mr==1 - for(Index i=peeled_mc; i<rows; i++) + for(Index i=peeled_mc1; i<rows; i++) { for(Index k=0; k<i; k++) - blockA[count++] = lhs(i, k); // normal + blockA[count++] = lhs(i, k); // normal blockA[count++] = numext::real(lhs(i, i)); // real (diagonal) @@ -82,7 +89,8 @@ struct symm_pack_rhs Index end_k = k2 + rows; Index count = 0; const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride); - Index packet_cols = (cols/nr)*nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; // first part: normal case for(Index j2=0; j2<k2; j2+=nr) @@ -91,79 +99,151 @@ struct symm_pack_rhs { blockB[count+0] = rhs(k,j2+0); blockB[count+1] = rhs(k,j2+1); - if (nr==4) + if (nr>=4) { blockB[count+2] = rhs(k,j2+2); blockB[count+3] = rhs(k,j2+3); } + if (nr>=8) + { + blockB[count+4] = rhs(k,j2+4); + blockB[count+5] = rhs(k,j2+5); + blockB[count+6] = rhs(k,j2+6); + blockB[count+7] = rhs(k,j2+7); + } count += nr; } } // second part: diagonal block - for(Index j2=k2; j2<(std::min)(k2+rows,packet_cols); j2+=nr) + Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2; + if(nr>=8) { - // again we can split vertically in three different parts (transpose, symmetric, normal) - // transpose - for(Index k=k2; k<j2; k++) + for(Index j2=k2; j2<end8; j2+=8) { - blockB[count+0] = numext::conj(rhs(j2+0,k)); - blockB[count+1] = numext::conj(rhs(j2+1,k)); - if (nr==4) + // again we can split vertically in three different parts (transpose, symmetric, normal) + // transpose + for(Index k=k2; k<j2; k++) { + blockB[count+0] = numext::conj(rhs(j2+0,k)); + blockB[count+1] = numext::conj(rhs(j2+1,k)); blockB[count+2] = numext::conj(rhs(j2+2,k)); blockB[count+3] = numext::conj(rhs(j2+3,k)); + blockB[count+4] = numext::conj(rhs(j2+4,k)); + blockB[count+5] = numext::conj(rhs(j2+5,k)); + blockB[count+6] = numext::conj(rhs(j2+6,k)); + blockB[count+7] = numext::conj(rhs(j2+7,k)); + count += 8; } - count += nr; - } - // symmetric - Index h = 0; - for(Index k=j2; k<j2+nr; k++) - { - // normal - for (Index w=0 ; w<h; ++w) - blockB[count+w] = rhs(k,j2+w); + // symmetric + Index h = 0; + for(Index k=j2; k<j2+8; k++) + { + // normal + for (Index w=0 ; w<h; ++w) + blockB[count+w] = rhs(k,j2+w); - blockB[count+h] = numext::real(rhs(k,k)); + blockB[count+h] = numext::real(rhs(k,k)); - // transpose - for (Index w=h+1 ; w<nr; ++w) - blockB[count+w] = numext::conj(rhs(j2+w,k)); - count += nr; - ++h; + // transpose + for (Index w=h+1 ; w<8; ++w) + blockB[count+w] = numext::conj(rhs(j2+w,k)); + count += 8; + ++h; + } + // normal + for(Index k=j2+8; k<end_k; k++) + { + blockB[count+0] = rhs(k,j2+0); + blockB[count+1] = rhs(k,j2+1); + blockB[count+2] = rhs(k,j2+2); + blockB[count+3] = rhs(k,j2+3); + blockB[count+4] = rhs(k,j2+4); + blockB[count+5] = rhs(k,j2+5); + blockB[count+6] = rhs(k,j2+6); + blockB[count+7] = rhs(k,j2+7); + count += 8; + } } - // normal - for(Index k=j2+nr; k<end_k; k++) + } + if(nr>=4) + { + for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4) { - blockB[count+0] = rhs(k,j2+0); - blockB[count+1] = rhs(k,j2+1); - if (nr==4) + // again we can split vertically in three different parts (transpose, symmetric, normal) + // transpose + for(Index k=k2; k<j2; k++) { + blockB[count+0] = numext::conj(rhs(j2+0,k)); + blockB[count+1] = numext::conj(rhs(j2+1,k)); + blockB[count+2] = numext::conj(rhs(j2+2,k)); + blockB[count+3] = numext::conj(rhs(j2+3,k)); + count += 4; + } + // symmetric + Index h = 0; + for(Index k=j2; k<j2+4; k++) + { + // normal + for (Index w=0 ; w<h; ++w) + blockB[count+w] = rhs(k,j2+w); + + blockB[count+h] = numext::real(rhs(k,k)); + + // transpose + for (Index w=h+1 ; w<4; ++w) + blockB[count+w] = numext::conj(rhs(j2+w,k)); + count += 4; + ++h; + } + // normal + for(Index k=j2+4; k<end_k; k++) + { + blockB[count+0] = rhs(k,j2+0); + blockB[count+1] = rhs(k,j2+1); blockB[count+2] = rhs(k,j2+2); blockB[count+3] = rhs(k,j2+3); + count += 4; } - count += nr; } } // third part: transposed - for(Index j2=k2+rows; j2<packet_cols; j2+=nr) + if(nr>=8) { - for(Index k=k2; k<end_k; k++) + for(Index j2=k2+rows; j2<packet_cols8; j2+=8) { - blockB[count+0] = numext::conj(rhs(j2+0,k)); - blockB[count+1] = numext::conj(rhs(j2+1,k)); - if (nr==4) + for(Index k=k2; k<end_k; k++) { + blockB[count+0] = numext::conj(rhs(j2+0,k)); + blockB[count+1] = numext::conj(rhs(j2+1,k)); blockB[count+2] = numext::conj(rhs(j2+2,k)); blockB[count+3] = numext::conj(rhs(j2+3,k)); + blockB[count+4] = numext::conj(rhs(j2+4,k)); + blockB[count+5] = numext::conj(rhs(j2+5,k)); + blockB[count+6] = numext::conj(rhs(j2+6,k)); + blockB[count+7] = numext::conj(rhs(j2+7,k)); + count += 8; + } + } + } + if(nr>=4) + { + for(Index j2=(std::max)(packet_cols8,k2+rows); j2<packet_cols4; j2+=4) + { + for(Index k=k2; k<end_k; k++) + { + blockB[count+0] = numext::conj(rhs(j2+0,k)); + blockB[count+1] = numext::conj(rhs(j2+1,k)); + blockB[count+2] = numext::conj(rhs(j2+2,k)); + blockB[count+3] = numext::conj(rhs(j2+3,k)); + count += 4; } - count += nr; } } // copy the remaining columns one at a time (=> the same with nr==1) - for(Index j2=packet_cols; j2<cols; ++j2) + for(Index j2=packet_cols4; j2<cols; ++j2) { // transpose Index half = (std::min)(end_k,j2); @@ -261,11 +341,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t // kc must smaller than mc kc = (std::min)(kc,mc); - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*cols; + std::size_t sizeB = kc*cols; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB + sizeW; + Scalar* blockB = allocatedBlockB; gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; @@ -348,11 +427,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f Index mc = rows; // cache block size along the M direction Index nc = cols; // cache block size along the N direction computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*cols; + std::size_t sizeB = kc*cols; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB + sizeW; + Scalar* blockB = allocatedBlockB; gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; @@ -423,11 +501,11 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor> ::run( - lhs.rows(), rhs.cols(), // sizes - &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info - &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info - &dst.coeffRef(0,0), dst.outerStride(), // result info - actualAlpha // alpha + lhs.rows(), rhs.cols(), // sizes + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info + &dst.coeffRef(0,0), dst.outerStride(), // result info + actualAlpha // alpha ); } }; diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 3f0618410..d93277cb8 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -125,11 +125,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); - ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW()); Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer; triangularBuffer.setZero(); @@ -187,7 +185,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth); gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha, - actualPanelWidth, actual_kc, 0, blockBOffset, blockW); + actualPanelWidth, actual_kc, 0, blockBOffset); // GEBP with remaining micro panel if (lengthTarget>0) @@ -197,7 +195,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget); gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha, - actualPanelWidth, actual_kc, 0, blockBOffset, blockW); + actualPanelWidth, actual_kc, 0, blockBOffset); } } } @@ -211,7 +209,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>() (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0); } } } @@ -265,12 +263,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction std::size_t sizeA = kc*mc; - std::size_t sizeB = kc*cols; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = kc*cols+EIGEN_ALIGN_BYTES/sizeof(Scalar); ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); - ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW()); Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer; triangularBuffer.setZero(); @@ -304,6 +300,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc; Scalar* geb = blockB+ts*ts; + geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar)); pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs); @@ -357,14 +354,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, actual_mc, panelLength, actualPanelWidth, alpha, actual_kc, actual_kc, // strides - blockOffset, blockOffset,// offsets - blockW); // workspace + blockOffset, blockOffset);// offsets } } gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride, blockA, geb, actual_mc, actual_kc, rs, alpha, - -1, -1, 0, 0, blockW); + -1, -1, 0, 0); } } } diff --git a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h b/Eigen/src/Core/products/TriangularMatrixVector_MKL.h index 09f110da7..3672b1240 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h +++ b/Eigen/src/Core/products/TriangularMatrixVector_MKL.h @@ -129,7 +129,6 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \ /* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \ if (size<(std::max)(rows,cols)) { \ - typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ if (size<rows) { \ @@ -214,7 +213,6 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \ /* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \ if (size<(std::max)(rows,cols)) { \ - typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ if (size<rows) { \ diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index f103eae72..1f7afd187 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -66,11 +66,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); - ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW()); conj_if<Conjugate> conj; gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel; @@ -158,7 +156,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), - actualPanelWidth, actual_kc, 0, blockBOffset, blockW); + actualPanelWidth, actual_kc, 0, blockBOffset); } } } @@ -174,7 +172,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju { pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc); - gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0, blockW); + gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0); } } } @@ -215,11 +213,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj std::size_t sizeA = kc*mc; std::size_t sizeB = kc*size; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); - ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW()); conj_if<Conjugate> conj; gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel; @@ -285,8 +281,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj actual_mc, panelLength, actualPanelWidth, Scalar(-1), actual_kc, actual_kc, // strides - panelOffset, panelOffset, // offsets - blockW); // workspace + panelOffset, panelOffset); // offsets } // unblocked triangular solve @@ -317,7 +312,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj if (rs>0) gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, actual_mc, actual_kc, rs, Scalar(-1), - -1, -1, 0, 0, blockW); + -1, -1, 0, 0); } } } diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h index 1e6e355d6..8acca9c8c 100644 --- a/Eigen/src/Core/util/MKL_support.h +++ b/Eigen/src/Core/util/MKL_support.h @@ -54,8 +54,25 @@ #endif #if defined EIGEN_USE_MKL +# include <mkl.h> +/*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/ +# ifndef INTEL_MKL_VERSION +# undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */ +# elif INTEL_MKL_VERSION < 100305 /* the intel-mkl-103-release-notes say this was when the lapacke.h interface was added*/ +# undef EIGEN_USE_MKL +# endif +# ifndef EIGEN_USE_MKL + /*If the MKL version is too old, undef everything*/ +# undef EIGEN_USE_MKL_ALL +# undef EIGEN_USE_BLAS +# undef EIGEN_USE_LAPACKE +# undef EIGEN_USE_MKL_VML +# undef EIGEN_USE_LAPACKE_STRICT +# undef EIGEN_USE_LAPACKE +# endif +#endif -#include <mkl.h> +#if defined EIGEN_USE_MKL #include <mkl_lapacke.h> #define EIGEN_MKL_VML_THRESHOLD 128 diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 6aa716698..f84d20397 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -66,13 +66,24 @@ #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 #endif +// Defined the boundary (in bytes) on which the data needs to be aligned. Note +// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be +// aligned at all regardless of the value of this #define. +#define EIGEN_ALIGN_BYTES 16 + #ifdef EIGEN_DONT_ALIGN #ifndef EIGEN_DONT_ALIGN_STATICALLY #define EIGEN_DONT_ALIGN_STATICALLY #endif #define EIGEN_ALIGN 0 -#else +#elif !defined(EIGEN_DONT_VECTORIZE) + #if defined(__AVX__) + #undef EIGEN_ALIGN_BYTES + #define EIGEN_ALIGN_BYTES 32 + #endif #define EIGEN_ALIGN 1 +#else + #define EIGEN_ALIGN 0 #endif // EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable @@ -286,13 +297,19 @@ namespace Eigen { #endif #define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) +#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) +#define EIGEN_ALIGN_DEFAULT EIGEN_ALIGN_TO_BOUNDARY(EIGEN_ALIGN_BYTES) #if EIGEN_ALIGN_STATICALLY #define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n) #define EIGEN_USER_ALIGN16 EIGEN_ALIGN16 +#define EIGEN_USER_ALIGN32 EIGEN_ALIGN32 +#define EIGEN_USER_ALIGN_DEFAULT EIGEN_ALIGN_DEFAULT #else #define EIGEN_USER_ALIGN_TO_BOUNDARY(n) #define EIGEN_USER_ALIGN16 +#define EIGEN_USER_ALIGN32 +#define EIGEN_USER_ALIGN_DEFAULT #endif #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD @@ -326,13 +343,13 @@ namespace Eigen { #elif defined(__clang__) // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; \ - EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \ template <typename OtherDerived> \ - EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other) { Base::operator=(other.derived()); return *this; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other) { Base::operator=(other.derived()); return *this; } #else #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; \ - EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \ { \ Base::operator=(other); \ return *this; \ diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 286e963d2..390b60c74 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -32,7 +32,7 @@ // page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed // quite safe, at least within the context of glibc, to equate 64-bit with LP64. #if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \ - && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) + && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_ALIGN_BYTES == 16) #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1 #else #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0 @@ -42,14 +42,14 @@ // See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup // FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures // See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup -#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) +#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) && (EIGEN_ALIGN_BYTES == 16) #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1 #else #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0 #endif -#if defined(__APPLE__) \ - || defined(_WIN64) \ +#if (defined(__APPLE__) && (EIGEN_ALIGN_BYTES == 16)) \ + || (defined(_WIN64) && (EIGEN_ALIGN_BYTES == 16)) \ || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \ || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED #define EIGEN_MALLOC_ALREADY_ALIGNED 1 @@ -73,7 +73,7 @@ #define EIGEN_HAS_POSIX_MEMALIGN 0 #endif -#ifdef EIGEN_VECTORIZE_SSE +#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_AVX #define EIGEN_HAS_MM_MALLOC 1 #else #define EIGEN_HAS_MM_MALLOC 0 @@ -89,7 +89,7 @@ inline void throw_std_bad_alloc() #ifdef EIGEN_EXCEPTIONS throw std::bad_alloc(); #else - std::size_t huge = -1; + std::size_t huge = static_cast<std::size_t>(-1); new int[huge]; #endif } @@ -105,9 +105,9 @@ inline void throw_std_bad_alloc() */ inline void* handmade_aligned_malloc(std::size_t size) { - void *original = std::malloc(size+16); + void *original = std::malloc(size+EIGEN_ALIGN_BYTES); if (original == 0) return 0; - void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16); + void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES); *(reinterpret_cast<void**>(aligned) - 1) = original; return aligned; } @@ -128,9 +128,9 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = if (ptr == 0) return handmade_aligned_malloc(size); void *original = *(reinterpret_cast<void**>(ptr) - 1); std::ptrdiff_t previous_offset = static_cast<char *>(ptr)-static_cast<char *>(original); - original = std::realloc(original,size+16); + original = std::realloc(original,size+EIGEN_ALIGN_BYTES); if (original == 0) return 0; - void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16); + void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES); void *previous_aligned = static_cast<char *>(original)+previous_offset; if(aligned!=previous_aligned) std::memmove(aligned, previous_aligned, size); @@ -208,7 +208,7 @@ inline void check_that_malloc_is_allowed() {} #endif -/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment. +/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements. * On allocation error, the returned pointer is null, and std::bad_alloc is thrown. */ inline void* aligned_malloc(size_t size) @@ -221,11 +221,11 @@ inline void* aligned_malloc(size_t size) #elif EIGEN_MALLOC_ALREADY_ALIGNED result = std::malloc(size); #elif EIGEN_HAS_POSIX_MEMALIGN - if(posix_memalign(&result, 16, size)) result = 0; + if(posix_memalign(&result, EIGEN_ALIGN_BYTES, size)) result = 0; #elif EIGEN_HAS_MM_MALLOC - result = _mm_malloc(size, 16); + result = _mm_malloc(size, EIGEN_ALIGN_BYTES); #elif defined(_MSC_VER) && (!defined(_WIN32_WCE)) - result = _aligned_malloc(size, 16); + result = _aligned_malloc(size, EIGEN_ALIGN_BYTES); #else result = handmade_aligned_malloc(size); #endif @@ -275,12 +275,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size) // implements _mm_malloc/_mm_free based on the corresponding _aligned_ // functions. This may not always be the case and we just try to be safe. #if defined(_MSC_VER) && (!defined(_WIN32_WCE)) && defined(_mm_free) - result = _aligned_realloc(ptr,new_size,16); + result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES); #else result = generic_aligned_realloc(ptr,new_size,old_size); #endif #elif defined(_MSC_VER) && (!defined(_WIN32_WCE)) - result = _aligned_realloc(ptr,new_size,16); + result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES); #else result = handmade_aligned_realloc(ptr,new_size,old_size); #endif @@ -552,7 +552,7 @@ template<typename T> struct smart_memmove_helper<T,false> { // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA // to the appropriate stack allocation function #ifndef EIGEN_ALLOCA - #if (defined __linux__) + #if (defined __linux__) || (defined __APPLE__) #define EIGEN_ALLOCA alloca #elif defined(_MSC_VER) #define EIGEN_ALLOCA _alloca @@ -607,9 +607,9 @@ template<typename T> class aligned_stack_memory_handler * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token. */ #ifdef EIGEN_ALLOCA - - #if defined(__arm__) || defined(_WIN32) - #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+16)) & ~(size_t(15))) + 16) + // The native alloca() that comes with llvm aligns buffer on 16 bytes even when AVX is enabled. +#if defined(__arm__) || defined(_WIN32) || EIGEN_ALIGN_BYTES > 16 + #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+EIGEN_ALIGN_BYTES)) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES) #else #define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA #endif @@ -679,7 +679,7 @@ template<typename T> class aligned_stack_memory_handler #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true) #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0))) + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_ALIGN_BYTES==0))) /****************************************************************************/ @@ -700,98 +700,42 @@ template<typename T> class aligned_stack_memory_handler * \sa \ref TopicStlContainers. */ template<class T> -class aligned_allocator +class aligned_allocator : public std::allocator<T> { public: - typedef size_t size_type; - typedef std::ptrdiff_t difference_type; - typedef T* pointer; - typedef const T* const_pointer; - typedef T& reference; - typedef const T& const_reference; - typedef T value_type; - - template<class U> - struct rebind - { - typedef aligned_allocator<U> other; - }; - - pointer address( reference value ) const - { - return &value; - } - - const_pointer address( const_reference value ) const - { - return &value; - } - - aligned_allocator() - { - } - - aligned_allocator( const aligned_allocator& ) - { - } - - template<class U> - aligned_allocator( const aligned_allocator<U>& ) - { - } - - ~aligned_allocator() - { - } - - size_type max_size() const - { - return (std::numeric_limits<size_type>::max)(); - } - - pointer allocate( size_type num, const void* hint = 0 ) - { - EIGEN_UNUSED_VARIABLE(hint); - internal::check_size_for_overflow<T>(num); - return static_cast<pointer>( internal::aligned_malloc( num * sizeof(T) ) ); - } - - void construct( pointer p, const T& value ) - { - ::new( p ) T( value ); - } + typedef size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef T value_type; + + template<class U> + struct rebind + { + typedef aligned_allocator<U> other; + }; -#if (__cplusplus >= 201103L) - template <typename U, typename... Args> - void construct( U* u, Args&&... args) - { - ::new( static_cast<void*>(u) ) U( std::forward<Args>( args )... ); - } -#endif + aligned_allocator() : std::allocator<T>() {} - void destroy( pointer p ) - { - p->~T(); - } + aligned_allocator(const aligned_allocator& other) : std::allocator<T>(other) {} -#if (__cplusplus >= 201103L) - template <typename U> - void destroy( U* u ) - { - u->~U(); - } -#endif + template<class U> + aligned_allocator(const aligned_allocator<U>& other) : std::allocator<T>(other) {} - void deallocate( pointer p, size_type /*num*/ ) - { - internal::aligned_free( p ); - } + ~aligned_allocator() {} - bool operator!=(const aligned_allocator<T>& ) const - { return false; } + pointer allocate(size_type num, const void* /*hint*/ = 0) + { + internal::check_size_for_overflow<T>(num); + return static_cast<pointer>( internal::aligned_malloc(num * sizeof(T)) ); + } - bool operator==(const aligned_allocator<T>& ) const - { return true; } + void deallocate(pointer p, size_type /*num*/) + { + internal::aligned_free(p); + } }; //---------- Cache sizes ---------- @@ -823,9 +767,9 @@ namespace internal { #ifdef EIGEN_CPUID -inline bool cpuid_is_vendor(int abcd[4], const char* vendor) +inline bool cpuid_is_vendor(int abcd[4], const int vendor[3]) { - return abcd[1]==(reinterpret_cast<const int*>(vendor))[0] && abcd[3]==(reinterpret_cast<const int*>(vendor))[1] && abcd[2]==(reinterpret_cast<const int*>(vendor))[2]; + return abcd[1]==vendor[0] && abcd[3]==vendor[1] && abcd[2]==vendor[2]; } inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3) @@ -967,13 +911,16 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3) { #ifdef EIGEN_CPUID int abcd[4]; + const int GenuineIntel[] = {0x756e6547, 0x49656e69, 0x6c65746e}; + const int AuthenticAMD[] = {0x68747541, 0x69746e65, 0x444d4163}; + const int AMDisbetter_[] = {0x69444d41, 0x74656273, 0x21726574}; // "AMDisbetter!" // identify the CPU vendor EIGEN_CPUID(abcd,0x0,0); int max_std_funcs = abcd[1]; - if(cpuid_is_vendor(abcd,"GenuineIntel")) + if(cpuid_is_vendor(abcd,GenuineIntel)) queryCacheSizes_intel(l1,l2,l3,max_std_funcs); - else if(cpuid_is_vendor(abcd,"AuthenticAMD") || cpuid_is_vendor(abcd,"AMDisbetter!")) + else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_)) queryCacheSizes_amd(l1,l2,l3); else // by default let's use Intel's API diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 5407645c9..14adb6795 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -101,6 +101,7 @@ template<typename T> struct packet_traits; template<typename T> struct unpacket_traits { typedef T type; + typedef T half; enum {size=1}; }; @@ -137,7 +138,7 @@ class compute_matrix_flags ((Options&DontAlign)==0) && ( #if EIGEN_ALIGN_STATICALLY - ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % 16) == 0)) + ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) #else 0 #endif diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h index bf20e03ef..d2563d470 100644 --- a/Eigen/src/Eigenvalues/EigenSolver.h +++ b/Eigen/src/Eigenvalues/EigenSolver.h @@ -275,10 +275,11 @@ template<typename _MatrixType> class EigenSolver */ EigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true); + /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */ ComputationInfo info() const { eigen_assert(m_isInitialized && "EigenSolver is not initialized."); - return m_realSchur.info(); + return m_info; } /** \brief Sets the maximum number of iterations allowed. */ @@ -302,6 +303,7 @@ template<typename _MatrixType> class EigenSolver EigenvalueType m_eivalues; bool m_isInitialized; bool m_eigenvectorsOk; + ComputationInfo m_info; RealSchur<MatrixType> m_realSchur; MatrixType m_matT; @@ -366,12 +368,16 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect { using std::sqrt; using std::abs; + using std::max; + using numext::isfinite; eigen_assert(matrix.cols() == matrix.rows()); // Reduce to real Schur form. m_realSchur.compute(matrix, computeEigenvectors); + + m_info = m_realSchur.info(); - if (m_realSchur.info() == Success) + if (m_info == Success) { m_matT = m_realSchur.matrixT(); if (computeEigenvectors) @@ -385,14 +391,40 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect if (i == matrix.cols() - 1 || m_matT.coeff(i+1, i) == Scalar(0)) { m_eivalues.coeffRef(i) = m_matT.coeff(i, i); + if(!isfinite(m_eivalues.coeffRef(i))) + { + m_isInitialized = true; + m_eigenvectorsOk = false; + m_info = NumericalIssue; + return *this; + } ++i; } else { Scalar p = Scalar(0.5) * (m_matT.coeff(i, i) - m_matT.coeff(i+1, i+1)); - Scalar z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1))); + Scalar z; + // Compute z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1))); + // without overflow + { + Scalar t0 = m_matT.coeff(i+1, i); + Scalar t1 = m_matT.coeff(i, i+1); + Scalar maxval = (max)(abs(p),(max)(abs(t0),abs(t1))); + t0 /= maxval; + t1 /= maxval; + Scalar p0 = p/maxval; + z = maxval * sqrt(abs(p0 * p0 + t0 * t1)); + } + m_eivalues.coeffRef(i) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, z); m_eivalues.coeffRef(i+1) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, -z); + if(!(isfinite(m_eivalues.coeffRef(i)) && isfinite(m_eivalues.coeffRef(i+1)))) + { + m_isInitialized = true; + m_eigenvectorsOk = false; + m_info = NumericalIssue; + return *this; + } i += 2; } } @@ -581,7 +613,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors() } else { - eigen_assert(0 && "Internal bug in EigenSolver"); // this should not happen + eigen_assert(0 && "Internal bug in EigenSolver (INF or NaN has not been detected)"); // this should not happen } } diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h index f424e6d7d..636712c2b 100644 --- a/Eigen/src/Geometry/AngleAxis.h +++ b/Eigen/src/Geometry/AngleAxis.h @@ -77,7 +77,9 @@ public: * represents an invalid rotation. */ template<typename Derived> inline AngleAxis(const Scalar& angle, const MatrixBase<Derived>& axis) : m_axis(axis), m_angle(angle) {} - /** Constructs and initialize the angle-axis rotation from a quaternion \a q. */ + /** Constructs and initialize the angle-axis rotation from a quaternion \a q. + * This function implicitly normalizes the quaternion \a q. + */ template<typename QuatDerived> inline explicit AngleAxis(const QuaternionBase<QuatDerived>& q) { *this = q; } /** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */ template<typename Derived> @@ -149,29 +151,27 @@ typedef AngleAxis<float> AngleAxisf; typedef AngleAxis<double> AngleAxisd; /** Set \c *this from a \b unit quaternion. - * The axis is normalized. + * The resulting axis is normalized. * - * \warning As any other method dealing with quaternion, if the input quaternion - * is not normalized then the result is undefined. + * This function implicitly normalizes the quaternion \a q. */ template<typename Scalar> template<typename QuatDerived> AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q) { - using std::acos; - EIGEN_USING_STD_MATH(min); - EIGEN_USING_STD_MATH(max); - using std::sqrt; - Scalar n2 = q.vec().squaredNorm(); - if (n2 < NumTraits<Scalar>::dummy_precision()*NumTraits<Scalar>::dummy_precision()) + using std::atan2; + Scalar n = q.vec().norm(); + if(n<NumTraits<Scalar>::epsilon()) + n = q.vec().stableNorm(); + if (n > Scalar(0)) { - m_angle = 0; - m_axis << 1, 0, 0; + m_angle = Scalar(2)*atan2(n, q.w()); + m_axis = q.vec() / n; } else { - m_angle = Scalar(2)*acos((min)((max)(Scalar(-1),q.w()),Scalar(1))); - m_axis = q.vec() / sqrt(n2); + m_angle = 0; + m_axis << 1, 0, 0; } return *this; } diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h index a712692e5..11e5398d4 100644 --- a/Eigen/src/Geometry/Quaternion.h +++ b/Eigen/src/Geometry/Quaternion.h @@ -587,7 +587,7 @@ inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Deri // which yields a singular value problem if (c < Scalar(-1)+NumTraits<Scalar>::dummy_precision()) { - c = max<Scalar>(c,-1); + c = (max)(c,Scalar(-1)); Matrix<Scalar,2,3> m; m << v0.transpose(), v1.transpose(); JacobiSVD<Matrix<Scalar,2,3> > svd(m, ComputeFullV); Vector3 axis = svd.matrixV().col(2); diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index f40644011..cb93acf6b 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h @@ -194,9 +194,9 @@ public: /** type of the matrix used to represent the linear part of the transformation */ typedef Matrix<Scalar,Dim,Dim,Options> LinearMatrixType; /** type of read/write reference to the linear part of the transformation */ - typedef Block<MatrixType,Dim,Dim,int(Mode)==(AffineCompact)> LinearPart; + typedef Block<MatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> LinearPart; /** type of read reference to the linear part of the transformation */ - typedef const Block<ConstMatrixType,Dim,Dim,int(Mode)==(AffineCompact)> ConstLinearPart; + typedef const Block<ConstMatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> ConstLinearPart; /** type of read/write reference to the affine part of the transformation */ typedef typename internal::conditional<int(Mode)==int(AffineCompact), MatrixType&, diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h index 345b47e0c..5e20662f8 100644 --- a/Eigen/src/Geometry/Umeyama.h +++ b/Eigen/src/Geometry/Umeyama.h @@ -113,7 +113,7 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo const Index n = src.cols(); // number of measurements // required for demeaning ... - const RealScalar one_over_n = 1 / static_cast<RealScalar>(n); + const RealScalar one_over_n = RealScalar(1) / static_cast<RealScalar>(n); // computation of mean const VectorType src_mean = src.rowwise().sum() * one_over_n; @@ -136,16 +136,16 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo // Eq. (39) VectorType S = VectorType::Ones(m); - if (sigma.determinant()<0) S(m-1) = -1; + if (sigma.determinant()<Scalar(0)) S(m-1) = Scalar(-1); // Eq. (40) and (43) const VectorType& d = svd.singularValues(); Index rank = 0; for (Index i=0; i<m; ++i) if (!internal::isMuchSmallerThan(d.coeff(i),d.coeff(0))) ++rank; if (rank == m-1) { - if ( svd.matrixU().determinant() * svd.matrixV().determinant() > 0 ) { + if ( svd.matrixU().determinant() * svd.matrixV().determinant() > Scalar(0) ) { Rt.block(0,0,m,m).noalias() = svd.matrixU()*svd.matrixV().transpose(); } else { - const Scalar s = S(m-1); S(m-1) = -1; + const Scalar s = S(m-1); S(m-1) = Scalar(-1); Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose(); S(m-1) = s; } @@ -156,7 +156,7 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo if (with_scaling) { // Eq. (42) - const Scalar c = 1/src_var * svd.singularValues().dot(S); + const Scalar c = Scalar(1)/src_var * svd.singularValues().dot(S); // Eq. (41) Rt.col(m).head(m) = dst_mean; diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h index 1991c6527..60dbea5f5 100644 --- a/Eigen/src/Householder/BlockHouseholder.h +++ b/Eigen/src/Householder/BlockHouseholder.h @@ -48,7 +48,7 @@ void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vec typedef typename MatrixType::Index Index; enum { TFactorSize = MatrixType::ColsAtCompileTime }; Index nbVecs = vectors.cols(); - Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize> T(nbVecs,nbVecs); + Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize, ColMajor> T(nbVecs,nbVecs); make_block_householder_triangular_factor(T, vectors, hCoeffs); const TriangularView<const VectorsType, UnitLower>& V(vectors); diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h index 73ca9bfde..1f3c060d0 100644 --- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h @@ -65,10 +65,10 @@ class DiagonalPreconditioner { typename MatType::InnerIterator it(mat,j); while(it && it.index()!=j) ++it; - if(it && it.index()==j) + if(it && it.index()==j && it.value()!=Scalar(0)) m_invdiag(j) = Scalar(1)/it.value(); else - m_invdiag(j) = 0; + m_invdiag(j) = Scalar(1); } m_isInitialized = true; return *this; diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 78cd9b90f..daf99e305 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -29,10 +29,11 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> > * * \param MatrixType the type of the matrix of which we are computing the LU decomposition * - * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A - * is decomposed as A = PLUQ where L is unit-lower-triangular, U is upper-triangular, and P and Q - * are permutation matrices. This is a rank-revealing LU decomposition. The eigenvalues (diagonal - * coefficients) of U are sorted in such a way that any zeros are at the end. + * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is + * decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is + * upper-triangular, and P and Q are permutation matrices. This is a rank-revealing LU + * decomposition. The eigenvalues (diagonal coefficients) of U are sorted in such a way that any + * zeros are at the end. * * This decomposition provides the generic approach to solving systems of linear equations, computing * the rank, invertibility, inverse, kernel, and determinant. @@ -546,8 +547,8 @@ typename internal::traits<MatrixType>::Scalar FullPivLU<MatrixType>::determinant } /** \returns the matrix represented by the decomposition, - * i.e., it returns the product: P^{-1} L U Q^{-1}. - * This function is provided for debug purpose. */ + * i.e., it returns the product: \f$ P^{-1} L U Q^{-1} \f$. + * This function is provided for debug purposes. */ template<typename MatrixType> MatrixType FullPivLU<MatrixType>::reconstructedMatrix() const { diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h index b4da6531a..4e0609784 100644 --- a/Eigen/src/OrderingMethods/Ordering.h +++ b/Eigen/src/OrderingMethods/Ordering.h @@ -109,7 +109,7 @@ class NaturalOrdering * \class COLAMDOrdering * * Functor computing the \em column \em approximate \em minimum \em degree ordering - * The matrix should be in column-major format + * The matrix should be in column-major and \b compressed format (see SparseMatrix::makeCompressed()). */ template<typename Index> class COLAMDOrdering @@ -118,10 +118,14 @@ class COLAMDOrdering typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType; typedef Matrix<Index, Dynamic, 1> IndexVector; - /** Compute the permutation vector form a sparse matrix */ + /** Compute the permutation vector \a perm form the sparse matrix \a mat + * \warning The input sparse matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()). + */ template <typename MatrixType> void operator() (const MatrixType& mat, PermutationType& perm) { + eigen_assert(mat.isCompressed() && "COLAMDOrdering requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to COLAMDOrdering"); + Index m = mat.rows(); Index n = mat.cols(); Index nnz = mat.nonZeros(); diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index d78583ecc..a0c0c9172 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -415,6 +415,7 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q, JacobiRotation<RealScalar> *j_right) { using std::sqrt; + using std::abs; Matrix<RealScalar,2,2> m; m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)), numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q)); @@ -428,9 +429,11 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q, } else { - RealScalar u = d / t; - rot1.c() = RealScalar(1) / sqrt(RealScalar(1) + numext::abs2(u)); - rot1.s() = rot1.c() * u; + RealScalar t2d2 = numext::hypot(t,d); + rot1.c() = abs(t)/t2d2; + rot1.s() = d/t2d2; + if(t<RealScalar(0)) + rot1.s() = -rot1.s(); } m.applyOnTheLeft(0,1,rot1); j_right->makeJacobi(m,0,1); diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 6f615e250..5b95cc33f 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -338,7 +338,10 @@ const Block<const Derived,Dynamic,Dynamic,true> SparseMatrixBase<Derived>::inner namespace internal {
template< typename XprType, int BlockRows, int BlockCols, bool InnerPanel,
- bool OuterVector = (BlockCols==1 && XprType::IsRowMajor) || (BlockRows==1 && !XprType::IsRowMajor)>
+ bool OuterVector = (BlockCols==1 && XprType::IsRowMajor)
+ | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
+ // revert to || as soon as not needed anymore.
+ (BlockRows==1 && !XprType::IsRowMajor)>
class GenericSparseBlockInnerIteratorImpl;
}
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index a25bd83eb..aa8bfc0f5 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -1205,7 +1205,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse m_data.value(p) = m_data.value(p-1); --p; } - eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exist, you must call coeffRef to this end"); + eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exists, you must call coeffRef to this end"); m_innerNonZeros[outer]++; diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h index 267c48bc3..5fb5bc203 100644 --- a/Eigen/src/SparseQR/SparseQR.h +++ b/Eigen/src/SparseQR/SparseQR.h @@ -58,6 +58,7 @@ namespace internal { * \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module * OrderingMethods \endlink module for the list of built-in and external ordering methods. * + * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()). * */ template<typename _MatrixType, typename _OrderingType> @@ -77,10 +78,23 @@ class SparseQR SparseQR () : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false) { } + /** Construct a QR factorization of the matrix \a mat. + * + * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()). + * + * \sa compute() + */ SparseQR(const MatrixType& mat) : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false) { compute(mat); } + + /** Computes the QR factorization of the sparse matrix \a mat. + * + * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()). + * + * \sa analyzePattern(), factorize() + */ void compute(const MatrixType& mat) { analyzePattern(mat); @@ -256,6 +270,8 @@ class SparseQR /** \brief Preprocessing step of a QR factorization * + * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()). + * * In this step, the fill-reducing permutation is computed and applied to the columns of A * and the column elimination tree is computed as well. Only the sparsity pattern of \a mat is exploited. * @@ -264,6 +280,7 @@ class SparseQR template <typename MatrixType, typename OrderingType> void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat) { + eigen_assert(mat.isCompressed() && "SparseQR requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to SparseQR"); // Compute the column fill reducing ordering OrderingType ord; ord(mat, m_perm_c); diff --git a/Eigen/src/StlSupport/StdDeque.h b/Eigen/src/StlSupport/StdDeque.h index 4ee8e5c10..aaf66330b 100644 --- a/Eigen/src/StlSupport/StdDeque.h +++ b/Eigen/src/StlSupport/StdDeque.h @@ -11,7 +11,7 @@ #ifndef EIGEN_STDDEQUE_H #define EIGEN_STDDEQUE_H -#include "Eigen/src/StlSupport/details.h" +#include "details.h" // Define the explicit instantiation (e.g. necessary for the Intel compiler) #if defined(__INTEL_COMPILER) || defined(__GNUC__) diff --git a/Eigen/src/StlSupport/StdList.h b/Eigen/src/StlSupport/StdList.h index 627381ece..3c742430c 100644 --- a/Eigen/src/StlSupport/StdList.h +++ b/Eigen/src/StlSupport/StdList.h @@ -10,7 +10,7 @@ #ifndef EIGEN_STDLIST_H #define EIGEN_STDLIST_H -#include "Eigen/src/StlSupport/details.h" +#include "details.h" // Define the explicit instantiation (e.g. necessary for the Intel compiler) #if defined(__INTEL_COMPILER) || defined(__GNUC__) diff --git a/Eigen/src/StlSupport/StdVector.h b/Eigen/src/StlSupport/StdVector.h index 40a9abefa..611664a2e 100644 --- a/Eigen/src/StlSupport/StdVector.h +++ b/Eigen/src/StlSupport/StdVector.h @@ -11,7 +11,7 @@ #ifndef EIGEN_STDVECTOR_H #define EIGEN_STDVECTOR_H -#include "Eigen/src/StlSupport/details.h" +#include "details.h" /** * This section contains a convenience MACRO which allows an easy specialization of diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index aea3375ed..ce462e951 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -141,6 +141,18 @@ tan() const return derived(); } +/** \returns an expression of the coefficient-wise arc tan of *this. + * + * Example: \include Cwise_atan.cpp + * Output: \verbinclude Cwise_atan.out + * + * \sa cos(), sin(), tan() + */ +inline const CwiseUnaryOp<internal::scalar_atan_op<Scalar>, Derived> +atan() const +{ + return derived(); +} /** \returns an expression of the coefficient-wise power of *this to the given exponent. * |