diff options
Diffstat (limited to 'Eigen/src')
35 files changed, 1792 insertions, 1236 deletions
diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h index 1a54b2335..941e39938 100644 --- a/Eigen/src/Core/ArrayBase.h +++ b/Eigen/src/Core/ArrayBase.h @@ -178,7 +178,7 @@ template<typename OtherDerived> EIGEN_STRONG_INLINE Derived & ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other) { - SelfCwiseBinaryOp<ei_scalar_difference_op<Scalar>, Derived> tmp(derived()); + SelfCwiseBinaryOp<ei_scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived()); tmp = other; return derived(); } @@ -192,7 +192,7 @@ template<typename OtherDerived> EIGEN_STRONG_INLINE Derived & ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other) { - SelfCwiseBinaryOp<ei_scalar_sum_op<Scalar>, Derived> tmp(derived()); + SelfCwiseBinaryOp<ei_scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived()); tmp = other.derived(); return derived(); } @@ -206,7 +206,7 @@ template<typename OtherDerived> EIGEN_STRONG_INLINE Derived & ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other) { - SelfCwiseBinaryOp<ei_scalar_product_op<Scalar>, Derived> tmp(derived()); + SelfCwiseBinaryOp<ei_scalar_product_op<Scalar>, Derived, OtherDerived> tmp(derived()); tmp = other.derived(); return derived(); } @@ -220,7 +220,7 @@ template<typename OtherDerived> EIGEN_STRONG_INLINE Derived & ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other) { - SelfCwiseBinaryOp<ei_scalar_quotient_op<Scalar>, Derived> tmp(derived()); + SelfCwiseBinaryOp<ei_scalar_quotient_op<Scalar>, Derived, OtherDerived> tmp(derived()); tmp = other.derived(); return derived(); } diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h index 2a7ca4786..335a888f6 100644 --- a/Eigen/src/Core/Assign.h +++ b/Eigen/src/Core/Assign.h @@ -256,6 +256,12 @@ struct ei_assign_impl; *** Default traversal *** ************************/ +template<typename Derived1, typename Derived2, int Unrolling> +struct ei_assign_impl<Derived1, Derived2, InvalidTraversal, Unrolling> +{ + inline static void run(Derived1 &, const Derived2 &) { } +}; + template<typename Derived1, typename Derived2> struct ei_assign_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling> { @@ -397,7 +403,12 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src) { const Index size = dst.size(); - const Index packetSize = ei_packet_traits<typename Derived1::Scalar>::size; + typedef ei_packet_traits<typename Derived1::Scalar> PacketTraits; + enum { + packetSize = PacketTraits::size, + dstAlignment = PacketTraits::AlignedOnScalar ? Aligned : int(ei_assign_traits<Derived1,Derived2>::DstIsAligned) , + srcAlignment = ei_assign_traits<Derived1,Derived2>::JointAlignment + }; const Index alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0 : ei_first_aligned(&dst.coeffRef(0), size); const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize; @@ -406,7 +417,7 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling for(Index index = alignedStart; index < alignedEnd; index += packetSize) { - dst.template copyPacket<Derived2, Aligned, ei_assign_traits<Derived1,Derived2>::JointAlignment>(index, src); + dst.template copyPacket<Derived2, dstAlignment, srcAlignment>(index, src); } ei_unaligned_assign_impl<>::run(src,dst,alignedEnd,size); @@ -438,12 +449,18 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorizedTraversal, NoUnrolling> typedef typename Derived1::Index Index; inline static void run(Derived1 &dst, const Derived2 &src) { - const Index packetSize = ei_packet_traits<typename Derived1::Scalar>::size; + typedef ei_packet_traits<typename Derived1::Scalar> PacketTraits; + enum { + packetSize = PacketTraits::size, + alignable = PacketTraits::AlignedOnScalar, + dstAlignment = alignable ? Aligned : int(ei_assign_traits<Derived1,Derived2>::DstIsAligned) , + srcAlignment = ei_assign_traits<Derived1,Derived2>::JointAlignment + }; const Index packetAlignedMask = packetSize - 1; const Index innerSize = dst.innerSize(); const Index outerSize = dst.outerSize(); - const Index alignedStep = (packetSize - dst.outerStride() % packetSize) & packetAlignedMask; - Index alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0 + const Index alignedStep = alignable ? (packetSize - dst.outerStride() % packetSize) & packetAlignedMask : 0; + Index alignedStart = ((!alignable) || ei_assign_traits<Derived1,Derived2>::DstIsAligned) ? 0 : ei_first_aligned(&dst.coeffRef(0,0), innerSize); for(Index outer = 0; outer < outerSize; ++outer) @@ -475,14 +492,21 @@ template<typename OtherDerived> EIGEN_STRONG_INLINE Derived& DenseBase<Derived> ::lazyAssign(const DenseBase<OtherDerived>& other) { + enum{ + SameType = ei_is_same_type<typename Derived::Scalar,typename OtherDerived::Scalar>::ret + }; + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived) - EIGEN_STATIC_ASSERT((ei_is_same_type<typename Derived::Scalar, typename OtherDerived::Scalar>::ret), - YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + EIGEN_STATIC_ASSERT(SameType,YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + + + #ifdef EIGEN_DEBUG_ASSIGN ei_assign_traits<Derived, OtherDerived>::debug(); #endif ei_assert(rows() == other.rows() && cols() == other.cols()); - ei_assign_impl<Derived, OtherDerived>::run(derived(),other.derived()); + ei_assign_impl<Derived, OtherDerived, int(SameType) ? int(ei_assign_traits<Derived, OtherDerived>::Traversal) + : int(InvalidTraversal)>::run(derived(),other.derived()); #ifndef EIGEN_NO_DEBUG checkTransposeAliasing(other.derived()); #endif diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h index 7f4587b53..171140c27 100644 --- a/Eigen/src/Core/CwiseBinaryOp.h +++ b/Eigen/src/Core/CwiseBinaryOp.h @@ -80,13 +80,14 @@ struct ei_traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > RhsCoeffReadCost = _RhsNested::CoeffReadCost, LhsFlags = _LhsNested::Flags, RhsFlags = _RhsNested::Flags, + SameType = ei_is_same_type<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::ret, StorageOrdersAgree = (int(Lhs::Flags)&RowMajorBit)==(int(Rhs::Flags)&RowMajorBit), Flags0 = (int(LhsFlags) | int(RhsFlags)) & ( HereditaryBits | (int(LhsFlags) & int(RhsFlags) & ( AlignedBit | (StorageOrdersAgree ? LinearAccessBit : 0) - | (ei_functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree ? PacketAccessBit : 0) + | (ei_functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0) ) ) ), @@ -95,6 +96,19 @@ struct ei_traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > }; }; +// we require Lhs and Rhs to have the same scalar type. Currently there is no example of a binary functor +// that would take two operands of different types. If there were such an example, then this check should be +// moved to the BinaryOp functors, on a per-case basis. This would however require a change in the BinaryOp functors, as +// currently they take only one typename Scalar template parameter. +// It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths. +// So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to +// add together a float matrix and a double matrix. +#define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \ + EIGEN_STATIC_ASSERT((ei_functor_allows_mixing_real_and_complex<BINOP>::ret \ + ? int(ei_is_same_type<typename NumTraits<LHS>::Real, typename NumTraits<RHS>::Real>::ret) \ + : int(ei_is_same_type<LHS, RHS>::ret)), \ + YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind> class CwiseBinaryOpImpl; @@ -121,17 +135,7 @@ class CwiseBinaryOp : ei_no_assignment_operator, EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& lhs, const Rhs& rhs, const BinaryOp& func = BinaryOp()) : m_lhs(lhs), m_rhs(rhs), m_functor(func) { - // we require Lhs and Rhs to have the same scalar type. Currently there is no example of a binary functor - // that would take two operands of different types. If there were such an example, then this check should be - // moved to the BinaryOp functors, on a per-case basis. This would however require a change in the BinaryOp functors, as - // currently they take only one typename Scalar template parameter. - // It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths. - // So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to - // add together a float matrix and a double matrix. - EIGEN_STATIC_ASSERT((ei_functor_allows_mixing_real_and_complex<BinaryOp>::ret - ? int(ei_is_same_type<typename Lhs::RealScalar, typename Rhs::RealScalar>::ret) - : int(ei_is_same_type<typename Lhs::Scalar, typename Rhs::Scalar>::ret)), - YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar); // require the sizes to match EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs, Rhs) ei_assert(lhs.rows() == rhs.rows() && lhs.cols() == rhs.cols()); @@ -211,7 +215,7 @@ template<typename OtherDerived> EIGEN_STRONG_INLINE Derived & MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other) { - SelfCwiseBinaryOp<ei_scalar_difference_op<Scalar>, Derived> tmp(derived()); + SelfCwiseBinaryOp<ei_scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived()); tmp = other; return derived(); } @@ -225,7 +229,7 @@ template<typename OtherDerived> EIGEN_STRONG_INLINE Derived & MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other) { - SelfCwiseBinaryOp<ei_scalar_sum_op<Scalar>, Derived> tmp(derived()); + SelfCwiseBinaryOp<ei_scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived()); tmp = other.derived(); return derived(); } diff --git a/Eigen/src/Core/DenseStorageBase.h b/Eigen/src/Core/DenseStorageBase.h index 9b4a645e6..e818d40a1 100644 --- a/Eigen/src/Core/DenseStorageBase.h +++ b/Eigen/src/Core/DenseStorageBase.h @@ -108,7 +108,7 @@ class DenseStorageBase : public ei_dense_xpr_base<Derived>::type template<int LoadMode> EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const { - return ei_ploadt<Scalar, LoadMode> + return ei_ploadt<PacketScalar, LoadMode> (m_storage.data() + (Flags & RowMajorBit ? col + row * m_storage.cols() : row + col * m_storage.rows())); @@ -117,7 +117,7 @@ class DenseStorageBase : public ei_dense_xpr_base<Derived>::type template<int LoadMode> EIGEN_STRONG_INLINE PacketScalar packet(Index index) const { - return ei_ploadt<Scalar, LoadMode>(m_storage.data() + index); + return ei_ploadt<PacketScalar, LoadMode>(m_storage.data() + index); } template<int StoreMode> diff --git a/Eigen/src/Core/DiagonalProduct.h b/Eigen/src/Core/DiagonalProduct.h index 610d13dc8..14c49e828 100644 --- a/Eigen/src/Core/DiagonalProduct.h +++ b/Eigen/src/Core/DiagonalProduct.h @@ -91,7 +91,7 @@ class DiagonalProduct : ei_no_assignment_operator, EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, ei_meta_true) const { return ei_pmul(m_matrix.template packet<LoadMode>(row, col), - ei_pset1(m_diagonal.diagonal().coeff(id))); + ei_pset1<PacketScalar>(m_diagonal.diagonal().coeff(id))); } template<int LoadMode> diff --git a/Eigen/src/Core/Functors.h b/Eigen/src/Core/Functors.h index c3a3a92c4..41ae4af42 100644 --- a/Eigen/src/Core/Functors.h +++ b/Eigen/src/Core/Functors.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr> +// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr> // // Eigen is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public @@ -35,11 +35,11 @@ template<typename Scalar> struct ei_scalar_sum_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_sum_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return ei_padd(a,b); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const + template<typename Packet> + EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const { return ei_predux(a); } }; template<typename Scalar> @@ -55,21 +55,25 @@ struct ei_functor_traits<ei_scalar_sum_op<Scalar> > { * * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux() */ -template<typename Scalar> struct ei_scalar_product_op { +template<typename LhsScalar,typename RhsScalar> struct ei_scalar_product_op { + enum { + Vectorizable = ei_is_same_type<LhsScalar,RhsScalar>::ret && ei_packet_traits<LhsScalar>::HasMul && ei_packet_traits<RhsScalar>::HasMul + }; + typedef typename ei_scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type; EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_product_op) - EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a * b; } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const + EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; } + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return ei_pmul(a,b); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const + template<typename Packet> + EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const { return ei_predux_mul(a); } }; -template<typename Scalar> -struct ei_functor_traits<ei_scalar_product_op<Scalar> > { +template<typename LhsScalar,typename RhsScalar> +struct ei_functor_traits<ei_scalar_product_op<LhsScalar,RhsScalar> > { enum { - Cost = NumTraits<Scalar>::MulCost, - PacketAccess = ei_packet_traits<Scalar>::HasMul + Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate! + PacketAccess = ei_scalar_product_op<LhsScalar,RhsScalar>::Vectorizable }; }; @@ -83,9 +87,9 @@ template<typename Scalar> struct ei_scalar_conj_product_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_conj_product_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return ei_conj_helper<Scalar,Scalar,Conj,false>().pmul(a,b); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const - { return ei_conj_helper<PacketScalar,PacketScalar,Conj,false>().pmul(a,b); } + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return ei_conj_helper<Packet,Packet,Conj,false>().pmul(a,b); } }; template<typename Scalar> struct ei_functor_traits<ei_scalar_conj_product_op<Scalar> > { @@ -103,11 +107,11 @@ struct ei_functor_traits<ei_scalar_conj_product_op<Scalar> > { template<typename Scalar> struct ei_scalar_min_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_min_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return std::min(a, b); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return ei_pmin(a,b); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const + template<typename Packet> + EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const { return ei_predux_min(a); } }; template<typename Scalar> @@ -126,11 +130,11 @@ struct ei_functor_traits<ei_scalar_min_op<Scalar> > { template<typename Scalar> struct ei_scalar_max_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_max_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return std::max(a, b); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return ei_pmax(a,b); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const + template<typename Packet> + EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const { return ei_predux_max(a); } }; template<typename Scalar> @@ -172,8 +176,8 @@ struct ei_functor_traits<ei_scalar_hypot_op<Scalar> > { template<typename Scalar> struct ei_scalar_difference_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_difference_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return ei_psub(a,b); } }; template<typename Scalar> @@ -192,8 +196,8 @@ struct ei_functor_traits<ei_scalar_difference_op<Scalar> > { template<typename Scalar> struct ei_scalar_quotient_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_quotient_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a / b; } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return ei_pdiv(a,b); } }; template<typename Scalar> @@ -214,8 +218,8 @@ struct ei_functor_traits<ei_scalar_quotient_op<Scalar> > { template<typename Scalar> struct ei_scalar_opposite_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_opposite_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return ei_pnegate(a); } }; template<typename Scalar> @@ -234,8 +238,8 @@ template<typename Scalar> struct ei_scalar_abs_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_abs_op) typedef typename NumTraits<Scalar>::Real result_type; EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return ei_abs(a); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return ei_pabs(a); } }; template<typename Scalar> @@ -256,8 +260,8 @@ template<typename Scalar> struct ei_scalar_abs2_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_abs2_op) typedef typename NumTraits<Scalar>::Real result_type; EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return ei_abs2(a); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return ei_pmul(a,a); } }; template<typename Scalar> @@ -272,8 +276,8 @@ struct ei_functor_traits<ei_scalar_abs2_op<Scalar> > template<typename Scalar> struct ei_scalar_conjugate_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_conjugate_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return ei_conj(a); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const { return ei_pconj(a); } + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return ei_pconj(a); } }; template<typename Scalar> struct ei_functor_traits<ei_scalar_conjugate_op<Scalar> > @@ -397,22 +401,22 @@ struct ei_functor_traits<ei_scalar_log_op<Scalar> > * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/ */ /* NOTE why doing the ei_pset1() in packetOp *is* an optimization ? - * indeed it seems better to declare m_other as a PacketScalar and do the ei_pset1() once + * indeed it seems better to declare m_other as a Packet and do the ei_pset1() once * in the constructor. However, in practice: - * - GCC does not like m_other as a PacketScalar and generate a load every time it needs it + * - GCC does not like m_other as a Packet and generate a load every time it needs it * - on the other hand GCC is able to moves the ei_pset1() away the loop :) * - simpler code ;) * (ICC and gcc 4.4 seems to perform well in both cases, the issue is visible with y = a*x + b*y) */ template<typename Scalar> struct ei_scalar_multiple_op { - typedef typename ei_packet_traits<Scalar>::type PacketScalar; + typedef typename ei_packet_traits<Scalar>::type Packet; // FIXME default copy constructors seems bugged with std::complex<> EIGEN_STRONG_INLINE ei_scalar_multiple_op(const ei_scalar_multiple_op& other) : m_other(other.m_other) { } EIGEN_STRONG_INLINE ei_scalar_multiple_op(const Scalar& other) : m_other(other) { } EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; } - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const - { return ei_pmul(a, ei_pset1(m_other)); } + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const + { return ei_pmul(a, ei_pset1<Packet>(m_other)); } typename ei_makeconst<typename NumTraits<Scalar>::Nested>::type m_other; }; template<typename Scalar> @@ -433,13 +437,13 @@ struct ei_functor_traits<ei_scalar_multiple2_op<Scalar1,Scalar2> > template<typename Scalar, bool IsInteger> struct ei_scalar_quotient1_impl { - typedef typename ei_packet_traits<Scalar>::type PacketScalar; + typedef typename ei_packet_traits<Scalar>::type Packet; // FIXME default copy constructors seems bugged with std::complex<> EIGEN_STRONG_INLINE ei_scalar_quotient1_impl(const ei_scalar_quotient1_impl& other) : m_other(other.m_other) { } EIGEN_STRONG_INLINE ei_scalar_quotient1_impl(const Scalar& other) : m_other(static_cast<Scalar>(1) / other) {} EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; } - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const - { return ei_pmul(a, ei_pset1(m_other)); } + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const + { return ei_pmul(a, ei_pset1<Packet>(m_other)); } const Scalar m_other; }; template<typename Scalar> @@ -480,13 +484,13 @@ struct ei_functor_traits<ei_scalar_quotient1_op<Scalar> > template<typename Scalar> struct ei_scalar_constant_op { - typedef typename ei_packet_traits<Scalar>::type PacketScalar; + typedef typename ei_packet_traits<Scalar>::type Packet; EIGEN_STRONG_INLINE ei_scalar_constant_op(const ei_scalar_constant_op& other) : m_other(other.m_other) { } EIGEN_STRONG_INLINE ei_scalar_constant_op(const Scalar& other) : m_other(other) { } template<typename Index> EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; } template<typename Index> - EIGEN_STRONG_INLINE const PacketScalar packetOp(Index, Index = 0) const { return ei_pset1(m_other); } + EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const { return ei_pset1<Packet>(m_other); } const Scalar m_other; }; template<typename Scalar> @@ -513,22 +517,22 @@ template <typename Scalar, bool RandomAccess> struct ei_linspaced_op_impl; template <typename Scalar> struct ei_linspaced_op_impl<Scalar,false> { - typedef typename ei_packet_traits<Scalar>::type PacketScalar; + typedef typename ei_packet_traits<Scalar>::type Packet; ei_linspaced_op_impl(Scalar low, Scalar step) : m_low(low), m_step(step), - m_packetStep(ei_pset1(ei_packet_traits<Scalar>::size*step)), - m_base(ei_padd(ei_pset1(low),ei_pmul(ei_pset1(step),ei_plset<Scalar>(-ei_packet_traits<Scalar>::size)))) {} + m_packetStep(ei_pset1<Packet>(ei_packet_traits<Scalar>::size*step)), + m_base(ei_padd(ei_pset1<Packet>(low),ei_pmul(ei_pset1<Packet>(step),ei_plset<Scalar>(-ei_packet_traits<Scalar>::size)))) {} template<typename Index> EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; } template<typename Index> - EIGEN_STRONG_INLINE const PacketScalar packetOp(Index) const { return m_base = ei_padd(m_base,m_packetStep); } + EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = ei_padd(m_base,m_packetStep); } const Scalar m_low; const Scalar m_step; - const PacketScalar m_packetStep; - mutable PacketScalar m_base; + const Packet m_packetStep; + mutable Packet m_base; }; // random access for packet ops: @@ -537,23 +541,23 @@ struct ei_linspaced_op_impl<Scalar,false> template <typename Scalar> struct ei_linspaced_op_impl<Scalar,true> { - typedef typename ei_packet_traits<Scalar>::type PacketScalar; + typedef typename ei_packet_traits<Scalar>::type Packet; ei_linspaced_op_impl(Scalar low, Scalar step) : m_low(low), m_step(step), - m_lowPacket(ei_pset1(m_low)), m_stepPacket(ei_pset1(m_step)), m_interPacket(ei_plset<Scalar>(0)) {} + m_lowPacket(ei_pset1<Packet>(m_low)), m_stepPacket(ei_pset1<Packet>(m_step)), m_interPacket(ei_plset<Scalar>(0)) {} template<typename Index> EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; } template<typename Index> - EIGEN_STRONG_INLINE const PacketScalar packetOp(Index i) const - { return ei_padd(m_lowPacket, ei_pmul(m_stepPacket, ei_padd(ei_pset1<Scalar>(i),m_interPacket))); } + EIGEN_STRONG_INLINE const Packet packetOp(Index i) const + { return ei_padd(m_lowPacket, ei_pmul(m_stepPacket, ei_padd(ei_pset1<Packet>(i),m_interPacket))); } const Scalar m_low; const Scalar m_step; - const PacketScalar m_lowPacket; - const PacketScalar m_stepPacket; - const PacketScalar m_interPacket; + const Packet m_lowPacket; + const Packet m_stepPacket; + const Packet m_interPacket; }; // ----- Linspace functor ---------------------------------------------------------------- @@ -566,12 +570,12 @@ template <typename Scalar, bool RandomAccess> struct ei_functor_traits< ei_linsp { enum { Cost = 1, PacketAccess = ei_packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; }; template <typename Scalar, bool RandomAccess> struct ei_linspaced_op { - typedef typename ei_packet_traits<Scalar>::type PacketScalar; + typedef typename ei_packet_traits<Scalar>::type Packet; ei_linspaced_op(Scalar low, Scalar high, int num_steps) : impl(low, (high-low)/(num_steps-1)) {} template<typename Index> EIGEN_STRONG_INLINE const Scalar operator() (Index i, Index = 0) const { return impl(i); } template<typename Index> - EIGEN_STRONG_INLINE const PacketScalar packetOp(Index i, Index = 0) const { return impl.packetOp(i); } + EIGEN_STRONG_INLINE const Packet packetOp(Index i, Index = 0) const { return impl.packetOp(i); } // This proxy object handles the actual required temporaries, the different // implementations (random vs. sequential access) as well as the // correct piping to size 2/4 packet operations. @@ -581,13 +585,15 @@ template <typename Scalar, bool RandomAccess> struct ei_linspaced_op // all functors allow linear access, except ei_scalar_identity_op. So we fix here a quick meta // to indicate whether a functor allows linear access, just always answering 'yes' except for // ei_scalar_identity_op. +// FIXME move this to ei_functor_traits adding a ei_functor_default template<typename Functor> struct ei_functor_has_linear_access { enum { ret = 1 }; }; template<typename Scalar> struct ei_functor_has_linear_access<ei_scalar_identity_op<Scalar> > { enum { ret = 0 }; }; // in CwiseBinaryOp, we require the Lhs and Rhs to have the same scalar type, except for multiplication // where we only require them to have the same _real_ scalar type so one may multiply, say, float by complex<float>. +// FIXME move this to ei_functor_traits adding a ei_functor_default template<typename Functor> struct ei_functor_allows_mixing_real_and_complex { enum { ret = 0 }; }; -template<typename Scalar> struct ei_functor_allows_mixing_real_and_complex<ei_scalar_product_op<Scalar> > { enum { ret = 1 }; }; +template<typename LhsScalar,typename RhsScalar> struct ei_functor_allows_mixing_real_and_complex<ei_scalar_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; }; /** \internal @@ -597,13 +603,13 @@ template<typename Scalar> struct ei_functor_allows_mixing_real_and_complex<ei_sc /* If you wonder why doing the ei_pset1() in packetOp() is an optimization check ei_scalar_multiple_op */ template<typename Scalar> struct ei_scalar_add_op { - typedef typename ei_packet_traits<Scalar>::type PacketScalar; + typedef typename ei_packet_traits<Scalar>::type Packet; // FIXME default copy constructors seems bugged with std::complex<> inline ei_scalar_add_op(const ei_scalar_add_op& other) : m_other(other.m_other) { } inline ei_scalar_add_op(const Scalar& other) : m_other(other) { } inline Scalar operator() (const Scalar& a) const { return a + m_other; } - inline const PacketScalar packetOp(const PacketScalar& a) const - { return ei_padd(a, ei_pset1(m_other)); } + inline const Packet packetOp(const Packet& a) const + { return ei_padd(a, ei_pset1<Packet>(m_other)); } const Scalar m_other; }; template<typename Scalar> @@ -690,9 +696,9 @@ template<typename Scalar> struct ei_scalar_inverse_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_inverse_op) inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; } - template<typename PacketScalar> - inline const PacketScalar packetOp(const PacketScalar& a) const - { return ei_pdiv(ei_pset1(Scalar(1)),a); } + template<typename Packet> + inline const Packet packetOp(const Packet& a) const + { return ei_pdiv(ei_pset1<Packet>(Scalar(1)),a); } }; template<typename Scalar> struct ei_functor_traits<ei_scalar_inverse_op<Scalar> > @@ -706,8 +712,8 @@ template<typename Scalar> struct ei_scalar_square_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_square_op) inline Scalar operator() (const Scalar& a) const { return a*a; } - template<typename PacketScalar> - inline const PacketScalar packetOp(const PacketScalar& a) const + template<typename Packet> + inline const Packet packetOp(const Packet& a) const { return ei_pmul(a,a); } }; template<typename Scalar> @@ -722,8 +728,8 @@ template<typename Scalar> struct ei_scalar_cube_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_cube_op) inline Scalar operator() (const Scalar& a) const { return a*a*a; } - template<typename PacketScalar> - inline const PacketScalar packetOp(const PacketScalar& a) const + template<typename Packet> + inline const Packet packetOp(const Packet& a) const { return ei_pmul(a,ei_pmul(a,a)); } }; template<typename Scalar> diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 77b1d748e..8ace18174 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -84,7 +84,8 @@ template<typename T> struct ei_packet_traits : ei_default_packet_traits typedef T type; enum { Vectorizable = 0, - size = 1 + size = 1, + AlignedOnScalar = 0 }; enum { HasAdd = 0, @@ -159,16 +160,20 @@ template<typename Packet> inline Packet ei_pandnot(const Packet& a, const Packet& b) { return a & (!b); } /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ -template<typename Scalar> inline typename ei_packet_traits<Scalar>::type -ei_pload(const Scalar* from) { return *from; } +template<typename Packet> inline Packet +ei_pload(const typename ei_unpacket_traits<Packet>::type* from) { return *from; } /** \internal \returns a packet version of \a *from, (un-aligned load) */ -template<typename Scalar> inline typename ei_packet_traits<Scalar>::type -ei_ploadu(const Scalar* from) { return *from; } +template<typename Packet> inline Packet +ei_ploadu(const typename ei_unpacket_traits<Packet>::type* from) { return *from; } + +/** \internal \returns a packet with elements of \a *from duplicated, e.g.: (from[0],from[0],from[1],from[1]) */ +template<typename Packet> inline Packet +ei_ploaddup(const typename ei_unpacket_traits<Packet>::type* from) { return *from; } /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ -template<typename Scalar> inline typename ei_packet_traits<Scalar>::type -ei_pset1(const Scalar& a) { return a; } +template<typename Packet> inline Packet +ei_pset1(const typename ei_unpacket_traits<Packet>::type& a) { return a; } /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ template<typename Scalar> inline typename ei_packet_traits<Scalar>::type @@ -255,13 +260,13 @@ ei_pmadd(const Packet& a, /** \internal \returns a packet version of \a *from. * \If LoadMode equals Aligned, \a from must be 16 bytes aligned */ -template<typename Scalar, int LoadMode> -inline typename ei_packet_traits<Scalar>::type ei_ploadt(const Scalar* from) +template<typename Packet, int LoadMode> +inline Packet ei_ploadt(const typename ei_unpacket_traits<Packet>::type* from) { if(LoadMode == Aligned) - return ei_pload(from); + return ei_pload<Packet>(from); else - return ei_ploadu(from); + return ei_ploadu<Packet>(from); } /** \internal copy the packet \a from to \a *to. diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index 6af7bb793..740f2b247 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -124,14 +124,14 @@ template<typename Derived> class MapBase template<int LoadMode> inline PacketScalar packet(Index row, Index col) const { - return ei_ploadt<Scalar, LoadMode> + return ei_ploadt<PacketScalar, LoadMode> (m_data + (col * colStride() + row * rowStride())); } template<int LoadMode> inline PacketScalar packet(Index index) const { - return ei_ploadt<Scalar, LoadMode>(m_data + index * innerStride()); + return ei_ploadt<PacketScalar, LoadMode>(m_data + index * innerStride()); } template<int StoreMode> diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 139132c6b..25b48517d 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -282,10 +282,13 @@ class GeneralProduct<Lhs, Rhs, GemvProduct> public: EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct) + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) { - EIGEN_STATIC_ASSERT((ei_is_same_type<typename Lhs::Scalar, typename Rhs::Scalar>::ret), - YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) +// EIGEN_STATIC_ASSERT((ei_is_same_type<typename Lhs::Scalar, typename Rhs::Scalar>::ret), +// YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) } enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight }; @@ -319,43 +322,66 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true> template<typename ProductType, typename Dest> static void run(const ProductType& prod, Dest& dest, typename ProductType::Scalar alpha) { - typedef typename ProductType::Scalar Scalar; + typedef typename ProductType::Index Index; + typedef typename ProductType::LhsScalar LhsScalar; + typedef typename ProductType::RhsScalar RhsScalar; + typedef typename ProductType::Scalar ResScalar; + typedef typename ProductType::RealScalar RealScalar; typedef typename ProductType::ActualLhsType ActualLhsType; typedef typename ProductType::ActualRhsType ActualRhsType; typedef typename ProductType::LhsBlasTraits LhsBlasTraits; typedef typename ProductType::RhsBlasTraits RhsBlasTraits; - typedef Map<Matrix<Scalar,Dynamic,1>, Aligned> MappedDest; + typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest; ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs()); ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs()); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs()) - * RhsBlasTraits::extractScalarFactor(prod.rhs()); + ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs()) + * RhsBlasTraits::extractScalarFactor(prod.rhs()); enum { // FIXME find a way to allow an inner stride on the result if ei_packet_traits<Scalar>::size==1 - EvalToDest = Dest::InnerStrideAtCompileTime==1 + EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1, + ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex) }; - Scalar* EIGEN_RESTRICT actualDest; - if (EvalToDest) + bool alphaIsCompatible = (!ComplexByReal) || (ei_imag(actualAlpha)==RealScalar(0)); + bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; + + RhsScalar compatibleAlpha = ei_get_factor<ResScalar,RhsScalar>::run(actualAlpha); + + ResScalar* actualDest; + if (evalToDest) + { actualDest = &dest.coeffRef(0); + } else { - actualDest = ei_aligned_stack_new(Scalar,dest.size()); - MappedDest(actualDest, dest.size()) = dest; + actualDest = ei_aligned_stack_new(ResScalar,dest.size()); + if(!alphaIsCompatible) + { + MappedDest(actualDest, dest.size()).setZero(); + compatibleAlpha = RhsScalar(1); + } + else + MappedDest(actualDest, dest.size()) = dest; } - ei_cache_friendly_product_colmajor_times_vector - <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>( - dest.size(), - &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(), - actualRhs, actualDest, actualAlpha); + ei_general_matrix_vector_product + <Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run( + actualLhs.rows(), actualLhs.cols(), + &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(), + actualRhs.data(), actualRhs.innerStride(), + actualDest, 1, + compatibleAlpha); - if (!EvalToDest) + if (!evalToDest) { - dest = MappedDest(actualDest, dest.size()); - ei_aligned_stack_delete(Scalar, actualDest, dest.size()); + if(!alphaIsCompatible) + dest += actualAlpha * MappedDest(actualDest, dest.size()); + else + dest = MappedDest(actualDest, dest.size()); + ei_aligned_stack_delete(ResScalar, actualDest, dest.size()); } } }; @@ -365,7 +391,9 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true> template<typename ProductType, typename Dest> static void run(const ProductType& prod, Dest& dest, typename ProductType::Scalar alpha) { - typedef typename ProductType::Scalar Scalar; + typedef typename ProductType::LhsScalar LhsScalar; + typedef typename ProductType::RhsScalar RhsScalar; + typedef typename ProductType::Scalar ResScalar; typedef typename ProductType::Index Index; typedef typename ProductType::ActualLhsType ActualLhsType; typedef typename ProductType::ActualRhsType ActualRhsType; @@ -376,34 +404,34 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true> ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs()); ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs()); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs()) - * RhsBlasTraits::extractScalarFactor(prod.rhs()); + ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs()) + * RhsBlasTraits::extractScalarFactor(prod.rhs()); enum { // FIXME I think here we really have to check for ei_packet_traits<Scalar>::size==1 // because in this case it is fine to have an inner stride - DirectlyUseRhs = ((ei_packet_traits<Scalar>::size==1) || (_ActualRhsType::Flags&ActualPacketAccessBit)) + DirectlyUseRhs = ((ei_packet_traits<RhsScalar>::size==1) || (_ActualRhsType::Flags&ActualPacketAccessBit)) && (!(_ActualRhsType::Flags & RowMajorBit)) }; - Scalar* EIGEN_RESTRICT rhs_data; + RhsScalar* rhs_data; if (DirectlyUseRhs) - rhs_data = reinterpret_cast<Scalar* EIGEN_RESTRICT>(&actualRhs.const_cast_derived().coeffRef(0)); + rhs_data = &actualRhs.const_cast_derived().coeffRef(0); else { - rhs_data = ei_aligned_stack_new(Scalar, actualRhs.size()); - Map<typename _ActualRhsType::PlainObject>(reinterpret_cast<Scalar*>(rhs_data), actualRhs.size()) = actualRhs; + rhs_data = ei_aligned_stack_new(RhsScalar, actualRhs.size()); + Map<typename _ActualRhsType::PlainObject>(rhs_data, actualRhs.size()) = actualRhs; } - ei_cache_friendly_product_rowmajor_times_vector - <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate, Scalar, Index>( + ei_general_matrix_vector_product + <Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run( actualLhs.rows(), actualLhs.cols(), &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(), rhs_data, 1, &dest.coeffRef(0,0), dest.innerStride(), actualAlpha); - if (!DirectlyUseRhs) ei_aligned_stack_delete(Scalar, rhs_data, prod.rhs().size()); + if (!DirectlyUseRhs) ei_aligned_stack_delete(RhsScalar, rhs_data, prod.rhs().size()); } }; diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h index acdbb8658..de8c4e740 100644 --- a/Eigen/src/Core/SelfCwiseBinaryOp.h +++ b/Eigen/src/Core/SelfCwiseBinaryOp.h @@ -39,14 +39,19 @@ * * \sa class SwapWrapper for a similar trick. */ -template<typename BinaryOp, typename MatrixType> -struct ei_traits<SelfCwiseBinaryOp<BinaryOp,MatrixType> > : ei_traits<MatrixType> +template<typename BinaryOp, typename Lhs, typename Rhs> +struct ei_traits<SelfCwiseBinaryOp<BinaryOp,Lhs,Rhs> > + : ei_traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> > { - + enum { + Flags = ei_traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >::Flags | (Lhs::Flags&DirectAccessBit), + OuterStrideAtCompileTime = Lhs::OuterStrideAtCompileTime, + InnerStrideAtCompileTime = Lhs::InnerStrideAtCompileTime + }; }; -template<typename BinaryOp, typename MatrixType> class SelfCwiseBinaryOp - : public ei_dense_xpr_base< SelfCwiseBinaryOp<BinaryOp, MatrixType> >::type +template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp + : public ei_dense_xpr_base< SelfCwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type { public: @@ -57,7 +62,7 @@ template<typename BinaryOp, typename MatrixType> class SelfCwiseBinaryOp using Base::operator=; - inline SelfCwiseBinaryOp(MatrixType& xpr, const BinaryOp& func = BinaryOp()) : m_matrix(xpr), m_functor(func) {} + inline SelfCwiseBinaryOp(Lhs& xpr, const BinaryOp& func = BinaryOp()) : m_matrix(xpr), m_functor(func) {} inline Index rows() const { return m_matrix.rows(); } inline Index cols() const { return m_matrix.cols(); } @@ -122,12 +127,8 @@ template<typename BinaryOp, typename MatrixType> class SelfCwiseBinaryOp template<typename RhsDerived> EIGEN_STRONG_INLINE SelfCwiseBinaryOp& lazyAssign(const DenseBase<RhsDerived>& rhs) { - EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(MatrixType,RhsDerived) - - EIGEN_STATIC_ASSERT((ei_functor_allows_mixing_real_and_complex<BinaryOp>::ret - ? int(ei_is_same_type<typename MatrixType::RealScalar, typename RhsDerived::RealScalar>::ret) - : int(ei_is_same_type<typename MatrixType::Scalar, typename RhsDerived::Scalar>::ret)), - YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs,RhsDerived) + EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename RhsDerived::Scalar); #ifdef EIGEN_DEBUG_ASSIGN ei_assign_traits<SelfCwiseBinaryOp, RhsDerived>::debug(); @@ -141,7 +142,7 @@ template<typename BinaryOp, typename MatrixType> class SelfCwiseBinaryOp } protected: - MatrixType& m_matrix; + Lhs& m_matrix; const BinaryOp& m_functor; private: @@ -151,8 +152,8 @@ template<typename BinaryOp, typename MatrixType> class SelfCwiseBinaryOp template<typename Derived> inline Derived& DenseBase<Derived>::operator*=(const Scalar& other) { - SelfCwiseBinaryOp<ei_scalar_product_op<Scalar>, Derived> tmp(derived()); typedef typename Derived::PlainObject PlainObject; + SelfCwiseBinaryOp<ei_scalar_product_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived()); tmp = PlainObject::Constant(rows(),cols(),other); return derived(); } @@ -160,10 +161,11 @@ inline Derived& DenseBase<Derived>::operator*=(const Scalar& other) template<typename Derived> inline Derived& DenseBase<Derived>::operator/=(const Scalar& other) { - SelfCwiseBinaryOp<typename ei_meta_if<NumTraits<Scalar>::IsInteger, + typedef typename ei_meta_if<NumTraits<Scalar>::IsInteger, ei_scalar_quotient_op<Scalar>, - ei_scalar_product_op<Scalar> >::ret, Derived> tmp(derived()); + ei_scalar_product_op<Scalar> >::ret BinOp; typedef typename Derived::PlainObject PlainObject; + SelfCwiseBinaryOp<BinOp, Derived, typename PlainObject::ConstantReturnType> tmp(derived()); tmp = PlainObject::Constant(rows(),cols(), NumTraits<Scalar>::IsInteger ? other : Scalar(1)/other); return derived(); } diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index 90ce2a802..960da31f3 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h @@ -53,7 +53,8 @@ struct ei_triangular_solver_selector; template<typename Lhs, typename Rhs, int Mode> struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,RowMajor,1> { - typedef typename Rhs::Scalar Scalar; + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; typedef ei_blas_traits<Lhs> LhsProductTraits; typedef typename LhsProductTraits::ExtractType ActualLhsType; typedef typename Lhs::Index Index; @@ -81,12 +82,12 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,RowMajor Index startRow = IsLower ? pi : pi-actualPanelWidth; Index startCol = IsLower ? 0 : pi; - ei_cache_friendly_product_rowmajor_times_vector<LhsProductTraits::NeedToConjugate,false,Scalar,Index>( + ei_general_matrix_vector_product<Index,LhsScalar,RowMajor,LhsProductTraits::NeedToConjugate,RhsScalar,false>::run( actualPanelWidth, r, &(actualLhs.const_cast_derived().coeffRef(startRow,startCol)), actualLhs.outerStride(), &(other.coeffRef(startCol)), other.innerStride(), &other.coeffRef(startRow), other.innerStride(), - Scalar(-1)); + RhsScalar(-1)); } for(Index k=0; k<actualPanelWidth; ++k) @@ -107,13 +108,12 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,RowMajor template<typename Lhs, typename Rhs, int Mode> struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,ColMajor,1> { - typedef typename Rhs::Scalar Scalar; - typedef typename ei_packet_traits<Scalar>::type Packet; + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; typedef ei_blas_traits<Lhs> LhsProductTraits; typedef typename LhsProductTraits::ExtractType ActualLhsType; typedef typename Lhs::Index Index; enum { - PacketSize = ei_packet_traits<Scalar>::size, IsLower = ((Mode&Lower)==Lower) }; @@ -148,12 +148,11 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,ColMajor // let's directly call the low level product function because: // 1 - it is faster to compile // 2 - it is slighlty faster at runtime - ei_cache_friendly_product_colmajor_times_vector<LhsProductTraits::NeedToConjugate,false>( - r, - &(actualLhs.const_cast_derived().coeffRef(endBlock,startBlock)), actualLhs.outerStride(), - other.segment(startBlock, actualPanelWidth), - &(other.coeffRef(endBlock, 0)), - Scalar(-1)); + ei_general_matrix_vector_product<Index,LhsScalar,ColMajor,LhsProductTraits::NeedToConjugate,RhsScalar,false>::run( + r, actualPanelWidth, + &(actualLhs.const_cast_derived().coeffRef(endBlock,startBlock)), actualLhs.outerStride(), + &other.coeff(startBlock), other.innerStride(), + &(other.coeffRef(endBlock, 0)), other.innerStride(), RhsScalar(-1)); } } } diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index 3a84b84ff..de8ae0a5b 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -303,11 +303,11 @@ inline void MatrixBase<Derived>::adjointInPlace() // The following is to detect aliasing problems in most common cases. -template<typename BinOp,typename NestedXpr> -struct ei_blas_traits<SelfCwiseBinaryOp<BinOp,NestedXpr> > +template<typename BinOp,typename NestedXpr,typename Rhs> +struct ei_blas_traits<SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> > : ei_blas_traits<NestedXpr> { - typedef SelfCwiseBinaryOp<BinOp,NestedXpr> XprType; + typedef SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> XprType; static inline const XprType extract(const XprType& x) { return x; } }; diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 2dba95a2f..ecada02f4 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -63,7 +63,7 @@ template<> struct ei_packet_traits<std::complex<float> > : ei_default_packet_tr template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; -template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>& from) +template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>& from) { Packet2cf res; /* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */ diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index ad694150a..8205beae5 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -59,13 +59,13 @@ typedef __vector unsigned char Packet16uc; Packet4i ei_p4i_##NAME = vec_splat_s32(X) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ - Packet4f ei_p4f_##NAME = ei_pset1<float>(X) + Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X)) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ - Packet4i ei_p4i_##NAME = ei_pset1<int>(X) + Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X) #define DST_CHAN 1 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) @@ -89,6 +89,7 @@ template<> struct ei_packet_traits<float> : ei_default_packet_traits typedef Packet4f type; enum { Vectorizable = 1, + AlignedOnScalar = 1, size=4, // FIXME check the Has* @@ -105,6 +106,7 @@ template<> struct ei_packet_traits<int> : ei_default_packet_traits enum { // FIXME check the Has* Vectorizable = 1, + AlignedOnScalar = 1, size=4 }; }; @@ -156,7 +158,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v) return s; } */ -template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { +template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float& from) { // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html float EIGEN_ALIGN16 af[4]; af[0] = from; @@ -165,7 +167,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { return vc; } -template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { +template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int& from) { int EIGEN_ALIGN16 ai[4]; ai[0] = from; Packet4i vc = vec_ld(0, ai); @@ -173,8 +175,8 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { return vc; } -template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return vec_add(ei_pset1(a), ei_p4f_COUNTDOWN); } -template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return vec_add(ei_pset1(a), ei_p4i_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return vec_add(ei_pset1<Packet4f>(a), ei_p4f_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return vec_add(ei_pset1<Packet4i>(a), ei_p4i_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); } template<> EIGEN_STRONG_INLINE Packet4i ei_padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); } @@ -239,7 +241,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) { ei_assert(false && "packet integer division are not supported by AltiVec"); - return ei_pset1<int>(0); + return ei_pset1<Packet4i>(0); } // for some weird raisons, it has to be overloaded for packet of integers @@ -265,10 +267,10 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pxor<Packet4i>(const Packet4i& a, con template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } -template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } +template<> EIGEN_STRONG_INLINE Packet4f ei_pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } -template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) +template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html @@ -280,7 +282,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data } -template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) +template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index bf68a2bbb..9678040e7 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -58,7 +58,7 @@ template<> struct ei_packet_traits<std::complex<float> > : ei_default_packet_tr template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; -template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>& from) +template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>& from) { float32x2_t r64; r64 = vld1_f32((float *)&from); @@ -141,6 +141,11 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_preverse(const Packet2cf& a) return Packet2cf(a_r128); } +EIGEN_STRONG_INLINE Packet2cf ei_pcplxflip/*<Packet2cf>*/(const Packet2cf& x) +{ + return Packet2cf(vrev64q_f32(a.v)); +} + template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux<Packet2cf>(const Packet2cf& a) { float32x2_t a1, a2; diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 66a6a30ee..8220ed07c 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -45,13 +45,13 @@ typedef float32x4_t Packet4f; typedef int32x4_t Packet4i; #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ - const Packet4f ei_p4f_##NAME = ei_pset1<float>(X) + const Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ const Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X)) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ - const Packet4i ei_p4i_##NAME = ei_pset1<int>(X) + const Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X) #ifndef __pld #define __pld(x) asm volatile ( " pld [%[addr]]\n" :: [addr] "r" (x) : "cc" ); @@ -62,6 +62,7 @@ template<> struct ei_packet_traits<float> : ei_default_packet_traits typedef Packet4f type; enum { Vectorizable = 1, + AlignedOnScalar = 1, size = 4, HasDiv = 1, @@ -78,6 +79,7 @@ template<> struct ei_packet_traits<int> : ei_default_packet_traits typedef Packet4i type; enum { Vectorizable = 1, + AlignedOnScalar = 1, size=4 // FIXME check the Has* }; @@ -86,18 +88,18 @@ template<> struct ei_packet_traits<int> : ei_default_packet_traits template<> struct ei_unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; template<> struct ei_unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; -template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { return vdupq_n_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { return vdupq_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); } template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { Packet4f countdown = { 3, 2, 1, 0 }; - return vaddq_f32(ei_pset1(a), countdown); + return vaddq_f32(ei_pset1<Packet4f>(a), countdown); } template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { Packet4i countdown = { 3, 2, 1, 0 }; - return vaddq_s32(ei_pset1(a), countdown); + return vaddq_s32(ei_pset1<Packet4i>(a), countdown); } template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); } @@ -135,7 +137,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con } template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) { ei_assert(false && "packet integer division are not supported by NEON"); - return ei_pset1<int>(0); + return ei_pset1<Packet4i>(0); } // for some weird raisons, it has to be overloaded for packet of integers @@ -178,6 +180,21 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIG template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f ei_ploaddup<Packet4f>(const float* from) +{ + float32x2_t lo, ho; + lo = vdup_n_f32(*from); + hi = vdup_n_f32(*from); + return vcombine_f32(lo, hi); +} +template<> EIGEN_STRONG_INLINE Packet4i ei_ploaddup<Packet4i>(const float* from) +{ + int32x2_t lo, ho; + lo = vdup_n_s32(*from); + hi = vdup_n_s32(*from); + return vcombine_s32(lo, hi); +} + template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } @@ -193,25 +210,21 @@ template<> EIGEN_STRONG_INLINE int ei_pfirst<Packet4i>(const Packet4i& a) { i template<> EIGEN_STRONG_INLINE Packet4f ei_preverse(const Packet4f& a) { float32x2_t a_lo, a_hi; - Packet4f a_r64, a_r128; + Packet4f a_r64; a_r64 = vrev64q_f32(a); a_lo = vget_low_f32(a_r64); a_hi = vget_high_f32(a_r64); - a_r128 = vcombine_f32(a_hi, a_lo); - - return a_r128; + return vcombine_f32(a_hi, a_lo); } template<> EIGEN_STRONG_INLINE Packet4i ei_preverse(const Packet4i& a) { int32x2_t a_lo, a_hi; - Packet4i a_r64, a_r128; + Packet4i a_r64; a_r64 = vrev64q_s32(a); a_lo = vget_low_s32(a_r64); a_hi = vget_high_s32(a_r64); - a_r128 = vcombine_s32(a_hi, a_lo); - - return a_r128; + return vcombine_s32(a_hi, a_lo); } template<> EIGEN_STRONG_INLINE Packet4f ei_pabs(const Packet4f& a) { return vabsq_f32(a); } template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a) { return vabsq_s32(a); } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 6c91386c6..585630563 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -38,6 +38,7 @@ template<> struct ei_packet_traits<std::complex<float> > : ei_default_packet_tr typedef Packet2cf type; enum { Vectorizable = 1, + AlignedOnScalar = 1, size = 2, HasAdd = 1, @@ -55,13 +56,6 @@ template<> struct ei_packet_traits<std::complex<float> > : ei_default_packet_tr template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; -template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>& from) -{ - Packet2cf res; - res.v = _mm_loadl_pi(res.v, (const __m64*)&from); - return Packet2cf(_mm_movelh_ps(res.v,res.v)); -} - template<> EIGEN_STRONG_INLINE Packet2cf ei_padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf ei_psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf ei_pnegate(const Packet2cf& a) @@ -79,9 +73,12 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_pmul<Packet2cf>(const Packet2cf& a, { // TODO optimize it for SSE3 and 4 #ifdef EIGEN_VECTORIZE_SSE3 - return Packet2cf(_mm_addsub_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), - _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3), + return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v), + _mm_mul_ps(_mm_movehdup_ps(a.v), ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)))); +// return Packet2cf(_mm_addsub_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), +// _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3), +// ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)))); #else const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000)); return Packet2cf(_mm_add_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), @@ -95,14 +92,21 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_por <Packet2cf>(const Packet2cf& template<> EIGEN_STRONG_INLINE Packet2cf ei_pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf ei_pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(_mm_load_ps((const float*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(ei_pload<Packet4f>(&ei_real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu<Packet4f>(&ei_real_ref(*from))); } -template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps((float*)to, from.v); } -template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstore(&ei_real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu(&ei_real_ref(*to), from.v); } template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>& from) +{ + Packet2cf res; + res.v = _mm_loadl_pi(res.v, (const __m64*)&from); + return Packet2cf(_mm_movelh_ps(res.v,res.v)); +} + template<> EIGEN_STRONG_INLINE std::complex<float> ei_pfirst<Packet2cf>(const Packet2cf& a) { std::complex<float> res; @@ -194,6 +198,24 @@ template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,true> } }; +template<> struct ei_conj_helper<Packet4f, Packet2cf, false,false> +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const + { return ei_padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const + { return Packet2cf(ei_pmul(x, y.v)); } +}; + +template<> struct ei_conj_helper<Packet2cf, Packet4f, false,false> +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const + { return ei_padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const + { return Packet2cf(ei_pmul(x.v, y)); } +}; + template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for SSE3 and 4 @@ -202,6 +224,12 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv<Packet2cf>(const Packet2cf& a, return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); } +EIGEN_STRONG_INLINE Packet2cf ei_pcplxflip/*<Packet2cf>*/(const Packet2cf& x) +{ + return Packet2cf(ei_vec4f_swizzle1(x.v, 1, 0, 3, 2)); +} + + //---------- double ---------- struct Packet1cd { @@ -215,6 +243,7 @@ template<> struct ei_packet_traits<std::complex<double> > : ei_default_packet_t typedef Packet1cd type; enum { Vectorizable = 1, + AlignedOnScalar = 0, size = 1, HasAdd = 1, @@ -261,23 +290,25 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_por <Packet1cd>(const Packet1cd& template<> EIGEN_STRONG_INLINE Packet1cd ei_pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd ei_pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } -// FIXME force unaligned load, this is a temporary fix -template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<std::complex<double> >(const std::complex<double>& from) -{ /* here we really have to use unaligned loads :( */ return ei_ploadu(&from); } +// FIXME force unaligned load, this is a temporary fix +template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <Packet1cd>(const std::complex<double>* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_pload<Packet2d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<Packet1cd>(const std::complex<double>* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu<Packet2d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<Packet1cd>(const std::complex<double>& from) +{ /* here we really have to use unaligned loads :( */ return ei_ploadu<Packet1cd>(&from); } // FIXME force unaligned store, this is a temporary fix -template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstoreu((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex<double> ei_pfirst<Packet1cd>(const Packet1cd& a) { - EIGEN_ALIGN16 std::complex<double> res; - ei_pstore(&res, a); - return res; + EIGEN_ALIGN16 double res[2]; + _mm_store_pd(res, a.v); + return *(std::complex<double>*)res; } template<> EIGEN_STRONG_INLINE Packet1cd ei_preverse(const Packet1cd& a) { return a; } @@ -361,6 +392,24 @@ template<> struct ei_conj_helper<Packet1cd, Packet1cd, true,true> } }; +template<> struct ei_conj_helper<Packet2d, Packet1cd, false,false> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const + { return ei_padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const + { return Packet1cd(ei_pmul(x, y.v)); } +}; + +template<> struct ei_conj_helper<Packet1cd, Packet2d, false,false> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const + { return ei_padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const + { return Packet1cd(ei_pmul(x.v, y)); } +}; + template<> EIGEN_STRONG_INLINE Packet1cd ei_pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for SSE3 and 4 @@ -369,4 +418,9 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pdiv<Packet1cd>(const Packet1cd& a, return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); } +EIGEN_STRONG_INLINE Packet1cd ei_pcplxflip/*<Packet1cd>*/(const Packet1cd& x) +{ + return Packet1cd(ei_preverse(x.v)); +} + #endif // EIGEN_COMPLEX_SSE_H diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 742bfa92f..e4ca82985 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -378,14 +378,14 @@ Packet4f ei_pcos<Packet4f>(const Packet4f& _x) template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f ei_psqrt<Packet4f>(const Packet4f& _x) { - Packet4f half = ei_pmul(_x, ei_pset1(.5f)); - - /* select only the inverse sqrt of non-zero inputs */ - Packet4f non_zero_mask = _mm_cmpgt_ps(_x, ei_pset1(std::numeric_limits<float>::epsilon())); - Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x)); - - x = ei_pmul(x, ei_psub(ei_pset1(1.5f), ei_pmul(half, ei_pmul(x,x)))); - return ei_pmul(_x,x); + Packet4f half = ei_pmul(_x, ei_pset1<Packet4f>(.5f)); + + /* select only the inverse sqrt of non-zero inputs */ + Packet4f non_zero_mask = _mm_cmpgt_ps(_x, ei_pset1<Packet4f>(std::numeric_limits<float>::epsilon())); + Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x)); + + x = ei_pmul(x, ei_psub(ei_pset1<Packet4f>(1.5f), ei_pmul(half, ei_pmul(x,x)))); + return ei_pmul(_x,x); } #endif // EIGEN_MATH_FUNCTIONS_SSE_H diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index dc28bdb56..8c441810c 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -45,7 +45,7 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; }; #define ei_vec2d_swizzle1(v,p,q) \ (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) - + #define ei_vec4f_swizzle2(a,b,p,q,r,s) \ (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) @@ -53,13 +53,13 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; }; (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ - const Packet4f ei_p4f_##NAME = ei_pset1<float>(X) + const Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f ei_p4f_##NAME = _mm_castsi128_ps(ei_pset1<int>(X)) + const Packet4f ei_p4f_##NAME = _mm_castsi128_ps(ei_pset1<Packet4i>(X)) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ - const Packet4i ei_p4i_##NAME = ei_pset1<int>(X) + const Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X) template<> struct ei_packet_traits<float> : ei_default_packet_traits @@ -67,6 +67,7 @@ template<> struct ei_packet_traits<float> : ei_default_packet_traits typedef Packet4f type; enum { Vectorizable = 1, + AlignedOnScalar = 1, size=4, HasDiv = 1, @@ -82,6 +83,7 @@ template<> struct ei_packet_traits<double> : ei_default_packet_traits typedef Packet2d type; enum { Vectorizable = 1, + AlignedOnScalar = 1, size=2, HasDiv = 1 @@ -93,6 +95,7 @@ template<> struct ei_packet_traits<int> : ei_default_packet_traits enum { // FIXME check the Has* Vectorizable = 1, + AlignedOnScalar = 1, size=4 }; }; @@ -104,27 +107,24 @@ template<> struct ei_unpacket_traits<Packet4i> { typedef int type; enum {size #ifdef __GNUC__ // Sometimes GCC implements _mm_set1_p* using multiple moves, // that is inefficient :( (e.g., see ei_gemm_pack_rhs) -template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { +template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float& from) { Packet4f res = _mm_set_ss(from); return ei_vec4f_swizzle1(res,0,0,0,0); } -template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) { -#ifdef EIGEN_VECTORIZE_SSE3 - return _mm_loaddup_pd(&from); -#else +template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<Packet2d>(const double& from) { + // NOTE the SSE3 intrinsic _mm_loaddup_pd is never faster but sometimes much slower Packet2d res = _mm_set_sd(from); return ei_vec2d_swizzle1(res, 0, 0); -#endif } #else -template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { return _mm_set1_ps(from); } -template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) { return _mm_set1_pd(from); } +template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float& from) { return _mm_set1_ps(from); } +template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); } #endif -template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { return _mm_set1_epi32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); } -template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return _mm_add_ps(ei_pset1(a), _mm_set_ps(3,2,1,0)); } -template<> EIGEN_STRONG_INLINE Packet2d ei_plset<double>(const double& a) { return _mm_add_pd(ei_pset1(a),_mm_set_pd(1,0)); } -template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return _mm_add_epi32(ei_pset1(a),_mm_set_epi32(3,2,1,0)); } +template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return _mm_add_ps(ei_pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); } +template<> EIGEN_STRONG_INLINE Packet2d ei_plset<double>(const double& a) { return _mm_add_pd(ei_pset1<Packet2d>(a),_mm_set_pd(1,0)); } +template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return _mm_add_epi32(ei_pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d ei_padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } @@ -171,7 +171,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con template<> EIGEN_STRONG_INLINE Packet2d ei_pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) { ei_assert(false && "packet integer division are not supported by SSE"); - return ei_pset1<int>(0); + return ei_pset1<Packet4i>(0); } // for some weird raisons, it has to be overloaded for packet of integers @@ -211,14 +211,14 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, template<> EIGEN_STRONG_INLINE Packet2d ei_pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } -template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } -template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); } +template<> EIGEN_STRONG_INLINE Packet4f ei_pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } +template<> EIGEN_STRONG_INLINE Packet2d ei_pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); } #if defined(_MSC_VER) - template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); } - template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } - template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); } + template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); } + template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } + template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); } #else // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would // require pointer casting to incompatible pointer types and leads to invalid code @@ -226,7 +226,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_D // a correct instruction dependency. // TODO: do the same for MSVC (ICC is compatible) // NOTE: with the code below, MSVC's compiler crashes! -template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) +template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD __m128d res; @@ -234,7 +234,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) res = _mm_loadh_pd(res, (const double*)(from+2)) ; return _mm_castpd_ps(res); } -template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from) +template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD __m128d res; @@ -242,7 +242,7 @@ template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from) res = _mm_loadh_pd(res,from+1); return res; } -template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) +template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD __m128d res; @@ -252,6 +252,19 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) } #endif +template<> EIGEN_STRONG_INLINE Packet4f ei_ploaddup<Packet4f>(const float* from) +{ + return ei_vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd((const double*)from)), 0, 0, 1, 1); +} +template<> EIGEN_STRONG_INLINE Packet2d ei_ploaddup<Packet2d>(const double* from) +{ return ei_pset1<Packet2d>(from[0]); } +template<> EIGEN_STRONG_INLINE Packet4i ei_ploaddup<Packet4i>(const int* from) +{ + Packet4i tmp; + tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from)); + return ei_vec4i_swizzle1(tmp, 0, 0, 1, 1); +} + template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); } diff --git a/Eigen/src/Core/products/CoeffBasedProduct.h b/Eigen/src/Core/products/CoeffBasedProduct.h index 1474bc1bb..c79a34de0 100644 --- a/Eigen/src/Core/products/CoeffBasedProduct.h +++ b/Eigen/src/Core/products/CoeffBasedProduct.h @@ -42,7 +42,7 @@ template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar> struct ei_product_coeff_impl; -template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> +template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode> struct ei_product_packet_impl; template<typename LhsNested, typename RhsNested, int NestingFlags> @@ -73,6 +73,8 @@ struct ei_traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> > LhsRowMajor = LhsFlags & RowMajorBit, RhsRowMajor = RhsFlags & RowMajorBit, + SameType = ei_is_same_type<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::ret, + CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime == Dynamic || ( (ColsAtCompileTime % ei_packet_traits<Scalar>::size) == 0 @@ -94,7 +96,8 @@ struct ei_traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> > Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit) | (EvalToRowMajor ? RowMajorBit : 0) | NestingFlags - | (CanVectorizeLhs || CanVectorizeRhs ? PacketAccessBit : 0), + // TODO enable vectorization for mixed types + | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0), CoeffReadCost = InnerSize == Dynamic ? Dynamic : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost) @@ -105,7 +108,8 @@ struct ei_traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> > * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI. */ - CanVectorizeInner = LhsRowMajor + CanVectorizeInner = SameType + && LhsRowMajor && (!RhsRowMajor) && (LhsFlags & RhsFlags & ActualPacketAccessBit) && (LhsFlags & RhsFlags & AlignedBit) @@ -275,20 +279,20 @@ struct ei_product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar> *** Scalar path with inner vectorization *** *******************************************/ -template<int UnrollingIndex, typename Lhs, typename Rhs, typename PacketScalar> +template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet> struct ei_product_coeff_vectorized_unroller { typedef typename Lhs::Index Index; enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size }; EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) { - ei_product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres); + ei_product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres); pres = ei_padd(pres, ei_pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) )); } }; -template<typename Lhs, typename Rhs, typename PacketScalar> -struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar> +template<typename Lhs, typename Rhs, typename Packet> +struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet> { typedef typename Lhs::Index Index; EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) @@ -300,13 +304,13 @@ struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar> template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar> struct ei_product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar> { - typedef typename Lhs::PacketScalar PacketScalar; + typedef typename Lhs::PacketScalar Packet; typedef typename Lhs::Index Index; enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size }; EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) { - PacketScalar pres; - ei_product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres); + Packet pres; + ei_product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres); ei_product_coeff_impl<DefaultTraversal,UnrollingIndex,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res); res = ei_predux(pres); } @@ -368,71 +372,71 @@ struct ei_product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetSca *** Packet path *** *******************/ -template<int UnrollingIndex, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, PacketScalar, LoadMode> +template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct ei_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode> { typedef typename Lhs::Index Index; - EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) { - ei_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res); - res = ei_pmadd(ei_pset1(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res); + ei_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res); + res = ei_pmadd(ei_pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res); } }; -template<int UnrollingIndex, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, PacketScalar, LoadMode> +template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct ei_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode> { typedef typename Lhs::Index Index; - EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) { - ei_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res); - res = ei_pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), ei_pset1(rhs.coeff(UnrollingIndex, col)), res); + ei_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res); + res = ei_pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), ei_pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res); } }; -template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<RowMajor, 0, Lhs, Rhs, PacketScalar, LoadMode> +template<typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct ei_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode> { typedef typename Lhs::Index Index; - EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) { - res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col)); + res = ei_pmul(ei_pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col)); } }; -template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<ColMajor, 0, Lhs, Rhs, PacketScalar, LoadMode> +template<typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct ei_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode> { typedef typename Lhs::Index Index; - EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) { - res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col))); + res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1<Packet>(rhs.coeff(0, col))); } }; -template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode> +template<typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct ei_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> { typedef typename Lhs::Index Index; - EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res) + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res) { ei_assert(lhs.cols()>0 && "you are using a non initialized matrix"); - res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col)); + res = ei_pmul(ei_pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col)); for(Index i = 1; i < lhs.cols(); ++i) - res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res); + res = ei_pmadd(ei_pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res); } }; -template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode> +template<typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> { typedef typename Lhs::Index Index; - EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res) + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res) { ei_assert(lhs.cols()>0 && "you are using a non initialized matrix"); - res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col))); + res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1<Packet>(rhs.coeff(0, col))); for(Index i = 1; i < lhs.cols(); ++i) - res = ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1(rhs.coeff(i, col)), res); + res = ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1<Packet>(rhs.coeff(i, col)), res); } }; diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index c8eeeff4d..7e2d496fe 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -25,6 +25,9 @@ #ifndef EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H +template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false> +class ei_gebp_traits; + /** \internal */ inline void ei_manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) { @@ -97,7 +100,7 @@ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) * for matrix products and related algorithms. The blocking sizes depends on various * parameters: * - the L1 and L2 cache sizes, - * - the register level blocking sizes defined by ei_product_blocking_traits, + * - the register level blocking sizes defined by ei_gebp_traits, * - the number of scalars that fit into a packet (when vectorization is enabled). * * \sa setCpuCacheSizes */ @@ -109,14 +112,15 @@ void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrd // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed // per kc x nr vertical small panels where nr is the blocking size along the n dimension // at the register level. For vectorization purpose, these small vertical panels are unpacked, - // i.e., each coefficient is replicated to fit a packet. This small vertical panel has to + // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to // stay in L1 cache. std::ptrdiff_t l1, l2; + typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits; enum { - kdiv = KcFactor * 2 * ei_product_blocking_traits<RhsScalar>::nr - * ei_packet_traits<RhsScalar>::size * sizeof(RhsScalar), - mr = ei_product_blocking_traits<LhsScalar>::mr, + kdiv = KcFactor * 2 * Traits::nr + * Traits::RhsProgress * sizeof(RhsScalar), + mr = ei_gebp_traits<LhsScalar,RhsScalar>::mr, mr_mask = (0xffffffff/mr)*mr }; @@ -136,112 +140,475 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st #ifdef EIGEN_HAS_FUSE_CJMADD #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); #else - #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = ei_padd(C,T); + + // FIXME (a bit overkill maybe ?) + + template<typename CJ, typename A, typename B, typename C, typename T> struct ei_gebp_madd_selector { + EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/) + { + c = cj.pmadd(a,b,c); + } + }; + + template<typename CJ, typename T> struct ei_gebp_madd_selector<CJ,T,T,T,T> { + EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, T& a, T& b, T& c, T& t) + { + t = b; t = cj.pmul(a,t); c = ei_padd(c,t); + } + }; + + template<typename CJ, typename A, typename B, typename C, typename T> + EIGEN_STRONG_INLINE void ei_gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) + { + ei_gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t); + } + + #define MADD(CJ,A,B,C,T) ei_gebp_madd(CJ,A,B,C,T); +// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = ei_padd(C,T); #endif -// optimized GEneral packed Block * packed Panel product kernel -template<typename Scalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +/* Vectorization logic + * real*real: unpack rhs to constant packets, ... + * + * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i), + * storing each res packet into two packets (2x2), + * at the end combine them: swap the second and addsub them + * cf*cf : same but with 2x4 blocks + * cplx*real : unpack rhs to constant packets, ... + * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual + */ +template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs> +class ei_gebp_traits +{ +public: + typedef _LhsScalar LhsScalar; + typedef _RhsScalar RhsScalar; + typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + + enum { + ConjLhs = _ConjLhs, + ConjRhs = _ConjRhs, + Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable, + LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, + RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, + ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + + // register block size along the N direction (must be either 2 or 4) + nr = NumberOfRegisters/4, + + // register block size along the M direction (currently, this one cannot be modified) + mr = 2 * LhsPacketSize, + + WorkSpaceFactor = nr * RhsPacketSize, + + LhsProgress = LhsPacketSize, + RhsProgress = RhsPacketSize + }; + + typedef typename ei_packet_traits<LhsScalar>::type _LhsPacket; + typedef typename ei_packet_traits<RhsScalar>::type _RhsPacket; + typedef typename ei_packet_traits<ResScalar>::type _ResPacket; + + typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket; + typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket; + typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket; + + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = ei_pset1<ResPacket>(ResScalar(0)); + } + + EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + { + for(DenseIndex k=0; k<n; k++) + ei_pstore(&b[k*RhsPacketSize], ei_pset1<RhsPacket>(rhs[k])); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = ei_pload<RhsPacket>(b); + } + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = ei_pload<LhsPacket>(a); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const + { + tmp = b; tmp = ei_pmul(a,tmp); c = ei_padd(c,tmp); + } + + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + r = ei_pmadd(c,alpha,r); + } + +protected: +// ei_conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; +// ei_conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj; +}; + +template<typename RealScalar, bool _ConjLhs> +class ei_gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false> +{ +public: + typedef std::complex<RealScalar> LhsScalar; + typedef RealScalar RhsScalar; + typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + + enum { + ConjLhs = _ConjLhs, + ConjRhs = false, + Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable, + LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, + RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, + ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + nr = NumberOfRegisters/4, + mr = 2 * LhsPacketSize, + WorkSpaceFactor = nr*RhsPacketSize, + + LhsProgress = LhsPacketSize, + RhsProgress = RhsPacketSize + }; + + typedef typename ei_packet_traits<LhsScalar>::type _LhsPacket; + typedef typename ei_packet_traits<RhsScalar>::type _RhsPacket; + typedef typename ei_packet_traits<ResScalar>::type _ResPacket; + + typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket; + typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket; + typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket; + + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = ei_pset1<ResPacket>(ResScalar(0)); + } + + EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + { + for(DenseIndex k=0; k<n; k++) + ei_pstore(&b[k*RhsPacketSize], ei_pset1<RhsPacket>(rhs[k])); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = ei_pload<RhsPacket>(b); + } + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = ei_pload<LhsPacket>(a); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + { + madd_impl(a, b, c, tmp, typename ei_meta_if<Vectorizable,ei_meta_true,ei_meta_false>::ret()); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const ei_meta_true&) const + { + tmp = b; tmp = ei_pmul(a.v,tmp); c.v = ei_padd(c.v,tmp); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const ei_meta_false&) const + { + c += a * b; + } + + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + r = cj.pmadd(c,alpha,r); + } + +protected: + ei_conj_helper<ResPacket,ResPacket,ConjLhs,false> cj; +}; + +template<typename RealScalar, bool _ConjLhs, bool _ConjRhs> +class ei_gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs > +{ +public: + typedef std::complex<RealScalar> Scalar; + typedef std::complex<RealScalar> LhsScalar; + typedef std::complex<RealScalar> RhsScalar; + typedef std::complex<RealScalar> ResScalar; + + enum { + ConjLhs = _ConjLhs, + ConjRhs = _ConjRhs, + Vectorizable = ei_packet_traits<RealScalar>::Vectorizable + && ei_packet_traits<Scalar>::Vectorizable, + RealPacketSize = Vectorizable ? ei_packet_traits<RealScalar>::size : 1, + ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1, + + nr = 2, + mr = 2 * ResPacketSize, + WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, + + LhsProgress = ResPacketSize, + RhsProgress = Vectorizable ? 2*ResPacketSize : 1 + }; + + typedef typename ei_packet_traits<RealScalar>::type RealPacket; + typedef typename ei_packet_traits<Scalar>::type ScalarPacket; + struct DoublePacket + { + RealPacket first; + RealPacket second; + }; + + typedef typename ei_meta_if<Vectorizable,RealPacket, Scalar>::ret LhsPacket; + typedef typename ei_meta_if<Vectorizable,DoublePacket,Scalar>::ret RhsPacket; + typedef typename ei_meta_if<Vectorizable,ScalarPacket,Scalar>::ret ResPacket; + typedef typename ei_meta_if<Vectorizable,DoublePacket,Scalar>::ret AccPacket; + + EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } + + EIGEN_STRONG_INLINE void initAcc(DoublePacket& p) + { + p.first = ei_pset1<RealPacket>(RealScalar(0)); + p.second = ei_pset1<RealPacket>(RealScalar(0)); + } + + /* Unpack the rhs coeff such that each complex coefficient is spread into + * two packects containing respectively the real and imaginary coefficient + * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y] + */ + EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b) + { + for(DenseIndex k=0; k<n; k++) + { + if(Vectorizable) + { + ei_pstore((RealScalar*)&b[k*ResPacketSize*2+0], ei_pset1<RealPacket>(ei_real(rhs[k]))); + ei_pstore((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], ei_pset1<RealPacket>(ei_imag(rhs[k]))); + } + else + b[k] = rhs[k]; + } + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const + { + dest.first = ei_pload<RealPacket>((const RealScalar*)b); + dest.second = ei_pload<RealPacket>((const RealScalar*)(b+ResPacketSize)); + } + + // nothing special here + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = ei_pload<LhsPacket>((const typename ei_unpacket_traits<LhsPacket>::type*)(a)); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const + { + c.first = ei_padd(ei_pmul(a,b.first), c.first); + c.second = ei_padd(ei_pmul(a,b.second),c.second); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const + { + c = cj.pmadd(a,b,c); + } + + EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } + + EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const + { + // assemble c + ResPacket tmp; + if((!ConjLhs)&&(!ConjRhs)) + { + tmp = ei_pcplxflip(ei_pconj(ResPacket(c.second))); + tmp = ei_padd(ResPacket(c.first),tmp); + } + else if((!ConjLhs)&&(ConjRhs)) + { + tmp = ei_pconj(ei_pcplxflip(ResPacket(c.second))); + tmp = ei_padd(ResPacket(c.first),tmp); + } + else if((ConjLhs)&&(!ConjRhs)) + { + tmp = ei_pcplxflip(ResPacket(c.second)); + tmp = ei_padd(ei_pconj(ResPacket(c.first)),tmp); + } + else if((ConjLhs)&&(ConjRhs)) + { + tmp = ei_pcplxflip(ResPacket(c.second)); + tmp = ei_psub(ei_pconj(ResPacket(c.first)),tmp); + } + + r = ei_pmadd(tmp,alpha,r); + } + +protected: + ei_conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; +}; + +template<typename RealScalar, bool _ConjRhs> +class ei_gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs > +{ +public: + typedef std::complex<RealScalar> Scalar; + typedef RealScalar LhsScalar; + typedef Scalar RhsScalar; + typedef Scalar ResScalar; + + enum { + ConjLhs = false, + ConjRhs = _ConjRhs, + Vectorizable = ei_packet_traits<RealScalar>::Vectorizable + && ei_packet_traits<Scalar>::Vectorizable, + LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, + RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, + ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + nr = 4, + mr = 2*ResPacketSize, + WorkSpaceFactor = nr*RhsPacketSize, + + LhsProgress = ResPacketSize, + RhsProgress = ResPacketSize + }; + + typedef typename ei_packet_traits<LhsScalar>::type _LhsPacket; + typedef typename ei_packet_traits<RhsScalar>::type _RhsPacket; + typedef typename ei_packet_traits<ResScalar>::type _ResPacket; + + typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket; + typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket; + typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket; + + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = ei_pset1<ResPacket>(ResScalar(0)); + } + + EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + { + for(DenseIndex k=0; k<n; k++) + ei_pstore(&b[k*RhsPacketSize], ei_pset1<RhsPacket>(rhs[k])); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = ei_pload<RhsPacket>(b); + } + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = ei_ploaddup<LhsPacket>(a); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + { + madd_impl(a, b, c, tmp, typename ei_meta_if<Vectorizable,ei_meta_true,ei_meta_false>::ret()); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const ei_meta_true&) const + { + tmp = b; tmp.v = ei_pmul(a,tmp.v); c = ei_padd(c,tmp); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const ei_meta_false&) const + { + c += a * b; + } + + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + r = cj.pmadd(alpha,c,r); + } + +protected: + ei_conj_helper<ResPacket,ResPacket,false,ConjRhs> cj; +}; + +/* optimized GEneral packed Block * packed Panel product kernel + * + * Mixing type logic: C += A * B + * | A | B | comments + * |real |cplx | no vectorization yet, would require to pack A with duplication + * |cplx |real | easy vectorization + */ +template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> struct ei_gebp_kernel { - void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, - Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, Scalar* unpackedB = 0) + typedef ei_gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; + typedef typename Traits::ResScalar ResScalar; + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + typedef typename Traits::AccPacket AccPacket; + + enum { + Vectorizable = Traits::Vectorizable, + LhsProgress = Traits::LhsProgress, + RhsProgress = Traits::RhsProgress, + ResPacketSize = Traits::ResPacketSize + }; + + EIGEN_FLATTEN_ATTRIB + void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0) { - typedef typename ei_packet_traits<Scalar>::type PacketType; - enum { PacketSize = ei_packet_traits<Scalar>::size }; + Traits traits; + if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; - ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj; - ei_conj_helper<PacketType,PacketType,ConjugateLhs,ConjugateRhs> pcj; + ei_conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; +// ei_conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; Index packet_cols = (cols/nr) * nr; const Index peeled_mc = (rows/mr)*mr; - const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0); + // FIXME: + const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); const Index peeled_kc = (depth/4)*4; if(unpackedB==0) - unpackedB = const_cast<Scalar*>(blockB - strideB * nr * PacketSize); + unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress); // loops on each micro vertical panel of rhs (depth x nr) for(Index j2=0; j2<packet_cols; j2+=nr) { - // unpack B - { - const Scalar* blB = &blockB[j2*strideB+offsetB*nr]; - Index n = depth*nr; - for(Index k=0; k<n; k++) - ei_pstore(&unpackedB[k*PacketSize], ei_pset1(blB[k])); - /*Scalar* dest = unpackedB; - for(Index k=0; k<n; k+=4*PacketSize) - { - #ifdef EIGEN_VECTORIZE_SSE - const Index S = 128; - const Index G = 16; - _mm_prefetch((const char*)(&blB[S/2+0]), _MM_HINT_T0); - _mm_prefetch((const char*)(&dest[S+0*G]), _MM_HINT_T0); - _mm_prefetch((const char*)(&dest[S+1*G]), _MM_HINT_T0); - _mm_prefetch((const char*)(&dest[S+2*G]), _MM_HINT_T0); - _mm_prefetch((const char*)(&dest[S+3*G]), _MM_HINT_T0); - #endif - - PacketType C0[PacketSize], C1[PacketSize], C2[PacketSize], C3[PacketSize]; - C0[0] = ei_pload(blB+0*PacketSize); - C1[0] = ei_pload(blB+1*PacketSize); - C2[0] = ei_pload(blB+2*PacketSize); - C3[0] = ei_pload(blB+3*PacketSize); - - ei_punpackp(C0); - ei_punpackp(C1); - ei_punpackp(C2); - ei_punpackp(C3); - - ei_pstore(dest+ 0*PacketSize, C0[0]); - ei_pstore(dest+ 1*PacketSize, C0[1]); - ei_pstore(dest+ 2*PacketSize, C0[2]); - ei_pstore(dest+ 3*PacketSize, C0[3]); - - ei_pstore(dest+ 4*PacketSize, C1[0]); - ei_pstore(dest+ 5*PacketSize, C1[1]); - ei_pstore(dest+ 6*PacketSize, C1[2]); - ei_pstore(dest+ 7*PacketSize, C1[3]); - - ei_pstore(dest+ 8*PacketSize, C2[0]); - ei_pstore(dest+ 9*PacketSize, C2[1]); - ei_pstore(dest+10*PacketSize, C2[2]); - ei_pstore(dest+11*PacketSize, C2[3]); - - ei_pstore(dest+12*PacketSize, C3[0]); - ei_pstore(dest+13*PacketSize, C3[1]); - ei_pstore(dest+14*PacketSize, C3[2]); - ei_pstore(dest+15*PacketSize, C3[3]); - - blB += 4*PacketSize; - dest += 16*PacketSize; - }*/ - } - // loops on each micro horizontal panel of lhs (mr x depth) + traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB); + + // loops on each largest micro horizontal panel of lhs (mr x depth) // => we select a mr x nr micro block of res which is entirely // stored into mr/packet_size x nr registers. for(Index i=0; i<peeled_mc; i+=mr) { - const Scalar* blA = &blockA[i*strideA+offsetA*mr]; + const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; ei_prefetch(&blA[0]); - // TODO move the res loads to the stores - // gets res block as register - PacketType C0, C1, C2, C3, C4, C5, C6, C7; - C0 = ei_pset1(Scalar(0)); - C1 = ei_pset1(Scalar(0)); - if(nr==4) C2 = ei_pset1(Scalar(0)); - if(nr==4) C3 = ei_pset1(Scalar(0)); - C4 = ei_pset1(Scalar(0)); - C5 = ei_pset1(Scalar(0)); - if(nr==4) C6 = ei_pset1(Scalar(0)); - if(nr==4) C7 = ei_pset1(Scalar(0)); - - Scalar* r0 = &res[(j2+0)*resStride + i]; - Scalar* r1 = r0 + resStride; - Scalar* r2 = r1 + resStride; - Scalar* r3 = r2 + resStride; + AccPacket C0, C1, C2, C3, C4, C5, C6, C7; + traits.initAcc(C0); + traits.initAcc(C1); + if(nr==4) traits.initAcc(C2); + if(nr==4) traits.initAcc(C3); + traits.initAcc(C4); + traits.initAcc(C5); + if(nr==4) traits.initAcc(C6); + if(nr==4) traits.initAcc(C7); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + ResScalar* r1 = r0 + resStride; + ResScalar* r2 = r1 + resStride; + ResScalar* r3 = r2 + resStride; ei_prefetch(r0+16); ei_prefetch(r1+16); @@ -251,122 +618,121 @@ struct ei_gebp_kernel // performs "inner" product // TODO let's check wether the folowing peeled loop could not be // optimized via optimal prefetching from one loop to the other - const Scalar* blB = unpackedB; + const RhsScalar* blB = unpackedB; for(Index k=0; k<peeled_kc; k+=4) { if(nr==2) { - PacketType B0, A0, A1; - #ifndef EIGEN_HAS_FUSE_CJMADD - PacketType T0; - #endif -EIGEN_ASM_COMMENT("mybegin"); - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - MADD(pcj,A0,B0,C0,T0); - MADD(pcj,A1,B0,C4,B0); - B0 = ei_pload(&blB[1*PacketSize]); - MADD(pcj,A0,B0,C1,T0); - MADD(pcj,A1,B0,C5,B0); - - A0 = ei_pload(&blA[2*PacketSize]); - A1 = ei_pload(&blA[3*PacketSize]); - B0 = ei_pload(&blB[2*PacketSize]); - MADD(pcj,A0,B0,C0,T0); - MADD(pcj,A1,B0,C4,B0); - B0 = ei_pload(&blB[3*PacketSize]); - MADD(pcj,A0,B0,C1,T0); - MADD(pcj,A1,B0,C5,B0); - - A0 = ei_pload(&blA[4*PacketSize]); - A1 = ei_pload(&blA[5*PacketSize]); - B0 = ei_pload(&blB[4*PacketSize]); - MADD(pcj,A0,B0,C0,T0); - MADD(pcj,A1,B0,C4,B0); - B0 = ei_pload(&blB[5*PacketSize]); - MADD(pcj,A0,B0,C1,T0); - MADD(pcj,A1,B0,C5,B0); - - A0 = ei_pload(&blA[6*PacketSize]); - A1 = ei_pload(&blA[7*PacketSize]); - B0 = ei_pload(&blB[6*PacketSize]); - MADD(pcj,A0,B0,C0,T0); - MADD(pcj,A1,B0,C4,B0); - B0 = ei_pload(&blB[7*PacketSize]); - MADD(pcj,A0,B0,C1,T0); - MADD(pcj,A1,B0,C5,B0); + LhsPacket A0, A1; + RhsPacket B0; + RhsPacket T0; + +EIGEN_ASM_COMMENT("mybegin2"); + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadLhs(&blA[1*LhsProgress], A1); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[1*RhsProgress], B0); + traits.madd(A0,B0,C1,T0); + traits.madd(A1,B0,C5,B0); + + traits.loadLhs(&blA[2*LhsProgress], A0); + traits.loadLhs(&blA[3*LhsProgress], A1); + traits.loadRhs(&blB[2*RhsProgress], B0); + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[3*RhsProgress], B0); + traits.madd(A0,B0,C1,T0); + traits.madd(A1,B0,C5,B0); + + traits.loadLhs(&blA[4*LhsProgress], A0); + traits.loadLhs(&blA[5*LhsProgress], A1); + traits.loadRhs(&blB[4*RhsProgress], B0); + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[5*RhsProgress], B0); + traits.madd(A0,B0,C1,T0); + traits.madd(A1,B0,C5,B0); + + traits.loadLhs(&blA[6*LhsProgress], A0); + traits.loadLhs(&blA[7*LhsProgress], A1); + traits.loadRhs(&blB[6*RhsProgress], B0); + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[7*RhsProgress], B0); + traits.madd(A0,B0,C1,T0); + traits.madd(A1,B0,C5,B0); EIGEN_ASM_COMMENT("myend"); } else { - PacketType B0, B1, B2, B3, A0, A1; - #ifndef EIGEN_HAS_FUSE_CJMADD - PacketType T0; - #endif -EIGEN_ASM_COMMENT("mybegin"); - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - - MADD(pcj,A0,B0,C0,T0); - B2 = ei_pload(&blB[2*PacketSize]); - MADD(pcj,A1,B0,C4,B0); - B3 = ei_pload(&blB[3*PacketSize]); - B0 = ei_pload(&blB[4*PacketSize]); - MADD(pcj,A0,B1,C1,T0); - MADD(pcj,A1,B1,C5,B1); - B1 = ei_pload(&blB[5*PacketSize]); - MADD(pcj,A0,B2,C2,T0); - MADD(pcj,A1,B2,C6,B2); - B2 = ei_pload(&blB[6*PacketSize]); - MADD(pcj,A0,B3,C3,T0); - A0 = ei_pload(&blA[2*PacketSize]); - MADD(pcj,A1,B3,C7,B3); - A1 = ei_pload(&blA[3*PacketSize]); - B3 = ei_pload(&blB[7*PacketSize]); - MADD(pcj,A0,B0,C0,T0); - MADD(pcj,A1,B0,C4,B0); - B0 = ei_pload(&blB[8*PacketSize]); - MADD(pcj,A0,B1,C1,T0); - MADD(pcj,A1,B1,C5,B1); - B1 = ei_pload(&blB[9*PacketSize]); - MADD(pcj,A0,B2,C2,T0); - MADD(pcj,A1,B2,C6,B2); - B2 = ei_pload(&blB[10*PacketSize]); - MADD(pcj,A0,B3,C3,T0); - A0 = ei_pload(&blA[4*PacketSize]); - MADD(pcj,A1,B3,C7,B3); - A1 = ei_pload(&blA[5*PacketSize]); - B3 = ei_pload(&blB[11*PacketSize]); - - MADD(pcj,A0,B0,C0,T0); - MADD(pcj,A1,B0,C4,B0); - B0 = ei_pload(&blB[12*PacketSize]); - MADD(pcj,A0,B1,C1,T0); - MADD(pcj,A1,B1,C5,B1); - B1 = ei_pload(&blB[13*PacketSize]); - MADD(pcj,A0,B2,C2,T0); - MADD(pcj,A1,B2,C6,B2); - B2 = ei_pload(&blB[14*PacketSize]); - MADD(pcj,A0,B3,C3,T0); - A0 = ei_pload(&blA[6*PacketSize]); - MADD(pcj,A1,B3,C7,B3); - A1 = ei_pload(&blA[7*PacketSize]); - B3 = ei_pload(&blB[15*PacketSize]); - MADD(pcj,A0,B0,C0,T0); - MADD(pcj,A1,B0,C4,B0); - MADD(pcj,A0,B1,C1,T0); - MADD(pcj,A1,B1,C5,B1); - MADD(pcj,A0,B2,C2,T0); - MADD(pcj,A1,B2,C6,B2); - MADD(pcj,A0,B3,C3,T0); - MADD(pcj,A1,B3,C7,B3); -EIGEN_ASM_COMMENT("myend"); +EIGEN_ASM_COMMENT("mybegin4"); + LhsPacket A0, A1; + RhsPacket B0, B1, B2, B3; + RhsPacket T0; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadLhs(&blA[1*LhsProgress], A1); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.loadRhs(&blB[1*RhsProgress], B1); + + traits.madd(A0,B0,C0,T0); + traits.loadRhs(&blB[2*RhsProgress], B2); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[3*RhsProgress], B3); + traits.loadRhs(&blB[4*RhsProgress], B0); + traits.madd(A0,B1,C1,T0); + traits.madd(A1,B1,C5,B1); + traits.loadRhs(&blB[5*RhsProgress], B1); + traits.madd(A0,B2,C2,T0); + traits.madd(A1,B2,C6,B2); + traits.loadRhs(&blB[6*RhsProgress], B2); + traits.madd(A0,B3,C3,T0); + traits.loadLhs(&blA[2*LhsProgress], A0); + traits.madd(A1,B3,C7,B3); + traits.loadLhs(&blA[3*LhsProgress], A1); + traits.loadRhs(&blB[7*RhsProgress], B3); + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[8*RhsProgress], B0); + traits.madd(A0,B1,C1,T0); + traits.madd(A1,B1,C5,B1); + traits.loadRhs(&blB[9*RhsProgress], B1); + traits.madd(A0,B2,C2,T0); + traits.madd(A1,B2,C6,B2); + traits.loadRhs(&blB[10*RhsProgress], B2); + traits.madd(A0,B3,C3,T0); + traits.loadLhs(&blA[4*LhsProgress], A0); + traits.madd(A1,B3,C7,B3); + traits.loadLhs(&blA[5*LhsProgress], A1); + traits.loadRhs(&blB[11*RhsProgress], B3); + + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[12*RhsProgress], B0); + traits.madd(A0,B1,C1,T0); + traits.madd(A1,B1,C5,B1); + traits.loadRhs(&blB[13*RhsProgress], B1); + traits.madd(A0,B2,C2,T0); + traits.madd(A1,B2,C6,B2); + traits.loadRhs(&blB[14*RhsProgress], B2); + traits.madd(A0,B3,C3,T0); + traits.loadLhs(&blA[6*LhsProgress], A0); + traits.madd(A1,B3,C7,B3); + traits.loadLhs(&blA[7*LhsProgress], A1); + traits.loadRhs(&blB[15*RhsProgress], B3); + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + traits.madd(A0,B1,C1,T0); + traits.madd(A1,B1,C5,B1); + traits.madd(A0,B2,C2,T0); + traits.madd(A1,B2,C6,B2); + traits.madd(A0,B3,C3,T0); + traits.madd(A1,B3,C7,B3); } - blB += 4*nr*PacketSize; + blB += 4*nr*RhsProgress; blA += 4*mr; } // process remaining peeled loop @@ -374,343 +740,370 @@ EIGEN_ASM_COMMENT("myend"); { if(nr==2) { - PacketType B0, A0, A1; - #ifndef EIGEN_HAS_FUSE_CJMADD - PacketType T0; - #endif - - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - MADD(pcj,A0,B0,C0,T0); - MADD(pcj,A1,B0,C4,B0); - B0 = ei_pload(&blB[1*PacketSize]); - MADD(pcj,A0,B0,C1,T0); - MADD(pcj,A1,B0,C5,B0); + LhsPacket A0, A1; + RhsPacket B0; + RhsPacket T0; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadLhs(&blA[1*LhsProgress], A1); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[1*RhsProgress], B0); + traits.madd(A0,B0,C1,T0); + traits.madd(A1,B0,C5,B0); } else { - PacketType B0, B1, B2, B3, A0, A1; - #ifndef EIGEN_HAS_FUSE_CJMADD - PacketType T0; - #endif - - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - - MADD(pcj,A0,B0,C0,T0); - B2 = ei_pload(&blB[2*PacketSize]); - MADD(pcj,A1,B0,C4,B0); - B3 = ei_pload(&blB[3*PacketSize]); - MADD(pcj,A0,B1,C1,T0); - MADD(pcj,A1,B1,C5,B1); - MADD(pcj,A0,B2,C2,T0); - MADD(pcj,A1,B2,C6,B2); - MADD(pcj,A0,B3,C3,T0); - MADD(pcj,A1,B3,C7,B3); + LhsPacket A0, A1; + RhsPacket B0, B1, B2, B3; + RhsPacket T0; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadLhs(&blA[1*LhsProgress], A1); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.loadRhs(&blB[1*RhsProgress], B1); + + traits.madd(A0,B0,C0,T0); + traits.loadRhs(&blB[2*RhsProgress], B2); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[3*RhsProgress], B3); + traits.madd(A0,B1,C1,T0); + traits.madd(A1,B1,C5,B1); + traits.madd(A0,B2,C2,T0); + traits.madd(A1,B2,C6,B2); + traits.madd(A0,B3,C3,T0); + traits.madd(A1,B3,C7,B3); } - blB += nr*PacketSize; + blB += nr*RhsProgress; blA += mr; } - PacketType R0, R1, R2, R3, R4, R5, R6, R7; - - R0 = ei_ploadu(r0); - R1 = ei_ploadu(r1); - if(nr==4) R2 = ei_ploadu(r2); - if(nr==4) R3 = ei_ploadu(r3); - R4 = ei_ploadu(r0 + PacketSize); - R5 = ei_ploadu(r1 + PacketSize); - if(nr==4) R6 = ei_ploadu(r2 + PacketSize); - if(nr==4) R7 = ei_ploadu(r3 + PacketSize); - - C0 = ei_padd(R0, C0); - C1 = ei_padd(R1, C1); - if(nr==4) C2 = ei_padd(R2, C2); - if(nr==4) C3 = ei_padd(R3, C3); - C4 = ei_padd(R4, C4); - C5 = ei_padd(R5, C5); - if(nr==4) C6 = ei_padd(R6, C6); - if(nr==4) C7 = ei_padd(R7, C7); - - ei_pstoreu(r0, C0); - ei_pstoreu(r1, C1); - if(nr==4) ei_pstoreu(r2, C2); - if(nr==4) ei_pstoreu(r3, C3); - ei_pstoreu(r0 + PacketSize, C4); - ei_pstoreu(r1 + PacketSize, C5); - if(nr==4) ei_pstoreu(r2 + PacketSize, C6); - if(nr==4) ei_pstoreu(r3 + PacketSize, C7); + ResPacket R0, R1, R2, R3, R4, R5, R6, R7; + ResPacket alphav = ei_pset1<ResPacket>(alpha); + + R0 = ei_ploadu<ResPacket>(r0); + R1 = ei_ploadu<ResPacket>(r1); + if(nr==4) R2 = ei_ploadu<ResPacket>(r2); + if(nr==4) R3 = ei_ploadu<ResPacket>(r3); + R4 = ei_ploadu<ResPacket>(r0 + ResPacketSize); + R5 = ei_ploadu<ResPacket>(r1 + ResPacketSize); + if(nr==4) R6 = ei_ploadu<ResPacket>(r2 + ResPacketSize); + if(nr==4) R7 = ei_ploadu<ResPacket>(r3 + ResPacketSize); + + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + if(nr==4) traits.acc(C2, alphav, R2); + if(nr==4) traits.acc(C3, alphav, R3); + traits.acc(C4, alphav, R4); + traits.acc(C5, alphav, R5); + if(nr==4) traits.acc(C6, alphav, R6); + if(nr==4) traits.acc(C7, alphav, R7); + + ei_pstoreu(r0, R0); + ei_pstoreu(r1, R1); + if(nr==4) ei_pstoreu(r2, R2); + if(nr==4) ei_pstoreu(r3, R3); + ei_pstoreu(r0 + ResPacketSize, R4); + ei_pstoreu(r1 + ResPacketSize, R5); + if(nr==4) ei_pstoreu(r2 + ResPacketSize, R6); + if(nr==4) ei_pstoreu(r3 + ResPacketSize, R7); } - if(rows-peeled_mc>=PacketSize) + + if(rows-peeled_mc>=LhsProgress) { Index i = peeled_mc; - const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize]; + const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; ei_prefetch(&blA[0]); // gets res block as register - PacketType C0, C1, C2, C3; - C0 = ei_ploadu(&res[(j2+0)*resStride + i]); - C1 = ei_ploadu(&res[(j2+1)*resStride + i]); - if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]); - if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]); + AccPacket C0, C1, C2, C3; + traits.initAcc(C0); + traits.initAcc(C1); + if(nr==4) traits.initAcc(C2); + if(nr==4) traits.initAcc(C3); // performs "inner" product - const Scalar* blB = unpackedB; + const RhsScalar* blB = unpackedB; for(Index k=0; k<peeled_kc; k+=4) { if(nr==2) { - PacketType B0, B1, A0; - - A0 = ei_pload(&blA[0*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - MADD(pcj,A0,B0,C0,B0); - B0 = ei_pload(&blB[2*PacketSize]); - MADD(pcj,A0,B1,C1,B1); - A0 = ei_pload(&blA[1*PacketSize]); - B1 = ei_pload(&blB[3*PacketSize]); - MADD(pcj,A0,B0,C0,B0); - B0 = ei_pload(&blB[4*PacketSize]); - MADD(pcj,A0,B1,C1,B1); - A0 = ei_pload(&blA[2*PacketSize]); - B1 = ei_pload(&blB[5*PacketSize]); - MADD(pcj,A0,B0,C0,B0); - B0 = ei_pload(&blB[6*PacketSize]); - MADD(pcj,A0,B1,C1,B1); - A0 = ei_pload(&blA[3*PacketSize]); - B1 = ei_pload(&blB[7*PacketSize]); - MADD(pcj,A0,B0,C0,B0); - MADD(pcj,A0,B1,C1,B1); + LhsPacket A0; + RhsPacket B0, B1; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.loadRhs(&blB[1*RhsProgress], B1); + traits.madd(A0,B0,C0,B0); + traits.loadRhs(&blB[2*RhsProgress], B0); + traits.madd(A0,B1,C1,B1); + traits.loadLhs(&blA[1*LhsProgress], A0); + traits.loadRhs(&blB[3*RhsProgress], B1); + traits.madd(A0,B0,C0,B0); + traits.loadRhs(&blB[4*RhsProgress], B0); + traits.madd(A0,B1,C1,B1); + traits.loadLhs(&blA[2*LhsProgress], A0); + traits.loadRhs(&blB[5*RhsProgress], B1); + traits.madd(A0,B0,C0,B0); + traits.loadRhs(&blB[6*RhsProgress], B0); + traits.madd(A0,B1,C1,B1); + traits.loadLhs(&blA[3*LhsProgress], A0); + traits.loadRhs(&blB[7*RhsProgress], B1); + traits.madd(A0,B0,C0,B0); + traits.madd(A0,B1,C1,B1); } else { - PacketType B0, B1, B2, B3, A0; - - A0 = ei_pload(&blA[0*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - - MADD(pcj,A0,B0,C0,B0); - B2 = ei_pload(&blB[2*PacketSize]); - B3 = ei_pload(&blB[3*PacketSize]); - B0 = ei_pload(&blB[4*PacketSize]); - MADD(pcj,A0,B1,C1,B1); - B1 = ei_pload(&blB[5*PacketSize]); - MADD(pcj,A0,B2,C2,B2); - B2 = ei_pload(&blB[6*PacketSize]); - MADD(pcj,A0,B3,C3,B3); - A0 = ei_pload(&blA[1*PacketSize]); - B3 = ei_pload(&blB[7*PacketSize]); - MADD(pcj,A0,B0,C0,B0); - B0 = ei_pload(&blB[8*PacketSize]); - MADD(pcj,A0,B1,C1,B1); - B1 = ei_pload(&blB[9*PacketSize]); - MADD(pcj,A0,B2,C2,B2); - B2 = ei_pload(&blB[10*PacketSize]); - MADD(pcj,A0,B3,C3,B3); - A0 = ei_pload(&blA[2*PacketSize]); - B3 = ei_pload(&blB[11*PacketSize]); - - MADD(pcj,A0,B0,C0,B0); - B0 = ei_pload(&blB[12*PacketSize]); - MADD(pcj,A0,B1,C1,B1); - B1 = ei_pload(&blB[13*PacketSize]); - MADD(pcj,A0,B2,C2,B2); - B2 = ei_pload(&blB[14*PacketSize]); - MADD(pcj,A0,B3,C3,B3); - - A0 = ei_pload(&blA[3*PacketSize]); - B3 = ei_pload(&blB[15*PacketSize]); - MADD(pcj,A0,B0,C0,B0); - MADD(pcj,A0,B1,C1,B1); - MADD(pcj,A0,B2,C2,B2); - MADD(pcj,A0,B3,C3,B3); + LhsPacket A0; + RhsPacket B0, B1, B2, B3; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.loadRhs(&blB[1*RhsProgress], B1); + + traits.madd(A0,B0,C0,B0); + traits.loadRhs(&blB[2*RhsProgress], B2); + traits.loadRhs(&blB[3*RhsProgress], B3); + traits.loadRhs(&blB[4*RhsProgress], B0); + traits.madd(A0,B1,C1,B1); + traits.loadRhs(&blB[5*RhsProgress], B1); + traits.madd(A0,B2,C2,B2); + traits.loadRhs(&blB[6*RhsProgress], B2); + traits.madd(A0,B3,C3,B3); + traits.loadLhs(&blA[1*LhsProgress], A0); + traits.loadRhs(&blB[7*RhsProgress], B3); + traits.madd(A0,B0,C0,B0); + traits.loadRhs(&blB[8*RhsProgress], B0); + traits.madd(A0,B1,C1,B1); + traits.loadRhs(&blB[9*RhsProgress], B1); + traits.madd(A0,B2,C2,B2); + traits.loadRhs(&blB[10*RhsProgress], B2); + traits.madd(A0,B3,C3,B3); + traits.loadLhs(&blA[2*LhsProgress], A0); + traits.loadRhs(&blB[11*RhsProgress], B3); + + traits.madd(A0,B0,C0,B0); + traits.loadRhs(&blB[12*RhsProgress], B0); + traits.madd(A0,B1,C1,B1); + traits.loadRhs(&blB[13*RhsProgress], B1); + traits.madd(A0,B2,C2,B2); + traits.loadRhs(&blB[14*RhsProgress], B2); + traits.madd(A0,B3,C3,B3); + + traits.loadLhs(&blA[3*LhsProgress], A0); + traits.loadRhs(&blB[15*RhsProgress], B3); + traits.madd(A0,B0,C0,B0); + traits.madd(A0,B1,C1,B1); + traits.madd(A0,B2,C2,B2); + traits.madd(A0,B3,C3,B3); } - blB += 4*nr*PacketSize; - blA += 4*PacketSize; + blB += nr*4*RhsProgress; + blA += 4*LhsProgress; } // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { if(nr==2) { - PacketType B0, A0; - #ifndef EIGEN_HAS_FUSE_CJMADD - PacketType T0; - #endif - - A0 = ei_pload(&blA[0*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - MADD(pcj,A0,B0,C0,T0); - B0 = ei_pload(&blB[1*PacketSize]); - MADD(pcj,A0,B0,C1,T0); + LhsPacket A0; + RhsPacket B0, B1; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.loadRhs(&blB[1*RhsProgress], B1); + traits.madd(A0,B0,C0,B0); + traits.madd(A0,B1,C1,B1); } else { - PacketType B0, B1, B2, B3, A0; - #ifndef EIGEN_HAS_FUSE_CJMADD - PacketType T0, T1; - #endif - - A0 = ei_pload(&blA[0*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - B1 = ei_pload(&blB[1*PacketSize]); - B2 = ei_pload(&blB[2*PacketSize]); - B3 = ei_pload(&blB[3*PacketSize]); - - MADD(pcj,A0,B0,C0,T0); - MADD(pcj,A0,B1,C1,T1); - MADD(pcj,A0,B2,C2,T0); - MADD(pcj,A0,B3,C3,T1); + LhsPacket A0; + RhsPacket B0, B1, B2, B3; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.loadRhs(&blB[1*RhsProgress], B1); + traits.loadRhs(&blB[2*RhsProgress], B2); + traits.loadRhs(&blB[3*RhsProgress], B3); + + traits.madd(A0,B0,C0,B0); + traits.madd(A0,B1,C1,B1); + traits.madd(A0,B2,C2,B2); + traits.madd(A0,B3,C3,B3); } - blB += nr*PacketSize; - blA += PacketSize; + blB += nr*RhsProgress; + blA += LhsProgress; } - ei_pstoreu(&res[(j2+0)*resStride + i], C0); - ei_pstoreu(&res[(j2+1)*resStride + i], C1); - if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2); - if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3); + ResPacket R0, R1, R2, R3; + ResPacket alphav = ei_pset1<ResPacket>(alpha); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + ResScalar* r1 = r0 + resStride; + ResScalar* r2 = r1 + resStride; + ResScalar* r3 = r2 + resStride; + + R0 = ei_ploadu<ResPacket>(r0); + R1 = ei_ploadu<ResPacket>(r1); + if(nr==4) R2 = ei_ploadu<ResPacket>(r2); + if(nr==4) R3 = ei_ploadu<ResPacket>(r3); + + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + if(nr==4) traits.acc(C2, alphav, R2); + if(nr==4) traits.acc(C3, alphav, R3); + + ei_pstoreu(r0, R0); + ei_pstoreu(r1, R1); + if(nr==4) ei_pstoreu(r2, R2); + if(nr==4) ei_pstoreu(r3, R3); } for(Index i=peeled_mc2; i<rows; i++) { - const Scalar* blA = &blockA[i*strideA+offsetA]; + const LhsScalar* blA = &blockA[i*strideA+offsetA]; ei_prefetch(&blA[0]); // gets a 1 x nr res block as registers - Scalar C0(0), C1(0), C2(0), C3(0); + ResScalar C0(0), C1(0), C2(0), C3(0); // TODO directly use blockB ??? - const Scalar* blB = unpackedB;//&blockB[j2*strideB+offsetB*nr]; + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; for(Index k=0; k<depth; k++) { if(nr==2) { - Scalar B0, A0; - #ifndef EIGEN_HAS_FUSE_CJMADD - Scalar T0; - #endif + LhsScalar A0; + RhsScalar B0, B1; A0 = blA[k]; - B0 = blB[0*PacketSize]; - MADD(cj,A0,B0,C0,T0); - B0 = blB[1*PacketSize]; - MADD(cj,A0,B0,C1,T0); + B0 = blB[0]; + B1 = blB[1]; + MADD(cj,A0,B0,C0,B0); + MADD(cj,A0,B1,C1,B1); } else { - Scalar B0, B1, B2, B3, A0; - #ifndef EIGEN_HAS_FUSE_CJMADD - Scalar T0, T1; - #endif + LhsScalar A0; + RhsScalar B0, B1, B2, B3; A0 = blA[k]; - B0 = blB[0*PacketSize]; - B1 = blB[1*PacketSize]; - B2 = blB[2*PacketSize]; - B3 = blB[3*PacketSize]; - - MADD(cj,A0,B0,C0,T0); - MADD(cj,A0,B1,C1,T1); - MADD(cj,A0,B2,C2,T0); - MADD(cj,A0,B3,C3,T1); + B0 = blB[0]; + B1 = blB[1]; + B2 = blB[2]; + B3 = blB[3]; + + MADD(cj,A0,B0,C0,B0); + MADD(cj,A0,B1,C1,B1); + MADD(cj,A0,B2,C2,B2); + MADD(cj,A0,B3,C3,B3); } - blB += nr*PacketSize; + blB += nr; } - res[(j2+0)*resStride + i] += C0; - res[(j2+1)*resStride + i] += C1; - if(nr==4) res[(j2+2)*resStride + i] += C2; - if(nr==4) res[(j2+3)*resStride + i] += C3; + res[(j2+0)*resStride + i] += alpha*C0; + res[(j2+1)*resStride + i] += alpha*C1; + if(nr==4) res[(j2+2)*resStride + i] += alpha*C2; + if(nr==4) res[(j2+3)*resStride + i] += alpha*C3; } } - // process remaining rhs/res columns one at a time // => do the same but with nr==1 for(Index j2=packet_cols; j2<cols; j2++) { // unpack B { - const Scalar* blB = &blockB[j2*strideB+offsetB]; - for(Index k=0; k<depth; k++) - ei_pstore(&unpackedB[k*PacketSize], ei_pset1(blB[k])); + traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB); +// const RhsScalar* blB = &blockB[j2*strideB+offsetB]; +// for(Index k=0; k<depth; k++) +// ei_pstore(&unpackedB[k*RhsPacketSize], ei_pset1<RhsPacket>(blB[k])); } for(Index i=0; i<peeled_mc; i+=mr) { - const Scalar* blA = &blockA[i*strideA+offsetA*mr]; + const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; ei_prefetch(&blA[0]); // TODO move the res loads to the stores // get res block as registers - PacketType C0, C4; - C0 = ei_ploadu(&res[(j2+0)*resStride + i]); - C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]); + AccPacket C0, C4; + traits.initAcc(C0); + traits.initAcc(C4); - const Scalar* blB = unpackedB; + const RhsScalar* blB = unpackedB; for(Index k=0; k<depth; k++) { - PacketType B0, A0, A1; - #ifndef EIGEN_HAS_FUSE_CJMADD - PacketType T0, T1; - #endif - - A0 = ei_pload(&blA[0*PacketSize]); - A1 = ei_pload(&blA[1*PacketSize]); - B0 = ei_pload(&blB[0*PacketSize]); - MADD(pcj,A0,B0,C0,T0); - MADD(pcj,A1,B0,C4,T1); - - blB += PacketSize; - blA += mr; + LhsPacket A0, A1; + RhsPacket B0; + RhsPacket T0; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadLhs(&blA[1*LhsProgress], A1); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + + blB += RhsProgress; + blA += 2*LhsProgress; } + ResPacket R0, R4; + ResPacket alphav = ei_pset1<ResPacket>(alpha); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + + R0 = ei_ploadu<ResPacket>(r0); + R4 = ei_ploadu<ResPacket>(r0+ResPacketSize); + + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R4); - ei_pstoreu(&res[(j2+0)*resStride + i], C0); - ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4); + ei_pstoreu(r0, R0); + ei_pstoreu(r0+ResPacketSize, R4); } - if(rows-peeled_mc>=PacketSize) + if(rows-peeled_mc>=LhsProgress) { Index i = peeled_mc; - const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize]; + const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; ei_prefetch(&blA[0]); - PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]); + AccPacket C0; + traits.initAcc(C0); - const Scalar* blB = unpackedB; + const RhsScalar* blB = unpackedB; for(Index k=0; k<depth; k++) { - PacketType T0; - MADD(pcj,ei_pload(blA), ei_pload(blB), C0, T0); - blB += PacketSize; - blA += PacketSize; + LhsPacket A0; + RhsPacket B0; + traits.loadLhs(blA, A0); + traits.loadRhs(blB, B0); + traits.madd(A0, B0, C0, B0); + blB += RhsProgress; + blA += LhsProgress; } - ei_pstoreu(&res[(j2+0)*resStride + i], C0); + ResPacket alphav = ei_pset1<ResPacket>(alpha); + ResPacket R0 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i]); + traits.acc(C0, alphav, R0); + ei_pstoreu(&res[(j2+0)*resStride + i], R0); } for(Index i=peeled_mc2; i<rows; i++) { - const Scalar* blA = &blockA[i*strideA+offsetA]; + const LhsScalar* blA = &blockA[i*strideA+offsetA]; ei_prefetch(&blA[0]); // gets a 1 x 1 res block as registers - Scalar C0(0); + ResScalar C0(0); // FIXME directly use blockB ?? - const Scalar* blB = unpackedB; + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; for(Index k=0; k<depth; k++) { - #ifndef EIGEN_HAS_FUSE_CJMADD - Scalar T0; - #endif - MADD(cj,blA[k], blB[k*PacketSize], C0, T0); + LhsScalar A0 = blA[k]; + RhsScalar B0 = blB[k]; + MADD(cj, A0, B0, C0, B0); } - res[(j2+0)*resStride + i] += C0; + res[(j2+0)*resStride + i] += alpha*C0; } } } @@ -732,34 +1125,34 @@ EIGEN_ASM_COMMENT("myend"); // // 32 33 34 35 ... // 36 36 38 39 ... -template<typename Scalar, typename Index, int mr, int StorageOrder, bool Conjugate, bool PanelMode> +template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode> struct ei_gemm_pack_lhs { void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0) { - enum { PacketSize = ei_packet_traits<Scalar>::size }; +// enum { PacketSize = ei_packet_traits<Scalar>::size }; ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; ei_const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride); Index count = 0; - Index peeled_mc = (rows/mr)*mr; - for(Index i=0; i<peeled_mc; i+=mr) + Index peeled_mc = (rows/Pack1)*Pack1; + for(Index i=0; i<peeled_mc; i+=Pack1) { - if(PanelMode) count += mr * offset; + if(PanelMode) count += Pack1 * offset; for(Index k=0; k<depth; k++) - for(Index w=0; w<mr; w++) + for(Index w=0; w<Pack1; w++) blockA[count++] = cj(lhs(i+w, k)); - if(PanelMode) count += mr * (stride-offset-depth); + if(PanelMode) count += Pack1 * (stride-offset-depth); } - if(rows-peeled_mc>=PacketSize) + if(rows-peeled_mc>=Pack2) { - if(PanelMode) count += PacketSize*offset; + if(PanelMode) count += Pack2*offset; for(Index k=0; k<depth; k++) - for(Index w=0; w<PacketSize; w++) + for(Index w=0; w<Pack2; w++) blockA[count++] = cj(lhs(peeled_mc+w, k)); - if(PanelMode) count += PacketSize * (stride-offset-depth); - peeled_mc += PacketSize; + if(PanelMode) count += Pack2 * (stride-offset-depth); + peeled_mc += Pack2; } for(Index i=peeled_mc; i<rows; i++) { @@ -783,12 +1176,11 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> { typedef typename ei_packet_traits<Scalar>::type Packet; enum { PacketSize = ei_packet_traits<Scalar>::size }; - void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Scalar alpha, Index depth, Index cols, + void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0) { ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - bool hasAlpha = alpha != Scalar(1); Index packet_cols = (cols/nr) * nr; Index count = 0; for(Index j2=0; j2<packet_cols; j2+=nr) @@ -799,24 +1191,14 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> const Scalar* b1 = &rhs[(j2+1)*rhsStride]; const Scalar* b2 = &rhs[(j2+2)*rhsStride]; const Scalar* b3 = &rhs[(j2+3)*rhsStride]; - if (hasAlpha) - for(Index k=0; k<depth; k++) - { - blockB[count+0] = alpha*cj(b0[k]); - blockB[count+1] = alpha*cj(b1[k]); - if(nr==4) blockB[count+2] = alpha*cj(b2[k]); - if(nr==4) blockB[count+3] = alpha*cj(b3[k]); - count += nr; - } - else - for(Index k=0; k<depth; k++) - { - blockB[count+0] = cj(b0[k]); - blockB[count+1] = cj(b1[k]); - if(nr==4) blockB[count+2] = cj(b2[k]); - if(nr==4) blockB[count+3] = cj(b3[k]); - count += nr; - } + for(Index k=0; k<depth; k++) + { + blockB[count+0] = cj(b0[k]); + blockB[count+1] = cj(b1[k]); + if(nr==4) blockB[count+2] = cj(b2[k]); + if(nr==4) blockB[count+3] = cj(b3[k]); + count += nr; + } // skip what we have after if(PanelMode) count += nr * (stride-offset-depth); } @@ -826,18 +1208,11 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> { if(PanelMode) count += offset; const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - if (hasAlpha) - for(Index k=0; k<depth; k++) - { - blockB[count] = alpha*cj(b0[k]); - count += 1; - } - else - for(Index k=0; k<depth; k++) - { - blockB[count] = cj(b0[k]); - count += 1; - } + for(Index k=0; k<depth; k++) + { + blockB[count] = cj(b0[k]); + count += 1; + } if(PanelMode) count += (stride-offset-depth); } } @@ -848,41 +1223,25 @@ template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> { enum { PacketSize = ei_packet_traits<Scalar>::size }; - void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Scalar alpha, Index depth, Index cols, + void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0) { ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - bool hasAlpha = alpha != Scalar(1); Index packet_cols = (cols/nr) * nr; Index count = 0; for(Index j2=0; j2<packet_cols; j2+=nr) { // skip what we have before if(PanelMode) count += nr * offset; - if (hasAlpha) - { - for(Index k=0; k<depth; k++) - { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = alpha*cj(b0[0]); - blockB[count+1] = alpha*cj(b0[1]); - if(nr==4) blockB[count+2] = alpha*cj(b0[2]); - if(nr==4) blockB[count+3] = alpha*cj(b0[3]); - count += nr; - } - } - else + for(Index k=0; k<depth; k++) { - for(Index k=0; k<depth; k++) - { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = cj(b0[0]); - blockB[count+1] = cj(b0[1]); - if(nr==4) blockB[count+2] = cj(b0[2]); - if(nr==4) blockB[count+3] = cj(b0[3]); - count += nr; - } + const Scalar* b0 = &rhs[k*rhsStride + j2]; + blockB[count+0] = cj(b0[0]); + blockB[count+1] = cj(b0[1]); + if(nr==4) blockB[count+2] = cj(b0[2]); + if(nr==4) blockB[count+3] = cj(b0[3]); + count += nr; } // skip what we have after if(PanelMode) count += nr * (stride-offset-depth); @@ -894,7 +1253,7 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> const Scalar* b0 = &rhs[j2]; for(Index k=0; k<depth; k++) { - blockB[count] = alpha*cj(b0[k*rhsStride]); + blockB[count] = cj(b0[k*rhsStride]); count += 1; } if(PanelMode) count += stride-offset-depth; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 2ae78c1e7..1cdfb84d1 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -29,26 +29,25 @@ template<typename _LhsScalar, typename _RhsScalar> class ei_level3_blocking; /* Specialization for a row-major destination matrix => simple transposition of the product */ template< - typename Scalar, typename Index, - int LhsStorageOrder, bool ConjugateLhs, - int RhsStorageOrder, bool ConjugateRhs> -struct ei_general_matrix_matrix_product<Scalar,Index,LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,RowMajor> + typename Index, + typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> +struct ei_general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor> { + typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run( Index rows, Index cols, Index depth, - const Scalar* lhs, Index lhsStride, - const Scalar* rhs, Index rhsStride, - Scalar* res, Index resStride, - Scalar alpha, - ei_level3_blocking<Scalar,Scalar>& blocking, + const LhsScalar* lhs, Index lhsStride, + const RhsScalar* rhs, Index rhsStride, + ResScalar* res, Index resStride, + ResScalar alpha, + ei_level3_blocking<RhsScalar,LhsScalar>& blocking, GemmParallelInfo<Index>* info = 0) { // transpose the product such that the result is column major - ei_general_matrix_matrix_product<Scalar, Index, - RhsStorageOrder==RowMajor ? ColMajor : RowMajor, - ConjugateRhs, - LhsStorageOrder==RowMajor ? ColMajor : RowMajor, - ConjugateLhs, + ei_general_matrix_matrix_product<Index, + RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs, + LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs, ColMajor> ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info); } @@ -57,41 +56,32 @@ struct ei_general_matrix_matrix_product<Scalar,Index,LhsStorageOrder,ConjugateLh /* Specialization for a col-major destination matrix * => Blocking algorithm following Goto's paper */ template< - typename Scalar, typename Index, - int LhsStorageOrder, bool ConjugateLhs, - int RhsStorageOrder, bool ConjugateRhs> -struct ei_general_matrix_matrix_product<Scalar,Index,LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> + typename Index, + typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> +struct ei_general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor> { +typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; static void run(Index rows, Index cols, Index depth, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, - Scalar alpha, - ei_level3_blocking<Scalar,Scalar>& blocking, + const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsStride, + ResScalar* res, Index resStride, + ResScalar alpha, + ei_level3_blocking<LhsScalar,RhsScalar>& blocking, GemmParallelInfo<Index>* info = 0) { - ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); + ei_const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); + ei_const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - typedef typename ei_packet_traits<Scalar>::type PacketType; - typedef ei_product_blocking_traits<Scalar> Blocking; + typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits; Index kc = blocking.kc(); // cache block size along the K direction Index mc = std::min(rows,blocking.mc()); // cache block size along the M direction //Index nc = blocking.nc(); // cache block size along the N direction - // FIXME starting from SSE3, normal complex product cannot be optimized as well as - // conjugate product, therefore it is better to conjugate during the copies. - // With SSE2, this is the other way round. - ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder, ConjugateLhs> pack_lhs; - ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder, ConjugateRhs> pack_rhs; - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr> gebp; - -// if (ConjugateRhs) -// alpha = ei_conj(alpha); -// ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs; -// ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs; -// ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp; + ei_gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + ei_gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs; + ei_gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; #ifdef EIGEN_HAS_OPENMP if(info) @@ -100,10 +90,10 @@ static void run(Index rows, Index cols, Index depth, Index tid = omp_get_thread_num(); Index threads = omp_get_num_threads(); - Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr*8; - Scalar* w = ei_aligned_stack_new(Scalar, sizeW); - Scalar* blockB = blocking.blockB(); + LhsScalar* blockA = ei_aligned_stack_new(LhsScalar, kc*mc); + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + RhsScalar* w = ei_aligned_stack_new(RhsScalar, sizeW); + RhsScalar* blockB = blocking.blockB(); ei_internal_assert(blockB!=0); // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs... @@ -124,7 +114,7 @@ static void run(Index rows, Index cols, Index depth, while(info[tid].users!=0) {} info[tid].users += threads; - pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, alpha, actual_kc, info[tid].rhs_length); + pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length); // Notify the other threads that the part B'_j is ready to go. info[tid].sync = k; @@ -140,7 +130,7 @@ static void run(Index rows, Index cols, Index depth, if(shift>0) while(info[j].sync!=k) {} - gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*kc, mc, actual_kc, info[j].rhs_length, -1,-1,0,0, w); + gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w); } // Then keep going as usual with the remaining A' @@ -152,7 +142,7 @@ static void run(Index rows, Index cols, Index depth, pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc); // C_i += A' * B' - gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1,-1,0,0, w); + gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w); } // Release all the sub blocks B'_j of B' for the current thread, @@ -162,8 +152,8 @@ static void run(Index rows, Index cols, Index depth, --(info[j].users); } - ei_aligned_stack_delete(Scalar, blockA, kc*mc); - ei_aligned_stack_delete(Scalar, w, sizeW); + ei_aligned_stack_delete(LhsScalar, blockA, kc*mc); + ei_aligned_stack_delete(RhsScalar, w, sizeW); } else #endif // EIGEN_HAS_OPENMP @@ -173,10 +163,10 @@ static void run(Index rows, Index cols, Index depth, // this is the sequential version! std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr; - Scalar *blockA = blocking.blockA()==0 ? ei_aligned_stack_new(Scalar, sizeA) : blocking.blockA(); - Scalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(Scalar, sizeB) : blocking.blockB(); - Scalar *blockW = blocking.blockW()==0 ? ei_aligned_stack_new(Scalar, sizeW) : blocking.blockW(); + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + LhsScalar *blockA = blocking.blockA()==0 ? ei_aligned_stack_new(LhsScalar, sizeA) : blocking.blockA(); + RhsScalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(RhsScalar, sizeB) : blocking.blockB(); + RhsScalar *blockW = blocking.blockW()==0 ? ei_aligned_stack_new(RhsScalar, sizeW) : blocking.blockW(); // For each horizontal panel of the rhs, and corresponding panel of the lhs... // (==GEMM_VAR1) @@ -188,7 +178,7 @@ static void run(Index rows, Index cols, Index depth, // => Pack rhs's panel into a sequential chunk of memory (L2 caching) // Note that this panel will be read as many times as the number of blocks in the lhs's // vertical panel which is, in practice, a very low number. - pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); + pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); // For each mc x kc block of the lhs's vertical panel... @@ -203,14 +193,14 @@ static void run(Index rows, Index cols, Index depth, pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); // Everything is packed, we can now call the block * panel kernel: - gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1, -1, 0, 0, blockW); + gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); } } - if(blocking.blockA()==0) ei_aligned_stack_delete(Scalar, blockA, kc*mc); - if(blocking.blockB()==0) ei_aligned_stack_delete(Scalar, blockB, sizeB); - if(blocking.blockW()==0) ei_aligned_stack_delete(Scalar, blockW, sizeW); + if(blocking.blockA()==0) ei_aligned_stack_delete(LhsScalar, blockA, kc*mc); + if(blocking.blockB()==0) ei_aligned_stack_delete(RhsScalar, blockB, sizeB); + if(blocking.blockW()==0) ei_aligned_stack_delete(RhsScalar, blockW, sizeW); } } @@ -245,8 +235,8 @@ struct ei_gemm_functor cols = m_rhs.cols(); Gemm::run(rows, cols, m_lhs.cols(), - (const Scalar*)&(m_lhs.const_cast_derived().coeffRef(row,0)), m_lhs.outerStride(), - (const Scalar*)&(m_rhs.const_cast_derived().coeffRef(0,col)), m_rhs.outerStride(), + /*(const Scalar*)*/&(m_lhs.const_cast_derived().coeffRef(row,0)), m_lhs.outerStride(), + /*(const Scalar*)*/&(m_rhs.const_cast_derived().coeffRef(0,col)), m_rhs.outerStride(), (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(), m_actualAlpha, m_blocking, info); } @@ -305,11 +295,11 @@ class ei_gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols }; typedef typename ei_meta_if<Transpose,_RhsScalar,_LhsScalar>::ret LhsScalar; typedef typename ei_meta_if<Transpose,_LhsScalar,_RhsScalar>::ret RhsScalar; - typedef ei_product_blocking_traits<RhsScalar> Blocking; + typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits; enum { SizeA = ActualRows * MaxDepth, SizeB = ActualCols * MaxDepth, - SizeW = MaxDepth * Blocking::nr * ei_packet_traits<RhsScalar>::size + SizeW = MaxDepth * Traits::WorkSpaceFactor }; EIGEN_ALIGN16 LhsScalar m_staticA[SizeA]; @@ -345,7 +335,7 @@ class ei_gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols }; typedef typename ei_meta_if<Transpose,_RhsScalar,_LhsScalar>::ret LhsScalar; typedef typename ei_meta_if<Transpose,_LhsScalar,_RhsScalar>::ret RhsScalar; - typedef ei_product_blocking_traits<RhsScalar> Blocking; + typedef ei_gebp_traits<LhsScalar,RhsScalar> Traits; DenseIndex m_sizeA; DenseIndex m_sizeB; @@ -362,7 +352,7 @@ class ei_gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols computeProductBlockingSizes<LhsScalar,RhsScalar>(this->m_kc, this->m_mc, this->m_nc); m_sizeA = this->m_mc * this->m_kc; m_sizeB = this->m_kc * this->m_nc; - m_sizeW = this->m_kc*ei_packet_traits<RhsScalar>::size*Blocking::nr; + m_sizeW = this->m_kc*Traits::WorkSpaceFactor; } void allocateA() @@ -407,11 +397,15 @@ class GeneralProduct<Lhs, Rhs, GemmProduct> }; public: EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct) + + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef Scalar ResScalar; GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) { - EIGEN_STATIC_ASSERT((ei_is_same_type<typename Lhs::Scalar, typename Rhs::Scalar>::ret), - YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + typedef ei_scalar_product_op<LhsScalar,RhsScalar> BinOp; + EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar); } template<typename Dest> void scaleAndAddTo(Dest& dst, Scalar alpha) const @@ -424,15 +418,15 @@ class GeneralProduct<Lhs, Rhs, GemmProduct> Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) * RhsBlasTraits::extractScalarFactor(m_rhs); - typedef ei_gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, + typedef ei_gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar, Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType; typedef ei_gemm_functor< Scalar, Index, ei_general_matrix_matrix_product< - Scalar, Index, - (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), - (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), + Index, + LhsScalar, (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), + RhsScalar, (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor; diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 9713b4849..44986dc16 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -30,56 +30,77 @@ * the number of load/stores of the result by a factor 4 and to reduce * the instruction dependency. Moreover, we know that all bands have the * same alignment pattern. - * TODO: since rhs gets evaluated only once, no need to evaluate it + * + * Mixing type logic: C += alpha * A * B + * | A | B |alpha| comments + * |real |cplx |cplx | no vectorization + * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization + * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp + * |cplx |real |real | optimal case, vectorization possible via real-cplx mul */ -template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename RhsType> -static EIGEN_DONT_INLINE -void ei_cache_friendly_product_colmajor_times_vector( - Index size, - const Scalar* lhs, Index lhsStride, - const RhsType& rhs, - Scalar* res, - Scalar alpha) +template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> +struct ei_general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs> { +typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + +enum { + Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable + && int(ei_packet_traits<LhsScalar>::size)==int(ei_packet_traits<RhsScalar>::size), + LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, + RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, + ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1 +}; + +typedef typename ei_packet_traits<LhsScalar>::type _LhsPacket; +typedef typename ei_packet_traits<RhsScalar>::type _RhsPacket; +typedef typename ei_packet_traits<ResScalar>::type _ResPacket; + +typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket; +typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket; +typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket; + +EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsScalar* lhs, Index lhsStride, + const RhsScalar* rhs, Index rhsIncr, + ResScalar* res, Index resIncr, + RhsScalar alpha) +{ + ei_internal_assert(resIncr==1); #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \ ei_pstore(&res[j], \ - ei_padd(ei_pload(&res[j]), \ + ei_padd(ei_pload<ResPacket>(&res[j]), \ ei_padd( \ - ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A0)(&lhs0[j]), ptmp0), \ - pcj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs1[j]), ptmp1)), \ - ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A2)(&lhs2[j]), ptmp2), \ - pcj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs3[j]), ptmp3)) ))) - - typedef typename NumTraits<Scalar>::Real RealScalar; - typedef typename ei_packet_traits<Scalar>::type Packet; - enum { - PacketSize = sizeof(Packet)/sizeof(Scalar), - Vectorizable = ei_packet_traits<Scalar>::Vectorizable - }; - - ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj; - ei_conj_helper<Packet,Packet,ConjugateLhs,ConjugateRhs> pcj; + ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \ + pcj.pmul(EIGEN_CAT(ei_ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \ + ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \ + pcj.pmul(EIGEN_CAT(ei_ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) ))) + + ei_conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; + ei_conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; if(ConjugateRhs) alpha = ei_conj(alpha); enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; const Index columnsAtOnce = 4; const Index peels = 2; - const Index PacketAlignedMask = PacketSize-1; - const Index PeelAlignedMask = PacketSize*peels-1; - + const Index LhsPacketAlignedMask = LhsPacketSize-1; + const Index ResPacketAlignedMask = ResPacketSize-1; + const Index PeelAlignedMask = ResPacketSize*peels-1; + const Index size = rows; + // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type. Index alignedStart = ei_first_aligned(res,size); - Index alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0; + Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; - const Index alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; + const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(PacketSize/2) ? EvenAligned + : alignmentStep==(LhsPacketSize/2) ? EvenAligned : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices @@ -88,19 +109,19 @@ void ei_cache_friendly_product_colmajor_times_vector( // find how many columns do we have to skip to be aligned with the result (if possible) Index skipColumns = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (size_t(lhs)%sizeof(Scalar)) || (size_t(res)%sizeof(Scalar)) ) + if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) ) { alignedSize = 0; alignedStart = 0; } - else if (PacketSize>1) + else if (LhsPacketSize>1) { - ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size<PacketSize); + ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); - while (skipColumns<PacketSize && - alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%PacketSize)) + while (skipColumns<LhsPacketSize && + alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize)) ++skipColumns; - if (skipColumns==PacketSize) + if (skipColumns==LhsPacketSize) { // nothing can be aligned, no need to skip any column alignmentPattern = NoneAligned; @@ -108,14 +129,14 @@ void ei_cache_friendly_product_colmajor_times_vector( } else { - skipColumns = std::min(skipColumns,rhs.size()); + skipColumns = std::min(skipColumns,cols); // note that the skiped columns are processed later. } ei_internal_assert( (alignmentPattern==NoneAligned) - || (skipColumns + columnsAtOnce >= rhs.size()) - || PacketSize > size - || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0); + || (skipColumns + columnsAtOnce >= cols) + || LhsPacketSize > size + || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0); } else if(Vectorizable) { @@ -127,15 +148,17 @@ void ei_cache_friendly_product_colmajor_times_vector( Index offset1 = (FirstAligned && alignmentStep==1?3:1); Index offset3 = (FirstAligned && alignmentStep==1?1:3); - Index columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; + Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce) { - Packet ptmp0 = ei_pset1(alpha*rhs[i]), ptmp1 = ei_pset1(alpha*rhs[i+offset1]), - ptmp2 = ei_pset1(alpha*rhs[i+2]), ptmp3 = ei_pset1(alpha*rhs[i+offset3]); + RhsPacket ptmp0 = ei_pset1<RhsPacket>(alpha*rhs[i*rhsIncr]), + ptmp1 = ei_pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]), + ptmp2 = ei_pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]), + ptmp3 = ei_pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]); // this helps a lot generating better binary code - const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, + *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; if (Vectorizable) { @@ -154,51 +177,52 @@ void ei_cache_friendly_product_colmajor_times_vector( switch(alignmentPattern) { case AllAligned: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,d,d); break; case EvenAligned: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,du,d); break; case FirstAligned: if(peels>1) { - Packet A00, A01, A02, A03, A10, A11, A12, A13; + LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; + ResPacket T0, T1; - A01 = ei_pload(&lhs1[alignedStart-1]); - A02 = ei_pload(&lhs2[alignedStart-2]); - A03 = ei_pload(&lhs3[alignedStart-3]); + A01 = ei_pload<LhsPacket>(&lhs1[alignedStart-1]); + A02 = ei_pload<LhsPacket>(&lhs2[alignedStart-2]); + A03 = ei_pload<LhsPacket>(&lhs3[alignedStart-3]); - for (Index j = alignedStart; j<peeledSize; j+=peels*PacketSize) + for (Index j = alignedStart; j<peeledSize; j+=peels*ResPacketSize) { - A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11); - A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12); - A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13); - - A00 = ei_pload (&lhs0[j]); - A10 = ei_pload (&lhs0[j+PacketSize]); - A00 = pcj.pmadd(A00, ptmp0, ei_pload(&res[j])); - A10 = pcj.pmadd(A10, ptmp0, ei_pload(&res[j+PacketSize])); - - A00 = pcj.pmadd(A01, ptmp1, A00); - A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); - A00 = pcj.pmadd(A02, ptmp2, A00); - A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); - A00 = pcj.pmadd(A03, ptmp3, A00); - ei_pstore(&res[j],A00); - A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); - A10 = pcj.pmadd(A11, ptmp1, A10); - A10 = pcj.pmadd(A12, ptmp2, A10); - A10 = pcj.pmadd(A13, ptmp3, A10); - ei_pstore(&res[j+PacketSize],A10); + A11 = ei_pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); ei_palign<1>(A01,A11); + A12 = ei_pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); ei_palign<2>(A02,A12); + A13 = ei_pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); ei_palign<3>(A03,A13); + + A00 = ei_pload<LhsPacket>(&lhs0[j]); + A10 = ei_pload<LhsPacket>(&lhs0[j+LhsPacketSize]); + T0 = pcj.pmadd(A00, ptmp0, ei_pload<ResPacket>(&res[j])); + T1 = pcj.pmadd(A10, ptmp0, ei_pload<ResPacket>(&res[j+ResPacketSize])); + + T0 = pcj.pmadd(A01, ptmp1, T0); + A01 = ei_pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); ei_palign<1>(A11,A01); + T0 = pcj.pmadd(A02, ptmp2, T0); + A02 = ei_pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); ei_palign<2>(A12,A02); + T0 = pcj.pmadd(A03, ptmp3, T0); + ei_pstore(&res[j],T0); + A03 = ei_pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); ei_palign<3>(A13,A03); + T1 = pcj.pmadd(A11, ptmp1, T1); + T1 = pcj.pmadd(A12, ptmp2, T1); + T1 = pcj.pmadd(A13, ptmp3, T1); + ei_pstore(&res[j+ResPacketSize],T1); } } - for (Index j = peeledSize; j<alignedSize; j+=PacketSize) + for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,du,du); break; default: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) _EIGEN_ACCUMULATE_PACKETS(du,du,du); break; } @@ -216,14 +240,14 @@ void ei_cache_friendly_product_colmajor_times_vector( } // process remaining first and last columns (at most columnsAtOnce-1) - Index end = rhs.size(); + Index end = cols; Index start = columnBound; do { - for (Index i=start; i<end; ++i) + for (Index k=start; k<end; ++k) { - Packet ptmp0 = ei_pset1(alpha*rhs[i]); - const Scalar* lhs0 = lhs + i*lhsStride; + RhsPacket ptmp0 = ei_pset1<RhsPacket>(alpha*rhs[k*rhsIncr]); + const LhsScalar* lhs0 = lhs + k*lhsStride; if (Vectorizable) { @@ -231,19 +255,18 @@ void ei_cache_friendly_product_colmajor_times_vector( // process first unaligned result's coeffs for (Index j=0; j<alignedStart; ++j) res[j] += cj.pmul(lhs0[j], ei_pfirst(ptmp0)); - // process aligned result's coeffs - if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0) - for (Index j = alignedStart;j<alignedSize;j+=PacketSize) - ei_pstore(&res[j], pcj.pmadd(ei_pload(&lhs0[j]), ptmp0, ei_pload(&res[j]))); + if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) + ei_pstore(&res[i], pcj.pmadd(ei_ploadu<LhsPacket>(&lhs0[i]), ptmp0, ei_pload<ResPacket>(&res[i]))); else - for (Index j = alignedStart;j<alignedSize;j+=PacketSize) - ei_pstore(&res[j], pcj.pmadd(ei_ploadu(&lhs0[j]), ptmp0, ei_pload(&res[j]))); + for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) + ei_pstore(&res[i], pcj.pmadd(ei_ploadu<LhsPacket>(&lhs0[i]), ptmp0, ei_pload<ResPacket>(&res[i]))); } // process remaining scalars (or all if no explicit vectorization) - for (Index j=alignedSize; j<size; ++j) - res[j] += cj.pmul(lhs0[j], ei_pfirst(ptmp0)); + for (Index i=alignedSize; i<size; ++i) + res[i] += cj.pmul(lhs0[i], ei_pfirst(ptmp0)); } if (skipColumns) { @@ -256,15 +279,45 @@ void ei_cache_friendly_product_colmajor_times_vector( } while(Vectorizable); #undef _EIGEN_ACCUMULATE_PACKETS } +}; -// TODO add peeling to mask unaligned load/stores -template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index> -static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( +/* Optimized row-major matrix * vector product: + * This algorithm processes 4 rows at onces that allows to both reduce + * the number of load/stores of the result by a factor 4 and to reduce + * the instruction dependency. Moreover, we know that all bands have the + * same alignment pattern. + * + * Mixing type logic: + * - alpha is always a complex (or converted to a complex) + * - no vectorization + */ +template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> +struct ei_general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs> +{ +typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + +enum { + Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable + && int(ei_packet_traits<LhsScalar>::size)==int(ei_packet_traits<RhsScalar>::size), + LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, + RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, + ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1 +}; + +typedef typename ei_packet_traits<LhsScalar>::type _LhsPacket; +typedef typename ei_packet_traits<RhsScalar>::type _RhsPacket; +typedef typename ei_packet_traits<ResScalar>::type _ResPacket; + +typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket; +typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket; +typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket; + +EIGEN_DONT_INLINE static void run( Index rows, Index cols, - const Scalar* lhs, Index lhsStride, - const Scalar* rhs, Index rhsIncr, - Scalar* res, Index resIncr, - Scalar alpha) + const LhsScalar* lhs, Index lhsStride, + const RhsScalar* rhs, Index rhsIncr, + ResScalar* res, Index resIncr, + ResScalar alpha) { EIGEN_UNUSED_VARIABLE(rhsIncr); ei_internal_assert(rhsIncr==1); @@ -273,39 +326,33 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( #endif #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ - Packet b = ei_pload(&rhs[j]); \ - ptmp0 = pcj.pmadd(EIGEN_CAT(ei_ploa,A0) (&lhs0[j]), b, ptmp0); \ - ptmp1 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs1[j]), b, ptmp1); \ - ptmp2 = pcj.pmadd(EIGEN_CAT(ei_ploa,A2) (&lhs2[j]), b, ptmp2); \ - ptmp3 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs3[j]), b, ptmp3); } - - typedef typename NumTraits<Scalar>::Real RealScalar; - typedef typename ei_packet_traits<Scalar>::type Packet; - enum { - PacketSize = sizeof(Packet)/sizeof(Scalar), - Vectorizable = ei_packet_traits<Scalar>::Vectorizable - }; - - ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj; - ei_conj_helper<Packet,Packet,ConjugateLhs,ConjugateRhs> pcj; + RhsPacket b = ei_pload<RhsPacket>(&rhs[j]); \ + ptmp0 = pcj.pmadd(EIGEN_CAT(ei_ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \ + ptmp1 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \ + ptmp2 = pcj.pmadd(EIGEN_CAT(ei_ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \ + ptmp3 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); } + + ei_conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; + ei_conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; const Index rowsAtOnce = 4; const Index peels = 2; - const Index PacketAlignedMask = PacketSize-1; - const Index PeelAlignedMask = PacketSize*peels-1; + const Index RhsPacketAlignedMask = RhsPacketSize-1; + const Index LhsPacketAlignedMask = LhsPacketSize-1; + const Index PeelAlignedMask = RhsPacketSize*peels-1; const Index depth = cols; // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type // if that's not the case then vectorization is discarded, see below. Index alignedStart = ei_first_aligned(rhs, depth); - Index alignedSize = PacketSize>1 ? alignedStart + ((depth-alignedStart) & ~PacketAlignedMask) : 0; + Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; - const Index alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; + const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(PacketSize/2) ? EvenAligned + : alignmentStep==(LhsPacketSize/2) ? EvenAligned : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices @@ -314,19 +361,19 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( // find how many rows do we have to skip to be aligned with rhs (if possible) Index skipRows = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (size_t(lhs)%sizeof(Scalar)) || (size_t(rhs)%sizeof(Scalar)) ) + if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ) { alignedSize = 0; alignedStart = 0; } - else if (PacketSize>1) + else if (LhsPacketSize>1) { - ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || depth<PacketSize); + ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); - while (skipRows<PacketSize && - alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%PacketSize)) + while (skipRows<LhsPacketSize && + alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize)) ++skipRows; - if (skipRows==PacketSize) + if (skipRows==LhsPacketSize) { // nothing can be aligned, no need to skip any column alignmentPattern = NoneAligned; @@ -338,10 +385,10 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( // note that the skiped columns are processed later. } ei_internal_assert( alignmentPattern==NoneAligned - || PacketSize==1 + || LhsPacketSize==1 || (skipRows + rowsAtOnce >= rows) - || PacketSize > depth - || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0); + || LhsPacketSize > depth + || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0); } else if(Vectorizable) { @@ -356,23 +403,24 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) { - EIGEN_ALIGN16 Scalar tmp0 = Scalar(0); - Scalar tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0); + EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); + ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0); // this helps the compiler generating good binary code - const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, + *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; if (Vectorizable) { /* explicit vectorization */ - Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0)); + ResPacket ptmp0 = ei_pset1<ResPacket>(ResScalar(0)), ptmp1 = ei_pset1<ResPacket>(ResScalar(0)), + ptmp2 = ei_pset1<ResPacket>(ResScalar(0)), ptmp3 = ei_pset1<ResPacket>(ResScalar(0)); // process initial unaligned coeffs // FIXME this loop get vectorized by the compiler ! for (Index j=0; j<alignedStart; ++j) { - Scalar b = rhs[j]; + RhsScalar b = rhs[j]; tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); } @@ -382,11 +430,11 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( switch(alignmentPattern) { case AllAligned: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,d,d); break; case EvenAligned: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,du,d); break; case FirstAligned: @@ -398,38 +446,38 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( * overlaping the desired unaligned packet. This is *much* more efficient * than basic unaligned loads. */ - Packet A01, A02, A03, b, A11, A12, A13; - A01 = ei_pload(&lhs1[alignedStart-1]); - A02 = ei_pload(&lhs2[alignedStart-2]); - A03 = ei_pload(&lhs3[alignedStart-3]); + LhsPacket A01, A02, A03, A11, A12, A13; + A01 = ei_pload<LhsPacket>(&lhs1[alignedStart-1]); + A02 = ei_pload<LhsPacket>(&lhs2[alignedStart-2]); + A03 = ei_pload<LhsPacket>(&lhs3[alignedStart-3]); - for (Index j = alignedStart; j<peeledSize; j+=peels*PacketSize) + for (Index j = alignedStart; j<peeledSize; j+=peels*RhsPacketSize) { - b = ei_pload(&rhs[j]); - A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11); - A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12); - A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13); + RhsPacket b = ei_pload<RhsPacket>(&rhs[j]); + A11 = ei_pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); ei_palign<1>(A01,A11); + A12 = ei_pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); ei_palign<2>(A02,A12); + A13 = ei_pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); ei_palign<3>(A03,A13); - ptmp0 = pcj.pmadd(ei_pload (&lhs0[j]), b, ptmp0); + ptmp0 = pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j]), b, ptmp0); ptmp1 = pcj.pmadd(A01, b, ptmp1); - A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); + A01 = ei_pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); ei_palign<1>(A11,A01); ptmp2 = pcj.pmadd(A02, b, ptmp2); - A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); + A02 = ei_pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); ei_palign<2>(A12,A02); ptmp3 = pcj.pmadd(A03, b, ptmp3); - A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); + A03 = ei_pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); ei_palign<3>(A13,A03); - b = ei_pload(&rhs[j+PacketSize]); - ptmp0 = pcj.pmadd(ei_pload (&lhs0[j+PacketSize]), b, ptmp0); + b = ei_pload<RhsPacket>(&rhs[j+RhsPacketSize]); + ptmp0 = pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0); ptmp1 = pcj.pmadd(A11, b, ptmp1); ptmp2 = pcj.pmadd(A12, b, ptmp2); ptmp3 = pcj.pmadd(A13, b, ptmp3); } } - for (Index j = peeledSize; j<alignedSize; j+=PacketSize) + for (Index j = peeledSize; j<alignedSize; j+=RhsPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,du,du); break; default: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) _EIGEN_ACCUMULATE_PACKETS(du,du,du); break; } @@ -444,7 +492,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( // FIXME this loop get vectorized by the compiler ! for (Index j=alignedSize; j<depth; ++j) { - Scalar b = rhs[j]; + RhsScalar b = rhs[j]; tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); } @@ -461,9 +509,9 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( { for (Index i=start; i<end; ++i) { - EIGEN_ALIGN16 Scalar tmp0 = Scalar(0); - Packet ptmp0 = ei_pset1(tmp0); - const Scalar* lhs0 = lhs + i*lhsStride; + EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); + ResPacket ptmp0 = ei_pset1<ResPacket>(tmp0); + const LhsScalar* lhs0 = lhs + i*lhsStride; // process first unaligned result's coeffs // FIXME this loop get vectorized by the compiler ! for (Index j=0; j<alignedStart; ++j) @@ -472,12 +520,12 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( if (alignedSize>alignedStart) { // process aligned rhs coeffs - if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0) - for (Index j = alignedStart;j<alignedSize;j+=PacketSize) - ptmp0 = pcj.pmadd(ei_pload(&lhs0[j]), ei_pload(&rhs[j]), ptmp0); + if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) + ptmp0 = pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j]), ei_pload<RhsPacket>(&rhs[j]), ptmp0); else - for (Index j = alignedStart;j<alignedSize;j+=PacketSize) - ptmp0 = pcj.pmadd(ei_ploadu(&lhs0[j]), ei_pload(&rhs[j]), ptmp0); + for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) + ptmp0 = pcj.pmadd(ei_ploadu<LhsPacket>(&lhs0[j]), ei_pload<RhsPacket>(&rhs[j]), ptmp0); tmp0 += ei_predux(ptmp0); } @@ -499,5 +547,6 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( #undef _EIGEN_ACCUMULATE_PACKETS } +}; #endif // EIGEN_GENERAL_MATRIX_VECTOR_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 4a95ad5e1..ede8b77bf 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -26,10 +26,9 @@ #define EIGEN_SELFADJOINT_MATRIX_MATRIX_H // pack a selfadjoint block diagonal for use with the gebp_kernel -template<typename Scalar, typename Index, int mr, int StorageOrder> +template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder> struct ei_symm_pack_lhs { - enum { PacketSize = ei_packet_traits<Scalar>::size }; template<int BlockRows> inline void pack(Scalar* blockA, const ei_const_blas_data_mapper<Scalar,Index,StorageOrder>& lhs, Index cols, Index i, Index& count) { @@ -59,16 +58,16 @@ struct ei_symm_pack_lhs { ei_const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride); Index count = 0; - Index peeled_mc = (rows/mr)*mr; - for(Index i=0; i<peeled_mc; i+=mr) + Index peeled_mc = (rows/Pack1)*Pack1; + for(Index i=0; i<peeled_mc; i+=Pack1) { - pack<mr>(blockA, lhs, cols, i, count); + pack<Pack1>(blockA, lhs, cols, i, count); } - if(rows-peeled_mc>=PacketSize) + if(rows-peeled_mc>=Pack2) { - pack<PacketSize>(blockA, lhs, cols, peeled_mc, count); - peeled_mc += PacketSize; + pack<Pack2>(blockA, lhs, cols, peeled_mc, count); + peeled_mc += Pack2; } // do the same with mr==1 @@ -89,7 +88,7 @@ template<typename Scalar, typename Index, int nr, int StorageOrder> struct ei_symm_pack_rhs { enum { PacketSize = ei_packet_traits<Scalar>::size }; - void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Scalar alpha, Index rows, Index cols, Index k2) + void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { Index end_k = k2 + rows; Index count = 0; @@ -101,12 +100,12 @@ struct ei_symm_pack_rhs { for(Index k=k2; k<end_k; k++) { - blockB[count+0] = alpha*rhs(k,j2+0); - blockB[count+1] = alpha*rhs(k,j2+1); + blockB[count+0] = rhs(k,j2+0); + blockB[count+1] = rhs(k,j2+1); if (nr==4) { - blockB[count+2] = alpha*rhs(k,j2+2); - blockB[count+3] = alpha*rhs(k,j2+3); + blockB[count+2] = rhs(k,j2+2); + blockB[count+3] = rhs(k,j2+3); } count += nr; } @@ -119,12 +118,12 @@ struct ei_symm_pack_rhs // transpose for(Index k=k2; k<j2; k++) { - blockB[count+0] = alpha*ei_conj(rhs(j2+0,k)); - blockB[count+1] = alpha*ei_conj(rhs(j2+1,k)); + blockB[count+0] = ei_conj(rhs(j2+0,k)); + blockB[count+1] = ei_conj(rhs(j2+1,k)); if (nr==4) { - blockB[count+2] = alpha*ei_conj(rhs(j2+2,k)); - blockB[count+3] = alpha*ei_conj(rhs(j2+3,k)); + blockB[count+2] = ei_conj(rhs(j2+2,k)); + blockB[count+3] = ei_conj(rhs(j2+3,k)); } count += nr; } @@ -134,25 +133,25 @@ struct ei_symm_pack_rhs { // normal for (Index w=0 ; w<h; ++w) - blockB[count+w] = alpha*rhs(k,j2+w); + blockB[count+w] = rhs(k,j2+w); - blockB[count+h] = alpha*ei_real(rhs(k,k)); + blockB[count+h] = ei_real(rhs(k,k)); // transpose for (Index w=h+1 ; w<nr; ++w) - blockB[count+w] = alpha*ei_conj(rhs(j2+w,k)); + blockB[count+w] = ei_conj(rhs(j2+w,k)); count += nr; ++h; } // normal for(Index k=j2+nr; k<end_k; k++) { - blockB[count+0] = alpha*rhs(k,j2+0); - blockB[count+1] = alpha*rhs(k,j2+1); + blockB[count+0] = rhs(k,j2+0); + blockB[count+1] = rhs(k,j2+1); if (nr==4) { - blockB[count+2] = alpha*rhs(k,j2+2); - blockB[count+3] = alpha*rhs(k,j2+3); + blockB[count+2] = rhs(k,j2+2); + blockB[count+3] = rhs(k,j2+3); } count += nr; } @@ -163,12 +162,12 @@ struct ei_symm_pack_rhs { for(Index k=k2; k<end_k; k++) { - blockB[count+0] = alpha*ei_conj(rhs(j2+0,k)); - blockB[count+1] = alpha*ei_conj(rhs(j2+1,k)); + blockB[count+0] = ei_conj(rhs(j2+0,k)); + blockB[count+1] = ei_conj(rhs(j2+1,k)); if (nr==4) { - blockB[count+2] = alpha*ei_conj(rhs(j2+2,k)); - blockB[count+3] = alpha*ei_conj(rhs(j2+3,k)); + blockB[count+2] = ei_conj(rhs(j2+2,k)); + blockB[count+3] = ei_conj(rhs(j2+3,k)); } count += nr; } @@ -181,13 +180,13 @@ struct ei_symm_pack_rhs Index half = std::min(end_k,j2); for(Index k=k2; k<half; k++) { - blockB[count] = alpha*ei_conj(rhs(j2,k)); + blockB[count] = ei_conj(rhs(j2,k)); count += 1; } if(half==j2 && half<k2+rows) { - blockB[count] = alpha*ei_real(rhs(j2,j2)); + blockB[count] = ei_real(rhs(j2,j2)); count += 1; } else @@ -196,7 +195,7 @@ struct ei_symm_pack_rhs // normal for(Index k=half+1; k<k2+rows; k++) { - blockB[count] = alpha*rhs(k,j2); + blockB[count] = rhs(k,j2); count += 1; } } @@ -253,12 +252,9 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - if (ConjugateRhs) - alpha = ei_conj(alpha); - - typedef ei_product_blocking_traits<Scalar> Blocking; + typedef ei_gebp_traits<Scalar,Scalar> Traits; - Index kc = size; // cache block size along the K direction + Index kc = size; // cache block size along the K direction Index mc = rows; // cache block size along the M direction Index nc = cols; // cache block size along the N direction computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); @@ -266,14 +262,15 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate kc = std::min(kc,mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); - Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; + Scalar* blockB = allocatedBlockB + sizeW; - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; - ei_symm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs; - ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs; - ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed; + ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + ei_symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; + ei_gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed; for(Index k2=0; k2<size; k2+=kc) { @@ -282,7 +279,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate // we have selected one row panel of rhs and one column panel of lhs // pack rhs's panel into a sequential chunk of memory // and expand each coeff to a constant packet for further reuse - pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); + pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); // the select lhs's panel has to be split in three different parts: // 1 - the transposed panel above the diagonal block => transposed packed copy @@ -294,7 +291,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate // transposed packed copy pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); } // the block diagonal { @@ -302,16 +299,16 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate // symmetric packed copy pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols); + gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); } for(Index i2=k2+kc; i2<size; i2+=mc) { const Index actual_mc = std::min(i2+mc,size)-i2; - ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder,false>() + ei_gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>() (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); } } @@ -338,10 +335,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - if (ConjugateRhs) - alpha = ei_conj(alpha); - - typedef ei_product_blocking_traits<Scalar> Blocking; + typedef ei_gebp_traits<Scalar,Scalar> Traits; Index kc = size; // cache block size along the K direction Index mc = rows; // cache block size along the M direction @@ -349,19 +343,20 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); - Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; + Scalar* blockB = allocatedBlockB + sizeW; - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; - ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs; - ei_symm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs; + ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + ei_gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + ei_symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; for(Index k2=0; k2<size; k2+=kc) { const Index actual_kc = std::min(k2+kc,size)-k2; - pack_rhs(blockB, _rhs, rhsStride, alpha, actual_kc, cols, k2); + pack_rhs(blockB, _rhs, rhsStride, actual_kc, cols, k2); // => GEPP for(Index i2=0; i2<rows; i2+=mc) @@ -369,7 +364,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat const Index actual_mc = std::min(i2+mc,rows)-i2; pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); } } diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index 4514c7692..8a10075f0 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -77,14 +77,14 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( register const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride; Scalar t0 = cjAlpha * rhs[j]; - Packet ptmp0 = ei_pset1(t0); + Packet ptmp0 = ei_pset1<Packet>(t0); Scalar t1 = cjAlpha * rhs[j+1]; - Packet ptmp1 = ei_pset1(t1); + Packet ptmp1 = ei_pset1<Packet>(t1); Scalar t2 = 0; - Packet ptmp2 = ei_pset1(t2); + Packet ptmp2 = ei_pset1<Packet>(t2); Scalar t3 = 0; - Packet ptmp3 = ei_pset1(t3); + Packet ptmp3 = ei_pset1<Packet>(t3); size_t starti = FirstTriangular ? 0 : j+2; size_t endi = FirstTriangular ? j : size; @@ -119,10 +119,10 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( Scalar* EIGEN_RESTRICT resIt = res + alignedStart; for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize) { - Packet A0i = ei_ploadu(a0It); a0It += PacketSize; - Packet A1i = ei_ploadu(a1It); a1It += PacketSize; - Packet Bi = ei_ploadu(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases - Packet Xi = ei_pload (resIt); + Packet A0i = ei_ploadu<Packet>(a0It); a0It += PacketSize; + Packet A1i = ei_ploadu<Packet>(a1It); a1It += PacketSize; + Packet Bi = ei_ploadu<Packet>(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases + Packet Xi = ei_pload <Packet>(resIt); Xi = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi)); ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2); diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index 40c0c9aac..8f431c2e4 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -65,23 +65,24 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL { ei_const_blas_data_mapper<Scalar, Index, MatStorageOrder> mat(_mat,matStride); - if(AAT) - alpha = ei_conj(alpha); +// if(AAT) +// alpha = ei_conj(alpha); - typedef ei_product_blocking_traits<Scalar> Blocking; + typedef ei_gebp_traits<Scalar,Scalar> Traits; Index kc = depth; // cache block size along the K direction Index mc = size; // cache block size along the M direction Index nc = size; // cache block size along the N direction computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); // !!! mc must be a multiple of nr: - if(mc>Blocking::nr) - mc = (mc/Blocking::nr)*Blocking::nr; + if(mc>Traits::nr) + mc = (mc/Traits::nr)*Traits::nr; Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*size; Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); - Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; + Scalar* blockB = allocatedBlockB + sizeW; // note that the actual rhs is the transpose/adjoint of mat enum { @@ -89,17 +90,17 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL ConjRhs = NumTraits<Scalar>::IsComplex && AAT }; - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjLhs, ConjRhs> gebp_kernel; - ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor> pack_rhs; - ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,MatStorageOrder, false> pack_lhs; - ei_sybb_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjLhs, ConjRhs, UpLo> sybb; + ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjLhs, ConjRhs> gebp_kernel; + ei_gemm_pack_rhs<Scalar, Index, Traits::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor> pack_rhs; + ei_gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, MatStorageOrder, false> pack_lhs; + ei_sybb_kernel<Scalar, Index, Traits::mr, Traits::nr, ConjLhs, ConjRhs, UpLo> sybb; for(Index k2=0; k2<depth; k2+=kc) { const Index actual_kc = std::min(k2+kc,depth)-k2; // note that the actual rhs is the transpose/adjoint of mat - pack_rhs(blockB, &mat(0,k2), matStride, alpha, actual_kc, size); + pack_rhs(blockB, &mat(0,k2), matStride, actual_kc, size); for(Index i2=0; i2<size; i2+=mc) { @@ -112,15 +113,15 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel // 3 - after the diagonal => processed with gebp or skipped if (UpLo==Lower) - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2), + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2), alpha, -1, -1, 0, 0, allocatedBlockB); - sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, allocatedBlockB); + sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB); if (UpLo==Upper) { Index j2 = i2+actual_mc; - gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, std::max(Index(0),size-j2), + gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, std::max(Index(0), size-j2), alpha, -1, -1, 0, 0, allocatedBlockB); } } @@ -173,9 +174,9 @@ struct ei_sybb_kernel PacketSize = ei_packet_traits<Scalar>::size, BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) }; - void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index size, Index depth, Scalar* workspace) + void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index size, Index depth, Scalar alpha, Scalar* workspace) { - ei_gebp_kernel<Scalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel; + ei_gebp_kernel<Scalar, Scalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel; Matrix<Scalar,BlockSize,BlockSize,ColMajor> buffer; // let's process the block per panel of actual_mc x BlockSize, @@ -186,14 +187,15 @@ struct ei_sybb_kernel const Scalar* actual_b = blockB+j*depth; if(UpLo==Upper) - gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, -1, -1, 0, 0, workspace); + gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, + -1, -1, 0, 0, workspace); // selfadjoint micro block { Index i = j; buffer.setZero(); // 1 - apply the kernel on the temporary buffer - gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, + gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, -1, -1, 0, 0, workspace); // 2 - triangular accumulation for(Index j1=0; j1<actualBlockSize; ++j1) @@ -208,7 +210,7 @@ struct ei_sybb_kernel if(UpLo==Lower) { Index i = j+actualBlockSize; - gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, + gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha, -1, -1, 0, 0, workspace); } } diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 98305f993..cef5eeba1 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -105,12 +105,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true, ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - if (ConjugateRhs) - alpha = ei_conj(alpha); - - typedef ei_product_blocking_traits<Scalar> Blocking; + typedef ei_gebp_traits<Scalar,Scalar> Traits; enum { - SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr), + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower, SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1 }; @@ -121,10 +118,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true, computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); -// Scalar* allocatedBlockB = new Scalar[sizeB]; - Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; + Scalar* blockB = allocatedBlockB + sizeW; Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer; triangularBuffer.setZero(); @@ -133,9 +130,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true, else triangularBuffer.diagonal().setOnes(); - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; - ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs; - ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs; + ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + ei_gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; for(Index k2=IsLower ? depth : 0; IsLower ? k2>0 : k2<depth; @@ -151,7 +148,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true, k2 = k2+actual_kc-kc; } - pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, alpha, actual_kc, cols); + pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols); // the selected lhs's panel has to be split in three different parts: // 1 - the part which is above the diagonal block => skip it @@ -180,7 +177,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true, } pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth); - gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, + gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha, actualPanelWidth, actual_kc, 0, blockBOffset); // GEBP with remaining micro panel @@ -190,7 +187,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true, pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget); - gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, + gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha, actualPanelWidth, actual_kc, 0, blockBOffset); } } @@ -202,10 +199,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true, for(Index i2=start; i2<end; i2+=mc) { const Index actual_mc = std::min(i2+mc,end)-i2; - ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder,false>() + ei_gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>() (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); } } } @@ -235,12 +232,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false, ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - if (ConjugateRhs) - alpha = ei_conj(alpha); - - typedef ei_product_blocking_traits<Scalar> Blocking; + typedef ei_gebp_traits<Scalar,Scalar> Traits; enum { - SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr), + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower, SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1 }; @@ -251,9 +245,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false, computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar,sizeB); - Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; + Scalar* blockB = allocatedBlockB + sizeW; Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer; triangularBuffer.setZero(); @@ -262,10 +257,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false, else triangularBuffer.diagonal().setOnes(); - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; - ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs; - ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs; - ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,false,true> pack_rhs_panel; + ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + ei_gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; + ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; for(Index k2=IsLower ? 0 : depth; IsLower ? k2<depth : k2>0; @@ -288,7 +283,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false, Scalar* geb = blockB+ts*ts; - pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, alpha, actual_kc, rs); + pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs); // pack the triangular part of the rhs padding the unrolled blocks with zeros if(ts>0) @@ -301,7 +296,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; // general part pack_rhs_panel(blockB+j2*actual_kc, - &rhs(actual_k2+panelOffset, actual_j2), rhsStride, alpha, + &rhs(actual_k2+panelOffset, actual_j2), rhsStride, panelLength, actualPanelWidth, actual_kc, panelOffset); @@ -315,7 +310,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false, } pack_rhs_panel(blockB+j2*actual_kc, - triangularBuffer.data(), triangularBuffer.outerStride(), alpha, + triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth, actual_kc, j2); } @@ -338,6 +333,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false, gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride, blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, + alpha, actual_kc, actual_kc, // strides blockOffset, blockOffset,// offsets allocatedBlockB); // workspace @@ -345,6 +341,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false, } gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride, blockA, geb, actual_mc, actual_kc, rs, + alpha, -1, -1, 0, 0, allocatedBlockB); } } diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h index de4f0b7bb..67c131ab2 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector.h +++ b/Eigen/src/Core/products/TriangularMatrixVector.h @@ -76,12 +76,11 @@ struct ei_product_triangular_vector_selector<true,Lhs,Rhs,Result,Mode,ConjLhs,Co if (r>0) { Index s = IsLower ? pi+actualPanelWidth : 0; - ei_cache_friendly_product_colmajor_times_vector<ConjLhs,ConjRhs>( - r, + ei_general_matrix_vector_product<Index,Scalar,ColMajor,ConjLhs,Scalar,ConjRhs>::run( + r, actualPanelWidth, &(lhs.const_cast_derived().coeffRef(s,pi)), lhs.outerStride(), - rhs.segment(pi, actualPanelWidth), - &(res.coeffRef(s)), - alpha); + &rhs.coeff(pi), rhs.innerStride(), + &res.coeffRef(s), res.innerStride(), alpha); } } } @@ -119,7 +118,7 @@ struct ei_product_triangular_vector_selector<true,Lhs,Rhs,Result,Mode,ConjLhs,Co if (r>0) { Index s = IsLower ? 0 : pi + actualPanelWidth; - ei_cache_friendly_product_rowmajor_times_vector<ConjLhs,ConjRhs,Scalar,Index>( + ei_general_matrix_vector_product<Index,Scalar,RowMajor,ConjLhs,Scalar,ConjRhs>::run( actualPanelWidth, r, &(lhs.const_cast_derived().coeffRef(pi,s)), lhs.outerStride(), &(rhs.const_cast_derived().coeffRef(s)), 1, diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 0fce7159e..7163a800a 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -57,9 +57,9 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora ei_const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride); ei_blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride); - typedef ei_product_blocking_traits<Scalar> Blocking; + typedef ei_gebp_traits<Scalar,Scalar> Traits; enum { - SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr), + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower }; @@ -69,14 +69,15 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); - Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; + Scalar* blockB = allocatedBlockB + sizeW; ei_conj_if<Conjugate> conj; - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, Conjugate, false> gebp_kernel; - ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,TriStorageOrder> pack_lhs; - ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, ColMajor, false, true> pack_rhs; + ei_gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel; + ei_gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs; + ei_gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs; for(Index k2=IsLower ? 0 : size; IsLower ? k2<size : k2>0; @@ -140,7 +141,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora Index blockBOffset = IsLower ? k1 : lengthTarget; // update the respective rows of B from other - pack_rhs(blockB, _other+startBlock, otherStride, -1, actualPanelWidth, cols, actual_kc, blockBOffset); + pack_rhs(blockB, _other+startBlock, otherStride, actualPanelWidth, cols, actual_kc, blockBOffset); // GEBP if (lengthTarget>0) @@ -149,7 +150,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); - gebp_kernel(_other+startTarget, otherStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, + gebp_kernel(_other+startTarget, otherStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, Scalar(-1), actualPanelWidth, actual_kc, 0, blockBOffset); } } @@ -166,7 +167,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora { pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc); - gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols); + gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1)); } } } @@ -191,15 +192,15 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor ei_const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride); ei_blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride); - typedef ei_product_blocking_traits<Scalar> Blocking; + typedef ei_gebp_traits<Scalar,Scalar> Traits; enum { RhsStorageOrder = TriStorageOrder, - SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr), + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower }; -// Index kc = std::min<Index>(Blocking::Max_kc/4,size); // cache block size along the K direction -// Index mc = std::min<Index>(Blocking::Max_mc,size); // cache block size along the M direction +// Index kc = std::min<Index>(Traits::Max_kc/4,size); // cache block size along the K direction +// Index mc = std::min<Index>(Traits::Max_mc,size); // cache block size along the M direction // check that !!!! Index kc = size; // cache block size along the K direction Index mc = size; // cache block size along the M direction @@ -207,15 +208,16 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); - std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*size; Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); - Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; + Scalar* blockB = allocatedBlockB + sizeW; ei_conj_if<Conjugate> conj; - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, false, Conjugate> gebp_kernel; - ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs; - ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,false,true> pack_rhs_panel; - ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, ColMajor, false, true> pack_lhs_panel; + ei_gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel; + ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; + ei_gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; + ei_gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k2<size; @@ -228,7 +230,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc; Scalar* geb = blockB+actual_kc*actual_kc; - if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, -1, actual_kc, rs); + if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs); // triangular packing (we only pack the panels off the diagonal, // neglecting the blocks overlapping the diagonal @@ -242,7 +244,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor if (panelLength>0) pack_rhs_panel(blockB+j2*actual_kc, - &rhs(actual_k2+panelOffset, actual_j2), triStride, -1, + &rhs(actual_k2+panelOffset, actual_j2), triStride, panelLength, actualPanelWidth, actual_kc, panelOffset); } @@ -273,6 +275,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor gebp_kernel(&lhs(i2,absolute_j2), otherStride, blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, + Scalar(-1), actual_kc, actual_kc, // strides panelOffset, panelOffset, // offsets allocatedBlockB); // workspace @@ -305,7 +308,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor if (rs>0) gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, - actual_mc, actual_kc, rs, + actual_mc, actual_kc, rs, Scalar(-1), -1, -1, 0, 0, allocatedBlockB); } } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index e53311100..972814dc9 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -29,30 +29,37 @@ // implement and control fast level 2 and level 3 BLAS-like routines. // forward declarations -template<typename Scalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false> +template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false> struct ei_gebp_kernel; template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false> struct ei_gemm_pack_rhs; -template<typename Scalar, typename Index, int mr, int StorageOrder, bool Conjugate = false, bool PanelMode = false> +template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false> struct ei_gemm_pack_lhs; template< - typename Scalar, typename Index, - int LhsStorageOrder, bool ConjugateLhs, - int RhsStorageOrder, bool ConjugateRhs, + typename Index, + typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int ResStorageOrder> struct ei_general_matrix_matrix_product; -template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename RhsType> -static void ei_cache_friendly_product_colmajor_times_vector( - Index size, const Scalar* lhs, Index lhsStride, const RhsType& rhs, Scalar* res, Scalar alpha); +template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> +struct ei_general_matrix_vector_product; -template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index> -static void ei_cache_friendly_product_rowmajor_times_vector( - Index rows, Index Cols, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsIncr, - Scalar* res, Index resIncr, Scalar alpha); + +template<bool Conjugate> struct ei_conj_if; + +template<> struct ei_conj_if<true> { + template<typename T> + inline T operator()(const T& x) { return ei_conj(x); } +}; + +template<> struct ei_conj_if<false> { + template<typename T> + inline const T& operator()(const T& x) { return x; } +}; template<typename Scalar> struct ei_conj_helper<Scalar,Scalar,false,false> { @@ -90,16 +97,30 @@ template<typename RealScalar> struct ei_conj_helper<std::complex<RealScalar>, st { return Scalar(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } }; -template<bool Conjugate> struct ei_conj_if; +template<typename RealScalar,bool Conj> struct ei_conj_helper<std::complex<RealScalar>, RealScalar, Conj,false> +{ + typedef std::complex<RealScalar> Scalar; + EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const + { return ei_padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const + { return ei_conj_if<Conj>()(x)*y; } +}; -template<> struct ei_conj_if<true> { - template<typename T> - inline T operator()(const T& x) { return ei_conj(x); } +template<typename RealScalar,bool Conj> struct ei_conj_helper<RealScalar, std::complex<RealScalar>, false,Conj> +{ + typedef std::complex<RealScalar> Scalar; + EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const + { return ei_padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const + { return x*ei_conj_if<Conj>()(y); } }; -template<> struct ei_conj_if<false> { - template<typename T> - inline const T& operator()(const T& x) { return x; } +template<typename From,typename To> struct ei_get_factor { + EIGEN_STRONG_INLINE static To run(const From& x) { return x; } +}; + +template<typename Scalar> struct ei_get_factor<Scalar,typename NumTraits<Scalar>::Real> { + EIGEN_STRONG_INLINE static typename NumTraits<Scalar>::Real run(const Scalar& x) { return ei_real(x); } }; // Lightweight helper class to access matrix coefficients. @@ -130,34 +151,6 @@ class ei_const_blas_data_mapper Index m_stride; }; -// Defines various constant controlling register blocking for matrix-matrix algorithms. -template<typename Scalar> -struct ei_product_blocking_traits -{ - typedef typename ei_packet_traits<Scalar>::type PacketType; - enum { - PacketSize = sizeof(PacketType)/sizeof(Scalar), - NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - - // register block size along the N direction (must be either 2 or 4) - nr = NumberOfRegisters/4, - - // register block size along the M direction (currently, this one cannot be modified) - mr = 2 * PacketSize - }; -}; - -template<typename Real> -struct ei_product_blocking_traits<std::complex<Real> > -{ - typedef std::complex<Real> Scalar; - typedef typename ei_packet_traits<Scalar>::type PacketType; - enum { - PacketSize = sizeof(PacketType)/sizeof(Scalar), - nr = 2, - mr = 2 * PacketSize - }; -}; /* Helper class to analyze the factors of a Product expression. * In particular it allows to pop out operator-, scalar multiples, diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 2c100c809..60fdcf2e2 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -184,7 +184,9 @@ enum { LinearVectorizedTraversal, /** \internal Generic vectorization path using one vectorized loop per row/column with some * scalar loops to handle the unaligned boundaries */ - SliceVectorizedTraversal + SliceVectorizedTraversal, + /** \internal Special case to properly handle incompatible scalar types or other defecting cases*/ + InvalidTraversal }; enum { diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index c3ff9d7c4..7f626c62b 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -70,7 +70,7 @@ template<typename NullaryOp, typename MatrixType> class CwiseNullaryOp; template<typename UnaryOp, typename MatrixType> class CwiseUnaryOp; template<typename ViewOp, typename MatrixType> class CwiseUnaryView; template<typename BinaryOp, typename Lhs, typename Rhs> class CwiseBinaryOp; -template<typename BinOp, typename MatrixType> class SelfCwiseBinaryOp; +template<typename BinOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp; template<typename Derived, typename Lhs, typename Rhs> class ProductBase; template<typename Lhs, typename Rhs, int Mode> class GeneralProduct; template<typename Lhs, typename Rhs, int NestingFlags> class CoeffBasedProduct; @@ -111,11 +111,10 @@ struct ProductReturnType; // Provides scalar/packet-wise product and product with accumulation // with optional conjugation of the arguments. -template<typename LhsScalar, typename RhsScalar, bool ConjLhs, bool ConjRhs> struct ei_conj_helper; +template<typename LhsScalar, typename RhsScalar, bool ConjLhs=false, bool ConjRhs=false> struct ei_conj_helper; template<typename Scalar> struct ei_scalar_sum_op; template<typename Scalar> struct ei_scalar_difference_op; -template<typename Scalar> struct ei_scalar_product_op; template<typename Scalar> struct ei_scalar_conj_product_op; template<typename Scalar> struct ei_scalar_quotient_op; template<typename Scalar> struct ei_scalar_opposite_op; @@ -143,7 +142,8 @@ template<typename Scalar> struct ei_scalar_add_op; template<typename Scalar> struct ei_scalar_constant_op; template<typename Scalar> struct ei_scalar_identity_op; -template<typename Scalar1,typename Scalar2> struct ei_scalar_multiple2_op; +template<typename LhsScalar,typename RhsScalar=LhsScalar> struct ei_scalar_product_op; +template<typename LhsScalar,typename RhsScalar> struct ei_scalar_multiple2_op; struct IOFormat; diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index f52e8b57c..7600cc3e7 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -147,6 +147,12 @@ #define EIGEN_ALWAYS_INLINE_ATTRIB #endif +#if EIGEN_GNUC_AT_LEAST(4,0) +#define EIGEN_FLATTEN_ATTRIB __attribute__((flatten)) +#else +#define EIGEN_FLATTEN_ATTRIB +#endif + // EIGEN_FORCE_INLINE means "inline as much as possible" #if (defined _MSC_VER) || (defined __intel_compiler) #define EIGEN_STRONG_INLINE __forceinline @@ -353,10 +359,8 @@ #define EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS) \ CwiseBinaryOp< \ ei_scalar_product_op< \ - typename ei_scalar_product_traits< \ typename ei_traits<LHS>::Scalar, \ typename ei_traits<RHS>::Scalar \ - >::ReturnType \ >, \ LHS, \ RHS \ diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index b01ceafb2..3d28680b6 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -205,10 +205,10 @@ template<typename T> struct ei_scalar_product_traits<std::complex<T>, T> }; // FIXME quick workaround around current limitation of ei_result_of -template<typename Scalar, typename ArgType0, typename ArgType1> -struct ei_result_of<ei_scalar_product_op<Scalar>(ArgType0,ArgType1)> { -typedef typename ei_scalar_product_traits<typename ei_cleantype<ArgType0>::type, typename ei_cleantype<ArgType1>::type>::ReturnType type; -}; +// template<typename Scalar, typename ArgType0, typename ArgType1> +// struct ei_result_of<ei_scalar_product_op<Scalar>(ArgType0,ArgType1)> { +// typedef typename ei_scalar_product_traits<typename ei_cleantype<ArgType0>::type, typename ei_cleantype<ArgType1>::type>::ReturnType type; +// }; template<typename T> struct ei_is_diagonal { enum { ret = false }; }; diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index 94bb5569e..f62d0d8a4 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -322,8 +322,8 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY& Index alignedStart = ei_first_aligned(y, size); Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; - const Packet pc = ei_pset1(Scalar(j.c())); - const Packet ps = ei_pset1(Scalar(j.s())); + const Packet pc = ei_pset1<Packet>(j.c()); + const Packet ps = ei_pset1<Packet>(j.s()); ei_conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex,false> pcj; for(Index i=0; i<alignedStart; ++i) @@ -341,8 +341,8 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY& { for(Index i=alignedStart; i<alignedEnd; i+=PacketSize) { - Packet xi = ei_pload(px); - Packet yi = ei_pload(py); + Packet xi = ei_pload<Packet>(px); + Packet yi = ei_pload<Packet>(py); ei_pstore(px, ei_padd(ei_pmul(pc,xi),pcj.pmul(ps,yi))); ei_pstore(py, ei_psub(ei_pmul(pc,yi),ei_pmul(ps,xi))); px += PacketSize; @@ -354,10 +354,10 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY& Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize); for(Index i=alignedStart; i<peelingEnd; i+=Peeling*PacketSize) { - Packet xi = ei_ploadu(px); - Packet xi1 = ei_ploadu(px+PacketSize); - Packet yi = ei_pload (py); - Packet yi1 = ei_pload (py+PacketSize); + Packet xi = ei_ploadu<Packet>(px); + Packet xi1 = ei_ploadu<Packet>(px+PacketSize); + Packet yi = ei_pload <Packet>(py); + Packet yi1 = ei_pload <Packet>(py+PacketSize); ei_pstoreu(px, ei_padd(ei_pmul(pc,xi),pcj.pmul(ps,yi))); ei_pstoreu(px+PacketSize, ei_padd(ei_pmul(pc,xi1),pcj.pmul(ps,yi1))); ei_pstore (py, ei_psub(ei_pmul(pc,yi),ei_pmul(ps,xi))); @@ -367,8 +367,8 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY& } if(alignedEnd!=peelingEnd) { - Packet xi = ei_ploadu(x+peelingEnd); - Packet yi = ei_pload (y+peelingEnd); + Packet xi = ei_ploadu<Packet>(x+peelingEnd); + Packet yi = ei_pload <Packet>(y+peelingEnd); ei_pstoreu(x+peelingEnd, ei_padd(ei_pmul(pc,xi),pcj.pmul(ps,yi))); ei_pstore (y+peelingEnd, ei_psub(ei_pmul(pc,yi),ei_pmul(ps,xi))); } |