diff options
author | Gael Guennebaud <g.gael@free.fr> | 2010-07-11 15:48:30 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2010-07-11 15:48:30 +0200 |
commit | ff96c94043d575e4d0dd477c1ed2487e33f79627 (patch) | |
tree | 5e9736916779fdacd431c2591a3ec1f77333e505 /Eigen | |
parent | 4161b8be6772f2b7338458c9932d7417797966bb (diff) |
mixing types in product step 2:
* pload* and pset1 are now templated on the packet type
* gemv routines are now embeded into a structure with
a consistent API with respect to gemm
* some configurations of vector * matrix and matrix * matrix works fine,
some need more work...
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/src/Core/DenseStorageBase.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/DiagonalProduct.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/Functors.h | 132 | ||||
-rw-r--r-- | Eigen/src/Core/GenericPacketMath.h | 20 | ||||
-rw-r--r-- | Eigen/src/Core/MapBase.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/Product.h | 63 | ||||
-rw-r--r-- | Eigen/src/Core/SolveTriangular.h | 13 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AltiVec/Complex.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AltiVec/PacketMath.h | 22 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/Complex.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/PacketMath.h | 14 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/Complex.h | 25 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/MathFunctions.h | 16 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/PacketMath.h | 42 | ||||
-rw-r--r-- | Eigen/src/Core/products/CoeffBasedProduct.h | 72 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 332 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixVector.h | 315 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixVector.h | 16 | ||||
-rw-r--r-- | Eigen/src/Core/products/TriangularMatrixVector.h | 11 | ||||
-rw-r--r-- | Eigen/src/Core/util/BlasUtil.h | 48 |
20 files changed, 603 insertions, 552 deletions
diff --git a/Eigen/src/Core/DenseStorageBase.h b/Eigen/src/Core/DenseStorageBase.h index 6886e3f97..16e0a86f1 100644 --- a/Eigen/src/Core/DenseStorageBase.h +++ b/Eigen/src/Core/DenseStorageBase.h @@ -108,7 +108,7 @@ class DenseStorageBase : public ei_dense_xpr_base<Derived>::type template<int LoadMode> EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const { - return ei_ploadt<Scalar, LoadMode> + return ei_ploadt<PacketScalar, LoadMode> (m_storage.data() + (Flags & RowMajorBit ? col + row * m_storage.cols() : row + col * m_storage.rows())); @@ -117,7 +117,7 @@ class DenseStorageBase : public ei_dense_xpr_base<Derived>::type template<int LoadMode> EIGEN_STRONG_INLINE PacketScalar packet(Index index) const { - return ei_ploadt<Scalar, LoadMode>(m_storage.data() + index); + return ei_ploadt<PacketScalar, LoadMode>(m_storage.data() + index); } template<int StoreMode> diff --git a/Eigen/src/Core/DiagonalProduct.h b/Eigen/src/Core/DiagonalProduct.h index 610d13dc8..14c49e828 100644 --- a/Eigen/src/Core/DiagonalProduct.h +++ b/Eigen/src/Core/DiagonalProduct.h @@ -91,7 +91,7 @@ class DiagonalProduct : ei_no_assignment_operator, EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, ei_meta_true) const { return ei_pmul(m_matrix.template packet<LoadMode>(row, col), - ei_pset1(m_diagonal.diagonal().coeff(id))); + ei_pset1<PacketScalar>(m_diagonal.diagonal().coeff(id))); } template<int LoadMode> diff --git a/Eigen/src/Core/Functors.h b/Eigen/src/Core/Functors.h index 9084905aa..3f059d233 100644 --- a/Eigen/src/Core/Functors.h +++ b/Eigen/src/Core/Functors.h @@ -35,11 +35,11 @@ template<typename Scalar> struct ei_scalar_sum_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_sum_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return ei_padd(a,b); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const + template<typename Packet> + EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const { return ei_predux(a); } }; template<typename Scalar> @@ -58,11 +58,11 @@ struct ei_functor_traits<ei_scalar_sum_op<Scalar> > { template<typename Scalar> struct ei_scalar_product_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_product_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a * b; } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return ei_pmul(a,b); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const + template<typename Packet> + EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const { return ei_predux_mul(a); } }; template<typename Scalar> @@ -83,9 +83,9 @@ template<typename Scalar> struct ei_scalar_conj_product_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_conj_product_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return ei_conj_helper<Scalar,Scalar,Conj,false>().pmul(a,b); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const - { return ei_conj_helper<PacketScalar,PacketScalar,Conj,false>().pmul(a,b); } + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return ei_conj_helper<Packet,Packet,Conj,false>().pmul(a,b); } }; template<typename Scalar> struct ei_functor_traits<ei_scalar_conj_product_op<Scalar> > { @@ -103,11 +103,11 @@ struct ei_functor_traits<ei_scalar_conj_product_op<Scalar> > { template<typename Scalar> struct ei_scalar_min_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_min_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return std::min(a, b); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return ei_pmin(a,b); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const + template<typename Packet> + EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const { return ei_predux_min(a); } }; template<typename Scalar> @@ -126,11 +126,11 @@ struct ei_functor_traits<ei_scalar_min_op<Scalar> > { template<typename Scalar> struct ei_scalar_max_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_max_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return std::max(a, b); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return ei_pmax(a,b); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const + template<typename Packet> + EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const { return ei_predux_max(a); } }; template<typename Scalar> @@ -172,8 +172,8 @@ struct ei_functor_traits<ei_scalar_hypot_op<Scalar> > { template<typename Scalar> struct ei_scalar_difference_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_difference_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return ei_psub(a,b); } }; template<typename Scalar> @@ -192,8 +192,8 @@ struct ei_functor_traits<ei_scalar_difference_op<Scalar> > { template<typename Scalar> struct ei_scalar_quotient_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_quotient_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a / b; } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return ei_pdiv(a,b); } }; template<typename Scalar> @@ -214,8 +214,8 @@ struct ei_functor_traits<ei_scalar_quotient_op<Scalar> > { template<typename Scalar> struct ei_scalar_opposite_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_opposite_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return ei_pnegate(a); } }; template<typename Scalar> @@ -234,8 +234,8 @@ template<typename Scalar> struct ei_scalar_abs_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_abs_op) typedef typename NumTraits<Scalar>::Real result_type; EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return ei_abs(a); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return ei_pabs(a); } }; template<typename Scalar> @@ -256,8 +256,8 @@ template<typename Scalar> struct ei_scalar_abs2_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_abs2_op) typedef typename NumTraits<Scalar>::Real result_type; EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return ei_abs2(a); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return ei_pmul(a,a); } }; template<typename Scalar> @@ -272,8 +272,8 @@ struct ei_functor_traits<ei_scalar_abs2_op<Scalar> > template<typename Scalar> struct ei_scalar_conjugate_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_conjugate_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return ei_conj(a); } - template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const { return ei_pconj(a); } + template<typename Packet> + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return ei_pconj(a); } }; template<typename Scalar> struct ei_functor_traits<ei_scalar_conjugate_op<Scalar> > @@ -397,22 +397,22 @@ struct ei_functor_traits<ei_scalar_log_op<Scalar> > * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/ */ /* NOTE why doing the ei_pset1() in packetOp *is* an optimization ? - * indeed it seems better to declare m_other as a PacketScalar and do the ei_pset1() once + * indeed it seems better to declare m_other as a Packet and do the ei_pset1() once * in the constructor. However, in practice: - * - GCC does not like m_other as a PacketScalar and generate a load every time it needs it + * - GCC does not like m_other as a Packet and generate a load every time it needs it * - on the other hand GCC is able to moves the ei_pset1() away the loop :) * - simpler code ;) * (ICC and gcc 4.4 seems to perform well in both cases, the issue is visible with y = a*x + b*y) */ template<typename Scalar> struct ei_scalar_multiple_op { - typedef typename ei_packet_traits<Scalar>::type PacketScalar; + typedef typename ei_packet_traits<Scalar>::type Packet; // FIXME default copy constructors seems bugged with std::complex<> EIGEN_STRONG_INLINE ei_scalar_multiple_op(const ei_scalar_multiple_op& other) : m_other(other.m_other) { } EIGEN_STRONG_INLINE ei_scalar_multiple_op(const Scalar& other) : m_other(other) { } EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; } - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const - { return ei_pmul(a, ei_pset1(m_other)); } + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const + { return ei_pmul(a, ei_pset1<Packet>(m_other)); } typename ei_makeconst<typename NumTraits<Scalar>::Nested>::type m_other; }; template<typename Scalar> @@ -433,13 +433,13 @@ struct ei_functor_traits<ei_scalar_multiple2_op<Scalar1,Scalar2> > template<typename Scalar, bool IsInteger> struct ei_scalar_quotient1_impl { - typedef typename ei_packet_traits<Scalar>::type PacketScalar; + typedef typename ei_packet_traits<Scalar>::type Packet; // FIXME default copy constructors seems bugged with std::complex<> EIGEN_STRONG_INLINE ei_scalar_quotient1_impl(const ei_scalar_quotient1_impl& other) : m_other(other.m_other) { } EIGEN_STRONG_INLINE ei_scalar_quotient1_impl(const Scalar& other) : m_other(static_cast<Scalar>(1) / other) {} EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; } - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const - { return ei_pmul(a, ei_pset1(m_other)); } + EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const + { return ei_pmul(a, ei_pset1<Packet>(m_other)); } const Scalar m_other; }; template<typename Scalar> @@ -480,13 +480,13 @@ struct ei_functor_traits<ei_scalar_quotient1_op<Scalar> > template<typename Scalar> struct ei_scalar_constant_op { - typedef typename ei_packet_traits<Scalar>::type PacketScalar; + typedef typename ei_packet_traits<Scalar>::type Packet; EIGEN_STRONG_INLINE ei_scalar_constant_op(const ei_scalar_constant_op& other) : m_other(other.m_other) { } EIGEN_STRONG_INLINE ei_scalar_constant_op(const Scalar& other) : m_other(other) { } template<typename Index> EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; } template<typename Index> - EIGEN_STRONG_INLINE const PacketScalar packetOp(Index, Index = 0) const { return ei_pset1(m_other); } + EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const { return ei_pset1<Packet>(m_other); } const Scalar m_other; }; template<typename Scalar> @@ -513,22 +513,22 @@ template <typename Scalar, bool RandomAccess> struct ei_linspaced_op_impl; template <typename Scalar> struct ei_linspaced_op_impl<Scalar,false> { - typedef typename ei_packet_traits<Scalar>::type PacketScalar; + typedef typename ei_packet_traits<Scalar>::type Packet; ei_linspaced_op_impl(Scalar low, Scalar step) : m_low(low), m_step(step), - m_packetStep(ei_pset1(ei_packet_traits<Scalar>::size*step)), - m_base(ei_padd(ei_pset1(low),ei_pmul(ei_pset1(step),ei_plset<Scalar>(-ei_packet_traits<Scalar>::size)))) {} + m_packetStep(ei_pset1<Packet>(ei_packet_traits<Scalar>::size*step)), + m_base(ei_padd(ei_pset1<Packet>(low),ei_pmul(ei_pset1<Packet>(step),ei_plset<Scalar>(-ei_packet_traits<Scalar>::size)))) {} template<typename Index> EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; } template<typename Index> - EIGEN_STRONG_INLINE const PacketScalar packetOp(Index) const { return m_base = ei_padd(m_base,m_packetStep); } + EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = ei_padd(m_base,m_packetStep); } const Scalar m_low; const Scalar m_step; - const PacketScalar m_packetStep; - mutable PacketScalar m_base; + const Packet m_packetStep; + mutable Packet m_base; }; // random access for packet ops: @@ -537,23 +537,23 @@ struct ei_linspaced_op_impl<Scalar,false> template <typename Scalar> struct ei_linspaced_op_impl<Scalar,true> { - typedef typename ei_packet_traits<Scalar>::type PacketScalar; + typedef typename ei_packet_traits<Scalar>::type Packet; ei_linspaced_op_impl(Scalar low, Scalar step) : m_low(low), m_step(step), - m_lowPacket(ei_pset1(m_low)), m_stepPacket(ei_pset1(m_step)), m_interPacket(ei_plset<Scalar>(0)) {} + m_lowPacket(ei_pset1<Packet>(m_low)), m_stepPacket(ei_pset1<Packet>(m_step)), m_interPacket(ei_plset<Scalar>(0)) {} template<typename Index> EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; } template<typename Index> - EIGEN_STRONG_INLINE const PacketScalar packetOp(Index i) const - { return ei_padd(m_lowPacket, ei_pmul(m_stepPacket, ei_padd(ei_pset1<Scalar>(i),m_interPacket))); } + EIGEN_STRONG_INLINE const Packet packetOp(Index i) const + { return ei_padd(m_lowPacket, ei_pmul(m_stepPacket, ei_padd(ei_pset1<Packet>(i),m_interPacket))); } const Scalar m_low; const Scalar m_step; - const PacketScalar m_lowPacket; - const PacketScalar m_stepPacket; - const PacketScalar m_interPacket; + const Packet m_lowPacket; + const Packet m_stepPacket; + const Packet m_interPacket; }; // ----- Linspace functor ---------------------------------------------------------------- @@ -566,12 +566,12 @@ template <typename Scalar, bool RandomAccess> struct ei_functor_traits< ei_linsp { enum { Cost = 1, PacketAccess = ei_packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; }; template <typename Scalar, bool RandomAccess> struct ei_linspaced_op { - typedef typename ei_packet_traits<Scalar>::type PacketScalar; + typedef typename ei_packet_traits<Scalar>::type Packet; ei_linspaced_op(Scalar low, Scalar high, int num_steps) : impl(low, (high-low)/(num_steps-1)) {} template<typename Index> EIGEN_STRONG_INLINE const Scalar operator() (Index i, Index = 0) const { return impl(i); } template<typename Index> - EIGEN_STRONG_INLINE const PacketScalar packetOp(Index i, Index = 0) const { return impl.packetOp(i); } + EIGEN_STRONG_INLINE const Packet packetOp(Index i, Index = 0) const { return impl.packetOp(i); } // This proxy object handles the actual required temporaries, the different // implementations (random vs. sequential access) as well as the piping // correct piping to size 2/4 packet operations. @@ -597,13 +597,13 @@ template<typename Scalar> struct ei_functor_allows_mixing_real_and_complex<ei_sc /* If you wonder why doing the ei_pset1() in packetOp() is an optimization check ei_scalar_multiple_op */ template<typename Scalar> struct ei_scalar_add_op { - typedef typename ei_packet_traits<Scalar>::type PacketScalar; + typedef typename ei_packet_traits<Scalar>::type Packet; // FIXME default copy constructors seems bugged with std::complex<> inline ei_scalar_add_op(const ei_scalar_add_op& other) : m_other(other.m_other) { } inline ei_scalar_add_op(const Scalar& other) : m_other(other) { } inline Scalar operator() (const Scalar& a) const { return a + m_other; } - inline const PacketScalar packetOp(const PacketScalar& a) const - { return ei_padd(a, ei_pset1(m_other)); } + inline const Packet packetOp(const Packet& a) const + { return ei_padd(a, ei_pset1<Packet>(m_other)); } const Scalar m_other; }; template<typename Scalar> @@ -690,9 +690,9 @@ template<typename Scalar> struct ei_scalar_inverse_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_inverse_op) inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; } - template<typename PacketScalar> - inline const PacketScalar packetOp(const PacketScalar& a) const - { return ei_pdiv(ei_pset1(Scalar(1)),a); } + template<typename Packet> + inline const Packet packetOp(const Packet& a) const + { return ei_pdiv(ei_pset1<Packet>(Scalar(1)),a); } }; template<typename Scalar> struct ei_functor_traits<ei_scalar_inverse_op<Scalar> > @@ -706,8 +706,8 @@ template<typename Scalar> struct ei_scalar_square_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_square_op) inline Scalar operator() (const Scalar& a) const { return a*a; } - template<typename PacketScalar> - inline const PacketScalar packetOp(const PacketScalar& a) const + template<typename Packet> + inline const Packet packetOp(const Packet& a) const { return ei_pmul(a,a); } }; template<typename Scalar> @@ -722,8 +722,8 @@ template<typename Scalar> struct ei_scalar_cube_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_cube_op) inline Scalar operator() (const Scalar& a) const { return a*a*a; } - template<typename PacketScalar> - inline const PacketScalar packetOp(const PacketScalar& a) const + template<typename Packet> + inline const Packet packetOp(const Packet& a) const { return ei_pmul(a,ei_pmul(a,a)); } }; template<typename Scalar> diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index d290074e5..914f16e73 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -160,16 +160,16 @@ template<typename Packet> inline Packet ei_pandnot(const Packet& a, const Packet& b) { return a & (!b); } /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ -template<typename Scalar> inline typename ei_packet_traits<Scalar>::type -ei_pload(const Scalar* from) { return *from; } +template<typename Packet> inline Packet +ei_pload(const typename ei_unpacket_traits<Packet>::type* from) { return *from; } /** \internal \returns a packet version of \a *from, (un-aligned load) */ -template<typename Scalar> inline typename ei_packet_traits<Scalar>::type -ei_ploadu(const Scalar* from) { return *from; } +template<typename Packet> inline Packet +ei_ploadu(const typename ei_unpacket_traits<Packet>::type* from) { return *from; } /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ -template<typename Scalar> inline typename ei_packet_traits<Scalar>::type -ei_pset1(const Scalar& a) { return a; } +template<typename Packet> inline Packet +ei_pset1(const typename ei_unpacket_traits<Packet>::type& a) { return a; } /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ template<typename Scalar> inline typename ei_packet_traits<Scalar>::type @@ -256,13 +256,13 @@ ei_pmadd(const Packet& a, /** \internal \returns a packet version of \a *from. * \If LoadMode equals Aligned, \a from must be 16 bytes aligned */ -template<typename Scalar, int LoadMode> -inline typename ei_packet_traits<Scalar>::type ei_ploadt(const Scalar* from) +template<typename Packet, int LoadMode> +inline Packet ei_ploadt(const typename ei_unpacket_traits<Packet>::type* from) { if(LoadMode == Aligned) - return ei_pload(from); + return ei_pload<Packet>(from); else - return ei_ploadu(from); + return ei_ploadu<Packet>(from); } /** \internal copy the packet \a from to \a *to. diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index 6af7bb793..740f2b247 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -124,14 +124,14 @@ template<typename Derived> class MapBase template<int LoadMode> inline PacketScalar packet(Index row, Index col) const { - return ei_ploadt<Scalar, LoadMode> + return ei_ploadt<PacketScalar, LoadMode> (m_data + (col * colStride() + row * rowStride())); } template<int LoadMode> inline PacketScalar packet(Index index) const { - return ei_ploadt<Scalar, LoadMode>(m_data + index * innerStride()); + return ei_ploadt<PacketScalar, LoadMode>(m_data + index * innerStride()); } template<int StoreMode> diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 139132c6b..edd79bd9a 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -282,10 +282,13 @@ class GeneralProduct<Lhs, Rhs, GemvProduct> public: EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct) + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) { - EIGEN_STATIC_ASSERT((ei_is_same_type<typename Lhs::Scalar, typename Rhs::Scalar>::ret), - YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) +// EIGEN_STATIC_ASSERT((ei_is_same_type<typename Lhs::Scalar, typename Rhs::Scalar>::ret), +// YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) } enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight }; @@ -295,7 +298,8 @@ class GeneralProduct<Lhs, Rhs, GemvProduct> { ei_assert(m_lhs.rows() == dst.rows() && m_rhs.cols() == dst.cols()); ei_gemv_selector<Side,(int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor, - bool(ei_blas_traits<MatrixType>::HasUsableDirectAccess)>::run(*this, dst, alpha); + bool(ei_blas_traits<MatrixType>::HasUsableDirectAccess) + /*&& ei_is_same_type<typename Lhs::Scalar, typename Rhs::Scalar>::ret*/>::run(*this, dst, alpha); } }; @@ -319,43 +323,48 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true> template<typename ProductType, typename Dest> static void run(const ProductType& prod, Dest& dest, typename ProductType::Scalar alpha) { - typedef typename ProductType::Scalar Scalar; + typedef typename ProductType::Index Index; + typedef typename ProductType::LhsScalar LhsScalar; + typedef typename ProductType::RhsScalar RhsScalar; + typedef typename ProductType::Scalar ResScalar; typedef typename ProductType::ActualLhsType ActualLhsType; typedef typename ProductType::ActualRhsType ActualRhsType; typedef typename ProductType::LhsBlasTraits LhsBlasTraits; typedef typename ProductType::RhsBlasTraits RhsBlasTraits; - typedef Map<Matrix<Scalar,Dynamic,1>, Aligned> MappedDest; + typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest; ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs()); ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs()); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs()) - * RhsBlasTraits::extractScalarFactor(prod.rhs()); + ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs()) + * RhsBlasTraits::extractScalarFactor(prod.rhs()); enum { // FIXME find a way to allow an inner stride on the result if ei_packet_traits<Scalar>::size==1 EvalToDest = Dest::InnerStrideAtCompileTime==1 }; - Scalar* EIGEN_RESTRICT actualDest; + ResScalar* actualDest; if (EvalToDest) actualDest = &dest.coeffRef(0); else { - actualDest = ei_aligned_stack_new(Scalar,dest.size()); + actualDest = ei_aligned_stack_new(ResScalar,dest.size()); MappedDest(actualDest, dest.size()) = dest; } - ei_cache_friendly_product_colmajor_times_vector - <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>( - dest.size(), - &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(), - actualRhs, actualDest, actualAlpha); + ei_general_matrix_vector_product + <Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run( + actualLhs.rows(), actualLhs.cols(), + &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(), + actualRhs, actualRhs.innerStride(), + actualDest, 1, + actualAlpha); if (!EvalToDest) { dest = MappedDest(actualDest, dest.size()); - ei_aligned_stack_delete(Scalar, actualDest, dest.size()); + ei_aligned_stack_delete(ResScalar, actualDest, dest.size()); } } }; @@ -365,7 +374,9 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true> template<typename ProductType, typename Dest> static void run(const ProductType& prod, Dest& dest, typename ProductType::Scalar alpha) { - typedef typename ProductType::Scalar Scalar; + typedef typename ProductType::LhsScalar LhsScalar; + typedef typename ProductType::RhsScalar RhsScalar; + typedef typename ProductType::Scalar ResScalar; typedef typename ProductType::Index Index; typedef typename ProductType::ActualLhsType ActualLhsType; typedef typename ProductType::ActualRhsType ActualRhsType; @@ -376,34 +387,34 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true> ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs()); ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs()); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs()) - * RhsBlasTraits::extractScalarFactor(prod.rhs()); + ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs()) + * RhsBlasTraits::extractScalarFactor(prod.rhs()); enum { // FIXME I think here we really have to check for ei_packet_traits<Scalar>::size==1 // because in this case it is fine to have an inner stride - DirectlyUseRhs = ((ei_packet_traits<Scalar>::size==1) || (_ActualRhsType::Flags&ActualPacketAccessBit)) + DirectlyUseRhs = ((ei_packet_traits<RhsScalar>::size==1) || (_ActualRhsType::Flags&ActualPacketAccessBit)) && (!(_ActualRhsType::Flags & RowMajorBit)) }; - Scalar* EIGEN_RESTRICT rhs_data; + RhsScalar* rhs_data; if (DirectlyUseRhs) - rhs_data = reinterpret_cast<Scalar* EIGEN_RESTRICT>(&actualRhs.const_cast_derived().coeffRef(0)); + rhs_data = &actualRhs.const_cast_derived().coeffRef(0); else { - rhs_data = ei_aligned_stack_new(Scalar, actualRhs.size()); - Map<typename _ActualRhsType::PlainObject>(reinterpret_cast<Scalar*>(rhs_data), actualRhs.size()) = actualRhs; + rhs_data = ei_aligned_stack_new(RhsScalar, actualRhs.size()); + Map<typename _ActualRhsType::PlainObject>(rhs_data, actualRhs.size()) = actualRhs; } - ei_cache_friendly_product_rowmajor_times_vector - <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate, Scalar, Index>( + ei_general_matrix_vector_product + <Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run( actualLhs.rows(), actualLhs.cols(), &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(), rhs_data, 1, &dest.coeffRef(0,0), dest.innerStride(), actualAlpha); - if (!DirectlyUseRhs) ei_aligned_stack_delete(Scalar, rhs_data, prod.rhs().size()); + if (!DirectlyUseRhs) ei_aligned_stack_delete(RhsScalar, rhs_data, prod.rhs().size()); } }; diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index 90ce2a802..f9e24a193 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h @@ -81,7 +81,7 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,RowMajor Index startRow = IsLower ? pi : pi-actualPanelWidth; Index startCol = IsLower ? 0 : pi; - ei_cache_friendly_product_rowmajor_times_vector<LhsProductTraits::NeedToConjugate,false,Scalar,Index>( + ei_general_matrix_vector_product<Index,Scalar,RowMajor,LhsProductTraits::NeedToConjugate,Scalar,false>::run( actualPanelWidth, r, &(actualLhs.const_cast_derived().coeffRef(startRow,startCol)), actualLhs.outerStride(), &(other.coeffRef(startCol)), other.innerStride(), @@ -148,12 +148,11 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,ColMajor // let's directly call the low level product function because: // 1 - it is faster to compile // 2 - it is slighlty faster at runtime - ei_cache_friendly_product_colmajor_times_vector<LhsProductTraits::NeedToConjugate,false>( - r, - &(actualLhs.const_cast_derived().coeffRef(endBlock,startBlock)), actualLhs.outerStride(), - other.segment(startBlock, actualPanelWidth), - &(other.coeffRef(endBlock, 0)), - Scalar(-1)); + ei_general_matrix_vector_product<Index,Scalar,ColMajor,LhsProductTraits::NeedToConjugate,Scalar,false>::run( + r, actualPanelWidth, + &(actualLhs.const_cast_derived().coeffRef(endBlock,startBlock)), actualLhs.outerStride(), + other.segment(startBlock, actualPanelWidth), other.innerStride(), + &(other.coeffRef(endBlock, 0)), other.innerStride(), Scalar(-1)); } } } diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 2dba95a2f..ecada02f4 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -63,7 +63,7 @@ template<> struct ei_packet_traits<std::complex<float> > : ei_default_packet_tr template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; -template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>& from) +template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>& from) { Packet2cf res; /* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */ diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index a3ceed8e8..8205beae5 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -59,13 +59,13 @@ typedef __vector unsigned char Packet16uc; Packet4i ei_p4i_##NAME = vec_splat_s32(X) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ - Packet4f ei_p4f_##NAME = ei_pset1<float>(X) + Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X)) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ - Packet4i ei_p4i_##NAME = ei_pset1<int>(X) + Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X) #define DST_CHAN 1 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) @@ -158,7 +158,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v) return s; } */ -template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { +template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float& from) { // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html float EIGEN_ALIGN16 af[4]; af[0] = from; @@ -167,7 +167,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { return vc; } -template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { +template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int& from) { int EIGEN_ALIGN16 ai[4]; ai[0] = from; Packet4i vc = vec_ld(0, ai); @@ -175,8 +175,8 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { return vc; } -template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return vec_add(ei_pset1(a), ei_p4f_COUNTDOWN); } -template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return vec_add(ei_pset1(a), ei_p4i_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return vec_add(ei_pset1<Packet4f>(a), ei_p4f_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return vec_add(ei_pset1<Packet4i>(a), ei_p4i_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); } template<> EIGEN_STRONG_INLINE Packet4i ei_padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); } @@ -241,7 +241,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) { ei_assert(false && "packet integer division are not supported by AltiVec"); - return ei_pset1<int>(0); + return ei_pset1<Packet4i>(0); } // for some weird raisons, it has to be overloaded for packet of integers @@ -267,10 +267,10 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pxor<Packet4i>(const Packet4i& a, con template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } -template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } +template<> EIGEN_STRONG_INLINE Packet4f ei_pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } -template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) +template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html @@ -282,7 +282,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data } -template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) +template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index bf68a2bbb..6d9e8da85 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -58,7 +58,7 @@ template<> struct ei_packet_traits<std::complex<float> > : ei_default_packet_tr template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; -template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>& from) +template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>& from) { float32x2_t r64; r64 = vld1_f32((float *)&from); diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 5b0d6ab12..b899fece1 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -45,13 +45,13 @@ typedef float32x4_t Packet4f; typedef int32x4_t Packet4i; #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ - const Packet4f ei_p4f_##NAME = ei_pset1<float>(X) + const Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ const Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X)) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ - const Packet4i ei_p4i_##NAME = ei_pset1<int>(X) + const Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X) #ifndef __pld #define __pld(x) asm volatile ( " pld [%[addr]]\n" :: [addr] "r" (x) : "cc" ); @@ -88,18 +88,18 @@ template<> struct ei_packet_traits<int> : ei_default_packet_traits template<> struct ei_unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; template<> struct ei_unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; -template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { return vdupq_n_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { return vdupq_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); } template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { Packet4f countdown = { 3, 2, 1, 0 }; - return vaddq_f32(ei_pset1(a), countdown); + return vaddq_f32(ei_pset1<Packet4f>(a), countdown); } template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { Packet4i countdown = { 3, 2, 1, 0 }; - return vaddq_s32(ei_pset1(a), countdown); + return vaddq_s32(ei_pset1<Packet4i>(a), countdown); } template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); } @@ -137,7 +137,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con } template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) { ei_assert(false && "packet integer division are not supported by NEON"); - return ei_pset1<int>(0); + return ei_pset1<Packet4i>(0); } // for some weird raisons, it has to be overloaded for packet of integers diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 9d32ede0e..6c72293fc 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -89,15 +89,15 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_por <Packet2cf>(const Packet2cf& template<> EIGEN_STRONG_INLINE Packet2cf ei_pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf ei_pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(ei_pload(&ei_real_ref(*from))); } -template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu(&ei_real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(ei_pload<Packet4f>(&ei_real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu<Packet4f>(&ei_real_ref(*from))); } template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstore(&ei_real_ref(*to), from.v); } template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu(&ei_real_ref(*to), from.v); } template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>& from) +template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>& from) { Packet2cf res; res.v = _mm_loadl_pi(res.v, (const __m64*)&from); @@ -276,10 +276,12 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pxor <Packet1cd>(const Packet1cd& template<> EIGEN_STRONG_INLINE Packet1cd ei_pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } // FIXME force unaligned load, this is a temporary fix -template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<std::complex<double> >(const std::complex<double>& from) -{ /* here we really have to use unaligned loads :( */ return ei_ploadu(&from); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <Packet1cd>(const std::complex<double>* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu<Packet2d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<Packet1cd>(const std::complex<double>* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu<Packet2d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<Packet1cd>(const std::complex<double>& from) +{ /* here we really have to use unaligned loads :( */ return ei_ploadu<Packet1cd>(&from); } // FIXME force unaligned store, this is a temporary fix template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstoreu((double*)to, from.v); } @@ -387,6 +389,15 @@ template<> struct ei_conj_helper<Packet2d, Packet1cd, false,false> { return Packet1cd(ei_pmul(x, y.v)); } }; +template<> struct ei_conj_helper<Packet1cd, Packet2d, false,false> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const + { return ei_padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const + { return Packet1cd(ei_pmul(x.v, y)); } +}; + template<> EIGEN_STRONG_INLINE Packet1cd ei_pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for SSE3 and 4 diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 742bfa92f..e4ca82985 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -378,14 +378,14 @@ Packet4f ei_pcos<Packet4f>(const Packet4f& _x) template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f ei_psqrt<Packet4f>(const Packet4f& _x) { - Packet4f half = ei_pmul(_x, ei_pset1(.5f)); - - /* select only the inverse sqrt of non-zero inputs */ - Packet4f non_zero_mask = _mm_cmpgt_ps(_x, ei_pset1(std::numeric_limits<float>::epsilon())); - Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x)); - - x = ei_pmul(x, ei_psub(ei_pset1(1.5f), ei_pmul(half, ei_pmul(x,x)))); - return ei_pmul(_x,x); + Packet4f half = ei_pmul(_x, ei_pset1<Packet4f>(.5f)); + + /* select only the inverse sqrt of non-zero inputs */ + Packet4f non_zero_mask = _mm_cmpgt_ps(_x, ei_pset1<Packet4f>(std::numeric_limits<float>::epsilon())); + Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x)); + + x = ei_pmul(x, ei_psub(ei_pset1<Packet4f>(1.5f), ei_pmul(half, ei_pmul(x,x)))); + return ei_pmul(_x,x); } #endif // EIGEN_MATH_FUNCTIONS_SSE_H diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 91af346ed..53a9bcf56 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -53,13 +53,13 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; }; (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ - const Packet4f ei_p4f_##NAME = ei_pset1<float>(X) + const Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f ei_p4f_##NAME = _mm_castsi128_ps(ei_pset1<int>(X)) + const Packet4f ei_p4f_##NAME = _mm_castsi128_ps(ei_pset1<Packet4i>(X)) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ - const Packet4i ei_p4i_##NAME = ei_pset1<int>(X) + const Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X) template<> struct ei_packet_traits<float> : ei_default_packet_traits @@ -107,11 +107,11 @@ template<> struct ei_unpacket_traits<Packet4i> { typedef int type; enum {size #ifdef __GNUC__ // Sometimes GCC implements _mm_set1_p* using multiple moves, // that is inefficient :( (e.g., see ei_gemm_pack_rhs) -template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { +template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float& from) { Packet4f res = _mm_set_ss(from); return ei_vec4f_swizzle1(res,0,0,0,0); } -template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) { +template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<Packet2d>(const double& from) { #ifdef EIGEN_VECTORIZE_SSE3 return _mm_loaddup_pd(&from); #else @@ -120,14 +120,14 @@ template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) { #endif } #else -template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { return _mm_set1_ps(from); } -template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) { return _mm_set1_pd(from); } +template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float& from) { return _mm_set1_ps(from); } +template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); } #endif -template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { return _mm_set1_epi32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); } -template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return _mm_add_ps(ei_pset1(a), _mm_set_ps(3,2,1,0)); } -template<> EIGEN_STRONG_INLINE Packet2d ei_plset<double>(const double& a) { return _mm_add_pd(ei_pset1(a),_mm_set_pd(1,0)); } -template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return _mm_add_epi32(ei_pset1(a),_mm_set_epi32(3,2,1,0)); } +template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return _mm_add_ps(ei_pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); } +template<> EIGEN_STRONG_INLINE Packet2d ei_plset<double>(const double& a) { return _mm_add_pd(ei_pset1<Packet2d>(a),_mm_set_pd(1,0)); } +template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return _mm_add_epi32(ei_pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d ei_padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } @@ -174,7 +174,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con template<> EIGEN_STRONG_INLINE Packet2d ei_pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) { ei_assert(false && "packet integer division are not supported by SSE"); - return ei_pset1<int>(0); + return ei_pset1<Packet4i>(0); } // for some weird raisons, it has to be overloaded for packet of integers @@ -214,14 +214,14 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, template<> EIGEN_STRONG_INLINE Packet2d ei_pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } -template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } -template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); } +template<> EIGEN_STRONG_INLINE Packet4f ei_pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } +template<> EIGEN_STRONG_INLINE Packet2d ei_pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); } #if defined(_MSC_VER) - template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); } - template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } - template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); } + template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); } + template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } + template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); } #else // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would // require pointer casting to incompatible pointer types and leads to invalid code @@ -229,7 +229,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_D // a correct instruction dependency. // TODO: do the same for MSVC (ICC is compatible) // NOTE: with the code below, MSVC's compiler crashes! -template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) +template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD __m128d res; @@ -237,7 +237,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) res = _mm_loadh_pd(res, (const double*)(from+2)) ; return _mm_castpd_ps(res); } -template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from) +template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD __m128d res; @@ -245,7 +245,7 @@ template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from) res = _mm_loadh_pd(res,from+1); return res; } -template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) +template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD __m128d res; diff --git a/Eigen/src/Core/products/CoeffBasedProduct.h b/Eigen/src/Core/products/CoeffBasedProduct.h index 1474bc1bb..ceaeda284 100644 --- a/Eigen/src/Core/products/CoeffBasedProduct.h +++ b/Eigen/src/Core/products/CoeffBasedProduct.h @@ -42,7 +42,7 @@ template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar> struct ei_product_coeff_impl; -template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> +template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode> struct ei_product_packet_impl; template<typename LhsNested, typename RhsNested, int NestingFlags> @@ -275,20 +275,20 @@ struct ei_product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar> *** Scalar path with inner vectorization *** *******************************************/ -template<int UnrollingIndex, typename Lhs, typename Rhs, typename PacketScalar> +template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet> struct ei_product_coeff_vectorized_unroller { typedef typename Lhs::Index Index; enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size }; EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) { - ei_product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres); + ei_product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres); pres = ei_padd(pres, ei_pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) )); } }; -template<typename Lhs, typename Rhs, typename PacketScalar> -struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar> +template<typename Lhs, typename Rhs, typename Packet> +struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet> { typedef typename Lhs::Index Index; EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) @@ -300,13 +300,13 @@ struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar> template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar> struct ei_product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar> { - typedef typename Lhs::PacketScalar PacketScalar; + typedef typename Lhs::PacketScalar Packet; typedef typename Lhs::Index Index; enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size }; EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) { - PacketScalar pres; - ei_product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres); + Packet pres; + ei_product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres); ei_product_coeff_impl<DefaultTraversal,UnrollingIndex,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res); res = ei_predux(pres); } @@ -368,71 +368,71 @@ struct ei_product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetSca *** Packet path *** *******************/ -template<int UnrollingIndex, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, PacketScalar, LoadMode> +template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct ei_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode> { typedef typename Lhs::Index Index; - EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) { - ei_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res); - res = ei_pmadd(ei_pset1(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res); + ei_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res); + res = ei_pmadd(ei_pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res); } }; -template<int UnrollingIndex, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, PacketScalar, LoadMode> +template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct ei_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode> { typedef typename Lhs::Index Index; - EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) { - ei_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res); - res = ei_pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), ei_pset1(rhs.coeff(UnrollingIndex, col)), res); + ei_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res); + res = ei_pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), ei_pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res); } }; -template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<RowMajor, 0, Lhs, Rhs, PacketScalar, LoadMode> +template<typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct ei_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode> { typedef typename Lhs::Index Index; - EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) { - res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col)); + res = ei_pmul(ei_pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col)); } }; -template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<ColMajor, 0, Lhs, Rhs, PacketScalar, LoadMode> +template<typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct ei_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode> { typedef typename Lhs::Index Index; - EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) { - res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col))); + res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1<Packet>(rhs.coeff(0, col))); } }; -template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode> +template<typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct ei_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> { typedef typename Lhs::Index Index; - EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res) + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res) { ei_assert(lhs.cols()>0 && "you are using a non initialized matrix"); - res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col)); + res = ei_pmul(ei_pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col)); for(Index i = 1; i < lhs.cols(); ++i) - res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res); + res = ei_pmadd(ei_pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res); } }; -template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode> -struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode> +template<typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> { typedef typename Lhs::Index Index; - EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res) + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res) { ei_assert(lhs.cols()>0 && "you are using a non initialized matrix"); - res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col))); + res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1<Packet>(rhs.coeff(0, col))); for(Index i = 1; i < lhs.cols(); ++i) - res = ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1(rhs.coeff(i, col)), res); + res = ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1<Packet>(rhs.coeff(i, col)), res); } }; diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 3dae26eee..ffb4cd386 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -133,11 +133,13 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n); } -#ifdef EIGEN_HAS_FUSE_CJMADD +// FIXME +// #ifdef EIGEN_HAS_FUSE_CJMADD #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); -#else - #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = ei_padd(C,T); -#endif +// #else + //#define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = ei_padd(C,ResPacket(T)); +// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); +// #endif // optimized GEneral packed Block * packed Panel product kernel template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> @@ -152,13 +154,13 @@ struct ei_gebp_kernel ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1 }; - typedef typename ei_packet_traits<LhsScalar>::type _LhsPacketType; - typedef typename ei_packet_traits<RhsScalar>::type _RhsPacketType; - typedef typename ei_packet_traits<ResScalar>::type _ResPacketType; + typedef typename ei_packet_traits<LhsScalar>::type _LhsPacket; + typedef typename ei_packet_traits<RhsScalar>::type _RhsPacket; + typedef typename ei_packet_traits<ResScalar>::type _ResPacket; - typedef typename ei_meta_if<Vectorizable,_LhsPacketType,LhsScalar>::ret LhsPacketType; - typedef typename ei_meta_if<Vectorizable,_RhsPacketType,RhsScalar>::ret RhsPacketType; - typedef typename ei_meta_if<Vectorizable,_ResPacketType,ResScalar>::ret ResPacketType; + typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket; + typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket; + typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket; void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0) @@ -166,7 +168,7 @@ struct ei_gebp_kernel if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; ei_conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; - ei_conj_helper<LhsPacketType,RhsPacketType,ConjugateLhs,ConjugateRhs> pcj; + ei_conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; Index packet_cols = (cols/nr) * nr; const Index peeled_mc = (rows/mr)*mr; const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsPacketSize ? LhsPacketSize : 0); @@ -183,7 +185,7 @@ struct ei_gebp_kernel const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; Index n = depth*nr; for(Index k=0; k<n; k++) - ei_pstore(&unpackedB[k*RhsPacketSize], ei_pset1(blB[k])); + ei_pstore(&unpackedB[k*RhsPacketSize], ei_pset1<RhsPacket>(blB[k])); /*Scalar* dest = unpackedB; for(Index k=0; k<n; k+=4*PacketSize) { @@ -197,11 +199,11 @@ struct ei_gebp_kernel _mm_prefetch((const char*)(&dest[S+3*G]), _MM_HINT_T0); #endif - PacketType C0[PacketSize], C1[PacketSize], C2[PacketSize], C3[PacketSize]; - C0[0] = ei_pload(blB+0*PacketSize); - C1[0] = ei_pload(blB+1*PacketSize); - C2[0] = ei_pload(blB+2*PacketSize); - C3[0] = ei_pload(blB+3*PacketSize); + RhsPacket C0[PacketSize], C1[PacketSize], C2[PacketSize], C3[PacketSize]; + C0[0] = ei_pload<RhsPacket>(blB+0*PacketSize); + C1[0] = ei_pload<RhsPacket>(blB+1*PacketSize); + C2[0] = ei_pload<RhsPacket>(blB+2*PacketSize); + C3[0] = ei_pload<RhsPacket>(blB+3*PacketSize); ei_punpackp(C0); ei_punpackp(C1); @@ -243,15 +245,15 @@ struct ei_gebp_kernel // TODO move the res loads to the stores // gets res block as register - ResPacketType C0, C1, C2, C3, C4, C5, C6, C7; - C0 = ei_pset1(ResScalar(0)); - C1 = ei_pset1(ResScalar(0)); - if(nr==4) C2 = ei_pset1(ResScalar(0)); - if(nr==4) C3 = ei_pset1(ResScalar(0)); - C4 = ei_pset1(ResScalar(0)); - C5 = ei_pset1(ResScalar(0)); - if(nr==4) C6 = ei_pset1(ResScalar(0)); - if(nr==4) C7 = ei_pset1(ResScalar(0)); + ResPacket C0, C1, C2, C3, C4, C5, C6, C7; + C0 = ei_pset1<ResPacket>(ResScalar(0)); + C1 = ei_pset1<ResPacket>(ResScalar(0)); + if(nr==4) C2 = ei_pset1<ResPacket>(ResScalar(0)); + if(nr==4) C3 = ei_pset1<ResPacket>(ResScalar(0)); + C4 = ei_pset1<ResPacket>(ResScalar(0)); + C5 = ei_pset1<ResPacket>(ResScalar(0)); + if(nr==4) C6 = ei_pset1<ResPacket>(ResScalar(0)); + if(nr==4) C7 = ei_pset1<ResPacket>(ResScalar(0)); ResScalar* r0 = &res[(j2+0)*resStride + i]; ResScalar* r1 = r0 + resStride; @@ -271,106 +273,106 @@ struct ei_gebp_kernel { if(nr==2) { - LhsPacketType A0, A1; - RhsPacketType B0; + LhsPacket A0, A1; + RhsPacket B0; #ifndef EIGEN_HAS_FUSE_CJMADD - ResPacketType T0; + RhsPacket T0; #endif EIGEN_ASM_COMMENT("mybegin"); - A0 = ei_pload(&blA[0*LhsPacketSize]); - A1 = ei_pload(&blA[1*LhsPacketSize]); - B0 = ei_pload(&blB[0*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]); + A1 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]); MADD(pcj,A0,B0,C0,T0); MADD(pcj,A1,B0,C4,B0); - B0 = ei_pload(&blB[1*RhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]); MADD(pcj,A0,B0,C1,T0); MADD(pcj,A1,B0,C5,B0); - A0 = ei_pload(&blA[2*LhsPacketSize]); - A1 = ei_pload(&blA[3*LhsPacketSize]); - B0 = ei_pload(&blB[2*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[2*LhsPacketSize]); + A1 = ei_pload<LhsPacket>(&blA[3*LhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]); MADD(pcj,A0,B0,C0,T0); MADD(pcj,A1,B0,C4,B0); - B0 = ei_pload(&blB[3*RhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]); MADD(pcj,A0,B0,C1,T0); MADD(pcj,A1,B0,C5,B0); - A0 = ei_pload(&blA[4*LhsPacketSize]); - A1 = ei_pload(&blA[5*LhsPacketSize]); - B0 = ei_pload(&blB[4*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[4*LhsPacketSize]); + A1 = ei_pload<LhsPacket>(&blA[5*LhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[4*RhsPacketSize]); MADD(pcj,A0,B0,C0,T0); MADD(pcj,A1,B0,C4,B0); - B0 = ei_pload(&blB[5*RhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[5*RhsPacketSize]); MADD(pcj,A0,B0,C1,T0); MADD(pcj,A1,B0,C5,B0); - A0 = ei_pload(&blA[6*LhsPacketSize]); - A1 = ei_pload(&blA[7*LhsPacketSize]); - B0 = ei_pload(&blB[6*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[6*LhsPacketSize]); + A1 = ei_pload<LhsPacket>(&blA[7*LhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[6*RhsPacketSize]); MADD(pcj,A0,B0,C0,T0); MADD(pcj,A1,B0,C4,B0); - B0 = ei_pload(&blB[7*RhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[7*RhsPacketSize]); MADD(pcj,A0,B0,C1,T0); MADD(pcj,A1,B0,C5,B0); EIGEN_ASM_COMMENT("myend"); } else { - LhsPacketType A0, A1; - RhsPacketType B0, B1, B2, B3; + LhsPacket A0, A1; + RhsPacket B0, B1, B2, B3; #ifndef EIGEN_HAS_FUSE_CJMADD - ResPacketType T0; + RhsPacket T0; #endif - A0 = ei_pload(&blA[0*LhsPacketSize]); - A1 = ei_pload(&blA[1*LhsPacketSize]); - B0 = ei_pload(&blB[0*RhsPacketSize]); - B1 = ei_pload(&blB[1*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]); + A1 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]); + B1 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]); MADD(pcj,A0,B0,C0,T0); - B2 = ei_pload(&blB[2*RhsPacketSize]); + B2 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]); MADD(pcj,A1,B0,C4,B0); - B3 = ei_pload(&blB[3*RhsPacketSize]); - B0 = ei_pload(&blB[4*RhsPacketSize]); + B3 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[4*RhsPacketSize]); MADD(pcj,A0,B1,C1,T0); MADD(pcj,A1,B1,C5,B1); - B1 = ei_pload(&blB[5*RhsPacketSize]); + B1 = ei_pload<RhsPacket>(&blB[5*RhsPacketSize]); MADD(pcj,A0,B2,C2,T0); MADD(pcj,A1,B2,C6,B2); - B2 = ei_pload(&blB[6*RhsPacketSize]); + B2 = ei_pload<RhsPacket>(&blB[6*RhsPacketSize]); MADD(pcj,A0,B3,C3,T0); - A0 = ei_pload(&blA[2*LhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[2*LhsPacketSize]); MADD(pcj,A1,B3,C7,B3); - A1 = ei_pload(&blA[3*LhsPacketSize]); - B3 = ei_pload(&blB[7*RhsPacketSize]); + A1 = ei_pload<LhsPacket>(&blA[3*LhsPacketSize]); + B3 = ei_pload<RhsPacket>(&blB[7*RhsPacketSize]); MADD(pcj,A0,B0,C0,T0); MADD(pcj,A1,B0,C4,B0); - B0 = ei_pload(&blB[8*RhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[8*RhsPacketSize]); MADD(pcj,A0,B1,C1,T0); MADD(pcj,A1,B1,C5,B1); - B1 = ei_pload(&blB[9*RhsPacketSize]); + B1 = ei_pload<RhsPacket>(&blB[9*RhsPacketSize]); MADD(pcj,A0,B2,C2,T0); MADD(pcj,A1,B2,C6,B2); - B2 = ei_pload(&blB[10*RhsPacketSize]); + B2 = ei_pload<RhsPacket>(&blB[10*RhsPacketSize]); MADD(pcj,A0,B3,C3,T0); - A0 = ei_pload(&blA[4*LhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[4*LhsPacketSize]); MADD(pcj,A1,B3,C7,B3); - A1 = ei_pload(&blA[5*LhsPacketSize]); - B3 = ei_pload(&blB[11*RhsPacketSize]); + A1 = ei_pload<LhsPacket>(&blA[5*LhsPacketSize]); + B3 = ei_pload<RhsPacket>(&blB[11*RhsPacketSize]); MADD(pcj,A0,B0,C0,T0); MADD(pcj,A1,B0,C4,B0); - B0 = ei_pload(&blB[12*RhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[12*RhsPacketSize]); MADD(pcj,A0,B1,C1,T0); MADD(pcj,A1,B1,C5,B1); - B1 = ei_pload(&blB[13*RhsPacketSize]); + B1 = ei_pload<RhsPacket>(&blB[13*RhsPacketSize]); MADD(pcj,A0,B2,C2,T0); MADD(pcj,A1,B2,C6,B2); - B2 = ei_pload(&blB[14*RhsPacketSize]); + B2 = ei_pload<RhsPacket>(&blB[14*RhsPacketSize]); MADD(pcj,A0,B3,C3,T0); - A0 = ei_pload(&blA[6*LhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[6*LhsPacketSize]); MADD(pcj,A1,B3,C7,B3); - A1 = ei_pload(&blA[7*LhsPacketSize]); - B3 = ei_pload(&blB[15*RhsPacketSize]); + A1 = ei_pload<LhsPacket>(&blA[7*LhsPacketSize]); + B3 = ei_pload<RhsPacket>(&blB[15*RhsPacketSize]); MADD(pcj,A0,B0,C0,T0); MADD(pcj,A1,B0,C4,B0); MADD(pcj,A0,B1,C1,T0); @@ -389,38 +391,38 @@ EIGEN_ASM_COMMENT("myend"); { if(nr==2) { - LhsPacketType A0, A1; - RhsPacketType B0; + LhsPacket A0, A1; + RhsPacket B0; #ifndef EIGEN_HAS_FUSE_CJMADD - ResPacketType T0; + RhsPacket T0; #endif - A0 = ei_pload(&blA[0*LhsPacketSize]); - A1 = ei_pload(&blA[1*LhsPacketSize]); - B0 = ei_pload(&blB[0*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]); + A1 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]); MADD(pcj,A0,B0,C0,T0); MADD(pcj,A1,B0,C4,B0); - B0 = ei_pload(&blB[1*RhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]); MADD(pcj,A0,B0,C1,T0); MADD(pcj,A1,B0,C5,B0); } else { - LhsPacketType A0, A1; - RhsPacketType B0, B1, B2, B3; + LhsPacket A0, A1; + RhsPacket B0, B1, B2, B3; #ifndef EIGEN_HAS_FUSE_CJMADD - ResPacketType T0; + RhsPacket T0; #endif - A0 = ei_pload(&blA[0*LhsPacketSize]); - A1 = ei_pload(&blA[1*LhsPacketSize]); - B0 = ei_pload(&blB[0*RhsPacketSize]); - B1 = ei_pload(&blB[1*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]); + A1 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]); + B1 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]); MADD(pcj,A0,B0,C0,T0); - B2 = ei_pload(&blB[2*RhsPacketSize]); + B2 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]); MADD(pcj,A1,B0,C4,B0); - B3 = ei_pload(&blB[3*RhsPacketSize]); + B3 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]); MADD(pcj,A0,B1,C1,T0); MADD(pcj,A1,B1,C5,B1); MADD(pcj,A0,B2,C2,T0); @@ -433,16 +435,16 @@ EIGEN_ASM_COMMENT("myend"); blA += mr; } - ResPacketType R0, R1, R2, R3, R4, R5, R6, R7; + ResPacket R0, R1, R2, R3, R4, R5, R6, R7; - R0 = ei_ploadu(r0); - R1 = ei_ploadu(r1); - if(nr==4) R2 = ei_ploadu(r2); - if(nr==4) R3 = ei_ploadu(r3); - R4 = ei_ploadu(r0 + ResPacketSize); - R5 = ei_ploadu(r1 + ResPacketSize); - if(nr==4) R6 = ei_ploadu(r2 + ResPacketSize); - if(nr==4) R7 = ei_ploadu(r3 + ResPacketSize); + R0 = ei_ploadu<ResPacket>(r0); + R1 = ei_ploadu<ResPacket>(r1); + if(nr==4) R2 = ei_ploadu<ResPacket>(r2); + if(nr==4) R3 = ei_ploadu<ResPacket>(r3); + R4 = ei_ploadu<ResPacket>(r0 + ResPacketSize); + R5 = ei_ploadu<ResPacket>(r1 + ResPacketSize); + if(nr==4) R6 = ei_ploadu<ResPacket>(r2 + ResPacketSize); + if(nr==4) R7 = ei_ploadu<ResPacket>(r3 + ResPacketSize); C0 = ei_padd(R0, C0); C1 = ei_padd(R1, C1); @@ -469,11 +471,11 @@ EIGEN_ASM_COMMENT("myend"); ei_prefetch(&blA[0]); // gets res block as register - ResPacketType C0, C1, C2, C3; - C0 = ei_ploadu(&res[(j2+0)*resStride + i]); - C1 = ei_ploadu(&res[(j2+1)*resStride + i]); - if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]); - if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]); + ResPacket C0, C1, C2, C3; + C0 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i]); + C1 = ei_ploadu<ResPacket>(&res[(j2+1)*resStride + i]); + if(nr==4) C2 = ei_ploadu<ResPacket>(&res[(j2+2)*resStride + i]); + if(nr==4) C3 = ei_ploadu<ResPacket>(&res[(j2+3)*resStride + i]); // performs "inner" product const RhsScalar* blB = unpackedB; @@ -481,70 +483,70 @@ EIGEN_ASM_COMMENT("myend"); { if(nr==2) { - LhsPacketType A0; - RhsPacketType B0, B1; + LhsPacket A0; + RhsPacket B0, B1; - A0 = ei_pload(&blA[0*LhsPacketSize]); - B0 = ei_pload(&blB[0*RhsPacketSize]); - B1 = ei_pload(&blB[1*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]); + B1 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]); MADD(pcj,A0,B0,C0,B0); - B0 = ei_pload(&blB[2*RhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]); MADD(pcj,A0,B1,C1,B1); - A0 = ei_pload(&blA[1*LhsPacketSize]); - B1 = ei_pload(&blB[3*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]); + B1 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]); MADD(pcj,A0,B0,C0,B0); - B0 = ei_pload(&blB[4*RhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[4*RhsPacketSize]); MADD(pcj,A0,B1,C1,B1); - A0 = ei_pload(&blA[2*LhsPacketSize]); - B1 = ei_pload(&blB[5*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[2*LhsPacketSize]); + B1 = ei_pload<RhsPacket>(&blB[5*RhsPacketSize]); MADD(pcj,A0,B0,C0,B0); - B0 = ei_pload(&blB[6*RhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[6*RhsPacketSize]); MADD(pcj,A0,B1,C1,B1); - A0 = ei_pload(&blA[3*LhsPacketSize]); - B1 = ei_pload(&blB[7*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[3*LhsPacketSize]); + B1 = ei_pload<RhsPacket>(&blB[7*RhsPacketSize]); MADD(pcj,A0,B0,C0,B0); MADD(pcj,A0,B1,C1,B1); } else { - LhsPacketType A0; - RhsPacketType B0, B1, B2, B3; + LhsPacket A0; + RhsPacket B0, B1, B2, B3; - A0 = ei_pload(&blA[0*LhsPacketSize]); - B0 = ei_pload(&blB[0*RhsPacketSize]); - B1 = ei_pload(&blB[1*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]); + B1 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]); MADD(pcj,A0,B0,C0,B0); - B2 = ei_pload(&blB[2*RhsPacketSize]); - B3 = ei_pload(&blB[3*RhsPacketSize]); - B0 = ei_pload(&blB[4*RhsPacketSize]); + B2 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]); + B3 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[4*RhsPacketSize]); MADD(pcj,A0,B1,C1,B1); - B1 = ei_pload(&blB[5*RhsPacketSize]); + B1 = ei_pload<RhsPacket>(&blB[5*RhsPacketSize]); MADD(pcj,A0,B2,C2,B2); - B2 = ei_pload(&blB[6*RhsPacketSize]); + B2 = ei_pload<RhsPacket>(&blB[6*RhsPacketSize]); MADD(pcj,A0,B3,C3,B3); - A0 = ei_pload(&blA[1*LhsPacketSize]); - B3 = ei_pload(&blB[7*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]); + B3 = ei_pload<RhsPacket>(&blB[7*RhsPacketSize]); MADD(pcj,A0,B0,C0,B0); - B0 = ei_pload(&blB[8*RhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[8*RhsPacketSize]); MADD(pcj,A0,B1,C1,B1); - B1 = ei_pload(&blB[9*RhsPacketSize]); + B1 = ei_pload<RhsPacket>(&blB[9*RhsPacketSize]); MADD(pcj,A0,B2,C2,B2); - B2 = ei_pload(&blB[10*RhsPacketSize]); + B2 = ei_pload<RhsPacket>(&blB[10*RhsPacketSize]); MADD(pcj,A0,B3,C3,B3); - A0 = ei_pload(&blA[2*LhsPacketSize]); - B3 = ei_pload(&blB[11*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[2*LhsPacketSize]); + B3 = ei_pload<RhsPacket>(&blB[11*RhsPacketSize]); MADD(pcj,A0,B0,C0,B0); - B0 = ei_pload(&blB[12*RhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[12*RhsPacketSize]); MADD(pcj,A0,B1,C1,B1); - B1 = ei_pload(&blB[13*RhsPacketSize]); + B1 = ei_pload<RhsPacket>(&blB[13*RhsPacketSize]); MADD(pcj,A0,B2,C2,B2); - B2 = ei_pload(&blB[14*RhsPacketSize]); + B2 = ei_pload<RhsPacket>(&blB[14*RhsPacketSize]); MADD(pcj,A0,B3,C3,B3); - A0 = ei_pload(&blA[3*LhsPacketSize]); - B3 = ei_pload(&blB[15*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[3*LhsPacketSize]); + B3 = ei_pload<RhsPacket>(&blB[15*RhsPacketSize]); MADD(pcj,A0,B0,C0,B0); MADD(pcj,A0,B1,C1,B1); MADD(pcj,A0,B2,C2,B2); @@ -559,31 +561,31 @@ EIGEN_ASM_COMMENT("myend"); { if(nr==2) { - LhsPacketType A0; - RhsPacketType B0; + LhsPacket A0; + RhsPacket B0; #ifndef EIGEN_HAS_FUSE_CJMADD - ResPacketType T0; + RhsPacket T0; #endif - A0 = ei_pload(&blA[0*LhsPacketSize]); - B0 = ei_pload(&blB[0*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]); MADD(pcj,A0,B0,C0,T0); - B0 = ei_pload(&blB[1*RhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]); MADD(pcj,A0,B0,C1,T0); } else { - LhsPacketType A0; - RhsPacketType B0, B1, B2, B3; + LhsPacket A0; + RhsPacket B0, B1, B2, B3; #ifndef EIGEN_HAS_FUSE_CJMADD - ResPacketType T0, T1; + RhsPacket T0, T1; #endif - A0 = ei_pload(&blA[0*LhsPacketSize]); - B0 = ei_pload(&blB[0*RhsPacketSize]); - B1 = ei_pload(&blB[1*RhsPacketSize]); - B2 = ei_pload(&blB[2*RhsPacketSize]); - B3 = ei_pload(&blB[3*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]); + B1 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]); + B2 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]); + B3 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]); MADD(pcj,A0,B0,C0,T0); MADD(pcj,A0,B1,C1,T1); @@ -662,7 +664,7 @@ EIGEN_ASM_COMMENT("myend"); { const RhsScalar* blB = &blockB[j2*strideB+offsetB]; for(Index k=0; k<depth; k++) - ei_pstore(&unpackedB[k*RhsPacketSize], ei_pset1(blB[k])); + ei_pstore(&unpackedB[k*RhsPacketSize], ei_pset1<RhsPacket>(blB[k])); } for(Index i=0; i<peeled_mc; i+=mr) @@ -673,22 +675,22 @@ EIGEN_ASM_COMMENT("myend"); // TODO move the res loads to the stores // get res block as registers - ResPacketType C0, C4; - C0 = ei_ploadu(&res[(j2+0)*resStride + i]); - C4 = ei_ploadu(&res[(j2+0)*resStride + i + ResPacketSize]); + ResPacket C0, C4; + C0 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i]); + C4 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i + ResPacketSize]); const RhsScalar* blB = unpackedB; for(Index k=0; k<depth; k++) { - LhsPacketType A0, A1; - RhsPacketType B0; + LhsPacket A0, A1; + RhsPacket B0; #ifndef EIGEN_HAS_FUSE_CJMADD - ResPacketType T0, T1; + RhsPacket T0, T1; #endif - A0 = ei_pload(&blA[0*LhsPacketSize]); - A1 = ei_pload(&blA[1*LhsPacketSize]); - B0 = ei_pload(&blB[0*RhsPacketSize]); + A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]); + A1 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]); + B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]); MADD(pcj,A0,B0,C0,T0); MADD(pcj,A1,B0,C4,T1); @@ -705,13 +707,13 @@ EIGEN_ASM_COMMENT("myend"); const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsPacketSize]; ei_prefetch(&blA[0]); - ResPacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]); + ResPacket C0 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i]); const RhsScalar* blB = unpackedB; for(Index k=0; k<depth; k++) { - ResPacketType T0; - MADD(pcj,ei_pload(blA), ei_pload(blB), C0, T0); + RhsPacket T0; + MADD(pcj,ei_pload<LhsPacket>(blA), ei_pload<RhsPacket>(blB), C0, T0); blB += RhsPacketSize; blA += LhsPacketSize; } diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 0997746ef..e0d71be7e 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -32,54 +32,71 @@ * same alignment pattern. * TODO: since rhs gets evaluated only once, no need to evaluate it */ -template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename RhsType> -static EIGEN_DONT_INLINE -void ei_cache_friendly_product_colmajor_times_vector( - Index size, - const Scalar* lhs, Index lhsStride, - const RhsType& rhs, - Scalar* res, - Scalar alpha) +template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> +struct ei_general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs> { +typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + +enum { + Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable + && ei_packet_traits<LhsScalar>::size==ei_packet_traits<RhsScalar>::size, + LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, + RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, + ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1 +}; + +typedef typename ei_packet_traits<LhsScalar>::type _LhsPacket; +typedef typename ei_packet_traits<RhsScalar>::type _RhsPacket; +typedef typename ei_packet_traits<ResScalar>::type _ResPacket; + +typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket; +typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket; +typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket; + +template<typename RhsType> +EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsScalar* lhs, Index lhsStride, + const RhsType&/*const RhsScalar**/ rhs, Index rhsIncr, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + EIGEN_UNUSED_VARIABLE(rhsIncr); + ei_internal_assert(resIncr==1); #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \ ei_pstore(&res[j], \ - ei_padd(ei_pload(&res[j]), \ + ei_padd(ei_pload<ResPacket>(&res[j]), \ ei_padd( \ - ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A0)(&lhs0[j]), ptmp0), \ - pcj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs1[j]), ptmp1)), \ - ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A2)(&lhs2[j]), ptmp2), \ - pcj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs3[j]), ptmp3)) ))) - - typedef typename NumTraits<Scalar>::Real RealScalar; - typedef typename ei_packet_traits<Scalar>::type Packet; - enum { - PacketSize = sizeof(Packet)/sizeof(Scalar), - Vectorizable = ei_packet_traits<Scalar>::Vectorizable - }; - - ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj; - ei_conj_helper<Packet,Packet,ConjugateLhs,ConjugateRhs> pcj; + ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \ + pcj.pmul(EIGEN_CAT(ei_ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \ + ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \ + pcj.pmul(EIGEN_CAT(ei_ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) ))) + + ei_conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; + ei_conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; if(ConjugateRhs) alpha = ei_conj(alpha); enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; const Index columnsAtOnce = 4; const Index peels = 2; - const Index PacketAlignedMask = PacketSize-1; - const Index PeelAlignedMask = PacketSize*peels-1; + const Index LhsPacketAlignedMask = LhsPacketSize-1; + const Index ResPacketAlignedMask = ResPacketSize-1; + const Index PeelAlignedMask = ResPacketSize*peels-1; + const Index size = rows; // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type. Index alignedStart = ei_first_aligned(res,size); - Index alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0; + Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; - const Index alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; + const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(PacketSize/2) ? EvenAligned + : alignmentStep==(LhsPacketSize/2) ? EvenAligned : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices @@ -88,19 +105,19 @@ void ei_cache_friendly_product_colmajor_times_vector( // find how many columns do we have to skip to be aligned with the result (if possible) Index skipColumns = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (size_t(lhs)%sizeof(Scalar)) || (size_t(res)%sizeof(Scalar)) ) + if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) ) { alignedSize = 0; alignedStart = 0; } - else if (PacketSize>1) + else if (LhsPacketSize>1) { - ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size<PacketSize); + ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); - while (skipColumns<PacketSize && - alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%PacketSize)) + while (skipColumns<LhsPacketSize && + alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize)) ++skipColumns; - if (skipColumns==PacketSize) + if (skipColumns==LhsPacketSize) { // nothing can be aligned, no need to skip any column alignmentPattern = NoneAligned; @@ -108,14 +125,14 @@ void ei_cache_friendly_product_colmajor_times_vector( } else { - skipColumns = std::min(skipColumns,rhs.size()); + skipColumns = std::min(skipColumns,cols); // note that the skiped columns are processed later. } ei_internal_assert( (alignmentPattern==NoneAligned) - || (skipColumns + columnsAtOnce >= rhs.size()) - || PacketSize > size - || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0); + || (skipColumns + columnsAtOnce >= cols) + || LhsPacketSize > size + || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0); } else if(Vectorizable) { @@ -127,15 +144,15 @@ void ei_cache_friendly_product_colmajor_times_vector( Index offset1 = (FirstAligned && alignmentStep==1?3:1); Index offset3 = (FirstAligned && alignmentStep==1?1:3); - Index columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; + Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce) { - Packet ptmp0 = ei_pset1(alpha*rhs[i]), ptmp1 = ei_pset1(alpha*rhs[i+offset1]), - ptmp2 = ei_pset1(alpha*rhs[i+2]), ptmp3 = ei_pset1(alpha*rhs[i+offset3]); + RhsPacket ptmp0 = ei_pset1<RhsPacket>(alpha*rhs[i]), ptmp1 = ei_pset1<RhsPacket>(alpha*rhs[i+offset1]), + ptmp2 = ei_pset1<RhsPacket>(alpha*rhs[i+2]), ptmp3 = ei_pset1<RhsPacket>(alpha*rhs[i+offset3]); // this helps a lot generating better binary code - const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, + *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; if (Vectorizable) { @@ -154,51 +171,51 @@ void ei_cache_friendly_product_colmajor_times_vector( switch(alignmentPattern) { case AllAligned: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,d,d); break; case EvenAligned: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,du,d); break; case FirstAligned: if(peels>1) { - Packet A00, A01, A02, A03, A10, A11, A12, A13; + LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; - A01 = ei_pload(&lhs1[alignedStart-1]); - A02 = ei_pload(&lhs2[alignedStart-2]); - A03 = ei_pload(&lhs3[alignedStart-3]); + A01 = ei_pload<LhsPacket>(&lhs1[alignedStart-1]); + A02 = ei_pload<LhsPacket>(&lhs2[alignedStart-2]); + A03 = ei_pload<LhsPacket>(&lhs3[alignedStart-3]); - for (Index j = alignedStart; j<peeledSize; j+=peels*PacketSize) + for (Index j = alignedStart; j<peeledSize; j+=peels*ResPacketSize) { - A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11); - A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12); - A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13); + A11 = ei_pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); ei_palign<1>(A01,A11); + A12 = ei_pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); ei_palign<2>(A02,A12); + A13 = ei_pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); ei_palign<3>(A03,A13); - A00 = ei_pload (&lhs0[j]); - A10 = ei_pload (&lhs0[j+PacketSize]); - A00 = pcj.pmadd(A00, ptmp0, ei_pload(&res[j])); - A10 = pcj.pmadd(A10, ptmp0, ei_pload(&res[j+PacketSize])); + A00 = ei_pload<LhsPacket>(&lhs0[j]); + A10 = ei_pload<LhsPacket>(&lhs0[j+LhsPacketSize]); + A00 = pcj.pmadd(A00, ptmp0, ei_pload<ResPacket>(&res[j])); + A10 = pcj.pmadd(A10, ptmp0, ei_pload<ResPacket>(&res[j+ResPacketSize])); A00 = pcj.pmadd(A01, ptmp1, A00); - A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); + A01 = ei_pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); ei_palign<1>(A11,A01); A00 = pcj.pmadd(A02, ptmp2, A00); - A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); + A02 = ei_pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); ei_palign<2>(A12,A02); A00 = pcj.pmadd(A03, ptmp3, A00); ei_pstore(&res[j],A00); - A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); + A03 = ei_pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); ei_palign<3>(A13,A03); A10 = pcj.pmadd(A11, ptmp1, A10); A10 = pcj.pmadd(A12, ptmp2, A10); A10 = pcj.pmadd(A13, ptmp3, A10); - ei_pstore(&res[j+PacketSize],A10); + ei_pstore(&res[j+ResPacketSize],A10); } } - for (Index j = peeledSize; j<alignedSize; j+=PacketSize) + for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,du,du); break; default: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) _EIGEN_ACCUMULATE_PACKETS(du,du,du); break; } @@ -216,14 +233,14 @@ void ei_cache_friendly_product_colmajor_times_vector( } // process remaining first and last columns (at most columnsAtOnce-1) - Index end = rhs.size(); + Index end = cols; Index start = columnBound; do { for (Index i=start; i<end; ++i) { - Packet ptmp0 = ei_pset1(alpha*rhs[i]); - const Scalar* lhs0 = lhs + i*lhsStride; + RhsPacket ptmp0 = ei_pset1<RhsPacket>(alpha*rhs[i]); + const LhsScalar* lhs0 = lhs + i*lhsStride; if (Vectorizable) { @@ -233,12 +250,12 @@ void ei_cache_friendly_product_colmajor_times_vector( res[j] += cj.pmul(lhs0[j], ei_pfirst(ptmp0)); // process aligned result's coeffs - if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0) - for (Index j = alignedStart;j<alignedSize;j+=PacketSize) - ei_pstore(&res[j], pcj.pmadd(ei_pload(&lhs0[j]), ptmp0, ei_pload(&res[j]))); + if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + for (Index j = alignedStart;j<alignedSize;j+=ResPacketSize) + ei_pstore(&res[j], pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j]), ptmp0, ei_pload<ResPacket>(&res[j]))); else - for (Index j = alignedStart;j<alignedSize;j+=PacketSize) - ei_pstore(&res[j], pcj.pmadd(ei_ploadu(&lhs0[j]), ptmp0, ei_pload(&res[j]))); + for (Index j = alignedStart;j<alignedSize;j+=ResPacketSize) + ei_pstore(&res[j], pcj.pmadd(ei_ploadu<LhsPacket>(&lhs0[j]), ptmp0, ei_pload<ResPacket>(&res[j]))); } // process remaining scalars (or all if no explicit vectorization) @@ -256,15 +273,35 @@ void ei_cache_friendly_product_colmajor_times_vector( } while(Vectorizable); #undef _EIGEN_ACCUMULATE_PACKETS } +}; -// TODO add peeling to mask unaligned load/stores -template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index> -static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( +template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> +struct ei_general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs> +{ +typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + +enum { + Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable + && int(ei_packet_traits<LhsScalar>::size)==int(ei_packet_traits<RhsScalar>::size), + LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1, + RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1, + ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1 +}; + +typedef typename ei_packet_traits<LhsScalar>::type _LhsPacket; +typedef typename ei_packet_traits<RhsScalar>::type _RhsPacket; +typedef typename ei_packet_traits<ResScalar>::type _ResPacket; + +typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket; +typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket; +typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket; + +EIGEN_DONT_INLINE static void run( Index rows, Index cols, - const Scalar* lhs, Index lhsStride, - const Scalar* rhs, Index rhsIncr, - Scalar* res, Index resIncr, - Scalar alpha) + const LhsScalar* lhs, Index lhsStride, + const RhsScalar* rhs, Index rhsIncr, + ResScalar* res, Index resIncr, + ResScalar alpha) { ei_internal_assert(rhsIncr==1); #ifdef _EIGEN_ACCUMULATE_PACKETS @@ -272,39 +309,33 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( #endif #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ - Packet b = ei_pload(&rhs[j]); \ - ptmp0 = pcj.pmadd(EIGEN_CAT(ei_ploa,A0) (&lhs0[j]), b, ptmp0); \ - ptmp1 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs1[j]), b, ptmp1); \ - ptmp2 = pcj.pmadd(EIGEN_CAT(ei_ploa,A2) (&lhs2[j]), b, ptmp2); \ - ptmp3 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs3[j]), b, ptmp3); } - - typedef typename NumTraits<Scalar>::Real RealScalar; - typedef typename ei_packet_traits<Scalar>::type Packet; - enum { - PacketSize = sizeof(Packet)/sizeof(Scalar), - Vectorizable = ei_packet_traits<Scalar>::Vectorizable - }; - - ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj; - ei_conj_helper<Packet,Packet,ConjugateLhs,ConjugateRhs> pcj; + RhsPacket b = ei_pload<RhsPacket>(&rhs[j]); \ + ptmp0 = pcj.pmadd(EIGEN_CAT(ei_ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \ + ptmp1 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \ + ptmp2 = pcj.pmadd(EIGEN_CAT(ei_ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \ + ptmp3 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); } + + ei_conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; + ei_conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; const Index rowsAtOnce = 4; const Index peels = 2; - const Index PacketAlignedMask = PacketSize-1; - const Index PeelAlignedMask = PacketSize*peels-1; + const Index RhsPacketAlignedMask = RhsPacketSize-1; + const Index LhsPacketAlignedMask = LhsPacketSize-1; + const Index PeelAlignedMask = RhsPacketSize*peels-1; const Index depth = cols; // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type // if that's not the case then vectorization is discarded, see below. Index alignedStart = ei_first_aligned(rhs, depth); - Index alignedSize = PacketSize>1 ? alignedStart + ((depth-alignedStart) & ~PacketAlignedMask) : 0; + Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; - const Index alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; + const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(PacketSize/2) ? EvenAligned + : alignmentStep==(LhsPacketSize/2) ? EvenAligned : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices @@ -313,19 +344,19 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( // find how many rows do we have to skip to be aligned with rhs (if possible) Index skipRows = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (size_t(lhs)%sizeof(Scalar)) || (size_t(rhs)%sizeof(Scalar)) ) + if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ) { alignedSize = 0; alignedStart = 0; } - else if (PacketSize>1) + else if (LhsPacketSize>1) { - ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || depth<PacketSize); + ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); - while (skipRows<PacketSize && - alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%PacketSize)) + while (skipRows<LhsPacketSize && + alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize)) ++skipRows; - if (skipRows==PacketSize) + if (skipRows==LhsPacketSize) { // nothing can be aligned, no need to skip any column alignmentPattern = NoneAligned; @@ -337,10 +368,10 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( // note that the skiped columns are processed later. } ei_internal_assert( alignmentPattern==NoneAligned - || PacketSize==1 + || LhsPacketSize==1 || (skipRows + rowsAtOnce >= rows) - || PacketSize > depth - || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0); + || LhsPacketSize > depth + || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0); } else if(Vectorizable) { @@ -355,23 +386,24 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) { - EIGEN_ALIGN16 Scalar tmp0 = Scalar(0); - Scalar tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0); + EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); + ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0); // this helps the compiler generating good binary code - const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, + *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; if (Vectorizable) { /* explicit vectorization */ - Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0)); + ResPacket ptmp0 = ei_pset1<ResPacket>(ResScalar(0)), ptmp1 = ei_pset1<ResPacket>(ResScalar(0)), + ptmp2 = ei_pset1<ResPacket>(ResScalar(0)), ptmp3 = ei_pset1<ResPacket>(ResScalar(0)); // process initial unaligned coeffs // FIXME this loop get vectorized by the compiler ! for (Index j=0; j<alignedStart; ++j) { - Scalar b = rhs[j]; + RhsScalar b = rhs[j]; tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); } @@ -381,11 +413,11 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( switch(alignmentPattern) { case AllAligned: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,d,d); break; case EvenAligned: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,du,d); break; case FirstAligned: @@ -397,38 +429,38 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( * overlaping the desired unaligned packet. This is *much* more efficient * than basic unaligned loads. */ - Packet A01, A02, A03, b, A11, A12, A13; - A01 = ei_pload(&lhs1[alignedStart-1]); - A02 = ei_pload(&lhs2[alignedStart-2]); - A03 = ei_pload(&lhs3[alignedStart-3]); + LhsPacket A01, A02, A03, A11, A12, A13; + A01 = ei_pload<LhsPacket>(&lhs1[alignedStart-1]); + A02 = ei_pload<LhsPacket>(&lhs2[alignedStart-2]); + A03 = ei_pload<LhsPacket>(&lhs3[alignedStart-3]); - for (Index j = alignedStart; j<peeledSize; j+=peels*PacketSize) + for (Index j = alignedStart; j<peeledSize; j+=peels*RhsPacketSize) { - b = ei_pload(&rhs[j]); - A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11); - A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12); - A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13); + RhsPacket b = ei_pload<RhsPacket>(&rhs[j]); + A11 = ei_pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); ei_palign<1>(A01,A11); + A12 = ei_pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); ei_palign<2>(A02,A12); + A13 = ei_pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); ei_palign<3>(A03,A13); - ptmp0 = pcj.pmadd(ei_pload (&lhs0[j]), b, ptmp0); + ptmp0 = pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j]), b, ptmp0); ptmp1 = pcj.pmadd(A01, b, ptmp1); - A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); + A01 = ei_pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); ei_palign<1>(A11,A01); ptmp2 = pcj.pmadd(A02, b, ptmp2); - A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); + A02 = ei_pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); ei_palign<2>(A12,A02); ptmp3 = pcj.pmadd(A03, b, ptmp3); - A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); + A03 = ei_pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); ei_palign<3>(A13,A03); - b = ei_pload(&rhs[j+PacketSize]); - ptmp0 = pcj.pmadd(ei_pload (&lhs0[j+PacketSize]), b, ptmp0); + b = ei_pload<RhsPacket>(&rhs[j+RhsPacketSize]); + ptmp0 = pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0); ptmp1 = pcj.pmadd(A11, b, ptmp1); ptmp2 = pcj.pmadd(A12, b, ptmp2); ptmp3 = pcj.pmadd(A13, b, ptmp3); } } - for (Index j = peeledSize; j<alignedSize; j+=PacketSize) + for (Index j = peeledSize; j<alignedSize; j+=RhsPacketSize) _EIGEN_ACCUMULATE_PACKETS(d,du,du); break; default: - for (Index j = alignedStart; j<alignedSize; j+=PacketSize) + for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) _EIGEN_ACCUMULATE_PACKETS(du,du,du); break; } @@ -443,7 +475,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( // FIXME this loop get vectorized by the compiler ! for (Index j=alignedSize; j<depth; ++j) { - Scalar b = rhs[j]; + RhsScalar b = rhs[j]; tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); } @@ -460,9 +492,9 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( { for (Index i=start; i<end; ++i) { - EIGEN_ALIGN16 Scalar tmp0 = Scalar(0); - Packet ptmp0 = ei_pset1(tmp0); - const Scalar* lhs0 = lhs + i*lhsStride; + EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); + ResPacket ptmp0 = ei_pset1<ResPacket>(tmp0); + const LhsScalar* lhs0 = lhs + i*lhsStride; // process first unaligned result's coeffs // FIXME this loop get vectorized by the compiler ! for (Index j=0; j<alignedStart; ++j) @@ -471,12 +503,12 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( if (alignedSize>alignedStart) { // process aligned rhs coeffs - if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0) - for (Index j = alignedStart;j<alignedSize;j+=PacketSize) - ptmp0 = pcj.pmadd(ei_pload(&lhs0[j]), ei_pload(&rhs[j]), ptmp0); + if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) + ptmp0 = pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j]), ei_pload<RhsPacket>(&rhs[j]), ptmp0); else - for (Index j = alignedStart;j<alignedSize;j+=PacketSize) - ptmp0 = pcj.pmadd(ei_ploadu(&lhs0[j]), ei_pload(&rhs[j]), ptmp0); + for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) + ptmp0 = pcj.pmadd(ei_ploadu<LhsPacket>(&lhs0[j]), ei_pload<RhsPacket>(&rhs[j]), ptmp0); tmp0 += ei_predux(ptmp0); } @@ -498,5 +530,6 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( #undef _EIGEN_ACCUMULATE_PACKETS } +}; #endif // EIGEN_GENERAL_MATRIX_VECTOR_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index 4514c7692..8a10075f0 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -77,14 +77,14 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( register const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride; Scalar t0 = cjAlpha * rhs[j]; - Packet ptmp0 = ei_pset1(t0); + Packet ptmp0 = ei_pset1<Packet>(t0); Scalar t1 = cjAlpha * rhs[j+1]; - Packet ptmp1 = ei_pset1(t1); + Packet ptmp1 = ei_pset1<Packet>(t1); Scalar t2 = 0; - Packet ptmp2 = ei_pset1(t2); + Packet ptmp2 = ei_pset1<Packet>(t2); Scalar t3 = 0; - Packet ptmp3 = ei_pset1(t3); + Packet ptmp3 = ei_pset1<Packet>(t3); size_t starti = FirstTriangular ? 0 : j+2; size_t endi = FirstTriangular ? j : size; @@ -119,10 +119,10 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( Scalar* EIGEN_RESTRICT resIt = res + alignedStart; for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize) { - Packet A0i = ei_ploadu(a0It); a0It += PacketSize; - Packet A1i = ei_ploadu(a1It); a1It += PacketSize; - Packet Bi = ei_ploadu(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases - Packet Xi = ei_pload (resIt); + Packet A0i = ei_ploadu<Packet>(a0It); a0It += PacketSize; + Packet A1i = ei_ploadu<Packet>(a1It); a1It += PacketSize; + Packet Bi = ei_ploadu<Packet>(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases + Packet Xi = ei_pload <Packet>(resIt); Xi = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi)); ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2); diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h index de4f0b7bb..16b02a425 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector.h +++ b/Eigen/src/Core/products/TriangularMatrixVector.h @@ -76,12 +76,11 @@ struct ei_product_triangular_vector_selector<true,Lhs,Rhs,Result,Mode,ConjLhs,Co if (r>0) { Index s = IsLower ? pi+actualPanelWidth : 0; - ei_cache_friendly_product_colmajor_times_vector<ConjLhs,ConjRhs>( - r, + ei_general_matrix_vector_product<Index,Scalar,ColMajor,ConjLhs,Scalar,ConjRhs>::run( + r, actualPanelWidth, &(lhs.const_cast_derived().coeffRef(s,pi)), lhs.outerStride(), - rhs.segment(pi, actualPanelWidth), - &(res.coeffRef(s)), - alpha); + rhs.segment(pi, actualPanelWidth), rhs.innerStride(), + &res.coeffRef(s), res.innerStride(), alpha); } } } @@ -119,7 +118,7 @@ struct ei_product_triangular_vector_selector<true,Lhs,Rhs,Result,Mode,ConjLhs,Co if (r>0) { Index s = IsLower ? 0 : pi + actualPanelWidth; - ei_cache_friendly_product_rowmajor_times_vector<ConjLhs,ConjRhs,Scalar,Index>( + ei_general_matrix_vector_product<Index,Scalar,RowMajor,ConjLhs,Scalar,ConjRhs>::run( actualPanelWidth, r, &(lhs.const_cast_derived().coeffRef(pi,s)), lhs.outerStride(), &(rhs.const_cast_derived().coeffRef(s)), 1, diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 62a6207c6..459446409 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -45,14 +45,21 @@ template< int ResStorageOrder> struct ei_general_matrix_matrix_product; -template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename RhsType> -static void ei_cache_friendly_product_colmajor_times_vector( - Index size, const Scalar* lhs, Index lhsStride, const RhsType& rhs, Scalar* res, Scalar alpha); +template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> +struct ei_general_matrix_vector_product; -template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index> -static void ei_cache_friendly_product_rowmajor_times_vector( - Index rows, Index Cols, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsIncr, - Scalar* res, Index resIncr, Scalar alpha); + +template<bool Conjugate> struct ei_conj_if; + +template<> struct ei_conj_if<true> { + template<typename T> + inline T operator()(const T& x) { return ei_conj(x); } +}; + +template<> struct ei_conj_if<false> { + template<typename T> + inline const T& operator()(const T& x) { return x; } +}; template<typename Scalar> struct ei_conj_helper<Scalar,Scalar,false,false> { @@ -90,35 +97,24 @@ template<typename RealScalar> struct ei_conj_helper<std::complex<RealScalar>, st { return Scalar(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } }; -template<typename RealScalar> struct ei_conj_helper<std::complex<RealScalar>, RealScalar, false,false> +template<typename RealScalar,bool Conj> struct ei_conj_helper<std::complex<RealScalar>, RealScalar, Conj,false> { typedef std::complex<RealScalar> Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const { return ei_padd(c, ei_pmul(x,y)); } - + EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const + { return ei_padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const - { return ei_pmul(x,y); } + { return ei_conj_if<Conj>()(x)*y; } }; -template<typename RealScalar> struct ei_conj_helper<RealScalar, std::complex<RealScalar>, false,false> +template<typename RealScalar,bool Conj> struct ei_conj_helper<RealScalar, std::complex<RealScalar>, false,Conj> { typedef std::complex<RealScalar> Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const { return ei_padd(c, pmul(x,y)); } - + EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const + { return ei_padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const - { return x * y; } -}; - -template<bool Conjugate> struct ei_conj_if; - -template<> struct ei_conj_if<true> { - template<typename T> - inline T operator()(const T& x) { return ei_conj(x); } + { return x*ei_conj_if<Conj>()(y); } }; -template<> struct ei_conj_if<false> { - template<typename T> - inline const T& operator()(const T& x) { return x; } -}; // Lightweight helper class to access matrix coefficients. // Yes, this is somehow redundant with Map<>, but this version is much much lighter, |