diff options
Diffstat (limited to 'Eigen')
24 files changed, 813 insertions, 288 deletions
diff --git a/Eigen/Core b/Eigen/Core index 5e3e0960a..3135d7530 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -221,6 +221,7 @@ using std::size_t; #if defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/MathFunctions.h" + #include "src/Core/arch/SSE/Complex.h" #elif defined EIGEN_VECTORIZE_ALTIVEC #include "src/Core/arch/AltiVec/PacketMath.h" #elif defined EIGEN_VECTORIZE_NEON diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h index 512e93883..8c2eacd96 100644 --- a/Eigen/src/Core/Diagonal.h +++ b/Eigen/src/Core/Diagonal.h @@ -126,6 +126,9 @@ template<typename MatrixType, int DiagIndex> class Diagonal EIGEN_STRONG_INLINE Index absDiagIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); } EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); } EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; } + // triger a compile time error is someone try to call packet + template<int LoadMode> typename MatrixType::PacketReturnType packet(Index) const; + template<int LoadMode> typename MatrixType::PacketReturnType packet(Index,Index) const; }; /** \returns an expression of the main diagonal of the matrix \c *this diff --git a/Eigen/src/Core/DiagonalProduct.h b/Eigen/src/Core/DiagonalProduct.h index 7caf3858f..610d13dc8 100644 --- a/Eigen/src/Core/DiagonalProduct.h +++ b/Eigen/src/Core/DiagonalProduct.h @@ -36,8 +36,16 @@ struct ei_traits<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> > ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Flags = (HereditaryBits & (unsigned int)(MatrixType::Flags)) - | (PacketAccessBit & (unsigned int)(MatrixType::Flags) & (unsigned int)(DiagonalType::DiagonalVectorType::Flags)), + + _StorageOrder = MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor, + _PacketOnDiag = !((int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheLeft) + ||(int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheRight)), + _SameTypes = ei_is_same_type<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ret, + // FIXME currently we need same types, but in the future the next rule should be the one + //_Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagonalType::Flags)&PacketAccessBit))), + _Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && _SameTypes && ((!_PacketOnDiag) || (bool(int(DiagonalType::Flags)&PacketAccessBit))), + + Flags = (HereditaryBits & (unsigned int)(MatrixType::Flags)) | (_Vectorizable ? PacketAccessBit : 0), CoeffReadCost = NumTraits<Scalar>::MulCost + MatrixType::CoeffReadCost + DiagonalType::DiagonalVectorType::CoeffReadCost }; }; @@ -69,26 +77,34 @@ class DiagonalProduct : ei_no_assignment_operator, EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const { enum { - StorageOrder = Flags & RowMajorBit ? RowMajor : ColMajor, - InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime, - DiagonalVectorPacketLoadMode = (LoadMode == Aligned && ((InnerSize%16) == 0)) ? Aligned : Unaligned + StorageOrder = Flags & RowMajorBit ? RowMajor : ColMajor }; const Index indexInDiagonalVector = ProductOrder == OnTheLeft ? row : col; - if((int(StorageOrder) == RowMajor && int(ProductOrder) == OnTheLeft) - ||(int(StorageOrder) == ColMajor && int(ProductOrder) == OnTheRight)) - { - return ei_pmul(m_matrix.template packet<LoadMode>(row, col), - ei_pset1(m_diagonal.diagonal().coeff(indexInDiagonalVector))); - } - else - { - return ei_pmul(m_matrix.template packet<LoadMode>(row, col), - m_diagonal.diagonal().template packet<DiagonalVectorPacketLoadMode>(indexInDiagonalVector)); - } + return packet_impl<LoadMode>(row,col,indexInDiagonalVector,typename ei_meta_if< + ((int(StorageOrder) == RowMajor && int(ProductOrder) == OnTheLeft) + ||(int(StorageOrder) == ColMajor && int(ProductOrder) == OnTheRight)), ei_meta_true, ei_meta_false>::ret()); } protected: + template<int LoadMode> + EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, ei_meta_true) const + { + return ei_pmul(m_matrix.template packet<LoadMode>(row, col), + ei_pset1(m_diagonal.diagonal().coeff(id))); + } + + template<int LoadMode> + EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, ei_meta_false) const + { + enum { + InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime, + DiagonalVectorPacketLoadMode = (LoadMode == Aligned && ((InnerSize%16) == 0)) ? Aligned : Unaligned + }; + return ei_pmul(m_matrix.template packet<LoadMode>(row, col), + m_diagonal.diagonal().template packet<DiagonalVectorPacketLoadMode>(id)); + } + const typename MatrixType::Nested m_matrix; const typename DiagonalType::Nested m_diagonal; }; diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index 8eaa62185..9fc2fb60e 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -41,7 +41,7 @@ struct ei_dot_nocheck { static inline typename ei_traits<T>::Scalar run(const MatrixBase<T>& a, const MatrixBase<U>& b) { - return a.conjugate().cwiseProduct(b).sum(); + return a.template binaryExpr<ei_scalar_conj_product_op<typename ei_traits<T>::Scalar> >(b).sum(); } }; @@ -50,7 +50,7 @@ struct ei_dot_nocheck<T, U, true> { static inline typename ei_traits<T>::Scalar run(const MatrixBase<T>& a, const MatrixBase<U>& b) { - return a.adjoint().cwiseProduct(b).sum(); + return a.transpose().template binaryExpr<ei_scalar_conj_product_op<typename ei_traits<T>::Scalar> >(b).sum(); } }; diff --git a/Eigen/src/Core/Functors.h b/Eigen/src/Core/Functors.h index 78d1e5628..9084905aa 100644 --- a/Eigen/src/Core/Functors.h +++ b/Eigen/src/Core/Functors.h @@ -46,7 +46,7 @@ template<typename Scalar> struct ei_functor_traits<ei_scalar_sum_op<Scalar> > { enum { Cost = NumTraits<Scalar>::AddCost, - PacketAccess = ei_packet_traits<Scalar>::size>1 + PacketAccess = ei_packet_traits<Scalar>::HasAdd }; }; @@ -69,7 +69,29 @@ template<typename Scalar> struct ei_functor_traits<ei_scalar_product_op<Scalar> > { enum { Cost = NumTraits<Scalar>::MulCost, - PacketAccess = ei_packet_traits<Scalar>::size>1 + PacketAccess = ei_packet_traits<Scalar>::HasMul + }; +}; + +/** \internal + * \brief Template functor to compute the conjugate product of two scalars + * + * This is a short cut for ei_conj(x) * y which is needed for optimization purpose + */ +template<typename Scalar> struct ei_scalar_conj_product_op { + enum { Conj = NumTraits<Scalar>::IsComplex }; + EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_conj_product_op) + EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const + { return ei_conj_helper<Scalar,Scalar,Conj,false>().pmul(a,b); } + template<typename PacketScalar> + EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const + { return ei_conj_helper<PacketScalar,PacketScalar,Conj,false>().pmul(a,b); } +}; +template<typename Scalar> +struct ei_functor_traits<ei_scalar_conj_product_op<Scalar> > { + enum { + Cost = NumTraits<Scalar>::MulCost, + PacketAccess = ei_packet_traits<Scalar>::HasMul }; }; @@ -92,7 +114,7 @@ template<typename Scalar> struct ei_functor_traits<ei_scalar_min_op<Scalar> > { enum { Cost = NumTraits<Scalar>::AddCost, - PacketAccess = ei_packet_traits<Scalar>::size>1 + PacketAccess = ei_packet_traits<Scalar>::HasMin }; }; @@ -115,7 +137,7 @@ template<typename Scalar> struct ei_functor_traits<ei_scalar_max_op<Scalar> > { enum { Cost = NumTraits<Scalar>::AddCost, - PacketAccess = ei_packet_traits<Scalar>::size>1 + PacketAccess = ei_packet_traits<Scalar>::HasMax }; }; @@ -158,7 +180,7 @@ template<typename Scalar> struct ei_functor_traits<ei_scalar_difference_op<Scalar> > { enum { Cost = NumTraits<Scalar>::AddCost, - PacketAccess = ei_packet_traits<Scalar>::size>1 + PacketAccess = ei_packet_traits<Scalar>::HasSub }; }; @@ -178,10 +200,7 @@ template<typename Scalar> struct ei_functor_traits<ei_scalar_quotient_op<Scalar> > { enum { Cost = 2 * NumTraits<Scalar>::MulCost, - PacketAccess = ei_packet_traits<Scalar>::size>1 - #if (defined EIGEN_VECTORIZE) - && !NumTraits<Scalar>::IsInteger - #endif + PacketAccess = ei_packet_traits<Scalar>::HasDiv }; }; @@ -203,7 +222,7 @@ template<typename Scalar> struct ei_functor_traits<ei_scalar_opposite_op<Scalar> > { enum { Cost = NumTraits<Scalar>::AddCost, - PacketAccess = int(ei_packet_traits<Scalar>::size)>1 }; + PacketAccess = ei_packet_traits<Scalar>::HasNegate }; }; /** \internal @@ -224,7 +243,7 @@ struct ei_functor_traits<ei_scalar_abs_op<Scalar> > { enum { Cost = NumTraits<Scalar>::AddCost, - PacketAccess = int(ei_packet_traits<Scalar>::size)>1 + PacketAccess = ei_packet_traits<Scalar>::HasAbs }; }; @@ -243,7 +262,7 @@ template<typename Scalar> struct ei_scalar_abs2_op { }; template<typename Scalar> struct ei_functor_traits<ei_scalar_abs2_op<Scalar> > -{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = int(ei_packet_traits<Scalar>::size)>1 }; }; +{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::HasAbs2 }; }; /** \internal * \brief Template functor to compute the conjugate of a complex value @@ -254,14 +273,14 @@ template<typename Scalar> struct ei_scalar_conjugate_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_conjugate_op) EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return ei_conj(a); } template<typename PacketScalar> - EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const { return a; } + EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const { return ei_pconj(a); } }; template<typename Scalar> struct ei_functor_traits<ei_scalar_conjugate_op<Scalar> > { enum { Cost = NumTraits<Scalar>::IsComplex ? NumTraits<Scalar>::AddCost : 0, - PacketAccess = int(ei_packet_traits<Scalar>::size)>1 + PacketAccess = ei_packet_traits<Scalar>::HasConj }; }; @@ -398,7 +417,7 @@ struct ei_scalar_multiple_op { }; template<typename Scalar> struct ei_functor_traits<ei_scalar_multiple_op<Scalar> > -{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::size>1 }; }; +{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::HasMul }; }; template<typename Scalar1, typename Scalar2> struct ei_scalar_multiple2_op { @@ -425,7 +444,7 @@ struct ei_scalar_quotient1_impl { }; template<typename Scalar> struct ei_functor_traits<ei_scalar_quotient1_impl<Scalar,false> > -{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::size>1 }; }; +{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::HasMul }; }; template<typename Scalar> struct ei_scalar_quotient1_impl<Scalar,true> { @@ -472,7 +491,8 @@ struct ei_scalar_constant_op { }; template<typename Scalar> struct ei_functor_traits<ei_scalar_constant_op<Scalar> > -{ enum { Cost = 1, PacketAccess = ei_packet_traits<Scalar>::size>1, IsRepeatable = true }; }; +// FIXME replace this packet test by a safe one +{ enum { Cost = 1, PacketAccess = ei_packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; }; template<typename Scalar> struct ei_scalar_identity_op { EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_identity_op) @@ -543,7 +563,7 @@ struct ei_linspaced_op_impl<Scalar,true> // nested expressions). template <typename Scalar, bool RandomAccess = true> struct ei_linspaced_op; template <typename Scalar, bool RandomAccess> struct ei_functor_traits< ei_linspaced_op<Scalar,RandomAccess> > -{ enum { Cost = 1, PacketAccess = ei_packet_traits<Scalar>::size>1, IsRepeatable = true }; }; +{ enum { Cost = 1, PacketAccess = ei_packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; }; template <typename Scalar, bool RandomAccess> struct ei_linspaced_op { typedef typename ei_packet_traits<Scalar>::type PacketScalar; @@ -588,7 +608,7 @@ struct ei_scalar_add_op { }; template<typename Scalar> struct ei_functor_traits<ei_scalar_add_op<Scalar> > -{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = ei_packet_traits<Scalar>::size>1 }; }; +{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = ei_packet_traits<Scalar>::HasAdd }; }; /** \internal * \brief Template functor to compute the square root of a scalar @@ -676,7 +696,7 @@ struct ei_scalar_inverse_op { }; template<typename Scalar> struct ei_functor_traits<ei_scalar_inverse_op<Scalar> > -{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = int(ei_packet_traits<Scalar>::size)>1 }; }; +{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::HasDiv }; }; /** \internal * \brief Template functor to compute the square of a scalar @@ -692,7 +712,7 @@ struct ei_scalar_square_op { }; template<typename Scalar> struct ei_functor_traits<ei_scalar_square_op<Scalar> > -{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = int(ei_packet_traits<Scalar>::size)>1 }; }; +{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::HasMul }; }; /** \internal * \brief Template functor to compute the cube of a scalar @@ -708,7 +728,7 @@ struct ei_scalar_cube_op { }; template<typename Scalar> struct ei_functor_traits<ei_scalar_cube_op<Scalar> > -{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = int(ei_packet_traits<Scalar>::size)>1 }; }; +{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::HasMul }; }; // default functor traits for STL functors: diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 643e12e34..77b1d748e 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -58,8 +58,11 @@ struct ei_default_packet_traits HasMul = 1, HasNegate = 1, HasAbs = 1, + HasAbs2 = 1, HasMin = 1, HasMax = 1, + HasConj = 1, + HasSetLinear = 1, HasDiv = 0, HasSqrt = 0, @@ -79,15 +82,21 @@ struct ei_default_packet_traits template<typename T> struct ei_packet_traits : ei_default_packet_traits { typedef T type; - enum {size=1}; + enum { + Vectorizable = 0, + size = 1 + }; enum { HasAdd = 0, HasSub = 0, HasMul = 0, HasNegate = 0, HasAbs = 0, + HasAbs2 = 0, HasMin = 0, - HasMax = 0 + HasMax = 0, + HasConj = 0, + HasSetLinear = 0 }; }; @@ -105,6 +114,10 @@ ei_psub(const Packet& a, template<typename Packet> inline Packet ei_pnegate(const Packet& a) { return -a; } +/** \internal \returns conj(a) (coeff-wise) */ +template<typename Packet> inline Packet +ei_pconj(const Packet& a) { return ei_conj(a); } + /** \internal \returns a * b (coeff-wise) */ template<typename Packet> inline Packet ei_pmul(const Packet& a, diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 66435e0e3..ca30c4dce 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -378,6 +378,8 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true> * RhsBlasTraits::extractScalarFactor(prod.rhs()); enum { + // FIXME I think here we really have to check for ei_packet_traits<Scalar>::size==1 + // because in this case it is fine to have an inner stride DirectlyUseRhs = ((ei_packet_traits<Scalar>::size==1) || (_ActualRhsType::Flags&ActualPacketAccessBit)) && (!(_ActualRhsType::Flags & RowMajorBit)) }; diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index e3454b554..9e6a375ea 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -85,8 +85,12 @@ static Packet4f ei_p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)ei_p4i_MINUS1, (Pack template<> struct ei_packet_traits<float> : ei_default_packet_traits { - typedef Packet4f type; enum {size=4}; + typedef Packet4f type; enum { + Vectorizable = 1, + size=4, + + // FIXME check the Has* HasSin = 0, HasCos = 0, HasLog = 0, @@ -95,7 +99,14 @@ template<> struct ei_packet_traits<float> : ei_default_packet_traits }; }; template<> struct ei_packet_traits<int> : ei_default_packet_traits -{ typedef Packet4i type; enum {size=4}; }; +{ + typedef Packet4i type; + enum { + // FIXME check the Has* + Vectorizable = 1, + size=4 + }; +}; template<> struct ei_unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; template<> struct ei_unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 2f0efbcc9..aaa27b56d 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -59,8 +59,12 @@ typedef int32x4_t Packet4i; template<> struct ei_packet_traits<float> : ei_default_packet_traits { - typedef Packet4f type; enum {size=4}; + typedef Packet4f type; enum { + Vectorizable = 1, + size = 4, + + // FIXME check the Has* HasSin = 0, HasCos = 0, HasLog = 0, @@ -69,7 +73,14 @@ template<> struct ei_packet_traits<float> : ei_default_packet_traits }; }; template<> struct ei_packet_traits<int> : ei_default_packet_traits -{ typedef Packet4i type; enum {size=4}; }; +{ + typedef Packet4i type; + enum { + Vectorizable = 1, + size=4 + // FIXME check the Has* + }; +}; template<> struct ei_unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; template<> struct ei_unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h new file mode 100644 index 000000000..4ecfc2f43 --- /dev/null +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -0,0 +1,370 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_COMPLEX_SSE_H +#define EIGEN_COMPLEX_SSE_H + +//---------- float ---------- +struct Packet2cf +{ + EIGEN_STRONG_INLINE Packet2cf() {} + EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {} + __m128 v; +}; + +template<> struct ei_packet_traits<std::complex<float> > : ei_default_packet_traits +{ + typedef Packet2cf type; + enum { + Vectorizable = 1, + size = 2, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; + +template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>& from) +{ + Packet2cf res; + res.v = _mm_loadl_pi(res.v, (const __m64*)&from); + return Packet2cf(_mm_movelh_ps(res.v,res.v)); +} + +template<> EIGEN_STRONG_INLINE Packet2cf ei_padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_pnegate(const Packet2cf& a) +{ + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); + return Packet2cf(_mm_xor_ps(a.v,mask)); +} +template<> EIGEN_STRONG_INLINE Packet2cf ei_pconj(const Packet2cf& a) +{ + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_xor_ps(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet2cf ei_pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) +{ + // TODO optimize it for SSE3 and 4 + #ifdef EIGEN_VECTORIZE_SSE3 + return Packet2cf(_mm_addsub_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), + _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3), + ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)))); + #else + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000)); + return Packet2cf(_mm_add_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), + _mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3), + ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); + #endif +} + +template<> EIGEN_STRONG_INLINE Packet2cf ei_pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(_mm_load_ps((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu((const float*)from)); } + +template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((float*)to, from.v); } + +template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } + +template<> EIGEN_STRONG_INLINE std::complex<float> ei_pfirst<Packet2cf>(const Packet2cf& a) +{ + std::complex<float> res; + _mm_storel_pi((__m64*)&res, a.v); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet2cf ei_preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(ei_preverse(_mm_castps_pd(a.v)))); } + +template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux<Packet2cf>(const Packet2cf& a) +{ + return ei_pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v)))); +} + +template<> EIGEN_STRONG_INLINE Packet2cf ei_preduxp<Packet2cf>(const Packet2cf* vecs) +{ + return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v))); +} + +template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux_mul<Packet2cf>(const Packet2cf& a) +{ + return ei_pfirst(ei_pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v)))); +} + +template<int Offset> +struct ei_palign_impl<Offset,Packet2cf> +{ + EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset==1) + { + first.v = _mm_movehl_ps(first.v, first.v); + first.v = _mm_movelh_ps(first.v, second.v); + } + } +}; + +template<> struct ei_conj_helper<Packet2cf, Packet2cf, false,true> +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return ei_padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return ei_pmul(a, ei_pconj(b)); + #else + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), + _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3), + ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)))); + #endif + } +}; + +template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,false> +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return ei_padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return ei_pmul(ei_pconj(a), b); + #else + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_add_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), + _mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3), + ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); + #endif + } +}; + +template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,true> +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return ei_padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return ei_pconj(ei_pmul(a, b)); + #else + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), + _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3), + ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)))); + #endif + } +}; + +template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) +{ + // TODO optimize it for SSE3 and 4 + Packet2cf res = ei_conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b); + __m128 s = _mm_mul_ps(b.v,b.v); + return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); +} + +//---------- double ---------- +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {} + __m128d v; +}; + +template<> struct ei_packet_traits<std::complex<double> > : ei_default_packet_traits +{ + typedef Packet1cd type; + enum { + Vectorizable = 1, + size = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct ei_unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; }; + +template<> EIGEN_STRONG_INLINE Packet1cd ei_padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_pnegate(const Packet1cd& a) { return Packet1cd(ei_pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_pconj(const Packet1cd& a) +{ + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_xor_pd(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet1cd ei_pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for SSE3 and 4 + #ifdef EIGEN_VECTORIZE_SSE3 + return Packet1cd(_mm_addsub_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), + _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1), + ei_vec2d_swizzle1(b.v, 1, 0)))); + #else + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); + return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), + _mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1), + ei_vec2d_swizzle1(b.v, 1, 0)), mask))); + #endif +} + +template<> EIGEN_STRONG_INLINE Packet1cd ei_pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(_mm_load_pd((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<std::complex<double> >(const std::complex<double>& from) +{ /* here we really have to use unaligned loads :( */ return ei_ploadu(&from); } + +template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, from.v); } + +template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } + +template<> EIGEN_STRONG_INLINE std::complex<double> ei_pfirst<Packet1cd>(const Packet1cd& a) +{ + EIGEN_ALIGN16 std::complex<double> res; + _mm_store_pd((double*)&res, a.v); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet1cd ei_preverse(const Packet1cd& a) { return a; } + +template<> EIGEN_STRONG_INLINE std::complex<double> ei_predux<Packet1cd>(const Packet1cd& a) +{ + return ei_pfirst(a); +} + +template<> EIGEN_STRONG_INLINE Packet1cd ei_preduxp<Packet1cd>(const Packet1cd* vecs) +{ + return vecs[0]; +} + +template<> EIGEN_STRONG_INLINE std::complex<double> ei_predux_mul<Packet1cd>(const Packet1cd& a) +{ + return ei_pfirst(a); +} + +template<int Offset> +struct ei_palign_impl<Offset,Packet1cd> +{ + EIGEN_STRONG_INLINE static void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) + { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... + } +}; + +template<> struct ei_conj_helper<Packet1cd, Packet1cd, false,true> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return ei_padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return ei_pmul(a, ei_pconj(b)); + #else + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), mask), + _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1), + ei_vec2d_swizzle1(b.v, 1, 0)))); + #endif + } +}; + +template<> struct ei_conj_helper<Packet1cd, Packet1cd, true,false> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return ei_padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return ei_pmul(ei_pconj(a), b); + #else + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), + _mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1), + ei_vec2d_swizzle1(b.v, 1, 0)), mask))); + #endif + } +}; + +template<> struct ei_conj_helper<Packet1cd, Packet1cd, true,true> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return ei_padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return ei_pconj(ei_pmul(a, b)); + #else + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), mask), + _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1), + ei_vec2d_swizzle1(b.v, 1, 0)))); + #endif + } +}; + +template<> EIGEN_STRONG_INLINE Packet1cd ei_pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for SSE3 and 4 + Packet1cd res = ei_conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b); + __m128d s = _mm_mul_pd(b.v,b.v); + return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); +} + +#endif // EIGEN_COMPLEX_SSE_H diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 29375bdae..9382fbde5 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -43,6 +43,9 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; }; #define ei_vec4i_swizzle1(v,p,q,r,s) \ (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) +#define ei_vec2d_swizzle1(v,p,q) \ + (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) + #define ei_vec4f_swizzle2(a,b,p,q,r,s) \ (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) @@ -58,10 +61,15 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; }; #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i ei_p4i_##NAME = ei_pset1<int>(X) + template<> struct ei_packet_traits<float> : ei_default_packet_traits { - typedef Packet4f type; enum {size=4}; + typedef Packet4f type; enum { + Vectorizable = 1, + size=4, + + HasDiv = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasLog = 1, @@ -70,9 +78,24 @@ template<> struct ei_packet_traits<float> : ei_default_packet_traits }; }; template<> struct ei_packet_traits<double> : ei_default_packet_traits -{ typedef Packet2d type; enum {size=2}; }; +{ + typedef Packet2d type; + enum { + Vectorizable = 1, + size=2, + + HasDiv = 1 + }; +}; template<> struct ei_packet_traits<int> : ei_default_packet_traits -{ typedef Packet4i type; enum {size=4}; }; +{ + typedef Packet4i type; + enum { + // FIXME check the Has* + Vectorizable = 1, + size=4 + }; +}; template<> struct ei_unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; template<> struct ei_unpacket_traits<Packet2d> { typedef double type; enum {size=2}; }; @@ -83,11 +106,11 @@ template<> struct ei_unpacket_traits<Packet4i> { typedef int type; enum {size // that is inefficient :( (e.g., see ei_gemm_pack_rhs) template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { Packet4f res = _mm_set_ss(from); - return _mm_shuffle_ps(res,res,0); + return ei_vec4f_swizzle1(res,0,0,0,0); } template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) { Packet2d res = _mm_set_sd(from); - return _mm_unpacklo_pd(res,res); + return ei_vec2d_swizzle1(res, 0, 0); } #else template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { return _mm_set1_ps(from); } diff --git a/Eigen/src/Core/products/CoeffBasedProduct.h b/Eigen/src/Core/products/CoeffBasedProduct.h index a17ce901b..1474bc1bb 100644 --- a/Eigen/src/Core/products/CoeffBasedProduct.h +++ b/Eigen/src/Core/products/CoeffBasedProduct.h @@ -318,7 +318,7 @@ struct ei_product_coeff_vectorized_dyn_selector typedef typename Lhs::Index Index; EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) { - res = lhs.row(row).cwiseProduct(rhs.col(col)).sum(); + res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum(); } }; @@ -330,7 +330,7 @@ struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols> typedef typename Lhs::Index Index; EIGEN_STRONG_INLINE static void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) { - res = lhs.cwiseProduct(rhs.col(col)).sum(); + res = lhs.transpose().cwiseProduct(rhs.col(col)).sum(); } }; @@ -340,7 +340,7 @@ struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1> typedef typename Lhs::Index Index; EIGEN_STRONG_INLINE static void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) { - res = lhs.row(row).cwiseProduct(rhs).sum(); + res = lhs.row(row).transpose().cwiseProduct(rhs).sum(); } }; @@ -350,7 +350,7 @@ struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1> typedef typename Lhs::Index Index; EIGEN_STRONG_INLINE static void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) { - res = lhs.cwiseProduct(rhs).sum(); + res = lhs.transpose().cwiseProduct(rhs).sum(); } }; diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 2c42ad5b6..cf133f68f 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -134,13 +134,13 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st } #ifdef EIGEN_HAS_FUSE_CJMADD - #define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C); + #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); #else - #define CJMADD(A,B,C,T) T = B; T = cj.pmul(A,T); C = ei_padd(C,T); + #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = ei_padd(C,T); #endif // optimized GEneral packed Block * packed Panel product kernel -template<typename Scalar, typename Index, int mr, int nr, typename Conj> +template<typename Scalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> struct ei_gebp_kernel { void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, @@ -150,7 +150,8 @@ struct ei_gebp_kernel enum { PacketSize = ei_packet_traits<Scalar>::size }; if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; - Conj cj; + ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj; + ei_conj_helper<PacketType,PacketType,ConjugateLhs,ConjugateRhs> pcj; Index packet_cols = (cols/nr) * nr; const Index peeled_mc = (rows/mr)*mr; const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0); @@ -259,42 +260,43 @@ struct ei_gebp_kernel #ifndef EIGEN_HAS_FUSE_CJMADD PacketType T0; #endif - +EIGEN_ASM_COMMENT("mybegin"); A0 = ei_pload(&blA[0*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,B0); B0 = ei_pload(&blB[1*PacketSize]); - CJMADD(A0,B0,C1,T0); - CJMADD(A1,B0,C5,B0); + MADD(pcj,A0,B0,C1,T0); + MADD(pcj,A1,B0,C5,B0); A0 = ei_pload(&blA[2*PacketSize]); A1 = ei_pload(&blA[3*PacketSize]); B0 = ei_pload(&blB[2*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,B0); B0 = ei_pload(&blB[3*PacketSize]); - CJMADD(A0,B0,C1,T0); - CJMADD(A1,B0,C5,B0); + MADD(pcj,A0,B0,C1,T0); + MADD(pcj,A1,B0,C5,B0); A0 = ei_pload(&blA[4*PacketSize]); A1 = ei_pload(&blA[5*PacketSize]); B0 = ei_pload(&blB[4*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,B0); B0 = ei_pload(&blB[5*PacketSize]); - CJMADD(A0,B0,C1,T0); - CJMADD(A1,B0,C5,B0); + MADD(pcj,A0,B0,C1,T0); + MADD(pcj,A1,B0,C5,B0); A0 = ei_pload(&blA[6*PacketSize]); A1 = ei_pload(&blA[7*PacketSize]); B0 = ei_pload(&blB[6*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,B0); B0 = ei_pload(&blB[7*PacketSize]); - CJMADD(A0,B0,C1,T0); - CJMADD(A1,B0,C5,B0); + MADD(pcj,A0,B0,C1,T0); + MADD(pcj,A1,B0,C5,B0); +EIGEN_ASM_COMMENT("myend"); } else { @@ -302,65 +304,66 @@ struct ei_gebp_kernel #ifndef EIGEN_HAS_FUSE_CJMADD PacketType T0; #endif - +EIGEN_ASM_COMMENT("mybegin"); A0 = ei_pload(&blA[0*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); B1 = ei_pload(&blB[1*PacketSize]); - CJMADD(A0,B0,C0,T0); + MADD(pcj,A0,B0,C0,T0); B2 = ei_pload(&blB[2*PacketSize]); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A1,B0,C4,B0); B3 = ei_pload(&blB[3*PacketSize]); B0 = ei_pload(&blB[4*PacketSize]); - CJMADD(A0,B1,C1,T0); - CJMADD(A1,B1,C5,B1); + MADD(pcj,A0,B1,C1,T0); + MADD(pcj,A1,B1,C5,B1); B1 = ei_pload(&blB[5*PacketSize]); - CJMADD(A0,B2,C2,T0); - CJMADD(A1,B2,C6,B2); + MADD(pcj,A0,B2,C2,T0); + MADD(pcj,A1,B2,C6,B2); B2 = ei_pload(&blB[6*PacketSize]); - CJMADD(A0,B3,C3,T0); + MADD(pcj,A0,B3,C3,T0); A0 = ei_pload(&blA[2*PacketSize]); - CJMADD(A1,B3,C7,B3); + MADD(pcj,A1,B3,C7,B3); A1 = ei_pload(&blA[3*PacketSize]); B3 = ei_pload(&blB[7*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,B0); B0 = ei_pload(&blB[8*PacketSize]); - CJMADD(A0,B1,C1,T0); - CJMADD(A1,B1,C5,B1); + MADD(pcj,A0,B1,C1,T0); + MADD(pcj,A1,B1,C5,B1); B1 = ei_pload(&blB[9*PacketSize]); - CJMADD(A0,B2,C2,T0); - CJMADD(A1,B2,C6,B2); + MADD(pcj,A0,B2,C2,T0); + MADD(pcj,A1,B2,C6,B2); B2 = ei_pload(&blB[10*PacketSize]); - CJMADD(A0,B3,C3,T0); + MADD(pcj,A0,B3,C3,T0); A0 = ei_pload(&blA[4*PacketSize]); - CJMADD(A1,B3,C7,B3); + MADD(pcj,A1,B3,C7,B3); A1 = ei_pload(&blA[5*PacketSize]); B3 = ei_pload(&blB[11*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,B0); B0 = ei_pload(&blB[12*PacketSize]); - CJMADD(A0,B1,C1,T0); - CJMADD(A1,B1,C5,B1); + MADD(pcj,A0,B1,C1,T0); + MADD(pcj,A1,B1,C5,B1); B1 = ei_pload(&blB[13*PacketSize]); - CJMADD(A0,B2,C2,T0); - CJMADD(A1,B2,C6,B2); + MADD(pcj,A0,B2,C2,T0); + MADD(pcj,A1,B2,C6,B2); B2 = ei_pload(&blB[14*PacketSize]); - CJMADD(A0,B3,C3,T0); + MADD(pcj,A0,B3,C3,T0); A0 = ei_pload(&blA[6*PacketSize]); - CJMADD(A1,B3,C7,B3); + MADD(pcj,A1,B3,C7,B3); A1 = ei_pload(&blA[7*PacketSize]); B3 = ei_pload(&blB[15*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,B0); - CJMADD(A0,B1,C1,T0); - CJMADD(A1,B1,C5,B1); - CJMADD(A0,B2,C2,T0); - CJMADD(A1,B2,C6,B2); - CJMADD(A0,B3,C3,T0); - CJMADD(A1,B3,C7,B3); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,B0); + MADD(pcj,A0,B1,C1,T0); + MADD(pcj,A1,B1,C5,B1); + MADD(pcj,A0,B2,C2,T0); + MADD(pcj,A1,B2,C6,B2); + MADD(pcj,A0,B3,C3,T0); + MADD(pcj,A1,B3,C7,B3); +EIGEN_ASM_COMMENT("myend"); } blB += 4*nr*PacketSize; @@ -379,11 +382,11 @@ struct ei_gebp_kernel A0 = ei_pload(&blA[0*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,B0); B0 = ei_pload(&blB[1*PacketSize]); - CJMADD(A0,B0,C1,T0); - CJMADD(A1,B0,C5,B0); + MADD(pcj,A0,B0,C1,T0); + MADD(pcj,A1,B0,C5,B0); } else { @@ -397,16 +400,16 @@ struct ei_gebp_kernel B0 = ei_pload(&blB[0*PacketSize]); B1 = ei_pload(&blB[1*PacketSize]); - CJMADD(A0,B0,C0,T0); + MADD(pcj,A0,B0,C0,T0); B2 = ei_pload(&blB[2*PacketSize]); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A1,B0,C4,B0); B3 = ei_pload(&blB[3*PacketSize]); - CJMADD(A0,B1,C1,T0); - CJMADD(A1,B1,C5,B1); - CJMADD(A0,B2,C2,T0); - CJMADD(A1,B2,C6,B2); - CJMADD(A0,B3,C3,T0); - CJMADD(A1,B3,C7,B3); + MADD(pcj,A0,B1,C1,T0); + MADD(pcj,A1,B1,C5,B1); + MADD(pcj,A0,B2,C2,T0); + MADD(pcj,A1,B2,C6,B2); + MADD(pcj,A0,B3,C3,T0); + MADD(pcj,A1,B3,C7,B3); } blB += nr*PacketSize; @@ -466,23 +469,23 @@ struct ei_gebp_kernel A0 = ei_pload(&blA[0*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); B1 = ei_pload(&blB[1*PacketSize]); - CJMADD(A0,B0,C0,B0); + MADD(pcj,A0,B0,C0,B0); B0 = ei_pload(&blB[2*PacketSize]); - CJMADD(A0,B1,C1,B1); + MADD(pcj,A0,B1,C1,B1); A0 = ei_pload(&blA[1*PacketSize]); B1 = ei_pload(&blB[3*PacketSize]); - CJMADD(A0,B0,C0,B0); + MADD(pcj,A0,B0,C0,B0); B0 = ei_pload(&blB[4*PacketSize]); - CJMADD(A0,B1,C1,B1); + MADD(pcj,A0,B1,C1,B1); A0 = ei_pload(&blA[2*PacketSize]); B1 = ei_pload(&blB[5*PacketSize]); - CJMADD(A0,B0,C0,B0); + MADD(pcj,A0,B0,C0,B0); B0 = ei_pload(&blB[6*PacketSize]); - CJMADD(A0,B1,C1,B1); + MADD(pcj,A0,B1,C1,B1); A0 = ei_pload(&blA[3*PacketSize]); B1 = ei_pload(&blB[7*PacketSize]); - CJMADD(A0,B0,C0,B0); - CJMADD(A0,B1,C1,B1); + MADD(pcj,A0,B0,C0,B0); + MADD(pcj,A0,B1,C1,B1); } else { @@ -492,41 +495,41 @@ struct ei_gebp_kernel B0 = ei_pload(&blB[0*PacketSize]); B1 = ei_pload(&blB[1*PacketSize]); - CJMADD(A0,B0,C0,B0); + MADD(pcj,A0,B0,C0,B0); B2 = ei_pload(&blB[2*PacketSize]); B3 = ei_pload(&blB[3*PacketSize]); B0 = ei_pload(&blB[4*PacketSize]); - CJMADD(A0,B1,C1,B1); + MADD(pcj,A0,B1,C1,B1); B1 = ei_pload(&blB[5*PacketSize]); - CJMADD(A0,B2,C2,B2); + MADD(pcj,A0,B2,C2,B2); B2 = ei_pload(&blB[6*PacketSize]); - CJMADD(A0,B3,C3,B3); + MADD(pcj,A0,B3,C3,B3); A0 = ei_pload(&blA[1*PacketSize]); B3 = ei_pload(&blB[7*PacketSize]); - CJMADD(A0,B0,C0,B0); + MADD(pcj,A0,B0,C0,B0); B0 = ei_pload(&blB[8*PacketSize]); - CJMADD(A0,B1,C1,B1); + MADD(pcj,A0,B1,C1,B1); B1 = ei_pload(&blB[9*PacketSize]); - CJMADD(A0,B2,C2,B2); + MADD(pcj,A0,B2,C2,B2); B2 = ei_pload(&blB[10*PacketSize]); - CJMADD(A0,B3,C3,B3); + MADD(pcj,A0,B3,C3,B3); A0 = ei_pload(&blA[2*PacketSize]); B3 = ei_pload(&blB[11*PacketSize]); - CJMADD(A0,B0,C0,B0); + MADD(pcj,A0,B0,C0,B0); B0 = ei_pload(&blB[12*PacketSize]); - CJMADD(A0,B1,C1,B1); + MADD(pcj,A0,B1,C1,B1); B1 = ei_pload(&blB[13*PacketSize]); - CJMADD(A0,B2,C2,B2); + MADD(pcj,A0,B2,C2,B2); B2 = ei_pload(&blB[14*PacketSize]); - CJMADD(A0,B3,C3,B3); + MADD(pcj,A0,B3,C3,B3); A0 = ei_pload(&blA[3*PacketSize]); B3 = ei_pload(&blB[15*PacketSize]); - CJMADD(A0,B0,C0,B0); - CJMADD(A0,B1,C1,B1); - CJMADD(A0,B2,C2,B2); - CJMADD(A0,B3,C3,B3); + MADD(pcj,A0,B0,C0,B0); + MADD(pcj,A0,B1,C1,B1); + MADD(pcj,A0,B2,C2,B2); + MADD(pcj,A0,B3,C3,B3); } blB += 4*nr*PacketSize; @@ -544,9 +547,9 @@ struct ei_gebp_kernel A0 = ei_pload(&blA[0*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); - CJMADD(A0,B0,C0,T0); + MADD(pcj,A0,B0,C0,T0); B0 = ei_pload(&blB[1*PacketSize]); - CJMADD(A0,B0,C1,T0); + MADD(pcj,A0,B0,C1,T0); } else { @@ -561,10 +564,10 @@ struct ei_gebp_kernel B2 = ei_pload(&blB[2*PacketSize]); B3 = ei_pload(&blB[3*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A0,B1,C1,T1); - CJMADD(A0,B2,C2,T0); - CJMADD(A0,B3,C3,T1); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A0,B1,C1,T1); + MADD(pcj,A0,B2,C2,T0); + MADD(pcj,A0,B3,C3,T1); } blB += nr*PacketSize; @@ -596,9 +599,9 @@ struct ei_gebp_kernel A0 = blA[k]; B0 = blB[0*PacketSize]; - CJMADD(A0,B0,C0,T0); + MADD(cj,A0,B0,C0,T0); B0 = blB[1*PacketSize]; - CJMADD(A0,B0,C1,T0); + MADD(cj,A0,B0,C1,T0); } else { @@ -613,10 +616,10 @@ struct ei_gebp_kernel B2 = blB[2*PacketSize]; B3 = blB[3*PacketSize]; - CJMADD(A0,B0,C0,T0); - CJMADD(A0,B1,C1,T1); - CJMADD(A0,B2,C2,T0); - CJMADD(A0,B3,C3,T1); + MADD(cj,A0,B0,C0,T0); + MADD(cj,A0,B1,C1,T1); + MADD(cj,A0,B2,C2,T0); + MADD(cj,A0,B3,C3,T1); } blB += nr*PacketSize; @@ -662,8 +665,8 @@ struct ei_gebp_kernel A0 = ei_pload(&blA[0*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,T1); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,T1); blB += PacketSize; blA += mr; @@ -683,7 +686,8 @@ struct ei_gebp_kernel const Scalar* blB = unpackedB; for(Index k=0; k<depth; k++) { - C0 = cj.pmadd(ei_pload(blA), ei_pload(blB), C0); + PacketType T0; + MADD(pcj,ei_pload(blA), ei_pload(blB), C0, T0); blB += PacketSize; blA += PacketSize; } @@ -700,7 +704,12 @@ struct ei_gebp_kernel // FIXME directly use blockB ?? const Scalar* blB = unpackedB; for(Index k=0; k<depth; k++) - C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0); + { + #ifndef EIGEN_HAS_FUSE_CJMADD + Scalar T0; + #endif + MADD(cj,blA[k], blB[k*PacketSize], C0, T0); + } res[(j2+0)*resStride + i] += C0; } } @@ -769,8 +778,8 @@ struct ei_gemm_pack_lhs // 4 5 6 7 16 17 18 19 25 28 // 8 9 10 11 20 21 22 23 26 29 // . . . . . . . . . . -template<typename Scalar, typename Index, int nr, bool PanelMode> -struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, PanelMode> +template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> +struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> { typedef typename ei_packet_traits<Scalar>::type Packet; enum { PacketSize = ei_packet_traits<Scalar>::size }; @@ -778,6 +787,7 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, PanelMode> Index stride=0, Index offset=0) { ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; bool hasAlpha = alpha != Scalar(1); Index packet_cols = (cols/nr) * nr; Index count = 0; @@ -792,19 +802,19 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, PanelMode> if (hasAlpha) for(Index k=0; k<depth; k++) { - blockB[count+0] = alpha*b0[k]; - blockB[count+1] = alpha*b1[k]; - if(nr==4) blockB[count+2] = alpha*b2[k]; - if(nr==4) blockB[count+3] = alpha*b3[k]; + blockB[count+0] = alpha*cj(b0[k]); + blockB[count+1] = alpha*cj(b1[k]); + if(nr==4) blockB[count+2] = alpha*cj(b2[k]); + if(nr==4) blockB[count+3] = alpha*cj(b3[k]); count += nr; } else for(Index k=0; k<depth; k++) { - blockB[count+0] = b0[k]; - blockB[count+1] = b1[k]; - if(nr==4) blockB[count+2] = b2[k]; - if(nr==4) blockB[count+3] = b3[k]; + blockB[count+0] = cj(b0[k]); + blockB[count+1] = cj(b1[k]); + if(nr==4) blockB[count+2] = cj(b2[k]); + if(nr==4) blockB[count+3] = cj(b3[k]); count += nr; } // skip what we have after @@ -819,13 +829,13 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, PanelMode> if (hasAlpha) for(Index k=0; k<depth; k++) { - blockB[count] = alpha*b0[k]; + blockB[count] = alpha*cj(b0[k]); count += 1; } else for(Index k=0; k<depth; k++) { - blockB[count] = b0[k]; + blockB[count] = cj(b0[k]); count += 1; } if(PanelMode) count += (stride-offset-depth); @@ -834,14 +844,15 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, PanelMode> }; // this version is optimized for row major matrices -template<typename Scalar, typename Index, int nr, bool PanelMode> -struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, PanelMode> +template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> +struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> { enum { PacketSize = ei_packet_traits<Scalar>::size }; void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Scalar alpha, Index depth, Index cols, Index stride=0, Index offset=0) { ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; bool hasAlpha = alpha != Scalar(1); Index packet_cols = (cols/nr) * nr; Index count = 0; @@ -854,10 +865,10 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, PanelMode> for(Index k=0; k<depth; k++) { const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = alpha*b0[0]; - blockB[count+1] = alpha*b0[1]; - if(nr==4) blockB[count+2] = alpha*b0[2]; - if(nr==4) blockB[count+3] = alpha*b0[3]; + blockB[count+0] = alpha*cj(b0[0]); + blockB[count+1] = alpha*cj(b0[1]); + if(nr==4) blockB[count+2] = alpha*cj(b0[2]); + if(nr==4) blockB[count+3] = alpha*cj(b0[3]); count += nr; } } @@ -866,10 +877,10 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, PanelMode> for(Index k=0; k<depth; k++) { const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = b0[0]; - blockB[count+1] = b0[1]; - if(nr==4) blockB[count+2] = b0[2]; - if(nr==4) blockB[count+3] = b0[3]; + blockB[count+0] = cj(b0[0]); + blockB[count+1] = cj(b0[1]); + if(nr==4) blockB[count+2] = cj(b0[2]); + if(nr==4) blockB[count+3] = cj(b0[3]); count += nr; } } @@ -883,7 +894,7 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, PanelMode> const Scalar* b0 = &rhs[j2]; for(Index k=0; k<depth; k++) { - blockB[count] = alpha*b0[k*rhsStride]; + blockB[count] = alpha*cj(b0[k*rhsStride]); count += 1; } if(PanelMode) count += stride-offset-depth; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 39b283a3f..2ae78c1e7 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -73,9 +73,6 @@ static void run(Index rows, Index cols, Index depth, ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - if (ConjugateRhs) - alpha = ei_conj(alpha); - typedef typename ei_packet_traits<Scalar>::type PacketType; typedef ei_product_blocking_traits<Scalar> Blocking; @@ -83,9 +80,18 @@ static void run(Index rows, Index cols, Index depth, Index mc = std::min(rows,blocking.mc()); // cache block size along the M direction //Index nc = blocking.nc(); // cache block size along the N direction - ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs; - ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs; - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp; + // FIXME starting from SSE3, normal complex product cannot be optimized as well as + // conjugate product, therefore it is better to conjugate during the copies. + // With SSE2, this is the other way round. + ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder, ConjugateLhs> pack_lhs; + ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder, ConjugateRhs> pack_rhs; + ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr> gebp; + +// if (ConjugateRhs) +// alpha = ei_conj(alpha); +// ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs; +// ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs; +// ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp; #ifdef EIGEN_HAS_OPENMP if(info) @@ -237,7 +243,7 @@ struct ei_gemm_functor { if(cols==-1) cols = m_rhs.cols(); - + Gemm::run(rows, cols, m_lhs.cols(), (const Scalar*)&(m_lhs.const_cast_derived().coeffRef(row,0)), m_lhs.outerStride(), (const Scalar*)&(m_rhs.const_cast_derived().coeffRef(0,col)), m_rhs.outerStride(), diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 5d8da247c..e671a657e 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -48,18 +48,22 @@ void ei_cache_friendly_product_colmajor_times_vector( ei_pstore(&res[j], \ ei_padd(ei_pload(&res[j]), \ ei_padd( \ - ei_padd(cj.pmul(EIGEN_CAT(ei_ploa , A0)(&lhs0[j]), ptmp0), \ - cj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs1[j]), ptmp1)), \ - ei_padd(cj.pmul(EIGEN_CAT(ei_ploa , A2)(&lhs2[j]), ptmp2), \ - cj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs3[j]), ptmp3)) ))) - - ei_conj_helper<ConjugateLhs,ConjugateRhs> cj; - if(ConjugateRhs) - alpha = ei_conj(alpha); + ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A0)(&lhs0[j]), ptmp0), \ + pcj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs1[j]), ptmp1)), \ + ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A2)(&lhs2[j]), ptmp2), \ + pcj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs3[j]), ptmp3)) ))) typedef typename NumTraits<Scalar>::Real RealScalar; typedef typename ei_packet_traits<Scalar>::type Packet; - const Index PacketSize = sizeof(Packet)/sizeof(Scalar); + enum { + PacketSize = sizeof(Packet)/sizeof(Scalar), + Vectorizable = ei_packet_traits<Scalar>::Vectorizable + }; + + ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj; + ei_conj_helper<Packet,Packet,ConjugateLhs,ConjugateRhs> pcj; + if(ConjugateRhs) + alpha = ei_conj(alpha); enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; const Index columnsAtOnce = 4; @@ -84,7 +88,7 @@ void ei_cache_friendly_product_colmajor_times_vector( // find how many columns do we have to skip to be aligned with the result (if possible) Index skipColumns = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (size_t(lhs)%sizeof(RealScalar)) || (size_t(res)%sizeof(RealScalar)) ) + if( (size_t(lhs)%sizeof(Scalar)) || (size_t(res)%sizeof(Scalar)) ) { alignedSize = 0; alignedStart = 0; @@ -113,6 +117,12 @@ void ei_cache_friendly_product_colmajor_times_vector( || PacketSize > size || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0); } + else if(Vectorizable) + { + alignedStart = 0; + alignedSize = size; + alignmentPattern = AllAligned; + } Index offset1 = (FirstAligned && alignmentStep==1?3:1); Index offset3 = (FirstAligned && alignmentStep==1?1:3); @@ -127,7 +137,7 @@ void ei_cache_friendly_product_colmajor_times_vector( const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; - if (PacketSize>1) + if (Vectorizable) { /* explicit vectorization */ // process initial unaligned coeffs @@ -168,19 +178,19 @@ void ei_cache_friendly_product_colmajor_times_vector( A00 = ei_pload (&lhs0[j]); A10 = ei_pload (&lhs0[j+PacketSize]); - A00 = cj.pmadd(A00, ptmp0, ei_pload(&res[j])); - A10 = cj.pmadd(A10, ptmp0, ei_pload(&res[j+PacketSize])); + A00 = pcj.pmadd(A00, ptmp0, ei_pload(&res[j])); + A10 = pcj.pmadd(A10, ptmp0, ei_pload(&res[j+PacketSize])); - A00 = cj.pmadd(A01, ptmp1, A00); + A00 = pcj.pmadd(A01, ptmp1, A00); A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); - A00 = cj.pmadd(A02, ptmp2, A00); + A00 = pcj.pmadd(A02, ptmp2, A00); A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); - A00 = cj.pmadd(A03, ptmp3, A00); + A00 = pcj.pmadd(A03, ptmp3, A00); ei_pstore(&res[j],A00); A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); - A10 = cj.pmadd(A11, ptmp1, A10); - A10 = cj.pmadd(A12, ptmp2, A10); - A10 = cj.pmadd(A13, ptmp3, A10); + A10 = pcj.pmadd(A11, ptmp1, A10); + A10 = pcj.pmadd(A12, ptmp2, A10); + A10 = pcj.pmadd(A13, ptmp3, A10); ei_pstore(&res[j+PacketSize],A10); } } @@ -215,7 +225,7 @@ void ei_cache_friendly_product_colmajor_times_vector( Packet ptmp0 = ei_pset1(alpha*rhs[i]); const Scalar* lhs0 = lhs + i*lhsStride; - if (PacketSize>1) + if (Vectorizable) { /* explicit vectorization */ // process first unaligned result's coeffs @@ -225,10 +235,10 @@ void ei_cache_friendly_product_colmajor_times_vector( // process aligned result's coeffs if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0) for (Index j = alignedStart;j<alignedSize;j+=PacketSize) - ei_pstore(&res[j], cj.pmadd(ei_pload(&lhs0[j]), ptmp0, ei_pload(&res[j]))); + ei_pstore(&res[j], pcj.pmadd(ei_pload(&lhs0[j]), ptmp0, ei_pload(&res[j]))); else for (Index j = alignedStart;j<alignedSize;j+=PacketSize) - ei_pstore(&res[j], cj.pmadd(ei_ploadu(&lhs0[j]), ptmp0, ei_pload(&res[j]))); + ei_pstore(&res[j], pcj.pmadd(ei_ploadu(&lhs0[j]), ptmp0, ei_pload(&res[j]))); } // process remaining scalars (or all if no explicit vectorization) @@ -243,7 +253,7 @@ void ei_cache_friendly_product_colmajor_times_vector( } else break; - } while(PacketSize>1); + } while(Vectorizable); #undef _EIGEN_ACCUMULATE_PACKETS } @@ -261,16 +271,20 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ Packet b = ei_pload(&rhs[j]); \ - ptmp0 = cj.pmadd(EIGEN_CAT(ei_ploa,A0) (&lhs0[j]), b, ptmp0); \ - ptmp1 = cj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs1[j]), b, ptmp1); \ - ptmp2 = cj.pmadd(EIGEN_CAT(ei_ploa,A2) (&lhs2[j]), b, ptmp2); \ - ptmp3 = cj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs3[j]), b, ptmp3); } - - ei_conj_helper<ConjugateLhs,ConjugateRhs> cj; + ptmp0 = pcj.pmadd(EIGEN_CAT(ei_ploa,A0) (&lhs0[j]), b, ptmp0); \ + ptmp1 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs1[j]), b, ptmp1); \ + ptmp2 = pcj.pmadd(EIGEN_CAT(ei_ploa,A2) (&lhs2[j]), b, ptmp2); \ + ptmp3 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs3[j]), b, ptmp3); } typedef typename NumTraits<Scalar>::Real RealScalar; typedef typename ei_packet_traits<Scalar>::type Packet; - const Index PacketSize = sizeof(Packet)/sizeof(Scalar); + enum { + PacketSize = sizeof(Packet)/sizeof(Scalar), + Vectorizable = ei_packet_traits<Scalar>::Vectorizable + }; + + ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj; + ei_conj_helper<Packet,Packet,ConjugateLhs,ConjugateRhs> pcj; enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; const Index rowsAtOnce = 4; @@ -297,7 +311,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( // find how many rows do we have to skip to be aligned with rhs (if possible) Index skipRows = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (size_t(lhs)%sizeof(RealScalar)) || (size_t(rhs)%sizeof(RealScalar)) ) + if( (size_t(lhs)%sizeof(Scalar)) || (size_t(rhs)%sizeof(Scalar)) ) { alignedSize = 0; alignedStart = 0; @@ -326,6 +340,12 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( || PacketSize > rhsSize || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0); } + else if(Vectorizable) + { + alignedStart = 0; + alignedSize = size; + alignmentPattern = AllAligned; + } Index offset1 = (FirstAligned && alignmentStep==1?3:1); Index offset3 = (FirstAligned && alignmentStep==1?1:3); @@ -333,13 +353,14 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( Index rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) { - Scalar tmp0 = Scalar(0), tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0); + EIGEN_ALIGN16 Scalar tmp0 = Scalar(0); + Scalar tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0); // this helps the compiler generating good binary code const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; - if (PacketSize>1) + if (Vectorizable) { /* explicit vectorization */ Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0)); @@ -386,19 +407,19 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12); A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13); - ptmp0 = cj.pmadd(ei_pload (&lhs0[j]), b, ptmp0); - ptmp1 = cj.pmadd(A01, b, ptmp1); + ptmp0 = pcj.pmadd(ei_pload (&lhs0[j]), b, ptmp0); + ptmp1 = pcj.pmadd(A01, b, ptmp1); A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); - ptmp2 = cj.pmadd(A02, b, ptmp2); + ptmp2 = pcj.pmadd(A02, b, ptmp2); A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); - ptmp3 = cj.pmadd(A03, b, ptmp3); + ptmp3 = pcj.pmadd(A03, b, ptmp3); A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); b = ei_pload(&rhs[j+PacketSize]); - ptmp0 = cj.pmadd(ei_pload (&lhs0[j+PacketSize]), b, ptmp0); - ptmp1 = cj.pmadd(A11, b, ptmp1); - ptmp2 = cj.pmadd(A12, b, ptmp2); - ptmp3 = cj.pmadd(A13, b, ptmp3); + ptmp0 = pcj.pmadd(ei_pload (&lhs0[j+PacketSize]), b, ptmp0); + ptmp1 = pcj.pmadd(A11, b, ptmp1); + ptmp2 = pcj.pmadd(A12, b, ptmp2); + ptmp3 = pcj.pmadd(A13, b, ptmp3); } } for (Index j = peeledSize; j<alignedSize; j+=PacketSize) @@ -434,7 +455,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( { for (Index i=start; i<end; ++i) { - Scalar tmp0 = Scalar(0); + EIGEN_ALIGN16 Scalar tmp0 = Scalar(0); Packet ptmp0 = ei_pset1(tmp0); const Scalar* lhs0 = lhs + i*lhsStride; // process first unaligned result's coeffs @@ -447,10 +468,10 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( // process aligned rhs coeffs if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0) for (Index j = alignedStart;j<alignedSize;j+=PacketSize) - ptmp0 = cj.pmadd(ei_pload(&lhs0[j]), ei_pload(&rhs[j]), ptmp0); + ptmp0 = pcj.pmadd(ei_pload(&lhs0[j]), ei_pload(&rhs[j]), ptmp0); else for (Index j = alignedStart;j<alignedSize;j+=PacketSize) - ptmp0 = cj.pmadd(ei_ploadu(&lhs0[j]), ei_pload(&rhs[j]), ptmp0); + ptmp0 = pcj.pmadd(ei_ploadu(&lhs0[j]), ei_pload(&rhs[j]), ptmp0); tmp0 += ei_predux(ptmp0); } @@ -468,7 +489,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( } else break; - } while(PacketSize>1); + } while(Vectorizable); #undef _EIGEN_ACCUMULATE_PACKETS } diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 24b711ced..d8fa1bd9c 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -270,7 +270,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel; + ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; ei_symm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs; ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs; ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed; @@ -353,7 +353,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB); Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel; + ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs; ei_symm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs; diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index d6933adb6..4514c7692 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -46,8 +46,11 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( FirstTriangular = IsRowMajor == IsLower }; - ei_conj_helper<NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0; - ei_conj_helper<NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1; + ei_conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0; + ei_conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1; + + ei_conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0; + ei_conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1; Scalar cjAlpha = ConjugateRhs ? ei_conj(alpha) : alpha; @@ -121,9 +124,9 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( Packet Bi = ei_ploadu(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases Packet Xi = ei_pload (resIt); - Xi = cj0.pmadd(A0i,ptmp0, cj0.pmadd(A1i,ptmp1,Xi)); - ptmp2 = cj1.pmadd(A0i, Bi, ptmp2); - ptmp3 = cj1.pmadd(A1i, Bi, ptmp3); + Xi = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi)); + ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2); + ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3); ei_pstore(resIt,Xi); resIt += PacketSize; } for (size_t i=alignedEnd; i<endi; i++) diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index eaf634de3..40c0c9aac 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -32,7 +32,7 @@ **********************************************************************/ // forward declarations (defined at the end of this file) -template<typename Scalar, typename Index, int mr, int nr, typename Conj, int UpLo> +template<typename Scalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo> struct ei_sybb_kernel; /* Optimized selfadjoint product (_SYRK) */ @@ -84,12 +84,15 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; // note that the actual rhs is the transpose/adjoint of mat - typedef ei_conj_helper<NumTraits<Scalar>::IsComplex && !AAT, NumTraits<Scalar>::IsComplex && AAT> Conj; + enum { + ConjLhs = NumTraits<Scalar>::IsComplex && !AAT, + ConjRhs = NumTraits<Scalar>::IsComplex && AAT + }; - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, Conj> gebp_kernel; + ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjLhs, ConjRhs> gebp_kernel; ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor> pack_rhs; ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,MatStorageOrder, false> pack_lhs; - ei_sybb_kernel<Scalar, Index, Blocking::mr, Blocking::nr, Conj, UpLo> sybb; + ei_sybb_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjLhs, ConjRhs, UpLo> sybb; for(Index k2=0; k2<depth; k2+=kc) { @@ -163,7 +166,7 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo> // while the selfadjoint block overlapping the diagonal is evaluated into a // small temporary buffer which is then accumulated into the result using a // triangular traversal. -template<typename Scalar, typename Index, int mr, int nr, typename Conj, int UpLo> +template<typename Scalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo> struct ei_sybb_kernel { enum { @@ -172,7 +175,7 @@ struct ei_sybb_kernel }; void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index size, Index depth, Scalar* workspace) { - ei_gebp_kernel<Scalar, Index, mr, nr, Conj> gebp_kernel; + ei_gebp_kernel<Scalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel; Matrix<Scalar,BlockSize,BlockSize,ColMajor> buffer; // let's process the block per panel of actual_mc x BlockSize, diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index fd0a7c2b2..be9362958 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -129,7 +129,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true, triangularBuffer.setZero(); triangularBuffer.diagonal().setOnes(); - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel; + ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs; ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs; @@ -254,10 +254,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false, triangularBuffer.setZero(); triangularBuffer.diagonal().setOnes(); - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel; + ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs; ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs; - ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,true> pack_rhs_panel; + ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,false,true> pack_rhs_panel; for(Index k2=IsLower ? 0 : depth; IsLower ? k2<depth : k2>0; diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 7038627fb..0fce7159e 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -74,9 +74,9 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; ei_conj_if<Conjugate> conj; - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ei_conj_helper<Conjugate,false> > gebp_kernel; + ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, Conjugate, false> gebp_kernel; ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,TriStorageOrder> pack_lhs; - ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, ColMajor, true> pack_rhs; + ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, ColMajor, false, true> pack_rhs; for(Index k2=IsLower ? 0 : size; IsLower ? k2<size : k2>0; @@ -212,9 +212,9 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr; ei_conj_if<Conjugate> conj; - ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ei_conj_helper<false,Conjugate> > gebp_kernel; + ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, false, Conjugate> gebp_kernel; ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs; - ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,true> pack_rhs_panel; + ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,false,true> pack_rhs_panel; ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, ColMajor, false, true> pack_lhs_panel; for(Index k2=IsLower ? size : 0; diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 8bcd8c95f..38c86511c 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -29,10 +29,10 @@ // implement and control fast level 2 and level 3 BLAS-like routines. // forward declarations -template<typename Scalar, typename Index, int mr, int nr, typename Conj> +template<typename Scalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false> struct ei_gebp_kernel; -template<typename Scalar, typename Index, int nr, int StorageOrder, bool PanelMode=false> +template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false> struct ei_gemm_pack_rhs; template<typename Scalar, typename Index, int mr, int StorageOrder, bool Conjugate = false, bool PanelMode = false> @@ -53,46 +53,40 @@ template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, static void ei_cache_friendly_product_rowmajor_times_vector( const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsSize, ResType& res, Scalar alpha); -// Provides scalar/packet-wise product and product with accumulation -// with optional conjugation of the arguments. -template<bool ConjLhs, bool ConjRhs> struct ei_conj_helper; - -template<> struct ei_conj_helper<false,false> +template<typename Scalar> struct ei_conj_helper<Scalar,Scalar,false,false> { - template<typename T> - EIGEN_STRONG_INLINE T pmadd(const T& x, const T& y, const T& c) const { return ei_pmadd(x,y,c); } - template<typename T> - EIGEN_STRONG_INLINE T pmul(const T& x, const T& y) const { return ei_pmul(x,y); } + EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return ei_pmadd(x,y,c); } + EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return ei_pmul(x,y); } }; -template<> struct ei_conj_helper<false,true> +template<typename RealScalar> struct ei_conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, false,true> { - template<typename T> std::complex<T> - pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const + typedef std::complex<RealScalar> Scalar; + EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return c + pmul(x,y); } - template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const - { return std::complex<T>(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_imag(x)*ei_real(y) - ei_real(x)*ei_imag(y)); } + EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const + { return Scalar(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_imag(x)*ei_real(y) - ei_real(x)*ei_imag(y)); } }; -template<> struct ei_conj_helper<true,false> +template<typename RealScalar> struct ei_conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, true,false> { - template<typename T> std::complex<T> - pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const + typedef std::complex<RealScalar> Scalar; + EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return c + pmul(x,y); } - template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const - { return std::complex<T>(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } + EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const + { return Scalar(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } }; -template<> struct ei_conj_helper<true,true> +template<typename RealScalar> struct ei_conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, true,true> { - template<typename T> std::complex<T> - pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const + typedef std::complex<RealScalar> Scalar; + EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return c + pmul(x,y); } - template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const - { return std::complex<T>(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } + EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const + { return Scalar(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } }; // Lightweight helper class to access matrix coefficients. @@ -140,6 +134,18 @@ struct ei_product_blocking_traits }; }; +template<typename Real> +struct ei_product_blocking_traits<std::complex<Real> > +{ + typedef std::complex<Real> Scalar; + typedef typename ei_packet_traits<Scalar>::type PacketType; + enum { + PacketSize = sizeof(PacketType)/sizeof(Scalar), + nr = 2, + mr = 2 * PacketSize + }; +}; + /* Helper class to analyze the factors of a Product expression. * In particular it allows to pop out operator-, scalar multiples, * and conjugate */ diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index ef923d867..310ffa4b3 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -106,9 +106,14 @@ template<typename Lhs, typename Rhs, int ProductType = ei_product_type<Lhs,Rhs>::value> struct ProductReturnType; +// Provides scalar/packet-wise product and product with accumulation +// with optional conjugation of the arguments. +template<typename LhsScalar, typename RhsScalar, bool ConjLhs, bool ConjRhs> struct ei_conj_helper; + template<typename Scalar> struct ei_scalar_sum_op; template<typename Scalar> struct ei_scalar_difference_op; template<typename Scalar> struct ei_scalar_product_op; +template<typename Scalar> struct ei_scalar_conj_product_op; template<typename Scalar> struct ei_scalar_quotient_op; template<typename Scalar> struct ei_scalar_opposite_op; template<typename Scalar> struct ei_scalar_conjugate_op; diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index f33b576ea..c93398092 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -151,7 +151,7 @@ class ei_compute_matrix_flags ) ) ? AlignedBit : 0, - packet_access_bit = ei_packet_traits<Scalar>::size > 1 && aligned_bit ? PacketAccessBit : 0 + packet_access_bit = ei_packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0 }; public: diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index 49a0d8f5d..94bb5569e 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -324,7 +324,7 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY& const Packet pc = ei_pset1(Scalar(j.c())); const Packet ps = ei_pset1(Scalar(j.s())); - ei_conj_helper<NumTraits<Scalar>::IsComplex,false> cj; + ei_conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex,false> pcj; for(Index i=0; i<alignedStart; ++i) { @@ -343,7 +343,7 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY& { Packet xi = ei_pload(px); Packet yi = ei_pload(py); - ei_pstore(px, ei_padd(ei_pmul(pc,xi),cj.pmul(ps,yi))); + ei_pstore(px, ei_padd(ei_pmul(pc,xi),pcj.pmul(ps,yi))); ei_pstore(py, ei_psub(ei_pmul(pc,yi),ei_pmul(ps,xi))); px += PacketSize; py += PacketSize; @@ -358,8 +358,8 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY& Packet xi1 = ei_ploadu(px+PacketSize); Packet yi = ei_pload (py); Packet yi1 = ei_pload (py+PacketSize); - ei_pstoreu(px, ei_padd(ei_pmul(pc,xi),cj.pmul(ps,yi))); - ei_pstoreu(px+PacketSize, ei_padd(ei_pmul(pc,xi1),cj.pmul(ps,yi1))); + ei_pstoreu(px, ei_padd(ei_pmul(pc,xi),pcj.pmul(ps,yi))); + ei_pstoreu(px+PacketSize, ei_padd(ei_pmul(pc,xi1),pcj.pmul(ps,yi1))); ei_pstore (py, ei_psub(ei_pmul(pc,yi),ei_pmul(ps,xi))); ei_pstore (py+PacketSize, ei_psub(ei_pmul(pc,yi1),ei_pmul(ps,xi1))); px += Peeling*PacketSize; @@ -369,7 +369,7 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY& { Packet xi = ei_ploadu(x+peelingEnd); Packet yi = ei_pload (y+peelingEnd); - ei_pstoreu(x+peelingEnd, ei_padd(ei_pmul(pc,xi),cj.pmul(ps,yi))); + ei_pstoreu(x+peelingEnd, ei_padd(ei_pmul(pc,xi),pcj.pmul(ps,yi))); ei_pstore (y+peelingEnd, ei_psub(ei_pmul(pc,yi),ei_pmul(ps,xi))); } } |