diff options
author | 2019-03-19 16:52:38 -0400 | |
---|---|---|
committer | 2019-03-19 16:52:38 -0400 | |
commit | 2dbea5510fe5cb64dbfdef9042c04a3a92b87f76 (patch) | |
tree | c187e7ec5e90a191e19466ff6084dd8f053dba7e /Eigen/src/Core/products | |
parent | e7e6809e6b38a5928efc0b5ca9520258e4d1fb3a (diff) | |
parent | 5c93b38c5fca514a08084e32feb8a8fb27bf3665 (diff) |
Merged eigen/eigen into default
Diffstat (limited to 'Eigen/src/Core/products')
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 1371 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 14 | ||||
-rw-r--r-- | Eigen/src/Core/products/Parallelizer.h | 3 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 23 |
4 files changed, 992 insertions, 419 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index e7cab4720..fdd0ec0e9 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -15,7 +15,13 @@ namespace Eigen { namespace internal { -template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false> +enum PacketSizeType { + PacketFull = 0, + PacketHalf, + PacketQuarter +}; + +template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=PacketFull> class gebp_traits; @@ -101,6 +107,16 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // at the register level. This small horizontal panel has to stay within L1 cache. std::ptrdiff_t l1, l2, l3; manage_caching_sizes(GetAction, &l1, &l2, &l3); + #ifdef EIGEN_VECTORIZE_AVX512 + // We need to find a rationale for that, but without this adjustment, + // performance with AVX512 is pretty bad, like -20% slower. + // One reason is that with increasing packet-size, the blocking size k + // has to become pretty small if we want that 1 lhs panel fit within L1. + // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are: + // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144. + // This is quite small for a good reuse of the accumulation registers. + l1 *= 4; + #endif if (num_threads > 1) { typedef typename Traits::ResScalar ResScalar; @@ -337,6 +353,61 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); #endif +template <typename RhsPacket, typename RhsPacketx4, int registers_taken> +struct RhsPanelHelper { + private: + static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken; + public: + typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type; +}; + +template <typename Packet> +struct QuadPacket +{ + Packet B_0, B1, B2, B3; + const Packet& get(const FixedInt<0>&) const { return B_0; } + const Packet& get(const FixedInt<1>&) const { return B1; } + const Packet& get(const FixedInt<2>&) const { return B2; } + const Packet& get(const FixedInt<3>&) const { return B3; } +}; + +template <int N, typename T1, typename T2, typename T3> +struct packet_conditional { typedef T3 type; }; + +template <typename T1, typename T2, typename T3> +struct packet_conditional<PacketFull, T1, T2, T3> { typedef T1 type; }; + +template <typename T1, typename T2, typename T3> +struct packet_conditional<PacketHalf, T1, T2, T3> { typedef T2 type; }; + +#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ + typedef typename packet_conditional<packet_size, \ + typename packet_traits<name ## Scalar>::type, \ + typename packet_traits<name ## Scalar>::half, \ + typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \ + prefix ## name ## Packet + +#define PACKET_DECL_COND(name, packet_size) \ + typedef typename packet_conditional<packet_size, \ + typename packet_traits<name ## Scalar>::type, \ + typename packet_traits<name ## Scalar>::half, \ + typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \ + name ## Packet + +#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \ + typedef typename packet_conditional<packet_size, \ + typename packet_traits<Scalar>::type, \ + typename packet_traits<Scalar>::half, \ + typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \ + prefix ## ScalarPacket + +#define PACKET_DECL_COND_SCALAR(packet_size) \ + typedef typename packet_conditional<packet_size, \ + typename packet_traits<Scalar>::type, \ + typename packet_traits<Scalar>::half, \ + typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \ + ScalarPacket + /* Vectorization logic * real*real: unpack rhs to constant packets, ... * @@ -347,7 +418,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ * cplx*real : unpack rhs to constant packets, ... * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual */ -template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs> +template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize> class gebp_traits { public: @@ -355,13 +426,17 @@ public: typedef _RhsScalar RhsScalar; typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, - RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, - ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, @@ -370,10 +445,12 @@ public: // register block size along the M direction (currently, this one cannot be modified) default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, -#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) - // we assume 16 registers +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \ + && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914)) + // we assume 16 registers or more // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined, // then using 3*LhsPacketSize triggers non-implemented paths in syrk. + // Bug 1515: MSVC prior to v19.14 yields to register spilling. mr = Vectorizable ? 3*LhsPacketSize : default_mr, #else mr = default_mr, @@ -383,38 +460,41 @@ public: RhsProgress = 1 }; - typedef typename packet_traits<LhsScalar>::type _LhsPacket; - typedef typename packet_traits<RhsScalar>::type _RhsPacket; - typedef typename packet_traits<ResScalar>::type _ResPacket; typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket<RhsPacket> RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - + template<typename RhsPacketType> EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { dest = pset1<RhsPacketType>(*b); } - + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + { + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); @@ -432,8 +512,8 @@ public: dest = ploadu<LhsPacketType>(a); } - template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType> - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const + template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj; // It would be a lot cleaner to call pmadd all the time. Unfortunately if we @@ -448,6 +528,12 @@ public: #endif } + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { r = pmadd(c,alpha,r); @@ -461,21 +547,25 @@ public: }; -template<typename RealScalar, bool _ConjLhs> -class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false> +template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize> +class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize> { public: typedef std::complex<RealScalar> LhsScalar; typedef RealScalar RhsScalar; typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = false, - Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, - RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, - ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, @@ -490,15 +580,13 @@ public: RhsProgress = 1 }; - typedef typename packet_traits<LhsScalar>::type _LhsPacket; - typedef typename packet_traits<RhsScalar>::type _RhsPacket; - typedef typename packet_traits<ResScalar>::type _ResPacket; - typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket<RhsPacket> RhsPacketx4; + typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -506,42 +594,64 @@ public: p = pset1<ResPacket>(ResScalar(0)); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1<RhsPacket>(*b); + dest = pset1<RhsPacketType>(*b); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - dest = pset1<RhsPacket>(*b); + loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type()); } - EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const { - dest = pload<LhsPacket>(a); + // FIXME we can do better! + // what we want here is a ploadheight + RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]}; + dest = ploadquad<RhsPacket>(tmp); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const { - dest = ploadu<LhsPacket>(a); + eigen_internal_assert(RhsPacketSize<=8); + dest = pset1<RhsPacket>(*b); } - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { - pbroadcast4(b, b0, b1, b2, b3); + dest = pload<LhsPacket>(a); } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + template<typename LhsPacketType> + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const + { + dest = ploadu<LhsPacketType>(a); + } + + template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); } - EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType> + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -556,13 +666,20 @@ public: c += a * b; } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + + template <typename ResPacketType, typename AccPacketType> + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj; r = cj.pmadd(c,alpha,r); } protected: - conj_helper<ResPacket,ResPacket,ConjLhs,false> cj; }; template<typename Packet> @@ -581,13 +698,57 @@ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Pack return res; } +// note that for DoublePacket<RealPacket> the "4" in "downto4" +// corresponds to the number of complexes, so it means "8" +// it terms of real coefficients. + template<typename Packet> -const DoublePacket<Packet>& predux_half_dowto4(const DoublePacket<Packet> &a) +const DoublePacket<Packet>& +predux_half_dowto4(const DoublePacket<Packet> &a, + typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0) { return a; } -template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; }; +template<typename Packet> +DoublePacket<typename unpacket_traits<Packet>::half> +predux_half_dowto4(const DoublePacket<Packet> &a, + typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0) +{ + // yes, that's pretty hackish :( + DoublePacket<typename unpacket_traits<Packet>::half> res; + typedef std::complex<typename unpacket_traits<Packet>::type> Cplx; + typedef typename packet_traits<Cplx>::type CplxPacket; + res.first = predux_half_dowto4(CplxPacket(a.first)).v; + res.second = predux_half_dowto4(CplxPacket(a.second)).v; + return res; +} + +// same here, "quad" actually means "8" in terms of real coefficients +template<typename Scalar, typename RealPacket> +void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest, + typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0) +{ + dest.first = pset1<RealPacket>(real(*b)); + dest.second = pset1<RealPacket>(imag(*b)); +} + +template<typename Scalar, typename RealPacket> +void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest, + typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0) +{ + // yes, that's pretty hackish too :( + typedef typename NumTraits<Scalar>::Real RealScalar; + RealScalar r[4] = {real(b[0]), real(b[0]), real(b[1]), real(b[1])}; + RealScalar i[4] = {imag(b[0]), imag(b[0]), imag(b[1]), imag(b[1])}; + dest.first = ploadquad<RealPacket>(r); + dest.second = ploadquad<RealPacket>(i); +} + + +template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { + typedef DoublePacket<typename unpacket_traits<Packet>::half> half; +}; // template<typename Packet> // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b) // { @@ -597,8 +758,8 @@ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typede // return res; // } -template<typename RealScalar, bool _ConjLhs, bool _ConjRhs> -class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs > +template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize> +class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize > { public: typedef std::complex<RealScalar> Scalar; @@ -606,15 +767,21 @@ public: typedef std::complex<RealScalar> RhsScalar; typedef std::complex<RealScalar> ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND(Real, _PacketSize); + PACKET_DECL_COND_SCALAR(_PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = packet_traits<RealScalar>::Vectorizable - && packet_traits<Scalar>::Vectorizable, - RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1, - ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, - LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, - RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, + Vectorizable = unpacket_traits<RealPacket>::vectorizable + && unpacket_traits<ScalarPacket>::vectorizable, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1, + RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1, // FIXME: should depend on NumberOfRegisters nr = 4, @@ -624,15 +791,16 @@ public: RhsProgress = 1 }; - typedef typename packet_traits<RealScalar>::type RealPacket; - typedef typename packet_traits<Scalar>::type ScalarPacket; - typedef DoublePacket<RealPacket> DoublePacketType; + typedef DoublePacket<RealPacket> DoublePacketType; typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing; typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket; typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket; typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket; typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket; + + // this actualy holds 8 packets! + typedef QuadPacket<RhsPacket> RhsPacketx4; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } @@ -643,51 +811,49 @@ public: } // Scalar path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { - dest = pset1<ResPacket>(*b); + dest = pset1<ScalarPacket>(*b); } // Vectorized path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const + template<typename RealPacketType> + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const { - dest.first = pset1<RealPacket>(real(*b)); - dest.second = pset1<RealPacket>(imag(*b)); + dest.first = pset1<RealPacketType>(real(*b)); + dest.second = pset1<RealPacketType>(imag(*b)); } - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - loadRhs(b,dest); + loadRhs(b, dest.B_0); + loadRhs(b + 1, dest.B1); + loadRhs(b + 2, dest.B2); + loadRhs(b + 3, dest.B3); } - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const + + // Scalar path + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const { - eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4); - loadRhs(b,dest); + loadRhs(b, dest); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + + // Vectorized path + template<typename RealPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); - loadRhs(b+2, b2); - loadRhs(b+3, b3); + loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} - // Vectorized path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); + loadRhs(b,dest); } - - // Scalar path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); + loadQuadToDoublePacket(b,dest); } // nothing special here @@ -696,47 +862,59 @@ public: dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a)); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template<typename LhsPacketType> + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a)); + dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a)); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const + template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType> + EIGEN_STRONG_INLINE + typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type + madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const + template<typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const { c = cj.pmadd(a,b,c); } + + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } - EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const + template<typename RealPacketType, typename ResPacketType> + EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const { // assemble c - ResPacket tmp; + ResPacketType tmp; if((!ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(pconj(ResPacket(c.second))); - tmp = padd(ResPacket(c.first),tmp); + tmp = pcplxflip(pconj(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); } else if((!ConjLhs)&&(ConjRhs)) { - tmp = pconj(pcplxflip(ResPacket(c.second))); - tmp = padd(ResPacket(c.first),tmp); + tmp = pconj(pcplxflip(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); } else if((ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(ResPacket(c.second)); - tmp = padd(pconj(ResPacket(c.first)),tmp); + tmp = pcplxflip(ResPacketType(c.second)); + tmp = padd(pconj(ResPacketType(c.first)),tmp); } else if((ConjLhs)&&(ConjRhs)) { - tmp = pcplxflip(ResPacket(c.second)); - tmp = psub(pconj(ResPacket(c.first)),tmp); + tmp = pcplxflip(ResPacketType(c.second)); + tmp = psub(pconj(ResPacketType(c.first)),tmp); } r = pmadd(tmp,alpha,r); @@ -746,8 +924,8 @@ protected: conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; }; -template<typename RealScalar, bool _ConjRhs> -class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs > +template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize> +class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize > { public: typedef std::complex<RealScalar> Scalar; @@ -755,14 +933,25 @@ public: typedef Scalar RhsScalar; typedef Scalar ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Real, _PacketSize); + PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize); + +#undef PACKET_DECL_COND_SCALAR_PREFIX +#undef PACKET_DECL_COND_PREFIX +#undef PACKET_DECL_COND_SCALAR +#undef PACKET_DECL_COND + enum { ConjLhs = false, ConjRhs = _ConjRhs, - Vectorizable = packet_traits<RealScalar>::Vectorizable - && packet_traits<Scalar>::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, - RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, - ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, + Vectorizable = unpacket_traits<_RealPacket>::vectorizable + && unpacket_traits<_ScalarPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, // FIXME: should depend on NumberOfRegisters @@ -773,15 +962,11 @@ public: RhsProgress = 1 }; - typedef typename packet_traits<LhsScalar>::type _LhsPacket; - typedef typename packet_traits<RhsScalar>::type _RhsPacket; - typedef typename packet_traits<ResScalar>::type _ResPacket; - typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; typedef LhsPacket LhsPacket4Packing; - + typedef QuadPacket<RhsPacket> RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -789,22 +974,25 @@ public: p = pset1<ResPacket>(ResScalar(0)); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1<RhsPacket>(*b); + dest = pset1<RhsPacketType>(*b); } - - void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - pbroadcast4(b, b0, b1, b2, b3); + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// // FIXME not sure that's the best way to implement it! -// b0 = pload1<RhsPacket>(b+0); -// b1 = pload1<RhsPacket>(b+1); -// } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { @@ -813,21 +1001,23 @@ public: EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4); - loadRhs(b,dest); + dest = ploadquad<RhsPacket>(b); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template<typename LhsPacketType> + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploaddup<LhsPacket>(a); + dest = ploaddup<LhsPacketType>(a); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); } - EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType> + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -843,16 +1033,166 @@ public: c += a * b; } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + + template <typename ResPacketType, typename AccPacketType> + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj; r = cj.pmadd(alpha,c,r); } protected: - conj_helper<ResPacket,ResPacket,false,ConjRhs> cj; + +}; + + +#if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON + +template<> +struct gebp_traits <float, float, false, false,Architecture::NEON,PacketFull> + : gebp_traits<float,float,false,false,Architecture::Generic,PacketFull> +{ + typedef float RhsPacket; + + typedef float32x4_t RhsPacketx4; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest = vld1q_f32(b); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const + {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_n_f32(c, a, b); + } + + // NOTE: Template parameter inference failed when compiled with Android NDK: + // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>". + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { madd_helper<0>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const + { madd_helper<1>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const + { madd_helper<2>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const + { madd_helper<3>(a, b, c); } + + private: + template<int LaneID> + EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const + { + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f32 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : ); + #else + c = vfmaq_laneq_f32(c, a, b, LaneID); + #endif + } +}; + + +template<> +struct gebp_traits <double, double, false, false,Architecture::NEON> + : gebp_traits<double,double,false,false,Architecture::Generic> +{ + typedef double RhsPacket; + + struct RhsPacketx4 { + float64x2_t B_0, B_1; + }; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest.B_0 = vld1q_f64(b); + dest.B_1 = vld1q_f64(b+2); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const + {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_n_f64(c, a, b); + } + + // NOTE: Template parameter inference failed when compiled with Android NDK: + // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>". + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { madd_helper<0>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const + { madd_helper<1>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const + { madd_helper<2>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const + { madd_helper<3>(a, b, c); } + + private: + template <int LaneID> + EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const + { + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f64 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + #else + if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0); + else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1); + else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0); + else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1); + #endif + } }; -/* optimized GEneral packed Block * packed Panel product kernel +#endif + +/* optimized General packed Block * packed Panel product kernel * * Mixing type logic: C += A * B * | A | B | comments @@ -862,26 +1202,47 @@ protected: template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> struct gebp_kernel { - typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits; + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,PacketHalf> HalfTraits; + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,PacketQuarter> QuarterTraits; + typedef typename Traits::ResScalar ResScalar; typedef typename Traits::LhsPacket LhsPacket; typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; + typedef typename Traits::RhsPacketx4 RhsPacketx4; + + typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15; + + typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits; - typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits; typedef typename SwappedTraits::ResScalar SResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; typedef typename SwappedTraits::RhsPacket SRhsPacket; typedef typename SwappedTraits::ResPacket SResPacket; typedef typename SwappedTraits::AccPacket SAccPacket; + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; + typedef typename HalfTraits::AccPacket AccPacketHalf; + + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; + typedef typename QuarterTraits::AccPacket AccPacketQuarter; + typedef typename DataMapper::LinearMapper LinearMapper; enum { Vectorizable = Traits::Vectorizable, LhsProgress = Traits::LhsProgress, + LhsProgressHalf = HalfTraits::LhsProgress, + LhsProgressQuarter = QuarterTraits::LhsProgress, RhsProgress = Traits::RhsProgress, + RhsProgressHalf = HalfTraits::RhsProgress, + RhsProgressQuarter = QuarterTraits::RhsProgress, ResPacketSize = Traits::ResPacketSize }; @@ -892,11 +1253,11 @@ struct gebp_kernel }; template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs, - int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs>::LhsProgress> +int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress> struct last_row_process_16_packets { - typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; - typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits; + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits; + typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits; typedef typename Traits::ResScalar ResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; @@ -924,8 +1285,8 @@ struct last_row_process_16_packets template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> { - typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; - typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits; + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits; + typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits; typedef typename Traits::ResScalar ResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; @@ -957,7 +1318,7 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, SRhsPacketQuarter b0; straits.loadLhsUnaligned(blB, a0); straits.loadRhs(blA, b0); - straits.madd(a0,b0,c0,b0); + straits.madd(a0,b0,c0,b0, fix<0>); blB += SwappedTraits::LhsProgress/4; blA += 1; } @@ -971,6 +1332,219 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, } }; +template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper> +struct lhs_process_one_packet +{ + typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4; + + EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0); + traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel); + traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>); + traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>); + traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>); + traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>); + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + __asm__ ("" : "+x,m" (*A0)); + #endif + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } + + EIGEN_STRONG_INLINE void operator()( + const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha, + Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, + int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4) + { + GEBPTraits traits; + + // loops on each largest micro horizontal panel of lhs + // (LhsProgress x depth) + for(Index i=peelStart; i<peelEnd; i+=LhsProgress) + { + // loops on each largest micro vertical panel of rhs (depth * nr) + for(Index j2=0; j2<packet_cols4; j2+=nr) + { + // We select a LhsProgress x nr micro block of res + // which is entirely stored into 1 x nr registers. + + const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C1, C2, C3; + traits.initAcc(C0); + traits.initAcc(C1); + traits.initAcc(C2); + traits.initAcc(C3); + // To improve instruction pipelining, let's double the accumulation registers: + // even k will accumulate in C*, while odd k will accumulate in D*. + // This trick is crutial to get good performance with FMA, otherwise it is + // actually faster to perform separated MUL+ADD because of a naturally + // better instruction-level parallelism. + AccPacket D0, D1, D2, D3; + traits.initAcc(D0); + traits.initAcc(D1); + traits.initAcc(D2); + traits.initAcc(D3); + + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + prefetch(&blB[0]); + LhsPacket A0, A1; + + for(Index k=0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4"); + RhsPacketx4 rhs_panel; + RhsPacket T0; + + internal::prefetch(blB+(48+0)); + peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3); + peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3); + internal::prefetch(blB+(48+16)); + peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3); + peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3); + + blB += pk*4*RhsProgress; + blA += pk*LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4"); + } + C0 = padd(C0,D0); + C1 = padd(C1,D1); + C2 = padd(C2,D2); + C3 = padd(C3,D3); + + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPacketx4 rhs_panel; + RhsPacket T0; + peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + blB += 4*RhsProgress; + blA += LhsProgress; + } + + ResPacket R0, R1; + ResPacket alphav = pset1<ResPacket>(alpha); + + R0 = r0.template loadPacket<ResPacket>(0); + R1 = r1.template loadPacket<ResPacket>(0); + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + r0.storePacket(0, R0); + r1.storePacket(0, R1); + + R0 = r2.template loadPacket<ResPacket>(0); + R1 = r3.template loadPacket<ResPacket>(0); + traits.acc(C2, alphav, R0); + traits.acc(C3, alphav, R1); + r2.storePacket(0, R0); + r3.storePacket(0, R1); + } + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2<cols; j2++) + { + // One column at a time + const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0; + traits.initAcc(C0); + + LinearMapper r0 = res.getLinearMapper(i, j2); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; + LhsPacket A0; + + for(Index k= 0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1"); + RhsPacket B_0; + +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + /* FIXME: why unaligned???? */ \ + traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0, fix<0>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \ + } while(false); + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*RhsProgress; + blA += pk*LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1"); + } + + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPacket B_0; + EIGEN_GEBGP_ONESTEP(0); + blB += RhsProgress; + blA += LhsProgress; + } +#undef EIGEN_GEBGP_ONESTEP + ResPacket R0; + ResPacket alphav = pset1<ResPacket>(alpha); + R0 = r0.template loadPacket<ResPacket>(0); + traits.acc(C0, alphav, R0); + r0.storePacket(0, R0); + } + } + } +}; + +template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper> +struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper> +{ + +EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0); + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3); + traits.madd(*A0, *B_0, *C0, *B_0); + traits.madd(*A0, *B1, *C1, *B1); + traits.madd(*A0, *B2, *C2, *B2); + traits.madd(*A0, *B3, *C3, *B3); + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } +}; + template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> EIGEN_DONT_INLINE void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs> @@ -987,10 +1561,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; - const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0; + const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0; enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) const Index peeled_kc = depth & ~(pk-1); - const Index prefetch_res_offset = 32/sizeof(ResScalar); + const int prefetch_res_offset = 32/sizeof(ResScalar); // const Index depth2 = depth & ~1; //---------- Process 3 * LhsProgress rows at once ---------- @@ -1048,36 +1624,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4"); - RhsPacket B_0, T0; + // 15 registers are taken (12 for acc, 2 for lhs). + RhsPanel15 rhs_panel; + RhsPacket T0; LhsPacket A2; - -#define EIGEN_GEBP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ + #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0)) + // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633 + // without this workaround A0, A1, and A2 are loaded in the same register, + // which is not good for pipelining + #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2)); + #else + #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND + #endif +#define EIGEN_GEBP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - internal::prefetch(blA+(3*K+16)*LhsProgress); \ - if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \ - traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ - traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ - traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \ - traits.madd(A0, B_0, C0, T0); \ - traits.madd(A1, B_0, C4, T0); \ - traits.madd(A2, B_0, C8, B_0); \ - traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \ - traits.madd(A0, B_0, C1, T0); \ - traits.madd(A1, B_0, C5, T0); \ - traits.madd(A2, B_0, C9, B_0); \ - traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \ - traits.madd(A0, B_0, C2, T0); \ - traits.madd(A1, B_0, C6, T0); \ - traits.madd(A2, B_0, C10, B_0); \ - traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \ - traits.madd(A0, B_0, C3 , T0); \ - traits.madd(A1, B_0, C7, T0); \ - traits.madd(A2, B_0, C11, B_0); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ - } while(false) + internal::prefetch(blA + (3 * K + 16) * LhsProgress); \ + if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \ + internal::prefetch(blB + (4 * K + 16) * RhsProgress); \ + } /* Bug 953 */ \ + traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \ + traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \ + EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \ + traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C0, T0, fix<0>); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A2, rhs_panel, C8, T0, fix<0>); \ + traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A2, rhs_panel, C9, T0, fix<1>); \ + traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A2, rhs_panel, C10, T0, fix<2>); \ + traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + traits.madd(A2, rhs_panel, C11, T0, fix<3>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ + } while (false) internal::prefetch(blB); EIGEN_GEBP_ONESTEP(0); @@ -1097,7 +1685,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { - RhsPacket B_0, T0; + RhsPanel15 rhs_panel; + RhsPacket T0; LhsPacket A2; EIGEN_GEBP_ONESTEP(0); blB += 4*RhsProgress; @@ -1177,20 +1766,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga { EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1"); RhsPacket B_0; -#define EIGEN_GEBGP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ - traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ - traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B_0); \ - traits.madd(A1, B_0, C4, B_0); \ - traits.madd(A2, B_0, C8, B_0); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ - } while(false) - + traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \ + traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \ + traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0, fix<0>); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ + traits.madd(A2, B_0, C8, B_0, fix<0>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ + } while (false) + EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); @@ -1279,26 +1868,34 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4"); - RhsPacket B_0, B1, B2, B3, T0; + RhsPacketx4 rhs_panel; + RhsPacket T0; + + // NOTE: the begin/end asm comments below work around bug 935! + // but they are not enough for gcc>=6 without FMA (bug 1637) + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1)); + #else + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND + #endif +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ + traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \ + traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C0, T0, fix<0>); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ + } while (false) - #define EIGEN_GEBGP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ - traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ - traits.madd(A0, B_0, C0, T0); \ - traits.madd(A1, B_0, C4, B_0); \ - traits.madd(A0, B1, C1, T0); \ - traits.madd(A1, B1, C5, B1); \ - traits.madd(A0, B2, C2, T0); \ - traits.madd(A1, B2, C6, B2); \ - traits.madd(A0, B3, C3, T0); \ - traits.madd(A1, B3, C7, B3); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ - } while(false) - internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -1318,7 +1915,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { - RhsPacket B_0, B1, B2, B3, T0; + RhsPacketx4 rhs_panel; + RhsPacket T0; EIGEN_GEBGP_ONESTEP(0); blB += 4*RhsProgress; blA += 2*Traits::LhsProgress; @@ -1389,8 +1987,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B1); \ - traits.madd(A1, B_0, C4, B_0); \ + traits.madd(A0, B_0, C0, B1, fix<0>); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ } while(false) @@ -1434,174 +2032,29 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga //---------- Process 1 * LhsProgress rows at once ---------- if(mr>=1*Traits::LhsProgress) { - // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth) - for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress) - { - // loops on each largest micro vertical panel of rhs (depth * nr) - for(Index j2=0; j2<packet_cols4; j2+=nr) - { - // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely - // stored into 1 x nr registers. - - const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)]; - prefetch(&blA[0]); - - // gets res block as register - AccPacket C0, C1, C2, C3; - traits.initAcc(C0); - traits.initAcc(C1); - traits.initAcc(C2); - traits.initAcc(C3); - - LinearMapper r0 = res.getLinearMapper(i, j2 + 0); - LinearMapper r1 = res.getLinearMapper(i, j2 + 1); - LinearMapper r2 = res.getLinearMapper(i, j2 + 2); - LinearMapper r3 = res.getLinearMapper(i, j2 + 3); - - r0.prefetch(prefetch_res_offset); - r1.prefetch(prefetch_res_offset); - r2.prefetch(prefetch_res_offset); - r3.prefetch(prefetch_res_offset); - - // performs "inner" products - const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - prefetch(&blB[0]); - LhsPacket A0; - - for(Index k=0; k<peeled_kc; k+=pk) - { - EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4"); - RhsPacket B_0, B1, B2, B3; - -#define EIGEN_GEBGP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \ - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ - traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ - traits.madd(A0, B_0, C0, B_0); \ - traits.madd(A0, B1, C1, B1); \ - traits.madd(A0, B2, C2, B2); \ - traits.madd(A0, B3, C3, B3); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \ - } while(false) - - internal::prefetch(blB+(48+0)); - EIGEN_GEBGP_ONESTEP(0); - EIGEN_GEBGP_ONESTEP(1); - EIGEN_GEBGP_ONESTEP(2); - EIGEN_GEBGP_ONESTEP(3); - internal::prefetch(blB+(48+16)); - EIGEN_GEBGP_ONESTEP(4); - EIGEN_GEBGP_ONESTEP(5); - EIGEN_GEBGP_ONESTEP(6); - EIGEN_GEBGP_ONESTEP(7); - - blB += pk*4*RhsProgress; - blA += pk*1*LhsProgress; - - EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4"); - } - // process remaining peeled loop - for(Index k=peeled_kc; k<depth; k++) - { - RhsPacket B_0, B1, B2, B3; - EIGEN_GEBGP_ONESTEP(0); - blB += 4*RhsProgress; - blA += 1*LhsProgress; - } -#undef EIGEN_GEBGP_ONESTEP - - ResPacket R0, R1; - ResPacket alphav = pset1<ResPacket>(alpha); - - R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); - R1 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - r0.storePacket(0 * Traits::ResPacketSize, R0); - r1.storePacket(0 * Traits::ResPacketSize, R1); - - R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); - R1 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); - traits.acc(C2, alphav, R0); - traits.acc(C3, alphav, R1); - r2.storePacket(0 * Traits::ResPacketSize, R0); - r3.storePacket(0 * Traits::ResPacketSize, R1); - } - - // Deal with remaining columns of the rhs - for(Index j2=packet_cols4; j2<cols; j2++) - { - // One column at a time - const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)]; - prefetch(&blA[0]); - - // gets res block as register - AccPacket C0; - traits.initAcc(C0); - - LinearMapper r0 = res.getLinearMapper(i, j2); - - // performs "inner" products - const RhsScalar* blB = &blockB[j2*strideB+offsetB]; - LhsPacket A0; - - for(Index k=0; k<peeled_kc; k+=pk) - { - EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1"); - RhsPacket B_0; - -#define EIGEN_GEBGP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \ - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ - traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B_0); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \ - } while(false); - - EIGEN_GEBGP_ONESTEP(0); - EIGEN_GEBGP_ONESTEP(1); - EIGEN_GEBGP_ONESTEP(2); - EIGEN_GEBGP_ONESTEP(3); - EIGEN_GEBGP_ONESTEP(4); - EIGEN_GEBGP_ONESTEP(5); - EIGEN_GEBGP_ONESTEP(6); - EIGEN_GEBGP_ONESTEP(7); - - blB += pk*RhsProgress; - blA += pk*1*Traits::LhsProgress; - - EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1"); - } - - // process remaining peeled loop - for(Index k=peeled_kc; k<depth; k++) - { - RhsPacket B_0; - EIGEN_GEBGP_ONESTEP(0); - blB += RhsProgress; - blA += 1*Traits::LhsProgress; - } -#undef EIGEN_GEBGP_ONESTEP - ResPacket R0; - ResPacket alphav = pset1<ResPacket>(alpha); - R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); - traits.acc(C0, alphav, R0); - r0.storePacket(0 * Traits::ResPacketSize, R0); - } - } + lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p; + p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressHalf rows at once ---------- + if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf) + { + lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p; + p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressQuarter rows at once ---------- + if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter) + { + lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p; + p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); } //---------- Process remaining rows, 1 at once ---------- - if(peeled_mc1<rows) + if(peeled_mc_quarter<rows) { // loop on each panel of the rhs for(Index j2=0; j2<packet_cols4; j2+=nr) { // loop on each row of the lhs (1*LhsProgress x depth) - for(Index i=peeled_mc1; i<rows; i+=1) + for(Index i=peeled_mc_quarter; i<rows; i+=1) { const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); @@ -1614,7 +2067,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size; if ((SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress<=16) && - (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) && + (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) && (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr)) { SAccPacket C0, C1, C2, C3; @@ -1638,15 +2091,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga straits.loadRhsQuad(blA+0*spk, B_0); straits.loadRhsQuad(blA+1*spk, B_1); - straits.madd(A0,B_0,C0,B_0); - straits.madd(A1,B_1,C1,B_1); + straits.madd(A0,B_0,C0,B_0, fix<0>); + straits.madd(A1,B_1,C1,B_1, fix<0>); straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); straits.loadRhsQuad(blA+2*spk, B_0); straits.loadRhsQuad(blA+3*spk, B_1); - straits.madd(A0,B_0,C2,B_0); - straits.madd(A1,B_1,C3,B_1); + straits.madd(A0,B_0,C2,B_0, fix<0>); + straits.madd(A1,B_1,C3,B_1, fix<0>); blB += 4*SwappedTraits::LhsProgress; blA += 4*spk; @@ -1659,7 +2112,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga straits.loadLhsUnaligned(blB, A0); straits.loadRhsQuad(blA, B_0); - straits.madd(A0,B_0,C0,B_0); + straits.madd(A0,B_0,C0,B_0, fix<0>); blB += SwappedTraits::LhsProgress; blA += spk; @@ -1669,7 +2122,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga // Special case where we have to first reduce the accumulation register C0 typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf; typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf; - typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf; + typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf; typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf; SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2); @@ -1683,7 +2136,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga straits.loadLhsUnaligned(blB, a0); straits.loadRhs(blA, b0); SAccPacketHalf c0 = predux_half_dowto4(C0); - straits.madd(a0,b0,c0,b0); + straits.madd(a0,b0,c0,b0, fix<0>); straits.acc(c0, alphav, R); } else @@ -1699,7 +2152,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga // template form, so that LhsProgress < 16 paths don't // fail to compile last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p; - p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0); + p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0); } else { @@ -1744,7 +2197,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga for(Index j2=packet_cols4; j2<cols; j2++) { // loop on each row of the lhs (1*LhsProgress x depth) - for(Index i=peeled_mc1; i<rows; i+=1) + for(Index i=peeled_mc_quarter; i<rows; i+=1) { const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); @@ -1791,7 +2244,13 @@ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pa EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - enum { PacketSize = unpacket_traits<Packet>::size }; + typedef typename unpacket_traits<Packet>::half HalfPacket; + typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits<Packet>::size, + HalfPacketSize = unpacket_traits<HalfPacket>::size, + QuarterPacketSize = unpacket_traits<QuarterPacket>::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1803,9 +2262,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 - : Pack2>1 ? (rows/Pack2)*Pack2 : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0; + const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0; + const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter + : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0; Index i=0; @@ -1864,20 +2326,60 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa if(PanelMode) count += (1*PacketSize) * (stride-offset-depth); } } - // Pack scalars + // Pack half packets + if(HasHalf && Pack1>=HalfPacketSize) + { + for(; i<peeled_mc_half; i+=HalfPacketSize) + { + if(PanelMode) count += (HalfPacketSize) * offset; + + for(Index k=0; k<depth; k++) + { + HalfPacket A; + A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=HalfPacketSize; + } + if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth); + } + } + // Pack quarter packets + if(HasQuarter && Pack1>=QuarterPacketSize) + { + for(; i<peeled_mc_quarter; i+=QuarterPacketSize) + { + if(PanelMode) count += (QuarterPacketSize) * offset; + + for(Index k=0; k<depth; k++) + { + QuarterPacket A; + A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=QuarterPacketSize; + } + if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth); + } + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). if(Pack2<PacketSize && Pack2>1) { - for(; i<peeled_mc0; i+=Pack2) + for(; i<peeled_mc0; i+=last_lhs_progress) { - if(PanelMode) count += Pack2 * offset; + if(PanelMode) count += last_lhs_progress * offset; for(Index k=0; k<depth; k++) - for(Index w=0; w<Pack2; w++) + for(Index w=0; w<last_lhs_progress; w++) blockA[count++] = cj(lhs(i+w, k)); - if(PanelMode) count += Pack2 * (stride-offset-depth); + if(PanelMode) count += last_lhs_progress * (stride-offset-depth); } } + // Pack scalars for(; i<rows; i++) { if(PanelMode) count += offset; @@ -1898,7 +2400,13 @@ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pa EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - enum { PacketSize = unpacket_traits<Packet>::size }; + typedef typename unpacket_traits<Packet>::half HalfPacket; + typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits<Packet>::size, + HalfPacketSize = unpacket_traits<HalfPacket>::size, + QuarterPacketSize = unpacket_traits<QuarterPacket>::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1906,37 +2414,51 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; Index count = 0; + bool gone_half = false, gone_quarter = false, gone_last = false; -// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; -// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; -// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - - int pack = Pack1; Index i = 0; + int pack = Pack1; + int psize = PacketSize; while(pack>0) { Index remaining_rows = rows-i; - Index peeled_mc = i+(remaining_rows/pack)*pack; + Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack; + Index starting_pos = i; for(; i<peeled_mc; i+=pack) { if(PanelMode) count += pack * offset; - const Index peeled_k = (depth/PacketSize)*PacketSize; Index k=0; - if(pack>=PacketSize) + if(pack>=psize && psize >= QuarterPacketSize) { - for(; k<peeled_k; k+=PacketSize) + const Index peeled_k = (depth/psize)*psize; + for(; k<peeled_k; k+=psize) { - for (Index m = 0; m < pack; m += PacketSize) + for (Index m = 0; m < pack; m += psize) { - PacketBlock<Packet> kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k); - ptranspose(kernel); - for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + if (psize == PacketSize) { + PacketBlock<Packet> kernel; + for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k); + ptranspose(kernel); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + } else if (HasHalf && psize == HalfPacketSize) { + gone_half = true; + PacketBlock<HalfPacket> kernel_half; + for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k); + ptranspose(kernel_half); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); + } else if (HasQuarter && psize == QuarterPacketSize) { + gone_quarter = true; + PacketBlock<QuarterPacket> kernel_quarter; + for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k); + ptranspose(kernel_quarter); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); + } } - count += PacketSize*pack; + count += psize*pack; } } + for(; k<depth; k++) { Index w=0; @@ -1959,9 +2481,28 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa if(PanelMode) count += pack * (stride-offset-depth); } - pack -= PacketSize; - if(pack<Pack2 && (pack+PacketSize)!=Pack2) - pack = Pack2; + pack -= psize; + Index left = rows - i; + if (pack <= 0) { + if (!gone_last && + (starting_pos == i || left >= psize/2 || left >= psize/4) && + ((psize/2 == HalfPacketSize && HasHalf && !gone_half) || + (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) { + psize /= 2; + pack = psize; + continue; + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). + if (Pack2 < PacketSize && !gone_last) { + gone_last = true; + psize = pack = left & ~1; + } + } } for(; i<rows; i++) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index f49abcad5..90c9c4647 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -469,6 +469,20 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) return; + // Fallback to GEMV if either the lhs or rhs is a runtime vector + if (dst.cols() == 1) + { + typename Dest::ColXpr dst_vec(dst.col(0)); + return internal::generic_product_impl<Lhs,typename Rhs::ConstColXpr,DenseShape,DenseShape,GemvProduct> + ::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha); + } + else if (dst.rows() == 1) + { + typename Dest::RowXpr dst_vec(dst.row(0)); + return internal::generic_product_impl<typename Lhs::ConstRowXpr,Rhs,DenseShape,DenseShape,GemvProduct> + ::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha); + } + typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs); diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 92e9b0d9f..e01e798f1 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -21,7 +21,8 @@ namespace internal { /** \internal */ inline void manage_multi_threading(Action action, int* v) { - static EIGEN_UNUSED int m_maxThreads = -1; + static int m_maxThreads = -1; + EIGEN_UNUSED_VARIABLE(m_maxThreads); if(action==SetAction) { diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index c84c71609..673073601 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -45,14 +45,23 @@ struct symm_pack_lhs } void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) { - enum { PacketSize = packet_traits<Scalar>::size }; + typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket; + typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket; + enum { PacketSize = packet_traits<Scalar>::size, + HalfPacketSize = unpacket_traits<HalfPacket>::size, + QuarterPacketSize = unpacket_traits<QuarterPacket>::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; + const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride); Index count = 0; //Index peeled_mc3 = (rows/Pack1)*Pack1; const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0; if(Pack1>=3*PacketSize) for(Index i=0; i<peeled_mc3; i+=3*PacketSize) @@ -66,8 +75,16 @@ struct symm_pack_lhs for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize) pack<1*PacketSize>(blockA, lhs, cols, i, count); + if(HasHalf && Pack1>=HalfPacketSize) + for(Index i=peeled_mc1; i<peeled_mc_half; i+=HalfPacketSize) + pack<HalfPacketSize>(blockA, lhs, cols, i, count); + + if(HasQuarter && Pack1>=QuarterPacketSize) + for(Index i=peeled_mc_half; i<peeled_mc_quarter; i+=QuarterPacketSize) + pack<QuarterPacketSize>(blockA, lhs, cols, i, count); + // do the same with mr==1 - for(Index i=peeled_mc1; i<rows; i++) + for(Index i=peeled_mc_quarter; i<rows; i++) { for(Index k=0; k<i; k++) blockA[count++] = lhs(i, k); // normal |