From c53eececb0415834b961cb61cd466907261b4b2f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 6 Dec 2018 15:58:06 +0100 Subject: Implement AVX512 vectorization of std::complex --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 167 +++++++++++++++------- 1 file changed, 119 insertions(+), 48 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 9ca865bd1..9475a6ecc 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -506,13 +506,28 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1(*b); + dest = pset1(*b); } EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { + loadRhsQuad_impl(b,dest, typename conditional::type()); + } + + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const + { + // FIXME we can do better! + // what we want here is a ploadheight + RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]}; + dest = ploadquad(tmp); + } + + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const + { + eigen_internal_assert(RhsPacketSize<=8); dest = pset1(*b); } @@ -521,9 +536,10 @@ public: dest = pload(a); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploadu(a); + dest = ploadu(a); } EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) @@ -536,12 +552,14 @@ public: // pbroadcast2(b, b0, b1); // } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const { madd_impl(a, b, c, tmp, typename conditional::type()); } - EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -556,13 +574,14 @@ public: c += a * b; } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + const conj_helper cj; r = cj.pmadd(c,alpha,r); } protected: - conj_helper cj; }; template @@ -581,13 +600,57 @@ DoublePacket padd(const DoublePacket &a, const DoublePacket the "4" in "downto4" +// corresponds to the number of complexes, so it means "8" +// it terms of real coefficients. + template -const DoublePacket& predux_half_dowto4(const DoublePacket &a) +const DoublePacket& +predux_half_dowto4(const DoublePacket &a, + typename enable_if::size<=8>::type* = 0) { return a; } -template struct unpacket_traits > { typedef DoublePacket half; }; +template +DoublePacket::half> +predux_half_dowto4(const DoublePacket &a, + typename enable_if::size==16>::type* = 0) +{ + // yes, that's pretty hackish :( + DoublePacket::half> res; + typedef std::complex::type> Cplx; + typedef typename packet_traits::type CplxPacket; + res.first = predux_half_dowto4(CplxPacket(a.first)).v; + res.second = predux_half_dowto4(CplxPacket(a.second)).v; + return res; +} + +// same here, "quad" actually means "8" in terms of real coefficients +template +void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, + typename enable_if::size<=8>::type* = 0) +{ + dest.first = pset1(real(*b)); + dest.second = pset1(imag(*b)); +} + +template +void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, + typename enable_if::size==16>::type* = 0) +{ + // yes, that's pretty hackish too :( + typedef typename NumTraits::Real RealScalar; + RealScalar r[4] = {real(b[0]), real(b[0]), real(b[1]), real(b[1])}; + RealScalar i[4] = {imag(b[0]), imag(b[0]), imag(b[1]), imag(b[1])}; + dest.first = ploadquad(r); + dest.second = ploadquad(i); +} + + +template struct unpacket_traits > { + typedef DoublePacket::half> half; +}; // template // DoublePacket pmadd(const DoublePacket &a, const DoublePacket &b) // { @@ -611,10 +674,10 @@ public: ConjRhs = _ConjRhs, Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, - RealPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + RealPacketSize = Vectorizable ? packet_traits::size : 1, // FIXME: should depend on NumberOfRegisters nr = 4, @@ -626,7 +689,7 @@ public: typedef typename packet_traits::type RealPacket; typedef typename packet_traits::type ScalarPacket; - typedef DoublePacket DoublePacketType; + typedef DoublePacket DoublePacketType; typedef typename conditional::type LhsPacket4Packing; typedef typename conditional::type LhsPacket; @@ -643,16 +706,17 @@ public: } // Scalar path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { - dest = pset1(*b); + dest = pset1(*b); } // Vectorized path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { - dest.first = pset1(real(*b)); - dest.second = pset1(imag(*b)); + dest.first = pset1(real(*b)); + dest.second = pset1(imag(*b)); } EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const @@ -661,8 +725,7 @@ public: } EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const { - eigen_internal_assert(unpacket_traits::size<=4); - loadRhs(b,dest); + loadQuadToDoublePacket(b,dest); } EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) @@ -696,12 +759,14 @@ public: dest = pload((const typename unpacket_traits::type*)(a)); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploadu((const typename unpacket_traits::type*)(a)); + dest = ploadu((const typename unpacket_traits::type*)(a)); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket& c, TmpType& /*tmp*/) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); @@ -714,29 +779,30 @@ public: EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } - EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacketType& alpha, ResPacketType& r) const { // assemble c - ResPacket tmp; + ResPacketType tmp; if((!ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(pconj(ResPacket(c.second))); - tmp = padd(ResPacket(c.first),tmp); + tmp = pcplxflip(pconj(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); } else if((!ConjLhs)&&(ConjRhs)) { - tmp = pconj(pcplxflip(ResPacket(c.second))); - tmp = padd(ResPacket(c.first),tmp); + tmp = pconj(pcplxflip(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); } else if((ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(ResPacket(c.second)); - tmp = padd(pconj(ResPacket(c.first)),tmp); + tmp = pcplxflip(ResPacketType(c.second)); + tmp = padd(pconj(ResPacketType(c.first)),tmp); } else if((ConjLhs)&&(ConjRhs)) { - tmp = pcplxflip(ResPacket(c.second)); - tmp = psub(pconj(ResPacket(c.first)),tmp); + tmp = pcplxflip(ResPacketType(c.second)); + tmp = psub(pconj(ResPacketType(c.first)),tmp); } r = pmadd(tmp,alpha,r); @@ -789,9 +855,10 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1(*b); + dest = pset1(*b); } void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) @@ -813,21 +880,23 @@ public: EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - eigen_internal_assert(unpacket_traits::size<=4); - loadRhs(b,dest); + dest = ploadquad(b); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploaddup(a); + dest = ploaddup(a); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const { madd_impl(a, b, c, tmp, typename conditional::type()); } - EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -843,13 +912,15 @@ public: c += a * b; } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + const conj_helper cj; r = cj.pmadd(alpha,c,r); } protected: - conj_helper cj; + }; @@ -1649,7 +1720,7 @@ void gebp_kernel::half>::half>::size; if ((SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress<=16) && - (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) && + (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) && (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr)) { SAccPacket C0, C1, C2, C3; @@ -1704,7 +1775,7 @@ void gebp_kernel=8,typename unpacket_traits::half,SResPacket>::type SResPacketHalf; typedef typename conditional=8,typename unpacket_traits::half,SLhsPacket>::type SLhsPacketHalf; - typedef typename conditional=8,typename unpacket_traits::half,SRhsPacket>::type SRhsPacketHalf; + typedef typename conditional=8,typename unpacket_traits::half,SRhsPacket>::type SRhsPacketHalf; typedef typename conditional=8,typename unpacket_traits::half,SAccPacket>::type SAccPacketHalf; SResPacketHalf R = res.template gatherPacket(i, j2); @@ -1734,7 +1805,7 @@ void gebp_kernel p; - p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0); + p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0); } else { -- cgit v1.2.3