From a4760548793811ee1accf8de05ff791a43d54be5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 23 Nov 2018 10:25:19 +0100 Subject: bug #1624: improve matrix-matrix product on ARM 64, 20% speedup --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 53 +++++++++++++++++++---- 1 file changed, 44 insertions(+), 9 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index e7cab4720..b8b83c320 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -15,7 +15,7 @@ namespace Eigen { namespace internal { -template +template class gebp_traits; @@ -347,7 +347,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ * cplx*real : unpack rhs to constant packets, ... * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual */ -template +template class gebp_traits { public: @@ -461,8 +461,8 @@ public: }; -template -class gebp_traits, RealScalar, _ConjLhs, false> +template +class gebp_traits, RealScalar, _ConjLhs, false, Arch> { public: typedef std::complex LhsScalar; @@ -597,8 +597,8 @@ template struct unpacket_traits > { typede // return res; // } -template -class gebp_traits, std::complex, _ConjLhs, _ConjRhs > +template +class gebp_traits, std::complex, _ConjLhs, _ConjRhs,Arch> { public: typedef std::complex Scalar; @@ -746,8 +746,8 @@ protected: conj_helper cj; }; -template -class gebp_traits, false, _ConjRhs > +template +class gebp_traits, false, _ConjRhs, Arch> { public: typedef std::complex Scalar; @@ -852,7 +852,42 @@ protected: conj_helper cj; }; -/* optimized GEneral packed Block * packed Panel product kernel + +#if EIGEN_ARCH_ARM64 + +template<> +struct gebp_traits + : gebp_traits +{ + typedef float32x2_t RhsPacket; + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + loadRhs(b+0, b0); + loadRhs(b+1, b1); + loadRhs(b+2, b2); + loadRhs(b+3, b3); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = vld1_f32(b); + } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/) const + { + c = vfmaq_lane_f32(c, a, b, 0); + } +}; + +#endif + +/* optimized General packed Block * packed Panel product kernel * * Mixing type logic: C += A * B * | A | B | comments -- cgit v1.2.3 From 7b1cb8a4407bdf243a581133847977fd812b66b7 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 27 Nov 2018 11:11:02 -0500 Subject: fix the build on 64-bit ARM when NEON is disabled --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index b8b83c320..5619a4588 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -853,7 +853,7 @@ protected: }; -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON template<> struct gebp_traits -- cgit v1.2.3 From a4159dba080f5621f19f814440553ba734c8e712 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 27 Nov 2018 16:53:14 -0500 Subject: do not read buffers out of bounds -- load only the 4 bytes we know exist here. Could also have done a vld1_lane_f32 but doing so here, without the overhead of initializing the unused lane, would have triggered used-of-uninitialized-value errors in tools such as ASan. Note that this code is sub-optimal before or after this change: we should be reading either 2 or 4 float32 values per load-instruction (2 for ARM in-order cores with an affinity for 8-byte loads; 4 for ARM out-of-order cores able to dual-issue 16-byte load instructions with arithmetic instructions). Before or after this patch, we are only loading 4 bytes of useful data here (even if before this patch, we were technically loading 8, only to use only the 4 first). --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 5619a4588..9ca865bd1 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -859,7 +859,7 @@ template<> struct gebp_traits : gebp_traits { - typedef float32x2_t RhsPacket; + typedef float RhsPacket; EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) { @@ -871,7 +871,7 @@ struct gebp_traits EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = vld1_f32(b); + dest = *b; } EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const @@ -881,7 +881,7 @@ struct gebp_traits EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/) const { - c = vfmaq_lane_f32(c, a, b, 0); + c = vfmaq_n_f32(c, a, b); } }; -- cgit v1.2.3 From c53eececb0415834b961cb61cd466907261b4b2f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 6 Dec 2018 15:58:06 +0100 Subject: Implement AVX512 vectorization of std::complex --- Eigen/Core | 8 +- Eigen/src/Core/GenericPacketMath.h | 1 + Eigen/src/Core/arch/AVX/Complex.h | 4 + Eigen/src/Core/arch/AVX512/Complex.h | 442 ++++++++++++++++++++++ Eigen/src/Core/arch/AVX512/PacketMath.h | 1 + Eigen/src/Core/arch/SSE/PacketMath.h | 12 +- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 167 +++++--- test/packetmath.cpp | 17 +- 8 files changed, 587 insertions(+), 65 deletions(-) create mode 100644 Eigen/src/Core/arch/AVX512/Complex.h (limited to 'Eigen/src/Core/products') diff --git a/Eigen/Core b/Eigen/Core index 0e23247d3..759b1bb80 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -154,14 +154,12 @@ using std::ptrdiff_t; #if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/TypeCasting.h" - // FIXME: this needs to be fixed (compilation issue if included) - // there is no reason to disable SSE/AVX vectorization - // of complex<> while enabling AVX512. - // #include "src/Core/arch/SSE/Complex.h" + #include "src/Core/arch/SSE/Complex.h" #include "src/Core/arch/AVX/PacketMath.h" #include "src/Core/arch/AVX/TypeCasting.h" - // #include "src/Core/arch/AVX/Complex.h" + #include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX512/PacketMath.h" + #include "src/Core/arch/AVX512/Complex.h" #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX512/MathFunctions.h" diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 9c2a437bf..2b2ee9e2c 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -56,6 +56,7 @@ struct default_packet_traits HasConj = 1, HasSetLinear = 1, HasBlend = 0, + HasReduxp = 1, HasDiv = 0, HasSqrt = 0, diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 7fa61969d..2bb40fc79 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -22,6 +22,7 @@ struct Packet4cf __m256 v; }; +#ifndef EIGEN_VECTORIZE_AVX512 template<> struct packet_traits > : default_packet_traits { typedef Packet4cf type; @@ -44,6 +45,7 @@ template<> struct packet_traits > : default_packet_traits HasSetLinear = 0 }; }; +#endif template<> struct unpacket_traits { typedef std::complex type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; }; @@ -228,6 +230,7 @@ struct Packet2cd __m256d v; }; +#ifndef EIGEN_VECTORIZE_AVX512 template<> struct packet_traits > : default_packet_traits { typedef Packet2cd type; @@ -250,6 +253,7 @@ template<> struct packet_traits > : default_packet_traits HasSetLinear = 0 }; }; +#endif template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; }; diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h new file mode 100644 index 000000000..2820abb49 --- /dev/null +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -0,0 +1,442 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_AVX512_H +#define EIGEN_COMPLEX_AVX512_H + +namespace Eigen { + +namespace internal { + +//---------- float ---------- +struct Packet8cf +{ + EIGEN_STRONG_INLINE Packet8cf() {} + EIGEN_STRONG_INLINE explicit Packet8cf(const __m512& a) : v(a) {} + __m512 v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet8cf type; + typedef Packet4cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0, + HasReduxp = 0 + }; +}; + +template<> struct unpacket_traits { + typedef std::complex type; + enum { + size = 8, + alignment=unpacket_traits::alignment + }; + typedef Packet4cf half; +}; + +template<> EIGEN_STRONG_INLINE Packet8cf padd(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf psub(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a) +{ + return Packet8cf(pnegate(a.v)); +} +template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a) +{ + const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32( + 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000, + 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet8cf(_mm512_xor_ps(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) +{ + __m512 tmp2 = _mm512_mul_ps(_mm512_movehdup_ps(a.v), _mm512_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1))); + return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2)); +} + +template<> EIGEN_STRONG_INLINE Packet8cf pand (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_and_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf por (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_or_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pxor (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_xor_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_andnot_ps(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet8cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload(&numext::real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet8cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu(&numext::real_ref(*from))); } + + +template<> EIGEN_STRONG_INLINE Packet8cf pset1(const std::complex& from) +{ + return Packet8cf(_mm512_castpd_ps(pload1((const double*)(const void*)&from))); +} + +template<> EIGEN_STRONG_INLINE Packet8cf ploaddup(const std::complex* from) +{ + return Packet8cf( _mm512_castpd_ps( ploaddup((const double*)(const void*)from )) ); +} +template<> EIGEN_STRONG_INLINE Packet8cf ploadquad(const std::complex* from) +{ + return Packet8cf( _mm512_castpd_ps( ploadquad((const double*)(const void*)from )) ); +} + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet8cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const Packet8cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet8cf pgather, Packet8cf>(const std::complex* from, Index stride) +{ + return Packet8cf(_mm512_castpd_ps(pgather((const double*)(const void*)from, stride))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet8cf>(std::complex* to, const Packet8cf& from, Index stride) +{ + pscatter((double*)(void*)to, _mm512_castps_pd(from.v), stride); +} + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet8cf& a) +{ + return pfirst(Packet2cf(_mm512_castps512_ps128(a.v))); +} + +template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) { + return Packet8cf(_mm512_castsi512_ps( + _mm512_permutexvar_epi64( _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), + _mm512_castps_si512(a.v)))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet8cf& a) +{ + return predux(padd(Packet4cf(_mm512_extractf32x8_ps(a.v,0)), + Packet4cf(_mm512_extractf32x8_ps(a.v,1)))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet8cf& a) +{ + return predux_mul(pmul(Packet4cf(_mm512_extractf32x8_ps(a.v, 0)), + Packet4cf(_mm512_extractf32x8_ps(a.v, 1)))); +} + +template <> +EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4(const Packet8cf& a) { + __m256 lane0 = _mm512_extractf32x8_ps(a.v, 0); + __m256 lane1 = _mm512_extractf32x8_ps(a.v, 1); + __m256 res = _mm256_add_ps(lane0, lane1); + return Packet4cf(res); +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet8cf& first, const Packet8cf& second) + { + if (Offset==0) return; + palign_impl::run(first.v, second.v); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f) + +template<> EIGEN_STRONG_INLINE Packet8cf pdiv(const Packet8cf& a, const Packet8cf& b) +{ + Packet8cf num = pmul(a, pconj(b)); + __m512 tmp = _mm512_mul_ps(b.v, b.v); + __m512 tmp2 = _mm512_shuffle_ps(tmp,tmp,0xB1); + __m512 denom = _mm512_add_ps(tmp, tmp2); + return Packet8cf(_mm512_div_ps(num.v, denom)); +} + +template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip(const Packet8cf& x) +{ + return Packet8cf(_mm512_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1))); +} + +//---------- double ---------- +struct Packet4cd +{ + EIGEN_STRONG_INLINE Packet4cd() {} + EIGEN_STRONG_INLINE explicit Packet4cd(const __m512d& a) : v(a) {} + __m512d v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet4cd type; + typedef Packet2cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 4, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0, + HasReduxp = 0 + }; +}; + +template<> struct unpacket_traits { + typedef std::complex type; + enum { + size = 4, + alignment = unpacket_traits::alignment + }; + typedef Packet2cd half; +}; + +template<> EIGEN_STRONG_INLINE Packet4cd padd(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd psub(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_sub_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) { return Packet4cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a) +{ + const __m512d mask = _mm512_castsi512_pd( + _mm512_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0, + 0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0)); + return Packet4cd(_mm512_xor_pd(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) +{ + __m512d tmp1 = _mm512_shuffle_pd(a.v,a.v,0x0); + __m512d tmp2 = _mm512_shuffle_pd(a.v,a.v,0xFF); + __m512d tmp3 = _mm512_shuffle_pd(b.v,b.v,0x55); + __m512d odd = _mm512_mul_pd(tmp2, tmp3); + return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd)); +} + +template<> EIGEN_STRONG_INLINE Packet4cd pand (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_and_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd por (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_or_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pxor (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_xor_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_andnot_pd(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet4cd pload (const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet4cd ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cd(ploadu((const double*)from)); } + +template<> EIGEN_STRONG_INLINE Packet4cd pset1(const std::complex& from) +{ + #ifdef EIGEN_VECTORIZE_AVX512DQ + return Packet4cd(_mm512_broadcast_f64x2(pset1(from).v)); + #else + return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1(from).v)))); + #endif +} + +template<> EIGEN_STRONG_INLINE Packet4cd ploaddup(const std::complex* from) { + return Packet4cd(_mm512_insertf64x4( + _mm512_castpd256_pd512(ploaddup(from).v), ploaddup(from+1).v, 1)); +} + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet4cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet4cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather, Packet4cd>(const std::complex* from, Index stride) +{ + return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512( + _mm256_insertf128_pd(_mm256_castpd128_pd256(pload(from+0*stride).v), pload(from+1*stride).v,1)), + _mm256_insertf128_pd(_mm256_castpd128_pd256(pload(from+2*stride).v), pload(from+3*stride).v,1), 1)); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet4cd>(std::complex* to, const Packet4cd& from, Index stride) +{ + __m512i fromi = _mm512_castpd_si512(from.v); + double* tod = (double*)(void*)to; + _mm_store_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) ); + _mm_store_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) ); + _mm_store_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) ); + _mm_store_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) ); +} + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet4cd& a) +{ + __m128d low = _mm512_extractf64x2_pd(a.v, 0); + EIGEN_ALIGN16 double res[2]; + _mm_store_pd(res, low); + return std::complex(res[0],res[1]); +} + +template<> EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) { + return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, EIGEN_SSE_SHUFFLE_MASK(3,2,1,0))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet4cd& a) +{ + return predux(padd(Packet2cd(_mm512_extractf64x4_pd(a.v,0)), + Packet2cd(_mm512_extractf64x4_pd(a.v,1)))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet4cd& a) +{ + return predux_mul(pmul(Packet2cd(_mm512_extractf64x4_pd(a.v,0)), + Packet2cd(_mm512_extractf64x4_pd(a.v,1)))); +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4cd& first, const Packet4cd& second) + { + if (Offset==0) return; + palign_impl::run(first.v, second.v); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d) + +template<> EIGEN_STRONG_INLINE Packet4cd pdiv(const Packet4cd& a, const Packet4cd& b) +{ + Packet4cd num = pmul(a, pconj(b)); + __m512d tmp = _mm512_mul_pd(b.v, b.v); + __m512d denom = padd(_mm512_permute_pd(tmp,0x55), tmp); + return Packet4cd(_mm512_div_pd(num.v, denom)); +} + +template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip(const Packet4cd& x) +{ + return Packet4cd(_mm512_permute_pd(x.v,0x55)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + ptranspose(reinterpret_cast&>(kernel)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + ptranspose(reinterpret_cast&>(kernel)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [a0 a1 b0 b1] + __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [a2 a3 b2 b3] + __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [c0 c1 d0 d1] + __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [c2 c3 d2 d3] + + kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a3 b3 c3 d3] + kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a2 b2 c2 d2] + kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a1 b1 c1 d1] + kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a0 b0 c0 d0] +} + +template<> EIGEN_STRONG_INLINE Packet8cf pinsertfirst(const Packet8cf& a, std::complex b) +{ + Packet2cf tmp = Packet2cf(_mm512_extractf32x4_ps(a.v,0)); + tmp = pinsertfirst(tmp, b); + return Packet8cf( _mm512_insertf32x4(a.v, tmp.v, 0) ); +} + +template<> EIGEN_STRONG_INLINE Packet4cd pinsertfirst(const Packet4cd& a, std::complex b) +{ + return Packet4cd(_mm512_castsi512_pd( _mm512_inserti32x4(_mm512_castpd_si512(a.v), _mm_castpd_si128(pset1(b).v), 0) )); +} + +template<> EIGEN_STRONG_INLINE Packet8cf pinsertlast(const Packet8cf& a, std::complex b) +{ + Packet2cf tmp = Packet2cf(_mm512_extractf32x4_ps(a.v,3) ); + tmp = pinsertlast(tmp, b); + return Packet8cf( _mm512_insertf32x4(a.v, tmp.v, 3) ); +} + +template<> EIGEN_STRONG_INLINE Packet4cd pinsertlast(const Packet4cd& a, std::complex b) +{ + return Packet4cd(_mm512_castsi512_pd( _mm512_inserti32x4(_mm512_castpd_si512(a.v), _mm_castpd_si128(pset1(b).v), 3) )); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_AVX512_H diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 1d38fb758..ce453f019 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -417,6 +417,7 @@ EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) { } #ifdef EIGEN_VECTORIZE_AVX512DQ +// FIXME: this does not look optimal, better load a Packet4d and shuffle... // Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, // a3} template <> diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 0e8e0d2b3..b073fc8d4 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -61,20 +61,22 @@ template<> struct is_arithmetic<__m128> { enum { value = true }; }; template<> struct is_arithmetic<__m128i> { enum { value = true }; }; template<> struct is_arithmetic<__m128d> { enum { value = true }; }; +#define EIGEN_SSE_SHUFFLE_MASK(p,q,r,s) ((s)<<6|(r)<<4|(q)<<2|(p)) + #define vec4f_swizzle1(v,p,q,r,s) \ - (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p))))) + (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))) #define vec4i_swizzle1(v,p,q,r,s) \ - (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) + (_mm_shuffle_epi32( v, EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))) #define vec2d_swizzle1(v,p,q) \ - (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) + (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), EIGEN_SSE_SHUFFLE_MASK(2*p,2*p+1,2*q,2*q+1)))) #define vec4f_swizzle2(a,b,p,q,r,s) \ - (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) + (_mm_shuffle_ps( (a), (b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))) #define vec4i_swizzle2(a,b,p,q,r,s) \ - (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) + (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))))) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 9ca865bd1..9475a6ecc 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -506,13 +506,28 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1(*b); + dest = pset1(*b); } EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { + loadRhsQuad_impl(b,dest, typename conditional::type()); + } + + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const + { + // FIXME we can do better! + // what we want here is a ploadheight + RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]}; + dest = ploadquad(tmp); + } + + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const + { + eigen_internal_assert(RhsPacketSize<=8); dest = pset1(*b); } @@ -521,9 +536,10 @@ public: dest = pload(a); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploadu(a); + dest = ploadu(a); } EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) @@ -536,12 +552,14 @@ public: // pbroadcast2(b, b0, b1); // } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const { madd_impl(a, b, c, tmp, typename conditional::type()); } - EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -556,13 +574,14 @@ public: c += a * b; } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + const conj_helper cj; r = cj.pmadd(c,alpha,r); } protected: - conj_helper cj; }; template @@ -581,13 +600,57 @@ DoublePacket padd(const DoublePacket &a, const DoublePacket the "4" in "downto4" +// corresponds to the number of complexes, so it means "8" +// it terms of real coefficients. + template -const DoublePacket& predux_half_dowto4(const DoublePacket &a) +const DoublePacket& +predux_half_dowto4(const DoublePacket &a, + typename enable_if::size<=8>::type* = 0) { return a; } -template struct unpacket_traits > { typedef DoublePacket half; }; +template +DoublePacket::half> +predux_half_dowto4(const DoublePacket &a, + typename enable_if::size==16>::type* = 0) +{ + // yes, that's pretty hackish :( + DoublePacket::half> res; + typedef std::complex::type> Cplx; + typedef typename packet_traits::type CplxPacket; + res.first = predux_half_dowto4(CplxPacket(a.first)).v; + res.second = predux_half_dowto4(CplxPacket(a.second)).v; + return res; +} + +// same here, "quad" actually means "8" in terms of real coefficients +template +void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, + typename enable_if::size<=8>::type* = 0) +{ + dest.first = pset1(real(*b)); + dest.second = pset1(imag(*b)); +} + +template +void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, + typename enable_if::size==16>::type* = 0) +{ + // yes, that's pretty hackish too :( + typedef typename NumTraits::Real RealScalar; + RealScalar r[4] = {real(b[0]), real(b[0]), real(b[1]), real(b[1])}; + RealScalar i[4] = {imag(b[0]), imag(b[0]), imag(b[1]), imag(b[1])}; + dest.first = ploadquad(r); + dest.second = ploadquad(i); +} + + +template struct unpacket_traits > { + typedef DoublePacket::half> half; +}; // template // DoublePacket pmadd(const DoublePacket &a, const DoublePacket &b) // { @@ -611,10 +674,10 @@ public: ConjRhs = _ConjRhs, Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, - RealPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + RealPacketSize = Vectorizable ? packet_traits::size : 1, // FIXME: should depend on NumberOfRegisters nr = 4, @@ -626,7 +689,7 @@ public: typedef typename packet_traits::type RealPacket; typedef typename packet_traits::type ScalarPacket; - typedef DoublePacket DoublePacketType; + typedef DoublePacket DoublePacketType; typedef typename conditional::type LhsPacket4Packing; typedef typename conditional::type LhsPacket; @@ -643,16 +706,17 @@ public: } // Scalar path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { - dest = pset1(*b); + dest = pset1(*b); } // Vectorized path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { - dest.first = pset1(real(*b)); - dest.second = pset1(imag(*b)); + dest.first = pset1(real(*b)); + dest.second = pset1(imag(*b)); } EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const @@ -661,8 +725,7 @@ public: } EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const { - eigen_internal_assert(unpacket_traits::size<=4); - loadRhs(b,dest); + loadQuadToDoublePacket(b,dest); } EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) @@ -696,12 +759,14 @@ public: dest = pload((const typename unpacket_traits::type*)(a)); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploadu((const typename unpacket_traits::type*)(a)); + dest = ploadu((const typename unpacket_traits::type*)(a)); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket& c, TmpType& /*tmp*/) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); @@ -714,29 +779,30 @@ public: EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } - EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacketType& alpha, ResPacketType& r) const { // assemble c - ResPacket tmp; + ResPacketType tmp; if((!ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(pconj(ResPacket(c.second))); - tmp = padd(ResPacket(c.first),tmp); + tmp = pcplxflip(pconj(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); } else if((!ConjLhs)&&(ConjRhs)) { - tmp = pconj(pcplxflip(ResPacket(c.second))); - tmp = padd(ResPacket(c.first),tmp); + tmp = pconj(pcplxflip(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); } else if((ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(ResPacket(c.second)); - tmp = padd(pconj(ResPacket(c.first)),tmp); + tmp = pcplxflip(ResPacketType(c.second)); + tmp = padd(pconj(ResPacketType(c.first)),tmp); } else if((ConjLhs)&&(ConjRhs)) { - tmp = pcplxflip(ResPacket(c.second)); - tmp = psub(pconj(ResPacket(c.first)),tmp); + tmp = pcplxflip(ResPacketType(c.second)); + tmp = psub(pconj(ResPacketType(c.first)),tmp); } r = pmadd(tmp,alpha,r); @@ -789,9 +855,10 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1(*b); + dest = pset1(*b); } void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) @@ -813,21 +880,23 @@ public: EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - eigen_internal_assert(unpacket_traits::size<=4); - loadRhs(b,dest); + dest = ploadquad(b); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploaddup(a); + dest = ploaddup(a); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const { madd_impl(a, b, c, tmp, typename conditional::type()); } - EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -843,13 +912,15 @@ public: c += a * b; } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + const conj_helper cj; r = cj.pmadd(alpha,c,r); } protected: - conj_helper cj; + }; @@ -1649,7 +1720,7 @@ void gebp_kernel::half>::half>::size; if ((SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress<=16) && - (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) && + (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) && (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr)) { SAccPacket C0, C1, C2, C3; @@ -1704,7 +1775,7 @@ void gebp_kernel=8,typename unpacket_traits::half,SResPacket>::type SResPacketHalf; typedef typename conditional=8,typename unpacket_traits::half,SLhsPacket>::type SLhsPacketHalf; - typedef typename conditional=8,typename unpacket_traits::half,SRhsPacket>::type SRhsPacketHalf; + typedef typename conditional=8,typename unpacket_traits::half,SRhsPacket>::type SRhsPacketHalf; typedef typename conditional=8,typename unpacket_traits::half,SAccPacket>::type SAccPacketHalf; SResPacketHalf R = res.template gatherPacket(i, j2); @@ -1734,7 +1805,7 @@ void gebp_kernel p; - p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0); + p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0); } else { diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 144083f1b..60c9dbc36 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -314,15 +314,18 @@ template void packetmath() ref[0] *= data1[i]; VERIFY(internal::isApprox(ref[0], internal::predux_mul(internal::pload(data1))) && "internal::predux_mul"); - for (int j=0; j(data1+j*PacketSize); + for (int j=0; j(data1+j*PacketSize); + } + internal::pstore(data2, internal::preduxp(packets)); + VERIFY(areApproxAbs(ref, data2, PacketSize, refvalue) && "internal::preduxp"); } - internal::pstore(data2, internal::preduxp(packets)); - VERIFY(areApproxAbs(ref, data2, PacketSize, refvalue) && "internal::preduxp"); for (int i=0; i Date: Fri, 7 Dec 2018 09:15:46 +0100 Subject: bug #1636: fix gemm performance issue with gcc>=6 and no FMA --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 9475a6ecc..88ca9cc97 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1387,10 +1387,18 @@ void gebp_kernel=6 without FMA (bug 1637) + #if EIGEN_GNUC_AT_LEAST(6,0) + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND asm("" : [a0] "+x" (A0), [a1] "+x" (A1) ); + #else + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND + #endif + #define EIGEN_GEBGP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ -- cgit v1.2.3 From 956678a4eff96979ab2139e9ce14f98bb820aa9d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 7 Dec 2018 18:03:36 +0100 Subject: bug #1515: disable gebp's 3pX4 micro kernel for MSVC<=19.14 because of register spilling. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 88ca9cc97..bcb332cb9 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -370,10 +370,12 @@ public: // register block size along the M direction (currently, this one cannot be modified) default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, -#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \ + && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914)) // we assume 16 registers // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined, // then using 3*LhsPacketSize triggers non-implemented paths in syrk. + // Bug 1515: MSVC prior to v19.14 yields to register spilling. mr = Vectorizable ? 3*LhsPacketSize : default_mr, #else mr = default_mr, -- cgit v1.2.3 From 426bce7529f148bbec3fd386ddf6d6c4880de347 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 8 Dec 2018 09:44:21 +0100 Subject: fix EIGEN_GEBP_2PX4_SPILLING_WORKAROUND for non vectorized type, and non x86/64 target --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index bcb332cb9..1197fec94 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1389,11 +1389,10 @@ void gebp_kernel=6 without FMA (bug 1637) #if EIGEN_GNUC_AT_LEAST(6,0) - #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND asm("" : [a0] "+x" (A0), [a1] "+x" (A1) ); + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+rm" (A0),[a1] "+rm" (A1)); #else #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND #endif -- cgit v1.2.3 From bff90bf270b330612e4d7e4fdba96b3671826208 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 8 Dec 2018 18:58:28 +0100 Subject: workaround "may be used uninitialized" warning --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 1197fec94..61521e2bb 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1399,7 +1399,6 @@ void gebp_kernel Date: Mon, 10 Dec 2018 23:22:44 +0100 Subject: enable spilling workaround on architectures with SSE/AVX --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 61521e2bb..b1e98b6f9 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1391,7 +1391,7 @@ void gebp_kernel=6 without FMA (bug 1637) - #if EIGEN_GNUC_AT_LEAST(6,0) + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+rm" (A0),[a1] "+rm" (A1)); #else #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND -- cgit v1.2.3 From 7166496f7011e63ff90cbb8b1b41642aaa7dbcc3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 11 Dec 2018 13:24:42 +0100 Subject: bug #1643: fix compilation issue with gcc and no optimizaion --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index b1e98b6f9..3aaa68c4c 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1392,7 +1392,7 @@ void gebp_kernel=6 without FMA (bug 1637) #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) - #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+rm" (A0),[a1] "+rm" (A1)); + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1)); #else #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND #endif -- cgit v1.2.3 From f159cf3d750a7930a29abf172d9436550cc8369f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 11 Dec 2018 15:36:27 +0100 Subject: Artificially increase l1-blocking size for AVX512. +10% speedup with current kernels. With a 6pX4 kernel (not committed yet), this provides a +20% speedup. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 3aaa68c4c..968cec78b 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -101,6 +101,16 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // at the register level. This small horizontal panel has to stay within L1 cache. std::ptrdiff_t l1, l2, l3; manage_caching_sizes(GetAction, &l1, &l2, &l3); + #ifdef EIGEN_VECTORIZE_AVX512 + // We need to find a rationale for that, but without this adjustment, + // performance with AVX512 is pretty bad, like -20% slower. + // One reason is that with increasing packet-size, the blocking size k + // has to become pretty small if we want that 1 lhs panel fit within L1. + // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are: + // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144. + // This is quite small for a good reuse of the accumulation registers. + l1 *= 4; + #endif if (num_threads > 1) { typedef typename Traits::ResScalar ResScalar; @@ -372,7 +382,7 @@ public: default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \ && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914)) - // we assume 16 registers + // we assume 16 registers or more // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined, // then using 3*LhsPacketSize triggers non-implemented paths in syrk. // Bug 1515: MSVC prior to v19.14 yields to register spilling. -- cgit v1.2.3 From 1024a70e82c0301d9f699fd344613e9cd417ab95 Mon Sep 17 00:00:00 2001 From: Gustavo Lima Chaves Date: Fri, 21 Dec 2018 11:03:18 -0800 Subject: gebp: Add new ½ and ¼ packet rows per (peeling) round on the lhs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The patch works by altering the gebp lhs packing routines to also consider ½ and ¼ packet lenght rows when packing, besides the original whole package and row-by-row attempts. Finally, gebp itself will try to fit a fraction of a packet at a time if: i) ½ and/or ¼ packets are available for the current context (e.g. AVX2 and SSE-sized SIMD register for x86) ii) The matrix's height is favorable to it (it may be it's too small in that dimension to take full advantage of the current/maximum packet width or it may be the case that last rows may take advantage of smaller packets before gebp goes row-by-row) This helps mitigate huge slowdowns one had on AVX512 builds when compared to AVX2 ones, for some dimensions. Gains top at an extra 1x in throughput. This patch is a complement to changeset 4ad359237aeb519dbd4b55eba43057b37988838c . Since packing is changed, Eigen users which would go for very low-level API usage, like TensorFlow, will have to be adapted to work fine with the changes. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 683 ++++++++++++++-------- Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 23 +- 2 files changed, 465 insertions(+), 241 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 968cec78b..afbd83eda 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -15,7 +15,13 @@ namespace Eigen { namespace internal { -template +enum PacketSizeType { + PacketFull = 0, + PacketHalf, + PacketQuarter +}; + +template class gebp_traits; @@ -347,6 +353,43 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); #endif +template +struct packet_conditional { typedef T3 type; }; + +template +struct packet_conditional { typedef T1 type; }; + +template +struct packet_conditional { typedef T2 type; }; + +#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + prefix ## name ## Packet + +#define PACKET_DECL_COND(name, packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + name ## Packet + +#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + prefix ## ScalarPacket + +#define PACKET_DECL_COND_SCALAR(packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + ScalarPacket + /* Vectorization logic * real*real: unpack rhs to constant packets, ... * @@ -357,7 +400,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ * cplx*real : unpack rhs to constant packets, ... * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual */ -template +template class gebp_traits { public: @@ -365,13 +408,17 @@ public: typedef _RhsScalar RhsScalar; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, @@ -395,9 +442,6 @@ public: RhsProgress = 1 }; - typedef typename packet_traits::type _LhsPacket; - typedef typename packet_traits::type _RhsPacket; - typedef typename packet_traits::type _ResPacket; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; @@ -473,21 +517,25 @@ public: }; -template -class gebp_traits, RealScalar, _ConjLhs, false, Arch> +template +class gebp_traits, RealScalar, _ConjLhs, false, Arch, _PacketSize> { public: typedef std::complex LhsScalar; typedef RealScalar RhsScalar; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = false, - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, @@ -502,10 +550,6 @@ public: RhsProgress = 1 }; - typedef typename packet_traits::type _LhsPacket; - typedef typename packet_traits::type _RhsPacket; - typedef typename packet_traits::type _ResPacket; - typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; @@ -672,8 +716,8 @@ template struct unpacket_traits > { // return res; // } -template -class gebp_traits, std::complex, _ConjLhs, _ConjRhs,Arch> +template +class gebp_traits, std::complex, _ConjLhs, _ConjRhs, Arch, _PacketSize > { public: typedef std::complex Scalar; @@ -681,15 +725,21 @@ public: typedef std::complex RhsScalar; typedef std::complex ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND(Real, _PacketSize); + PACKET_DECL_COND_SCALAR(_PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = packet_traits::Vectorizable - && packet_traits::Vectorizable, - ResPacketSize = Vectorizable ? packet_traits::size : 1, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - RealPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits::vectorizable + && unpacket_traits::vectorizable, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RealPacketSize = Vectorizable ? unpacket_traits::size : 1, // FIXME: should depend on NumberOfRegisters nr = 4, @@ -699,8 +749,6 @@ public: RhsProgress = 1 }; - typedef typename packet_traits::type RealPacket; - typedef typename packet_traits::type ScalarPacket; typedef DoublePacket DoublePacketType; typedef typename conditional::type LhsPacket4Packing; @@ -824,8 +872,8 @@ protected: conj_helper cj; }; -template -class gebp_traits, false, _ConjRhs, Arch> +template +class gebp_traits, false, _ConjRhs, Arch, _PacketSize > { public: typedef std::complex Scalar; @@ -833,14 +881,25 @@ public: typedef Scalar RhsScalar; typedef Scalar ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Real, _PacketSize); + PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize); + +#undef PACKET_DECL_COND_SCALAR_PREFIX +#undef PACKET_DECL_COND_PREFIX +#undef PACKET_DECL_COND_SCALAR +#undef PACKET_DECL_COND + enum { ConjLhs = false, ConjRhs = _ConjRhs, - Vectorizable = packet_traits::Vectorizable - && packet_traits::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits<_RealPacket>::vectorizable + && unpacket_traits<_ScalarPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, // FIXME: should depend on NumberOfRegisters @@ -851,10 +910,6 @@ public: RhsProgress = 1 }; - typedef typename packet_traits::type _LhsPacket; - typedef typename packet_traits::type _RhsPacket; - typedef typename packet_traits::type _ResPacket; - typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; @@ -980,26 +1035,44 @@ struct gebp_traits template struct gebp_kernel { - typedef gebp_traits Traits; + typedef gebp_traits Traits; + typedef gebp_traits HalfTraits; + typedef gebp_traits QuarterTraits; + typedef typename Traits::ResScalar ResScalar; typedef typename Traits::LhsPacket LhsPacket; typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; - typedef gebp_traits SwappedTraits; + typedef gebp_traits SwappedTraits; + typedef typename SwappedTraits::ResScalar SResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; typedef typename SwappedTraits::RhsPacket SRhsPacket; typedef typename SwappedTraits::ResPacket SResPacket; typedef typename SwappedTraits::AccPacket SAccPacket; + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; + typedef typename HalfTraits::AccPacket AccPacketHalf; + + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; + typedef typename QuarterTraits::AccPacket AccPacketQuarter; + typedef typename DataMapper::LinearMapper LinearMapper; enum { Vectorizable = Traits::Vectorizable, LhsProgress = Traits::LhsProgress, + LhsProgressHalf = HalfTraits::LhsProgress, + LhsProgressQuarter = QuarterTraits::LhsProgress, RhsProgress = Traits::RhsProgress, + RhsProgressHalf = HalfTraits::RhsProgress, + RhsProgressQuarter = QuarterTraits::RhsProgress, ResPacketSize = Traits::ResPacketSize }; @@ -1010,11 +1083,11 @@ struct gebp_kernel }; template::LhsProgress> +int SwappedLhsProgress = gebp_traits::LhsProgress> struct last_row_process_16_packets { - typedef gebp_traits Traits; - typedef gebp_traits SwappedTraits; + typedef gebp_traits Traits; + typedef gebp_traits SwappedTraits; typedef typename Traits::ResScalar ResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; @@ -1042,8 +1115,8 @@ struct last_row_process_16_packets template struct last_row_process_16_packets { - typedef gebp_traits Traits; - typedef gebp_traits SwappedTraits; + typedef gebp_traits Traits; + typedef gebp_traits SwappedTraits; typedef typename Traits::ResScalar ResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; @@ -1089,6 +1162,195 @@ struct last_row_process_16_packets +struct lhs_process_one_packet +{ + + EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhs(&blA[(0+1*K)*(LhsProgress)], *A0); + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3); + traits.madd(*A0, *B_0, *C0, *B_0); + traits.madd(*A0, *B1, *C1, *B1); + traits.madd(*A0, *B2, *C2, *B2); + traits.madd(*A0, *B3, *C3, *B3); + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } + + EIGEN_STRONG_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, Index prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4) + { + GEBPTraits traits; + + // loops on each largest micro horizontal panel of lhs + // (LhsProgress x depth) + for(Index i=peelStart; i(alpha); + + R0 = r0.template loadPacket(0); + R1 = r1.template loadPacket(0); + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + r0.storePacket(0, R0); + r1.storePacket(0, R1); + + R0 = r2.template loadPacket(0); + R1 = r3.template loadPacket(0); + traits.acc(C2, alphav, R0); + traits.acc(C3, alphav, R1); + r2.storePacket(0, R0); + r3.storePacket(0, R1); + } + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2(alpha); + R0 = r0.template loadPacket(0); + traits.acc(C0, alphav, R0); + r0.storePacket(0, R0); + } + } + } +}; + +template +struct lhs_process_fraction_of_packet : lhs_process_one_packet +{ + +EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0); + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3); + traits.madd(*A0, *B_0, *C0, *B_0); + traits.madd(*A0, *B1, *C1, *B1); + traits.madd(*A0, *B2, *C2, *B2); + traits.madd(*A0, *B3, *C3, *B3); + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } +}; + template EIGEN_DONT_INLINE void gebp_kernel @@ -1105,7 +1367,9 @@ void gebp_kernel=4 ? (cols/4) * 4 : 0; const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; - const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0; + const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0; enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) const Index peeled_kc = depth & ~(pk-1); const Index prefetch_res_offset = 32/sizeof(ResScalar); @@ -1559,174 +1823,29 @@ void gebp_kernel=1*Traits::LhsProgress) { - // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth) - for(Index i=peeled_mc2; i(alpha); - - R0 = r0.template loadPacket(0 * Traits::ResPacketSize); - R1 = r1.template loadPacket(0 * Traits::ResPacketSize); - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - r0.storePacket(0 * Traits::ResPacketSize, R0); - r1.storePacket(0 * Traits::ResPacketSize, R1); - - R0 = r2.template loadPacket(0 * Traits::ResPacketSize); - R1 = r3.template loadPacket(0 * Traits::ResPacketSize); - traits.acc(C2, alphav, R0); - traits.acc(C3, alphav, R1); - r2.storePacket(0 * Traits::ResPacketSize, R0); - r3.storePacket(0 * Traits::ResPacketSize, R1); - } - - // Deal with remaining columns of the rhs - for(Index j2=packet_cols4; j2(alpha); - R0 = r0.template loadPacket(0 * Traits::ResPacketSize); - traits.acc(C0, alphav, R0); - r0.storePacket(0 * Traits::ResPacketSize, R0); - } - } + lhs_process_one_packet p; + p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressHalf rows at once ---------- + if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf) + { + lhs_process_fraction_of_packet p; + p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressQuarter rows at once ---------- + if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter) + { + lhs_process_fraction_of_packet p; + p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); } //---------- Process remaining rows, 1 at once ---------- - if(peeled_mc1 ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - enum { PacketSize = unpacket_traits::size }; + typedef typename unpacket_traits::half HalfPacket; + typedef typename unpacket_traits::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1928,9 +2053,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 - : Pack2>1 ? (rows/Pack2)*Pack2 : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0; + const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0; + const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter + : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0; Index i=0; @@ -1989,20 +2117,60 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=HalfPacketSize) + { + for(; i(i+0*(HalfPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=HalfPacketSize; + } + if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth); + } + } + // Pack quarter packets + if(HasQuarter && Pack1>=QuarterPacketSize) + { + for(; i(i+0*(QuarterPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=QuarterPacketSize; + } + if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth); + } + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). if(Pack21) { - for(; i ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - enum { PacketSize = unpacket_traits::size }; + typedef typename unpacket_traits::half HalfPacket; + typedef typename unpacket_traits::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -2031,37 +2205,51 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; Index count = 0; + bool gone_half = false, gone_quarter = false, gone_last = false; -// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; -// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; -// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - - int pack = Pack1; Index i = 0; + int pack = Pack1; + int psize = PacketSize; while(pack>0) { Index remaining_rows = rows-i; - Index peeled_mc = i+(remaining_rows/pack)*pack; + Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack; + Index starting_pos = i; for(; i=PacketSize) + if(pack>=psize && psize >= QuarterPacketSize) { - for(; k kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); - ptranspose(kernel); - for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + if (psize == PacketSize) { + PacketBlock kernel; + for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); + ptranspose(kernel); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + } else if (HasHalf && psize == HalfPacketSize) { + gone_half = true; + PacketBlock kernel_half; + for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket(i+p+m, k); + ptranspose(kernel_half); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); + } else if (HasQuarter && psize == QuarterPacketSize) { + gone_quarter = true; + PacketBlock kernel_quarter; + for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket(i+p+m, k); + ptranspose(kernel_quarter); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); + } } - count += PacketSize*pack; + count += psize*pack; } } + for(; k= psize/2 || left >= psize/4) && + ((psize/2 == HalfPacketSize && HasHalf && !gone_half) || + (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) { + psize /= 2; + pack = psize; + continue; + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). + if (Pack2 < PacketSize && !gone_last) { + gone_last = true; + psize = pack = left & ~1; + } + } } for(; i::size }; + typedef typename unpacket_traits::type>::half HalfPacket; + typedef typename unpacket_traits::type>::half>::half QuarterPacket; + enum { PacketSize = packet_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; + const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; //Index peeled_mc3 = (rows/Pack1)*Pack1; const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0; if(Pack1>=3*PacketSize) for(Index i=0; i(blockA, lhs, cols, i, count); + if(HasHalf && Pack1>=HalfPacketSize) + for(Index i=peeled_mc1; i(blockA, lhs, cols, i, count); + + if(HasQuarter && Pack1>=QuarterPacketSize) + for(Index i=peeled_mc_half; i(blockA, lhs, cols, i, count); + // do the same with mr==1 - for(Index i=peeled_mc1; i Date: Sat, 22 Dec 2018 13:09:07 +0100 Subject: Make code compile again for older compilers. See https://stackoverflow.com/questions/7411515/ --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 968cec78b..d6dd9dc17 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -589,7 +589,7 @@ public: template EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { - const conj_helper cj; + conj_helper cj; r = cj.pmadd(c,alpha,r); } @@ -927,7 +927,7 @@ public: template EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { - const conj_helper cj; + conj_helper cj; r = cj.pmadd(alpha,c,r); } -- cgit v1.2.3 From dbfcceabf50db9c1dc6d82863aa9670a1b53c0a4 Mon Sep 17 00:00:00 2001 From: Renjie Liu Date: Wed, 16 Jan 2019 12:51:36 +0800 Subject: Bug: 1633: refactor gebp kernel and optimize for neon --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 435 ++++++++++++++-------- 1 file changed, 274 insertions(+), 161 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index d6dd9dc17..bfc7d1979 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -347,6 +347,14 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); #endif +template +struct RhsPanelHelper { + private: + typedef typename conditional<(registers_taken < 15), RhsPacket, RhsPacketx4>::type inter_type; + public: + typedef typename conditional<(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS < 32), RhsPacket, inter_type>::type type; +}; + /* Vectorization logic * real*real: unpack rhs to constant packets, ... * @@ -404,29 +412,42 @@ public: typedef typename conditional::type ResPacket; typedef LhsPacket LhsPacket4Packing; + typedef struct { + RhsPacket B_0, B1, B2, B3; + const RhsPacket& get(const FixedInt<0>&) const { return B_0; } + const RhsPacket& get(const FixedInt<1>&) const { return B1; } + const RhsPacket& get(const FixedInt<2>&) const { return B2; } + const RhsPacket& get(const FixedInt<3>&) const { return B3; } + } RhsPacketx4; + typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - + template EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { dest = pset1(*b); } - + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + { + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); @@ -444,8 +465,8 @@ public: dest = ploadu(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const FixedInt&) const { conj_helper cj; // It would be a lot cleaner to call pmadd all the time. Unfortunately if we @@ -460,6 +481,13 @@ public: #endif } + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacketx4&, const FixedInt& lane) const + { + RhsPacket tmp; + madd(a, b.get(lane), c, tmp, lane); + } + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { r = pmadd(c,alpha,r); @@ -511,6 +539,14 @@ public: typedef typename conditional::type ResPacket; typedef LhsPacket LhsPacket4Packing; + typedef struct { + RhsPacket B_0, B1, B2, B3; + const RhsPacket& get(const FixedInt<0>&) const { return B_0; } + const RhsPacket& get(const FixedInt<1>&) const { return B1; } + const RhsPacket& get(const FixedInt<2>&) const { return B2; } + const RhsPacket& get(const FixedInt<3>&) const { return B3; } + } RhsPacketx4; + typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -523,6 +559,20 @@ public: { dest = pset1(*b); } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { @@ -554,18 +604,8 @@ public: dest = ploadu(a); } - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, AccPacketType& c, RhsPacket& tmp, const FixedInt&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } @@ -586,6 +626,13 @@ public: c += a * b; } + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacketx4&, const FixedInt& lane) const + { + RhsPacket tmp; + madd(a, b.get(lane), c, tmp, lane); + } + template EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { @@ -708,6 +755,14 @@ public: typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; typedef typename conditional::type AccPacket; + + typedef struct { + RhsPacket B_0, B1, B2, B3; + const RhsPacket& get(const FixedInt<0>&) const { return B_0; } + const RhsPacket& get(const FixedInt<1>&) const { return B1; } + const RhsPacket& get(const FixedInt<2>&) const { return B2; } + const RhsPacket& get(const FixedInt<3>&) const { return B3; } + } RhsPacketx4; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } @@ -730,39 +785,39 @@ public: dest.first = pset1(real(*b)); dest.second = pset1(imag(*b)); } - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - loadRhs(b,dest); + loadRhs(b, dest.B_0); + loadRhs(b + 1, dest.B1); + loadRhs(b + 2, dest.B2); + loadRhs(b + 3, dest.B3); } - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const + + // Scalar path + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const { - loadQuadToDoublePacket(b,dest); + loadRhs(b, dest); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + + // Vectorized path + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); - loadRhs(b+2, b2); - loadRhs(b+3, b3); + loadRhs(b, dest); } - - // Vectorized path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); } - // Scalar path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const + { + loadRhs(b,dest); + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); + loadQuadToDoublePacket(b,dest); } // nothing special here @@ -777,17 +832,25 @@ public: dest = ploadu((const typename unpacket_traits::type*)(a)); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket& c, TmpType& /*tmp*/) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, DoublePacket& c, TmpType& /*tmp*/, const FixedInt&) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const { c = cj.pmadd(a,b,c); } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacketx4&, const FixedInt& lane) const + { + RhsPacket tmp; + madd(a, b.get(lane), c, tmp, lane); + } EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } @@ -860,6 +923,14 @@ public: typedef typename conditional::type ResPacket; typedef LhsPacket LhsPacket4Packing; + typedef struct { + RhsPacket B_0, B1, B2, B3; + const RhsPacket& get(const FixedInt<0>&) const { return B_0; } + const RhsPacket& get(const FixedInt<1>&) const { return B1; } + const RhsPacket& get(const FixedInt<2>&) const { return B2; } + const RhsPacket& get(const FixedInt<3>&) const { return B3; } + } RhsPacketx4; + typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -872,18 +943,20 @@ public: { dest = pset1(*b); } - - void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - pbroadcast4(b, b0, b1, b2, b3); + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// // FIXME not sure that's the best way to implement it! -// b0 = pload1(b+0); -// b1 = pload1(b+1); -// } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { @@ -901,8 +974,8 @@ public: dest = ploaddup(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, AccPacketType& c, RhsPacket& tmp, const FixedInt&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } @@ -924,6 +997,13 @@ public: c += a * b; } + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacketx4&, const FixedInt& lane) const + { + RhsPacket tmp; + madd(a, b.get(lane), c, tmp, lane); + } + template EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { @@ -932,7 +1012,7 @@ public: } protected: - + }; @@ -944,27 +1024,54 @@ struct gebp_traits { typedef float RhsPacket; - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + typedef float32x4_t RhsPacketx4; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - loadRhs(b+0, b0); - loadRhs(b+1, b1); - loadRhs(b+2, b2); - loadRhs(b+3, b3); + dest = *b; } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest = vld1q_f32(b); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; } + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const + {} + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b,dest); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c += a * b; + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacketx4& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_lane_f32(c, a, vget_low_f32(b), 0); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacketx4& /*tmp*/, const FixedInt<1>&) const + { + c = vfmaq_lane_f32(c, a, vget_low_f32(b), 1); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacketx4& /*tmp*/, const FixedInt<2>&) const + { + c = vfmaq_lane_f32(c, a, vget_high_f32(b), 0); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacketx4& /*tmp*/, const FixedInt<3>&) const { - c = vfmaq_n_f32(c, a, b); + c = vfmaq_lane_f32(c, a, vget_high_f32(b), 1); } }; @@ -986,6 +1093,9 @@ struct gebp_kernel typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; + typedef typename Traits::RhsPacketx4 RhsPacketx4; + + typedef typename RhsPanelHelper::type RhsPanel15; typedef gebp_traits SwappedTraits; typedef typename SwappedTraits::ResScalar SResScalar; @@ -1075,7 +1185,7 @@ struct last_row_process_16_packets); blB += SwappedTraits::LhsProgress/4; blA += 1; } @@ -1166,36 +1276,39 @@ void gebp_kernel); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A2, rhs_panel, C8, T0, fix<0>); \ + traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A2, rhs_panel, C9, T0, fix<1>); \ + traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A2, rhs_panel, C10, T0, fix<2>); \ + traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + traits.madd(A2, rhs_panel, C11, T0, fix<3>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ + } while (false) internal::prefetch(blB); EIGEN_GEBP_ONESTEP(0); @@ -1215,7 +1328,7 @@ void gebp_kernel); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ + traits.madd(A2, B_0, C8, B_0, fix<0>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ + } while (false) + EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); @@ -1397,7 +1510,7 @@ void gebp_kernel=6 without FMA (bug 1637) @@ -1406,24 +1519,24 @@ void gebp_kernel); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ + } while (false) + internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -1443,7 +1556,7 @@ void gebp_kernel); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ } while(false) @@ -1596,19 +1709,19 @@ void gebp_kernel); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \ } while(false) internal::prefetch(blB+(48+0)); @@ -1630,7 +1743,7 @@ void gebp_kernel); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \ } while(false); EIGEN_GEBGP_ONESTEP(0); @@ -1763,15 +1876,15 @@ void gebp_kernel); + straits.madd(A1,B_1,C1,B_1, fix<0>); straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); straits.loadRhsQuad(blA+2*spk, B_0); straits.loadRhsQuad(blA+3*spk, B_1); - straits.madd(A0,B_0,C2,B_0); - straits.madd(A1,B_1,C3,B_1); + straits.madd(A0,B_0,C2,B_0, fix<0>); + straits.madd(A1,B_1,C3,B_1, fix<0>); blB += 4*SwappedTraits::LhsProgress; blA += 4*spk; @@ -1784,7 +1897,7 @@ void gebp_kernel); blB += SwappedTraits::LhsProgress; blA += spk; @@ -1808,7 +1921,7 @@ void gebp_kernel); straits.acc(c0, alphav, R); } else -- cgit v1.2.3 From 0b466b69336497628de8216ff797369b009a2946 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 16 Jan 2019 13:50:13 +0100 Subject: bug #1633: use proper type for madd temporaries, factorize RhsPacketx4. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 113 +++++++++------------- 1 file changed, 48 insertions(+), 65 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index bfc7d1979..04b7bfa7e 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -355,6 +355,16 @@ struct RhsPanelHelper { typedef typename conditional<(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS < 32), RhsPacket, inter_type>::type type; }; +template +struct QuadPacket +{ + Packet B_0, B1, B2, B3; + const Packet& get(const FixedInt<0>&) const { return B_0; } + const Packet& get(const FixedInt<1>&) const { return B1; } + const Packet& get(const FixedInt<2>&) const { return B2; } + const Packet& get(const FixedInt<3>&) const { return B3; } +}; + /* Vectorization logic * real*real: unpack rhs to constant packets, ... * @@ -412,14 +422,7 @@ public: typedef typename conditional::type ResPacket; typedef LhsPacket LhsPacket4Packing; - typedef struct { - RhsPacket B_0, B1, B2, B3; - const RhsPacket& get(const FixedInt<0>&) const { return B_0; } - const RhsPacket& get(const FixedInt<1>&) const { return B1; } - const RhsPacket& get(const FixedInt<2>&) const { return B2; } - const RhsPacket& get(const FixedInt<3>&) const { return B3; } - } RhsPacketx4; - + typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -465,8 +468,8 @@ public: dest = ploadu(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const FixedInt&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { conj_helper cj; // It would be a lot cleaner to call pmadd all the time. Unfortunately if we @@ -481,10 +484,9 @@ public: #endif } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacketx4&, const FixedInt& lane) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const { - RhsPacket tmp; madd(a, b.get(lane), c, tmp, lane); } @@ -539,13 +541,7 @@ public: typedef typename conditional::type ResPacket; typedef LhsPacket LhsPacket4Packing; - typedef struct { - RhsPacket B_0, B1, B2, B3; - const RhsPacket& get(const FixedInt<0>&) const { return B_0; } - const RhsPacket& get(const FixedInt<1>&) const { return B1; } - const RhsPacket& get(const FixedInt<2>&) const { return B2; } - const RhsPacket& get(const FixedInt<3>&) const { return B3; } - } RhsPacketx4; + typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; @@ -604,8 +600,8 @@ public: dest = ploadu(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, AccPacketType& c, RhsPacket& tmp, const FixedInt&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } @@ -626,10 +622,9 @@ public: c += a * b; } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacketx4&, const FixedInt& lane) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const { - RhsPacket tmp; madd(a, b.get(lane), c, tmp, lane); } @@ -756,13 +751,8 @@ public: typedef typename conditional::type ResPacket; typedef typename conditional::type AccPacket; - typedef struct { - RhsPacket B_0, B1, B2, B3; - const RhsPacket& get(const FixedInt<0>&) const { return B_0; } - const RhsPacket& get(const FixedInt<1>&) const { return B1; } - const RhsPacket& get(const FixedInt<2>&) const { return B2; } - const RhsPacket& get(const FixedInt<3>&) const { return B3; } - } RhsPacketx4; + // this actualy holds 8 packets! + typedef QuadPacket RhsPacketx4; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } @@ -807,9 +797,7 @@ public: loadRhs(b, dest); } - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const - { - } + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { @@ -832,23 +820,22 @@ public: dest = ploadu((const typename unpacket_traits::type*)(a)); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, DoublePacket& c, TmpType& /*tmp*/, const FixedInt&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, DoublePacket& c, TmpType& /*tmp*/, const LaneIdType&) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const { c = cj.pmadd(a,b,c); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacketx4&, const FixedInt& lane) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const { - RhsPacket tmp; madd(a, b.get(lane), c, tmp, lane); } @@ -922,15 +909,7 @@ public: typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; typedef LhsPacket LhsPacket4Packing; - - typedef struct { - RhsPacket B_0, B1, B2, B3; - const RhsPacket& get(const FixedInt<0>&) const { return B_0; } - const RhsPacket& get(const FixedInt<1>&) const { return B1; } - const RhsPacket& get(const FixedInt<2>&) const { return B2; } - const RhsPacket& get(const FixedInt<3>&) const { return B3; } - } RhsPacketx4; - + typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -974,8 +953,8 @@ public: dest = ploaddup(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, AccPacketType& c, RhsPacket& tmp, const FixedInt&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } @@ -997,10 +976,9 @@ public: c += a * b; } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacketx4&, const FixedInt& lane) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const { - RhsPacket tmp; madd(a, b.get(lane), c, tmp, lane); } @@ -1054,22 +1032,22 @@ struct gebp_traits c += a * b; } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacketx4& /*tmp*/, const FixedInt<0>&) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const { c = vfmaq_lane_f32(c, a, vget_low_f32(b), 0); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacketx4& /*tmp*/, const FixedInt<1>&) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const { c = vfmaq_lane_f32(c, a, vget_low_f32(b), 1); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacketx4& /*tmp*/, const FixedInt<2>&) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const { c = vfmaq_lane_f32(c, a, vget_high_f32(b), 0); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacketx4& /*tmp*/, const FixedInt<3>&) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const { c = vfmaq_lane_f32(c, a, vget_high_f32(b), 1); } @@ -1277,7 +1255,8 @@ void gebp_kernel=6 without FMA (bug 1637) @@ -1556,7 +1536,8 @@ void gebp_kernel Date: Wed, 16 Jan 2019 21:22:20 +0100 Subject: bug #1661: fix regression in GEBP and AVX512 --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 04b7bfa7e..8b4b1dbd3 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1307,7 +1307,8 @@ void gebp_kernel Date: Wed, 16 Jan 2019 21:47:42 +0100 Subject: GEBP: cleanup logic to choose between a 4 packets of 1 packet --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 8b4b1dbd3..fdc050e05 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -350,9 +350,9 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ template struct RhsPanelHelper { private: - typedef typename conditional<(registers_taken < 15), RhsPacket, RhsPacketx4>::type inter_type; + static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken; public: - typedef typename conditional<(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS < 32), RhsPacket, inter_type>::type type; + typedef typename conditional=4, RhsPacketx4, RhsPacket>::type type; }; template -- cgit v1.2.3 From 0f028f61cb4d7731512fff861d8228945c5d965c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 16 Jan 2019 22:26:38 +0100 Subject: GEBP: fix swapped kernel mode with AVX512 and complex scalars --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index fdc050e05..c5d77763a 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -600,8 +600,8 @@ public: dest = ploadu(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } @@ -820,8 +820,10 @@ public: dest = ploadu((const typename unpacket_traits::type*)(a)); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, DoublePacket& c, TmpType& /*tmp*/, const LaneIdType&) const + template + EIGEN_STRONG_INLINE + typename enable_if::value>::type + madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket& c, TmpType& /*tmp*/, const LaneIdType&) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); @@ -953,8 +955,8 @@ public: dest = ploaddup(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacket& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } -- cgit v1.2.3 From 71429883ee41689fd657cdca824459f38ae53423 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 25 Jan 2019 17:00:21 -0800 Subject: Fix compilation error in NEON GEBP specializaition of madd. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index c5d77763a..cc6f3f029 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1031,7 +1031,7 @@ struct gebp_traits EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const { - c += a * b; + c = vfmaq_n_f32(c, a, b); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const -- cgit v1.2.3 From be5b0f664ab1481e74d72e01d4f9172cf927b221 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Jan 2019 11:48:25 +0100 Subject: ARM64 & GEBP: Make use of vfmaq_laneq_f32 and workaround GCC's issue in generating good ASM --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 45 ++++++++++++----------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index cc6f3f029..dea8c94eb 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1008,17 +1008,17 @@ struct gebp_traits EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = *b; + dest = *b; } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - dest = vld1q_f32(b); + dest = vld1q_f32(b); } EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = *b; + dest = *b; } EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const @@ -1034,24 +1034,19 @@ struct gebp_traits c = vfmaq_n_f32(c, a, b); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const { - c = vfmaq_lane_f32(c, a, vget_low_f32(b), 0); - } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const - { - c = vfmaq_lane_f32(c, a, vget_low_f32(b), 1); - } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const - { - c = vfmaq_lane_f32(c, a, vget_high_f32(b), 0); - } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const - { - c = vfmaq_lane_f32(c, a, vget_high_f32(b), 1); + #if EIGEN_COMP_GNUC_STRICT + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f32 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : ); + #else + c = vfmaq_laneq_f32(c, a, b, LaneID); + #endif } }; @@ -1260,7 +1255,14 @@ void gebp_kernel); \ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ -- cgit v1.2.3 From 3775926bbae69c23584dd9e6acdbe20ee6ac7050 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Jan 2019 11:49:06 +0100 Subject: ARM64 & GEBP: add specialization for double +30% speed up --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 59 +++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index dea8c94eb..c8c3d69cc 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1050,6 +1050,65 @@ struct gebp_traits } }; + +template<> +struct gebp_traits + : gebp_traits +{ + typedef double RhsPacket; + + struct RhsPacketx4 { + float64x2_t B_0, B_1; + }; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest.B_0 = vld1q_f64(b); + dest.B_1 = vld1q_f64(b+2); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const + {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_n_f64(c, a, b); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const + { + #if EIGEN_COMP_GNUC_STRICT + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f64 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + #else + if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0); + else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1); + else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0); + else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1); + #endif + } +}; + #endif /* optimized General packed Block * packed Panel product kernel -- cgit v1.2.3 From df12fae8b8c0e10f671ffdded241be7a71684ffb Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Jan 2019 11:52:28 +0100 Subject: According to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101, the previous GCC issue is fixed in GCC trunk (will be gcc 9). --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index c8c3d69cc..86767d0e9 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1037,7 +1037,7 @@ struct gebp_traits template EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const { - #if EIGEN_COMP_GNUC_STRICT + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 // vfmaq_laneq_f32 is implemented through a costly dup if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); @@ -1093,7 +1093,7 @@ struct gebp_traits template EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const { - #if EIGEN_COMP_GNUC_STRICT + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 // vfmaq_laneq_f64 is implemented through a costly dup if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); @@ -1314,7 +1314,7 @@ void gebp_kernel Date: Wed, 30 Jan 2019 16:48:20 +0100 Subject: Fix compilation with ARM64. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 030c7740a..f07746fdb 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1053,9 +1053,9 @@ protected: #if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON -template -struct gebp_traits - : gebp_traits +template<> +struct gebp_traits + : gebp_traits { typedef float RhsPacket; -- cgit v1.2.3 From 7ef879f6bfa465a80109216e6d0b18266ef97321 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Jan 2019 23:45:12 +0100 Subject: GEBP: improves pipelining in the 1pX4 path with FMA. Prior to this change, a product with a LHS having 8 rows was faster with AVX-only than with AVX+FMA. With AVX+FMA I measured a speed up of about x1.25 in such cases. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 45 ++++++++++++++++------- 1 file changed, 31 insertions(+), 14 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index f07746fdb..e55c2ade8 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1313,15 +1313,18 @@ struct lhs_process_one_packet EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) { - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); - traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0); - traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel); - traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>); - traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>); - traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>); - traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>); - EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0); + traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel); + traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>); + traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>); + traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>); + traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>); + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + __asm__ ("" : "+x,m" (*A0)); + #endif + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); } EIGEN_STRONG_INLINE void operator()( @@ -1350,6 +1353,16 @@ struct lhs_process_one_packet traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); + // To improve instruction pipelining, let's double the accumulation registers: + // even k will accumulate in C*, while odd k will accumulate in D*. + // This trick is crutial to get good performance with FMA, otherwise it is + // actually faster to perform separated MUL+ADD because of a naturally + // better instruction-level parallelism. + AccPacket D0, D1, D2, D3; + traits.initAcc(D0); + traits.initAcc(D1); + traits.initAcc(D2); + traits.initAcc(D3); LinearMapper r0 = res.getLinearMapper(i, j2 + 0); LinearMapper r1 = res.getLinearMapper(i, j2 + 1); @@ -1364,7 +1377,7 @@ struct lhs_process_one_packet // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; prefetch(&blB[0]); - LhsPacket A0; + LhsPacket A0, A1; for(Index k=0; k Date: Thu, 31 Jan 2019 14:24:08 -0800 Subject: Speed up Eigen matrix*vector and vector*matrix multiplication. This change speeds up Eigen matrix * vector and vector * matrix multiplication for dynamic matrices when it is known at runtime that one of the factors is a vector. The benchmarks below test c.noalias()= n_by_n_matrix * n_by_1_matrix; c.noalias()= 1_by_n_matrix * n_by_n_matrix; respectively. Benchmark measurements: SSE: Run on *** (72 X 2992 MHz CPUs); 2019-01-28T17:51:44.452697457-08:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------ BM_MatVec/64 1096 312 +71.5% BM_MatVec/128 4581 1464 +68.0% BM_MatVec/256 18534 5710 +69.2% BM_MatVec/512 118083 24162 +79.5% BM_MatVec/1k 704106 173346 +75.4% BM_MatVec/2k 3080828 742728 +75.9% BM_MatVec/4k 25421512 4530117 +82.2% BM_VecMat/32 352 130 +63.1% BM_VecMat/64 1213 425 +65.0% BM_VecMat/128 4640 1564 +66.3% BM_VecMat/256 17902 5884 +67.1% BM_VecMat/512 70466 24000 +65.9% BM_VecMat/1k 340150 161263 +52.6% BM_VecMat/2k 1420590 645576 +54.6% BM_VecMat/4k 8083859 4364327 +46.0% AVX2: Run on *** (72 X 2993 MHz CPUs); 2019-01-28T17:45:11.508545307-08:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------ BM_MatVec/64 619 120 +80.6% BM_MatVec/128 9693 752 +92.2% BM_MatVec/256 38356 2773 +92.8% BM_MatVec/512 69006 12803 +81.4% BM_MatVec/1k 443810 160378 +63.9% BM_MatVec/2k 2633553 646594 +75.4% BM_MatVec/4k 16211095 4327148 +73.3% BM_VecMat/64 925 227 +75.5% BM_VecMat/128 3438 830 +75.9% BM_VecMat/256 13427 2936 +78.1% BM_VecMat/512 53944 12473 +76.9% BM_VecMat/1k 302264 157076 +48.0% BM_VecMat/2k 1396811 675778 +51.6% BM_VecMat/4k 8962246 4459010 +50.2% AVX512: Run on *** (72 X 2993 MHz CPUs); 2019-01-28T17:35:17.239329863-08:00 CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------ BM_MatVec/64 401 111 +72.3% BM_MatVec/128 1846 513 +72.2% BM_MatVec/256 36739 1927 +94.8% BM_MatVec/512 54490 9227 +83.1% BM_MatVec/1k 487374 161457 +66.9% BM_MatVec/2k 2016270 643824 +68.1% BM_MatVec/4k 13204300 4077412 +69.1% BM_VecMat/32 324 106 +67.3% BM_VecMat/64 1034 246 +76.2% BM_VecMat/128 3576 802 +77.6% BM_VecMat/256 13411 2561 +80.9% BM_VecMat/512 58686 10037 +82.9% BM_VecMat/1k 320862 163750 +49.0% BM_VecMat/2k 1406719 651397 +53.7% BM_VecMat/4k 7785179 4124677 +47.0% Currently watchingStop watching --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 158 +++++++++++++++++++++----- 1 file changed, 129 insertions(+), 29 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index f49abcad5..4bcccd326 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -404,13 +404,13 @@ class gemm_blocking_space -struct generic_product_impl - : generic_product_impl_base > -{ +template 1 || Dest::RowsAtCompileTime > 1), + bool MultipleColsAtCompileTime = + (Rhs::ColsAtCompileTime > 1 || Dest::ColsAtCompileTime > 1)> +struct gemm_selector { typedef typename Product::Scalar Scalar; - typedef typename Lhs::Scalar LhsScalar; - typedef typename Rhs::Scalar RhsScalar; typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; @@ -420,10 +420,130 @@ struct generic_product_impl typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; typedef typename internal::remove_all::type ActualRhsTypeCleaned; + static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) + { + if (a_rhs.cols() != 1 && a_lhs.rows() != 1) { + gemm_selector::run(dst, a_lhs, a_rhs, alpha); + } else if (a_rhs.cols() == 1) { + // matrix * vector. + internal::gemv_dense_selector::HasUsableDirectAccess) + >::run(a_lhs, a_rhs.col(0), dst, alpha); + } else { + // vector * matrix. + internal::gemv_dense_selector::HasUsableDirectAccess) + >::run(a_lhs.row(0), a_rhs, dst, alpha); + } + } +}; + +template +struct gemm_selector { + typedef typename Product::Scalar Scalar; + + typedef internal::blas_traits LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef typename internal::remove_all::type ActualLhsTypeCleaned; + + static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) + { + if (a_rhs.cols() != 1 && a_lhs.rows() != 1) { + gemm_selector::run(dst, a_lhs, a_rhs, alpha); + } else { + // matrix * vector. + internal::gemv_dense_selector::HasUsableDirectAccess) + >::run(a_lhs, a_rhs.col(0), dst, alpha); + } + } +}; + +template +struct gemm_selector { + typedef typename Product::Scalar Scalar; + + typedef internal::blas_traits RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef typename internal::remove_all::type ActualRhsTypeCleaned; + + static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) + { + if (a_rhs.cols() != 1 && a_lhs.rows() != 1) { + gemm_selector::run(dst, a_lhs, a_rhs, alpha); + } else { + // vector * matrix. + internal::gemv_dense_selector::HasUsableDirectAccess) + >::run(a_lhs.row(0), a_rhs, dst, alpha); + } + } +}; + +template +struct gemm_selector { + typedef typename Product::Scalar Scalar; + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + + typedef internal::blas_traits LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef + typename internal::remove_all::type ActualLhsTypeCleaned; + + typedef internal::blas_traits RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef + typename internal::remove_all::type ActualRhsTypeCleaned; + enum { - MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime) + MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED( + Lhs::MaxColsAtCompileTime, Rhs::MaxRowsAtCompileTime) }; + static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, + const Scalar& alpha) { + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) * + RhsBlasTraits::extractScalarFactor(a_rhs); + typename internal::add_const_on_value_type::type lhs = + LhsBlasTraits::extract(a_lhs); + typename internal::add_const_on_value_type::type rhs = + RhsBlasTraits::extract(a_rhs); + typedef internal::gemm_blocking_space< + (Dest::Flags & RowMajorBit) ? RowMajor : ColMajor, LhsScalar, RhsScalar, + Dest::MaxRowsAtCompileTime, Dest::MaxColsAtCompileTime, + MaxDepthAtCompileTime> + BlockingType; + + typedef internal::gemm_functor< + Scalar, Index, + internal::general_matrix_matrix_product< + Index, LhsScalar, + (ActualLhsTypeCleaned::Flags & RowMajorBit) ? RowMajor : ColMajor, + bool(LhsBlasTraits::NeedToConjugate), RhsScalar, + (ActualRhsTypeCleaned::Flags & RowMajorBit) ? RowMajor : ColMajor, + bool(RhsBlasTraits::NeedToConjugate), + (Dest::Flags & RowMajorBit) ? RowMajor : ColMajor>, + ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> + GemmFunctor; + + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); + internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime > 32 || + Dest::MaxRowsAtCompileTime == Dynamic)>( + GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), + a_rhs.cols(), a_lhs.cols(), Dest::Flags & RowMajorBit); + } +}; + +template +struct generic_product_impl + : generic_product_impl_base > +{ + typedef typename Product::Scalar Scalar; typedef generic_product_impl lazyproduct; template @@ -450,7 +570,7 @@ struct generic_product_impl if((rhs.rows()+dst.rows()+dst.cols())0) lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op()); else - scaleAndAddTo(dst,lhs, rhs, Scalar(1)); + scaleAndAddTo(dst, lhs, rhs, Scalar(1)); } template @@ -469,27 +589,7 @@ struct generic_product_impl if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) return; - typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); - typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); - - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) - * RhsBlasTraits::extractScalarFactor(a_rhs); - - typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar, - Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType; - - typedef internal::gemm_functor< - Scalar, Index, - internal::general_matrix_matrix_product< - Index, - LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), - RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), - (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, - ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor; - - BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); - internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)> - (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit); + gemm_selector::run(dst, a_lhs, a_rhs, alpha); } }; -- cgit v1.2.3 From b55b5c7280a0481f01fe5ec764d55c443a8b6496 Mon Sep 17 00:00:00 2001 From: Sameer Agarwal Date: Fri, 1 Feb 2019 15:23:53 -0800 Subject: Speed up row-major matrix-vector product on ARM The row-major matrix-vector multiplication code uses a threshold to check if processing 8 rows at a time would thrash the cache. This change introduces two modifications to this logic. 1. A smaller threshold for ARM and ARM64 devices. The value of this threshold was determined empirically using a Pixel2 phone, by benchmarking a large number of matrix-vector products in the range [1..4096]x[1..4096] and measuring performance separately on small and little cores with frequency pinning. On big (out-of-order) cores, this change has little to no impact. But on the small (in-order) cores, the matrix-vector products are up to 700% faster. Especially on large matrices. The motivation for this change was some internal code at Google which was using hand-written NEON for implementing similar functionality, processing the matrix one row at a time, which exhibited substantially better performance than Eigen. With the current change, Eigen handily beats that code. 2. Make the logic for choosing number of simultaneous rows apply unifiormly to 8, 4 and 2 rows instead of just 8 rows. Since the default threshold for non-ARM devices is essentially unchanged (32000 -> 32 * 1024), this change has no impact on non-ARM performance. This was verified by running the same set of benchmarks on a Xeon desktop. --- Eigen/src/Core/products/GeneralMatrixVector.h | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 767feb99d..e7dc25478 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -255,11 +255,20 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product cj; conj_helper pcj; - // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, - // processing 8 rows at once might be counter productive wrt cache. - const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7; - const Index n4 = rows-3; - const Index n2 = rows-1; + // TODO: fine tune the following heuristic. The rationale is that if the + // matrix is very large, processing multiple rows at once might be counter + // productive wrt cache. +#if EIGEN_ARCH_ARM_OR_ARM64 + // This threshold was empirically determined using a Pixel2. + // The little cores are a lot more sensitive to this number + // than the big cores. + const Index cache_threshold = 1024; +#else + const Index cache_threshold = 1024 * 256; +#endif + + const Index row_bytes = lhs.stride() * sizeof(LhsScalar); + const Index n8 = (8 * row_bytes > cache_threshold) ? 0 : (rows - 7); // TODO: for padded aligned inputs, we could enable aligned reads enum { LhsAlignment = Unaligned }; @@ -320,6 +329,9 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product cache_threshold) ? 0 : (rows - 3); for(; i(ResScalar(0)), @@ -355,6 +367,9 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product cache_threshold) ? 0 : (rows - 1); for(; i(ResScalar(0)), -- cgit v1.2.3 From 690b2c45b1101e9661305e6a728ffe2279974fc6 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 4 Feb 2019 10:29:15 -0800 Subject: Fix GeneralBlockPanelKernel Android compilation --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 33 ++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index e55c2ade8..a70a06e57 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1089,8 +1089,21 @@ struct gebp_traits c = vfmaq_n_f32(c, a, b); } + // NOTE: Template parameter inference failed when compiled with Android NDK: + // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { madd_helper<0>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const + { madd_helper<1>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const + { madd_helper<2>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const + { madd_helper<3>(a, b, c); } + + private: template - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const + EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const { #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 @@ -1145,11 +1158,25 @@ struct gebp_traits c = vfmaq_n_f64(c, a, b); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const + // NOTE: Template parameter inference failed when compiled with Android NDK: + // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { madd_helper<0>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const + { madd_helper<1>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const + { madd_helper<2>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const + { madd_helper<3>(a, b, c); } + + private: + template + EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const { #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 // vfmaq_laneq_f64 is implemented through a costly dup if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); -- cgit v1.2.3 From 6d0f6265a95757208d089b3c9705281b68fd66b4 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 4 Feb 2019 10:30:25 -0800 Subject: Remove duplicated comment line --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 1 - 1 file changed, 1 deletion(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index a70a06e57..fdd0ec0e9 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1176,7 +1176,6 @@ struct gebp_traits { #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 - // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 // vfmaq_laneq_f64 is implemented through a costly dup if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); -- cgit v1.2.3 From fa2fcb4895a4ae12cb28003e646c736d013e68e8 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 7 Feb 2019 16:07:08 +0100 Subject: Backed out changeset 4c0fa6ce0f81ce67dd6723528ddf72f66ae92ba2 --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 158 +++++--------------------- 1 file changed, 29 insertions(+), 129 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 4bcccd326..f49abcad5 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -404,146 +404,26 @@ class gemm_blocking_space 1 || Dest::RowsAtCompileTime > 1), - bool MultipleColsAtCompileTime = - (Rhs::ColsAtCompileTime > 1 || Dest::ColsAtCompileTime > 1)> -struct gemm_selector { - typedef typename Product::Scalar Scalar; - - typedef internal::blas_traits LhsBlasTraits; - typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; - typedef typename internal::remove_all::type ActualLhsTypeCleaned; - - typedef internal::blas_traits RhsBlasTraits; - typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; - typedef typename internal::remove_all::type ActualRhsTypeCleaned; - - static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) - { - if (a_rhs.cols() != 1 && a_lhs.rows() != 1) { - gemm_selector::run(dst, a_lhs, a_rhs, alpha); - } else if (a_rhs.cols() == 1) { - // matrix * vector. - internal::gemv_dense_selector::HasUsableDirectAccess) - >::run(a_lhs, a_rhs.col(0), dst, alpha); - } else { - // vector * matrix. - internal::gemv_dense_selector::HasUsableDirectAccess) - >::run(a_lhs.row(0), a_rhs, dst, alpha); - } - } -}; - -template -struct gemm_selector { - typedef typename Product::Scalar Scalar; - - typedef internal::blas_traits LhsBlasTraits; - typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; - typedef typename internal::remove_all::type ActualLhsTypeCleaned; - - static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) - { - if (a_rhs.cols() != 1 && a_lhs.rows() != 1) { - gemm_selector::run(dst, a_lhs, a_rhs, alpha); - } else { - // matrix * vector. - internal::gemv_dense_selector::HasUsableDirectAccess) - >::run(a_lhs, a_rhs.col(0), dst, alpha); - } - } -}; - -template -struct gemm_selector { +template +struct generic_product_impl + : generic_product_impl_base > +{ typedef typename Product::Scalar Scalar; - - typedef internal::blas_traits RhsBlasTraits; - typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; - typedef typename internal::remove_all::type ActualRhsTypeCleaned; - - static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) - { - if (a_rhs.cols() != 1 && a_lhs.rows() != 1) { - gemm_selector::run(dst, a_lhs, a_rhs, alpha); - } else { - // vector * matrix. - internal::gemv_dense_selector::HasUsableDirectAccess) - >::run(a_lhs.row(0), a_rhs, dst, alpha); - } - } -}; - -template -struct gemm_selector { - typedef typename Product::Scalar Scalar; typedef typename Lhs::Scalar LhsScalar; typedef typename Rhs::Scalar RhsScalar; typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; - typedef - typename internal::remove_all::type ActualLhsTypeCleaned; + typedef typename internal::remove_all::type ActualLhsTypeCleaned; typedef internal::blas_traits RhsBlasTraits; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; - typedef - typename internal::remove_all::type ActualRhsTypeCleaned; + typedef typename internal::remove_all::type ActualRhsTypeCleaned; enum { - MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED( - Lhs::MaxColsAtCompileTime, Rhs::MaxRowsAtCompileTime) + MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime) }; - static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, - const Scalar& alpha) { - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) * - RhsBlasTraits::extractScalarFactor(a_rhs); - typename internal::add_const_on_value_type::type lhs = - LhsBlasTraits::extract(a_lhs); - typename internal::add_const_on_value_type::type rhs = - RhsBlasTraits::extract(a_rhs); - typedef internal::gemm_blocking_space< - (Dest::Flags & RowMajorBit) ? RowMajor : ColMajor, LhsScalar, RhsScalar, - Dest::MaxRowsAtCompileTime, Dest::MaxColsAtCompileTime, - MaxDepthAtCompileTime> - BlockingType; - - typedef internal::gemm_functor< - Scalar, Index, - internal::general_matrix_matrix_product< - Index, LhsScalar, - (ActualLhsTypeCleaned::Flags & RowMajorBit) ? RowMajor : ColMajor, - bool(LhsBlasTraits::NeedToConjugate), RhsScalar, - (ActualRhsTypeCleaned::Flags & RowMajorBit) ? RowMajor : ColMajor, - bool(RhsBlasTraits::NeedToConjugate), - (Dest::Flags & RowMajorBit) ? RowMajor : ColMajor>, - ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> - GemmFunctor; - - BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); - internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime > 32 || - Dest::MaxRowsAtCompileTime == Dynamic)>( - GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), - a_rhs.cols(), a_lhs.cols(), Dest::Flags & RowMajorBit); - } -}; - -template -struct generic_product_impl - : generic_product_impl_base > -{ - typedef typename Product::Scalar Scalar; typedef generic_product_impl lazyproduct; template @@ -570,7 +450,7 @@ struct generic_product_impl if((rhs.rows()+dst.rows()+dst.cols())0) lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op()); else - scaleAndAddTo(dst, lhs, rhs, Scalar(1)); + scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } template @@ -589,7 +469,27 @@ struct generic_product_impl if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) return; - gemm_selector::run(dst, a_lhs, a_rhs, alpha); + typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); + typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) + * RhsBlasTraits::extractScalarFactor(a_rhs); + + typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar, + Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType; + + typedef internal::gemm_functor< + Scalar, Index, + internal::general_matrix_matrix_product< + Index, + LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), + RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), + (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, + ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor; + + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); + internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)> + (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit); } }; -- cgit v1.2.3 From 013cc3a6b39c5962a3261a063d2a4ab4810cb757 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 7 Feb 2019 16:24:09 +0100 Subject: Make GEMM fallback to GEMV for runtime vectors. This is a more general and simpler version of changeset 4c0fa6ce0f81ce67dd6723528ddf72f66ae92ba2 --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index f49abcad5..90c9c4647 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -469,6 +469,20 @@ struct generic_product_impl if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) return; + // Fallback to GEMV if either the lhs or rhs is a runtime vector + if (dst.cols() == 1) + { + typename Dest::ColXpr dst_vec(dst.col(0)); + return internal::generic_product_impl + ::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha); + } + else if (dst.rows() == 1) + { + typename Dest::RowXpr dst_vec(dst.row(0)); + return internal::generic_product_impl + ::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha); + } + typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); -- cgit v1.2.3 From 65e23ca7e93b6836cb197adcb1e832ae94203b35 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 14 Feb 2019 13:46:13 -0800 Subject: Revert https://bitbucket.org/eigen/eigen/commits/b55b5c7280a0481f01fe5ec764d55c443a8b6496 . --- Eigen/src/Core/products/GeneralMatrixVector.h | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index e7dc25478..767feb99d 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -255,20 +255,11 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product cj; conj_helper pcj; - // TODO: fine tune the following heuristic. The rationale is that if the - // matrix is very large, processing multiple rows at once might be counter - // productive wrt cache. -#if EIGEN_ARCH_ARM_OR_ARM64 - // This threshold was empirically determined using a Pixel2. - // The little cores are a lot more sensitive to this number - // than the big cores. - const Index cache_threshold = 1024; -#else - const Index cache_threshold = 1024 * 256; -#endif - - const Index row_bytes = lhs.stride() * sizeof(LhsScalar); - const Index n8 = (8 * row_bytes > cache_threshold) ? 0 : (rows - 7); + // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, + // processing 8 rows at once might be counter productive wrt cache. + const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7; + const Index n4 = rows-3; + const Index n2 = rows-1; // TODO: for padded aligned inputs, we could enable aligned reads enum { LhsAlignment = Unaligned }; @@ -329,9 +320,6 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product cache_threshold) ? 0 : (rows - 3); for(; i(ResScalar(0)), @@ -367,9 +355,6 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product cache_threshold) ? 0 : (rows - 1); for(; i(ResScalar(0)), -- cgit v1.2.3 From bfbf7da0478afe75e19a953f0925bbd492bcd427 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 5 Mar 2019 23:46:24 +0100 Subject: bug #1689 fix used-but-marked-unused warning --- Eigen/src/Core/products/Parallelizer.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 92e9b0d9f..e01e798f1 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -21,7 +21,8 @@ namespace internal { /** \internal */ inline void manage_multi_threading(Action action, int* v) { - static EIGEN_UNUSED int m_maxThreads = -1; + static int m_maxThreads = -1; + EIGEN_UNUSED_VARIABLE(m_maxThreads); if(action==SetAction) { -- cgit v1.2.3