From f85038b7f3e9a0bd7d2bfbed96cc966863aeea57 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Wed, 3 Feb 2021 08:18:28 -0800 Subject: Fix excessive GEBP register spilling for 32-bit NEON. Clang does a poor job of optimizing the GEBP microkernel on 32-bit ARM, leading to excessive 16-byte register spills, slowing down basic f32 matrix multiplication by approx 50%. By specializing `gebp_traits`, we can eliminate the register spills. Volatile inline ASM both acts as a barrier to prevent reordering and enforces strict register use. In a simple f32 matrix multiply example, this modification reduces 16-byte spills from 109 instances to zero, leading to a 1.5x speed increase (search for `16-byte Spill` in the assembly in https://godbolt.org/z/chsPbE). This is a replacement of !379. See there for further discussion. Also moved `gebp_traits` specializations for NEON to `Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h` to be alongside other NEON-specific code. Fixes #2138. --- Eigen/Core | 4 +- Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h | 183 +++++++++++++++++++++ Eigen/src/Core/products/GeneralBlockPanelKernel.h | 142 ---------------- 3 files changed, 186 insertions(+), 143 deletions(-) create mode 100644 Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h diff --git a/Eigen/Core b/Eigen/Core index 4d9a3309c..1a60dcba4 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -340,7 +340,9 @@ using std::ptrdiff_t; #include "src/Core/ConditionEstimator.h" #if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) -#include "src/Core/arch/AltiVec/MatrixProduct.h" + #include "src/Core/arch/AltiVec/MatrixProduct.h" +#elif defined EIGEN_VECTORIZE_NEON + #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h" #endif #include "src/Core/BooleanRedux.h" diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h new file mode 100644 index 000000000..3481f337e --- /dev/null +++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h @@ -0,0 +1,183 @@ +namespace Eigen { +namespace internal { + +#if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG + +// Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm. +// Here we specialize gebp_traits to eliminate these register spills. +// See #2138. +template<> +struct gebp_traits + : gebp_traits +{ + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + // This volatile inline ASM both acts as a barrier to prevent reordering, + // as well as enforces strict register use. + asm volatile( + "vmla.f32 %q[r], %q[c], %q[alpha]" + : [r] "+w" (r) + : [c] "w" (c), + [alpha] "w" (alpha) + : ); + } + + template + EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b, + Packet4f& c, Packet4f& tmp, + const LaneIdType&) const { + acc(a, b, c); + } + + template + EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket& b, + Packet4f& c, Packet4f& tmp, + const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } +}; + +#endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG + +#if EIGEN_ARCH_ARM64 + +template<> +struct gebp_traits + : gebp_traits +{ + typedef float RhsPacket; + typedef float32x4_t RhsPacketx4; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest = vld1q_f32(b); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_n_f32(c, a, b); + } + + // NOTE: Template parameter inference failed when compiled with Android NDK: + // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { madd_helper<0>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const + { madd_helper<1>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const + { madd_helper<2>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const + { madd_helper<3>(a, b, c); } + + private: + template + EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const + { + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f32 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : ); + #else + c = vfmaq_laneq_f32(c, a, b, LaneID); + #endif + } +}; + + +template<> +struct gebp_traits + : gebp_traits +{ + typedef double RhsPacket; + + struct RhsPacketx4 { + float64x2_t B_0, B_1; + }; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest.B_0 = vld1q_f64(b); + dest.B_1 = vld1q_f64(b+2); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_n_f64(c, a, b); + } + + // NOTE: Template parameter inference failed when compiled with Android NDK: + // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { madd_helper<0>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const + { madd_helper<1>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const + { madd_helper<2>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const + { madd_helper<3>(a, b, c); } + + private: + template + EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const + { + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f64 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + #else + if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0); + else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1); + else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0); + else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1); + #endif + } +}; + +#endif // EIGEN_ARCH_ARM64 + +} // namespace internal +} // namespace Eigen diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 216d1ad52..79367f197 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1076,148 +1076,6 @@ protected: }; - -#if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON - -template<> -struct gebp_traits - : gebp_traits -{ - typedef float RhsPacket; - - typedef float32x4_t RhsPacketx4; - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const - { - dest = *b; - } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const - { - dest = vld1q_f32(b); - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const - { - dest = *b; - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const - {} - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const - { - loadRhs(b,dest); - } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const - { - c = vfmaq_n_f32(c, a, b); - } - - // NOTE: Template parameter inference failed when compiled with Android NDK: - // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const - { madd_helper<0>(a, b, c); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const - { madd_helper<1>(a, b, c); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const - { madd_helper<2>(a, b, c); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const - { madd_helper<3>(a, b, c); } - - private: - template - EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const - { - #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) - // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 - // vfmaq_laneq_f32 is implemented through a costly dup - if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); - else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : ); - else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : ); - else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : ); - #else - c = vfmaq_laneq_f32(c, a, b, LaneID); - #endif - } -}; - - -template<> -struct gebp_traits - : gebp_traits -{ - typedef double RhsPacket; - - struct RhsPacketx4 { - float64x2_t B_0, B_1; - }; - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const - { - dest = *b; - } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const - { - dest.B_0 = vld1q_f64(b); - dest.B_1 = vld1q_f64(b+2); - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const - { - loadRhs(b,dest); - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const - {} - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const - { - loadRhs(b,dest); - } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const - { - c = vfmaq_n_f64(c, a, b); - } - - // NOTE: Template parameter inference failed when compiled with Android NDK: - // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const - { madd_helper<0>(a, b, c); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const - { madd_helper<1>(a, b, c); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const - { madd_helper<2>(a, b, c); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const - { madd_helper<3>(a, b, c); } - - private: - template - EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const - { - #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) - // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 - // vfmaq_laneq_f64 is implemented through a costly dup - if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); - else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); - else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); - else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); - #else - if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0); - else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1); - else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0); - else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1); - #endif - } -}; - -#endif - /* optimized General packed Block * packed Panel product kernel * * Mixing type logic: C += A * B -- cgit v1.2.3