From f85038b7f3e9a0bd7d2bfbed96cc966863aeea57 Mon Sep 17 00:00:00 2001
From: Antonio Sanchez <cantonios@google.com>
Date: Wed, 3 Feb 2021 08:18:28 -0800
Subject: Fix excessive GEBP register spilling for 32-bit NEON.

Clang does a poor job of optimizing the GEBP microkernel on 32-bit ARM,
leading to excessive 16-byte register spills, slowing down basic f32
matrix multiplication by approx 50%.

By specializing `gebp_traits`, we can eliminate the register spills.
Volatile inline ASM both acts as a barrier to prevent reordering and
enforces strict register use. In a simple f32 matrix multiply example,
this modification reduces 16-byte spills from 109 instances to zero,
leading to a 1.5x speed increase (search for `16-byte Spill` in the
assembly in https://godbolt.org/z/chsPbE).

This is a replacement of !379.  See there for further discussion.

Also moved `gebp_traits` specializations for NEON to
`Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h` to be alongside
other NEON-specific code.

Fixes #2138.
---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 142 ----------------------
 1 file changed, 142 deletions(-)

(limited to 'Eigen/src/Core/products')
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 216d1ad52..79367f197 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -1076,148 +1076,6 @@ protected:
 
 };
 
-
-#if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON
-
-template<>
-struct gebp_traits <float, float, false, false,Architecture::NEON,GEBPPacketFull>
- : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
-{
-  typedef float RhsPacket;
-
-  typedef float32x4_t RhsPacketx4;
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    dest = *b;
-  }
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
-  {
-    dest = vld1q_f32(b);
-  }
-
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    dest = *b;
-  }
-
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
-  {}
-
-  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
-  {
-    loadRhs(b,dest);
-  }
-
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  {
-    c = vfmaq_n_f32(c, a, b);
-  }
-
-  // NOTE: Template parameter inference failed when compiled with Android NDK:
-  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
-
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  { madd_helper<0>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
-  { madd_helper<1>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
-  { madd_helper<2>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
-  { madd_helper<3>(a, b, c); }
-
- private:
-  template<int LaneID>
-  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
-  {
-    #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
-    // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
-    // vfmaq_laneq_f32 is implemented through a costly dup
-         if(LaneID==0)  asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==1)  asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==2)  asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==3)  asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    #else
-    c = vfmaq_laneq_f32(c, a, b, LaneID);
-    #endif
-  }
-};
-
-
-template<>
-struct gebp_traits <double, double, false, false,Architecture::NEON>
- : gebp_traits<double,double,false,false,Architecture::Generic>
-{
-  typedef double RhsPacket;
-
-  struct RhsPacketx4 {
-    float64x2_t B_0, B_1;
-  };
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    dest = *b;
-  }
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
-  {
-    dest.B_0 = vld1q_f64(b);
-    dest.B_1 = vld1q_f64(b+2);
-  }
-
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    loadRhs(b,dest);
-  }
-
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
-  {}
-
-  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
-  {
-    loadRhs(b,dest);
-  }
-
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  {
-    c = vfmaq_n_f64(c, a, b);
-  }
-
-  // NOTE: Template parameter inference failed when compiled with Android NDK:
-  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
-
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  { madd_helper<0>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
-  { madd_helper<1>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
-  { madd_helper<2>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
-  { madd_helper<3>(a, b, c); }
-
- private:
-  template <int LaneID>
-  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
-  {
-    #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
-    // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
-    // vfmaq_laneq_f64 is implemented through a costly dup
-         if(LaneID==0)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
-    else if(LaneID==1)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
-    else if(LaneID==2)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
-    else if(LaneID==3)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
-    #else
-         if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
-    else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
-    else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
-    else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
-    #endif
-  }
-};
-
-#endif
-
 /* optimized General packed Block * packed Panel product kernel
  *
  * Mixing type logic: C += A * B
-- 
cgit v1.2.3