aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/NEON
diff options
context:
space:
mode:
authorGravatar Antonio Sanchez <cantonios@google.com>2021-02-03 08:18:28 -0800
committerGravatar Antonio Sanchez <cantonios@google.com>2021-02-03 09:01:48 -0800
commitf85038b7f3e9a0bd7d2bfbed96cc966863aeea57 (patch)
treea890999030a9b7b22f0091ba5185b1a58d06d550 /Eigen/src/Core/arch/NEON
parent56c8b14d875ae42a52d0da52916fac1e29305ca7 (diff)
Fix excessive GEBP register spilling for 32-bit NEON.
Clang does a poor job of optimizing the GEBP microkernel on 32-bit ARM, leading to excessive 16-byte register spills, slowing down basic f32 matrix multiplication by approx 50%. By specializing `gebp_traits`, we can eliminate the register spills. Volatile inline ASM both acts as a barrier to prevent reordering and enforces strict register use. In a simple f32 matrix multiply example, this modification reduces 16-byte spills from 109 instances to zero, leading to a 1.5x speed increase (search for `16-byte Spill` in the assembly in https://godbolt.org/z/chsPbE). This is a replacement of !379. See there for further discussion. Also moved `gebp_traits` specializations for NEON to `Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h` to be alongside other NEON-specific code. Fixes #2138.
Diffstat (limited to 'Eigen/src/Core/arch/NEON')
-rw-r--r--Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h183
1 files changed, 183 insertions, 0 deletions
diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
new file mode 100644
index 000000000..3481f337e
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
@@ -0,0 +1,183 @@
+namespace Eigen {
+namespace internal {
+
+#if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
+
+// Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
+// Here we specialize gebp_traits to eliminate these register spills.
+// See #2138.
+template<>
+struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
+ : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
+{
+ EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
+ {
+ // This volatile inline ASM both acts as a barrier to prevent reordering,
+ // as well as enforces strict register use.
+ asm volatile(
+ "vmla.f32 %q[r], %q[c], %q[alpha]"
+ : [r] "+w" (r)
+ : [c] "w" (c),
+ [alpha] "w" (alpha)
+ : );
+ }
+
+ template <typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
+ Packet4f& c, Packet4f& tmp,
+ const LaneIdType&) const {
+ acc(a, b, c);
+ }
+
+ template <typename LaneIdType>
+ EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b,
+ Packet4f& c, Packet4f& tmp,
+ const LaneIdType& lane) const {
+ madd(a, b.get(lane), c, tmp, lane);
+ }
+};
+
+#endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
+
+#if EIGEN_ARCH_ARM64
+
+template<>
+struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
+ : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
+{
+ typedef float RhsPacket;
+ typedef float32x4_t RhsPacketx4;
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = *b;
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {
+ dest = vld1q_f32(b);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = *b;
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+ {}
+
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+ {
+ loadRhs(b,dest);
+ }
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+ {
+ c = vfmaq_n_f32(c, a, b);
+ }
+
+ // NOTE: Template parameter inference failed when compiled with Android NDK:
+ // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+ { madd_helper<0>(a, b, c); }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
+ { madd_helper<1>(a, b, c); }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
+ { madd_helper<2>(a, b, c); }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
+ { madd_helper<3>(a, b, c); }
+
+ private:
+ template<int LaneID>
+ EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
+ {
+ #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
+ // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+ // vfmaq_laneq_f32 is implemented through a costly dup
+ if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : );
+ else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : );
+ else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : );
+ else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : );
+ #else
+ c = vfmaq_laneq_f32(c, a, b, LaneID);
+ #endif
+ }
+};
+
+
+template<>
+struct gebp_traits <double,double,false,false,Architecture::NEON>
+ : gebp_traits<double,double,false,false,Architecture::Generic>
+{
+ typedef double RhsPacket;
+
+ struct RhsPacketx4 {
+ float64x2_t B_0, B_1;
+ };
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ dest = *b;
+ }
+
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+ {
+ dest.B_0 = vld1q_f64(b);
+ dest.B_1 = vld1q_f64(b+2);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
+ {
+ loadRhs(b,dest);
+ }
+
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
+ {}
+
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+ {
+ loadRhs(b,dest);
+ }
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+ {
+ c = vfmaq_n_f64(c, a, b);
+ }
+
+ // NOTE: Template parameter inference failed when compiled with Android NDK:
+ // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
+
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
+ { madd_helper<0>(a, b, c); }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
+ { madd_helper<1>(a, b, c); }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
+ { madd_helper<2>(a, b, c); }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
+ { madd_helper<3>(a, b, c); }
+
+ private:
+ template <int LaneID>
+ EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
+ {
+ #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
+ // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+ // vfmaq_laneq_f64 is implemented through a costly dup
+ if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
+ else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
+ else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
+ else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
+ #else
+ if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
+ else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
+ else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
+ else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
+ #endif
+ }
+};
+
+#endif // EIGEN_ARCH_ARM64
+
+} // namespace internal
+} // namespace Eigen