From be5b0f664ab1481e74d72e01d4f9172cf927b221 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 30 Jan 2019 11:48:25 +0100 Subject: ARM64 & GEBP: Make use of vfmaq_laneq_f32 and workaround GCC's issue in generating good ASM --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 45 ++++++++++++----------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'Eigen/src/Core/products') diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index cc6f3f029..dea8c94eb 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1008,17 +1008,17 @@ struct gebp_traits EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = *b; + dest = *b; } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - dest = vld1q_f32(b); + dest = vld1q_f32(b); } EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = *b; + dest = *b; } EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const @@ -1034,24 +1034,19 @@ struct gebp_traits c = vfmaq_n_f32(c, a, b); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt&) const { - c = vfmaq_lane_f32(c, a, vget_low_f32(b), 0); - } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const - { - c = vfmaq_lane_f32(c, a, vget_low_f32(b), 1); - } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const - { - c = vfmaq_lane_f32(c, a, vget_high_f32(b), 0); - } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const - { - c = vfmaq_lane_f32(c, a, vget_high_f32(b), 1); + #if EIGEN_COMP_GNUC_STRICT + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f32 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : ); + #else + c = vfmaq_laneq_f32(c, a, b, LaneID); + #endif } }; @@ -1260,7 +1255,14 @@ void gebp_kernel); \ traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ -- cgit v1.2.3