namespace Eigen { namespace internal { #if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG // Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm. // Here we specialize gebp_traits to eliminate these register spills. // See #2138. template<> struct gebp_traits : gebp_traits { EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { // This volatile inline ASM both acts as a barrier to prevent reordering, // as well as enforces strict register use. asm volatile( "vmla.f32 %q[r], %q[c], %q[alpha]" : [r] "+w" (r) : [c] "w" (c), [alpha] "w" (alpha) : ); } template EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b, Packet4f& c, Packet4f& tmp, const LaneIdType&) const { acc(a, b, c); } template EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket& b, Packet4f& c, Packet4f& tmp, const LaneIdType& lane) const { madd(a, b.get(lane), c, tmp, lane); } }; #endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG #if EIGEN_ARCH_ARM64 template<> struct gebp_traits : gebp_traits { typedef float RhsPacket; typedef float32x4_t RhsPacketx4; EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { dest = vld1q_f32(b); } EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; } EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b,dest); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const { c = vfmaq_n_f32(c, a, b); } // NOTE: Template parameter inference failed when compiled with Android NDK: // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const { madd_helper<0>(a, b, c); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const { madd_helper<1>(a, b, c); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const { madd_helper<2>(a, b, c); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const { madd_helper<3>(a, b, c); } private: template EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const { #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 // vfmaq_laneq_f32 is implemented through a costly dup if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : ); else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : ); else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : ); #else c = vfmaq_laneq_f32(c, a, b, LaneID); #endif } }; template<> struct gebp_traits : gebp_traits { typedef double RhsPacket; struct RhsPacketx4 { float64x2_t B_0, B_1; }; EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { dest.B_0 = vld1q_f64(b); dest.B_1 = vld1q_f64(b+2); } EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b,dest); } EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b,dest); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const { c = vfmaq_n_f64(c, a, b); } // NOTE: Template parameter inference failed when compiled with Android NDK: // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const { madd_helper<0>(a, b, c); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const { madd_helper<1>(a, b, c); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const { madd_helper<2>(a, b, c); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const { madd_helper<3>(a, b, c); } private: template EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const { #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 // vfmaq_laneq_f64 is implemented through a costly dup if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); #else if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0); else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1); else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0); else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1); #endif } }; #endif // EIGEN_ARCH_ARM64 } // namespace internal } // namespace Eigen