namespace Eigen {
namespace internal {
  
#if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG

// Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
// Here we specialize gebp_traits to eliminate these register spills.
// See #2138.
template<>
struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
 : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
{
  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
  { 
    // This volatile inline ASM both acts as a barrier to prevent reordering,
    // as well as enforces strict register use.
    asm volatile(
      "vmla.f32 %q[r], %q[c], %q[alpha]"
      : [r] "+w" (r)
      : [c] "w" (c),
        [alpha] "w" (alpha)
      : );
  }

  template <typename LaneIdType>
  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
                                Packet4f& c, Packet4f& tmp,
                                const LaneIdType&) const {
    acc(a, b, c);
  }
  
  template <typename LaneIdType>
  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b,
                                Packet4f& c, Packet4f& tmp,
                                const LaneIdType& lane) const {
    madd(a, b.get(lane), c, tmp, lane);
  }
};

#endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG

#if EIGEN_ARCH_ARM64

template<>
struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
 : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
{
  typedef float RhsPacket;
  typedef float32x4_t RhsPacketx4;

  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
  {
    dest = *b;
  }

  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
  {
    dest = vld1q_f32(b);
  }

  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
  {
    dest = *b;
  }

  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
  {}

  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
  {
    loadRhs(b,dest);
  }

  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
  {
    c = vfmaq_n_f32(c, a, b);
  }

  // NOTE: Template parameter inference failed when compiled with Android NDK:
  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".

  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
  { madd_helper<0>(a, b, c); }
  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
  { madd_helper<1>(a, b, c); }
  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
  { madd_helper<2>(a, b, c); }
  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
  { madd_helper<3>(a, b, c); }

 private:
  template<int LaneID>
  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
  {
    #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
    // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
    // vfmaq_laneq_f32 is implemented through a costly dup
         if(LaneID==0)  asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) :  );
    else if(LaneID==1)  asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) :  );
    else if(LaneID==2)  asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) :  );
    else if(LaneID==3)  asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) :  );
    #else
    c = vfmaq_laneq_f32(c, a, b, LaneID);
    #endif
  }
};


template<>
struct gebp_traits <double,double,false,false,Architecture::NEON>
 : gebp_traits<double,double,false,false,Architecture::Generic>
{
  typedef double RhsPacket;

  struct RhsPacketx4 {
    float64x2_t B_0, B_1;
  };

  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
  {
    dest = *b;
  }

  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
  {
    dest.B_0 = vld1q_f64(b);
    dest.B_1 = vld1q_f64(b+2);
  }

  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
  {
    loadRhs(b,dest);
  }

  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
  {}

  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
  {
    loadRhs(b,dest);
  }

  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
  {
    c = vfmaq_n_f64(c, a, b);
  }

  // NOTE: Template parameter inference failed when compiled with Android NDK:
  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".

  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
  { madd_helper<0>(a, b, c); }
  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
  { madd_helper<1>(a, b, c); }
  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
  { madd_helper<2>(a, b, c); }
  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
  { madd_helper<3>(a, b, c); }

 private:
  template <int LaneID>
  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
  {
    #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
    // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
    // vfmaq_laneq_f64 is implemented through a costly dup
         if(LaneID==0)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
    else if(LaneID==1)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
    else if(LaneID==2)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
    else if(LaneID==3)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
    #else
         if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
    else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
    else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
    else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
    #endif
  }
};

#endif // EIGEN_ARCH_ARM64

}  // namespace internal
}  // namespace Eigen