From da5a7afed056596b089a4241b62a7e17f2c43119 Mon Sep 17 00:00:00 2001 From: Joel Holdsworth Date: Fri, 10 Jan 2020 18:31:15 +0000 Subject: Improvements to the tidiness and completeness of the NEON implementation --- Eigen/src/Core/arch/NEON/Complex.h | 308 +++++++-------- Eigen/src/Core/arch/NEON/MathFunctions.h | 41 +- Eigen/src/Core/arch/NEON/PacketMath.h | 617 +++++++++++++++---------------- Eigen/src/Core/arch/NEON/TypeCasting.h | 48 +-- 4 files changed, 485 insertions(+), 529 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 5a1ed386b..c1d0c6502 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -15,7 +15,8 @@ namespace Eigen { namespace internal { -inline uint32x4_t p4ui_CONJ_XOR() { +inline uint32x4_t p4ui_CONJ_XOR() +{ // See bug 1325, clang fails to call vld1q_u64. #if EIGEN_COMP_CLANG uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; @@ -26,7 +27,8 @@ inline uint32x4_t p4ui_CONJ_XOR() { #endif } -inline uint32x2_t p2ui_CONJ_XOR() { +inline uint32x2_t p2ui_CONJ_XOR() +{ static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 }; return vld1_u32( conj_XOR_DATA ); } @@ -36,48 +38,64 @@ struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} - Packet4f v; + Packet4f v; }; -template<> struct packet_traits > : default_packet_traits +template<> struct packet_traits > : default_packet_traits { typedef Packet2cf type; typedef Packet2cf half; - enum { + enum + { Vectorizable = 1, AlignedOnScalar = 1, size = 2, HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, HasSetLinear = 0 }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; }; - -template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +template<> struct unpacket_traits { - float32x2_t r64; - r64 = vld1_f32((const float *)&from); + typedef std::complex type; + typedef Packet2cf half; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + const float32x2_t r64 = vld1_f32(reinterpret_cast(&from)); return Packet2cf(vcombine_f32(r64, r64)); } -template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(padd(a.v, b.v)); } + +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(psub(a.v, b.v)); } + template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } + template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { - Packet4ui b = vreinterpretq_u32_f32(a.v); + const Packet4ui b = vreinterpretq_u32_f32(a.v); return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR()))); } @@ -93,7 +111,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con v1 = vmulq_f32(v1, b.v); // Multiply the imag a with b v2 = vmulq_f32(v2, b.v); - // Conjugate v2 + // Conjugate v2 v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR())); // Swap real/imag elements in v2. v2 = vrev64q_f32(v2); @@ -113,96 +131,78 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packe return Packet2cf(pand(eq, eq_swapped)); } -template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) -{ - return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) -{ - return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) -{ - return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); -} +template<> EIGEN_STRONG_INLINE Packet2cf pand(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet2cf por(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } + template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) -{ - return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); -} +{ return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(reinterpret_cast(from))); } -template<> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu(reinterpret_cast(from))); } -template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) +{ return pset1(*from); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex *to, const Packet2cf& from) +{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(to), from.v); } -template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex *to, const Packet2cf& from) +{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>( + const std::complex* from, Index stride) { - Packet4f res = pset1(0.f); - res = vsetq_lane_f32(std::real(from[0*stride]), res, 0); + Packet4f res = vdupq_n_f32(std::real(from[0*stride])); res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1); res = vsetq_lane_f32(std::real(from[1*stride]), res, 2); res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3); return Packet2cf(res); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>( + std::complex* to, const Packet2cf& from, Index stride) { to[stride*0] = std::complex(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1)); to[stride*1] = std::complex(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const float *)addr); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex *addr) +{ EIGEN_ARM_PREFETCH(reinterpret_cast(addr)); } -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { EIGEN_ALIGN16 std::complex x[2]; - vst1q_f32((float *)x, a.v); + vst1q_f32(reinterpret_cast(x), a.v); return x[0]; } template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) -{ - float32x2_t a_lo, a_hi; - Packet4f a_r128; - - a_lo = vget_low_f32(a.v); - a_hi = vget_high_f32(a.v); - a_r128 = vcombine_f32(a_hi, a_lo); - - return Packet2cf(a_r128); -} +{ return Packet2cf(vcombine_f32(vget_high_f32(a.v), vget_low_f32(a.v))); } template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& a) -{ - return Packet2cf(vrev64q_f32(a.v)); -} +{ return Packet2cf(vrev64q_f32(a.v)); } template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) { - float32x2_t a1, a2; std::complex s; - - a1 = vget_low_f32(a.v); - a2 = vget_high_f32(a.v); - a2 = vadd_f32(a1, a2); - vst1_f32((float *)&s, a2); - + vst1_f32(reinterpret_cast(&s), vadd_f32(vget_low_f32(a.v), vget_high_f32(a.v))); return s; } template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) { - Packet4f sum1, sum2, sum; - - // Add the first two 64-bit float32x2_t of vecs[0] - sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v)); - sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v)); - sum = vaddq_f32(sum1, sum2); - - return Packet2cf(sum); + const Packet4f sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v)); + const Packet4f sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v)); + return Packet2cf(vaddq_f32(sum1, sum2)); } template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) @@ -220,14 +220,14 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P v1 = vmul_f32(v1, a2); // Multiply the imag a with b v2 = vmul_f32(v2, a2); - // Conjugate v2 + // Conjugate v2 v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR())); // Swap real/imag elements in v2. v2 = vrev64_f32(v2); // Add v1, v2 prod = vadd_f32(v1, v2); - vst1_f32((float *)&s, prod); + vst1_f32(reinterpret_cast(&s), prod); return s; } @@ -237,44 +237,36 @@ struct palign_impl { EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) { - if (Offset==1) - { + if (Offset == 1) first.v = vextq_f32(first.v, second.v, 2); - } } }; -template<> struct conj_helper +template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(a, pconj(b)); - } + { return internal::pmul(a, pconj(b)); } }; -template<> struct conj_helper +template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(pconj(a), b); - } + { return internal::pmul(pconj(a), b); } }; -template<> struct conj_helper +template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return pconj(internal::pmul(a, b)); - } + { return pconj(internal::pmul(a,b)); } }; EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) @@ -282,18 +274,18 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for NEON - Packet2cf res = conj_helper().pmul(a,b); + Packet2cf res = conj_helper().pmul(a,b); Packet4f s, rev_s; // this computes the norm s = vmulq_f32(b.v, b.v); rev_s = vrev64q_f32(s); - return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); + return Packet2cf(pdiv(res.v, vaddq_f32(s, rev_s))); } -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) +{ Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v)); kernel.packet[1].v = tmp; @@ -321,7 +313,8 @@ template<> struct packet_traits > : default_packet_traits { typedef Packet1cd type; typedef Packet1cd half; - enum { + enum + { Vectorizable = 1, AlignedOnScalar = 0, size = 1, @@ -340,24 +333,49 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; }; +template<> struct unpacket_traits +{ + typedef std::complex type; + enum + { + size=1, + alignment=Aligned16, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; + typedef Packet1cd half; +}; -template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload(reinterpret_cast(from))); } -template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) -{ /* here we really have to use unaligned loads :( */ return ploadu(&from); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu(reinterpret_cast(from))); } + +template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) +{ + /* here we really have to use unaligned loads :( */ + return ploadu(&from); +} -template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } +template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(padd(a.v, b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(psub(a.v, b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) +{ return Packet1cd(pnegate(a.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) +{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { Packet2d v1, v2; - // Get the real values of a + // Get the real values of a v1 = vdupq_lane_f64(vget_low_f64(a.v), 0); // Get the imag values of a v2 = vdupq_lane_f64(vget_high_f64(a.v), 0); @@ -365,7 +383,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con v1 = vmulq_f64(v1, b.v); // Multiply the imag a with b v2 = vmulq_f64(v2, b.v); - // Conjugate v2 + // Conjugate v2 v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR)); // Swap real/imag elements in v2. v2 = preverse(v2); @@ -385,31 +403,32 @@ template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packe return Packet1cd(pand(eq, eq_swapped)); } -template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} +template<> EIGEN_STRONG_INLINE Packet1cd pand(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet1cd por(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet1cd pxor(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } + template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} +{ return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } -template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) +{ return pset1(*from); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex *to, const Packet1cd& from) +{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(to), from.v); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const double *)addr); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex *to, const Packet1cd& from) +{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), from.v); } -template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex *addr) +{ EIGEN_ARM_PREFETCH(reinterpret_cast(addr)); } + +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>( + const std::complex* from, Index stride) { Packet2d res = pset1(0.0); res = vsetq_lane_f64(std::real(from[0*stride]), res, 0); @@ -417,17 +436,14 @@ template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Pack return Packet1cd(res); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride) -{ - to[stride*0] = std::complex(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); -} - +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>( + std::complex* to, const Packet1cd& from, Index stride) +{ to[stride*0] = std::complex(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); } -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { EIGEN_ALIGN16 std::complex res; pstore >(&res, a); - return res; } @@ -455,9 +471,7 @@ template<> struct conj_helper { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(a, pconj(b)); - } + { return internal::pmul(a, pconj(b)); } }; template<> struct conj_helper @@ -466,9 +480,7 @@ template<> struct conj_helper { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(pconj(a), b); - } + { return internal::pmul(pconj(a), b); } }; template<> struct conj_helper @@ -477,9 +489,7 @@ template<> struct conj_helper { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return pconj(internal::pmul(a, b)); - } + { return pconj(internal::pmul(a,b)); } }; EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) @@ -495,9 +505,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, con } EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) -{ - return Packet1cd(preverse(Packet2d(x.v))); -} +{ return Packet1cd(preverse(Packet2d(x.v))); } EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h index 8bf572788..510f2ea1f 100644 --- a/Eigen/src/Core/arch/NEON/MathFunctions.h +++ b/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -12,36 +12,21 @@ namespace Eigen { namespace internal { -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp(const Packet4f& x) -{ - return pexp_float(x); -} - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f plog(const Packet4f& x) -{ - return plog_float(x); -} - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f psin(const Packet4f& x) -{ - return psin_float(x); -} - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pcos(const Packet4f& x) -{ - return pcos_float(x); -} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& x) +{ return pexp_float(x); } + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog(const Packet4f& x) +{ return plog_float(x); } + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psin(const Packet4f& x) +{ return psin_float(x); } + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pcos(const Packet4f& x) +{ return pcos_float(x); } // Hyperbolic Tangent function. -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f -ptanh(const Packet4f& x) { - return internal::generic_fast_tanh_float(x); -} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f ptanh(const Packet4f& x) +{ return internal::generic_fast_tanh_float(x); } } // end namespace internal diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index dcb1fca09..be1c3b98b 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -32,7 +32,7 @@ namespace internal { #if EIGEN_ARCH_ARM64 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #else -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 #endif #endif @@ -50,7 +50,8 @@ struct eigen_packet_wrapper operator const T&() const { return m_val; } eigen_packet_wrapper() {} eigen_packet_wrapper(const T &v) : m_val(v) {} - eigen_packet_wrapper& operator=(const T &v) { + eigen_packet_wrapper& operator=(const T &v) + { m_val = v; return *this; } @@ -59,16 +60,16 @@ struct eigen_packet_wrapper }; typedef eigen_packet_wrapper Packet2f; typedef eigen_packet_wrapper Packet4f; -typedef eigen_packet_wrapper Packet4i; -typedef eigen_packet_wrapper Packet2i; +typedef eigen_packet_wrapper Packet2i; +typedef eigen_packet_wrapper Packet4i; typedef eigen_packet_wrapper Packet4ui; #else typedef float32x2_t Packet2f; typedef float32x4_t Packet4f; -typedef int32x4_t Packet4i; typedef int32x2_t Packet2i; +typedef int32x4_t Packet4i; typedef uint32x4_t Packet4ui; #endif // EIGEN_COMP_MSVC @@ -99,79 +100,124 @@ typedef uint32x4_t Packet4ui; #endif template <> -struct packet_traits : default_packet_traits { +struct packet_traits : default_packet_traits +{ typedef Packet4f type; typedef Packet4f half; // Packet2f intrinsics not implemented yet - enum { + enum + { Vectorizable = 1, AlignedOnScalar = 1, size = 4, HasHalfPacket = 0, // Packet2f intrinsics not implemented yet - HasDiv = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 1, + + HasDiv = 1, HasFloor = 1, - // FIXME check the Has* - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, - HasLog = 1, - HasExp = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, HasSqrt = 0, HasTanh = EIGEN_FAST_MATH, - HasErf = EIGEN_FAST_MATH + HasErf = EIGEN_FAST_MATH }; }; + template <> -struct packet_traits : default_packet_traits { +struct packet_traits : default_packet_traits +{ typedef Packet4i type; typedef Packet4i half; // Packet2i intrinsics not implemented yet - enum { + enum + { Vectorizable = 1, AlignedOnScalar = 1, - size=4, - HasHalfPacket=0 // Packet2i intrinsics not implemented yet - // FIXME check the Has* + size = 4, + HasHalfPacket = 0, // Packet2i intrinsics not implemented yet + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 1 }; }; -#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM +#if EIGEN_GNUC_AT_MOST(4, 4) && !EIGEN_COMP_LLVM // workaround gcc 4.2, 4.3 and 4.4 compilatin issue EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } -EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); } -EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } -EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } -EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } +EIGEN_STRONG_INLINE float32x2_t vld1_f32(const float* x) { return ::vld1_f32 ((const float32_t*)x); } +EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32(const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } +EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } +EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } #endif template<> struct unpacket_traits { - typedef float type; - typedef Packet4f half; - typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; + typedef float type; + typedef Packet4f half; + typedef Packet4i integer_packet; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; }; template<> struct unpacket_traits { - typedef int32_t type; - typedef Packet4i half; - enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; + typedef int32_t type; + typedef Packet4i half; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; }; -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { return vdupq_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { return vdupq_n_s32(from); } -template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) { return vreinterpretq_f32_u32(vdupq_n_u32(from)); } +template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) +{ return vreinterpretq_f32_u32(vdupq_n_u32(from)); } template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { - const float f[] = {0, 1, 2, 3}; - Packet4f countdown = vld1q_f32(f); - return vaddq_f32(pset1(a), countdown); + const float c[] = {0.0f,1.0f,2.0f,3.0f}; + return vaddq_f32(pset1(a), vld1q_f32(c)); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) { - const int32_t i[] = {0, 1, 2, 3}; - Packet4i countdown = vld1q_s32(i); - return vaddq_s32(pset1(a), countdown); + const int32_t c[] = {0,1,2,3}; + return vaddq_s32(pset1(a), vld1q_s32(c)); } template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); } @@ -216,7 +262,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const } template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, const Packet4i& /*b*/) -{ eigen_assert(false && "packet integer division are not supported by NEON"); +{ + eigen_assert(false && "packet integer division are not supported by NEON"); return pset1(0); } @@ -231,9 +278,11 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co // MLA is not fused i.e. does 2 roundings. // In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4): // MLA: 10 GFlop/s ; FMA: 12 GFlops/s. -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ return vfmaq_f32(c,a,b); } #else -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ #if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu, // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on @@ -257,7 +306,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& #endif // No FMA instruction for int, so use MLA unconditionally. -template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) +{ return vmlaq_s32(c,a,b); } template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); } @@ -265,12 +315,23 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); } template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vcleq_f32(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vcltq_f32(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vceqq_f32(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vcleq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) +{ return vreinterpretq_s32_u32(vcleq_s32(a,b)); } + +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vcltq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) +{ return vreinterpretq_s32_u32(vcltq_s32(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return vreinterpretq_s32_u32(vceqq_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vceqq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) +{ return vreinterpretq_s32_u32(vceqq_s32(a,b)); } + +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { @@ -286,278 +347,191 @@ template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); -} +{ return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } + template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); } template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); -} +{ return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); -} +{ return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); -} -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); } +{ return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) +{ return vbicq_s32(a,b); } template EIGEN_STRONG_INLINE Packet4i pshiftright(Packet4i a) { return vshrq_n_s32(a,N); } template EIGEN_STRONG_INLINE Packet4i pshiftleft(Packet4i a) { return vshlq_n_s32(a,N); } -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) -{ - float32x2_t lo, hi; - lo = vld1_dup_f32(from); - hi = vld1_dup_f32(from+1); - return vcombine_f32(lo, hi); -} +{ return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from+1)); } template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) -{ - int32x2_t lo, hi; - lo = vld1_dup_s32(from); - hi = vld1_dup_s32(from+1); - return vcombine_s32(lo, hi); -} +{ return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from+1)); } + +template<> EIGEN_STRONG_INLINE Packet4f ploadquad(const float* from) { return vld1q_dup_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ploadquad(const int32_t* from) { return vld1q_dup_s32(from); } -template<> EIGEN_STRONG_INLINE void pstore (float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to,from); } -template<> EIGEN_STRONG_INLINE void pstoreu (float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to,from); } template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - Packet4f res = pset1(0.f); - res = vsetq_lane_f32(from[0*stride], res, 0); - res = vsetq_lane_f32(from[1*stride], res, 1); - res = vsetq_lane_f32(from[2*stride], res, 2); - res = vsetq_lane_f32(from[3*stride], res, 3); + Packet4f res = vld1q_dup_f32(from); + res = vld1q_lane_f32(from + 1*stride, res, 1); + res = vld1q_lane_f32(from + 2*stride, res, 2); + res = vld1q_lane_f32(from + 3*stride, res, 3); return res; } template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int32_t* from, Index stride) { - Packet4i res = pset1(0); - res = vsetq_lane_s32(from[0*stride], res, 0); - res = vsetq_lane_s32(from[1*stride], res, 1); - res = vsetq_lane_s32(from[2*stride], res, 2); - res = vsetq_lane_s32(from[3*stride], res, 3); + Packet4i res = vld1q_dup_s32(from); + res = vld1q_lane_s32(from + 1*stride, res, 1); + res = vld1q_lane_s32(from + 2*stride, res, 2); + res = vld1q_lane_s32(from + 3*stride, res, 3); return res; } template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { - to[stride*0] = vgetq_lane_f32(from, 0); - to[stride*1] = vgetq_lane_f32(from, 1); - to[stride*2] = vgetq_lane_f32(from, 2); - to[stride*3] = vgetq_lane_f32(from, 3); + vst1q_lane_f32(to + stride*0, from, 0); + vst1q_lane_f32(to + stride*1, from, 1); + vst1q_lane_f32(to + stride*2, from, 2); + vst1q_lane_f32(to + stride*3, from, 3); } template<> EIGEN_DEVICE_FUNC inline void pscatter(int32_t* to, const Packet4i& from, Index stride) { - to[stride*0] = vgetq_lane_s32(from, 0); - to[stride*1] = vgetq_lane_s32(from, 1); - to[stride*2] = vgetq_lane_s32(from, 2); - to[stride*3] = vgetq_lane_s32(from, 3); + vst1q_lane_s32(to + stride*0, from, 0); + vst1q_lane_s32(to + stride*1, from, 1); + vst1q_lane_s32(to + stride*2, from, 2); + vst1q_lane_s32(to + stride*3, from, 3); } -template<> EIGEN_STRONG_INLINE void prefetch (const float* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } -// FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { EIGEN_ALIGN16 float x[4]; vst1q_f32(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { EIGEN_ALIGN16 int32_t x[4]; vst1q_s32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { return vgetq_lane_f32(a,0); } +template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { return vgetq_lane_s32(a,0); } -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { - float32x2_t a_lo, a_hi; - Packet4f a_r64; - - a_r64 = vrev64q_f32(a); - a_lo = vget_low_f32(a_r64); - a_hi = vget_high_f32(a_r64); - return vcombine_f32(a_hi, a_lo); +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + const float32x4_t a_r64 = vrev64q_f32(a); + return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) +{ + const int32x4_t a_r64 = vrev64q_s32(a); + return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64)); } -template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { - int32x2_t a_lo, a_hi; - Packet4i a_r64; - - a_r64 = vrev64q_s32(a); - a_lo = vget_low_s32(a_r64); - a_hi = vget_high_s32(a_r64); - return vcombine_s32(a_hi, a_lo); +template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) +{ + const uint32x4_t a_r64 = vrev64q_u32(a); + return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64)); } template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } -template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { - return pfrexp_float(a,exponent); -} +template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) +{ return pfrexp_float(a,exponent); } -template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { - return pldexp_float(a,exponent); -} +template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) +{ return pldexp_float(a,exponent); } template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { - float32x2_t a_lo, a_hi, sum; - - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - sum = vpadd_f32(a_lo, a_hi); - sum = vpadd_f32(sum, sum); - return vget_lane_f32(sum, 0); + const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a)); + return vget_lane_f32(vpadd_f32(sum, sum), 0); } - -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +template<> EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) { - float32x4x2_t vtrn1, vtrn2, res1, res2; - Packet4f sum1, sum2, sum; - - // NEON zip performs interleaving of the supplied vectors. - // We perform two interleaves in a row to acquire the transposed vector - vtrn1 = vzipq_f32(vecs[0], vecs[2]); - vtrn2 = vzipq_f32(vecs[1], vecs[3]); - res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]); - res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]); - - // Do the addition of the resulting vectors - sum1 = vaddq_f32(res1.val[0], res1.val[1]); - sum2 = vaddq_f32(res2.val[0], res2.val[1]); - sum = vaddq_f32(sum1, sum2); - - return sum; + const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a)); + return vget_lane_s32(vpadd_s32(sum, sum), 0); } -template<> EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) { - int32x2_t a_lo, a_hi, sum; - - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - sum = vpadd_s32(a_lo, a_hi); - sum = vpadd_s32(sum, sum); - return vget_lane_s32(sum, 0); + const float32x4x2_t vtrn1 = vzipq_f32(vecs[0], vecs[2]); + const float32x4x2_t vtrn2 = vzipq_f32(vecs[1], vecs[3]); + const float32x4x2_t res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]); + const float32x4x2_t res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]); + return vaddq_f32(vaddq_f32(res1.val[0], res1.val[1]), vaddq_f32(res2.val[0], res2.val[1])); } - template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) { - int32x4x2_t vtrn1, vtrn2, res1, res2; - Packet4i sum1, sum2, sum; - - // NEON zip performs interleaving of the supplied vectors. - // We perform two interleaves in a row to acquire the transposed vector - vtrn1 = vzipq_s32(vecs[0], vecs[2]); - vtrn2 = vzipq_s32(vecs[1], vecs[3]); - res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]); - res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]); - - // Do the addition of the resulting vectors - sum1 = vaddq_s32(res1.val[0], res1.val[1]); - sum2 = vaddq_s32(res2.val[0], res2.val[1]); - sum = vaddq_s32(sum1, sum2); - - return sum; + const int32x4x2_t vtrn1 = vzipq_s32(vecs[0], vecs[2]); + const int32x4x2_t vtrn2 = vzipq_s32(vecs[1], vecs[3]); + const int32x4x2_t res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]); + const int32x4x2_t res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]); + return vaddq_s32(vaddq_s32(res1.val[0], res1.val[1]), vaddq_s32(res2.val[0], res2.val[1])); } // Other reduction functions: // mul template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { - float32x2_t a_lo, a_hi, prod; - - // Get a_lo = |a1|a2| and a_hi = |a3|a4| - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| - prod = vmul_f32(a_lo, a_hi); + const float32x2_t prod = vmul_f32(vget_low_f32(a), vget_high_f32(a)); // Multiply prod with its swapped value |a2*a4|a1*a3| - prod = vmul_f32(prod, vrev64_f32(prod)); - - return vget_lane_f32(prod, 0); + return vget_lane_f32(vmul_f32(prod, vrev64_f32(prod)), 0); } template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) { - int32x2_t a_lo, a_hi, prod; - - // Get a_lo = |a1|a2| and a_hi = |a3|a4| - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| - prod = vmul_s32(a_lo, a_hi); + const int32x2_t prod = vmul_s32(vget_low_s32(a), vget_high_s32(a)); // Multiply prod with its swapped value |a2*a4|a1*a3| - prod = vmul_s32(prod, vrev64_s32(prod)); - - return vget_lane_s32(prod, 0); + return vget_lane_s32(vmul_s32(prod, vrev64_s32(prod)), 0); } // min template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { - float32x2_t a_lo, a_hi, min; - - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - min = vpmin_f32(a_lo, a_hi); - min = vpmin_f32(min, min); - - return vget_lane_f32(min, 0); + const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a)); + return vget_lane_f32(vpmin_f32(min, min), 0); } - template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) { - int32x2_t a_lo, a_hi, min; - - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - min = vpmin_s32(a_lo, a_hi); - min = vpmin_s32(min, min); - - return vget_lane_s32(min, 0); + const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a)); + return vget_lane_s32(vpmin_s32(min, min), 0); } // max template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { - float32x2_t a_lo, a_hi, max; - - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - max = vpmax_f32(a_lo, a_hi); - max = vpmax_f32(max, max); - - return vget_lane_f32(max, 0); + const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a)); + return vget_lane_f32(vpmax_f32(max, max), 0); } - template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) { - int32x2_t a_lo, a_hi, max; - - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - max = vpmax_s32(a_lo, a_hi); - max = vpmax_s32(max, max); - - return vget_lane_s32(max, 0); + const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a)); + return vget_lane_s32(vpmax_s32(max, max), 0); } template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) { uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)), vget_high_u32(vreinterpretq_u32_f32(x))); - return vget_lane_u32(vpmax_u32(tmp,tmp),0); + return vget_lane_u32(vpmax_u32(tmp, tmp), 0); } // this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, @@ -573,21 +547,22 @@ struct palign_impl\ }\ };\ -PALIGN_NEON(0,Packet4f,vextq_f32) -PALIGN_NEON(1,Packet4f,vextq_f32) -PALIGN_NEON(2,Packet4f,vextq_f32) -PALIGN_NEON(3,Packet4f,vextq_f32) -PALIGN_NEON(0,Packet4i,vextq_s32) -PALIGN_NEON(1,Packet4i,vextq_s32) -PALIGN_NEON(2,Packet4i,vextq_s32) -PALIGN_NEON(3,Packet4i,vextq_s32) +PALIGN_NEON(0, Packet4f, vextq_f32) +PALIGN_NEON(1, Packet4f, vextq_f32) +PALIGN_NEON(2, Packet4f, vextq_f32) +PALIGN_NEON(3, Packet4f, vextq_f32) + +PALIGN_NEON(0, Packet4i, vextq_s32) +PALIGN_NEON(1, Packet4i, vextq_s32) +PALIGN_NEON(2, Packet4i, vextq_s32) +PALIGN_NEON(3, Packet4i, vextq_s32) #undef PALIGN_NEON -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); - float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) +{ + const float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); + const float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0])); kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0])); @@ -595,10 +570,11 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1])); } -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); - int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) +{ + const int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); + const int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); + kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0])); kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0])); kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1])); @@ -624,17 +600,9 @@ ptranspose(PacketBlock& kernel) { // Defining these functions as templates ensures that if these intrinsics are // already defined in arm_neon.h, then our workaround doesn't cause a conflict // and has lower priority in overload resolution. -template -uint64x2_t vreinterpretq_u64_f64(T a) -{ - return (uint64x2_t) a; -} +template uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; } -template -float64x2_t vreinterpretq_f64_u64(T a) -{ - return (float64x2_t) a; -} +template float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; } typedef float64x2_t Packet2d; typedef float64x1_t Packet1d; @@ -643,12 +611,13 @@ template<> struct packet_traits : default_packet_traits { typedef Packet2d type; typedef Packet2d half; - enum { + enum + { Vectorizable = 1, AlignedOnScalar = 1, size = 2, - HasHalfPacket=0, - + HasHalfPacket = 0, + HasDiv = 1, // FIXME check the Has* HasSin = 0, @@ -659,16 +628,28 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; }; +template<> struct unpacket_traits +{ + typedef double type; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; + typedef Packet2d half; +}; template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vdupq_n_f64(from); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { - const double countdown_raw[] = {0.0,1.0}; - const Packet2d countdown = vld1q_f64(countdown_raw); - return vaddq_f64(pset1(a), countdown); + const double c[] = {0.0,1.0}; + return vaddq_f64(pset1(a), vld1q_f64(c)); } + template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); } template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); } @@ -683,9 +664,11 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const #ifdef __ARM_FEATURE_FMA // See bug 936. See above comment about FMA for float. -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) +{ return vfmaq_f64(c,a,b); } #else -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) +{ return vmlaq_f64(c,a,b); } #endif template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); } @@ -694,94 +677,93 @@ template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); -} +{ return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); -} +{ return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); -} +{ return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); -} +{ return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } -template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return vreinterpretq_f64_u64(vcleq_f64(a,b)); } -template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return vreinterpretq_f64_u64(vcltq_f64(a,b)); } -template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return vreinterpretq_f64_u64(vceqq_f64(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vcleq_f64(a,b)); } -template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vcltq_f64(a,b)); } -template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vceqq_f64(a,b)); } -template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) -{ - return vld1q_dup_f64(from); -} -template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); } +template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } -template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); } +template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { return vld1q_dup_f64(from); } +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to,from); } + +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to,from); } template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { Packet2d res = pset1(0.0); - res = vsetq_lane_f64(from[0*stride], res, 0); - res = vsetq_lane_f64(from[1*stride], res, 1); + res = vld1q_lane_f64(from + 0*stride, res, 0); + res = vld1q_lane_f64(from + 1*stride, res, 1); return res; } + template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { - to[stride*0] = vgetq_lane_f64(from, 0); - to[stride*1] = vgetq_lane_f64(from, 1); + vst1q_lane_f64(to + stride*0, from, 0); + vst1q_lane_f64(to + stride*1, from, 1); } + template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ARM_PREFETCH(addr); } // FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return vgetq_lane_f64(a, 0); } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return vgetq_lane_f64(a,0); } -template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) +{ return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } #if EIGEN_COMP_CLANG && defined(__apple_build_version__) // workaround ICE, see bug 907 -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; } +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) +{ return (vget_low_f64(a) + vget_high_f64(a))[0]; } #else -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) +{ return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } #endif template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) { - float64x2_t trn1, trn2; - - // NEON zip performs interleaving of the supplied vectors. - // We perform two interleaves in a row to acquire the transposed vector - trn1 = vzip1q_f64(vecs[0], vecs[1]); - trn2 = vzip2q_f64(vecs[0], vecs[1]); - - // Do the addition of the resulting vectors - return vaddq_f64(trn1, trn2); + return vaddq_f64(vzip1q_f64(vecs[0], vecs[1]), vzip2q_f64(vecs[0], vecs[1])); } // Other reduction functions: // mul #if EIGEN_COMP_CLANG && defined(__apple_build_version__) -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) +{ return (vget_low_f64(a) * vget_high_f64(a))[0]; } #else -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) +{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } #endif // min -template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); } +template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) +{ return vgetq_lane_f64(vpminq_f64(a,a), 0); } // max -template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); } +template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) +{ return vgetq_lane_f64(vpmaxq_f64(a,a), 0); } // this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, // see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 @@ -796,19 +778,20 @@ struct palign_impl\ }\ };\ -PALIGN_NEON(0,Packet2d,vextq_f64) -PALIGN_NEON(1,Packet2d,vextq_f64) +PALIGN_NEON(0, Packet2d, vextq_f64) +PALIGN_NEON(1, Packet2d, vextq_f64) #undef PALIGN_NEON EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]); - float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]); +ptranspose(PacketBlock& kernel) +{ + const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]); + const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]); - kernel.packet[0] = trn1; - kernel.packet[1] = trn2; + kernel.packet[0] = tmp1; + kernel.packet[1] = tmp2; } -#endif // EIGEN_ARCH_ARM64 +#endif // EIGEN_ARCH_ARM64 } // end namespace internal diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h index 20dbe1332..5036ee92b 100644 --- a/Eigen/src/Core/arch/NEON/TypeCasting.h +++ b/Eigen/src/Core/arch/NEON/TypeCasting.h @@ -10,44 +10,24 @@ #ifndef EIGEN_TYPE_CASTING_NEON_H #define EIGEN_TYPE_CASTING_NEON_H +#include + namespace Eigen { namespace internal { -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - - -template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { - return vcvtq_s32_f32(a); -} - -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { - return vcvtq_f32_s32(a); -} - -template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { - return vreinterpretq_s32_f32(a); -} - -template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { - return vreinterpretq_f32_s32(a); -} +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; +template<> struct type_casting_traits +{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { return vcvtq_f32_s32(a); } +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { return vcvtq_s32_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) +{ return vreinterpretq_f32_s32(a); } +template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) +{ return vreinterpretq_s32_f32(a); } } // end namespace internal -- cgit v1.2.3