diff options
author | Guoqiang QI <425418567@qq.com> | 2021-01-13 22:54:03 +0000 |
---|---|---|
committer | Rasmus Munk Larsen <rmlarsen@google.com> | 2021-01-13 22:54:03 +0000 |
commit | 38ae5353ab6f1050aed64821ac56a1561096cdce (patch) | |
tree | bc2cbea514d3843d300695559b96abeac4ceefc3 /Eigen/src/Core/arch | |
parent | 352f1422d3ceea19a04cab297c6339e0870e1c6c (diff) |
1)provide a better generic paddsub op implementation
2)make paddsub op support the Packet2cf/Packet4f/Packet2f in NEON
3)make paddsub op support the Packet2cf/Packet4f in SSE
Diffstat (limited to 'Eigen/src/Core/arch')
-rw-r--r-- | Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h | 6 | ||||
-rw-r--r-- | Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/Complex.h | 7 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/PacketMath.h | 49 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/Complex.h | 7 | ||||
-rwxr-xr-x | Eigen/src/Core/arch/SSE/PacketMath.h | 22 |
6 files changed, 84 insertions, 11 deletions
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 9253d8cab..f40093455 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -19,6 +19,12 @@ namespace Eigen { namespace internal { +template<typename Packet, int N> EIGEN_DEVICE_FUNC inline Packet +pset(const typename unpacket_traits<Packet>::type (&a)[N] /* a */) { + EIGEN_STATIC_ASSERT(unpacket_traits<Packet>::size == N, THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE); + return pload<Packet>(a); +} + template<typename Packet> EIGEN_STRONG_INLINE Packet pfrexp_float(const Packet& a, Packet& exponent) { typedef typename unpacket_traits<Packet>::integer_packet PacketI; diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h index 491f1c927..a623f54cb 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h @@ -17,6 +17,10 @@ namespace internal { // implemented in GenericPacketMathFunctions.h // This is needed to workaround a circular dependency. +/** \internal \returns a packet with constant coefficients \a a, e.g.: (a[N-1],...,a[0]) */ +template<typename Packet, int N> EIGEN_DEVICE_FUNC inline Packet +pset(const typename unpacket_traits<Packet>::type (&a)[N] /* a */); + template<typename Packet> EIGEN_STRONG_INLINE Packet pfrexp_float(const Packet& a, Packet& exponent); diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index a889ab1d2..1aa361bc0 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -124,6 +124,13 @@ template<> EIGEN_STRONG_INLINE Packet1cf psub<Packet1cf>(const Packet1cf& a, con template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b); +template<> EIGEN_STRONG_INLINE Packet2cf paddsub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) +{ + Packet4f mask = {-0.0f, -0.0f, 0.0f, 0.0f}; + return Packet2cf(padd(a.v, pxor(mask, b.v))); +} + template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate<Packet2f>(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate<Packet4f>(a.v)); } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 14f3dbd0f..1f34faae0 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -84,12 +84,18 @@ typedef uint64x2_t Packet2ul; #endif // EIGEN_COMP_MSVC +EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){ + const float* a = reinterpret_cast<const float*>(&m); + Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))}; + return res; +} + // fuctionally equivalent to _mm_shuffle_ps in SSE when interleave // == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)), // interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h -// to enable a shared implementation for fast inversion of matrices of size 4. -template<bool interleave> -EIGEN_STRONG_INLINE Packet4f shuffle(const Packet4f &m, const Packet4f &n, int mask) +// to enable a shared implementation for fast inversion of matrices of size 4. +template<bool interleave> +EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask) { const float* a = reinterpret_cast<const float*>(&m); const float* b = reinterpret_cast<const float*>(&n); @@ -97,8 +103,8 @@ EIGEN_STRONG_INLINE Packet4f shuffle(const Packet4f &m, const Packet4f &n, int m return res; } -template<> -EIGEN_STRONG_INLINE Packet4f shuffle<true>(const Packet4f &m, const Packet4f &n, int mask) +template<> +EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f &m, const Packet4f &n, int mask) { const float* a = reinterpret_cast<const float*>(&m); const float* b = reinterpret_cast<const float*>(&n); @@ -108,25 +114,29 @@ EIGEN_STRONG_INLINE Packet4f shuffle<true>(const Packet4f &m, const Packet4f &n, EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));} +EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) +{ + return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s)); +} EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) -{ - return shuffle<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s)); +{ + return shuffle2<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s)); } EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) { - return shuffle<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1)); + return shuffle2<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1)); } EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) { - return shuffle<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3)); + return shuffle2<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3)); } EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) { - return shuffle<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1)); + return shuffle2<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1)); } EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) { - return shuffle<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3)); + return shuffle2<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3)); } #define vec4f_duplane(a, p) \ vdupq_lane_f32(vget_low_f32(a), p) @@ -851,6 +861,17 @@ template<> EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, con template<> EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); } template<> EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b); +template<> EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f & b) { + Packet2f mask = {-0.0f, 0.0f}; + return padd(a, pxor(mask, b)); +} +template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b); +template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) { + Packet4f mask = {-0.0f, 0.0f, -0.0f, 0.0f}; + return padd(a, pxor(mask, b)); +} + template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); } template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); } template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a) @@ -3717,6 +3738,12 @@ template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& ); +template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b){ + const Packet2d mask = {-0.0,0.0}; + return padd(a, pxor(mask, b)); +} + template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); } template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 58cdb5dbe..f6f1b8c9f 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -66,6 +66,13 @@ template<> struct unpacket_traits<Packet2cf> { template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b); +template<> EIGEN_STRONG_INLINE Packet2cf paddsub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) +{ + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x0,0x0)); + return Packet2cf(padd(a.v, pxor(mask, b.v))); +} + template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 4e733c77e..4c5b664e6 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -301,6 +301,28 @@ template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } template<> EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b); +template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + return _mm_addsub_ps(a,b); +#else + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x0,0x80000000,0x0)); + return padd(a, pxor(mask, b)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& ); +template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + return _mm_addsub_pd(a,b); +#else + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x0)); + return padd(a, pxor(mask, b)); +#endif +} + template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); |