From 38ae5353ab6f1050aed64821ac56a1561096cdce Mon Sep 17 00:00:00 2001 From: Guoqiang QI <425418567@qq.com> Date: Wed, 13 Jan 2021 22:54:03 +0000 Subject: 1)provide a better generic paddsub op implementation 2)make paddsub op support the Packet2cf/Packet4f/Packet2f in NEON 3)make paddsub op support the Packet2cf/Packet4f in SSE --- Eigen/Geometry | 4 +- Eigen/src/Core/GenericPacketMath.h | 6 + .../Core/arch/Default/GenericPacketMathFunctions.h | 6 + .../arch/Default/GenericPacketMathFunctionsFwd.h | 4 + Eigen/src/Core/arch/NEON/Complex.h | 7 + Eigen/src/Core/arch/NEON/PacketMath.h | 49 ++++-- Eigen/src/Core/arch/SSE/Complex.h | 7 + Eigen/src/Core/arch/SSE/PacketMath.h | 22 +++ Eigen/src/Core/util/StaticAssert.h | 3 +- Eigen/src/Geometry/arch/Geometry_SIMD.h | 165 ++++++++++++++++++++ Eigen/src/Geometry/arch/Geometry_SSE.h | 170 --------------------- 11 files changed, 259 insertions(+), 184 deletions(-) create mode 100644 Eigen/src/Geometry/arch/Geometry_SIMD.h delete mode 100644 Eigen/src/Geometry/arch/Geometry_SSE.h diff --git a/Eigen/Geometry b/Eigen/Geometry index 16b4bd6e1..e93fa726e 100644 --- a/Eigen/Geometry +++ b/Eigen/Geometry @@ -50,8 +50,8 @@ #include "src/Geometry/Umeyama.h" // Use the SSE optimized version whenever possible. -#if defined EIGEN_VECTORIZE_SSE -#include "src/Geometry/arch/Geometry_SSE.h" +#if (defined EIGEN_VECTORIZE_SSE) || (defined EIGEN_VECTORIZE_NEON) +#include "src/Geometry/arch/Geometry_SIMD.h" #endif #include "src/Core/util/ReenableStupidWarnings.h" diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 671ed3c89..ec7d20e73 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -407,6 +407,12 @@ pabs(const unsigned long& a) { return a; } template<> EIGEN_DEVICE_FUNC inline unsigned long long pabs(const unsigned long long& a) { return a; } +/** \internal \returns the addsub value of \a a,b */ +template EIGEN_DEVICE_FUNC inline Packet +paddsub(const Packet& a, const Packet& b) { + return pselect(peven_mask(a), padd(a, b), psub(a, b)); + } + /** \internal \returns the phase angle of \a a */ template EIGEN_DEVICE_FUNC inline Packet parg(const Packet& a) { using numext::arg; return arg(a); } diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 9253d8cab..f40093455 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -19,6 +19,12 @@ namespace Eigen { namespace internal { +template EIGEN_DEVICE_FUNC inline Packet +pset(const typename unpacket_traits::type (&a)[N] /* a */) { + EIGEN_STATIC_ASSERT(unpacket_traits::size == N, THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE); + return pload(a); +} + template EIGEN_STRONG_INLINE Packet pfrexp_float(const Packet& a, Packet& exponent) { typedef typename unpacket_traits::integer_packet PacketI; diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h index 491f1c927..a623f54cb 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h @@ -17,6 +17,10 @@ namespace internal { // implemented in GenericPacketMathFunctions.h // This is needed to workaround a circular dependency. +/** \internal \returns a packet with constant coefficients \a a, e.g.: (a[N-1],...,a[0]) */ +template EIGEN_DEVICE_FUNC inline Packet +pset(const typename unpacket_traits::type (&a)[N] /* a */); + template EIGEN_STRONG_INLINE Packet pfrexp_float(const Packet& a, Packet& exponent); diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index a889ab1d2..1aa361bc0 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -124,6 +124,13 @@ template<> EIGEN_STRONG_INLINE Packet1cf psub(const Packet1cf& a, con template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b); +template<> EIGEN_STRONG_INLINE Packet2cf paddsub(const Packet2cf& a, const Packet2cf& b) +{ + Packet4f mask = {-0.0f, -0.0f, 0.0f, 0.0f}; + return Packet2cf(padd(a.v, pxor(mask, b.v))); +} + template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 14f3dbd0f..1f34faae0 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -84,12 +84,18 @@ typedef uint64x2_t Packet2ul; #endif // EIGEN_COMP_MSVC +EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){ + const float* a = reinterpret_cast(&m); + Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))}; + return res; +} + // fuctionally equivalent to _mm_shuffle_ps in SSE when interleave // == false (i.e. shuffle(m, n, mask) equals _mm_shuffle_ps(m, n, mask)), // interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h -// to enable a shared implementation for fast inversion of matrices of size 4. -template -EIGEN_STRONG_INLINE Packet4f shuffle(const Packet4f &m, const Packet4f &n, int mask) +// to enable a shared implementation for fast inversion of matrices of size 4. +template +EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask) { const float* a = reinterpret_cast(&m); const float* b = reinterpret_cast(&n); @@ -97,8 +103,8 @@ EIGEN_STRONG_INLINE Packet4f shuffle(const Packet4f &m, const Packet4f &n, int m return res; } -template<> -EIGEN_STRONG_INLINE Packet4f shuffle(const Packet4f &m, const Packet4f &n, int mask) +template<> +EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask) { const float* a = reinterpret_cast(&m); const float* b = reinterpret_cast(&n); @@ -108,25 +114,29 @@ EIGEN_STRONG_INLINE Packet4f shuffle(const Packet4f &m, const Packet4f &n, EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));} +EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) +{ + return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s)); +} EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) -{ - return shuffle(a,b,eigen_neon_shuffle_mask(p, q, r, s)); +{ + return shuffle2(a,b,eigen_neon_shuffle_mask(p, q, r, s)); } EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) { - return shuffle(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1)); + return shuffle2(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1)); } EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) { - return shuffle(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3)); + return shuffle2(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3)); } EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) { - return shuffle(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1)); + return shuffle2(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1)); } EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) { - return shuffle(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3)); + return shuffle2(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3)); } #define vec4f_duplane(a, p) \ vdupq_lane_f32(vget_low_f32(a), p) @@ -851,6 +861,17 @@ template<> EIGEN_STRONG_INLINE Packet4ui psub(const Packet4ui& a, con template<> EIGEN_STRONG_INLINE Packet2l psub(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); } template<> EIGEN_STRONG_INLINE Packet2ul psub(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2f pxor(const Packet2f& a, const Packet2f& b); +template<> EIGEN_STRONG_INLINE Packet2f paddsub(const Packet2f& a, const Packet2f & b) { + Packet2f mask = {-0.0f, 0.0f}; + return padd(a, pxor(mask, b)); +} +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b); +template<> EIGEN_STRONG_INLINE Packet4f paddsub(const Packet4f& a, const Packet4f& b) { + Packet4f mask = {-0.0f, 0.0f, -0.0f, 0.0f}; + return padd(a, pxor(mask, b)); +} + template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); } template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); } template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a) @@ -3717,6 +3738,12 @@ template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& , const Packet2d& ); +template<> EIGEN_STRONG_INLINE Packet2d paddsub(const Packet2d& a, const Packet2d& b){ + const Packet2d mask = {-0.0,0.0}; + return padd(a, pxor(mask, b)); +} + template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); } template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 58cdb5dbe..f6f1b8c9f 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -66,6 +66,13 @@ template<> struct unpacket_traits { template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b); +template<> EIGEN_STRONG_INLINE Packet2cf paddsub(const Packet2cf& a, const Packet2cf& b) +{ + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x0,0x0)); + return Packet2cf(padd(a.v, pxor(mask, b.v))); +} + template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 4e733c77e..4c5b664e6 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -301,6 +301,28 @@ template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } template<> EIGEN_STRONG_INLINE Packet16b psub(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b); +template<> EIGEN_STRONG_INLINE Packet4f paddsub(const Packet4f& a, const Packet4f& b) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + return _mm_addsub_ps(a,b); +#else + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x0,0x80000000,0x0)); + return padd(a, pxor(mask, b)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& , const Packet2d& ); +template<> EIGEN_STRONG_INLINE Packet2d paddsub(const Packet2d& a, const Packet2d& b) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + return _mm_addsub_pd(a,b); +#else + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x0)); + return padd(a, pxor(mask, b)); +#endif +} + template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index 95107ff36..2ef19a4dd 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -105,7 +105,8 @@ CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1, SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1, INVALID_TEMPLATE_PARAMETER=1, - GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1 + GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1, + THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE=1 }; }; diff --git a/Eigen/src/Geometry/arch/Geometry_SIMD.h b/Eigen/src/Geometry/arch/Geometry_SIMD.h new file mode 100644 index 000000000..9c15bfb98 --- /dev/null +++ b/Eigen/src/Geometry/arch/Geometry_SIMD.h @@ -0,0 +1,165 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Rohit Garg +// Copyright (C) 2009-2010 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_GEOMETRY_SIMD_H +#define EIGEN_GEOMETRY_SIMD_H + +namespace Eigen { + +namespace internal { + +template +struct quat_product +{ + enum { + AAlignment = traits::Alignment, + BAlignment = traits::Alignment, + ResAlignment = traits >::Alignment + }; + static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) + { + evaluator ae(_a.coeffs()); + evaluator be(_b.coeffs()); + Quaternion res; + float arr[4] = {0.f, 0.f, 0.f, -0.f}; + const Packet4f mask = pset(arr); + Packet4f a = ae.template packet(0); + Packet4f b = be.template packet(0); + Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); + Packet4f s2 = pmul(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); + pstoret( + &res.x(), + padd(psub(pmul(a,vec4f_swizzle1(b,3,3,3,3)), + pmul(vec4f_swizzle1(a,2,0,1,0), + vec4f_swizzle1(b,1,2,0,0))), + pxor(mask,padd(s1,s2)))); + + return res; + } +}; + +template +struct quat_conj +{ + enum { + ResAlignment = traits >::Alignment + }; + static inline Quaternion run(const QuaternionBase& q) + { + evaluator qe(q.coeffs()); + Quaternion res; + float arr[4] = {-0.f,-0.f,-0.f,0.f}; + const Packet4f mask = pset(arr); + pstoret(&res.x(), pxor(mask, qe.template packet::Alignment,Packet4f>(0))); + return res; + } +}; + + +template +struct cross3_impl +{ + enum { + ResAlignment = traits::type>::Alignment + }; + static inline typename plain_matrix_type::type + run(const VectorLhs& lhs, const VectorRhs& rhs) + { + evaluator lhs_eval(lhs); + evaluator rhs_eval(rhs); + Packet4f a = lhs_eval.template packet::Alignment,Packet4f>(0); + Packet4f b = rhs_eval.template packet::Alignment,Packet4f>(0); + Packet4f mul1 = pmul(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3)); + Packet4f mul2 = pmul(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3)); + typename plain_matrix_type::type res; + pstoret(&res.x(),psub(mul1,mul2)); + return res; + } +}; + + + +#if (defined EIGEN_VECTORIZE_SSE) || (EIGEN_ARCH_ARM64) + +template +struct quat_product +{ + enum { + BAlignment = traits::Alignment, + ResAlignment = traits >::Alignment + }; + + static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) + { + Quaternion res; + + evaluator ae(_a.coeffs()); + evaluator be(_b.coeffs()); + + const double* a = _a.coeffs().data(); + Packet2d b_xy = be.template packet(0); + Packet2d b_zw = be.template packet(2); + Packet2d a_xx = pset1(a[0]); + Packet2d a_yy = pset1(a[1]); + Packet2d a_zz = pset1(a[2]); + Packet2d a_ww = pset1(a[3]); + + // two temporaries: + Packet2d t1, t2; + + /* + * t1 = ww*xy + yy*zw + * t2 = zz*xy - xx*zw + * res.xy = t1 +/- swap(t2) + */ + t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw)); + t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw)); + pstoret(&res.x(), paddsub(t1, preverse(t2))); + + /* + * t1 = ww*zw - yy*xy + * t2 = zz*zw + xx*xy + * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2) + */ + t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy)); + t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy)); + pstoret(&res.z(), preverse(paddsub(preverse(t1), t2))); + + return res; +} +}; + +template +struct quat_conj +{ + enum { + ResAlignment = traits >::Alignment + }; + static inline Quaternion run(const QuaternionBase& q) + { + evaluator qe(q.coeffs()); + Quaternion res; + double arr1[2] = {-0.0, -0.0}; + double arr2[2] = {-0.0, 0.0}; + const Packet2d mask0 = pset(arr1); + const Packet2d mask2 = pset(arr2); + pstoret(&res.x(), pxor(mask0, qe.template packet::Alignment,Packet2d>(0))); + pstoret(&res.z(), pxor(mask2, qe.template packet::Alignment,Packet2d>(2))); + return res; + } +}; + +#endif // end EIGEN_VECTORIZE_SSE_OR_EIGEN_ARCH_ARM64 + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_GEOMETRY_SIMD_H diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h deleted file mode 100644 index 108cc9f8e..000000000 --- a/Eigen/src/Geometry/arch/Geometry_SSE.h +++ /dev/null @@ -1,170 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2009 Rohit Garg -// Copyright (C) 2009-2010 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_GEOMETRY_SSE_H -#define EIGEN_GEOMETRY_SSE_H - -namespace Eigen { - -namespace internal { - -template -struct quat_product -{ - enum { - AAlignment = traits::Alignment, - BAlignment = traits::Alignment, - ResAlignment = traits >::Alignment - }; - static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) - { - evaluator ae(_a.coeffs()); - evaluator be(_b.coeffs()); - Quaternion res; - const Packet4f mask = _mm_setr_ps(0.f,0.f,0.f,-0.f); - Packet4f a = ae.template packet(0); - Packet4f b = be.template packet(0); - Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); - Packet4f s2 = pmul(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); - pstoret( - &res.x(), - padd(psub(pmul(a,vec4f_swizzle1(b,3,3,3,3)), - pmul(vec4f_swizzle1(a,2,0,1,0), - vec4f_swizzle1(b,1,2,0,0))), - pxor(mask,padd(s1,s2)))); - - return res; - } -}; - -template -struct quat_conj -{ - enum { - ResAlignment = traits >::Alignment - }; - static inline Quaternion run(const QuaternionBase& q) - { - evaluator qe(q.coeffs()); - Quaternion res; - const Packet4f mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f); - pstoret(&res.x(), pxor(mask, qe.template packet::Alignment,Packet4f>(0))); - return res; - } -}; - - -template -struct cross3_impl -{ - enum { - ResAlignment = traits::type>::Alignment - }; - static inline typename plain_matrix_type::type - run(const VectorLhs& lhs, const VectorRhs& rhs) - { - evaluator lhs_eval(lhs); - evaluator rhs_eval(rhs); - Packet4f a = lhs_eval.template packet::Alignment,Packet4f>(0); - Packet4f b = rhs_eval.template packet::Alignment,Packet4f>(0); - Packet4f mul1 = pmul(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3)); - Packet4f mul2 = pmul(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3)); - typename plain_matrix_type::type res; - pstoret(&res.x(),psub(mul1,mul2)); - return res; - } -}; - - - - -template -struct quat_product -{ - enum { - BAlignment = traits::Alignment, - ResAlignment = traits >::Alignment - }; - - static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) - { - const Packet2d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); - - Quaternion res; - - evaluator ae(_a.coeffs()); - evaluator be(_b.coeffs()); - - const double* a = _a.coeffs().data(); - Packet2d b_xy = be.template packet(0); - Packet2d b_zw = be.template packet(2); - Packet2d a_xx = pset1(a[0]); - Packet2d a_yy = pset1(a[1]); - Packet2d a_zz = pset1(a[2]); - Packet2d a_ww = pset1(a[3]); - - // two temporaries: - Packet2d t1, t2; - - /* - * t1 = ww*xy + yy*zw - * t2 = zz*xy - xx*zw - * res.xy = t1 +/- swap(t2) - */ - t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw)); - t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw)); -#ifdef EIGEN_VECTORIZE_SSE3 - EIGEN_UNUSED_VARIABLE(mask) - pstoret(&res.x(), _mm_addsub_pd(t1, preverse(t2))); -#else - pstoret(&res.x(), padd(t1, pxor(mask,preverse(t2)))); -#endif - - /* - * t1 = ww*zw - yy*xy - * t2 = zz*zw + xx*xy - * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2) - */ - t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy)); - t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy)); -#ifdef EIGEN_VECTORIZE_SSE3 - EIGEN_UNUSED_VARIABLE(mask) - pstoret(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2))); -#else - pstoret(&res.z(), psub(t1, pxor(mask,preverse(t2)))); -#endif - - return res; -} -}; - -template -struct quat_conj -{ - enum { - ResAlignment = traits >::Alignment - }; - static inline Quaternion run(const QuaternionBase& q) - { - evaluator qe(q.coeffs()); - Quaternion res; - const Packet2d mask0 = _mm_setr_pd(-0.,-0.); - const Packet2d mask2 = _mm_setr_pd(-0.,0.); - pstoret(&res.x(), pxor(mask0, qe.template packet::Alignment,Packet2d>(0))); - pstoret(&res.z(), pxor(mask2, qe.template packet::Alignment,Packet2d>(2))); - return res; - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_GEOMETRY_SSE_H -- cgit v1.2.3