From 394f564055f3723ead6dd45fdb9013ea77f8f6ad Mon Sep 17 00:00:00 2001 From: Guoqiang QI <425418567@qq.com> Date: Tue, 17 Nov 2020 12:27:01 +0000 Subject: Unify Inverse_SSE.h and Inverse_NEON.h into a single generic implementation using PacketMath. --- Eigen/LU | 8 +- Eigen/src/Core/arch/NEON/PacketMath.h | 73 +++++++ Eigen/src/Core/arch/NEON/TypeCasting.h | 8 + Eigen/src/Core/arch/SSE/PacketMath.h | 49 ++++- Eigen/src/Core/arch/SSE/TypeCasting.h | 8 + Eigen/src/LU/arch/InverseSize4.h | 348 ++++++++++++++++++++++++++++++ Eigen/src/LU/arch/Inverse_NEON.h | 372 --------------------------------- Eigen/src/LU/arch/Inverse_SSE.h | 338 ------------------------------ 8 files changed, 482 insertions(+), 722 deletions(-) create mode 100644 Eigen/src/LU/arch/InverseSize4.h delete mode 100644 Eigen/src/LU/arch/Inverse_NEON.h delete mode 100644 Eigen/src/LU/arch/Inverse_SSE.h diff --git a/Eigen/LU b/Eigen/LU index ca72f1357..2a6b77180 100644 --- a/Eigen/LU +++ b/Eigen/LU @@ -40,12 +40,8 @@ // Use the SSE optimized version whenever possible. At the moment the // SSE version doesn't compile when AVX is enabled -#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX - #include "src/LU/arch/Inverse_SSE.h" -#endif - -#if defined EIGEN_VECTORIZE_NEON - #include "src/LU/arch/Inverse_NEON.h" +#if (defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX) || defined EIGEN_VECTORIZE_NEON + #include "src/LU/arch/InverseSize4.h" #endif #include "src/Core/util/ReenableStupidWarnings.h" diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index b927a165a..a51fc88c6 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -84,6 +84,53 @@ typedef uint64x2_t Packet2ul; #endif // EIGEN_COMP_MSVC +// fuctionally equivalent to _mm_shuffle_ps in SSE when interleave +// == false (i.e. shuffle(m, n, mask) equals _mm_shuffle_ps(m, n, mask)), +// interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h +// to enable a shared implementation for fast inversion of matrices of size 4. +template +EIGEN_STRONG_INLINE Packet4f shuffle(const Packet4f &m, const Packet4f &n, int mask) +{ + const float* a = reinterpret_cast(&m); + const float* b = reinterpret_cast(&n); + Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; + return res; +} + +template<> +EIGEN_STRONG_INLINE Packet4f shuffle(const Packet4f &m, const Packet4f &n, int mask) +{ + const float* a = reinterpret_cast(&m); + const float* b = reinterpret_cast(&n); + Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; + return res; +} + +EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));} + +EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) +{ + return shuffle(a,b,eigen_neon_shuffle_mask(p, q, r, s)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) +{ + return shuffle(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) +{ + return shuffle(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) +{ + return shuffle(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) +{ + return shuffle(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3)); +} +#define vec4f_duplane(a, p) \ + vdupq_lane_f32(vget_low_f32(a), p) + #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) @@ -3525,6 +3572,32 @@ template float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2 typedef float64x2_t Packet2d; typedef float64x1_t Packet1d; +// fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask)) +// Currently used in LU/arch/InverseSize4.h to enable a shared implementation +// for fast inversion of matrices of size 4. +EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) +{ + const double* a = reinterpret_cast(&m); + const double* b = reinterpret_cast(&n); + Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))}; + return res; +} + +EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) +{ + return shuffle(a, b, mask); +} +EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a,const Packet2d& b) +{ + return shuffle(a, b, 0); +} +EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b) +{ + return shuffle(a, b, 3); +} +#define vec2d_duplane(a, p) \ + vdupq_laneq_f64(a, p) + template<> struct packet_traits : default_packet_traits { typedef Packet2d type; diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h index 80be213d2..54f97336e 100644 --- a/Eigen/src/Core/arch/NEON/TypeCasting.h +++ b/Eigen/src/Core/arch/NEON/TypeCasting.h @@ -1401,6 +1401,14 @@ template <> EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2d& a) { return vreinterpretq_u64_f64(a); } +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { + return vreinterpretq_f64_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { + return vreinterpretq_s32_f64(a); +} #endif // EIGEN_ARCH_ARM64 diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index c39c0a06e..4db701491 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -52,22 +52,59 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; -#define EIGEN_SSE_SHUFFLE_MASK(p,q,r,s) ((s)<<6|(r)<<4|(q)<<2|(p)) +template +struct shuffle_mask{ + enum { mask = (s)<<6|(r)<<4|(q)<<2|(p) }; +}; +// TODO: change the implementation of all swizzle* ops from macro to template, #define vec4f_swizzle1(v,p,q,r,s) \ - (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))) + Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), (shuffle_mask::mask)))) #define vec4i_swizzle1(v,p,q,r,s) \ - (_mm_shuffle_epi32( v, EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))) + Packet4i(_mm_shuffle_epi32( v, (shuffle_mask::mask))) #define vec2d_swizzle1(v,p,q) \ - (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), EIGEN_SSE_SHUFFLE_MASK(2*p,2*p+1,2*q,2*q+1)))) + Packet2d(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), (shuffle_mask<2*p,2*p+1,2*q,2*q+1>::mask)))) #define vec4f_swizzle2(a,b,p,q,r,s) \ - (_mm_shuffle_ps( (a), (b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))) + Packet4f(_mm_shuffle_ps( (a), (b), (shuffle_mask::mask))) #define vec4i_swizzle2(a,b,p,q,r,s) \ - (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))))) + Packet4i(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask::mask))))) + +EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) +{ + return Packet4f(_mm_movelh_ps(a,b)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) +{ + return Packet4f(_mm_movehl_ps(a,b)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) +{ + return Packet4f(_mm_unpacklo_ps(a,b)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) +{ + return Packet4f(_mm_unpackhi_ps(a,b)); +} +#define vec4f_duplane(a,p) \ + vec4f_swizzle2(a,a,p,p,p,p) + +#define vec2d_swizzle2(a,b,mask) \ + Packet2d(_mm_shuffle_pd(a,b,mask)) + +EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) +{ + return Packet2d(_mm_unpacklo_pd(a,b)); +} +EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) +{ + return Packet2d(_mm_unpackhi_pd(a,b)); +} +#define vec2d_duplane(a,p) \ + vec2d_swizzle2(a,a,(p<<1)|p) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h index 3e6cd90e5..d2a0037e0 100644 --- a/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -77,6 +77,14 @@ template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Pa return _mm_castsi128_ps(a); } +template<> EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { + return _mm_castsi128_pd(a); +} + +template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { + return _mm_castpd_si128(a); +} + // Disable the following code since it's broken on too many platforms / compilers. //#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) #if 0 diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h new file mode 100644 index 000000000..a4c9ca207 --- /dev/null +++ b/Eigen/src/LU/arch/InverseSize4.h @@ -0,0 +1,348 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2001 Intel Corporation +// Copyright (C) 2010 Gael Guennebaud +// Copyright (C) 2009 Benoit Jacob +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The algorithm below is a reimplementation of former \src\LU\Inverse_SSE.h using PacketMath. +// inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M, +// adjugate of M and determinant of M respectively. M# is computed block-wise +// using specific formulae. For proof, see: +// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html +// Variable names are adopted from \src\LU\Inverse_SSE.h. +// +// The SSE code for the 4x4 float and double matrix inverse in former (deprecated) \src\LU\Inverse_SSE.h +// comes from the following Intel's library: +// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/ +// +// Here is the respective copyright and license statement: +// +// Copyright (c) 2001 Intel Corporation. +// +// Permition is granted to use, copy, distribute and prepare derivative works +// of this library for any purpose and without fee, provided, that the above +// copyright notice and this statement appear in all copies. +// Intel makes no representations about the suitability of this software for +// any purpose, and specifically disclaims all warranties. +// See LEGAL.TXT for all the legal information. +// +// TODO: Unify implementations of different data types (i.e. float and double). +#ifndef EIGEN_INVERSE_SIZE_4_H +#define EIGEN_INVERSE_SIZE_4_H + +namespace Eigen +{ +namespace internal +{ +template +struct compute_inverse_size4 +{ + enum + { + MatrixAlignment = traits::Alignment, + ResultAlignment = traits::Alignment, + StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit) + }; + typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType; + + static void run(const MatrixType &mat, ResultType &result) + { + ActualMatrixType matrix(mat); + + Packet4f _L1 = matrix.template packet(0); + Packet4f _L2 = matrix.template packet(4); + Packet4f _L3 = matrix.template packet(8); + Packet4f _L4 = matrix.template packet(12); + + // Four 2x2 sub-matrices of the input matrix + // input = [[A, B], + // [C, D]] + Packet4f A, B, C, D; + + if (!StorageOrdersMatch) + { + A = vec4f_unpacklo(_L1, _L2); + B = vec4f_unpacklo(_L3, _L4); + C = vec4f_unpackhi(_L1, _L2); + D = vec4f_unpackhi(_L3, _L4); + } + else + { + A = vec4f_movelh(_L1, _L2); + B = vec4f_movehl(_L2, _L1); + C = vec4f_movelh(_L3, _L4); + D = vec4f_movehl(_L4, _L3); + } + + Packet4f AB, DC; + + // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product. + AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B); + AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1))); + + // DC = D#*C + DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C); + DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1))); + + // determinants of the sub-matrices + Packet4f dA, dB, dC, dD; + + dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A); + dA = psub(dA, vec4f_movehl(dA, dA)); + + dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B); + dB = psub(dB, vec4f_movehl(dB, dB)); + + dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C); + dC = psub(dC, vec4f_movehl(dC, dC)); + + dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D); + dD = psub(dD, vec4f_movehl(dD, dD)); + + Packet4f d, d1, d2; + + d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB); + d = padd(d, vec4f_movehl(d, d)); + d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0)); + d1 = pmul(dA, dD); + d2 = pmul(dB, dC); + + // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C) + Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0); + + // reciprocal of the determinant of the input matrix, rd = 1/det + Packet4f rd = pdiv(pset1(1.0f), det); + + // Four sub-matrices of the inverse + Packet4f iA, iB, iC, iD; + + // iD = D*|A| - C*A#*B + iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB)); + iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB))); + iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD); + + // iA = A*|D| - B*D#*C + iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC)); + iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC))); + iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA); + + // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A + iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0)); + iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1))); + iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB); + + // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D + iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0)); + iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1))); + iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC); + + const int bits[4] = {0, -2147483648, -2147483648, 0}; + const Packet4f p4f_sign_PNNP = preinterpret(pgather(bits, static_cast(1))); + rd = pxor(rd, p4f_sign_PNNP); + iA = pmul(iA, rd); + iB = pmul(iB, rd); + iC = pmul(iC, rd); + iD = pmul(iD, rd); + + Index res_stride = result.outerStride(); + float *res = result.data(); + + pstoret(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1)); + pstoret(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0)); + pstoret(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1)); + pstoret(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0)); + } +}; + +#if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG)) +// same algorithm as above, except that each operand is split into +// halves for two registers to hold. +template +struct compute_inverse_size4 +{ + enum + { + MatrixAlignment = traits::Alignment, + ResultAlignment = traits::Alignment, + StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit) + }; + typedef typename conditional<(MatrixType::Flags & LinearAccessBit), + MatrixType const &, + typename MatrixType::PlainObject>::type + ActualMatrixType; + + static void run(const MatrixType &mat, ResultType &result) + { + ActualMatrixType matrix(mat); + + // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower + // row e.g. A1, upper row of A, A2, lower row of A + // input = [[A, B], = [[[A1, [B1, + // [C, D]] A2], B2]], + // [[C1, [D1, + // C2], D2]]] + + Packet2d A1, A2, B1, B2, C1, C2, D1, D2; + + if (StorageOrdersMatch) + { + A1 = matrix.template packet(0); + B1 = matrix.template packet(2); + A2 = matrix.template packet(4); + B2 = matrix.template packet(6); + C1 = matrix.template packet(8); + D1 = matrix.template packet(10); + C2 = matrix.template packet(12); + D2 = matrix.template packet(14); + } + else + { + Packet2d temp; + A1 = matrix.template packet(0); + C1 = matrix.template packet(2); + A2 = matrix.template packet(4); + C2 = matrix.template packet(6); + + temp = A1; + A1 = vec2d_unpacklo(A1, A2); + A2 = vec2d_unpackhi(temp, A2); + + temp = C1; + C1 = vec2d_unpacklo(C1, C2); + C2 = vec2d_unpackhi(temp, C2); + + B1 = matrix.template packet(8); + D1 = matrix.template packet(10); + B2 = matrix.template packet(12); + D2 = matrix.template packet(14); + + temp = B1; + B1 = vec2d_unpacklo(B1, B2); + B2 = vec2d_unpackhi(temp, B2); + + temp = D1; + D1 = vec2d_unpacklo(D1, D2); + D2 = vec2d_unpackhi(temp, D2); + } + + // determinants of the sub-matrices + Packet2d dA, dB, dC, dD; + + dA = vec2d_swizzle2(A2, A2, 1); + dA = pmul(A1, dA); + dA = psub(dA, vec2d_duplane(dA, 1)); + + dB = vec2d_swizzle2(B2, B2, 1); + dB = pmul(B1, dB); + dB = psub(dB, vec2d_duplane(dB, 1)); + + dC = vec2d_swizzle2(C2, C2, 1); + dC = pmul(C1, dC); + dC = psub(dC, vec2d_duplane(dC, 1)); + + dD = vec2d_swizzle2(D2, D2, 1); + dD = pmul(D1, dD); + dD = psub(dD, vec2d_duplane(dD, 1)); + + Packet2d DC1, DC2, AB1, AB2; + + // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product. + AB1 = pmul(B1, vec2d_duplane(A2, 1)); + AB2 = pmul(B2, vec2d_duplane(A1, 0)); + AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1))); + AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0))); + + // DC = D#*C + DC1 = pmul(C1, vec2d_duplane(D2, 1)); + DC2 = pmul(C2, vec2d_duplane(D1, 0)); + DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1))); + DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0))); + + Packet2d d1, d2; + + // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C) + Packet2d det; + + // reciprocal of the determinant of the input matrix, rd = 1/det + Packet2d rd; + + d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0)); + d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3)); + rd = padd(d1, d2); + rd = padd(rd, vec2d_duplane(rd, 1)); + + d1 = pmul(dA, dD); + d2 = pmul(dB, dC); + + det = padd(d1, d2); + det = psub(det, rd); + det = vec2d_duplane(det, 0); + rd = pdiv(pset1(1.0), det); + + // rows of four sub-matrices of the inverse + Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2; + + // iD = D*|A| - C*A#*B + iD1 = pmul(AB1, vec2d_duplane(C1, 0)); + iD2 = pmul(AB1, vec2d_duplane(C2, 0)); + iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1))); + iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1))); + dA = vec2d_duplane(dA, 0); + iD1 = psub(pmul(D1, dA), iD1); + iD2 = psub(pmul(D2, dA), iD2); + + // iA = A*|D| - B*D#*C + iA1 = pmul(DC1, vec2d_duplane(B1, 0)); + iA2 = pmul(DC1, vec2d_duplane(B2, 0)); + iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1))); + iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1))); + dD = vec2d_duplane(dD, 0); + iA1 = psub(pmul(A1, dD), iA1); + iA2 = psub(pmul(A2, dD), iA2); + + // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A + iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1)); + iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1)); + iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2))); + iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2))); + dB = vec2d_duplane(dB, 0); + iB1 = psub(pmul(C1, dB), iB1); + iB2 = psub(pmul(C2, dB), iB2); + + // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D + iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1)); + iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1)); + iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2))); + iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2))); + dC = vec2d_duplane(dC, 0); + iC1 = psub(pmul(B1, dC), iC1); + iC2 = psub(pmul(B2, dC), iC2); + + const int bits1[4] = {0, -2147483648, 0, 0}; + const int bits2[4] = {0, 0, 0, -2147483648}; + const Packet2d _Sign_NP = preinterpret(pgather(bits1, static_cast(1))); + const Packet2d _Sign_PN = preinterpret(pgather(bits2, static_cast(1))); + d1 = pxor(rd, _Sign_PN); + d2 = pxor(rd, _Sign_NP); + + Index res_stride = result.outerStride(); + double *res = result.data(); + pstoret(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1)); + pstoret(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2)); + pstoret(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1)); + pstoret(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2)); + pstoret(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1)); + pstoret(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2)); + pstoret(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1)); + pstoret(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2)); + } +}; +#endif +} // namespace internal +} // namespace Eigen +#endif \ No newline at end of file diff --git a/Eigen/src/LU/arch/Inverse_NEON.h b/Eigen/src/LU/arch/Inverse_NEON.h deleted file mode 100644 index ed64b1b3a..000000000 --- a/Eigen/src/LU/arch/Inverse_NEON.h +++ /dev/null @@ -1,372 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// -// The algorithm below is a re-implementation of \src\LU\Inverse_SSE.h using NEON -// intrinsics. inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M, -// adjugate of M and determinant of M respectively. M# is computed block-wise -// using specific formulae. For proof, see: -// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html -// Variable names are adopted from \src\LU\Inverse_SSE.h. - -// TODO: Unify implementations of different data types (i.e. float and double) and -// different sets of instrinsics (i.e. SSE and NEON) -#ifndef EIGEN_INVERSE_NEON_H -#define EIGEN_INVERSE_NEON_H - -namespace Eigen -{ -namespace internal -{ -template -struct compute_inverse_size4 -{ - enum - { - MatrixAlignment = traits::Alignment, - ResultAlignment = traits::Alignment, - StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit) - }; - typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType; - - // fuctionally equivalent to _mm_shuffle_ps in SSE when interleave - // == false (i.e. shuffle(m, n, mask, false) equals _mm_shuffle_ps(m, n, mask)), - // interleave m and n when interleave == true - static Packet4f shuffle(const Packet4f &m, const Packet4f &n, int mask, bool interleave = false) - { - const float *a = reinterpret_cast(&m); - const float *b = reinterpret_cast(&n); - if (!interleave) - { - Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; - return res; - } - else - { - Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; - return res; - } - } - - static void run(const MatrixType &mat, ResultType &result) - { - ActualMatrixType matrix(mat); - - Packet4f _L1 = matrix.template packet(0); - Packet4f _L2 = matrix.template packet(4); - Packet4f _L3 = matrix.template packet(8); - Packet4f _L4 = matrix.template packet(12); - - // Four 2x2 sub-matrices of the input matrix - // input = [[A, B], - // [C, D]] - Packet4f A, B, C, D; - - if (!StorageOrdersMatch) - { - A = shuffle(_L1, _L2, 0x50, true); - B = shuffle(_L3, _L4, 0x50, true); - C = shuffle(_L1, _L2, 0xFA, true); - D = shuffle(_L3, _L4, 0xFA, true); - } - else - { - A = shuffle(_L1, _L2, 0x44); - B = shuffle(_L1, _L2, 0xEE); - C = shuffle(_L3, _L4, 0x44); - D = shuffle(_L3, _L4, 0xEE); - } - - Packet4f AB, DC, temp; - - // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product. - AB = shuffle(A, A, 0x0F); - AB = pmul(AB, B); - - temp = shuffle(A, A, 0xA5); - temp = pmul(temp, shuffle(B, B, 0x4E)); - AB = psub(AB, temp); - - // DC = D#*C - DC = shuffle(D, D, 0x0F); - DC = pmul(DC, C); - temp = shuffle(D, D, 0xA5); - temp = pmul(temp, shuffle(C, C, 0x4E)); - DC = psub(DC, temp); - - // determinants of the sub-matrices - Packet4f dA, dB, dC, dD; - - dA = pmul(shuffle(A, A, 0x5F), A); - dA = psub(dA, shuffle(dA, dA, 0xEE)); - - dB = pmul(shuffle(B, B, 0x5F), B); - dB = psub(dB, shuffle(dB, dB, 0xEE)); - - dC = pmul(shuffle(C, C, 0x5F), C); - dC = psub(dC, shuffle(dC, dC, 0xEE)); - - dD = pmul(shuffle(D, D, 0x5F), D); - dD = psub(dD, shuffle(dD, dD, 0xEE)); - - Packet4f d, d1, d2; - Packet2f sum; - temp = shuffle(DC, DC, 0xD8); - d = pmul(temp, AB); - sum = vpadd_f32(vadd_f32(vget_low_f32(d), vget_high_f32(d)), vadd_f32(vget_low_f32(d), vget_high_f32(d))); - d = vdupq_lane_f32(sum, 0); - d1 = pmul(dA, dD); - d2 = pmul(dB, dC); - - // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C) - Packet4f det = psub(padd(d1, d2), d); - - // reciprocal of the determinant of the input matrix, rd = 1/det - Packet4f rd = pdiv(vdupq_n_f32(float32_t(1.0)), det); - - // Four sub-matrices of the inverse - Packet4f iA, iB, iC, iD; - - // iD = D*|A| - C*A#*B - temp = shuffle(C, C, 0xA0); - temp = pmul(temp, shuffle(AB, AB, 0x44)); - iD = shuffle(C, C, 0xF5); - iD = pmul(iD, shuffle(AB, AB, 0xEE)); - iD = padd(iD, temp); - iD = psub(vmulq_lane_f32(D, vget_low_f32(dA), 0), iD); - - // iA = A*|D| - B*D#*C - temp = shuffle(B, B, 0xA0); - temp = pmul(temp, shuffle(DC, DC, 0x44)); - iA = shuffle(B, B, 0xF5); - iA = pmul(iA, shuffle(DC, DC, 0xEE)); - iA = padd(iA, temp); - iA = psub(vmulq_lane_f32(A, vget_low_f32(dD), 0), iA); - - // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A - iB = pmul(D, shuffle(AB, AB, 0x33)); - iB = psub(iB, pmul(shuffle(D, D, 0xB1), shuffle(AB, AB, 0x66))); - iB = psub(vmulq_lane_f32(C, vget_low_f32(dB), 0), iB); - - // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D - iC = pmul(A, shuffle(DC, DC, 0x33)); - iC = psub(iC, pmul(shuffle(A, A, 0xB1), shuffle(DC, DC, 0x66))); - iC = psub(vmulq_lane_f32(B, vget_low_f32(dC), 0), iC); - - const Packet4f coeff = {1.0, -1.0, -1.0, 1.0}; - rd = pmul(vdupq_lane_f32(vget_low_f32(rd), 0), coeff); - iA = pmul(iA, rd); - iB = pmul(iB, rd); - iC = pmul(iC, rd); - iD = pmul(iD, rd); - - Index res_stride = result.outerStride(); - float *res = result.data(); - - pstoret(res + 0, shuffle(iA, iB, 0x77)); - pstoret(res + res_stride, shuffle(iA, iB, 0x22)); - pstoret(res + 2 * res_stride, shuffle(iC, iD, 0x77)); - pstoret(res + 3 * res_stride, shuffle(iC, iD, 0x22)); - } -}; - -#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG - -// same algorithm as above, except that each operand is split into -// halves for two registers to hold. -template -struct compute_inverse_size4 -{ - enum - { - MatrixAlignment = traits::Alignment, - ResultAlignment = traits::Alignment, - StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit) - }; - typedef typename conditional<(MatrixType::Flags & LinearAccessBit), - MatrixType const &, - typename MatrixType::PlainObject>::type - ActualMatrixType; - - // fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask)) - static Packet2d shuffle(const Packet2d &m, const Packet2d &n, int mask) - { - const double *a = reinterpret_cast(&m); - const double *b = reinterpret_cast(&n); - Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))}; - return res; - } - - static void run(const MatrixType &mat, ResultType &result) - { - ActualMatrixType matrix(mat); - - // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower - // row e.g. A1, upper row of A, A2, lower row of A - // input = [[A, B], = [[[A1, [B1, - // [C, D]] A2], B2]], - // [[C1, [D1, - // C2], D2]]] - - Packet2d A1, A2, B1, B2, C1, C2, D1, D2; - - if (StorageOrdersMatch) - { - A1 = matrix.template packet(0); - B1 = matrix.template packet(2); - A2 = matrix.template packet(4); - B2 = matrix.template packet(6); - C1 = matrix.template packet(8); - D1 = matrix.template packet(10); - C2 = matrix.template packet(12); - D2 = matrix.template packet(14); - } - else - { - Packet2d temp; - A1 = matrix.template packet(0); - C1 = matrix.template packet(2); - A2 = matrix.template packet(4); - C2 = matrix.template packet(6); - - temp = A1; - A1 = shuffle(A1, A2, 0); - A2 = shuffle(temp, A2, 3); - - temp = C1; - C1 = shuffle(C1, C2, 0); - C2 = shuffle(temp, C2, 3); - - B1 = matrix.template packet(8); - D1 = matrix.template packet(10); - B2 = matrix.template packet(12); - D2 = matrix.template packet(14); - - temp = B1; - B1 = shuffle(B1, B2, 0); - B2 = shuffle(temp, B2, 3); - - temp = D1; - D1 = shuffle(D1, D2, 0); - D2 = shuffle(temp, D2, 3); - } - - // determinants of the sub-matrices - Packet2d dA, dB, dC, dD; - - dA = shuffle(A2, A2, 1); - dA = pmul(A1, dA); - dA = psub(dA, vdupq_laneq_f64(dA, 1)); - - dB = shuffle(B2, B2, 1); - dB = pmul(B1, dB); - dB = psub(dB, vdupq_laneq_f64(dB, 1)); - - dC = shuffle(C2, C2, 1); - dC = pmul(C1, dC); - dC = psub(dC, vdupq_laneq_f64(dC, 1)); - - dD = shuffle(D2, D2, 1); - dD = pmul(D1, dD); - dD = psub(dD, vdupq_laneq_f64(dD, 1)); - - Packet2d DC1, DC2, AB1, AB2; - - // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product. - AB1 = pmul(B1, vdupq_laneq_f64(A2, 1)); - AB2 = pmul(B2, vdupq_laneq_f64(A1, 0)); - AB1 = psub(AB1, pmul(B2, vdupq_laneq_f64(A1, 1))); - AB2 = psub(AB2, pmul(B1, vdupq_laneq_f64(A2, 0))); - - // DC = D#*C - DC1 = pmul(C1, vdupq_laneq_f64(D2, 1)); - DC2 = pmul(C2, vdupq_laneq_f64(D1, 0)); - DC1 = psub(DC1, pmul(C2, vdupq_laneq_f64(D1, 1))); - DC2 = psub(DC2, pmul(C1, vdupq_laneq_f64(D2, 0))); - - Packet2d d1, d2; - - // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C) - Packet2d det; - - // reciprocal of the determinant of the input matrix, rd = 1/det - Packet2d rd; - - d1 = pmul(AB1, shuffle(DC1, DC2, 0)); - d2 = pmul(AB2, shuffle(DC1, DC2, 3)); - rd = padd(d1, d2); - rd = padd(rd, vdupq_laneq_f64(rd, 1)); - - d1 = pmul(dA, dD); - d2 = pmul(dB, dC); - - det = padd(d1, d2); - det = psub(det, rd); - det = vdupq_laneq_f64(det, 0); - rd = pdiv(vdupq_n_f64(float64_t(1.0)), det); - - // rows of four sub-matrices of the inverse - Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2; - - // iD = D*|A| - C*A#*B - iD1 = pmul(AB1, vdupq_laneq_f64(C1, 0)); - iD2 = pmul(AB1, vdupq_laneq_f64(C2, 0)); - iD1 = padd(iD1, pmul(AB2, vdupq_laneq_f64(C1, 1))); - iD2 = padd(iD2, pmul(AB2, vdupq_laneq_f64(C2, 1))); - dA = vdupq_laneq_f64(dA, 0); - iD1 = psub(pmul(D1, dA), iD1); - iD2 = psub(pmul(D2, dA), iD2); - - // iA = A*|D| - B*D#*C - iA1 = pmul(DC1, vdupq_laneq_f64(B1, 0)); - iA2 = pmul(DC1, vdupq_laneq_f64(B2, 0)); - iA1 = padd(iA1, pmul(DC2, vdupq_laneq_f64(B1, 1))); - iA2 = padd(iA2, pmul(DC2, vdupq_laneq_f64(B2, 1))); - dD = vdupq_laneq_f64(dD, 0); - iA1 = psub(pmul(A1, dD), iA1); - iA2 = psub(pmul(A2, dD), iA2); - - // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A - iB1 = pmul(D1, shuffle(AB2, AB1, 1)); - iB2 = pmul(D2, shuffle(AB2, AB1, 1)); - iB1 = psub(iB1, pmul(shuffle(D1, D1, 1), shuffle(AB2, AB1, 2))); - iB2 = psub(iB2, pmul(shuffle(D2, D2, 1), shuffle(AB2, AB1, 2))); - dB = vdupq_laneq_f64(dB, 0); - iB1 = psub(pmul(C1, dB), iB1); - iB2 = psub(pmul(C2, dB), iB2); - - // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D - iC1 = pmul(A1, shuffle(DC2, DC1, 1)); - iC2 = pmul(A2, shuffle(DC2, DC1, 1)); - iC1 = psub(iC1, pmul(shuffle(A1, A1, 1), shuffle(DC2, DC1, 2))); - iC2 = psub(iC2, pmul(shuffle(A2, A2, 1), shuffle(DC2, DC1, 2))); - dC = vdupq_laneq_f64(dC, 0); - iC1 = psub(pmul(B1, dC), iC1); - iC2 = psub(pmul(B2, dC), iC2); - - const Packet2d PN = {1.0, -1.0}; - const Packet2d NP = {-1.0, 1.0}; - d1 = pmul(PN, rd); - d2 = pmul(NP, rd); - - Index res_stride = result.outerStride(); - double *res = result.data(); - pstoret(res + 0, pmul(shuffle(iA2, iA1, 3), d1)); - pstoret(res + res_stride, pmul(shuffle(iA2, iA1, 0), d2)); - pstoret(res + 2, pmul(shuffle(iB2, iB1, 3), d1)); - pstoret(res + res_stride + 2, pmul(shuffle(iB2, iB1, 0), d2)); - pstoret(res + 2 * res_stride, pmul(shuffle(iC2, iC1, 3), d1)); - pstoret(res + 3 * res_stride, pmul(shuffle(iC2, iC1, 0), d2)); - pstoret(res + 2 * res_stride + 2, pmul(shuffle(iD2, iD1, 3), d1)); - pstoret(res + 3 * res_stride + 2, pmul(shuffle(iD2, iD1, 0), d2)); - } -}; - -#endif // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG - -} // namespace internal -} // namespace Eigen -#endif diff --git a/Eigen/src/LU/arch/Inverse_SSE.h b/Eigen/src/LU/arch/Inverse_SSE.h deleted file mode 100644 index 4dce2ef20..000000000 --- a/Eigen/src/LU/arch/Inverse_SSE.h +++ /dev/null @@ -1,338 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2001 Intel Corporation -// Copyright (C) 2010 Gael Guennebaud -// Copyright (C) 2009 Benoit Jacob -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// The SSE code for the 4x4 float and double matrix inverse in this file -// comes from the following Intel's library: -// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/ -// -// Here is the respective copyright and license statement: -// -// Copyright (c) 2001 Intel Corporation. -// -// Permition is granted to use, copy, distribute and prepare derivative works -// of this library for any purpose and without fee, provided, that the above -// copyright notice and this statement appear in all copies. -// Intel makes no representations about the suitability of this software for -// any purpose, and specifically disclaims all warranties. -// See LEGAL.TXT for all the legal information. - -#ifndef EIGEN_INVERSE_SSE_H -#define EIGEN_INVERSE_SSE_H - -namespace Eigen { - -namespace internal { - -template -struct compute_inverse_size4 -{ - enum { - MatrixAlignment = traits::Alignment, - ResultAlignment = traits::Alignment, - StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) - }; - typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType; - - static void run(const MatrixType& mat, ResultType& result) - { - ActualMatrixType matrix(mat); - const Packet4f p4f_sign_PNNP = _mm_castsi128_ps(_mm_set_epi32(0x00000000, 0x80000000, 0x80000000, 0x00000000)); - - // Load the full matrix into registers - __m128 _L1 = matrix.template packet( 0); - __m128 _L2 = matrix.template packet( 4); - __m128 _L3 = matrix.template packet( 8); - __m128 _L4 = matrix.template packet(12); - - // The inverse is calculated using "Divide and Conquer" technique. The - // original matrix is divide into four 2x2 sub-matrices. Since each - // register holds four matrix element, the smaller matrices are - // represented as a registers. Hence we get a better locality of the - // calculations. - - __m128 A, B, C, D; // the four sub-matrices - if(!StorageOrdersMatch) - { - A = _mm_unpacklo_ps(_L1, _L2); - B = _mm_unpacklo_ps(_L3, _L4); - C = _mm_unpackhi_ps(_L1, _L2); - D = _mm_unpackhi_ps(_L3, _L4); - } - else - { - A = _mm_movelh_ps(_L1, _L2); - B = _mm_movehl_ps(_L2, _L1); - C = _mm_movelh_ps(_L3, _L4); - D = _mm_movehl_ps(_L4, _L3); - } - - __m128 iA, iB, iC, iD, // partial inverse of the sub-matrices - DC, AB; - __m128 dA, dB, dC, dD; // determinant of the sub-matrices - __m128 det, d, d1, d2; - __m128 rd; // reciprocal of the determinant - - // AB = A# * B - AB = _mm_mul_ps(_mm_shuffle_ps(A,A,0x0F), B); - AB = _mm_sub_ps(AB,_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E))); - // DC = D# * C - DC = _mm_mul_ps(_mm_shuffle_ps(D,D,0x0F), C); - DC = _mm_sub_ps(DC,_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E))); - - // dA = |A| - dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A); - dA = _mm_sub_ss(dA, _mm_movehl_ps(dA,dA)); - // dB = |B| - dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B); - dB = _mm_sub_ss(dB, _mm_movehl_ps(dB,dB)); - - // dC = |C| - dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C); - dC = _mm_sub_ss(dC, _mm_movehl_ps(dC,dC)); - // dD = |D| - dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D); - dD = _mm_sub_ss(dD, _mm_movehl_ps(dD,dD)); - - // d = trace(AB*DC) = trace(A#*B*D#*C) - d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB); - - // iD = C*A#*B - iD = _mm_mul_ps(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB)); - iD = _mm_add_ps(iD,_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB))); - // iA = B*D#*C - iA = _mm_mul_ps(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC)); - iA = _mm_add_ps(iA,_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC))); - - // d = trace(AB*DC) = trace(A#*B*D#*C) [continue] - d = _mm_add_ps(d, _mm_movehl_ps(d, d)); - d = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1)); - d1 = _mm_mul_ss(dA,dD); - d2 = _mm_mul_ss(dB,dC); - - // iD = D*|A| - C*A#*B - iD = _mm_sub_ps(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD); - - // iA = A*|D| - B*D#*C; - iA = _mm_sub_ps(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA); - - // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C) - det = _mm_sub_ss(_mm_add_ss(d1,d2),d); - rd = _mm_div_ss(_mm_set_ss(1.0f), det); - -// #ifdef ZERO_SINGULAR -// rd = _mm_and_ps(_mm_cmpneq_ss(det,_mm_setzero_ps()), rd); -// #endif - - // iB = D * (A#B)# = D*B#*A - iB = _mm_mul_ps(D, _mm_shuffle_ps(AB,AB,0x33)); - iB = _mm_sub_ps(iB, _mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66))); - // iC = A * (D#C)# = A*C#*D - iC = _mm_mul_ps(A, _mm_shuffle_ps(DC,DC,0x33)); - iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66))); - - rd = _mm_shuffle_ps(rd,rd,0); - rd = _mm_xor_ps(rd, p4f_sign_PNNP); - - // iB = C*|B| - D*B#*A - iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB); - - // iC = B*|C| - A*C#*D; - iC = _mm_sub_ps(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC); - - // iX = iX / det - iA = _mm_mul_ps(rd,iA); - iB = _mm_mul_ps(rd,iB); - iC = _mm_mul_ps(rd,iC); - iD = _mm_mul_ps(rd,iD); - - Index res_stride = result.outerStride(); - float* res = result.data(); - pstoret(res+0, _mm_shuffle_ps(iA,iB,0x77)); - pstoret(res+res_stride, _mm_shuffle_ps(iA,iB,0x22)); - pstoret(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77)); - pstoret(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22)); - } - -}; - -template -struct compute_inverse_size4 -{ - enum { - MatrixAlignment = traits::Alignment, - ResultAlignment = traits::Alignment, - StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) - }; - typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType; - - static void run(const MatrixType& mat, ResultType& result) - { - ActualMatrixType matrix(mat); - const __m128d _Sign_NP = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); - const __m128d _Sign_PN = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - - // The inverse is calculated using "Divide and Conquer" technique. The - // original matrix is divide into four 2x2 sub-matrices. Since each - // register of the matrix holds two elements, the smaller matrices are - // consisted of two registers. Hence we get a better locality of the - // calculations. - - // the four sub-matrices - __m128d A1, A2, B1, B2, C1, C2, D1, D2; - - if(StorageOrdersMatch) - { - A1 = matrix.template packet( 0); B1 = matrix.template packet( 2); - A2 = matrix.template packet( 4); B2 = matrix.template packet( 6); - C1 = matrix.template packet( 8); D1 = matrix.template packet(10); - C2 = matrix.template packet(12); D2 = matrix.template packet(14); - } - else - { - __m128d tmp; - A1 = matrix.template packet( 0); C1 = matrix.template packet( 2); - A2 = matrix.template packet( 4); C2 = matrix.template packet( 6); - tmp = A1; - A1 = _mm_unpacklo_pd(A1,A2); - A2 = _mm_unpackhi_pd(tmp,A2); - tmp = C1; - C1 = _mm_unpacklo_pd(C1,C2); - C2 = _mm_unpackhi_pd(tmp,C2); - - B1 = matrix.template packet( 8); D1 = matrix.template packet(10); - B2 = matrix.template packet(12); D2 = matrix.template packet(14); - tmp = B1; - B1 = _mm_unpacklo_pd(B1,B2); - B2 = _mm_unpackhi_pd(tmp,B2); - tmp = D1; - D1 = _mm_unpacklo_pd(D1,D2); - D2 = _mm_unpackhi_pd(tmp,D2); - } - - __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2, // partial invese of the sub-matrices - DC1, DC2, AB1, AB2; - __m128d dA, dB, dC, dD; // determinant of the sub-matrices - __m128d det, d1, d2, rd; - - // dA = |A| - dA = _mm_shuffle_pd(A2, A2, 1); - dA = _mm_mul_pd(A1, dA); - dA = _mm_sub_sd(dA, _mm_shuffle_pd(dA,dA,3)); - // dB = |B| - dB = _mm_shuffle_pd(B2, B2, 1); - dB = _mm_mul_pd(B1, dB); - dB = _mm_sub_sd(dB, _mm_shuffle_pd(dB,dB,3)); - - // AB = A# * B - AB1 = _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,3)); - AB2 = _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,0)); - AB1 = _mm_sub_pd(AB1, _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,3))); - AB2 = _mm_sub_pd(AB2, _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,0))); - - // dC = |C| - dC = _mm_shuffle_pd(C2, C2, 1); - dC = _mm_mul_pd(C1, dC); - dC = _mm_sub_sd(dC, _mm_shuffle_pd(dC,dC,3)); - // dD = |D| - dD = _mm_shuffle_pd(D2, D2, 1); - dD = _mm_mul_pd(D1, dD); - dD = _mm_sub_sd(dD, _mm_shuffle_pd(dD,dD,3)); - - // DC = D# * C - DC1 = _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,3)); - DC2 = _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,0)); - DC1 = _mm_sub_pd(DC1, _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,3))); - DC2 = _mm_sub_pd(DC2, _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,0))); - - // rd = trace(AB*DC) = trace(A#*B*D#*C) - d1 = _mm_mul_pd(AB1, _mm_shuffle_pd(DC1, DC2, 0)); - d2 = _mm_mul_pd(AB2, _mm_shuffle_pd(DC1, DC2, 3)); - rd = _mm_add_pd(d1, d2); - rd = _mm_add_sd(rd, _mm_shuffle_pd(rd, rd,3)); - - // iD = C*A#*B - iD1 = _mm_mul_pd(AB1, _mm_shuffle_pd(C1,C1,0)); - iD2 = _mm_mul_pd(AB1, _mm_shuffle_pd(C2,C2,0)); - iD1 = _mm_add_pd(iD1, _mm_mul_pd(AB2, _mm_shuffle_pd(C1,C1,3))); - iD2 = _mm_add_pd(iD2, _mm_mul_pd(AB2, _mm_shuffle_pd(C2,C2,3))); - - // iA = B*D#*C - iA1 = _mm_mul_pd(DC1, _mm_shuffle_pd(B1,B1,0)); - iA2 = _mm_mul_pd(DC1, _mm_shuffle_pd(B2,B2,0)); - iA1 = _mm_add_pd(iA1, _mm_mul_pd(DC2, _mm_shuffle_pd(B1,B1,3))); - iA2 = _mm_add_pd(iA2, _mm_mul_pd(DC2, _mm_shuffle_pd(B2,B2,3))); - - // iD = D*|A| - C*A#*B - dA = _mm_shuffle_pd(dA,dA,0); - iD1 = _mm_sub_pd(_mm_mul_pd(D1, dA), iD1); - iD2 = _mm_sub_pd(_mm_mul_pd(D2, dA), iD2); - - // iA = A*|D| - B*D#*C; - dD = _mm_shuffle_pd(dD,dD,0); - iA1 = _mm_sub_pd(_mm_mul_pd(A1, dD), iA1); - iA2 = _mm_sub_pd(_mm_mul_pd(A2, dD), iA2); - - d1 = _mm_mul_sd(dA, dD); - d2 = _mm_mul_sd(dB, dC); - - // iB = D * (A#B)# = D*B#*A - iB1 = _mm_mul_pd(D1, _mm_shuffle_pd(AB2,AB1,1)); - iB2 = _mm_mul_pd(D2, _mm_shuffle_pd(AB2,AB1,1)); - iB1 = _mm_sub_pd(iB1, _mm_mul_pd(_mm_shuffle_pd(D1,D1,1), _mm_shuffle_pd(AB2,AB1,2))); - iB2 = _mm_sub_pd(iB2, _mm_mul_pd(_mm_shuffle_pd(D2,D2,1), _mm_shuffle_pd(AB2,AB1,2))); - - // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C) - det = _mm_add_sd(d1, d2); - det = _mm_sub_sd(det, rd); - - // iC = A * (D#C)# = A*C#*D - iC1 = _mm_mul_pd(A1, _mm_shuffle_pd(DC2,DC1,1)); - iC2 = _mm_mul_pd(A2, _mm_shuffle_pd(DC2,DC1,1)); - iC1 = _mm_sub_pd(iC1, _mm_mul_pd(_mm_shuffle_pd(A1,A1,1), _mm_shuffle_pd(DC2,DC1,2))); - iC2 = _mm_sub_pd(iC2, _mm_mul_pd(_mm_shuffle_pd(A2,A2,1), _mm_shuffle_pd(DC2,DC1,2))); - - rd = _mm_div_sd(_mm_set_sd(1.0), det); -// #ifdef ZERO_SINGULAR -// rd = _mm_and_pd(_mm_cmpneq_sd(det,_mm_setzero_pd()), rd); -// #endif - rd = _mm_shuffle_pd(rd,rd,0); - - // iB = C*|B| - D*B#*A - dB = _mm_shuffle_pd(dB,dB,0); - iB1 = _mm_sub_pd(_mm_mul_pd(C1, dB), iB1); - iB2 = _mm_sub_pd(_mm_mul_pd(C2, dB), iB2); - - d1 = _mm_xor_pd(rd, _Sign_PN); - d2 = _mm_xor_pd(rd, _Sign_NP); - - // iC = B*|C| - A*C#*D; - dC = _mm_shuffle_pd(dC,dC,0); - iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1); - iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2); - - Index res_stride = result.outerStride(); - double* res = result.data(); - pstoret(res+0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1)); - pstoret(res+res_stride, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2)); - pstoret(res+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1)); - pstoret(res+res_stride+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2)); - pstoret(res+2*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1)); - pstoret(res+3*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2)); - pstoret(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1)); - pstoret(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2)); - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_INVERSE_SSE_H -- cgit v1.2.3