diff options
author | Gael Guennebaud <g.gael@free.fr> | 2010-08-24 13:00:59 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2010-08-24 13:00:59 +0200 |
commit | a47bbf664cfde3447e1232cd3f947920eff936ab (patch) | |
tree | 1a3d222205ab43cb31aabcccc9a2063bf6bcc4df /Eigen/src/LU/arch | |
parent | 548ecc2fe5310fcf1e363a440a83b6bf4c18b0e2 (diff) |
fix 4x4 SSE inversion when storage orders don't match
Diffstat (limited to 'Eigen/src/LU/arch')
-rw-r--r-- | Eigen/src/LU/arch/Inverse_SSE.h | 62 |
1 files changed, 51 insertions, 11 deletions
diff --git a/Eigen/src/LU/arch/Inverse_SSE.h b/Eigen/src/LU/arch/Inverse_SSE.h index 7421b7012..6d497d326 100644 --- a/Eigen/src/LU/arch/Inverse_SSE.h +++ b/Eigen/src/LU/arch/Inverse_SSE.h @@ -46,8 +46,9 @@ template<typename MatrixType, typename ResultType> struct ei_compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType> { enum { - MatrixAlignment = bool(MatrixType::Flags&AlignedBit), - ResultAlignment = bool(ResultType::Flags&AlignedBit) + MatrixAlignment = bool(MatrixType::Flags&AlignedBit), + ResultAlignment = bool(ResultType::Flags&AlignedBit), + StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) }; static void run(const MatrixType& matrix, ResultType& result) @@ -66,10 +67,21 @@ struct ei_compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType // represented as a registers. Hence we get a better locality of the // calculations. - __m128 A = _mm_movelh_ps(_L1, _L2), // the four sub-matrices - B = _mm_movehl_ps(_L2, _L1), - C = _mm_movelh_ps(_L3, _L4), - D = _mm_movehl_ps(_L4, _L3); + __m128 A, B, C, D; // the four sub-matrices + if(!StorageOrdersMatch) + { + A = _mm_unpacklo_ps(_L1, _L2); + B = _mm_unpacklo_ps(_L3, _L4); + C = _mm_unpackhi_ps(_L1, _L2); + D = _mm_unpackhi_ps(_L3, _L4); + } + else + { + A = _mm_movelh_ps(_L1, _L2); + B = _mm_movehl_ps(_L2, _L1); + C = _mm_movelh_ps(_L3, _L4); + D = _mm_movehl_ps(_L4, _L3); + } __m128 iA, iB, iC, iD, // partial inverse of the sub-matrices DC, AB; @@ -163,7 +175,8 @@ struct ei_compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultTyp { enum { MatrixAlignment = bool(MatrixType::Flags&AlignedBit), - ResultAlignment = bool(ResultType::Flags&AlignedBit) + ResultAlignment = bool(ResultType::Flags&AlignedBit), + StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) }; static void run(const MatrixType& matrix, ResultType& result) { @@ -177,10 +190,37 @@ struct ei_compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultTyp // calculations. // the four sub-matrices - __m128d A1(matrix.template packet<MatrixAlignment>( 0)), B1(matrix.template packet<MatrixAlignment>( 2)), - A2(matrix.template packet<MatrixAlignment>( 4)), B2(matrix.template packet<MatrixAlignment>( 6)), - C1(matrix.template packet<MatrixAlignment>( 8)), D1(matrix.template packet<MatrixAlignment>(10)), - C2(matrix.template packet<MatrixAlignment>(12)), D2(matrix.template packet<MatrixAlignment>(14)); + __m128d A1, A2, B1, B2, C1, C2, D1, D2; + + if(StorageOrdersMatch) + { + A1 = matrix.template packet<MatrixAlignment>( 0); B1 = matrix.template packet<MatrixAlignment>( 2); + A2 = matrix.template packet<MatrixAlignment>( 4); B2 = matrix.template packet<MatrixAlignment>( 6); + C1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10); + C2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14); + } + else + { + __m128d tmp; + A1 = matrix.template packet<MatrixAlignment>( 0); C1 = matrix.template packet<MatrixAlignment>( 2); + A2 = matrix.template packet<MatrixAlignment>( 4); C2 = matrix.template packet<MatrixAlignment>( 6); + tmp = A1; + A1 = _mm_unpacklo_pd(A1,A2); + A2 = _mm_unpackhi_pd(tmp,A2); + tmp = C1; + C1 = _mm_unpacklo_pd(C1,C2); + C2 = _mm_unpackhi_pd(tmp,C2); + + B1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10); + B2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14); + tmp = B1; + B1 = _mm_unpacklo_pd(B1,B2); + B2 = _mm_unpackhi_pd(tmp,B2); + tmp = D1; + D1 = _mm_unpacklo_pd(D1,D2); + D2 = _mm_unpackhi_pd(tmp,D2); + } + __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2, // partial invese of the sub-matrices DC1, DC2, AB1, AB2; __m128d dA, dB, dC, dD; // determinant of the sub-matrices |