From d02eccf5840b356e741ea0e3166db2c7623b88c4 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Mon, 14 Dec 2009 22:47:14 -0500 Subject: add SSE path for Matrix4f inverse, taken from Intel except that we do a kosher division instead of RCPPS-followed-by-Newton-Raphson. The rationale for that is that elsewhere in Eigen we dont allow ourselves this approximation (which throws 2 bits of mantissa), so there's no reason we should allow it here. --- COPYING.LGPL | 2 +- Eigen/LU | 1 + Eigen/src/LU/Inverse.h | 18 +++-- Eigen/src/LU/arch/CMakeLists.txt | 6 ++ Eigen/src/LU/arch/Inverse_SSE.h | 152 +++++++++++++++++++++++++++++++++++++++ test/prec_inverse_4x4.cpp | 7 +- 6 files changed, 175 insertions(+), 11 deletions(-) create mode 100644 Eigen/src/LU/arch/CMakeLists.txt create mode 100644 Eigen/src/LU/arch/Inverse_SSE.h diff --git a/COPYING.LGPL b/COPYING.LGPL index fc8a5de7e..0e4fa8aaf 100644 --- a/COPYING.LGPL +++ b/COPYING.LGPL @@ -1,4 +1,4 @@ - GNU LESSER GENERAL PUBLIC LICENSE + GNU LESSER GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. diff --git a/Eigen/LU b/Eigen/LU index c17b9852e..ce11279b6 100644 --- a/Eigen/LU +++ b/Eigen/LU @@ -25,6 +25,7 @@ namespace Eigen { #include "src/LU/PartialPivLU.h" #include "src/LU/Determinant.h" #include "src/LU/Inverse.h" +#include "src/LU/arch/Inverse_SSE.h" } // namespace Eigen diff --git a/Eigen/src/LU/Inverse.h b/Eigen/src/LU/Inverse.h index 8afbfda96..91d02164c 100644 --- a/Eigen/src/LU/Inverse.h +++ b/Eigen/src/LU/Inverse.h @@ -182,10 +182,10 @@ struct ei_compute_inverse_and_det_with_check *** Size 4 implementation *** ****************************/ -template -struct ei_compute_inverse +template +struct ei_compute_inverse_size4 { - static inline void run(const MatrixType& matrix, ResultType& result) + static void run(const MatrixType& matrix, ResultType& result) { result.coeffRef(0,0) = matrix.minor(0,0).determinant(); result.coeffRef(1,0) = -matrix.minor(0,1).determinant(); @@ -207,6 +207,13 @@ struct ei_compute_inverse } }; +template +struct ei_compute_inverse + : ei_compute_inverse_size4 +{ +}; + template struct ei_compute_inverse_and_det_with_check { @@ -238,7 +245,8 @@ template struct ei_inverse_impl : public ReturnByValue > { // for 2x2, it's worth giving a chance to avoid evaluating. - // for larger sizes, evaluating has negligible cost and limits code size. + // for larger sizes, evaluating has negligible cost, limits code size, + // and allows for vectorized paths. typedef typename ei_meta_if< MatrixType::RowsAtCompileTime == 2, typename ei_nested::type, @@ -264,7 +272,7 @@ struct ei_inverse_impl : public ReturnByValue > * * \returns the matrix inverse of this matrix. * - * For small fixed sizes up to 4x4, this method uses ad-hoc methods (cofactors up to 3x3, Euler's trick for 4x4). + * For small fixed sizes up to 4x4, this method uses cofactors. * In the general case, this method uses class PartialPivLU. * * \note This matrix must be invertible, otherwise the result is undefined. If you need an diff --git a/Eigen/src/LU/arch/CMakeLists.txt b/Eigen/src/LU/arch/CMakeLists.txt new file mode 100644 index 000000000..f6b7ed9ec --- /dev/null +++ b/Eigen/src/LU/arch/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_LU_arch_SRCS "*.h") + +INSTALL(FILES + ${Eigen_LU_arch_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/LU/arch COMPONENT Devel + ) diff --git a/Eigen/src/LU/arch/Inverse_SSE.h b/Eigen/src/LU/arch/Inverse_SSE.h new file mode 100644 index 000000000..371861aa5 --- /dev/null +++ b/Eigen/src/LU/arch/Inverse_SSE.h @@ -0,0 +1,152 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 1999 Intel Corporation +// Copyright (C) 2009 Benoit Jacob +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +// The SSE code for the 4x4 float matrix inverse in this file comes from the file +// ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf +// See page ii of that document for legal stuff. Not being lawyers, we just assume +// here that if Intel makes this document publically available, with source code +// and detailed explanations, it's because they want their CPUs to be fed with +// good code, and therefore they presumably don't mind us using it in Eigen. + +#ifndef EIGEN_INVERSE_SSE_H +#define EIGEN_INVERSE_SSE_H + +template +struct ei_compute_inverse_size4 +{ + static void run(const MatrixType& matrix, ResultType& result) + { + // Variables (Streaming SIMD Extensions registers) which will contain cofactors and, later, the + // lines of the inverted matrix. + __m128 minor0, minor1, minor2, minor3; + + // Variables which will contain the lines of the reference matrix and, later (after the transposition), + // the columns of the original matrix. + __m128 row0, row1, row2, row3; + + // Temporary variables and the variable that will contain the matrix determinant. + __m128 det, tmp1; + + // Matrix transposition + const float *src = matrix.data(); + tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4)); + row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12)); + row0 = _mm_shuffle_ps(tmp1, row1, 0x88); + row1 = _mm_shuffle_ps(row1, tmp1, 0xDD); + tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)); + row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14)); + row2 = _mm_shuffle_ps(tmp1, row3, 0x88); + row3 = _mm_shuffle_ps(row3, tmp1, 0xDD); + + + // Cofactors calculation. Because in the process of cofactor computation some pairs in three- + // element products are repeated, it is not reasonable to load these pairs anew every time. The + // values in the registers with these pairs are formed using shuffle instruction. Cofactors are + // calculated row by row (4 elements are placed in 1 SP FP SIMD floating point register). + + tmp1 = _mm_mul_ps(row2, row3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor0 = _mm_mul_ps(row1, tmp1); + minor1 = _mm_mul_ps(row0, tmp1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); + minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); + minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); + // ----------------------------------------------- + tmp1 = _mm_mul_ps(row1, row2); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); + minor3 = _mm_mul_ps(row0, tmp1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); + minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); + minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); + // ----------------------------------------------- + tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + row2 = _mm_shuffle_ps(row2, row2, 0x4E); + minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); + minor2 = _mm_mul_ps(row0, tmp1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); + minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); + minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); + // ----------------------------------------------- + tmp1 = _mm_mul_ps(row0, row1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); + minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); + minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); + // ----------------------------------------------- + tmp1 = _mm_mul_ps(row0, row3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); + minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); + minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); + // ----------------------------------------------- + tmp1 = _mm_mul_ps(row0, row2); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); + minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); + minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); + + // Evaluation of determinant and its reciprocal value. In the original Intel document, + // 1/det was evaluated using a fast rcpps command with subsequent approximation using + // the Newton-Raphson algorithm. Here, we go for a IEEE-compliant division instead, + // so as to not compromise precision at all. + det = _mm_mul_ps(row0, minor0); + det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); + det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); + // tmp1= _mm_rcp_ss(det); + // det= _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))); + det = _mm_div_ps(ei_pset1(1.0f), det); // <--- yay, one original line not copied from Intel + det = _mm_shuffle_ps(det, det, 0x00); + // warning, Intel's variable naming is very confusing: now 'det' is 1/det ! + + // Multiplication of cofactors by 1/det. Storing the inverse matrix to the address in pointer src. + minor0 = _mm_mul_ps(det, minor0); + float *dst = result.data(); + _mm_storel_pi((__m64*)(dst), minor0); + _mm_storeh_pi((__m64*)(dst+2), minor0); + minor1 = _mm_mul_ps(det, minor1); + _mm_storel_pi((__m64*)(dst+4), minor1); + _mm_storeh_pi((__m64*)(dst+6), minor1); + minor2 = _mm_mul_ps(det, minor2); + _mm_storel_pi((__m64*)(dst+ 8), minor2); + _mm_storeh_pi((__m64*)(dst+10), minor2); + minor3 = _mm_mul_ps(det, minor3); + _mm_storel_pi((__m64*)(dst+12), minor3); + _mm_storeh_pi((__m64*)(dst+14), minor3); + } +}; + +#endif // EIGEN_INVERSE_SSE_H + \ No newline at end of file diff --git a/test/prec_inverse_4x4.cpp b/test/prec_inverse_4x4.cpp index 83b1a8a71..8b7dbd8e7 100644 --- a/test/prec_inverse_4x4.cpp +++ b/test/prec_inverse_4x4.cpp @@ -37,12 +37,9 @@ template void inverse_permutation_4x4() MatrixType m = PermutationMatrix<4>(indices); MatrixType inv = m.inverse(); double error = double( (m*inv-MatrixType::Identity()).norm() / epsilon() ); - error_max = std::max(error_max, error); + VERIFY(error == 0.0); std::next_permutation(indices.data(),indices.data()+4); } - std::cerr << "inverse_permutation_4x4, Scalar = " << type_name() << std::endl; - EIGEN_DEBUG_VAR(error_max); - VERIFY(error_max < 1. ); } template void inverse_general_4x4(int repeat) @@ -68,7 +65,7 @@ template void inverse_general_4x4(int repeat) EIGEN_DEBUG_VAR(error_avg); EIGEN_DEBUG_VAR(error_max); VERIFY(error_avg < (NumTraits::IsComplex ? 8.0 : 1.0)); - VERIFY(error_max < (NumTraits::IsComplex ? 64.0 : 16.0)); + VERIFY(error_max < (NumTraits::IsComplex ? 64.0 : 20.0)); } void test_prec_inverse_4x4() -- cgit v1.2.3