From d02eccf5840b356e741ea0e3166db2c7623b88c4 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Mon, 14 Dec 2009 22:47:14 -0500
Subject: add SSE path for Matrix4f inverse, taken from Intel except that we do
 a kosher division instead of RCPPS-followed-by-Newton-Raphson. The rationale
 for that is that elsewhere in Eigen we dont allow ourselves this
 approximation (which throws 2 bits of mantissa), so there's no reason we
 should allow it here.

---
 COPYING.LGPL                     |   2 +-
 Eigen/LU                         |   1 +
 Eigen/src/LU/Inverse.h           |  18 +++--
 Eigen/src/LU/arch/CMakeLists.txt |   6 ++
 Eigen/src/LU/arch/Inverse_SSE.h  | 152 +++++++++++++++++++++++++++++++++++++++
 test/prec_inverse_4x4.cpp        |   7 +-
 6 files changed, 175 insertions(+), 11 deletions(-)
 create mode 100644 Eigen/src/LU/arch/CMakeLists.txt
 create mode 100644 Eigen/src/LU/arch/Inverse_SSE.h
diff --git a/COPYING.LGPL b/COPYING.LGPL
index fc8a5de7e..0e4fa8aaf 100644
--- a/COPYING.LGPL
+++ b/COPYING.LGPL
@@ -1,4 +1,4 @@
-		   GNU LESSER GENERAL PUBLIC LICENSE
+                  GNU LESSER GENERAL PUBLIC LICENSE
                        Version 3, 29 June 2007
 
  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
diff --git a/Eigen/LU b/Eigen/LU
index c17b9852e..ce11279b6 100644
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -25,6 +25,7 @@ namespace Eigen {
 #include "src/LU/PartialPivLU.h"
 #include "src/LU/Determinant.h"
 #include "src/LU/Inverse.h"
+#include "src/LU/arch/Inverse_SSE.h"
 
 } // namespace Eigen
 
diff --git a/Eigen/src/LU/Inverse.h b/Eigen/src/LU/Inverse.h
index 8afbfda96..91d02164c 100644
--- a/Eigen/src/LU/Inverse.h
+++ b/Eigen/src/LU/Inverse.h
@@ -182,10 +182,10 @@ struct ei_compute_inverse_and_det_with_check<MatrixType, ResultType, 3>
 *** Size 4 implementation ***
 ****************************/
 
-template<typename MatrixType, typename ResultType>
-struct ei_compute_inverse<MatrixType, ResultType, 4>
+template<int Arch, typename Scalar, typename MatrixType, typename ResultType>
+struct ei_compute_inverse_size4
 {
-  static inline void run(const MatrixType& matrix, ResultType& result)
+  static void run(const MatrixType& matrix, ResultType& result)
   {
     result.coeffRef(0,0) = matrix.minor(0,0).determinant();
     result.coeffRef(1,0) = -matrix.minor(0,1).determinant();
@@ -207,6 +207,13 @@ struct ei_compute_inverse<MatrixType, ResultType, 4>
   }
 };
 
+template<typename MatrixType, typename ResultType>
+struct ei_compute_inverse<MatrixType, ResultType, 4>
+ : ei_compute_inverse_size4<Architecture::Target, typename MatrixType::Scalar,
+                            MatrixType, ResultType>
+{
+};
+
 template<typename MatrixType, typename ResultType>
 struct ei_compute_inverse_and_det_with_check<MatrixType, ResultType, 4>
 {
@@ -238,7 +245,8 @@ template<typename MatrixType>
 struct ei_inverse_impl : public ReturnByValue<ei_inverse_impl<MatrixType> >
 {
   // for 2x2, it's worth giving a chance to avoid evaluating.
-  // for larger sizes, evaluating has negligible cost and limits code size.
+  // for larger sizes, evaluating has negligible cost, limits code size,
+  // and allows for vectorized paths.
   typedef typename ei_meta_if<
     MatrixType::RowsAtCompileTime == 2,
     typename ei_nested<MatrixType,2>::type,
@@ -264,7 +272,7 @@ struct ei_inverse_impl : public ReturnByValue<ei_inverse_impl<MatrixType> >
   *
   * \returns the matrix inverse of this matrix.
   *
-  * For small fixed sizes up to 4x4, this method uses ad-hoc methods (cofactors up to 3x3, Euler's trick for 4x4).
+  * For small fixed sizes up to 4x4, this method uses cofactors.
   * In the general case, this method uses class PartialPivLU.
   *
   * \note This matrix must be invertible, otherwise the result is undefined. If you need an
diff --git a/Eigen/src/LU/arch/CMakeLists.txt b/Eigen/src/LU/arch/CMakeLists.txt
new file mode 100644
index 000000000..f6b7ed9ec
--- /dev/null
+++ b/Eigen/src/LU/arch/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_LU_arch_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_LU_arch_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/LU/arch COMPONENT Devel
+  )
diff --git a/Eigen/src/LU/arch/Inverse_SSE.h b/Eigen/src/LU/arch/Inverse_SSE.h
new file mode 100644
index 000000000..371861aa5
--- /dev/null
+++ b/Eigen/src/LU/arch/Inverse_SSE.h
@@ -0,0 +1,152 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 1999 Intel Corporation
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+// The SSE code for the 4x4 float matrix inverse in this file comes from the file
+//  ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf
+// See page ii of that document for legal stuff. Not being lawyers, we just assume
+// here that if Intel makes this document publically available, with source code
+// and detailed explanations, it's because they want their CPUs to be fed with
+// good code, and therefore they presumably don't mind us using it in Eigen.
+
+#ifndef EIGEN_INVERSE_SSE_H
+#define EIGEN_INVERSE_SSE_H
+
+template<typename MatrixType, typename ResultType>
+struct ei_compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
+{
+  static void run(const MatrixType& matrix, ResultType& result)
+  {
+    // Variables (Streaming SIMD Extensions registers) which will contain cofactors and, later, the
+    // lines of the inverted matrix.
+    __m128 minor0, minor1, minor2, minor3;
+
+    // Variables which will contain the lines of the reference matrix and, later (after the transposition),
+    // the columns of the original matrix.
+    __m128 row0, row1, row2, row3;
+
+    // Temporary variables and the variable that will contain the matrix determinant.
+    __m128 det, tmp1;
+
+    // Matrix transposition
+    const float *src = matrix.data();
+    tmp1  = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
+    row1  = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
+    row0  = _mm_shuffle_ps(tmp1, row1, 0x88);
+    row1  = _mm_shuffle_ps(row1, tmp1, 0xDD);
+    tmp1  = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
+    row3  = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
+    row2  = _mm_shuffle_ps(tmp1, row3, 0x88);
+    row3  = _mm_shuffle_ps(row3, tmp1, 0xDD);
+
+
+    // Cofactors calculation. Because in the process of cofactor computation some pairs in three-
+    // element products are repeated, it is not reasonable to load these pairs anew every time. The
+    // values in the registers with these pairs are formed using shuffle instruction. Cofactors are
+    // calculated row by row (4 elements are placed in 1 SP FP SIMD floating point register).
+
+    tmp1   = _mm_mul_ps(row2, row3);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor0  = _mm_mul_ps(row1, tmp1);
+    minor1  = _mm_mul_ps(row0, tmp1);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor0  = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
+    minor1  = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
+    minor1  = _mm_shuffle_ps(minor1, minor1, 0x4E);
+    //    -----------------------------------------------
+    tmp1   = _mm_mul_ps(row1, row2);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor0  = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
+    minor3  = _mm_mul_ps(row0, tmp1);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor0  = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
+    minor3  = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
+    minor3  = _mm_shuffle_ps(minor3, minor3, 0x4E);
+    //    -----------------------------------------------
+    tmp1   = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    row2   = _mm_shuffle_ps(row2, row2, 0x4E);
+    minor0  = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
+    minor2  = _mm_mul_ps(row0, tmp1);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor0  = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
+    minor2  = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
+    minor2  = _mm_shuffle_ps(minor2, minor2, 0x4E);
+    //    -----------------------------------------------
+    tmp1   = _mm_mul_ps(row0, row1);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
+    minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
+    minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
+    //           -----------------------------------------------
+    tmp1   = _mm_mul_ps(row0, row3);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
+    minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
+    minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
+    //           -----------------------------------------------
+    tmp1   = _mm_mul_ps(row0, row2);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
+    minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
+    minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
+
+    // Evaluation of determinant and its reciprocal value. In the original Intel document,
+    // 1/det was evaluated using a fast rcpps command with subsequent approximation using
+    // the Newton-Raphson algorithm. Here, we go for a IEEE-compliant division instead,
+    // so as to not compromise precision at all.
+    det    = _mm_mul_ps(row0, minor0);
+    det    = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
+    det    = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
+    // tmp1= _mm_rcp_ss(det);
+    // det= _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
+    det    = _mm_div_ps(ei_pset1<float>(1.0f), det); // <--- yay, one original line not copied from Intel
+    det    = _mm_shuffle_ps(det, det, 0x00);
+    // warning, Intel's variable naming is very confusing: now 'det' is 1/det !
+
+    // Multiplication of cofactors by 1/det. Storing the inverse matrix to the address in pointer src.
+    minor0 = _mm_mul_ps(det, minor0);
+    float *dst = result.data();
+    _mm_storel_pi((__m64*)(dst), minor0);
+    _mm_storeh_pi((__m64*)(dst+2), minor0);
+    minor1 = _mm_mul_ps(det, minor1);
+    _mm_storel_pi((__m64*)(dst+4), minor1);
+    _mm_storeh_pi((__m64*)(dst+6), minor1);
+    minor2 = _mm_mul_ps(det, minor2);
+    _mm_storel_pi((__m64*)(dst+ 8), minor2);
+    _mm_storeh_pi((__m64*)(dst+10), minor2);
+    minor3 = _mm_mul_ps(det, minor3);
+    _mm_storel_pi((__m64*)(dst+12), minor3);
+    _mm_storeh_pi((__m64*)(dst+14), minor3);
+  }
+};
+
+#endif // EIGEN_INVERSE_SSE_H
+                                                                                                    
\ No newline at end of file
diff --git a/test/prec_inverse_4x4.cpp b/test/prec_inverse_4x4.cpp
index 83b1a8a71..8b7dbd8e7 100644
--- a/test/prec_inverse_4x4.cpp
+++ b/test/prec_inverse_4x4.cpp
@@ -37,12 +37,9 @@ template<typename MatrixType> void inverse_permutation_4x4()
     MatrixType m = PermutationMatrix<4>(indices);
     MatrixType inv = m.inverse();
     double error = double( (m*inv-MatrixType::Identity()).norm() / epsilon<Scalar>() );
-    error_max = std::max(error_max, error);
+    VERIFY(error == 0.0);
     std::next_permutation(indices.data(),indices.data()+4);
   }
-  std::cerr << "inverse_permutation_4x4, Scalar = " << type_name<Scalar>() << std::endl;
-  EIGEN_DEBUG_VAR(error_max);
-  VERIFY(error_max < 1. );
 }
 
 template<typename MatrixType> void inverse_general_4x4(int repeat)
@@ -68,7 +65,7 @@ template<typename MatrixType> void inverse_general_4x4(int repeat)
   EIGEN_DEBUG_VAR(error_avg);
   EIGEN_DEBUG_VAR(error_max);
   VERIFY(error_avg < (NumTraits<Scalar>::IsComplex ? 8.0 : 1.0));
-  VERIFY(error_max < (NumTraits<Scalar>::IsComplex ? 64.0 : 16.0));
+  VERIFY(error_max < (NumTraits<Scalar>::IsComplex ? 64.0 : 20.0));
 }
 
 void test_prec_inverse_4x4()
-- 
cgit v1.2.3