Fix 4x4 inverse with non-linear destination

author: Gael Guennebaud <g.gael@free.fr> 2016-08-30 23:16:38 +0200
committer: Gael Guennebaud <g.gael@free.fr> 2016-08-30 23:16:38 +0200
commit: 8c48d425309821c17e2543c1e0106f308731f8be (patch)
tree: 2a48ae289d9cb287abf02020b09e2a79a416f196
parent: e7fbbc2748dac4addbc429e053ead795968d86c6 (diff)
2 files changed, 31 insertions, 12 deletions
diff --git a/Eigen/src/LU/arch/Inverse_SSE.h b/Eigen/src/LU/arch/Inverse_SSE.h
index e1470c664..ebb64a62b 100644
--- a/Eigen/src/LU/arch/Inverse_SSE.h
+++ b/Eigen/src/LU/arch/Inverse_SSE.h
@@ -153,10 +153,12 @@ struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
     iC = _mm_mul_ps(rd,iC);
     iD = _mm_mul_ps(rd,iD);
 
-    result.template writePacket<ResultAlignment>( 0, _mm_shuffle_ps(iA,iB,0x77));
-    result.template writePacket<ResultAlignment>( 4, _mm_shuffle_ps(iA,iB,0x22));
-    result.template writePacket<ResultAlignment>( 8, _mm_shuffle_ps(iC,iD,0x77));
-    result.template writePacket<ResultAlignment>(12, _mm_shuffle_ps(iC,iD,0x22));
+    Index res_stride = result.outerStride();
+    float* res = result.data();
+    pstoret<float, Packet4f, ResultAlignment>(res+0,            _mm_shuffle_ps(iA,iB,0x77));
+    pstoret<float, Packet4f, ResultAlignment>(res+res_stride,   _mm_shuffle_ps(iA,iB,0x22));
+    pstoret<float, Packet4f, ResultAlignment>(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77));
+    pstoret<float, Packet4f, ResultAlignment>(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22));
   }
 
 };
@@ -316,14 +318,16 @@ struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType>
     iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1);
     iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2);
 
-    result.template writePacket<ResultAlignment>( 0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1));     // iA# / det
-    result.template writePacket<ResultAlignment>( 4, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2));
-    result.template writePacket<ResultAlignment>( 2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1));     // iB# / det
-    result.template writePacket<ResultAlignment>( 6, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2));
-    result.template writePacket<ResultAlignment>( 8, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1));     // iC# / det
-    result.template writePacket<ResultAlignment>(12, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2));
-    result.template writePacket<ResultAlignment>(10, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1));     // iD# / det
-    result.template writePacket<ResultAlignment>(14, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2));
+    Index res_stride = result.outerStride();
+    double* res = result.data();
+    pstoret<double, Packet2d, ResultAlignment>(res+0,             _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res+res_stride,    _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res+2,             _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res+res_stride+2,  _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride,  _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride,  _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2));
   }
 };
 
diff --git a/test/prec_inverse_4x4.cpp b/test/prec_inverse_4x4.cpp
index c4ef2d4bd..eb6ad18c9 100644
--- a/test/prec_inverse_4x4.cpp
+++ b/test/prec_inverse_4x4.cpp
@@ -53,14 +53,29 @@ template<typename MatrixType> void inverse_general_4x4(int repeat)
    // FIXME that 1.25 used to be 1.2 until we tested gcc 4.1 on 30 June 2010 and got 1.21.
   VERIFY(error_avg < (NumTraits<Scalar>::IsComplex ? 8.0 : 1.25));
   VERIFY(error_max < (NumTraits<Scalar>::IsComplex ? 64.0 : 20.0));
+
+  {
+    int s = 5;//internal::random<int>(4,10);
+    int i = 0;//internal::random<int>(0,s-4);
+    int j = 0;//internal::random<int>(0,s-4);
+    Matrix<Scalar,5,5> mat(s,s);
+    mat.setRandom();
+    MatrixType submat = mat.template block<4,4>(i,j);
+    MatrixType mat_inv = mat.template block<4,4>(i,j).inverse();
+    VERIFY_IS_APPROX(mat_inv, submat.inverse());
+    mat.template block<4,4>(i,j) = submat.inverse();
+    VERIFY_IS_APPROX(mat_inv, (mat.template block<4,4>(i,j)));
+  }
 }
 
 void test_prec_inverse_4x4()
 {
   CALL_SUBTEST_1((inverse_permutation_4x4<Matrix4f>()));
   CALL_SUBTEST_1(( inverse_general_4x4<Matrix4f>(200000 * g_repeat) ));
+  CALL_SUBTEST_1(( inverse_general_4x4<Matrix<float,4,4,RowMajor> >(200000 * g_repeat) ));
 
   CALL_SUBTEST_2((inverse_permutation_4x4<Matrix<double,4,4,RowMajor> >()));
+  CALL_SUBTEST_2(( inverse_general_4x4<Matrix<double,4,4,ColMajor> >(200000 * g_repeat) ));
   CALL_SUBTEST_2(( inverse_general_4x4<Matrix<double,4,4,RowMajor> >(200000 * g_repeat) ));
 
   CALL_SUBTEST_3((inverse_permutation_4x4<Matrix4cf>()));
author	Gael Guennebaud <g.gael@free.fr>	2016-08-30 23:16:38 +0200
committer	Gael Guennebaud <g.gael@free.fr>	2016-08-30 23:16:38 +0200
commit	8c48d425309821c17e2543c1e0106f308731f8be (patch)
tree	2a48ae289d9cb287abf02020b09e2a79a416f196
parent	e7fbbc2748dac4addbc429e053ead795968d86c6 (diff)