From 8c48d425309821c17e2543c1e0106f308731f8be Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 30 Aug 2016 23:16:38 +0200 Subject: Fix 4x4 inverse with non-linear destination --- Eigen/src/LU/arch/Inverse_SSE.h | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'Eigen/src/LU') diff --git a/Eigen/src/LU/arch/Inverse_SSE.h b/Eigen/src/LU/arch/Inverse_SSE.h index e1470c664..ebb64a62b 100644 --- a/Eigen/src/LU/arch/Inverse_SSE.h +++ b/Eigen/src/LU/arch/Inverse_SSE.h @@ -153,10 +153,12 @@ struct compute_inverse_size4 iC = _mm_mul_ps(rd,iC); iD = _mm_mul_ps(rd,iD); - result.template writePacket( 0, _mm_shuffle_ps(iA,iB,0x77)); - result.template writePacket( 4, _mm_shuffle_ps(iA,iB,0x22)); - result.template writePacket( 8, _mm_shuffle_ps(iC,iD,0x77)); - result.template writePacket(12, _mm_shuffle_ps(iC,iD,0x22)); + Index res_stride = result.outerStride(); + float* res = result.data(); + pstoret(res+0, _mm_shuffle_ps(iA,iB,0x77)); + pstoret(res+res_stride, _mm_shuffle_ps(iA,iB,0x22)); + pstoret(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77)); + pstoret(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22)); } }; @@ -316,14 +318,16 @@ struct compute_inverse_size4 iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1); iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2); - result.template writePacket( 0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1)); // iA# / det - result.template writePacket( 4, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2)); - result.template writePacket( 2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1)); // iB# / det - result.template writePacket( 6, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2)); - result.template writePacket( 8, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1)); // iC# / det - result.template writePacket(12, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2)); - result.template writePacket(10, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1)); // iD# / det - result.template writePacket(14, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2)); + Index res_stride = result.outerStride(); + double* res = result.data(); + pstoret(res+0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1)); + pstoret(res+res_stride, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2)); + pstoret(res+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1)); + pstoret(res+res_stride+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2)); + pstoret(res+2*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1)); + pstoret(res+3*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2)); + pstoret(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1)); + pstoret(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2)); } }; -- cgit v1.2.3