From 85a76a16ea835fcfa7d4c185a338ae2aef9a272a Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 22 Apr 2021 15:21:01 +0000 Subject: Make vectorized compute_inverse_size4 compile with AVX. --- Eigen/LU | 4 +--- Eigen/src/LU/arch/InverseSize4.h | 45 +++++++++++++++++++++------------------- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/Eigen/LU b/Eigen/LU index 0fb184bcb..1236ceb04 100644 --- a/Eigen/LU +++ b/Eigen/LU @@ -38,9 +38,7 @@ #include "src/LU/Determinant.h" #include "src/LU/InverseImpl.h" -// Use the SSE optimized version whenever possible. At the moment the -// SSE version doesn't compile when AVX is enabled -#if (defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX) || defined EIGEN_VECTORIZE_NEON +#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON #include "src/LU/arch/InverseSize4.h" #endif diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h index 5a8d0c114..ee5548aed 100644 --- a/Eigen/src/LU/arch/InverseSize4.h +++ b/Eigen/src/LU/arch/InverseSize4.h @@ -54,10 +54,12 @@ struct compute_inverse_size4(0); - Packet4f _L2 = matrix.template packet(4); - Packet4f _L3 = matrix.template packet(8); - Packet4f _L4 = matrix.template packet(12); + const float* data = matrix.data(); + const Index stride = matrix.innerStride(); + Packet4f _L1 = ploadt(data); + Packet4f _L2 = ploadt(data + stride*4); + Packet4f _L3 = ploadt(data + stride*8); + Packet4f _L4 = ploadt(data + stride*12); // Four 2x2 sub-matrices of the input matrix // input = [[A, B], @@ -189,25 +191,26 @@ struct compute_inverse_size4(0); - B1 = matrix.template packet(2); - A2 = matrix.template packet(4); - B2 = matrix.template packet(6); - C1 = matrix.template packet(8); - D1 = matrix.template packet(10); - C2 = matrix.template packet(12); - D2 = matrix.template packet(14); + A1 = ploadt(data + stride*0); + B1 = ploadt(data + stride*2); + A2 = ploadt(data + stride*4); + B2 = ploadt(data + stride*6); + C1 = ploadt(data + stride*8); + D1 = ploadt(data + stride*10); + C2 = ploadt(data + stride*12); + D2 = ploadt(data + stride*14); } else { Packet2d temp; - A1 = matrix.template packet(0); - C1 = matrix.template packet(2); - A2 = matrix.template packet(4); - C2 = matrix.template packet(6); - + A1 = ploadt(data + stride*0); + C1 = ploadt(data + stride*2); + A2 = ploadt(data + stride*4); + C2 = ploadt(data + stride*6); temp = A1; A1 = vec2d_unpacklo(A1, A2); A2 = vec2d_unpackhi(temp, A2); @@ -216,10 +219,10 @@ struct compute_inverse_size4(8); - D1 = matrix.template packet(10); - B2 = matrix.template packet(12); - D2 = matrix.template packet(14); + B1 = ploadt(data + stride*8); + D1 = ploadt(data + stride*10); + B2 = ploadt(data + stride*12); + D2 = ploadt(data + stride*14); temp = B1; B1 = vec2d_unpacklo(B1, B2); -- cgit v1.2.3