diff options
author | Rasmus Munk Larsen <rmlarsen@google.com> | 2021-04-22 15:21:01 +0000 |
---|---|---|
committer | Rasmus Munk Larsen <rmlarsen@google.com> | 2021-04-22 15:21:01 +0000 |
commit | 85a76a16ea835fcfa7d4c185a338ae2aef9a272a (patch) | |
tree | 32e9f3b58c87ef63677d681e8b60b733b843ca06 /Eigen | |
parent | d72c794ccd21637ba56dec0dd8bd0cffef7bc47e (diff) |
Make vectorized compute_inverse_size4 compile with AVX.
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/LU | 4 | ||||
-rw-r--r-- | Eigen/src/LU/arch/InverseSize4.h | 45 |
2 files changed, 25 insertions, 24 deletions
@@ -38,9 +38,7 @@ #include "src/LU/Determinant.h" #include "src/LU/InverseImpl.h" -// Use the SSE optimized version whenever possible. At the moment the -// SSE version doesn't compile when AVX is enabled -#if (defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX) || defined EIGEN_VECTORIZE_NEON +#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON #include "src/LU/arch/InverseSize4.h" #endif diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h index 5a8d0c114..ee5548aed 100644 --- a/Eigen/src/LU/arch/InverseSize4.h +++ b/Eigen/src/LU/arch/InverseSize4.h @@ -54,10 +54,12 @@ struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType { ActualMatrixType matrix(mat); - Packet4f _L1 = matrix.template packet<MatrixAlignment>(0); - Packet4f _L2 = matrix.template packet<MatrixAlignment>(4); - Packet4f _L3 = matrix.template packet<MatrixAlignment>(8); - Packet4f _L4 = matrix.template packet<MatrixAlignment>(12); + const float* data = matrix.data(); + const Index stride = matrix.innerStride(); + Packet4f _L1 = ploadt<Packet4f,MatrixAlignment>(data); + Packet4f _L2 = ploadt<Packet4f,MatrixAlignment>(data + stride*4); + Packet4f _L3 = ploadt<Packet4f,MatrixAlignment>(data + stride*8); + Packet4f _L4 = ploadt<Packet4f,MatrixAlignment>(data + stride*12); // Four 2x2 sub-matrices of the input matrix // input = [[A, B], @@ -189,25 +191,26 @@ struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultTyp Packet2d A1, A2, B1, B2, C1, C2, D1, D2; + const double* data = matrix.data(); + const Index stride = matrix.innerStride(); if (StorageOrdersMatch) { - A1 = matrix.template packet<MatrixAlignment>(0); - B1 = matrix.template packet<MatrixAlignment>(2); - A2 = matrix.template packet<MatrixAlignment>(4); - B2 = matrix.template packet<MatrixAlignment>(6); - C1 = matrix.template packet<MatrixAlignment>(8); - D1 = matrix.template packet<MatrixAlignment>(10); - C2 = matrix.template packet<MatrixAlignment>(12); - D2 = matrix.template packet<MatrixAlignment>(14); + A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0); + B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2); + A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4); + B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6); + C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8); + D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10); + C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12); + D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14); } else { Packet2d temp; - A1 = matrix.template packet<MatrixAlignment>(0); - C1 = matrix.template packet<MatrixAlignment>(2); - A2 = matrix.template packet<MatrixAlignment>(4); - C2 = matrix.template packet<MatrixAlignment>(6); - + A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0); + C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2); + A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4); + C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6); temp = A1; A1 = vec2d_unpacklo(A1, A2); A2 = vec2d_unpackhi(temp, A2); @@ -216,10 +219,10 @@ struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultTyp C1 = vec2d_unpacklo(C1, C2); C2 = vec2d_unpackhi(temp, C2); - B1 = matrix.template packet<MatrixAlignment>(8); - D1 = matrix.template packet<MatrixAlignment>(10); - B2 = matrix.template packet<MatrixAlignment>(12); - D2 = matrix.template packet<MatrixAlignment>(14); + B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8); + D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10); + B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12); + D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14); temp = B1; B1 = vec2d_unpacklo(B1, B2); |