diff options
author | Gael Guennebaud <g.gael@free.fr> | 2019-02-21 17:18:28 +0100 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2019-02-21 17:18:28 +0100 |
commit | cca6c207f42e8706ee581bd67b091e55327cbaca (patch) | |
tree | a6a5ef4f80093cf7d8d54bd569b5b0a52103c955 | |
parent | 1c09ee8541501c37eae05cebae36b417f5f1650a (diff) |
AVX512: implement faster ploadquad<Packet16f> thus speeding up GEMM
-rw-r--r-- | Eigen/src/Core/arch/AVX512/PacketMath.h | 10 |
1 files changed, 4 insertions, 6 deletions
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 3842f576b..c111fd7f0 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -526,13 +526,11 @@ EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) { // {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3} template <> EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) { - Packet16f tmp = _mm512_undefined_ps(); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3); - return tmp; + Packet16f tmp = _mm512_castps128_ps512(pload<Packet4f>(from)); + const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0); + return _mm512_permutexvar_ps(scatter_mask, tmp); } + // Loads 2 doubles from memory a returns the packet // {a0, a0 a0, a0, a1, a1, a1, a1} template <> |