aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/AVX/PacketMath.h
diff options
context:
space:
mode:
Diffstat (limited to 'Eigen/src/Core/arch/AVX/PacketMath.h')
-rw-r--r--Eigen/src/Core/arch/AVX/PacketMath.h20
1 files changed, 12 insertions, 8 deletions
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 8d2e88061..a8b94e191 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -183,18 +183,22 @@ template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGE
template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
{
// TODO try to find a way to avoid the need of a temporary register
- Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from));
- tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
- return _mm256_unpacklo_ps(tmp,tmp);
+// Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from));
+// tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
+// return _mm256_unpacklo_ps(tmp,tmp);
+
+ // _mm256_insertf128_ps is very slow on Haswell, thus:
+ Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
+ // mimic an "inplace" permutation of the lower 128bits using a blend
+ tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
+ // then we can perform a consistent permutation on the global register to get everything in shape:
+ return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
}
// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1}
template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
{
- // TODO try to find a way to avoid the need of a temporary register
- Packet2d tmp0 = _mm_loadu_pd(from);
- Packet2d tmp1 = _mm_permute_pd(tmp0,3);
- tmp0 = _mm_permute_pd(tmp0,0);
- return _mm256_insertf128_pd(_mm256_castpd128_pd256(tmp0), tmp1, 1);
+ Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
+ return _mm256_permute_pd(tmp, 3<<2);
}
// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1}