diff options
author | Gael Guennebaud <g.gael@free.fr> | 2014-04-17 20:51:04 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2014-04-17 20:51:04 +0200 |
commit | 9746396d1b8d039d3d0d6537ad477135e5e9d3f5 (patch) | |
tree | fe473674d1ad00fd0513e218d5faf2c45781be32 /Eigen | |
parent | 1dd015fea64048219aa4c2d616fb56e0c37bad47 (diff) |
Optimize AVX pset1 for complexes and ploaddup
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/src/Core/arch/AVX/Complex.h | 14 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AVX/PacketMath.h | 20 |
2 files changed, 16 insertions, 18 deletions
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 8f95a7be7..d0646e77d 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -78,11 +78,7 @@ template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<fl template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from) { - const float r = std::real(from); - const float i = std::imag(from); - // Beware, _mm256_set_ps expects the scalar values in reverse order (i.e. 7 to 0) - const __m256 result = _mm256_set_ps(i, r, i, r, i, r, i, r); - return Packet4cf(result); + return Packet4cf(_mm256_castps_pd(_mm256_broadcast_sd((const double*)(const void*)&from))); } template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from) @@ -304,11 +300,9 @@ template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<do template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from) { - const double r = std::real(from); - const double i = std::imag(from); - // Beware, _mm256_set_pd expects the scalar values in reverse order (i.e. 3 to 0) - const __m256d result = _mm256_set_pd(i, r, i, r); - return Packet2cd(result); + // in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though) +// return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from)); + return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from)); } template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 8d2e88061..a8b94e191 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -183,18 +183,22 @@ template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGE template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from) { // TODO try to find a way to avoid the need of a temporary register - Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); - tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); - return _mm256_unpacklo_ps(tmp,tmp); +// Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); +// tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); +// return _mm256_unpacklo_ps(tmp,tmp); + + // _mm256_insertf128_ps is very slow on Haswell, thus: + Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); + // mimic an "inplace" permutation of the lower 128bits using a blend + tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15); + // then we can perform a consistent permutation on the global register to get everything in shape: + return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); } // Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from) { - // TODO try to find a way to avoid the need of a temporary register - Packet2d tmp0 = _mm_loadu_pd(from); - Packet2d tmp1 = _mm_permute_pd(tmp0,3); - tmp0 = _mm_permute_pd(tmp0,0); - return _mm256_insertf128_pd(_mm256_castpd128_pd256(tmp0), tmp1, 1); + Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from); + return _mm256_permute_pd(tmp, 3<<2); } // Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1} |