From cf1a7bfbe146e232bd091a0ff2f186ea8803564e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 26 Mar 2014 12:03:31 -0700 Subject: Used AVX instructions to vectorize the complex version of the pfirst and ploaddup packet primitives. Silenced a few compilation warnings. --- Eigen/src/Core/arch/AVX/Complex.h | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'Eigen/src/Core/arch/AVX/Complex.h') diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 17c32d79c..a7a1b7fda 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -78,19 +78,21 @@ template<> EIGEN_STRONG_INLINE Packet4cf pset1(const std::complex EIGEN_STRONG_INLINE Packet4cf ploaddup(const std::complex* from) { - __m256 result; - for (int i = 0; i < 2; ++i) { - result[4*i] = std::real(from[i]); - result[4*i+1] = std::imag(from[i]); - result[4*i+2] = std::real(from[i]); - result[4*i+3] = std::imag(from[i]); - } + // This should be optimized. + __m128 complex1 = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)from); + complex1 = _mm_movelh_ps(complex1, complex1); + __m128 complex2 = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from+1)); + complex2 = _mm_movelh_ps(complex2, complex2); + __m256 result = _mm256_setzero_ps(); + result = _mm256_insertf128_ps(result, complex1, 0); + result = _mm256_insertf128_ps(result, complex2, 1); return Packet4cf(result); } @@ -102,7 +104,10 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::co template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet4cf& a) { - return std::complex(a.v[0], a.v[1]); + __m128 low = _mm256_extractf128_ps(a.v, 0); + std::complex res; + _mm_storel_pi((__m64*)&res, low); + return res; } template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) { @@ -112,7 +117,7 @@ template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) { __m128d highd = _mm_castps_pd(high); low = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1)); high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1)); - __m256 result; + __m256 result = _mm256_setzero_ps(); result = _mm256_insertf128_ps(result, low, 1); result = _mm256_insertf128_ps(result, high, 0); return Packet4cf(result); @@ -300,6 +305,7 @@ template<> EIGEN_STRONG_INLINE Packet2cd pset1(const std::complex EIGEN_STRONG_INLINE void prefetch >(const std::c template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cd& a) { - return std::complex(a.v[0],a.v[1]); + __m128d low = _mm256_extractf128_pd(a.v, 0); + EIGEN_ALIGN16 double res[2]; + _mm_store_pd(res, low); + return std::complex(res[0],res[1]); } template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) { -- cgit v1.2.3