diff options
author | Gael Guennebaud <g.gael@free.fr> | 2014-03-30 22:43:47 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2014-03-30 22:43:47 +0200 |
commit | 1c0728043a1fe154107c66cc26c33cab9780efa5 (patch) | |
tree | b4c3d3a6a62935e3b9c7c95c7bfab7dd6cafe217 /Eigen/src/Core/arch/AVX | |
parent | e497a27ddc0d724bc0eadde1e211617b766dd100 (diff) |
Workaround alignment warnings
Diffstat (limited to 'Eigen/src/Core/arch/AVX')
-rw-r--r-- | Eigen/src/Core/arch/AVX/Complex.h | 32 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AVX/PacketMath.h | 7 |
2 files changed, 17 insertions, 22 deletions
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 7c1947a4f..cb16180c5 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -87,15 +87,10 @@ template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<flo template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from) { - // This should be optimized. - __m128 complex1 = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)from); - complex1 = _mm_movelh_ps(complex1, complex1); - __m128 complex2 = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from+1)); - complex2 = _mm_movelh_ps(complex2, complex2); - __m256 result = _mm256_setzero_ps(); - result = _mm256_insertf128_ps(result, complex1, 0); - result = _mm256_insertf128_ps(result, complex2, 1); - return Packet4cf(result); + // FIXME The following might be optimized using _mm256_movedup_pd + Packet2cf a = ploaddup<Packet2cf>(from); + Packet2cf b = ploaddup<Packet2cf>(from+1); + return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1)); } template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } @@ -104,33 +99,30 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, int stride) { return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]), - std::imag(from[2*stride]), std::real(from[2*stride]), - std::imag(from[1*stride]), std::real(from[1*stride]), - std::imag(from[0*stride]), std::real(from[0*stride]))); + std::imag(from[2*stride]), std::real(from[2*stride]), + std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); } template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, int stride) { __m128 low = _mm256_extractf128_ps(from.v, 0); to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)), - _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1))); + _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1))); to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)), - _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3))); + _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3))); __m128 high = _mm256_extractf128_ps(from.v, 1); to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)), - _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1))); + _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1))); to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)), - _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3))); + _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3))); } template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a) { - __m128 low = _mm256_extractf128_ps(a.v, 0); - std::complex<float> res; - _mm_storel_pi((__m64*)&res, low); - return res; + return pfirst(Packet2cf(_mm256_castps256_ps128(a.v))); } template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) { diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 132c1abe3..38f52ecc8 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -173,6 +173,7 @@ template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGE // Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from) { + // FIXME we should only load the first 128bits Packet8f tmp = ploadu<Packet8f>(from); Packet8f tmp1 = _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); Packet8f tmp2 = _mm256_permute_ps(tmp, _MM_SHUFFLE(1,1,0,0)); @@ -181,6 +182,7 @@ template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from) // Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from) { + // FIXME we should only load the first 128bits Packet4d tmp = ploadu<Packet4d>(from); Packet4d tmp1 = _mm256_permute_pd(tmp,0); Packet4d tmp2 = _mm256_permute_pd(tmp,3); @@ -195,11 +197,12 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } -// TODO: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available +// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available +// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4); template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, int stride) { return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride], - from[3*stride], from[2*stride], from[1*stride], from[0*stride]); + from[3*stride], from[2*stride], from[1*stride], from[0*stride]); } template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, int stride) { |