diff options
author | 2010-07-19 08:50:59 +0200 | |
---|---|---|
committer | 2010-07-19 08:50:59 +0200 | |
commit | cd0e5dca9bba943869ab7c98d370fcfc8456997a (patch) | |
tree | faf3a7b29a10109e8d772b72a9f16816a7f2bdc3 /Eigen/src/Core/arch | |
parent | 45362f4eaea04eadedca3201352733604a145346 (diff) |
wip: extend the gebp kernel to optimize complex and mixed products
Diffstat (limited to 'Eigen/src/Core/arch')
-rw-r--r-- | Eigen/src/Core/arch/SSE/Complex.h | 32 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/PacketMath.h | 19 |
2 files changed, 43 insertions, 8 deletions
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 8184159c7..baa25907c 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -73,9 +73,12 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_pmul<Packet2cf>(const Packet2cf& a, { // TODO optimize it for SSE3 and 4 #ifdef EIGEN_VECTORIZE_SSE3 - return Packet2cf(_mm_addsub_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), - _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3), + return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v), + _mm_mul_ps(_mm_movehdup_ps(a.v), ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)))); +// return Packet2cf(_mm_addsub_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), +// _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3), +// ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)))); #else const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000)); return Packet2cf(_mm_add_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), @@ -224,6 +227,12 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv<Packet2cf>(const Packet2cf& a, return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); } +EIGEN_STRONG_INLINE Packet2cf ei_pcplxflip/*<Packet2cf>*/(const Packet2cf& x) +{ + return Packet2cf(ei_vec4f_swizzle1(x.v, 1, 0, 3, 2)); +} + + //---------- double ---------- struct Packet1cd { @@ -268,9 +277,13 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pmul<Packet1cd>(const Packet1cd& a, { // TODO optimize it for SSE3 and 4 #ifdef EIGEN_VECTORIZE_SSE3 - return Packet1cd(_mm_addsub_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), - _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1), - ei_vec2d_swizzle1(b.v, 1, 0)))); +// return Packet1cd(_mm_addsub_pd(_mm_mul_pd(a.v, b.v), +// _mm_mul_pd(a.v, b.v/*ei_vec2d_swizzle1(b.v, 1, 0)*/))); + return Packet1cd(_mm_add_pd(_mm_mul_pd(a.v, b.v), + _mm_mul_pd(a.v, ei_vec2d_swizzle1(b.v, 1, 0)))); +// return Packet1cd(_mm_addsub_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), +// _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1), +// ei_vec2d_swizzle1(b.v, 1, 0)))); #else const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), @@ -286,14 +299,14 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pandnot<Packet1cd>(const Packet1cd& // FIXME force unaligned load, this is a temporary fix template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <Packet1cd>(const std::complex<double>* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu<Packet2d>((const double*)from)); } +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_pload<Packet2d>((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu<Packet2d>((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<Packet1cd>(const std::complex<double>& from) { /* here we really have to use unaligned loads :( */ return ei_ploadu<Packet1cd>(&from); } // FIXME force unaligned store, this is a temporary fix -template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstoreu((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } @@ -415,4 +428,9 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pdiv<Packet1cd>(const Packet1cd& a, return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); } +EIGEN_STRONG_INLINE Packet1cd ei_pcplxflip/*<Packet1cd>*/(const Packet1cd& x) +{ + return Packet1cd(ei_preverse(x.v)); +} + #endif // EIGEN_COMPLEX_SSE_H diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 53a9bcf56..87ebb783a 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -45,7 +45,9 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; }; #define ei_vec2d_swizzle1(v,p,q) \ (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) - +// #define ei_vec2d_swizzle1(v,p,q) \ + (_mm_shuffle_pd(v,v, (q)<<1|(p) )) + #define ei_vec4f_swizzle2(a,b,p,q,r,s) \ (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) @@ -255,6 +257,21 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from) } #endif +template<> EIGEN_STRONG_INLINE Packet4f ei_ploaddup<Packet4f>(const float* from) +{ + Packet4f tmp; + tmp = _mm_loadl_pi(tmp,(__m64*)from); + return ei_vec4f_swizzle1(tmp, 0, 0, 1, 1); +} +template<> EIGEN_STRONG_INLINE Packet2d ei_ploaddup<Packet2d>(const double* from) +{ return ei_pset1<Packet2d>(from[0]); } +template<> EIGEN_STRONG_INLINE Packet4i ei_ploaddup<Packet4i>(const int* from) +{ + Packet4i tmp; + tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from)); + return ei_vec4i_swizzle1(tmp, 0, 0, 1, 1); +} + template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); } |