diff options
author | Gael Guennebaud <g.gael@free.fr> | 2009-06-24 10:48:36 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2009-06-24 10:48:36 +0200 |
commit | a44f7cf440b009019e6d957195aa1f438ff82b81 (patch) | |
tree | a21f57711eedd0fa961207ef6346b08617fd0bd0 | |
parent | aa17b5b5143306546f5f3e15a8fe0c9d39b0285a (diff) |
re-enable the fast unaligned loads for gcc and icc using inline assembly
(this allows to avoid incompatible pointer casts and to specify the dependency to the data explicitely)
-rw-r--r-- | Eigen/src/Core/arch/SSE/PacketMath.h | 31 |
1 files changed, 30 insertions, 1 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 7af63bbc9..660ca61c6 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -44,7 +44,7 @@ typedef __m128d Packet2d; #define ei_vec4i_swizzle2(a,b,p,q,r,s) \ (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) - + #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f ei_p4f_##NAME = ei_pset1<float>(X) @@ -163,9 +163,38 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double* from) { return _mm_load_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); } +#if (!defined __GNUC__) || (!defined __ICC) template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { return _mm_loadu_ps(from); } template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double* from) { return _mm_loadu_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); } +#else +// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would +// require pointer casting to incompatible pointer types and leads to invalid code +// because of the strict aliasing rule. The "dummy" stuff are required to enforce +// a correct instruction dependency. +// TODO: do the same for MSVC (ICC is compatible) +template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) +{ + __m128 res; + asm("movsd %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from), [dummy] "m" (*(from+1)) ); + asm("movhps %[from2], %[r]" : [r] "+x" (res) : [from2] "m" (*(from+2)), [dummy] "m" (*(from+3)) ); + return res; +} +template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from) +{ + __m128d res; + asm("movsd %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from) ); + asm("movhpd %[from1], %[r]" : [r] "+x" (res) : [from1] "m" (*(from+1)) ); + return res; +} +template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) +{ + __m128i res; + asm("movsd %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from), [dummy] "m" (*(from+1)) ); + asm("movhps %[from2], %[r]" : [r] "+x" (res) : [from2] "m" (*(from+2)), [dummy] "m" (*(from+3)) ); + return res; +} +#endif template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float* to, const Packet4f& from) { _mm_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const Packet2d& from) { _mm_store_pd(to, from); } |