From 3288e9e168bf89737aded54c4d98470a865d6fd3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Mar 2009 14:01:30 +0000 Subject: add much faster versions of unaligned stores (and slightly faster unaligned loads) --- Eigen/src/Core/arch/SSE/PacketMath.h | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index c2fbcd7cc..a3b4b67fd 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -97,23 +97,25 @@ template<> EIGEN_STRONG_INLINE __m128 ei_pload(const float* from) { re template<> EIGEN_STRONG_INLINE __m128d ei_pload(const double* from) { return _mm_load_pd(from); } template<> EIGEN_STRONG_INLINE __m128i ei_pload(const int* from) { return _mm_load_si128(reinterpret_cast(from)); } -template<> EIGEN_STRONG_INLINE __m128 ei_ploadu(const float* from) { return _mm_loadu_ps(from); } -// template<> EIGEN_STRONG_INLINE __m128 ei_ploadu(const float* from) { -// if (size_t(from)&0xF) -// return _mm_loadu_ps(from); -// else -// return _mm_loadu_ps(from); -// } -template<> EIGEN_STRONG_INLINE __m128d ei_ploadu(const double* from) { return _mm_loadu_pd(from); } -template<> EIGEN_STRONG_INLINE __m128i ei_ploadu(const int* from) { return _mm_loadu_si128(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE __m128 ei_ploadu(const float* from) { + __m128 r; + r = _mm_castpd_ps(_mm_load_sd((double*)(from))); + r = _mm_loadh_pi(r, (const __m64*)(from+2)); + return r; +} +template<> EIGEN_STRONG_INLINE __m128d ei_ploadu(const double* from) { return _mm_castps_pd(ei_ploadu((const float*)(from))); } +template<> EIGEN_STRONG_INLINE __m128i ei_ploadu(const int* from) { return _mm_castpd_si128(ei_ploadu((const double*)(from))); } -template<> EIGEN_STRONG_INLINE void ei_pstore(float* to, const __m128& from) { _mm_store_ps(to, from); } +template<> EIGEN_STRONG_INLINE void ei_pstore(float* to, const __m128& from) { _mm_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void ei_pstore(double* to, const __m128d& from) { _mm_store_pd(to, from); } -template<> EIGEN_STRONG_INLINE void ei_pstore(int* to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } +template<> EIGEN_STRONG_INLINE void ei_pstore(int* to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } -template<> EIGEN_STRONG_INLINE void ei_pstoreu(float* to, const __m128& from) { _mm_storeu_ps(to, from); } -template<> EIGEN_STRONG_INLINE void ei_pstoreu(double* to, const __m128d& from) { _mm_storeu_pd(to, from); } -template<> EIGEN_STRONG_INLINE void ei_pstoreu(int* to, const __m128i& from) { _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } +template<> EIGEN_STRONG_INLINE void ei_pstoreu(double* to, const __m128d& from) { + _mm_storel_pd((to), from); + _mm_storeh_pd((to+1), from); +} +template<> EIGEN_STRONG_INLINE void ei_pstoreu(float* to, const __m128& from) { ei_pstoreu((double*)to, _mm_castps_pd(from)); } +template<> EIGEN_STRONG_INLINE void ei_pstoreu(int* to, const __m128i& from) { ei_pstoreu((double*)to, _mm_castsi128_pd(from)); } #ifdef _MSC_VER // this fix internal compilation error -- cgit v1.2.3