From 50105c3ed6a339faee730b22345241907a43fd6d Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 16 Dec 2008 03:48:49 +0000 Subject: Hopefully fix compilation of SSE Packetmath with MSVC. The reason why we didn't realize until now that it didn't compile at all with MSVC is that before today with MSVC the SSE2 detection didn't work. --- Eigen/src/Core/arch/SSE/PacketMath.h | 92 ++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 46 deletions(-) (limited to 'Eigen') diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 8b4348fea..c6740f414 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -37,17 +37,17 @@ template<> struct ei_unpacket_traits<__m128> { typedef float type; enum {size= template<> struct ei_unpacket_traits<__m128d> { typedef double type; enum {size=2}; }; template<> struct ei_unpacket_traits<__m128i> { typedef int type; enum {size=4}; }; -template<> inline __m128 ei_padd(const __m128& a, const __m128& b) { return _mm_add_ps(a,b); } -template<> inline __m128d ei_padd(const __m128d& a, const __m128d& b) { return _mm_add_pd(a,b); } -template<> inline __m128i ei_padd(const __m128i& a, const __m128i& b) { return _mm_add_epi32(a,b); } +template<> inline __m128 ei_padd<__m128>(const __m128& a, const __m128& b) { return _mm_add_ps(a,b); } +template<> inline __m128d ei_padd<__m128d>(const __m128d& a, const __m128d& b) { return _mm_add_pd(a,b); } +template<> inline __m128i ei_padd<__m128i>(const __m128i& a, const __m128i& b) { return _mm_add_epi32(a,b); } -template<> inline __m128 ei_psub(const __m128& a, const __m128& b) { return _mm_sub_ps(a,b); } -template<> inline __m128d ei_psub(const __m128d& a, const __m128d& b) { return _mm_sub_pd(a,b); } -template<> inline __m128i ei_psub(const __m128i& a, const __m128i& b) { return _mm_sub_epi32(a,b); } +template<> inline __m128 ei_psub<__m128>(const __m128& a, const __m128& b) { return _mm_sub_ps(a,b); } +template<> inline __m128d ei_psub<__m128d>(const __m128d& a, const __m128d& b) { return _mm_sub_pd(a,b); } +template<> inline __m128i ei_psub<__m128i>(const __m128i& a, const __m128i& b) { return _mm_sub_epi32(a,b); } -template<> inline __m128 ei_pmul(const __m128& a, const __m128& b) { return _mm_mul_ps(a,b); } -template<> inline __m128d ei_pmul(const __m128d& a, const __m128d& b) { return _mm_mul_pd(a,b); } -template<> inline __m128i ei_pmul(const __m128i& a, const __m128i& b) +template<> inline __m128 ei_pmul<__m128>(const __m128& a, const __m128& b) { return _mm_mul_ps(a,b); } +template<> inline __m128d ei_pmul<__m128d>(const __m128d& a, const __m128d& b) { return _mm_mul_pd(a,b); } +template<> inline __m128i ei_pmul<__m128i>(const __m128i& a, const __m128i& b) { return _mm_or_si128( _mm_and_si128( @@ -59,9 +59,9 @@ template<> inline __m128i ei_pmul(const __m128i& a, const __m128i& b) _mm_setr_epi32(0xffffffff,0,0xffffffff,0)), 4)); } -template<> inline __m128 ei_pdiv(const __m128& a, const __m128& b) { return _mm_div_ps(a,b); } -template<> inline __m128d ei_pdiv(const __m128d& a, const __m128d& b) { return _mm_div_pd(a,b); } -template<> inline __m128i ei_pdiv(const __m128i& /*a*/, const __m128i& /*b*/) +template<> inline __m128 ei_pdiv<__m128>(const __m128& a, const __m128& b) { return _mm_div_ps(a,b); } +template<> inline __m128d ei_pdiv<__m128d>(const __m128d& a, const __m128d& b) { return _mm_div_pd(a,b); } +template<> inline __m128i ei_pdiv<__m128i>(const __m128i& /*a*/, const __m128i& /*b*/) { ei_assert(false && "packet integer division are not supported by SSE"); __m128i dummy; return dummy; @@ -70,61 +70,61 @@ template<> inline __m128i ei_pdiv(const __m128i& /*a*/, const __m128i& /*b*/) // for some weird raisons, it has to be overloaded for packet integer template<> inline __m128i ei_pmadd(const __m128i& a, const __m128i& b, const __m128i& c) { return ei_padd(ei_pmul(a,b), c); } -template<> inline __m128 ei_pmin(const __m128& a, const __m128& b) { return _mm_min_ps(a,b); } -template<> inline __m128d ei_pmin(const __m128d& a, const __m128d& b) { return _mm_min_pd(a,b); } +template<> inline __m128 ei_pmin<__m128>(const __m128& a, const __m128& b) { return _mm_min_ps(a,b); } +template<> inline __m128d ei_pmin<__m128d>(const __m128d& a, const __m128d& b) { return _mm_min_pd(a,b); } // FIXME this vectorized min operator is likely to be slower than the standard one -template<> inline __m128i ei_pmin(const __m128i& a, const __m128i& b) +template<> inline __m128i ei_pmin<__m128i>(const __m128i& a, const __m128i& b) { __m128i mask = _mm_cmplt_epi32(a,b); return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); } -template<> inline __m128 ei_pmax(const __m128& a, const __m128& b) { return _mm_max_ps(a,b); } -template<> inline __m128d ei_pmax(const __m128d& a, const __m128d& b) { return _mm_max_pd(a,b); } +template<> inline __m128 ei_pmax<__m128>(const __m128& a, const __m128& b) { return _mm_max_ps(a,b); } +template<> inline __m128d ei_pmax<__m128d>(const __m128d& a, const __m128d& b) { return _mm_max_pd(a,b); } // FIXME this vectorized max operator is likely to be slower than the standard one -template<> inline __m128i ei_pmax(const __m128i& a, const __m128i& b) +template<> inline __m128i ei_pmax<__m128i>(const __m128i& a, const __m128i& b) { __m128i mask = _mm_cmpgt_epi32(a,b); return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); } -template<> inline __m128 ei_pload(const float* from) { return _mm_load_ps(from); } -template<> inline __m128d ei_pload(const double* from) { return _mm_load_pd(from); } -template<> inline __m128i ei_pload(const int* from) { return _mm_load_si128(reinterpret_cast(from)); } +template<> inline __m128 ei_pload(const float* from) { return _mm_load_ps(from); } +template<> inline __m128d ei_pload(const double* from) { return _mm_load_pd(from); } +template<> inline __m128i ei_pload(const int* from) { return _mm_load_si128(reinterpret_cast(from)); } -template<> inline __m128 ei_ploadu(const float* from) { return _mm_loadu_ps(from); } +template<> inline __m128 ei_ploadu(const float* from) { return _mm_loadu_ps(from); } // template<> inline __m128 ei_ploadu(const float* from) { // if (size_t(from)&0xF) // return _mm_loadu_ps(from); // else // return _mm_loadu_ps(from); // } -template<> inline __m128d ei_ploadu(const double* from) { return _mm_loadu_pd(from); } -template<> inline __m128i ei_ploadu(const int* from) { return _mm_loadu_si128(reinterpret_cast(from)); } +template<> inline __m128d ei_ploadu(const double* from) { return _mm_loadu_pd(from); } +template<> inline __m128i ei_ploadu(const int* from) { return _mm_loadu_si128(reinterpret_cast(from)); } -template<> inline __m128 ei_pset1(const float& from) { return _mm_set1_ps(from); } -template<> inline __m128d ei_pset1(const double& from) { return _mm_set1_pd(from); } -template<> inline __m128i ei_pset1(const int& from) { return _mm_set1_epi32(from); } +template<> inline __m128 ei_pset1(const float& from) { return _mm_set1_ps(from); } +template<> inline __m128d ei_pset1(const double& from) { return _mm_set1_pd(from); } +template<> inline __m128i ei_pset1(const int& from) { return _mm_set1_epi32(from); } -template<> inline void ei_pstore(float* to, const __m128& from) { _mm_store_ps(to, from); } -template<> inline void ei_pstore(double* to, const __m128d& from) { _mm_store_pd(to, from); } -template<> inline void ei_pstore(int* to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } +template<> inline void ei_pstore(float* to, const __m128& from) { _mm_store_ps(to, from); } +template<> inline void ei_pstore(double* to, const __m128d& from) { _mm_store_pd(to, from); } +template<> inline void ei_pstore(int* to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } -template<> inline void ei_pstoreu(float* to, const __m128& from) { _mm_storeu_ps(to, from); } -template<> inline void ei_pstoreu(double* to, const __m128d& from) { _mm_storeu_pd(to, from); } -template<> inline void ei_pstoreu(int* to, const __m128i& from) { _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } +template<> inline void ei_pstoreu(float* to, const __m128& from) { _mm_storeu_ps(to, from); } +template<> inline void ei_pstoreu(double* to, const __m128d& from) { _mm_storeu_pd(to, from); } +template<> inline void ei_pstoreu(int* to, const __m128i& from) { _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } -template<> inline float ei_pfirst(const __m128& a) { return _mm_cvtss_f32(a); } -template<> inline double ei_pfirst(const __m128d& a) { return _mm_cvtsd_f64(a); } -template<> inline int ei_pfirst(const __m128i& a) { return _mm_cvtsi128_si32(a); } +template<> inline float ei_pfirst<__m128>(const __m128& a) { return _mm_cvtss_f32(a); } +template<> inline double ei_pfirst<__m128d>(const __m128d& a) { return _mm_cvtsd_f64(a); } +template<> inline int ei_pfirst<__m128i>(const __m128i& a) { return _mm_cvtsi128_si32(a); } #ifdef __SSE3__ // TODO implement SSE2 versions as well as integer versions -inline __m128 ei_preduxp(const __m128* vecs) +template<> inline __m128 ei_preduxp<__m128>(const __m128* vecs) { return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); } -inline __m128d ei_preduxp(const __m128d* vecs) +template<> inline __m128d ei_preduxp<__m128d>(const __m128d* vecs) { return _mm_hadd_pd(vecs[0], vecs[1]); } @@ -134,13 +134,13 @@ inline __m128d ei_preduxp(const __m128d* vecs) // return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); // } -inline float ei_predux(const __m128& a) +template<> inline float ei_predux<__m128>(const __m128& a) { __m128 tmp0 = _mm_hadd_ps(a,a); return ei_pfirst(_mm_hadd_ps(tmp0, tmp0)); } -inline double ei_predux(const __m128d& a) { return ei_pfirst(_mm_hadd_pd(a, a)); } +template<> inline double ei_predux<__m128d>(const __m128d& a) { return ei_pfirst(_mm_hadd_pd(a, a)); } // SSSE3 version: // inline float ei_predux(const __m128i& a) @@ -150,17 +150,17 @@ inline double ei_predux(const __m128d& a) { return ei_pfirst(_mm_hadd_pd(a, a)); // } #else // SSE2 versions -inline float ei_predux(const __m128& a) +template<> inline float ei_predux<__m128>(const __m128& a) { __m128 tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); return ei_pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } -inline double ei_predux(const __m128d& a) +template<> inline double ei_predux<__m128d>(const __m128d& a) { return ei_pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); } -inline __m128 ei_preduxp(const __m128* vecs) +template<> inline __m128 ei_preduxp<__m128>(const __m128* vecs) { __m128 tmp0, tmp1, tmp2; tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); @@ -174,19 +174,19 @@ inline __m128 ei_preduxp(const __m128* vecs) return _mm_add_ps(tmp0, tmp2); } -inline __m128d ei_preduxp(const __m128d* vecs) +template<> inline __m128d ei_preduxp<__m128d>(const __m128d* vecs) { return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); } #endif // SSE3 -inline int ei_predux(const __m128i& a) +template<> inline int ei_predux<__m128i>(const __m128i& a) { __m128i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); return ei_pfirst(tmp) + ei_pfirst(_mm_shuffle_epi32(tmp, 1)); } -inline __m128i ei_preduxp(const __m128i* vecs) +template<> inline __m128i ei_preduxp<__m128i>(const __m128i* vecs) { __m128i tmp0, tmp1, tmp2; tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]); -- cgit v1.2.3