diff options
author | Gael Guennebaud <g.gael@free.fr> | 2010-07-11 15:48:30 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2010-07-11 15:48:30 +0200 |
commit | ff96c94043d575e4d0dd477c1ed2487e33f79627 (patch) | |
tree | 5e9736916779fdacd431c2591a3ec1f77333e505 /Eigen/src/Core/arch | |
parent | 4161b8be6772f2b7338458c9932d7417797966bb (diff) |
mixing types in product step 2:
* pload* and pset1 are now templated on the packet type
* gemv routines are now embeded into a structure with
a consistent API with respect to gemm
* some configurations of vector * matrix and matrix * matrix works fine,
some need more work...
Diffstat (limited to 'Eigen/src/Core/arch')
-rw-r--r-- | Eigen/src/Core/arch/AltiVec/Complex.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AltiVec/PacketMath.h | 22 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/Complex.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/PacketMath.h | 14 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/Complex.h | 25 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/MathFunctions.h | 16 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/PacketMath.h | 42 |
7 files changed, 67 insertions, 56 deletions
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 2dba95a2f..ecada02f4 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -63,7 +63,7 @@ template<> struct ei_packet_traits<std::complex<float> > : ei_default_packet_tr template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; -template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>& from) +template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>& from) { Packet2cf res; /* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */ diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index a3ceed8e8..8205beae5 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -59,13 +59,13 @@ typedef __vector unsigned char Packet16uc; Packet4i ei_p4i_##NAME = vec_splat_s32(X) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ - Packet4f ei_p4f_##NAME = ei_pset1<float>(X) + Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X)) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ - Packet4i ei_p4i_##NAME = ei_pset1<int>(X) + Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X) #define DST_CHAN 1 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) @@ -158,7 +158,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v) return s; } */ -template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { +template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float& from) { // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html float EIGEN_ALIGN16 af[4]; af[0] = from; @@ -167,7 +167,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { return vc; } -template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { +template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int& from) { int EIGEN_ALIGN16 ai[4]; ai[0] = from; Packet4i vc = vec_ld(0, ai); @@ -175,8 +175,8 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { return vc; } -template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return vec_add(ei_pset1(a), ei_p4f_COUNTDOWN); } -template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return vec_add(ei_pset1(a), ei_p4i_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return vec_add(ei_pset1<Packet4f>(a), ei_p4f_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return vec_add(ei_pset1<Packet4i>(a), ei_p4i_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); } template<> EIGEN_STRONG_INLINE Packet4i ei_padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); } @@ -241,7 +241,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) { ei_assert(false && "packet integer division are not supported by AltiVec"); - return ei_pset1<int>(0); + return ei_pset1<Packet4i>(0); } // for some weird raisons, it has to be overloaded for packet of integers @@ -267,10 +267,10 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pxor<Packet4i>(const Packet4i& a, con template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } -template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } +template<> EIGEN_STRONG_INLINE Packet4f ei_pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } -template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) +template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html @@ -282,7 +282,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data } -template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) +template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index bf68a2bbb..6d9e8da85 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -58,7 +58,7 @@ template<> struct ei_packet_traits<std::complex<float> > : ei_default_packet_tr template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; -template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>& from) +template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>& from) { float32x2_t r64; r64 = vld1_f32((float *)&from); diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 5b0d6ab12..b899fece1 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -45,13 +45,13 @@ typedef float32x4_t Packet4f; typedef int32x4_t Packet4i; #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ - const Packet4f ei_p4f_##NAME = ei_pset1<float>(X) + const Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ const Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X)) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ - const Packet4i ei_p4i_##NAME = ei_pset1<int>(X) + const Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X) #ifndef __pld #define __pld(x) asm volatile ( " pld [%[addr]]\n" :: [addr] "r" (x) : "cc" ); @@ -88,18 +88,18 @@ template<> struct ei_packet_traits<int> : ei_default_packet_traits template<> struct ei_unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; template<> struct ei_unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; -template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { return vdupq_n_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { return vdupq_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); } template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { Packet4f countdown = { 3, 2, 1, 0 }; - return vaddq_f32(ei_pset1(a), countdown); + return vaddq_f32(ei_pset1<Packet4f>(a), countdown); } template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { Packet4i countdown = { 3, 2, 1, 0 }; - return vaddq_s32(ei_pset1(a), countdown); + return vaddq_s32(ei_pset1<Packet4i>(a), countdown); } template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); } @@ -137,7 +137,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con } template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) { ei_assert(false && "packet integer division are not supported by NEON"); - return ei_pset1<int>(0); + return ei_pset1<Packet4i>(0); } // for some weird raisons, it has to be overloaded for packet of integers diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 9d32ede0e..6c72293fc 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -89,15 +89,15 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_por <Packet2cf>(const Packet2cf& template<> EIGEN_STRONG_INLINE Packet2cf ei_pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf ei_pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(ei_pload(&ei_real_ref(*from))); } -template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu(&ei_real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(ei_pload<Packet4f>(&ei_real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu<Packet4f>(&ei_real_ref(*from))); } template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstore(&ei_real_ref(*to), from.v); } template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu(&ei_real_ref(*to), from.v); } template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>& from) +template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>& from) { Packet2cf res; res.v = _mm_loadl_pi(res.v, (const __m64*)&from); @@ -276,10 +276,12 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pxor <Packet1cd>(const Packet1cd& template<> EIGEN_STRONG_INLINE Packet1cd ei_pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } // FIXME force unaligned load, this is a temporary fix -template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<std::complex<double> >(const std::complex<double>& from) -{ /* here we really have to use unaligned loads :( */ return ei_ploadu(&from); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <Packet1cd>(const std::complex<double>* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu<Packet2d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<Packet1cd>(const std::complex<double>* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu<Packet2d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<Packet1cd>(const std::complex<double>& from) +{ /* here we really have to use unaligned loads :( */ return ei_ploadu<Packet1cd>(&from); } // FIXME force unaligned store, this is a temporary fix template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstoreu((double*)to, from.v); } @@ -387,6 +389,15 @@ template<> struct ei_conj_helper<Packet2d, Packet1cd, false,false> { return Packet1cd(ei_pmul(x, y.v)); } }; +template<> struct ei_conj_helper<Packet1cd, Packet2d, false,false> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const + { return ei_padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const + { return Packet1cd(ei_pmul(x.v, y)); } +}; + template<> EIGEN_STRONG_INLINE Packet1cd ei_pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for SSE3 and 4 diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 742bfa92f..e4ca82985 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -378,14 +378,14 @@ Packet4f ei_pcos<Packet4f>(const Packet4f& _x) template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f ei_psqrt<Packet4f>(const Packet4f& _x) { - Packet4f half = ei_pmul(_x, ei_pset1(.5f)); - - /* select only the inverse sqrt of non-zero inputs */ - Packet4f non_zero_mask = _mm_cmpgt_ps(_x, ei_pset1(std::numeric_limits<float>::epsilon())); - Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x)); - - x = ei_pmul(x, ei_psub(ei_pset1(1.5f), ei_pmul(half, ei_pmul(x,x)))); - return ei_pmul(_x,x); + Packet4f half = ei_pmul(_x, ei_pset1<Packet4f>(.5f)); + + /* select only the inverse sqrt of non-zero inputs */ + Packet4f non_zero_mask = _mm_cmpgt_ps(_x, ei_pset1<Packet4f>(std::numeric_limits<float>::epsilon())); + Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x)); + + x = ei_pmul(x, ei_psub(ei_pset1<Packet4f>(1.5f), ei_pmul(half, ei_pmul(x,x)))); + return ei_pmul(_x,x); } #endif // EIGEN_MATH_FUNCTIONS_SSE_H diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 91af346ed..53a9bcf56 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -53,13 +53,13 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; }; (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ - const Packet4f ei_p4f_##NAME = ei_pset1<float>(X) + const Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f ei_p4f_##NAME = _mm_castsi128_ps(ei_pset1<int>(X)) + const Packet4f ei_p4f_##NAME = _mm_castsi128_ps(ei_pset1<Packet4i>(X)) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ - const Packet4i ei_p4i_##NAME = ei_pset1<int>(X) + const Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X) template<> struct ei_packet_traits<float> : ei_default_packet_traits @@ -107,11 +107,11 @@ template<> struct ei_unpacket_traits<Packet4i> { typedef int type; enum {size #ifdef __GNUC__ // Sometimes GCC implements _mm_set1_p* using multiple moves, // that is inefficient :( (e.g., see ei_gemm_pack_rhs) -template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { +template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float& from) { Packet4f res = _mm_set_ss(from); return ei_vec4f_swizzle1(res,0,0,0,0); } -template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) { +template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<Packet2d>(const double& from) { #ifdef EIGEN_VECTORIZE_SSE3 return _mm_loaddup_pd(&from); #else @@ -120,14 +120,14 @@ template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) { #endif } #else -template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { return _mm_set1_ps(from); } -template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) { return _mm_set1_pd(from); } +template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float& from) { return _mm_set1_ps(from); } +template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); } #endif -template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { return _mm_set1_epi32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); } -template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return _mm_add_ps(ei_pset1(a), _mm_set_ps(3,2,1,0)); } -template<> EIGEN_STRONG_INLINE Packet2d ei_plset<double>(const double& a) { return _mm_add_pd(ei_pset1(a),_mm_set_pd(1,0)); } -template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return _mm_add_epi32(ei_pset1(a),_mm_set_epi32(3,2,1,0)); } +template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return _mm_add_ps(ei_pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); } +template<> EIGEN_STRONG_INLINE Packet2d ei_plset<double>(const double& a) { return _mm_add_pd(ei_pset1<Packet2d>(a),_mm_set_pd(1,0)); } +template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return _mm_add_epi32(ei_pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d ei_padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } @@ -174,7 +174,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con template<> EIGEN_STRONG_INLINE Packet2d ei_pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) { ei_assert(false && "packet integer division are not supported by SSE"); - return ei_pset1<int>(0); + return ei_pset1<Packet4i>(0); } // for some weird raisons, it has to be overloaded for packet of integers @@ -214,14 +214,14 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, template<> EIGEN_STRONG_INLINE Packet2d ei_pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } -template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } -template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); } +template<> EIGEN_STRONG_INLINE Packet4f ei_pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } +template<> EIGEN_STRONG_INLINE Packet2d ei_pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } +template<> EIGEN_STRONG_INLINE Packet4i ei_pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); } #if defined(_MSC_VER) - template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); } - template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } - template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); } + template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); } + template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } + template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); } #else // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would // require pointer casting to incompatible pointer types and leads to invalid code @@ -229,7 +229,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_D // a correct instruction dependency. // TODO: do the same for MSVC (ICC is compatible) // NOTE: with the code below, MSVC's compiler crashes! -template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) +template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD __m128d res; @@ -237,7 +237,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) res = _mm_loadh_pd(res, (const double*)(from+2)) ; return _mm_castpd_ps(res); } -template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from) +template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD __m128d res; @@ -245,7 +245,7 @@ template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from) res = _mm_loadh_pd(res,from+1); return res; } -template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) +template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD __m128d res; |