mixing types in product step 2:

* pload* and pset1 are now templated on the packet type * gemv routines are now embeded into a structure with a consistent API with respect to gemm * some configurations of vector * matrix and matrix * matrix works fine, some need more work...
author: Gael Guennebaud <g.gael@free.fr> 2010-07-11 15:48:30 +0200
committer: Gael Guennebaud <g.gael@free.fr> 2010-07-11 15:48:30 +0200
commit: ff96c94043d575e4d0dd477c1ed2487e33f79627 (patch)
tree: 5e9736916779fdacd431c2591a3ec1f77333e505 /Eigen/src/Core/arch
parent: 4161b8be6772f2b7338458c9932d7417797966bb (diff)
7 files changed, 67 insertions, 56 deletions
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 2dba95a2f..ecada02f4 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -63,7 +63,7 @@ template<> struct ei_packet_traits<std::complex<float> >  : ei_default_packet_tr
 
 template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
 
-template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>&  from)
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>&  from)
 {
   Packet2cf res;
   /* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index a3ceed8e8..8205beae5 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -59,13 +59,13 @@ typedef __vector unsigned char  Packet16uc;
   Packet4i ei_p4i_##NAME = vec_splat_s32(X)
 
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
-  Packet4f ei_p4f_##NAME = ei_pset1<float>(X)
+  Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
   Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X))
 
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
-  Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
+  Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X)
 
 #define DST_CHAN 1
 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
@@ -158,7 +158,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
   return s;
 }
 */
-template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) {
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float&  from) {
   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
   float EIGEN_ALIGN16 af[4];
   af[0] = from;
@@ -167,7 +167,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) {
   return vc;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int&    from)   {
+template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int&    from)   {
   int EIGEN_ALIGN16 ai[4];
   ai[0] = from;
   Packet4i vc = vec_ld(0, ai);
@@ -175,8 +175,8 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int&    from)   {
   return vc;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return vec_add(ei_pset1(a), ei_p4f_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a)     { return vec_add(ei_pset1(a), ei_p4i_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return vec_add(ei_pset1<Packet4f>(a), ei_p4f_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a)     { return vec_add(ei_pset1<Packet4i>(a), ei_p4i_COUNTDOWN); }
 
 template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i ei_padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
@@ -241,7 +241,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con
 
 template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
 { ei_assert(false && "packet integer division are not supported by AltiVec");
-  return ei_pset1<int>(0);
+  return ei_pset1<Packet4i>(0);
 }
 
 // for some weird raisons, it has to be overloaded for packet of integers
@@ -267,10 +267,10 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pxor<Packet4i>(const Packet4i& a, con
 template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float* from)
 {
   EIGEN_DEBUG_ALIGNED_LOAD
   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
@@ -282,7 +282,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
   return (Packet4f) vec_perm(MSQ, LSQ, mask);           // align the data
 
 }
-template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from)
 {
   EIGEN_DEBUG_ALIGNED_LOAD
   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index bf68a2bbb..6d9e8da85 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -58,7 +58,7 @@ template<> struct ei_packet_traits<std::complex<float> >  : ei_default_packet_tr
 
 template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
 
-template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>&  from)
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>&  from)
 {
   float32x2_t r64;
   r64 = vld1_f32((float *)&from);
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 5b0d6ab12..b899fece1 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -45,13 +45,13 @@ typedef float32x4_t Packet4f;
 typedef int32x4_t   Packet4i;
 
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
-  const Packet4f ei_p4f_##NAME = ei_pset1<float>(X)
+  const Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
   const Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X))
 
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
-  const Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
+  const Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X)
 
 #ifndef __pld
 #define __pld(x) asm volatile ( "   pld [%[addr]]\n" :: [addr] "r" (x) : "cc" );
@@ -88,18 +88,18 @@ template<> struct ei_packet_traits<int>    : ei_default_packet_traits
 template<> struct ei_unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
 template<> struct ei_unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
 
-template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) { return vdupq_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int&    from)   { return vdupq_n_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int&    from)   { return vdupq_n_s32(from); }
 
 template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a)
 {
   Packet4f countdown = { 3, 2, 1, 0 };
-  return vaddq_f32(ei_pset1(a), countdown);
+  return vaddq_f32(ei_pset1<Packet4f>(a), countdown);
 }
 template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a)
 {
   Packet4i countdown = { 3, 2, 1, 0 };
-  return vaddq_s32(ei_pset1(a), countdown);
+  return vaddq_s32(ei_pset1<Packet4i>(a), countdown);
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
@@ -137,7 +137,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con
 }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
 { ei_assert(false && "packet integer division are not supported by NEON");
-  return ei_pset1<int>(0);
+  return ei_pset1<Packet4i>(0);
 }
 
 // for some weird raisons, it has to be overloaded for packet of integers
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 9d32ede0e..6c72293fc 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -89,15 +89,15 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_por    <Packet2cf>(const Packet2cf&
 template<> EIGEN_STRONG_INLINE Packet2cf ei_pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf ei_pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); }
 
-template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(ei_pload(&ei_real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu(&ei_real_ref(*from))); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(ei_pload<Packet4f>(&ei_real_ref(*from))); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu<Packet4f>(&ei_real_ref(*from))); }
 
 template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstore(&ei_real_ref(*to), from.v); }
 template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu(&ei_real_ref(*to), from.v); }
 
 template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 
-template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>&  from)
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>&  from)
 {
   Packet2cf res;
   res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
@@ -276,10 +276,12 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pxor   <Packet1cd>(const Packet1cd&
 template<> EIGEN_STRONG_INLINE Packet1cd ei_pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
 
 // FIXME force unaligned load, this is a temporary fix 
-template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<std::complex<double> >(const std::complex<double>&  from)
-{ /* here we really have to use unaligned loads :( */ return ei_ploadu(&from); }
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <Packet1cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<Packet1cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<Packet1cd>(const std::complex<double>&  from)
+{ /* here we really have to use unaligned loads :( */ return ei_ploadu<Packet1cd>(&from); }
 
 // FIXME force unaligned store, this is a temporary fix
 template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstoreu((double*)to, from.v); }
@@ -387,6 +389,15 @@ template<> struct ei_conj_helper<Packet2d, Packet1cd, false,false>
   { return Packet1cd(ei_pmul(x, y.v)); }
 };
 
+template<> struct ei_conj_helper<Packet1cd, Packet2d, false,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
+  { return ei_padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
+  { return Packet1cd(ei_pmul(x.v, y)); }
+};
+
 template<> EIGEN_STRONG_INLINE Packet1cd ei_pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   // TODO optimize it for SSE3 and 4
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 742bfa92f..e4ca82985 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -378,14 +378,14 @@ Packet4f ei_pcos<Packet4f>(const Packet4f& _x)
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f ei_psqrt<Packet4f>(const Packet4f& _x)
 {
-	Packet4f half = ei_pmul(_x, ei_pset1(.5f));
-	
-	/* select only the inverse sqrt of non-zero inputs */
-	Packet4f non_zero_mask = _mm_cmpgt_ps(_x, ei_pset1(std::numeric_limits<float>::epsilon()));
-	Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x));
-
-	x = ei_pmul(x, ei_psub(ei_pset1(1.5f), ei_pmul(half, ei_pmul(x,x))));
-	return ei_pmul(_x,x);
+  Packet4f half = ei_pmul(_x, ei_pset1<Packet4f>(.5f));
+
+  /* select only the inverse sqrt of non-zero inputs */
+  Packet4f non_zero_mask = _mm_cmpgt_ps(_x, ei_pset1<Packet4f>(std::numeric_limits<float>::epsilon()));
+  Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x));
+
+  x = ei_pmul(x, ei_psub(ei_pset1<Packet4f>(1.5f), ei_pmul(half, ei_pmul(x,x))));
+  return ei_pmul(_x,x);
 }
 
 #endif // EIGEN_MATH_FUNCTIONS_SSE_H
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 91af346ed..53a9bcf56 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -53,13 +53,13 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; };
   (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
 
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
-  const Packet4f ei_p4f_##NAME = ei_pset1<float>(X)
+  const Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  const Packet4f ei_p4f_##NAME = _mm_castsi128_ps(ei_pset1<int>(X))
+  const Packet4f ei_p4f_##NAME = _mm_castsi128_ps(ei_pset1<Packet4i>(X))
 
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
-  const Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
+  const Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X)
 
 
 template<> struct ei_packet_traits<float>  : ei_default_packet_traits
@@ -107,11 +107,11 @@ template<> struct ei_unpacket_traits<Packet4i> { typedef int    type; enum {size
 #ifdef __GNUC__
 // Sometimes GCC implements _mm_set1_p* using multiple moves,
 // that is inefficient :( (e.g., see ei_gemm_pack_rhs)
-template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) {
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float&  from) {
   Packet4f res = _mm_set_ss(from);
   return ei_vec4f_swizzle1(res,0,0,0,0);
 }
-template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double&  from) {
+template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<Packet2d>(const double&  from) {
 #ifdef EIGEN_VECTORIZE_SSE3
   return _mm_loaddup_pd(&from);
 #else
@@ -120,14 +120,14 @@ template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double&  from) {
 #endif
 }
 #else
-template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) { return _mm_set1_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) { return _mm_set1_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float&  from) { return _mm_set1_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
 #endif
-template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int&    from) { return _mm_set1_epi32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return _mm_add_ps(ei_pset1(a), _mm_set_ps(3,2,1,0)); }
-template<> EIGEN_STRONG_INLINE Packet2d ei_plset<double>(const double& a) { return _mm_add_pd(ei_pset1(a),_mm_set_pd(1,0)); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return _mm_add_epi32(ei_pset1(a),_mm_set_epi32(3,2,1,0)); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return _mm_add_ps(ei_pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_plset<double>(const double& a) { return _mm_add_pd(ei_pset1<Packet2d>(a),_mm_set_pd(1,0)); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return _mm_add_epi32(ei_pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
 
 template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d ei_padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
@@ -174,7 +174,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con
 template<> EIGEN_STRONG_INLINE Packet2d ei_pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
 { ei_assert(false && "packet integer division are not supported by SSE");
-  return ei_pset1<int>(0);
+  return ei_pset1<Packet4i>(0);
 }
 
 // for some weird raisons, it has to be overloaded for packet of integers
@@ -214,14 +214,14 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a,
 template<> EIGEN_STRONG_INLINE Packet2d ei_pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float*    from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
 
 #if defined(_MSC_VER)
-  template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float*   from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); }
-  template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double*  from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
-  template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
+  template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float*  from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); }
+  template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
+  template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int*    from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
 #else
 // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
 // require pointer casting to incompatible pointer types and leads to invalid code
@@ -229,7 +229,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_D
 // a correct instruction dependency.
 // TODO: do the same for MSVC (ICC is compatible)
 // NOTE: with the code below, MSVC's compiler crashes!
-template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
   __m128d res;
@@ -237,7 +237,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
   res =  _mm_loadh_pd(res, (const double*)(from+2)) ;
   return _mm_castpd_ps(res);
 }
-template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from)
+template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<Packet2d>(const double* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
   __m128d res;
@@ -245,7 +245,7 @@ template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from)
   res = _mm_loadh_pd(res,from+1);
   return res;
 }
-template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
   __m128d res;
author	Gael Guennebaud <g.gael@free.fr>	2010-07-11 15:48:30 +0200
committer	Gael Guennebaud <g.gael@free.fr>	2010-07-11 15:48:30 +0200
commit	ff96c94043d575e4d0dd477c1ed2487e33f79627 (patch)
tree	5e9736916779fdacd431c2591a3ec1f77333e505 /Eigen/src/Core/arch
parent	4161b8be6772f2b7338458c9932d7417797966bb (diff)