add SSE2 versions of sin, cos, log, exp using code from Julien

Pommier. They are for float only, and they return exactly the same result as the standard versions in about 90% of the cases. Otherwise the max error is below 1e-7. However, for very large values (>1e3) the accuracy of sin and cos slighlty decrease. They are about 3 or 4 times faster than 4 calls to their respective standard versions. So, is it ok to enable them by default in their respective functors ?
author: Gael Guennebaud <g.gael@free.fr> 2009-03-25 12:26:13 +0000
committer: Gael Guennebaud <g.gael@free.fr> 2009-03-25 12:26:13 +0000
commit: 17860e578cab13d5aacf8b5e6e373e59403352ba (patch)
tree: 285e8ef963751a1a96ea127457ff51cf44a6cf16 /Eigen/src
parent: 64fbd93cd904790e831aa5404698c5aa30f54be4 (diff)
4 files changed, 582 insertions, 116 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index d50899f10..21b4fb159 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -34,6 +34,37 @@
   * of generic vectorized code.
   */
 
+struct ei_default_packet_traits
+{
+  enum {
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasMin    = 1,
+    HasMax    = 1,
+    
+    HasDiv    = 0,
+    HasExp    = 0,
+    HasLog    = 0,
+    HasPow    = 0,
+    
+    HasSin    = 0,
+    HasCos    = 0,
+    HasTan    = 0,
+    HasASin   = 0,
+    HasACos   = 0,
+    HasATan   = 0
+  };
+};
+
+template<typename T> struct ei_packet_traits : ei_default_packet_traits
+{
+  typedef T type;
+  enum {size=1};
+};
+
 /** \internal \returns a + b (coeff-wise) */
 template<typename Packet> inline Packet
 ei_padd(const Packet& a,
@@ -72,6 +103,22 @@ ei_pmax(const Packet& a,
 template<typename Packet> inline Packet
 ei_pabs(const Packet& a) { return ei_abs(a); }
 
+/** \internal \returns the bitwise and of \a a and \a b */
+template<typename Packet> inline Packet
+ei_pand(const Packet& a, const Packet& b) { return a & b; }
+
+/** \internal \returns the bitwise or of \a a and \a b */
+template<typename Packet> inline Packet
+ei_por(const Packet& a, const Packet& b) { return a | b; }
+
+/** \internal \returns the bitwise xor of \a a and \a b */
+template<typename Packet> inline Packet
+ei_pxor(const Packet& a, const Packet& b) { return a ^ b; }
+
+/** \internal \returns the bitwise andnot of \a a and \a b */
+template<typename Packet> inline Packet
+ei_pandnot(const Packet& a, const Packet& b) { return a & (!b); }
+        
 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
 template<typename Scalar> inline typename ei_packet_traits<Scalar>::type
 ei_pload(const Scalar* from) { return *from; }
@@ -120,6 +167,22 @@ template<typename Packet> inline typename ei_unpacket_traits<Packet>::type ei_pr
 template<typename Packet> inline Packet ei_preverse(const Packet& a)
 { return a; }
 
+/**************************
+* Trnascendental functions
+***************************/
+
+/** \internal \returns the sin of \a a (coeff-wise) */
+template<typename Packet> inline Packet ei_psin(const Packet& a) { return ei_sin(a); }
+
+/** \internal \returns the cos of \a a (coeff-wise) */
+template<typename Packet> inline Packet ei_pcos(const Packet& a) { return ei_cos(a); }
+
+/** \internal \returns the exp of \a a (coeff-wise) */
+template<typename Packet> inline Packet ei_pexp(const Packet& a) { return ei_exp(a); }
+
+/** \internal \returns the log of \a a (coeff-wise) */
+template<typename Packet> inline Packet ei_plog(const Packet& a) { return ei_log(a); }
+
 /***************************************************************************
 * The following functions might not have to be overwritten for vectorized types
 ***************************************************************************/
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index d183f048f..e87dd1962 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -45,8 +45,10 @@ typedef __vector __bool int     v4bi;
 #define USE_CONST_v1i_    const v4ui  v1i_  = vec_splat_u32(-1)
 #define USE_CONST_v0f_    USE_CONST_v1i_; const v4f v0f_ = (v4f) vec_sl(v1i_, v1i_)
 
-template<> struct ei_packet_traits<float>  { typedef v4f type; enum {size=4}; };
-template<> struct ei_packet_traits<int>    { typedef v4i type; enum {size=4}; };
+template<> struct ei_packet_traits<float> : ei_default_packet_traits
+{ typedef v4f type; enum {size=4}; };
+template<> struct ei_packet_traits<int>   : ei_default_packet_traits
+{ typedef v4i type; enum {size=4}; };
 
 template<> struct ei_unpacket_traits<v4f>  { typedef float  type; enum {size=4}; };
 template<> struct ei_unpacket_traits<v4i>  { typedef int    type; enum {size=4}; };
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index f8237b524..d49479c16 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra. Eigen itself is part of the KDE project.
 //
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@@ -29,6 +29,10 @@
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
 #endif
 
+typedef __m128  Packet4f;
+typedef __m128i Packet4i;
+typedef __m128d Packet2d;
+
 #define ei_vec4f_swizzle1(v,p,q,r,s) \
   (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
 
@@ -40,47 +44,67 @@
 
 #define ei_vec4i_swizzle2(a,b,p,q,r,s) \
   (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
+  
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
+  const Packet4f ei_p4f_##NAME = ei_pset1<float>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  const Packet4f ei_p4f_##NAME = _mm_castsi128_ps(ei_pset1<int>(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
+  const Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
 
-template<> struct ei_packet_traits<float>  { typedef __m128  type; enum {size=4}; };
-template<> struct ei_packet_traits<double> { typedef __m128d type; enum {size=2}; };
-template<> struct ei_packet_traits<int>    { typedef __m128i type; enum {size=4}; };
+template<> struct ei_packet_traits<float>  : ei_default_packet_traits
+{
+  typedef Packet4f type; enum {size=4};
+  enum {
+    HasSin  = 1,
+    HasCos  = 1,
+    HasLog  = 1,
+    HasExp  = 1
+  };
+};
+template<> struct ei_packet_traits<double> : ei_default_packet_traits
+{ typedef Packet2d type; enum {size=2}; };
+template<> struct ei_packet_traits<int>    : ei_default_packet_traits
+{ typedef Packet4i type; enum {size=4}; };
 
-template<> struct ei_unpacket_traits<__m128>  { typedef float  type; enum {size=4}; };
-template<> struct ei_unpacket_traits<__m128d> { typedef double type; enum {size=2}; };
-template<> struct ei_unpacket_traits<__m128i> { typedef int    type; enum {size=4}; };
+template<> struct ei_unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
+template<> struct ei_unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
+template<> struct ei_unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
 
-template<> EIGEN_STRONG_INLINE __m128  ei_pset1<float>(const float&  from) { return _mm_set1_ps(from); }
-template<> EIGEN_STRONG_INLINE __m128d ei_pset1<double>(const double& from) { return _mm_set1_pd(from); }
-template<> EIGEN_STRONG_INLINE __m128i ei_pset1<int>(const int&    from) { return _mm_set1_epi32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) { return _mm_set1_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) { return _mm_set1_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int&    from) { return _mm_set1_epi32(from); }
 
-template<> EIGEN_STRONG_INLINE __m128  ei_padd<__m128>(const __m128&  a, const __m128&  b) { return _mm_add_ps(a,b); }
-template<> EIGEN_STRONG_INLINE __m128d ei_padd<__m128d>(const __m128d& a, const __m128d& b) { return _mm_add_pd(a,b); }
-template<> EIGEN_STRONG_INLINE __m128i ei_padd<__m128i>(const __m128i& a, const __m128i& b) { return _mm_add_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
 
-template<> EIGEN_STRONG_INLINE __m128  ei_psub<__m128>(const __m128&  a, const __m128&  b) { return _mm_sub_ps(a,b); }
-template<> EIGEN_STRONG_INLINE __m128d ei_psub<__m128d>(const __m128d& a, const __m128d& b) { return _mm_sub_pd(a,b); }
-template<> EIGEN_STRONG_INLINE __m128i ei_psub<__m128i>(const __m128i& a, const __m128i& b) { return _mm_sub_epi32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
 
-template<> EIGEN_STRONG_INLINE __m128 ei_pnegate(const __m128& a)
+template<> EIGEN_STRONG_INLINE Packet4f ei_pnegate(const Packet4f& a)
 {
-  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
   return _mm_xor_ps(a,mask);
 }
-template<> EIGEN_STRONG_INLINE __m128d ei_pnegate(const __m128d& a)
+template<> EIGEN_STRONG_INLINE Packet2d ei_pnegate(const Packet2d& a)
 {
-  const __m128d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
   return _mm_xor_pd(a,mask);
 }
-template<> EIGEN_STRONG_INLINE __m128i ei_pnegate(const __m128i& a)
+template<> EIGEN_STRONG_INLINE Packet4i ei_pnegate(const Packet4i& a)
 {
   return ei_psub(_mm_setr_epi32(0,0,0,0), a);
 }
 
-template<> EIGEN_STRONG_INLINE __m128  ei_pmul<__m128>(const __m128&  a, const __m128&  b) { return _mm_mul_ps(a,b); }
-template<> EIGEN_STRONG_INLINE __m128d ei_pmul<__m128d>(const __m128d& a, const __m128d& b) { return _mm_mul_pd(a,b); }
-template<> EIGEN_STRONG_INLINE __m128i ei_pmul<__m128i>(const __m128i& a, const __m128i& b)
+template<> EIGEN_STRONG_INLINE Packet4f ei_pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
-  // this version is very slightly faster than 4 scalar products
+  // this version is slightly faster than 4 scalar products
   return ei_vec4i_swizzle1(
             ei_vec4i_swizzle2(
               _mm_mul_epu32(a,b),
@@ -90,94 +114,109 @@ template<> EIGEN_STRONG_INLINE __m128i ei_pmul<__m128i>(const __m128i& a, const
             0,2,1,3);
 }
 
-template<> EIGEN_STRONG_INLINE __m128  ei_pdiv<__m128>(const __m128&  a, const __m128&  b) { return _mm_div_ps(a,b); }
-template<> EIGEN_STRONG_INLINE __m128d ei_pdiv<__m128d>(const __m128d& a, const __m128d& b) { return _mm_div_pd(a,b); }
-template<> EIGEN_STRONG_INLINE __m128i ei_pdiv<__m128i>(const __m128i& /*a*/, const __m128i& /*b*/)
+template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
 { ei_assert(false && "packet integer division are not supported by SSE");
-  __m128i dummy = ei_pset1<int>(0);
-  return dummy;
+  return ei_pset1<int>(0);
 }
 
-// for some weird raisons, it has to be overloaded for packet integer
-template<> EIGEN_STRONG_INLINE __m128i ei_pmadd(const __m128i& a, const __m128i& b, const __m128i& c) { return ei_padd(ei_pmul(a,b), c); }
+// for some weird raisons, it has to be overloaded for packet of integers
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return ei_padd(ei_pmul(a,b), c); }
 
-template<> EIGEN_STRONG_INLINE __m128  ei_pmin<__m128>(const __m128&  a, const __m128&  b) { return _mm_min_ps(a,b); }
-template<> EIGEN_STRONG_INLINE __m128d ei_pmin<__m128d>(const __m128d& a, const __m128d& b) { return _mm_min_pd(a,b); }
-template<> EIGEN_STRONG_INLINE __m128i ei_pmin<__m128i>(const __m128i& a, const __m128i& b)
+template<> EIGEN_STRONG_INLINE Packet4f ei_pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
   // after some bench, this version *is* faster than a scalar implementation
-  __m128i mask = _mm_cmplt_epi32(a,b);
+  Packet4i mask = _mm_cmplt_epi32(a,b);
   return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
 }
 
-template<> EIGEN_STRONG_INLINE __m128  ei_pmax<__m128>(const __m128&  a, const __m128&  b) { return _mm_max_ps(a,b); }
-template<> EIGEN_STRONG_INLINE __m128d ei_pmax<__m128d>(const __m128d& a, const __m128d& b) { return _mm_max_pd(a,b); }
-template<> EIGEN_STRONG_INLINE __m128i ei_pmax<__m128i>(const __m128i& a, const __m128i& b)
+template<> EIGEN_STRONG_INLINE Packet4f ei_pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
   // after some bench, this version *is* faster than a scalar implementation
-  __m128i mask = _mm_cmpgt_epi32(a,b);
+  Packet4i mask = _mm_cmpgt_epi32(a,b);
   return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
 }
 
-template<> EIGEN_STRONG_INLINE __m128  ei_pload<float>(const float*   from) { return _mm_load_ps(from); }
-template<> EIGEN_STRONG_INLINE __m128d ei_pload<double>(const double*  from) { return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE __m128i ei_pload<int>(const int* from) { return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float*    from) { return _mm_load_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double*  from) { return _mm_load_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
 
-template<> EIGEN_STRONG_INLINE __m128  ei_ploadu(const float*   from) {
-  __m128 r;
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float*   from) {
+  Packet4f r;
   r = _mm_castpd_ps(_mm_load_sd((double*)(from)));
   r = _mm_loadh_pi(r, (const __m64*)(from+2));
   return r;
 }
-template<> EIGEN_STRONG_INLINE __m128d ei_ploadu<double>(const double*  from) { return _mm_castps_pd(ei_ploadu((const float*)(from))); }
-template<> EIGEN_STRONG_INLINE __m128i ei_ploadu<int>(const int* from) { return _mm_castpd_si128(ei_ploadu((const double*)(from))); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double*  from) { return _mm_castps_pd(ei_ploadu((const float*)(from))); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { return _mm_castpd_si128(ei_ploadu((const double*)(from))); }
 
-template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float*   to, const __m128&  from) { _mm_store_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const __m128d& from) { _mm_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int*       to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
+template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float*   to, const Packet4f& from) { _mm_store_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const Packet2d& from) { _mm_store_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int*       to, const Packet4i& from) { _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
 
-template<> EIGEN_STRONG_INLINE void ei_pstoreu<double>(double* to, const __m128d& from) {
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<double>(double* to, const Packet2d& from) {
   _mm_storel_pd((to), from);
   _mm_storeh_pd((to+1), from);
 }
-template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float*  to, const __m128&  from) { ei_pstoreu((double*)to, _mm_castps_pd(from)); }
-template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int*      to, const __m128i& from) { ei_pstoreu((double*)to, _mm_castsi128_pd(from)); }
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float*  to, const Packet4f& from) { ei_pstoreu((double*)to, _mm_castps_pd(from)); }
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int*      to, const Packet4i& from) { ei_pstoreu((double*)to, _mm_castsi128_pd(from)); }
 
 #ifdef _MSC_VER
 // this fix internal compilation error
-template<> EIGEN_STRONG_INLINE float  ei_pfirst<__m128>(const __m128&  a) { float x = _mm_cvtss_f32(a); return x; }
-template<> EIGEN_STRONG_INLINE double ei_pfirst<__m128d>(const __m128d& a) { double x = _mm_cvtsd_f64(a); return x; }
-template<> EIGEN_STRONG_INLINE int    ei_pfirst<__m128i>(const __m128i& a) { int x = _mm_cvtsi128_si32(a); return x; }
+template<> EIGEN_STRONG_INLINE float  ei_pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
+template<> EIGEN_STRONG_INLINE double ei_pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
+template<> EIGEN_STRONG_INLINE int    ei_pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
 #else
-template<> EIGEN_STRONG_INLINE float  ei_pfirst<__m128>(const __m128&  a) { return _mm_cvtss_f32(a); }
-template<> EIGEN_STRONG_INLINE double ei_pfirst<__m128d>(const __m128d& a) { return _mm_cvtsd_f64(a); }
-template<> EIGEN_STRONG_INLINE int    ei_pfirst<__m128i>(const __m128i& a) { return _mm_cvtsi128_si32(a); }
+template<> EIGEN_STRONG_INLINE float  ei_pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
+template<> EIGEN_STRONG_INLINE double ei_pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
+template<> EIGEN_STRONG_INLINE int    ei_pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
 #endif
 
-template<> EIGEN_STRONG_INLINE __m128 ei_preverse(const __m128& a)
+template<> EIGEN_STRONG_INLINE Packet4f ei_preverse(const Packet4f& a)
 { return _mm_shuffle_ps(a,a,0x1B); }
-template<> EIGEN_STRONG_INLINE __m128d ei_preverse(const __m128d& a)
+template<> EIGEN_STRONG_INLINE Packet2d ei_preverse(const Packet2d& a)
 { return _mm_shuffle_pd(a,a,0x1); }
-template<> EIGEN_STRONG_INLINE __m128i ei_preverse(const __m128i& a)
+template<> EIGEN_STRONG_INLINE Packet4i ei_preverse(const Packet4i& a)
 { return _mm_shuffle_epi32(a,0x1B); }
 
 
-template<> EIGEN_STRONG_INLINE __m128 ei_pabs(const __m128& a)
+template<> EIGEN_STRONG_INLINE Packet4f ei_pabs(const Packet4f& a)
 {
-  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
   return _mm_and_ps(a,mask);
 }
-template<> EIGEN_STRONG_INLINE __m128d ei_pabs(const __m128d& a)
+template<> EIGEN_STRONG_INLINE Packet2d ei_pabs(const Packet2d& a)
 {
-  const __m128d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
   return _mm_and_pd(a,mask);
 }
-template<> EIGEN_STRONG_INLINE __m128i ei_pabs(const __m128i& a)
+template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a)
 {
   #ifdef __SSSE3__
   return _mm_abs_epi32(a);
   #else
-  __m128i aux = _mm_srai_epi32(a,31);
+  Packet4i aux = _mm_srai_epi32(a,31);
   return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
   #endif
 }
@@ -185,49 +224,49 @@ template<> EIGEN_STRONG_INLINE __m128i ei_pabs(const __m128i& a)
 
 #ifdef __SSE3__
 // TODO implement SSE2 versions as well as integer versions
-template<> EIGEN_STRONG_INLINE __m128 ei_preduxp<__m128>(const __m128* vecs)
+template<> EIGEN_STRONG_INLINE Packet4f ei_preduxp<Packet4f>(const Packet4f* vecs)
 {
   return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
 }
-template<> EIGEN_STRONG_INLINE __m128d ei_preduxp<__m128d>(const __m128d* vecs)
+template<> EIGEN_STRONG_INLINE Packet2d ei_preduxp<Packet2d>(const Packet2d* vecs)
 {
   return _mm_hadd_pd(vecs[0], vecs[1]);
 }
 // SSSE3 version:
-// EIGEN_STRONG_INLINE __m128i ei_preduxp(const __m128i* vecs)
+// EIGEN_STRONG_INLINE Packet4i ei_preduxp(const Packet4i* vecs)
 // {
 //   return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
 // }
 
-template<> EIGEN_STRONG_INLINE float ei_predux<__m128>(const __m128& a)
+template<> EIGEN_STRONG_INLINE float ei_predux<Packet4f>(const Packet4f& a)
 {
-  __m128 tmp0 = _mm_hadd_ps(a,a);
+  Packet4f tmp0 = _mm_hadd_ps(a,a);
   return ei_pfirst(_mm_hadd_ps(tmp0, tmp0));
 }
 
-template<> EIGEN_STRONG_INLINE double ei_predux<__m128d>(const __m128d& a) { return ei_pfirst(_mm_hadd_pd(a, a)); }
+template<> EIGEN_STRONG_INLINE double ei_predux<Packet2d>(const Packet2d& a) { return ei_pfirst(_mm_hadd_pd(a, a)); }
 
 // SSSE3 version:
-// EIGEN_STRONG_INLINE float ei_predux(const __m128i& a)
+// EIGEN_STRONG_INLINE float ei_predux(const Packet4i& a)
 // {
-//   __m128i tmp0 = _mm_hadd_epi32(a,a);
+//   Packet4i tmp0 = _mm_hadd_epi32(a,a);
 //   return ei_pfirst(_mm_hadd_epi32(tmp0, tmp0));
 // }
 #else
 // SSE2 versions
-template<> EIGEN_STRONG_INLINE float ei_predux<__m128>(const __m128& a)
+template<> EIGEN_STRONG_INLINE float ei_predux<Packet4f>(const Packet4f& a)
 {
-  __m128 tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
+  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
   return ei_pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
-template<> EIGEN_STRONG_INLINE double ei_predux<__m128d>(const __m128d& a)
+template<> EIGEN_STRONG_INLINE double ei_predux<Packet2d>(const Packet2d& a)
 {
   return ei_pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
 }
 
-template<> EIGEN_STRONG_INLINE __m128 ei_preduxp<__m128>(const __m128* vecs)
+template<> EIGEN_STRONG_INLINE Packet4f ei_preduxp<Packet4f>(const Packet4f* vecs)
 {
-  __m128 tmp0, tmp1, tmp2;
+  Packet4f tmp0, tmp1, tmp2;
   tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
   tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
   tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
@@ -239,21 +278,21 @@ template<> EIGEN_STRONG_INLINE __m128 ei_preduxp<__m128>(const __m128* vecs)
   return _mm_add_ps(tmp0, tmp2);
 }
 
-template<> EIGEN_STRONG_INLINE __m128d ei_preduxp<__m128d>(const __m128d* vecs)
+template<> EIGEN_STRONG_INLINE Packet2d ei_preduxp<Packet2d>(const Packet2d* vecs)
 {
   return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
 }
 #endif  // SSE3
 
-template<> EIGEN_STRONG_INLINE int ei_predux<__m128i>(const __m128i& a)
+template<> EIGEN_STRONG_INLINE int ei_predux<Packet4i>(const Packet4i& a)
 {
-  __m128i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
+  Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
   return ei_pfirst(tmp) + ei_pfirst(_mm_shuffle_epi32(tmp, 1));
 }
 
-template<> EIGEN_STRONG_INLINE __m128i ei_preduxp<__m128i>(const __m128i* vecs)
+template<> EIGEN_STRONG_INLINE Packet4i ei_preduxp<Packet4i>(const Packet4i* vecs)
 {
-  __m128i tmp0, tmp1, tmp2;
+  Packet4i tmp0, tmp1, tmp2;
   tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
   tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
   tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
@@ -268,16 +307,16 @@ template<> EIGEN_STRONG_INLINE __m128i ei_preduxp<__m128i>(const __m128i* vecs)
 // Other reduction functions:
 
 // mul
-template<> EIGEN_STRONG_INLINE float ei_predux_mul<__m128>(const __m128& a)
+template<> EIGEN_STRONG_INLINE float ei_predux_mul<Packet4f>(const Packet4f& a)
 {
-  __m128 tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
+  Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
   return ei_pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
-template<> EIGEN_STRONG_INLINE double ei_predux_mul<__m128d>(const __m128d& a)
+template<> EIGEN_STRONG_INLINE double ei_predux_mul<Packet2d>(const Packet2d& a)
 {
   return ei_pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
 }
-template<> EIGEN_STRONG_INLINE int ei_predux_mul<__m128i>(const __m128i& a)
+template<> EIGEN_STRONG_INLINE int ei_predux_mul<Packet4i>(const Packet4i& a)
 {
   // after some experiments, it is seems this is the fastest way to implement it
   // for GCC (eg., reusing ei_pmul is very slow !)
@@ -288,16 +327,16 @@ template<> EIGEN_STRONG_INLINE int ei_predux_mul<__m128i>(const __m128i& a)
 }
 
 // min
-template<> EIGEN_STRONG_INLINE float ei_predux_min<__m128>(const __m128& a)
+template<> EIGEN_STRONG_INLINE float ei_predux_min<Packet4f>(const Packet4f& a)
 {
-  __m128 tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
+  Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
   return ei_pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
-template<> EIGEN_STRONG_INLINE double ei_predux_min<__m128d>(const __m128d& a)
+template<> EIGEN_STRONG_INLINE double ei_predux_min<Packet2d>(const Packet2d& a)
 {
   return ei_pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
 }
-template<> EIGEN_STRONG_INLINE int ei_predux_min<__m128i>(const __m128i& a)
+template<> EIGEN_STRONG_INLINE int ei_predux_min<Packet4i>(const Packet4i& a)
 {
   // after some experiments, it is seems this is the fastest way to implement it
   // for GCC (eg., it does not like using std::min after the ei_pstore !!)
@@ -309,16 +348,16 @@ template<> EIGEN_STRONG_INLINE int ei_predux_min<__m128i>(const __m128i& a)
 }
 
 // max
-template<> EIGEN_STRONG_INLINE float ei_predux_max<__m128>(const __m128& a)
+template<> EIGEN_STRONG_INLINE float ei_predux_max<Packet4f>(const Packet4f& a)
 {
-  __m128 tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
+  Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
   return ei_pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
-template<> EIGEN_STRONG_INLINE double ei_predux_max<__m128d>(const __m128d& a)
+template<> EIGEN_STRONG_INLINE double ei_predux_max<Packet2d>(const Packet2d& a)
 {
   return ei_pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
 }
-template<> EIGEN_STRONG_INLINE int ei_predux_max<__m128i>(const __m128i& a)
+template<> EIGEN_STRONG_INLINE int ei_predux_max<Packet4i>(const Packet4i& a)
 {
   // after some experiments, it is seems this is the fastest way to implement it
   // for GCC (eg., it does not like using std::min after the ei_pstore !!)
@@ -330,15 +369,15 @@ template<> EIGEN_STRONG_INLINE int ei_predux_max<__m128i>(const __m128i& a)
 }
 
 #if (defined __GNUC__)
-// template <> EIGEN_STRONG_INLINE __m128 ei_pmadd(const __m128&  a, const __m128&  b, const __m128&  c)
+// template <> EIGEN_STRONG_INLINE Packet4f ei_pmadd(const Packet4f&  a, const Packet4f&  b, const Packet4f&  c)
 // {
-//   __m128 res = b;
+//   Packet4f res = b;
 //   asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
 //   return res;
 // }
-// EIGEN_STRONG_INLINE __m128i _mm_alignr_epi8(const __m128i&  a, const __m128i&  b, const int i)
+// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i&  a, const Packet4i&  b, const int i)
 // {
-//   __m128i res = a;
+//   Packet4i res = a;
 //   asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
 //   return res;
 // }
@@ -347,9 +386,9 @@ template<> EIGEN_STRONG_INLINE int ei_predux_max<__m128i>(const __m128i& a)
 #ifdef __SSSE3__
 // SSSE3 versions
 template<int Offset>
-struct ei_palign_impl<Offset,__m128>
+struct ei_palign_impl<Offset,Packet4f>
 {
-  EIGEN_STRONG_INLINE static void run(__m128& first, const __m128& second)
+  EIGEN_STRONG_INLINE static void run(Packet4f& first, const Packet4f& second)
   {
     if (Offset!=0)
       first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
@@ -357,9 +396,9 @@ struct ei_palign_impl<Offset,__m128>
 };
 
 template<int Offset>
-struct ei_palign_impl<Offset,__m128i>
+struct ei_palign_impl<Offset,Packet4i>
 {
-  EIGEN_STRONG_INLINE static void run(__m128i& first, const __m128i& second)
+  EIGEN_STRONG_INLINE static void run(Packet4i& first, const Packet4i& second)
   {
     if (Offset!=0)
       first = _mm_alignr_epi8(second,first, Offset*4);
@@ -367,9 +406,9 @@ struct ei_palign_impl<Offset,__m128i>
 };
 
 template<int Offset>
-struct ei_palign_impl<Offset,__m128d>
+struct ei_palign_impl<Offset,Packet2d>
 {
-  EIGEN_STRONG_INLINE static void run(__m128d& first, const __m128d& second)
+  EIGEN_STRONG_INLINE static void run(Packet2d& first, const Packet2d& second)
   {
     if (Offset==1)
       first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
@@ -378,9 +417,9 @@ struct ei_palign_impl<Offset,__m128d>
 #else
 // SSE2 versions
 template<int Offset>
-struct ei_palign_impl<Offset,__m128>
+struct ei_palign_impl<Offset,Packet4f>
 {
-  EIGEN_STRONG_INLINE static void run(__m128& first, const __m128& second)
+  EIGEN_STRONG_INLINE static void run(Packet4f& first, const Packet4f& second)
   {
     if (Offset==1)
     {
@@ -401,9 +440,9 @@ struct ei_palign_impl<Offset,__m128>
 };
 
 template<int Offset>
-struct ei_palign_impl<Offset,__m128i>
+struct ei_palign_impl<Offset,Packet4i>
 {
-  EIGEN_STRONG_INLINE static void run(__m128i& first, const __m128i& second)
+  EIGEN_STRONG_INLINE static void run(Packet4i& first, const Packet4i& second)
   {
     if (Offset==1)
     {
@@ -424,9 +463,9 @@ struct ei_palign_impl<Offset,__m128i>
 };
 
 template<int Offset>
-struct ei_palign_impl<Offset,__m128d>
+struct ei_palign_impl<Offset,Packet2d>
 {
-  EIGEN_STRONG_INLINE static void run(__m128d& first, const __m128d& second)
+  EIGEN_STRONG_INLINE static void run(Packet2d& first, const Packet2d& second)
   {
     if (Offset==1)
     {
diff --git a/Eigen/src/Core/arch/SSE/TranscendentalFunctions.h b/Eigen/src/Core/arch/SSE/TranscendentalFunctions.h
new file mode 100644
index 000000000..3b6712524
--- /dev/null
+++ b/Eigen/src/Core/arch/SSE/TranscendentalFunctions.h
@@ -0,0 +1,362 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+/* The functions of this file come from Julien Pommier's sse math library.
+ * which is itself inspired by Intel Approximate Math library, and based on the
+ * corresponding algorithms of the cephes math library.
+ */
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#ifndef EIGEN_TRANSCENDENTAL_FUNCTIONS_SSE_H
+#define EIGEN_TRANSCENDENTAL_FUNCTIONS_SSE_H
+
+_EIGEN_DECLARE_CONST_Packet4f(1 , 1.0);
+_EIGEN_DECLARE_CONST_Packet4f(half, 0.5);
+/* the smallest non denormalized float number */
+_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
+// _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(mant_mask, 0x7f800000);
+_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
+// _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_sign_mask, ~0x80000000);
+
+_EIGEN_DECLARE_CONST_Packet4i(1, 1);
+_EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
+_EIGEN_DECLARE_CONST_Packet4i(2, 2);
+_EIGEN_DECLARE_CONST_Packet4i(4, 4);
+_EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+
+/* natural logarithm computed for 4 simultaneous float 
+   return NaN for x <= 0
+*/
+_EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375);
+
+template<> EIGEN_DONT_INLINE Packet4f ei_plog(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  Packet4i emm0;
+
+  Packet4f invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+
+  x = ei_pmax(x, ei_p4f_min_norm_pos);  /* cut off denormalized stuff */
+  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+
+  /* keep only the fractional part */
+  x = _mm_and_ps(x, ei_p4f_inv_mant_mask);
+  x = _mm_or_ps(x, ei_p4f_half);
+
+  emm0 = _mm_sub_epi32(emm0, ei_p4i_0x7f);
+  Packet4f e = ei_padd(_mm_cvtepi32_ps(emm0), ei_p4f_1);
+
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  Packet4f mask = _mm_cmplt_ps(x, ei_p4f_cephes_SQRTHF);
+  Packet4f tmp = _mm_and_ps(x, mask);
+  x = ei_psub(x, ei_p4f_1);
+  e = ei_psub(e, _mm_and_ps(ei_p4f_1, mask));
+  x = ei_padd(x, tmp);
+
+  Packet4f z = ei_pmul(x,x);
+
+  Packet4f y = ei_p4f_cephes_log_p0;
+  y = ei_pmadd(y, x, ei_p4f_cephes_log_p1);
+  y = ei_pmadd(y, x, ei_p4f_cephes_log_p2);
+  y = ei_pmadd(y, x, ei_p4f_cephes_log_p3);
+  y = ei_pmadd(y, x, ei_p4f_cephes_log_p4);
+  y = ei_pmadd(y, x, ei_p4f_cephes_log_p5);
+  y = ei_pmadd(y, x, ei_p4f_cephes_log_p6);
+  y = ei_pmadd(y, x, ei_p4f_cephes_log_p7);
+  y = ei_pmadd(y, x, ei_p4f_cephes_log_p8);
+  y = ei_pmul(y, x);
+  y = ei_pmul(y, z);
+  
+  y = ei_pmadd(e, ei_p4f_cephes_log_q1, y);
+  y = ei_psub(y, ei_pmul(z, ei_p4f_half));
+
+  tmp = ei_pmul(e, ei_p4f_cephes_log_q2);
+  x = ei_padd(x, y);
+  x = ei_padd(x, tmp);
+  return _mm_or_ps(x, invalid_mask); // negative arg will be NAN
+}
+
+_EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647949f);
+_EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+_EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4);
+
+_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1);
+
+template<> EIGEN_DONT_INLINE Packet4f ei_pexp(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  Packet4f tmp = _mm_setzero_ps(), fx;
+  Packet4i emm0;
+
+  // clamp x
+  x = ei_pmax(ei_pmin(x, ei_p4f_exp_hi), ei_p4f_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = ei_pmadd(x, ei_p4f_cephes_LOG2EF, ei_p4f_half);
+
+  /* how to perform a floorf with SSE: just below */
+  emm0 = _mm_cvttps_epi32(fx);
+  tmp  = _mm_cvtepi32_ps(emm0);
+  /* if greater, substract 1 */
+  Packet4f mask = _mm_cmpgt_ps(tmp, fx);    
+  mask = _mm_and_ps(mask, ei_p4f_1);
+  fx = ei_psub(tmp, mask);
+
+  tmp = ei_pmul(fx, ei_p4f_cephes_exp_C1);
+  Packet4f z = ei_pmul(fx, ei_p4f_cephes_exp_C2);
+  x = ei_psub(x, tmp);
+  x = ei_psub(x, z);
+
+  z = ei_pmul(x,x);
+  
+  Packet4f y = ei_p4f_cephes_exp_p0;
+  y = ei_pmadd(y, x, ei_p4f_cephes_exp_p1);
+  y = ei_pmadd(y, x, ei_p4f_cephes_exp_p2);
+  y = ei_pmadd(y, x, ei_p4f_cephes_exp_p3);
+  y = ei_pmadd(y, x, ei_p4f_cephes_exp_p4);
+  y = ei_pmadd(y, x, ei_p4f_cephes_exp_p5);
+  y = ei_pmadd(y, z, x);
+  y = ei_padd(y, ei_p4f_1);
+
+  /* build 2^n */
+  emm0 = _mm_cvttps_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, ei_p4i_0x7f);
+  emm0 = _mm_slli_epi32(emm0, 23);
+  return ei_pmul(y, _mm_castsi128_ps(emm0));
+}
+
+/* evaluation of 4 sines at onces, using SSE2 intrinsics.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+*/
+
+_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625);
+_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4);
+_EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8);
+_EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4);
+_EIGEN_DECLARE_CONST_Packet4f(sincof_p1,  8.3321608736E-3);
+_EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1);
+_EIGEN_DECLARE_CONST_Packet4f(coscof_p0,  2.443315711809948E-005);
+_EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003);
+_EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002);
+_EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+template<> EIGEN_DONT_INLINE Packet4f ei_psin(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+
+  Packet4i emm0, emm2;
+  sign_bit = x;
+  /* take the absolute value */
+  x = ei_pabs(x);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, ei_p4f_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = ei_pmul(x, ei_p4f_cephes_FOPI);
+
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, ei_p4i_1);
+  emm2 = _mm_and_si128(emm2, ei_p4i_not1);
+  y = _mm_cvtepi32_ps(emm2);
+  /* get the swap sign flag */
+  emm0 = _mm_and_si128(emm2, ei_p4i_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  emm2 = _mm_and_si128(emm2, ei_p4i_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
+  Packet4f poly_mask = _mm_castsi128_ps(emm2);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = ei_pmul(y, ei_p4f_minus_cephes_DP1);
+  xmm2 = ei_pmul(y, ei_p4f_minus_cephes_DP2);
+  xmm3 = ei_pmul(y, ei_p4f_minus_cephes_DP3);
+  x = ei_padd(x, xmm1);
+  x = ei_padd(x, xmm2);
+  x = ei_padd(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = ei_p4f_coscof_p0;
+  Packet4f z = _mm_mul_ps(x,x);
+
+  y = ei_pmadd(y, z, ei_p4f_coscof_p1);
+  y = ei_pmadd(y, z, ei_p4f_coscof_p2);
+  y = ei_pmul(y, z);
+  y = ei_pmul(y, z);
+  Packet4f tmp = ei_pmul(z, ei_p4f_half);
+  y = ei_psub(y, tmp);
+  y = ei_padd(y, ei_p4f_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  Packet4f y2 = ei_p4f_sincof_p0;
+  y2 = ei_pmadd(y2, z, ei_p4f_sincof_p1);
+  y2 = ei_pmadd(y2, z, ei_p4f_sincof_p2);
+  y2 = ei_pmul(y2, z);
+  y2 = ei_pmul(y2, x);
+  y2 = ei_padd(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  y2 = _mm_and_ps(poly_mask, y2);
+  y = _mm_andnot_ps(poly_mask, y);
+  y = _mm_or_ps(y,y2);
+  /* update the sign */
+  return _mm_xor_ps(y, sign_bit);
+}
+
+/* almost the same as ei_psin */
+template<> EIGEN_DONT_INLINE Packet4f ei_pcos(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+  Packet4i emm0, emm2;
+  
+  x = ei_pabs(x);
+  
+  /* scale by 4/Pi */
+  y = ei_pmul(x, ei_p4f_cephes_FOPI);
+  
+  /* get the integer part of y */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, ei_p4i_1);
+  emm2 = _mm_and_si128(emm2, ei_p4i_not1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm2 = _mm_sub_epi32(emm2, ei_p4i_2);
+  
+  /* get the swap sign flag */
+  emm0 = _mm_andnot_si128(emm2, ei_p4i_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask */
+  emm2 = _mm_and_si128(emm2, ei_p4i_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  Packet4f sign_bit = _mm_castsi128_ps(emm0);
+  Packet4f poly_mask = _mm_castsi128_ps(emm2);
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = ei_pmul(y, ei_p4f_minus_cephes_DP1);
+  xmm2 = ei_pmul(y, ei_p4f_minus_cephes_DP2);
+  xmm3 = ei_pmul(y, ei_p4f_minus_cephes_DP3);
+  x = ei_padd(x, xmm1);
+  x = ei_padd(x, xmm2);
+  x = ei_padd(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = ei_p4f_coscof_p0;
+  Packet4f z = ei_pmul(x,x);
+
+  y = ei_pmadd(y,z,ei_p4f_coscof_p1);
+  y = ei_pmadd(y,z,ei_p4f_coscof_p2);
+  y = ei_pmul(y, z);
+  y = ei_pmul(y, z);
+  Packet4f tmp = _mm_mul_ps(z, ei_p4f_half);
+  y = ei_psub(y, tmp);
+  y = ei_padd(y, ei_p4f_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+  Packet4f y2 = ei_p4f_sincof_p0;
+  y2 = ei_pmadd(y2, z, ei_p4f_sincof_p1);
+  y2 = ei_pmadd(y2, z, ei_p4f_sincof_p2);
+  y2 = ei_pmul(y2, z);
+  y2 = ei_pmadd(y2, x, x);
+
+  /* select the correct result from the two polynoms */
+  y2 = _mm_and_ps(poly_mask, y2);
+  y  = _mm_andnot_ps(poly_mask, y);
+  y  = _mm_or_ps(y,y2);
+  
+  /* update the sign */
+  return _mm_xor_ps(y, sign_bit);
+}
+
+#endif // EIGEN_TRANSCENDENTAL_FUNCTIONS_SSE_H
author	Gael Guennebaud <g.gael@free.fr>	2009-03-25 12:26:13 +0000
committer	Gael Guennebaud <g.gael@free.fr>	2009-03-25 12:26:13 +0000
commit	17860e578cab13d5aacf8b5e6e373e59403352ba (patch)
tree	285e8ef963751a1a96ea127457ff51cf44a6cf16 /Eigen/src
parent	64fbd93cd904790e831aa5404698c5aa30f54be4 (diff)