Unify SSE/AVX psin functions.

It is based on the SSE version which is much more accurate, though very slightly slower. This changeset also includes the following required changes: - add packet-float to packet-int type traits - add packet float<->int reinterpret casts - add faster pselect for AVX based on blendv
author: Gael Guennebaud <g.gael@free.fr> 2018-11-27 22:41:51 +0100
committer: Gael Guennebaud <g.gael@free.fr> 2018-11-27 22:41:51 +0100
commit: fa7fd61edad765608beb629a2c6f656535188db6 (patch)
tree: 90e7a48a96e0f8663dc9c0bc1cc518082b664dcb /Eigen/src/Core/arch/SSE
parent: 08edbc8cfebbd4064ca625072b128408b9bbe812 (diff)
3 files changed, 29 insertions, 98 deletions
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index e2046be47..9e699244e 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -54,101 +54,7 @@ Packet2d pexp<Packet2d>(const Packet2d& x)
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psin<Packet4f>(const Packet4f& _x)
 {
-  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
-  _EIGEN_DECLARE_CONST_Packet4i(1, 1);
-  _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
-  _EIGEN_DECLARE_CONST_Packet4i(2, 2);
-  _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000u);
-
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p1,  8.3321608736E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p0,  2.443315711809948E-005f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
-  Packet4f xmm1, xmm2, xmm3, sign_bit, y;
-
-  Packet4i emm0, emm2;
-  sign_bit = x;
-  /* take the absolute value */
-  x = pabs(x);
-
-  /* take the modulo */
-
-  /* extract the sign bit (upper one) */
-  sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask);
-
-  /* scale by 4/Pi */
-  y = pmul(x, p4f_cephes_FOPI);
-
-  /* store the integer part of y in mm0 */
-  emm2 = _mm_cvttps_epi32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  emm2 = _mm_add_epi32(emm2, p4i_1);
-  emm2 = _mm_and_si128(emm2, p4i_not1);
-  y = _mm_cvtepi32_ps(emm2);
-  /* get the swap sign flag */
-  emm0 = _mm_and_si128(emm2, p4i_4);
-  emm0 = _mm_slli_epi32(emm0, 29);
-  /* get the polynom selection mask
-     there is one polynom for 0 <= x <= Pi/4
-     and another one for Pi/4<x<=Pi/2
-
-     Both branches will be computed.
-  */
-  emm2 = _mm_and_si128(emm2, p4i_2);
-  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
-  Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
-  Packet4f poly_mask = _mm_castsi128_ps(emm2);
-  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = pmul(y, p4f_minus_cephes_DP1);
-  xmm2 = pmul(y, p4f_minus_cephes_DP2);
-  xmm3 = pmul(y, p4f_minus_cephes_DP3);
-  x = padd(x, xmm1);
-  x = padd(x, xmm2);
-  x = padd(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = p4f_coscof_p0;
-  Packet4f z = _mm_mul_ps(x,x);
-
-  y = pmadd(y, z, p4f_coscof_p1);
-  y = pmadd(y, z, p4f_coscof_p2);
-  y = pmul(y, z);
-  y = pmul(y, z);
-  Packet4f tmp = pmul(z, p4f_half);
-  y = psub(y, tmp);
-  y = padd(y, p4f_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  Packet4f y2 = p4f_sincof_p0;
-  y2 = pmadd(y2, z, p4f_sincof_p1);
-  y2 = pmadd(y2, z, p4f_sincof_p2);
-  y2 = pmul(y2, z);
-  y2 = pmul(y2, x);
-  y2 = padd(y2, x);
-
-  /* select the correct result from the two polynoms */
-  y2 = _mm_and_ps(poly_mask, y2);
-  y = _mm_andnot_ps(poly_mask, y);
-  y = _mm_or_ps(y,y2);
-  /* update the sign */
-  return _mm_xor_ps(y, sign_bit);
+  return psin_float(_x);
 }
 
 /* almost the same as psin */
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index cb0f9f2c5..a508ce73e 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -158,9 +158,22 @@ template<> struct packet_traits<int>    : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> {
+  typedef float     type;
+  typedef Packet4f  half;
+  typedef Packet4i  integer_packet;
+  enum {size=4, alignment=Aligned16};
+};
+template<> struct unpacket_traits<Packet2d> {
+  typedef double    type;
+  typedef Packet2d  half;
+  enum {size=2, alignment=Aligned16};
+};
+template<> struct unpacket_traits<Packet4i> {
+  typedef int       type;
+  typedef Packet4i  half;
+  enum {size=4, alignment=Aligned16};
+};
 
 #ifndef EIGEN_VECTORIZE_AVX
 template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
@@ -184,6 +197,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int fro
 
 template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); }
 template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); }
+template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); }
 
 // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
 // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
@@ -338,6 +352,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
+
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
@@ -354,6 +370,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con
 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
 
+template<> EIGEN_STRONG_INLINE Packet4i pshiftleft<Packet4i>(const Packet4i& a, int n) { return _mm_slli_epi32(a,n); }
+
 #ifdef EIGEN_VECTORIZE_SSE4_1
 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
 template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index c6ca8c716..f607366f0 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -69,6 +69,13 @@ template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f
   return _mm_cvtps_pd(a);
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+  return _mm_castps_si128(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+  return _mm_castsi128_ps(a);
+}
 
 } // end namespace internal
author	Gael Guennebaud <g.gael@free.fr>	2018-11-27 22:41:51 +0100
committer	Gael Guennebaud <g.gael@free.fr>	2018-11-27 22:41:51 +0100
commit	fa7fd61edad765608beb629a2c6f656535188db6 (patch)
tree	90e7a48a96e0f8663dc9c0bc1cc518082b664dcb /Eigen/src/Core/arch/SSE
parent	08edbc8cfebbd4064ca625072b128408b9bbe812 (diff)