Unify SSE/AVX psin functions.

It is based on the SSE version which is much more accurate, though very slightly slower. This changeset also includes the following required changes: - add packet-float to packet-int type traits - add packet float<->int reinterpret casts - add faster pselect for AVX based on blendv
author: Gael Guennebaud <g.gael@free.fr> 2018-11-27 22:41:51 +0100
committer: Gael Guennebaud <g.gael@free.fr> 2018-11-27 22:41:51 +0100
commit: fa7fd61edad765608beb629a2c6f656535188db6 (patch)
tree: 90e7a48a96e0f8663dc9c0bc1cc518082b664dcb /Eigen/src/Core/arch/SSE/MathFunctions.h
parent: 08edbc8cfebbd4064ca625072b128408b9bbe812 (diff)
1 files changed, 1 insertions, 95 deletions
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index e2046be47..9e699244e 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -54,101 +54,7 @@ Packet2d pexp<Packet2d>(const Packet2d& x)
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psin<Packet4f>(const Packet4f& _x)
 {
-  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
-  _EIGEN_DECLARE_CONST_Packet4i(1, 1);
-  _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
-  _EIGEN_DECLARE_CONST_Packet4i(2, 2);
-  _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000u);
-
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p1,  8.3321608736E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p0,  2.443315711809948E-005f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
-  Packet4f xmm1, xmm2, xmm3, sign_bit, y;
-
-  Packet4i emm0, emm2;
-  sign_bit = x;
-  /* take the absolute value */
-  x = pabs(x);
-
-  /* take the modulo */
-
-  /* extract the sign bit (upper one) */
-  sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask);
-
-  /* scale by 4/Pi */
-  y = pmul(x, p4f_cephes_FOPI);
-
-  /* store the integer part of y in mm0 */
-  emm2 = _mm_cvttps_epi32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  emm2 = _mm_add_epi32(emm2, p4i_1);
-  emm2 = _mm_and_si128(emm2, p4i_not1);
-  y = _mm_cvtepi32_ps(emm2);
-  /* get the swap sign flag */
-  emm0 = _mm_and_si128(emm2, p4i_4);
-  emm0 = _mm_slli_epi32(emm0, 29);
-  /* get the polynom selection mask
-     there is one polynom for 0 <= x <= Pi/4
-     and another one for Pi/4<x<=Pi/2
-
-     Both branches will be computed.
-  */
-  emm2 = _mm_and_si128(emm2, p4i_2);
-  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
-  Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
-  Packet4f poly_mask = _mm_castsi128_ps(emm2);
-  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = pmul(y, p4f_minus_cephes_DP1);
-  xmm2 = pmul(y, p4f_minus_cephes_DP2);
-  xmm3 = pmul(y, p4f_minus_cephes_DP3);
-  x = padd(x, xmm1);
-  x = padd(x, xmm2);
-  x = padd(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = p4f_coscof_p0;
-  Packet4f z = _mm_mul_ps(x,x);
-
-  y = pmadd(y, z, p4f_coscof_p1);
-  y = pmadd(y, z, p4f_coscof_p2);
-  y = pmul(y, z);
-  y = pmul(y, z);
-  Packet4f tmp = pmul(z, p4f_half);
-  y = psub(y, tmp);
-  y = padd(y, p4f_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  Packet4f y2 = p4f_sincof_p0;
-  y2 = pmadd(y2, z, p4f_sincof_p1);
-  y2 = pmadd(y2, z, p4f_sincof_p2);
-  y2 = pmul(y2, z);
-  y2 = pmul(y2, x);
-  y2 = padd(y2, x);
-
-  /* select the correct result from the two polynoms */
-  y2 = _mm_and_ps(poly_mask, y2);
-  y = _mm_andnot_ps(poly_mask, y);
-  y = _mm_or_ps(y,y2);
-  /* update the sign */
-  return _mm_xor_ps(y, sign_bit);
+  return psin_float(_x);
 }
 
 /* almost the same as psin */
author	Gael Guennebaud <g.gael@free.fr>	2018-11-27 22:41:51 +0100
committer	Gael Guennebaud <g.gael@free.fr>	2018-11-27 22:41:51 +0100
commit	fa7fd61edad765608beb629a2c6f656535188db6 (patch)
tree	90e7a48a96e0f8663dc9c0bc1cc518082b664dcb /Eigen/src/Core/arch/SSE/MathFunctions.h
parent	08edbc8cfebbd4064ca625072b128408b9bbe812 (diff)