From 3012e755e92d3b3f01f8e7753b5e71cbeaaa40df Mon Sep 17 00:00:00 2001
From: Guoqiang QI <425418567@qq.com>
Date: Tue, 15 Sep 2020 17:10:35 +0000
Subject: Add plog ops support packet2d for NEON

---
 .../Core/arch/Default/GenericPacketMathFunctions.h | 118 +++++++++++++++++++++
 1 file changed, 118 insertions(+)

(limited to 'Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h')
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index e4a0c0919..a0bfada93 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -29,6 +29,16 @@ pfrexp_float(const Packet& a, Packet& exponent) {
   return por(pand(a, cst_inv_mant_mask), cst_half);
 }
 
+template<typename Packet> EIGEN_STRONG_INLINE Packet
+pfrexp_double(const Packet& a, Packet& exponent) {
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  const Packet cst_1022d = pset1<Packet>(1022.0);
+  const Packet cst_half = pset1<Packet>(0.5);
+  const Packet cst_inv_mant_mask  = pset1frombits<Packet>(~0x7ff0000000000000u);
+  exponent = psub(pcast<PacketI,Packet>(plogical_shift_right<52>(preinterpret<PacketI>(a))), cst_1022d);
+  return por(pand(a, cst_inv_mant_mask), cst_half);
+}
+
 template<typename Packet> EIGEN_STRONG_INLINE Packet
 pldexp_float(Packet a, Packet exponent)
 {
@@ -139,6 +149,114 @@ Packet plog_float(const Packet _x)
                               por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
 }
 
+
+/* Returns the base e (2.718...) logarithm of x.
+ * The argument is separated into its exponent and fractional
+ * parts.  If the exponent is between -1 and +1, the logarithm
+ * of the fraction is approximated by
+ *
+ *     log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x).
+ *
+ * Otherwise, setting  z = 2(x-1)/x+1),
+ *                     log(x) = z + z**3 P(z)/Q(z).
+ * 
+ * for more detail see: http://www.netlib.org/cephes/
+ */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_UNUSED
+Packet plog_double(const Packet _x)
+{
+  Packet x = _x;
+
+  const Packet cst_1              = pset1<Packet>(1.0);
+  const Packet cst_half           = pset1<Packet>(0.5);
+  // The smallest non denormalized float number.
+  const Packet cst_min_norm_pos   = pset1frombits<Packet>( 0x0010000000000000u);
+  const Packet cst_minus_inf      = pset1frombits<Packet>( 0xfff0000000000000u);
+  const Packet cst_pos_inf        = pset1frombits<Packet>( 0x7ff0000000000000u);
+
+ // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
+ //                             1/sqrt(2) <= x < sqrt(2)
+  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
+  const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
+  const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
+  const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
+  const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
+  const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
+  const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
+
+  const Packet cst_cephes_log_r0 = pset1<Packet>(1.0);
+  const Packet cst_cephes_log_r1 = pset1<Packet>(1.12873587189167450590E1);
+  const Packet cst_cephes_log_r2 = pset1<Packet>(4.52279145837532221105E1);
+  const Packet cst_cephes_log_r3 = pset1<Packet>(8.29875266912776603211E1);
+  const Packet cst_cephes_log_r4 = pset1<Packet>(7.11544750618563894466E1);
+  const Packet cst_cephes_log_r5 = pset1<Packet>(2.31251620126765340583E1);
+
+  const Packet cst_cephes_log_q1 = pset1<Packet>(-2.121944400546905827679e-4);
+  const Packet cst_cephes_log_q2 = pset1<Packet>(0.693359375);
+
+  // Truncate input values to the minimum positive normal.
+  x = pmax(x, cst_min_norm_pos);
+
+  Packet e;
+  // extract significant in the range [0.5,1) and exponent
+  x = pfrexp(x,e);
+  
+  // Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
+  // and shift by -1. The values are then centered around 0, which improves
+  // the stability of the polynomial evaluation.
+  //   if( x < SQRTHF ) {
+  //     e -= 1;
+  //     x = x + x - 1.0;
+  //   } else { x = x - 1.0; }
+  Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
+  Packet tmp = pand(x, mask);
+  x = psub(x, cst_1);
+  e = psub(e, pand(cst_1, mask));
+  x = padd(x, tmp);
+
+  Packet x2 = pmul(x, x);
+  Packet x3 = pmul(x2, x);
+
+  // Evaluate the polynomial approximant , probably to improve instruction-level parallelism.
+  // y = x * ( z * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
+  Packet y, y1, y2,y_;
+  y  = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
+  y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
+  y  = pmadd(y, x, cst_cephes_log_p2);
+  y1 = pmadd(y1, x, cst_cephes_log_p5);
+  y_ = pmadd(y, x3, y1);
+
+  y  = pmadd(cst_cephes_log_r0, x, cst_cephes_log_r1);
+  y1 = pmadd(cst_cephes_log_r3, x, cst_cephes_log_r4);
+  y  = pmadd(y, x, cst_cephes_log_r2);
+  y1 = pmadd(y1, x, cst_cephes_log_r5);
+  y  = pmadd(y, x3, y1);
+
+  y_ = pmul(y_, x3);
+  y  = pdiv(y_, y);
+
+  // Add the logarithm of the exponent back to the result of the interpolation.
+  y1  = pmul(e, cst_cephes_log_q1);
+  tmp = pmul(x2, cst_half);
+  y   = padd(y, y1);
+  x   = psub(x, tmp);
+  y2  = pmul(e, cst_cephes_log_q2);
+  x   = padd(x, y);
+  x   = padd(x, y2);
+
+  Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
+  Packet iszero_mask  = pcmp_eq(_x,pzero(_x));
+  Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
+  // Filter out invalid inputs, i.e.:
+  //  - negative arg will be NAN
+  //  - 0 will be -INF
+  //  - +INF will be +INF
+  return pselect(iszero_mask, cst_minus_inf,
+                              por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
+}
+
 /** \internal \returns log(1 + x) computed using W. Kahan's formula.
     See: http://www.plunk.org/~hatch/rightway.php
  */
-- 
cgit v1.2.3