From 85428a344025316db654c65e2628d4ceea1eec47 Mon Sep 17 00:00:00 2001
From: Guoqiang QI <425418567@qq.com>
Date: Tue, 8 Sep 2020 09:04:03 +0000
Subject: Add Neon psqrt<Packet2d> and pexp<Packet2d>

---
 .../Core/arch/Default/GenericPacketMathFunctions.h | 10 ++++++
 Eigen/src/Core/arch/NEON/MathFunctions.h           |  9 ++++++
 Eigen/src/Core/arch/NEON/PacketMath.h              | 36 ++++++++++++++++++++--
 3 files changed, 53 insertions(+), 2 deletions(-)

(limited to 'Eigen/src/Core/arch')
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 4d9b3b44c..e4a0c0919 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -39,6 +39,16 @@ pldexp_float(Packet a, Packet exponent)
   return pmul(a, preinterpret<Packet>(plogical_shift_left<23>(ei)));
 }
 
+template<typename Packet> EIGEN_STRONG_INLINE Packet
+pldexp_double(Packet a, Packet exponent)
+{
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  const Packet cst_1023 = pset1<Packet>(1023.0);
+  // return a * 2^exponent
+  PacketI ei = pcast<Packet,PacketI>(padd(exponent, cst_1023));
+  return pmul(a, preinterpret<Packet>(plogical_shift_left<52>(ei)));
+}
+
 // Natural logarithm
 // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
 // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
index ab549a683..8bea0ac3c 100644
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -44,6 +44,15 @@ BF16_PACKET_FUNCTION(Packet4f, Packet4bf, plog)
 BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp)
 BF16_PACKET_FUNCTION(Packet4f, Packet4bf, ptanh)
 
+
+//---------- double ----------
+
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp<Packet2d>(const Packet2d& x)
+{ return pexp_double(x); }
+
+#endif
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index fbd4faa97..2d0f91e2f 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -3580,8 +3580,8 @@ template<> struct packet_traits<double>  : default_packet_traits
     HasSin  = 0,
     HasCos  = 0,
     HasLog  = 0,
-    HasExp  = 0,
-    HasSqrt = 0,
+    HasExp  = 1,
+    HasSqrt = 1,
     HasTanh = 0,
     HasErf  = 0
   };
@@ -3750,6 +3750,38 @@ ptranspose(PacketBlock<Packet2d, 2>& kernel)
 template<> EIGEN_DEVICE_FUNC inline Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b)
 { return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
 
+template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent)
+{ return pldexp_double(a, exponent); }
+
+#if EIGEN_FAST_MATH
+
+// Functions for sqrt support packet2d.
+// The EIGEN_FAST_MATH version uses the vrsqrte_f64 approximation and one step
+// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
+// exact solution. It does not handle +inf, or denormalized numbers correctly.
+// The main advantage of this approach is not just speed, but also the fact that
+// it can be inlined and pipelined with other computations, further reducing its
+// effective latency. This is similar to Quake3's fast inverse square root.
+// For more details see: http://www.beyond3d.com/content/articles/8/
+template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){
+  Packet2d half = vmulq_n_f64(_x, 0.5);
+  Packet2ul denormal_mask = vandq_u64(vcgeq_f64(_x, vdupq_n_f64(0.0)),
+                                      vcltq_f64(_x, pset1<Packet2d>((std::numeric_limits<double>::min)())));
+  // Compute approximate reciprocal sqrt.
+  Packet2d x = vrsqrteq_f64(_x);
+  // Do a single step of Newton's iteration. 
+  //the number 1.5f was set reference to Quake3's fast inverse square root
+  x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
+  // Do one more Newton's iteration to get more accurate result.
+  x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
+  // Flush results for denormals to zero.
+  return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(pmul(_x, x)), denormal_mask));
+}
+
+#else 
+template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrt_f64(_x); }
+#endif
+
 #endif // EIGEN_ARCH_ARM64
 
 } // end namespace internal
-- 
cgit v1.2.3