From 85428a344025316db654c65e2628d4ceea1eec47 Mon Sep 17 00:00:00 2001 From: Guoqiang QI <425418567@qq.com> Date: Tue, 8 Sep 2020 09:04:03 +0000 Subject: Add Neon psqrt and pexp --- .../Core/arch/Default/GenericPacketMathFunctions.h | 10 ++++++ Eigen/src/Core/arch/NEON/MathFunctions.h | 9 ++++++ Eigen/src/Core/arch/NEON/PacketMath.h | 36 ++++++++++++++++++++-- 3 files changed, 53 insertions(+), 2 deletions(-) (limited to 'Eigen/src/Core/arch') diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 4d9b3b44c..e4a0c0919 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -39,6 +39,16 @@ pldexp_float(Packet a, Packet exponent) return pmul(a, preinterpret(plogical_shift_left<23>(ei))); } +template EIGEN_STRONG_INLINE Packet +pldexp_double(Packet a, Packet exponent) +{ + typedef typename unpacket_traits::integer_packet PacketI; + const Packet cst_1023 = pset1(1023.0); + // return a * 2^exponent + PacketI ei = pcast(padd(exponent, cst_1023)); + return pmul(a, preinterpret(plogical_shift_left<52>(ei))); +} + // Natural logarithm // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h index ab549a683..8bea0ac3c 100644 --- a/Eigen/src/Core/arch/NEON/MathFunctions.h +++ b/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -44,6 +44,15 @@ BF16_PACKET_FUNCTION(Packet4f, Packet4bf, plog) BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp) BF16_PACKET_FUNCTION(Packet4f, Packet4bf, ptanh) + +//---------- double ---------- + +#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp(const Packet2d& x) +{ return pexp_double(x); } + +#endif + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index fbd4faa97..2d0f91e2f 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -3580,8 +3580,8 @@ template<> struct packet_traits : default_packet_traits HasSin = 0, HasCos = 0, HasLog = 0, - HasExp = 0, - HasSqrt = 0, + HasExp = 1, + HasSqrt = 1, HasTanh = 0, HasErf = 0 }; @@ -3750,6 +3750,38 @@ ptranspose(PacketBlock& kernel) template<> EIGEN_DEVICE_FUNC inline Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b) { return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) +{ return pldexp_double(a, exponent); } + +#if EIGEN_FAST_MATH + +// Functions for sqrt support packet2d. +// The EIGEN_FAST_MATH version uses the vrsqrte_f64 approximation and one step +// of Newton's method, at a cost of 1-2 bits of precision as opposed to the +// exact solution. It does not handle +inf, or denormalized numbers correctly. +// The main advantage of this approach is not just speed, but also the fact that +// it can be inlined and pipelined with other computations, further reducing its +// effective latency. This is similar to Quake3's fast inverse square root. +// For more details see: http://www.beyond3d.com/content/articles/8/ +template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ + Packet2d half = vmulq_n_f64(_x, 0.5); + Packet2ul denormal_mask = vandq_u64(vcgeq_f64(_x, vdupq_n_f64(0.0)), + vcltq_f64(_x, pset1((std::numeric_limits::min)()))); + // Compute approximate reciprocal sqrt. + Packet2d x = vrsqrteq_f64(_x); + // Do a single step of Newton's iteration. + //the number 1.5f was set reference to Quake3's fast inverse square root + x = vmulq_f64(x, psub(pset1(1.5), pmul(half, pmul(x, x)))); + // Do one more Newton's iteration to get more accurate result. + x = vmulq_f64(x, psub(pset1(1.5), pmul(half, pmul(x, x)))); + // Flush results for denormals to zero. + return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(pmul(_x, x)), denormal_mask)); +} + +#else +template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrt_f64(_x); } +#endif + #endif // EIGEN_ARCH_ARM64 } // end namespace internal -- cgit v1.2.3