Add an additional step of Newton-Raphson for `psqrt<double>` on Arm, which otherwise has an error of ~1000 ulps.

author: Rasmus Munk Larsen <rmlarsen@google.com> 2020-12-15 04:06:41 +0000
committer: Rasmus Munk Larsen <rmlarsen@google.com> 2020-12-15 04:06:41 +0000
commit: 6cee8d347e8a7e8e1a689a3b7de5fe413f3e1103 (patch)
tree: 751096df7a820ba4c42e8d65cfc7e005dffbdde4 /Eigen/src/Core/arch/NEON/PacketMath.h
parent: cf0b5b0344a3bfcf410e95bf22289015a2daf34b (diff)
1 files changed, 2 insertions, 1 deletions
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 90ffee767..5883eca38 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -3896,7 +3896,8 @@ template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){
   // Do a single step of Newton's iteration.
   //the number 1.5f was set reference to Quake3's fast inverse square root
   x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
-  // Do one more Newton's iteration to get more accurate result.
+  // Do two more Newton's iteration to get a result accurate to 1 ulp.
+  x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
   x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
   // Flush results for denormals to zero.
   return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(pmul(_x, x)), denormal_mask));
author	Rasmus Munk Larsen <rmlarsen@google.com>	2020-12-15 04:06:41 +0000
committer	Rasmus Munk Larsen <rmlarsen@google.com>	2020-12-15 04:06:41 +0000
commit	6cee8d347e8a7e8e1a689a3b7de5fe413f3e1103 (patch)
tree	751096df7a820ba4c42e8d65cfc7e005dffbdde4 /Eigen/src/Core/arch/NEON/PacketMath.h
parent	cf0b5b0344a3bfcf410e95bf22289015a2daf34b (diff)