Fix rint for SSE/NEON.

It seems *sometimes* with aggressive optimizations the combination `psub(padd(a, b), b)` trick to force rounding is compiled away. Here we replace with inline assembly to prevent this (I tried `volatile`, but that leads to additional loads from memory). Also fixed an edge case for large inputs `a` where adding `b` bumps the value up a power of two and ends up rounding away more than just the fractional part. If we are over `2^digits` then just return the input. This edge case was missed in the test since the test was comparing approximate equality, which was still satisfied. Adding a strict equality option catches it.
author: Antonio Sanchez <cantonios@google.com> 2021-03-03 09:41:46 -0800
committer: Antonio Sanchez <cantonios@google.com> 2021-03-03 09:41:46 -0800
commit: e72dfeb8b9fa5662831b5d0bb9d132521f9173dd (patch)
tree: bdc7a7bd7f8fa13a65bd71897282bcc4e3bc87d5 /Eigen/src/Core/arch/SSE
parent: 199c5f2b47eb1f8e5a2d20e60f07e97cd95a6ba6 (diff)
1 files changed, 25 insertions, 10 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index b9821ad80..652ad1d34 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -646,20 +646,35 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { re
 #else
 template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
   // Adds and subtracts signum(a) * 2^23 to force rounding.
-  const Packet4f offset = 
-    pselect(pcmp_lt(a, pzero(a)), 
-      pset1<Packet4f>(-static_cast<float>(1<<23)),
-      pset1<Packet4f>(+static_cast<float>(1<<23)));
-  return psub(padd(a, offset), offset);
+  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
+  const Packet4f abs_a = pabs(a);
+  // Inline asm to prevent the compiler from optimizing away the
+  // addition and subtraction.
+  // Packet4f r = psub(padd(abs_a, limit), limit);
+  Packet4f r = abs_a;
+  __asm__ ("addps %[limit], %[r]\n\t"
+           "subps %[limit], %[r]" : [r] "+x" (r) : [limit] "x" (limit));
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
 }
 
 template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
   // Adds and subtracts signum(a) * 2^52 to force rounding.
-  const Packet2d offset = 
-    pselect(pcmp_lt(a, pzero(a)), 
-      pset1<Packet2d>(-static_cast<double>(1ull<<52)),
-      pset1<Packet2d>(+static_cast<double>(1ull<<52)));
-  return psub(padd(a, offset), offset);
+  const Packet2d limit = pset1<Packet2d>(static_cast<double>(1ull<<52));
+  const Packet2d abs_a = pabs(a);
+  // Inline asm to prevent the compiler from optimizing away the
+  // addition and subtraction.
+  // Packet2d r = psub(padd(abs_a, limit), limit);
+  Packet2d r = abs_a;
+  asm("addpd %[limit], %[r] \n\t"
+      "subpd %[limit], %[r]" : [r] "+x" (r) : [limit] "x" (limit));
+
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
author	Antonio Sanchez <cantonios@google.com>	2021-03-03 09:41:46 -0800
committer	Antonio Sanchez <cantonios@google.com>	2021-03-03 09:41:46 -0800
commit	e72dfeb8b9fa5662831b5d0bb9d132521f9173dd (patch)
tree	bdc7a7bd7f8fa13a65bd71897282bcc4e3bc87d5 /Eigen/src/Core/arch/SSE
parent	199c5f2b47eb1f8e5a2d20e60f07e97cd95a6ba6 (diff)