Fix rint SSE/NEON again, using optimization barrier.

This is a new version of !423, which failed for MSVC. Defined `EIGEN_OPTIMIZATION_BARRIER(X)` that uses inline assembly to prevent operations involving `X` from crossing that barrier. Should work on most `GNUC` compatible compilers (MSVC doesn't seem to need this). This is a modified version adapted from what was used in `psincos_float` and tested on more platforms (see #1674, https://godbolt.org/z/73ezTG). Modified `rint` to use the barrier to prevent the add/subtract rounding trick from being optimized away. Also fixed an edge case for large inputs that get bumped up a power of two and ends up rounding away more than just the fractional part. If we are over `2^digits` then just return the input. This edge case was missed in the test since the test was comparing approximate equality, which was still satisfied. Adding a strict equality option catches it.
author: Antonio Sanchez <cantonios@google.com> 2021-03-03 19:22:15 -0800
committer: Antonio Sanchez <cantonios@google.com> 2021-03-05 08:54:12 -0800
commit: 82d61af3a490154ad1c0ae2fe00c561095854897 (patch)
tree: 9137169da76e43ef4908ab87dc5990d801c48eda /Eigen/src/Core/arch/NEON/PacketMath.h
parent: 5f0b4a4010af4cbf6161a0d1a03a747addc44a5d (diff)
1 files changed, 20 insertions, 10 deletions
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index ec6ea90c5..7d69de6dc 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -3207,20 +3207,30 @@ template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
 
 template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
   // Adds and subtracts signum(a) * 2^23 to force rounding.
-  const Packet4f offset = 
-    pselect(pcmp_lt(a, pzero(a)), 
-      pset1<Packet4f>(-static_cast<float>(1<<23)),
-      pset1<Packet4f>(+static_cast<float>(1<<23)));
-  return psub(padd(a, offset), offset);
+  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
+  const Packet4f abs_a = pabs(a);
+  Packet4f r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
 }
 
 template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) {
   // Adds and subtracts signum(a) * 2^23 to force rounding.
-  const Packet2f offset = 
-    pselect(pcmp_lt(a, pzero(a)), 
-      pset1<Packet2f>(-static_cast<float>(1<<23)),
-      pset1<Packet2f>(+static_cast<float>(1<<23)));
-  return psub(padd(a, offset), offset);
+  const Packet2f limit = pset1<Packet2f>(static_cast<float>(1<<23));
+  const Packet2f abs_a = pabs(a);
+  Packet2f r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
author	Antonio Sanchez <cantonios@google.com>	2021-03-03 19:22:15 -0800
committer	Antonio Sanchez <cantonios@google.com>	2021-03-05 08:54:12 -0800
commit	82d61af3a490154ad1c0ae2fe00c561095854897 (patch)
tree	9137169da76e43ef4908ab87dc5990d801c48eda /Eigen/src/Core/arch/NEON/PacketMath.h
parent	5f0b4a4010af4cbf6161a0d1a03a747addc44a5d (diff)