diff options
author | Antonio Sanchez <cantonios@google.com> | 2021-03-03 19:22:15 -0800 |
---|---|---|
committer | Antonio Sanchez <cantonios@google.com> | 2021-03-05 08:54:12 -0800 |
commit | 82d61af3a490154ad1c0ae2fe00c561095854897 (patch) | |
tree | 9137169da76e43ef4908ab87dc5990d801c48eda /Eigen/src/Core/arch/NEON/PacketMath.h | |
parent | 5f0b4a4010af4cbf6161a0d1a03a747addc44a5d (diff) |
Fix rint SSE/NEON again, using optimization barrier.
This is a new version of !423, which failed for MSVC.
Defined `EIGEN_OPTIMIZATION_BARRIER(X)` that uses inline assembly to
prevent operations involving `X` from crossing that barrier. Should
work on most `GNUC` compatible compilers (MSVC doesn't seem to need
this). This is a modified version adapted from what was used in
`psincos_float` and tested on more platforms
(see #1674, https://godbolt.org/z/73ezTG).
Modified `rint` to use the barrier to prevent the add/subtract rounding
trick from being optimized away.
Also fixed an edge case for large inputs that get bumped up a power of two
and ends up rounding away more than just the fractional part. If we are
over `2^digits` then just return the input. This edge case was missed in
the test since the test was comparing approximate equality, which was still
satisfied. Adding a strict equality option catches it.
Diffstat (limited to 'Eigen/src/Core/arch/NEON/PacketMath.h')
-rw-r--r-- | Eigen/src/Core/arch/NEON/PacketMath.h | 30 |
1 files changed, 20 insertions, 10 deletions
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index ec6ea90c5..7d69de6dc 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -3207,20 +3207,30 @@ template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { // Adds and subtracts signum(a) * 2^23 to force rounding. - const Packet4f offset = - pselect(pcmp_lt(a, pzero(a)), - pset1<Packet4f>(-static_cast<float>(1<<23)), - pset1<Packet4f>(+static_cast<float>(1<<23))); - return psub(padd(a, offset), offset); + const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23)); + const Packet4f abs_a = pabs(a); + Packet4f r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; } template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) { // Adds and subtracts signum(a) * 2^23 to force rounding. - const Packet2f offset = - pselect(pcmp_lt(a, pzero(a)), - pset1<Packet2f>(-static_cast<float>(1<<23)), - pset1<Packet2f>(+static_cast<float>(1<<23))); - return psub(padd(a, offset), offset); + const Packet2f limit = pset1<Packet2f>(static_cast<float>(1<<23)); + const Packet2f abs_a = pabs(a); + Packet2f r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; } template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) |