From 82d61af3a490154ad1c0ae2fe00c561095854897 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Wed, 3 Mar 2021 19:22:15 -0800 Subject: Fix rint SSE/NEON again, using optimization barrier. This is a new version of !423, which failed for MSVC. Defined `EIGEN_OPTIMIZATION_BARRIER(X)` that uses inline assembly to prevent operations involving `X` from crossing that barrier. Should work on most `GNUC` compatible compilers (MSVC doesn't seem to need this). This is a modified version adapted from what was used in `psincos_float` and tested on more platforms (see #1674, https://godbolt.org/z/73ezTG). Modified `rint` to use the barrier to prevent the add/subtract rounding trick from being optimized away. Also fixed an edge case for large inputs that get bumped up a power of two and ends up rounding away more than just the fractional part. If we are over `2^digits` then just return the input. This edge case was missed in the test since the test was comparing approximate equality, which was still satisfied. Adding a strict equality option catches it. --- Eigen/src/Core/arch/SSE/PacketMath.h | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'Eigen/src/Core/arch/SSE') diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b9821ad80..d7b8bc8ac 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -646,20 +646,30 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { re #else template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { // Adds and subtracts signum(a) * 2^23 to force rounding. - const Packet4f offset = - pselect(pcmp_lt(a, pzero(a)), - pset1(-static_cast(1<<23)), - pset1(+static_cast(1<<23))); - return psub(padd(a, offset), offset); + const Packet4f limit = pset1(static_cast(1<<23)); + const Packet4f abs_a = pabs(a); + Packet4f r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; } template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) { // Adds and subtracts signum(a) * 2^52 to force rounding. - const Packet2d offset = - pselect(pcmp_lt(a, pzero(a)), - pset1(-static_cast(1ull<<52)), - pset1(+static_cast(1ull<<52))); - return psub(padd(a, offset), offset); + const Packet2d limit = pset1(static_cast(1ull<<52)); + const Packet2d abs_a = pabs(a); + Packet2d r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) -- cgit v1.2.3