From e72dfeb8b9fa5662831b5d0bb9d132521f9173dd Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Wed, 3 Mar 2021 09:41:46 -0800 Subject: Fix rint for SSE/NEON. It seems *sometimes* with aggressive optimizations the combination `psub(padd(a, b), b)` trick to force rounding is compiled away. Here we replace with inline assembly to prevent this (I tried `volatile`, but that leads to additional loads from memory). Also fixed an edge case for large inputs `a` where adding `b` bumps the value up a power of two and ends up rounding away more than just the fractional part. If we are over `2^digits` then just return the input. This edge case was missed in the test since the test was comparing approximate equality, which was still satisfied. Adding a strict equality option catches it. --- Eigen/src/Core/arch/NEON/PacketMath.h | 34 ++++++++++++++++++++++++---------- Eigen/src/Core/arch/SSE/PacketMath.h | 35 +++++++++++++++++++++++++---------- test/packetmath.cpp | 18 +++++++++--------- test/packetmath_test_shared.h | 25 +++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 29 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index ec6ea90c5..51cebaf2b 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -3207,20 +3207,34 @@ template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { // Adds and subtracts signum(a) * 2^23 to force rounding. - const Packet4f offset = - pselect(pcmp_lt(a, pzero(a)), - pset1(-static_cast(1<<23)), - pset1(+static_cast(1<<23))); - return psub(padd(a, offset), offset); + const Packet4f limit = pset1(static_cast(1<<23)); + const Packet4f abs_a = pabs(a); + // Inline asm to prevent the compiler from optimizing away the + // addition and subtraction. + // Packet4f r = psub(padd(abs_a, limit), limit); + Packet4f r = abs_a; + __asm__ ("vadd.f32 %[r], %[r], %[limit]\n\t" + "vsub.f32 %[r], %[r], %[limit]" : [r] "+x" (r) : [limit] "x" (limit)); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; } template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) { // Adds and subtracts signum(a) * 2^23 to force rounding. - const Packet2f offset = - pselect(pcmp_lt(a, pzero(a)), - pset1(-static_cast(1<<23)), - pset1(+static_cast(1<<23))); - return psub(padd(a, offset), offset); + const Packet2f limit = pset1(static_cast(1<<23)); + const Packet2f abs_a = pabs(a); + // Inline asm to prevent the compiler from optimizing away the + // addition and subtraction. + // Packet4f r = psub(padd(abs_a, limit), limit); + Packet2f r = abs_a; + __asm__ ("vadd.f32 %[r], %[r], %[limit]\n\t" + "vsub.f32 %[r], %[r], %[limit]" : [r] "+x" (r) : [limit] "x" (limit)); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b9821ad80..652ad1d34 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -646,20 +646,35 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { re #else template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { // Adds and subtracts signum(a) * 2^23 to force rounding. - const Packet4f offset = - pselect(pcmp_lt(a, pzero(a)), - pset1(-static_cast(1<<23)), - pset1(+static_cast(1<<23))); - return psub(padd(a, offset), offset); + const Packet4f limit = pset1(static_cast(1<<23)); + const Packet4f abs_a = pabs(a); + // Inline asm to prevent the compiler from optimizing away the + // addition and subtraction. + // Packet4f r = psub(padd(abs_a, limit), limit); + Packet4f r = abs_a; + __asm__ ("addps %[limit], %[r]\n\t" + "subps %[limit], %[r]" : [r] "+x" (r) : [limit] "x" (limit)); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; } template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) { // Adds and subtracts signum(a) * 2^52 to force rounding. - const Packet2d offset = - pselect(pcmp_lt(a, pzero(a)), - pset1(-static_cast(1ull<<52)), - pset1(+static_cast(1ull<<52))); - return psub(padd(a, offset), offset); + const Packet2d limit = pset1(static_cast(1ull<<52)); + const Packet2d abs_a = pabs(a); + // Inline asm to prevent the compiler from optimizing away the + // addition and subtraction. + // Packet2d r = psub(padd(abs_a, limit), limit); + Packet2d r = abs_a; + asm("addpd %[limit], %[r] \n\t" + "subpd %[limit], %[r]" : [r] "+x" (r) : [limit] "x" (limit)); + + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 76ac47554..e69120a25 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -543,10 +543,10 @@ void packetmath_real() { CHECK_CWISE1_IF(PacketTraits::HasCos, std::cos, internal::pcos); CHECK_CWISE1_IF(PacketTraits::HasTan, std::tan, internal::ptan); - CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround); - CHECK_CWISE1_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil); - CHECK_CWISE1_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor); - CHECK_CWISE1_IF(PacketTraits::HasRint, numext::rint, internal::print); + CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::round, internal::pround); + CHECK_CWISE1_EXACT_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil); + CHECK_CWISE1_EXACT_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor); + CHECK_CWISE1_EXACT_IF(PacketTraits::HasRint, numext::rint, internal::print); // Rounding edge cases. if (PacketTraits::HasRound || PacketTraits::HasCeil || PacketTraits::HasFloor || PacketTraits::HasRint) { @@ -583,10 +583,10 @@ void packetmath_real() { for (size_t k=0; k::min_exponent-10); + data1[PacketSize] = Scalar(std::numeric_limits::min_exponent-55); CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp); // overflow to inf data1[PacketSize] = Scalar(std::numeric_limits::max_exponent+10); diff --git a/test/packetmath_test_shared.h b/test/packetmath_test_shared.h index 027715a89..8fbe0d2f7 100644 --- a/test/packetmath_test_shared.h +++ b/test/packetmath_test_shared.h @@ -108,6 +108,23 @@ template bool areApprox(const Scalar* a, const Scalar* b, int s return true; } +template bool areEqual(const Scalar* a, const Scalar* b, int size) +{ + for (int i=0; i >(a,size) << "]" << " != vec: [" << Map >(b,size) << "]\n"; + return false; + } + } + return true; +} + #define CHECK_CWISE1(REFOP, POP) { \ for (int i=0; i VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \ } +#define CHECK_CWISE1_EXACT_IF(COND, REFOP, POP) if(COND) { \ + test::packet_helper h; \ + for (int i=0; i h; \ for (int i=0; i