diff options
Diffstat (limited to 'Eigen/src/Core')
-rw-r--r-- | Eigen/src/Core/arch/NEON/PacketMath.h | 34 | ||||
-rwxr-xr-x | Eigen/src/Core/arch/SSE/PacketMath.h | 35 |
2 files changed, 49 insertions, 20 deletions
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index ec6ea90c5..51cebaf2b 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -3207,20 +3207,34 @@ template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { // Adds and subtracts signum(a) * 2^23 to force rounding. - const Packet4f offset = - pselect(pcmp_lt(a, pzero(a)), - pset1<Packet4f>(-static_cast<float>(1<<23)), - pset1<Packet4f>(+static_cast<float>(1<<23))); - return psub(padd(a, offset), offset); + const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23)); + const Packet4f abs_a = pabs(a); + // Inline asm to prevent the compiler from optimizing away the + // addition and subtraction. + // Packet4f r = psub(padd(abs_a, limit), limit); + Packet4f r = abs_a; + __asm__ ("vadd.f32 %[r], %[r], %[limit]\n\t" + "vsub.f32 %[r], %[r], %[limit]" : [r] "+x" (r) : [limit] "x" (limit)); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; } template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) { // Adds and subtracts signum(a) * 2^23 to force rounding. - const Packet2f offset = - pselect(pcmp_lt(a, pzero(a)), - pset1<Packet2f>(-static_cast<float>(1<<23)), - pset1<Packet2f>(+static_cast<float>(1<<23))); - return psub(padd(a, offset), offset); + const Packet2f limit = pset1<Packet2f>(static_cast<float>(1<<23)); + const Packet2f abs_a = pabs(a); + // Inline asm to prevent the compiler from optimizing away the + // addition and subtraction. + // Packet4f r = psub(padd(abs_a, limit), limit); + Packet2f r = abs_a; + __asm__ ("vadd.f32 %[r], %[r], %[limit]\n\t" + "vsub.f32 %[r], %[r], %[limit]" : [r] "+x" (r) : [limit] "x" (limit)); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; } template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b9821ad80..652ad1d34 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -646,20 +646,35 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { re #else template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { // Adds and subtracts signum(a) * 2^23 to force rounding. - const Packet4f offset = - pselect(pcmp_lt(a, pzero(a)), - pset1<Packet4f>(-static_cast<float>(1<<23)), - pset1<Packet4f>(+static_cast<float>(1<<23))); - return psub(padd(a, offset), offset); + const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23)); + const Packet4f abs_a = pabs(a); + // Inline asm to prevent the compiler from optimizing away the + // addition and subtraction. + // Packet4f r = psub(padd(abs_a, limit), limit); + Packet4f r = abs_a; + __asm__ ("addps %[limit], %[r]\n\t" + "subps %[limit], %[r]" : [r] "+x" (r) : [limit] "x" (limit)); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; } template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) { // Adds and subtracts signum(a) * 2^52 to force rounding. - const Packet2d offset = - pselect(pcmp_lt(a, pzero(a)), - pset1<Packet2d>(-static_cast<double>(1ull<<52)), - pset1<Packet2d>(+static_cast<double>(1ull<<52))); - return psub(padd(a, offset), offset); + const Packet2d limit = pset1<Packet2d>(static_cast<double>(1ull<<52)); + const Packet2d abs_a = pabs(a); + // Inline asm to prevent the compiler from optimizing away the + // addition and subtraction. + // Packet2d r = psub(padd(abs_a, limit), limit); + Packet2d r = abs_a; + asm("addpd %[limit], %[r] \n\t" + "subpd %[limit], %[r]" : [r] "+x" (r) : [limit] "x" (limit)); + + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; } template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) |