From 06e99aaf409eff4693c4256e59bb58313052818d Mon Sep 17 00:00:00 2001 From: Ilya Tokar Date: Thu, 12 Dec 2019 14:04:56 -0500 Subject: Bug 1785: fix pround on x86 to use the same rounding mode as std::round. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This also adds pset1frombits helper to Packet[24]d. Makes round ~45% slower for SSE: 1.65µs ± 1% before vs 2.45µs ± 2% after, stil an order of magnitude faster than scalar version: 33.8µs ± 2%. --- Eigen/src/Core/arch/AVX/PacketMath.h | 16 +++++++++++++--- Eigen/src/Core/arch/SSE/PacketMath.h | 16 ++++++++++++++-- test/packetmath.cpp | 8 ++++++++ 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index ed8acfc61..00c3ae5d0 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -311,9 +311,6 @@ template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8 #endif } -template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } -template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } - template<> EIGEN_STRONG_INLINE Packet8f pceil(const Packet8f& a) { return _mm256_ceil_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { return _mm256_ceil_pd(a); } @@ -391,6 +388,19 @@ template<> EIGEN_STRONG_INLINE Packet8i pandnot(const Packet8i& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) +{ + const Packet8f mask = pset1frombits(0x80000000u); + const Packet8f prev0dot5 = pset1frombits(0x3EFFFFFFu); + return _mm256_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} +template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) +{ + const Packet4d mask = _mm256_castsi256_pd(_mm256_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull, 0x8000000000000000ull, 0x8000000000000000ull)); + const Packet4d prev0dot5 = _mm256_castsi256_pd(_mm256_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull)); + return _mm256_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + template<> EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) { return _mm256_blendv_ps(b,a,mask); } template<> EIGEN_STRONG_INLINE Packet4d pselect(const Packet4d& mask, const Packet4d& a, const Packet4d& b) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 4a772d078..c9aa391e3 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -421,8 +421,20 @@ template EIGEN_STRONG_INLINE Packet4i pshiftright(Packet4i a) { return _m template EIGEN_STRONG_INLINE Packet4i pshiftleft(Packet4i a) { return _mm_slli_epi32(a,N); } #ifdef EIGEN_VECTORIZE_SSE4_1 -template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return _mm_round_ps(a, 0); } -template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return _mm_round_pd(a, 0); } +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) +{ + // Unfortunatly _mm_round_ps doesn't have a rounding mode to implement numext::round. + const Packet4f mask = pset1frombits(0x80000000u); + const Packet4f prev0dot5 = pset1frombits(0x3EFFFFFFu); + return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + +template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) +{ + const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull)); + const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull)); + return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return _mm_ceil_ps(a); } template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return _mm_ceil_pd(a); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 00f5f042b..ed0ec7efe 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -518,6 +518,14 @@ template void packetmath_real() CHECK_CWISE1_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil); CHECK_CWISE1_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor); + // See bug 1785. + for (int i=0; i(-1,1); -- cgit v1.2.3