diff options
author | Ilya Tokar <tokarip@google.com> | 2019-12-12 14:04:56 -0500 |
---|---|---|
committer | Ilya Tokar <tokarip@google.com> | 2019-12-12 17:38:53 -0500 |
commit | 06e99aaf409eff4693c4256e59bb58313052818d (patch) | |
tree | 1d8c03970355f441cd3710ba49c6448db20a0108 /Eigen/src/Core/arch/SSE/PacketMath.h | |
parent | 73a8d572f5d2e7020b71026d48bfdf99decf8d5b (diff) |
Bug 1785: fix pround on x86 to use the same rounding mode as std::round.
This also adds pset1frombits helper to Packet[24]d.
Makes round ~45% slower for SSE: 1.65µs ± 1% before vs 2.45µs ± 2% after,
stil an order of magnitude faster than scalar version: 33.8µs ± 2%.
Diffstat (limited to 'Eigen/src/Core/arch/SSE/PacketMath.h')
-rwxr-xr-x | Eigen/src/Core/arch/SSE/PacketMath.h | 16 |
1 files changed, 14 insertions, 2 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 4a772d078..c9aa391e3 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -421,8 +421,20 @@ template<int N> EIGEN_STRONG_INLINE Packet4i pshiftright(Packet4i a) { return _m template<int N> EIGEN_STRONG_INLINE Packet4i pshiftleft(Packet4i a) { return _mm_slli_epi32(a,N); } #ifdef EIGEN_VECTORIZE_SSE4_1 -template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); } -template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); } +template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) +{ + // Unfortunatly _mm_round_ps doesn't have a rounding mode to implement numext::round. + const Packet4f mask = pset1frombits<Packet4f>(0x80000000u); + const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu); + return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + +template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) +{ + const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull)); + const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull)); + return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); } template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); } |