aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/SSE/PacketMath.h
diff options
context:
space:
mode:
authorGravatar Ilya Tokar <tokarip@google.com>2019-12-12 14:04:56 -0500
committerGravatar Ilya Tokar <tokarip@google.com>2019-12-12 17:38:53 -0500
commit06e99aaf409eff4693c4256e59bb58313052818d (patch)
tree1d8c03970355f441cd3710ba49c6448db20a0108 /Eigen/src/Core/arch/SSE/PacketMath.h
parent73a8d572f5d2e7020b71026d48bfdf99decf8d5b (diff)
Bug 1785: fix pround on x86 to use the same rounding mode as std::round.
This also adds pset1frombits helper to Packet[24]d. Makes round ~45% slower for SSE: 1.65µs ± 1% before vs 2.45µs ± 2% after, stil an order of magnitude faster than scalar version: 33.8µs ± 2%.
Diffstat (limited to 'Eigen/src/Core/arch/SSE/PacketMath.h')
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h16
1 files changed, 14 insertions, 2 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 4a772d078..c9aa391e3 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -421,8 +421,20 @@ template<int N> EIGEN_STRONG_INLINE Packet4i pshiftright(Packet4i a) { return _m
template<int N> EIGEN_STRONG_INLINE Packet4i pshiftleft(Packet4i a) { return _mm_slli_epi32(a,N); }
#ifdef EIGEN_VECTORIZE_SSE4_1
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
-template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
+{
+ // Unfortunatly _mm_round_ps doesn't have a rounding mode to implement numext::round.
+ const Packet4f mask = pset1frombits<Packet4f>(0x80000000u);
+ const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu);
+ return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
+{
+ const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
+ const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
+ return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }