diff options
author | Antonio Sanchez <cantonios@google.com> | 2021-02-25 14:29:49 -0800 |
---|---|---|
committer | Antonio Sánchez <cantonios@google.com> | 2021-02-27 22:42:07 +0000 |
commit | 1e0c7d4f4933b12a325dbaa2c79ce946bb13f7d6 (patch) | |
tree | 30d28ba3618296434df793ad25d06f68e6c98d65 /Eigen/src/Core/arch/SSE | |
parent | 976ae0ca6f381a855daddcba73de72737be2e8a7 (diff) |
Add print for SSE/NEON, use NEON rounding intrinsics if available.
In SSE, by adding/subtracting 2^MantissaBits, we force rounding according to the
current rounding mode.
For NEON, we use the provided intrinsics for rint/floor/ceil if
available (armv8).
Related to #1969.
Diffstat (limited to 'Eigen/src/Core/arch/SSE')
-rwxr-xr-x | Eigen/src/Core/arch/SSE/PacketMath.h | 61 |
1 files changed, 29 insertions, 32 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 78fd99e64..b9821ad80 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -148,12 +148,11 @@ struct packet_traits<float> : default_packet_traits { HasErf = EIGEN_FAST_MATH, HasBlend = 1, HasCeil = 1, - HasFloor = 1 + HasFloor = 1, + HasRint = 1, #ifdef EIGEN_VECTORIZE_SSE4_1 - , - HasRint = 1, - HasRound = 1 + HasRound = 1, #endif }; }; @@ -175,12 +174,10 @@ struct packet_traits<double> : default_packet_traits { HasRsqrt = 1, HasBlend = 1, HasFloor = 1, - HasCeil = 1 - -#ifdef EIGEN_VECTORIZE_SSE4_1 - , + HasCeil = 1, + HasRint = 1, + #ifdef EIGEN_VECTORIZE_SSE4_1 HasRound = 1, - HasRint = 1 #endif }; }; @@ -647,23 +644,17 @@ template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { ret template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); } #else -template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) -{ - const Packet4f cst_1 = pset1<Packet4f>(1.0f); - Packet4i emm0 = _mm_cvttps_epi32(a); - Packet4f tmp = _mm_cvtepi32_ps(emm0); - // If greater, subtract one. - Packet4f mask = _mm_cmpgt_ps(tmp, a); - mask = pand(mask, cst_1); - tmp = psub(tmp, mask); - // Handle saturation cases. - const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest())); - return pselect(pcmp_lt(pabs(a), cst_max), tmp, a); +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const Packet4f offset = + pselect(pcmp_lt(a, pzero(a)), + pset1<Packet4f>(-static_cast<float>(1<<23)), + pset1<Packet4f>(+static_cast<float>(1<<23))); + return psub(padd(a, offset), offset); } -// Rounds to nearest integer. -EIGEN_STRONG_INLINE Packet2d pround_to_nearest(const Packet2d& a) { - // Adds and subtracts signum(a) * 2^52 to force rounding to within precision. +template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) { + // Adds and subtracts signum(a) * 2^52 to force rounding. const Packet2d offset = pselect(pcmp_lt(a, pzero(a)), pset1<Packet2d>(-static_cast<double>(1ull<<52)), @@ -671,10 +662,20 @@ EIGEN_STRONG_INLINE Packet2d pround_to_nearest(const Packet2d& a) { return psub(padd(a, offset), offset); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) +{ + const Packet4f cst_1 = pset1<Packet4f>(1.0f); + Packet4f tmp = print<Packet4f>(a); + // If greater, subtract one. + Packet4f mask = _mm_cmpgt_ps(tmp, a); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { const Packet2d cst_1 = pset1<Packet2d>(1.0); - Packet2d tmp = pround_to_nearest(a); + Packet2d tmp = print<Packet2d>(a); // If greater, subtract one. Packet2d mask = _mm_cmpgt_pd(tmp, a); mask = pand(mask, cst_1); @@ -684,21 +685,17 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { const Packet4f cst_1 = pset1<Packet4f>(1.0f); - Packet4i emm0 = _mm_cvttps_epi32(a); - Packet4f tmp = _mm_cvtepi32_ps(emm0); + Packet4f tmp = print<Packet4f>(a); // If smaller, add one. Packet4f mask = _mm_cmplt_ps(tmp, a); mask = pand(mask, cst_1); - tmp = padd(tmp, mask); - // Handle saturation cases. - const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest())); - return pselect(pcmp_lt(pabs(a), cst_max), tmp, a); + return padd(tmp, mask); } template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { const Packet2d cst_1 = pset1<Packet2d>(1.0); - Packet2d tmp = pround_to_nearest(a); + Packet2d tmp = print<Packet2d>(a); // If smaller, add one. Packet2d mask = _mm_cmplt_pd(tmp, a); mask = pand(mask, cst_1); |