aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/SSE
diff options
context:
space:
mode:
authorGravatar Antonio Sanchez <cantonios@google.com>2021-02-25 14:29:49 -0800
committerGravatar Antonio Sánchez <cantonios@google.com>2021-02-27 22:42:07 +0000
commit1e0c7d4f4933b12a325dbaa2c79ce946bb13f7d6 (patch)
tree30d28ba3618296434df793ad25d06f68e6c98d65 /Eigen/src/Core/arch/SSE
parent976ae0ca6f381a855daddcba73de72737be2e8a7 (diff)
Add print for SSE/NEON, use NEON rounding intrinsics if available.
In SSE, by adding/subtracting 2^MantissaBits, we force rounding according to the current rounding mode. For NEON, we use the provided intrinsics for rint/floor/ceil if available (armv8). Related to #1969.
Diffstat (limited to 'Eigen/src/Core/arch/SSE')
-rwxr-xr-xEigen/src/Core/arch/SSE/PacketMath.h61
1 files changed, 29 insertions, 32 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 78fd99e64..b9821ad80 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -148,12 +148,11 @@ struct packet_traits<float> : default_packet_traits {
HasErf = EIGEN_FAST_MATH,
HasBlend = 1,
HasCeil = 1,
- HasFloor = 1
+ HasFloor = 1,
+ HasRint = 1,
#ifdef EIGEN_VECTORIZE_SSE4_1
- ,
- HasRint = 1,
- HasRound = 1
+ HasRound = 1,
#endif
};
};
@@ -175,12 +174,10 @@ struct packet_traits<double> : default_packet_traits {
HasRsqrt = 1,
HasBlend = 1,
HasFloor = 1,
- HasCeil = 1
-
-#ifdef EIGEN_VECTORIZE_SSE4_1
- ,
+ HasCeil = 1,
+ HasRint = 1,
+ #ifdef EIGEN_VECTORIZE_SSE4_1
HasRound = 1,
- HasRint = 1
#endif
};
};
@@ -647,23 +644,17 @@ template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { ret
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
#else
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
-{
- const Packet4f cst_1 = pset1<Packet4f>(1.0f);
- Packet4i emm0 = _mm_cvttps_epi32(a);
- Packet4f tmp = _mm_cvtepi32_ps(emm0);
- // If greater, subtract one.
- Packet4f mask = _mm_cmpgt_ps(tmp, a);
- mask = pand(mask, cst_1);
- tmp = psub(tmp, mask);
- // Handle saturation cases.
- const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest()));
- return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
+template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
+ // Adds and subtracts signum(a) * 2^23 to force rounding.
+ const Packet4f offset =
+ pselect(pcmp_lt(a, pzero(a)),
+ pset1<Packet4f>(-static_cast<float>(1<<23)),
+ pset1<Packet4f>(+static_cast<float>(1<<23)));
+ return psub(padd(a, offset), offset);
}
-// Rounds to nearest integer.
-EIGEN_STRONG_INLINE Packet2d pround_to_nearest(const Packet2d& a) {
- // Adds and subtracts signum(a) * 2^52 to force rounding to within precision.
+template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
+ // Adds and subtracts signum(a) * 2^52 to force rounding.
const Packet2d offset =
pselect(pcmp_lt(a, pzero(a)),
pset1<Packet2d>(-static_cast<double>(1ull<<52)),
@@ -671,10 +662,20 @@ EIGEN_STRONG_INLINE Packet2d pround_to_nearest(const Packet2d& a) {
return psub(padd(a, offset), offset);
}
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+ const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+ Packet4f tmp = print<Packet4f>(a);
+ // If greater, subtract one.
+ Packet4f mask = _mm_cmpgt_ps(tmp, a);
+ mask = pand(mask, cst_1);
+ return psub(tmp, mask);
+}
+
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
{
const Packet2d cst_1 = pset1<Packet2d>(1.0);
- Packet2d tmp = pround_to_nearest(a);
+ Packet2d tmp = print<Packet2d>(a);
// If greater, subtract one.
Packet2d mask = _mm_cmpgt_pd(tmp, a);
mask = pand(mask, cst_1);
@@ -684,21 +685,17 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
{
const Packet4f cst_1 = pset1<Packet4f>(1.0f);
- Packet4i emm0 = _mm_cvttps_epi32(a);
- Packet4f tmp = _mm_cvtepi32_ps(emm0);
+ Packet4f tmp = print<Packet4f>(a);
// If smaller, add one.
Packet4f mask = _mm_cmplt_ps(tmp, a);
mask = pand(mask, cst_1);
- tmp = padd(tmp, mask);
- // Handle saturation cases.
- const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest()));
- return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
+ return padd(tmp, mask);
}
template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
{
const Packet2d cst_1 = pset1<Packet2d>(1.0);
- Packet2d tmp = pround_to_nearest(a);
+ Packet2d tmp = print<Packet2d>(a);
// If smaller, add one.
Packet2d mask = _mm_cmplt_pd(tmp, a);
mask = pand(mask, cst_1);