Add print for SSE/NEON, use NEON rounding intrinsics if available.

In SSE, by adding/subtracting 2^MantissaBits, we force rounding according to the current rounding mode. For NEON, we use the provided intrinsics for rint/floor/ceil if available (armv8). Related to #1969.
author: Antonio Sanchez <cantonios@google.com> 2021-02-25 14:29:49 -0800
committer: Antonio Sánchez <cantonios@google.com> 2021-02-27 22:42:07 +0000
commit: 1e0c7d4f4933b12a325dbaa2c79ce946bb13f7d6 (patch)
tree: 30d28ba3618296434df793ad25d06f68e6c98d65 /Eigen/src/Core/arch/SSE
parent: 976ae0ca6f381a855daddcba73de72737be2e8a7 (diff)
1 files changed, 29 insertions, 32 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 78fd99e64..b9821ad80 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -148,12 +148,11 @@ struct packet_traits<float> : default_packet_traits {
     HasErf = EIGEN_FAST_MATH,
     HasBlend = 1,
     HasCeil = 1,
-    HasFloor = 1
+    HasFloor = 1,
+    HasRint = 1,
 
 #ifdef EIGEN_VECTORIZE_SSE4_1
-    ,
-    HasRint = 1,
-    HasRound = 1
+    HasRound = 1,
 #endif
   };
 };
@@ -175,12 +174,10 @@ struct packet_traits<double> : default_packet_traits {
     HasRsqrt = 1,
     HasBlend = 1,
     HasFloor = 1,
-    HasCeil = 1
-
-#ifdef EIGEN_VECTORIZE_SSE4_1
-    ,
+    HasCeil = 1,
+    HasRint = 1,
+  #ifdef EIGEN_VECTORIZE_SSE4_1
     HasRound = 1,
-    HasRint = 1
 #endif
   };
 };
@@ -647,23 +644,17 @@ template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { ret
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
 #else
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
-{
-  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
-  Packet4i emm0 = _mm_cvttps_epi32(a);
-  Packet4f tmp  = _mm_cvtepi32_ps(emm0);
-  // If greater, subtract one.
-  Packet4f mask = _mm_cmpgt_ps(tmp, a);
-  mask = pand(mask, cst_1);
-  tmp = psub(tmp, mask);
-  // Handle saturation cases.
-  const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest()));
-  return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
+template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
+  // Adds and subtracts signum(a) * 2^23 to force rounding.
+  const Packet4f offset = 
+    pselect(pcmp_lt(a, pzero(a)), 
+      pset1<Packet4f>(-static_cast<float>(1<<23)),
+      pset1<Packet4f>(+static_cast<float>(1<<23)));
+  return psub(padd(a, offset), offset);
 }
 
-// Rounds to nearest integer.
-EIGEN_STRONG_INLINE Packet2d pround_to_nearest(const Packet2d& a) {
-  // Adds and subtracts signum(a) * 2^52 to force rounding to within precision.
+template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
+  // Adds and subtracts signum(a) * 2^52 to force rounding.
   const Packet2d offset = 
     pselect(pcmp_lt(a, pzero(a)), 
       pset1<Packet2d>(-static_cast<double>(1ull<<52)),
@@ -671,10 +662,20 @@ EIGEN_STRONG_INLINE Packet2d pround_to_nearest(const Packet2d& a) {
   return psub(padd(a, offset), offset);
 }
 
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  Packet4f tmp  = print<Packet4f>(a);
+  // If greater, subtract one.
+  Packet4f mask = _mm_cmpgt_ps(tmp, a);
+  mask = pand(mask, cst_1);
+  return psub(tmp, mask);
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
 {
   const Packet2d cst_1 = pset1<Packet2d>(1.0);
-  Packet2d tmp  = pround_to_nearest(a);
+  Packet2d tmp  = print<Packet2d>(a);
   // If greater, subtract one.
   Packet2d mask = _mm_cmpgt_pd(tmp, a);
   mask = pand(mask, cst_1);
@@ -684,21 +685,17 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
 {
   const Packet4f cst_1 = pset1<Packet4f>(1.0f);
-  Packet4i emm0 = _mm_cvttps_epi32(a);
-  Packet4f tmp  = _mm_cvtepi32_ps(emm0);
+  Packet4f tmp  = print<Packet4f>(a);
   // If smaller, add one.
   Packet4f mask = _mm_cmplt_ps(tmp, a);
   mask = pand(mask, cst_1);
-  tmp = padd(tmp, mask);
-  // Handle saturation cases.
-  const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest()));
-  return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
+  return padd(tmp, mask);
 }
 
 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
 {
   const Packet2d cst_1 = pset1<Packet2d>(1.0);
-  Packet2d tmp  = pround_to_nearest(a);
+  Packet2d tmp  = print<Packet2d>(a);
   // If smaller, add one.
   Packet2d mask = _mm_cmplt_pd(tmp, a);
   mask = pand(mask, cst_1);
author	Antonio Sanchez <cantonios@google.com>	2021-02-25 14:29:49 -0800
committer	Antonio Sánchez <cantonios@google.com>	2021-02-27 22:42:07 +0000
commit	1e0c7d4f4933b12a325dbaa2c79ce946bb13f7d6 (patch)
tree	30d28ba3618296434df793ad25d06f68e6c98d65 /Eigen/src/Core/arch/SSE
parent	976ae0ca6f381a855daddcba73de72737be2e8a7 (diff)