From cdd8fdc32e730d5a65796a791ff13a92815c59b9 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Mon, 18 Jan 2021 13:25:16 +0000
Subject: Vectorize `pow(x, y)`. This closes
 https://gitlab.com/libeigen/eigen/-/issues/2085, which also contains a
 description of the algorithm.

I ran some testing (comparing to `std::pow(double(x), double(y)))` for `x` in the set of all (positive) floats in the interval `[std::sqrt(std::numeric_limits<float>::min()), std::sqrt(std::numeric_limits<float>::max())]`, and `y` in `{2, sqrt(2), -sqrt(2)}` I get the following error statistics:

```
max_rel_error = 8.34405e-07
rms_rel_error = 2.76654e-07
```

If I widen the range to all normal float I see lower accuracy for arguments where the result is subnormal, e.g. for `y = sqrt(2)`:

```
max_rel_error = 0.666667
rms = 6.8727e-05
count = 1335165689
argmax = 2.56049e-32, 2.10195e-45 != 1.4013e-45
```

which seems reasonable, since these results are subnormals with only couple of significant bits left.
---
 Eigen/src/Core/arch/SSE/PacketMath.h | 38 +++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 7 deletions(-)

(limited to 'Eigen/src/Core/arch/SSE')
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 4c5b664e6..05d9f8edd 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -26,7 +26,7 @@ namespace internal {
 
 #ifdef EIGEN_VECTORIZE_FMA
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 #endif
 
@@ -147,13 +147,13 @@ struct packet_traits<float> : default_packet_traits {
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
     HasBlend = 1,
+    HasCeil = 1,
     HasFloor = 1
 
 #ifdef EIGEN_VECTORIZE_SSE4_1
     ,
     HasRint = 1,
-    HasRound = 1,
-    HasCeil = 1
+    HasRound = 1
 #endif
   };
 };
@@ -173,14 +173,14 @@ struct packet_traits<double> : default_packet_traits {
     HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasBlend = 1
+    HasBlend = 1,
+    HasFloor = 1,
+    HasCeil = 1
 
 #ifdef EIGEN_VECTORIZE_SSE4_1
     ,
     HasRound = 1,
-    HasRint = 1,
-    HasFloor = 1,
-    HasCeil = 1
+    HasRint = 1
 #endif
   };
 };
@@ -650,6 +650,30 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
   mask = pand(mask, cst_1);
   return psub(tmp, mask);
 }
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
+{
+  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
+  Packet4i emm0 = _mm_cvttps_epi32(a);
+  Packet4f tmp  = _mm_cvtepi32_ps(emm0);
+  /* if greater, substract 1 */
+  Packet4f mask = _mm_cmplt_ps(tmp, a);
+  mask = pand(mask, cst_1);
+  return padd(tmp, mask);
+}
+
+// WARNING: this pfloor implementation makes sense for small inputs only,
+// It is currently only used by pexp and not exposed through HasFloor.
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
+{
+  const Packet2d cst_1 = pset1<Packet2d>(1.0);
+  Packet4i emm0 = _mm_cvttpd_epi32(a);
+  Packet2d tmp  = _mm_cvtepi32_pd(emm0);
+  /* if greater, substract 1 */
+  Packet2d mask = _mm_cmplt_pd(tmp, a);
+  mask = pand(mask, cst_1);
+  return padd(tmp, mask);
+}
 #endif
 
 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
-- 
cgit v1.2.3