From a5660744801116ad2f9ab4e9e389f194ba307a35 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Mon, 16 Dec 2019 21:33:42 +0000
Subject: Improve accuracy of fast approximate tanh and the logistic functions
 in Eigen, such that they preserve relative accuracy to within a few ULPs
 where their function values tend to zero (around x=0 for tanh, and for large
 negative x for the logistic function).

This change re-instates the fast rational approximation of the logistic function for float32 in Eigen (removed in https://gitlab.com/libeigen/eigen/commit/66f07efeaed39d6a67005343d7e0caf7d9eeacdb), but uses the more accurate approximation 1/(1+exp(-1)) ~= exp(x) below -9. The exponential is only calculated on the vectorized path if at least one element in the SIMD input vector is less than -9.

This change also contains a few improvements to speed up the original float specialization of logistic:
  - Introduce EIGEN_PREDICT_{FALSE,TRUE} for __builtin_predict and use it to predict that the logistic-only path is most likely (~2-3% speedup for the common case).
  - Carefully set the upper clipping point to the smallest x where the approximation evaluates to exactly 1. This saves the explicit clamping of the output (~7% speedup).

The increased accuracy for tanh comes at a cost of 10-20% depending on instruction set.

The benchmarks below repeated calls

   u = v.logistic()  (u = v.tanh(), respectively)

where u and v are of type Eigen::ArrayXf, have length 8k, and v contains random numbers in [-1,1].

Benchmark numbers for logistic:

Before:
Benchmark                  Time(ns)        CPU(ns)     Iterations
-----------------------------------------------------------------
SSE
BM_eigen_logistic_float        4467           4468         155835  model_time: 4827
AVX
BM_eigen_logistic_float        2347           2347         299135  model_time: 2926
AVX+FMA
BM_eigen_logistic_float        1467           1467         476143  model_time: 2926
AVX512
BM_eigen_logistic_float         805            805         858696  model_time: 1463

After:
Benchmark                  Time(ns)        CPU(ns)     Iterations
-----------------------------------------------------------------
SSE
BM_eigen_logistic_float        2589           2590         270264  model_time: 4827
AVX
BM_eigen_logistic_float        1428           1428         489265  model_time: 2926
AVX+FMA
BM_eigen_logistic_float        1059           1059         662255  model_time: 2926
AVX512
BM_eigen_logistic_float         673            673        1000000  model_time: 1463

Benchmark numbers for tanh:

Before:
Benchmark                  Time(ns)        CPU(ns)     Iterations
-----------------------------------------------------------------
SSE
BM_eigen_tanh_float        2391           2391         292624  model_time: 4242
AVX
BM_eigen_tanh_float        1256           1256         554662  model_time: 2633
AVX+FMA
BM_eigen_tanh_float         823            823         866267  model_time: 1609
AVX512
BM_eigen_tanh_float         443            443        1578999  model_time: 805

After:
Benchmark                  Time(ns)        CPU(ns)     Iterations
-----------------------------------------------------------------
SSE
BM_eigen_tanh_float        2588           2588         273531  model_time: 4242
AVX
BM_eigen_tanh_float        1536           1536         452321  model_time: 2633
AVX+FMA
BM_eigen_tanh_float        1007           1007         694681  model_time: 1609
AVX512
BM_eigen_tanh_float         471            471        1472178  model_time: 805
---
 Eigen/src/Core/MathFunctionsImpl.h | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'Eigen/src/Core/MathFunctionsImpl.h')

diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index 9ace5f32d..7af58fadb 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -17,10 +17,11 @@ namespace internal {
 
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise)
     Doesn't do anything fancy, just a 13/6-degree rational interpolant which
-    is accurate up to a couple of ulps in the (approximate) range [-8, 8], 
-    outside of which tanh(x) = +/-1 in single precision. This is done by
-    Clamp the inputs to the range [-c, c]. The value c is chosen as the smallest
-    value where the approximation evaluates to exactly 1.
+    is accurate up to a couple of ulps in the (approximate) range [-8, 8],
+    outside of which tanh(x) = +/-1 in single precision. The input is clamped
+    to the range [-c, c]. The value c is chosen as the smallest value where
+    the approximation evaluates to exactly 1. In the reange [-0.0004, 0.0004]
+    the approxmation tanh(x) ~= x is used for better accuracy as x tends to zero.
 
     This implementation works on both scalars and packets.
 */
@@ -29,13 +30,15 @@ T generic_fast_tanh_float(const T& a_x)
 {
   // Clamp the inputs to the range [-c, c]
 #ifdef EIGEN_VECTORIZE_FMA
-  const T plus_clamp = pset1<T>(7.99881172180175781);
-  const T minus_clamp = pset1<T>(-7.99881172180175781);
+  const T plus_clamp = pset1<T>(7.99881172180175781f);
+  const T minus_clamp = pset1<T>(-7.99881172180175781f);
 #else
-  const T plus_clamp = pset1<T>(7.90531110763549805);
-  const T minus_clamp = pset1<T>(-7.90531110763549805);
+  const T plus_clamp = pset1<T>(7.90531110763549805f);
+  const T minus_clamp = pset1<T>(-7.90531110763549805f);
 #endif
+  const T tiny = pset1<T>(0.0004f);
   const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
+  const T tiny_mask = pcmp_lt(pabs(a_x), tiny);
   // The monomial coefficients of the numerator polynomial (odd).
   const T alpha_1 = pset1<T>(4.89352455891786e-03f);
   const T alpha_3 = pset1<T>(6.37261928875436e-04f);
@@ -63,13 +66,13 @@ T generic_fast_tanh_float(const T& a_x)
   p = pmadd(x2, p, alpha_1);
   p = pmul(x, p);
 
-  // Evaluate the denominator polynomial p.
+  // Evaluate the denominator polynomial q.
   T q = pmadd(x2, beta_6, beta_4);
   q = pmadd(x2, q, beta_2);
   q = pmadd(x2, q, beta_0);
 
   // Divide the numerator by the denominator.
-  return pdiv(p, q);
+  return pselect(tiny_mask, x, pdiv(p, q));
 }
 
 template<typename RealScalar>
-- 
cgit v1.2.3