aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/MathFunctionsImpl.h
diff options
context:
space:
mode:
authorGravatar Rasmus Munk Larsen <rmlarsen@google.com>2019-12-16 21:33:42 +0000
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2019-12-16 21:33:42 +0000
commita5660744801116ad2f9ab4e9e389f194ba307a35 (patch)
treea7150bf1b016baab78b82ab8d2195b5e91916126 /Eigen/src/Core/MathFunctionsImpl.h
parent8e5da71466591cc24352782b08dc78ddb94f0717 (diff)
Improve accuracy of fast approximate tanh and the logistic functions in Eigen, such that they preserve relative accuracy to within a few ULPs where their function values tend to zero (around x=0 for tanh, and for large negative x for the logistic function).
This change re-instates the fast rational approximation of the logistic function for float32 in Eigen (removed in https://gitlab.com/libeigen/eigen/commit/66f07efeaed39d6a67005343d7e0caf7d9eeacdb), but uses the more accurate approximation 1/(1+exp(-1)) ~= exp(x) below -9. The exponential is only calculated on the vectorized path if at least one element in the SIMD input vector is less than -9. This change also contains a few improvements to speed up the original float specialization of logistic: - Introduce EIGEN_PREDICT_{FALSE,TRUE} for __builtin_predict and use it to predict that the logistic-only path is most likely (~2-3% speedup for the common case). - Carefully set the upper clipping point to the smallest x where the approximation evaluates to exactly 1. This saves the explicit clamping of the output (~7% speedup). The increased accuracy for tanh comes at a cost of 10-20% depending on instruction set. The benchmarks below repeated calls u = v.logistic() (u = v.tanh(), respectively) where u and v are of type Eigen::ArrayXf, have length 8k, and v contains random numbers in [-1,1]. Benchmark numbers for logistic: Before: Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------- SSE BM_eigen_logistic_float 4467 4468 155835 model_time: 4827 AVX BM_eigen_logistic_float 2347 2347 299135 model_time: 2926 AVX+FMA BM_eigen_logistic_float 1467 1467 476143 model_time: 2926 AVX512 BM_eigen_logistic_float 805 805 858696 model_time: 1463 After: Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------- SSE BM_eigen_logistic_float 2589 2590 270264 model_time: 4827 AVX BM_eigen_logistic_float 1428 1428 489265 model_time: 2926 AVX+FMA BM_eigen_logistic_float 1059 1059 662255 model_time: 2926 AVX512 BM_eigen_logistic_float 673 673 1000000 model_time: 1463 Benchmark numbers for tanh: Before: Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------- SSE BM_eigen_tanh_float 2391 2391 292624 model_time: 4242 AVX BM_eigen_tanh_float 1256 1256 554662 model_time: 2633 AVX+FMA BM_eigen_tanh_float 823 823 866267 model_time: 1609 AVX512 BM_eigen_tanh_float 443 443 1578999 model_time: 805 After: Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------------- SSE BM_eigen_tanh_float 2588 2588 273531 model_time: 4242 AVX BM_eigen_tanh_float 1536 1536 452321 model_time: 2633 AVX+FMA BM_eigen_tanh_float 1007 1007 694681 model_time: 1609 AVX512 BM_eigen_tanh_float 471 471 1472178 model_time: 805
Diffstat (limited to 'Eigen/src/Core/MathFunctionsImpl.h')
-rw-r--r--Eigen/src/Core/MathFunctionsImpl.h23
1 files changed, 13 insertions, 10 deletions
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index 9ace5f32d..7af58fadb 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -17,10 +17,11 @@ namespace internal {
/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
Doesn't do anything fancy, just a 13/6-degree rational interpolant which
- is accurate up to a couple of ulps in the (approximate) range [-8, 8],
- outside of which tanh(x) = +/-1 in single precision. This is done by
- Clamp the inputs to the range [-c, c]. The value c is chosen as the smallest
- value where the approximation evaluates to exactly 1.
+ is accurate up to a couple of ulps in the (approximate) range [-8, 8],
+ outside of which tanh(x) = +/-1 in single precision. The input is clamped
+ to the range [-c, c]. The value c is chosen as the smallest value where
+ the approximation evaluates to exactly 1. In the reange [-0.0004, 0.0004]
+ the approxmation tanh(x) ~= x is used for better accuracy as x tends to zero.
This implementation works on both scalars and packets.
*/
@@ -29,13 +30,15 @@ T generic_fast_tanh_float(const T& a_x)
{
// Clamp the inputs to the range [-c, c]
#ifdef EIGEN_VECTORIZE_FMA
- const T plus_clamp = pset1<T>(7.99881172180175781);
- const T minus_clamp = pset1<T>(-7.99881172180175781);
+ const T plus_clamp = pset1<T>(7.99881172180175781f);
+ const T minus_clamp = pset1<T>(-7.99881172180175781f);
#else
- const T plus_clamp = pset1<T>(7.90531110763549805);
- const T minus_clamp = pset1<T>(-7.90531110763549805);
+ const T plus_clamp = pset1<T>(7.90531110763549805f);
+ const T minus_clamp = pset1<T>(-7.90531110763549805f);
#endif
+ const T tiny = pset1<T>(0.0004f);
const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
+ const T tiny_mask = pcmp_lt(pabs(a_x), tiny);
// The monomial coefficients of the numerator polynomial (odd).
const T alpha_1 = pset1<T>(4.89352455891786e-03f);
const T alpha_3 = pset1<T>(6.37261928875436e-04f);
@@ -63,13 +66,13 @@ T generic_fast_tanh_float(const T& a_x)
p = pmadd(x2, p, alpha_1);
p = pmul(x, p);
- // Evaluate the denominator polynomial p.
+ // Evaluate the denominator polynomial q.
T q = pmadd(x2, beta_6, beta_4);
q = pmadd(x2, q, beta_2);
q = pmadd(x2, q, beta_0);
// Divide the numerator by the denominator.
- return pdiv(p, q);
+ return pselect(tiny_mask, x, pdiv(p, q));
}
template<typename RealScalar>