diff options
author | Rasmus Munk Larsen <rmlarsen@google.com> | 2019-11-15 17:09:46 -0800 |
---|---|---|
committer | Rasmus Munk Larsen <rmlarsen@google.com> | 2019-11-15 17:09:46 -0800 |
commit | f1e83073082f2733eec6235f2fdf251217a54ade (patch) | |
tree | a20a4945bf0083ffe1a4d4a617a7a2c4740ba00a /Eigen/src/Core/arch/SSE | |
parent | 2cb2915f908418c897773e0342f152768c13a0d8 (diff) |
1. Fix a bug in psqrt and make it return 0 for +inf arguments.
2. Simplify handling of special cases by taking advantage of the fact that the
builtin vrsqrt approximation handles negative, zero and +inf arguments correctly.
This speeds up the SSE and AVX implementations by ~20%.
3. Make the Newton-Raphson formula used for rsqrt more numerically robust:
Before: y = y * (1.5 - x/2 * y^2)
After: y = y * (1.5 - y * (x/2) * y)
Forming y^2 can overflow for very large or very small (denormalized) values of x, while x*y ~= 1. For AVX512, this makes it possible to compute accurate results for denormal inputs down to ~1e-42 in single precision.
4. Add a faster double precision implementation for Knights Landing using the vrsqrt28 instruction and a single Newton-Raphson iteration.
Benchmark results: https://bitbucket.org/snippets/rmlarsen/5LBq9o
Diffstat (limited to 'Eigen/src/Core/arch/SSE')
-rw-r--r-- | Eigen/src/Core/arch/SSE/MathFunctions.h | 40 |
1 files changed, 22 insertions, 18 deletions
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index b21bb93bf..85255ad23 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -98,30 +98,34 @@ Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt<Packet4f>(const Packet4f& _x) { - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000u); _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u); _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u); Packet4f neg_half = pmul(_x, p4f_minus_half); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min); - Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x)); - - // Fill in NaNs and Infs for the negative/zero entries. - Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps()); - Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask); - Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan), - _mm_and_ps(zero_mask, p4f_inf)); - - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five)); - - // Insert NaNs and Infs in all the right places. - return _mm_or_ps(x, infs_and_nans); + // Identity infinite, zero, negative and denormal arguments. + Packet4f lt_min_mask = _mm_cmplt_ps(_x, p4f_flt_min); + Packet4f inf_mask = _mm_cmpeq_ps(_x, p4f_inf); + Packet4f not_normal_finite_mask = _mm_or_ps(lt_min_mask, inf_mask); + + // Compute an approximate result using the rsqrt intrinsic. + Packet4f y_approx = _mm_rsqrt_ps(_x); + + // Do a single step of Newton-Raphson iteration to improve the approximation. + // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). + // It is essential to evaluate the inner term like this because forming + // y_n^2 may over- or underflow. + Packet4f y_newton = pmul( + y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p4f_one_point_five)); + + // Select the result of the Newton-Raphson step for positive normal arguments. + // For other arguments, choose the output of the intrinsic. This will + // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if + // x is zero or a positive denormalized float (equivalent to flushing positive + // denormalized inputs to zero). + return pselect<Packet4f>(not_normal_finite_mask, y_approx, y_newton); } #else |