aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/AVX512
diff options
context:
space:
mode:
authorGravatar Rasmus Munk Larsen <rmlarsen@google.com>2019-11-15 17:09:46 -0800
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2019-11-15 17:09:46 -0800
commitf1e83073082f2733eec6235f2fdf251217a54ade (patch)
treea20a4945bf0083ffe1a4d4a617a7a2c4740ba00a /Eigen/src/Core/arch/AVX512
parent2cb2915f908418c897773e0342f152768c13a0d8 (diff)
1. Fix a bug in psqrt and make it return 0 for +inf arguments.
2. Simplify handling of special cases by taking advantage of the fact that the builtin vrsqrt approximation handles negative, zero and +inf arguments correctly. This speeds up the SSE and AVX implementations by ~20%. 3. Make the Newton-Raphson formula used for rsqrt more numerically robust: Before: y = y * (1.5 - x/2 * y^2) After: y = y * (1.5 - y * (x/2) * y) Forming y^2 can overflow for very large or very small (denormalized) values of x, while x*y ~= 1. For AVX512, this makes it possible to compute accurate results for denormal inputs down to ~1e-42 in single precision. 4. Add a faster double precision implementation for Knights Landing using the vrsqrt28 instruction and a single Newton-Raphson iteration. Benchmark results: https://bitbucket.org/snippets/rmlarsen/5LBq9o
Diffstat (limited to 'Eigen/src/Core/arch/AVX512')
-rw-r--r--Eigen/src/Core/arch/AVX512/MathFunctions.h115
1 files changed, 71 insertions, 44 deletions
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 273c22c18..0346c1d95 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -253,6 +253,7 @@ pexp<Packet8d>(const Packet8d& _x) {
return pmax(pmul(x, e), _x);
}*/
+
// Functions for sqrt.
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
@@ -308,74 +309,100 @@ EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) {
}
#endif
-// Functions for rsqrt.
-// Almost identical to the sqrt routine, just leave out the last multiplication
-// and fill in NaN/Inf where needed. Note that this function only exists as an
-// iterative version for doubles since there is no instruction for diretly
-// computing the reciprocal square root in AVX-512.
-#ifdef EIGEN_FAST_MATH
+// prsqrt for float.
+#if defined(EIGEN_VECTORIZE_AVX512ER)
+
+template <>
+EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
+ return _mm512_rsqrt28_ps(x);
+}
+
+#elif EIGEN_FAST_MATH
+
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
prsqrt<Packet16f>(const Packet16f& _x) {
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
- _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
_EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
_EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
- _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
Packet16f neg_half = pmul(_x, p16f_minus_half);
- // select only the inverse sqrt of positive normal inputs (denormals are
- // flushed to zero and cause infs as well).
- __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
- Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
-
- // Fill in NaNs and Infs for the negative/zero entries.
- __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
- Packet16f infs_and_nans = _mm512_mask_blend_ps(
- neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
+ // Identity infinite, negative and denormal arguments.
+ __mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ);
+ __mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ);
+ __mmask16 not_finite_pos_mask = not_pos_mask | inf_mask;
+
+ // Compute an approximate result using the rsqrt intrinsic, forcing +inf
+ // for denormals for consistency with AVX and SSE implementations.
+ Packet16f y_approx = _mm512_rsqrt14_ps(_x);
+
+ // Do a single step of Newton-Raphson iteration to improve the approximation.
+ // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
+ // It is essential to evaluate the inner term like this because forming
+ // y_n^2 may over- or underflow.
+ Packet16f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p16f_one_point_five));
+
+ // Select the result of the Newton-Raphson step for positive finite arguments.
+ // For other arguments, choose the output of the intrinsic. This will
+ // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
+ return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx);
+ }
- // Do a single step of Newton's iteration.
- x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
+#else
- // Insert NaNs and Infs in all the right places.
- return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
+template <>
+EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
+ _EIGEN_DECLARE_CONST_Packet16f(one, 1.0f);
+ return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x));
}
+#endif
+
+// prsqrt for double.
+#if EIGEN_FAST_MATH
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
prsqrt<Packet8d>(const Packet8d& _x) {
- _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
- _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL);
_EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
_EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
- _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
+ _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
Packet8d neg_half = pmul(_x, p8d_minus_half);
- // select only the inverse sqrt of positive normal inputs (denormals are
- // flushed to zero and cause infs as well).
- __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
- Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
-
- // Fill in NaNs and Infs for the negative/zero entries.
- __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
- Packet8d infs_and_nans = _mm512_mask_blend_pd(
- neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
-
- // Do a first step of Newton's iteration.
- x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
-
- // Do a second step of Newton's iteration.
- x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+ // Identity infinite, negative and denormal arguments.
+ __mmask8 inf_mask = _mm512_cmp_pd_mask(_x, p8d_inf, _CMP_EQ_OQ);
+ __mmask8 not_pos_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LE_OQ);
+ __mmask8 not_finite_pos_mask = not_pos_mask | inf_mask;
- // Insert NaNs and Infs in all the right places.
- return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
+ // Compute an approximate result using the rsqrt intrinsic, forcing +inf
+ // for denormals for consistency with AVX and SSE implementations.
+#if defined(EIGEN_VECTORIZE_AVX512ER)
+ Packet8d y_approx = _mm512_rsqrt28_pd(_x);
+#else
+ Packet8d y_approx = _mm512_rsqrt14_pd(_x);
+#endif
+ // Do one or two steps of Newton-Raphson's to improve the approximation, depending on the
+ // starting accuracy (either 2^-14 or 2^-28, depending on whether AVX512ER is available).
+ // The Newton-Raphson algorithm has quadratic convergence and roughly doubles the number
+ // of correct digits for each step.
+ // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
+ // It is essential to evaluate the inner term like this because forming
+ // y_n^2 may over- or underflow.
+ Packet8d y_newton = pmul(y_approx, pmadd(neg_half, pmul(y_approx, y_approx), p8d_one_point_five));
+#if !defined(EIGEN_VECTORIZE_AVX512ER)
+ y_newton = pmul(y_newton, pmadd(y_newton, pmul(neg_half, y_newton), p8d_one_point_five));
+#endif
+ // Select the result of the Newton-Raphson step for positive finite arguments.
+ // For other arguments, choose the output of the intrinsic. This will
+ // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
+ return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx);
}
-#elif defined(EIGEN_VECTORIZE_AVX512ER)
+#else
template <>
-EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
- return _mm512_rsqrt28_ps(x);
+EIGEN_STRONG_INLINE Packet8d prsqrt<Packet8d>(const Packet8d& x) {
+ _EIGEN_DECLARE_CONST_Packet8d(one, 1.0f);
+ return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(x));
}
#endif