diff options
author | Mark D Ryan <mark.d.ryan@intel.com> | 2018-06-25 05:05:02 -0700 |
---|---|---|
committer | Mark D Ryan <mark.d.ryan@intel.com> | 2018-06-25 05:05:02 -0700 |
commit | e79c5149bf049994b86ab19aac24b0dde2bcc387 (patch) | |
tree | 8ed580404924ba7ecf9d358682804fbf0b71ba5c /Eigen/src/Core/arch/AVX512/MathFunctions.h | |
parent | 2ebcb911b27174c5402e4c7af3d2738fd042a5e2 (diff) |
Fix AVX512 implementations of psqrt
This commit fixes the AVX512 implementations of psqrt in the same
way that 3ed67cb0bb4af65fbf243df598604a8c7630bf7d
fixed the AVX2 version of this function. The
AVX512 versions of psqrt incorrectly return -0.0 for negative
values, instead of NaN. Fixing the issues requires adding
some additional instructions that slow down the algorithms. A
similar test to the one used in 3ed67cb0bb4af65fbf243df598604a8c7630bf7d
shows that the
corrected Packet16f code runs at 73% of the speed of the existing code,
while the corrected Packed8d function runs at 68% of the original.
Diffstat (limited to 'Eigen/src/Core/arch/AVX512/MathFunctions.h')
-rw-r--r-- | Eigen/src/Core/arch/AVX512/MathFunctions.h | 47 |
1 files changed, 19 insertions, 28 deletions
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 9c1717f76..81a3b4f62 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -258,48 +258,39 @@ pexp<Packet8d>(const Packet8d& _x) { template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f psqrt<Packet16f>(const Packet16f& _x) { - _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); - _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); - - Packet16f neg_half = pmul(_x, p16f_minus_half); + Packet16f half = pmul(_x, pset1<Packet16f>(.5f)); + __mmask16 denormal_mask = _mm512_kand( + _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()), + _CMP_LT_OQ), + _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ)); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ); - Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_setzero_ps(), _mm512_rsqrt14_ps(_x)); + Packet16f x = _mm512_rsqrt14_ps(_x); // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); + x = pmul(x, psub(pset1<Packet16f>(1.5f), pmul(half, pmul(x,x)))); - // Multiply the original _x by it's reciprocal square root to extract the - // square root. - return pmul(_x, x); + // Flush results for denormals to zero. + return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps()); } template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d psqrt<Packet8d>(const Packet8d& _x) { - _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); - _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); - - Packet8d neg_half = pmul(_x, p8d_minus_half); + Packet8d half = pmul(_x, pset1<Packet8d>(.5f)); + __mmask16 denormal_mask = _mm512_kand( + _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()), + _CMP_LT_OQ), + _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ)); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ); - Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_setzero_pd(), _mm512_rsqrt14_pd(_x)); + Packet8d x = _mm512_rsqrt14_pd(_x); - // Do a first step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + // Do a single step of Newton's iteration. + x = pmul(x, psub(pset1<Packet8d>(1.5f), pmul(half, pmul(x,x)))); // Do a second step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + x = pmul(x, psub(pset1<Packet8d>(1.5f), pmul(half, pmul(x,x)))); - // Multiply the original _x by it's reciprocal square root to extract the - // square root. - return pmul(_x, x); + return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd()); } #else template <> |