Fix AVX512 implementations of psqrt

This commit fixes the AVX512 implementations of psqrt in the same way that 3ed67cb0bb4af65fbf243df598604a8c7630bf7d fixed the AVX2 version of this function. The AVX512 versions of psqrt incorrectly return -0.0 for negative values, instead of NaN. Fixing the issues requires adding some additional instructions that slow down the algorithms. A similar test to the one used in 3ed67cb0bb4af65fbf243df598604a8c7630bf7d shows that the corrected Packet16f code runs at 73% of the speed of the existing code, while the corrected Packed8d function runs at 68% of the original.
author: Mark D Ryan <mark.d.ryan@intel.com> 2018-06-25 05:05:02 -0700
committer: Mark D Ryan <mark.d.ryan@intel.com> 2018-06-25 05:05:02 -0700
commit: e79c5149bf049994b86ab19aac24b0dde2bcc387 (patch)
tree: 8ed580404924ba7ecf9d358682804fbf0b71ba5c /Eigen/src/Core/arch/AVX512/MathFunctions.h
parent: 2ebcb911b27174c5402e4c7af3d2738fd042a5e2 (diff)
1 files changed, 19 insertions, 28 deletions
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 9c1717f76..81a3b4f62 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -258,48 +258,39 @@ pexp<Packet8d>(const Packet8d& _x) {
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 psqrt<Packet16f>(const Packet16f& _x) {
-  _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
-  _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
-
-  Packet16f neg_half = pmul(_x, p16f_minus_half);
+  Packet16f half = pmul(_x, pset1<Packet16f>(.5f));
+  __mmask16 denormal_mask = _mm512_kand(
+      _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
+                        _CMP_LT_OQ),
+      _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
 
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
-  Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_setzero_ps(), _mm512_rsqrt14_ps(_x));
+  Packet16f x = _mm512_rsqrt14_ps(_x);
 
   // Do a single step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
+  x = pmul(x, psub(pset1<Packet16f>(1.5f), pmul(half, pmul(x,x))));
 
-  // Multiply the original _x by it's reciprocal square root to extract the
-  // square root.
-  return pmul(_x, x);
+  // Flush results for denormals to zero.
+  return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
 }
 
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 psqrt<Packet8d>(const Packet8d& _x) {
-  _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
-  _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
-  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
-
-  Packet8d neg_half = pmul(_x, p8d_minus_half);
+  Packet8d half = pmul(_x, pset1<Packet8d>(.5f));
+  __mmask16 denormal_mask = _mm512_kand(
+      _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
+                        _CMP_LT_OQ),
+      _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
 
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
-  Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_setzero_pd(), _mm512_rsqrt14_pd(_x));
+  Packet8d x = _mm512_rsqrt14_pd(_x);
 
-  // Do a first step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+  // Do a single step of Newton's iteration.
+  x = pmul(x, psub(pset1<Packet8d>(1.5f), pmul(half, pmul(x,x))));
 
   // Do a second step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+  x = pmul(x, psub(pset1<Packet8d>(1.5f), pmul(half, pmul(x,x))));
 
-  // Multiply the original _x by it's reciprocal square root to extract the
-  // square root.
-  return pmul(_x, x);
+  return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
 }
 #else
 template <>
author	Mark D Ryan <mark.d.ryan@intel.com>	2018-06-25 05:05:02 -0700
committer	Mark D Ryan <mark.d.ryan@intel.com>	2018-06-25 05:05:02 -0700
commit	e79c5149bf049994b86ab19aac24b0dde2bcc387 (patch)
tree	8ed580404924ba7ecf9d358682804fbf0b71ba5c /Eigen/src/Core/arch/AVX512/MathFunctions.h
parent	2ebcb911b27174c5402e4c7af3d2738fd042a5e2 (diff)