* Add iterative psqrt<double> for AVX and SSE when FMA is available. This provides a ~10% speedup.

* Write iterative sqrt explicitly in terms of pmadd. This gives up to 7% speedup for psqrt<float> with AVX & SSE with FMA. * Remove iterative psqrt<double> for NEON, because the initial rsqrt apprimation is not accurate enough for convergence in 2 Newton-Raphson steps and with 3 steps, just calling the builtin sqrt insn is faster. The following benchmarks were compiled with clang "-O2 -fast-math -mfma" and with and without -mavx. AVX+FMA (float) name old cpu/op new cpu/op delta BM_eigen_sqrt_float/1 1.08ns ± 0% 1.09ns ± 1% ~ BM_eigen_sqrt_float/8 2.07ns ± 0% 2.08ns ± 1% ~ BM_eigen_sqrt_float/64 12.4ns ± 0% 12.4ns ± 1% ~ BM_eigen_sqrt_float/512 95.7ns ± 0% 95.5ns ± 0% ~ BM_eigen_sqrt_float/4k 776ns ± 0% 763ns ± 0% -1.67% BM_eigen_sqrt_float/32k 6.57µs ± 1% 6.13µs ± 0% -6.69% BM_eigen_sqrt_float/256k 83.7µs ± 3% 83.3µs ± 2% ~ BM_eigen_sqrt_float/1M 335µs ± 2% 332µs ± 2% ~ SSE+FMA (float) name old cpu/op new cpu/op delta BM_eigen_sqrt_float/1 1.08ns ± 0% 1.09ns ± 0% ~ BM_eigen_sqrt_float/8 2.07ns ± 0% 2.06ns ± 0% ~ BM_eigen_sqrt_float/64 12.4ns ± 0% 12.4ns ± 1% ~ BM_eigen_sqrt_float/512 95.7ns ± 0% 96.3ns ± 4% ~ BM_eigen_sqrt_float/4k 774ns ± 0% 763ns ± 0% -1.50% BM_eigen_sqrt_float/32k 6.58µs ± 2% 6.11µs ± 0% -7.06% BM_eigen_sqrt_float/256k 82.7µs ± 1% 82.6µs ± 1% ~ BM_eigen_sqrt_float/1M 330µs ± 1% 329µs ± 2% ~ SSE+FMA (double) BM_eigen_sqrt_double/1 1.63ns ± 0% 1.63ns ± 0% ~ BM_eigen_sqrt_double/8 6.51ns ± 0% 6.08ns ± 0% -6.68% BM_eigen_sqrt_double/64 52.1ns ± 0% 46.5ns ± 1% -10.65% BM_eigen_sqrt_double/512 417ns ± 0% 374ns ± 1% -10.29% BM_eigen_sqrt_double/4k 3.33µs ± 0% 2.97µs ± 1% -11.00% BM_eigen_sqrt_double/32k 26.7µs ± 0% 23.7µs ± 0% -11.07% BM_eigen_sqrt_double/256k 213µs ± 0% 206µs ± 1% -3.31% BM_eigen_sqrt_double/1M 862µs ± 0% 870µs ± 2% +0.96% AVX+FMA (double) name old cpu/op new cpu/op delta BM_eigen_sqrt_double/1 1.63ns ± 0% 1.63ns ± 0% ~ BM_eigen_sqrt_double/8 6.51ns ± 0% 6.06ns ± 0% -6.95% BM_eigen_sqrt_double/64 52.1ns ± 0% 46.5ns ± 1% -10.80% BM_eigen_sqrt_double/512 417ns ± 0% 373ns ± 1% -10.59% BM_eigen_sqrt_double/4k 3.33µs ± 0% 2.97µs ± 1% -10.79% BM_eigen_sqrt_double/32k 26.7µs ± 0% 23.8µs ± 0% -10.94% BM_eigen_sqrt_double/256k 214µs ± 0% 208µs ± 2% -2.76% BM_eigen_sqrt_double/1M 866µs ± 3% 923µs ± 7% ~
author: Rasmus Munk Larsen <rmlarsen@google.com> 2020-12-16 18:16:11 +0000
committer: Rasmus Munk Larsen <rmlarsen@google.com> 2020-12-16 18:16:11 +0000
commit: 05754100fecf00e13b2a5799e31570a980e4dd72 (patch)
tree: 682f1c03d41ac384279ac90d7ee1f847cb804330 /Eigen/src/Core/arch/AVX
parent: 3bee9422d6578e551689a941ccd5faeb83e61489 (diff)
1 files changed, 13 insertions, 10 deletions
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 5fe2cff32..c0d362fa7 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -97,33 +97,36 @@ pexp<Packet4d>(const Packet4d& _x) {
 // For detail see here: http://www.beyond3d.com/content/articles/8/
 #if EIGEN_FAST_MATH
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-psqrt<Packet8f>(const Packet8f& _x) {
-  Packet8f half = pmul(_x, pset1<Packet8f>(.5f));
-  Packet8f denormal_mask = _mm256_and_ps(
-      _mm256_cmp_ps(_x, pset1<Packet8f>((std::numeric_limits<float>::min)()),
-                    _CMP_LT_OQ),
-      _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ));
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f psqrt<Packet8f>(const Packet8f& _x) {
+  Packet8f minus_half_x = pmul(_x, pset1<Packet8f>(-0.5f));
+  Packet8f denormal_mask = pandnot(
+      pcmp_lt(_x, pset1<Packet8f>((std::numeric_limits<float>::min)())),
+      pcmp_lt(_x, pzero(_x)));
 
   // Compute approximate reciprocal sqrt.
   Packet8f x = _mm256_rsqrt_ps(_x);
   // Do a single step of Newton's iteration.
-  x = pmul(x, psub(pset1<Packet8f>(1.5f), pmul(half, pmul(x,x))));
+  x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1<Packet8f>(1.5f)));
   // Flush results for denormals to zero.
-  return _mm256_andnot_ps(denormal_mask, pmul(_x,x));
+  return pandnot(pmul(_x,x), denormal_mask);
 }
+
 #else
+
 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet8f psqrt<Packet8f>(const Packet8f& _x) {
   return _mm256_sqrt_ps(_x);
 }
+
 #endif
+
 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4d psqrt<Packet4d>(const Packet4d& _x) {
   return _mm256_sqrt_pd(_x);
 }
-#if EIGEN_FAST_MATH
 
+#if EIGEN_FAST_MATH
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
   _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
author	Rasmus Munk Larsen <rmlarsen@google.com>	2020-12-16 18:16:11 +0000
committer	Rasmus Munk Larsen <rmlarsen@google.com>	2020-12-16 18:16:11 +0000
commit	05754100fecf00e13b2a5799e31570a980e4dd72 (patch)
tree	682f1c03d41ac384279ac90d7ee1f847cb804330 /Eigen/src/Core/arch/AVX
parent	3bee9422d6578e551689a941ccd5faeb83e61489 (diff)