Add sk_float_rsqrt with SSE + NEON fast paths.

Current numbers: N4: running bench [640 480] math_fastIsqrt NONRENDERING: cmsecs = 3.12 running bench [640 480] math_slowIsqrt NONRENDERING: cmsecs = 4.82 running bench [640 480] math_sk_float_rsqrt NONRENDERING: cmsecs = 1.99 Desktop: running bench [640 480] math_fastIsqrt NONRENDERING: cmsecs = 0.89 running bench [640 480] math_slowIsqrt NONRENDERING: cmsecs = 0.94 running bench [640 480] math_sk_float_rsqrt NONRENDERING: cmsecs = 0.09 Haven't found any other benches where this is a significant effect yet. BUG= R=reed@google.com Author: mtklein@google.com Review URL: https://codereview.chromium.org/60083014 git-svn-id: http://skia.googlecode.com/svn/trunk@12203 2bbb7eff-a529-9590-31e7-b0007b416f81
author: commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> 2013-11-08 20:14:16 +0000
committer: commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> 2013-11-08 20:14:16 +0000
commit: 11e5b972a984c7b4e09ba4dfeacc7bd805107c5a (patch)
tree: 8b819ae42f35273010e987ce02363e381e6d2466 /include/core/SkFloatingPoint.h
parent: 8f457e3230f1a4ce737f512ffbb5c919b8d02407 (diff)
1 files changed, 41 insertions, 0 deletions
diff --git a/include/core/SkFloatingPoint.h b/include/core/SkFloatingPoint.h
index 44a3eef98d..7dfa9d8680 100644
--- a/include/core/SkFloatingPoint.h
+++ b/include/core/SkFloatingPoint.h
@@ -96,4 +96,45 @@ extern const uint32_t gIEEENegativeInfinity;
 #define SK_FloatNaN                 (*SkTCast<const float*>(&gIEEENotANumber))
 #define SK_FloatInfinity            (*SkTCast<const float*>(&gIEEEInfinity))
 #define SK_FloatNegativeInfinity    (*SkTCast<const float*>(&gIEEENegativeInfinity))
+
+#if defined(__SSE__)
+#include <xmmintrin.h>
+#elif defined(__ARM_NEON__)
+#include <arm_neon.h>
+#endif
+
+// Fast, approximate inverse square root.
+// Compare to name-brand "1.0f / sk_float_sqrt(x)".  Should be around 10x faster on SSE, 2x on NEON.
+static inline float sk_float_rsqrt(const float x) {
+// We want all this inlined, so we'll inline SIMD and just take the hit when we don't know we've got
+// it at compile time.  This is going to be too fast to productively hide behind a function pointer.
+//
+// We do one step of Newton's method to refine the estimates in the NEON and null paths.  No
+// refinement is faster, but very innacurate.  Two steps is more accurate, but slower than 1/sqrt.
+#if defined(__SSE__)
+    float result;
+    _mm_store_ss(&result, _mm_rsqrt_ss(_mm_set_ss(x)));
+    return result;
+#elif defined(__ARM_NEON__)
+    // Get initial estimate.
+    const float32x2_t xx = vdup_n_f32(x);  // Clever readers will note we're doing everything 2x.
+    float32x2_t estimate = vrsqrte_f32(xx);
+
+    // One step of Newton's method to refine.
+    const float32x2_t estimate_sq = vmul_f32(estimate, estimate);
+    estimate = vmul_f32(estimate, vrsqrts_f32(xx, estimate_sq));
+    return vget_lane_f32(estimate, 0);  // 1 will work fine too; the answer's in both places.
+#else
+    // Get initial estimate.
+    int i = *SkTCast<int*>(&x);
+    i = 0x5f3759df - (i>>1);
+    float estimate = *SkTCast<float*>(&i);
+
+    // One step of Newton's method to refine.
+    const float estimate_sq = estimate*estimate;
+    estimate *= (1.5f-0.5f*x*estimate_sq);
+    return estimate;
+#endif
+}
+
 #endif
author	commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>	2013-11-08 20:14:16 +0000
committer	commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>	2013-11-08 20:14:16 +0000
commit	11e5b972a984c7b4e09ba4dfeacc7bd805107c5a (patch)
tree	8b819ae42f35273010e987ce02363e381e6d2466 /include/core/SkFloatingPoint.h
parent	8f457e3230f1a4ce737f512ffbb5c919b8d02407 (diff)