Add sk_float_rsqrt with SSE + NEON fast paths.

Current numbers: N4: running bench [640 480] math_fastIsqrt NONRENDERING: cmsecs = 3.12 running bench [640 480] math_slowIsqrt NONRENDERING: cmsecs = 4.82 running bench [640 480] math_sk_float_rsqrt NONRENDERING: cmsecs = 1.99 Desktop: running bench [640 480] math_fastIsqrt NONRENDERING: cmsecs = 0.89 running bench [640 480] math_slowIsqrt NONRENDERING: cmsecs = 0.94 running bench [640 480] math_sk_float_rsqrt NONRENDERING: cmsecs = 0.09 Haven't found any other benches where this is a significant effect yet. BUG= R=reed@google.com Author: mtklein@google.com Review URL: https://codereview.chromium.org/60083014 git-svn-id: http://skia.googlecode.com/svn/trunk@12203 2bbb7eff-a529-9590-31e7-b0007b416f81
author: commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> 2013-11-08 20:14:16 +0000
committer: commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> 2013-11-08 20:14:16 +0000
commit: 11e5b972a984c7b4e09ba4dfeacc7bd805107c5a (patch)
tree: 8b819ae42f35273010e987ce02363e381e6d2466 /include/core
parent: 8f457e3230f1a4ce737f512ffbb5c919b8d02407 (diff)
2 files changed, 53 insertions, 14 deletions
diff --git a/include/core/SkFloatingPoint.h b/include/core/SkFloatingPoint.h
index 44a3eef98d..7dfa9d8680 100644
--- a/include/core/SkFloatingPoint.h
+++ b/include/core/SkFloatingPoint.h
@@ -96,4 +96,45 @@ extern const uint32_t gIEEENegativeInfinity;
 #define SK_FloatNaN                 (*SkTCast<const float*>(&gIEEENotANumber))
 #define SK_FloatInfinity            (*SkTCast<const float*>(&gIEEEInfinity))
 #define SK_FloatNegativeInfinity    (*SkTCast<const float*>(&gIEEENegativeInfinity))
+
+#if defined(__SSE__)
+#include <xmmintrin.h>
+#elif defined(__ARM_NEON__)
+#include <arm_neon.h>
+#endif
+
+// Fast, approximate inverse square root.
+// Compare to name-brand "1.0f / sk_float_sqrt(x)".  Should be around 10x faster on SSE, 2x on NEON.
+static inline float sk_float_rsqrt(const float x) {
+// We want all this inlined, so we'll inline SIMD and just take the hit when we don't know we've got
+// it at compile time.  This is going to be too fast to productively hide behind a function pointer.
+//
+// We do one step of Newton's method to refine the estimates in the NEON and null paths.  No
+// refinement is faster, but very innacurate.  Two steps is more accurate, but slower than 1/sqrt.
+#if defined(__SSE__)
+    float result;
+    _mm_store_ss(&result, _mm_rsqrt_ss(_mm_set_ss(x)));
+    return result;
+#elif defined(__ARM_NEON__)
+    // Get initial estimate.
+    const float32x2_t xx = vdup_n_f32(x);  // Clever readers will note we're doing everything 2x.
+    float32x2_t estimate = vrsqrte_f32(xx);
+
+    // One step of Newton's method to refine.
+    const float32x2_t estimate_sq = vmul_f32(estimate, estimate);
+    estimate = vmul_f32(estimate, vrsqrts_f32(xx, estimate_sq));
+    return vget_lane_f32(estimate, 0);  // 1 will work fine too; the answer's in both places.
+#else
+    // Get initial estimate.
+    int i = *SkTCast<int*>(&x);
+    i = 0x5f3759df - (i>>1);
+    float estimate = *SkTCast<float*>(&i);
+
+    // One step of Newton's method to refine.
+    const float estimate_sq = estimate*estimate;
+    estimate *= (1.5f-0.5f*x*estimate_sq);
+    return estimate;
+#endif
+}
+
 #endif
diff --git a/include/core/SkPoint.h b/include/core/SkPoint.h
index b94f730ec2..caf26507ff 100644
--- a/include/core/SkPoint.h
+++ b/include/core/SkPoint.h
@@ -216,13 +216,10 @@ struct SK_API SkPoint {
      *  Return true if the computed length of the vector is >= the internal
      *  tolerance (used to avoid dividing by tiny values).
      */
-    static bool CanNormalize(SkScalar dx, SkScalar dy)
-#ifdef SK_SCALAR_IS_FLOAT
-    // Simple enough (and performance critical sometimes) so we inline it.
-    { return (dx*dx + dy*dy) > (SK_ScalarNearlyZero * SK_ScalarNearlyZero); }
-#else
-    ;
-#endif
+    static bool CanNormalize(SkScalar dx, SkScalar dy) {
+        // Simple enough (and performance critical sometimes) so we inline it.
+        return (dx*dx + dy*dy) > (SK_ScalarNearlyZero * SK_ScalarNearlyZero);
+    }
 
     bool canNormalize() const {
         return CanNormalize(fX, fY);
@@ -252,6 +249,14 @@ struct SK_API SkPoint {
     */
     bool setLength(SkScalar x, SkScalar y, SkScalar length);
 
+    /** Same as setLength, but favoring speed over accuracy.
+    */
+    bool setLengthFast(SkScalar length);
+
+    /** Same as setLength, but favoring speed over accuracy.
+    */
+    bool setLengthFast(SkScalar x, SkScalar y, SkScalar length);
+
     /** Scale the point's coordinates by scale, writing the answer into dst.
         It is legal for dst == this.
     */
@@ -316,7 +321,6 @@ struct SK_API SkPoint {
      *  Returns true if both X and Y are finite (not infinity or NaN)
      */
     bool isFinite() const {
-#ifdef SK_SCALAR_IS_FLOAT
         SkScalar accum = 0;
         accum *= fX;
         accum *= fY;
@@ -327,12 +331,6 @@ struct SK_API SkPoint {
         // value==value will be true iff value is not NaN
         // TODO: is it faster to say !accum or accum==accum?
         return accum == accum;
-#else
-        // use bit-or for speed, since we don't care about short-circuting the
-        // tests, and we expect the common case will be that we need to check all.
-        int isNaN = (SK_FixedNaN == fX) | (SK_FixedNaN == fX));
-        return !isNaN;
-#endif
     }
 
     /**
author	commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>	2013-11-08 20:14:16 +0000
committer	commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>	2013-11-08 20:14:16 +0000
commit	11e5b972a984c7b4e09ba4dfeacc7bd805107c5a (patch)
tree	8b819ae42f35273010e987ce02363e381e6d2466 /include/core
parent	8f457e3230f1a4ce737f512ffbb5c919b8d02407 (diff)