aboutsummaryrefslogtreecommitdiffhomepage
path: root/include/core/SkFloatingPoint.h
diff options
context:
space:
mode:
authorGravatar commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>2013-11-08 20:14:16 +0000
committerGravatar commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>2013-11-08 20:14:16 +0000
commit11e5b972a984c7b4e09ba4dfeacc7bd805107c5a (patch)
tree8b819ae42f35273010e987ce02363e381e6d2466 /include/core/SkFloatingPoint.h
parent8f457e3230f1a4ce737f512ffbb5c919b8d02407 (diff)
Add sk_float_rsqrt with SSE + NEON fast paths.
Current numbers: N4: running bench [640 480] math_fastIsqrt NONRENDERING: cmsecs = 3.12 running bench [640 480] math_slowIsqrt NONRENDERING: cmsecs = 4.82 running bench [640 480] math_sk_float_rsqrt NONRENDERING: cmsecs = 1.99 Desktop: running bench [640 480] math_fastIsqrt NONRENDERING: cmsecs = 0.89 running bench [640 480] math_slowIsqrt NONRENDERING: cmsecs = 0.94 running bench [640 480] math_sk_float_rsqrt NONRENDERING: cmsecs = 0.09 Haven't found any other benches where this is a significant effect yet. BUG= R=reed@google.com Author: mtklein@google.com Review URL: https://codereview.chromium.org/60083014 git-svn-id: http://skia.googlecode.com/svn/trunk@12203 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'include/core/SkFloatingPoint.h')
-rw-r--r--include/core/SkFloatingPoint.h41
1 files changed, 41 insertions, 0 deletions
diff --git a/include/core/SkFloatingPoint.h b/include/core/SkFloatingPoint.h
index 44a3eef98d..7dfa9d8680 100644
--- a/include/core/SkFloatingPoint.h
+++ b/include/core/SkFloatingPoint.h
@@ -96,4 +96,45 @@ extern const uint32_t gIEEENegativeInfinity;
#define SK_FloatNaN (*SkTCast<const float*>(&gIEEENotANumber))
#define SK_FloatInfinity (*SkTCast<const float*>(&gIEEEInfinity))
#define SK_FloatNegativeInfinity (*SkTCast<const float*>(&gIEEENegativeInfinity))
+
+#if defined(__SSE__)
+#include <xmmintrin.h>
+#elif defined(__ARM_NEON__)
+#include <arm_neon.h>
+#endif
+
+// Fast, approximate inverse square root.
+// Compare to name-brand "1.0f / sk_float_sqrt(x)". Should be around 10x faster on SSE, 2x on NEON.
+static inline float sk_float_rsqrt(const float x) {
+// We want all this inlined, so we'll inline SIMD and just take the hit when we don't know we've got
+// it at compile time. This is going to be too fast to productively hide behind a function pointer.
+//
+// We do one step of Newton's method to refine the estimates in the NEON and null paths. No
+// refinement is faster, but very innacurate. Two steps is more accurate, but slower than 1/sqrt.
+#if defined(__SSE__)
+ float result;
+ _mm_store_ss(&result, _mm_rsqrt_ss(_mm_set_ss(x)));
+ return result;
+#elif defined(__ARM_NEON__)
+ // Get initial estimate.
+ const float32x2_t xx = vdup_n_f32(x); // Clever readers will note we're doing everything 2x.
+ float32x2_t estimate = vrsqrte_f32(xx);
+
+ // One step of Newton's method to refine.
+ const float32x2_t estimate_sq = vmul_f32(estimate, estimate);
+ estimate = vmul_f32(estimate, vrsqrts_f32(xx, estimate_sq));
+ return vget_lane_f32(estimate, 0); // 1 will work fine too; the answer's in both places.
+#else
+ // Get initial estimate.
+ int i = *SkTCast<int*>(&x);
+ i = 0x5f3759df - (i>>1);
+ float estimate = *SkTCast<float*>(&i);
+
+ // One step of Newton's method to refine.
+ const float estimate_sq = estimate*estimate;
+ estimate *= (1.5f-0.5f*x*estimate_sq);
+ return estimate;
+#endif
+}
+
#endif