diff options
author | commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2013-11-08 20:14:16 +0000 |
---|---|---|
committer | commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2013-11-08 20:14:16 +0000 |
commit | 11e5b972a984c7b4e09ba4dfeacc7bd805107c5a (patch) | |
tree | 8b819ae42f35273010e987ce02363e381e6d2466 /include/core/SkFloatingPoint.h | |
parent | 8f457e3230f1a4ce737f512ffbb5c919b8d02407 (diff) |
Add sk_float_rsqrt with SSE + NEON fast paths.
Current numbers:
N4:
running bench [640 480] math_fastIsqrt NONRENDERING: cmsecs = 3.12
running bench [640 480] math_slowIsqrt NONRENDERING: cmsecs = 4.82
running bench [640 480] math_sk_float_rsqrt NONRENDERING: cmsecs = 1.99
Desktop:
running bench [640 480] math_fastIsqrt NONRENDERING: cmsecs = 0.89
running bench [640 480] math_slowIsqrt NONRENDERING: cmsecs = 0.94
running bench [640 480] math_sk_float_rsqrt NONRENDERING: cmsecs = 0.09
Haven't found any other benches where this is a significant effect yet.
BUG=
R=reed@google.com
Author: mtklein@google.com
Review URL: https://codereview.chromium.org/60083014
git-svn-id: http://skia.googlecode.com/svn/trunk@12203 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'include/core/SkFloatingPoint.h')
-rw-r--r-- | include/core/SkFloatingPoint.h | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/include/core/SkFloatingPoint.h b/include/core/SkFloatingPoint.h index 44a3eef98d..7dfa9d8680 100644 --- a/include/core/SkFloatingPoint.h +++ b/include/core/SkFloatingPoint.h @@ -96,4 +96,45 @@ extern const uint32_t gIEEENegativeInfinity; #define SK_FloatNaN (*SkTCast<const float*>(&gIEEENotANumber)) #define SK_FloatInfinity (*SkTCast<const float*>(&gIEEEInfinity)) #define SK_FloatNegativeInfinity (*SkTCast<const float*>(&gIEEENegativeInfinity)) + +#if defined(__SSE__) +#include <xmmintrin.h> +#elif defined(__ARM_NEON__) +#include <arm_neon.h> +#endif + +// Fast, approximate inverse square root. +// Compare to name-brand "1.0f / sk_float_sqrt(x)". Should be around 10x faster on SSE, 2x on NEON. +static inline float sk_float_rsqrt(const float x) { +// We want all this inlined, so we'll inline SIMD and just take the hit when we don't know we've got +// it at compile time. This is going to be too fast to productively hide behind a function pointer. +// +// We do one step of Newton's method to refine the estimates in the NEON and null paths. No +// refinement is faster, but very innacurate. Two steps is more accurate, but slower than 1/sqrt. +#if defined(__SSE__) + float result; + _mm_store_ss(&result, _mm_rsqrt_ss(_mm_set_ss(x))); + return result; +#elif defined(__ARM_NEON__) + // Get initial estimate. + const float32x2_t xx = vdup_n_f32(x); // Clever readers will note we're doing everything 2x. + float32x2_t estimate = vrsqrte_f32(xx); + + // One step of Newton's method to refine. + const float32x2_t estimate_sq = vmul_f32(estimate, estimate); + estimate = vmul_f32(estimate, vrsqrts_f32(xx, estimate_sq)); + return vget_lane_f32(estimate, 0); // 1 will work fine too; the answer's in both places. +#else + // Get initial estimate. + int i = *SkTCast<int*>(&x); + i = 0x5f3759df - (i>>1); + float estimate = *SkTCast<float*>(&i); + + // One step of Newton's method to refine. + const float estimate_sq = estimate*estimate; + estimate *= (1.5f-0.5f*x*estimate_sq); + return estimate; +#endif +} + #endif |