diff options
author | commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2013-11-08 20:14:16 +0000 |
---|---|---|
committer | commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2013-11-08 20:14:16 +0000 |
commit | 11e5b972a984c7b4e09ba4dfeacc7bd805107c5a (patch) | |
tree | 8b819ae42f35273010e987ce02363e381e6d2466 /include/core | |
parent | 8f457e3230f1a4ce737f512ffbb5c919b8d02407 (diff) |
Add sk_float_rsqrt with SSE + NEON fast paths.
Current numbers:
N4:
running bench [640 480] math_fastIsqrt NONRENDERING: cmsecs = 3.12
running bench [640 480] math_slowIsqrt NONRENDERING: cmsecs = 4.82
running bench [640 480] math_sk_float_rsqrt NONRENDERING: cmsecs = 1.99
Desktop:
running bench [640 480] math_fastIsqrt NONRENDERING: cmsecs = 0.89
running bench [640 480] math_slowIsqrt NONRENDERING: cmsecs = 0.94
running bench [640 480] math_sk_float_rsqrt NONRENDERING: cmsecs = 0.09
Haven't found any other benches where this is a significant effect yet.
BUG=
R=reed@google.com
Author: mtklein@google.com
Review URL: https://codereview.chromium.org/60083014
git-svn-id: http://skia.googlecode.com/svn/trunk@12203 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'include/core')
-rw-r--r-- | include/core/SkFloatingPoint.h | 41 | ||||
-rw-r--r-- | include/core/SkPoint.h | 26 |
2 files changed, 53 insertions, 14 deletions
diff --git a/include/core/SkFloatingPoint.h b/include/core/SkFloatingPoint.h index 44a3eef98d..7dfa9d8680 100644 --- a/include/core/SkFloatingPoint.h +++ b/include/core/SkFloatingPoint.h @@ -96,4 +96,45 @@ extern const uint32_t gIEEENegativeInfinity; #define SK_FloatNaN (*SkTCast<const float*>(&gIEEENotANumber)) #define SK_FloatInfinity (*SkTCast<const float*>(&gIEEEInfinity)) #define SK_FloatNegativeInfinity (*SkTCast<const float*>(&gIEEENegativeInfinity)) + +#if defined(__SSE__) +#include <xmmintrin.h> +#elif defined(__ARM_NEON__) +#include <arm_neon.h> +#endif + +// Fast, approximate inverse square root. +// Compare to name-brand "1.0f / sk_float_sqrt(x)". Should be around 10x faster on SSE, 2x on NEON. +static inline float sk_float_rsqrt(const float x) { +// We want all this inlined, so we'll inline SIMD and just take the hit when we don't know we've got +// it at compile time. This is going to be too fast to productively hide behind a function pointer. +// +// We do one step of Newton's method to refine the estimates in the NEON and null paths. No +// refinement is faster, but very innacurate. Two steps is more accurate, but slower than 1/sqrt. +#if defined(__SSE__) + float result; + _mm_store_ss(&result, _mm_rsqrt_ss(_mm_set_ss(x))); + return result; +#elif defined(__ARM_NEON__) + // Get initial estimate. + const float32x2_t xx = vdup_n_f32(x); // Clever readers will note we're doing everything 2x. + float32x2_t estimate = vrsqrte_f32(xx); + + // One step of Newton's method to refine. + const float32x2_t estimate_sq = vmul_f32(estimate, estimate); + estimate = vmul_f32(estimate, vrsqrts_f32(xx, estimate_sq)); + return vget_lane_f32(estimate, 0); // 1 will work fine too; the answer's in both places. +#else + // Get initial estimate. + int i = *SkTCast<int*>(&x); + i = 0x5f3759df - (i>>1); + float estimate = *SkTCast<float*>(&i); + + // One step of Newton's method to refine. + const float estimate_sq = estimate*estimate; + estimate *= (1.5f-0.5f*x*estimate_sq); + return estimate; +#endif +} + #endif diff --git a/include/core/SkPoint.h b/include/core/SkPoint.h index b94f730ec2..caf26507ff 100644 --- a/include/core/SkPoint.h +++ b/include/core/SkPoint.h @@ -216,13 +216,10 @@ struct SK_API SkPoint { * Return true if the computed length of the vector is >= the internal * tolerance (used to avoid dividing by tiny values). */ - static bool CanNormalize(SkScalar dx, SkScalar dy) -#ifdef SK_SCALAR_IS_FLOAT - // Simple enough (and performance critical sometimes) so we inline it. - { return (dx*dx + dy*dy) > (SK_ScalarNearlyZero * SK_ScalarNearlyZero); } -#else - ; -#endif + static bool CanNormalize(SkScalar dx, SkScalar dy) { + // Simple enough (and performance critical sometimes) so we inline it. + return (dx*dx + dy*dy) > (SK_ScalarNearlyZero * SK_ScalarNearlyZero); + } bool canNormalize() const { return CanNormalize(fX, fY); @@ -252,6 +249,14 @@ struct SK_API SkPoint { */ bool setLength(SkScalar x, SkScalar y, SkScalar length); + /** Same as setLength, but favoring speed over accuracy. + */ + bool setLengthFast(SkScalar length); + + /** Same as setLength, but favoring speed over accuracy. + */ + bool setLengthFast(SkScalar x, SkScalar y, SkScalar length); + /** Scale the point's coordinates by scale, writing the answer into dst. It is legal for dst == this. */ @@ -316,7 +321,6 @@ struct SK_API SkPoint { * Returns true if both X and Y are finite (not infinity or NaN) */ bool isFinite() const { -#ifdef SK_SCALAR_IS_FLOAT SkScalar accum = 0; accum *= fX; accum *= fY; @@ -327,12 +331,6 @@ struct SK_API SkPoint { // value==value will be true iff value is not NaN // TODO: is it faster to say !accum or accum==accum? return accum == accum; -#else - // use bit-or for speed, since we don't care about short-circuting the - // tests, and we expect the common case will be that we need to check all. - int isNaN = (SK_FixedNaN == fX) | (SK_FixedNaN == fX)); - return !isNaN; -#endif } /** |