diff options
-rw-r--r-- | bench/MathBench.cpp | 17 | ||||
-rw-r--r-- | include/core/SkFloatingPoint.h | 41 | ||||
-rw-r--r-- | include/core/SkPoint.h | 26 | ||||
-rw-r--r-- | src/core/SkPoint.cpp | 294 | ||||
-rw-r--r-- | tests/PointTest.cpp | 24 |
5 files changed, 108 insertions, 294 deletions
diff --git a/bench/MathBench.cpp b/bench/MathBench.cpp index abe04e13d7..6327c3c580 100644 --- a/bench/MathBench.cpp +++ b/bench/MathBench.cpp @@ -92,6 +92,22 @@ private: typedef MathBench INHERITED; }; +class SkRSqrtMathBench : public MathBench { +public: + SkRSqrtMathBench() : INHERITED("sk_float_rsqrt") {} +protected: + virtual void performTest(float* SK_RESTRICT dst, + const float* SK_RESTRICT src, + int count) { + for (int i = 0; i < count; ++i) { + dst[i] = sk_float_rsqrt(src[i]); + } + } +private: + typedef MathBench INHERITED; +}; + + class SlowISqrtMathBench : public MathBench { public: SlowISqrtMathBench() : INHERITED("slowIsqrt") {} @@ -550,6 +566,7 @@ DEF_BENCH(return new DivModBench<int64_t>("int64_t")) /////////////////////////////////////////////////////////////////////////////// DEF_BENCH( return new NoOpMathBench(); ) +DEF_BENCH( return new SkRSqrtMathBench(); ) DEF_BENCH( return new SlowISqrtMathBench(); ) DEF_BENCH( return new FastISqrtMathBench(); ) DEF_BENCH( return new QMul64Bench(); ) diff --git a/include/core/SkFloatingPoint.h b/include/core/SkFloatingPoint.h index 44a3eef98d..7dfa9d8680 100644 --- a/include/core/SkFloatingPoint.h +++ b/include/core/SkFloatingPoint.h @@ -96,4 +96,45 @@ extern const uint32_t gIEEENegativeInfinity; #define SK_FloatNaN (*SkTCast<const float*>(&gIEEENotANumber)) #define SK_FloatInfinity (*SkTCast<const float*>(&gIEEEInfinity)) #define SK_FloatNegativeInfinity (*SkTCast<const float*>(&gIEEENegativeInfinity)) + +#if defined(__SSE__) +#include <xmmintrin.h> +#elif defined(__ARM_NEON__) +#include <arm_neon.h> +#endif + +// Fast, approximate inverse square root. +// Compare to name-brand "1.0f / sk_float_sqrt(x)". Should be around 10x faster on SSE, 2x on NEON. +static inline float sk_float_rsqrt(const float x) { +// We want all this inlined, so we'll inline SIMD and just take the hit when we don't know we've got +// it at compile time. This is going to be too fast to productively hide behind a function pointer. +// +// We do one step of Newton's method to refine the estimates in the NEON and null paths. No +// refinement is faster, but very innacurate. Two steps is more accurate, but slower than 1/sqrt. +#if defined(__SSE__) + float result; + _mm_store_ss(&result, _mm_rsqrt_ss(_mm_set_ss(x))); + return result; +#elif defined(__ARM_NEON__) + // Get initial estimate. + const float32x2_t xx = vdup_n_f32(x); // Clever readers will note we're doing everything 2x. + float32x2_t estimate = vrsqrte_f32(xx); + + // One step of Newton's method to refine. + const float32x2_t estimate_sq = vmul_f32(estimate, estimate); + estimate = vmul_f32(estimate, vrsqrts_f32(xx, estimate_sq)); + return vget_lane_f32(estimate, 0); // 1 will work fine too; the answer's in both places. +#else + // Get initial estimate. + int i = *SkTCast<int*>(&x); + i = 0x5f3759df - (i>>1); + float estimate = *SkTCast<float*>(&i); + + // One step of Newton's method to refine. + const float estimate_sq = estimate*estimate; + estimate *= (1.5f-0.5f*x*estimate_sq); + return estimate; +#endif +} + #endif diff --git a/include/core/SkPoint.h b/include/core/SkPoint.h index b94f730ec2..caf26507ff 100644 --- a/include/core/SkPoint.h +++ b/include/core/SkPoint.h @@ -216,13 +216,10 @@ struct SK_API SkPoint { * Return true if the computed length of the vector is >= the internal * tolerance (used to avoid dividing by tiny values). */ - static bool CanNormalize(SkScalar dx, SkScalar dy) -#ifdef SK_SCALAR_IS_FLOAT - // Simple enough (and performance critical sometimes) so we inline it. - { return (dx*dx + dy*dy) > (SK_ScalarNearlyZero * SK_ScalarNearlyZero); } -#else - ; -#endif + static bool CanNormalize(SkScalar dx, SkScalar dy) { + // Simple enough (and performance critical sometimes) so we inline it. + return (dx*dx + dy*dy) > (SK_ScalarNearlyZero * SK_ScalarNearlyZero); + } bool canNormalize() const { return CanNormalize(fX, fY); @@ -252,6 +249,14 @@ struct SK_API SkPoint { */ bool setLength(SkScalar x, SkScalar y, SkScalar length); + /** Same as setLength, but favoring speed over accuracy. + */ + bool setLengthFast(SkScalar length); + + /** Same as setLength, but favoring speed over accuracy. + */ + bool setLengthFast(SkScalar x, SkScalar y, SkScalar length); + /** Scale the point's coordinates by scale, writing the answer into dst. It is legal for dst == this. */ @@ -316,7 +321,6 @@ struct SK_API SkPoint { * Returns true if both X and Y are finite (not infinity or NaN) */ bool isFinite() const { -#ifdef SK_SCALAR_IS_FLOAT SkScalar accum = 0; accum *= fX; accum *= fY; @@ -327,12 +331,6 @@ struct SK_API SkPoint { // value==value will be true iff value is not NaN // TODO: is it faster to say !accum or accum==accum? return accum == accum; -#else - // use bit-or for speed, since we don't care about short-circuting the - // tests, and we expect the common case will be that we need to check all. - int isNaN = (SK_FixedNaN == fX) | (SK_FixedNaN == fX)); - return !isNaN; -#endif } /** diff --git a/src/core/SkPoint.cpp b/src/core/SkPoint.cpp index bf3affaaf5..719ee54b22 100644 --- a/src/core/SkPoint.cpp +++ b/src/core/SkPoint.cpp @@ -87,8 +87,6 @@ bool SkPoint::setLength(SkScalar length) { return this->setLength(fX, fY, length); } -#ifdef SK_SCALAR_IS_FLOAT - // Returns the square of the Euclidian distance to (dx,dy). static inline float getLengthSquared(float dx, float dy) { return dx * dx + dy * dy; @@ -177,290 +175,32 @@ bool SkPoint::setLength(float x, float y, float length) { return true; } -#else - -#include "Sk64.h" - -// Returns the square of the Euclidian distance to (dx,dy) in *result. -static inline void getLengthSquared(SkScalar dx, SkScalar dy, Sk64 *result) { - Sk64 dySqr; - - result->setMul(dx, dx); - dySqr.setMul(dy, dy); - result->add(dySqr); -} - -// Calculates the square of the Euclidian distance to (dx,dy) and stores it in -// *lengthSquared. Returns true if the distance is judged to be "nearly zero". -// -// This logic is encapsulated in a helper method to make it explicit that we -// always perform this check in the same manner, to avoid inconsistencies -// (see http://code.google.com/p/skia/issues/detail?id=560 ). -static inline bool isLengthNearlyZero(SkScalar dx, SkScalar dy, - Sk64 *lengthSquared) { - Sk64 tolSqr; - getLengthSquared(dx, dy, lengthSquared); - - // we want nearlyzero^2, but to compute it fast we want to just do a - // 32bit multiply, so we require that it not exceed 31bits. That is true - // if nearlyzero is <= 0xB504, which should be trivial, since usually - // nearlyzero is a very small fixed-point value. - SkASSERT(SK_ScalarNearlyZero <= 0xB504); - - tolSqr.set(0, SK_ScalarNearlyZero * SK_ScalarNearlyZero); - return *lengthSquared <= tolSqr; -} - -SkScalar SkPoint::Normalize(SkPoint* pt) { - Sk64 mag2; - if (!isLengthNearlyZero(pt->fX, pt->fY, &mag2)) { - SkScalar mag = mag2.getSqrt(); - SkScalar scale = SkScalarInvert(mag); - pt->fX = SkScalarMul(pt->fX, scale); - pt->fY = SkScalarMul(pt->fY, scale); - return mag; - } - return 0; -} - -bool SkPoint::CanNormalize(SkScalar dx, SkScalar dy) { - Sk64 mag2_unused; - return !isLengthNearlyZero(dx, dy, &mag2_unused); -} - -SkScalar SkPoint::Length(SkScalar dx, SkScalar dy) { - Sk64 tmp; - getLengthSquared(dx, dy, &tmp); - return tmp.getSqrt(); -} - -#ifdef SK_DEBUGx -static SkFixed fixlen(SkFixed x, SkFixed y) { - float fx = (float)x; - float fy = (float)y; - - return (int)floorf(sqrtf(fx*fx + fy*fy) + 0.5f); -} -#endif - -static inline uint32_t squarefixed(unsigned x) { - x >>= 16; - return x*x; -} - -#if 1 // Newton iter for setLength - -static inline unsigned invsqrt_iter(unsigned V, unsigned U) { - unsigned x = V * U >> 14; - x = x * U >> 14; - x = (3 << 14) - x; - x = (U >> 1) * x >> 14; - return x; -} - -static const uint16_t gInvSqrt14GuessTable[] = { - 0x4000, 0x3c57, 0x393e, 0x3695, 0x3441, 0x3235, 0x3061, - 0x2ebd, 0x2d41, 0x2be7, 0x2aaa, 0x2987, 0x287a, 0x2780, - 0x2698, 0x25be, 0x24f3, 0x2434, 0x2380, 0x22d6, 0x2235, - 0x219d, 0x210c, 0x2083, 0x2000, 0x1f82, 0x1f0b, 0x1e99, - 0x1e2b, 0x1dc2, 0x1d5d, 0x1cfc, 0x1c9f, 0x1c45, 0x1bee, - 0x1b9b, 0x1b4a, 0x1afc, 0x1ab0, 0x1a67, 0x1a20, 0x19dc, - 0x1999, 0x1959, 0x191a, 0x18dd, 0x18a2, 0x1868, 0x1830, - 0x17fa, 0x17c4, 0x1791, 0x175e, 0x172d, 0x16fd, 0x16ce -}; - -#define BUILD_INVSQRT_TABLEx -#ifdef BUILD_INVSQRT_TABLE -static void build_invsqrt14_guess_table() { - for (int i = 8; i <= 63; i++) { - unsigned x = SkToU16((1 << 28) / SkSqrt32(i << 25)); - printf("0x%x, ", x); - } - printf("\n"); -} -#endif - -static unsigned fast_invsqrt(uint32_t x) { -#ifdef BUILD_INVSQRT_TABLE - unsigned top2 = x >> 25; - SkASSERT(top2 >= 8 && top2 <= 63); - - static bool gOnce; - if (!gOnce) { - build_invsqrt14_guess_table(); - gOnce = true; - } -#endif - - unsigned V = x >> 14; // make V .14 - - unsigned top = x >> 25; - SkASSERT(top >= 8 && top <= 63); - SkASSERT(top - 8 < SK_ARRAY_COUNT(gInvSqrt14GuessTable)); - unsigned U = gInvSqrt14GuessTable[top - 8]; - - U = invsqrt_iter(V, U); - return invsqrt_iter(V, U); +bool SkPoint::setLengthFast(float length) { + return this->setLengthFast(fX, fY, length); } -/* We "normalize" x,y to be .14 values (so we can square them and stay 32bits. - Then we Newton-iterate this in .14 space to compute the invser-sqrt, and - scale by it at the end. The .14 space means we can execute our iterations - and stay in 32bits as well, making the multiplies much cheaper than calling - SkFixedMul. -*/ -bool SkPoint::setLength(SkFixed ox, SkFixed oy, SkFixed length) { - if (ox == 0) { - if (oy == 0) { - return false; - } - this->set(0, SkApplySign(length, SkExtractSign(oy))); - return true; - } - if (oy == 0) { - this->set(SkApplySign(length, SkExtractSign(ox)), 0); - return true; +bool SkPoint::setLengthFast(float x, float y, float length) { + float mag2; + if (isLengthNearlyZero(x, y, &mag2)) { + return false; } - unsigned x = SkAbs32(ox); - unsigned y = SkAbs32(oy); - int zeros = SkCLZ(x | y); - - // make x,y 1.14 values so our fast sqr won't overflow - if (zeros > 17) { - x <<= zeros - 17; - y <<= zeros - 17; + float scale; + if (SkScalarIsFinite(mag2)) { + scale = length * sk_float_rsqrt(mag2); // <--- this is the difference } else { - x >>= 17 - zeros; - y >>= 17 - zeros; - } - SkASSERT((x | y) <= 0x7FFF); - - unsigned invrt = fast_invsqrt(x*x + y*y); - - x = x * invrt >> 12; - y = y * invrt >> 12; - - if (length != SK_Fixed1) { - x = SkFixedMul(x, length); - y = SkFixedMul(y, length); - } - this->set(SkApplySign(x, SkExtractSign(ox)), - SkApplySign(y, SkExtractSign(oy))); - return true; -} -#else -/* - Normalize x,y, and then scale them by length. - - The obvious way to do this would be the following: - S64 tmp1, tmp2; - tmp1.setMul(x,x); - tmp2.setMul(y,y); - tmp1.add(tmp2); - len = tmp1.getSqrt(); - x' = SkFixedDiv(x, len); - y' = SkFixedDiv(y, len); - This is fine, but slower than what we do below. - - The present technique does not compute the starting length, but - rather fiddles with x,y iteratively, all the while checking its - magnitude^2 (avoiding a sqrt). - - We normalize by first shifting x,y so that at least one of them - has bit 31 set (after taking the abs of them). - Then we loop, refining x,y by squaring them and comparing - against a very large 1.0 (1 << 28), and then adding or subtracting - a delta (which itself is reduced by half each time through the loop). - For speed we want the squaring to be with a simple integer mul. To keep - that from overflowing we shift our coordinates down until we are dealing - with at most 15 bits (2^15-1)^2 * 2 says withing 32 bits) - When our square is close to 1.0, we shift x,y down into fixed range. -*/ -bool SkPoint::setLength(SkFixed ox, SkFixed oy, SkFixed length) { - if (ox == 0) { - if (oy == 0) - return false; - this->set(0, SkApplySign(length, SkExtractSign(oy))); - return true; - } - if (oy == 0) { - this->set(SkApplySign(length, SkExtractSign(ox)), 0); - return true; - } - - SkFixed x = SkAbs32(ox); - SkFixed y = SkAbs32(oy); - - // shift x,y so that the greater of them is 15bits (1.14 fixed point) - { - int shift = SkCLZ(x | y); - // make them .30 - x <<= shift - 1; - y <<= shift - 1; - } - - SkFixed dx = x; - SkFixed dy = y; - - for (int i = 0; i < 17; i++) { - dx >>= 1; - dy >>= 1; - - U32 len2 = squarefixed(x) + squarefixed(y); - if (len2 >> 28) { - x -= dx; - y -= dy; - } else { - x += dx; - y += dy; - } - } - x >>= 14; - y >>= 14; - -#ifdef SK_DEBUGx // measure how far we are from unit-length - { - static int gMaxError; - static int gMaxDiff; - - SkFixed len = fixlen(x, y); - int err = len - SK_Fixed1; - err = SkAbs32(err); - - if (err > gMaxError) { - gMaxError = err; - SkDebugf("gMaxError %d\n", err); - } - - float fx = SkAbs32(ox)/65536.0f; - float fy = SkAbs32(oy)/65536.0f; - float mag = sqrtf(fx*fx + fy*fy); - fx /= mag; - fy /= mag; - SkFixed xx = (int)floorf(fx * 65536 + 0.5f); - SkFixed yy = (int)floorf(fy * 65536 + 0.5f); - err = SkMax32(SkAbs32(xx-x), SkAbs32(yy-y)); - if (err > gMaxDiff) { - gMaxDiff = err; - SkDebugf("gMaxDiff %d\n", err); - } - } -#endif - - x = SkApplySign(x, SkExtractSign(ox)); - y = SkApplySign(y, SkExtractSign(oy)); - if (length != SK_Fixed1) { - x = SkFixedMul(x, length); - y = SkFixedMul(y, length); + // our mag2 step overflowed to infinity, so use doubles instead. + // much slower, but needed when x or y are very large, other wise we + // divide by inf. and return (0,0) vector. + double xx = x; + double yy = y; + scale = (float)(length / sqrt(xx * xx + yy * yy)); } - - this->set(x, y); + fX = x * scale; + fY = y * scale; return true; } -#endif -#endif /////////////////////////////////////////////////////////////////////////////// diff --git a/tests/PointTest.cpp b/tests/PointTest.cpp index 1255a8c65d..9f91c47c1c 100644 --- a/tests/PointTest.cpp +++ b/tests/PointTest.cpp @@ -117,7 +117,8 @@ static void test_underflow(skiatest::Reporter* reporter) { REPORTER_ASSERT(reporter, pt == copy); // pt is unchanged } -static void PointTest(skiatest::Reporter* reporter) { +#include "TestClassDef.h" +DEF_TEST(Point, reporter) { test_casts(reporter); static const struct { @@ -137,5 +138,22 @@ static void PointTest(skiatest::Reporter* reporter) { test_overflow(reporter); } -#include "TestClassDef.h" -DEFINE_TESTCLASS("Point", PointTestClass, PointTest) +DEF_TEST(Point_setLengthFast, reporter) { + // Scale a (1,1) point to a bunch of different lengths, + // making sure the slow and fast paths are within 0.1%. + const float tests[] = { 1.0f, 0.0f, 1.0e-37f, 3.4e38f, 42.0f, 0.00012f }; + + const SkPoint kOne = {1.0f, 1.0f}; + for (unsigned i = 0; i < SK_ARRAY_COUNT(tests); i++) { + SkPoint slow = kOne, fast = kOne; + + slow.setLength(tests[i]); + fast.setLengthFast(tests[i]); + + if (slow.length() < FLT_MIN && fast.length() < FLT_MIN) continue; + + SkScalar ratio = slow.length() / fast.length(); + REPORTER_ASSERT(reporter, ratio > 0.999f); + REPORTER_ASSERT(reporter, ratio < 1.001f); + } +} |