diff options
-rw-r--r-- | bench/PMFloatBench.cpp | 84 | ||||
-rw-r--r-- | src/core/SkPMFloat.h | 29 | ||||
-rw-r--r-- | src/effects/SkColorMatrixFilter.cpp | 36 | ||||
-rw-r--r-- | src/opts/SkPMFloat_SSE2.h | 84 | ||||
-rw-r--r-- | src/opts/SkPMFloat_SSSE3.h | 87 | ||||
-rw-r--r-- | src/opts/SkPMFloat_neon.h | 58 | ||||
-rw-r--r-- | src/opts/SkPMFloat_none.h | 55 | ||||
-rw-r--r-- | src/opts/SkPMFloat_sse.h | 36 | ||||
-rw-r--r-- | tests/PMFloatTest.cpp | 48 |
9 files changed, 100 insertions, 417 deletions
diff --git a/bench/PMFloatBench.cpp b/bench/PMFloatBench.cpp index 37542e3525..540fdb7815 100644 --- a/bench/PMFloatBench.cpp +++ b/bench/PMFloatBench.cpp @@ -20,20 +20,10 @@ static uint32_t lcg_rand(uint32_t* seed) { } // I'm having better luck getting these to constant-propagate away as template parameters. -template <bool kClamp, bool kWide> -struct PMFloatGetSetBench : public Benchmark { - PMFloatGetSetBench() {} +struct PMFloatRoundtripBench : public Benchmark { + PMFloatRoundtripBench() {} - const char* onGetName() override { - switch (kClamp << 1 | kWide) { - case 0: return "SkPMFloat_get_1x"; - case 1: return "SkPMFloat_get_4x"; - case 2: return "SkPMFloat_clamp_1x"; - case 3: return "SkPMFloat_clamp_4x"; - } - SkFAIL("unreachable"); - return "oh bother"; - } + const char* onGetName() override { return "SkPMFloat_roundtrip"; } bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; } void onDraw(const int loops, SkCanvas* canvas) override { @@ -41,61 +31,23 @@ struct PMFloatGetSetBench : public Benchmark { uint32_t junk = 0; uint32_t seed = 0; for (int i = 0; i < loops; i++) { - SkPMColor colors[4]; + SkPMColor color; #ifdef SK_DEBUG - for (int i = 0; i < 4; i++) { - // Our SkASSERTs will remind us that it's technically required that we premultiply. - colors[i] = SkPreMultiplyColor(lcg_rand(&seed)); - } + // Our SkASSERTs will remind us that it's technically required that we premultiply. + color = SkPreMultiplyColor(lcg_rand(&seed)); #else // But it's a lot faster not to, and this code won't really mind the non-PM colors. - (void)lcg_rand(&seed); - colors[0] = seed + 0; - colors[1] = seed + 1; - colors[2] = seed + 2; - colors[3] = seed + 3; + color = lcg_rand(&seed); #endif - SkPMFloat fa,fb,fc,fd; - if (kWide) { - SkPMFloat::From4PMColors(colors, &fa, &fb, &fc, &fd); - } else { - fa = SkPMFloat::FromPMColor(colors[0]); - fb = SkPMFloat::FromPMColor(colors[1]); - fc = SkPMFloat::FromPMColor(colors[2]); - fd = SkPMFloat::FromPMColor(colors[3]); - } - - SkPMColor back[4]; - switch (kClamp << 1 | kWide) { - case 0: { - back[0] = fa.round(); - back[1] = fb.round(); - back[2] = fc.round(); - back[3] = fd.round(); - } break; - case 1: SkPMFloat::RoundTo4PMColors(fa, fb, fc, fd, back); break; - case 2: { - back[0] = fa.roundClamp(); - back[1] = fb.roundClamp(); - back[2] = fc.roundClamp(); - back[3] = fd.roundClamp(); - } break; - case 3: SkPMFloat::RoundClampTo4PMColors(fa, fb, fc, fd, back); break; - } - for (int i = 0; i < 4; i++) { - junk ^= back[i]; - } + auto f = SkPMFloat::FromPMColor(color); + SkPMColor back = f.round(); + junk ^= back; } blackhole ^= junk; } }; - -// Extra () help DEF_BENCH not get confused by the comma inside the <>. -DEF_BENCH(return (new PMFloatGetSetBench< true, true>);) -DEF_BENCH(return (new PMFloatGetSetBench<false, true>);) -DEF_BENCH(return (new PMFloatGetSetBench< true, false>);) -DEF_BENCH(return (new PMFloatGetSetBench<false, false>);) +DEF_BENCH(return new PMFloatRoundtripBench;) struct PMFloatGradientBench : public Benchmark { const char* onGetName() override { return "PMFloat_gradient"; } @@ -103,8 +55,8 @@ struct PMFloatGradientBench : public Benchmark { SkPMColor fDevice[100]; void onDraw(const int loops, SkCanvas*) override { - Sk4f c0 = SkPMFloat::FromARGB(255, 255, 0, 0), - c1 = SkPMFloat::FromARGB(255, 0, 0, 255), + Sk4f c0 = SkPMFloat::FromARGB(1, 1, 0, 0), + c1 = SkPMFloat::FromARGB(1, 0, 0, 1), dc = c1 - c0, fx(0.1f), dx(0.002f), @@ -112,15 +64,15 @@ struct PMFloatGradientBench : public Benchmark { dcdx4(dcdx+dcdx+dcdx+dcdx); for (int n = 0; n < loops; n++) { - Sk4f a = c0 + dc*fx + Sk4f(0.5f), // The +0.5f lets us call trunc() instead of get(). + Sk4f a = c0 + dc*fx, b = a + dcdx, c = b + dcdx, d = c + dcdx; for (size_t i = 0; i < SK_ARRAY_COUNT(fDevice); i += 4) { - fDevice[i+0] = SkPMFloat(a).trunc(); - fDevice[i+1] = SkPMFloat(b).trunc(); - fDevice[i+2] = SkPMFloat(c).trunc(); - fDevice[i+3] = SkPMFloat(d).trunc(); + fDevice[i+0] = SkPMFloat(a).round(); + fDevice[i+1] = SkPMFloat(b).round(); + fDevice[i+2] = SkPMFloat(c).round(); + fDevice[i+3] = SkPMFloat(d).round(); a = a + dcdx4; b = b + dcdx4; c = c + dcdx4; diff --git a/src/core/SkPMFloat.h b/src/core/SkPMFloat.h index ce7b75e705..f1d302458d 100644 --- a/src/core/SkPMFloat.h +++ b/src/core/SkPMFloat.h @@ -21,15 +21,12 @@ namespace { // A pre-multiplied color storing each component in the same order as SkPMColor, -// but as a float in the range [0, 255]. +// but as a float in the range [0, 1]. class SkPMFloat : public Sk4f { public: static SkPMFloat FromPMColor(SkPMColor c) { return SkPMFloat(c); } static SkPMFloat FromARGB(float a, float r, float g, float b) { return SkPMFloat(a,r,g,b); } - // May be more efficient than one at a time. No special alignment assumed for SkPMColors. - static void From4PMColors(const SkPMColor[4], SkPMFloat*, SkPMFloat*, SkPMFloat*, SkPMFloat*); - // Uninitialized. SkPMFloat() {} explicit SkPMFloat(SkPMColor); @@ -47,24 +44,10 @@ public: float g() const { return this->kth<SK_G32_SHIFT / 8>(); } float b() const { return this->kth<SK_B32_SHIFT / 8>(); } - // N.B. All methods returning an SkPMColor call SkPMColorAssert on that result before returning. - - // round() and roundClamp() round component values to the nearest integer. - SkPMColor round() const; // Assumes all values in [0, 255]. Some implementations may clamp. - SkPMColor roundClamp() const; // Will clamp all values to [0, 255]. - - // Like round(), but truncates instead of rounding. - // The domain of this function is (-1.0f, 256.0f). Values in (-1.0f, 0.0f] trunc to a zero. - SkPMColor trunc() const; - - // 4-at-a-time versions of round() and roundClamp(). Like From4PMColors(), no alignment assumed. - static void RoundTo4PMColors( - const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, SkPMColor[4]); - static void RoundClampTo4PMColors( - const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, SkPMColor[4]); + SkPMColor round() const; // Rounds from [0.0f, 1.0f] to [0, 255], clamping if out of range. bool isValid() const { - return this->a() >= 0 && this->a() <= 255 + return this->a() >= 0 && this->a() <= 1 && this->r() >= 0 && this->r() <= this->a() && this->g() >= 0 && this->g() <= this->a() && this->b() >= 0 && this->b() <= this->a(); @@ -80,10 +63,8 @@ private: // Platform implementations of SkPMFloat assume Sk4f uses SSE or NEON. _none is generic. #include "../opts/SkPMFloat_none.h" #else - #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 - #include "../opts/SkPMFloat_SSSE3.h" - #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 - #include "../opts/SkPMFloat_SSE2.h" + #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 + #include "../opts/SkPMFloat_sse.h" #elif defined(SK_ARM_HAS_NEON) #include "../opts/SkPMFloat_neon.h" #else diff --git a/src/effects/SkColorMatrixFilter.cpp b/src/effects/SkColorMatrixFilter.cpp index 894a7b089d..78d4797e44 100644 --- a/src/effects/SkColorMatrixFilter.cpp +++ b/src/effects/SkColorMatrixFilter.cpp @@ -239,25 +239,9 @@ uint32_t SkColorMatrixFilter::getFlags() const { return this->INHERITED::getFlags() | fFlags; } -/** - * Need inv255 = 1 / 255 as a constant, so when we premul a SkPMFloat, we can do this - * - * new_red = old_red * alpha * inv255 - * - * instead of (much slower) - * - * new_red = old_red * alpha / 255 - * - * However, 1.0f/255 comes to (in hex) 0x3B808081, which is slightly bigger than the "actual" - * value of 0x3B808080(repeat 80)... This slightly too-big value can cause us to compute - * new_red > alpha, which is a problem (for valid premul). To fix this, we use a - * hand-computed value of 0x3B808080, 1 ULP smaller. This keeps our colors valid. - */ -static const float gInv255 = 0.0039215683f; // (1.0f / 255) - ULP == SkBits2Float(0x3B808080) - static Sk4f premul(const Sk4f& x) { - float scale = SkPMFloat(x).a() * gInv255; - Sk4f pm = x * Sk4f(scale, scale, scale, 1); + float scale = SkPMFloat(x).a(); + Sk4f pm = x * SkPMFloat(1, scale, scale, scale); #ifdef SK_DEBUG SkPMFloat pmf(pm); @@ -268,12 +252,12 @@ static Sk4f premul(const Sk4f& x) { } static Sk4f unpremul(const SkPMFloat& pm) { - float scale = 255 / pm.a(); // candidate for fast/approx invert? - return pm * Sk4f(scale, scale, scale, 1); + float scale = 1 / pm.a(); // candidate for fast/approx invert? + return pm * SkPMFloat(1, scale, scale, scale); } -static Sk4f clamp_0_255(const Sk4f& value) { - return Sk4f::Max(Sk4f::Min(value, Sk4f(255)), Sk4f(0)); +static Sk4f clamp_0_1(const Sk4f& value) { + return Sk4f::Max(Sk4f::Min(value, Sk4f(1)), Sk4f(0)); } void SkColorMatrixFilter::filterSpan(const SkPMColor src[], int count, SkPMColor dst[]) const { @@ -292,14 +276,16 @@ void SkColorMatrixFilter::filterSpan(const SkPMColor src[], int count, SkPMColor #endif if (use_floats) { + // c0-c3 are already in [0,1]. const Sk4f c0 = Sk4f::Load(fTranspose + 0); const Sk4f c1 = Sk4f::Load(fTranspose + 4); const Sk4f c2 = Sk4f::Load(fTranspose + 8); const Sk4f c3 = Sk4f::Load(fTranspose + 12); - const Sk4f c4 = Sk4f::Load(fTranspose + 16); // translates + // c4 (the translate vector) is in [0, 255]. Bring it back to [0,1]. + const Sk4f c4 = Sk4f::Load(fTranspose + 16)*Sk4f(1.0f/255); // todo: we could cache this in the constructor... - SkPMColor matrix_translate_pmcolor = SkPMFloat(premul(clamp_0_255(c4))).roundClamp(); + SkPMColor matrix_translate_pmcolor = SkPMFloat(premul(clamp_0_1(c4))).round(); for (int i = 0; i < count; i++) { const SkPMColor src_c = src[i]; @@ -323,7 +309,7 @@ void SkColorMatrixFilter::filterSpan(const SkPMColor src[], int count, SkPMColor Sk4f dst4 = c0 * r4 + c1 * g4 + c2 * b4 + c3 * a4 + c4; // clamp, re-premul, and write - dst[i] = SkPMFloat(premul(clamp_0_255(dst4))).round(); + dst[i] = SkPMFloat(premul(clamp_0_1(dst4))).round(); } } else { const State& state = fState; diff --git a/src/opts/SkPMFloat_SSE2.h b/src/opts/SkPMFloat_SSE2.h deleted file mode 100644 index c7e791ff62..0000000000 --- a/src/opts/SkPMFloat_SSE2.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright 2015 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -namespace { // See SkPMFloat.h - -// For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits -// (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats. - -// round() and roundClamp() do the opposite, working from floats to 8-bit-in-32-bit, -// to 8-bit-in-16-bit, back down to 8-bit components. -// _mm_packus_epi16() gives us clamping for free while narrowing. - -inline SkPMFloat::SkPMFloat(SkPMColor c) { - SkPMColorAssert(c); - __m128i fix8 = _mm_set_epi32(0,0,0,c), - fix8_16 = _mm_unpacklo_epi8 (fix8, _mm_setzero_si128()), - fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128()); - fVec = _mm_cvtepi32_ps(fix8_32); - SkASSERT(this->isValid()); -} - -inline SkPMColor SkPMFloat::round() const { - return this->roundClamp(); // Haven't beaten this yet. -} - -inline SkPMColor SkPMFloat::roundClamp() const { - // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up). - __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fVec)), - fix8_16 = _mm_packus_epi16(fix8_32, fix8_32), - fix8 = _mm_packus_epi16(fix8_16, fix8_16); - SkPMColor c = _mm_cvtsi128_si32(fix8); - SkPMColorAssert(c); - return c; -} - -inline SkPMColor SkPMFloat::trunc() const { - // Basically, same as roundClamp(), but no rounding. - __m128i fix8_32 = _mm_cvttps_epi32(fVec), - fix8_16 = _mm_packus_epi16(fix8_32, fix8_32), - fix8 = _mm_packus_epi16(fix8_16, fix8_16); - SkPMColor c = _mm_cvtsi128_si32(fix8); - SkPMColorAssert(c); - return c; -} - -inline void SkPMFloat::From4PMColors(const SkPMColor colors[4], - SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) { - // Haven't beaten this yet. - *a = FromPMColor(colors[0]); - *b = FromPMColor(colors[1]); - *c = FromPMColor(colors[2]); - *d = FromPMColor(colors[3]); -} - -inline void SkPMFloat::RoundTo4PMColors( - const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d, - SkPMColor colors[4]) { - // Haven't beaten this yet. - RoundClampTo4PMColors(a,b,c,d, colors); -} - -inline void SkPMFloat::RoundClampTo4PMColors( - const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d, - SkPMColor colors[4]) { - // Same as _SSSE3.h's. We use 3 _mm_packus_epi16() where the naive loop uses 8. - // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up). - __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), a.fVec)), - c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), b.fVec)), - c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), c.fVec)), - c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), d.fVec)); - __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1), - _mm_packus_epi16(c2, c3)); - _mm_storeu_si128((__m128i*)colors, c3210); - SkPMColorAssert(colors[0]); - SkPMColorAssert(colors[1]); - SkPMColorAssert(colors[2]); - SkPMColorAssert(colors[3]); -} - -} // namespace diff --git a/src/opts/SkPMFloat_SSSE3.h b/src/opts/SkPMFloat_SSSE3.h deleted file mode 100644 index 67116ec2dd..0000000000 --- a/src/opts/SkPMFloat_SSSE3.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright 2015 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -namespace { // See SkPMFloat.h - -// For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 32 bits -// (fix8_32), then convert those to floats. - -// round() does the opposite, working from floats to 8-bit-in-32-bits, then back to packed 8 bit. - -// roundClamp() is the same as _SSE2: floats to 8-in-32, to 8-in-16, to packed 8 bit, with -// _mm_packus_epi16() both clamping and narrowing. - -inline SkPMFloat::SkPMFloat(SkPMColor c) { - SkPMColorAssert(c); - const int _ = 255; // _ means to zero that byte. - __m128i fix8 = _mm_set_epi32(0,0,0,c), - fix8_32 = _mm_shuffle_epi8(fix8, _mm_set_epi8(_,_,_,3, _,_,_,2, _,_,_,1, _,_,_,0)); - fVec = _mm_cvtepi32_ps(fix8_32); - SkASSERT(this->isValid()); -} - -inline SkPMColor SkPMFloat::trunc() const { - const int _ = 255; // _ means to zero that byte. - __m128i fix8_32 = _mm_cvttps_epi32(fVec), - fix8 = _mm_shuffle_epi8(fix8_32, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 12,8,4,0)); - SkPMColor c = _mm_cvtsi128_si32(fix8); - SkPMColorAssert(c); - return c; -} - -inline SkPMColor SkPMFloat::round() const { - return SkPMFloat(Sk4f(0.5f) + *this).trunc(); -} - -inline SkPMColor SkPMFloat::roundClamp() const { - // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up). - __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fVec)), - fix8_16 = _mm_packus_epi16(fix8_32, fix8_32), - fix8 = _mm_packus_epi16(fix8_16, fix8_16); - SkPMColor c = _mm_cvtsi128_si32(fix8); - SkPMColorAssert(c); - return c; -} - -inline void SkPMFloat::From4PMColors(const SkPMColor colors[4], - SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) { - // Haven't beaten this yet. - *a = FromPMColor(colors[0]); - *b = FromPMColor(colors[1]); - *c = FromPMColor(colors[2]); - *d = FromPMColor(colors[3]); -} - -inline void SkPMFloat::RoundTo4PMColors( - const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d, - SkPMColor colors[4]) { - // Haven't beaten this yet. Still faster than RoundClampTo4PMColors? - colors[0] = a.round(); - colors[1] = b.round(); - colors[2] = c.round(); - colors[3] = d.round(); -} - -inline void SkPMFloat::RoundClampTo4PMColors( - const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d, - SkPMColor colors[4]) { - // Same as _SSE2.h's. We use 3 _mm_packus_epi16() where the naive loop uses 8. - // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up). - __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), a.fVec)), - c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), b.fVec)), - c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), c.fVec)), - c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), d.fVec)); - __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1), - _mm_packus_epi16(c2, c3)); - _mm_storeu_si128((__m128i*)colors, c3210); - SkPMColorAssert(colors[0]); - SkPMColorAssert(colors[1]); - SkPMColorAssert(colors[2]); - SkPMColorAssert(colors[3]); -} - -} // namespace diff --git a/src/opts/SkPMFloat_neon.h b/src/opts/SkPMFloat_neon.h index cabb29a2a8..57f613751d 100644 --- a/src/opts/SkPMFloat_neon.h +++ b/src/opts/SkPMFloat_neon.h @@ -7,70 +7,24 @@ namespace { // See SkPMFloat.h -// For SkPMFloat(SkPMFColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits -// (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats. - -// round() and roundClamp() do the opposite, working from floats to 8-bit-in-32-bit, -// to 8-bit-in-16-bit, back down to 8-bit components. -// roundClamp() uses vqmovn to clamp while narrowing instead of just narrowing with vmovn. - inline SkPMFloat::SkPMFloat(SkPMColor c) { SkPMColorAssert(c); uint8x8_t fix8 = (uint8x8_t)vdup_n_u32(c); uint16x8_t fix8_16 = vmovl_u8(fix8); uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16)); - fVec = vcvtq_f32_u32(fix8_32); + fVec = vcvtq_n_f32_u32(fix8_32, 8); SkASSERT(this->isValid()); } -inline SkPMColor SkPMFloat::trunc() const { - uint32x4_t fix8_32 = vcvtq_u32_f32(fVec); // vcvtq_u32_f32 truncates - uint16x4_t fix8_16 = vmovn_u32(fix8_32); - uint8x8_t fix8 = vmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0))); - SkPMColor c = vget_lane_u32((uint32x2_t)fix8, 0); - SkPMColorAssert(c); - return c; -} - inline SkPMColor SkPMFloat::round() const { - return SkPMFloat(Sk4f(0.5f) + *this).trunc(); -} - -inline SkPMColor SkPMFloat::roundClamp() const { - float32x4_t add_half = vaddq_f32(fVec, vdupq_n_f32(0.5f)); - uint32x4_t fix8_32 = vcvtq_u32_f32(add_half); // vcvtq_u32_f32 truncates, so round manually - uint16x4_t fix8_16 = vqmovn_u32(fix8_32); - uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0))); + // vcvtq_n_u32_f32 truncates, so we round manually by adding a half before converting. + float32x4_t rounded = vaddq_f32(fVec, vdupq_n_f32(0.5f/255)); + uint32x4_t fix8_32 = vcvtq_n_u32_f32(rounded, 8); + uint16x4_t fix8_16 = vqmovn_u32(fix8_32); + uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0))); SkPMColor c = vget_lane_u32((uint32x2_t)fix8, 0); SkPMColorAssert(c); return c; } -// TODO: we should be able to beat these loops on all three methods. -inline void SkPMFloat::From4PMColors(const SkPMColor colors[4], - SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) { - *a = FromPMColor(colors[0]); - *b = FromPMColor(colors[1]); - *c = FromPMColor(colors[2]); - *d = FromPMColor(colors[3]); -} - -inline void SkPMFloat::RoundTo4PMColors( - const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d, - SkPMColor colors[4]) { - colors[0] = a.round(); - colors[1] = b.round(); - colors[2] = c.round(); - colors[3] = d.round(); -} - -inline void SkPMFloat::RoundClampTo4PMColors( - const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d, - SkPMColor colors[4]) { - colors[0] = a.roundClamp(); - colors[1] = b.roundClamp(); - colors[2] = c.roundClamp(); - colors[3] = d.roundClamp(); -} - } // namespace diff --git a/src/opts/SkPMFloat_none.h b/src/opts/SkPMFloat_none.h index 9bb584ed86..ba773564d6 100644 --- a/src/opts/SkPMFloat_none.h +++ b/src/opts/SkPMFloat_none.h @@ -8,61 +8,26 @@ namespace { // See SkPMFloat.h inline SkPMFloat::SkPMFloat(SkPMColor c) { - *this = SkPMFloat::FromARGB(SkGetPackedA32(c), - SkGetPackedR32(c), - SkGetPackedG32(c), - SkGetPackedB32(c)); + float inv255 = 1.0f/255; + *this = SkPMFloat::FromARGB(SkGetPackedA32(c) * inv255, + SkGetPackedR32(c) * inv255, + SkGetPackedG32(c) * inv255, + SkGetPackedB32(c) * inv255); SkASSERT(this->isValid()); } -inline SkPMColor SkPMFloat::trunc() const { - return SkPackARGB32(this->a(), this->r(), this->g(), this->b()); -} - inline SkPMColor SkPMFloat::round() const { - SkPMColor c = SkPackARGB32(this->a()+0.5f, this->r()+0.5f, this->g()+0.5f, this->b()+0.5f); - SkPMColorAssert(c); - return c; -} - -inline SkPMColor SkPMFloat::roundClamp() const { float a = this->a(), r = this->r(), g = this->g(), b = this->b(); - a = a < 0 ? 0 : (a > 255 ? 255 : a); - r = r < 0 ? 0 : (r > 255 ? 255 : r); - g = g < 0 ? 0 : (g > 255 ? 255 : g); - b = b < 0 ? 0 : (b > 255 ? 255 : b); - SkPMColor c = SkPackARGB32(a+0.5f, r+0.5f, g+0.5f, b+0.5f); + a = a < 0 ? 0 : (a > 1 ? 1 : a); + r = r < 0 ? 0 : (r > 1 ? 1 : r); + g = g < 0 ? 0 : (g > 1 ? 1 : g); + b = b < 0 ? 0 : (b > 1 ? 1 : b); + SkPMColor c = SkPackARGB32(255*a+0.5f, 255*r+0.5f, 255*g+0.5f, 255*b+0.5f); SkPMColorAssert(c); return c; } -inline void SkPMFloat::From4PMColors(const SkPMColor colors[4], - SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) { - *a = FromPMColor(colors[0]); - *b = FromPMColor(colors[1]); - *c = FromPMColor(colors[2]); - *d = FromPMColor(colors[3]); -} - -inline void SkPMFloat::RoundTo4PMColors( - const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d, - SkPMColor colors[4]) { - colors[0] = a.round(); - colors[1] = b.round(); - colors[2] = c.round(); - colors[3] = d.round(); -} - -inline void SkPMFloat::RoundClampTo4PMColors( - const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d, - SkPMColor colors[4]) { - colors[0] = a.roundClamp(); - colors[1] = b.roundClamp(); - colors[2] = c.roundClamp(); - colors[3] = d.roundClamp(); -} - } // namespace diff --git a/src/opts/SkPMFloat_sse.h b/src/opts/SkPMFloat_sse.h new file mode 100644 index 0000000000..802b17ba0c --- /dev/null +++ b/src/opts/SkPMFloat_sse.h @@ -0,0 +1,36 @@ +/* + * Copyright 2015 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +namespace { // See SkPMFloat.h + +inline SkPMFloat::SkPMFloat(SkPMColor c) { + SkPMColorAssert(c); +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 + const int _ = 255; // Zero these bytes. + __m128i fix8 = _mm_cvtsi32_si128((int)c), + fix8_32 = _mm_shuffle_epi8(fix8, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_)); +#else + __m128i fix8 = _mm_cvtsi32_si128((int)c), + fix8_16 = _mm_unpacklo_epi8 (fix8, _mm_setzero_si128()), + fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128()); +#endif + fVec = _mm_mul_ps(_mm_cvtepi32_ps(fix8_32), _mm_set1_ps(1.0f / 255)); + SkASSERT(this->isValid()); +} + +inline SkPMColor SkPMFloat::round() const { + // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up). + __m128 scaled = _mm_mul_ps(_mm_set1_ps(255), fVec); + __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), scaled)), + fix8_16 = _mm_packus_epi16(fix8_32, fix8_32), + fix8 = _mm_packus_epi16(fix8_16, fix8_16); + SkPMColor c = _mm_cvtsi128_si32(fix8); + SkPMColorAssert(c); + return c; +} + +} // namespace diff --git a/tests/PMFloatTest.cpp b/tests/PMFloatTest.cpp index 1c53081571..b7b39411d1 100644 --- a/tests/PMFloatTest.cpp +++ b/tests/PMFloatTest.cpp @@ -12,47 +12,27 @@ DEF_TEST(SkPMFloat, r) { // Test SkPMColor <-> SkPMFloat SkPMColor c = SkPreMultiplyColor(0xFFCC9933); SkPMFloat pmf(c); - REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, pmf.a())); - REPORTER_ASSERT(r, SkScalarNearlyEqual(204.0f, pmf.r())); - REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, pmf.g())); - REPORTER_ASSERT(r, SkScalarNearlyEqual( 51.0f, pmf.b())); + REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, 255*pmf.a())); + REPORTER_ASSERT(r, SkScalarNearlyEqual(204.0f, 255*pmf.r())); + REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, 255*pmf.g())); + REPORTER_ASSERT(r, SkScalarNearlyEqual( 51.0f, 255*pmf.b())); REPORTER_ASSERT(r, c == pmf.round()); // Test rounding. - pmf = SkPMFloat(254.5f, 203.5f, 153.1f, 50.8f); + pmf = SkPMFloat(254.5f/255, 203.5f/255, 153.1f/255, 50.8f/255); REPORTER_ASSERT(r, c == pmf.round()); - pmf = SkPMFloat(255.9f, 204.01f, 153.0f, -0.9f); - REPORTER_ASSERT(r, SkPreMultiplyColor(0xFFCC9900) == pmf.trunc()); - - // Test clamping. - SkPMFloat clamped(SkPMFloat(510.0f, 153.0f, 1.0f, -0.2f).roundClamp()); - REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, clamped.a())); - REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, clamped.r())); - REPORTER_ASSERT(r, SkScalarNearlyEqual( 1.0f, clamped.g())); - REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.0f, clamped.b())); + SkPMFloat clamped(SkPMFloat(510.0f/255, 153.0f/255, 1.0f/255, -0.2f/255).round()); + REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, 255*clamped.a())); + REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, 255*clamped.r())); + REPORTER_ASSERT(r, SkScalarNearlyEqual( 1.0f, 255*clamped.g())); + REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.0f, 255*clamped.b())); // Test SkPMFloat <-> Sk4f conversion. Sk4f fs = clamped; SkPMFloat scaled = fs * Sk4f(0.25f); - REPORTER_ASSERT(r, SkScalarNearlyEqual(63.75f, scaled.a())); - REPORTER_ASSERT(r, SkScalarNearlyEqual(38.25f, scaled.r())); - REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.25f, scaled.g())); - REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.00f, scaled.b())); - - // Test 4-at-a-time conversions. - SkPMColor colors[4] = { 0xFF000000, 0xFFFF0000, 0xFF00FF00, 0xFF0000FF }; - SkPMFloat floats[4]; - SkPMFloat::From4PMColors(colors, floats+0, floats+1, floats+2, floats+3); - - SkPMColor back[4]; - SkPMFloat::RoundTo4PMColors(floats[0], floats[1], floats[2], floats[3], back); - for (int i = 0; i < 4; i++) { - REPORTER_ASSERT(r, back[i] == colors[i]); - } - - SkPMFloat::RoundClampTo4PMColors(floats[0], floats[1], floats[2], floats[3], back); - for (int i = 0; i < 4; i++) { - REPORTER_ASSERT(r, back[i] == colors[i]); - } + REPORTER_ASSERT(r, SkScalarNearlyEqual(63.75f, 255*scaled.a())); + REPORTER_ASSERT(r, SkScalarNearlyEqual(38.25f, 255*scaled.r())); + REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.25f, 255*scaled.g())); + REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.00f, 255*scaled.b())); } |