aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--bench/PMFloatBench.cpp84
-rw-r--r--src/core/SkPMFloat.h29
-rw-r--r--src/effects/SkColorMatrixFilter.cpp36
-rw-r--r--src/opts/SkPMFloat_SSE2.h84
-rw-r--r--src/opts/SkPMFloat_SSSE3.h87
-rw-r--r--src/opts/SkPMFloat_neon.h58
-rw-r--r--src/opts/SkPMFloat_none.h55
-rw-r--r--src/opts/SkPMFloat_sse.h36
-rw-r--r--tests/PMFloatTest.cpp48
9 files changed, 100 insertions, 417 deletions
diff --git a/bench/PMFloatBench.cpp b/bench/PMFloatBench.cpp
index 37542e3525..540fdb7815 100644
--- a/bench/PMFloatBench.cpp
+++ b/bench/PMFloatBench.cpp
@@ -20,20 +20,10 @@ static uint32_t lcg_rand(uint32_t* seed) {
}
// I'm having better luck getting these to constant-propagate away as template parameters.
-template <bool kClamp, bool kWide>
-struct PMFloatGetSetBench : public Benchmark {
- PMFloatGetSetBench() {}
+struct PMFloatRoundtripBench : public Benchmark {
+ PMFloatRoundtripBench() {}
- const char* onGetName() override {
- switch (kClamp << 1 | kWide) {
- case 0: return "SkPMFloat_get_1x";
- case 1: return "SkPMFloat_get_4x";
- case 2: return "SkPMFloat_clamp_1x";
- case 3: return "SkPMFloat_clamp_4x";
- }
- SkFAIL("unreachable");
- return "oh bother";
- }
+ const char* onGetName() override { return "SkPMFloat_roundtrip"; }
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
void onDraw(const int loops, SkCanvas* canvas) override {
@@ -41,61 +31,23 @@ struct PMFloatGetSetBench : public Benchmark {
uint32_t junk = 0;
uint32_t seed = 0;
for (int i = 0; i < loops; i++) {
- SkPMColor colors[4];
+ SkPMColor color;
#ifdef SK_DEBUG
- for (int i = 0; i < 4; i++) {
- // Our SkASSERTs will remind us that it's technically required that we premultiply.
- colors[i] = SkPreMultiplyColor(lcg_rand(&seed));
- }
+ // Our SkASSERTs will remind us that it's technically required that we premultiply.
+ color = SkPreMultiplyColor(lcg_rand(&seed));
#else
// But it's a lot faster not to, and this code won't really mind the non-PM colors.
- (void)lcg_rand(&seed);
- colors[0] = seed + 0;
- colors[1] = seed + 1;
- colors[2] = seed + 2;
- colors[3] = seed + 3;
+ color = lcg_rand(&seed);
#endif
- SkPMFloat fa,fb,fc,fd;
- if (kWide) {
- SkPMFloat::From4PMColors(colors, &fa, &fb, &fc, &fd);
- } else {
- fa = SkPMFloat::FromPMColor(colors[0]);
- fb = SkPMFloat::FromPMColor(colors[1]);
- fc = SkPMFloat::FromPMColor(colors[2]);
- fd = SkPMFloat::FromPMColor(colors[3]);
- }
-
- SkPMColor back[4];
- switch (kClamp << 1 | kWide) {
- case 0: {
- back[0] = fa.round();
- back[1] = fb.round();
- back[2] = fc.round();
- back[3] = fd.round();
- } break;
- case 1: SkPMFloat::RoundTo4PMColors(fa, fb, fc, fd, back); break;
- case 2: {
- back[0] = fa.roundClamp();
- back[1] = fb.roundClamp();
- back[2] = fc.roundClamp();
- back[3] = fd.roundClamp();
- } break;
- case 3: SkPMFloat::RoundClampTo4PMColors(fa, fb, fc, fd, back); break;
- }
- for (int i = 0; i < 4; i++) {
- junk ^= back[i];
- }
+ auto f = SkPMFloat::FromPMColor(color);
+ SkPMColor back = f.round();
+ junk ^= back;
}
blackhole ^= junk;
}
};
-
-// Extra () help DEF_BENCH not get confused by the comma inside the <>.
-DEF_BENCH(return (new PMFloatGetSetBench< true, true>);)
-DEF_BENCH(return (new PMFloatGetSetBench<false, true>);)
-DEF_BENCH(return (new PMFloatGetSetBench< true, false>);)
-DEF_BENCH(return (new PMFloatGetSetBench<false, false>);)
+DEF_BENCH(return new PMFloatRoundtripBench;)
struct PMFloatGradientBench : public Benchmark {
const char* onGetName() override { return "PMFloat_gradient"; }
@@ -103,8 +55,8 @@ struct PMFloatGradientBench : public Benchmark {
SkPMColor fDevice[100];
void onDraw(const int loops, SkCanvas*) override {
- Sk4f c0 = SkPMFloat::FromARGB(255, 255, 0, 0),
- c1 = SkPMFloat::FromARGB(255, 0, 0, 255),
+ Sk4f c0 = SkPMFloat::FromARGB(1, 1, 0, 0),
+ c1 = SkPMFloat::FromARGB(1, 0, 0, 1),
dc = c1 - c0,
fx(0.1f),
dx(0.002f),
@@ -112,15 +64,15 @@ struct PMFloatGradientBench : public Benchmark {
dcdx4(dcdx+dcdx+dcdx+dcdx);
for (int n = 0; n < loops; n++) {
- Sk4f a = c0 + dc*fx + Sk4f(0.5f), // The +0.5f lets us call trunc() instead of get().
+ Sk4f a = c0 + dc*fx,
b = a + dcdx,
c = b + dcdx,
d = c + dcdx;
for (size_t i = 0; i < SK_ARRAY_COUNT(fDevice); i += 4) {
- fDevice[i+0] = SkPMFloat(a).trunc();
- fDevice[i+1] = SkPMFloat(b).trunc();
- fDevice[i+2] = SkPMFloat(c).trunc();
- fDevice[i+3] = SkPMFloat(d).trunc();
+ fDevice[i+0] = SkPMFloat(a).round();
+ fDevice[i+1] = SkPMFloat(b).round();
+ fDevice[i+2] = SkPMFloat(c).round();
+ fDevice[i+3] = SkPMFloat(d).round();
a = a + dcdx4;
b = b + dcdx4;
c = c + dcdx4;
diff --git a/src/core/SkPMFloat.h b/src/core/SkPMFloat.h
index ce7b75e705..f1d302458d 100644
--- a/src/core/SkPMFloat.h
+++ b/src/core/SkPMFloat.h
@@ -21,15 +21,12 @@
namespace {
// A pre-multiplied color storing each component in the same order as SkPMColor,
-// but as a float in the range [0, 255].
+// but as a float in the range [0, 1].
class SkPMFloat : public Sk4f {
public:
static SkPMFloat FromPMColor(SkPMColor c) { return SkPMFloat(c); }
static SkPMFloat FromARGB(float a, float r, float g, float b) { return SkPMFloat(a,r,g,b); }
- // May be more efficient than one at a time. No special alignment assumed for SkPMColors.
- static void From4PMColors(const SkPMColor[4], SkPMFloat*, SkPMFloat*, SkPMFloat*, SkPMFloat*);
-
// Uninitialized.
SkPMFloat() {}
explicit SkPMFloat(SkPMColor);
@@ -47,24 +44,10 @@ public:
float g() const { return this->kth<SK_G32_SHIFT / 8>(); }
float b() const { return this->kth<SK_B32_SHIFT / 8>(); }
- // N.B. All methods returning an SkPMColor call SkPMColorAssert on that result before returning.
-
- // round() and roundClamp() round component values to the nearest integer.
- SkPMColor round() const; // Assumes all values in [0, 255]. Some implementations may clamp.
- SkPMColor roundClamp() const; // Will clamp all values to [0, 255].
-
- // Like round(), but truncates instead of rounding.
- // The domain of this function is (-1.0f, 256.0f). Values in (-1.0f, 0.0f] trunc to a zero.
- SkPMColor trunc() const;
-
- // 4-at-a-time versions of round() and roundClamp(). Like From4PMColors(), no alignment assumed.
- static void RoundTo4PMColors(
- const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, SkPMColor[4]);
- static void RoundClampTo4PMColors(
- const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, const SkPMFloat&, SkPMColor[4]);
+ SkPMColor round() const; // Rounds from [0.0f, 1.0f] to [0, 255], clamping if out of range.
bool isValid() const {
- return this->a() >= 0 && this->a() <= 255
+ return this->a() >= 0 && this->a() <= 1
&& this->r() >= 0 && this->r() <= this->a()
&& this->g() >= 0 && this->g() <= this->a()
&& this->b() >= 0 && this->b() <= this->a();
@@ -80,10 +63,8 @@ private:
// Platform implementations of SkPMFloat assume Sk4f uses SSE or NEON. _none is generic.
#include "../opts/SkPMFloat_none.h"
#else
- #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
- #include "../opts/SkPMFloat_SSSE3.h"
- #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
- #include "../opts/SkPMFloat_SSE2.h"
+ #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+ #include "../opts/SkPMFloat_sse.h"
#elif defined(SK_ARM_HAS_NEON)
#include "../opts/SkPMFloat_neon.h"
#else
diff --git a/src/effects/SkColorMatrixFilter.cpp b/src/effects/SkColorMatrixFilter.cpp
index 894a7b089d..78d4797e44 100644
--- a/src/effects/SkColorMatrixFilter.cpp
+++ b/src/effects/SkColorMatrixFilter.cpp
@@ -239,25 +239,9 @@ uint32_t SkColorMatrixFilter::getFlags() const {
return this->INHERITED::getFlags() | fFlags;
}
-/**
- * Need inv255 = 1 / 255 as a constant, so when we premul a SkPMFloat, we can do this
- *
- * new_red = old_red * alpha * inv255
- *
- * instead of (much slower)
- *
- * new_red = old_red * alpha / 255
- *
- * However, 1.0f/255 comes to (in hex) 0x3B808081, which is slightly bigger than the "actual"
- * value of 0x3B808080(repeat 80)... This slightly too-big value can cause us to compute
- * new_red > alpha, which is a problem (for valid premul). To fix this, we use a
- * hand-computed value of 0x3B808080, 1 ULP smaller. This keeps our colors valid.
- */
-static const float gInv255 = 0.0039215683f; // (1.0f / 255) - ULP == SkBits2Float(0x3B808080)
-
static Sk4f premul(const Sk4f& x) {
- float scale = SkPMFloat(x).a() * gInv255;
- Sk4f pm = x * Sk4f(scale, scale, scale, 1);
+ float scale = SkPMFloat(x).a();
+ Sk4f pm = x * SkPMFloat(1, scale, scale, scale);
#ifdef SK_DEBUG
SkPMFloat pmf(pm);
@@ -268,12 +252,12 @@ static Sk4f premul(const Sk4f& x) {
}
static Sk4f unpremul(const SkPMFloat& pm) {
- float scale = 255 / pm.a(); // candidate for fast/approx invert?
- return pm * Sk4f(scale, scale, scale, 1);
+ float scale = 1 / pm.a(); // candidate for fast/approx invert?
+ return pm * SkPMFloat(1, scale, scale, scale);
}
-static Sk4f clamp_0_255(const Sk4f& value) {
- return Sk4f::Max(Sk4f::Min(value, Sk4f(255)), Sk4f(0));
+static Sk4f clamp_0_1(const Sk4f& value) {
+ return Sk4f::Max(Sk4f::Min(value, Sk4f(1)), Sk4f(0));
}
void SkColorMatrixFilter::filterSpan(const SkPMColor src[], int count, SkPMColor dst[]) const {
@@ -292,14 +276,16 @@ void SkColorMatrixFilter::filterSpan(const SkPMColor src[], int count, SkPMColor
#endif
if (use_floats) {
+ // c0-c3 are already in [0,1].
const Sk4f c0 = Sk4f::Load(fTranspose + 0);
const Sk4f c1 = Sk4f::Load(fTranspose + 4);
const Sk4f c2 = Sk4f::Load(fTranspose + 8);
const Sk4f c3 = Sk4f::Load(fTranspose + 12);
- const Sk4f c4 = Sk4f::Load(fTranspose + 16); // translates
+ // c4 (the translate vector) is in [0, 255]. Bring it back to [0,1].
+ const Sk4f c4 = Sk4f::Load(fTranspose + 16)*Sk4f(1.0f/255);
// todo: we could cache this in the constructor...
- SkPMColor matrix_translate_pmcolor = SkPMFloat(premul(clamp_0_255(c4))).roundClamp();
+ SkPMColor matrix_translate_pmcolor = SkPMFloat(premul(clamp_0_1(c4))).round();
for (int i = 0; i < count; i++) {
const SkPMColor src_c = src[i];
@@ -323,7 +309,7 @@ void SkColorMatrixFilter::filterSpan(const SkPMColor src[], int count, SkPMColor
Sk4f dst4 = c0 * r4 + c1 * g4 + c2 * b4 + c3 * a4 + c4;
// clamp, re-premul, and write
- dst[i] = SkPMFloat(premul(clamp_0_255(dst4))).round();
+ dst[i] = SkPMFloat(premul(clamp_0_1(dst4))).round();
}
} else {
const State& state = fState;
diff --git a/src/opts/SkPMFloat_SSE2.h b/src/opts/SkPMFloat_SSE2.h
deleted file mode 100644
index c7e791ff62..0000000000
--- a/src/opts/SkPMFloat_SSE2.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-namespace { // See SkPMFloat.h
-
-// For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits
-// (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats.
-
-// round() and roundClamp() do the opposite, working from floats to 8-bit-in-32-bit,
-// to 8-bit-in-16-bit, back down to 8-bit components.
-// _mm_packus_epi16() gives us clamping for free while narrowing.
-
-inline SkPMFloat::SkPMFloat(SkPMColor c) {
- SkPMColorAssert(c);
- __m128i fix8 = _mm_set_epi32(0,0,0,c),
- fix8_16 = _mm_unpacklo_epi8 (fix8, _mm_setzero_si128()),
- fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
- fVec = _mm_cvtepi32_ps(fix8_32);
- SkASSERT(this->isValid());
-}
-
-inline SkPMColor SkPMFloat::round() const {
- return this->roundClamp(); // Haven't beaten this yet.
-}
-
-inline SkPMColor SkPMFloat::roundClamp() const {
- // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
- __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fVec)),
- fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
- fix8 = _mm_packus_epi16(fix8_16, fix8_16);
- SkPMColor c = _mm_cvtsi128_si32(fix8);
- SkPMColorAssert(c);
- return c;
-}
-
-inline SkPMColor SkPMFloat::trunc() const {
- // Basically, same as roundClamp(), but no rounding.
- __m128i fix8_32 = _mm_cvttps_epi32(fVec),
- fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
- fix8 = _mm_packus_epi16(fix8_16, fix8_16);
- SkPMColor c = _mm_cvtsi128_si32(fix8);
- SkPMColorAssert(c);
- return c;
-}
-
-inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
- SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
- // Haven't beaten this yet.
- *a = FromPMColor(colors[0]);
- *b = FromPMColor(colors[1]);
- *c = FromPMColor(colors[2]);
- *d = FromPMColor(colors[3]);
-}
-
-inline void SkPMFloat::RoundTo4PMColors(
- const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
- SkPMColor colors[4]) {
- // Haven't beaten this yet.
- RoundClampTo4PMColors(a,b,c,d, colors);
-}
-
-inline void SkPMFloat::RoundClampTo4PMColors(
- const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
- SkPMColor colors[4]) {
- // Same as _SSSE3.h's. We use 3 _mm_packus_epi16() where the naive loop uses 8.
- // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
- __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), a.fVec)),
- c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), b.fVec)),
- c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), c.fVec)),
- c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), d.fVec));
- __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
- _mm_packus_epi16(c2, c3));
- _mm_storeu_si128((__m128i*)colors, c3210);
- SkPMColorAssert(colors[0]);
- SkPMColorAssert(colors[1]);
- SkPMColorAssert(colors[2]);
- SkPMColorAssert(colors[3]);
-}
-
-} // namespace
diff --git a/src/opts/SkPMFloat_SSSE3.h b/src/opts/SkPMFloat_SSSE3.h
deleted file mode 100644
index 67116ec2dd..0000000000
--- a/src/opts/SkPMFloat_SSSE3.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-namespace { // See SkPMFloat.h
-
-// For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 32 bits
-// (fix8_32), then convert those to floats.
-
-// round() does the opposite, working from floats to 8-bit-in-32-bits, then back to packed 8 bit.
-
-// roundClamp() is the same as _SSE2: floats to 8-in-32, to 8-in-16, to packed 8 bit, with
-// _mm_packus_epi16() both clamping and narrowing.
-
-inline SkPMFloat::SkPMFloat(SkPMColor c) {
- SkPMColorAssert(c);
- const int _ = 255; // _ means to zero that byte.
- __m128i fix8 = _mm_set_epi32(0,0,0,c),
- fix8_32 = _mm_shuffle_epi8(fix8, _mm_set_epi8(_,_,_,3, _,_,_,2, _,_,_,1, _,_,_,0));
- fVec = _mm_cvtepi32_ps(fix8_32);
- SkASSERT(this->isValid());
-}
-
-inline SkPMColor SkPMFloat::trunc() const {
- const int _ = 255; // _ means to zero that byte.
- __m128i fix8_32 = _mm_cvttps_epi32(fVec),
- fix8 = _mm_shuffle_epi8(fix8_32, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 12,8,4,0));
- SkPMColor c = _mm_cvtsi128_si32(fix8);
- SkPMColorAssert(c);
- return c;
-}
-
-inline SkPMColor SkPMFloat::round() const {
- return SkPMFloat(Sk4f(0.5f) + *this).trunc();
-}
-
-inline SkPMColor SkPMFloat::roundClamp() const {
- // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
- __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fVec)),
- fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
- fix8 = _mm_packus_epi16(fix8_16, fix8_16);
- SkPMColor c = _mm_cvtsi128_si32(fix8);
- SkPMColorAssert(c);
- return c;
-}
-
-inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
- SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
- // Haven't beaten this yet.
- *a = FromPMColor(colors[0]);
- *b = FromPMColor(colors[1]);
- *c = FromPMColor(colors[2]);
- *d = FromPMColor(colors[3]);
-}
-
-inline void SkPMFloat::RoundTo4PMColors(
- const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
- SkPMColor colors[4]) {
- // Haven't beaten this yet. Still faster than RoundClampTo4PMColors?
- colors[0] = a.round();
- colors[1] = b.round();
- colors[2] = c.round();
- colors[3] = d.round();
-}
-
-inline void SkPMFloat::RoundClampTo4PMColors(
- const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
- SkPMColor colors[4]) {
- // Same as _SSE2.h's. We use 3 _mm_packus_epi16() where the naive loop uses 8.
- // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
- __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), a.fVec)),
- c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), b.fVec)),
- c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), c.fVec)),
- c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), d.fVec));
- __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
- _mm_packus_epi16(c2, c3));
- _mm_storeu_si128((__m128i*)colors, c3210);
- SkPMColorAssert(colors[0]);
- SkPMColorAssert(colors[1]);
- SkPMColorAssert(colors[2]);
- SkPMColorAssert(colors[3]);
-}
-
-} // namespace
diff --git a/src/opts/SkPMFloat_neon.h b/src/opts/SkPMFloat_neon.h
index cabb29a2a8..57f613751d 100644
--- a/src/opts/SkPMFloat_neon.h
+++ b/src/opts/SkPMFloat_neon.h
@@ -7,70 +7,24 @@
namespace { // See SkPMFloat.h
-// For SkPMFloat(SkPMFColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits
-// (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats.
-
-// round() and roundClamp() do the opposite, working from floats to 8-bit-in-32-bit,
-// to 8-bit-in-16-bit, back down to 8-bit components.
-// roundClamp() uses vqmovn to clamp while narrowing instead of just narrowing with vmovn.
-
inline SkPMFloat::SkPMFloat(SkPMColor c) {
SkPMColorAssert(c);
uint8x8_t fix8 = (uint8x8_t)vdup_n_u32(c);
uint16x8_t fix8_16 = vmovl_u8(fix8);
uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16));
- fVec = vcvtq_f32_u32(fix8_32);
+ fVec = vcvtq_n_f32_u32(fix8_32, 8);
SkASSERT(this->isValid());
}
-inline SkPMColor SkPMFloat::trunc() const {
- uint32x4_t fix8_32 = vcvtq_u32_f32(fVec); // vcvtq_u32_f32 truncates
- uint16x4_t fix8_16 = vmovn_u32(fix8_32);
- uint8x8_t fix8 = vmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
- SkPMColor c = vget_lane_u32((uint32x2_t)fix8, 0);
- SkPMColorAssert(c);
- return c;
-}
-
inline SkPMColor SkPMFloat::round() const {
- return SkPMFloat(Sk4f(0.5f) + *this).trunc();
-}
-
-inline SkPMColor SkPMFloat::roundClamp() const {
- float32x4_t add_half = vaddq_f32(fVec, vdupq_n_f32(0.5f));
- uint32x4_t fix8_32 = vcvtq_u32_f32(add_half); // vcvtq_u32_f32 truncates, so round manually
- uint16x4_t fix8_16 = vqmovn_u32(fix8_32);
- uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
+ // vcvtq_n_u32_f32 truncates, so we round manually by adding a half before converting.
+ float32x4_t rounded = vaddq_f32(fVec, vdupq_n_f32(0.5f/255));
+ uint32x4_t fix8_32 = vcvtq_n_u32_f32(rounded, 8);
+ uint16x4_t fix8_16 = vqmovn_u32(fix8_32);
+ uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
SkPMColor c = vget_lane_u32((uint32x2_t)fix8, 0);
SkPMColorAssert(c);
return c;
}
-// TODO: we should be able to beat these loops on all three methods.
-inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
- SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
- *a = FromPMColor(colors[0]);
- *b = FromPMColor(colors[1]);
- *c = FromPMColor(colors[2]);
- *d = FromPMColor(colors[3]);
-}
-
-inline void SkPMFloat::RoundTo4PMColors(
- const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
- SkPMColor colors[4]) {
- colors[0] = a.round();
- colors[1] = b.round();
- colors[2] = c.round();
- colors[3] = d.round();
-}
-
-inline void SkPMFloat::RoundClampTo4PMColors(
- const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
- SkPMColor colors[4]) {
- colors[0] = a.roundClamp();
- colors[1] = b.roundClamp();
- colors[2] = c.roundClamp();
- colors[3] = d.roundClamp();
-}
-
} // namespace
diff --git a/src/opts/SkPMFloat_none.h b/src/opts/SkPMFloat_none.h
index 9bb584ed86..ba773564d6 100644
--- a/src/opts/SkPMFloat_none.h
+++ b/src/opts/SkPMFloat_none.h
@@ -8,61 +8,26 @@
namespace { // See SkPMFloat.h
inline SkPMFloat::SkPMFloat(SkPMColor c) {
- *this = SkPMFloat::FromARGB(SkGetPackedA32(c),
- SkGetPackedR32(c),
- SkGetPackedG32(c),
- SkGetPackedB32(c));
+ float inv255 = 1.0f/255;
+ *this = SkPMFloat::FromARGB(SkGetPackedA32(c) * inv255,
+ SkGetPackedR32(c) * inv255,
+ SkGetPackedG32(c) * inv255,
+ SkGetPackedB32(c) * inv255);
SkASSERT(this->isValid());
}
-inline SkPMColor SkPMFloat::trunc() const {
- return SkPackARGB32(this->a(), this->r(), this->g(), this->b());
-}
-
inline SkPMColor SkPMFloat::round() const {
- SkPMColor c = SkPackARGB32(this->a()+0.5f, this->r()+0.5f, this->g()+0.5f, this->b()+0.5f);
- SkPMColorAssert(c);
- return c;
-}
-
-inline SkPMColor SkPMFloat::roundClamp() const {
float a = this->a(),
r = this->r(),
g = this->g(),
b = this->b();
- a = a < 0 ? 0 : (a > 255 ? 255 : a);
- r = r < 0 ? 0 : (r > 255 ? 255 : r);
- g = g < 0 ? 0 : (g > 255 ? 255 : g);
- b = b < 0 ? 0 : (b > 255 ? 255 : b);
- SkPMColor c = SkPackARGB32(a+0.5f, r+0.5f, g+0.5f, b+0.5f);
+ a = a < 0 ? 0 : (a > 1 ? 1 : a);
+ r = r < 0 ? 0 : (r > 1 ? 1 : r);
+ g = g < 0 ? 0 : (g > 1 ? 1 : g);
+ b = b < 0 ? 0 : (b > 1 ? 1 : b);
+ SkPMColor c = SkPackARGB32(255*a+0.5f, 255*r+0.5f, 255*g+0.5f, 255*b+0.5f);
SkPMColorAssert(c);
return c;
}
-inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
- SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
- *a = FromPMColor(colors[0]);
- *b = FromPMColor(colors[1]);
- *c = FromPMColor(colors[2]);
- *d = FromPMColor(colors[3]);
-}
-
-inline void SkPMFloat::RoundTo4PMColors(
- const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
- SkPMColor colors[4]) {
- colors[0] = a.round();
- colors[1] = b.round();
- colors[2] = c.round();
- colors[3] = d.round();
-}
-
-inline void SkPMFloat::RoundClampTo4PMColors(
- const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
- SkPMColor colors[4]) {
- colors[0] = a.roundClamp();
- colors[1] = b.roundClamp();
- colors[2] = c.roundClamp();
- colors[3] = d.roundClamp();
-}
-
} // namespace
diff --git a/src/opts/SkPMFloat_sse.h b/src/opts/SkPMFloat_sse.h
new file mode 100644
index 0000000000..802b17ba0c
--- /dev/null
+++ b/src/opts/SkPMFloat_sse.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+namespace { // See SkPMFloat.h
+
+inline SkPMFloat::SkPMFloat(SkPMColor c) {
+ SkPMColorAssert(c);
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+ const int _ = 255; // Zero these bytes.
+ __m128i fix8 = _mm_cvtsi32_si128((int)c),
+ fix8_32 = _mm_shuffle_epi8(fix8, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_));
+#else
+ __m128i fix8 = _mm_cvtsi32_si128((int)c),
+ fix8_16 = _mm_unpacklo_epi8 (fix8, _mm_setzero_si128()),
+ fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
+#endif
+ fVec = _mm_mul_ps(_mm_cvtepi32_ps(fix8_32), _mm_set1_ps(1.0f / 255));
+ SkASSERT(this->isValid());
+}
+
+inline SkPMColor SkPMFloat::round() const {
+ // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
+ __m128 scaled = _mm_mul_ps(_mm_set1_ps(255), fVec);
+ __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), scaled)),
+ fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
+ fix8 = _mm_packus_epi16(fix8_16, fix8_16);
+ SkPMColor c = _mm_cvtsi128_si32(fix8);
+ SkPMColorAssert(c);
+ return c;
+}
+
+} // namespace
diff --git a/tests/PMFloatTest.cpp b/tests/PMFloatTest.cpp
index 1c53081571..b7b39411d1 100644
--- a/tests/PMFloatTest.cpp
+++ b/tests/PMFloatTest.cpp
@@ -12,47 +12,27 @@ DEF_TEST(SkPMFloat, r) {
// Test SkPMColor <-> SkPMFloat
SkPMColor c = SkPreMultiplyColor(0xFFCC9933);
SkPMFloat pmf(c);
- REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, pmf.a()));
- REPORTER_ASSERT(r, SkScalarNearlyEqual(204.0f, pmf.r()));
- REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, pmf.g()));
- REPORTER_ASSERT(r, SkScalarNearlyEqual( 51.0f, pmf.b()));
+ REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, 255*pmf.a()));
+ REPORTER_ASSERT(r, SkScalarNearlyEqual(204.0f, 255*pmf.r()));
+ REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, 255*pmf.g()));
+ REPORTER_ASSERT(r, SkScalarNearlyEqual( 51.0f, 255*pmf.b()));
REPORTER_ASSERT(r, c == pmf.round());
// Test rounding.
- pmf = SkPMFloat(254.5f, 203.5f, 153.1f, 50.8f);
+ pmf = SkPMFloat(254.5f/255, 203.5f/255, 153.1f/255, 50.8f/255);
REPORTER_ASSERT(r, c == pmf.round());
- pmf = SkPMFloat(255.9f, 204.01f, 153.0f, -0.9f);
- REPORTER_ASSERT(r, SkPreMultiplyColor(0xFFCC9900) == pmf.trunc());
-
- // Test clamping.
- SkPMFloat clamped(SkPMFloat(510.0f, 153.0f, 1.0f, -0.2f).roundClamp());
- REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, clamped.a()));
- REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, clamped.r()));
- REPORTER_ASSERT(r, SkScalarNearlyEqual( 1.0f, clamped.g()));
- REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.0f, clamped.b()));
+ SkPMFloat clamped(SkPMFloat(510.0f/255, 153.0f/255, 1.0f/255, -0.2f/255).round());
+ REPORTER_ASSERT(r, SkScalarNearlyEqual(255.0f, 255*clamped.a()));
+ REPORTER_ASSERT(r, SkScalarNearlyEqual(153.0f, 255*clamped.r()));
+ REPORTER_ASSERT(r, SkScalarNearlyEqual( 1.0f, 255*clamped.g()));
+ REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.0f, 255*clamped.b()));
// Test SkPMFloat <-> Sk4f conversion.
Sk4f fs = clamped;
SkPMFloat scaled = fs * Sk4f(0.25f);
- REPORTER_ASSERT(r, SkScalarNearlyEqual(63.75f, scaled.a()));
- REPORTER_ASSERT(r, SkScalarNearlyEqual(38.25f, scaled.r()));
- REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.25f, scaled.g()));
- REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.00f, scaled.b()));
-
- // Test 4-at-a-time conversions.
- SkPMColor colors[4] = { 0xFF000000, 0xFFFF0000, 0xFF00FF00, 0xFF0000FF };
- SkPMFloat floats[4];
- SkPMFloat::From4PMColors(colors, floats+0, floats+1, floats+2, floats+3);
-
- SkPMColor back[4];
- SkPMFloat::RoundTo4PMColors(floats[0], floats[1], floats[2], floats[3], back);
- for (int i = 0; i < 4; i++) {
- REPORTER_ASSERT(r, back[i] == colors[i]);
- }
-
- SkPMFloat::RoundClampTo4PMColors(floats[0], floats[1], floats[2], floats[3], back);
- for (int i = 0; i < 4; i++) {
- REPORTER_ASSERT(r, back[i] == colors[i]);
- }
+ REPORTER_ASSERT(r, SkScalarNearlyEqual(63.75f, 255*scaled.a()));
+ REPORTER_ASSERT(r, SkScalarNearlyEqual(38.25f, 255*scaled.r()));
+ REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.25f, 255*scaled.g()));
+ REPORTER_ASSERT(r, SkScalarNearlyEqual( 0.00f, 255*scaled.b()));
}