Convert SkPMFloat to [0,1] range and prune its API.

Now that Sk4px exists, there's a lot less sense in eeking out every cycle of speed from SkPMFloat: if we need to go _really_ fast, we should use Sk4px. SkPMFloat's going to be used for things that are already slow: large-range intermediates, divides, sqrts, etc. A [0,1] range is easier to work with, and can even be faster if we eliminate enough *255 and *1/255 steps. This is particularly true on ARM, where NEON can do the *255 and /255 steps for us while converting float<->int. We have lots of experimental SkPMFloat <-> SkPMColor APIs that I'm now removing. Of the existing APIs, roundClamp() is the sanest, so I've kept only that, now called round(). The 4-at-a-time APIs never panned out, so they're gone. There will be small diffs on: colormatrix coloremoji colorfilterimagefilter fadefilter imagefilters_xfermodes imagefilterscropexpand imagefiltersgraph tileimagefilter BUG=skia: Review URL: https://codereview.chromium.org/1201343004
author: mtklein <mtklein@chromium.org> 2015-06-25 08:56:28 -0700
committer: Commit bot <commit-bot@chromium.org> 2015-06-25 08:56:28 -0700
commit: e9a3e3c17a313942042d6cfb9f4f0361a900d9e7 (patch)
tree: 913d6d293e2578f223ec82f2bbac69fbf43b711b /src/opts
parent: 538bacb4bb3ceac7786108cd68b04ed58b1c29c7 (diff)
5 files changed, 52 insertions, 268 deletions
diff --git a/src/opts/SkPMFloat_SSE2.h b/src/opts/SkPMFloat_SSE2.h
deleted file mode 100644
index c7e791ff62..0000000000
--- a/src/opts/SkPMFloat_SSE2.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-namespace {  // See SkPMFloat.h
-
-// For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits
-// (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats.
-
-// round() and roundClamp() do the opposite, working from floats to 8-bit-in-32-bit,
-// to 8-bit-in-16-bit, back down to 8-bit components.
-// _mm_packus_epi16() gives us clamping for free while narrowing.
-
-inline SkPMFloat::SkPMFloat(SkPMColor c) {
-    SkPMColorAssert(c);
-    __m128i fix8    = _mm_set_epi32(0,0,0,c),
-            fix8_16 = _mm_unpacklo_epi8 (fix8,    _mm_setzero_si128()),
-            fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
-    fVec = _mm_cvtepi32_ps(fix8_32);
-    SkASSERT(this->isValid());
-}
-
-inline SkPMColor SkPMFloat::round() const {
-    return this->roundClamp();  // Haven't beaten this yet.
-}
-
-inline SkPMColor SkPMFloat::roundClamp() const {
-    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
-    __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fVec)),
-            fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
-            fix8    = _mm_packus_epi16(fix8_16, fix8_16);
-    SkPMColor c = _mm_cvtsi128_si32(fix8);
-    SkPMColorAssert(c);
-    return c;
-}
-
-inline SkPMColor SkPMFloat::trunc() const {
-    // Basically, same as roundClamp(), but no rounding.
-    __m128i fix8_32 = _mm_cvttps_epi32(fVec),
-            fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
-            fix8    = _mm_packus_epi16(fix8_16, fix8_16);
-    SkPMColor c = _mm_cvtsi128_si32(fix8);
-    SkPMColorAssert(c);
-    return c;
-}
-
-inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
-                                     SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
-    // Haven't beaten this yet.
-    *a = FromPMColor(colors[0]);
-    *b = FromPMColor(colors[1]);
-    *c = FromPMColor(colors[2]);
-    *d = FromPMColor(colors[3]);
-}
-
-inline void SkPMFloat::RoundTo4PMColors(
-        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
-        SkPMColor colors[4]) {
-    // Haven't beaten this yet.
-    RoundClampTo4PMColors(a,b,c,d, colors);
-}
-
-inline void SkPMFloat::RoundClampTo4PMColors(
-        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
-        SkPMColor colors[4]) {
-    // Same as _SSSE3.h's.  We use 3 _mm_packus_epi16() where the naive loop uses 8.
-    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
-    __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), a.fVec)),
-            c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), b.fVec)),
-            c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), c.fVec)),
-            c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), d.fVec));
-    __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
-                                     _mm_packus_epi16(c2, c3));
-    _mm_storeu_si128((__m128i*)colors, c3210);
-    SkPMColorAssert(colors[0]);
-    SkPMColorAssert(colors[1]);
-    SkPMColorAssert(colors[2]);
-    SkPMColorAssert(colors[3]);
-}
-
-}  // namespace
diff --git a/src/opts/SkPMFloat_SSSE3.h b/src/opts/SkPMFloat_SSSE3.h
deleted file mode 100644
index 67116ec2dd..0000000000
--- a/src/opts/SkPMFloat_SSSE3.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-namespace {  // See SkPMFloat.h
-
-// For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 32 bits
-// (fix8_32), then convert those to floats.
-
-// round() does the opposite, working from floats to 8-bit-in-32-bits, then back to packed 8 bit.
-
-// roundClamp() is the same as _SSE2: floats to 8-in-32, to 8-in-16, to packed 8 bit, with
-// _mm_packus_epi16() both clamping and narrowing.
-
-inline SkPMFloat::SkPMFloat(SkPMColor c) {
-    SkPMColorAssert(c);
-    const int _ = 255;  // _ means to zero that byte.
-    __m128i fix8    = _mm_set_epi32(0,0,0,c),
-            fix8_32 = _mm_shuffle_epi8(fix8, _mm_set_epi8(_,_,_,3, _,_,_,2, _,_,_,1, _,_,_,0));
-    fVec = _mm_cvtepi32_ps(fix8_32);
-    SkASSERT(this->isValid());
-}
-
-inline SkPMColor SkPMFloat::trunc() const {
-    const int _ = 255;  // _ means to zero that byte.
-    __m128i fix8_32 = _mm_cvttps_epi32(fVec),
-            fix8    = _mm_shuffle_epi8(fix8_32, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 12,8,4,0));
-    SkPMColor c = _mm_cvtsi128_si32(fix8);
-    SkPMColorAssert(c);
-    return c;
-}
-
-inline SkPMColor SkPMFloat::round() const {
-    return SkPMFloat(Sk4f(0.5f) + *this).trunc();
-}
-
-inline SkPMColor SkPMFloat::roundClamp() const {
-    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
-    __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fVec)),
-            fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
-            fix8    = _mm_packus_epi16(fix8_16, fix8_16);
-    SkPMColor c = _mm_cvtsi128_si32(fix8);
-    SkPMColorAssert(c);
-    return c;
-}
-
-inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
-                                     SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
-    // Haven't beaten this yet.
-    *a = FromPMColor(colors[0]);
-    *b = FromPMColor(colors[1]);
-    *c = FromPMColor(colors[2]);
-    *d = FromPMColor(colors[3]);
-}
-
-inline void SkPMFloat::RoundTo4PMColors(
-        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
-        SkPMColor colors[4]) {
-    // Haven't beaten this yet.  Still faster than RoundClampTo4PMColors?
-    colors[0] = a.round();
-    colors[1] = b.round();
-    colors[2] = c.round();
-    colors[3] = d.round();
-}
-
-inline void SkPMFloat::RoundClampTo4PMColors(
-        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
-        SkPMColor colors[4]) {
-    // Same as _SSE2.h's.  We use 3 _mm_packus_epi16() where the naive loop uses 8.
-    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
-    __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), a.fVec)),
-            c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), b.fVec)),
-            c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), c.fVec)),
-            c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), d.fVec));
-    __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
-                                     _mm_packus_epi16(c2, c3));
-    _mm_storeu_si128((__m128i*)colors, c3210);
-    SkPMColorAssert(colors[0]);
-    SkPMColorAssert(colors[1]);
-    SkPMColorAssert(colors[2]);
-    SkPMColorAssert(colors[3]);
-}
-
-}  // namespace
diff --git a/src/opts/SkPMFloat_neon.h b/src/opts/SkPMFloat_neon.h
index cabb29a2a8..57f613751d 100644
--- a/src/opts/SkPMFloat_neon.h
+++ b/src/opts/SkPMFloat_neon.h
@@ -7,70 +7,24 @@
 
 namespace { // See SkPMFloat.h
 
-// For SkPMFloat(SkPMFColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits
-// (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats.
-
-// round() and roundClamp() do the opposite, working from floats to 8-bit-in-32-bit,
-// to 8-bit-in-16-bit, back down to 8-bit components.
-// roundClamp() uses vqmovn to clamp while narrowing instead of just narrowing with vmovn.
-
 inline SkPMFloat::SkPMFloat(SkPMColor c) {
     SkPMColorAssert(c);
     uint8x8_t   fix8    = (uint8x8_t)vdup_n_u32(c);
     uint16x8_t  fix8_16 = vmovl_u8(fix8);
     uint32x4_t  fix8_32 = vmovl_u16(vget_low_u16(fix8_16));
-    fVec = vcvtq_f32_u32(fix8_32);
+    fVec = vcvtq_n_f32_u32(fix8_32, 8);
     SkASSERT(this->isValid());
 }
 
-inline SkPMColor SkPMFloat::trunc() const {
-    uint32x4_t  fix8_32  = vcvtq_u32_f32(fVec);  // vcvtq_u32_f32 truncates
-    uint16x4_t  fix8_16  = vmovn_u32(fix8_32);
-    uint8x8_t   fix8     = vmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
-    SkPMColor c = vget_lane_u32((uint32x2_t)fix8, 0);
-    SkPMColorAssert(c);
-    return c;
-}
-
 inline SkPMColor SkPMFloat::round() const {
-    return SkPMFloat(Sk4f(0.5f) + *this).trunc();
-}
-
-inline SkPMColor SkPMFloat::roundClamp() const {
-    float32x4_t add_half = vaddq_f32(fVec, vdupq_n_f32(0.5f));
-    uint32x4_t  fix8_32  = vcvtq_u32_f32(add_half);  // vcvtq_u32_f32 truncates, so round manually
-    uint16x4_t  fix8_16  = vqmovn_u32(fix8_32);
-    uint8x8_t   fix8     = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
+    // vcvtq_n_u32_f32 truncates, so we round manually by adding a half before converting.
+    float32x4_t rounded = vaddq_f32(fVec, vdupq_n_f32(0.5f/255));
+    uint32x4_t  fix8_32 = vcvtq_n_u32_f32(rounded, 8);
+    uint16x4_t  fix8_16 = vqmovn_u32(fix8_32);
+    uint8x8_t   fix8    = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
     SkPMColor c = vget_lane_u32((uint32x2_t)fix8, 0);
     SkPMColorAssert(c);
     return c;
 }
 
-// TODO: we should be able to beat these loops on all three methods.
-inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
-                                     SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
-    *a = FromPMColor(colors[0]);
-    *b = FromPMColor(colors[1]);
-    *c = FromPMColor(colors[2]);
-    *d = FromPMColor(colors[3]);
-}
-
-inline void SkPMFloat::RoundTo4PMColors(
-        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
-        SkPMColor colors[4]) {
-    colors[0] = a.round();
-    colors[1] = b.round();
-    colors[2] = c.round();
-    colors[3] = d.round();
-}
-
-inline void SkPMFloat::RoundClampTo4PMColors(
-        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
-        SkPMColor colors[4]) {
-    colors[0] = a.roundClamp();
-    colors[1] = b.roundClamp();
-    colors[2] = c.roundClamp();
-    colors[3] = d.roundClamp();
-}
-
 }  // namespace
diff --git a/src/opts/SkPMFloat_none.h b/src/opts/SkPMFloat_none.h
index 9bb584ed86..ba773564d6 100644
--- a/src/opts/SkPMFloat_none.h
+++ b/src/opts/SkPMFloat_none.h
@@ -8,61 +8,26 @@
 namespace {  // See SkPMFloat.h
 
 inline SkPMFloat::SkPMFloat(SkPMColor c) {
-    *this = SkPMFloat::FromARGB(SkGetPackedA32(c),
-                                SkGetPackedR32(c),
-                                SkGetPackedG32(c),
-                                SkGetPackedB32(c));
+    float inv255 = 1.0f/255;
+    *this = SkPMFloat::FromARGB(SkGetPackedA32(c) * inv255,
+                                SkGetPackedR32(c) * inv255,
+                                SkGetPackedG32(c) * inv255,
+                                SkGetPackedB32(c) * inv255);
     SkASSERT(this->isValid());
 }
 
-inline SkPMColor SkPMFloat::trunc() const {
-    return SkPackARGB32(this->a(), this->r(), this->g(), this->b());
-}
-
 inline SkPMColor SkPMFloat::round() const {
-    SkPMColor c = SkPackARGB32(this->a()+0.5f, this->r()+0.5f, this->g()+0.5f, this->b()+0.5f);
-    SkPMColorAssert(c);
-    return c;
-}
-
-inline SkPMColor SkPMFloat::roundClamp() const {
     float a = this->a(),
           r = this->r(),
           g = this->g(),
           b = this->b();
-    a = a < 0 ? 0 : (a > 255 ? 255 : a);
-    r = r < 0 ? 0 : (r > 255 ? 255 : r);
-    g = g < 0 ? 0 : (g > 255 ? 255 : g);
-    b = b < 0 ? 0 : (b > 255 ? 255 : b);
-    SkPMColor c = SkPackARGB32(a+0.5f, r+0.5f, g+0.5f, b+0.5f);
+    a = a < 0 ? 0 : (a > 1 ? 1 : a);
+    r = r < 0 ? 0 : (r > 1 ? 1 : r);
+    g = g < 0 ? 0 : (g > 1 ? 1 : g);
+    b = b < 0 ? 0 : (b > 1 ? 1 : b);
+    SkPMColor c = SkPackARGB32(255*a+0.5f, 255*r+0.5f, 255*g+0.5f, 255*b+0.5f);
     SkPMColorAssert(c);
     return c;
 }
 
-inline void SkPMFloat::From4PMColors(const SkPMColor colors[4],
-                                     SkPMFloat* a, SkPMFloat* b, SkPMFloat* c, SkPMFloat* d) {
-    *a = FromPMColor(colors[0]);
-    *b = FromPMColor(colors[1]);
-    *c = FromPMColor(colors[2]);
-    *d = FromPMColor(colors[3]);
-}
-
-inline void SkPMFloat::RoundTo4PMColors(
-        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
-        SkPMColor colors[4]) {
-    colors[0] = a.round();
-    colors[1] = b.round();
-    colors[2] = c.round();
-    colors[3] = d.round();
-}
-
-inline void SkPMFloat::RoundClampTo4PMColors(
-        const SkPMFloat& a, const SkPMFloat& b, const SkPMFloat&c, const SkPMFloat& d,
-        SkPMColor colors[4]) {
-    colors[0] = a.roundClamp();
-    colors[1] = b.roundClamp();
-    colors[2] = c.roundClamp();
-    colors[3] = d.roundClamp();
-}
-
 }  // namespace
diff --git a/src/opts/SkPMFloat_sse.h b/src/opts/SkPMFloat_sse.h
new file mode 100644
index 0000000000..802b17ba0c
--- /dev/null
+++ b/src/opts/SkPMFloat_sse.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+namespace {  // See SkPMFloat.h
+
+inline SkPMFloat::SkPMFloat(SkPMColor c) {
+    SkPMColorAssert(c);
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+    const int _ = 255;  // Zero these bytes.
+    __m128i fix8    = _mm_cvtsi32_si128((int)c),
+            fix8_32 = _mm_shuffle_epi8(fix8, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_));
+#else
+    __m128i fix8    = _mm_cvtsi32_si128((int)c),
+            fix8_16 = _mm_unpacklo_epi8 (fix8,    _mm_setzero_si128()),
+            fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
+#endif
+    fVec = _mm_mul_ps(_mm_cvtepi32_ps(fix8_32), _mm_set1_ps(1.0f / 255));
+    SkASSERT(this->isValid());
+}
+
+inline SkPMColor SkPMFloat::round() const {
+    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
+    __m128 scaled = _mm_mul_ps(_mm_set1_ps(255), fVec);
+    __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), scaled)),
+            fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
+            fix8    = _mm_packus_epi16(fix8_16, fix8_16);
+    SkPMColor c = _mm_cvtsi128_si32(fix8);
+    SkPMColorAssert(c);
+    return c;
+}
+
+}  // namespace
author	mtklein <mtklein@chromium.org>	2015-06-25 08:56:28 -0700
committer	Commit bot <commit-bot@chromium.org>	2015-06-25 08:56:28 -0700
commit	e9a3e3c17a313942042d6cfb9f4f0361a900d9e7 (patch)
tree	913d6d293e2578f223ec82f2bbac69fbf43b711b /src/opts
parent	538bacb4bb3ceac7786108cd68b04ed58b1c29c7 (diff)