Move float<->byte conversions into Sk4f.

This lets us avoid conversions to [0.0, 1.0] space and rounding that aren't necessary for SkColorCubeFilter_opts.h. Dropping rounding on the way back to bytes means we'll see a bunch of off-by-1 diffs. Rough perf effect: SSSE3: 110 -> 93 (~15%) NEON: 465 -> 375 (~20%) This is the beginning of the end for SkPMFloat as an entity distinct from Sk4f. I've kept it for now so I can convert sites one by one and think about how things that really want to keep PM color order will work. BUG=skia:4117 Review URL: https://codereview.chromium.org/1319413003
author: mtklein <mtklein@chromium.org> 2015-08-31 14:39:59 -0700
committer: Commit bot <commit-bot@chromium.org> 2015-08-31 14:39:59 -0700
commit: aba1dc8c6aa5cbc4f38ddfce757832359f200b54 (patch)
tree: 9c9ae11de774aa7d6658b291ef1b187a31916c19 /src/opts
parent: b2885d59bccb1c5045e52135c5dd7a85a57694fa (diff)
6 files changed, 48 insertions, 178 deletions
diff --git a/src/opts/SkColorCubeFilter_opts.h b/src/opts/SkColorCubeFilter_opts.h
index 41d2525b5a..afd6149cff 100644
--- a/src/opts/SkColorCubeFilter_opts.h
+++ b/src/opts/SkColorCubeFilter_opts.h
@@ -6,7 +6,7 @@
 #define SkColorCubeFilter_opts_DEFINED
 
 #include "SkColor.h"
-#include "SkPMFloat.h"
+#include "SkNx.h"
 #include "SkUnPreMultiply.h"
 
 namespace SK_OPTS_NS {
@@ -18,7 +18,6 @@ void color_cube_filter_span(const SkPMColor src[],
                             const SkScalar* colorToFactors[2],
                             int dim,
                             const SkColor* colorCube) {
-    uint8_t* ptr_dst = reinterpret_cast<uint8_t*>(dst);
     uint8_t r, g, b, a;
 
     for (int i = 0; i < count; ++i) {
@@ -51,8 +50,7 @@ void color_cube_filter_span(const SkPMColor src[],
         const int i10 = (colorToIndex[1][g] + colorToIndex[0][b] * dim) * dim;
         const int i11 = (colorToIndex[1][g] + colorToIndex[1][b] * dim) * dim;
 
-        SkPMFloat color(0,0,0,0);
-
+        Sk4f color(0,0,0,0);
         for (int x = 0; x < 2; ++x) {
             const int ix = colorToIndex[x][r];
 
@@ -61,22 +59,23 @@ void color_cube_filter_span(const SkPMColor src[],
             const SkColor lutColor10 = colorCube[ix + i10];
             const SkColor lutColor11 = colorCube[ix + i11];
 
-            Sk4f  sum = SkPMFloat::FromOpaqueColor(lutColor00) * g0b0;
-            sum = sum + SkPMFloat::FromOpaqueColor(lutColor01) * g0b1;
-            sum = sum + SkPMFloat::FromOpaqueColor(lutColor10) * g1b0;
-            sum = sum + SkPMFloat::FromOpaqueColor(lutColor11) * g1b1;
-
+            Sk4f  sum = Sk4f::FromBytes((const uint8_t*)&lutColor00) * g0b0;
+            sum = sum + Sk4f::FromBytes((const uint8_t*)&lutColor01) * g0b1;
+            sum = sum + Sk4f::FromBytes((const uint8_t*)&lutColor10) * g1b0;
+            sum = sum + Sk4f::FromBytes((const uint8_t*)&lutColor11) * g1b1;
             color = color + sum * Sk4f((float)colorToFactors[x][r]);
         }
-
         if (a != 255) {
-            color = color * Sk4f(a * 1.0f/255);
+            color = color * Sk4f(a * (1.0f/255));
         }
 
-        dst[i] = color.round();
-
-        ptr_dst[SK_A32_SHIFT / 8] = a;
-        ptr_dst += 4;
+        // color is BGRA (SkColor order), dst is SkPMColor order, so may need to swap R+B.
+    #if defined(SK_PMCOLOR_IS_RGBA)
+        color = Sk4f(color.kth<2>(), color.kth<1>(), color.kth<0>(), color.kth<3>());
+    #endif
+        uint8_t* dstBytes = (uint8_t*)(dst+i);
+        color.toBytes(dstBytes);
+        dstBytes[SK_A32_SHIFT/8] = a;
     }
 }
 
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index 660b92c6b0..cf149862c5 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -223,9 +223,22 @@ public:
     SkNf() {}
     explicit SkNf(float val)           : fVec(vdupq_n_f32(val)) {}
     static SkNf Load(const float vals[4]) { return vld1q_f32(vals); }
+    static SkNf FromBytes(const uint8_t vals[4]) {
+        uint8x8_t   fix8    = (uint8x8_t)vld1_dup_u32((const uint32_t*)vals);
+        uint16x8_t  fix8_16 = vmovl_u8(fix8);
+        uint32x4_t  fix8_32 = vmovl_u16(vget_low_u16(fix8_16));
+        return SkNf(vcvtq_f32_u32(fix8_32));
+    }
+
     SkNf(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }
 
     void store(float vals[4]) const { vst1q_f32(vals, fVec); }
+    void toBytes(uint8_t bytes[4]) const {
+        uint32x4_t  fix8_32 = vcvtq_u32_f32(fVec);
+        uint16x4_t  fix8_16 = vqmovn_u32(fix8_32);
+        uint8x8_t   fix8    = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
+        vst1_lane_u32((uint32_t*)bytes, (uint32x2_t)fix8, 0);
+    }
 
     SkNi<4, int> castTrunc() const { return vcvtq_s32_f32(fVec); }
 
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index e165f58737..093cd4c14c 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -155,9 +155,30 @@ public:
     SkNf() {}
     explicit SkNf(float val)           : fVec( _mm_set1_ps(val) ) {}
     static SkNf Load(const float vals[4]) { return _mm_loadu_ps(vals); }
+
+    static SkNf FromBytes(const uint8_t bytes[4]) {
+        __m128i fix8 = _mm_cvtsi32_si128(*(const int*)bytes);
+    #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+        const char _ = ~0;  // Zero these bytes.
+        __m128i fix8_32 = _mm_shuffle_epi8(fix8, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_));
+    #else
+        __m128i fix8_16 = _mm_unpacklo_epi8 (fix8,    _mm_setzero_si128()),
+                fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
+    #endif
+        return SkNf(_mm_cvtepi32_ps(fix8_32));
+        // TODO: use _mm_cvtepu8_epi32 w/SSE4.1?
+    }
+
     SkNf(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {}
 
     void store(float vals[4]) const { _mm_storeu_ps(vals, fVec); }
+    void toBytes(uint8_t bytes[4]) const {
+        __m128i fix8_32 = _mm_cvttps_epi32(fVec),
+                fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
+                fix8    = _mm_packus_epi16(fix8_16, fix8_16);
+        *(int*)bytes = _mm_cvtsi128_si32(fix8);
+        // TODO: use _mm_shuffle_epi8 w/SSSE3?
+    }
 
     SkNi<4, int> castTrunc() const { return _mm_cvttps_epi32(fVec); }
 
diff --git a/src/opts/SkPMFloat_neon.h b/src/opts/SkPMFloat_neon.h
deleted file mode 100644
index 4e099f91ec..0000000000
--- a/src/opts/SkPMFloat_neon.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-namespace { // See SkPMFloat.h
-
-static_assert(SK_A32_SHIFT == 24, "This file assumes little-endian.");
-
-inline SkPMFloat::SkPMFloat(SkPMColor c) {
-    SkPMColorAssert(c);
-    uint8x8_t   fix8    = (uint8x8_t)vdup_n_u32(c);
-    uint16x8_t  fix8_16 = vmovl_u8(fix8);
-    uint32x4_t  fix8_32 = vmovl_u16(vget_low_u16(fix8_16));
-    fVec = vmulq_f32(vcvtq_f32_u32(fix8_32), vdupq_n_f32(1.0f/255));
-    SkASSERT(this->isValid());
-}
-
-inline SkPMColor SkPMFloat::round() const {
-    // vcvt_u32_f32 truncates, so we round manually by adding a half before converting.
-    float32x4_t rounded = vmlaq_f32(vdupq_n_f32(0.5f), fVec, vdupq_n_f32(255));
-    uint32x4_t  fix8_32 = vcvtq_u32_f32(rounded);
-    uint16x4_t  fix8_16 = vqmovn_u32(fix8_32);
-    uint8x8_t   fix8    = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
-    SkPMColor c = vget_lane_u32((uint32x2_t)fix8, 0);
-    SkPMColorAssert(c);
-    return c;
-}
-
-inline Sk4f SkPMFloat::alphas() const {
-    return vdupq_lane_f32(vget_high_f32(fVec), 1);  // Duplicate high lane of high half i.e. lane 3.
-}
-
-inline SkPMFloat SkPMFloat::FromOpaqueColor(SkColor c) {
-    SkASSERT(SkColorGetA(c) == 0xFF);
-    uint8x8_t fix8 = (uint8x8_t)vdup_n_u32(c);
-#if defined(SK_PMCOLOR_IS_RGBA)
-    fix8 = vtbl1_u8(fix8, vcreate_u8(0x0300010203000102ULL));  // 03 00 01 02, 2x, i.e. swap R&B.
-#endif
-    uint16x8_t fix8_16 = vmovl_u8(fix8);
-    uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16));
-
-    SkPMFloat pmf = Sk4f(vmulq_f32(vcvtq_f32_u32(fix8_32), vdupq_n_f32(1.0f/255)));
-    SkASSERT(pmf.isValid());
-    return pmf;
-}
-
-}  // namespace
diff --git a/src/opts/SkPMFloat_none.h b/src/opts/SkPMFloat_none.h
deleted file mode 100644
index 17c76d90f4..0000000000
--- a/src/opts/SkPMFloat_none.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-namespace {  // See SkPMFloat.h
-
-inline SkPMFloat::SkPMFloat(SkPMColor c) {
-    float inv255 = 1.0f/255;
-    *this = SkPMFloat::FromARGB(SkGetPackedA32(c) * inv255,
-                                SkGetPackedR32(c) * inv255,
-                                SkGetPackedG32(c) * inv255,
-                                SkGetPackedB32(c) * inv255);
-    SkASSERT(this->isValid());
-}
-
-inline SkPMColor SkPMFloat::round() const {
-    float a = this->a(),
-          r = this->r(),
-          g = this->g(),
-          b = this->b();
-    a = a < 0 ? 0 : (a > 1 ? 1 : a);
-    r = r < 0 ? 0 : (r > 1 ? 1 : r);
-    g = g < 0 ? 0 : (g > 1 ? 1 : g);
-    b = b < 0 ? 0 : (b > 1 ? 1 : b);
-    SkPMColor c = SkPackARGB32(255*a+0.5f, 255*r+0.5f, 255*g+0.5f, 255*b+0.5f);
-    SkPMColorAssert(c);
-    return c;
-}
-
-inline Sk4f SkPMFloat::alphas() const {
-    return Sk4f(this->a());
-}
-
-inline SkPMFloat SkPMFloat::FromOpaqueColor(SkColor c) {
-  SkASSERT(SkColorGetA(c) == 0xFF);
-  float inv255 = 1.0f / 255;
-  SkPMFloat pmf = SkPMFloat::FromARGB(1.0f,
-                                      SkColorGetR(c) * inv255,
-                                      SkColorGetG(c) * inv255,
-                                      SkColorGetB(c) * inv255);
-  SkASSERT(pmf.isValid());
-  return pmf;
-}
-
-}  // namespace
diff --git a/src/opts/SkPMFloat_sse.h b/src/opts/SkPMFloat_sse.h
deleted file mode 100644
index 6a4d5b6b5c..0000000000
--- a/src/opts/SkPMFloat_sse.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-namespace {  // See SkPMFloat.h
-
-inline SkPMFloat::SkPMFloat(SkPMColor c) {
-    SkPMColorAssert(c);
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
-    const char _ = ~0;  // Zero these bytes.
-    __m128i fix8    = _mm_cvtsi32_si128((int)c),
-            fix8_32 = _mm_shuffle_epi8(fix8, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_));
-#else
-    __m128i fix8    = _mm_cvtsi32_si128((int)c),
-            fix8_16 = _mm_unpacklo_epi8 (fix8,    _mm_setzero_si128()),
-            fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
-#endif
-    fVec = _mm_mul_ps(_mm_cvtepi32_ps(fix8_32), _mm_set1_ps(1.0f / 255));
-    SkASSERT(this->isValid());
-}
-
-inline SkPMColor SkPMFloat::round() const {
-    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
-    __m128 scaled = _mm_mul_ps(_mm_set1_ps(255), fVec);
-    __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), scaled)),
-            fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
-            fix8    = _mm_packus_epi16(fix8_16, fix8_16);
-    SkPMColor c = _mm_cvtsi128_si32(fix8);
-    SkPMColorAssert(c);
-    return c;
-}
-
-inline Sk4f SkPMFloat::alphas() const {
-    static_assert(SK_A32_SHIFT == 24, "");
-    return _mm_shuffle_ps(fVec, fVec, 0xff);  // Read as 11 11 11 11, copying lane 3 to all lanes.
-}
-
-inline SkPMFloat SkPMFloat::FromOpaqueColor(SkColor c) {
-    SkASSERT(SkColorGetA(c) == 0xFF);
-    __m128i fix8 = _mm_cvtsi32_si128((int)c);
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
-    const char _ = ~0;  // Zero these bytes.
-    __m128i fix8_32 = _mm_shuffle_epi8(fix8,
-    #if defined(SK_PMCOLOR_IS_BGRA)
-            _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_)
-    #else
-            _mm_setr_epi8(2,_,_,_, 1,_,_,_, 0,_,_,_, 3,_,_,_)
-    #endif
-    );
-#else
-    __m128i fix8_16 = _mm_unpacklo_epi8 (fix8   , _mm_setzero_si128()),
-            fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
-    #if defined(SK_PMCOLOR_IS_RGBA)
-        fix8_32 = _mm_shuffle_epi32(fix8_32, 0xC6);  // C6 == 11 00 01 10, i.e swap lanes 0 and 2.
-    #endif
-#endif
-    SkPMFloat pmf = Sk4f(_mm_mul_ps(_mm_cvtepi32_ps(fix8_32), _mm_set1_ps(1.0f/255)));
-    SkASSERT(pmf.isValid());
-    return pmf;
-}
-
-}  // namespace
author	mtklein <mtklein@chromium.org>	2015-08-31 14:39:59 -0700
committer	Commit bot <commit-bot@chromium.org>	2015-08-31 14:39:59 -0700
commit	aba1dc8c6aa5cbc4f38ddfce757832359f200b54 (patch)
tree	9c9ae11de774aa7d6658b291ef1b187a31916c19 /src/opts
parent	b2885d59bccb1c5045e52135c5dd7a85a57694fa (diff)