3 files changed, 50 insertions, 135 deletions
diff --git a/src/opts/SkColorXform_opts.h b/src/opts/SkColorXform_opts.h
index da0c9010f2..5af5c2aed7 100644
--- a/src/opts/SkColorXform_opts.h
+++ b/src/opts/SkColorXform_opts.h
@@ -8,6 +8,7 @@
 #ifndef SkColorXform_opts_DEFINED
 #define SkColorXform_opts_DEFINED
 
+#include "SkNx.h"
 #include "SkColorPriv.h"
 
 namespace SK_OPTS_NS {
@@ -146,103 +147,66 @@ extern const float linear_from_2dot2[256] = {
         0.974300202388861000f, 0.982826255053791000f, 0.991392843592940000f, 1.000000000000000000f,
 };
 
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+static Sk4f linear_to_2dot2(const Sk4f& x) {
+    // x^(29/64) is a very good approximation of the true value, x^(1/2.2).
+    auto x2  = x.rsqrt(),                            // x^(-1/2)
+         x32 = x2.rsqrt().rsqrt().rsqrt().rsqrt(),   // x^(-1/32)
+         x64 = x32.rsqrt();                          // x^(+1/64)
 
-// x^(29/64) is a very good approximation of the true value, x^(1/2.2).
-static __m128 linear_to_2dot2(__m128 x) {
-    // x^(-1/2)
-    __m128 x2  = _mm_rsqrt_ps(x);
-
-    // x^(-1/32)
-    __m128 x32 = _mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(x2))));
-
-    // x^(+1/64)
-    __m128 x64 = _mm_rsqrt_ps(x32);
-
-    // x^(+29/64) = x^(+1/2) * x^(-1/32) * x^(-1/64)
-    // Note that we also scale to the 0-255 range.
-    // These terms can be combined more minimally with 3 muls and 1 reciprocal.  However, this
-    // is faster, because it allows us to start the muls in parallel with the rsqrts.
-    __m128 scale = _mm_set1_ps(255.0f);
-    return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, _mm_rcp_ps(x2)), x32), _mm_rcp_ps(x64));
+    // 29 = 32 - 2 - 1
+    return 255.0f * x2.invert() * x32 * x64.invert();
 }
 
-static __m128 clamp_0_to_255(__m128 x) {
+static Sk4f clamp_0_to_255(const Sk4f& x) {
     // The order of the arguments is important here.  We want to make sure that NaN
     // clamps to zero.  Note that max(NaN, 0) = 0, while max(0, NaN) = NaN.
-    return _mm_min_ps(_mm_max_ps(x, _mm_setzero_ps()), _mm_set1_ps(255.0f));
+    return Sk4f::Min(Sk4f::Max(x, 0.0f), 255.0f);
 }
 
 template <const float (&linear_from_curve)[256]>
 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,
                              const float matrix[16]) {
     // Load transformation matrix.
-    __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);
-    __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);
-    __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);
+    auto rXgXbX = Sk4f::Load(matrix + 0),
+         rYgYbY = Sk4f::Load(matrix + 4),
+         rZgZbZ = Sk4f::Load(matrix + 8);
 
     while (len >= 4) {
         // Convert to linear.  The look-up table has perfect accuracy.
-        __m128 reds   = _mm_setr_ps(linear_from_curve[(src[0] >>  0) & 0xFF],
-                                    linear_from_curve[(src[1] >>  0) & 0xFF],
-                                    linear_from_curve[(src[2] >>  0) & 0xFF],
-                                    linear_from_curve[(src[3] >>  0) & 0xFF]);
-        __m128 greens = _mm_setr_ps(linear_from_curve[(src[0] >>  8) & 0xFF],
-                                    linear_from_curve[(src[1] >>  8) & 0xFF],
-                                    linear_from_curve[(src[2] >>  8) & 0xFF],
-                                    linear_from_curve[(src[3] >>  8) & 0xFF]);
-        __m128 blues  = _mm_setr_ps(linear_from_curve[(src[0] >> 16) & 0xFF],
-                                    linear_from_curve[(src[1] >> 16) & 0xFF],
-                                    linear_from_curve[(src[2] >> 16) & 0xFF],
-                                    linear_from_curve[(src[3] >> 16) & 0xFF]);
+        auto reds   = Sk4f{linear_from_curve[(src[0] >>  0) & 0xFF],
+                           linear_from_curve[(src[1] >>  0) & 0xFF],
+                           linear_from_curve[(src[2] >>  0) & 0xFF],
+                           linear_from_curve[(src[3] >>  0) & 0xFF]};
+        auto greens = Sk4f{linear_from_curve[(src[0] >>  8) & 0xFF],
+                           linear_from_curve[(src[1] >>  8) & 0xFF],
+                           linear_from_curve[(src[2] >>  8) & 0xFF],
+                           linear_from_curve[(src[3] >>  8) & 0xFF]};
+        auto blues  = Sk4f{linear_from_curve[(src[0] >> 16) & 0xFF],
+                           linear_from_curve[(src[1] >> 16) & 0xFF],
+                           linear_from_curve[(src[2] >> 16) & 0xFF],
+                           linear_from_curve[(src[3] >> 16) & 0xFF]};
 
         // Apply the transformation matrix to dst gamut.
-        // Splat rX, rY, and rZ each across a register.
-        __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00);
-        __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00);
-        __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00);
-
-        // dstReds = rX * reds + rY * greens + rZ * blues
-        __m128 dstReds =                     _mm_mul_ps(reds,   rX);
-               dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY));
-               dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues,  rZ));
-
-        // Splat gX, gY, and gZ each across a register.
-        __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55);
-        __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55);
-        __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55);
-
-        // dstGreens = gX * reds + gY * greens + gZ * blues
-        __m128 dstGreens =                       _mm_mul_ps(reds,   gX);
-               dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY));
-               dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues,  gZ));
-
-        // Splat bX, bY, and bZ each across a register.
-        __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);
-        __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);
-        __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);
-
-        // dstBlues = bX * reds + bY * greens + bZ * blues
-        __m128 dstBlues =                      _mm_mul_ps(reds,   bX);
-               dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));
-               dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues,  bZ));
+        auto dstReds   = rXgXbX[0]*reds + rYgYbY[0]*greens + rZgZbZ[0]*blues,
+             dstGreens = rXgXbX[1]*reds + rYgYbY[1]*greens + rZgZbZ[1]*blues,
+             dstBlues  = rXgXbX[2]*reds + rYgYbY[2]*greens + rZgZbZ[2]*blues;
 
         // Convert to dst gamma.
         dstReds   = linear_to_2dot2(dstReds);
         dstGreens = linear_to_2dot2(dstGreens);
         dstBlues  = linear_to_2dot2(dstBlues);
 
-        // Clamp floats.
+        // Clamp floats to byte range.
         dstReds   = clamp_0_to_255(dstReds);
         dstGreens = clamp_0_to_255(dstGreens);
         dstBlues  = clamp_0_to_255(dstBlues);
 
         // Convert to bytes and store to memory.
-        __m128i rgba = _mm_set1_epi32(0xFF000000);
-        rgba = _mm_or_si128(rgba,                _mm_cvtps_epi32(dstReds)       );
-        rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens),  8));
-        rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues),  16));
-        _mm_storeu_si128((__m128i*) dst, rgba);
+        auto rgba = (Sk4i{(int)0xFF000000}          )
+                  | (SkNx_cast<int>(dstReds)        )
+                  | (SkNx_cast<int>(dstGreens) <<  8)
+                  | (SkNx_cast<int>(dstBlues)  << 16);
+        rgba.store(dst);
 
         dst += 4;
         src += 4;
@@ -250,27 +214,25 @@ static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,
     }
 
     while (len > 0) {
-        // Splat the red, green, and blue components.
-        __m128 r = _mm_set1_ps(linear_from_curve[(src[0] >>  0) & 0xFF]),
-               g = _mm_set1_ps(linear_from_curve[(src[0] >>  8) & 0xFF]),
-               b = _mm_set1_ps(linear_from_curve[(src[0] >> 16) & 0xFF]);
+        // Splat r,g,b across a register each.
+        auto r = Sk4f{linear_from_curve[(*src >>  0) & 0xFF]},
+             g = Sk4f{linear_from_curve[(*src >>  8) & 0xFF]},
+             b = Sk4f{linear_from_curve[(*src >> 16) & 0xFF]};
 
-        // Apply the transformation matrix to dst gamut.
-        __m128 dstPixel =                      _mm_mul_ps(r, rXgXbX);
-               dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY));
-               dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ));
+        // Apply transformation matrix to dst gamut.
+        auto dstPixel = rXgXbX*r + rYgYbY*g + rZgZbZ*b;
 
         // Convert to dst gamma.
         dstPixel = linear_to_2dot2(dstPixel);
 
-        // Clamp floats to 0-255 range.
+        // Clamp floats to byte range.
         dstPixel = clamp_0_to_255(dstPixel);
 
         // Convert to bytes and store to memory.
-        __m128i dstInts = _mm_cvtps_epi32(dstPixel);
-        __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts), dstInts);
-        dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes);
-        _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes));
+        uint32_t rgba;
+        SkNx_cast<uint8_t>(dstPixel).store(&rgba);
+        rgba |= 0xFF000000;
+        *dst = rgba;
 
         dst += 1;
         src += 1;
@@ -278,57 +240,6 @@ static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,
     }
 }
 
-#else
-
-static uint8_t clamp_float_to_byte(float v) {
-    // The ordering of the logic is a little strange here in order
-    // to make sure we convert NaNs to 0.
-    if (v >= 254.5f) {
-        return 255;
-    } else if (v >= 0.5f) {
-        return (uint8_t) (v + 0.5f);
-    } else {
-        return 0;
-    }
-}
-
-template <const float (&linear_from_curve)[256]>
-static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,
-                             const float matrix[16]) {
-    while (len-- > 0) {
-        // Convert to linear.
-        float srcFloats[3];
-        srcFloats[0] = linear_from_curve[(*src >>  0) & 0xFF];
-        srcFloats[1] = linear_from_curve[(*src >>  8) & 0xFF];
-        srcFloats[2] = linear_from_curve[(*src >> 16) & 0xFF];
-
-        // Convert to dst gamut.
-        float dstFloats[3];
-        dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +
-                       srcFloats[2] * matrix[8];
-        dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +
-                       srcFloats[2] * matrix[9];
-        dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +
-                       srcFloats[2] * matrix[10];
-
-        // Convert to dst gamma.
-        // Note: pow is really, really slow.  We will suffer when SSE2 is not supported.
-        dstFloats[0] = powf(dstFloats[0], (1/2.2f)) * 255.0f;
-        dstFloats[1] = powf(dstFloats[1], (1/2.2f)) * 255.0f;
-        dstFloats[2] = powf(dstFloats[2], (1/2.2f)) * 255.0f;
-
-        *dst = (0xFF                              << 24) |
-               (clamp_float_to_byte(dstFloats[2]) << 16) |
-               (clamp_float_to_byte(dstFloats[1]) <<  8) |
-               (clamp_float_to_byte(dstFloats[0]) <<  0);
-
-        dst++;
-        src++;
-    }
-}
-
-#endif
-
 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, int len,
                                            const float matrix[16]) {
     color_xform_RGB1<linear_from_srgb>(dst, src, len, matrix);
@@ -339,6 +250,6 @@ static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src,
     color_xform_RGB1<linear_from_2dot2>(dst, src, len, matrix);
 }
 
-}
+}  // namespace SK_OPTS_NS
 
 #endif // SkColorXform_opts_DEFINED
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index 063b99f89a..bb81cded18 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -386,6 +386,8 @@ public:
     SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); }
     SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); }
 
+    SkNx operator | (const SkNx& o) const { return vorrq_s32(fVec, o.fVec); }
+
     SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); }
     SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); }
 
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index 0b22a5a3c6..bdc6d77ce7 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -152,6 +152,8 @@ public:
                                   _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)));
     }
 
+    SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
+
     SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
     SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); }