4 files changed, 148 insertions, 48 deletions
diff --git a/src/splicer/SkSplicer.cpp b/src/splicer/SkSplicer.cpp
index b289871576..f55eb34986 100644
--- a/src/splicer/SkSplicer.cpp
+++ b/src/splicer/SkSplicer.cpp
@@ -256,6 +256,7 @@ namespace {
             CASE(move_src_dst);
             CASE(move_dst_src);
             CASE(premul);
+            CASE(scale_u8);
             CASE(load_8888);
             CASE(store_8888);
         #undef CASE
diff --git a/src/splicer/SkSplicer_generated_lowp.h b/src/splicer/SkSplicer_generated_lowp.h
index 8e2796e7fe..3ea4962485 100644
--- a/src/splicer/SkSplicer_generated_lowp.h
+++ b/src/splicer/SkSplicer_generated_lowp.h
@@ -134,6 +134,28 @@ static const unsigned int kSplice_premul_lowp[] = {
     0x4e60ba02,                                 //  abs           v2.8h, v16.8h
     0x6f111622,                                 //  usra          v2.8h, v17.8h, #15
 };
+static const unsigned int kSplice_scale_u8_lowp[] = {
+    0xf9400048,                                 //  ldr           x8, [x2]
+    0xfc606910,                                 //  ldr           d16, [x8,x0]
+    0x2f0fa610,                                 //  ushll         v16.8h, v16.8b, #7
+    0x6f183610,                                 //  ursra         v16.8h, v16.8h, #8
+    0x6e70b411,                                 //  sqrdmulh      v17.8h, v0.8h, v16.8h
+    0x6e70b433,                                 //  sqrdmulh      v19.8h, v1.8h, v16.8h
+    0x6e70b455,                                 //  sqrdmulh      v21.8h, v2.8h, v16.8h
+    0x6e70b477,                                 //  sqrdmulh      v23.8h, v3.8h, v16.8h
+    0x4e201e12,                                 //  and           v18.16b, v16.16b, v0.16b
+    0x4e211e14,                                 //  and           v20.16b, v16.16b, v1.16b
+    0x4e221e16,                                 //  and           v22.16b, v16.16b, v2.16b
+    0x4e231e10,                                 //  and           v16.16b, v16.16b, v3.16b
+    0x4e60ba20,                                 //  abs           v0.8h, v17.8h
+    0x4e60ba61,                                 //  abs           v1.8h, v19.8h
+    0x4e60baa2,                                 //  abs           v2.8h, v21.8h
+    0x4e60bae3,                                 //  abs           v3.8h, v23.8h
+    0x6f111640,                                 //  usra          v0.8h, v18.8h, #15
+    0x6f111681,                                 //  usra          v1.8h, v20.8h, #15
+    0x6f1116c2,                                 //  usra          v2.8h, v22.8h, #15
+    0x6f111603,                                 //  usra          v3.8h, v16.8h, #15
+};
 static const unsigned int kSplice_load_8888_lowp[] = {
     0xf9400048,                                 //  ldr           x8, [x2]
     0x8b000908,                                 //  add           x8, x8, x0, lsl #2
@@ -280,6 +302,29 @@ static const unsigned int kSplice_premul_lowp[] = {
     0xf3911134,                                 //  vsra.u16      d1, d20, #15
     0xf3912130,                                 //  vsra.u16      d2, d16, #15
 };
+static const unsigned int kSplice_scale_u8_lowp[] = {
+    0xe592c000,                                 //  ldr           ip, [r2]
+    0xe08cc000,                                 //  add           ip, ip, r0
+    0xf4ec0c8f,                                 //  vld1.32       {d16[]}, [ip]
+    0xf3cf0a30,                                 //  vshll.u8      q8, d16, #7
+    0xf3d80370,                                 //  vrsra.u16     q8, q8, #8
+    0xf3502b20,                                 //  vqrdmulh.s16  d18, d0, d16
+    0xf3513b20,                                 //  vqrdmulh.s16  d19, d1, d16
+    0xf3524b20,                                 //  vqrdmulh.s16  d20, d2, d16
+    0xf3535b20,                                 //  vqrdmulh.s16  d21, d3, d16
+    0xf2406190,                                 //  vand          d22, d16, d0
+    0xf3b50322,                                 //  vabs.s16      d0, d18
+    0xf2407191,                                 //  vand          d23, d16, d1
+    0xf2402192,                                 //  vand          d18, d16, d2
+    0xf2400193,                                 //  vand          d16, d16, d3
+    0xf3b51323,                                 //  vabs.s16      d1, d19
+    0xf3b52324,                                 //  vabs.s16      d2, d20
+    0xf3b53325,                                 //  vabs.s16      d3, d21
+    0xf3910136,                                 //  vsra.u16      d0, d22, #15
+    0xf3911137,                                 //  vsra.u16      d1, d23, #15
+    0xf3912132,                                 //  vsra.u16      d2, d18, #15
+    0xf3913130,                                 //  vsra.u16      d3, d16, #15
+};
 static const unsigned int kSplice_load_8888_lowp[] = {
     0xe592c000,                                 //  ldr           ip, [r2]
     0xe08cc100,                                 //  add           ip, ip, r0, lsl #2
@@ -410,6 +455,25 @@ static const unsigned char kSplice_premul_lowp[] = {
     0xc4,0xe2,0x6d,0x0b,0xd3,                   //  vpmulhrsw     %ymm3,%ymm2,%ymm2
     0xc4,0xe2,0x7d,0x1d,0xd2,                   //  vpabsw        %ymm2,%ymm2
 };
+static const unsigned char kSplice_scale_u8_lowp[] = {
+    0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
+    0xc4,0x62,0x7d,0x30,0x04,0x38,              //  vpmovzxbw     (%rax,%rdi,1),%ymm8
+    0xc4,0xc1,0x35,0x71,0xf0,0x07,              //  vpsllw        $0x7,%ymm8,%ymm9
+    0xc4,0xc1,0x2d,0x71,0xd0,0x01,              //  vpsrlw        $0x1,%ymm8,%ymm10
+    0xc4,0x41,0x35,0xdd,0xca,                   //  vpaddusw      %ymm10,%ymm9,%ymm9
+    0xc4,0x62,0x7d,0x79,0x11,                   //  vpbroadcastw  (%rcx),%ymm10
+    0xc4,0x41,0x3d,0xdd,0xc2,                   //  vpaddusw      %ymm10,%ymm8,%ymm8
+    0xc4,0xc1,0x3d,0x71,0xd0,0x08,              //  vpsrlw        $0x8,%ymm8,%ymm8
+    0xc4,0x41,0x35,0xdd,0xc0,                   //  vpaddusw      %ymm8,%ymm9,%ymm8
+    0xc4,0xc2,0x7d,0x0b,0xc0,                   //  vpmulhrsw     %ymm8,%ymm0,%ymm0
+    0xc4,0xe2,0x7d,0x1d,0xc0,                   //  vpabsw        %ymm0,%ymm0
+    0xc4,0xc2,0x75,0x0b,0xc8,                   //  vpmulhrsw     %ymm8,%ymm1,%ymm1
+    0xc4,0xe2,0x7d,0x1d,0xc9,                   //  vpabsw        %ymm1,%ymm1
+    0xc4,0xc2,0x6d,0x0b,0xd0,                   //  vpmulhrsw     %ymm8,%ymm2,%ymm2
+    0xc4,0xe2,0x7d,0x1d,0xd2,                   //  vpabsw        %ymm2,%ymm2
+    0xc4,0xc2,0x65,0x0b,0xd8,                   //  vpmulhrsw     %ymm8,%ymm3,%ymm3
+    0xc4,0xe2,0x7d,0x1d,0xdb,                   //  vpabsw        %ymm3,%ymm3
+};
 static const unsigned char kSplice_load_8888_lowp[] = {
     0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
     0xc5,0xfa,0x6f,0x04,0xb8,                   //  vmovdqu       (%rax,%rdi,4),%xmm0
diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp
index c61a267fc8..4112779eb5 100644
--- a/src/splicer/SkSplicer_stages.cpp
+++ b/src/splicer/SkSplicer_stages.cpp
@@ -12,6 +12,9 @@
     #error This file is not like the rest of Skia.  It must be compiled with clang.
 #endif
 
+// It's tricky to relocate code referencing ordinary constants, so we read them from this struct.
+using K = const SkSplicer_constants;
+
 #if defined(__aarch64__)
     #include <arm_neon.h>
 
@@ -95,7 +98,6 @@ static T unaligned_load(const P* p) {
 #endif
 
 // Stages all fit a common interface that allows SkSplicer to splice them together.
-using K = const SkSplicer_constants;
 using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F);
 
 // Stage's arguments act as the working set of registers within the final spliced function.
diff --git a/src/splicer/SkSplicer_stages_lowp.cpp b/src/splicer/SkSplicer_stages_lowp.cpp
index ef3ab4032c..38a2632d5a 100644
--- a/src/splicer/SkSplicer_stages_lowp.cpp
+++ b/src/splicer/SkSplicer_stages_lowp.cpp
@@ -15,9 +15,14 @@
     #error This file is not like the rest of Skia.  It must be compiled with clang.
 #endif
 
+// We use a set of constants suitable for SkFixed15 math.
+using K = const SkSplicer_constants_lowp;
+
 #if defined(__aarch64__)
     #include <arm_neon.h>
 
+    using U8 = uint8_t __attribute__((ext_vector_type(8)));
+
     // In this file, F is a vector of SkFixed15.
     // See SkFixed15.h for notes on its various operations.
     struct F {
@@ -43,12 +48,24 @@
     static F min(F a, F b) { return vminq_u16(a,b); }
     static F max(F a, F b) { return vmaxq_u16(a,b); }
 
+    static F from_u8(U8 u8, K*) {
+        // u8 * (32768/255) == u8 * 128.50196... == u8*128 + u8/2 + (u8+1)>>8
+        //
+        // Here we do (u8*128 <rounding +> u8/2), which is correct for 0 and 255,
+        // and never off by more than 1 anywhere.  It's just 2 instructions in NEON:
+        auto u16 = vshll_n_u8(u8, 7);     // u16 =   u8*128
+        u16 = vrsraq_n_u16(u16, u16, 8);  // u16 += u16/256, with rounding
+        return u16;
+    };
+
 #elif defined(__ARM_NEON__)
     #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
         #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
     #endif
     #include <arm_neon.h>
 
+    using U8 = uint8_t __attribute__((ext_vector_type(8)));  // But, only low 4 lanes active.
+
     struct F {
         using V = uint16_t __attribute__((ext_vector_type(4)));
 
@@ -72,12 +89,20 @@
     static F min(F a, F b) { return vmin_u16(a,b); }
     static F max(F a, F b) { return vmax_u16(a,b); }
 
+    static F from_u8(U8 u8, K*) {
+        auto u16 = vshll_n_u8(u8, 7);     // Identical to aarch64...
+        u16 = vrsraq_n_u16(u16, u16, 8);  //
+        return vget_low_u16(u16);         // ...but only the low 4 lanes are active.
+    }
+
 #else
     #if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
         #error On x86, compile with -mavx2 -mfma -mf16c.
     #endif
     #include <immintrin.h>
 
+    using U8 = uint8_t __attribute__((ext_vector_type(16)));
+
     struct F {
         using V = uint16_t __attribute__((ext_vector_type(16)));
 
@@ -97,20 +122,31 @@
     };
     static F min(F a, F b) { return _mm256_min_epu16(a,b); }
     static F max(F a, F b) { return _mm256_max_epu16(a,b); }
+
+    static F from_u8(U8 u8, K* k) {
+        // Nothing too interesting here.  We follow the stock SkFixed15 formula.
+        F u16 = _mm256_cvtepu8_epi16(u8);
+        return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8);
+    }
 #endif
 
 // No platform actually supports FMA for SkFixed15.
 // This fma() method just makes it easier to port stages to lowp.
 static F fma(F f, F m, F a) { return f*m+a; }
 
+template <typename T, typename P>
+static T unaligned_load(const P* p) {
+    T v;
+    memcpy(&v, p, sizeof(v));
+    return v;
+}
+
 #if defined(__ARM_NEON__)
     #define C extern "C" __attribute__((pcs("aapcs-vfp")))
 #else
     #define C extern "C"
 #endif
 
-// We use a set of constants suitable for SkFixed15 math.
-using K = const SkSplicer_constants_lowp;
 using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F);
 
 // The armv7 aapcs-vfp calling convention makes us pass F::V instead of F if we want them in
@@ -198,32 +234,34 @@ STAGE(premul) {
     b = b * a;
 }
 
+STAGE(scale_u8) {
+    auto ptr = *(const uint8_t**)ctx + x;
+
+#if defined(__ARM_NEON__)
+    // On armv7, U8 can fit 8 bytes, but we only want to load 4.
+    U8 scales = vdup_n_u32(unaligned_load<uint32_t>(ptr));
+#else
+    U8 scales = unaligned_load<U8>(ptr);
+#endif
+
+    auto c = from_u8(scales, k);
+    r = r * c;
+    g = g * c;
+    b = b * c;
+    a = a * c;
+}
+
 STAGE(load_8888) {
     auto ptr = *(const uint32_t**)ctx + x;
 
 #if defined(__aarch64__)
-    auto to_fixed15 = [](uint8x8_t u8) {
-        // u8 * (32768/255) == u8 * 128.50196... == u8*128 + u8/2 + (u8+1)>>8  ( see SkFixed15.h)
-        //
-        // Here we do (u8*128 <rounding +> u8/2), which is the same as our canonical math for 0
-        // and 255, and never off by more than 1 in between.  Thanks to NEON, it's 2 instructions!
-        auto u16 = vshll_n_u8(u8, 7);      // u16 =  u8*128
-        return vrsraq_n_u16(u16, u16, 8);  // u16 + u16/256, with rounding
-    };
-
     uint8x8x4_t rgba = vld4_u8((const uint8_t*)ptr);
-    r = to_fixed15(rgba.val[0]);
-    g = to_fixed15(rgba.val[1]);
-    b = to_fixed15(rgba.val[2]);
-    a = to_fixed15(rgba.val[3]);
+    r = from_u8(rgba.val[0], k);
+    g = from_u8(rgba.val[1], k);
+    b = from_u8(rgba.val[2], k);
+    a = from_u8(rgba.val[3], k);
 
 #elif defined(__ARM_NEON__)
-    auto to_fixed15 = [](uint8x8_t u8) {
-        // Same as aarch64, but only keeping the bottom 4 lanes.
-        auto u16 = vshll_n_u8(u8, 7);
-        return vget_low_u16(vrsraq_n_u16(u16, u16, 8));
-    };
-
     // I can't get quite the code generation I want using vld4_lane_u8(),
     // so we're going to drop into assembly to do the loads.  :/
 
@@ -233,17 +271,12 @@ STAGE(load_8888) {
         "vld4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
         "vld4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
         : "+r"(ptr), "=w"(R), "=w"(G), "=w"(B), "=w"(A));
-    r = to_fixed15(R);
-    g = to_fixed15(G);
-    b = to_fixed15(B);
-    a = to_fixed15(A);
+    r = from_u8(R, k);
+    g = from_u8(G, k);
+    b = from_u8(B, k);
+    a = from_u8(A, k);
 
 #else
-    auto to_fixed15 = [k](__m128i u8) {
-        F u16 = _mm256_cvtepu8_epi16(u8);
-        return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8);
-    };
-
     // TODO: shorter, more confusing, faster with 256-bit loads and shuffles
 
     // Load 16 interplaced pixels.
@@ -268,10 +301,10 @@ STAGE(load_8888) {
          rg_89ABCDEF = _mm_unpacklo_epi8(_8ACE, _9BDF),  // r89ABCDEF g89ABCDEF
          ba_89ABCDEF = _mm_unpackhi_epi8(_8ACE, _9BDF);  // b89ABCDEF a89ABCDEF
 
-    r = to_fixed15(_mm_unpacklo_epi64(rg_01234567, rg_89ABCDEF));
-    g = to_fixed15(_mm_unpackhi_epi64(rg_01234567, rg_89ABCDEF));
-    b = to_fixed15(_mm_unpacklo_epi64(ba_01234567, ba_89ABCDEF));
-    a = to_fixed15(_mm_unpackhi_epi64(ba_01234567, ba_89ABCDEF));
+    r = from_u8(_mm_unpacklo_epi64(rg_01234567, rg_89ABCDEF), k);
+    g = from_u8(_mm_unpackhi_epi64(rg_01234567, rg_89ABCDEF), k);
+    b = from_u8(_mm_unpacklo_epi64(ba_01234567, ba_89ABCDEF), k);
+    a = from_u8(_mm_unpackhi_epi64(ba_01234567, ba_89ABCDEF), k);
 #endif
 }
 
@@ -279,7 +312,7 @@ STAGE(store_8888) {
     auto ptr = *(uint32_t**)ctx + x;
 
 #if defined(__aarch64__)
-    auto from_fixed15 = [](F v) {
+    auto to_u8 = [](F v) {
         // The canonical math for this from SkFixed15.h is (v - (v>>8)) >> 7.
         // But what's really most important is that all bytes round trip.
 
@@ -288,14 +321,14 @@ STAGE(store_8888) {
     };
 
     uint8x8x4_t rgba = {{
-        from_fixed15(r),
-        from_fixed15(g),
-        from_fixed15(b),
-        from_fixed15(a),
+        to_u8(r),
+        to_u8(g),
+        to_u8(b),
+        to_u8(a),
     }};
     vst4_u8((uint8_t*)ptr, rgba);
 #elif defined(__ARM_NEON__)
-    auto from_fixed15 = [](F v) {
+    auto to_u8 = [](F v) {
         // Same as aarch64, but first we need to pad our vectors from 8 to 16 bytes.
         F whatever;
         return vqshrn_n_u16(vcombine_u8(v, whatever), 7);
@@ -307,22 +340,22 @@ STAGE(store_8888) {
         "vst4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
         "vst4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
         : "+r"(ptr)
-        : "w"(from_fixed15(r)), "w"(from_fixed15(g)), "w"(from_fixed15(b)), "w"(from_fixed15(a))
+        : "w"(to_u8(r)), "w"(to_u8(g)), "w"(to_u8(b)), "w"(to_u8(a))
         : "memory");
 
 #else
-    auto from_fixed15 = [](F v) {
-        // See the note in aarch64's from_fixed15().  The same roundtrip goal applies here.
+    auto to_u8 = [](F v) {
+        // See the note in aarch64's to_u8().  The same roundtrip goal applies here.
         // Here we take a different approach: (v saturated+ v) >> 8.
         v = (v+v) >> 8;
         return _mm_packus_epi16(_mm256_extracti128_si256(v, 0),
                                 _mm256_extracti128_si256(v, 1));
     };
 
-    auto R = from_fixed15(r),
-         G = from_fixed15(g),
-         B = from_fixed15(b),
-         A = from_fixed15(a);
+    auto R = to_u8(r),
+         G = to_u8(g),
+         B = to_u8(b),
+         A = to_u8(a);
 
     auto rg_01234567 = _mm_unpacklo_epi8(R,G),  // rg0 rg1 rg2 ... rg7
          rg_89ABCDEF = _mm_unpackhi_epi8(R,G),  // rg8 rg9 rgA ... rgF