aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--src/splicer/SkSplicer.cpp1
-rw-r--r--src/splicer/SkSplicer_generated_lowp.h64
-rw-r--r--src/splicer/SkSplicer_stages.cpp4
-rw-r--r--src/splicer/SkSplicer_stages_lowp.cpp127
4 files changed, 148 insertions, 48 deletions
diff --git a/src/splicer/SkSplicer.cpp b/src/splicer/SkSplicer.cpp
index b289871576..f55eb34986 100644
--- a/src/splicer/SkSplicer.cpp
+++ b/src/splicer/SkSplicer.cpp
@@ -256,6 +256,7 @@ namespace {
CASE(move_src_dst);
CASE(move_dst_src);
CASE(premul);
+ CASE(scale_u8);
CASE(load_8888);
CASE(store_8888);
#undef CASE
diff --git a/src/splicer/SkSplicer_generated_lowp.h b/src/splicer/SkSplicer_generated_lowp.h
index 8e2796e7fe..3ea4962485 100644
--- a/src/splicer/SkSplicer_generated_lowp.h
+++ b/src/splicer/SkSplicer_generated_lowp.h
@@ -134,6 +134,28 @@ static const unsigned int kSplice_premul_lowp[] = {
0x4e60ba02, // abs v2.8h, v16.8h
0x6f111622, // usra v2.8h, v17.8h, #15
};
+static const unsigned int kSplice_scale_u8_lowp[] = {
+ 0xf9400048, // ldr x8, [x2]
+ 0xfc606910, // ldr d16, [x8,x0]
+ 0x2f0fa610, // ushll v16.8h, v16.8b, #7
+ 0x6f183610, // ursra v16.8h, v16.8h, #8
+ 0x6e70b411, // sqrdmulh v17.8h, v0.8h, v16.8h
+ 0x6e70b433, // sqrdmulh v19.8h, v1.8h, v16.8h
+ 0x6e70b455, // sqrdmulh v21.8h, v2.8h, v16.8h
+ 0x6e70b477, // sqrdmulh v23.8h, v3.8h, v16.8h
+ 0x4e201e12, // and v18.16b, v16.16b, v0.16b
+ 0x4e211e14, // and v20.16b, v16.16b, v1.16b
+ 0x4e221e16, // and v22.16b, v16.16b, v2.16b
+ 0x4e231e10, // and v16.16b, v16.16b, v3.16b
+ 0x4e60ba20, // abs v0.8h, v17.8h
+ 0x4e60ba61, // abs v1.8h, v19.8h
+ 0x4e60baa2, // abs v2.8h, v21.8h
+ 0x4e60bae3, // abs v3.8h, v23.8h
+ 0x6f111640, // usra v0.8h, v18.8h, #15
+ 0x6f111681, // usra v1.8h, v20.8h, #15
+ 0x6f1116c2, // usra v2.8h, v22.8h, #15
+ 0x6f111603, // usra v3.8h, v16.8h, #15
+};
static const unsigned int kSplice_load_8888_lowp[] = {
0xf9400048, // ldr x8, [x2]
0x8b000908, // add x8, x8, x0, lsl #2
@@ -280,6 +302,29 @@ static const unsigned int kSplice_premul_lowp[] = {
0xf3911134, // vsra.u16 d1, d20, #15
0xf3912130, // vsra.u16 d2, d16, #15
};
+static const unsigned int kSplice_scale_u8_lowp[] = {
+ 0xe592c000, // ldr ip, [r2]
+ 0xe08cc000, // add ip, ip, r0
+ 0xf4ec0c8f, // vld1.32 {d16[]}, [ip]
+ 0xf3cf0a30, // vshll.u8 q8, d16, #7
+ 0xf3d80370, // vrsra.u16 q8, q8, #8
+ 0xf3502b20, // vqrdmulh.s16 d18, d0, d16
+ 0xf3513b20, // vqrdmulh.s16 d19, d1, d16
+ 0xf3524b20, // vqrdmulh.s16 d20, d2, d16
+ 0xf3535b20, // vqrdmulh.s16 d21, d3, d16
+ 0xf2406190, // vand d22, d16, d0
+ 0xf3b50322, // vabs.s16 d0, d18
+ 0xf2407191, // vand d23, d16, d1
+ 0xf2402192, // vand d18, d16, d2
+ 0xf2400193, // vand d16, d16, d3
+ 0xf3b51323, // vabs.s16 d1, d19
+ 0xf3b52324, // vabs.s16 d2, d20
+ 0xf3b53325, // vabs.s16 d3, d21
+ 0xf3910136, // vsra.u16 d0, d22, #15
+ 0xf3911137, // vsra.u16 d1, d23, #15
+ 0xf3912132, // vsra.u16 d2, d18, #15
+ 0xf3913130, // vsra.u16 d3, d16, #15
+};
static const unsigned int kSplice_load_8888_lowp[] = {
0xe592c000, // ldr ip, [r2]
0xe08cc100, // add ip, ip, r0, lsl #2
@@ -410,6 +455,25 @@ static const unsigned char kSplice_premul_lowp[] = {
0xc4,0xe2,0x6d,0x0b,0xd3, // vpmulhrsw %ymm3,%ymm2,%ymm2
0xc4,0xe2,0x7d,0x1d,0xd2, // vpabsw %ymm2,%ymm2
};
+static const unsigned char kSplice_scale_u8_lowp[] = {
+ 0x48,0x8b,0x02, // mov (%rdx),%rax
+ 0xc4,0x62,0x7d,0x30,0x04,0x38, // vpmovzxbw (%rax,%rdi,1),%ymm8
+ 0xc4,0xc1,0x35,0x71,0xf0,0x07, // vpsllw $0x7,%ymm8,%ymm9
+ 0xc4,0xc1,0x2d,0x71,0xd0,0x01, // vpsrlw $0x1,%ymm8,%ymm10
+ 0xc4,0x41,0x35,0xdd,0xca, // vpaddusw %ymm10,%ymm9,%ymm9
+ 0xc4,0x62,0x7d,0x79,0x11, // vpbroadcastw (%rcx),%ymm10
+ 0xc4,0x41,0x3d,0xdd,0xc2, // vpaddusw %ymm10,%ymm8,%ymm8
+ 0xc4,0xc1,0x3d,0x71,0xd0,0x08, // vpsrlw $0x8,%ymm8,%ymm8
+ 0xc4,0x41,0x35,0xdd,0xc0, // vpaddusw %ymm8,%ymm9,%ymm8
+ 0xc4,0xc2,0x7d,0x0b,0xc0, // vpmulhrsw %ymm8,%ymm0,%ymm0
+ 0xc4,0xe2,0x7d,0x1d,0xc0, // vpabsw %ymm0,%ymm0
+ 0xc4,0xc2,0x75,0x0b,0xc8, // vpmulhrsw %ymm8,%ymm1,%ymm1
+ 0xc4,0xe2,0x7d,0x1d,0xc9, // vpabsw %ymm1,%ymm1
+ 0xc4,0xc2,0x6d,0x0b,0xd0, // vpmulhrsw %ymm8,%ymm2,%ymm2
+ 0xc4,0xe2,0x7d,0x1d,0xd2, // vpabsw %ymm2,%ymm2
+ 0xc4,0xc2,0x65,0x0b,0xd8, // vpmulhrsw %ymm8,%ymm3,%ymm3
+ 0xc4,0xe2,0x7d,0x1d,0xdb, // vpabsw %ymm3,%ymm3
+};
static const unsigned char kSplice_load_8888_lowp[] = {
0x48,0x8b,0x02, // mov (%rdx),%rax
0xc5,0xfa,0x6f,0x04,0xb8, // vmovdqu (%rax,%rdi,4),%xmm0
diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp
index c61a267fc8..4112779eb5 100644
--- a/src/splicer/SkSplicer_stages.cpp
+++ b/src/splicer/SkSplicer_stages.cpp
@@ -12,6 +12,9 @@
#error This file is not like the rest of Skia. It must be compiled with clang.
#endif
+// It's tricky to relocate code referencing ordinary constants, so we read them from this struct.
+using K = const SkSplicer_constants;
+
#if defined(__aarch64__)
#include <arm_neon.h>
@@ -95,7 +98,6 @@ static T unaligned_load(const P* p) {
#endif
// Stages all fit a common interface that allows SkSplicer to splice them together.
-using K = const SkSplicer_constants;
using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F);
// Stage's arguments act as the working set of registers within the final spliced function.
diff --git a/src/splicer/SkSplicer_stages_lowp.cpp b/src/splicer/SkSplicer_stages_lowp.cpp
index ef3ab4032c..38a2632d5a 100644
--- a/src/splicer/SkSplicer_stages_lowp.cpp
+++ b/src/splicer/SkSplicer_stages_lowp.cpp
@@ -15,9 +15,14 @@
#error This file is not like the rest of Skia. It must be compiled with clang.
#endif
+// We use a set of constants suitable for SkFixed15 math.
+using K = const SkSplicer_constants_lowp;
+
#if defined(__aarch64__)
#include <arm_neon.h>
+ using U8 = uint8_t __attribute__((ext_vector_type(8)));
+
// In this file, F is a vector of SkFixed15.
// See SkFixed15.h for notes on its various operations.
struct F {
@@ -43,12 +48,24 @@
static F min(F a, F b) { return vminq_u16(a,b); }
static F max(F a, F b) { return vmaxq_u16(a,b); }
+ static F from_u8(U8 u8, K*) {
+ // u8 * (32768/255) == u8 * 128.50196... == u8*128 + u8/2 + (u8+1)>>8
+ //
+ // Here we do (u8*128 <rounding +> u8/2), which is correct for 0 and 255,
+ // and never off by more than 1 anywhere. It's just 2 instructions in NEON:
+ auto u16 = vshll_n_u8(u8, 7); // u16 = u8*128
+ u16 = vrsraq_n_u16(u16, u16, 8); // u16 += u16/256, with rounding
+ return u16;
+ };
+
#elif defined(__ARM_NEON__)
#if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
#error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
#endif
#include <arm_neon.h>
+ using U8 = uint8_t __attribute__((ext_vector_type(8))); // But, only low 4 lanes active.
+
struct F {
using V = uint16_t __attribute__((ext_vector_type(4)));
@@ -72,12 +89,20 @@
static F min(F a, F b) { return vmin_u16(a,b); }
static F max(F a, F b) { return vmax_u16(a,b); }
+ static F from_u8(U8 u8, K*) {
+ auto u16 = vshll_n_u8(u8, 7); // Identical to aarch64...
+ u16 = vrsraq_n_u16(u16, u16, 8); //
+ return vget_low_u16(u16); // ...but only the low 4 lanes are active.
+ }
+
#else
#if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
#error On x86, compile with -mavx2 -mfma -mf16c.
#endif
#include <immintrin.h>
+ using U8 = uint8_t __attribute__((ext_vector_type(16)));
+
struct F {
using V = uint16_t __attribute__((ext_vector_type(16)));
@@ -97,20 +122,31 @@
};
static F min(F a, F b) { return _mm256_min_epu16(a,b); }
static F max(F a, F b) { return _mm256_max_epu16(a,b); }
+
+ static F from_u8(U8 u8, K* k) {
+ // Nothing too interesting here. We follow the stock SkFixed15 formula.
+ F u16 = _mm256_cvtepu8_epi16(u8);
+ return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8);
+ }
#endif
// No platform actually supports FMA for SkFixed15.
// This fma() method just makes it easier to port stages to lowp.
static F fma(F f, F m, F a) { return f*m+a; }
+template <typename T, typename P>
+static T unaligned_load(const P* p) {
+ T v;
+ memcpy(&v, p, sizeof(v));
+ return v;
+}
+
#if defined(__ARM_NEON__)
#define C extern "C" __attribute__((pcs("aapcs-vfp")))
#else
#define C extern "C"
#endif
-// We use a set of constants suitable for SkFixed15 math.
-using K = const SkSplicer_constants_lowp;
using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F);
// The armv7 aapcs-vfp calling convention makes us pass F::V instead of F if we want them in
@@ -198,32 +234,34 @@ STAGE(premul) {
b = b * a;
}
+STAGE(scale_u8) {
+ auto ptr = *(const uint8_t**)ctx + x;
+
+#if defined(__ARM_NEON__)
+ // On armv7, U8 can fit 8 bytes, but we only want to load 4.
+ U8 scales = vdup_n_u32(unaligned_load<uint32_t>(ptr));
+#else
+ U8 scales = unaligned_load<U8>(ptr);
+#endif
+
+ auto c = from_u8(scales, k);
+ r = r * c;
+ g = g * c;
+ b = b * c;
+ a = a * c;
+}
+
STAGE(load_8888) {
auto ptr = *(const uint32_t**)ctx + x;
#if defined(__aarch64__)
- auto to_fixed15 = [](uint8x8_t u8) {
- // u8 * (32768/255) == u8 * 128.50196... == u8*128 + u8/2 + (u8+1)>>8 ( see SkFixed15.h)
- //
- // Here we do (u8*128 <rounding +> u8/2), which is the same as our canonical math for 0
- // and 255, and never off by more than 1 in between. Thanks to NEON, it's 2 instructions!
- auto u16 = vshll_n_u8(u8, 7); // u16 = u8*128
- return vrsraq_n_u16(u16, u16, 8); // u16 + u16/256, with rounding
- };
-
uint8x8x4_t rgba = vld4_u8((const uint8_t*)ptr);
- r = to_fixed15(rgba.val[0]);
- g = to_fixed15(rgba.val[1]);
- b = to_fixed15(rgba.val[2]);
- a = to_fixed15(rgba.val[3]);
+ r = from_u8(rgba.val[0], k);
+ g = from_u8(rgba.val[1], k);
+ b = from_u8(rgba.val[2], k);
+ a = from_u8(rgba.val[3], k);
#elif defined(__ARM_NEON__)
- auto to_fixed15 = [](uint8x8_t u8) {
- // Same as aarch64, but only keeping the bottom 4 lanes.
- auto u16 = vshll_n_u8(u8, 7);
- return vget_low_u16(vrsraq_n_u16(u16, u16, 8));
- };
-
// I can't get quite the code generation I want using vld4_lane_u8(),
// so we're going to drop into assembly to do the loads. :/
@@ -233,17 +271,12 @@ STAGE(load_8888) {
"vld4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
"vld4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
: "+r"(ptr), "=w"(R), "=w"(G), "=w"(B), "=w"(A));
- r = to_fixed15(R);
- g = to_fixed15(G);
- b = to_fixed15(B);
- a = to_fixed15(A);
+ r = from_u8(R, k);
+ g = from_u8(G, k);
+ b = from_u8(B, k);
+ a = from_u8(A, k);
#else
- auto to_fixed15 = [k](__m128i u8) {
- F u16 = _mm256_cvtepu8_epi16(u8);
- return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8);
- };
-
// TODO: shorter, more confusing, faster with 256-bit loads and shuffles
// Load 16 interplaced pixels.
@@ -268,10 +301,10 @@ STAGE(load_8888) {
rg_89ABCDEF = _mm_unpacklo_epi8(_8ACE, _9BDF), // r89ABCDEF g89ABCDEF
ba_89ABCDEF = _mm_unpackhi_epi8(_8ACE, _9BDF); // b89ABCDEF a89ABCDEF
- r = to_fixed15(_mm_unpacklo_epi64(rg_01234567, rg_89ABCDEF));
- g = to_fixed15(_mm_unpackhi_epi64(rg_01234567, rg_89ABCDEF));
- b = to_fixed15(_mm_unpacklo_epi64(ba_01234567, ba_89ABCDEF));
- a = to_fixed15(_mm_unpackhi_epi64(ba_01234567, ba_89ABCDEF));
+ r = from_u8(_mm_unpacklo_epi64(rg_01234567, rg_89ABCDEF), k);
+ g = from_u8(_mm_unpackhi_epi64(rg_01234567, rg_89ABCDEF), k);
+ b = from_u8(_mm_unpacklo_epi64(ba_01234567, ba_89ABCDEF), k);
+ a = from_u8(_mm_unpackhi_epi64(ba_01234567, ba_89ABCDEF), k);
#endif
}
@@ -279,7 +312,7 @@ STAGE(store_8888) {
auto ptr = *(uint32_t**)ctx + x;
#if defined(__aarch64__)
- auto from_fixed15 = [](F v) {
+ auto to_u8 = [](F v) {
// The canonical math for this from SkFixed15.h is (v - (v>>8)) >> 7.
// But what's really most important is that all bytes round trip.
@@ -288,14 +321,14 @@ STAGE(store_8888) {
};
uint8x8x4_t rgba = {{
- from_fixed15(r),
- from_fixed15(g),
- from_fixed15(b),
- from_fixed15(a),
+ to_u8(r),
+ to_u8(g),
+ to_u8(b),
+ to_u8(a),
}};
vst4_u8((uint8_t*)ptr, rgba);
#elif defined(__ARM_NEON__)
- auto from_fixed15 = [](F v) {
+ auto to_u8 = [](F v) {
// Same as aarch64, but first we need to pad our vectors from 8 to 16 bytes.
F whatever;
return vqshrn_n_u16(vcombine_u8(v, whatever), 7);
@@ -307,22 +340,22 @@ STAGE(store_8888) {
"vst4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
"vst4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
: "+r"(ptr)
- : "w"(from_fixed15(r)), "w"(from_fixed15(g)), "w"(from_fixed15(b)), "w"(from_fixed15(a))
+ : "w"(to_u8(r)), "w"(to_u8(g)), "w"(to_u8(b)), "w"(to_u8(a))
: "memory");
#else
- auto from_fixed15 = [](F v) {
- // See the note in aarch64's from_fixed15(). The same roundtrip goal applies here.
+ auto to_u8 = [](F v) {
+ // See the note in aarch64's to_u8(). The same roundtrip goal applies here.
// Here we take a different approach: (v saturated+ v) >> 8.
v = (v+v) >> 8;
return _mm_packus_epi16(_mm256_extracti128_si256(v, 0),
_mm256_extracti128_si256(v, 1));
};
- auto R = from_fixed15(r),
- G = from_fixed15(g),
- B = from_fixed15(b),
- A = from_fixed15(a);
+ auto R = to_u8(r),
+ G = to_u8(g),
+ B = to_u8(b),
+ A = to_u8(a);
auto rg_01234567 = _mm_unpacklo_epi8(R,G), // rg0 rg1 rg2 ... rg7
rg_89ABCDEF = _mm_unpackhi_epi8(R,G), // rg8 rg9 rgA ... rgF