aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-02-22 13:52:40 -0500
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-02-22 19:36:27 +0000
commit2b767361de00fd85cb32dce62c4a95d30b7eaabf (patch)
tree2c8f808a0c0f5d8db23585461647a45eeed23b24
parentbc9956de31da06529b540918832f2435f884ac26 (diff)
SkJumper: implement lerp_u8
Going to start filling these in in biggest-bang-for-the-buck order. lerp_u8 (i.e. text drawing) is number 1 right now. Change-Id: If58eaf8ddbb93a6b954c3700fa1a476dca94a809 Reviewed-on: https://skia-review.googlesource.com/8856 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
-rw-r--r--src/jumper/SkJumper.cpp1
-rw-r--r--src/jumper/SkJumper_generated.S136
-rw-r--r--src/jumper/SkJumper_generated_win.S70
-rw-r--r--src/jumper/SkJumper_stages.cpp15
4 files changed, 222 insertions, 0 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 60b34791c3..3c7aca6057 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -45,6 +45,7 @@ static K kConstants = {
M(from_srgb) \
M(to_srgb) \
M(scale_u8) \
+ M(lerp_u8) \
M(load_tables) \
M(load_8888) \
M(store_8888) \
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index cdd81d1416..af76618632 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -356,6 +356,38 @@ _sk_scale_u8_aarch64:
.long 0x6e23de03 // fmul v3.4s, v16.4s, v3.4s
.long 0xd61f0060 // br x3
+.globl _sk_lerp_u8_aarch64
+_sk_lerp_u8_aarch64:
+ .long 0xa8c10c28 // ldp x8, x3, [x1],#16
+ .long 0xbd400c51 // ldr s17, [x2,#12]
+ .long 0x4ea4d412 // fsub v18.4s, v0.4s, v4.4s
+ .long 0xf9400108 // ldr x8, [x8]
+ .long 0x8b000108 // add x8, x8, x0
+ .long 0x39400109 // ldrb w9, [x8]
+ .long 0x3940050a // ldrb w10, [x8,#1]
+ .long 0x3940090b // ldrb w11, [x8,#2]
+ .long 0x39400d08 // ldrb w8, [x8,#3]
+ .long 0x4e021d30 // mov v16.h[0], w9
+ .long 0x4e061d50 // mov v16.h[1], w10
+ .long 0x4e0a1d70 // mov v16.h[2], w11
+ .long 0x4e0e1d10 // mov v16.h[3], w8
+ .long 0x2f07b7f0 // bic v16.4h, #0xff, lsl #8
+ .long 0x2f10a600 // uxtl v0.4s, v16.4h
+ .long 0x6e21d800 // ucvtf v0.4s, v0.4s
+ .long 0x4f919010 // fmul v16.4s, v0.4s, v17.s[0]
+ .long 0x4ea41c80 // mov v0.16b, v4.16b
+ .long 0x4ea5d431 // fsub v17.4s, v1.4s, v5.4s
+ .long 0x4ea51ca1 // mov v1.16b, v5.16b
+ .long 0x4e32ce00 // fmla v0.4s, v16.4s, v18.4s
+ .long 0x4ea6d452 // fsub v18.4s, v2.4s, v6.4s
+ .long 0x4e31ce01 // fmla v1.4s, v16.4s, v17.4s
+ .long 0x4ea61cc2 // mov v2.16b, v6.16b
+ .long 0x4ea7d471 // fsub v17.4s, v3.4s, v7.4s
+ .long 0x4ea71ce3 // mov v3.16b, v7.16b
+ .long 0x4e32ce02 // fmla v2.4s, v16.4s, v18.4s
+ .long 0x4e31ce03 // fmla v3.4s, v16.4s, v17.4s
+ .long 0xd61f0060 // br x3
+
.globl _sk_load_tables_aarch64
_sk_load_tables_aarch64:
.long 0xa8c10c28 // ldp x8, x3, [x1],#16
@@ -930,6 +962,40 @@ _sk_scale_u8_vfp4:
.long 0xecbd8b02 // vpop {d8}
.long 0xe12fff1c // bx ip
+.globl _sk_lerp_u8_vfp4
+_sk_lerp_u8_vfp4:
+ .long 0xed2d8b02 // vpush {d8}
+ .long 0xe24dd008 // sub sp, sp, #8
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xf2612d05 // vsub.f32 d18, d1, d5
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xf2623d06 // vsub.f32 d19, d2, d6
+ .long 0xf2634d07 // vsub.f32 d20, d3, d7
+ .long 0xe2811008 // add r1, r1, #8
+ .long 0xe5933000 // ldr r3, [r3]
+ .long 0xf2251115 // vorr d1, d5, d5
+ .long 0xf2262116 // vorr d2, d6, d6
+ .long 0xe0833000 // add r3, r3, r0
+ .long 0xf2273117 // vorr d3, d7, d7
+ .long 0xe1d330b0 // ldrh r3, [r3]
+ .long 0xe1cd30b4 // strh r3, [sp, #4]
+ .long 0xe28d3004 // add r3, sp, #4
+ .long 0xed928a03 // vldr s16, [r2, #12]
+ .long 0xf4e3041f // vld1.16 {d16[0]}, [r3 :16]
+ .long 0xf3c80a30 // vmovl.u8 q8, d16
+ .long 0xf3d00a30 // vmovl.u16 q8, d16
+ .long 0xf3fb06a0 // vcvt.f32.u32 d16, d16
+ .long 0xf2601d04 // vsub.f32 d17, d0, d4
+ .long 0xf2240114 // vorr d0, d4, d4
+ .long 0xf2e009c8 // vmul.f32 d16, d16, d8[0]
+ .long 0xf2010cb0 // vfma.f32 d0, d17, d16
+ .long 0xf2021cb0 // vfma.f32 d1, d18, d16
+ .long 0xf2032cb0 // vfma.f32 d2, d19, d16
+ .long 0xf2043cb0 // vfma.f32 d3, d20, d16
+ .long 0xe28dd008 // add sp, sp, #8
+ .long 0xecbd8b02 // vpop {d8}
+ .long 0xe12fff1c // bx ip
+
.globl _sk_load_tables_vfp4
_sk_load_tables_vfp4:
.long 0xe92d48f0 // push {r4, r5, r6, r7, fp, lr}
@@ -1494,6 +1560,25 @@ _sk_scale_u8_hsw:
.byte 0x48,0xad // lods %ds:(%rsi),%rax
.byte 0xff,0xe0 // jmpq *%rax
+.globl _sk_lerp_u8_hsw
+_sk_lerp_u8_hsw:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x48,0x8b,0x00 // mov (%rax),%rax
+ .byte 0xc4,0x62,0x7d,0x31,0x04,0x38 // vpmovzxbd (%rax,%rdi,1),%ymm8
+ .byte 0xc4,0x41,0x7c,0x5b,0xc0 // vcvtdq2ps %ymm8,%ymm8
+ .byte 0xc4,0x62,0x7d,0x18,0x4a,0x0c // vbroadcastss 0xc(%rdx),%ymm9
+ .byte 0xc4,0x41,0x3c,0x59,0xc1 // vmulps %ymm9,%ymm8,%ymm8
+ .byte 0xc5,0xfc,0x5c,0xc4 // vsubps %ymm4,%ymm0,%ymm0
+ .byte 0xc4,0xe2,0x3d,0xa8,0xc4 // vfmadd213ps %ymm4,%ymm8,%ymm0
+ .byte 0xc5,0xf4,0x5c,0xcd // vsubps %ymm5,%ymm1,%ymm1
+ .byte 0xc4,0xe2,0x3d,0xa8,0xcd // vfmadd213ps %ymm5,%ymm8,%ymm1
+ .byte 0xc5,0xec,0x5c,0xd6 // vsubps %ymm6,%ymm2,%ymm2
+ .byte 0xc4,0xe2,0x3d,0xa8,0xd6 // vfmadd213ps %ymm6,%ymm8,%ymm2
+ .byte 0xc5,0xe4,0x5c,0xdf // vsubps %ymm7,%ymm3,%ymm3
+ .byte 0xc4,0xe2,0x3d,0xa8,0xdf // vfmadd213ps %ymm7,%ymm8,%ymm3
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
.globl _sk_load_tables_hsw
_sk_load_tables_hsw:
.byte 0x48,0xad // lods %ds:(%rsi),%rax
@@ -2093,6 +2178,30 @@ _sk_scale_u8_sse41:
.byte 0x48,0xad // lods %ds:(%rsi),%rax
.byte 0xff,0xe0 // jmpq *%rax
+.globl _sk_lerp_u8_sse41
+_sk_lerp_u8_sse41:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x48,0x8b,0x00 // mov (%rax),%rax
+ .byte 0x66,0x44,0x0f,0x38,0x31,0x04,0x38 // pmovzxbd (%rax,%rdi,1),%xmm8
+ .byte 0x45,0x0f,0x5b,0xc0 // cvtdq2ps %xmm8,%xmm8
+ .byte 0xf3,0x44,0x0f,0x10,0x4a,0x0c // movss 0xc(%rdx),%xmm9
+ .byte 0x45,0x0f,0xc6,0xc9,0x00 // shufps $0x0,%xmm9,%xmm9
+ .byte 0x45,0x0f,0x59,0xc8 // mulps %xmm8,%xmm9
+ .byte 0x0f,0x5c,0xc4 // subps %xmm4,%xmm0
+ .byte 0x41,0x0f,0x59,0xc1 // mulps %xmm9,%xmm0
+ .byte 0x0f,0x58,0xc4 // addps %xmm4,%xmm0
+ .byte 0x0f,0x5c,0xcd // subps %xmm5,%xmm1
+ .byte 0x41,0x0f,0x59,0xc9 // mulps %xmm9,%xmm1
+ .byte 0x0f,0x58,0xcd // addps %xmm5,%xmm1
+ .byte 0x0f,0x5c,0xd6 // subps %xmm6,%xmm2
+ .byte 0x41,0x0f,0x59,0xd1 // mulps %xmm9,%xmm2
+ .byte 0x0f,0x58,0xd6 // addps %xmm6,%xmm2
+ .byte 0x0f,0x5c,0xdf // subps %xmm7,%xmm3
+ .byte 0x41,0x0f,0x59,0xd9 // mulps %xmm9,%xmm3
+ .byte 0x0f,0x58,0xdf // addps %xmm7,%xmm3
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
.globl _sk_load_tables_sse41
_sk_load_tables_sse41:
.byte 0x48,0xad // lods %ds:(%rsi),%rax
@@ -2795,6 +2904,33 @@ _sk_scale_u8_sse2:
.byte 0x48,0xad // lods %ds:(%rsi),%rax
.byte 0xff,0xe0 // jmpq *%rax
+.globl _sk_lerp_u8_sse2
+_sk_lerp_u8_sse2:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x48,0x8b,0x00 // mov (%rax),%rax
+ .byte 0x66,0x44,0x0f,0x6e,0x04,0x38 // movd (%rax,%rdi,1),%xmm8
+ .byte 0x66,0x45,0x0f,0xef,0xc9 // pxor %xmm9,%xmm9
+ .byte 0x66,0x45,0x0f,0x60,0xc1 // punpcklbw %xmm9,%xmm8
+ .byte 0x66,0x45,0x0f,0x61,0xc1 // punpcklwd %xmm9,%xmm8
+ .byte 0x45,0x0f,0x5b,0xc0 // cvtdq2ps %xmm8,%xmm8
+ .byte 0xf3,0x44,0x0f,0x10,0x4a,0x0c // movss 0xc(%rdx),%xmm9
+ .byte 0x45,0x0f,0xc6,0xc9,0x00 // shufps $0x0,%xmm9,%xmm9
+ .byte 0x45,0x0f,0x59,0xc8 // mulps %xmm8,%xmm9
+ .byte 0x0f,0x5c,0xc4 // subps %xmm4,%xmm0
+ .byte 0x41,0x0f,0x59,0xc1 // mulps %xmm9,%xmm0
+ .byte 0x0f,0x58,0xc4 // addps %xmm4,%xmm0
+ .byte 0x0f,0x5c,0xcd // subps %xmm5,%xmm1
+ .byte 0x41,0x0f,0x59,0xc9 // mulps %xmm9,%xmm1
+ .byte 0x0f,0x58,0xcd // addps %xmm5,%xmm1
+ .byte 0x0f,0x5c,0xd6 // subps %xmm6,%xmm2
+ .byte 0x41,0x0f,0x59,0xd1 // mulps %xmm9,%xmm2
+ .byte 0x0f,0x58,0xd6 // addps %xmm6,%xmm2
+ .byte 0x0f,0x5c,0xdf // subps %xmm7,%xmm3
+ .byte 0x41,0x0f,0x59,0xd9 // mulps %xmm9,%xmm3
+ .byte 0x0f,0x58,0xdf // addps %xmm7,%xmm3
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
.globl _sk_load_tables_sse2
_sk_load_tables_sse2:
.byte 0x48,0xad // lods %ds:(%rsi),%rax
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index 27b8d584aa..d681d24c04 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -336,6 +336,25 @@ _sk_scale_u8_hsw LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
+PUBLIC _sk_lerp_u8_hsw
+_sk_lerp_u8_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,139,0 ; mov (%rax),%rax
+ DB 196,98,125,49,4,56 ; vpmovzxbd (%rax,%rdi,1),%ymm8
+ DB 196,65,124,91,192 ; vcvtdq2ps %ymm8,%ymm8
+ DB 196,98,125,24,74,12 ; vbroadcastss 0xc(%rdx),%ymm9
+ DB 196,65,60,89,193 ; vmulps %ymm9,%ymm8,%ymm8
+ DB 197,252,92,196 ; vsubps %ymm4,%ymm0,%ymm0
+ DB 196,226,61,168,196 ; vfmadd213ps %ymm4,%ymm8,%ymm0
+ DB 197,244,92,205 ; vsubps %ymm5,%ymm1,%ymm1
+ DB 196,226,61,168,205 ; vfmadd213ps %ymm5,%ymm8,%ymm1
+ DB 197,236,92,214 ; vsubps %ymm6,%ymm2,%ymm2
+ DB 196,226,61,168,214 ; vfmadd213ps %ymm6,%ymm8,%ymm2
+ DB 197,228,92,223 ; vsubps %ymm7,%ymm3,%ymm3
+ DB 196,226,61,168,223 ; vfmadd213ps %ymm7,%ymm8,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
PUBLIC _sk_load_tables_hsw
_sk_load_tables_hsw LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
@@ -962,6 +981,30 @@ _sk_scale_u8_sse41 LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
+PUBLIC _sk_lerp_u8_sse41
+_sk_lerp_u8_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,139,0 ; mov (%rax),%rax
+ DB 102,68,15,56,49,4,56 ; pmovzxbd (%rax,%rdi,1),%xmm8
+ DB 69,15,91,192 ; cvtdq2ps %xmm8,%xmm8
+ DB 243,68,15,16,74,12 ; movss 0xc(%rdx),%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 15,92,196 ; subps %xmm4,%xmm0
+ DB 65,15,89,193 ; mulps %xmm9,%xmm0
+ DB 15,88,196 ; addps %xmm4,%xmm0
+ DB 15,92,205 ; subps %xmm5,%xmm1
+ DB 65,15,89,201 ; mulps %xmm9,%xmm1
+ DB 15,88,205 ; addps %xmm5,%xmm1
+ DB 15,92,214 ; subps %xmm6,%xmm2
+ DB 65,15,89,209 ; mulps %xmm9,%xmm2
+ DB 15,88,214 ; addps %xmm6,%xmm2
+ DB 15,92,223 ; subps %xmm7,%xmm3
+ DB 65,15,89,217 ; mulps %xmm9,%xmm3
+ DB 15,88,223 ; addps %xmm7,%xmm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
PUBLIC _sk_load_tables_sse41
_sk_load_tables_sse41 LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
@@ -1691,6 +1734,33 @@ _sk_scale_u8_sse2 LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
+PUBLIC _sk_lerp_u8_sse2
+_sk_lerp_u8_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,139,0 ; mov (%rax),%rax
+ DB 102,68,15,110,4,56 ; movd (%rax,%rdi,1),%xmm8
+ DB 102,69,15,239,201 ; pxor %xmm9,%xmm9
+ DB 102,69,15,96,193 ; punpcklbw %xmm9,%xmm8
+ DB 102,69,15,97,193 ; punpcklwd %xmm9,%xmm8
+ DB 69,15,91,192 ; cvtdq2ps %xmm8,%xmm8
+ DB 243,68,15,16,74,12 ; movss 0xc(%rdx),%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 15,92,196 ; subps %xmm4,%xmm0
+ DB 65,15,89,193 ; mulps %xmm9,%xmm0
+ DB 15,88,196 ; addps %xmm4,%xmm0
+ DB 15,92,205 ; subps %xmm5,%xmm1
+ DB 65,15,89,201 ; mulps %xmm9,%xmm1
+ DB 15,88,205 ; addps %xmm5,%xmm1
+ DB 15,92,214 ; subps %xmm6,%xmm2
+ DB 65,15,89,209 ; mulps %xmm9,%xmm2
+ DB 15,88,214 ; addps %xmm6,%xmm2
+ DB 15,92,223 ; subps %xmm7,%xmm3
+ DB 65,15,89,217 ; mulps %xmm9,%xmm3
+ DB 15,88,223 ; addps %xmm7,%xmm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
PUBLIC _sk_load_tables_sse2
_sk_load_tables_sse2 LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index af4fecec28..0a5d702551 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -136,6 +136,10 @@ using K = const SkJumper_constants;
#endif
#endif
+static F lerp(F from, F to, F t) {
+ return mad(to-from, t, from);
+}
+
// We need to be a careful with casts.
// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
// These named casts and bit_cast() are always what they seem to be.
@@ -389,6 +393,17 @@ STAGE(scale_u8) {
b = b * c;
a = a * c;
}
+STAGE(lerp_u8) {
+ auto ptr = *(const uint8_t**)ctx + x;
+
+ auto scales = unaligned_load<U8>(ptr);
+ auto c = cast(expand(scales)) * k->_1_255;
+
+ r = lerp(dr, r, c);
+ g = lerp(dg, g, c);
+ b = lerp(db, b, c);
+ a = lerp(da, a, c);
+}
STAGE(load_tables) {
struct Ctx {