From 2b767361de00fd85cb32dce62c4a95d30b7eaabf Mon Sep 17 00:00:00 2001 From: Mike Klein Date: Wed, 22 Feb 2017 13:52:40 -0500 Subject: SkJumper: implement lerp_u8 Going to start filling these in in biggest-bang-for-the-buck order. lerp_u8 (i.e. text drawing) is number 1 right now. Change-Id: If58eaf8ddbb93a6b954c3700fa1a476dca94a809 Reviewed-on: https://skia-review.googlesource.com/8856 Reviewed-by: Herb Derby Commit-Queue: Mike Klein --- src/jumper/SkJumper.cpp | 1 + src/jumper/SkJumper_generated.S | 136 ++++++++++++++++++++++++++++++++++++ src/jumper/SkJumper_generated_win.S | 70 +++++++++++++++++++ src/jumper/SkJumper_stages.cpp | 15 ++++ 4 files changed, 222 insertions(+) diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp index 60b34791c3..3c7aca6057 100644 --- a/src/jumper/SkJumper.cpp +++ b/src/jumper/SkJumper.cpp @@ -45,6 +45,7 @@ static K kConstants = { M(from_srgb) \ M(to_srgb) \ M(scale_u8) \ + M(lerp_u8) \ M(load_tables) \ M(load_8888) \ M(store_8888) \ diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S index cdd81d1416..af76618632 100644 --- a/src/jumper/SkJumper_generated.S +++ b/src/jumper/SkJumper_generated.S @@ -356,6 +356,38 @@ _sk_scale_u8_aarch64: .long 0x6e23de03 // fmul v3.4s, v16.4s, v3.4s .long 0xd61f0060 // br x3 +.globl _sk_lerp_u8_aarch64 +_sk_lerp_u8_aarch64: + .long 0xa8c10c28 // ldp x8, x3, [x1],#16 + .long 0xbd400c51 // ldr s17, [x2,#12] + .long 0x4ea4d412 // fsub v18.4s, v0.4s, v4.4s + .long 0xf9400108 // ldr x8, [x8] + .long 0x8b000108 // add x8, x8, x0 + .long 0x39400109 // ldrb w9, [x8] + .long 0x3940050a // ldrb w10, [x8,#1] + .long 0x3940090b // ldrb w11, [x8,#2] + .long 0x39400d08 // ldrb w8, [x8,#3] + .long 0x4e021d30 // mov v16.h[0], w9 + .long 0x4e061d50 // mov v16.h[1], w10 + .long 0x4e0a1d70 // mov v16.h[2], w11 + .long 0x4e0e1d10 // mov v16.h[3], w8 + .long 0x2f07b7f0 // bic v16.4h, #0xff, lsl #8 + .long 0x2f10a600 // uxtl v0.4s, v16.4h + .long 0x6e21d800 // ucvtf v0.4s, v0.4s + .long 0x4f919010 // fmul v16.4s, v0.4s, v17.s[0] + .long 0x4ea41c80 // mov v0.16b, v4.16b + .long 0x4ea5d431 // fsub v17.4s, v1.4s, v5.4s + .long 0x4ea51ca1 // mov v1.16b, v5.16b + .long 0x4e32ce00 // fmla v0.4s, v16.4s, v18.4s + .long 0x4ea6d452 // fsub v18.4s, v2.4s, v6.4s + .long 0x4e31ce01 // fmla v1.4s, v16.4s, v17.4s + .long 0x4ea61cc2 // mov v2.16b, v6.16b + .long 0x4ea7d471 // fsub v17.4s, v3.4s, v7.4s + .long 0x4ea71ce3 // mov v3.16b, v7.16b + .long 0x4e32ce02 // fmla v2.4s, v16.4s, v18.4s + .long 0x4e31ce03 // fmla v3.4s, v16.4s, v17.4s + .long 0xd61f0060 // br x3 + .globl _sk_load_tables_aarch64 _sk_load_tables_aarch64: .long 0xa8c10c28 // ldp x8, x3, [x1],#16 @@ -930,6 +962,40 @@ _sk_scale_u8_vfp4: .long 0xecbd8b02 // vpop {d8} .long 0xe12fff1c // bx ip +.globl _sk_lerp_u8_vfp4 +_sk_lerp_u8_vfp4: + .long 0xed2d8b02 // vpush {d8} + .long 0xe24dd008 // sub sp, sp, #8 + .long 0xe5913000 // ldr r3, [r1] + .long 0xf2612d05 // vsub.f32 d18, d1, d5 + .long 0xe591c004 // ldr ip, [r1, #4] + .long 0xf2623d06 // vsub.f32 d19, d2, d6 + .long 0xf2634d07 // vsub.f32 d20, d3, d7 + .long 0xe2811008 // add r1, r1, #8 + .long 0xe5933000 // ldr r3, [r3] + .long 0xf2251115 // vorr d1, d5, d5 + .long 0xf2262116 // vorr d2, d6, d6 + .long 0xe0833000 // add r3, r3, r0 + .long 0xf2273117 // vorr d3, d7, d7 + .long 0xe1d330b0 // ldrh r3, [r3] + .long 0xe1cd30b4 // strh r3, [sp, #4] + .long 0xe28d3004 // add r3, sp, #4 + .long 0xed928a03 // vldr s16, [r2, #12] + .long 0xf4e3041f // vld1.16 {d16[0]}, [r3 :16] + .long 0xf3c80a30 // vmovl.u8 q8, d16 + .long 0xf3d00a30 // vmovl.u16 q8, d16 + .long 0xf3fb06a0 // vcvt.f32.u32 d16, d16 + .long 0xf2601d04 // vsub.f32 d17, d0, d4 + .long 0xf2240114 // vorr d0, d4, d4 + .long 0xf2e009c8 // vmul.f32 d16, d16, d8[0] + .long 0xf2010cb0 // vfma.f32 d0, d17, d16 + .long 0xf2021cb0 // vfma.f32 d1, d18, d16 + .long 0xf2032cb0 // vfma.f32 d2, d19, d16 + .long 0xf2043cb0 // vfma.f32 d3, d20, d16 + .long 0xe28dd008 // add sp, sp, #8 + .long 0xecbd8b02 // vpop {d8} + .long 0xe12fff1c // bx ip + .globl _sk_load_tables_vfp4 _sk_load_tables_vfp4: .long 0xe92d48f0 // push {r4, r5, r6, r7, fp, lr} @@ -1494,6 +1560,25 @@ _sk_scale_u8_hsw: .byte 0x48,0xad // lods %ds:(%rsi),%rax .byte 0xff,0xe0 // jmpq *%rax +.globl _sk_lerp_u8_hsw +_sk_lerp_u8_hsw: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0x8b,0x00 // mov (%rax),%rax + .byte 0xc4,0x62,0x7d,0x31,0x04,0x38 // vpmovzxbd (%rax,%rdi,1),%ymm8 + .byte 0xc4,0x41,0x7c,0x5b,0xc0 // vcvtdq2ps %ymm8,%ymm8 + .byte 0xc4,0x62,0x7d,0x18,0x4a,0x0c // vbroadcastss 0xc(%rdx),%ymm9 + .byte 0xc4,0x41,0x3c,0x59,0xc1 // vmulps %ymm9,%ymm8,%ymm8 + .byte 0xc5,0xfc,0x5c,0xc4 // vsubps %ymm4,%ymm0,%ymm0 + .byte 0xc4,0xe2,0x3d,0xa8,0xc4 // vfmadd213ps %ymm4,%ymm8,%ymm0 + .byte 0xc5,0xf4,0x5c,0xcd // vsubps %ymm5,%ymm1,%ymm1 + .byte 0xc4,0xe2,0x3d,0xa8,0xcd // vfmadd213ps %ymm5,%ymm8,%ymm1 + .byte 0xc5,0xec,0x5c,0xd6 // vsubps %ymm6,%ymm2,%ymm2 + .byte 0xc4,0xe2,0x3d,0xa8,0xd6 // vfmadd213ps %ymm6,%ymm8,%ymm2 + .byte 0xc5,0xe4,0x5c,0xdf // vsubps %ymm7,%ymm3,%ymm3 + .byte 0xc4,0xe2,0x3d,0xa8,0xdf // vfmadd213ps %ymm7,%ymm8,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + .globl _sk_load_tables_hsw _sk_load_tables_hsw: .byte 0x48,0xad // lods %ds:(%rsi),%rax @@ -2093,6 +2178,30 @@ _sk_scale_u8_sse41: .byte 0x48,0xad // lods %ds:(%rsi),%rax .byte 0xff,0xe0 // jmpq *%rax +.globl _sk_lerp_u8_sse41 +_sk_lerp_u8_sse41: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0x8b,0x00 // mov (%rax),%rax + .byte 0x66,0x44,0x0f,0x38,0x31,0x04,0x38 // pmovzxbd (%rax,%rdi,1),%xmm8 + .byte 0x45,0x0f,0x5b,0xc0 // cvtdq2ps %xmm8,%xmm8 + .byte 0xf3,0x44,0x0f,0x10,0x4a,0x0c // movss 0xc(%rdx),%xmm9 + .byte 0x45,0x0f,0xc6,0xc9,0x00 // shufps $0x0,%xmm9,%xmm9 + .byte 0x45,0x0f,0x59,0xc8 // mulps %xmm8,%xmm9 + .byte 0x0f,0x5c,0xc4 // subps %xmm4,%xmm0 + .byte 0x41,0x0f,0x59,0xc1 // mulps %xmm9,%xmm0 + .byte 0x0f,0x58,0xc4 // addps %xmm4,%xmm0 + .byte 0x0f,0x5c,0xcd // subps %xmm5,%xmm1 + .byte 0x41,0x0f,0x59,0xc9 // mulps %xmm9,%xmm1 + .byte 0x0f,0x58,0xcd // addps %xmm5,%xmm1 + .byte 0x0f,0x5c,0xd6 // subps %xmm6,%xmm2 + .byte 0x41,0x0f,0x59,0xd1 // mulps %xmm9,%xmm2 + .byte 0x0f,0x58,0xd6 // addps %xmm6,%xmm2 + .byte 0x0f,0x5c,0xdf // subps %xmm7,%xmm3 + .byte 0x41,0x0f,0x59,0xd9 // mulps %xmm9,%xmm3 + .byte 0x0f,0x58,0xdf // addps %xmm7,%xmm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + .globl _sk_load_tables_sse41 _sk_load_tables_sse41: .byte 0x48,0xad // lods %ds:(%rsi),%rax @@ -2795,6 +2904,33 @@ _sk_scale_u8_sse2: .byte 0x48,0xad // lods %ds:(%rsi),%rax .byte 0xff,0xe0 // jmpq *%rax +.globl _sk_lerp_u8_sse2 +_sk_lerp_u8_sse2: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0x8b,0x00 // mov (%rax),%rax + .byte 0x66,0x44,0x0f,0x6e,0x04,0x38 // movd (%rax,%rdi,1),%xmm8 + .byte 0x66,0x45,0x0f,0xef,0xc9 // pxor %xmm9,%xmm9 + .byte 0x66,0x45,0x0f,0x60,0xc1 // punpcklbw %xmm9,%xmm8 + .byte 0x66,0x45,0x0f,0x61,0xc1 // punpcklwd %xmm9,%xmm8 + .byte 0x45,0x0f,0x5b,0xc0 // cvtdq2ps %xmm8,%xmm8 + .byte 0xf3,0x44,0x0f,0x10,0x4a,0x0c // movss 0xc(%rdx),%xmm9 + .byte 0x45,0x0f,0xc6,0xc9,0x00 // shufps $0x0,%xmm9,%xmm9 + .byte 0x45,0x0f,0x59,0xc8 // mulps %xmm8,%xmm9 + .byte 0x0f,0x5c,0xc4 // subps %xmm4,%xmm0 + .byte 0x41,0x0f,0x59,0xc1 // mulps %xmm9,%xmm0 + .byte 0x0f,0x58,0xc4 // addps %xmm4,%xmm0 + .byte 0x0f,0x5c,0xcd // subps %xmm5,%xmm1 + .byte 0x41,0x0f,0x59,0xc9 // mulps %xmm9,%xmm1 + .byte 0x0f,0x58,0xcd // addps %xmm5,%xmm1 + .byte 0x0f,0x5c,0xd6 // subps %xmm6,%xmm2 + .byte 0x41,0x0f,0x59,0xd1 // mulps %xmm9,%xmm2 + .byte 0x0f,0x58,0xd6 // addps %xmm6,%xmm2 + .byte 0x0f,0x5c,0xdf // subps %xmm7,%xmm3 + .byte 0x41,0x0f,0x59,0xd9 // mulps %xmm9,%xmm3 + .byte 0x0f,0x58,0xdf // addps %xmm7,%xmm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + .globl _sk_load_tables_sse2 _sk_load_tables_sse2: .byte 0x48,0xad // lods %ds:(%rsi),%rax diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S index 27b8d584aa..d681d24c04 100644 --- a/src/jumper/SkJumper_generated_win.S +++ b/src/jumper/SkJumper_generated_win.S @@ -336,6 +336,25 @@ _sk_scale_u8_hsw LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax +PUBLIC _sk_lerp_u8_hsw +_sk_lerp_u8_hsw LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,139,0 ; mov (%rax),%rax + DB 196,98,125,49,4,56 ; vpmovzxbd (%rax,%rdi,1),%ymm8 + DB 196,65,124,91,192 ; vcvtdq2ps %ymm8,%ymm8 + DB 196,98,125,24,74,12 ; vbroadcastss 0xc(%rdx),%ymm9 + DB 196,65,60,89,193 ; vmulps %ymm9,%ymm8,%ymm8 + DB 197,252,92,196 ; vsubps %ymm4,%ymm0,%ymm0 + DB 196,226,61,168,196 ; vfmadd213ps %ymm4,%ymm8,%ymm0 + DB 197,244,92,205 ; vsubps %ymm5,%ymm1,%ymm1 + DB 196,226,61,168,205 ; vfmadd213ps %ymm5,%ymm8,%ymm1 + DB 197,236,92,214 ; vsubps %ymm6,%ymm2,%ymm2 + DB 196,226,61,168,214 ; vfmadd213ps %ymm6,%ymm8,%ymm2 + DB 197,228,92,223 ; vsubps %ymm7,%ymm3,%ymm3 + DB 196,226,61,168,223 ; vfmadd213ps %ymm7,%ymm8,%ymm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + PUBLIC _sk_load_tables_hsw _sk_load_tables_hsw LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax @@ -962,6 +981,30 @@ _sk_scale_u8_sse41 LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax +PUBLIC _sk_lerp_u8_sse41 +_sk_lerp_u8_sse41 LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,139,0 ; mov (%rax),%rax + DB 102,68,15,56,49,4,56 ; pmovzxbd (%rax,%rdi,1),%xmm8 + DB 69,15,91,192 ; cvtdq2ps %xmm8,%xmm8 + DB 243,68,15,16,74,12 ; movss 0xc(%rdx),%xmm9 + DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9 + DB 69,15,89,200 ; mulps %xmm8,%xmm9 + DB 15,92,196 ; subps %xmm4,%xmm0 + DB 65,15,89,193 ; mulps %xmm9,%xmm0 + DB 15,88,196 ; addps %xmm4,%xmm0 + DB 15,92,205 ; subps %xmm5,%xmm1 + DB 65,15,89,201 ; mulps %xmm9,%xmm1 + DB 15,88,205 ; addps %xmm5,%xmm1 + DB 15,92,214 ; subps %xmm6,%xmm2 + DB 65,15,89,209 ; mulps %xmm9,%xmm2 + DB 15,88,214 ; addps %xmm6,%xmm2 + DB 15,92,223 ; subps %xmm7,%xmm3 + DB 65,15,89,217 ; mulps %xmm9,%xmm3 + DB 15,88,223 ; addps %xmm7,%xmm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + PUBLIC _sk_load_tables_sse41 _sk_load_tables_sse41 LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax @@ -1691,6 +1734,33 @@ _sk_scale_u8_sse2 LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax +PUBLIC _sk_lerp_u8_sse2 +_sk_lerp_u8_sse2 LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,139,0 ; mov (%rax),%rax + DB 102,68,15,110,4,56 ; movd (%rax,%rdi,1),%xmm8 + DB 102,69,15,239,201 ; pxor %xmm9,%xmm9 + DB 102,69,15,96,193 ; punpcklbw %xmm9,%xmm8 + DB 102,69,15,97,193 ; punpcklwd %xmm9,%xmm8 + DB 69,15,91,192 ; cvtdq2ps %xmm8,%xmm8 + DB 243,68,15,16,74,12 ; movss 0xc(%rdx),%xmm9 + DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9 + DB 69,15,89,200 ; mulps %xmm8,%xmm9 + DB 15,92,196 ; subps %xmm4,%xmm0 + DB 65,15,89,193 ; mulps %xmm9,%xmm0 + DB 15,88,196 ; addps %xmm4,%xmm0 + DB 15,92,205 ; subps %xmm5,%xmm1 + DB 65,15,89,201 ; mulps %xmm9,%xmm1 + DB 15,88,205 ; addps %xmm5,%xmm1 + DB 15,92,214 ; subps %xmm6,%xmm2 + DB 65,15,89,209 ; mulps %xmm9,%xmm2 + DB 15,88,214 ; addps %xmm6,%xmm2 + DB 15,92,223 ; subps %xmm7,%xmm3 + DB 65,15,89,217 ; mulps %xmm9,%xmm3 + DB 15,88,223 ; addps %xmm7,%xmm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + PUBLIC _sk_load_tables_sse2 _sk_load_tables_sse2 LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index af4fecec28..0a5d702551 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -136,6 +136,10 @@ using K = const SkJumper_constants; #endif #endif +static F lerp(F from, F to, F t) { + return mad(to-from, t, from); +} + // We need to be a careful with casts. // (F)x means cast x to float in the portable path, but bit_cast x to float in the others. // These named casts and bit_cast() are always what they seem to be. @@ -389,6 +393,17 @@ STAGE(scale_u8) { b = b * c; a = a * c; } +STAGE(lerp_u8) { + auto ptr = *(const uint8_t**)ctx + x; + + auto scales = unaligned_load(ptr); + auto c = cast(expand(scales)) * k->_1_255; + + r = lerp(dr, r, c); + g = lerp(dg, g, c); + b = lerp(db, b, c); + a = lerp(da, a, c); +} STAGE(load_tables) { struct Ctx { -- cgit v1.2.3