diff options
author | Mike Klein <mtklein@chromium.org> | 2017-02-24 09:05:14 -0500 |
---|---|---|
committer | Mike Klein <mtklein@chromium.org> | 2017-02-24 14:37:14 +0000 |
commit | 420e38f586ed21a51c9d216c422b4c4d5ab2dc97 (patch) | |
tree | 66304c9651058dcf3989c966a39c0ae07fefa2b9 /src/jumper | |
parent | 1a9e8f31b43098e69cf5b49fef362387a6cc6da7 (diff) |
SkJumper: a8
Change-Id: I123caaee0bb8e3967c0a1f2acf1d80bcf0f41758
Reviewed-on: https://skia-review.googlesource.com/8944
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/jumper')
-rw-r--r-- | src/jumper/SkJumper.cpp | 2 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated.S | 207 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated_win.S | 121 | ||||
-rw-r--r-- | src/jumper/SkJumper_stages.cpp | 32 |
4 files changed, 361 insertions, 1 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp index 90db9a94a8..d38090d180 100644 --- a/src/jumper/SkJumper.cpp +++ b/src/jumper/SkJumper.cpp @@ -70,6 +70,8 @@ static K kConstants = { M(lerp_u8) \ M(lerp_565) \ M(load_tables) \ + M(load_a8) \ + M(store_a8) \ M(load_565) \ M(store_565) \ M(load_8888) \ diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S index 8b4106c050..973534a65b 100644 --- a/src/jumper/SkJumper_generated.S +++ b/src/jumper/SkJumper_generated.S @@ -526,6 +526,50 @@ _sk_load_tables_aarch64: .long 0x6e1c0602 // mov v2.s[3], v16.s[0] .long 0xd61f0060 // br x3 +.globl _sk_load_a8_aarch64 +_sk_load_a8_aarch64: + .long 0xa8c10c28 // ldp x8, x3, [x1],#16 + .long 0xbd400c43 // ldr s3, [x2,#12] + .long 0x6f00e400 // movi v0.2d, #0x0 + .long 0x6f00e401 // movi v1.2d, #0x0 + .long 0xf9400108 // ldr x8, [x8] + .long 0x6f00e402 // movi v2.2d, #0x0 + .long 0x8b000108 // add x8, x8, x0 + .long 0x39400109 // ldrb w9, [x8] + .long 0x3940050a // ldrb w10, [x8,#1] + .long 0x3940090b // ldrb w11, [x8,#2] + .long 0x39400d08 // ldrb w8, [x8,#3] + .long 0x4e021d30 // mov v16.h[0], w9 + .long 0x4e061d50 // mov v16.h[1], w10 + .long 0x4e0a1d70 // mov v16.h[2], w11 + .long 0x4e0e1d10 // mov v16.h[3], w8 + .long 0x2f07b7f0 // bic v16.4h, #0xff, lsl #8 + .long 0x2f10a610 // uxtl v16.4s, v16.4h + .long 0x6e21da10 // ucvtf v16.4s, v16.4s + .long 0x4f839203 // fmul v3.4s, v16.4s, v3.s[0] + .long 0xd61f0060 // br x3 + +.globl _sk_store_a8_aarch64 +_sk_store_a8_aarch64: + .long 0xf9400028 // ldr x8, [x1] + .long 0xbd400850 // ldr s16, [x2,#8] + .long 0xf9400108 // ldr x8, [x8] + .long 0x4f909070 // fmul v16.4s, v3.4s, v16.s[0] + .long 0x6e21aa10 // fcvtnu v16.4s, v16.4s + .long 0x0e612a10 // xtn v16.4h, v16.4s + .long 0x0e0e3e09 // umov w9, v16.h[3] + .long 0x8b000108 // add x8, x8, x0 + .long 0x39000d09 // strb w9, [x8,#3] + .long 0x0e0a3e09 // umov w9, v16.h[2] + .long 0x39000909 // strb w9, [x8,#2] + .long 0x0e063e09 // umov w9, v16.h[1] + .long 0x39000509 // strb w9, [x8,#1] + .long 0x0e023e09 // umov w9, v16.h[0] + .long 0x39000109 // strb w9, [x8] + .long 0xf9400423 // ldr x3, [x1,#8] + .long 0x91004021 // add x1, x1, #0x10 + .long 0xd61f0060 // br x3 + .globl _sk_load_565_aarch64 _sk_load_565_aarch64: .long 0xa8c10c28 // ldp x8, x3, [x1],#16 @@ -1273,6 +1317,48 @@ _sk_load_tables_vfp4: .long 0xe8bd48f0 // pop {r4, r5, r6, r7, fp, lr} .long 0xe12fff1c // bx ip +.globl _sk_load_a8_vfp4 +_sk_load_a8_vfp4: + .long 0xe24dd004 // sub sp, sp, #4 + .long 0xe5913000 // ldr r3, [r1] + .long 0xf2801010 // vmov.i32 d1, #0 + .long 0xe591c004 // ldr ip, [r1, #4] + .long 0xe2811008 // add r1, r1, #8 + .long 0xf2802010 // vmov.i32 d2, #0 + .long 0xe5933000 // ldr r3, [r3] + .long 0xe0833000 // add r3, r3, r0 + .long 0xe1d330b0 // ldrh r3, [r3] + .long 0xe1cd30b0 // strh r3, [sp] + .long 0xe1a0300d // mov r3, sp + .long 0xf4e3041f // vld1.16 {d16[0]}, [r3 :16] + .long 0xed920a03 // vldr s0, [r2, #12] + .long 0xf3c80a30 // vmovl.u8 q8, d16 + .long 0xf3d00a30 // vmovl.u16 q8, d16 + .long 0xf3fb06a0 // vcvt.f32.u32 d16, d16 + .long 0xf2a039c0 // vmul.f32 d3, d16, d0[0] + .long 0xf2800010 // vmov.i32 d0, #0 + .long 0xe28dd004 // add sp, sp, #4 + .long 0xe12fff1c // bx ip + +.globl _sk_store_a8_vfp4 +_sk_store_a8_vfp4: + .long 0xe92d4800 // push {fp, lr} + .long 0xe2823008 // add r3, r2, #8 + .long 0xf2c3061f // vmov.i32 d16, #1056964608 + .long 0xf4e31c9f // vld1.32 {d17[]}, [r3 :32] + .long 0xe5913000 // ldr r3, [r1] + .long 0xf2430c31 // vfma.f32 d16, d3, d17 + .long 0xe5933000 // ldr r3, [r3] + .long 0xf3fb07a0 // vcvt.u32.f32 d16, d16 + .long 0xee10eb90 // vmov.32 lr, d16[0] + .long 0xee30cb90 // vmov.32 ip, d16[1] + .long 0xe7e3e000 // strb lr, [r3, r0]! + .long 0xe5c3c001 // strb ip, [r3, #1] + .long 0xe5913004 // ldr r3, [r1, #4] + .long 0xe2811008 // add r1, r1, #8 + .long 0xe8bd4800 // pop {fp, lr} + .long 0xe12fff13 // bx r3 + .globl _sk_load_565_vfp4 _sk_load_565_vfp4: .long 0xe24dd004 // sub sp, sp, #4 @@ -1976,6 +2062,34 @@ _sk_load_tables_hsw: .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax +.globl _sk_load_a8_hsw +_sk_load_a8_hsw: + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 72,139,0 // mov (%rax),%rax + .byte 196,226,125,49,4,56 // vpmovzxbd (%rax,%rdi,1),%ymm0 + .byte 197,252,91,192 // vcvtdq2ps %ymm0,%ymm0 + .byte 196,226,125,24,74,12 // vbroadcastss 0xc(%rdx),%ymm1 + .byte 197,252,89,217 // vmulps %ymm1,%ymm0,%ymm3 + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 197,252,87,192 // vxorps %ymm0,%ymm0,%ymm0 + .byte 197,244,87,201 // vxorps %ymm1,%ymm1,%ymm1 + .byte 197,236,87,210 // vxorps %ymm2,%ymm2,%ymm2 + .byte 255,224 // jmpq *%rax + +.globl _sk_store_a8_hsw +_sk_store_a8_hsw: + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 72,139,0 // mov (%rax),%rax + .byte 196,98,125,24,66,8 // vbroadcastss 0x8(%rdx),%ymm8 + .byte 197,60,89,195 // vmulps %ymm3,%ymm8,%ymm8 + .byte 196,65,125,91,192 // vcvtps2dq %ymm8,%ymm8 + .byte 196,67,125,25,193,1 // vextractf128 $0x1,%ymm8,%xmm9 + .byte 196,66,57,43,193 // vpackusdw %xmm9,%xmm8,%xmm8 + .byte 196,65,57,103,192 // vpackuswb %xmm8,%xmm8,%xmm8 + .byte 197,121,214,4,56 // vmovq %xmm8,(%rax,%rdi,1) + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 255,224 // jmpq *%rax + .globl _sk_load_565_hsw _sk_load_565_hsw: .byte 72,173 // lods %ds:(%rsi),%rax @@ -2760,6 +2874,36 @@ _sk_load_tables_avx: .byte 65,95 // pop %r15 .byte 255,224 // jmpq *%rax +.globl _sk_load_a8_avx +_sk_load_a8_avx: + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 72,139,0 // mov (%rax),%rax + .byte 196,226,121,49,68,56,4 // vpmovzxbd 0x4(%rax,%rdi,1),%xmm0 + .byte 196,226,121,49,12,56 // vpmovzxbd (%rax,%rdi,1),%xmm1 + .byte 196,227,117,24,192,1 // vinsertf128 $0x1,%xmm0,%ymm1,%ymm0 + .byte 197,252,91,192 // vcvtdq2ps %ymm0,%ymm0 + .byte 196,226,125,24,74,12 // vbroadcastss 0xc(%rdx),%ymm1 + .byte 197,252,89,217 // vmulps %ymm1,%ymm0,%ymm3 + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 197,252,87,192 // vxorps %ymm0,%ymm0,%ymm0 + .byte 197,244,87,201 // vxorps %ymm1,%ymm1,%ymm1 + .byte 197,236,87,210 // vxorps %ymm2,%ymm2,%ymm2 + .byte 255,224 // jmpq *%rax + +.globl _sk_store_a8_avx +_sk_store_a8_avx: + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 72,139,0 // mov (%rax),%rax + .byte 196,98,125,24,66,8 // vbroadcastss 0x8(%rdx),%ymm8 + .byte 197,60,89,195 // vmulps %ymm3,%ymm8,%ymm8 + .byte 196,65,125,91,192 // vcvtps2dq %ymm8,%ymm8 + .byte 196,67,125,25,193,1 // vextractf128 $0x1,%ymm8,%xmm9 + .byte 196,66,57,43,193 // vpackusdw %xmm9,%xmm8,%xmm8 + .byte 196,65,57,103,192 // vpackuswb %xmm8,%xmm8,%xmm8 + .byte 197,121,214,4,56 // vmovq %xmm8,(%rax,%rdi,1) + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 255,224 // jmpq *%rax + .globl _sk_load_565_avx _sk_load_565_avx: .byte 72,173 // lods %ds:(%rsi),%rax @@ -3655,6 +3799,35 @@ _sk_load_tables_sse41: .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax +.globl _sk_load_a8_sse41 +_sk_load_a8_sse41: + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 72,139,0 // mov (%rax),%rax + .byte 102,15,56,49,4,56 // pmovzxbd (%rax,%rdi,1),%xmm0 + .byte 15,91,192 // cvtdq2ps %xmm0,%xmm0 + .byte 243,15,16,90,12 // movss 0xc(%rdx),%xmm3 + .byte 15,198,219,0 // shufps $0x0,%xmm3,%xmm3 + .byte 15,89,216 // mulps %xmm0,%xmm3 + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 15,87,192 // xorps %xmm0,%xmm0 + .byte 15,87,201 // xorps %xmm1,%xmm1 + .byte 15,87,210 // xorps %xmm2,%xmm2 + .byte 255,224 // jmpq *%rax + +.globl _sk_store_a8_sse41 +_sk_store_a8_sse41: + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 72,139,0 // mov (%rax),%rax + .byte 243,68,15,16,66,8 // movss 0x8(%rdx),%xmm8 + .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8 + .byte 68,15,89,195 // mulps %xmm3,%xmm8 + .byte 102,69,15,91,192 // cvtps2dq %xmm8,%xmm8 + .byte 102,69,15,56,43,192 // packusdw %xmm8,%xmm8 + .byte 102,69,15,103,192 // packuswb %xmm8,%xmm8 + .byte 102,68,15,126,4,56 // movd %xmm8,(%rax,%rdi,1) + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 255,224 // jmpq *%rax + .globl _sk_load_565_sse41 _sk_load_565_sse41: .byte 72,173 // lods %ds:(%rsi),%rax @@ -4542,6 +4715,40 @@ _sk_load_tables_sse2: .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax +.globl _sk_load_a8_sse2 +_sk_load_a8_sse2: + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 72,139,0 // mov (%rax),%rax + .byte 102,15,110,4,56 // movd (%rax,%rdi,1),%xmm0 + .byte 102,15,239,201 // pxor %xmm1,%xmm1 + .byte 102,15,96,193 // punpcklbw %xmm1,%xmm0 + .byte 102,15,97,193 // punpcklwd %xmm1,%xmm0 + .byte 15,91,192 // cvtdq2ps %xmm0,%xmm0 + .byte 243,15,16,90,12 // movss 0xc(%rdx),%xmm3 + .byte 15,198,219,0 // shufps $0x0,%xmm3,%xmm3 + .byte 15,89,216 // mulps %xmm0,%xmm3 + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 15,87,192 // xorps %xmm0,%xmm0 + .byte 102,15,239,201 // pxor %xmm1,%xmm1 + .byte 15,87,210 // xorps %xmm2,%xmm2 + .byte 255,224 // jmpq *%rax + +.globl _sk_store_a8_sse2 +_sk_store_a8_sse2: + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 72,139,0 // mov (%rax),%rax + .byte 243,68,15,16,66,8 // movss 0x8(%rdx),%xmm8 + .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8 + .byte 68,15,89,195 // mulps %xmm3,%xmm8 + .byte 102,69,15,91,192 // cvtps2dq %xmm8,%xmm8 + .byte 102,65,15,114,240,16 // pslld $0x10,%xmm8 + .byte 102,65,15,114,224,16 // psrad $0x10,%xmm8 + .byte 102,69,15,107,192 // packssdw %xmm8,%xmm8 + .byte 102,69,15,103,192 // packuswb %xmm8,%xmm8 + .byte 102,68,15,126,4,56 // movd %xmm8,(%rax,%rdi,1) + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 255,224 // jmpq *%rax + .globl _sk_load_565_sse2 _sk_load_565_sse2: .byte 72,173 // lods %ds:(%rsi),%rax diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S index 8d80694771..e22e4cdfe3 100644 --- a/src/jumper/SkJumper_generated_win.S +++ b/src/jumper/SkJumper_generated_win.S @@ -457,6 +457,34 @@ _sk_load_tables_hsw LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax +PUBLIC _sk_load_a8_hsw +_sk_load_a8_hsw LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,139,0 ; mov (%rax),%rax + DB 196,226,125,49,4,56 ; vpmovzxbd (%rax,%rdi,1),%ymm0 + DB 197,252,91,192 ; vcvtdq2ps %ymm0,%ymm0 + DB 196,226,125,24,74,12 ; vbroadcastss 0xc(%rdx),%ymm1 + DB 197,252,89,217 ; vmulps %ymm1,%ymm0,%ymm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 197,252,87,192 ; vxorps %ymm0,%ymm0,%ymm0 + DB 197,244,87,201 ; vxorps %ymm1,%ymm1,%ymm1 + DB 197,236,87,210 ; vxorps %ymm2,%ymm2,%ymm2 + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_store_a8_hsw +_sk_store_a8_hsw LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,139,0 ; mov (%rax),%rax + DB 196,98,125,24,66,8 ; vbroadcastss 0x8(%rdx),%ymm8 + DB 197,60,89,195 ; vmulps %ymm3,%ymm8,%ymm8 + DB 196,65,125,91,192 ; vcvtps2dq %ymm8,%ymm8 + DB 196,67,125,25,193,1 ; vextractf128 $0x1,%ymm8,%xmm9 + DB 196,66,57,43,193 ; vpackusdw %xmm9,%xmm8,%xmm8 + DB 196,65,57,103,192 ; vpackuswb %xmm8,%xmm8,%xmm8 + DB 197,121,214,4,56 ; vmovq %xmm8,(%rax,%rdi,1) + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + PUBLIC _sk_load_565_hsw _sk_load_565_hsw LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax @@ -1268,6 +1296,36 @@ _sk_load_tables_avx LABEL PROC DB 65,95 ; pop %r15 DB 255,224 ; jmpq *%rax +PUBLIC _sk_load_a8_avx +_sk_load_a8_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,139,0 ; mov (%rax),%rax + DB 196,226,121,49,68,56,4 ; vpmovzxbd 0x4(%rax,%rdi,1),%xmm0 + DB 196,226,121,49,12,56 ; vpmovzxbd (%rax,%rdi,1),%xmm1 + DB 196,227,117,24,192,1 ; vinsertf128 $0x1,%xmm0,%ymm1,%ymm0 + DB 197,252,91,192 ; vcvtdq2ps %ymm0,%ymm0 + DB 196,226,125,24,74,12 ; vbroadcastss 0xc(%rdx),%ymm1 + DB 197,252,89,217 ; vmulps %ymm1,%ymm0,%ymm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 197,252,87,192 ; vxorps %ymm0,%ymm0,%ymm0 + DB 197,244,87,201 ; vxorps %ymm1,%ymm1,%ymm1 + DB 197,236,87,210 ; vxorps %ymm2,%ymm2,%ymm2 + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_store_a8_avx +_sk_store_a8_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,139,0 ; mov (%rax),%rax + DB 196,98,125,24,66,8 ; vbroadcastss 0x8(%rdx),%ymm8 + DB 197,60,89,195 ; vmulps %ymm3,%ymm8,%ymm8 + DB 196,65,125,91,192 ; vcvtps2dq %ymm8,%ymm8 + DB 196,67,125,25,193,1 ; vextractf128 $0x1,%ymm8,%xmm9 + DB 196,66,57,43,193 ; vpackusdw %xmm9,%xmm8,%xmm8 + DB 196,65,57,103,192 ; vpackuswb %xmm8,%xmm8,%xmm8 + DB 197,121,214,4,56 ; vmovq %xmm8,(%rax,%rdi,1) + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + PUBLIC _sk_load_565_avx _sk_load_565_avx LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax @@ -2190,6 +2248,35 @@ _sk_load_tables_sse41 LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax +PUBLIC _sk_load_a8_sse41 +_sk_load_a8_sse41 LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,139,0 ; mov (%rax),%rax + DB 102,15,56,49,4,56 ; pmovzxbd (%rax,%rdi,1),%xmm0 + DB 15,91,192 ; cvtdq2ps %xmm0,%xmm0 + DB 243,15,16,90,12 ; movss 0xc(%rdx),%xmm3 + DB 15,198,219,0 ; shufps $0x0,%xmm3,%xmm3 + DB 15,89,216 ; mulps %xmm0,%xmm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 15,87,192 ; xorps %xmm0,%xmm0 + DB 15,87,201 ; xorps %xmm1,%xmm1 + DB 15,87,210 ; xorps %xmm2,%xmm2 + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_store_a8_sse41 +_sk_store_a8_sse41 LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,139,0 ; mov (%rax),%rax + DB 243,68,15,16,66,8 ; movss 0x8(%rdx),%xmm8 + DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8 + DB 68,15,89,195 ; mulps %xmm3,%xmm8 + DB 102,69,15,91,192 ; cvtps2dq %xmm8,%xmm8 + DB 102,69,15,56,43,192 ; packusdw %xmm8,%xmm8 + DB 102,69,15,103,192 ; packuswb %xmm8,%xmm8 + DB 102,68,15,126,4,56 ; movd %xmm8,(%rax,%rdi,1) + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + PUBLIC _sk_load_565_sse41 _sk_load_565_sse41 LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax @@ -3104,6 +3191,40 @@ _sk_load_tables_sse2 LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax +PUBLIC _sk_load_a8_sse2 +_sk_load_a8_sse2 LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,139,0 ; mov (%rax),%rax + DB 102,15,110,4,56 ; movd (%rax,%rdi,1),%xmm0 + DB 102,15,239,201 ; pxor %xmm1,%xmm1 + DB 102,15,96,193 ; punpcklbw %xmm1,%xmm0 + DB 102,15,97,193 ; punpcklwd %xmm1,%xmm0 + DB 15,91,192 ; cvtdq2ps %xmm0,%xmm0 + DB 243,15,16,90,12 ; movss 0xc(%rdx),%xmm3 + DB 15,198,219,0 ; shufps $0x0,%xmm3,%xmm3 + DB 15,89,216 ; mulps %xmm0,%xmm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 15,87,192 ; xorps %xmm0,%xmm0 + DB 102,15,239,201 ; pxor %xmm1,%xmm1 + DB 15,87,210 ; xorps %xmm2,%xmm2 + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_store_a8_sse2 +_sk_store_a8_sse2 LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,139,0 ; mov (%rax),%rax + DB 243,68,15,16,66,8 ; movss 0x8(%rdx),%xmm8 + DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8 + DB 68,15,89,195 ; mulps %xmm3,%xmm8 + DB 102,69,15,91,192 ; cvtps2dq %xmm8,%xmm8 + DB 102,65,15,114,240,16 ; pslld $0x10,%xmm8 + DB 102,65,15,114,224,16 ; psrad $0x10,%xmm8 + DB 102,69,15,107,192 ; packssdw %xmm8,%xmm8 + DB 102,69,15,103,192 ; packuswb %xmm8,%xmm8 + DB 102,68,15,126,4,56 ; movd %xmm8,(%rax,%rdi,1) + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + PUBLIC _sk_load_565_sse2 _sk_load_565_sse2 LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index 580432c60d..899323b0db 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -42,6 +42,7 @@ static Dst bit_cast(const Src& src) { static F rsqrt(F v) { return 1.0f / sqrtf(v); } static U32 round(F v, F scale) { return (uint32_t)lrintf(v*scale); } static U16 pack(U32 v) { return (U16)v; } + static U8 pack(U16 v) { return (U8)v; } static F if_then_else(I32 c, F t, F e) { return c ? t : e; } @@ -67,6 +68,7 @@ static Dst bit_cast(const Src& src) { static F rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; } static U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); } static U16 pack(U32 v) { return __builtin_convertvector(v, U16); } + static U8 pack(U16 v) { return __builtin_convertvector(v, U8); } static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); } @@ -94,6 +96,7 @@ static Dst bit_cast(const Src& src) { static F rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; } static U32 round(F v, F scale) { return vcvt_u32_f32(mad(v,scale,0.5f)); } static U16 pack(U32 v) { return __builtin_convertvector(v, U16); } + static U8 pack(U16 v) { return __builtin_convertvector(v, U8); } static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); } @@ -123,6 +126,10 @@ static Dst bit_cast(const Src& src) { hi = _mm256_extractf128_si256(v, 1); return _mm_packus_epi32(lo, hi); } + static U8 pack(U16 v) { + __m128i r = _mm_packus_epi16(v,v); + return unaligned_load<U8>(&r); + } static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } @@ -151,6 +158,10 @@ static Dst bit_cast(const Src& src) { hi = _mm256_extractf128_si256(v, 1); return _mm_packus_epi32(lo, hi); } + static U8 pack(U16 v) { + __m128i r = _mm_packus_epi16(v,v); + return unaligned_load<U8>(&r); + } static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } @@ -187,6 +198,12 @@ static Dst bit_cast(const Src& src) { #endif return unaligned_load<U16>(&p); // We have two copies. Return (the lower) one. } + static U8 pack(U16 v) { + __m128i r; + memcpy(&r, &v, sizeof(v)); + r = _mm_packus_epi16(r,r); + return unaligned_load<U8>(&r); + } static F if_then_else(I32 c, F t, F e) { #if defined(__SSE4_1__) @@ -213,7 +230,7 @@ static Dst bit_cast(const Src& src) { static U32 expand(U16 v) { return __builtin_convertvector( v, U32); } static U32 expand(U8 v) { return __builtin_convertvector( v, U32); } #else - static F cast (U32 v) { return (F)v; } + static F cast (U32 v) { return (F)v; } static U32 expand(U16 v) { return (U32)v; } static U32 expand(U8 v) { return (U32)v; } #endif @@ -525,6 +542,19 @@ STAGE(load_tables) { a = cast( (px >> 24)) * k->_1_255; } +STAGE(load_a8) { + auto ptr = *(const uint8_t**)ctx + x; + + r = g = b = 0.0f; + a = cast(expand(unaligned_load<U8>(ptr))) * k->_1_255; +} +STAGE(store_a8) { + auto ptr = *(uint8_t**)ctx + x; + + U8 packed = pack(pack(round(a, k->_255))); + memcpy(ptr, &packed, sizeof(packed)); +} + STAGE(load_565) { auto ptr = *(const uint16_t**)ctx + x; |