diff options
author | Mike Klein <mtklein@chromium.org> | 2017-02-23 08:04:49 -0500 |
---|---|---|
committer | Mike Klein <mtklein@chromium.org> | 2017-02-23 13:37:39 +0000 |
commit | ca0cfb4a7a52ae894ca005475ad9de5ac1329900 (patch) | |
tree | 3f7defe919b4120bb4cef3496c207291e6d1e955 /src/jumper/SkJumper_generated.S | |
parent | a6e431b2c1baa564d2619bdc2a51a3b5bfa7e276 (diff) |
Add AVX to the SkJumper mix.
AVX is a nice little halfway point between SSE4.1 and HSW, in terms
of instructions available, performance, and availability.
Intel chips have had AVX since ~2011, compared to ~2013 for HSW and
~2007 for SSE4.1. Like HSW it's got 8-wide 256-bit float vectors,
but integer (and double) operations are essentially still only 128-bit.
It also doesn't have F16 conversion or FMA instructions.
It doesn't look like this is going to be a burden to maintain, and only
adds a few KB of code size. In exchange, we now run 8x wide on 45% to
70% of x86 machines, depending on the OS.
In my brief testing, speed eerily resembles exact geometric progression:
SSE4.1: 1x speed (baseline)
AVX: ~sqrt(2)x speed
HSW: ~2x speed
This adds all the basic plumbing for AVX but leaves it disabled.
I'll flip it on once I've implemented the f16 TODOs.
Change-Id: I1c378dabb8a06386646371bf78ade9e9432b006f
Reviewed-on: https://skia-review.googlesource.com/8898
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/jumper/SkJumper_generated.S')
-rw-r--r-- | src/jumper/SkJumper_generated.S | 668 |
1 files changed, 668 insertions, 0 deletions
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S index 5d7ec003a2..25bfc1bcd8 100644 --- a/src/jumper/SkJumper_generated.S +++ b/src/jumper/SkJumper_generated.S @@ -1854,6 +1854,674 @@ _sk_linear_gradient_2stops_hsw: .byte 0xc5,0x7c,0x29,0xc0 // vmovaps %ymm8,%ymm0 .byte 0xff,0xe0 // jmpq *%rax +.globl _sk_start_pipeline_avx +_sk_start_pipeline_avx: + .byte 0x41,0x57 // push %r15 + .byte 0x41,0x56 // push %r14 + .byte 0x41,0x55 // push %r13 + .byte 0x41,0x54 // push %r12 + .byte 0x53 // push %rbx + .byte 0x49,0x89,0xcf // mov %rcx,%r15 + .byte 0x49,0x89,0xd6 // mov %rdx,%r14 + .byte 0x48,0x89,0xfb // mov %rdi,%rbx + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x49,0x89,0xc4 // mov %rax,%r12 + .byte 0x49,0x89,0xf5 // mov %rsi,%r13 + .byte 0x48,0x8d,0x43,0x08 // lea 0x8(%rbx),%rax + .byte 0x4c,0x39,0xf8 // cmp %r15,%rax + .byte 0x76,0x05 // jbe 28 <_sk_start_pipeline_avx+0x28> + .byte 0x48,0x89,0xd8 // mov %rbx,%rax + .byte 0xeb,0x3c // jmp 64 <_sk_start_pipeline_avx+0x64> + .byte 0xc5,0xfc,0x57,0xc0 // vxorps %ymm0,%ymm0,%ymm0 + .byte 0xc5,0xf4,0x57,0xc9 // vxorps %ymm1,%ymm1,%ymm1 + .byte 0xc5,0xec,0x57,0xd2 // vxorps %ymm2,%ymm2,%ymm2 + .byte 0xc5,0xe4,0x57,0xdb // vxorps %ymm3,%ymm3,%ymm3 + .byte 0xc5,0xdc,0x57,0xe4 // vxorps %ymm4,%ymm4,%ymm4 + .byte 0xc5,0xd4,0x57,0xed // vxorps %ymm5,%ymm5,%ymm5 + .byte 0xc5,0xcc,0x57,0xf6 // vxorps %ymm6,%ymm6,%ymm6 + .byte 0xc5,0xc4,0x57,0xff // vxorps %ymm7,%ymm7,%ymm7 + .byte 0x48,0x89,0xdf // mov %rbx,%rdi + .byte 0x4c,0x89,0xee // mov %r13,%rsi + .byte 0x4c,0x89,0xf2 // mov %r14,%rdx + .byte 0x41,0xff,0xd4 // callq *%r12 + .byte 0x48,0x8d,0x43,0x08 // lea 0x8(%rbx),%rax + .byte 0x48,0x83,0xc3,0x10 // add $0x10,%rbx + .byte 0x4c,0x39,0xfb // cmp %r15,%rbx + .byte 0x48,0x89,0xc3 // mov %rax,%rbx + .byte 0x76,0xc4 // jbe 28 <_sk_start_pipeline_avx+0x28> + .byte 0x5b // pop %rbx + .byte 0x41,0x5c // pop %r12 + .byte 0x41,0x5d // pop %r13 + .byte 0x41,0x5e // pop %r14 + .byte 0x41,0x5f // pop %r15 + .byte 0xc5,0xf8,0x77 // vzeroupper + .byte 0xc3 // retq + +.globl _sk_just_return_avx +_sk_just_return_avx: + .byte 0xc3 // retq + +.globl _sk_seed_shader_avx +_sk_seed_shader_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xf9,0x6e,0xc7 // vmovd %edi,%xmm0 + .byte 0xc4,0xe3,0x79,0x04,0xc0,0x00 // vpermilps $0x0,%xmm0,%xmm0 + .byte 0xc4,0xe3,0x7d,0x18,0xc0,0x01 // vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 + .byte 0xc5,0xfc,0x5b,0xc0 // vcvtdq2ps %ymm0,%ymm0 + .byte 0xc4,0xe2,0x7d,0x18,0x4a,0x04 // vbroadcastss 0x4(%rdx),%ymm1 + .byte 0xc5,0xfc,0x58,0xc1 // vaddps %ymm1,%ymm0,%ymm0 + .byte 0xc5,0xfc,0x58,0x42,0x14 // vaddps 0x14(%rdx),%ymm0,%ymm0 + .byte 0xc5,0xf9,0x6e,0x10 // vmovd (%rax),%xmm2 + .byte 0xc4,0xe3,0x79,0x04,0xd2,0x00 // vpermilps $0x0,%xmm2,%xmm2 + .byte 0xc4,0xe3,0x6d,0x18,0xd2,0x01 // vinsertf128 $0x1,%xmm2,%ymm2,%ymm2 + .byte 0xc5,0xfc,0x5b,0xd2 // vcvtdq2ps %ymm2,%ymm2 + .byte 0xc5,0xec,0x58,0xc9 // vaddps %ymm1,%ymm2,%ymm1 + .byte 0xc4,0xe2,0x7d,0x18,0x12 // vbroadcastss (%rdx),%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xe4,0x57,0xdb // vxorps %ymm3,%ymm3,%ymm3 + .byte 0xc5,0xdc,0x57,0xe4 // vxorps %ymm4,%ymm4,%ymm4 + .byte 0xc5,0xd4,0x57,0xed // vxorps %ymm5,%ymm5,%ymm5 + .byte 0xc5,0xcc,0x57,0xf6 // vxorps %ymm6,%ymm6,%ymm6 + .byte 0xc5,0xc4,0x57,0xff // vxorps %ymm7,%ymm7,%ymm7 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_constant_color_avx +_sk_constant_color_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0xe2,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm0 + .byte 0xc4,0xe2,0x7d,0x18,0x48,0x04 // vbroadcastss 0x4(%rax),%ymm1 + .byte 0xc4,0xe2,0x7d,0x18,0x50,0x08 // vbroadcastss 0x8(%rax),%ymm2 + .byte 0xc4,0xe2,0x7d,0x18,0x58,0x0c // vbroadcastss 0xc(%rax),%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_clear_avx +_sk_clear_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xfc,0x57,0xc0 // vxorps %ymm0,%ymm0,%ymm0 + .byte 0xc5,0xf4,0x57,0xc9 // vxorps %ymm1,%ymm1,%ymm1 + .byte 0xc5,0xec,0x57,0xd2 // vxorps %ymm2,%ymm2,%ymm2 + .byte 0xc5,0xe4,0x57,0xdb // vxorps %ymm3,%ymm3,%ymm3 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_plus__avx +_sk_plus__avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xfc,0x58,0xc4 // vaddps %ymm4,%ymm0,%ymm0 + .byte 0xc5,0xf4,0x58,0xcd // vaddps %ymm5,%ymm1,%ymm1 + .byte 0xc5,0xec,0x58,0xd6 // vaddps %ymm6,%ymm2,%ymm2 + .byte 0xc5,0xe4,0x58,0xdf // vaddps %ymm7,%ymm3,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_srcover_avx +_sk_srcover_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x02 // vbroadcastss (%rdx),%ymm8 + .byte 0xc5,0x3c,0x5c,0xc3 // vsubps %ymm3,%ymm8,%ymm8 + .byte 0xc5,0x3c,0x59,0xcc // vmulps %ymm4,%ymm8,%ymm9 + .byte 0xc5,0xb4,0x58,0xc0 // vaddps %ymm0,%ymm9,%ymm0 + .byte 0xc5,0x3c,0x59,0xcd // vmulps %ymm5,%ymm8,%ymm9 + .byte 0xc5,0xb4,0x58,0xc9 // vaddps %ymm1,%ymm9,%ymm1 + .byte 0xc5,0x3c,0x59,0xce // vmulps %ymm6,%ymm8,%ymm9 + .byte 0xc5,0xb4,0x58,0xd2 // vaddps %ymm2,%ymm9,%ymm2 + .byte 0xc5,0x3c,0x59,0xc7 // vmulps %ymm7,%ymm8,%ymm8 + .byte 0xc5,0xbc,0x58,0xdb // vaddps %ymm3,%ymm8,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_dstover_avx +_sk_dstover_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x02 // vbroadcastss (%rdx),%ymm8 + .byte 0xc5,0x3c,0x5c,0xc7 // vsubps %ymm7,%ymm8,%ymm8 + .byte 0xc5,0xbc,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm0 + .byte 0xc5,0xfc,0x58,0xc4 // vaddps %ymm4,%ymm0,%ymm0 + .byte 0xc5,0xbc,0x59,0xc9 // vmulps %ymm1,%ymm8,%ymm1 + .byte 0xc5,0xf4,0x58,0xcd // vaddps %ymm5,%ymm1,%ymm1 + .byte 0xc5,0xbc,0x59,0xd2 // vmulps %ymm2,%ymm8,%ymm2 + .byte 0xc5,0xec,0x58,0xd6 // vaddps %ymm6,%ymm2,%ymm2 + .byte 0xc5,0xbc,0x59,0xdb // vmulps %ymm3,%ymm8,%ymm3 + .byte 0xc5,0xe4,0x58,0xdf // vaddps %ymm7,%ymm3,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_clamp_0_avx +_sk_clamp_0_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x41,0x3c,0x57,0xc0 // vxorps %ymm8,%ymm8,%ymm8 + .byte 0xc4,0xc1,0x7c,0x5f,0xc0 // vmaxps %ymm8,%ymm0,%ymm0 + .byte 0xc4,0xc1,0x74,0x5f,0xc8 // vmaxps %ymm8,%ymm1,%ymm1 + .byte 0xc4,0xc1,0x6c,0x5f,0xd0 // vmaxps %ymm8,%ymm2,%ymm2 + .byte 0xc4,0xc1,0x64,0x5f,0xd8 // vmaxps %ymm8,%ymm3,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_clamp_1_avx +_sk_clamp_1_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x02 // vbroadcastss (%rdx),%ymm8 + .byte 0xc4,0xc1,0x7c,0x5d,0xc0 // vminps %ymm8,%ymm0,%ymm0 + .byte 0xc4,0xc1,0x74,0x5d,0xc8 // vminps %ymm8,%ymm1,%ymm1 + .byte 0xc4,0xc1,0x6c,0x5d,0xd0 // vminps %ymm8,%ymm2,%ymm2 + .byte 0xc4,0xc1,0x64,0x5d,0xd8 // vminps %ymm8,%ymm3,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_clamp_a_avx +_sk_clamp_a_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x02 // vbroadcastss (%rdx),%ymm8 + .byte 0xc4,0xc1,0x64,0x5d,0xd8 // vminps %ymm8,%ymm3,%ymm3 + .byte 0xc5,0xfc,0x5d,0xc3 // vminps %ymm3,%ymm0,%ymm0 + .byte 0xc5,0xf4,0x5d,0xcb // vminps %ymm3,%ymm1,%ymm1 + .byte 0xc5,0xec,0x5d,0xd3 // vminps %ymm3,%ymm2,%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_set_rgb_avx +_sk_set_rgb_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0xe2,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm0 + .byte 0xc4,0xe2,0x7d,0x18,0x48,0x04 // vbroadcastss 0x4(%rax),%ymm1 + .byte 0xc4,0xe2,0x7d,0x18,0x50,0x08 // vbroadcastss 0x8(%rax),%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_swap_rb_avx +_sk_swap_rb_avx: + .byte 0xc5,0x7c,0x28,0xc0 // vmovaps %ymm0,%ymm8 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xfc,0x28,0xc2 // vmovaps %ymm2,%ymm0 + .byte 0xc5,0x7c,0x29,0xc2 // vmovaps %ymm8,%ymm2 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_swap_avx +_sk_swap_avx: + .byte 0xc5,0x7c,0x28,0xc3 // vmovaps %ymm3,%ymm8 + .byte 0xc5,0x7c,0x28,0xca // vmovaps %ymm2,%ymm9 + .byte 0xc5,0x7c,0x28,0xd1 // vmovaps %ymm1,%ymm10 + .byte 0xc5,0x7c,0x28,0xd8 // vmovaps %ymm0,%ymm11 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xfc,0x28,0xc4 // vmovaps %ymm4,%ymm0 + .byte 0xc5,0xfc,0x28,0xcd // vmovaps %ymm5,%ymm1 + .byte 0xc5,0xfc,0x28,0xd6 // vmovaps %ymm6,%ymm2 + .byte 0xc5,0xfc,0x28,0xdf // vmovaps %ymm7,%ymm3 + .byte 0xc5,0x7c,0x29,0xdc // vmovaps %ymm11,%ymm4 + .byte 0xc5,0x7c,0x29,0xd5 // vmovaps %ymm10,%ymm5 + .byte 0xc5,0x7c,0x29,0xce // vmovaps %ymm9,%ymm6 + .byte 0xc5,0x7c,0x29,0xc7 // vmovaps %ymm8,%ymm7 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_move_src_dst_avx +_sk_move_src_dst_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xfc,0x28,0xe0 // vmovaps %ymm0,%ymm4 + .byte 0xc5,0xfc,0x28,0xe9 // vmovaps %ymm1,%ymm5 + .byte 0xc5,0xfc,0x28,0xf2 // vmovaps %ymm2,%ymm6 + .byte 0xc5,0xfc,0x28,0xfb // vmovaps %ymm3,%ymm7 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_move_dst_src_avx +_sk_move_dst_src_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xfc,0x28,0xc4 // vmovaps %ymm4,%ymm0 + .byte 0xc5,0xfc,0x28,0xcd // vmovaps %ymm5,%ymm1 + .byte 0xc5,0xfc,0x28,0xd6 // vmovaps %ymm6,%ymm2 + .byte 0xc5,0xfc,0x28,0xdf // vmovaps %ymm7,%ymm3 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_premul_avx +_sk_premul_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xfc,0x59,0xc3 // vmulps %ymm3,%ymm0,%ymm0 + .byte 0xc5,0xf4,0x59,0xcb // vmulps %ymm3,%ymm1,%ymm1 + .byte 0xc5,0xec,0x59,0xd3 // vmulps %ymm3,%ymm2,%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_unpremul_avx +_sk_unpremul_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x41,0x3c,0x57,0xc0 // vxorps %ymm8,%ymm8,%ymm8 + .byte 0xc4,0x41,0x64,0xc2,0xc8,0x00 // vcmpeqps %ymm8,%ymm3,%ymm9 + .byte 0xc4,0x62,0x7d,0x18,0x12 // vbroadcastss (%rdx),%ymm10 + .byte 0xc5,0x2c,0x5e,0xd3 // vdivps %ymm3,%ymm10,%ymm10 + .byte 0xc4,0x43,0x2d,0x4a,0xc0,0x90 // vblendvps %ymm9,%ymm8,%ymm10,%ymm8 + .byte 0xc5,0xbc,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm0 + .byte 0xc5,0xbc,0x59,0xc9 // vmulps %ymm1,%ymm8,%ymm1 + .byte 0xc5,0xbc,0x59,0xd2 // vmulps %ymm2,%ymm8,%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_from_srgb_avx +_sk_from_srgb_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x42,0x40 // vbroadcastss 0x40(%rdx),%ymm8 + .byte 0xc5,0x3c,0x59,0xc8 // vmulps %ymm0,%ymm8,%ymm9 + .byte 0xc5,0x7c,0x59,0xd0 // vmulps %ymm0,%ymm0,%ymm10 + .byte 0xc4,0x62,0x7d,0x18,0x5a,0x3c // vbroadcastss 0x3c(%rdx),%ymm11 + .byte 0xc4,0x62,0x7d,0x18,0x62,0x38 // vbroadcastss 0x38(%rdx),%ymm12 + .byte 0xc5,0x24,0x59,0xe8 // vmulps %ymm0,%ymm11,%ymm13 + .byte 0xc4,0x41,0x14,0x58,0xec // vaddps %ymm12,%ymm13,%ymm13 + .byte 0xc4,0x62,0x7d,0x18,0x72,0x34 // vbroadcastss 0x34(%rdx),%ymm14 + .byte 0xc4,0x41,0x2c,0x59,0xd5 // vmulps %ymm13,%ymm10,%ymm10 + .byte 0xc4,0x41,0x0c,0x58,0xd2 // vaddps %ymm10,%ymm14,%ymm10 + .byte 0xc4,0x62,0x7d,0x18,0x6a,0x44 // vbroadcastss 0x44(%rdx),%ymm13 + .byte 0xc4,0xc1,0x7c,0xc2,0xc5,0x01 // vcmpltps %ymm13,%ymm0,%ymm0 + .byte 0xc4,0xc3,0x2d,0x4a,0xc1,0x00 // vblendvps %ymm0,%ymm9,%ymm10,%ymm0 + .byte 0xc5,0x3c,0x59,0xc9 // vmulps %ymm1,%ymm8,%ymm9 + .byte 0xc5,0x74,0x59,0xd1 // vmulps %ymm1,%ymm1,%ymm10 + .byte 0xc5,0x24,0x59,0xf9 // vmulps %ymm1,%ymm11,%ymm15 + .byte 0xc4,0x41,0x04,0x58,0xfc // vaddps %ymm12,%ymm15,%ymm15 + .byte 0xc4,0x41,0x2c,0x59,0xd7 // vmulps %ymm15,%ymm10,%ymm10 + .byte 0xc4,0x41,0x0c,0x58,0xd2 // vaddps %ymm10,%ymm14,%ymm10 + .byte 0xc4,0xc1,0x74,0xc2,0xcd,0x01 // vcmpltps %ymm13,%ymm1,%ymm1 + .byte 0xc4,0xc3,0x2d,0x4a,0xc9,0x10 // vblendvps %ymm1,%ymm9,%ymm10,%ymm1 + .byte 0xc5,0x3c,0x59,0xc2 // vmulps %ymm2,%ymm8,%ymm8 + .byte 0xc5,0x6c,0x59,0xca // vmulps %ymm2,%ymm2,%ymm9 + .byte 0xc5,0x24,0x59,0xd2 // vmulps %ymm2,%ymm11,%ymm10 + .byte 0xc4,0x41,0x2c,0x58,0xd4 // vaddps %ymm12,%ymm10,%ymm10 + .byte 0xc4,0x41,0x34,0x59,0xca // vmulps %ymm10,%ymm9,%ymm9 + .byte 0xc4,0x41,0x0c,0x58,0xc9 // vaddps %ymm9,%ymm14,%ymm9 + .byte 0xc4,0xc1,0x6c,0xc2,0xd5,0x01 // vcmpltps %ymm13,%ymm2,%ymm2 + .byte 0xc4,0xc3,0x35,0x4a,0xd0,0x20 // vblendvps %ymm2,%ymm8,%ymm9,%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_to_srgb_avx +_sk_to_srgb_avx: + .byte 0xc5,0x7c,0x52,0xc0 // vrsqrtps %ymm0,%ymm8 + .byte 0xc4,0x41,0x7c,0x53,0xc8 // vrcpps %ymm8,%ymm9 + .byte 0xc4,0x41,0x7c,0x52,0xd0 // vrsqrtps %ymm8,%ymm10 + .byte 0xc4,0x62,0x7d,0x18,0x42,0x48 // vbroadcastss 0x48(%rdx),%ymm8 + .byte 0xc5,0x3c,0x59,0xd8 // vmulps %ymm0,%ymm8,%ymm11 + .byte 0xc4,0x62,0x7d,0x18,0x22 // vbroadcastss (%rdx),%ymm12 + .byte 0xc4,0x62,0x7d,0x18,0x6a,0x4c // vbroadcastss 0x4c(%rdx),%ymm13 + .byte 0xc4,0x62,0x7d,0x18,0x72,0x50 // vbroadcastss 0x50(%rdx),%ymm14 + .byte 0xc4,0x62,0x7d,0x18,0x7a,0x54 // vbroadcastss 0x54(%rdx),%ymm15 + .byte 0xc4,0x41,0x34,0x59,0xce // vmulps %ymm14,%ymm9,%ymm9 + .byte 0xc4,0x41,0x34,0x58,0xcf // vaddps %ymm15,%ymm9,%ymm9 + .byte 0xc4,0x41,0x2c,0x59,0xd5 // vmulps %ymm13,%ymm10,%ymm10 + .byte 0xc4,0x41,0x2c,0x58,0xc9 // vaddps %ymm9,%ymm10,%ymm9 + .byte 0xc4,0x41,0x1c,0x5d,0xc9 // vminps %ymm9,%ymm12,%ymm9 + .byte 0xc4,0x62,0x7d,0x18,0x52,0x58 // vbroadcastss 0x58(%rdx),%ymm10 + .byte 0xc4,0xc1,0x7c,0xc2,0xc2,0x01 // vcmpltps %ymm10,%ymm0,%ymm0 + .byte 0xc4,0xc3,0x35,0x4a,0xc3,0x00 // vblendvps %ymm0,%ymm11,%ymm9,%ymm0 + .byte 0xc5,0x7c,0x52,0xc9 // vrsqrtps %ymm1,%ymm9 + .byte 0xc4,0x41,0x7c,0x53,0xd9 // vrcpps %ymm9,%ymm11 + .byte 0xc4,0x41,0x7c,0x52,0xc9 // vrsqrtps %ymm9,%ymm9 + .byte 0xc4,0x41,0x0c,0x59,0xdb // vmulps %ymm11,%ymm14,%ymm11 + .byte 0xc4,0x41,0x04,0x58,0xdb // vaddps %ymm11,%ymm15,%ymm11 + .byte 0xc4,0x41,0x14,0x59,0xc9 // vmulps %ymm9,%ymm13,%ymm9 + .byte 0xc4,0x41,0x34,0x58,0xcb // vaddps %ymm11,%ymm9,%ymm9 + .byte 0xc5,0x3c,0x59,0xd9 // vmulps %ymm1,%ymm8,%ymm11 + .byte 0xc4,0x41,0x1c,0x5d,0xc9 // vminps %ymm9,%ymm12,%ymm9 + .byte 0xc4,0xc1,0x74,0xc2,0xca,0x01 // vcmpltps %ymm10,%ymm1,%ymm1 + .byte 0xc4,0xc3,0x35,0x4a,0xcb,0x10 // vblendvps %ymm1,%ymm11,%ymm9,%ymm1 + .byte 0xc5,0x7c,0x52,0xca // vrsqrtps %ymm2,%ymm9 + .byte 0xc4,0x41,0x7c,0x53,0xd9 // vrcpps %ymm9,%ymm11 + .byte 0xc4,0x41,0x0c,0x59,0xdb // vmulps %ymm11,%ymm14,%ymm11 + .byte 0xc4,0x41,0x04,0x58,0xdb // vaddps %ymm11,%ymm15,%ymm11 + .byte 0xc4,0x41,0x7c,0x52,0xc9 // vrsqrtps %ymm9,%ymm9 + .byte 0xc4,0x41,0x14,0x59,0xc9 // vmulps %ymm9,%ymm13,%ymm9 + .byte 0xc4,0x41,0x34,0x58,0xcb // vaddps %ymm11,%ymm9,%ymm9 + .byte 0xc4,0x41,0x1c,0x5d,0xc9 // vminps %ymm9,%ymm12,%ymm9 + .byte 0xc5,0x3c,0x59,0xc2 // vmulps %ymm2,%ymm8,%ymm8 + .byte 0xc4,0xc1,0x6c,0xc2,0xd2,0x01 // vcmpltps %ymm10,%ymm2,%ymm2 + .byte 0xc4,0xc3,0x35,0x4a,0xd0,0x20 // vblendvps %ymm2,%ymm8,%ymm9,%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_scale_u8_avx +_sk_scale_u8_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0x8b,0x00 // mov (%rax),%rax + .byte 0xc4,0x62,0x79,0x31,0x44,0x38,0x04 // vpmovzxbd 0x4(%rax,%rdi,1),%xmm8 + .byte 0xc4,0x62,0x79,0x31,0x0c,0x38 // vpmovzxbd (%rax,%rdi,1),%xmm9 + .byte 0xc4,0x43,0x35,0x18,0xc0,0x01 // vinsertf128 $0x1,%xmm8,%ymm9,%ymm8 + .byte 0xc4,0x41,0x7c,0x5b,0xc0 // vcvtdq2ps %ymm8,%ymm8 + .byte 0xc4,0x62,0x7d,0x18,0x4a,0x0c // vbroadcastss 0xc(%rdx),%ymm9 + .byte 0xc4,0x41,0x3c,0x59,0xc1 // vmulps %ymm9,%ymm8,%ymm8 + .byte 0xc5,0xbc,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm0 + .byte 0xc5,0xbc,0x59,0xc9 // vmulps %ymm1,%ymm8,%ymm1 + .byte 0xc5,0xbc,0x59,0xd2 // vmulps %ymm2,%ymm8,%ymm2 + .byte 0xc5,0xbc,0x59,0xdb // vmulps %ymm3,%ymm8,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_lerp_u8_avx +_sk_lerp_u8_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0x8b,0x00 // mov (%rax),%rax + .byte 0xc4,0x62,0x79,0x31,0x44,0x38,0x04 // vpmovzxbd 0x4(%rax,%rdi,1),%xmm8 + .byte 0xc4,0x62,0x79,0x31,0x0c,0x38 // vpmovzxbd (%rax,%rdi,1),%xmm9 + .byte 0xc4,0x43,0x35,0x18,0xc0,0x01 // vinsertf128 $0x1,%xmm8,%ymm9,%ymm8 + .byte 0xc4,0x41,0x7c,0x5b,0xc0 // vcvtdq2ps %ymm8,%ymm8 + .byte 0xc4,0x62,0x7d,0x18,0x4a,0x0c // vbroadcastss 0xc(%rdx),%ymm9 + .byte 0xc4,0x41,0x3c,0x59,0xc1 // vmulps %ymm9,%ymm8,%ymm8 + .byte 0xc5,0xfc,0x5c,0xc4 // vsubps %ymm4,%ymm0,%ymm0 + .byte 0xc4,0xc1,0x7c,0x59,0xc0 // vmulps %ymm8,%ymm0,%ymm0 + .byte 0xc5,0xfc,0x58,0xc4 // vaddps %ymm4,%ymm0,%ymm0 + .byte 0xc5,0xf4,0x5c,0xcd // vsubps %ymm5,%ymm1,%ymm1 + .byte 0xc4,0xc1,0x74,0x59,0xc8 // vmulps %ymm8,%ymm1,%ymm1 + .byte 0xc5,0xf4,0x58,0xcd // vaddps %ymm5,%ymm1,%ymm1 + .byte 0xc5,0xec,0x5c,0xd6 // vsubps %ymm6,%ymm2,%ymm2 + .byte 0xc4,0xc1,0x6c,0x59,0xd0 // vmulps %ymm8,%ymm2,%ymm2 + .byte 0xc5,0xec,0x58,0xd6 // vaddps %ymm6,%ymm2,%ymm2 + .byte 0xc5,0xe4,0x5c,0xdf // vsubps %ymm7,%ymm3,%ymm3 + .byte 0xc4,0xc1,0x64,0x59,0xd8 // vmulps %ymm8,%ymm3,%ymm3 + .byte 0xc5,0xe4,0x58,0xdf // vaddps %ymm7,%ymm3,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_load_tables_avx +_sk_load_tables_avx: + .byte 0x41,0x57 // push %r15 + .byte 0x41,0x56 // push %r14 + .byte 0x41,0x54 // push %r12 + .byte 0x53 // push %rbx + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x4c,0x8b,0x00 // mov (%rax),%r8 + .byte 0x48,0x8b,0x48,0x08 // mov 0x8(%rax),%rcx + .byte 0xc4,0x41,0x7c,0x10,0x14,0xb8 // vmovups (%r8,%rdi,4),%ymm10 + .byte 0xc5,0xf9,0x6e,0x42,0x10 // vmovd 0x10(%rdx),%xmm0 + .byte 0xc4,0xe3,0x79,0x04,0xc0,0x00 // vpermilps $0x0,%xmm0,%xmm0 + .byte 0xc4,0x63,0x7d,0x18,0xc8,0x01 // vinsertf128 $0x1,%xmm0,%ymm0,%ymm9 + .byte 0xc4,0xc1,0x34,0x54,0xc2 // vandps %ymm10,%ymm9,%ymm0 + .byte 0xc4,0xc1,0xf9,0x7e,0xc0 // vmovq %xmm0,%r8 + .byte 0x45,0x89,0xc1 // mov %r8d,%r9d + .byte 0xc4,0xc3,0xf9,0x16,0xc2,0x01 // vpextrq $0x1,%xmm0,%r10 + .byte 0x45,0x89,0xd3 // mov %r10d,%r11d + .byte 0x49,0xc1,0xea,0x20 // shr $0x20,%r10 + .byte 0x49,0xc1,0xe8,0x20 // shr $0x20,%r8 + .byte 0xc4,0xe3,0x7d,0x19,0xc0,0x01 // vextractf128 $0x1,%ymm0,%xmm0 + .byte 0xc4,0xc1,0xf9,0x7e,0xc7 // vmovq %xmm0,%r15 + .byte 0x45,0x89,0xfe // mov %r15d,%r14d + .byte 0xc4,0xe3,0xf9,0x16,0xc3,0x01 // vpextrq $0x1,%xmm0,%rbx + .byte 0x41,0x89,0xdc // mov %ebx,%r12d + .byte 0x48,0xc1,0xeb,0x20 // shr $0x20,%rbx + .byte 0x49,0xc1,0xef,0x20 // shr $0x20,%r15 + .byte 0xc4,0xa1,0x7a,0x10,0x04,0xb1 // vmovss (%rcx,%r14,4),%xmm0 + .byte 0xc4,0xa3,0x79,0x21,0x04,0xb9,0x10 // vinsertps $0x10,(%rcx,%r15,4),%xmm0,%xmm0 + .byte 0xc4,0xa3,0x79,0x21,0x04,0xa1,0x20 // vinsertps $0x20,(%rcx,%r12,4),%xmm0,%xmm0 + .byte 0xc4,0xe3,0x79,0x21,0x04,0x99,0x30 // vinsertps $0x30,(%rcx,%rbx,4),%xmm0,%xmm0 + .byte 0xc4,0xa1,0x7a,0x10,0x0c,0x89 // vmovss (%rcx,%r9,4),%xmm1 + .byte 0xc4,0xa3,0x71,0x21,0x0c,0x81,0x10 // vinsertps $0x10,(%rcx,%r8,4),%xmm1,%xmm1 + .byte 0xc4,0xa3,0x71,0x21,0x0c,0x99,0x20 // vinsertps $0x20,(%rcx,%r11,4),%xmm1,%xmm1 + .byte 0xc4,0xa3,0x71,0x21,0x0c,0x91,0x30 // vinsertps $0x30,(%rcx,%r10,4),%xmm1,%xmm1 + .byte 0xc4,0xe3,0x75,0x18,0xc0,0x01 // vinsertf128 $0x1,%xmm0,%ymm1,%ymm0 + .byte 0x4c,0x8b,0x78,0x10 // mov 0x10(%rax),%r15 + .byte 0xc4,0xc1,0x71,0x72,0xd2,0x08 // vpsrld $0x8,%xmm10,%xmm1 + .byte 0xc4,0x43,0x7d,0x19,0xd0,0x01 // vextractf128 $0x1,%ymm10,%xmm8 + .byte 0xc4,0xc1,0x69,0x72,0xd0,0x08 // vpsrld $0x8,%xmm8,%xmm2 + .byte 0xc4,0xe3,0x75,0x18,0xca,0x01 // vinsertf128 $0x1,%xmm2,%ymm1,%ymm1 + .byte 0xc5,0xb4,0x54,0xc9 // vandps %ymm1,%ymm9,%ymm1 + .byte 0xc4,0xc1,0xf9,0x7e,0xc8 // vmovq %xmm1,%r8 + .byte 0x45,0x89,0xc2 // mov %r8d,%r10d + .byte 0xc4,0xc3,0xf9,0x16,0xc9,0x01 // vpextrq $0x1,%xmm1,%r9 + .byte 0x45,0x89,0xcb // mov %r9d,%r11d + .byte 0x49,0xc1,0xe9,0x20 // shr $0x20,%r9 + .byte 0x49,0xc1,0xe8,0x20 // shr $0x20,%r8 + .byte 0xc4,0xe3,0x7d,0x19,0xc9,0x01 // vextractf128 $0x1,%ymm1,%xmm1 + .byte 0xc4,0xe1,0xf9,0x7e,0xcb // vmovq %xmm1,%rbx + .byte 0x41,0x89,0xde // mov %ebx,%r14d + .byte 0xc4,0xe3,0xf9,0x16,0xc9,0x01 // vpextrq $0x1,%xmm1,%rcx + .byte 0x41,0x89,0xcc // mov %ecx,%r12d + .byte 0x48,0xc1,0xe9,0x20 // shr $0x20,%rcx + .byte 0x48,0xc1,0xeb,0x20 // shr $0x20,%rbx + .byte 0xc4,0x81,0x7a,0x10,0x0c,0xb7 // vmovss (%r15,%r14,4),%xmm1 + .byte 0xc4,0xc3,0x71,0x21,0x0c,0x9f,0x10 // vinsertps $0x10,(%r15,%rbx,4),%xmm1,%xmm1 + .byte 0xc4,0x81,0x7a,0x10,0x14,0xa7 // vmovss (%r15,%r12,4),%xmm2 + .byte 0xc4,0xe3,0x71,0x21,0xca,0x20 // vinsertps $0x20,%xmm2,%xmm1,%xmm1 + .byte 0xc4,0xc1,0x7a,0x10,0x14,0x8f // vmovss (%r15,%rcx,4),%xmm2 + .byte 0xc4,0xe3,0x71,0x21,0xca,0x30 // vinsertps $0x30,%xmm2,%xmm1,%xmm1 + .byte 0xc4,0x81,0x7a,0x10,0x14,0x97 // vmovss (%r15,%r10,4),%xmm2 + .byte 0xc4,0x83,0x69,0x21,0x14,0x87,0x10 // vinsertps $0x10,(%r15,%r8,4),%xmm2,%xmm2 + .byte 0xc4,0x81,0x7a,0x10,0x1c,0x9f // vmovss (%r15,%r11,4),%xmm3 + .byte 0xc4,0xe3,0x69,0x21,0xd3,0x20 // vinsertps $0x20,%xmm3,%xmm2,%xmm2 + .byte 0xc4,0x81,0x7a,0x10,0x1c,0x8f // vmovss (%r15,%r9,4),%xmm3 + .byte 0xc4,0xe3,0x69,0x21,0xd3,0x30 // vinsertps $0x30,%xmm3,%xmm2,%xmm2 + .byte 0xc4,0xe3,0x6d,0x18,0xc9,0x01 // vinsertf128 $0x1,%xmm1,%ymm2,%ymm1 + .byte 0x48,0x8b,0x40,0x18 // mov 0x18(%rax),%rax + .byte 0xc4,0xc1,0x69,0x72,0xd2,0x10 // vpsrld $0x10,%xmm10,%xmm2 + .byte 0xc4,0xc1,0x61,0x72,0xd0,0x10 // vpsrld $0x10,%xmm8,%xmm3 + .byte 0xc4,0xe3,0x6d,0x18,0xd3,0x01 // vinsertf128 $0x1,%xmm3,%ymm2,%ymm2 + .byte 0xc5,0xb4,0x54,0xd2 // vandps %ymm2,%ymm9,%ymm2 + .byte 0xc4,0xc1,0xf9,0x7e,0xd0 // vmovq %xmm2,%r8 + .byte 0x45,0x89,0xc1 // mov %r8d,%r9d + .byte 0xc4,0xc3,0xf9,0x16,0xd6,0x01 // vpextrq $0x1,%xmm2,%r14 + .byte 0x45,0x89,0xf2 // mov %r14d,%r10d + .byte 0x49,0xc1,0xee,0x20 // shr $0x20,%r14 + .byte 0x49,0xc1,0xe8,0x20 // shr $0x20,%r8 + .byte 0xc4,0xe3,0x7d,0x19,0xd2,0x01 // vextractf128 $0x1,%ymm2,%xmm2 + .byte 0xc4,0xe1,0xf9,0x7e,0xd3 // vmovq %xmm2,%rbx + .byte 0x41,0x89,0xdb // mov %ebx,%r11d + .byte 0xc4,0xe3,0xf9,0x16,0xd1,0x01 // vpextrq $0x1,%xmm2,%rcx + .byte 0x41,0x89,0xcf // mov %ecx,%r15d + .byte 0x48,0xc1,0xe9,0x20 // shr $0x20,%rcx + .byte 0x48,0xc1,0xeb,0x20 // shr $0x20,%rbx + .byte 0xc4,0xa1,0x7a,0x10,0x14,0x98 // vmovss (%rax,%r11,4),%xmm2 + .byte 0xc4,0xe3,0x69,0x21,0x14,0x98,0x10 // vinsertps $0x10,(%rax,%rbx,4),%xmm2,%xmm2 + .byte 0xc4,0xa1,0x7a,0x10,0x1c,0xb8 // vmovss (%rax,%r15,4),%xmm3 + .byte 0xc4,0xe3,0x69,0x21,0xd3,0x20 // vinsertps $0x20,%xmm3,%xmm2,%xmm2 + .byte 0xc5,0xfa,0x10,0x1c,0x88 // vmovss (%rax,%rcx,4),%xmm3 + .byte 0xc4,0x63,0x69,0x21,0xcb,0x30 // vinsertps $0x30,%xmm3,%xmm2,%xmm9 + .byte 0xc4,0xa1,0x7a,0x10,0x1c,0x88 // vmovss (%rax,%r9,4),%xmm3 + .byte 0xc4,0xa3,0x61,0x21,0x1c,0x80,0x10 // vinsertps $0x10,(%rax,%r8,4),%xmm3,%xmm3 + .byte 0xc4,0xa1,0x7a,0x10,0x14,0x90 // vmovss (%rax,%r10,4),%xmm2 + .byte 0xc4,0xe3,0x61,0x21,0xd2,0x20 // vinsertps $0x20,%xmm2,%xmm3,%xmm2 + .byte 0xc4,0xa1,0x7a,0x10,0x1c,0xb0 // vmovss (%rax,%r14,4),%xmm3 + .byte 0xc4,0xe3,0x69,0x21,0xd3,0x30 // vinsertps $0x30,%xmm3,%xmm2,%xmm2 + .byte 0xc4,0xc3,0x6d,0x18,0xd1,0x01 // vinsertf128 $0x1,%xmm9,%ymm2,%ymm2 + .byte 0xc4,0xc1,0x31,0x72,0xd2,0x18 // vpsrld $0x18,%xmm10,%xmm9 + .byte 0xc4,0xc1,0x61,0x72,0xd0,0x18 // vpsrld $0x18,%xmm8,%xmm3 + .byte 0xc4,0xe3,0x35,0x18,0xdb,0x01 // vinsertf128 $0x1,%xmm3,%ymm9,%ymm3 + .byte 0xc5,0xfc,0x5b,0xdb // vcvtdq2ps %ymm3,%ymm3 + .byte 0xc4,0x62,0x7d,0x18,0x42,0x0c // vbroadcastss 0xc(%rdx),%ymm8 + .byte 0xc4,0xc1,0x64,0x59,0xd8 // vmulps %ymm8,%ymm3,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x5b // pop %rbx + .byte 0x41,0x5c // pop %r12 + .byte 0x41,0x5e // pop %r14 + .byte 0x41,0x5f // pop %r15 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_load_8888_avx +_sk_load_8888_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0x8b,0x00 // mov (%rax),%rax + .byte 0xc5,0xfc,0x10,0x1c,0xb8 // vmovups (%rax,%rdi,4),%ymm3 + .byte 0xc5,0xf9,0x6e,0x42,0x10 // vmovd 0x10(%rdx),%xmm0 + .byte 0xc4,0xe3,0x79,0x04,0xc0,0x00 // vpermilps $0x0,%xmm0,%xmm0 + .byte 0xc4,0x63,0x7d,0x18,0xd8,0x01 // vinsertf128 $0x1,%xmm0,%ymm0,%ymm11 + .byte 0xc5,0xa4,0x54,0xc3 // vandps %ymm3,%ymm11,%ymm0 + .byte 0xc5,0xfc,0x5b,0xc0 // vcvtdq2ps %ymm0,%ymm0 + .byte 0xc4,0x62,0x7d,0x18,0x42,0x0c // vbroadcastss 0xc(%rdx),%ymm8 + .byte 0xc5,0xbc,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm0 + .byte 0xc5,0xa9,0x72,0xd3,0x08 // vpsrld $0x8,%xmm3,%xmm10 + .byte 0xc4,0xc3,0x7d,0x19,0xd9,0x01 // vextractf128 $0x1,%ymm3,%xmm9 + .byte 0xc4,0xc1,0x71,0x72,0xd1,0x08 // vpsrld $0x8,%xmm9,%xmm1 + .byte 0xc4,0xe3,0x2d,0x18,0xc9,0x01 // vinsertf128 $0x1,%xmm1,%ymm10,%ymm1 + .byte 0xc5,0xa4,0x54,0xc9 // vandps %ymm1,%ymm11,%ymm1 + .byte 0xc5,0xfc,0x5b,0xc9 // vcvtdq2ps %ymm1,%ymm1 + .byte 0xc5,0xbc,0x59,0xc9 // vmulps %ymm1,%ymm8,%ymm1 + .byte 0xc5,0xa9,0x72,0xd3,0x10 // vpsrld $0x10,%xmm3,%xmm10 + .byte 0xc4,0xc1,0x69,0x72,0xd1,0x10 // vpsrld $0x10,%xmm9,%xmm2 + .byte 0xc4,0xe3,0x2d,0x18,0xd2,0x01 // vinsertf128 $0x1,%xmm2,%ymm10,%ymm2 + .byte 0xc5,0xa4,0x54,0xd2 // vandps %ymm2,%ymm11,%ymm2 + .byte 0xc5,0xfc,0x5b,0xd2 // vcvtdq2ps %ymm2,%ymm2 + .byte 0xc5,0xbc,0x59,0xd2 // vmulps %ymm2,%ymm8,%ymm2 + .byte 0xc5,0xa9,0x72,0xd3,0x18 // vpsrld $0x18,%xmm3,%xmm10 + .byte 0xc4,0xc1,0x61,0x72,0xd1,0x18 // vpsrld $0x18,%xmm9,%xmm3 + .byte 0xc4,0xe3,0x2d,0x18,0xdb,0x01 // vinsertf128 $0x1,%xmm3,%ymm10,%ymm3 + .byte 0xc5,0xfc,0x5b,0xdb // vcvtdq2ps %ymm3,%ymm3 + .byte 0xc4,0xc1,0x64,0x59,0xd8 // vmulps %ymm8,%ymm3,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_store_8888_avx +_sk_store_8888_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0x8b,0x00 // mov (%rax),%rax + .byte 0xc4,0x62,0x7d,0x18,0x42,0x08 // vbroadcastss 0x8(%rdx),%ymm8 + .byte 0xc5,0x3c,0x59,0xc8 // vmulps %ymm0,%ymm8,%ymm9 + .byte 0xc4,0x41,0x7d,0x5b,0xc9 // vcvtps2dq %ymm9,%ymm9 + .byte 0xc5,0x3c,0x59,0xd1 // vmulps %ymm1,%ymm8,%ymm10 + .byte 0xc4,0x41,0x7d,0x5b,0xd2 // vcvtps2dq %ymm10,%ymm10 + .byte 0xc4,0xc1,0x21,0x72,0xf2,0x08 // vpslld $0x8,%xmm10,%xmm11 + .byte 0xc4,0x43,0x7d,0x19,0xd2,0x01 // vextractf128 $0x1,%ymm10,%xmm10 + .byte 0xc4,0xc1,0x29,0x72,0xf2,0x08 // vpslld $0x8,%xmm10,%xmm10 + .byte 0xc4,0x43,0x25,0x18,0xd2,0x01 // vinsertf128 $0x1,%xmm10,%ymm11,%ymm10 + .byte 0xc4,0x41,0x2d,0x56,0xc9 // vorpd %ymm9,%ymm10,%ymm9 + .byte 0xc5,0x3c,0x59,0xd2 // vmulps %ymm2,%ymm8,%ymm10 + .byte 0xc4,0x41,0x7d,0x5b,0xd2 // vcvtps2dq %ymm10,%ymm10 + .byte 0xc4,0xc1,0x21,0x72,0xf2,0x10 // vpslld $0x10,%xmm10,%xmm11 + .byte 0xc4,0x43,0x7d,0x19,0xd2,0x01 // vextractf128 $0x1,%ymm10,%xmm10 + .byte 0xc4,0xc1,0x29,0x72,0xf2,0x10 // vpslld $0x10,%xmm10,%xmm10 + .byte 0xc4,0x43,0x25,0x18,0xd2,0x01 // vinsertf128 $0x1,%xmm10,%ymm11,%ymm10 + .byte 0xc4,0x41,0x35,0x56,0xca // vorpd %ymm10,%ymm9,%ymm9 + .byte 0xc5,0x3c,0x59,0xc3 // vmulps %ymm3,%ymm8,%ymm8 + .byte 0xc4,0x41,0x7d,0x5b,0xc0 // vcvtps2dq %ymm8,%ymm8 + .byte 0xc4,0xc1,0x29,0x72,0xf0,0x18 // vpslld $0x18,%xmm8,%xmm10 + .byte 0xc4,0x43,0x7d,0x19,0xc0,0x01 // vextractf128 $0x1,%ymm8,%xmm8 + .byte 0xc4,0xc1,0x39,0x72,0xf0,0x18 // vpslld $0x18,%xmm8,%xmm8 + .byte 0xc4,0x43,0x2d,0x18,0xc0,0x01 // vinsertf128 $0x1,%xmm8,%ymm10,%ymm8 + .byte 0xc4,0x41,0x35,0x56,0xc0 // vorpd %ymm8,%ymm9,%ymm8 + .byte 0xc5,0x7d,0x11,0x04,0xb8 // vmovupd %ymm8,(%rax,%rdi,4) + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_load_f16_avx +_sk_load_f16_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_store_f16_avx +_sk_store_f16_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_clamp_x_avx +_sk_clamp_x_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm8 + .byte 0xc4,0x43,0x7d,0x19,0xc1,0x01 // vextractf128 $0x1,%ymm8,%xmm9 + .byte 0xc4,0x41,0x29,0x76,0xd2 // vpcmpeqd %xmm10,%xmm10,%xmm10 + .byte 0xc4,0x41,0x31,0xfe,0xca // vpaddd %xmm10,%xmm9,%xmm9 + .byte 0xc4,0x41,0x39,0xfe,0xc2 // vpaddd %xmm10,%xmm8,%xmm8 + .byte 0xc4,0x43,0x3d,0x18,0xc1,0x01 // vinsertf128 $0x1,%xmm9,%ymm8,%ymm8 + .byte 0xc4,0xc1,0x7c,0x5d,0xc0 // vminps %ymm8,%ymm0,%ymm0 + .byte 0xc4,0x41,0x3c,0x57,0xc0 // vxorps %ymm8,%ymm8,%ymm8 + .byte 0xc5,0xbc,0x5f,0xc0 // vmaxps %ymm0,%ymm8,%ymm0 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_clamp_y_avx +_sk_clamp_y_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm8 + .byte 0xc4,0x43,0x7d,0x19,0xc1,0x01 // vextractf128 $0x1,%ymm8,%xmm9 + .byte 0xc4,0x41,0x29,0x76,0xd2 // vpcmpeqd %xmm10,%xmm10,%xmm10 + .byte 0xc4,0x41,0x31,0xfe,0xca // vpaddd %xmm10,%xmm9,%xmm9 + .byte 0xc4,0x41,0x39,0xfe,0xc2 // vpaddd %xmm10,%xmm8,%xmm8 + .byte 0xc4,0x43,0x3d,0x18,0xc1,0x01 // vinsertf128 $0x1,%xmm9,%ymm8,%ymm8 + .byte 0xc4,0xc1,0x74,0x5d,0xc8 // vminps %ymm8,%ymm1,%ymm1 + .byte 0xc4,0x41,0x3c,0x57,0xc0 // vxorps %ymm8,%ymm8,%ymm8 + .byte 0xc5,0xbc,0x5f,0xc9 // vmaxps %ymm1,%ymm8,%ymm1 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_matrix_2x3_avx +_sk_matrix_2x3_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm8 + .byte 0xc4,0x62,0x7d,0x18,0x48,0x08 // vbroadcastss 0x8(%rax),%ymm9 + .byte 0xc4,0x62,0x7d,0x18,0x50,0x10 // vbroadcastss 0x10(%rax),%ymm10 + .byte 0xc5,0x34,0x59,0xc9 // vmulps %ymm1,%ymm9,%ymm9 + .byte 0xc4,0x41,0x34,0x58,0xca // vaddps %ymm10,%ymm9,%ymm9 + .byte 0xc5,0x3c,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm8 + .byte 0xc4,0x41,0x3c,0x58,0xc1 // vaddps %ymm9,%ymm8,%ymm8 + .byte 0xc4,0x62,0x7d,0x18,0x48,0x04 // vbroadcastss 0x4(%rax),%ymm9 + .byte 0xc4,0x62,0x7d,0x18,0x50,0x0c // vbroadcastss 0xc(%rax),%ymm10 + .byte 0xc4,0x62,0x7d,0x18,0x58,0x14 // vbroadcastss 0x14(%rax),%ymm11 + .byte 0xc5,0xac,0x59,0xc9 // vmulps %ymm1,%ymm10,%ymm1 + .byte 0xc4,0xc1,0x74,0x58,0xcb // vaddps %ymm11,%ymm1,%ymm1 + .byte 0xc5,0xb4,0x59,0xc0 // vmulps %ymm0,%ymm9,%ymm0 + .byte 0xc5,0xfc,0x58,0xc9 // vaddps %ymm1,%ymm0,%ymm1 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0x7c,0x29,0xc0 // vmovaps %ymm8,%ymm0 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_matrix_3x4_avx +_sk_matrix_3x4_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm8 + .byte 0xc4,0x62,0x7d,0x18,0x48,0x0c // vbroadcastss 0xc(%rax),%ymm9 + .byte 0xc4,0x62,0x7d,0x18,0x50,0x18 // vbroadcastss 0x18(%rax),%ymm10 + .byte 0xc4,0x62,0x7d,0x18,0x58,0x24 // vbroadcastss 0x24(%rax),%ymm11 + .byte 0xc5,0x2c,0x59,0xd2 // vmulps %ymm2,%ymm10,%ymm10 + .byte 0xc4,0x41,0x2c,0x58,0xd3 // vaddps %ymm11,%ymm10,%ymm10 + .byte 0xc5,0x34,0x59,0xc9 // vmulps %ymm1,%ymm9,%ymm9 + .byte 0xc4,0x41,0x34,0x58,0xca // vaddps %ymm10,%ymm9,%ymm9 + .byte 0xc5,0x3c,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm8 + .byte 0xc4,0x41,0x3c,0x58,0xc1 // vaddps %ymm9,%ymm8,%ymm8 + .byte 0xc4,0x62,0x7d,0x18,0x48,0x04 // vbroadcastss 0x4(%rax),%ymm9 + .byte 0xc4,0x62,0x7d,0x18,0x50,0x10 // vbroadcastss 0x10(%rax),%ymm10 + .byte 0xc4,0x62,0x7d,0x18,0x58,0x1c // vbroadcastss 0x1c(%rax),%ymm11 + .byte 0xc4,0x62,0x7d,0x18,0x60,0x28 // vbroadcastss 0x28(%rax),%ymm12 + .byte 0xc5,0x24,0x59,0xda // vmulps %ymm2,%ymm11,%ymm11 + .byte 0xc4,0x41,0x24,0x58,0xdc // vaddps %ymm12,%ymm11,%ymm11 + .byte 0xc5,0x2c,0x59,0xd1 // vmulps %ymm1,%ymm10,%ymm10 + .byte 0xc4,0x41,0x2c,0x58,0xd3 // vaddps %ymm11,%ymm10,%ymm10 + .byte 0xc5,0x34,0x59,0xc8 // vmulps %ymm0,%ymm9,%ymm9 + .byte 0xc4,0x41,0x34,0x58,0xca // vaddps %ymm10,%ymm9,%ymm9 + .byte 0xc4,0x62,0x7d,0x18,0x50,0x08 // vbroadcastss 0x8(%rax),%ymm10 + .byte 0xc4,0x62,0x7d,0x18,0x58,0x14 // vbroadcastss 0x14(%rax),%ymm11 + .byte 0xc4,0x62,0x7d,0x18,0x60,0x20 // vbroadcastss 0x20(%rax),%ymm12 + .byte 0xc4,0x62,0x7d,0x18,0x68,0x2c // vbroadcastss 0x2c(%rax),%ymm13 + .byte 0xc5,0x9c,0x59,0xd2 // vmulps %ymm2,%ymm12,%ymm2 + .byte 0xc4,0xc1,0x6c,0x58,0xd5 // vaddps %ymm13,%ymm2,%ymm2 + .byte 0xc5,0xa4,0x59,0xc9 // vmulps %ymm1,%ymm11,%ymm1 + .byte 0xc5,0xf4,0x58,0xca // vaddps %ymm2,%ymm1,%ymm1 + .byte 0xc5,0xac,0x59,0xc0 // vmulps %ymm0,%ymm10,%ymm0 + .byte 0xc5,0xfc,0x58,0xd1 // vaddps %ymm1,%ymm0,%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0x7c,0x29,0xc0 // vmovaps %ymm8,%ymm0 + .byte 0xc5,0x7c,0x29,0xc9 // vmovaps %ymm9,%ymm1 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_linear_gradient_2stops_avx +_sk_linear_gradient_2stops_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0xe2,0x7d,0x18,0x48,0x10 // vbroadcastss 0x10(%rax),%ymm1 + .byte 0xc4,0xe2,0x7d,0x18,0x10 // vbroadcastss (%rax),%ymm2 + .byte 0xc5,0xf4,0x59,0xc8 // vmulps %ymm0,%ymm1,%ymm1 + .byte 0xc5,0x6c,0x58,0xc1 // vaddps %ymm1,%ymm2,%ymm8 + .byte 0xc4,0xe2,0x7d,0x18,0x48,0x14 // vbroadcastss 0x14(%rax),%ymm1 + .byte 0xc4,0xe2,0x7d,0x18,0x50,0x04 // vbroadcastss 0x4(%rax),%ymm2 + .byte 0xc5,0xf4,0x59,0xc8 // vmulps %ymm0,%ymm1,%ymm1 + .byte 0xc5,0xec,0x58,0xc9 // vaddps %ymm1,%ymm2,%ymm1 + .byte 0xc4,0xe2,0x7d,0x18,0x50,0x18 // vbroadcastss 0x18(%rax),%ymm2 + .byte 0xc4,0xe2,0x7d,0x18,0x58,0x08 // vbroadcastss 0x8(%rax),%ymm3 + .byte 0xc5,0xec,0x59,0xd0 // vmulps %ymm0,%ymm2,%ymm2 + .byte 0xc5,0xe4,0x58,0xd2 // vaddps %ymm2,%ymm3,%ymm2 + .byte 0xc4,0xe2,0x7d,0x18,0x58,0x1c // vbroadcastss 0x1c(%rax),%ymm3 + .byte 0xc4,0x62,0x7d,0x18,0x48,0x0c // vbroadcastss 0xc(%rax),%ymm9 + .byte 0xc5,0xe4,0x59,0xc0 // vmulps %ymm0,%ymm3,%ymm0 + .byte 0xc5,0xb4,0x58,0xd8 // vaddps %ymm0,%ymm9,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0x7c,0x29,0xc0 // vmovaps %ymm8,%ymm0 + .byte 0xff,0xe0 // jmpq *%rax + .globl _sk_start_pipeline_sse41 _sk_start_pipeline_sse41: .byte 0x41,0x57 // push %r15 |