aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper
diff options
context:
space:
mode:
Diffstat (limited to 'src/jumper')
-rw-r--r--src/jumper/SkJumper.cpp22
-rw-r--r--src/jumper/SkJumper_generated.S668
-rw-r--r--src/jumper/SkJumper_generated_win.S695
-rw-r--r--src/jumper/SkJumper_stages.cpp29
-rwxr-xr-xsrc/jumper/build_stages.py10
5 files changed, 1424 insertions, 0 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 18a5f0275f..b5271a6a58 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -106,16 +106,21 @@ extern "C" {
#elif defined(__x86_64__) || defined(_M_X64)
size_t ASM(start_pipeline,hsw )(size_t, void**, K*, size_t);
+ size_t ASM(start_pipeline,avx )(size_t, void**, K*, size_t);
size_t ASM(start_pipeline,sse41)(size_t, void**, K*, size_t);
size_t ASM(start_pipeline,sse2 )(size_t, void**, K*, size_t);
StageFn ASM(just_return,hsw),
+ ASM(just_return,avx),
ASM(just_return,sse41),
ASM(just_return,sse2);
#define M(st) StageFn ASM(st,hsw);
STAGES(M)
#undef M
+ #define M(st) StageFn ASM(st,avx);
+ STAGES(M)
+ #undef M
#define M(st) StageFn ASM(st,sse41);
STAGES(M)
#undef M
@@ -170,6 +175,18 @@ extern "C" {
#undef M
}
}
+ static StageFn* lookup_avx(SkRasterPipeline::StockStage st) {
+ switch (st) {
+ default:
+ #ifdef WHATS_NEXT
+ gMissing[st]++;
+ #endif
+ return nullptr;
+ #define M(st) case SkRasterPipeline::st: return ASM(st,avx);
+ STAGES(M)
+ #undef M
+ }
+ }
static StageFn* lookup_sse41(SkRasterPipeline::StockStage st) {
switch (st) {
default:
@@ -259,6 +276,11 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
return false;
}
}
+ if (0 && SkCpu::Supports(SkCpu::AVX)) {
+ if (!build_and_run(8, lookup_avx, ASM(just_return,avx), ASM(start_pipeline,avx))) {
+ return false;
+ }
+ }
if (1 && SkCpu::Supports(SkCpu::SSE41)) {
if (!build_and_run(4, lookup_sse41, ASM(just_return,sse41), ASM(start_pipeline,sse41))) {
return false;
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index 5d7ec003a2..25bfc1bcd8 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -1854,6 +1854,674 @@ _sk_linear_gradient_2stops_hsw:
.byte 0xc5,0x7c,0x29,0xc0 // vmovaps %ymm8,%ymm0
.byte 0xff,0xe0 // jmpq *%rax
+.globl _sk_start_pipeline_avx
+_sk_start_pipeline_avx:
+ .byte 0x41,0x57 // push %r15
+ .byte 0x41,0x56 // push %r14
+ .byte 0x41,0x55 // push %r13
+ .byte 0x41,0x54 // push %r12
+ .byte 0x53 // push %rbx
+ .byte 0x49,0x89,0xcf // mov %rcx,%r15
+ .byte 0x49,0x89,0xd6 // mov %rdx,%r14
+ .byte 0x48,0x89,0xfb // mov %rdi,%rbx
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x49,0x89,0xc4 // mov %rax,%r12
+ .byte 0x49,0x89,0xf5 // mov %rsi,%r13
+ .byte 0x48,0x8d,0x43,0x08 // lea 0x8(%rbx),%rax
+ .byte 0x4c,0x39,0xf8 // cmp %r15,%rax
+ .byte 0x76,0x05 // jbe 28 <_sk_start_pipeline_avx+0x28>
+ .byte 0x48,0x89,0xd8 // mov %rbx,%rax
+ .byte 0xeb,0x3c // jmp 64 <_sk_start_pipeline_avx+0x64>
+ .byte 0xc5,0xfc,0x57,0xc0 // vxorps %ymm0,%ymm0,%ymm0
+ .byte 0xc5,0xf4,0x57,0xc9 // vxorps %ymm1,%ymm1,%ymm1
+ .byte 0xc5,0xec,0x57,0xd2 // vxorps %ymm2,%ymm2,%ymm2
+ .byte 0xc5,0xe4,0x57,0xdb // vxorps %ymm3,%ymm3,%ymm3
+ .byte 0xc5,0xdc,0x57,0xe4 // vxorps %ymm4,%ymm4,%ymm4
+ .byte 0xc5,0xd4,0x57,0xed // vxorps %ymm5,%ymm5,%ymm5
+ .byte 0xc5,0xcc,0x57,0xf6 // vxorps %ymm6,%ymm6,%ymm6
+ .byte 0xc5,0xc4,0x57,0xff // vxorps %ymm7,%ymm7,%ymm7
+ .byte 0x48,0x89,0xdf // mov %rbx,%rdi
+ .byte 0x4c,0x89,0xee // mov %r13,%rsi
+ .byte 0x4c,0x89,0xf2 // mov %r14,%rdx
+ .byte 0x41,0xff,0xd4 // callq *%r12
+ .byte 0x48,0x8d,0x43,0x08 // lea 0x8(%rbx),%rax
+ .byte 0x48,0x83,0xc3,0x10 // add $0x10,%rbx
+ .byte 0x4c,0x39,0xfb // cmp %r15,%rbx
+ .byte 0x48,0x89,0xc3 // mov %rax,%rbx
+ .byte 0x76,0xc4 // jbe 28 <_sk_start_pipeline_avx+0x28>
+ .byte 0x5b // pop %rbx
+ .byte 0x41,0x5c // pop %r12
+ .byte 0x41,0x5d // pop %r13
+ .byte 0x41,0x5e // pop %r14
+ .byte 0x41,0x5f // pop %r15
+ .byte 0xc5,0xf8,0x77 // vzeroupper
+ .byte 0xc3 // retq
+
+.globl _sk_just_return_avx
+_sk_just_return_avx:
+ .byte 0xc3 // retq
+
+.globl _sk_seed_shader_avx
+_sk_seed_shader_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc5,0xf9,0x6e,0xc7 // vmovd %edi,%xmm0
+ .byte 0xc4,0xe3,0x79,0x04,0xc0,0x00 // vpermilps $0x0,%xmm0,%xmm0
+ .byte 0xc4,0xe3,0x7d,0x18,0xc0,0x01 // vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
+ .byte 0xc5,0xfc,0x5b,0xc0 // vcvtdq2ps %ymm0,%ymm0
+ .byte 0xc4,0xe2,0x7d,0x18,0x4a,0x04 // vbroadcastss 0x4(%rdx),%ymm1
+ .byte 0xc5,0xfc,0x58,0xc1 // vaddps %ymm1,%ymm0,%ymm0
+ .byte 0xc5,0xfc,0x58,0x42,0x14 // vaddps 0x14(%rdx),%ymm0,%ymm0
+ .byte 0xc5,0xf9,0x6e,0x10 // vmovd (%rax),%xmm2
+ .byte 0xc4,0xe3,0x79,0x04,0xd2,0x00 // vpermilps $0x0,%xmm2,%xmm2
+ .byte 0xc4,0xe3,0x6d,0x18,0xd2,0x01 // vinsertf128 $0x1,%xmm2,%ymm2,%ymm2
+ .byte 0xc5,0xfc,0x5b,0xd2 // vcvtdq2ps %ymm2,%ymm2
+ .byte 0xc5,0xec,0x58,0xc9 // vaddps %ymm1,%ymm2,%ymm1
+ .byte 0xc4,0xe2,0x7d,0x18,0x12 // vbroadcastss (%rdx),%ymm2
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc5,0xe4,0x57,0xdb // vxorps %ymm3,%ymm3,%ymm3
+ .byte 0xc5,0xdc,0x57,0xe4 // vxorps %ymm4,%ymm4,%ymm4
+ .byte 0xc5,0xd4,0x57,0xed // vxorps %ymm5,%ymm5,%ymm5
+ .byte 0xc5,0xcc,0x57,0xf6 // vxorps %ymm6,%ymm6,%ymm6
+ .byte 0xc5,0xc4,0x57,0xff // vxorps %ymm7,%ymm7,%ymm7
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_constant_color_avx
+_sk_constant_color_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc4,0xe2,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm0
+ .byte 0xc4,0xe2,0x7d,0x18,0x48,0x04 // vbroadcastss 0x4(%rax),%ymm1
+ .byte 0xc4,0xe2,0x7d,0x18,0x50,0x08 // vbroadcastss 0x8(%rax),%ymm2
+ .byte 0xc4,0xe2,0x7d,0x18,0x58,0x0c // vbroadcastss 0xc(%rax),%ymm3
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_clear_avx
+_sk_clear_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc5,0xfc,0x57,0xc0 // vxorps %ymm0,%ymm0,%ymm0
+ .byte 0xc5,0xf4,0x57,0xc9 // vxorps %ymm1,%ymm1,%ymm1
+ .byte 0xc5,0xec,0x57,0xd2 // vxorps %ymm2,%ymm2,%ymm2
+ .byte 0xc5,0xe4,0x57,0xdb // vxorps %ymm3,%ymm3,%ymm3
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_plus__avx
+_sk_plus__avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc5,0xfc,0x58,0xc4 // vaddps %ymm4,%ymm0,%ymm0
+ .byte 0xc5,0xf4,0x58,0xcd // vaddps %ymm5,%ymm1,%ymm1
+ .byte 0xc5,0xec,0x58,0xd6 // vaddps %ymm6,%ymm2,%ymm2
+ .byte 0xc5,0xe4,0x58,0xdf // vaddps %ymm7,%ymm3,%ymm3
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_srcover_avx
+_sk_srcover_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc4,0x62,0x7d,0x18,0x02 // vbroadcastss (%rdx),%ymm8
+ .byte 0xc5,0x3c,0x5c,0xc3 // vsubps %ymm3,%ymm8,%ymm8
+ .byte 0xc5,0x3c,0x59,0xcc // vmulps %ymm4,%ymm8,%ymm9
+ .byte 0xc5,0xb4,0x58,0xc0 // vaddps %ymm0,%ymm9,%ymm0
+ .byte 0xc5,0x3c,0x59,0xcd // vmulps %ymm5,%ymm8,%ymm9
+ .byte 0xc5,0xb4,0x58,0xc9 // vaddps %ymm1,%ymm9,%ymm1
+ .byte 0xc5,0x3c,0x59,0xce // vmulps %ymm6,%ymm8,%ymm9
+ .byte 0xc5,0xb4,0x58,0xd2 // vaddps %ymm2,%ymm9,%ymm2
+ .byte 0xc5,0x3c,0x59,0xc7 // vmulps %ymm7,%ymm8,%ymm8
+ .byte 0xc5,0xbc,0x58,0xdb // vaddps %ymm3,%ymm8,%ymm3
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_dstover_avx
+_sk_dstover_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc4,0x62,0x7d,0x18,0x02 // vbroadcastss (%rdx),%ymm8
+ .byte 0xc5,0x3c,0x5c,0xc7 // vsubps %ymm7,%ymm8,%ymm8
+ .byte 0xc5,0xbc,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm0
+ .byte 0xc5,0xfc,0x58,0xc4 // vaddps %ymm4,%ymm0,%ymm0
+ .byte 0xc5,0xbc,0x59,0xc9 // vmulps %ymm1,%ymm8,%ymm1
+ .byte 0xc5,0xf4,0x58,0xcd // vaddps %ymm5,%ymm1,%ymm1
+ .byte 0xc5,0xbc,0x59,0xd2 // vmulps %ymm2,%ymm8,%ymm2
+ .byte 0xc5,0xec,0x58,0xd6 // vaddps %ymm6,%ymm2,%ymm2
+ .byte 0xc5,0xbc,0x59,0xdb // vmulps %ymm3,%ymm8,%ymm3
+ .byte 0xc5,0xe4,0x58,0xdf // vaddps %ymm7,%ymm3,%ymm3
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_clamp_0_avx
+_sk_clamp_0_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc4,0x41,0x3c,0x57,0xc0 // vxorps %ymm8,%ymm8,%ymm8
+ .byte 0xc4,0xc1,0x7c,0x5f,0xc0 // vmaxps %ymm8,%ymm0,%ymm0
+ .byte 0xc4,0xc1,0x74,0x5f,0xc8 // vmaxps %ymm8,%ymm1,%ymm1
+ .byte 0xc4,0xc1,0x6c,0x5f,0xd0 // vmaxps %ymm8,%ymm2,%ymm2
+ .byte 0xc4,0xc1,0x64,0x5f,0xd8 // vmaxps %ymm8,%ymm3,%ymm3
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_clamp_1_avx
+_sk_clamp_1_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc4,0x62,0x7d,0x18,0x02 // vbroadcastss (%rdx),%ymm8
+ .byte 0xc4,0xc1,0x7c,0x5d,0xc0 // vminps %ymm8,%ymm0,%ymm0
+ .byte 0xc4,0xc1,0x74,0x5d,0xc8 // vminps %ymm8,%ymm1,%ymm1
+ .byte 0xc4,0xc1,0x6c,0x5d,0xd0 // vminps %ymm8,%ymm2,%ymm2
+ .byte 0xc4,0xc1,0x64,0x5d,0xd8 // vminps %ymm8,%ymm3,%ymm3
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_clamp_a_avx
+_sk_clamp_a_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc4,0x62,0x7d,0x18,0x02 // vbroadcastss (%rdx),%ymm8
+ .byte 0xc4,0xc1,0x64,0x5d,0xd8 // vminps %ymm8,%ymm3,%ymm3
+ .byte 0xc5,0xfc,0x5d,0xc3 // vminps %ymm3,%ymm0,%ymm0
+ .byte 0xc5,0xf4,0x5d,0xcb // vminps %ymm3,%ymm1,%ymm1
+ .byte 0xc5,0xec,0x5d,0xd3 // vminps %ymm3,%ymm2,%ymm2
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_set_rgb_avx
+_sk_set_rgb_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc4,0xe2,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm0
+ .byte 0xc4,0xe2,0x7d,0x18,0x48,0x04 // vbroadcastss 0x4(%rax),%ymm1
+ .byte 0xc4,0xe2,0x7d,0x18,0x50,0x08 // vbroadcastss 0x8(%rax),%ymm2
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_swap_rb_avx
+_sk_swap_rb_avx:
+ .byte 0xc5,0x7c,0x28,0xc0 // vmovaps %ymm0,%ymm8
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc5,0xfc,0x28,0xc2 // vmovaps %ymm2,%ymm0
+ .byte 0xc5,0x7c,0x29,0xc2 // vmovaps %ymm8,%ymm2
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_swap_avx
+_sk_swap_avx:
+ .byte 0xc5,0x7c,0x28,0xc3 // vmovaps %ymm3,%ymm8
+ .byte 0xc5,0x7c,0x28,0xca // vmovaps %ymm2,%ymm9
+ .byte 0xc5,0x7c,0x28,0xd1 // vmovaps %ymm1,%ymm10
+ .byte 0xc5,0x7c,0x28,0xd8 // vmovaps %ymm0,%ymm11
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc5,0xfc,0x28,0xc4 // vmovaps %ymm4,%ymm0
+ .byte 0xc5,0xfc,0x28,0xcd // vmovaps %ymm5,%ymm1
+ .byte 0xc5,0xfc,0x28,0xd6 // vmovaps %ymm6,%ymm2
+ .byte 0xc5,0xfc,0x28,0xdf // vmovaps %ymm7,%ymm3
+ .byte 0xc5,0x7c,0x29,0xdc // vmovaps %ymm11,%ymm4
+ .byte 0xc5,0x7c,0x29,0xd5 // vmovaps %ymm10,%ymm5
+ .byte 0xc5,0x7c,0x29,0xce // vmovaps %ymm9,%ymm6
+ .byte 0xc5,0x7c,0x29,0xc7 // vmovaps %ymm8,%ymm7
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_move_src_dst_avx
+_sk_move_src_dst_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc5,0xfc,0x28,0xe0 // vmovaps %ymm0,%ymm4
+ .byte 0xc5,0xfc,0x28,0xe9 // vmovaps %ymm1,%ymm5
+ .byte 0xc5,0xfc,0x28,0xf2 // vmovaps %ymm2,%ymm6
+ .byte 0xc5,0xfc,0x28,0xfb // vmovaps %ymm3,%ymm7
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_move_dst_src_avx
+_sk_move_dst_src_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc5,0xfc,0x28,0xc4 // vmovaps %ymm4,%ymm0
+ .byte 0xc5,0xfc,0x28,0xcd // vmovaps %ymm5,%ymm1
+ .byte 0xc5,0xfc,0x28,0xd6 // vmovaps %ymm6,%ymm2
+ .byte 0xc5,0xfc,0x28,0xdf // vmovaps %ymm7,%ymm3
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_premul_avx
+_sk_premul_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc5,0xfc,0x59,0xc3 // vmulps %ymm3,%ymm0,%ymm0
+ .byte 0xc5,0xf4,0x59,0xcb // vmulps %ymm3,%ymm1,%ymm1
+ .byte 0xc5,0xec,0x59,0xd3 // vmulps %ymm3,%ymm2,%ymm2
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_unpremul_avx
+_sk_unpremul_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc4,0x41,0x3c,0x57,0xc0 // vxorps %ymm8,%ymm8,%ymm8
+ .byte 0xc4,0x41,0x64,0xc2,0xc8,0x00 // vcmpeqps %ymm8,%ymm3,%ymm9
+ .byte 0xc4,0x62,0x7d,0x18,0x12 // vbroadcastss (%rdx),%ymm10
+ .byte 0xc5,0x2c,0x5e,0xd3 // vdivps %ymm3,%ymm10,%ymm10
+ .byte 0xc4,0x43,0x2d,0x4a,0xc0,0x90 // vblendvps %ymm9,%ymm8,%ymm10,%ymm8
+ .byte 0xc5,0xbc,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm0
+ .byte 0xc5,0xbc,0x59,0xc9 // vmulps %ymm1,%ymm8,%ymm1
+ .byte 0xc5,0xbc,0x59,0xd2 // vmulps %ymm2,%ymm8,%ymm2
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_from_srgb_avx
+_sk_from_srgb_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc4,0x62,0x7d,0x18,0x42,0x40 // vbroadcastss 0x40(%rdx),%ymm8
+ .byte 0xc5,0x3c,0x59,0xc8 // vmulps %ymm0,%ymm8,%ymm9
+ .byte 0xc5,0x7c,0x59,0xd0 // vmulps %ymm0,%ymm0,%ymm10
+ .byte 0xc4,0x62,0x7d,0x18,0x5a,0x3c // vbroadcastss 0x3c(%rdx),%ymm11
+ .byte 0xc4,0x62,0x7d,0x18,0x62,0x38 // vbroadcastss 0x38(%rdx),%ymm12
+ .byte 0xc5,0x24,0x59,0xe8 // vmulps %ymm0,%ymm11,%ymm13
+ .byte 0xc4,0x41,0x14,0x58,0xec // vaddps %ymm12,%ymm13,%ymm13
+ .byte 0xc4,0x62,0x7d,0x18,0x72,0x34 // vbroadcastss 0x34(%rdx),%ymm14
+ .byte 0xc4,0x41,0x2c,0x59,0xd5 // vmulps %ymm13,%ymm10,%ymm10
+ .byte 0xc4,0x41,0x0c,0x58,0xd2 // vaddps %ymm10,%ymm14,%ymm10
+ .byte 0xc4,0x62,0x7d,0x18,0x6a,0x44 // vbroadcastss 0x44(%rdx),%ymm13
+ .byte 0xc4,0xc1,0x7c,0xc2,0xc5,0x01 // vcmpltps %ymm13,%ymm0,%ymm0
+ .byte 0xc4,0xc3,0x2d,0x4a,0xc1,0x00 // vblendvps %ymm0,%ymm9,%ymm10,%ymm0
+ .byte 0xc5,0x3c,0x59,0xc9 // vmulps %ymm1,%ymm8,%ymm9
+ .byte 0xc5,0x74,0x59,0xd1 // vmulps %ymm1,%ymm1,%ymm10
+ .byte 0xc5,0x24,0x59,0xf9 // vmulps %ymm1,%ymm11,%ymm15
+ .byte 0xc4,0x41,0x04,0x58,0xfc // vaddps %ymm12,%ymm15,%ymm15
+ .byte 0xc4,0x41,0x2c,0x59,0xd7 // vmulps %ymm15,%ymm10,%ymm10
+ .byte 0xc4,0x41,0x0c,0x58,0xd2 // vaddps %ymm10,%ymm14,%ymm10
+ .byte 0xc4,0xc1,0x74,0xc2,0xcd,0x01 // vcmpltps %ymm13,%ymm1,%ymm1
+ .byte 0xc4,0xc3,0x2d,0x4a,0xc9,0x10 // vblendvps %ymm1,%ymm9,%ymm10,%ymm1
+ .byte 0xc5,0x3c,0x59,0xc2 // vmulps %ymm2,%ymm8,%ymm8
+ .byte 0xc5,0x6c,0x59,0xca // vmulps %ymm2,%ymm2,%ymm9
+ .byte 0xc5,0x24,0x59,0xd2 // vmulps %ymm2,%ymm11,%ymm10
+ .byte 0xc4,0x41,0x2c,0x58,0xd4 // vaddps %ymm12,%ymm10,%ymm10
+ .byte 0xc4,0x41,0x34,0x59,0xca // vmulps %ymm10,%ymm9,%ymm9
+ .byte 0xc4,0x41,0x0c,0x58,0xc9 // vaddps %ymm9,%ymm14,%ymm9
+ .byte 0xc4,0xc1,0x6c,0xc2,0xd5,0x01 // vcmpltps %ymm13,%ymm2,%ymm2
+ .byte 0xc4,0xc3,0x35,0x4a,0xd0,0x20 // vblendvps %ymm2,%ymm8,%ymm9,%ymm2
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_to_srgb_avx
+_sk_to_srgb_avx:
+ .byte 0xc5,0x7c,0x52,0xc0 // vrsqrtps %ymm0,%ymm8
+ .byte 0xc4,0x41,0x7c,0x53,0xc8 // vrcpps %ymm8,%ymm9
+ .byte 0xc4,0x41,0x7c,0x52,0xd0 // vrsqrtps %ymm8,%ymm10
+ .byte 0xc4,0x62,0x7d,0x18,0x42,0x48 // vbroadcastss 0x48(%rdx),%ymm8
+ .byte 0xc5,0x3c,0x59,0xd8 // vmulps %ymm0,%ymm8,%ymm11
+ .byte 0xc4,0x62,0x7d,0x18,0x22 // vbroadcastss (%rdx),%ymm12
+ .byte 0xc4,0x62,0x7d,0x18,0x6a,0x4c // vbroadcastss 0x4c(%rdx),%ymm13
+ .byte 0xc4,0x62,0x7d,0x18,0x72,0x50 // vbroadcastss 0x50(%rdx),%ymm14
+ .byte 0xc4,0x62,0x7d,0x18,0x7a,0x54 // vbroadcastss 0x54(%rdx),%ymm15
+ .byte 0xc4,0x41,0x34,0x59,0xce // vmulps %ymm14,%ymm9,%ymm9
+ .byte 0xc4,0x41,0x34,0x58,0xcf // vaddps %ymm15,%ymm9,%ymm9
+ .byte 0xc4,0x41,0x2c,0x59,0xd5 // vmulps %ymm13,%ymm10,%ymm10
+ .byte 0xc4,0x41,0x2c,0x58,0xc9 // vaddps %ymm9,%ymm10,%ymm9
+ .byte 0xc4,0x41,0x1c,0x5d,0xc9 // vminps %ymm9,%ymm12,%ymm9
+ .byte 0xc4,0x62,0x7d,0x18,0x52,0x58 // vbroadcastss 0x58(%rdx),%ymm10
+ .byte 0xc4,0xc1,0x7c,0xc2,0xc2,0x01 // vcmpltps %ymm10,%ymm0,%ymm0
+ .byte 0xc4,0xc3,0x35,0x4a,0xc3,0x00 // vblendvps %ymm0,%ymm11,%ymm9,%ymm0
+ .byte 0xc5,0x7c,0x52,0xc9 // vrsqrtps %ymm1,%ymm9
+ .byte 0xc4,0x41,0x7c,0x53,0xd9 // vrcpps %ymm9,%ymm11
+ .byte 0xc4,0x41,0x7c,0x52,0xc9 // vrsqrtps %ymm9,%ymm9
+ .byte 0xc4,0x41,0x0c,0x59,0xdb // vmulps %ymm11,%ymm14,%ymm11
+ .byte 0xc4,0x41,0x04,0x58,0xdb // vaddps %ymm11,%ymm15,%ymm11
+ .byte 0xc4,0x41,0x14,0x59,0xc9 // vmulps %ymm9,%ymm13,%ymm9
+ .byte 0xc4,0x41,0x34,0x58,0xcb // vaddps %ymm11,%ymm9,%ymm9
+ .byte 0xc5,0x3c,0x59,0xd9 // vmulps %ymm1,%ymm8,%ymm11
+ .byte 0xc4,0x41,0x1c,0x5d,0xc9 // vminps %ymm9,%ymm12,%ymm9
+ .byte 0xc4,0xc1,0x74,0xc2,0xca,0x01 // vcmpltps %ymm10,%ymm1,%ymm1
+ .byte 0xc4,0xc3,0x35,0x4a,0xcb,0x10 // vblendvps %ymm1,%ymm11,%ymm9,%ymm1
+ .byte 0xc5,0x7c,0x52,0xca // vrsqrtps %ymm2,%ymm9
+ .byte 0xc4,0x41,0x7c,0x53,0xd9 // vrcpps %ymm9,%ymm11
+ .byte 0xc4,0x41,0x0c,0x59,0xdb // vmulps %ymm11,%ymm14,%ymm11
+ .byte 0xc4,0x41,0x04,0x58,0xdb // vaddps %ymm11,%ymm15,%ymm11
+ .byte 0xc4,0x41,0x7c,0x52,0xc9 // vrsqrtps %ymm9,%ymm9
+ .byte 0xc4,0x41,0x14,0x59,0xc9 // vmulps %ymm9,%ymm13,%ymm9
+ .byte 0xc4,0x41,0x34,0x58,0xcb // vaddps %ymm11,%ymm9,%ymm9
+ .byte 0xc4,0x41,0x1c,0x5d,0xc9 // vminps %ymm9,%ymm12,%ymm9
+ .byte 0xc5,0x3c,0x59,0xc2 // vmulps %ymm2,%ymm8,%ymm8
+ .byte 0xc4,0xc1,0x6c,0xc2,0xd2,0x01 // vcmpltps %ymm10,%ymm2,%ymm2
+ .byte 0xc4,0xc3,0x35,0x4a,0xd0,0x20 // vblendvps %ymm2,%ymm8,%ymm9,%ymm2
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_scale_u8_avx
+_sk_scale_u8_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x48,0x8b,0x00 // mov (%rax),%rax
+ .byte 0xc4,0x62,0x79,0x31,0x44,0x38,0x04 // vpmovzxbd 0x4(%rax,%rdi,1),%xmm8
+ .byte 0xc4,0x62,0x79,0x31,0x0c,0x38 // vpmovzxbd (%rax,%rdi,1),%xmm9
+ .byte 0xc4,0x43,0x35,0x18,0xc0,0x01 // vinsertf128 $0x1,%xmm8,%ymm9,%ymm8
+ .byte 0xc4,0x41,0x7c,0x5b,0xc0 // vcvtdq2ps %ymm8,%ymm8
+ .byte 0xc4,0x62,0x7d,0x18,0x4a,0x0c // vbroadcastss 0xc(%rdx),%ymm9
+ .byte 0xc4,0x41,0x3c,0x59,0xc1 // vmulps %ymm9,%ymm8,%ymm8
+ .byte 0xc5,0xbc,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm0
+ .byte 0xc5,0xbc,0x59,0xc9 // vmulps %ymm1,%ymm8,%ymm1
+ .byte 0xc5,0xbc,0x59,0xd2 // vmulps %ymm2,%ymm8,%ymm2
+ .byte 0xc5,0xbc,0x59,0xdb // vmulps %ymm3,%ymm8,%ymm3
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_lerp_u8_avx
+_sk_lerp_u8_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x48,0x8b,0x00 // mov (%rax),%rax
+ .byte 0xc4,0x62,0x79,0x31,0x44,0x38,0x04 // vpmovzxbd 0x4(%rax,%rdi,1),%xmm8
+ .byte 0xc4,0x62,0x79,0x31,0x0c,0x38 // vpmovzxbd (%rax,%rdi,1),%xmm9
+ .byte 0xc4,0x43,0x35,0x18,0xc0,0x01 // vinsertf128 $0x1,%xmm8,%ymm9,%ymm8
+ .byte 0xc4,0x41,0x7c,0x5b,0xc0 // vcvtdq2ps %ymm8,%ymm8
+ .byte 0xc4,0x62,0x7d,0x18,0x4a,0x0c // vbroadcastss 0xc(%rdx),%ymm9
+ .byte 0xc4,0x41,0x3c,0x59,0xc1 // vmulps %ymm9,%ymm8,%ymm8
+ .byte 0xc5,0xfc,0x5c,0xc4 // vsubps %ymm4,%ymm0,%ymm0
+ .byte 0xc4,0xc1,0x7c,0x59,0xc0 // vmulps %ymm8,%ymm0,%ymm0
+ .byte 0xc5,0xfc,0x58,0xc4 // vaddps %ymm4,%ymm0,%ymm0
+ .byte 0xc5,0xf4,0x5c,0xcd // vsubps %ymm5,%ymm1,%ymm1
+ .byte 0xc4,0xc1,0x74,0x59,0xc8 // vmulps %ymm8,%ymm1,%ymm1
+ .byte 0xc5,0xf4,0x58,0xcd // vaddps %ymm5,%ymm1,%ymm1
+ .byte 0xc5,0xec,0x5c,0xd6 // vsubps %ymm6,%ymm2,%ymm2
+ .byte 0xc4,0xc1,0x6c,0x59,0xd0 // vmulps %ymm8,%ymm2,%ymm2
+ .byte 0xc5,0xec,0x58,0xd6 // vaddps %ymm6,%ymm2,%ymm2
+ .byte 0xc5,0xe4,0x5c,0xdf // vsubps %ymm7,%ymm3,%ymm3
+ .byte 0xc4,0xc1,0x64,0x59,0xd8 // vmulps %ymm8,%ymm3,%ymm3
+ .byte 0xc5,0xe4,0x58,0xdf // vaddps %ymm7,%ymm3,%ymm3
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_load_tables_avx
+_sk_load_tables_avx:
+ .byte 0x41,0x57 // push %r15
+ .byte 0x41,0x56 // push %r14
+ .byte 0x41,0x54 // push %r12
+ .byte 0x53 // push %rbx
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x4c,0x8b,0x00 // mov (%rax),%r8
+ .byte 0x48,0x8b,0x48,0x08 // mov 0x8(%rax),%rcx
+ .byte 0xc4,0x41,0x7c,0x10,0x14,0xb8 // vmovups (%r8,%rdi,4),%ymm10
+ .byte 0xc5,0xf9,0x6e,0x42,0x10 // vmovd 0x10(%rdx),%xmm0
+ .byte 0xc4,0xe3,0x79,0x04,0xc0,0x00 // vpermilps $0x0,%xmm0,%xmm0
+ .byte 0xc4,0x63,0x7d,0x18,0xc8,0x01 // vinsertf128 $0x1,%xmm0,%ymm0,%ymm9
+ .byte 0xc4,0xc1,0x34,0x54,0xc2 // vandps %ymm10,%ymm9,%ymm0
+ .byte 0xc4,0xc1,0xf9,0x7e,0xc0 // vmovq %xmm0,%r8
+ .byte 0x45,0x89,0xc1 // mov %r8d,%r9d
+ .byte 0xc4,0xc3,0xf9,0x16,0xc2,0x01 // vpextrq $0x1,%xmm0,%r10
+ .byte 0x45,0x89,0xd3 // mov %r10d,%r11d
+ .byte 0x49,0xc1,0xea,0x20 // shr $0x20,%r10
+ .byte 0x49,0xc1,0xe8,0x20 // shr $0x20,%r8
+ .byte 0xc4,0xe3,0x7d,0x19,0xc0,0x01 // vextractf128 $0x1,%ymm0,%xmm0
+ .byte 0xc4,0xc1,0xf9,0x7e,0xc7 // vmovq %xmm0,%r15
+ .byte 0x45,0x89,0xfe // mov %r15d,%r14d
+ .byte 0xc4,0xe3,0xf9,0x16,0xc3,0x01 // vpextrq $0x1,%xmm0,%rbx
+ .byte 0x41,0x89,0xdc // mov %ebx,%r12d
+ .byte 0x48,0xc1,0xeb,0x20 // shr $0x20,%rbx
+ .byte 0x49,0xc1,0xef,0x20 // shr $0x20,%r15
+ .byte 0xc4,0xa1,0x7a,0x10,0x04,0xb1 // vmovss (%rcx,%r14,4),%xmm0
+ .byte 0xc4,0xa3,0x79,0x21,0x04,0xb9,0x10 // vinsertps $0x10,(%rcx,%r15,4),%xmm0,%xmm0
+ .byte 0xc4,0xa3,0x79,0x21,0x04,0xa1,0x20 // vinsertps $0x20,(%rcx,%r12,4),%xmm0,%xmm0
+ .byte 0xc4,0xe3,0x79,0x21,0x04,0x99,0x30 // vinsertps $0x30,(%rcx,%rbx,4),%xmm0,%xmm0
+ .byte 0xc4,0xa1,0x7a,0x10,0x0c,0x89 // vmovss (%rcx,%r9,4),%xmm1
+ .byte 0xc4,0xa3,0x71,0x21,0x0c,0x81,0x10 // vinsertps $0x10,(%rcx,%r8,4),%xmm1,%xmm1
+ .byte 0xc4,0xa3,0x71,0x21,0x0c,0x99,0x20 // vinsertps $0x20,(%rcx,%r11,4),%xmm1,%xmm1
+ .byte 0xc4,0xa3,0x71,0x21,0x0c,0x91,0x30 // vinsertps $0x30,(%rcx,%r10,4),%xmm1,%xmm1
+ .byte 0xc4,0xe3,0x75,0x18,0xc0,0x01 // vinsertf128 $0x1,%xmm0,%ymm1,%ymm0
+ .byte 0x4c,0x8b,0x78,0x10 // mov 0x10(%rax),%r15
+ .byte 0xc4,0xc1,0x71,0x72,0xd2,0x08 // vpsrld $0x8,%xmm10,%xmm1
+ .byte 0xc4,0x43,0x7d,0x19,0xd0,0x01 // vextractf128 $0x1,%ymm10,%xmm8
+ .byte 0xc4,0xc1,0x69,0x72,0xd0,0x08 // vpsrld $0x8,%xmm8,%xmm2
+ .byte 0xc4,0xe3,0x75,0x18,0xca,0x01 // vinsertf128 $0x1,%xmm2,%ymm1,%ymm1
+ .byte 0xc5,0xb4,0x54,0xc9 // vandps %ymm1,%ymm9,%ymm1
+ .byte 0xc4,0xc1,0xf9,0x7e,0xc8 // vmovq %xmm1,%r8
+ .byte 0x45,0x89,0xc2 // mov %r8d,%r10d
+ .byte 0xc4,0xc3,0xf9,0x16,0xc9,0x01 // vpextrq $0x1,%xmm1,%r9
+ .byte 0x45,0x89,0xcb // mov %r9d,%r11d
+ .byte 0x49,0xc1,0xe9,0x20 // shr $0x20,%r9
+ .byte 0x49,0xc1,0xe8,0x20 // shr $0x20,%r8
+ .byte 0xc4,0xe3,0x7d,0x19,0xc9,0x01 // vextractf128 $0x1,%ymm1,%xmm1
+ .byte 0xc4,0xe1,0xf9,0x7e,0xcb // vmovq %xmm1,%rbx
+ .byte 0x41,0x89,0xde // mov %ebx,%r14d
+ .byte 0xc4,0xe3,0xf9,0x16,0xc9,0x01 // vpextrq $0x1,%xmm1,%rcx
+ .byte 0x41,0x89,0xcc // mov %ecx,%r12d
+ .byte 0x48,0xc1,0xe9,0x20 // shr $0x20,%rcx
+ .byte 0x48,0xc1,0xeb,0x20 // shr $0x20,%rbx
+ .byte 0xc4,0x81,0x7a,0x10,0x0c,0xb7 // vmovss (%r15,%r14,4),%xmm1
+ .byte 0xc4,0xc3,0x71,0x21,0x0c,0x9f,0x10 // vinsertps $0x10,(%r15,%rbx,4),%xmm1,%xmm1
+ .byte 0xc4,0x81,0x7a,0x10,0x14,0xa7 // vmovss (%r15,%r12,4),%xmm2
+ .byte 0xc4,0xe3,0x71,0x21,0xca,0x20 // vinsertps $0x20,%xmm2,%xmm1,%xmm1
+ .byte 0xc4,0xc1,0x7a,0x10,0x14,0x8f // vmovss (%r15,%rcx,4),%xmm2
+ .byte 0xc4,0xe3,0x71,0x21,0xca,0x30 // vinsertps $0x30,%xmm2,%xmm1,%xmm1
+ .byte 0xc4,0x81,0x7a,0x10,0x14,0x97 // vmovss (%r15,%r10,4),%xmm2
+ .byte 0xc4,0x83,0x69,0x21,0x14,0x87,0x10 // vinsertps $0x10,(%r15,%r8,4),%xmm2,%xmm2
+ .byte 0xc4,0x81,0x7a,0x10,0x1c,0x9f // vmovss (%r15,%r11,4),%xmm3
+ .byte 0xc4,0xe3,0x69,0x21,0xd3,0x20 // vinsertps $0x20,%xmm3,%xmm2,%xmm2
+ .byte 0xc4,0x81,0x7a,0x10,0x1c,0x8f // vmovss (%r15,%r9,4),%xmm3
+ .byte 0xc4,0xe3,0x69,0x21,0xd3,0x30 // vinsertps $0x30,%xmm3,%xmm2,%xmm2
+ .byte 0xc4,0xe3,0x6d,0x18,0xc9,0x01 // vinsertf128 $0x1,%xmm1,%ymm2,%ymm1
+ .byte 0x48,0x8b,0x40,0x18 // mov 0x18(%rax),%rax
+ .byte 0xc4,0xc1,0x69,0x72,0xd2,0x10 // vpsrld $0x10,%xmm10,%xmm2
+ .byte 0xc4,0xc1,0x61,0x72,0xd0,0x10 // vpsrld $0x10,%xmm8,%xmm3
+ .byte 0xc4,0xe3,0x6d,0x18,0xd3,0x01 // vinsertf128 $0x1,%xmm3,%ymm2,%ymm2
+ .byte 0xc5,0xb4,0x54,0xd2 // vandps %ymm2,%ymm9,%ymm2
+ .byte 0xc4,0xc1,0xf9,0x7e,0xd0 // vmovq %xmm2,%r8
+ .byte 0x45,0x89,0xc1 // mov %r8d,%r9d
+ .byte 0xc4,0xc3,0xf9,0x16,0xd6,0x01 // vpextrq $0x1,%xmm2,%r14
+ .byte 0x45,0x89,0xf2 // mov %r14d,%r10d
+ .byte 0x49,0xc1,0xee,0x20 // shr $0x20,%r14
+ .byte 0x49,0xc1,0xe8,0x20 // shr $0x20,%r8
+ .byte 0xc4,0xe3,0x7d,0x19,0xd2,0x01 // vextractf128 $0x1,%ymm2,%xmm2
+ .byte 0xc4,0xe1,0xf9,0x7e,0xd3 // vmovq %xmm2,%rbx
+ .byte 0x41,0x89,0xdb // mov %ebx,%r11d
+ .byte 0xc4,0xe3,0xf9,0x16,0xd1,0x01 // vpextrq $0x1,%xmm2,%rcx
+ .byte 0x41,0x89,0xcf // mov %ecx,%r15d
+ .byte 0x48,0xc1,0xe9,0x20 // shr $0x20,%rcx
+ .byte 0x48,0xc1,0xeb,0x20 // shr $0x20,%rbx
+ .byte 0xc4,0xa1,0x7a,0x10,0x14,0x98 // vmovss (%rax,%r11,4),%xmm2
+ .byte 0xc4,0xe3,0x69,0x21,0x14,0x98,0x10 // vinsertps $0x10,(%rax,%rbx,4),%xmm2,%xmm2
+ .byte 0xc4,0xa1,0x7a,0x10,0x1c,0xb8 // vmovss (%rax,%r15,4),%xmm3
+ .byte 0xc4,0xe3,0x69,0x21,0xd3,0x20 // vinsertps $0x20,%xmm3,%xmm2,%xmm2
+ .byte 0xc5,0xfa,0x10,0x1c,0x88 // vmovss (%rax,%rcx,4),%xmm3
+ .byte 0xc4,0x63,0x69,0x21,0xcb,0x30 // vinsertps $0x30,%xmm3,%xmm2,%xmm9
+ .byte 0xc4,0xa1,0x7a,0x10,0x1c,0x88 // vmovss (%rax,%r9,4),%xmm3
+ .byte 0xc4,0xa3,0x61,0x21,0x1c,0x80,0x10 // vinsertps $0x10,(%rax,%r8,4),%xmm3,%xmm3
+ .byte 0xc4,0xa1,0x7a,0x10,0x14,0x90 // vmovss (%rax,%r10,4),%xmm2
+ .byte 0xc4,0xe3,0x61,0x21,0xd2,0x20 // vinsertps $0x20,%xmm2,%xmm3,%xmm2
+ .byte 0xc4,0xa1,0x7a,0x10,0x1c,0xb0 // vmovss (%rax,%r14,4),%xmm3
+ .byte 0xc4,0xe3,0x69,0x21,0xd3,0x30 // vinsertps $0x30,%xmm3,%xmm2,%xmm2
+ .byte 0xc4,0xc3,0x6d,0x18,0xd1,0x01 // vinsertf128 $0x1,%xmm9,%ymm2,%ymm2
+ .byte 0xc4,0xc1,0x31,0x72,0xd2,0x18 // vpsrld $0x18,%xmm10,%xmm9
+ .byte 0xc4,0xc1,0x61,0x72,0xd0,0x18 // vpsrld $0x18,%xmm8,%xmm3
+ .byte 0xc4,0xe3,0x35,0x18,0xdb,0x01 // vinsertf128 $0x1,%xmm3,%ymm9,%ymm3
+ .byte 0xc5,0xfc,0x5b,0xdb // vcvtdq2ps %ymm3,%ymm3
+ .byte 0xc4,0x62,0x7d,0x18,0x42,0x0c // vbroadcastss 0xc(%rdx),%ymm8
+ .byte 0xc4,0xc1,0x64,0x59,0xd8 // vmulps %ymm8,%ymm3,%ymm3
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x5b // pop %rbx
+ .byte 0x41,0x5c // pop %r12
+ .byte 0x41,0x5e // pop %r14
+ .byte 0x41,0x5f // pop %r15
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_load_8888_avx
+_sk_load_8888_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x48,0x8b,0x00 // mov (%rax),%rax
+ .byte 0xc5,0xfc,0x10,0x1c,0xb8 // vmovups (%rax,%rdi,4),%ymm3
+ .byte 0xc5,0xf9,0x6e,0x42,0x10 // vmovd 0x10(%rdx),%xmm0
+ .byte 0xc4,0xe3,0x79,0x04,0xc0,0x00 // vpermilps $0x0,%xmm0,%xmm0
+ .byte 0xc4,0x63,0x7d,0x18,0xd8,0x01 // vinsertf128 $0x1,%xmm0,%ymm0,%ymm11
+ .byte 0xc5,0xa4,0x54,0xc3 // vandps %ymm3,%ymm11,%ymm0
+ .byte 0xc5,0xfc,0x5b,0xc0 // vcvtdq2ps %ymm0,%ymm0
+ .byte 0xc4,0x62,0x7d,0x18,0x42,0x0c // vbroadcastss 0xc(%rdx),%ymm8
+ .byte 0xc5,0xbc,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm0
+ .byte 0xc5,0xa9,0x72,0xd3,0x08 // vpsrld $0x8,%xmm3,%xmm10
+ .byte 0xc4,0xc3,0x7d,0x19,0xd9,0x01 // vextractf128 $0x1,%ymm3,%xmm9
+ .byte 0xc4,0xc1,0x71,0x72,0xd1,0x08 // vpsrld $0x8,%xmm9,%xmm1
+ .byte 0xc4,0xe3,0x2d,0x18,0xc9,0x01 // vinsertf128 $0x1,%xmm1,%ymm10,%ymm1
+ .byte 0xc5,0xa4,0x54,0xc9 // vandps %ymm1,%ymm11,%ymm1
+ .byte 0xc5,0xfc,0x5b,0xc9 // vcvtdq2ps %ymm1,%ymm1
+ .byte 0xc5,0xbc,0x59,0xc9 // vmulps %ymm1,%ymm8,%ymm1
+ .byte 0xc5,0xa9,0x72,0xd3,0x10 // vpsrld $0x10,%xmm3,%xmm10
+ .byte 0xc4,0xc1,0x69,0x72,0xd1,0x10 // vpsrld $0x10,%xmm9,%xmm2
+ .byte 0xc4,0xe3,0x2d,0x18,0xd2,0x01 // vinsertf128 $0x1,%xmm2,%ymm10,%ymm2
+ .byte 0xc5,0xa4,0x54,0xd2 // vandps %ymm2,%ymm11,%ymm2
+ .byte 0xc5,0xfc,0x5b,0xd2 // vcvtdq2ps %ymm2,%ymm2
+ .byte 0xc5,0xbc,0x59,0xd2 // vmulps %ymm2,%ymm8,%ymm2
+ .byte 0xc5,0xa9,0x72,0xd3,0x18 // vpsrld $0x18,%xmm3,%xmm10
+ .byte 0xc4,0xc1,0x61,0x72,0xd1,0x18 // vpsrld $0x18,%xmm9,%xmm3
+ .byte 0xc4,0xe3,0x2d,0x18,0xdb,0x01 // vinsertf128 $0x1,%xmm3,%ymm10,%ymm3
+ .byte 0xc5,0xfc,0x5b,0xdb // vcvtdq2ps %ymm3,%ymm3
+ .byte 0xc4,0xc1,0x64,0x59,0xd8 // vmulps %ymm8,%ymm3,%ymm3
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_store_8888_avx
+_sk_store_8888_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x48,0x8b,0x00 // mov (%rax),%rax
+ .byte 0xc4,0x62,0x7d,0x18,0x42,0x08 // vbroadcastss 0x8(%rdx),%ymm8
+ .byte 0xc5,0x3c,0x59,0xc8 // vmulps %ymm0,%ymm8,%ymm9
+ .byte 0xc4,0x41,0x7d,0x5b,0xc9 // vcvtps2dq %ymm9,%ymm9
+ .byte 0xc5,0x3c,0x59,0xd1 // vmulps %ymm1,%ymm8,%ymm10
+ .byte 0xc4,0x41,0x7d,0x5b,0xd2 // vcvtps2dq %ymm10,%ymm10
+ .byte 0xc4,0xc1,0x21,0x72,0xf2,0x08 // vpslld $0x8,%xmm10,%xmm11
+ .byte 0xc4,0x43,0x7d,0x19,0xd2,0x01 // vextractf128 $0x1,%ymm10,%xmm10
+ .byte 0xc4,0xc1,0x29,0x72,0xf2,0x08 // vpslld $0x8,%xmm10,%xmm10
+ .byte 0xc4,0x43,0x25,0x18,0xd2,0x01 // vinsertf128 $0x1,%xmm10,%ymm11,%ymm10
+ .byte 0xc4,0x41,0x2d,0x56,0xc9 // vorpd %ymm9,%ymm10,%ymm9
+ .byte 0xc5,0x3c,0x59,0xd2 // vmulps %ymm2,%ymm8,%ymm10
+ .byte 0xc4,0x41,0x7d,0x5b,0xd2 // vcvtps2dq %ymm10,%ymm10
+ .byte 0xc4,0xc1,0x21,0x72,0xf2,0x10 // vpslld $0x10,%xmm10,%xmm11
+ .byte 0xc4,0x43,0x7d,0x19,0xd2,0x01 // vextractf128 $0x1,%ymm10,%xmm10
+ .byte 0xc4,0xc1,0x29,0x72,0xf2,0x10 // vpslld $0x10,%xmm10,%xmm10
+ .byte 0xc4,0x43,0x25,0x18,0xd2,0x01 // vinsertf128 $0x1,%xmm10,%ymm11,%ymm10
+ .byte 0xc4,0x41,0x35,0x56,0xca // vorpd %ymm10,%ymm9,%ymm9
+ .byte 0xc5,0x3c,0x59,0xc3 // vmulps %ymm3,%ymm8,%ymm8
+ .byte 0xc4,0x41,0x7d,0x5b,0xc0 // vcvtps2dq %ymm8,%ymm8
+ .byte 0xc4,0xc1,0x29,0x72,0xf0,0x18 // vpslld $0x18,%xmm8,%xmm10
+ .byte 0xc4,0x43,0x7d,0x19,0xc0,0x01 // vextractf128 $0x1,%ymm8,%xmm8
+ .byte 0xc4,0xc1,0x39,0x72,0xf0,0x18 // vpslld $0x18,%xmm8,%xmm8
+ .byte 0xc4,0x43,0x2d,0x18,0xc0,0x01 // vinsertf128 $0x1,%xmm8,%ymm10,%ymm8
+ .byte 0xc4,0x41,0x35,0x56,0xc0 // vorpd %ymm8,%ymm9,%ymm8
+ .byte 0xc5,0x7d,0x11,0x04,0xb8 // vmovupd %ymm8,(%rax,%rdi,4)
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_load_f16_avx
+_sk_load_f16_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_store_f16_avx
+_sk_store_f16_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_clamp_x_avx
+_sk_clamp_x_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc4,0x62,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm8
+ .byte 0xc4,0x43,0x7d,0x19,0xc1,0x01 // vextractf128 $0x1,%ymm8,%xmm9
+ .byte 0xc4,0x41,0x29,0x76,0xd2 // vpcmpeqd %xmm10,%xmm10,%xmm10
+ .byte 0xc4,0x41,0x31,0xfe,0xca // vpaddd %xmm10,%xmm9,%xmm9
+ .byte 0xc4,0x41,0x39,0xfe,0xc2 // vpaddd %xmm10,%xmm8,%xmm8
+ .byte 0xc4,0x43,0x3d,0x18,0xc1,0x01 // vinsertf128 $0x1,%xmm9,%ymm8,%ymm8
+ .byte 0xc4,0xc1,0x7c,0x5d,0xc0 // vminps %ymm8,%ymm0,%ymm0
+ .byte 0xc4,0x41,0x3c,0x57,0xc0 // vxorps %ymm8,%ymm8,%ymm8
+ .byte 0xc5,0xbc,0x5f,0xc0 // vmaxps %ymm0,%ymm8,%ymm0
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_clamp_y_avx
+_sk_clamp_y_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc4,0x62,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm8
+ .byte 0xc4,0x43,0x7d,0x19,0xc1,0x01 // vextractf128 $0x1,%ymm8,%xmm9
+ .byte 0xc4,0x41,0x29,0x76,0xd2 // vpcmpeqd %xmm10,%xmm10,%xmm10
+ .byte 0xc4,0x41,0x31,0xfe,0xca // vpaddd %xmm10,%xmm9,%xmm9
+ .byte 0xc4,0x41,0x39,0xfe,0xc2 // vpaddd %xmm10,%xmm8,%xmm8
+ .byte 0xc4,0x43,0x3d,0x18,0xc1,0x01 // vinsertf128 $0x1,%xmm9,%ymm8,%ymm8
+ .byte 0xc4,0xc1,0x74,0x5d,0xc8 // vminps %ymm8,%ymm1,%ymm1
+ .byte 0xc4,0x41,0x3c,0x57,0xc0 // vxorps %ymm8,%ymm8,%ymm8
+ .byte 0xc5,0xbc,0x5f,0xc9 // vmaxps %ymm1,%ymm8,%ymm1
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_matrix_2x3_avx
+_sk_matrix_2x3_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc4,0x62,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm8
+ .byte 0xc4,0x62,0x7d,0x18,0x48,0x08 // vbroadcastss 0x8(%rax),%ymm9
+ .byte 0xc4,0x62,0x7d,0x18,0x50,0x10 // vbroadcastss 0x10(%rax),%ymm10
+ .byte 0xc5,0x34,0x59,0xc9 // vmulps %ymm1,%ymm9,%ymm9
+ .byte 0xc4,0x41,0x34,0x58,0xca // vaddps %ymm10,%ymm9,%ymm9
+ .byte 0xc5,0x3c,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm8
+ .byte 0xc4,0x41,0x3c,0x58,0xc1 // vaddps %ymm9,%ymm8,%ymm8
+ .byte 0xc4,0x62,0x7d,0x18,0x48,0x04 // vbroadcastss 0x4(%rax),%ymm9
+ .byte 0xc4,0x62,0x7d,0x18,0x50,0x0c // vbroadcastss 0xc(%rax),%ymm10
+ .byte 0xc4,0x62,0x7d,0x18,0x58,0x14 // vbroadcastss 0x14(%rax),%ymm11
+ .byte 0xc5,0xac,0x59,0xc9 // vmulps %ymm1,%ymm10,%ymm1
+ .byte 0xc4,0xc1,0x74,0x58,0xcb // vaddps %ymm11,%ymm1,%ymm1
+ .byte 0xc5,0xb4,0x59,0xc0 // vmulps %ymm0,%ymm9,%ymm0
+ .byte 0xc5,0xfc,0x58,0xc9 // vaddps %ymm1,%ymm0,%ymm1
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc5,0x7c,0x29,0xc0 // vmovaps %ymm8,%ymm0
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_matrix_3x4_avx
+_sk_matrix_3x4_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc4,0x62,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm8
+ .byte 0xc4,0x62,0x7d,0x18,0x48,0x0c // vbroadcastss 0xc(%rax),%ymm9
+ .byte 0xc4,0x62,0x7d,0x18,0x50,0x18 // vbroadcastss 0x18(%rax),%ymm10
+ .byte 0xc4,0x62,0x7d,0x18,0x58,0x24 // vbroadcastss 0x24(%rax),%ymm11
+ .byte 0xc5,0x2c,0x59,0xd2 // vmulps %ymm2,%ymm10,%ymm10
+ .byte 0xc4,0x41,0x2c,0x58,0xd3 // vaddps %ymm11,%ymm10,%ymm10
+ .byte 0xc5,0x34,0x59,0xc9 // vmulps %ymm1,%ymm9,%ymm9
+ .byte 0xc4,0x41,0x34,0x58,0xca // vaddps %ymm10,%ymm9,%ymm9
+ .byte 0xc5,0x3c,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm8
+ .byte 0xc4,0x41,0x3c,0x58,0xc1 // vaddps %ymm9,%ymm8,%ymm8
+ .byte 0xc4,0x62,0x7d,0x18,0x48,0x04 // vbroadcastss 0x4(%rax),%ymm9
+ .byte 0xc4,0x62,0x7d,0x18,0x50,0x10 // vbroadcastss 0x10(%rax),%ymm10
+ .byte 0xc4,0x62,0x7d,0x18,0x58,0x1c // vbroadcastss 0x1c(%rax),%ymm11
+ .byte 0xc4,0x62,0x7d,0x18,0x60,0x28 // vbroadcastss 0x28(%rax),%ymm12
+ .byte 0xc5,0x24,0x59,0xda // vmulps %ymm2,%ymm11,%ymm11
+ .byte 0xc4,0x41,0x24,0x58,0xdc // vaddps %ymm12,%ymm11,%ymm11
+ .byte 0xc5,0x2c,0x59,0xd1 // vmulps %ymm1,%ymm10,%ymm10
+ .byte 0xc4,0x41,0x2c,0x58,0xd3 // vaddps %ymm11,%ymm10,%ymm10
+ .byte 0xc5,0x34,0x59,0xc8 // vmulps %ymm0,%ymm9,%ymm9
+ .byte 0xc4,0x41,0x34,0x58,0xca // vaddps %ymm10,%ymm9,%ymm9
+ .byte 0xc4,0x62,0x7d,0x18,0x50,0x08 // vbroadcastss 0x8(%rax),%ymm10
+ .byte 0xc4,0x62,0x7d,0x18,0x58,0x14 // vbroadcastss 0x14(%rax),%ymm11
+ .byte 0xc4,0x62,0x7d,0x18,0x60,0x20 // vbroadcastss 0x20(%rax),%ymm12
+ .byte 0xc4,0x62,0x7d,0x18,0x68,0x2c // vbroadcastss 0x2c(%rax),%ymm13
+ .byte 0xc5,0x9c,0x59,0xd2 // vmulps %ymm2,%ymm12,%ymm2
+ .byte 0xc4,0xc1,0x6c,0x58,0xd5 // vaddps %ymm13,%ymm2,%ymm2
+ .byte 0xc5,0xa4,0x59,0xc9 // vmulps %ymm1,%ymm11,%ymm1
+ .byte 0xc5,0xf4,0x58,0xca // vaddps %ymm2,%ymm1,%ymm1
+ .byte 0xc5,0xac,0x59,0xc0 // vmulps %ymm0,%ymm10,%ymm0
+ .byte 0xc5,0xfc,0x58,0xd1 // vaddps %ymm1,%ymm0,%ymm2
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc5,0x7c,0x29,0xc0 // vmovaps %ymm8,%ymm0
+ .byte 0xc5,0x7c,0x29,0xc9 // vmovaps %ymm9,%ymm1
+ .byte 0xff,0xe0 // jmpq *%rax
+
+.globl _sk_linear_gradient_2stops_avx
+_sk_linear_gradient_2stops_avx:
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc4,0xe2,0x7d,0x18,0x48,0x10 // vbroadcastss 0x10(%rax),%ymm1
+ .byte 0xc4,0xe2,0x7d,0x18,0x10 // vbroadcastss (%rax),%ymm2
+ .byte 0xc5,0xf4,0x59,0xc8 // vmulps %ymm0,%ymm1,%ymm1
+ .byte 0xc5,0x6c,0x58,0xc1 // vaddps %ymm1,%ymm2,%ymm8
+ .byte 0xc4,0xe2,0x7d,0x18,0x48,0x14 // vbroadcastss 0x14(%rax),%ymm1
+ .byte 0xc4,0xe2,0x7d,0x18,0x50,0x04 // vbroadcastss 0x4(%rax),%ymm2
+ .byte 0xc5,0xf4,0x59,0xc8 // vmulps %ymm0,%ymm1,%ymm1
+ .byte 0xc5,0xec,0x58,0xc9 // vaddps %ymm1,%ymm2,%ymm1
+ .byte 0xc4,0xe2,0x7d,0x18,0x50,0x18 // vbroadcastss 0x18(%rax),%ymm2
+ .byte 0xc4,0xe2,0x7d,0x18,0x58,0x08 // vbroadcastss 0x8(%rax),%ymm3
+ .byte 0xc5,0xec,0x59,0xd0 // vmulps %ymm0,%ymm2,%ymm2
+ .byte 0xc5,0xe4,0x58,0xd2 // vaddps %ymm2,%ymm3,%ymm2
+ .byte 0xc4,0xe2,0x7d,0x18,0x58,0x1c // vbroadcastss 0x1c(%rax),%ymm3
+ .byte 0xc4,0x62,0x7d,0x18,0x48,0x0c // vbroadcastss 0xc(%rax),%ymm9
+ .byte 0xc5,0xe4,0x59,0xc0 // vmulps %ymm0,%ymm3,%ymm0
+ .byte 0xc5,0xb4,0x58,0xd8 // vaddps %ymm0,%ymm9,%ymm3
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0xc5,0x7c,0x29,0xc0 // vmovaps %ymm8,%ymm0
+ .byte 0xff,0xe0 // jmpq *%rax
+
.globl _sk_start_pipeline_sse41
_sk_start_pipeline_sse41:
.byte 0x41,0x57 // push %r15
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index d2078b6b4f..1409d03c6f 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -589,6 +589,701 @@ _sk_linear_gradient_2stops_hsw LABEL PROC
DB 197,124,41,192 ; vmovaps %ymm8,%ymm0
DB 255,224 ; jmpq *%rax
+PUBLIC _sk_start_pipeline_avx
+_sk_start_pipeline_avx LABEL PROC
+ DB 65,87 ; push %r15
+ DB 65,86 ; push %r14
+ DB 65,85 ; push %r13
+ DB 65,84 ; push %r12
+ DB 86 ; push %rsi
+ DB 87 ; push %rdi
+ DB 83 ; push %rbx
+ DB 72,129,236,160,0,0,0 ; sub $0xa0,%rsp
+ DB 197,120,41,188,36,144,0,0,0 ; vmovaps %xmm15,0x90(%rsp)
+ DB 197,120,41,180,36,128,0,0,0 ; vmovaps %xmm14,0x80(%rsp)
+ DB 197,120,41,108,36,112 ; vmovaps %xmm13,0x70(%rsp)
+ DB 197,120,41,100,36,96 ; vmovaps %xmm12,0x60(%rsp)
+ DB 197,120,41,92,36,80 ; vmovaps %xmm11,0x50(%rsp)
+ DB 197,120,41,84,36,64 ; vmovaps %xmm10,0x40(%rsp)
+ DB 197,120,41,76,36,48 ; vmovaps %xmm9,0x30(%rsp)
+ DB 197,120,41,68,36,32 ; vmovaps %xmm8,0x20(%rsp)
+ DB 197,248,41,124,36,16 ; vmovaps %xmm7,0x10(%rsp)
+ DB 197,248,41,52,36 ; vmovaps %xmm6,(%rsp)
+ DB 77,137,207 ; mov %r9,%r15
+ DB 77,137,198 ; mov %r8,%r14
+ DB 72,137,203 ; mov %rcx,%rbx
+ DB 72,137,214 ; mov %rdx,%rsi
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 73,137,196 ; mov %rax,%r12
+ DB 73,137,245 ; mov %rsi,%r13
+ DB 72,141,67,8 ; lea 0x8(%rbx),%rax
+ DB 76,57,248 ; cmp %r15,%rax
+ DB 118,5 ; jbe 75 <_sk_start_pipeline_avx+0x75>
+ DB 72,137,216 ; mov %rbx,%rax
+ DB 235,60 ; jmp b1 <_sk_start_pipeline_avx+0xb1>
+ DB 197,252,87,192 ; vxorps %ymm0,%ymm0,%ymm0
+ DB 197,244,87,201 ; vxorps %ymm1,%ymm1,%ymm1
+ DB 197,236,87,210 ; vxorps %ymm2,%ymm2,%ymm2
+ DB 197,228,87,219 ; vxorps %ymm3,%ymm3,%ymm3
+ DB 197,220,87,228 ; vxorps %ymm4,%ymm4,%ymm4
+ DB 197,212,87,237 ; vxorps %ymm5,%ymm5,%ymm5
+ DB 197,204,87,246 ; vxorps %ymm6,%ymm6,%ymm6
+ DB 197,196,87,255 ; vxorps %ymm7,%ymm7,%ymm7
+ DB 72,137,223 ; mov %rbx,%rdi
+ DB 76,137,238 ; mov %r13,%rsi
+ DB 76,137,242 ; mov %r14,%rdx
+ DB 65,255,212 ; callq *%r12
+ DB 72,141,67,8 ; lea 0x8(%rbx),%rax
+ DB 72,131,195,16 ; add $0x10,%rbx
+ DB 76,57,251 ; cmp %r15,%rbx
+ DB 72,137,195 ; mov %rax,%rbx
+ DB 118,196 ; jbe 75 <_sk_start_pipeline_avx+0x75>
+ DB 197,248,40,52,36 ; vmovaps (%rsp),%xmm6
+ DB 197,248,40,124,36,16 ; vmovaps 0x10(%rsp),%xmm7
+ DB 197,120,40,68,36,32 ; vmovaps 0x20(%rsp),%xmm8
+ DB 197,120,40,76,36,48 ; vmovaps 0x30(%rsp),%xmm9
+ DB 197,120,40,84,36,64 ; vmovaps 0x40(%rsp),%xmm10
+ DB 197,120,40,92,36,80 ; vmovaps 0x50(%rsp),%xmm11
+ DB 197,120,40,100,36,96 ; vmovaps 0x60(%rsp),%xmm12
+ DB 197,120,40,108,36,112 ; vmovaps 0x70(%rsp),%xmm13
+ DB 197,120,40,180,36,128,0,0,0 ; vmovaps 0x80(%rsp),%xmm14
+ DB 197,120,40,188,36,144,0,0,0 ; vmovaps 0x90(%rsp),%xmm15
+ DB 72,129,196,160,0,0,0 ; add $0xa0,%rsp
+ DB 91 ; pop %rbx
+ DB 95 ; pop %rdi
+ DB 94 ; pop %rsi
+ DB 65,92 ; pop %r12
+ DB 65,93 ; pop %r13
+ DB 65,94 ; pop %r14
+ DB 65,95 ; pop %r15
+ DB 197,248,119 ; vzeroupper
+ DB 195 ; retq
+
+PUBLIC _sk_just_return_avx
+_sk_just_return_avx LABEL PROC
+ DB 195 ; retq
+
+PUBLIC _sk_seed_shader_avx
+_sk_seed_shader_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,249,110,199 ; vmovd %edi,%xmm0
+ DB 196,227,121,4,192,0 ; vpermilps $0x0,%xmm0,%xmm0
+ DB 196,227,125,24,192,1 ; vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
+ DB 197,252,91,192 ; vcvtdq2ps %ymm0,%ymm0
+ DB 196,226,125,24,74,4 ; vbroadcastss 0x4(%rdx),%ymm1
+ DB 197,252,88,193 ; vaddps %ymm1,%ymm0,%ymm0
+ DB 197,252,88,66,20 ; vaddps 0x14(%rdx),%ymm0,%ymm0
+ DB 197,249,110,16 ; vmovd (%rax),%xmm2
+ DB 196,227,121,4,210,0 ; vpermilps $0x0,%xmm2,%xmm2
+ DB 196,227,109,24,210,1 ; vinsertf128 $0x1,%xmm2,%ymm2,%ymm2
+ DB 197,252,91,210 ; vcvtdq2ps %ymm2,%ymm2
+ DB 197,236,88,201 ; vaddps %ymm1,%ymm2,%ymm1
+ DB 196,226,125,24,18 ; vbroadcastss (%rdx),%ymm2
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,228,87,219 ; vxorps %ymm3,%ymm3,%ymm3
+ DB 197,220,87,228 ; vxorps %ymm4,%ymm4,%ymm4
+ DB 197,212,87,237 ; vxorps %ymm5,%ymm5,%ymm5
+ DB 197,204,87,246 ; vxorps %ymm6,%ymm6,%ymm6
+ DB 197,196,87,255 ; vxorps %ymm7,%ymm7,%ymm7
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_constant_color_avx
+_sk_constant_color_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,226,125,24,0 ; vbroadcastss (%rax),%ymm0
+ DB 196,226,125,24,72,4 ; vbroadcastss 0x4(%rax),%ymm1
+ DB 196,226,125,24,80,8 ; vbroadcastss 0x8(%rax),%ymm2
+ DB 196,226,125,24,88,12 ; vbroadcastss 0xc(%rax),%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_clear_avx
+_sk_clear_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,252,87,192 ; vxorps %ymm0,%ymm0,%ymm0
+ DB 197,244,87,201 ; vxorps %ymm1,%ymm1,%ymm1
+ DB 197,236,87,210 ; vxorps %ymm2,%ymm2,%ymm2
+ DB 197,228,87,219 ; vxorps %ymm3,%ymm3,%ymm3
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_plus__avx
+_sk_plus__avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,252,88,196 ; vaddps %ymm4,%ymm0,%ymm0
+ DB 197,244,88,205 ; vaddps %ymm5,%ymm1,%ymm1
+ DB 197,236,88,214 ; vaddps %ymm6,%ymm2,%ymm2
+ DB 197,228,88,223 ; vaddps %ymm7,%ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_srcover_avx
+_sk_srcover_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,98,125,24,2 ; vbroadcastss (%rdx),%ymm8
+ DB 197,60,92,195 ; vsubps %ymm3,%ymm8,%ymm8
+ DB 197,60,89,204 ; vmulps %ymm4,%ymm8,%ymm9
+ DB 197,180,88,192 ; vaddps %ymm0,%ymm9,%ymm0
+ DB 197,60,89,205 ; vmulps %ymm5,%ymm8,%ymm9
+ DB 197,180,88,201 ; vaddps %ymm1,%ymm9,%ymm1
+ DB 197,60,89,206 ; vmulps %ymm6,%ymm8,%ymm9
+ DB 197,180,88,210 ; vaddps %ymm2,%ymm9,%ymm2
+ DB 197,60,89,199 ; vmulps %ymm7,%ymm8,%ymm8
+ DB 197,188,88,219 ; vaddps %ymm3,%ymm8,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_dstover_avx
+_sk_dstover_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,98,125,24,2 ; vbroadcastss (%rdx),%ymm8
+ DB 197,60,92,199 ; vsubps %ymm7,%ymm8,%ymm8
+ DB 197,188,89,192 ; vmulps %ymm0,%ymm8,%ymm0
+ DB 197,252,88,196 ; vaddps %ymm4,%ymm0,%ymm0
+ DB 197,188,89,201 ; vmulps %ymm1,%ymm8,%ymm1
+ DB 197,244,88,205 ; vaddps %ymm5,%ymm1,%ymm1
+ DB 197,188,89,210 ; vmulps %ymm2,%ymm8,%ymm2
+ DB 197,236,88,214 ; vaddps %ymm6,%ymm2,%ymm2
+ DB 197,188,89,219 ; vmulps %ymm3,%ymm8,%ymm3
+ DB 197,228,88,223 ; vaddps %ymm7,%ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_clamp_0_avx
+_sk_clamp_0_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8
+ DB 196,193,124,95,192 ; vmaxps %ymm8,%ymm0,%ymm0
+ DB 196,193,116,95,200 ; vmaxps %ymm8,%ymm1,%ymm1
+ DB 196,193,108,95,208 ; vmaxps %ymm8,%ymm2,%ymm2
+ DB 196,193,100,95,216 ; vmaxps %ymm8,%ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_clamp_1_avx
+_sk_clamp_1_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,98,125,24,2 ; vbroadcastss (%rdx),%ymm8
+ DB 196,193,124,93,192 ; vminps %ymm8,%ymm0,%ymm0
+ DB 196,193,116,93,200 ; vminps %ymm8,%ymm1,%ymm1
+ DB 196,193,108,93,208 ; vminps %ymm8,%ymm2,%ymm2
+ DB 196,193,100,93,216 ; vminps %ymm8,%ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_clamp_a_avx
+_sk_clamp_a_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,98,125,24,2 ; vbroadcastss (%rdx),%ymm8
+ DB 196,193,100,93,216 ; vminps %ymm8,%ymm3,%ymm3
+ DB 197,252,93,195 ; vminps %ymm3,%ymm0,%ymm0
+ DB 197,244,93,203 ; vminps %ymm3,%ymm1,%ymm1
+ DB 197,236,93,211 ; vminps %ymm3,%ymm2,%ymm2
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_set_rgb_avx
+_sk_set_rgb_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,226,125,24,0 ; vbroadcastss (%rax),%ymm0
+ DB 196,226,125,24,72,4 ; vbroadcastss 0x4(%rax),%ymm1
+ DB 196,226,125,24,80,8 ; vbroadcastss 0x8(%rax),%ymm2
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_swap_rb_avx
+_sk_swap_rb_avx LABEL PROC
+ DB 197,124,40,192 ; vmovaps %ymm0,%ymm8
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,252,40,194 ; vmovaps %ymm2,%ymm0
+ DB 197,124,41,194 ; vmovaps %ymm8,%ymm2
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_swap_avx
+_sk_swap_avx LABEL PROC
+ DB 197,124,40,195 ; vmovaps %ymm3,%ymm8
+ DB 197,124,40,202 ; vmovaps %ymm2,%ymm9
+ DB 197,124,40,209 ; vmovaps %ymm1,%ymm10
+ DB 197,124,40,216 ; vmovaps %ymm0,%ymm11
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,252,40,196 ; vmovaps %ymm4,%ymm0
+ DB 197,252,40,205 ; vmovaps %ymm5,%ymm1
+ DB 197,252,40,214 ; vmovaps %ymm6,%ymm2
+ DB 197,252,40,223 ; vmovaps %ymm7,%ymm3
+ DB 197,124,41,220 ; vmovaps %ymm11,%ymm4
+ DB 197,124,41,213 ; vmovaps %ymm10,%ymm5
+ DB 197,124,41,206 ; vmovaps %ymm9,%ymm6
+ DB 197,124,41,199 ; vmovaps %ymm8,%ymm7
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_move_src_dst_avx
+_sk_move_src_dst_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,252,40,224 ; vmovaps %ymm0,%ymm4
+ DB 197,252,40,233 ; vmovaps %ymm1,%ymm5
+ DB 197,252,40,242 ; vmovaps %ymm2,%ymm6
+ DB 197,252,40,251 ; vmovaps %ymm3,%ymm7
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_move_dst_src_avx
+_sk_move_dst_src_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,252,40,196 ; vmovaps %ymm4,%ymm0
+ DB 197,252,40,205 ; vmovaps %ymm5,%ymm1
+ DB 197,252,40,214 ; vmovaps %ymm6,%ymm2
+ DB 197,252,40,223 ; vmovaps %ymm7,%ymm3
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_premul_avx
+_sk_premul_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,252,89,195 ; vmulps %ymm3,%ymm0,%ymm0
+ DB 197,244,89,203 ; vmulps %ymm3,%ymm1,%ymm1
+ DB 197,236,89,211 ; vmulps %ymm3,%ymm2,%ymm2
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_unpremul_avx
+_sk_unpremul_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8
+ DB 196,65,100,194,200,0 ; vcmpeqps %ymm8,%ymm3,%ymm9
+ DB 196,98,125,24,18 ; vbroadcastss (%rdx),%ymm10
+ DB 197,44,94,211 ; vdivps %ymm3,%ymm10,%ymm10
+ DB 196,67,45,74,192,144 ; vblendvps %ymm9,%ymm8,%ymm10,%ymm8
+ DB 197,188,89,192 ; vmulps %ymm0,%ymm8,%ymm0
+ DB 197,188,89,201 ; vmulps %ymm1,%ymm8,%ymm1
+ DB 197,188,89,210 ; vmulps %ymm2,%ymm8,%ymm2
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_from_srgb_avx
+_sk_from_srgb_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,98,125,24,66,64 ; vbroadcastss 0x40(%rdx),%ymm8
+ DB 197,60,89,200 ; vmulps %ymm0,%ymm8,%ymm9
+ DB 197,124,89,208 ; vmulps %ymm0,%ymm0,%ymm10
+ DB 196,98,125,24,90,60 ; vbroadcastss 0x3c(%rdx),%ymm11
+ DB 196,98,125,24,98,56 ; vbroadcastss 0x38(%rdx),%ymm12
+ DB 197,36,89,232 ; vmulps %ymm0,%ymm11,%ymm13
+ DB 196,65,20,88,236 ; vaddps %ymm12,%ymm13,%ymm13
+ DB 196,98,125,24,114,52 ; vbroadcastss 0x34(%rdx),%ymm14
+ DB 196,65,44,89,213 ; vmulps %ymm13,%ymm10,%ymm10
+ DB 196,65,12,88,210 ; vaddps %ymm10,%ymm14,%ymm10
+ DB 196,98,125,24,106,68 ; vbroadcastss 0x44(%rdx),%ymm13
+ DB 196,193,124,194,197,1 ; vcmpltps %ymm13,%ymm0,%ymm0
+ DB 196,195,45,74,193,0 ; vblendvps %ymm0,%ymm9,%ymm10,%ymm0
+ DB 197,60,89,201 ; vmulps %ymm1,%ymm8,%ymm9
+ DB 197,116,89,209 ; vmulps %ymm1,%ymm1,%ymm10
+ DB 197,36,89,249 ; vmulps %ymm1,%ymm11,%ymm15
+ DB 196,65,4,88,252 ; vaddps %ymm12,%ymm15,%ymm15
+ DB 196,65,44,89,215 ; vmulps %ymm15,%ymm10,%ymm10
+ DB 196,65,12,88,210 ; vaddps %ymm10,%ymm14,%ymm10
+ DB 196,193,116,194,205,1 ; vcmpltps %ymm13,%ymm1,%ymm1
+ DB 196,195,45,74,201,16 ; vblendvps %ymm1,%ymm9,%ymm10,%ymm1
+ DB 197,60,89,194 ; vmulps %ymm2,%ymm8,%ymm8
+ DB 197,108,89,202 ; vmulps %ymm2,%ymm2,%ymm9
+ DB 197,36,89,210 ; vmulps %ymm2,%ymm11,%ymm10
+ DB 196,65,44,88,212 ; vaddps %ymm12,%ymm10,%ymm10
+ DB 196,65,52,89,202 ; vmulps %ymm10,%ymm9,%ymm9
+ DB 196,65,12,88,201 ; vaddps %ymm9,%ymm14,%ymm9
+ DB 196,193,108,194,213,1 ; vcmpltps %ymm13,%ymm2,%ymm2
+ DB 196,195,53,74,208,32 ; vblendvps %ymm2,%ymm8,%ymm9,%ymm2
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_to_srgb_avx
+_sk_to_srgb_avx LABEL PROC
+ DB 197,124,82,192 ; vrsqrtps %ymm0,%ymm8
+ DB 196,65,124,83,200 ; vrcpps %ymm8,%ymm9
+ DB 196,65,124,82,208 ; vrsqrtps %ymm8,%ymm10
+ DB 196,98,125,24,66,72 ; vbroadcastss 0x48(%rdx),%ymm8
+ DB 197,60,89,216 ; vmulps %ymm0,%ymm8,%ymm11
+ DB 196,98,125,24,34 ; vbroadcastss (%rdx),%ymm12
+ DB 196,98,125,24,106,76 ; vbroadcastss 0x4c(%rdx),%ymm13
+ DB 196,98,125,24,114,80 ; vbroadcastss 0x50(%rdx),%ymm14
+ DB 196,98,125,24,122,84 ; vbroadcastss 0x54(%rdx),%ymm15
+ DB 196,65,52,89,206 ; vmulps %ymm14,%ymm9,%ymm9
+ DB 196,65,52,88,207 ; vaddps %ymm15,%ymm9,%ymm9
+ DB 196,65,44,89,213 ; vmulps %ymm13,%ymm10,%ymm10
+ DB 196,65,44,88,201 ; vaddps %ymm9,%ymm10,%ymm9
+ DB 196,65,28,93,201 ; vminps %ymm9,%ymm12,%ymm9
+ DB 196,98,125,24,82,88 ; vbroadcastss 0x58(%rdx),%ymm10
+ DB 196,193,124,194,194,1 ; vcmpltps %ymm10,%ymm0,%ymm0
+ DB 196,195,53,74,195,0 ; vblendvps %ymm0,%ymm11,%ymm9,%ymm0
+ DB 197,124,82,201 ; vrsqrtps %ymm1,%ymm9
+ DB 196,65,124,83,217 ; vrcpps %ymm9,%ymm11
+ DB 196,65,124,82,201 ; vrsqrtps %ymm9,%ymm9
+ DB 196,65,12,89,219 ; vmulps %ymm11,%ymm14,%ymm11
+ DB 196,65,4,88,219 ; vaddps %ymm11,%ymm15,%ymm11
+ DB 196,65,20,89,201 ; vmulps %ymm9,%ymm13,%ymm9
+ DB 196,65,52,88,203 ; vaddps %ymm11,%ymm9,%ymm9
+ DB 197,60,89,217 ; vmulps %ymm1,%ymm8,%ymm11
+ DB 196,65,28,93,201 ; vminps %ymm9,%ymm12,%ymm9
+ DB 196,193,116,194,202,1 ; vcmpltps %ymm10,%ymm1,%ymm1
+ DB 196,195,53,74,203,16 ; vblendvps %ymm1,%ymm11,%ymm9,%ymm1
+ DB 197,124,82,202 ; vrsqrtps %ymm2,%ymm9
+ DB 196,65,124,83,217 ; vrcpps %ymm9,%ymm11
+ DB 196,65,12,89,219 ; vmulps %ymm11,%ymm14,%ymm11
+ DB 196,65,4,88,219 ; vaddps %ymm11,%ymm15,%ymm11
+ DB 196,65,124,82,201 ; vrsqrtps %ymm9,%ymm9
+ DB 196,65,20,89,201 ; vmulps %ymm9,%ymm13,%ymm9
+ DB 196,65,52,88,203 ; vaddps %ymm11,%ymm9,%ymm9
+ DB 196,65,28,93,201 ; vminps %ymm9,%ymm12,%ymm9
+ DB 197,60,89,194 ; vmulps %ymm2,%ymm8,%ymm8
+ DB 196,193,108,194,210,1 ; vcmpltps %ymm10,%ymm2,%ymm2
+ DB 196,195,53,74,208,32 ; vblendvps %ymm2,%ymm8,%ymm9,%ymm2
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_scale_u8_avx
+_sk_scale_u8_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,139,0 ; mov (%rax),%rax
+ DB 196,98,121,49,68,56,4 ; vpmovzxbd 0x4(%rax,%rdi,1),%xmm8
+ DB 196,98,121,49,12,56 ; vpmovzxbd (%rax,%rdi,1),%xmm9
+ DB 196,67,53,24,192,1 ; vinsertf128 $0x1,%xmm8,%ymm9,%ymm8
+ DB 196,65,124,91,192 ; vcvtdq2ps %ymm8,%ymm8
+ DB 196,98,125,24,74,12 ; vbroadcastss 0xc(%rdx),%ymm9
+ DB 196,65,60,89,193 ; vmulps %ymm9,%ymm8,%ymm8
+ DB 197,188,89,192 ; vmulps %ymm0,%ymm8,%ymm0
+ DB 197,188,89,201 ; vmulps %ymm1,%ymm8,%ymm1
+ DB 197,188,89,210 ; vmulps %ymm2,%ymm8,%ymm2
+ DB 197,188,89,219 ; vmulps %ymm3,%ymm8,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_lerp_u8_avx
+_sk_lerp_u8_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,139,0 ; mov (%rax),%rax
+ DB 196,98,121,49,68,56,4 ; vpmovzxbd 0x4(%rax,%rdi,1),%xmm8
+ DB 196,98,121,49,12,56 ; vpmovzxbd (%rax,%rdi,1),%xmm9
+ DB 196,67,53,24,192,1 ; vinsertf128 $0x1,%xmm8,%ymm9,%ymm8
+ DB 196,65,124,91,192 ; vcvtdq2ps %ymm8,%ymm8
+ DB 196,98,125,24,74,12 ; vbroadcastss 0xc(%rdx),%ymm9
+ DB 196,65,60,89,193 ; vmulps %ymm9,%ymm8,%ymm8
+ DB 197,252,92,196 ; vsubps %ymm4,%ymm0,%ymm0
+ DB 196,193,124,89,192 ; vmulps %ymm8,%ymm0,%ymm0
+ DB 197,252,88,196 ; vaddps %ymm4,%ymm0,%ymm0
+ DB 197,244,92,205 ; vsubps %ymm5,%ymm1,%ymm1
+ DB 196,193,116,89,200 ; vmulps %ymm8,%ymm1,%ymm1
+ DB 197,244,88,205 ; vaddps %ymm5,%ymm1,%ymm1
+ DB 197,236,92,214 ; vsubps %ymm6,%ymm2,%ymm2
+ DB 196,193,108,89,208 ; vmulps %ymm8,%ymm2,%ymm2
+ DB 197,236,88,214 ; vaddps %ymm6,%ymm2,%ymm2
+ DB 197,228,92,223 ; vsubps %ymm7,%ymm3,%ymm3
+ DB 196,193,100,89,216 ; vmulps %ymm8,%ymm3,%ymm3
+ DB 197,228,88,223 ; vaddps %ymm7,%ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_load_tables_avx
+_sk_load_tables_avx LABEL PROC
+ DB 65,87 ; push %r15
+ DB 65,86 ; push %r14
+ DB 65,84 ; push %r12
+ DB 83 ; push %rbx
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 76,139,0 ; mov (%rax),%r8
+ DB 72,139,72,8 ; mov 0x8(%rax),%rcx
+ DB 196,65,124,16,20,184 ; vmovups (%r8,%rdi,4),%ymm10
+ DB 197,249,110,66,16 ; vmovd 0x10(%rdx),%xmm0
+ DB 196,227,121,4,192,0 ; vpermilps $0x0,%xmm0,%xmm0
+ DB 196,99,125,24,200,1 ; vinsertf128 $0x1,%xmm0,%ymm0,%ymm9
+ DB 196,193,52,84,194 ; vandps %ymm10,%ymm9,%ymm0
+ DB 196,193,249,126,192 ; vmovq %xmm0,%r8
+ DB 69,137,193 ; mov %r8d,%r9d
+ DB 196,195,249,22,194,1 ; vpextrq $0x1,%xmm0,%r10
+ DB 69,137,211 ; mov %r10d,%r11d
+ DB 73,193,234,32 ; shr $0x20,%r10
+ DB 73,193,232,32 ; shr $0x20,%r8
+ DB 196,227,125,25,192,1 ; vextractf128 $0x1,%ymm0,%xmm0
+ DB 196,193,249,126,199 ; vmovq %xmm0,%r15
+ DB 69,137,254 ; mov %r15d,%r14d
+ DB 196,227,249,22,195,1 ; vpextrq $0x1,%xmm0,%rbx
+ DB 65,137,220 ; mov %ebx,%r12d
+ DB 72,193,235,32 ; shr $0x20,%rbx
+ DB 73,193,239,32 ; shr $0x20,%r15
+ DB 196,161,122,16,4,177 ; vmovss (%rcx,%r14,4),%xmm0
+ DB 196,163,121,33,4,185,16 ; vinsertps $0x10,(%rcx,%r15,4),%xmm0,%xmm0
+ DB 196,163,121,33,4,161,32 ; vinsertps $0x20,(%rcx,%r12,4),%xmm0,%xmm0
+ DB 196,227,121,33,4,153,48 ; vinsertps $0x30,(%rcx,%rbx,4),%xmm0,%xmm0
+ DB 196,161,122,16,12,137 ; vmovss (%rcx,%r9,4),%xmm1
+ DB 196,163,113,33,12,129,16 ; vinsertps $0x10,(%rcx,%r8,4),%xmm1,%xmm1
+ DB 196,163,113,33,12,153,32 ; vinsertps $0x20,(%rcx,%r11,4),%xmm1,%xmm1
+ DB 196,163,113,33,12,145,48 ; vinsertps $0x30,(%rcx,%r10,4),%xmm1,%xmm1
+ DB 196,227,117,24,192,1 ; vinsertf128 $0x1,%xmm0,%ymm1,%ymm0
+ DB 76,139,120,16 ; mov 0x10(%rax),%r15
+ DB 196,193,113,114,210,8 ; vpsrld $0x8,%xmm10,%xmm1
+ DB 196,67,125,25,208,1 ; vextractf128 $0x1,%ymm10,%xmm8
+ DB 196,193,105,114,208,8 ; vpsrld $0x8,%xmm8,%xmm2
+ DB 196,227,117,24,202,1 ; vinsertf128 $0x1,%xmm2,%ymm1,%ymm1
+ DB 197,180,84,201 ; vandps %ymm1,%ymm9,%ymm1
+ DB 196,193,249,126,200 ; vmovq %xmm1,%r8
+ DB 69,137,194 ; mov %r8d,%r10d
+ DB 196,195,249,22,201,1 ; vpextrq $0x1,%xmm1,%r9
+ DB 69,137,203 ; mov %r9d,%r11d
+ DB 73,193,233,32 ; shr $0x20,%r9
+ DB 73,193,232,32 ; shr $0x20,%r8
+ DB 196,227,125,25,201,1 ; vextractf128 $0x1,%ymm1,%xmm1
+ DB 196,225,249,126,203 ; vmovq %xmm1,%rbx
+ DB 65,137,222 ; mov %ebx,%r14d
+ DB 196,227,249,22,201,1 ; vpextrq $0x1,%xmm1,%rcx
+ DB 65,137,204 ; mov %ecx,%r12d
+ DB 72,193,233,32 ; shr $0x20,%rcx
+ DB 72,193,235,32 ; shr $0x20,%rbx
+ DB 196,129,122,16,12,183 ; vmovss (%r15,%r14,4),%xmm1
+ DB 196,195,113,33,12,159,16 ; vinsertps $0x10,(%r15,%rbx,4),%xmm1,%xmm1
+ DB 196,129,122,16,20,167 ; vmovss (%r15,%r12,4),%xmm2
+ DB 196,227,113,33,202,32 ; vinsertps $0x20,%xmm2,%xmm1,%xmm1
+ DB 196,193,122,16,20,143 ; vmovss (%r15,%rcx,4),%xmm2
+ DB 196,227,113,33,202,48 ; vinsertps $0x30,%xmm2,%xmm1,%xmm1
+ DB 196,129,122,16,20,151 ; vmovss (%r15,%r10,4),%xmm2
+ DB 196,131,105,33,20,135,16 ; vinsertps $0x10,(%r15,%r8,4),%xmm2,%xmm2
+ DB 196,129,122,16,28,159 ; vmovss (%r15,%r11,4),%xmm3
+ DB 196,227,105,33,211,32 ; vinsertps $0x20,%xmm3,%xmm2,%xmm2
+ DB 196,129,122,16,28,143 ; vmovss (%r15,%r9,4),%xmm3
+ DB 196,227,105,33,211,48 ; vinsertps $0x30,%xmm3,%xmm2,%xmm2
+ DB 196,227,109,24,201,1 ; vinsertf128 $0x1,%xmm1,%ymm2,%ymm1
+ DB 72,139,64,24 ; mov 0x18(%rax),%rax
+ DB 196,193,105,114,210,16 ; vpsrld $0x10,%xmm10,%xmm2
+ DB 196,193,97,114,208,16 ; vpsrld $0x10,%xmm8,%xmm3
+ DB 196,227,109,24,211,1 ; vinsertf128 $0x1,%xmm3,%ymm2,%ymm2
+ DB 197,180,84,210 ; vandps %ymm2,%ymm9,%ymm2
+ DB 196,193,249,126,208 ; vmovq %xmm2,%r8
+ DB 69,137,193 ; mov %r8d,%r9d
+ DB 196,195,249,22,214,1 ; vpextrq $0x1,%xmm2,%r14
+ DB 69,137,242 ; mov %r14d,%r10d
+ DB 73,193,238,32 ; shr $0x20,%r14
+ DB 73,193,232,32 ; shr $0x20,%r8
+ DB 196,227,125,25,210,1 ; vextractf128 $0x1,%ymm2,%xmm2
+ DB 196,225,249,126,211 ; vmovq %xmm2,%rbx
+ DB 65,137,219 ; mov %ebx,%r11d
+ DB 196,227,249,22,209,1 ; vpextrq $0x1,%xmm2,%rcx
+ DB 65,137,207 ; mov %ecx,%r15d
+ DB 72,193,233,32 ; shr $0x20,%rcx
+ DB 72,193,235,32 ; shr $0x20,%rbx
+ DB 196,161,122,16,20,152 ; vmovss (%rax,%r11,4),%xmm2
+ DB 196,227,105,33,20,152,16 ; vinsertps $0x10,(%rax,%rbx,4),%xmm2,%xmm2
+ DB 196,161,122,16,28,184 ; vmovss (%rax,%r15,4),%xmm3
+ DB 196,227,105,33,211,32 ; vinsertps $0x20,%xmm3,%xmm2,%xmm2
+ DB 197,250,16,28,136 ; vmovss (%rax,%rcx,4),%xmm3
+ DB 196,99,105,33,203,48 ; vinsertps $0x30,%xmm3,%xmm2,%xmm9
+ DB 196,161,122,16,28,136 ; vmovss (%rax,%r9,4),%xmm3
+ DB 196,163,97,33,28,128,16 ; vinsertps $0x10,(%rax,%r8,4),%xmm3,%xmm3
+ DB 196,161,122,16,20,144 ; vmovss (%rax,%r10,4),%xmm2
+ DB 196,227,97,33,210,32 ; vinsertps $0x20,%xmm2,%xmm3,%xmm2
+ DB 196,161,122,16,28,176 ; vmovss (%rax,%r14,4),%xmm3
+ DB 196,227,105,33,211,48 ; vinsertps $0x30,%xmm3,%xmm2,%xmm2
+ DB 196,195,109,24,209,1 ; vinsertf128 $0x1,%xmm9,%ymm2,%ymm2
+ DB 196,193,49,114,210,24 ; vpsrld $0x18,%xmm10,%xmm9
+ DB 196,193,97,114,208,24 ; vpsrld $0x18,%xmm8,%xmm3
+ DB 196,227,53,24,219,1 ; vinsertf128 $0x1,%xmm3,%ymm9,%ymm3
+ DB 197,252,91,219 ; vcvtdq2ps %ymm3,%ymm3
+ DB 196,98,125,24,66,12 ; vbroadcastss 0xc(%rdx),%ymm8
+ DB 196,193,100,89,216 ; vmulps %ymm8,%ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 91 ; pop %rbx
+ DB 65,92 ; pop %r12
+ DB 65,94 ; pop %r14
+ DB 65,95 ; pop %r15
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_load_8888_avx
+_sk_load_8888_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,139,0 ; mov (%rax),%rax
+ DB 197,252,16,28,184 ; vmovups (%rax,%rdi,4),%ymm3
+ DB 197,249,110,66,16 ; vmovd 0x10(%rdx),%xmm0
+ DB 196,227,121,4,192,0 ; vpermilps $0x0,%xmm0,%xmm0
+ DB 196,99,125,24,216,1 ; vinsertf128 $0x1,%xmm0,%ymm0,%ymm11
+ DB 197,164,84,195 ; vandps %ymm3,%ymm11,%ymm0
+ DB 197,252,91,192 ; vcvtdq2ps %ymm0,%ymm0
+ DB 196,98,125,24,66,12 ; vbroadcastss 0xc(%rdx),%ymm8
+ DB 197,188,89,192 ; vmulps %ymm0,%ymm8,%ymm0
+ DB 197,169,114,211,8 ; vpsrld $0x8,%xmm3,%xmm10
+ DB 196,195,125,25,217,1 ; vextractf128 $0x1,%ymm3,%xmm9
+ DB 196,193,113,114,209,8 ; vpsrld $0x8,%xmm9,%xmm1
+ DB 196,227,45,24,201,1 ; vinsertf128 $0x1,%xmm1,%ymm10,%ymm1
+ DB 197,164,84,201 ; vandps %ymm1,%ymm11,%ymm1
+ DB 197,252,91,201 ; vcvtdq2ps %ymm1,%ymm1
+ DB 197,188,89,201 ; vmulps %ymm1,%ymm8,%ymm1
+ DB 197,169,114,211,16 ; vpsrld $0x10,%xmm3,%xmm10
+ DB 196,193,105,114,209,16 ; vpsrld $0x10,%xmm9,%xmm2
+ DB 196,227,45,24,210,1 ; vinsertf128 $0x1,%xmm2,%ymm10,%ymm2
+ DB 197,164,84,210 ; vandps %ymm2,%ymm11,%ymm2
+ DB 197,252,91,210 ; vcvtdq2ps %ymm2,%ymm2
+ DB 197,188,89,210 ; vmulps %ymm2,%ymm8,%ymm2
+ DB 197,169,114,211,24 ; vpsrld $0x18,%xmm3,%xmm10
+ DB 196,193,97,114,209,24 ; vpsrld $0x18,%xmm9,%xmm3
+ DB 196,227,45,24,219,1 ; vinsertf128 $0x1,%xmm3,%ymm10,%ymm3
+ DB 197,252,91,219 ; vcvtdq2ps %ymm3,%ymm3
+ DB 196,193,100,89,216 ; vmulps %ymm8,%ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_store_8888_avx
+_sk_store_8888_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,139,0 ; mov (%rax),%rax
+ DB 196,98,125,24,66,8 ; vbroadcastss 0x8(%rdx),%ymm8
+ DB 197,60,89,200 ; vmulps %ymm0,%ymm8,%ymm9
+ DB 196,65,125,91,201 ; vcvtps2dq %ymm9,%ymm9
+ DB 197,60,89,209 ; vmulps %ymm1,%ymm8,%ymm10
+ DB 196,65,125,91,210 ; vcvtps2dq %ymm10,%ymm10
+ DB 196,193,33,114,242,8 ; vpslld $0x8,%xmm10,%xmm11
+ DB 196,67,125,25,210,1 ; vextractf128 $0x1,%ymm10,%xmm10
+ DB 196,193,41,114,242,8 ; vpslld $0x8,%xmm10,%xmm10
+ DB 196,67,37,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm11,%ymm10
+ DB 196,65,45,86,201 ; vorpd %ymm9,%ymm10,%ymm9
+ DB 197,60,89,210 ; vmulps %ymm2,%ymm8,%ymm10
+ DB 196,65,125,91,210 ; vcvtps2dq %ymm10,%ymm10
+ DB 196,193,33,114,242,16 ; vpslld $0x10,%xmm10,%xmm11
+ DB 196,67,125,25,210,1 ; vextractf128 $0x1,%ymm10,%xmm10
+ DB 196,193,41,114,242,16 ; vpslld $0x10,%xmm10,%xmm10
+ DB 196,67,37,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm11,%ymm10
+ DB 196,65,53,86,202 ; vorpd %ymm10,%ymm9,%ymm9
+ DB 197,60,89,195 ; vmulps %ymm3,%ymm8,%ymm8
+ DB 196,65,125,91,192 ; vcvtps2dq %ymm8,%ymm8
+ DB 196,193,41,114,240,24 ; vpslld $0x18,%xmm8,%xmm10
+ DB 196,67,125,25,192,1 ; vextractf128 $0x1,%ymm8,%xmm8
+ DB 196,193,57,114,240,24 ; vpslld $0x18,%xmm8,%xmm8
+ DB 196,67,45,24,192,1 ; vinsertf128 $0x1,%xmm8,%ymm10,%ymm8
+ DB 196,65,53,86,192 ; vorpd %ymm8,%ymm9,%ymm8
+ DB 197,125,17,4,184 ; vmovupd %ymm8,(%rax,%rdi,4)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_load_f16_avx
+_sk_load_f16_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_store_f16_avx
+_sk_store_f16_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_clamp_x_avx
+_sk_clamp_x_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8
+ DB 196,67,125,25,193,1 ; vextractf128 $0x1,%ymm8,%xmm9
+ DB 196,65,41,118,210 ; vpcmpeqd %xmm10,%xmm10,%xmm10
+ DB 196,65,49,254,202 ; vpaddd %xmm10,%xmm9,%xmm9
+ DB 196,65,57,254,194 ; vpaddd %xmm10,%xmm8,%xmm8
+ DB 196,67,61,24,193,1 ; vinsertf128 $0x1,%xmm9,%ymm8,%ymm8
+ DB 196,193,124,93,192 ; vminps %ymm8,%ymm0,%ymm0
+ DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8
+ DB 197,188,95,192 ; vmaxps %ymm0,%ymm8,%ymm0
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_clamp_y_avx
+_sk_clamp_y_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8
+ DB 196,67,125,25,193,1 ; vextractf128 $0x1,%ymm8,%xmm9
+ DB 196,65,41,118,210 ; vpcmpeqd %xmm10,%xmm10,%xmm10
+ DB 196,65,49,254,202 ; vpaddd %xmm10,%xmm9,%xmm9
+ DB 196,65,57,254,194 ; vpaddd %xmm10,%xmm8,%xmm8
+ DB 196,67,61,24,193,1 ; vinsertf128 $0x1,%xmm9,%ymm8,%ymm8
+ DB 196,193,116,93,200 ; vminps %ymm8,%ymm1,%ymm1
+ DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8
+ DB 197,188,95,201 ; vmaxps %ymm1,%ymm8,%ymm1
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_matrix_2x3_avx
+_sk_matrix_2x3_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8
+ DB 196,98,125,24,72,8 ; vbroadcastss 0x8(%rax),%ymm9
+ DB 196,98,125,24,80,16 ; vbroadcastss 0x10(%rax),%ymm10
+ DB 197,52,89,201 ; vmulps %ymm1,%ymm9,%ymm9
+ DB 196,65,52,88,202 ; vaddps %ymm10,%ymm9,%ymm9
+ DB 197,60,89,192 ; vmulps %ymm0,%ymm8,%ymm8
+ DB 196,65,60,88,193 ; vaddps %ymm9,%ymm8,%ymm8
+ DB 196,98,125,24,72,4 ; vbroadcastss 0x4(%rax),%ymm9
+ DB 196,98,125,24,80,12 ; vbroadcastss 0xc(%rax),%ymm10
+ DB 196,98,125,24,88,20 ; vbroadcastss 0x14(%rax),%ymm11
+ DB 197,172,89,201 ; vmulps %ymm1,%ymm10,%ymm1
+ DB 196,193,116,88,203 ; vaddps %ymm11,%ymm1,%ymm1
+ DB 197,180,89,192 ; vmulps %ymm0,%ymm9,%ymm0
+ DB 197,252,88,201 ; vaddps %ymm1,%ymm0,%ymm1
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,124,41,192 ; vmovaps %ymm8,%ymm0
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_matrix_3x4_avx
+_sk_matrix_3x4_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8
+ DB 196,98,125,24,72,12 ; vbroadcastss 0xc(%rax),%ymm9
+ DB 196,98,125,24,80,24 ; vbroadcastss 0x18(%rax),%ymm10
+ DB 196,98,125,24,88,36 ; vbroadcastss 0x24(%rax),%ymm11
+ DB 197,44,89,210 ; vmulps %ymm2,%ymm10,%ymm10
+ DB 196,65,44,88,211 ; vaddps %ymm11,%ymm10,%ymm10
+ DB 197,52,89,201 ; vmulps %ymm1,%ymm9,%ymm9
+ DB 196,65,52,88,202 ; vaddps %ymm10,%ymm9,%ymm9
+ DB 197,60,89,192 ; vmulps %ymm0,%ymm8,%ymm8
+ DB 196,65,60,88,193 ; vaddps %ymm9,%ymm8,%ymm8
+ DB 196,98,125,24,72,4 ; vbroadcastss 0x4(%rax),%ymm9
+ DB 196,98,125,24,80,16 ; vbroadcastss 0x10(%rax),%ymm10
+ DB 196,98,125,24,88,28 ; vbroadcastss 0x1c(%rax),%ymm11
+ DB 196,98,125,24,96,40 ; vbroadcastss 0x28(%rax),%ymm12
+ DB 197,36,89,218 ; vmulps %ymm2,%ymm11,%ymm11
+ DB 196,65,36,88,220 ; vaddps %ymm12,%ymm11,%ymm11
+ DB 197,44,89,209 ; vmulps %ymm1,%ymm10,%ymm10
+ DB 196,65,44,88,211 ; vaddps %ymm11,%ymm10,%ymm10
+ DB 197,52,89,200 ; vmulps %ymm0,%ymm9,%ymm9
+ DB 196,65,52,88,202 ; vaddps %ymm10,%ymm9,%ymm9
+ DB 196,98,125,24,80,8 ; vbroadcastss 0x8(%rax),%ymm10
+ DB 196,98,125,24,88,20 ; vbroadcastss 0x14(%rax),%ymm11
+ DB 196,98,125,24,96,32 ; vbroadcastss 0x20(%rax),%ymm12
+ DB 196,98,125,24,104,44 ; vbroadcastss 0x2c(%rax),%ymm13
+ DB 197,156,89,210 ; vmulps %ymm2,%ymm12,%ymm2
+ DB 196,193,108,88,213 ; vaddps %ymm13,%ymm2,%ymm2
+ DB 197,164,89,201 ; vmulps %ymm1,%ymm11,%ymm1
+ DB 197,244,88,202 ; vaddps %ymm2,%ymm1,%ymm1
+ DB 197,172,89,192 ; vmulps %ymm0,%ymm10,%ymm0
+ DB 197,252,88,209 ; vaddps %ymm1,%ymm0,%ymm2
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,124,41,192 ; vmovaps %ymm8,%ymm0
+ DB 197,124,41,201 ; vmovaps %ymm9,%ymm1
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_linear_gradient_2stops_avx
+_sk_linear_gradient_2stops_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,226,125,24,72,16 ; vbroadcastss 0x10(%rax),%ymm1
+ DB 196,226,125,24,16 ; vbroadcastss (%rax),%ymm2
+ DB 197,244,89,200 ; vmulps %ymm0,%ymm1,%ymm1
+ DB 197,108,88,193 ; vaddps %ymm1,%ymm2,%ymm8
+ DB 196,226,125,24,72,20 ; vbroadcastss 0x14(%rax),%ymm1
+ DB 196,226,125,24,80,4 ; vbroadcastss 0x4(%rax),%ymm2
+ DB 197,244,89,200 ; vmulps %ymm0,%ymm1,%ymm1
+ DB 197,236,88,201 ; vaddps %ymm1,%ymm2,%ymm1
+ DB 196,226,125,24,80,24 ; vbroadcastss 0x18(%rax),%ymm2
+ DB 196,226,125,24,88,8 ; vbroadcastss 0x8(%rax),%ymm3
+ DB 197,236,89,208 ; vmulps %ymm0,%ymm2,%ymm2
+ DB 197,228,88,210 ; vaddps %ymm2,%ymm3,%ymm2
+ DB 196,226,125,24,88,28 ; vbroadcastss 0x1c(%rax),%ymm3
+ DB 196,98,125,24,72,12 ; vbroadcastss 0xc(%rax),%ymm9
+ DB 197,228,89,192 ; vmulps %ymm0,%ymm3,%ymm0
+ DB 197,180,88,216 ; vaddps %ymm0,%ymm9,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,124,41,192 ; vmovaps %ymm8,%ymm0
+ DB 255,224 ; jmpq *%rax
+
PUBLIC _sk_start_pipeline_sse41
_sk_start_pipeline_sse41 LABEL PROC
DB 65,87 ; push %r15
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index a691f2bc62..21e3c3590b 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -104,6 +104,30 @@ using K = const SkJumper_constants;
#define WRAP(name) sk_##name##_hsw
+#elif defined(__AVX__)
+ #include <immintrin.h>
+
+ using F = float __attribute__((ext_vector_type(8)));
+ using I32 = int32_t __attribute__((ext_vector_type(8)));
+ using U32 = uint32_t __attribute__((ext_vector_type(8)));
+ using U8 = uint8_t __attribute__((ext_vector_type(8)));
+
+ static F mad(F f, F m, F a) { return f*m+a; }
+ static F min(F a, F b) { return _mm256_min_ps(a,b); }
+ static F max(F a, F b) { return _mm256_max_ps(a,b); }
+ static F rcp (F v) { return _mm256_rcp_ps (v); }
+ static F rsqrt(F v) { return _mm256_rsqrt_ps(v); }
+ static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
+
+ static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
+
+ static F gather(const float* p, U32 ix) {
+ return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
+ p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
+ }
+
+ #define WRAP(name) sk_##name##_avx
+
#elif defined(__SSE2__)
#include <immintrin.h>
@@ -499,6 +523,9 @@ STAGE(load_f16) {
g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567));
b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
+#elif defined(__AVX__)
+ // TODO
+
#elif defined(__SSE2__)
auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
@@ -568,6 +595,8 @@ STAGE(store_f16) {
_mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
_mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
_mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
+#elif defined(__AVX__)
+ // TODO
#elif defined(__SSE2__)
auto float_to_half = [&](F f) {
return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000))) // Fix up the exponent,
diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py
index b6ab3c0c48..945f77606f 100755
--- a/src/jumper/build_stages.py
+++ b/src/jumper/build_stages.py
@@ -33,6 +33,14 @@ subprocess.check_call(['clang++'] + cflags + sse41 + ['-DWIN'] +
['-c', 'src/jumper/SkJumper_stages.cpp'] +
['-o', 'win_sse41.o'])
+avx = '-mno-red-zone -mavx'.split()
+subprocess.check_call(['clang++'] + cflags + avx +
+ ['-c', 'src/jumper/SkJumper_stages.cpp'] +
+ ['-o', 'avx.o'])
+subprocess.check_call(['clang++'] + cflags + avx + ['-DWIN'] +
+ ['-c', 'src/jumper/SkJumper_stages.cpp'] +
+ ['-o', 'win_avx.o'])
+
hsw = '-mno-red-zone -mavx2 -mfma -mf16c'.split()
subprocess.check_call(['clang++'] + cflags + hsw +
['-c', 'src/jumper/SkJumper_stages.cpp'] +
@@ -125,6 +133,7 @@ parse_object_file('vfp4.o', '.long', target='elf32-littlearm')
print '#elif defined(__x86_64__)'
parse_object_file('hsw.o', '.byte')
+parse_object_file('avx.o', '.byte')
parse_object_file('sse41.o', '.byte')
parse_object_file('sse2.o', '.byte')
print '#endif'
@@ -141,6 +150,7 @@ print '''; Copyright 2017 Google Inc.
'''
print '_text SEGMENT'
parse_object_file('win_hsw.o', 'DB')
+parse_object_file('win_avx.o', 'DB')
parse_object_file('win_sse41.o', 'DB')
parse_object_file('win_sse2.o', 'DB')
print 'END'