diff options
author | Mike Klein <mtklein@chromium.org> | 2017-02-23 08:04:49 -0500 |
---|---|---|
committer | Mike Klein <mtklein@chromium.org> | 2017-02-23 13:37:39 +0000 |
commit | ca0cfb4a7a52ae894ca005475ad9de5ac1329900 (patch) | |
tree | 3f7defe919b4120bb4cef3496c207291e6d1e955 /src | |
parent | a6e431b2c1baa564d2619bdc2a51a3b5bfa7e276 (diff) |
Add AVX to the SkJumper mix.
AVX is a nice little halfway point between SSE4.1 and HSW, in terms
of instructions available, performance, and availability.
Intel chips have had AVX since ~2011, compared to ~2013 for HSW and
~2007 for SSE4.1. Like HSW it's got 8-wide 256-bit float vectors,
but integer (and double) operations are essentially still only 128-bit.
It also doesn't have F16 conversion or FMA instructions.
It doesn't look like this is going to be a burden to maintain, and only
adds a few KB of code size. In exchange, we now run 8x wide on 45% to
70% of x86 machines, depending on the OS.
In my brief testing, speed eerily resembles exact geometric progression:
SSE4.1: 1x speed (baseline)
AVX: ~sqrt(2)x speed
HSW: ~2x speed
This adds all the basic plumbing for AVX but leaves it disabled.
I'll flip it on once I've implemented the f16 TODOs.
Change-Id: I1c378dabb8a06386646371bf78ade9e9432b006f
Reviewed-on: https://skia-review.googlesource.com/8898
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src')
-rw-r--r-- | src/jumper/SkJumper.cpp | 22 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated.S | 668 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated_win.S | 695 | ||||
-rw-r--r-- | src/jumper/SkJumper_stages.cpp | 29 | ||||
-rwxr-xr-x | src/jumper/build_stages.py | 10 |
5 files changed, 1424 insertions, 0 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp index 18a5f0275f..b5271a6a58 100644 --- a/src/jumper/SkJumper.cpp +++ b/src/jumper/SkJumper.cpp @@ -106,16 +106,21 @@ extern "C" { #elif defined(__x86_64__) || defined(_M_X64) size_t ASM(start_pipeline,hsw )(size_t, void**, K*, size_t); + size_t ASM(start_pipeline,avx )(size_t, void**, K*, size_t); size_t ASM(start_pipeline,sse41)(size_t, void**, K*, size_t); size_t ASM(start_pipeline,sse2 )(size_t, void**, K*, size_t); StageFn ASM(just_return,hsw), + ASM(just_return,avx), ASM(just_return,sse41), ASM(just_return,sse2); #define M(st) StageFn ASM(st,hsw); STAGES(M) #undef M + #define M(st) StageFn ASM(st,avx); + STAGES(M) + #undef M #define M(st) StageFn ASM(st,sse41); STAGES(M) #undef M @@ -170,6 +175,18 @@ extern "C" { #undef M } } + static StageFn* lookup_avx(SkRasterPipeline::StockStage st) { + switch (st) { + default: + #ifdef WHATS_NEXT + gMissing[st]++; + #endif + return nullptr; + #define M(st) case SkRasterPipeline::st: return ASM(st,avx); + STAGES(M) + #undef M + } + } static StageFn* lookup_sse41(SkRasterPipeline::StockStage st) { switch (st) { default: @@ -259,6 +276,11 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const { return false; } } + if (0 && SkCpu::Supports(SkCpu::AVX)) { + if (!build_and_run(8, lookup_avx, ASM(just_return,avx), ASM(start_pipeline,avx))) { + return false; + } + } if (1 && SkCpu::Supports(SkCpu::SSE41)) { if (!build_and_run(4, lookup_sse41, ASM(just_return,sse41), ASM(start_pipeline,sse41))) { return false; diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S index 5d7ec003a2..25bfc1bcd8 100644 --- a/src/jumper/SkJumper_generated.S +++ b/src/jumper/SkJumper_generated.S @@ -1854,6 +1854,674 @@ _sk_linear_gradient_2stops_hsw: .byte 0xc5,0x7c,0x29,0xc0 // vmovaps %ymm8,%ymm0 .byte 0xff,0xe0 // jmpq *%rax +.globl _sk_start_pipeline_avx +_sk_start_pipeline_avx: + .byte 0x41,0x57 // push %r15 + .byte 0x41,0x56 // push %r14 + .byte 0x41,0x55 // push %r13 + .byte 0x41,0x54 // push %r12 + .byte 0x53 // push %rbx + .byte 0x49,0x89,0xcf // mov %rcx,%r15 + .byte 0x49,0x89,0xd6 // mov %rdx,%r14 + .byte 0x48,0x89,0xfb // mov %rdi,%rbx + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x49,0x89,0xc4 // mov %rax,%r12 + .byte 0x49,0x89,0xf5 // mov %rsi,%r13 + .byte 0x48,0x8d,0x43,0x08 // lea 0x8(%rbx),%rax + .byte 0x4c,0x39,0xf8 // cmp %r15,%rax + .byte 0x76,0x05 // jbe 28 <_sk_start_pipeline_avx+0x28> + .byte 0x48,0x89,0xd8 // mov %rbx,%rax + .byte 0xeb,0x3c // jmp 64 <_sk_start_pipeline_avx+0x64> + .byte 0xc5,0xfc,0x57,0xc0 // vxorps %ymm0,%ymm0,%ymm0 + .byte 0xc5,0xf4,0x57,0xc9 // vxorps %ymm1,%ymm1,%ymm1 + .byte 0xc5,0xec,0x57,0xd2 // vxorps %ymm2,%ymm2,%ymm2 + .byte 0xc5,0xe4,0x57,0xdb // vxorps %ymm3,%ymm3,%ymm3 + .byte 0xc5,0xdc,0x57,0xe4 // vxorps %ymm4,%ymm4,%ymm4 + .byte 0xc5,0xd4,0x57,0xed // vxorps %ymm5,%ymm5,%ymm5 + .byte 0xc5,0xcc,0x57,0xf6 // vxorps %ymm6,%ymm6,%ymm6 + .byte 0xc5,0xc4,0x57,0xff // vxorps %ymm7,%ymm7,%ymm7 + .byte 0x48,0x89,0xdf // mov %rbx,%rdi + .byte 0x4c,0x89,0xee // mov %r13,%rsi + .byte 0x4c,0x89,0xf2 // mov %r14,%rdx + .byte 0x41,0xff,0xd4 // callq *%r12 + .byte 0x48,0x8d,0x43,0x08 // lea 0x8(%rbx),%rax + .byte 0x48,0x83,0xc3,0x10 // add $0x10,%rbx + .byte 0x4c,0x39,0xfb // cmp %r15,%rbx + .byte 0x48,0x89,0xc3 // mov %rax,%rbx + .byte 0x76,0xc4 // jbe 28 <_sk_start_pipeline_avx+0x28> + .byte 0x5b // pop %rbx + .byte 0x41,0x5c // pop %r12 + .byte 0x41,0x5d // pop %r13 + .byte 0x41,0x5e // pop %r14 + .byte 0x41,0x5f // pop %r15 + .byte 0xc5,0xf8,0x77 // vzeroupper + .byte 0xc3 // retq + +.globl _sk_just_return_avx +_sk_just_return_avx: + .byte 0xc3 // retq + +.globl _sk_seed_shader_avx +_sk_seed_shader_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xf9,0x6e,0xc7 // vmovd %edi,%xmm0 + .byte 0xc4,0xe3,0x79,0x04,0xc0,0x00 // vpermilps $0x0,%xmm0,%xmm0 + .byte 0xc4,0xe3,0x7d,0x18,0xc0,0x01 // vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 + .byte 0xc5,0xfc,0x5b,0xc0 // vcvtdq2ps %ymm0,%ymm0 + .byte 0xc4,0xe2,0x7d,0x18,0x4a,0x04 // vbroadcastss 0x4(%rdx),%ymm1 + .byte 0xc5,0xfc,0x58,0xc1 // vaddps %ymm1,%ymm0,%ymm0 + .byte 0xc5,0xfc,0x58,0x42,0x14 // vaddps 0x14(%rdx),%ymm0,%ymm0 + .byte 0xc5,0xf9,0x6e,0x10 // vmovd (%rax),%xmm2 + .byte 0xc4,0xe3,0x79,0x04,0xd2,0x00 // vpermilps $0x0,%xmm2,%xmm2 + .byte 0xc4,0xe3,0x6d,0x18,0xd2,0x01 // vinsertf128 $0x1,%xmm2,%ymm2,%ymm2 + .byte 0xc5,0xfc,0x5b,0xd2 // vcvtdq2ps %ymm2,%ymm2 + .byte 0xc5,0xec,0x58,0xc9 // vaddps %ymm1,%ymm2,%ymm1 + .byte 0xc4,0xe2,0x7d,0x18,0x12 // vbroadcastss (%rdx),%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xe4,0x57,0xdb // vxorps %ymm3,%ymm3,%ymm3 + .byte 0xc5,0xdc,0x57,0xe4 // vxorps %ymm4,%ymm4,%ymm4 + .byte 0xc5,0xd4,0x57,0xed // vxorps %ymm5,%ymm5,%ymm5 + .byte 0xc5,0xcc,0x57,0xf6 // vxorps %ymm6,%ymm6,%ymm6 + .byte 0xc5,0xc4,0x57,0xff // vxorps %ymm7,%ymm7,%ymm7 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_constant_color_avx +_sk_constant_color_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0xe2,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm0 + .byte 0xc4,0xe2,0x7d,0x18,0x48,0x04 // vbroadcastss 0x4(%rax),%ymm1 + .byte 0xc4,0xe2,0x7d,0x18,0x50,0x08 // vbroadcastss 0x8(%rax),%ymm2 + .byte 0xc4,0xe2,0x7d,0x18,0x58,0x0c // vbroadcastss 0xc(%rax),%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_clear_avx +_sk_clear_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xfc,0x57,0xc0 // vxorps %ymm0,%ymm0,%ymm0 + .byte 0xc5,0xf4,0x57,0xc9 // vxorps %ymm1,%ymm1,%ymm1 + .byte 0xc5,0xec,0x57,0xd2 // vxorps %ymm2,%ymm2,%ymm2 + .byte 0xc5,0xe4,0x57,0xdb // vxorps %ymm3,%ymm3,%ymm3 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_plus__avx +_sk_plus__avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xfc,0x58,0xc4 // vaddps %ymm4,%ymm0,%ymm0 + .byte 0xc5,0xf4,0x58,0xcd // vaddps %ymm5,%ymm1,%ymm1 + .byte 0xc5,0xec,0x58,0xd6 // vaddps %ymm6,%ymm2,%ymm2 + .byte 0xc5,0xe4,0x58,0xdf // vaddps %ymm7,%ymm3,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_srcover_avx +_sk_srcover_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x02 // vbroadcastss (%rdx),%ymm8 + .byte 0xc5,0x3c,0x5c,0xc3 // vsubps %ymm3,%ymm8,%ymm8 + .byte 0xc5,0x3c,0x59,0xcc // vmulps %ymm4,%ymm8,%ymm9 + .byte 0xc5,0xb4,0x58,0xc0 // vaddps %ymm0,%ymm9,%ymm0 + .byte 0xc5,0x3c,0x59,0xcd // vmulps %ymm5,%ymm8,%ymm9 + .byte 0xc5,0xb4,0x58,0xc9 // vaddps %ymm1,%ymm9,%ymm1 + .byte 0xc5,0x3c,0x59,0xce // vmulps %ymm6,%ymm8,%ymm9 + .byte 0xc5,0xb4,0x58,0xd2 // vaddps %ymm2,%ymm9,%ymm2 + .byte 0xc5,0x3c,0x59,0xc7 // vmulps %ymm7,%ymm8,%ymm8 + .byte 0xc5,0xbc,0x58,0xdb // vaddps %ymm3,%ymm8,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_dstover_avx +_sk_dstover_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x02 // vbroadcastss (%rdx),%ymm8 + .byte 0xc5,0x3c,0x5c,0xc7 // vsubps %ymm7,%ymm8,%ymm8 + .byte 0xc5,0xbc,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm0 + .byte 0xc5,0xfc,0x58,0xc4 // vaddps %ymm4,%ymm0,%ymm0 + .byte 0xc5,0xbc,0x59,0xc9 // vmulps %ymm1,%ymm8,%ymm1 + .byte 0xc5,0xf4,0x58,0xcd // vaddps %ymm5,%ymm1,%ymm1 + .byte 0xc5,0xbc,0x59,0xd2 // vmulps %ymm2,%ymm8,%ymm2 + .byte 0xc5,0xec,0x58,0xd6 // vaddps %ymm6,%ymm2,%ymm2 + .byte 0xc5,0xbc,0x59,0xdb // vmulps %ymm3,%ymm8,%ymm3 + .byte 0xc5,0xe4,0x58,0xdf // vaddps %ymm7,%ymm3,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_clamp_0_avx +_sk_clamp_0_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x41,0x3c,0x57,0xc0 // vxorps %ymm8,%ymm8,%ymm8 + .byte 0xc4,0xc1,0x7c,0x5f,0xc0 // vmaxps %ymm8,%ymm0,%ymm0 + .byte 0xc4,0xc1,0x74,0x5f,0xc8 // vmaxps %ymm8,%ymm1,%ymm1 + .byte 0xc4,0xc1,0x6c,0x5f,0xd0 // vmaxps %ymm8,%ymm2,%ymm2 + .byte 0xc4,0xc1,0x64,0x5f,0xd8 // vmaxps %ymm8,%ymm3,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_clamp_1_avx +_sk_clamp_1_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x02 // vbroadcastss (%rdx),%ymm8 + .byte 0xc4,0xc1,0x7c,0x5d,0xc0 // vminps %ymm8,%ymm0,%ymm0 + .byte 0xc4,0xc1,0x74,0x5d,0xc8 // vminps %ymm8,%ymm1,%ymm1 + .byte 0xc4,0xc1,0x6c,0x5d,0xd0 // vminps %ymm8,%ymm2,%ymm2 + .byte 0xc4,0xc1,0x64,0x5d,0xd8 // vminps %ymm8,%ymm3,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_clamp_a_avx +_sk_clamp_a_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x02 // vbroadcastss (%rdx),%ymm8 + .byte 0xc4,0xc1,0x64,0x5d,0xd8 // vminps %ymm8,%ymm3,%ymm3 + .byte 0xc5,0xfc,0x5d,0xc3 // vminps %ymm3,%ymm0,%ymm0 + .byte 0xc5,0xf4,0x5d,0xcb // vminps %ymm3,%ymm1,%ymm1 + .byte 0xc5,0xec,0x5d,0xd3 // vminps %ymm3,%ymm2,%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_set_rgb_avx +_sk_set_rgb_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0xe2,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm0 + .byte 0xc4,0xe2,0x7d,0x18,0x48,0x04 // vbroadcastss 0x4(%rax),%ymm1 + .byte 0xc4,0xe2,0x7d,0x18,0x50,0x08 // vbroadcastss 0x8(%rax),%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_swap_rb_avx +_sk_swap_rb_avx: + .byte 0xc5,0x7c,0x28,0xc0 // vmovaps %ymm0,%ymm8 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xfc,0x28,0xc2 // vmovaps %ymm2,%ymm0 + .byte 0xc5,0x7c,0x29,0xc2 // vmovaps %ymm8,%ymm2 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_swap_avx +_sk_swap_avx: + .byte 0xc5,0x7c,0x28,0xc3 // vmovaps %ymm3,%ymm8 + .byte 0xc5,0x7c,0x28,0xca // vmovaps %ymm2,%ymm9 + .byte 0xc5,0x7c,0x28,0xd1 // vmovaps %ymm1,%ymm10 + .byte 0xc5,0x7c,0x28,0xd8 // vmovaps %ymm0,%ymm11 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xfc,0x28,0xc4 // vmovaps %ymm4,%ymm0 + .byte 0xc5,0xfc,0x28,0xcd // vmovaps %ymm5,%ymm1 + .byte 0xc5,0xfc,0x28,0xd6 // vmovaps %ymm6,%ymm2 + .byte 0xc5,0xfc,0x28,0xdf // vmovaps %ymm7,%ymm3 + .byte 0xc5,0x7c,0x29,0xdc // vmovaps %ymm11,%ymm4 + .byte 0xc5,0x7c,0x29,0xd5 // vmovaps %ymm10,%ymm5 + .byte 0xc5,0x7c,0x29,0xce // vmovaps %ymm9,%ymm6 + .byte 0xc5,0x7c,0x29,0xc7 // vmovaps %ymm8,%ymm7 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_move_src_dst_avx +_sk_move_src_dst_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xfc,0x28,0xe0 // vmovaps %ymm0,%ymm4 + .byte 0xc5,0xfc,0x28,0xe9 // vmovaps %ymm1,%ymm5 + .byte 0xc5,0xfc,0x28,0xf2 // vmovaps %ymm2,%ymm6 + .byte 0xc5,0xfc,0x28,0xfb // vmovaps %ymm3,%ymm7 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_move_dst_src_avx +_sk_move_dst_src_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xfc,0x28,0xc4 // vmovaps %ymm4,%ymm0 + .byte 0xc5,0xfc,0x28,0xcd // vmovaps %ymm5,%ymm1 + .byte 0xc5,0xfc,0x28,0xd6 // vmovaps %ymm6,%ymm2 + .byte 0xc5,0xfc,0x28,0xdf // vmovaps %ymm7,%ymm3 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_premul_avx +_sk_premul_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0xfc,0x59,0xc3 // vmulps %ymm3,%ymm0,%ymm0 + .byte 0xc5,0xf4,0x59,0xcb // vmulps %ymm3,%ymm1,%ymm1 + .byte 0xc5,0xec,0x59,0xd3 // vmulps %ymm3,%ymm2,%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_unpremul_avx +_sk_unpremul_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x41,0x3c,0x57,0xc0 // vxorps %ymm8,%ymm8,%ymm8 + .byte 0xc4,0x41,0x64,0xc2,0xc8,0x00 // vcmpeqps %ymm8,%ymm3,%ymm9 + .byte 0xc4,0x62,0x7d,0x18,0x12 // vbroadcastss (%rdx),%ymm10 + .byte 0xc5,0x2c,0x5e,0xd3 // vdivps %ymm3,%ymm10,%ymm10 + .byte 0xc4,0x43,0x2d,0x4a,0xc0,0x90 // vblendvps %ymm9,%ymm8,%ymm10,%ymm8 + .byte 0xc5,0xbc,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm0 + .byte 0xc5,0xbc,0x59,0xc9 // vmulps %ymm1,%ymm8,%ymm1 + .byte 0xc5,0xbc,0x59,0xd2 // vmulps %ymm2,%ymm8,%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_from_srgb_avx +_sk_from_srgb_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x42,0x40 // vbroadcastss 0x40(%rdx),%ymm8 + .byte 0xc5,0x3c,0x59,0xc8 // vmulps %ymm0,%ymm8,%ymm9 + .byte 0xc5,0x7c,0x59,0xd0 // vmulps %ymm0,%ymm0,%ymm10 + .byte 0xc4,0x62,0x7d,0x18,0x5a,0x3c // vbroadcastss 0x3c(%rdx),%ymm11 + .byte 0xc4,0x62,0x7d,0x18,0x62,0x38 // vbroadcastss 0x38(%rdx),%ymm12 + .byte 0xc5,0x24,0x59,0xe8 // vmulps %ymm0,%ymm11,%ymm13 + .byte 0xc4,0x41,0x14,0x58,0xec // vaddps %ymm12,%ymm13,%ymm13 + .byte 0xc4,0x62,0x7d,0x18,0x72,0x34 // vbroadcastss 0x34(%rdx),%ymm14 + .byte 0xc4,0x41,0x2c,0x59,0xd5 // vmulps %ymm13,%ymm10,%ymm10 + .byte 0xc4,0x41,0x0c,0x58,0xd2 // vaddps %ymm10,%ymm14,%ymm10 + .byte 0xc4,0x62,0x7d,0x18,0x6a,0x44 // vbroadcastss 0x44(%rdx),%ymm13 + .byte 0xc4,0xc1,0x7c,0xc2,0xc5,0x01 // vcmpltps %ymm13,%ymm0,%ymm0 + .byte 0xc4,0xc3,0x2d,0x4a,0xc1,0x00 // vblendvps %ymm0,%ymm9,%ymm10,%ymm0 + .byte 0xc5,0x3c,0x59,0xc9 // vmulps %ymm1,%ymm8,%ymm9 + .byte 0xc5,0x74,0x59,0xd1 // vmulps %ymm1,%ymm1,%ymm10 + .byte 0xc5,0x24,0x59,0xf9 // vmulps %ymm1,%ymm11,%ymm15 + .byte 0xc4,0x41,0x04,0x58,0xfc // vaddps %ymm12,%ymm15,%ymm15 + .byte 0xc4,0x41,0x2c,0x59,0xd7 // vmulps %ymm15,%ymm10,%ymm10 + .byte 0xc4,0x41,0x0c,0x58,0xd2 // vaddps %ymm10,%ymm14,%ymm10 + .byte 0xc4,0xc1,0x74,0xc2,0xcd,0x01 // vcmpltps %ymm13,%ymm1,%ymm1 + .byte 0xc4,0xc3,0x2d,0x4a,0xc9,0x10 // vblendvps %ymm1,%ymm9,%ymm10,%ymm1 + .byte 0xc5,0x3c,0x59,0xc2 // vmulps %ymm2,%ymm8,%ymm8 + .byte 0xc5,0x6c,0x59,0xca // vmulps %ymm2,%ymm2,%ymm9 + .byte 0xc5,0x24,0x59,0xd2 // vmulps %ymm2,%ymm11,%ymm10 + .byte 0xc4,0x41,0x2c,0x58,0xd4 // vaddps %ymm12,%ymm10,%ymm10 + .byte 0xc4,0x41,0x34,0x59,0xca // vmulps %ymm10,%ymm9,%ymm9 + .byte 0xc4,0x41,0x0c,0x58,0xc9 // vaddps %ymm9,%ymm14,%ymm9 + .byte 0xc4,0xc1,0x6c,0xc2,0xd5,0x01 // vcmpltps %ymm13,%ymm2,%ymm2 + .byte 0xc4,0xc3,0x35,0x4a,0xd0,0x20 // vblendvps %ymm2,%ymm8,%ymm9,%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_to_srgb_avx +_sk_to_srgb_avx: + .byte 0xc5,0x7c,0x52,0xc0 // vrsqrtps %ymm0,%ymm8 + .byte 0xc4,0x41,0x7c,0x53,0xc8 // vrcpps %ymm8,%ymm9 + .byte 0xc4,0x41,0x7c,0x52,0xd0 // vrsqrtps %ymm8,%ymm10 + .byte 0xc4,0x62,0x7d,0x18,0x42,0x48 // vbroadcastss 0x48(%rdx),%ymm8 + .byte 0xc5,0x3c,0x59,0xd8 // vmulps %ymm0,%ymm8,%ymm11 + .byte 0xc4,0x62,0x7d,0x18,0x22 // vbroadcastss (%rdx),%ymm12 + .byte 0xc4,0x62,0x7d,0x18,0x6a,0x4c // vbroadcastss 0x4c(%rdx),%ymm13 + .byte 0xc4,0x62,0x7d,0x18,0x72,0x50 // vbroadcastss 0x50(%rdx),%ymm14 + .byte 0xc4,0x62,0x7d,0x18,0x7a,0x54 // vbroadcastss 0x54(%rdx),%ymm15 + .byte 0xc4,0x41,0x34,0x59,0xce // vmulps %ymm14,%ymm9,%ymm9 + .byte 0xc4,0x41,0x34,0x58,0xcf // vaddps %ymm15,%ymm9,%ymm9 + .byte 0xc4,0x41,0x2c,0x59,0xd5 // vmulps %ymm13,%ymm10,%ymm10 + .byte 0xc4,0x41,0x2c,0x58,0xc9 // vaddps %ymm9,%ymm10,%ymm9 + .byte 0xc4,0x41,0x1c,0x5d,0xc9 // vminps %ymm9,%ymm12,%ymm9 + .byte 0xc4,0x62,0x7d,0x18,0x52,0x58 // vbroadcastss 0x58(%rdx),%ymm10 + .byte 0xc4,0xc1,0x7c,0xc2,0xc2,0x01 // vcmpltps %ymm10,%ymm0,%ymm0 + .byte 0xc4,0xc3,0x35,0x4a,0xc3,0x00 // vblendvps %ymm0,%ymm11,%ymm9,%ymm0 + .byte 0xc5,0x7c,0x52,0xc9 // vrsqrtps %ymm1,%ymm9 + .byte 0xc4,0x41,0x7c,0x53,0xd9 // vrcpps %ymm9,%ymm11 + .byte 0xc4,0x41,0x7c,0x52,0xc9 // vrsqrtps %ymm9,%ymm9 + .byte 0xc4,0x41,0x0c,0x59,0xdb // vmulps %ymm11,%ymm14,%ymm11 + .byte 0xc4,0x41,0x04,0x58,0xdb // vaddps %ymm11,%ymm15,%ymm11 + .byte 0xc4,0x41,0x14,0x59,0xc9 // vmulps %ymm9,%ymm13,%ymm9 + .byte 0xc4,0x41,0x34,0x58,0xcb // vaddps %ymm11,%ymm9,%ymm9 + .byte 0xc5,0x3c,0x59,0xd9 // vmulps %ymm1,%ymm8,%ymm11 + .byte 0xc4,0x41,0x1c,0x5d,0xc9 // vminps %ymm9,%ymm12,%ymm9 + .byte 0xc4,0xc1,0x74,0xc2,0xca,0x01 // vcmpltps %ymm10,%ymm1,%ymm1 + .byte 0xc4,0xc3,0x35,0x4a,0xcb,0x10 // vblendvps %ymm1,%ymm11,%ymm9,%ymm1 + .byte 0xc5,0x7c,0x52,0xca // vrsqrtps %ymm2,%ymm9 + .byte 0xc4,0x41,0x7c,0x53,0xd9 // vrcpps %ymm9,%ymm11 + .byte 0xc4,0x41,0x0c,0x59,0xdb // vmulps %ymm11,%ymm14,%ymm11 + .byte 0xc4,0x41,0x04,0x58,0xdb // vaddps %ymm11,%ymm15,%ymm11 + .byte 0xc4,0x41,0x7c,0x52,0xc9 // vrsqrtps %ymm9,%ymm9 + .byte 0xc4,0x41,0x14,0x59,0xc9 // vmulps %ymm9,%ymm13,%ymm9 + .byte 0xc4,0x41,0x34,0x58,0xcb // vaddps %ymm11,%ymm9,%ymm9 + .byte 0xc4,0x41,0x1c,0x5d,0xc9 // vminps %ymm9,%ymm12,%ymm9 + .byte 0xc5,0x3c,0x59,0xc2 // vmulps %ymm2,%ymm8,%ymm8 + .byte 0xc4,0xc1,0x6c,0xc2,0xd2,0x01 // vcmpltps %ymm10,%ymm2,%ymm2 + .byte 0xc4,0xc3,0x35,0x4a,0xd0,0x20 // vblendvps %ymm2,%ymm8,%ymm9,%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_scale_u8_avx +_sk_scale_u8_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0x8b,0x00 // mov (%rax),%rax + .byte 0xc4,0x62,0x79,0x31,0x44,0x38,0x04 // vpmovzxbd 0x4(%rax,%rdi,1),%xmm8 + .byte 0xc4,0x62,0x79,0x31,0x0c,0x38 // vpmovzxbd (%rax,%rdi,1),%xmm9 + .byte 0xc4,0x43,0x35,0x18,0xc0,0x01 // vinsertf128 $0x1,%xmm8,%ymm9,%ymm8 + .byte 0xc4,0x41,0x7c,0x5b,0xc0 // vcvtdq2ps %ymm8,%ymm8 + .byte 0xc4,0x62,0x7d,0x18,0x4a,0x0c // vbroadcastss 0xc(%rdx),%ymm9 + .byte 0xc4,0x41,0x3c,0x59,0xc1 // vmulps %ymm9,%ymm8,%ymm8 + .byte 0xc5,0xbc,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm0 + .byte 0xc5,0xbc,0x59,0xc9 // vmulps %ymm1,%ymm8,%ymm1 + .byte 0xc5,0xbc,0x59,0xd2 // vmulps %ymm2,%ymm8,%ymm2 + .byte 0xc5,0xbc,0x59,0xdb // vmulps %ymm3,%ymm8,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_lerp_u8_avx +_sk_lerp_u8_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0x8b,0x00 // mov (%rax),%rax + .byte 0xc4,0x62,0x79,0x31,0x44,0x38,0x04 // vpmovzxbd 0x4(%rax,%rdi,1),%xmm8 + .byte 0xc4,0x62,0x79,0x31,0x0c,0x38 // vpmovzxbd (%rax,%rdi,1),%xmm9 + .byte 0xc4,0x43,0x35,0x18,0xc0,0x01 // vinsertf128 $0x1,%xmm8,%ymm9,%ymm8 + .byte 0xc4,0x41,0x7c,0x5b,0xc0 // vcvtdq2ps %ymm8,%ymm8 + .byte 0xc4,0x62,0x7d,0x18,0x4a,0x0c // vbroadcastss 0xc(%rdx),%ymm9 + .byte 0xc4,0x41,0x3c,0x59,0xc1 // vmulps %ymm9,%ymm8,%ymm8 + .byte 0xc5,0xfc,0x5c,0xc4 // vsubps %ymm4,%ymm0,%ymm0 + .byte 0xc4,0xc1,0x7c,0x59,0xc0 // vmulps %ymm8,%ymm0,%ymm0 + .byte 0xc5,0xfc,0x58,0xc4 // vaddps %ymm4,%ymm0,%ymm0 + .byte 0xc5,0xf4,0x5c,0xcd // vsubps %ymm5,%ymm1,%ymm1 + .byte 0xc4,0xc1,0x74,0x59,0xc8 // vmulps %ymm8,%ymm1,%ymm1 + .byte 0xc5,0xf4,0x58,0xcd // vaddps %ymm5,%ymm1,%ymm1 + .byte 0xc5,0xec,0x5c,0xd6 // vsubps %ymm6,%ymm2,%ymm2 + .byte 0xc4,0xc1,0x6c,0x59,0xd0 // vmulps %ymm8,%ymm2,%ymm2 + .byte 0xc5,0xec,0x58,0xd6 // vaddps %ymm6,%ymm2,%ymm2 + .byte 0xc5,0xe4,0x5c,0xdf // vsubps %ymm7,%ymm3,%ymm3 + .byte 0xc4,0xc1,0x64,0x59,0xd8 // vmulps %ymm8,%ymm3,%ymm3 + .byte 0xc5,0xe4,0x58,0xdf // vaddps %ymm7,%ymm3,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_load_tables_avx +_sk_load_tables_avx: + .byte 0x41,0x57 // push %r15 + .byte 0x41,0x56 // push %r14 + .byte 0x41,0x54 // push %r12 + .byte 0x53 // push %rbx + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x4c,0x8b,0x00 // mov (%rax),%r8 + .byte 0x48,0x8b,0x48,0x08 // mov 0x8(%rax),%rcx + .byte 0xc4,0x41,0x7c,0x10,0x14,0xb8 // vmovups (%r8,%rdi,4),%ymm10 + .byte 0xc5,0xf9,0x6e,0x42,0x10 // vmovd 0x10(%rdx),%xmm0 + .byte 0xc4,0xe3,0x79,0x04,0xc0,0x00 // vpermilps $0x0,%xmm0,%xmm0 + .byte 0xc4,0x63,0x7d,0x18,0xc8,0x01 // vinsertf128 $0x1,%xmm0,%ymm0,%ymm9 + .byte 0xc4,0xc1,0x34,0x54,0xc2 // vandps %ymm10,%ymm9,%ymm0 + .byte 0xc4,0xc1,0xf9,0x7e,0xc0 // vmovq %xmm0,%r8 + .byte 0x45,0x89,0xc1 // mov %r8d,%r9d + .byte 0xc4,0xc3,0xf9,0x16,0xc2,0x01 // vpextrq $0x1,%xmm0,%r10 + .byte 0x45,0x89,0xd3 // mov %r10d,%r11d + .byte 0x49,0xc1,0xea,0x20 // shr $0x20,%r10 + .byte 0x49,0xc1,0xe8,0x20 // shr $0x20,%r8 + .byte 0xc4,0xe3,0x7d,0x19,0xc0,0x01 // vextractf128 $0x1,%ymm0,%xmm0 + .byte 0xc4,0xc1,0xf9,0x7e,0xc7 // vmovq %xmm0,%r15 + .byte 0x45,0x89,0xfe // mov %r15d,%r14d + .byte 0xc4,0xe3,0xf9,0x16,0xc3,0x01 // vpextrq $0x1,%xmm0,%rbx + .byte 0x41,0x89,0xdc // mov %ebx,%r12d + .byte 0x48,0xc1,0xeb,0x20 // shr $0x20,%rbx + .byte 0x49,0xc1,0xef,0x20 // shr $0x20,%r15 + .byte 0xc4,0xa1,0x7a,0x10,0x04,0xb1 // vmovss (%rcx,%r14,4),%xmm0 + .byte 0xc4,0xa3,0x79,0x21,0x04,0xb9,0x10 // vinsertps $0x10,(%rcx,%r15,4),%xmm0,%xmm0 + .byte 0xc4,0xa3,0x79,0x21,0x04,0xa1,0x20 // vinsertps $0x20,(%rcx,%r12,4),%xmm0,%xmm0 + .byte 0xc4,0xe3,0x79,0x21,0x04,0x99,0x30 // vinsertps $0x30,(%rcx,%rbx,4),%xmm0,%xmm0 + .byte 0xc4,0xa1,0x7a,0x10,0x0c,0x89 // vmovss (%rcx,%r9,4),%xmm1 + .byte 0xc4,0xa3,0x71,0x21,0x0c,0x81,0x10 // vinsertps $0x10,(%rcx,%r8,4),%xmm1,%xmm1 + .byte 0xc4,0xa3,0x71,0x21,0x0c,0x99,0x20 // vinsertps $0x20,(%rcx,%r11,4),%xmm1,%xmm1 + .byte 0xc4,0xa3,0x71,0x21,0x0c,0x91,0x30 // vinsertps $0x30,(%rcx,%r10,4),%xmm1,%xmm1 + .byte 0xc4,0xe3,0x75,0x18,0xc0,0x01 // vinsertf128 $0x1,%xmm0,%ymm1,%ymm0 + .byte 0x4c,0x8b,0x78,0x10 // mov 0x10(%rax),%r15 + .byte 0xc4,0xc1,0x71,0x72,0xd2,0x08 // vpsrld $0x8,%xmm10,%xmm1 + .byte 0xc4,0x43,0x7d,0x19,0xd0,0x01 // vextractf128 $0x1,%ymm10,%xmm8 + .byte 0xc4,0xc1,0x69,0x72,0xd0,0x08 // vpsrld $0x8,%xmm8,%xmm2 + .byte 0xc4,0xe3,0x75,0x18,0xca,0x01 // vinsertf128 $0x1,%xmm2,%ymm1,%ymm1 + .byte 0xc5,0xb4,0x54,0xc9 // vandps %ymm1,%ymm9,%ymm1 + .byte 0xc4,0xc1,0xf9,0x7e,0xc8 // vmovq %xmm1,%r8 + .byte 0x45,0x89,0xc2 // mov %r8d,%r10d + .byte 0xc4,0xc3,0xf9,0x16,0xc9,0x01 // vpextrq $0x1,%xmm1,%r9 + .byte 0x45,0x89,0xcb // mov %r9d,%r11d + .byte 0x49,0xc1,0xe9,0x20 // shr $0x20,%r9 + .byte 0x49,0xc1,0xe8,0x20 // shr $0x20,%r8 + .byte 0xc4,0xe3,0x7d,0x19,0xc9,0x01 // vextractf128 $0x1,%ymm1,%xmm1 + .byte 0xc4,0xe1,0xf9,0x7e,0xcb // vmovq %xmm1,%rbx + .byte 0x41,0x89,0xde // mov %ebx,%r14d + .byte 0xc4,0xe3,0xf9,0x16,0xc9,0x01 // vpextrq $0x1,%xmm1,%rcx + .byte 0x41,0x89,0xcc // mov %ecx,%r12d + .byte 0x48,0xc1,0xe9,0x20 // shr $0x20,%rcx + .byte 0x48,0xc1,0xeb,0x20 // shr $0x20,%rbx + .byte 0xc4,0x81,0x7a,0x10,0x0c,0xb7 // vmovss (%r15,%r14,4),%xmm1 + .byte 0xc4,0xc3,0x71,0x21,0x0c,0x9f,0x10 // vinsertps $0x10,(%r15,%rbx,4),%xmm1,%xmm1 + .byte 0xc4,0x81,0x7a,0x10,0x14,0xa7 // vmovss (%r15,%r12,4),%xmm2 + .byte 0xc4,0xe3,0x71,0x21,0xca,0x20 // vinsertps $0x20,%xmm2,%xmm1,%xmm1 + .byte 0xc4,0xc1,0x7a,0x10,0x14,0x8f // vmovss (%r15,%rcx,4),%xmm2 + .byte 0xc4,0xe3,0x71,0x21,0xca,0x30 // vinsertps $0x30,%xmm2,%xmm1,%xmm1 + .byte 0xc4,0x81,0x7a,0x10,0x14,0x97 // vmovss (%r15,%r10,4),%xmm2 + .byte 0xc4,0x83,0x69,0x21,0x14,0x87,0x10 // vinsertps $0x10,(%r15,%r8,4),%xmm2,%xmm2 + .byte 0xc4,0x81,0x7a,0x10,0x1c,0x9f // vmovss (%r15,%r11,4),%xmm3 + .byte 0xc4,0xe3,0x69,0x21,0xd3,0x20 // vinsertps $0x20,%xmm3,%xmm2,%xmm2 + .byte 0xc4,0x81,0x7a,0x10,0x1c,0x8f // vmovss (%r15,%r9,4),%xmm3 + .byte 0xc4,0xe3,0x69,0x21,0xd3,0x30 // vinsertps $0x30,%xmm3,%xmm2,%xmm2 + .byte 0xc4,0xe3,0x6d,0x18,0xc9,0x01 // vinsertf128 $0x1,%xmm1,%ymm2,%ymm1 + .byte 0x48,0x8b,0x40,0x18 // mov 0x18(%rax),%rax + .byte 0xc4,0xc1,0x69,0x72,0xd2,0x10 // vpsrld $0x10,%xmm10,%xmm2 + .byte 0xc4,0xc1,0x61,0x72,0xd0,0x10 // vpsrld $0x10,%xmm8,%xmm3 + .byte 0xc4,0xe3,0x6d,0x18,0xd3,0x01 // vinsertf128 $0x1,%xmm3,%ymm2,%ymm2 + .byte 0xc5,0xb4,0x54,0xd2 // vandps %ymm2,%ymm9,%ymm2 + .byte 0xc4,0xc1,0xf9,0x7e,0xd0 // vmovq %xmm2,%r8 + .byte 0x45,0x89,0xc1 // mov %r8d,%r9d + .byte 0xc4,0xc3,0xf9,0x16,0xd6,0x01 // vpextrq $0x1,%xmm2,%r14 + .byte 0x45,0x89,0xf2 // mov %r14d,%r10d + .byte 0x49,0xc1,0xee,0x20 // shr $0x20,%r14 + .byte 0x49,0xc1,0xe8,0x20 // shr $0x20,%r8 + .byte 0xc4,0xe3,0x7d,0x19,0xd2,0x01 // vextractf128 $0x1,%ymm2,%xmm2 + .byte 0xc4,0xe1,0xf9,0x7e,0xd3 // vmovq %xmm2,%rbx + .byte 0x41,0x89,0xdb // mov %ebx,%r11d + .byte 0xc4,0xe3,0xf9,0x16,0xd1,0x01 // vpextrq $0x1,%xmm2,%rcx + .byte 0x41,0x89,0xcf // mov %ecx,%r15d + .byte 0x48,0xc1,0xe9,0x20 // shr $0x20,%rcx + .byte 0x48,0xc1,0xeb,0x20 // shr $0x20,%rbx + .byte 0xc4,0xa1,0x7a,0x10,0x14,0x98 // vmovss (%rax,%r11,4),%xmm2 + .byte 0xc4,0xe3,0x69,0x21,0x14,0x98,0x10 // vinsertps $0x10,(%rax,%rbx,4),%xmm2,%xmm2 + .byte 0xc4,0xa1,0x7a,0x10,0x1c,0xb8 // vmovss (%rax,%r15,4),%xmm3 + .byte 0xc4,0xe3,0x69,0x21,0xd3,0x20 // vinsertps $0x20,%xmm3,%xmm2,%xmm2 + .byte 0xc5,0xfa,0x10,0x1c,0x88 // vmovss (%rax,%rcx,4),%xmm3 + .byte 0xc4,0x63,0x69,0x21,0xcb,0x30 // vinsertps $0x30,%xmm3,%xmm2,%xmm9 + .byte 0xc4,0xa1,0x7a,0x10,0x1c,0x88 // vmovss (%rax,%r9,4),%xmm3 + .byte 0xc4,0xa3,0x61,0x21,0x1c,0x80,0x10 // vinsertps $0x10,(%rax,%r8,4),%xmm3,%xmm3 + .byte 0xc4,0xa1,0x7a,0x10,0x14,0x90 // vmovss (%rax,%r10,4),%xmm2 + .byte 0xc4,0xe3,0x61,0x21,0xd2,0x20 // vinsertps $0x20,%xmm2,%xmm3,%xmm2 + .byte 0xc4,0xa1,0x7a,0x10,0x1c,0xb0 // vmovss (%rax,%r14,4),%xmm3 + .byte 0xc4,0xe3,0x69,0x21,0xd3,0x30 // vinsertps $0x30,%xmm3,%xmm2,%xmm2 + .byte 0xc4,0xc3,0x6d,0x18,0xd1,0x01 // vinsertf128 $0x1,%xmm9,%ymm2,%ymm2 + .byte 0xc4,0xc1,0x31,0x72,0xd2,0x18 // vpsrld $0x18,%xmm10,%xmm9 + .byte 0xc4,0xc1,0x61,0x72,0xd0,0x18 // vpsrld $0x18,%xmm8,%xmm3 + .byte 0xc4,0xe3,0x35,0x18,0xdb,0x01 // vinsertf128 $0x1,%xmm3,%ymm9,%ymm3 + .byte 0xc5,0xfc,0x5b,0xdb // vcvtdq2ps %ymm3,%ymm3 + .byte 0xc4,0x62,0x7d,0x18,0x42,0x0c // vbroadcastss 0xc(%rdx),%ymm8 + .byte 0xc4,0xc1,0x64,0x59,0xd8 // vmulps %ymm8,%ymm3,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x5b // pop %rbx + .byte 0x41,0x5c // pop %r12 + .byte 0x41,0x5e // pop %r14 + .byte 0x41,0x5f // pop %r15 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_load_8888_avx +_sk_load_8888_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0x8b,0x00 // mov (%rax),%rax + .byte 0xc5,0xfc,0x10,0x1c,0xb8 // vmovups (%rax,%rdi,4),%ymm3 + .byte 0xc5,0xf9,0x6e,0x42,0x10 // vmovd 0x10(%rdx),%xmm0 + .byte 0xc4,0xe3,0x79,0x04,0xc0,0x00 // vpermilps $0x0,%xmm0,%xmm0 + .byte 0xc4,0x63,0x7d,0x18,0xd8,0x01 // vinsertf128 $0x1,%xmm0,%ymm0,%ymm11 + .byte 0xc5,0xa4,0x54,0xc3 // vandps %ymm3,%ymm11,%ymm0 + .byte 0xc5,0xfc,0x5b,0xc0 // vcvtdq2ps %ymm0,%ymm0 + .byte 0xc4,0x62,0x7d,0x18,0x42,0x0c // vbroadcastss 0xc(%rdx),%ymm8 + .byte 0xc5,0xbc,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm0 + .byte 0xc5,0xa9,0x72,0xd3,0x08 // vpsrld $0x8,%xmm3,%xmm10 + .byte 0xc4,0xc3,0x7d,0x19,0xd9,0x01 // vextractf128 $0x1,%ymm3,%xmm9 + .byte 0xc4,0xc1,0x71,0x72,0xd1,0x08 // vpsrld $0x8,%xmm9,%xmm1 + .byte 0xc4,0xe3,0x2d,0x18,0xc9,0x01 // vinsertf128 $0x1,%xmm1,%ymm10,%ymm1 + .byte 0xc5,0xa4,0x54,0xc9 // vandps %ymm1,%ymm11,%ymm1 + .byte 0xc5,0xfc,0x5b,0xc9 // vcvtdq2ps %ymm1,%ymm1 + .byte 0xc5,0xbc,0x59,0xc9 // vmulps %ymm1,%ymm8,%ymm1 + .byte 0xc5,0xa9,0x72,0xd3,0x10 // vpsrld $0x10,%xmm3,%xmm10 + .byte 0xc4,0xc1,0x69,0x72,0xd1,0x10 // vpsrld $0x10,%xmm9,%xmm2 + .byte 0xc4,0xe3,0x2d,0x18,0xd2,0x01 // vinsertf128 $0x1,%xmm2,%ymm10,%ymm2 + .byte 0xc5,0xa4,0x54,0xd2 // vandps %ymm2,%ymm11,%ymm2 + .byte 0xc5,0xfc,0x5b,0xd2 // vcvtdq2ps %ymm2,%ymm2 + .byte 0xc5,0xbc,0x59,0xd2 // vmulps %ymm2,%ymm8,%ymm2 + .byte 0xc5,0xa9,0x72,0xd3,0x18 // vpsrld $0x18,%xmm3,%xmm10 + .byte 0xc4,0xc1,0x61,0x72,0xd1,0x18 // vpsrld $0x18,%xmm9,%xmm3 + .byte 0xc4,0xe3,0x2d,0x18,0xdb,0x01 // vinsertf128 $0x1,%xmm3,%ymm10,%ymm3 + .byte 0xc5,0xfc,0x5b,0xdb // vcvtdq2ps %ymm3,%ymm3 + .byte 0xc4,0xc1,0x64,0x59,0xd8 // vmulps %ymm8,%ymm3,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_store_8888_avx +_sk_store_8888_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0x8b,0x00 // mov (%rax),%rax + .byte 0xc4,0x62,0x7d,0x18,0x42,0x08 // vbroadcastss 0x8(%rdx),%ymm8 + .byte 0xc5,0x3c,0x59,0xc8 // vmulps %ymm0,%ymm8,%ymm9 + .byte 0xc4,0x41,0x7d,0x5b,0xc9 // vcvtps2dq %ymm9,%ymm9 + .byte 0xc5,0x3c,0x59,0xd1 // vmulps %ymm1,%ymm8,%ymm10 + .byte 0xc4,0x41,0x7d,0x5b,0xd2 // vcvtps2dq %ymm10,%ymm10 + .byte 0xc4,0xc1,0x21,0x72,0xf2,0x08 // vpslld $0x8,%xmm10,%xmm11 + .byte 0xc4,0x43,0x7d,0x19,0xd2,0x01 // vextractf128 $0x1,%ymm10,%xmm10 + .byte 0xc4,0xc1,0x29,0x72,0xf2,0x08 // vpslld $0x8,%xmm10,%xmm10 + .byte 0xc4,0x43,0x25,0x18,0xd2,0x01 // vinsertf128 $0x1,%xmm10,%ymm11,%ymm10 + .byte 0xc4,0x41,0x2d,0x56,0xc9 // vorpd %ymm9,%ymm10,%ymm9 + .byte 0xc5,0x3c,0x59,0xd2 // vmulps %ymm2,%ymm8,%ymm10 + .byte 0xc4,0x41,0x7d,0x5b,0xd2 // vcvtps2dq %ymm10,%ymm10 + .byte 0xc4,0xc1,0x21,0x72,0xf2,0x10 // vpslld $0x10,%xmm10,%xmm11 + .byte 0xc4,0x43,0x7d,0x19,0xd2,0x01 // vextractf128 $0x1,%ymm10,%xmm10 + .byte 0xc4,0xc1,0x29,0x72,0xf2,0x10 // vpslld $0x10,%xmm10,%xmm10 + .byte 0xc4,0x43,0x25,0x18,0xd2,0x01 // vinsertf128 $0x1,%xmm10,%ymm11,%ymm10 + .byte 0xc4,0x41,0x35,0x56,0xca // vorpd %ymm10,%ymm9,%ymm9 + .byte 0xc5,0x3c,0x59,0xc3 // vmulps %ymm3,%ymm8,%ymm8 + .byte 0xc4,0x41,0x7d,0x5b,0xc0 // vcvtps2dq %ymm8,%ymm8 + .byte 0xc4,0xc1,0x29,0x72,0xf0,0x18 // vpslld $0x18,%xmm8,%xmm10 + .byte 0xc4,0x43,0x7d,0x19,0xc0,0x01 // vextractf128 $0x1,%ymm8,%xmm8 + .byte 0xc4,0xc1,0x39,0x72,0xf0,0x18 // vpslld $0x18,%xmm8,%xmm8 + .byte 0xc4,0x43,0x2d,0x18,0xc0,0x01 // vinsertf128 $0x1,%xmm8,%ymm10,%ymm8 + .byte 0xc4,0x41,0x35,0x56,0xc0 // vorpd %ymm8,%ymm9,%ymm8 + .byte 0xc5,0x7d,0x11,0x04,0xb8 // vmovupd %ymm8,(%rax,%rdi,4) + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_load_f16_avx +_sk_load_f16_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_store_f16_avx +_sk_store_f16_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_clamp_x_avx +_sk_clamp_x_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm8 + .byte 0xc4,0x43,0x7d,0x19,0xc1,0x01 // vextractf128 $0x1,%ymm8,%xmm9 + .byte 0xc4,0x41,0x29,0x76,0xd2 // vpcmpeqd %xmm10,%xmm10,%xmm10 + .byte 0xc4,0x41,0x31,0xfe,0xca // vpaddd %xmm10,%xmm9,%xmm9 + .byte 0xc4,0x41,0x39,0xfe,0xc2 // vpaddd %xmm10,%xmm8,%xmm8 + .byte 0xc4,0x43,0x3d,0x18,0xc1,0x01 // vinsertf128 $0x1,%xmm9,%ymm8,%ymm8 + .byte 0xc4,0xc1,0x7c,0x5d,0xc0 // vminps %ymm8,%ymm0,%ymm0 + .byte 0xc4,0x41,0x3c,0x57,0xc0 // vxorps %ymm8,%ymm8,%ymm8 + .byte 0xc5,0xbc,0x5f,0xc0 // vmaxps %ymm0,%ymm8,%ymm0 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_clamp_y_avx +_sk_clamp_y_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm8 + .byte 0xc4,0x43,0x7d,0x19,0xc1,0x01 // vextractf128 $0x1,%ymm8,%xmm9 + .byte 0xc4,0x41,0x29,0x76,0xd2 // vpcmpeqd %xmm10,%xmm10,%xmm10 + .byte 0xc4,0x41,0x31,0xfe,0xca // vpaddd %xmm10,%xmm9,%xmm9 + .byte 0xc4,0x41,0x39,0xfe,0xc2 // vpaddd %xmm10,%xmm8,%xmm8 + .byte 0xc4,0x43,0x3d,0x18,0xc1,0x01 // vinsertf128 $0x1,%xmm9,%ymm8,%ymm8 + .byte 0xc4,0xc1,0x74,0x5d,0xc8 // vminps %ymm8,%ymm1,%ymm1 + .byte 0xc4,0x41,0x3c,0x57,0xc0 // vxorps %ymm8,%ymm8,%ymm8 + .byte 0xc5,0xbc,0x5f,0xc9 // vmaxps %ymm1,%ymm8,%ymm1 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_matrix_2x3_avx +_sk_matrix_2x3_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm8 + .byte 0xc4,0x62,0x7d,0x18,0x48,0x08 // vbroadcastss 0x8(%rax),%ymm9 + .byte 0xc4,0x62,0x7d,0x18,0x50,0x10 // vbroadcastss 0x10(%rax),%ymm10 + .byte 0xc5,0x34,0x59,0xc9 // vmulps %ymm1,%ymm9,%ymm9 + .byte 0xc4,0x41,0x34,0x58,0xca // vaddps %ymm10,%ymm9,%ymm9 + .byte 0xc5,0x3c,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm8 + .byte 0xc4,0x41,0x3c,0x58,0xc1 // vaddps %ymm9,%ymm8,%ymm8 + .byte 0xc4,0x62,0x7d,0x18,0x48,0x04 // vbroadcastss 0x4(%rax),%ymm9 + .byte 0xc4,0x62,0x7d,0x18,0x50,0x0c // vbroadcastss 0xc(%rax),%ymm10 + .byte 0xc4,0x62,0x7d,0x18,0x58,0x14 // vbroadcastss 0x14(%rax),%ymm11 + .byte 0xc5,0xac,0x59,0xc9 // vmulps %ymm1,%ymm10,%ymm1 + .byte 0xc4,0xc1,0x74,0x58,0xcb // vaddps %ymm11,%ymm1,%ymm1 + .byte 0xc5,0xb4,0x59,0xc0 // vmulps %ymm0,%ymm9,%ymm0 + .byte 0xc5,0xfc,0x58,0xc9 // vaddps %ymm1,%ymm0,%ymm1 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0x7c,0x29,0xc0 // vmovaps %ymm8,%ymm0 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_matrix_3x4_avx +_sk_matrix_3x4_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0x62,0x7d,0x18,0x00 // vbroadcastss (%rax),%ymm8 + .byte 0xc4,0x62,0x7d,0x18,0x48,0x0c // vbroadcastss 0xc(%rax),%ymm9 + .byte 0xc4,0x62,0x7d,0x18,0x50,0x18 // vbroadcastss 0x18(%rax),%ymm10 + .byte 0xc4,0x62,0x7d,0x18,0x58,0x24 // vbroadcastss 0x24(%rax),%ymm11 + .byte 0xc5,0x2c,0x59,0xd2 // vmulps %ymm2,%ymm10,%ymm10 + .byte 0xc4,0x41,0x2c,0x58,0xd3 // vaddps %ymm11,%ymm10,%ymm10 + .byte 0xc5,0x34,0x59,0xc9 // vmulps %ymm1,%ymm9,%ymm9 + .byte 0xc4,0x41,0x34,0x58,0xca // vaddps %ymm10,%ymm9,%ymm9 + .byte 0xc5,0x3c,0x59,0xc0 // vmulps %ymm0,%ymm8,%ymm8 + .byte 0xc4,0x41,0x3c,0x58,0xc1 // vaddps %ymm9,%ymm8,%ymm8 + .byte 0xc4,0x62,0x7d,0x18,0x48,0x04 // vbroadcastss 0x4(%rax),%ymm9 + .byte 0xc4,0x62,0x7d,0x18,0x50,0x10 // vbroadcastss 0x10(%rax),%ymm10 + .byte 0xc4,0x62,0x7d,0x18,0x58,0x1c // vbroadcastss 0x1c(%rax),%ymm11 + .byte 0xc4,0x62,0x7d,0x18,0x60,0x28 // vbroadcastss 0x28(%rax),%ymm12 + .byte 0xc5,0x24,0x59,0xda // vmulps %ymm2,%ymm11,%ymm11 + .byte 0xc4,0x41,0x24,0x58,0xdc // vaddps %ymm12,%ymm11,%ymm11 + .byte 0xc5,0x2c,0x59,0xd1 // vmulps %ymm1,%ymm10,%ymm10 + .byte 0xc4,0x41,0x2c,0x58,0xd3 // vaddps %ymm11,%ymm10,%ymm10 + .byte 0xc5,0x34,0x59,0xc8 // vmulps %ymm0,%ymm9,%ymm9 + .byte 0xc4,0x41,0x34,0x58,0xca // vaddps %ymm10,%ymm9,%ymm9 + .byte 0xc4,0x62,0x7d,0x18,0x50,0x08 // vbroadcastss 0x8(%rax),%ymm10 + .byte 0xc4,0x62,0x7d,0x18,0x58,0x14 // vbroadcastss 0x14(%rax),%ymm11 + .byte 0xc4,0x62,0x7d,0x18,0x60,0x20 // vbroadcastss 0x20(%rax),%ymm12 + .byte 0xc4,0x62,0x7d,0x18,0x68,0x2c // vbroadcastss 0x2c(%rax),%ymm13 + .byte 0xc5,0x9c,0x59,0xd2 // vmulps %ymm2,%ymm12,%ymm2 + .byte 0xc4,0xc1,0x6c,0x58,0xd5 // vaddps %ymm13,%ymm2,%ymm2 + .byte 0xc5,0xa4,0x59,0xc9 // vmulps %ymm1,%ymm11,%ymm1 + .byte 0xc5,0xf4,0x58,0xca // vaddps %ymm2,%ymm1,%ymm1 + .byte 0xc5,0xac,0x59,0xc0 // vmulps %ymm0,%ymm10,%ymm0 + .byte 0xc5,0xfc,0x58,0xd1 // vaddps %ymm1,%ymm0,%ymm2 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0x7c,0x29,0xc0 // vmovaps %ymm8,%ymm0 + .byte 0xc5,0x7c,0x29,0xc9 // vmovaps %ymm9,%ymm1 + .byte 0xff,0xe0 // jmpq *%rax + +.globl _sk_linear_gradient_2stops_avx +_sk_linear_gradient_2stops_avx: + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc4,0xe2,0x7d,0x18,0x48,0x10 // vbroadcastss 0x10(%rax),%ymm1 + .byte 0xc4,0xe2,0x7d,0x18,0x10 // vbroadcastss (%rax),%ymm2 + .byte 0xc5,0xf4,0x59,0xc8 // vmulps %ymm0,%ymm1,%ymm1 + .byte 0xc5,0x6c,0x58,0xc1 // vaddps %ymm1,%ymm2,%ymm8 + .byte 0xc4,0xe2,0x7d,0x18,0x48,0x14 // vbroadcastss 0x14(%rax),%ymm1 + .byte 0xc4,0xe2,0x7d,0x18,0x50,0x04 // vbroadcastss 0x4(%rax),%ymm2 + .byte 0xc5,0xf4,0x59,0xc8 // vmulps %ymm0,%ymm1,%ymm1 + .byte 0xc5,0xec,0x58,0xc9 // vaddps %ymm1,%ymm2,%ymm1 + .byte 0xc4,0xe2,0x7d,0x18,0x50,0x18 // vbroadcastss 0x18(%rax),%ymm2 + .byte 0xc4,0xe2,0x7d,0x18,0x58,0x08 // vbroadcastss 0x8(%rax),%ymm3 + .byte 0xc5,0xec,0x59,0xd0 // vmulps %ymm0,%ymm2,%ymm2 + .byte 0xc5,0xe4,0x58,0xd2 // vaddps %ymm2,%ymm3,%ymm2 + .byte 0xc4,0xe2,0x7d,0x18,0x58,0x1c // vbroadcastss 0x1c(%rax),%ymm3 + .byte 0xc4,0x62,0x7d,0x18,0x48,0x0c // vbroadcastss 0xc(%rax),%ymm9 + .byte 0xc5,0xe4,0x59,0xc0 // vmulps %ymm0,%ymm3,%ymm0 + .byte 0xc5,0xb4,0x58,0xd8 // vaddps %ymm0,%ymm9,%ymm3 + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0xc5,0x7c,0x29,0xc0 // vmovaps %ymm8,%ymm0 + .byte 0xff,0xe0 // jmpq *%rax + .globl _sk_start_pipeline_sse41 _sk_start_pipeline_sse41: .byte 0x41,0x57 // push %r15 diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S index d2078b6b4f..1409d03c6f 100644 --- a/src/jumper/SkJumper_generated_win.S +++ b/src/jumper/SkJumper_generated_win.S @@ -589,6 +589,701 @@ _sk_linear_gradient_2stops_hsw LABEL PROC DB 197,124,41,192 ; vmovaps %ymm8,%ymm0 DB 255,224 ; jmpq *%rax +PUBLIC _sk_start_pipeline_avx +_sk_start_pipeline_avx LABEL PROC + DB 65,87 ; push %r15 + DB 65,86 ; push %r14 + DB 65,85 ; push %r13 + DB 65,84 ; push %r12 + DB 86 ; push %rsi + DB 87 ; push %rdi + DB 83 ; push %rbx + DB 72,129,236,160,0,0,0 ; sub $0xa0,%rsp + DB 197,120,41,188,36,144,0,0,0 ; vmovaps %xmm15,0x90(%rsp) + DB 197,120,41,180,36,128,0,0,0 ; vmovaps %xmm14,0x80(%rsp) + DB 197,120,41,108,36,112 ; vmovaps %xmm13,0x70(%rsp) + DB 197,120,41,100,36,96 ; vmovaps %xmm12,0x60(%rsp) + DB 197,120,41,92,36,80 ; vmovaps %xmm11,0x50(%rsp) + DB 197,120,41,84,36,64 ; vmovaps %xmm10,0x40(%rsp) + DB 197,120,41,76,36,48 ; vmovaps %xmm9,0x30(%rsp) + DB 197,120,41,68,36,32 ; vmovaps %xmm8,0x20(%rsp) + DB 197,248,41,124,36,16 ; vmovaps %xmm7,0x10(%rsp) + DB 197,248,41,52,36 ; vmovaps %xmm6,(%rsp) + DB 77,137,207 ; mov %r9,%r15 + DB 77,137,198 ; mov %r8,%r14 + DB 72,137,203 ; mov %rcx,%rbx + DB 72,137,214 ; mov %rdx,%rsi + DB 72,173 ; lods %ds:(%rsi),%rax + DB 73,137,196 ; mov %rax,%r12 + DB 73,137,245 ; mov %rsi,%r13 + DB 72,141,67,8 ; lea 0x8(%rbx),%rax + DB 76,57,248 ; cmp %r15,%rax + DB 118,5 ; jbe 75 <_sk_start_pipeline_avx+0x75> + DB 72,137,216 ; mov %rbx,%rax + DB 235,60 ; jmp b1 <_sk_start_pipeline_avx+0xb1> + DB 197,252,87,192 ; vxorps %ymm0,%ymm0,%ymm0 + DB 197,244,87,201 ; vxorps %ymm1,%ymm1,%ymm1 + DB 197,236,87,210 ; vxorps %ymm2,%ymm2,%ymm2 + DB 197,228,87,219 ; vxorps %ymm3,%ymm3,%ymm3 + DB 197,220,87,228 ; vxorps %ymm4,%ymm4,%ymm4 + DB 197,212,87,237 ; vxorps %ymm5,%ymm5,%ymm5 + DB 197,204,87,246 ; vxorps %ymm6,%ymm6,%ymm6 + DB 197,196,87,255 ; vxorps %ymm7,%ymm7,%ymm7 + DB 72,137,223 ; mov %rbx,%rdi + DB 76,137,238 ; mov %r13,%rsi + DB 76,137,242 ; mov %r14,%rdx + DB 65,255,212 ; callq *%r12 + DB 72,141,67,8 ; lea 0x8(%rbx),%rax + DB 72,131,195,16 ; add $0x10,%rbx + DB 76,57,251 ; cmp %r15,%rbx + DB 72,137,195 ; mov %rax,%rbx + DB 118,196 ; jbe 75 <_sk_start_pipeline_avx+0x75> + DB 197,248,40,52,36 ; vmovaps (%rsp),%xmm6 + DB 197,248,40,124,36,16 ; vmovaps 0x10(%rsp),%xmm7 + DB 197,120,40,68,36,32 ; vmovaps 0x20(%rsp),%xmm8 + DB 197,120,40,76,36,48 ; vmovaps 0x30(%rsp),%xmm9 + DB 197,120,40,84,36,64 ; vmovaps 0x40(%rsp),%xmm10 + DB 197,120,40,92,36,80 ; vmovaps 0x50(%rsp),%xmm11 + DB 197,120,40,100,36,96 ; vmovaps 0x60(%rsp),%xmm12 + DB 197,120,40,108,36,112 ; vmovaps 0x70(%rsp),%xmm13 + DB 197,120,40,180,36,128,0,0,0 ; vmovaps 0x80(%rsp),%xmm14 + DB 197,120,40,188,36,144,0,0,0 ; vmovaps 0x90(%rsp),%xmm15 + DB 72,129,196,160,0,0,0 ; add $0xa0,%rsp + DB 91 ; pop %rbx + DB 95 ; pop %rdi + DB 94 ; pop %rsi + DB 65,92 ; pop %r12 + DB 65,93 ; pop %r13 + DB 65,94 ; pop %r14 + DB 65,95 ; pop %r15 + DB 197,248,119 ; vzeroupper + DB 195 ; retq + +PUBLIC _sk_just_return_avx +_sk_just_return_avx LABEL PROC + DB 195 ; retq + +PUBLIC _sk_seed_shader_avx +_sk_seed_shader_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 197,249,110,199 ; vmovd %edi,%xmm0 + DB 196,227,121,4,192,0 ; vpermilps $0x0,%xmm0,%xmm0 + DB 196,227,125,24,192,1 ; vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 + DB 197,252,91,192 ; vcvtdq2ps %ymm0,%ymm0 + DB 196,226,125,24,74,4 ; vbroadcastss 0x4(%rdx),%ymm1 + DB 197,252,88,193 ; vaddps %ymm1,%ymm0,%ymm0 + DB 197,252,88,66,20 ; vaddps 0x14(%rdx),%ymm0,%ymm0 + DB 197,249,110,16 ; vmovd (%rax),%xmm2 + DB 196,227,121,4,210,0 ; vpermilps $0x0,%xmm2,%xmm2 + DB 196,227,109,24,210,1 ; vinsertf128 $0x1,%xmm2,%ymm2,%ymm2 + DB 197,252,91,210 ; vcvtdq2ps %ymm2,%ymm2 + DB 197,236,88,201 ; vaddps %ymm1,%ymm2,%ymm1 + DB 196,226,125,24,18 ; vbroadcastss (%rdx),%ymm2 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 197,228,87,219 ; vxorps %ymm3,%ymm3,%ymm3 + DB 197,220,87,228 ; vxorps %ymm4,%ymm4,%ymm4 + DB 197,212,87,237 ; vxorps %ymm5,%ymm5,%ymm5 + DB 197,204,87,246 ; vxorps %ymm6,%ymm6,%ymm6 + DB 197,196,87,255 ; vxorps %ymm7,%ymm7,%ymm7 + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_constant_color_avx +_sk_constant_color_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,226,125,24,0 ; vbroadcastss (%rax),%ymm0 + DB 196,226,125,24,72,4 ; vbroadcastss 0x4(%rax),%ymm1 + DB 196,226,125,24,80,8 ; vbroadcastss 0x8(%rax),%ymm2 + DB 196,226,125,24,88,12 ; vbroadcastss 0xc(%rax),%ymm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_clear_avx +_sk_clear_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,173 ; lods %ds:(%rsi),%rax + DB 197,252,87,192 ; vxorps %ymm0,%ymm0,%ymm0 + DB 197,244,87,201 ; vxorps %ymm1,%ymm1,%ymm1 + DB 197,236,87,210 ; vxorps %ymm2,%ymm2,%ymm2 + DB 197,228,87,219 ; vxorps %ymm3,%ymm3,%ymm3 + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_plus__avx +_sk_plus__avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 197,252,88,196 ; vaddps %ymm4,%ymm0,%ymm0 + DB 197,244,88,205 ; vaddps %ymm5,%ymm1,%ymm1 + DB 197,236,88,214 ; vaddps %ymm6,%ymm2,%ymm2 + DB 197,228,88,223 ; vaddps %ymm7,%ymm3,%ymm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_srcover_avx +_sk_srcover_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,98,125,24,2 ; vbroadcastss (%rdx),%ymm8 + DB 197,60,92,195 ; vsubps %ymm3,%ymm8,%ymm8 + DB 197,60,89,204 ; vmulps %ymm4,%ymm8,%ymm9 + DB 197,180,88,192 ; vaddps %ymm0,%ymm9,%ymm0 + DB 197,60,89,205 ; vmulps %ymm5,%ymm8,%ymm9 + DB 197,180,88,201 ; vaddps %ymm1,%ymm9,%ymm1 + DB 197,60,89,206 ; vmulps %ymm6,%ymm8,%ymm9 + DB 197,180,88,210 ; vaddps %ymm2,%ymm9,%ymm2 + DB 197,60,89,199 ; vmulps %ymm7,%ymm8,%ymm8 + DB 197,188,88,219 ; vaddps %ymm3,%ymm8,%ymm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_dstover_avx +_sk_dstover_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,98,125,24,2 ; vbroadcastss (%rdx),%ymm8 + DB 197,60,92,199 ; vsubps %ymm7,%ymm8,%ymm8 + DB 197,188,89,192 ; vmulps %ymm0,%ymm8,%ymm0 + DB 197,252,88,196 ; vaddps %ymm4,%ymm0,%ymm0 + DB 197,188,89,201 ; vmulps %ymm1,%ymm8,%ymm1 + DB 197,244,88,205 ; vaddps %ymm5,%ymm1,%ymm1 + DB 197,188,89,210 ; vmulps %ymm2,%ymm8,%ymm2 + DB 197,236,88,214 ; vaddps %ymm6,%ymm2,%ymm2 + DB 197,188,89,219 ; vmulps %ymm3,%ymm8,%ymm3 + DB 197,228,88,223 ; vaddps %ymm7,%ymm3,%ymm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_clamp_0_avx +_sk_clamp_0_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8 + DB 196,193,124,95,192 ; vmaxps %ymm8,%ymm0,%ymm0 + DB 196,193,116,95,200 ; vmaxps %ymm8,%ymm1,%ymm1 + DB 196,193,108,95,208 ; vmaxps %ymm8,%ymm2,%ymm2 + DB 196,193,100,95,216 ; vmaxps %ymm8,%ymm3,%ymm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_clamp_1_avx +_sk_clamp_1_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,98,125,24,2 ; vbroadcastss (%rdx),%ymm8 + DB 196,193,124,93,192 ; vminps %ymm8,%ymm0,%ymm0 + DB 196,193,116,93,200 ; vminps %ymm8,%ymm1,%ymm1 + DB 196,193,108,93,208 ; vminps %ymm8,%ymm2,%ymm2 + DB 196,193,100,93,216 ; vminps %ymm8,%ymm3,%ymm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_clamp_a_avx +_sk_clamp_a_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,98,125,24,2 ; vbroadcastss (%rdx),%ymm8 + DB 196,193,100,93,216 ; vminps %ymm8,%ymm3,%ymm3 + DB 197,252,93,195 ; vminps %ymm3,%ymm0,%ymm0 + DB 197,244,93,203 ; vminps %ymm3,%ymm1,%ymm1 + DB 197,236,93,211 ; vminps %ymm3,%ymm2,%ymm2 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_set_rgb_avx +_sk_set_rgb_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,226,125,24,0 ; vbroadcastss (%rax),%ymm0 + DB 196,226,125,24,72,4 ; vbroadcastss 0x4(%rax),%ymm1 + DB 196,226,125,24,80,8 ; vbroadcastss 0x8(%rax),%ymm2 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_swap_rb_avx +_sk_swap_rb_avx LABEL PROC + DB 197,124,40,192 ; vmovaps %ymm0,%ymm8 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,173 ; lods %ds:(%rsi),%rax + DB 197,252,40,194 ; vmovaps %ymm2,%ymm0 + DB 197,124,41,194 ; vmovaps %ymm8,%ymm2 + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_swap_avx +_sk_swap_avx LABEL PROC + DB 197,124,40,195 ; vmovaps %ymm3,%ymm8 + DB 197,124,40,202 ; vmovaps %ymm2,%ymm9 + DB 197,124,40,209 ; vmovaps %ymm1,%ymm10 + DB 197,124,40,216 ; vmovaps %ymm0,%ymm11 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,173 ; lods %ds:(%rsi),%rax + DB 197,252,40,196 ; vmovaps %ymm4,%ymm0 + DB 197,252,40,205 ; vmovaps %ymm5,%ymm1 + DB 197,252,40,214 ; vmovaps %ymm6,%ymm2 + DB 197,252,40,223 ; vmovaps %ymm7,%ymm3 + DB 197,124,41,220 ; vmovaps %ymm11,%ymm4 + DB 197,124,41,213 ; vmovaps %ymm10,%ymm5 + DB 197,124,41,206 ; vmovaps %ymm9,%ymm6 + DB 197,124,41,199 ; vmovaps %ymm8,%ymm7 + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_move_src_dst_avx +_sk_move_src_dst_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,173 ; lods %ds:(%rsi),%rax + DB 197,252,40,224 ; vmovaps %ymm0,%ymm4 + DB 197,252,40,233 ; vmovaps %ymm1,%ymm5 + DB 197,252,40,242 ; vmovaps %ymm2,%ymm6 + DB 197,252,40,251 ; vmovaps %ymm3,%ymm7 + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_move_dst_src_avx +_sk_move_dst_src_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,173 ; lods %ds:(%rsi),%rax + DB 197,252,40,196 ; vmovaps %ymm4,%ymm0 + DB 197,252,40,205 ; vmovaps %ymm5,%ymm1 + DB 197,252,40,214 ; vmovaps %ymm6,%ymm2 + DB 197,252,40,223 ; vmovaps %ymm7,%ymm3 + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_premul_avx +_sk_premul_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 197,252,89,195 ; vmulps %ymm3,%ymm0,%ymm0 + DB 197,244,89,203 ; vmulps %ymm3,%ymm1,%ymm1 + DB 197,236,89,211 ; vmulps %ymm3,%ymm2,%ymm2 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_unpremul_avx +_sk_unpremul_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8 + DB 196,65,100,194,200,0 ; vcmpeqps %ymm8,%ymm3,%ymm9 + DB 196,98,125,24,18 ; vbroadcastss (%rdx),%ymm10 + DB 197,44,94,211 ; vdivps %ymm3,%ymm10,%ymm10 + DB 196,67,45,74,192,144 ; vblendvps %ymm9,%ymm8,%ymm10,%ymm8 + DB 197,188,89,192 ; vmulps %ymm0,%ymm8,%ymm0 + DB 197,188,89,201 ; vmulps %ymm1,%ymm8,%ymm1 + DB 197,188,89,210 ; vmulps %ymm2,%ymm8,%ymm2 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_from_srgb_avx +_sk_from_srgb_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,98,125,24,66,64 ; vbroadcastss 0x40(%rdx),%ymm8 + DB 197,60,89,200 ; vmulps %ymm0,%ymm8,%ymm9 + DB 197,124,89,208 ; vmulps %ymm0,%ymm0,%ymm10 + DB 196,98,125,24,90,60 ; vbroadcastss 0x3c(%rdx),%ymm11 + DB 196,98,125,24,98,56 ; vbroadcastss 0x38(%rdx),%ymm12 + DB 197,36,89,232 ; vmulps %ymm0,%ymm11,%ymm13 + DB 196,65,20,88,236 ; vaddps %ymm12,%ymm13,%ymm13 + DB 196,98,125,24,114,52 ; vbroadcastss 0x34(%rdx),%ymm14 + DB 196,65,44,89,213 ; vmulps %ymm13,%ymm10,%ymm10 + DB 196,65,12,88,210 ; vaddps %ymm10,%ymm14,%ymm10 + DB 196,98,125,24,106,68 ; vbroadcastss 0x44(%rdx),%ymm13 + DB 196,193,124,194,197,1 ; vcmpltps %ymm13,%ymm0,%ymm0 + DB 196,195,45,74,193,0 ; vblendvps %ymm0,%ymm9,%ymm10,%ymm0 + DB 197,60,89,201 ; vmulps %ymm1,%ymm8,%ymm9 + DB 197,116,89,209 ; vmulps %ymm1,%ymm1,%ymm10 + DB 197,36,89,249 ; vmulps %ymm1,%ymm11,%ymm15 + DB 196,65,4,88,252 ; vaddps %ymm12,%ymm15,%ymm15 + DB 196,65,44,89,215 ; vmulps %ymm15,%ymm10,%ymm10 + DB 196,65,12,88,210 ; vaddps %ymm10,%ymm14,%ymm10 + DB 196,193,116,194,205,1 ; vcmpltps %ymm13,%ymm1,%ymm1 + DB 196,195,45,74,201,16 ; vblendvps %ymm1,%ymm9,%ymm10,%ymm1 + DB 197,60,89,194 ; vmulps %ymm2,%ymm8,%ymm8 + DB 197,108,89,202 ; vmulps %ymm2,%ymm2,%ymm9 + DB 197,36,89,210 ; vmulps %ymm2,%ymm11,%ymm10 + DB 196,65,44,88,212 ; vaddps %ymm12,%ymm10,%ymm10 + DB 196,65,52,89,202 ; vmulps %ymm10,%ymm9,%ymm9 + DB 196,65,12,88,201 ; vaddps %ymm9,%ymm14,%ymm9 + DB 196,193,108,194,213,1 ; vcmpltps %ymm13,%ymm2,%ymm2 + DB 196,195,53,74,208,32 ; vblendvps %ymm2,%ymm8,%ymm9,%ymm2 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_to_srgb_avx +_sk_to_srgb_avx LABEL PROC + DB 197,124,82,192 ; vrsqrtps %ymm0,%ymm8 + DB 196,65,124,83,200 ; vrcpps %ymm8,%ymm9 + DB 196,65,124,82,208 ; vrsqrtps %ymm8,%ymm10 + DB 196,98,125,24,66,72 ; vbroadcastss 0x48(%rdx),%ymm8 + DB 197,60,89,216 ; vmulps %ymm0,%ymm8,%ymm11 + DB 196,98,125,24,34 ; vbroadcastss (%rdx),%ymm12 + DB 196,98,125,24,106,76 ; vbroadcastss 0x4c(%rdx),%ymm13 + DB 196,98,125,24,114,80 ; vbroadcastss 0x50(%rdx),%ymm14 + DB 196,98,125,24,122,84 ; vbroadcastss 0x54(%rdx),%ymm15 + DB 196,65,52,89,206 ; vmulps %ymm14,%ymm9,%ymm9 + DB 196,65,52,88,207 ; vaddps %ymm15,%ymm9,%ymm9 + DB 196,65,44,89,213 ; vmulps %ymm13,%ymm10,%ymm10 + DB 196,65,44,88,201 ; vaddps %ymm9,%ymm10,%ymm9 + DB 196,65,28,93,201 ; vminps %ymm9,%ymm12,%ymm9 + DB 196,98,125,24,82,88 ; vbroadcastss 0x58(%rdx),%ymm10 + DB 196,193,124,194,194,1 ; vcmpltps %ymm10,%ymm0,%ymm0 + DB 196,195,53,74,195,0 ; vblendvps %ymm0,%ymm11,%ymm9,%ymm0 + DB 197,124,82,201 ; vrsqrtps %ymm1,%ymm9 + DB 196,65,124,83,217 ; vrcpps %ymm9,%ymm11 + DB 196,65,124,82,201 ; vrsqrtps %ymm9,%ymm9 + DB 196,65,12,89,219 ; vmulps %ymm11,%ymm14,%ymm11 + DB 196,65,4,88,219 ; vaddps %ymm11,%ymm15,%ymm11 + DB 196,65,20,89,201 ; vmulps %ymm9,%ymm13,%ymm9 + DB 196,65,52,88,203 ; vaddps %ymm11,%ymm9,%ymm9 + DB 197,60,89,217 ; vmulps %ymm1,%ymm8,%ymm11 + DB 196,65,28,93,201 ; vminps %ymm9,%ymm12,%ymm9 + DB 196,193,116,194,202,1 ; vcmpltps %ymm10,%ymm1,%ymm1 + DB 196,195,53,74,203,16 ; vblendvps %ymm1,%ymm11,%ymm9,%ymm1 + DB 197,124,82,202 ; vrsqrtps %ymm2,%ymm9 + DB 196,65,124,83,217 ; vrcpps %ymm9,%ymm11 + DB 196,65,12,89,219 ; vmulps %ymm11,%ymm14,%ymm11 + DB 196,65,4,88,219 ; vaddps %ymm11,%ymm15,%ymm11 + DB 196,65,124,82,201 ; vrsqrtps %ymm9,%ymm9 + DB 196,65,20,89,201 ; vmulps %ymm9,%ymm13,%ymm9 + DB 196,65,52,88,203 ; vaddps %ymm11,%ymm9,%ymm9 + DB 196,65,28,93,201 ; vminps %ymm9,%ymm12,%ymm9 + DB 197,60,89,194 ; vmulps %ymm2,%ymm8,%ymm8 + DB 196,193,108,194,210,1 ; vcmpltps %ymm10,%ymm2,%ymm2 + DB 196,195,53,74,208,32 ; vblendvps %ymm2,%ymm8,%ymm9,%ymm2 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_scale_u8_avx +_sk_scale_u8_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,139,0 ; mov (%rax),%rax + DB 196,98,121,49,68,56,4 ; vpmovzxbd 0x4(%rax,%rdi,1),%xmm8 + DB 196,98,121,49,12,56 ; vpmovzxbd (%rax,%rdi,1),%xmm9 + DB 196,67,53,24,192,1 ; vinsertf128 $0x1,%xmm8,%ymm9,%ymm8 + DB 196,65,124,91,192 ; vcvtdq2ps %ymm8,%ymm8 + DB 196,98,125,24,74,12 ; vbroadcastss 0xc(%rdx),%ymm9 + DB 196,65,60,89,193 ; vmulps %ymm9,%ymm8,%ymm8 + DB 197,188,89,192 ; vmulps %ymm0,%ymm8,%ymm0 + DB 197,188,89,201 ; vmulps %ymm1,%ymm8,%ymm1 + DB 197,188,89,210 ; vmulps %ymm2,%ymm8,%ymm2 + DB 197,188,89,219 ; vmulps %ymm3,%ymm8,%ymm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_lerp_u8_avx +_sk_lerp_u8_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,139,0 ; mov (%rax),%rax + DB 196,98,121,49,68,56,4 ; vpmovzxbd 0x4(%rax,%rdi,1),%xmm8 + DB 196,98,121,49,12,56 ; vpmovzxbd (%rax,%rdi,1),%xmm9 + DB 196,67,53,24,192,1 ; vinsertf128 $0x1,%xmm8,%ymm9,%ymm8 + DB 196,65,124,91,192 ; vcvtdq2ps %ymm8,%ymm8 + DB 196,98,125,24,74,12 ; vbroadcastss 0xc(%rdx),%ymm9 + DB 196,65,60,89,193 ; vmulps %ymm9,%ymm8,%ymm8 + DB 197,252,92,196 ; vsubps %ymm4,%ymm0,%ymm0 + DB 196,193,124,89,192 ; vmulps %ymm8,%ymm0,%ymm0 + DB 197,252,88,196 ; vaddps %ymm4,%ymm0,%ymm0 + DB 197,244,92,205 ; vsubps %ymm5,%ymm1,%ymm1 + DB 196,193,116,89,200 ; vmulps %ymm8,%ymm1,%ymm1 + DB 197,244,88,205 ; vaddps %ymm5,%ymm1,%ymm1 + DB 197,236,92,214 ; vsubps %ymm6,%ymm2,%ymm2 + DB 196,193,108,89,208 ; vmulps %ymm8,%ymm2,%ymm2 + DB 197,236,88,214 ; vaddps %ymm6,%ymm2,%ymm2 + DB 197,228,92,223 ; vsubps %ymm7,%ymm3,%ymm3 + DB 196,193,100,89,216 ; vmulps %ymm8,%ymm3,%ymm3 + DB 197,228,88,223 ; vaddps %ymm7,%ymm3,%ymm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_load_tables_avx +_sk_load_tables_avx LABEL PROC + DB 65,87 ; push %r15 + DB 65,86 ; push %r14 + DB 65,84 ; push %r12 + DB 83 ; push %rbx + DB 72,173 ; lods %ds:(%rsi),%rax + DB 76,139,0 ; mov (%rax),%r8 + DB 72,139,72,8 ; mov 0x8(%rax),%rcx + DB 196,65,124,16,20,184 ; vmovups (%r8,%rdi,4),%ymm10 + DB 197,249,110,66,16 ; vmovd 0x10(%rdx),%xmm0 + DB 196,227,121,4,192,0 ; vpermilps $0x0,%xmm0,%xmm0 + DB 196,99,125,24,200,1 ; vinsertf128 $0x1,%xmm0,%ymm0,%ymm9 + DB 196,193,52,84,194 ; vandps %ymm10,%ymm9,%ymm0 + DB 196,193,249,126,192 ; vmovq %xmm0,%r8 + DB 69,137,193 ; mov %r8d,%r9d + DB 196,195,249,22,194,1 ; vpextrq $0x1,%xmm0,%r10 + DB 69,137,211 ; mov %r10d,%r11d + DB 73,193,234,32 ; shr $0x20,%r10 + DB 73,193,232,32 ; shr $0x20,%r8 + DB 196,227,125,25,192,1 ; vextractf128 $0x1,%ymm0,%xmm0 + DB 196,193,249,126,199 ; vmovq %xmm0,%r15 + DB 69,137,254 ; mov %r15d,%r14d + DB 196,227,249,22,195,1 ; vpextrq $0x1,%xmm0,%rbx + DB 65,137,220 ; mov %ebx,%r12d + DB 72,193,235,32 ; shr $0x20,%rbx + DB 73,193,239,32 ; shr $0x20,%r15 + DB 196,161,122,16,4,177 ; vmovss (%rcx,%r14,4),%xmm0 + DB 196,163,121,33,4,185,16 ; vinsertps $0x10,(%rcx,%r15,4),%xmm0,%xmm0 + DB 196,163,121,33,4,161,32 ; vinsertps $0x20,(%rcx,%r12,4),%xmm0,%xmm0 + DB 196,227,121,33,4,153,48 ; vinsertps $0x30,(%rcx,%rbx,4),%xmm0,%xmm0 + DB 196,161,122,16,12,137 ; vmovss (%rcx,%r9,4),%xmm1 + DB 196,163,113,33,12,129,16 ; vinsertps $0x10,(%rcx,%r8,4),%xmm1,%xmm1 + DB 196,163,113,33,12,153,32 ; vinsertps $0x20,(%rcx,%r11,4),%xmm1,%xmm1 + DB 196,163,113,33,12,145,48 ; vinsertps $0x30,(%rcx,%r10,4),%xmm1,%xmm1 + DB 196,227,117,24,192,1 ; vinsertf128 $0x1,%xmm0,%ymm1,%ymm0 + DB 76,139,120,16 ; mov 0x10(%rax),%r15 + DB 196,193,113,114,210,8 ; vpsrld $0x8,%xmm10,%xmm1 + DB 196,67,125,25,208,1 ; vextractf128 $0x1,%ymm10,%xmm8 + DB 196,193,105,114,208,8 ; vpsrld $0x8,%xmm8,%xmm2 + DB 196,227,117,24,202,1 ; vinsertf128 $0x1,%xmm2,%ymm1,%ymm1 + DB 197,180,84,201 ; vandps %ymm1,%ymm9,%ymm1 + DB 196,193,249,126,200 ; vmovq %xmm1,%r8 + DB 69,137,194 ; mov %r8d,%r10d + DB 196,195,249,22,201,1 ; vpextrq $0x1,%xmm1,%r9 + DB 69,137,203 ; mov %r9d,%r11d + DB 73,193,233,32 ; shr $0x20,%r9 + DB 73,193,232,32 ; shr $0x20,%r8 + DB 196,227,125,25,201,1 ; vextractf128 $0x1,%ymm1,%xmm1 + DB 196,225,249,126,203 ; vmovq %xmm1,%rbx + DB 65,137,222 ; mov %ebx,%r14d + DB 196,227,249,22,201,1 ; vpextrq $0x1,%xmm1,%rcx + DB 65,137,204 ; mov %ecx,%r12d + DB 72,193,233,32 ; shr $0x20,%rcx + DB 72,193,235,32 ; shr $0x20,%rbx + DB 196,129,122,16,12,183 ; vmovss (%r15,%r14,4),%xmm1 + DB 196,195,113,33,12,159,16 ; vinsertps $0x10,(%r15,%rbx,4),%xmm1,%xmm1 + DB 196,129,122,16,20,167 ; vmovss (%r15,%r12,4),%xmm2 + DB 196,227,113,33,202,32 ; vinsertps $0x20,%xmm2,%xmm1,%xmm1 + DB 196,193,122,16,20,143 ; vmovss (%r15,%rcx,4),%xmm2 + DB 196,227,113,33,202,48 ; vinsertps $0x30,%xmm2,%xmm1,%xmm1 + DB 196,129,122,16,20,151 ; vmovss (%r15,%r10,4),%xmm2 + DB 196,131,105,33,20,135,16 ; vinsertps $0x10,(%r15,%r8,4),%xmm2,%xmm2 + DB 196,129,122,16,28,159 ; vmovss (%r15,%r11,4),%xmm3 + DB 196,227,105,33,211,32 ; vinsertps $0x20,%xmm3,%xmm2,%xmm2 + DB 196,129,122,16,28,143 ; vmovss (%r15,%r9,4),%xmm3 + DB 196,227,105,33,211,48 ; vinsertps $0x30,%xmm3,%xmm2,%xmm2 + DB 196,227,109,24,201,1 ; vinsertf128 $0x1,%xmm1,%ymm2,%ymm1 + DB 72,139,64,24 ; mov 0x18(%rax),%rax + DB 196,193,105,114,210,16 ; vpsrld $0x10,%xmm10,%xmm2 + DB 196,193,97,114,208,16 ; vpsrld $0x10,%xmm8,%xmm3 + DB 196,227,109,24,211,1 ; vinsertf128 $0x1,%xmm3,%ymm2,%ymm2 + DB 197,180,84,210 ; vandps %ymm2,%ymm9,%ymm2 + DB 196,193,249,126,208 ; vmovq %xmm2,%r8 + DB 69,137,193 ; mov %r8d,%r9d + DB 196,195,249,22,214,1 ; vpextrq $0x1,%xmm2,%r14 + DB 69,137,242 ; mov %r14d,%r10d + DB 73,193,238,32 ; shr $0x20,%r14 + DB 73,193,232,32 ; shr $0x20,%r8 + DB 196,227,125,25,210,1 ; vextractf128 $0x1,%ymm2,%xmm2 + DB 196,225,249,126,211 ; vmovq %xmm2,%rbx + DB 65,137,219 ; mov %ebx,%r11d + DB 196,227,249,22,209,1 ; vpextrq $0x1,%xmm2,%rcx + DB 65,137,207 ; mov %ecx,%r15d + DB 72,193,233,32 ; shr $0x20,%rcx + DB 72,193,235,32 ; shr $0x20,%rbx + DB 196,161,122,16,20,152 ; vmovss (%rax,%r11,4),%xmm2 + DB 196,227,105,33,20,152,16 ; vinsertps $0x10,(%rax,%rbx,4),%xmm2,%xmm2 + DB 196,161,122,16,28,184 ; vmovss (%rax,%r15,4),%xmm3 + DB 196,227,105,33,211,32 ; vinsertps $0x20,%xmm3,%xmm2,%xmm2 + DB 197,250,16,28,136 ; vmovss (%rax,%rcx,4),%xmm3 + DB 196,99,105,33,203,48 ; vinsertps $0x30,%xmm3,%xmm2,%xmm9 + DB 196,161,122,16,28,136 ; vmovss (%rax,%r9,4),%xmm3 + DB 196,163,97,33,28,128,16 ; vinsertps $0x10,(%rax,%r8,4),%xmm3,%xmm3 + DB 196,161,122,16,20,144 ; vmovss (%rax,%r10,4),%xmm2 + DB 196,227,97,33,210,32 ; vinsertps $0x20,%xmm2,%xmm3,%xmm2 + DB 196,161,122,16,28,176 ; vmovss (%rax,%r14,4),%xmm3 + DB 196,227,105,33,211,48 ; vinsertps $0x30,%xmm3,%xmm2,%xmm2 + DB 196,195,109,24,209,1 ; vinsertf128 $0x1,%xmm9,%ymm2,%ymm2 + DB 196,193,49,114,210,24 ; vpsrld $0x18,%xmm10,%xmm9 + DB 196,193,97,114,208,24 ; vpsrld $0x18,%xmm8,%xmm3 + DB 196,227,53,24,219,1 ; vinsertf128 $0x1,%xmm3,%ymm9,%ymm3 + DB 197,252,91,219 ; vcvtdq2ps %ymm3,%ymm3 + DB 196,98,125,24,66,12 ; vbroadcastss 0xc(%rdx),%ymm8 + DB 196,193,100,89,216 ; vmulps %ymm8,%ymm3,%ymm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 91 ; pop %rbx + DB 65,92 ; pop %r12 + DB 65,94 ; pop %r14 + DB 65,95 ; pop %r15 + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_load_8888_avx +_sk_load_8888_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,139,0 ; mov (%rax),%rax + DB 197,252,16,28,184 ; vmovups (%rax,%rdi,4),%ymm3 + DB 197,249,110,66,16 ; vmovd 0x10(%rdx),%xmm0 + DB 196,227,121,4,192,0 ; vpermilps $0x0,%xmm0,%xmm0 + DB 196,99,125,24,216,1 ; vinsertf128 $0x1,%xmm0,%ymm0,%ymm11 + DB 197,164,84,195 ; vandps %ymm3,%ymm11,%ymm0 + DB 197,252,91,192 ; vcvtdq2ps %ymm0,%ymm0 + DB 196,98,125,24,66,12 ; vbroadcastss 0xc(%rdx),%ymm8 + DB 197,188,89,192 ; vmulps %ymm0,%ymm8,%ymm0 + DB 197,169,114,211,8 ; vpsrld $0x8,%xmm3,%xmm10 + DB 196,195,125,25,217,1 ; vextractf128 $0x1,%ymm3,%xmm9 + DB 196,193,113,114,209,8 ; vpsrld $0x8,%xmm9,%xmm1 + DB 196,227,45,24,201,1 ; vinsertf128 $0x1,%xmm1,%ymm10,%ymm1 + DB 197,164,84,201 ; vandps %ymm1,%ymm11,%ymm1 + DB 197,252,91,201 ; vcvtdq2ps %ymm1,%ymm1 + DB 197,188,89,201 ; vmulps %ymm1,%ymm8,%ymm1 + DB 197,169,114,211,16 ; vpsrld $0x10,%xmm3,%xmm10 + DB 196,193,105,114,209,16 ; vpsrld $0x10,%xmm9,%xmm2 + DB 196,227,45,24,210,1 ; vinsertf128 $0x1,%xmm2,%ymm10,%ymm2 + DB 197,164,84,210 ; vandps %ymm2,%ymm11,%ymm2 + DB 197,252,91,210 ; vcvtdq2ps %ymm2,%ymm2 + DB 197,188,89,210 ; vmulps %ymm2,%ymm8,%ymm2 + DB 197,169,114,211,24 ; vpsrld $0x18,%xmm3,%xmm10 + DB 196,193,97,114,209,24 ; vpsrld $0x18,%xmm9,%xmm3 + DB 196,227,45,24,219,1 ; vinsertf128 $0x1,%xmm3,%ymm10,%ymm3 + DB 197,252,91,219 ; vcvtdq2ps %ymm3,%ymm3 + DB 196,193,100,89,216 ; vmulps %ymm8,%ymm3,%ymm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_store_8888_avx +_sk_store_8888_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,139,0 ; mov (%rax),%rax + DB 196,98,125,24,66,8 ; vbroadcastss 0x8(%rdx),%ymm8 + DB 197,60,89,200 ; vmulps %ymm0,%ymm8,%ymm9 + DB 196,65,125,91,201 ; vcvtps2dq %ymm9,%ymm9 + DB 197,60,89,209 ; vmulps %ymm1,%ymm8,%ymm10 + DB 196,65,125,91,210 ; vcvtps2dq %ymm10,%ymm10 + DB 196,193,33,114,242,8 ; vpslld $0x8,%xmm10,%xmm11 + DB 196,67,125,25,210,1 ; vextractf128 $0x1,%ymm10,%xmm10 + DB 196,193,41,114,242,8 ; vpslld $0x8,%xmm10,%xmm10 + DB 196,67,37,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm11,%ymm10 + DB 196,65,45,86,201 ; vorpd %ymm9,%ymm10,%ymm9 + DB 197,60,89,210 ; vmulps %ymm2,%ymm8,%ymm10 + DB 196,65,125,91,210 ; vcvtps2dq %ymm10,%ymm10 + DB 196,193,33,114,242,16 ; vpslld $0x10,%xmm10,%xmm11 + DB 196,67,125,25,210,1 ; vextractf128 $0x1,%ymm10,%xmm10 + DB 196,193,41,114,242,16 ; vpslld $0x10,%xmm10,%xmm10 + DB 196,67,37,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm11,%ymm10 + DB 196,65,53,86,202 ; vorpd %ymm10,%ymm9,%ymm9 + DB 197,60,89,195 ; vmulps %ymm3,%ymm8,%ymm8 + DB 196,65,125,91,192 ; vcvtps2dq %ymm8,%ymm8 + DB 196,193,41,114,240,24 ; vpslld $0x18,%xmm8,%xmm10 + DB 196,67,125,25,192,1 ; vextractf128 $0x1,%ymm8,%xmm8 + DB 196,193,57,114,240,24 ; vpslld $0x18,%xmm8,%xmm8 + DB 196,67,45,24,192,1 ; vinsertf128 $0x1,%xmm8,%ymm10,%ymm8 + DB 196,65,53,86,192 ; vorpd %ymm8,%ymm9,%ymm8 + DB 197,125,17,4,184 ; vmovupd %ymm8,(%rax,%rdi,4) + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_load_f16_avx +_sk_load_f16_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_store_f16_avx +_sk_store_f16_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_clamp_x_avx +_sk_clamp_x_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8 + DB 196,67,125,25,193,1 ; vextractf128 $0x1,%ymm8,%xmm9 + DB 196,65,41,118,210 ; vpcmpeqd %xmm10,%xmm10,%xmm10 + DB 196,65,49,254,202 ; vpaddd %xmm10,%xmm9,%xmm9 + DB 196,65,57,254,194 ; vpaddd %xmm10,%xmm8,%xmm8 + DB 196,67,61,24,193,1 ; vinsertf128 $0x1,%xmm9,%ymm8,%ymm8 + DB 196,193,124,93,192 ; vminps %ymm8,%ymm0,%ymm0 + DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8 + DB 197,188,95,192 ; vmaxps %ymm0,%ymm8,%ymm0 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_clamp_y_avx +_sk_clamp_y_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8 + DB 196,67,125,25,193,1 ; vextractf128 $0x1,%ymm8,%xmm9 + DB 196,65,41,118,210 ; vpcmpeqd %xmm10,%xmm10,%xmm10 + DB 196,65,49,254,202 ; vpaddd %xmm10,%xmm9,%xmm9 + DB 196,65,57,254,194 ; vpaddd %xmm10,%xmm8,%xmm8 + DB 196,67,61,24,193,1 ; vinsertf128 $0x1,%xmm9,%ymm8,%ymm8 + DB 196,193,116,93,200 ; vminps %ymm8,%ymm1,%ymm1 + DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8 + DB 197,188,95,201 ; vmaxps %ymm1,%ymm8,%ymm1 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_matrix_2x3_avx +_sk_matrix_2x3_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8 + DB 196,98,125,24,72,8 ; vbroadcastss 0x8(%rax),%ymm9 + DB 196,98,125,24,80,16 ; vbroadcastss 0x10(%rax),%ymm10 + DB 197,52,89,201 ; vmulps %ymm1,%ymm9,%ymm9 + DB 196,65,52,88,202 ; vaddps %ymm10,%ymm9,%ymm9 + DB 197,60,89,192 ; vmulps %ymm0,%ymm8,%ymm8 + DB 196,65,60,88,193 ; vaddps %ymm9,%ymm8,%ymm8 + DB 196,98,125,24,72,4 ; vbroadcastss 0x4(%rax),%ymm9 + DB 196,98,125,24,80,12 ; vbroadcastss 0xc(%rax),%ymm10 + DB 196,98,125,24,88,20 ; vbroadcastss 0x14(%rax),%ymm11 + DB 197,172,89,201 ; vmulps %ymm1,%ymm10,%ymm1 + DB 196,193,116,88,203 ; vaddps %ymm11,%ymm1,%ymm1 + DB 197,180,89,192 ; vmulps %ymm0,%ymm9,%ymm0 + DB 197,252,88,201 ; vaddps %ymm1,%ymm0,%ymm1 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 197,124,41,192 ; vmovaps %ymm8,%ymm0 + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_matrix_3x4_avx +_sk_matrix_3x4_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8 + DB 196,98,125,24,72,12 ; vbroadcastss 0xc(%rax),%ymm9 + DB 196,98,125,24,80,24 ; vbroadcastss 0x18(%rax),%ymm10 + DB 196,98,125,24,88,36 ; vbroadcastss 0x24(%rax),%ymm11 + DB 197,44,89,210 ; vmulps %ymm2,%ymm10,%ymm10 + DB 196,65,44,88,211 ; vaddps %ymm11,%ymm10,%ymm10 + DB 197,52,89,201 ; vmulps %ymm1,%ymm9,%ymm9 + DB 196,65,52,88,202 ; vaddps %ymm10,%ymm9,%ymm9 + DB 197,60,89,192 ; vmulps %ymm0,%ymm8,%ymm8 + DB 196,65,60,88,193 ; vaddps %ymm9,%ymm8,%ymm8 + DB 196,98,125,24,72,4 ; vbroadcastss 0x4(%rax),%ymm9 + DB 196,98,125,24,80,16 ; vbroadcastss 0x10(%rax),%ymm10 + DB 196,98,125,24,88,28 ; vbroadcastss 0x1c(%rax),%ymm11 + DB 196,98,125,24,96,40 ; vbroadcastss 0x28(%rax),%ymm12 + DB 197,36,89,218 ; vmulps %ymm2,%ymm11,%ymm11 + DB 196,65,36,88,220 ; vaddps %ymm12,%ymm11,%ymm11 + DB 197,44,89,209 ; vmulps %ymm1,%ymm10,%ymm10 + DB 196,65,44,88,211 ; vaddps %ymm11,%ymm10,%ymm10 + DB 197,52,89,200 ; vmulps %ymm0,%ymm9,%ymm9 + DB 196,65,52,88,202 ; vaddps %ymm10,%ymm9,%ymm9 + DB 196,98,125,24,80,8 ; vbroadcastss 0x8(%rax),%ymm10 + DB 196,98,125,24,88,20 ; vbroadcastss 0x14(%rax),%ymm11 + DB 196,98,125,24,96,32 ; vbroadcastss 0x20(%rax),%ymm12 + DB 196,98,125,24,104,44 ; vbroadcastss 0x2c(%rax),%ymm13 + DB 197,156,89,210 ; vmulps %ymm2,%ymm12,%ymm2 + DB 196,193,108,88,213 ; vaddps %ymm13,%ymm2,%ymm2 + DB 197,164,89,201 ; vmulps %ymm1,%ymm11,%ymm1 + DB 197,244,88,202 ; vaddps %ymm2,%ymm1,%ymm1 + DB 197,172,89,192 ; vmulps %ymm0,%ymm10,%ymm0 + DB 197,252,88,209 ; vaddps %ymm1,%ymm0,%ymm2 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 197,124,41,192 ; vmovaps %ymm8,%ymm0 + DB 197,124,41,201 ; vmovaps %ymm9,%ymm1 + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_linear_gradient_2stops_avx +_sk_linear_gradient_2stops_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,226,125,24,72,16 ; vbroadcastss 0x10(%rax),%ymm1 + DB 196,226,125,24,16 ; vbroadcastss (%rax),%ymm2 + DB 197,244,89,200 ; vmulps %ymm0,%ymm1,%ymm1 + DB 197,108,88,193 ; vaddps %ymm1,%ymm2,%ymm8 + DB 196,226,125,24,72,20 ; vbroadcastss 0x14(%rax),%ymm1 + DB 196,226,125,24,80,4 ; vbroadcastss 0x4(%rax),%ymm2 + DB 197,244,89,200 ; vmulps %ymm0,%ymm1,%ymm1 + DB 197,236,88,201 ; vaddps %ymm1,%ymm2,%ymm1 + DB 196,226,125,24,80,24 ; vbroadcastss 0x18(%rax),%ymm2 + DB 196,226,125,24,88,8 ; vbroadcastss 0x8(%rax),%ymm3 + DB 197,236,89,208 ; vmulps %ymm0,%ymm2,%ymm2 + DB 197,228,88,210 ; vaddps %ymm2,%ymm3,%ymm2 + DB 196,226,125,24,88,28 ; vbroadcastss 0x1c(%rax),%ymm3 + DB 196,98,125,24,72,12 ; vbroadcastss 0xc(%rax),%ymm9 + DB 197,228,89,192 ; vmulps %ymm0,%ymm3,%ymm0 + DB 197,180,88,216 ; vaddps %ymm0,%ymm9,%ymm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 197,124,41,192 ; vmovaps %ymm8,%ymm0 + DB 255,224 ; jmpq *%rax + PUBLIC _sk_start_pipeline_sse41 _sk_start_pipeline_sse41 LABEL PROC DB 65,87 ; push %r15 diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index a691f2bc62..21e3c3590b 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -104,6 +104,30 @@ using K = const SkJumper_constants; #define WRAP(name) sk_##name##_hsw +#elif defined(__AVX__) + #include <immintrin.h> + + using F = float __attribute__((ext_vector_type(8))); + using I32 = int32_t __attribute__((ext_vector_type(8))); + using U32 = uint32_t __attribute__((ext_vector_type(8))); + using U8 = uint8_t __attribute__((ext_vector_type(8))); + + static F mad(F f, F m, F a) { return f*m+a; } + static F min(F a, F b) { return _mm256_min_ps(a,b); } + static F max(F a, F b) { return _mm256_max_ps(a,b); } + static F rcp (F v) { return _mm256_rcp_ps (v); } + static F rsqrt(F v) { return _mm256_rsqrt_ps(v); } + static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); } + + static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } + + static F gather(const float* p, U32 ix) { + return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]], + p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], }; + } + + #define WRAP(name) sk_##name##_avx + #elif defined(__SSE2__) #include <immintrin.h> @@ -499,6 +523,9 @@ STAGE(load_f16) { g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567)); b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567)); a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567)); +#elif defined(__AVX__) + // TODO + #elif defined(__SSE2__) auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0), _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); @@ -568,6 +595,8 @@ STAGE(store_f16) { _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123)); _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567)); _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567)); +#elif defined(__AVX__) + // TODO #elif defined(__SSE2__) auto float_to_half = [&](F f) { return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000))) // Fix up the exponent, diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py index b6ab3c0c48..945f77606f 100755 --- a/src/jumper/build_stages.py +++ b/src/jumper/build_stages.py @@ -33,6 +33,14 @@ subprocess.check_call(['clang++'] + cflags + sse41 + ['-DWIN'] + ['-c', 'src/jumper/SkJumper_stages.cpp'] + ['-o', 'win_sse41.o']) +avx = '-mno-red-zone -mavx'.split() +subprocess.check_call(['clang++'] + cflags + avx + + ['-c', 'src/jumper/SkJumper_stages.cpp'] + + ['-o', 'avx.o']) +subprocess.check_call(['clang++'] + cflags + avx + ['-DWIN'] + + ['-c', 'src/jumper/SkJumper_stages.cpp'] + + ['-o', 'win_avx.o']) + hsw = '-mno-red-zone -mavx2 -mfma -mf16c'.split() subprocess.check_call(['clang++'] + cflags + hsw + ['-c', 'src/jumper/SkJumper_stages.cpp'] + @@ -125,6 +133,7 @@ parse_object_file('vfp4.o', '.long', target='elf32-littlearm') print '#elif defined(__x86_64__)' parse_object_file('hsw.o', '.byte') +parse_object_file('avx.o', '.byte') parse_object_file('sse41.o', '.byte') parse_object_file('sse2.o', '.byte') print '#endif' @@ -141,6 +150,7 @@ print '''; Copyright 2017 Google Inc. ''' print '_text SEGMENT' parse_object_file('win_hsw.o', 'DB') +parse_object_file('win_avx.o', 'DB') parse_object_file('win_sse41.o', 'DB') parse_object_file('win_sse2.o', 'DB') print 'END' |