aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-08-10 14:28:52 -0400
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-08-11 16:52:06 +0000
commita1d1703b749b55dc1a07da6aa28447c3cd060e57 (patch)
tree571e7d03a96b63542363665989a03109ca7d27ed /src
parent3c2865374597525a25f02160d914dc8fdc3bf415 (diff)
double pump 8-bit stages
This basically unrolls all loops, handling twice as many pixels in a stride. We now pass around 4 native registers instead of just 2. I've temporarily disabled AVX2 mask loads and stores. It shouldn't be hard to turn them back on, but I'd want to test on AVX2 hardware first. Change-Id: I0907070f086a0650167456c149a479c1d96b8a2d Reviewed-on: https://skia-review.googlesource.com/33361 Reviewed-by: Florin Malita <fmalita@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src')
-rw-r--r--src/jumper/SkJumper_generated.S7918
-rw-r--r--src/jumper/SkJumper_generated_win.S7929
-rw-r--r--src/jumper/SkJumper_stages_8bit.cpp153
3 files changed, 11193 insertions, 4807 deletions
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index f3ea32388a..0cc69f8f1f 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -56080,7 +56080,7 @@ _sk_start_pipeline_hsw_8bit:
.byte 73,57,207 // cmp %rcx,%r15
.byte 115,102 // jae 95 <_sk_start_pipeline_hsw_8bit+0x95>
.byte 72,139,69,208 // mov -0x30(%rbp),%rax
- .byte 72,141,64,8 // lea 0x8(%rax),%rax
+ .byte 72,141,64,16 // lea 0x10(%rax),%rax
.byte 72,137,69,176 // mov %rax,-0x50(%rbp)
.byte 76,141,101,184 // lea -0x48(%rbp),%r12
.byte 72,139,69,208 // mov -0x30(%rbp),%rax
@@ -56093,9 +56093,9 @@ _sk_start_pipeline_hsw_8bit:
.byte 76,137,246 // mov %r14,%rsi
.byte 65,255,213 // callq *%r13
.byte 72,139,77,184 // mov -0x48(%rbp),%rcx
- .byte 72,141,65,8 // lea 0x8(%rcx),%rax
+ .byte 72,141,65,16 // lea 0x10(%rcx),%rax
.byte 72,137,69,184 // mov %rax,-0x48(%rbp)
- .byte 72,131,193,16 // add $0x10,%rcx
+ .byte 72,131,193,32 // add $0x20,%rcx
.byte 72,57,217 // cmp %rbx,%rcx
.byte 118,226 // jbe 59 <_sk_start_pipeline_hsw_8bit+0x59>
.byte 72,137,217 // mov %rbx,%rcx
@@ -56131,6 +56131,7 @@ _sk_uniform_color_hsw_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 196,226,125,24,64,16 // vbroadcastss 0x10(%rax),%ymm0
.byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,252,40,200 // vmovaps %ymm0,%ymm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_set_rgb_hsw_8bit
@@ -56138,21 +56139,24 @@ HIDDEN _sk_set_rgb_hsw_8bit
FUNCTION(_sk_set_rgb_hsw_8bit)
_sk_set_rgb_hsw_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 197,250,16,21,54,18,0,0 // vmovss 0x1236(%rip),%xmm2 # 12f4 <_sk_xor__hsw_8bit+0xbd>
- .byte 197,234,89,24 // vmulss (%rax),%xmm2,%xmm3
- .byte 196,225,250,44,203 // vcvttss2si %xmm3,%rcx
- .byte 197,234,89,88,4 // vmulss 0x4(%rax),%xmm2,%xmm3
- .byte 196,225,250,44,211 // vcvttss2si %xmm3,%rdx
+ .byte 197,250,16,37,62,39,0,0 // vmovss 0x273e(%rip),%xmm4 # 2800 <_sk_xor__hsw_8bit+0x175>
+ .byte 197,218,89,40 // vmulss (%rax),%xmm4,%xmm5
+ .byte 196,225,250,44,205 // vcvttss2si %xmm5,%rcx
+ .byte 197,218,89,104,4 // vmulss 0x4(%rax),%xmm4,%xmm5
+ .byte 196,225,250,44,213 // vcvttss2si %xmm5,%rdx
.byte 193,226,8 // shl $0x8,%edx
.byte 9,202 // or %ecx,%edx
- .byte 197,234,89,80,8 // vmulss 0x8(%rax),%xmm2,%xmm2
- .byte 196,225,250,44,194 // vcvttss2si %xmm2,%rax
+ .byte 197,218,89,96,8 // vmulss 0x8(%rax),%xmm4,%xmm4
+ .byte 196,225,250,44,196 // vcvttss2si %xmm4,%rax
.byte 193,224,16 // shl $0x10,%eax
.byte 9,208 // or %edx,%eax
- .byte 197,249,110,208 // vmovd %eax,%xmm2
- .byte 196,226,125,88,210 // vpbroadcastd %xmm2,%ymm2
- .byte 197,253,219,5,42,18,0,0 // vpand 0x122a(%rip),%ymm0,%ymm0 # 1320 <_sk_xor__hsw_8bit+0xe9>
- .byte 197,237,235,192 // vpor %ymm0,%ymm2,%ymm0
+ .byte 197,249,110,224 // vmovd %eax,%xmm4
+ .byte 196,226,125,88,228 // vpbroadcastd %xmm4,%ymm4
+ .byte 197,253,111,45,38,39,0,0 // vmovdqa 0x2726(%rip),%ymm5 # 2820 <_sk_xor__hsw_8bit+0x195>
+ .byte 197,245,219,205 // vpand %ymm5,%ymm1,%ymm1
+ .byte 197,253,219,197 // vpand %ymm5,%ymm0,%ymm0
+ .byte 197,221,235,192 // vpor %ymm0,%ymm4,%ymm0
+ .byte 197,221,235,201 // vpor %ymm1,%ymm4,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -56160,23 +56164,42 @@ HIDDEN _sk_premul_hsw_8bit
.globl _sk_premul_hsw_8bit
FUNCTION(_sk_premul_hsw_8bit)
_sk_premul_hsw_8bit:
- .byte 196,226,125,0,21,57,18,0,0 // vpshufb 0x1239(%rip),%ymm0,%ymm2 # 1340 <_sk_xor__hsw_8bit+0x109>
- .byte 197,237,235,21,81,18,0,0 // vpor 0x1251(%rip),%ymm2,%ymm2 # 1360 <_sk_xor__hsw_8bit+0x129>
- .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3
+ .byte 197,253,111,37,42,39,0,0 // vmovdqa 0x272a(%rip),%ymm4 # 2840 <_sk_xor__hsw_8bit+0x1b5>
+ .byte 196,226,125,0,236 // vpshufb %ymm4,%ymm0,%ymm5
+ .byte 196,226,117,0,228 // vpshufb %ymm4,%ymm1,%ymm4
+ .byte 197,253,111,53,56,39,0,0 // vmovdqa 0x2738(%rip),%ymm6 # 2860 <_sk_xor__hsw_8bit+0x1d5>
+ .byte 197,221,235,230 // vpor %ymm6,%ymm4,%ymm4
+ .byte 197,213,235,238 // vpor %ymm6,%ymm5,%ymm5
+ .byte 196,226,125,48,240 // vpmovzxbw %xmm0,%ymm6
.byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0
.byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
- .byte 196,226,125,48,226 // vpmovzxbw %xmm2,%ymm4
- .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2
- .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2
- .byte 197,237,213,208 // vpmullw %ymm0,%ymm2,%ymm2
- .byte 197,221,213,227 // vpmullw %ymm3,%ymm4,%ymm4
- .byte 197,221,253,219 // vpaddw %ymm3,%ymm4,%ymm3
- .byte 197,237,253,192 // vpaddw %ymm0,%ymm2,%ymm0
+ .byte 196,226,125,48,249 // vpmovzxbw %xmm1,%ymm7
+ .byte 196,227,125,57,201,1 // vextracti128 $0x1,%ymm1,%xmm1
+ .byte 196,226,125,48,201 // vpmovzxbw %xmm1,%ymm1
+ .byte 196,98,125,48,197 // vpmovzxbw %xmm5,%ymm8
+ .byte 196,227,125,57,237,1 // vextracti128 $0x1,%ymm5,%xmm5
+ .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
+ .byte 196,98,125,48,204 // vpmovzxbw %xmm4,%ymm9
+ .byte 196,227,125,57,228,1 // vextracti128 $0x1,%ymm4,%xmm4
+ .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
+ .byte 197,221,213,225 // vpmullw %ymm1,%ymm4,%ymm4
+ .byte 197,53,213,207 // vpmullw %ymm7,%ymm9,%ymm9
+ .byte 197,213,213,232 // vpmullw %ymm0,%ymm5,%ymm5
+ .byte 197,61,213,198 // vpmullw %ymm6,%ymm8,%ymm8
+ .byte 197,189,253,246 // vpaddw %ymm6,%ymm8,%ymm6
+ .byte 197,213,253,192 // vpaddw %ymm0,%ymm5,%ymm0
+ .byte 197,181,253,239 // vpaddw %ymm7,%ymm9,%ymm5
+ .byte 197,221,253,201 // vpaddw %ymm1,%ymm4,%ymm1
+ .byte 197,245,113,209,8 // vpsrlw $0x8,%ymm1,%ymm1
+ .byte 197,221,113,213,8 // vpsrlw $0x8,%ymm5,%ymm4
.byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0
- .byte 197,237,113,211,8 // vpsrlw $0x8,%ymm3,%ymm2
- .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3
- .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0
+ .byte 197,213,113,214,8 // vpsrlw $0x8,%ymm6,%ymm5
+ .byte 196,227,85,56,240,1 // vinserti128 $0x1,%xmm0,%ymm5,%ymm6
+ .byte 196,227,85,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm5,%ymm0
+ .byte 197,205,103,192 // vpackuswb %ymm0,%ymm6,%ymm0
+ .byte 196,227,93,56,233,1 // vinserti128 $0x1,%xmm1,%ymm4,%ymm5
+ .byte 196,227,93,70,201,49 // vperm2i128 $0x31,%ymm1,%ymm4,%ymm1
+ .byte 197,213,103,201 // vpackuswb %ymm1,%ymm5,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -56184,7 +56207,9 @@ HIDDEN _sk_swap_rb_hsw_8bit
.globl _sk_swap_rb_hsw_8bit
FUNCTION(_sk_swap_rb_hsw_8bit)
_sk_swap_rb_hsw_8bit:
- .byte 196,226,125,0,5,26,18,0,0 // vpshufb 0x121a(%rip),%ymm0,%ymm0 # 1380 <_sk_xor__hsw_8bit+0x149>
+ .byte 197,253,111,37,176,38,0,0 // vmovdqa 0x26b0(%rip),%ymm4 # 2880 <_sk_xor__hsw_8bit+0x1f5>
+ .byte 196,226,125,0,196 // vpshufb %ymm4,%ymm0,%ymm0
+ .byte 196,226,117,0,204 // vpshufb %ymm4,%ymm1,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -56192,8 +56217,9 @@ HIDDEN _sk_invert_hsw_8bit
.globl _sk_invert_hsw_8bit
FUNCTION(_sk_invert_hsw_8bit)
_sk_invert_hsw_8bit:
- .byte 197,237,118,210 // vpcmpeqd %ymm2,%ymm2,%ymm2
- .byte 197,253,239,194 // vpxor %ymm2,%ymm0,%ymm0
+ .byte 197,221,118,228 // vpcmpeqd %ymm4,%ymm4,%ymm4
+ .byte 197,253,239,196 // vpxor %ymm4,%ymm0,%ymm0
+ .byte 197,245,239,204 // vpxor %ymm4,%ymm1,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -56201,172 +56227,644 @@ HIDDEN _sk_load_8888_hsw_8bit
.globl _sk_load_8888_hsw_8bit
FUNCTION(_sk_load_8888_hsw_8bit)
_sk_load_8888_hsw_8bit:
- .byte 76,99,15 // movslq (%rdi),%r9
- .byte 76,139,71,16 // mov 0x10(%rdi),%r8
+ .byte 76,99,7 // movslq (%rdi),%r8
+ .byte 76,139,79,16 // mov 0x10(%rdi),%r9
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 72,99,80,8 // movslq 0x8(%rax),%rdx
- .byte 72,99,79,8 // movslq 0x8(%rdi),%rcx
- .byte 72,15,175,202 // imul %rdx,%rcx
- .byte 72,193,225,2 // shl $0x2,%rcx
- .byte 72,3,8 // add (%rax),%rcx
- .byte 74,141,4,137 // lea (%rcx,%r9,4),%rax
- .byte 77,133,192 // test %r8,%r8
- .byte 117,8 // jne 1a3 <_sk_load_8888_hsw_8bit+0x2d>
- .byte 197,254,111,0 // vmovdqu (%rax),%ymm0
+ .byte 72,99,72,8 // movslq 0x8(%rax),%rcx
+ .byte 72,99,87,8 // movslq 0x8(%rdi),%rdx
+ .byte 72,15,175,209 // imul %rcx,%rdx
+ .byte 72,193,226,2 // shl $0x2,%rdx
+ .byte 72,3,16 // add (%rax),%rdx
+ .byte 77,133,201 // test %r9,%r9
+ .byte 117,17 // jne 220 <_sk_load_8888_hsw_8bit+0x32>
+ .byte 196,161,126,111,76,130,32 // vmovdqu 0x20(%rdx,%r8,4),%ymm1
+ .byte 196,161,126,111,4,130 // vmovdqu (%rdx,%r8,4),%ymm0
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 185,8,0,0,0 // mov $0x8,%ecx
- .byte 68,41,193 // sub %r8d,%ecx
- .byte 192,225,3 // shl $0x3,%cl
- .byte 72,199,194,255,255,255,255 // mov $0xffffffffffffffff,%rdx
- .byte 72,211,234 // shr %cl,%rdx
- .byte 196,225,249,110,194 // vmovq %rdx,%xmm0
- .byte 196,226,125,33,192 // vpmovsxbd %xmm0,%ymm0
- .byte 196,226,125,140,0 // vpmaskmovd (%rax),%ymm0,%ymm0
- .byte 235,214 // jmp 19f <_sk_load_8888_hsw_8bit+0x29>
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 197,245,239,201 // vpxor %ymm1,%ymm1,%ymm1
+ .byte 197,253,239,192 // vpxor %ymm0,%ymm0,%ymm0
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,231 // ja 21c <_sk_load_8888_hsw_8bit+0x2e>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,24,1,0,0 // lea 0x118(%rip),%rcx # 358 <_sk_load_8888_hsw_8bit+0x16a>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 196,161,121,110,4,130 // vmovd (%rdx,%r8,4),%xmm0
+ .byte 235,203 // jmp 21c <_sk_load_8888_hsw_8bit+0x2e>
+ .byte 196,161,121,110,68,130,8 // vmovd 0x8(%rdx,%r8,4),%xmm0
+ .byte 196,226,121,89,192 // vpbroadcastq %xmm0,%xmm0
+ .byte 197,245,239,201 // vpxor %ymm1,%ymm1,%ymm1
+ .byte 196,227,117,2,192,4 // vpblendd $0x4,%ymm0,%ymm1,%ymm0
+ .byte 196,162,121,53,36,130 // vpmovzxdq (%rdx,%r8,4),%xmm4
+ .byte 197,249,112,228,232 // vpshufd $0xe8,%xmm4,%xmm4
+ .byte 196,227,125,2,196,3 // vpblendd $0x3,%ymm4,%ymm0,%ymm0
+ .byte 235,162 // jmp 21c <_sk_load_8888_hsw_8bit+0x2e>
+ .byte 196,161,121,110,68,130,24 // vmovd 0x18(%rdx,%r8,4),%xmm0
+ .byte 196,226,125,89,192 // vpbroadcastq %xmm0,%ymm0
+ .byte 197,245,239,201 // vpxor %ymm1,%ymm1,%ymm1
+ .byte 196,227,117,2,192,64 // vpblendd $0x40,%ymm0,%ymm1,%ymm0
+ .byte 196,227,125,57,196,1 // vextracti128 $0x1,%ymm0,%xmm4
+ .byte 196,163,89,34,100,130,20,1 // vpinsrd $0x1,0x14(%rdx,%r8,4),%xmm4,%xmm4
+ .byte 196,227,125,56,196,1 // vinserti128 $0x1,%xmm4,%ymm0,%ymm0
+ .byte 196,227,125,57,196,1 // vextracti128 $0x1,%ymm0,%xmm4
+ .byte 196,163,89,34,100,130,16,0 // vpinsrd $0x0,0x10(%rdx,%r8,4),%xmm4,%xmm4
+ .byte 196,227,125,56,196,1 // vinserti128 $0x1,%xmm4,%ymm0,%ymm0
+ .byte 196,161,122,111,36,130 // vmovdqu (%rdx,%r8,4),%xmm4
+ .byte 196,227,93,2,192,240 // vpblendd $0xf0,%ymm0,%ymm4,%ymm0
+ .byte 233,83,255,255,255 // jmpq 21c <_sk_load_8888_hsw_8bit+0x2e>
+ .byte 196,161,121,110,68,130,40 // vmovd 0x28(%rdx,%r8,4),%xmm0
+ .byte 196,226,121,89,192 // vpbroadcastq %xmm0,%xmm0
+ .byte 197,245,239,201 // vpxor %ymm1,%ymm1,%ymm1
+ .byte 196,227,117,2,200,4 // vpblendd $0x4,%ymm0,%ymm1,%ymm1
+ .byte 196,163,113,34,68,130,36,1 // vpinsrd $0x1,0x24(%rdx,%r8,4),%xmm1,%xmm0
+ .byte 196,227,117,2,200,15 // vpblendd $0xf,%ymm0,%ymm1,%ymm1
+ .byte 196,161,121,110,68,130,32 // vmovd 0x20(%rdx,%r8,4),%xmm0
+ .byte 196,227,117,2,200,1 // vpblendd $0x1,%ymm0,%ymm1,%ymm1
+ .byte 233,23,255,255,255 // jmpq 216 <_sk_load_8888_hsw_8bit+0x28>
+ .byte 196,161,121,110,68,130,56 // vmovd 0x38(%rdx,%r8,4),%xmm0
+ .byte 196,226,125,89,192 // vpbroadcastq %xmm0,%ymm0
+ .byte 197,245,239,201 // vpxor %ymm1,%ymm1,%ymm1
+ .byte 196,227,117,2,200,64 // vpblendd $0x40,%ymm0,%ymm1,%ymm1
+ .byte 196,227,125,57,200,1 // vextracti128 $0x1,%ymm1,%xmm0
+ .byte 196,163,121,34,68,130,52,1 // vpinsrd $0x1,0x34(%rdx,%r8,4),%xmm0,%xmm0
+ .byte 196,227,117,56,200,1 // vinserti128 $0x1,%xmm0,%ymm1,%ymm1
+ .byte 196,227,125,57,200,1 // vextracti128 $0x1,%ymm1,%xmm0
+ .byte 196,163,121,34,68,130,48,0 // vpinsrd $0x0,0x30(%rdx,%r8,4),%xmm0,%xmm0
+ .byte 196,227,117,56,200,1 // vinserti128 $0x1,%xmm0,%ymm1,%ymm1
+ .byte 196,161,126,111,4,130 // vmovdqu (%rdx,%r8,4),%ymm0
+ .byte 196,161,122,111,100,130,32 // vmovdqu 0x20(%rdx,%r8,4),%xmm4
+ .byte 196,227,93,2,201,240 // vpblendd $0xf0,%ymm1,%ymm4,%ymm1
+ .byte 233,199,254,255,255 // jmpq 21c <_sk_load_8888_hsw_8bit+0x2e>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 241 // icebp
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,15 // decl (%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 249 // stc
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,96,255 // jmpq *-0x1(%rax)
+ .byte 255 // (bad)
+ .byte 255,76,255,255 // decl -0x1(%rdi,%rdi,8)
+ .byte 255 // (bad)
+ .byte 56,255 // cmp %bh,%bh
+ .byte 255 // (bad)
+ .byte 255,34 // jmpq *(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 190,254,255,255,149 // mov $0x95fffffe,%esi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,135,255,255,255,113 // incl 0x71ffffff(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,229 // jmpq *%rbp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,209 // callq *%rcx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 189,255,255,255,167 // mov $0xa7ffffff,%ebp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_8888_dst_hsw_8bit
.globl _sk_load_8888_dst_hsw_8bit
FUNCTION(_sk_load_8888_dst_hsw_8bit)
_sk_load_8888_dst_hsw_8bit:
- .byte 76,99,15 // movslq (%rdi),%r9
- .byte 76,139,71,16 // mov 0x10(%rdi),%r8
+ .byte 76,99,7 // movslq (%rdi),%r8
+ .byte 76,139,79,16 // mov 0x10(%rdi),%r9
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 72,99,80,8 // movslq 0x8(%rax),%rdx
- .byte 72,99,79,8 // movslq 0x8(%rdi),%rcx
- .byte 72,15,175,202 // imul %rdx,%rcx
- .byte 72,193,225,2 // shl $0x2,%rcx
- .byte 72,3,8 // add (%rax),%rcx
- .byte 74,141,4,137 // lea (%rcx,%r9,4),%rax
- .byte 77,133,192 // test %r8,%r8
- .byte 117,8 // jne 1f6 <_sk_load_8888_dst_hsw_8bit+0x2d>
- .byte 197,254,111,8 // vmovdqu (%rax),%ymm1
+ .byte 72,99,72,8 // movslq 0x8(%rax),%rcx
+ .byte 72,99,87,8 // movslq 0x8(%rdi),%rdx
+ .byte 72,15,175,209 // imul %rcx,%rdx
+ .byte 72,193,226,2 // shl $0x2,%rdx
+ .byte 72,3,16 // add (%rax),%rdx
+ .byte 77,133,201 // test %r9,%r9
+ .byte 117,17 // jne 3c6 <_sk_load_8888_dst_hsw_8bit+0x32>
+ .byte 196,161,126,111,92,130,32 // vmovdqu 0x20(%rdx,%r8,4),%ymm3
+ .byte 196,161,126,111,20,130 // vmovdqu (%rdx,%r8,4),%ymm2
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 185,8,0,0,0 // mov $0x8,%ecx
- .byte 68,41,193 // sub %r8d,%ecx
- .byte 192,225,3 // shl $0x3,%cl
- .byte 72,199,194,255,255,255,255 // mov $0xffffffffffffffff,%rdx
- .byte 72,211,234 // shr %cl,%rdx
- .byte 196,225,249,110,202 // vmovq %rdx,%xmm1
- .byte 196,226,125,33,201 // vpmovsxbd %xmm1,%ymm1
- .byte 196,226,117,140,8 // vpmaskmovd (%rax),%ymm1,%ymm1
- .byte 235,214 // jmp 1f2 <_sk_load_8888_dst_hsw_8bit+0x29>
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 197,229,239,219 // vpxor %ymm3,%ymm3,%ymm3
+ .byte 197,237,239,210 // vpxor %ymm2,%ymm2,%ymm2
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,231 // ja 3c2 <_sk_load_8888_dst_hsw_8bit+0x2e>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,22,1,0,0 // lea 0x116(%rip),%rcx # 4fc <_sk_load_8888_dst_hsw_8bit+0x168>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 196,161,121,110,20,130 // vmovd (%rdx,%r8,4),%xmm2
+ .byte 235,203 // jmp 3c2 <_sk_load_8888_dst_hsw_8bit+0x2e>
+ .byte 196,161,121,110,84,130,8 // vmovd 0x8(%rdx,%r8,4),%xmm2
+ .byte 196,226,121,89,210 // vpbroadcastq %xmm2,%xmm2
+ .byte 197,229,239,219 // vpxor %ymm3,%ymm3,%ymm3
+ .byte 196,227,101,2,210,4 // vpblendd $0x4,%ymm2,%ymm3,%ymm2
+ .byte 196,162,121,53,36,130 // vpmovzxdq (%rdx,%r8,4),%xmm4
+ .byte 197,249,112,228,232 // vpshufd $0xe8,%xmm4,%xmm4
+ .byte 196,227,109,2,212,3 // vpblendd $0x3,%ymm4,%ymm2,%ymm2
+ .byte 235,162 // jmp 3c2 <_sk_load_8888_dst_hsw_8bit+0x2e>
+ .byte 196,161,121,110,84,130,24 // vmovd 0x18(%rdx,%r8,4),%xmm2
+ .byte 196,226,125,89,210 // vpbroadcastq %xmm2,%ymm2
+ .byte 197,229,239,219 // vpxor %ymm3,%ymm3,%ymm3
+ .byte 196,227,101,2,210,64 // vpblendd $0x40,%ymm2,%ymm3,%ymm2
+ .byte 196,227,125,57,212,1 // vextracti128 $0x1,%ymm2,%xmm4
+ .byte 196,163,89,34,100,130,20,1 // vpinsrd $0x1,0x14(%rdx,%r8,4),%xmm4,%xmm4
+ .byte 196,227,109,56,212,1 // vinserti128 $0x1,%xmm4,%ymm2,%ymm2
+ .byte 196,227,125,57,212,1 // vextracti128 $0x1,%ymm2,%xmm4
+ .byte 196,163,89,34,100,130,16,0 // vpinsrd $0x0,0x10(%rdx,%r8,4),%xmm4,%xmm4
+ .byte 196,227,109,56,212,1 // vinserti128 $0x1,%xmm4,%ymm2,%ymm2
+ .byte 196,161,122,111,36,130 // vmovdqu (%rdx,%r8,4),%xmm4
+ .byte 196,227,93,2,210,240 // vpblendd $0xf0,%ymm2,%ymm4,%ymm2
+ .byte 233,83,255,255,255 // jmpq 3c2 <_sk_load_8888_dst_hsw_8bit+0x2e>
+ .byte 196,161,121,110,84,130,40 // vmovd 0x28(%rdx,%r8,4),%xmm2
+ .byte 196,226,121,89,210 // vpbroadcastq %xmm2,%xmm2
+ .byte 197,229,239,219 // vpxor %ymm3,%ymm3,%ymm3
+ .byte 196,227,101,2,218,4 // vpblendd $0x4,%ymm2,%ymm3,%ymm3
+ .byte 196,163,97,34,84,130,36,1 // vpinsrd $0x1,0x24(%rdx,%r8,4),%xmm3,%xmm2
+ .byte 196,227,101,2,218,15 // vpblendd $0xf,%ymm2,%ymm3,%ymm3
+ .byte 196,161,121,110,84,130,32 // vmovd 0x20(%rdx,%r8,4),%xmm2
+ .byte 196,227,101,2,218,1 // vpblendd $0x1,%ymm2,%ymm3,%ymm3
+ .byte 233,23,255,255,255 // jmpq 3bc <_sk_load_8888_dst_hsw_8bit+0x28>
+ .byte 196,161,121,110,84,130,56 // vmovd 0x38(%rdx,%r8,4),%xmm2
+ .byte 196,226,125,89,210 // vpbroadcastq %xmm2,%ymm2
+ .byte 197,229,239,219 // vpxor %ymm3,%ymm3,%ymm3
+ .byte 196,227,101,2,218,64 // vpblendd $0x40,%ymm2,%ymm3,%ymm3
+ .byte 196,227,125,57,218,1 // vextracti128 $0x1,%ymm3,%xmm2
+ .byte 196,163,105,34,84,130,52,1 // vpinsrd $0x1,0x34(%rdx,%r8,4),%xmm2,%xmm2
+ .byte 196,227,101,56,218,1 // vinserti128 $0x1,%xmm2,%ymm3,%ymm3
+ .byte 196,227,125,57,218,1 // vextracti128 $0x1,%ymm3,%xmm2
+ .byte 196,163,105,34,84,130,48,0 // vpinsrd $0x0,0x30(%rdx,%r8,4),%xmm2,%xmm2
+ .byte 196,227,101,56,218,1 // vinserti128 $0x1,%xmm2,%ymm3,%ymm3
+ .byte 196,161,126,111,20,130 // vmovdqu (%rdx,%r8,4),%ymm2
+ .byte 196,161,122,111,100,130,32 // vmovdqu 0x20(%rdx,%r8,4),%xmm4
+ .byte 196,227,93,2,219,240 // vpblendd $0xf0,%ymm3,%ymm4,%ymm3
+ .byte 233,199,254,255,255 // jmpq 3c2 <_sk_load_8888_dst_hsw_8bit+0x2e>
+ .byte 144 // nop
+ .byte 243,254 // repz (bad)
+ .byte 255 // (bad)
+ .byte 255,17 // callq *(%rcx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 251 // sti
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,98,255 // jmpq *-0x1(%rdx)
+ .byte 255 // (bad)
+ .byte 255,78,255 // decl -0x1(%rsi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 58,255 // cmp %bh,%bh
+ .byte 255 // (bad)
+ .byte 255,36,255 // jmpq *(%rdi,%rdi,8)
+ .byte 255 // (bad)
+ .byte 255,192 // inc %eax
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,151,255,255,255,137 // callq *-0x76000001(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,115,255 // pushq -0x1(%rbx)
+ .byte 255 // (bad)
+ .byte 255,231 // jmpq *%rdi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,211 // callq *%rbx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 191,255,255,255,169 // mov $0xa9ffffff,%edi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_store_8888_hsw_8bit
.globl _sk_store_8888_hsw_8bit
FUNCTION(_sk_store_8888_hsw_8bit)
_sk_store_8888_hsw_8bit:
- .byte 76,99,15 // movslq (%rdi),%r9
- .byte 76,139,71,16 // mov 0x10(%rdi),%r8
+ .byte 76,99,7 // movslq (%rdi),%r8
+ .byte 76,139,79,16 // mov 0x10(%rdi),%r9
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 72,99,80,8 // movslq 0x8(%rax),%rdx
- .byte 72,99,79,8 // movslq 0x8(%rdi),%rcx
- .byte 72,15,175,202 // imul %rdx,%rcx
- .byte 72,193,225,2 // shl $0x2,%rcx
- .byte 72,3,8 // add (%rax),%rcx
- .byte 74,141,4,137 // lea (%rcx,%r9,4),%rax
- .byte 77,133,192 // test %r8,%r8
- .byte 117,8 // jne 249 <_sk_store_8888_hsw_8bit+0x2d>
- .byte 197,254,127,0 // vmovdqu %ymm0,(%rax)
+ .byte 72,99,72,8 // movslq 0x8(%rax),%rcx
+ .byte 72,99,87,8 // movslq 0x8(%rdi),%rdx
+ .byte 72,15,175,209 // imul %rcx,%rdx
+ .byte 72,193,226,2 // shl $0x2,%rdx
+ .byte 72,3,16 // add (%rax),%rdx
+ .byte 77,133,201 // test %r9,%r9
+ .byte 117,17 // jne 56a <_sk_store_8888_hsw_8bit+0x32>
+ .byte 196,161,126,127,4,130 // vmovdqu %ymm0,(%rdx,%r8,4)
+ .byte 196,161,126,127,76,130,32 // vmovdqu %ymm1,0x20(%rdx,%r8,4)
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 185,8,0,0,0 // mov $0x8,%ecx
- .byte 68,41,193 // sub %r8d,%ecx
- .byte 192,225,3 // shl $0x3,%cl
- .byte 72,199,194,255,255,255,255 // mov $0xffffffffffffffff,%rdx
- .byte 72,211,234 // shr %cl,%rdx
- .byte 196,225,249,110,210 // vmovq %rdx,%xmm2
- .byte 196,226,125,33,210 // vpmovsxbd %xmm2,%ymm2
- .byte 196,226,109,142,0 // vpmaskmovd %ymm0,%ymm2,(%rax)
- .byte 235,214 // jmp 245 <_sk_store_8888_hsw_8bit+0x29>
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,239 // ja 566 <_sk_store_8888_hsw_8bit+0x2e>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,178,0,0,0 // lea 0xb2(%rip),%rcx # 634 <_sk_store_8888_hsw_8bit+0xfc>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 196,161,121,126,4,130 // vmovd %xmm0,(%rdx,%r8,4)
+ .byte 235,211 // jmp 566 <_sk_store_8888_hsw_8bit+0x2e>
+ .byte 196,163,121,22,68,130,8,2 // vpextrd $0x2,%xmm0,0x8(%rdx,%r8,4)
+ .byte 196,161,121,214,4,130 // vmovq %xmm0,(%rdx,%r8,4)
+ .byte 235,195 // jmp 566 <_sk_store_8888_hsw_8bit+0x2e>
+ .byte 196,227,125,57,196,1 // vextracti128 $0x1,%ymm0,%xmm4
+ .byte 196,163,121,22,100,130,24,2 // vpextrd $0x2,%xmm4,0x18(%rdx,%r8,4)
+ .byte 196,227,125,57,196,1 // vextracti128 $0x1,%ymm0,%xmm4
+ .byte 196,163,121,22,100,130,20,1 // vpextrd $0x1,%xmm4,0x14(%rdx,%r8,4)
+ .byte 196,227,125,57,196,1 // vextracti128 $0x1,%ymm0,%xmm4
+ .byte 196,161,121,126,100,130,16 // vmovd %xmm4,0x10(%rdx,%r8,4)
+ .byte 196,161,122,127,4,130 // vmovdqu %xmm0,(%rdx,%r8,4)
+ .byte 235,146 // jmp 566 <_sk_store_8888_hsw_8bit+0x2e>
+ .byte 196,163,121,22,76,130,40,2 // vpextrd $0x2,%xmm1,0x28(%rdx,%r8,4)
+ .byte 196,163,121,22,76,130,36,1 // vpextrd $0x1,%xmm1,0x24(%rdx,%r8,4)
+ .byte 196,161,121,126,76,130,32 // vmovd %xmm1,0x20(%rdx,%r8,4)
+ .byte 196,161,126,127,4,130 // vmovdqu %ymm0,(%rdx,%r8,4)
+ .byte 233,112,255,255,255 // jmpq 566 <_sk_store_8888_hsw_8bit+0x2e>
+ .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4
+ .byte 196,163,121,22,100,130,56,2 // vpextrd $0x2,%xmm4,0x38(%rdx,%r8,4)
+ .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4
+ .byte 196,163,121,22,100,130,52,1 // vpextrd $0x1,%xmm4,0x34(%rdx,%r8,4)
+ .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4
+ .byte 196,161,121,126,100,130,48 // vmovd %xmm4,0x30(%rdx,%r8,4)
+ .byte 196,161,126,127,4,130 // vmovdqu %ymm0,(%rdx,%r8,4)
+ .byte 196,161,122,127,76,130,32 // vmovdqu %xmm1,0x20(%rdx,%r8,4)
+ .byte 233,53,255,255,255 // jmpq 566 <_sk_store_8888_hsw_8bit+0x2e>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 87 // push %rdi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,103,255 // jmpq *-0x1(%rdi)
+ .byte 255 // (bad)
+ .byte 255,95,255 // lcall *-0x1(%rdi)
+ .byte 255 // (bad)
+ .byte 255,152,255,255,255,139 // lcall *-0x74000001(%rax)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 125,255 // jge 649 <_sk_store_8888_hsw_8bit+0x111>
+ .byte 255 // (bad)
+ .byte 255,111,255 // ljmp *-0x1(%rdi)
+ .byte 255 // (bad)
+ .byte 255,183,255,255,255,176 // pushq -0x4f000001(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,168,255,255,255,160 // ljmp *-0x5f000001(%rax)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 235,255 // jmp 661 <_sk_store_8888_hsw_8bit+0x129>
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 222,255 // fdivrp %st,%st(7)
+ .byte 255 // (bad)
+ .byte 255,208 // callq *%rax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,194 // inc %edx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_bgra_hsw_8bit
.globl _sk_load_bgra_hsw_8bit
FUNCTION(_sk_load_bgra_hsw_8bit)
_sk_load_bgra_hsw_8bit:
- .byte 76,99,15 // movslq (%rdi),%r9
- .byte 76,139,71,16 // mov 0x10(%rdi),%r8
+ .byte 76,99,7 // movslq (%rdi),%r8
+ .byte 76,139,79,16 // mov 0x10(%rdi),%r9
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 72,99,80,8 // movslq 0x8(%rax),%rdx
- .byte 72,99,79,8 // movslq 0x8(%rdi),%rcx
- .byte 72,15,175,202 // imul %rdx,%rcx
- .byte 72,193,225,2 // shl $0x2,%rcx
- .byte 72,3,8 // add (%rax),%rcx
- .byte 74,141,4,137 // lea (%rcx,%r9,4),%rax
- .byte 77,133,192 // test %r8,%r8
- .byte 117,17 // jne 2a5 <_sk_load_bgra_hsw_8bit+0x36>
- .byte 197,254,111,0 // vmovdqu (%rax),%ymm0
- .byte 196,226,125,0,5,255,16,0,0 // vpshufb 0x10ff(%rip),%ymm0,%ymm0 # 13a0 <_sk_xor__hsw_8bit+0x169>
+ .byte 72,99,72,8 // movslq 0x8(%rax),%rcx
+ .byte 72,99,87,8 // movslq 0x8(%rdi),%rdx
+ .byte 72,15,175,209 // imul %rcx,%rdx
+ .byte 72,193,226,2 // shl $0x2,%rdx
+ .byte 72,3,16 // add (%rax),%rdx
+ .byte 77,133,201 // test %r9,%r9
+ .byte 117,35 // jne 6b4 <_sk_load_bgra_hsw_8bit+0x44>
+ .byte 196,161,126,111,76,130,32 // vmovdqu 0x20(%rdx,%r8,4),%ymm1
+ .byte 196,161,126,111,4,130 // vmovdqu (%rdx,%r8,4),%ymm0
+ .byte 197,253,111,37,250,33,0,0 // vmovdqa 0x21fa(%rip),%ymm4 # 28a0 <_sk_xor__hsw_8bit+0x215>
+ .byte 196,226,125,0,196 // vpshufb %ymm4,%ymm0,%ymm0
+ .byte 196,226,117,0,204 // vpshufb %ymm4,%ymm1,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 185,8,0,0,0 // mov $0x8,%ecx
- .byte 68,41,193 // sub %r8d,%ecx
- .byte 192,225,3 // shl $0x3,%cl
- .byte 72,199,194,255,255,255,255 // mov $0xffffffffffffffff,%rdx
- .byte 72,211,234 // shr %cl,%rdx
- .byte 196,225,249,110,194 // vmovq %rdx,%xmm0
- .byte 196,226,125,33,192 // vpmovsxbd %xmm0,%ymm0
- .byte 196,226,125,140,0 // vpmaskmovd (%rax),%ymm0,%ymm0
- .byte 235,205 // jmp 298 <_sk_load_bgra_hsw_8bit+0x29>
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 197,245,239,201 // vpxor %ymm1,%ymm1,%ymm1
+ .byte 197,253,239,192 // vpxor %ymm0,%ymm0,%ymm0
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,213 // ja 69e <_sk_load_bgra_hsw_8bit+0x2e>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,24,1,0,0 // lea 0x118(%rip),%rcx # 7ec <_sk_load_bgra_hsw_8bit+0x17c>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 196,161,121,110,4,130 // vmovd (%rdx,%r8,4),%xmm0
+ .byte 235,185 // jmp 69e <_sk_load_bgra_hsw_8bit+0x2e>
+ .byte 196,161,121,110,68,130,8 // vmovd 0x8(%rdx,%r8,4),%xmm0
+ .byte 196,226,121,89,192 // vpbroadcastq %xmm0,%xmm0
+ .byte 197,245,239,201 // vpxor %ymm1,%ymm1,%ymm1
+ .byte 196,227,117,2,192,4 // vpblendd $0x4,%ymm0,%ymm1,%ymm0
+ .byte 196,162,121,53,36,130 // vpmovzxdq (%rdx,%r8,4),%xmm4
+ .byte 197,249,112,228,232 // vpshufd $0xe8,%xmm4,%xmm4
+ .byte 196,227,125,2,196,3 // vpblendd $0x3,%ymm4,%ymm0,%ymm0
+ .byte 235,144 // jmp 69e <_sk_load_bgra_hsw_8bit+0x2e>
+ .byte 196,161,121,110,68,130,24 // vmovd 0x18(%rdx,%r8,4),%xmm0
+ .byte 196,226,125,89,192 // vpbroadcastq %xmm0,%ymm0
+ .byte 197,245,239,201 // vpxor %ymm1,%ymm1,%ymm1
+ .byte 196,227,117,2,192,64 // vpblendd $0x40,%ymm0,%ymm1,%ymm0
+ .byte 196,227,125,57,196,1 // vextracti128 $0x1,%ymm0,%xmm4
+ .byte 196,163,89,34,100,130,20,1 // vpinsrd $0x1,0x14(%rdx,%r8,4),%xmm4,%xmm4
+ .byte 196,227,125,56,196,1 // vinserti128 $0x1,%xmm4,%ymm0,%ymm0
+ .byte 196,227,125,57,196,1 // vextracti128 $0x1,%ymm0,%xmm4
+ .byte 196,163,89,34,100,130,16,0 // vpinsrd $0x0,0x10(%rdx,%r8,4),%xmm4,%xmm4
+ .byte 196,227,125,56,196,1 // vinserti128 $0x1,%xmm4,%ymm0,%ymm0
+ .byte 196,161,122,111,36,130 // vmovdqu (%rdx,%r8,4),%xmm4
+ .byte 196,227,93,2,192,240 // vpblendd $0xf0,%ymm0,%ymm4,%ymm0
+ .byte 233,65,255,255,255 // jmpq 69e <_sk_load_bgra_hsw_8bit+0x2e>
+ .byte 196,161,121,110,68,130,40 // vmovd 0x28(%rdx,%r8,4),%xmm0
+ .byte 196,226,121,89,192 // vpbroadcastq %xmm0,%xmm0
+ .byte 197,245,239,201 // vpxor %ymm1,%ymm1,%ymm1
+ .byte 196,227,117,2,200,4 // vpblendd $0x4,%ymm0,%ymm1,%ymm1
+ .byte 196,163,113,34,68,130,36,1 // vpinsrd $0x1,0x24(%rdx,%r8,4),%xmm1,%xmm0
+ .byte 196,227,117,2,200,15 // vpblendd $0xf,%ymm0,%ymm1,%ymm1
+ .byte 196,161,121,110,68,130,32 // vmovd 0x20(%rdx,%r8,4),%xmm0
+ .byte 196,227,117,2,200,1 // vpblendd $0x1,%ymm0,%ymm1,%ymm1
+ .byte 233,5,255,255,255 // jmpq 698 <_sk_load_bgra_hsw_8bit+0x28>
+ .byte 196,161,121,110,68,130,56 // vmovd 0x38(%rdx,%r8,4),%xmm0
+ .byte 196,226,125,89,192 // vpbroadcastq %xmm0,%ymm0
+ .byte 197,245,239,201 // vpxor %ymm1,%ymm1,%ymm1
+ .byte 196,227,117,2,200,64 // vpblendd $0x40,%ymm0,%ymm1,%ymm1
+ .byte 196,227,125,57,200,1 // vextracti128 $0x1,%ymm1,%xmm0
+ .byte 196,163,121,34,68,130,52,1 // vpinsrd $0x1,0x34(%rdx,%r8,4),%xmm0,%xmm0
+ .byte 196,227,117,56,200,1 // vinserti128 $0x1,%xmm0,%ymm1,%ymm1
+ .byte 196,227,125,57,200,1 // vextracti128 $0x1,%ymm1,%xmm0
+ .byte 196,163,121,34,68,130,48,0 // vpinsrd $0x0,0x30(%rdx,%r8,4),%xmm0,%xmm0
+ .byte 196,227,117,56,200,1 // vinserti128 $0x1,%xmm0,%ymm1,%ymm1
+ .byte 196,161,126,111,4,130 // vmovdqu (%rdx,%r8,4),%ymm0
+ .byte 196,161,122,111,100,130,32 // vmovdqu 0x20(%rdx,%r8,4),%xmm4
+ .byte 196,227,93,2,201,240 // vpblendd $0xf0,%ymm1,%ymm4,%ymm1
+ .byte 233,181,254,255,255 // jmpq 69e <_sk_load_bgra_hsw_8bit+0x2e>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 241 // icebp
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,15 // decl (%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 249 // stc
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,96,255 // jmpq *-0x1(%rax)
+ .byte 255 // (bad)
+ .byte 255,76,255,255 // decl -0x1(%rdi,%rdi,8)
+ .byte 255 // (bad)
+ .byte 56,255 // cmp %bh,%bh
+ .byte 255 // (bad)
+ .byte 255,34 // jmpq *(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,172,254,255,255,149,255 // ljmp *-0x6a0001(%rsi,%rdi,8)
+ .byte 255 // (bad)
+ .byte 255,135,255,255,255,113 // incl 0x71ffffff(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,229 // jmpq *%rbp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,209 // callq *%rcx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 189,255,255,255,167 // mov $0xa7ffffff,%ebp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_bgra_dst_hsw_8bit
.globl _sk_load_bgra_dst_hsw_8bit
FUNCTION(_sk_load_bgra_dst_hsw_8bit)
_sk_load_bgra_dst_hsw_8bit:
- .byte 76,99,15 // movslq (%rdi),%r9
- .byte 76,139,71,16 // mov 0x10(%rdi),%r8
+ .byte 76,99,7 // movslq (%rdi),%r8
+ .byte 76,139,79,16 // mov 0x10(%rdi),%r9
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 72,99,80,8 // movslq 0x8(%rax),%rdx
- .byte 72,99,79,8 // movslq 0x8(%rdi),%rcx
- .byte 72,15,175,202 // imul %rdx,%rcx
- .byte 72,193,225,2 // shl $0x2,%rcx
- .byte 72,3,8 // add (%rax),%rcx
- .byte 74,141,4,137 // lea (%rcx,%r9,4),%rax
- .byte 77,133,192 // test %r8,%r8
- .byte 117,17 // jne 301 <_sk_load_bgra_dst_hsw_8bit+0x36>
- .byte 197,254,111,8 // vmovdqu (%rax),%ymm1
- .byte 196,226,117,0,13,195,16,0,0 // vpshufb 0x10c3(%rip),%ymm1,%ymm1 # 13c0 <_sk_xor__hsw_8bit+0x189>
+ .byte 72,99,72,8 // movslq 0x8(%rax),%rcx
+ .byte 72,99,87,8 // movslq 0x8(%rdi),%rdx
+ .byte 72,15,175,209 // imul %rcx,%rdx
+ .byte 72,193,226,2 // shl $0x2,%rdx
+ .byte 72,3,16 // add (%rax),%rdx
+ .byte 77,133,201 // test %r9,%r9
+ .byte 117,35 // jne 86c <_sk_load_bgra_dst_hsw_8bit+0x44>
+ .byte 196,161,126,111,92,130,32 // vmovdqu 0x20(%rdx,%r8,4),%ymm3
+ .byte 196,161,126,111,20,130 // vmovdqu (%rdx,%r8,4),%ymm2
+ .byte 197,253,111,37,98,32,0,0 // vmovdqa 0x2062(%rip),%ymm4 # 28c0 <_sk_xor__hsw_8bit+0x235>
+ .byte 196,226,109,0,212 // vpshufb %ymm4,%ymm2,%ymm2
+ .byte 196,226,101,0,220 // vpshufb %ymm4,%ymm3,%ymm3
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 185,8,0,0,0 // mov $0x8,%ecx
- .byte 68,41,193 // sub %r8d,%ecx
- .byte 192,225,3 // shl $0x3,%cl
- .byte 72,199,194,255,255,255,255 // mov $0xffffffffffffffff,%rdx
- .byte 72,211,234 // shr %cl,%rdx
- .byte 196,225,249,110,202 // vmovq %rdx,%xmm1
- .byte 196,226,125,33,201 // vpmovsxbd %xmm1,%ymm1
- .byte 196,226,117,140,8 // vpmaskmovd (%rax),%ymm1,%ymm1
- .byte 235,205 // jmp 2f4 <_sk_load_bgra_dst_hsw_8bit+0x29>
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 197,229,239,219 // vpxor %ymm3,%ymm3,%ymm3
+ .byte 197,237,239,210 // vpxor %ymm2,%ymm2,%ymm2
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,213 // ja 856 <_sk_load_bgra_dst_hsw_8bit+0x2e>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,24,1,0,0 // lea 0x118(%rip),%rcx # 9a4 <_sk_load_bgra_dst_hsw_8bit+0x17c>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 196,161,121,110,20,130 // vmovd (%rdx,%r8,4),%xmm2
+ .byte 235,185 // jmp 856 <_sk_load_bgra_dst_hsw_8bit+0x2e>
+ .byte 196,161,121,110,84,130,8 // vmovd 0x8(%rdx,%r8,4),%xmm2
+ .byte 196,226,121,89,210 // vpbroadcastq %xmm2,%xmm2
+ .byte 197,229,239,219 // vpxor %ymm3,%ymm3,%ymm3
+ .byte 196,227,101,2,210,4 // vpblendd $0x4,%ymm2,%ymm3,%ymm2
+ .byte 196,162,121,53,36,130 // vpmovzxdq (%rdx,%r8,4),%xmm4
+ .byte 197,249,112,228,232 // vpshufd $0xe8,%xmm4,%xmm4
+ .byte 196,227,109,2,212,3 // vpblendd $0x3,%ymm4,%ymm2,%ymm2
+ .byte 235,144 // jmp 856 <_sk_load_bgra_dst_hsw_8bit+0x2e>
+ .byte 196,161,121,110,84,130,24 // vmovd 0x18(%rdx,%r8,4),%xmm2
+ .byte 196,226,125,89,210 // vpbroadcastq %xmm2,%ymm2
+ .byte 197,229,239,219 // vpxor %ymm3,%ymm3,%ymm3
+ .byte 196,227,101,2,210,64 // vpblendd $0x40,%ymm2,%ymm3,%ymm2
+ .byte 196,227,125,57,212,1 // vextracti128 $0x1,%ymm2,%xmm4
+ .byte 196,163,89,34,100,130,20,1 // vpinsrd $0x1,0x14(%rdx,%r8,4),%xmm4,%xmm4
+ .byte 196,227,109,56,212,1 // vinserti128 $0x1,%xmm4,%ymm2,%ymm2
+ .byte 196,227,125,57,212,1 // vextracti128 $0x1,%ymm2,%xmm4
+ .byte 196,163,89,34,100,130,16,0 // vpinsrd $0x0,0x10(%rdx,%r8,4),%xmm4,%xmm4
+ .byte 196,227,109,56,212,1 // vinserti128 $0x1,%xmm4,%ymm2,%ymm2
+ .byte 196,161,122,111,36,130 // vmovdqu (%rdx,%r8,4),%xmm4
+ .byte 196,227,93,2,210,240 // vpblendd $0xf0,%ymm2,%ymm4,%ymm2
+ .byte 233,65,255,255,255 // jmpq 856 <_sk_load_bgra_dst_hsw_8bit+0x2e>
+ .byte 196,161,121,110,84,130,40 // vmovd 0x28(%rdx,%r8,4),%xmm2
+ .byte 196,226,121,89,210 // vpbroadcastq %xmm2,%xmm2
+ .byte 197,229,239,219 // vpxor %ymm3,%ymm3,%ymm3
+ .byte 196,227,101,2,218,4 // vpblendd $0x4,%ymm2,%ymm3,%ymm3
+ .byte 196,163,97,34,84,130,36,1 // vpinsrd $0x1,0x24(%rdx,%r8,4),%xmm3,%xmm2
+ .byte 196,227,101,2,218,15 // vpblendd $0xf,%ymm2,%ymm3,%ymm3
+ .byte 196,161,121,110,84,130,32 // vmovd 0x20(%rdx,%r8,4),%xmm2
+ .byte 196,227,101,2,218,1 // vpblendd $0x1,%ymm2,%ymm3,%ymm3
+ .byte 233,5,255,255,255 // jmpq 850 <_sk_load_bgra_dst_hsw_8bit+0x28>
+ .byte 196,161,121,110,84,130,56 // vmovd 0x38(%rdx,%r8,4),%xmm2
+ .byte 196,226,125,89,210 // vpbroadcastq %xmm2,%ymm2
+ .byte 197,229,239,219 // vpxor %ymm3,%ymm3,%ymm3
+ .byte 196,227,101,2,218,64 // vpblendd $0x40,%ymm2,%ymm3,%ymm3
+ .byte 196,227,125,57,218,1 // vextracti128 $0x1,%ymm3,%xmm2
+ .byte 196,163,105,34,84,130,52,1 // vpinsrd $0x1,0x34(%rdx,%r8,4),%xmm2,%xmm2
+ .byte 196,227,101,56,218,1 // vinserti128 $0x1,%xmm2,%ymm3,%ymm3
+ .byte 196,227,125,57,218,1 // vextracti128 $0x1,%ymm3,%xmm2
+ .byte 196,163,105,34,84,130,48,0 // vpinsrd $0x0,0x30(%rdx,%r8,4),%xmm2,%xmm2
+ .byte 196,227,101,56,218,1 // vinserti128 $0x1,%xmm2,%ymm3,%ymm3
+ .byte 196,161,126,111,20,130 // vmovdqu (%rdx,%r8,4),%ymm2
+ .byte 196,161,122,111,100,130,32 // vmovdqu 0x20(%rdx,%r8,4),%xmm4
+ .byte 196,227,93,2,219,240 // vpblendd $0xf0,%ymm3,%ymm4,%ymm3
+ .byte 233,181,254,255,255 // jmpq 856 <_sk_load_bgra_dst_hsw_8bit+0x2e>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 241 // icebp
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,15 // decl (%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 249 // stc
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,96,255 // jmpq *-0x1(%rax)
+ .byte 255 // (bad)
+ .byte 255,76,255,255 // decl -0x1(%rdi,%rdi,8)
+ .byte 255 // (bad)
+ .byte 56,255 // cmp %bh,%bh
+ .byte 255 // (bad)
+ .byte 255,34 // jmpq *(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,172,254,255,255,149,255 // ljmp *-0x6a0001(%rsi,%rdi,8)
+ .byte 255 // (bad)
+ .byte 255,135,255,255,255,113 // incl 0x71ffffff(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,229 // jmpq *%rbp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,209 // callq *%rcx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 189,255,255,255,167 // mov $0xa7ffffff,%ebp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_store_bgra_hsw_8bit
.globl _sk_store_bgra_hsw_8bit
FUNCTION(_sk_store_bgra_hsw_8bit)
_sk_store_bgra_hsw_8bit:
- .byte 76,99,15 // movslq (%rdi),%r9
- .byte 76,139,71,16 // mov 0x10(%rdi),%r8
+ .byte 76,99,7 // movslq (%rdi),%r8
+ .byte 76,139,79,16 // mov 0x10(%rdi),%r9
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 72,99,80,8 // movslq 0x8(%rax),%rdx
- .byte 72,99,79,8 // movslq 0x8(%rdi),%rcx
- .byte 72,15,175,202 // imul %rdx,%rcx
- .byte 72,193,225,2 // shl $0x2,%rcx
- .byte 72,3,8 // add (%rax),%rcx
- .byte 74,141,4,137 // lea (%rcx,%r9,4),%rax
- .byte 196,226,125,0,21,144,16,0,0 // vpshufb 0x1090(%rip),%ymm0,%ymm2 # 13e0 <_sk_xor__hsw_8bit+0x1a9>
- .byte 77,133,192 // test %r8,%r8
- .byte 117,8 // jne 35d <_sk_store_bgra_hsw_8bit+0x36>
- .byte 197,254,127,16 // vmovdqu %ymm2,(%rax)
+ .byte 72,99,72,8 // movslq 0x8(%rax),%rcx
+ .byte 72,99,87,8 // movslq 0x8(%rdi),%rdx
+ .byte 72,15,175,209 // imul %rcx,%rdx
+ .byte 72,193,226,2 // shl $0x2,%rdx
+ .byte 72,3,16 // add (%rax),%rdx
+ .byte 197,253,111,37,220,30,0,0 // vmovdqa 0x1edc(%rip),%ymm4 # 28e0 <_sk_xor__hsw_8bit+0x255>
+ .byte 196,226,117,0,236 // vpshufb %ymm4,%ymm1,%ymm5
+ .byte 196,226,125,0,228 // vpshufb %ymm4,%ymm0,%ymm4
+ .byte 77,133,201 // test %r9,%r9
+ .byte 117,17 // jne a24 <_sk_store_bgra_hsw_8bit+0x44>
+ .byte 196,161,126,127,36,130 // vmovdqu %ymm4,(%rdx,%r8,4)
+ .byte 196,161,126,127,108,130,32 // vmovdqu %ymm5,0x20(%rdx,%r8,4)
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 185,8,0,0,0 // mov $0x8,%ecx
- .byte 68,41,193 // sub %r8d,%ecx
- .byte 192,225,3 // shl $0x3,%cl
- .byte 72,199,194,255,255,255,255 // mov $0xffffffffffffffff,%rdx
- .byte 72,211,234 // shr %cl,%rdx
- .byte 196,225,249,110,218 // vmovq %rdx,%xmm3
- .byte 196,226,125,33,219 // vpmovsxbd %xmm3,%ymm3
- .byte 196,226,101,142,16 // vpmaskmovd %ymm2,%ymm3,(%rax)
- .byte 235,214 // jmp 359 <_sk_store_bgra_hsw_8bit+0x32>
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,239 // ja a20 <_sk_store_bgra_hsw_8bit+0x40>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,176,0,0,0 // lea 0xb0(%rip),%rcx # aec <_sk_store_bgra_hsw_8bit+0x10c>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 196,161,121,126,36,130 // vmovd %xmm4,(%rdx,%r8,4)
+ .byte 235,211 // jmp a20 <_sk_store_bgra_hsw_8bit+0x40>
+ .byte 196,163,121,22,100,130,8,2 // vpextrd $0x2,%xmm4,0x8(%rdx,%r8,4)
+ .byte 196,161,121,214,36,130 // vmovq %xmm4,(%rdx,%r8,4)
+ .byte 235,195 // jmp a20 <_sk_store_bgra_hsw_8bit+0x40>
+ .byte 196,227,125,57,229,1 // vextracti128 $0x1,%ymm4,%xmm5
+ .byte 196,163,121,22,108,130,24,2 // vpextrd $0x2,%xmm5,0x18(%rdx,%r8,4)
+ .byte 196,227,125,57,229,1 // vextracti128 $0x1,%ymm4,%xmm5
+ .byte 196,163,121,22,108,130,20,1 // vpextrd $0x1,%xmm5,0x14(%rdx,%r8,4)
+ .byte 196,227,125,57,229,1 // vextracti128 $0x1,%ymm4,%xmm5
+ .byte 196,161,121,126,108,130,16 // vmovd %xmm5,0x10(%rdx,%r8,4)
+ .byte 196,161,122,127,36,130 // vmovdqu %xmm4,(%rdx,%r8,4)
+ .byte 235,146 // jmp a20 <_sk_store_bgra_hsw_8bit+0x40>
+ .byte 196,163,121,22,108,130,40,2 // vpextrd $0x2,%xmm5,0x28(%rdx,%r8,4)
+ .byte 196,163,121,22,108,130,36,1 // vpextrd $0x1,%xmm5,0x24(%rdx,%r8,4)
+ .byte 196,161,121,126,108,130,32 // vmovd %xmm5,0x20(%rdx,%r8,4)
+ .byte 196,161,126,127,36,130 // vmovdqu %ymm4,(%rdx,%r8,4)
+ .byte 233,112,255,255,255 // jmpq a20 <_sk_store_bgra_hsw_8bit+0x40>
+ .byte 196,227,125,57,238,1 // vextracti128 $0x1,%ymm5,%xmm6
+ .byte 196,163,121,22,116,130,56,2 // vpextrd $0x2,%xmm6,0x38(%rdx,%r8,4)
+ .byte 196,227,125,57,238,1 // vextracti128 $0x1,%ymm5,%xmm6
+ .byte 196,163,121,22,116,130,52,1 // vpextrd $0x1,%xmm6,0x34(%rdx,%r8,4)
+ .byte 196,227,125,57,238,1 // vextracti128 $0x1,%ymm5,%xmm6
+ .byte 196,161,121,126,116,130,48 // vmovd %xmm6,0x30(%rdx,%r8,4)
+ .byte 196,161,126,127,36,130 // vmovdqu %ymm4,(%rdx,%r8,4)
+ .byte 196,161,122,127,108,130,32 // vmovdqu %xmm5,0x20(%rdx,%r8,4)
+ .byte 233,53,255,255,255 // jmpq a20 <_sk_store_bgra_hsw_8bit+0x40>
+ .byte 144 // nop
+ .byte 89 // pop %rcx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,105,255 // ljmp *-0x1(%rcx)
+ .byte 255 // (bad)
+ .byte 255,97,255 // jmpq *-0x1(%rcx)
+ .byte 255 // (bad)
+ .byte 255,154,255,255,255,141 // lcall *-0x72000001(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 127,255 // jg b01 <_sk_store_bgra_hsw_8bit+0x121>
+ .byte 255 // (bad)
+ .byte 255,113,255 // pushq -0x1(%rcx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 185,255,255,255,178 // mov $0xb2ffffff,%ecx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,170,255,255,255,162 // ljmp *-0x5d000001(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 237 // in (%dx),%eax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,224 // jmpq *%rax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,210 // callq *%rdx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,196 // inc %esp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_a8_hsw_8bit
.globl _sk_load_a8_hsw_8bit
@@ -56380,62 +56878,86 @@ _sk_load_a8_hsw_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,28 // jne 3bc <_sk_load_a8_hsw_8bit+0x39>
- .byte 196,162,121,48,4,2 // vpmovzxbw (%rdx,%r8,1),%xmm0
- .byte 197,249,219,5,82,18,0,0 // vpand 0x1252(%rip),%xmm0,%xmm0 # 1600 <_sk_xor__hsw_8bit+0x3c9>
- .byte 196,226,125,51,192 // vpmovzxwd %xmm0,%ymm0
+ .byte 117,35 // jne b68 <_sk_load_a8_hsw_8bit+0x40>
+ .byte 196,161,122,111,4,2 // vmovdqu (%rdx,%r8,1),%xmm0
+ .byte 197,249,112,200,78 // vpshufd $0x4e,%xmm0,%xmm1
+ .byte 196,226,125,49,201 // vpmovzxbd %xmm1,%ymm1
+ .byte 196,226,125,49,192 // vpmovzxbd %xmm0,%ymm0
.byte 197,253,114,240,24 // vpslld $0x18,%ymm0,%ymm0
+ .byte 197,245,114,241,24 // vpslld $0x18,%ymm1,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 65,128,225,15 // and $0xf,%r9b
.byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
.byte 65,254,201 // dec %r9b
- .byte 65,128,249,6 // cmp $0x6,%r9b
- .byte 119,217 // ja 3a6 <_sk_load_a8_hsw_8bit+0x23>
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,210 // ja b4b <_sk_load_a8_hsw_8bit+0x23>
.byte 65,15,182,193 // movzbl %r9b,%eax
- .byte 72,141,13,116,0,0,0 // lea 0x74(%rip),%rcx # 44c <_sk_load_a8_hsw_8bit+0xc9>
+ .byte 72,141,13,192,0,0,0 // lea 0xc0(%rip),%rcx # c44 <_sk_load_a8_hsw_8bit+0x11c>
.byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
.byte 72,1,200 // add %rcx,%rax
.byte 255,224 // jmpq *%rax
.byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
.byte 197,249,110,192 // vmovd %eax,%xmm0
- .byte 235,186 // jmp 3a6 <_sk_load_a8_hsw_8bit+0x23>
- .byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
+ .byte 235,179 // jmp b4b <_sk_load_a8_hsw_8bit+0x23>
.byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
- .byte 197,249,196,192,2 // vpinsrw $0x2,%eax,%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,2,2 // vpinsrb $0x2,0x2(%rdx,%r8,1),%xmm0,%xmm0
.byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
- .byte 197,249,110,208 // vmovd %eax,%xmm2
- .byte 196,226,121,48,210 // vpmovzxbw %xmm2,%xmm2
- .byte 196,227,121,2,194,1 // vpblendd $0x1,%xmm2,%xmm0,%xmm0
- .byte 235,149 // jmp 3a6 <_sk_load_a8_hsw_8bit+0x23>
- .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
+ .byte 197,249,110,200 // vmovd %eax,%xmm1
+ .byte 196,227,121,14,193,1 // vpblendw $0x1,%xmm1,%xmm0,%xmm0
+ .byte 235,150 // jmp b4b <_sk_load_a8_hsw_8bit+0x23>
.byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
- .byte 197,249,196,192,6 // vpinsrw $0x6,%eax,%xmm0,%xmm0
- .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
- .byte 197,249,196,192,5 // vpinsrw $0x5,%eax,%xmm0,%xmm0
- .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
- .byte 197,249,196,192,4 // vpinsrw $0x4,%eax,%xmm0,%xmm0
- .byte 196,161,121,110,20,2 // vmovd (%rdx,%r8,1),%xmm2
- .byte 196,226,121,48,210 // vpmovzxbw %xmm2,%xmm2
- .byte 196,227,105,2,192,12 // vpblendd $0xc,%xmm0,%xmm2,%xmm0
- .byte 233,90,255,255,255 // jmpq 3a6 <_sk_load_a8_hsw_8bit+0x23>
- .byte 149 // xchg %eax,%ebp
+ .byte 196,163,121,32,68,2,6,6 // vpinsrb $0x6,0x6(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,5,5 // vpinsrb $0x5,0x5(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,4,4 // vpinsrb $0x4,0x4(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,161,121,110,12,2 // vmovd (%rdx,%r8,1),%xmm1
+ .byte 196,227,121,2,193,1 // vpblendd $0x1,%xmm1,%xmm0,%xmm0
+ .byte 233,105,255,255,255 // jmpq b4b <_sk_load_a8_hsw_8bit+0x23>
+ .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,10,10 // vpinsrb $0xa,0xa(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,9,9 // vpinsrb $0x9,0x9(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,8,8 // vpinsrb $0x8,0x8(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,161,122,126,12,2 // vmovq (%rdx,%r8,1),%xmm1
+ .byte 196,227,113,2,192,12 // vpblendd $0xc,%xmm0,%xmm1,%xmm0
+ .byte 233,60,255,255,255 // jmpq b4b <_sk_load_a8_hsw_8bit+0x23>
+ .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,14,14 // vpinsrb $0xe,0xe(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,13,13 // vpinsrb $0xd,0xd(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,12,12 // vpinsrb $0xc,0xc(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,161,122,126,12,2 // vmovq (%rdx,%r8,1),%xmm1
+ .byte 196,163,113,34,76,2,8,2 // vpinsrd $0x2,0x8(%rdx,%r8,1),%xmm1,%xmm1
+ .byte 196,227,113,2,192,8 // vpblendd $0x8,%xmm0,%xmm1,%xmm0
+ .byte 233,7,255,255,255 // jmpq b4b <_sk_load_a8_hsw_8bit+0x23>
+ .byte 73,255 // rex.WB (bad)
.byte 255 // (bad)
+ .byte 255,96,255 // jmpq *-0x1(%rax)
.byte 255 // (bad)
- .byte 255,175,255,255,255,160 // ljmp *-0x5f000001(%rdi)
+ .byte 255,84,255,255 // callq *-0x1(%rdi,%rdi,8)
+ .byte 255,141,255,255,255,133 // decl -0x7a000001(%rbp)
.byte 255 // (bad)
.byte 255 // (bad)
.byte 255 // (bad)
- .byte 234 // (bad)
+ .byte 125,255 // jge c59 <_sk_load_a8_hsw_8bit+0x131>
+ .byte 255 // (bad)
+ .byte 255,113,255 // pushq -0x1(%rcx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 186,255,255,255,178 // mov $0xb2ffffff,%edx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,170,255,255,255,158 // ljmp *-0x61000001(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,231 // jmpq *%rdi
.byte 255 // (bad)
.byte 255 // (bad)
.byte 255 // (bad)
.byte 223,255 // (bad)
.byte 255 // (bad)
- .byte 255,212 // callq *%rsp
+ .byte 255,215 // callq *%rdi
.byte 255 // (bad)
.byte 255 // (bad)
- .byte 255,197 // inc %ebp
+ .byte 255,203 // dec %ebx
.byte 255 // (bad)
.byte 255 // (bad)
.byte 255 // .byte 0xff
@@ -56452,61 +56974,86 @@ _sk_load_a8_dst_hsw_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,28 // jne 4a1 <_sk_load_a8_dst_hsw_8bit+0x39>
- .byte 196,162,121,48,12,2 // vpmovzxbw (%rdx,%r8,1),%xmm1
- .byte 197,241,219,13,125,17,0,0 // vpand 0x117d(%rip),%xmm1,%xmm1 # 1610 <_sk_xor__hsw_8bit+0x3d9>
- .byte 196,226,125,51,201 // vpmovzxwd %xmm1,%ymm1
- .byte 197,245,114,241,24 // vpslld $0x18,%ymm1,%ymm1
+ .byte 117,35 // jne cc0 <_sk_load_a8_dst_hsw_8bit+0x40>
+ .byte 196,161,122,111,20,2 // vmovdqu (%rdx,%r8,1),%xmm2
+ .byte 197,249,112,218,78 // vpshufd $0x4e,%xmm2,%xmm3
+ .byte 196,226,125,49,219 // vpmovzxbd %xmm3,%ymm3
+ .byte 196,226,125,49,210 // vpmovzxbd %xmm2,%ymm2
+ .byte 197,237,114,242,24 // vpslld $0x18,%ymm2,%ymm2
+ .byte 197,229,114,243,24 // vpslld $0x18,%ymm3,%ymm3
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,7 // and $0x7,%r9b
- .byte 197,241,239,201 // vpxor %xmm1,%xmm1,%xmm1
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2
.byte 65,254,201 // dec %r9b
- .byte 65,128,249,6 // cmp $0x6,%r9b
- .byte 119,217 // ja 48b <_sk_load_a8_dst_hsw_8bit+0x23>
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,210 // ja ca3 <_sk_load_a8_dst_hsw_8bit+0x23>
.byte 65,15,182,193 // movzbl %r9b,%eax
- .byte 72,141,13,119,0,0,0 // lea 0x77(%rip),%rcx # 534 <_sk_load_a8_dst_hsw_8bit+0xcc>
+ .byte 72,141,13,192,0,0,0 // lea 0xc0(%rip),%rcx # d9c <_sk_load_a8_dst_hsw_8bit+0x11c>
.byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
.byte 72,1,200 // add %rcx,%rax
.byte 255,224 // jmpq *%rax
.byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
- .byte 197,249,110,200 // vmovd %eax,%xmm1
- .byte 235,186 // jmp 48b <_sk_load_a8_dst_hsw_8bit+0x23>
- .byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
- .byte 197,241,239,201 // vpxor %xmm1,%xmm1,%xmm1
- .byte 197,241,196,200,2 // vpinsrw $0x2,%eax,%xmm1,%xmm1
- .byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
.byte 197,249,110,208 // vmovd %eax,%xmm2
- .byte 196,226,121,48,210 // vpmovzxbw %xmm2,%xmm2
- .byte 196,227,113,2,202,1 // vpblendd $0x1,%xmm2,%xmm1,%xmm1
- .byte 235,149 // jmp 48b <_sk_load_a8_dst_hsw_8bit+0x23>
- .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
- .byte 197,241,239,201 // vpxor %xmm1,%xmm1,%xmm1
- .byte 197,241,196,200,6 // vpinsrw $0x6,%eax,%xmm1,%xmm1
- .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
- .byte 197,241,196,200,5 // vpinsrw $0x5,%eax,%xmm1,%xmm1
- .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
- .byte 197,241,196,200,4 // vpinsrw $0x4,%eax,%xmm1,%xmm1
- .byte 196,161,121,110,20,2 // vmovd (%rdx,%r8,1),%xmm2
- .byte 196,226,121,48,210 // vpmovzxbw %xmm2,%xmm2
- .byte 196,227,105,2,201,12 // vpblendd $0xc,%xmm1,%xmm2,%xmm1
- .byte 233,90,255,255,255 // jmpq 48b <_sk_load_a8_dst_hsw_8bit+0x23>
- .byte 15,31,0 // nopl (%rax)
- .byte 146 // xchg %eax,%edx
+ .byte 235,179 // jmp ca3 <_sk_load_a8_dst_hsw_8bit+0x23>
+ .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,2,2 // vpinsrb $0x2,0x2(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
+ .byte 197,249,110,216 // vmovd %eax,%xmm3
+ .byte 196,227,105,14,211,1 // vpblendw $0x1,%xmm3,%xmm2,%xmm2
+ .byte 235,150 // jmp ca3 <_sk_load_a8_dst_hsw_8bit+0x23>
+ .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,6,6 // vpinsrb $0x6,0x6(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,5,5 // vpinsrb $0x5,0x5(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,4,4 // vpinsrb $0x4,0x4(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,161,121,110,28,2 // vmovd (%rdx,%r8,1),%xmm3
+ .byte 196,227,105,2,211,1 // vpblendd $0x1,%xmm3,%xmm2,%xmm2
+ .byte 233,105,255,255,255 // jmpq ca3 <_sk_load_a8_dst_hsw_8bit+0x23>
+ .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,10,10 // vpinsrb $0xa,0xa(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,9,9 // vpinsrb $0x9,0x9(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,8,8 // vpinsrb $0x8,0x8(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,161,122,126,28,2 // vmovq (%rdx,%r8,1),%xmm3
+ .byte 196,227,97,2,210,12 // vpblendd $0xc,%xmm2,%xmm3,%xmm2
+ .byte 233,60,255,255,255 // jmpq ca3 <_sk_load_a8_dst_hsw_8bit+0x23>
+ .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,14,14 // vpinsrb $0xe,0xe(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,13,13 // vpinsrb $0xd,0xd(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,12,12 // vpinsrb $0xc,0xc(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,161,122,126,28,2 // vmovq (%rdx,%r8,1),%xmm3
+ .byte 196,163,97,34,92,2,8,2 // vpinsrd $0x2,0x8(%rdx,%r8,1),%xmm3,%xmm3
+ .byte 196,227,97,2,210,8 // vpblendd $0x8,%xmm2,%xmm3,%xmm2
+ .byte 233,7,255,255,255 // jmpq ca3 <_sk_load_a8_dst_hsw_8bit+0x23>
+ .byte 73,255 // rex.WB (bad)
.byte 255 // (bad)
+ .byte 255,96,255 // jmpq *-0x1(%rax)
+ .byte 255 // (bad)
+ .byte 255,84,255,255 // callq *-0x1(%rdi,%rdi,8)
+ .byte 255,141,255,255,255,133 // decl -0x7a000001(%rbp)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 125,255 // jge db1 <_sk_load_a8_dst_hsw_8bit+0x131>
+ .byte 255 // (bad)
+ .byte 255,113,255 // pushq -0x1(%rcx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 186,255,255,255,178 // mov $0xb2ffffff,%edx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,170,255,255,255,158 // ljmp *-0x61000001(%rdx)
.byte 255 // (bad)
- .byte 255,172,255,255,255,157,255 // ljmp *-0x620001(%rdi,%rdi,8)
.byte 255 // (bad)
.byte 255,231 // jmpq *%rdi
.byte 255 // (bad)
.byte 255 // (bad)
.byte 255 // (bad)
- .byte 220,255 // fdivr %st,%st(7)
+ .byte 223,255 // (bad)
.byte 255 // (bad)
- .byte 255,209 // callq *%rcx
+ .byte 255,215 // callq *%rdi
.byte 255 // (bad)
.byte 255 // (bad)
- .byte 255,194 // inc %edx
+ .byte 255,203 // dec %ebx
.byte 255 // (bad)
.byte 255 // (bad)
.byte 255 // .byte 0xff
@@ -56522,51 +57069,80 @@ _sk_store_a8_hsw_8bit:
.byte 72,99,87,8 // movslq 0x8(%rdi),%rdx
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
- .byte 196,226,125,0,21,143,14,0,0 // vpshufb 0xe8f(%rip),%ymm0,%ymm2 # 1400 <_sk_xor__hsw_8bit+0x1c9>
- .byte 196,227,253,0,210,232 // vpermq $0xe8,%ymm2,%ymm2
+ .byte 197,253,111,37,8,27,0,0 // vmovdqa 0x1b08(%rip),%ymm4 # 2900 <_sk_xor__hsw_8bit+0x275>
+ .byte 196,226,117,0,236 // vpshufb %ymm4,%ymm1,%ymm5
+ .byte 196,227,253,0,237,232 // vpermq $0xe8,%ymm5,%ymm5
+ .byte 197,249,111,53,245,28,0,0 // vmovdqa 0x1cf5(%rip),%xmm6 # 2b00 <_sk_xor__hsw_8bit+0x475>
+ .byte 196,226,81,0,238 // vpshufb %xmm6,%xmm5,%xmm5
+ .byte 196,226,125,0,228 // vpshufb %ymm4,%ymm0,%ymm4
+ .byte 196,227,253,0,228,232 // vpermq $0xe8,%ymm4,%ymm4
+ .byte 196,226,89,0,230 // vpshufb %xmm6,%xmm4,%xmm4
+ .byte 197,217,108,229 // vpunpcklqdq %xmm5,%xmm4,%xmm4
.byte 77,133,201 // test %r9,%r9
- .byte 117,19 // jne 58f <_sk_store_a8_hsw_8bit+0x3f>
- .byte 196,226,105,0,21,187,16,0,0 // vpshufb 0x10bb(%rip),%xmm2,%xmm2 # 1640 <_sk_xor__hsw_8bit+0x409>
- .byte 196,161,121,214,20,2 // vmovq %xmm2,(%rdx,%r8,1)
+ .byte 117,10 // jne e33 <_sk_store_a8_hsw_8bit+0x5b>
+ .byte 196,161,122,127,36,2 // vmovdqu %xmm4,(%rdx,%r8,1)
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 65,128,225,15 // and $0xf,%r9b
.byte 65,254,201 // dec %r9b
- .byte 65,128,249,6 // cmp $0x6,%r9b
- .byte 119,239 // ja 58b <_sk_store_a8_hsw_8bit+0x3b>
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,239 // ja e2f <_sk_store_a8_hsw_8bit+0x57>
.byte 65,15,182,193 // movzbl %r9b,%eax
- .byte 72,141,13,85,0,0,0 // lea 0x55(%rip),%rcx # 5fc <_sk_store_a8_hsw_8bit+0xac>
+ .byte 72,141,13,137,0,0,0 // lea 0x89(%rip),%rcx # ed4 <_sk_store_a8_hsw_8bit+0xfc>
.byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
.byte 72,1,200 // add %rcx,%rax
.byte 255,224 // jmpq *%rax
- .byte 196,163,121,20,20,2,0 // vpextrb $0x0,%xmm2,(%rdx,%r8,1)
- .byte 235,210 // jmp 58b <_sk_store_a8_hsw_8bit+0x3b>
- .byte 196,163,121,20,84,2,2,4 // vpextrb $0x4,%xmm2,0x2(%rdx,%r8,1)
- .byte 196,226,105,0,21,86,16,0,0 // vpshufb 0x1056(%rip),%xmm2,%xmm2 # 1620 <_sk_xor__hsw_8bit+0x3e9>
- .byte 196,163,121,21,20,2,0 // vpextrw $0x0,%xmm2,(%rdx,%r8,1)
- .byte 235,184 // jmp 58b <_sk_store_a8_hsw_8bit+0x3b>
- .byte 196,163,121,20,84,2,6,12 // vpextrb $0xc,%xmm2,0x6(%rdx,%r8,1)
- .byte 196,163,121,20,84,2,5,10 // vpextrb $0xa,%xmm2,0x5(%rdx,%r8,1)
- .byte 196,163,121,20,84,2,4,8 // vpextrb $0x8,%xmm2,0x4(%rdx,%r8,1)
- .byte 196,226,105,0,21,60,16,0,0 // vpshufb 0x103c(%rip),%xmm2,%xmm2 # 1630 <_sk_xor__hsw_8bit+0x3f9>
- .byte 196,161,121,126,20,2 // vmovd %xmm2,(%rdx,%r8,1)
- .byte 235,143 // jmp 58b <_sk_store_a8_hsw_8bit+0x3b>
- .byte 180,255 // mov $0xff,%ah
+ .byte 196,163,121,20,36,2,0 // vpextrb $0x0,%xmm4,(%rdx,%r8,1)
+ .byte 235,210 // jmp e2f <_sk_store_a8_hsw_8bit+0x57>
+ .byte 196,163,121,20,100,2,2,2 // vpextrb $0x2,%xmm4,0x2(%rdx,%r8,1)
+ .byte 196,163,121,21,36,2,0 // vpextrw $0x0,%xmm4,(%rdx,%r8,1)
+ .byte 235,193 // jmp e2f <_sk_store_a8_hsw_8bit+0x57>
+ .byte 196,163,121,20,100,2,6,6 // vpextrb $0x6,%xmm4,0x6(%rdx,%r8,1)
+ .byte 196,163,121,20,100,2,5,5 // vpextrb $0x5,%xmm4,0x5(%rdx,%r8,1)
+ .byte 196,163,121,20,100,2,4,4 // vpextrb $0x4,%xmm4,0x4(%rdx,%r8,1)
+ .byte 196,161,121,126,36,2 // vmovd %xmm4,(%rdx,%r8,1)
+ .byte 235,161 // jmp e2f <_sk_store_a8_hsw_8bit+0x57>
+ .byte 196,163,121,20,100,2,10,10 // vpextrb $0xa,%xmm4,0xa(%rdx,%r8,1)
+ .byte 196,163,121,20,100,2,9,9 // vpextrb $0x9,%xmm4,0x9(%rdx,%r8,1)
+ .byte 196,163,121,20,100,2,8,8 // vpextrb $0x8,%xmm4,0x8(%rdx,%r8,1)
+ .byte 235,32 // jmp ec8 <_sk_store_a8_hsw_8bit+0xf0>
+ .byte 196,163,121,20,100,2,14,14 // vpextrb $0xe,%xmm4,0xe(%rdx,%r8,1)
+ .byte 196,163,121,20,100,2,13,13 // vpextrb $0xd,%xmm4,0xd(%rdx,%r8,1)
+ .byte 196,163,121,20,100,2,12,12 // vpextrb $0xc,%xmm4,0xc(%rdx,%r8,1)
+ .byte 196,163,121,22,100,2,8,2 // vpextrd $0x2,%xmm4,0x8(%rdx,%r8,1)
+ .byte 196,161,121,214,36,2 // vmovq %xmm4,(%rdx,%r8,1)
+ .byte 233,92,255,255,255 // jmpq e2f <_sk_store_a8_hsw_8bit+0x57>
+ .byte 144 // nop
+ .byte 128,255,255 // cmp $0xff,%bh
+ .byte 255,145,255,255,255,137 // callq *-0x76000001(%rcx)
.byte 255 // (bad)
- .byte 255,197 // inc %ebp
.byte 255 // (bad)
+ .byte 255,178,255,255,255,170 // pushq -0x55000001(%rdx)
.byte 255 // (bad)
.byte 255 // (bad)
- .byte 189,255,255,255,239 // mov $0xefffffff,%ebp
+ .byte 255,162,255,255,255,154 // jmpq *-0x65000001(%rdx)
.byte 255 // (bad)
.byte 255 // (bad)
- .byte 255,231 // jmpq *%rdi
+ .byte 255,244 // push %rsp
.byte 255 // (bad)
.byte 255 // (bad)
+ .byte 255,202 // dec %edx
.byte 255 // (bad)
- .byte 223,255 // (bad)
.byte 255 // (bad)
- .byte 255,215 // callq *%rdi
+ .byte 255,194 // inc %edx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 186,255,255,255,236 // mov $0xecffffff,%edx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,228 // jmpq *%rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 220,255 // fdivr %st,%st(7)
+ .byte 255 // (bad)
+ .byte 255,212 // callq *%rsp
.byte 255 // (bad)
.byte 255 // (bad)
.byte 255 // .byte 0xff
@@ -56583,63 +57159,91 @@ _sk_load_g8_hsw_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,50 // jne 667 <_sk_load_g8_hsw_8bit+0x4f>
- .byte 196,162,121,48,4,2 // vpmovzxbw (%rdx,%r8,1),%xmm0
- .byte 197,249,219,5,13,16,0,0 // vpand 0x100d(%rip),%xmm0,%xmm0 # 1650 <_sk_xor__hsw_8bit+0x419>
- .byte 196,226,125,51,192 // vpmovzxwd %xmm0,%ymm0
- .byte 196,226,125,88,21,167,12,0,0 // vpbroadcastd 0xca7(%rip),%ymm2 # 12f8 <_sk_xor__hsw_8bit+0xc1>
- .byte 196,226,125,64,194 // vpmulld %ymm2,%ymm0,%ymm0
- .byte 196,226,125,88,21,157,12,0,0 // vpbroadcastd 0xc9d(%rip),%ymm2 # 12fc <_sk_xor__hsw_8bit+0xc5>
- .byte 197,253,235,194 // vpor %ymm2,%ymm0,%ymm0
+ .byte 117,61 // jne f6a <_sk_load_g8_hsw_8bit+0x5a>
+ .byte 196,161,122,111,4,2 // vmovdqu (%rdx,%r8,1),%xmm0
+ .byte 196,226,125,49,200 // vpmovzxbd %xmm0,%ymm1
+ .byte 197,249,112,192,78 // vpshufd $0x4e,%xmm0,%xmm0
+ .byte 196,226,125,49,192 // vpmovzxbd %xmm0,%ymm0
+ .byte 196,226,125,88,37,185,24,0,0 // vpbroadcastd 0x18b9(%rip),%ymm4 # 2804 <_sk_xor__hsw_8bit+0x179>
+ .byte 196,226,125,64,236 // vpmulld %ymm4,%ymm0,%ymm5
+ .byte 196,226,117,64,196 // vpmulld %ymm4,%ymm1,%ymm0
+ .byte 196,226,125,88,13,170,24,0,0 // vpbroadcastd 0x18aa(%rip),%ymm1 # 2808 <_sk_xor__hsw_8bit+0x17d>
+ .byte 197,253,235,193 // vpor %ymm1,%ymm0,%ymm0
+ .byte 197,213,235,201 // vpor %ymm1,%ymm5,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 65,128,225,15 // and $0xf,%r9b
.byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
.byte 65,254,201 // dec %r9b
- .byte 65,128,249,6 // cmp $0x6,%r9b
- .byte 119,195 // ja 63b <_sk_load_g8_hsw_8bit+0x23>
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,184 // ja f33 <_sk_load_g8_hsw_8bit+0x23>
.byte 65,15,182,193 // movzbl %r9b,%eax
- .byte 72,141,13,121,0,0,0 // lea 0x79(%rip),%rcx # 6fc <_sk_load_g8_hsw_8bit+0xe4>
+ .byte 72,141,13,198,0,0,0 // lea 0xc6(%rip),%rcx # 104c <_sk_load_g8_hsw_8bit+0x13c>
.byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
.byte 72,1,200 // add %rcx,%rax
.byte 255,224 // jmpq *%rax
.byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
.byte 197,249,110,192 // vmovd %eax,%xmm0
- .byte 235,164 // jmp 63b <_sk_load_g8_hsw_8bit+0x23>
- .byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
+ .byte 235,153 // jmp f33 <_sk_load_g8_hsw_8bit+0x23>
.byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
- .byte 197,249,196,192,2 // vpinsrw $0x2,%eax,%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,2,2 // vpinsrb $0x2,0x2(%rdx,%r8,1),%xmm0,%xmm0
.byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
- .byte 197,249,110,208 // vmovd %eax,%xmm2
- .byte 196,226,121,48,210 // vpmovzxbw %xmm2,%xmm2
- .byte 196,227,121,2,194,1 // vpblendd $0x1,%xmm2,%xmm0,%xmm0
- .byte 233,124,255,255,255 // jmpq 63b <_sk_load_g8_hsw_8bit+0x23>
- .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
+ .byte 197,249,110,200 // vmovd %eax,%xmm1
+ .byte 196,227,121,14,193,1 // vpblendw $0x1,%xmm1,%xmm0,%xmm0
+ .byte 233,121,255,255,255 // jmpq f33 <_sk_load_g8_hsw_8bit+0x23>
.byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
- .byte 197,249,196,192,6 // vpinsrw $0x6,%eax,%xmm0,%xmm0
- .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
- .byte 197,249,196,192,5 // vpinsrw $0x5,%eax,%xmm0,%xmm0
- .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
- .byte 197,249,196,192,4 // vpinsrw $0x4,%eax,%xmm0,%xmm0
- .byte 196,161,121,110,20,2 // vmovd (%rdx,%r8,1),%xmm2
- .byte 196,226,121,48,210 // vpmovzxbw %xmm2,%xmm2
- .byte 196,227,105,2,192,12 // vpblendd $0xc,%xmm0,%xmm2,%xmm0
- .byte 233,65,255,255,255 // jmpq 63b <_sk_load_g8_hsw_8bit+0x23>
- .byte 102,144 // xchg %ax,%ax
- .byte 144 // nop
+ .byte 196,163,121,32,68,2,6,6 // vpinsrb $0x6,0x6(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,5,5 // vpinsrb $0x5,0x5(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,4,4 // vpinsrb $0x4,0x4(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,161,121,110,12,2 // vmovd (%rdx,%r8,1),%xmm1
+ .byte 196,227,121,2,193,1 // vpblendd $0x1,%xmm1,%xmm0,%xmm0
+ .byte 233,76,255,255,255 // jmpq f33 <_sk_load_g8_hsw_8bit+0x23>
+ .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,10,10 // vpinsrb $0xa,0xa(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,9,9 // vpinsrb $0x9,0x9(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,8,8 // vpinsrb $0x8,0x8(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,161,122,126,12,2 // vmovq (%rdx,%r8,1),%xmm1
+ .byte 196,227,113,2,192,12 // vpblendd $0xc,%xmm0,%xmm1,%xmm0
+ .byte 233,31,255,255,255 // jmpq f33 <_sk_load_g8_hsw_8bit+0x23>
+ .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,14,14 // vpinsrb $0xe,0xe(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,13,13 // vpinsrb $0xd,0xd(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,163,121,32,68,2,12,12 // vpinsrb $0xc,0xc(%rdx,%r8,1),%xmm0,%xmm0
+ .byte 196,161,122,126,12,2 // vmovq (%rdx,%r8,1),%xmm1
+ .byte 196,163,113,34,76,2,8,2 // vpinsrd $0x2,0x8(%rdx,%r8,1),%xmm1,%xmm1
+ .byte 196,227,113,2,192,8 // vpblendd $0x8,%xmm0,%xmm1,%xmm0
+ .byte 233,234,254,255,255 // jmpq f33 <_sk_load_g8_hsw_8bit+0x23>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 67,255 // rex.XB (bad)
.byte 255 // (bad)
+ .byte 255,90,255 // lcall *-0x1(%rdx)
.byte 255 // (bad)
- .byte 255,170,255,255,255,155 // ljmp *-0x64000001(%rdx)
+ .byte 255,78,255 // decl -0x1(%rsi)
.byte 255 // (bad)
+ .byte 255,138,255,255,255,130 // decl -0x7d000001(%rdx)
.byte 255 // (bad)
.byte 255 // (bad)
- .byte 232,255,255,255,221 // callq ffffffffde00070c <_sk_xor__hsw_8bit+0xffffffffddfff4d5>
.byte 255 // (bad)
+ .byte 122,255 // jp 1061 <_sk_load_g8_hsw_8bit+0x151>
.byte 255 // (bad)
- .byte 255,210 // callq *%rdx
+ .byte 255,110,255 // ljmp *-0x1(%rsi)
.byte 255 // (bad)
+ .byte 255,183,255,255,255,175 // pushq -0x50000001(%rdi)
.byte 255 // (bad)
- .byte 255,195 // inc %ebx
+ .byte 255 // (bad)
+ .byte 255,167,255,255,255,155 // jmpq *-0x64000001(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,228 // jmpq *%rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 220,255 // fdivr %st,%st(7)
+ .byte 255 // (bad)
+ .byte 255,212 // callq *%rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,200 // dec %eax
.byte 255 // (bad)
.byte 255 // (bad)
.byte 255 // .byte 0xff
@@ -56656,63 +57260,91 @@ _sk_load_g8_dst_hsw_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,50 // jne 767 <_sk_load_g8_dst_hsw_8bit+0x4f>
- .byte 196,162,121,48,12,2 // vpmovzxbw (%rdx,%r8,1),%xmm1
- .byte 197,241,219,13,29,15,0,0 // vpand 0xf1d(%rip),%xmm1,%xmm1 # 1660 <_sk_xor__hsw_8bit+0x429>
- .byte 196,226,125,51,201 // vpmovzxwd %xmm1,%ymm1
- .byte 196,226,125,88,21,175,11,0,0 // vpbroadcastd 0xbaf(%rip),%ymm2 # 1300 <_sk_xor__hsw_8bit+0xc9>
- .byte 196,226,117,64,202 // vpmulld %ymm2,%ymm1,%ymm1
- .byte 196,226,125,88,21,165,11,0,0 // vpbroadcastd 0xba5(%rip),%ymm2 # 1304 <_sk_xor__hsw_8bit+0xcd>
- .byte 197,245,235,202 // vpor %ymm2,%ymm1,%ymm1
+ .byte 117,61 // jne 10e2 <_sk_load_g8_dst_hsw_8bit+0x5a>
+ .byte 196,161,122,111,20,2 // vmovdqu (%rdx,%r8,1),%xmm2
+ .byte 196,226,125,49,218 // vpmovzxbd %xmm2,%ymm3
+ .byte 197,249,112,210,78 // vpshufd $0x4e,%xmm2,%xmm2
+ .byte 196,226,125,49,210 // vpmovzxbd %xmm2,%ymm2
+ .byte 196,226,125,88,37,73,23,0,0 // vpbroadcastd 0x1749(%rip),%ymm4 # 280c <_sk_xor__hsw_8bit+0x181>
+ .byte 196,226,109,64,236 // vpmulld %ymm4,%ymm2,%ymm5
+ .byte 196,226,101,64,212 // vpmulld %ymm4,%ymm3,%ymm2
+ .byte 196,226,125,88,29,58,23,0,0 // vpbroadcastd 0x173a(%rip),%ymm3 # 2810 <_sk_xor__hsw_8bit+0x185>
+ .byte 197,237,235,211 // vpor %ymm3,%ymm2,%ymm2
+ .byte 197,213,235,219 // vpor %ymm3,%ymm5,%ymm3
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,7 // and $0x7,%r9b
- .byte 197,241,239,201 // vpxor %xmm1,%xmm1,%xmm1
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2
.byte 65,254,201 // dec %r9b
- .byte 65,128,249,6 // cmp $0x6,%r9b
- .byte 119,195 // ja 73b <_sk_load_g8_dst_hsw_8bit+0x23>
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,184 // ja 10ab <_sk_load_g8_dst_hsw_8bit+0x23>
.byte 65,15,182,193 // movzbl %r9b,%eax
- .byte 72,141,13,121,0,0,0 // lea 0x79(%rip),%rcx # 7fc <_sk_load_g8_dst_hsw_8bit+0xe4>
+ .byte 72,141,13,198,0,0,0 // lea 0xc6(%rip),%rcx # 11c4 <_sk_load_g8_dst_hsw_8bit+0x13c>
.byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
.byte 72,1,200 // add %rcx,%rax
.byte 255,224 // jmpq *%rax
.byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
- .byte 197,249,110,200 // vmovd %eax,%xmm1
- .byte 235,164 // jmp 73b <_sk_load_g8_dst_hsw_8bit+0x23>
- .byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
- .byte 197,241,239,201 // vpxor %xmm1,%xmm1,%xmm1
- .byte 197,241,196,200,2 // vpinsrw $0x2,%eax,%xmm1,%xmm1
- .byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
.byte 197,249,110,208 // vmovd %eax,%xmm2
- .byte 196,226,121,48,210 // vpmovzxbw %xmm2,%xmm2
- .byte 196,227,113,2,202,1 // vpblendd $0x1,%xmm2,%xmm1,%xmm1
- .byte 233,124,255,255,255 // jmpq 73b <_sk_load_g8_dst_hsw_8bit+0x23>
- .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
- .byte 197,241,239,201 // vpxor %xmm1,%xmm1,%xmm1
- .byte 197,241,196,200,6 // vpinsrw $0x6,%eax,%xmm1,%xmm1
- .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
- .byte 197,241,196,200,5 // vpinsrw $0x5,%eax,%xmm1,%xmm1
- .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
- .byte 197,241,196,200,4 // vpinsrw $0x4,%eax,%xmm1,%xmm1
- .byte 196,161,121,110,20,2 // vmovd (%rdx,%r8,1),%xmm2
- .byte 196,226,121,48,210 // vpmovzxbw %xmm2,%xmm2
- .byte 196,227,105,2,201,12 // vpblendd $0xc,%xmm1,%xmm2,%xmm1
- .byte 233,65,255,255,255 // jmpq 73b <_sk_load_g8_dst_hsw_8bit+0x23>
- .byte 102,144 // xchg %ax,%ax
- .byte 144 // nop
+ .byte 235,153 // jmp 10ab <_sk_load_g8_dst_hsw_8bit+0x23>
+ .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,2,2 // vpinsrb $0x2,0x2(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
+ .byte 197,249,110,216 // vmovd %eax,%xmm3
+ .byte 196,227,105,14,211,1 // vpblendw $0x1,%xmm3,%xmm2,%xmm2
+ .byte 233,121,255,255,255 // jmpq 10ab <_sk_load_g8_dst_hsw_8bit+0x23>
+ .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,6,6 // vpinsrb $0x6,0x6(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,5,5 // vpinsrb $0x5,0x5(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,4,4 // vpinsrb $0x4,0x4(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,161,121,110,28,2 // vmovd (%rdx,%r8,1),%xmm3
+ .byte 196,227,105,2,211,1 // vpblendd $0x1,%xmm3,%xmm2,%xmm2
+ .byte 233,76,255,255,255 // jmpq 10ab <_sk_load_g8_dst_hsw_8bit+0x23>
+ .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,10,10 // vpinsrb $0xa,0xa(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,9,9 // vpinsrb $0x9,0x9(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,8,8 // vpinsrb $0x8,0x8(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,161,122,126,28,2 // vmovq (%rdx,%r8,1),%xmm3
+ .byte 196,227,97,2,210,12 // vpblendd $0xc,%xmm2,%xmm3,%xmm2
+ .byte 233,31,255,255,255 // jmpq 10ab <_sk_load_g8_dst_hsw_8bit+0x23>
+ .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,14,14 // vpinsrb $0xe,0xe(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,13,13 // vpinsrb $0xd,0xd(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,163,105,32,84,2,12,12 // vpinsrb $0xc,0xc(%rdx,%r8,1),%xmm2,%xmm2
+ .byte 196,161,122,126,28,2 // vmovq (%rdx,%r8,1),%xmm3
+ .byte 196,163,97,34,92,2,8,2 // vpinsrd $0x2,0x8(%rdx,%r8,1),%xmm3,%xmm3
+ .byte 196,227,97,2,210,8 // vpblendd $0x8,%xmm2,%xmm3,%xmm2
+ .byte 233,234,254,255,255 // jmpq 10ab <_sk_load_g8_dst_hsw_8bit+0x23>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 67,255 // rex.XB (bad)
.byte 255 // (bad)
+ .byte 255,90,255 // lcall *-0x1(%rdx)
.byte 255 // (bad)
- .byte 255,170,255,255,255,155 // ljmp *-0x64000001(%rdx)
+ .byte 255,78,255 // decl -0x1(%rsi)
.byte 255 // (bad)
+ .byte 255,138,255,255,255,130 // decl -0x7d000001(%rdx)
.byte 255 // (bad)
.byte 255 // (bad)
- .byte 232,255,255,255,221 // callq ffffffffde00080c <_sk_xor__hsw_8bit+0xffffffffddfff5d5>
.byte 255 // (bad)
+ .byte 122,255 // jp 11d9 <_sk_load_g8_dst_hsw_8bit+0x151>
.byte 255 // (bad)
- .byte 255,210 // callq *%rdx
+ .byte 255,110,255 // ljmp *-0x1(%rsi)
.byte 255 // (bad)
+ .byte 255,183,255,255,255,175 // pushq -0x50000001(%rdi)
.byte 255 // (bad)
- .byte 255,195 // inc %ebx
+ .byte 255 // (bad)
+ .byte 255,167,255,255,255,155 // jmpq *-0x64000001(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,228 // jmpq *%rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 220,255 // fdivr %st,%st(7)
+ .byte 255 // (bad)
+ .byte 255,212 // callq *%rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,200 // dec %eax
.byte 255 // (bad)
.byte 255 // (bad)
.byte 255 // .byte 0xff
@@ -56724,80 +57356,252 @@ _sk_srcover_rgba_8888_hsw_8bit:
.byte 76,99,15 // movslq (%rdi),%r9
.byte 76,139,71,16 // mov 0x10(%rdi),%r8
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 72,99,80,8 // movslq 0x8(%rax),%rdx
- .byte 72,99,79,8 // movslq 0x8(%rdi),%rcx
- .byte 72,15,175,202 // imul %rdx,%rcx
- .byte 72,193,225,2 // shl $0x2,%rcx
- .byte 72,3,8 // add (%rax),%rcx
- .byte 74,141,4,137 // lea (%rcx,%r9,4),%rax
+ .byte 72,99,72,8 // movslq 0x8(%rax),%rcx
+ .byte 72,99,87,8 // movslq 0x8(%rdi),%rdx
+ .byte 72,15,175,209 // imul %rcx,%rdx
+ .byte 72,193,226,2 // shl $0x2,%rdx
+ .byte 72,3,16 // add (%rax),%rdx
.byte 77,133,192 // test %r8,%r8
- .byte 117,108 // jne 8a9 <_sk_srcover_rgba_8888_hsw_8bit+0x91>
- .byte 197,254,111,16 // vmovdqu (%rax),%ymm2
- .byte 196,226,125,0,29,214,11,0,0 // vpshufb 0xbd6(%rip),%ymm0,%ymm3 # 1420 <_sk_xor__hsw_8bit+0x1e9>
- .byte 196,226,125,48,226 // vpmovzxbw %xmm2,%ymm4
- .byte 196,227,125,57,213,1 // vextracti128 $0x1,%ymm2,%xmm5
+ .byte 15,133,222,0,0,0 // jne 1303 <_sk_srcover_rgba_8888_hsw_8bit+0x103>
+ .byte 196,33,126,111,76,138,32 // vmovdqu 0x20(%rdx,%r9,4),%ymm9
+ .byte 196,33,126,111,28,138 // vmovdqu (%rdx,%r9,4),%ymm11
+ .byte 197,253,111,53,230,22,0,0 // vmovdqa 0x16e6(%rip),%ymm6 # 2920 <_sk_xor__hsw_8bit+0x295>
+ .byte 196,226,117,0,254 // vpshufb %ymm6,%ymm1,%ymm7
+ .byte 196,226,125,0,246 // vpshufb %ymm6,%ymm0,%ymm6
+ .byte 196,66,125,48,195 // vpmovzxbw %xmm11,%ymm8
+ .byte 196,99,125,57,220,1 // vextracti128 $0x1,%ymm11,%xmm4
+ .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
+ .byte 196,66,125,48,209 // vpmovzxbw %xmm9,%ymm10
+ .byte 196,99,125,57,205,1 // vextracti128 $0x1,%ymm9,%xmm5
.byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
- .byte 196,226,125,48,243 // vpmovzxbw %xmm3,%ymm6
- .byte 196,227,125,57,219,1 // vextracti128 $0x1,%ymm3,%xmm3
- .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3
- .byte 197,213,213,219 // vpmullw %ymm3,%ymm5,%ymm3
+ .byte 196,98,125,48,230 // vpmovzxbw %xmm6,%ymm12
+ .byte 196,227,125,57,246,1 // vextracti128 $0x1,%ymm6,%xmm6
+ .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6
+ .byte 196,98,125,48,239 // vpmovzxbw %xmm7,%ymm13
+ .byte 196,227,125,57,255,1 // vextracti128 $0x1,%ymm7,%xmm7
+ .byte 196,226,125,48,255 // vpmovzxbw %xmm7,%ymm7
+ .byte 197,213,213,255 // vpmullw %ymm7,%ymm5,%ymm7
+ .byte 196,65,45,213,237 // vpmullw %ymm13,%ymm10,%ymm13
.byte 197,221,213,246 // vpmullw %ymm6,%ymm4,%ymm6
+ .byte 196,65,61,213,228 // vpmullw %ymm12,%ymm8,%ymm12
+ .byte 196,65,29,253,192 // vpaddw %ymm8,%ymm12,%ymm8
.byte 197,205,253,228 // vpaddw %ymm4,%ymm6,%ymm4
- .byte 197,229,253,221 // vpaddw %ymm5,%ymm3,%ymm3
- .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3
+ .byte 196,193,21,253,242 // vpaddw %ymm10,%ymm13,%ymm6
+ .byte 197,197,253,237 // vpaddw %ymm5,%ymm7,%ymm5
+ .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5
+ .byte 197,205,113,214,8 // vpsrlw $0x8,%ymm6,%ymm6
.byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
- .byte 196,227,93,56,235,1 // vinserti128 $0x1,%xmm3,%ymm4,%ymm5
- .byte 196,227,93,70,219,49 // vperm2i128 $0x31,%ymm3,%ymm4,%ymm3
- .byte 197,213,103,219 // vpackuswb %ymm3,%ymm5,%ymm3
- .byte 197,237,248,211 // vpsubb %ymm3,%ymm2,%ymm2
- .byte 197,237,252,208 // vpaddb %ymm0,%ymm2,%ymm2
+ .byte 196,193,69,113,208,8 // vpsrlw $0x8,%ymm8,%ymm7
+ .byte 196,99,69,56,196,1 // vinserti128 $0x1,%xmm4,%ymm7,%ymm8
+ .byte 196,227,69,70,228,49 // vperm2i128 $0x31,%ymm4,%ymm7,%ymm4
+ .byte 197,189,103,228 // vpackuswb %ymm4,%ymm8,%ymm4
+ .byte 196,227,77,56,253,1 // vinserti128 $0x1,%xmm5,%ymm6,%ymm7
+ .byte 196,227,77,70,237,49 // vperm2i128 $0x31,%ymm5,%ymm6,%ymm5
+ .byte 197,197,103,237 // vpackuswb %ymm5,%ymm7,%ymm5
+ .byte 197,181,248,237 // vpsubb %ymm5,%ymm9,%ymm5
+ .byte 197,165,248,228 // vpsubb %ymm4,%ymm11,%ymm4
+ .byte 197,221,252,224 // vpaddb %ymm0,%ymm4,%ymm4
+ .byte 197,213,252,233 // vpaddb %ymm1,%ymm5,%ymm5
.byte 77,133,192 // test %r8,%r8
- .byte 117,49 // jne 8d2 <_sk_srcover_rgba_8888_hsw_8bit+0xba>
- .byte 197,254,127,16 // vmovdqu %ymm2,(%rax)
+ .byte 117,72 // jne 133a <_sk_srcover_rgba_8888_hsw_8bit+0x13a>
+ .byte 196,161,126,127,36,138 // vmovdqu %ymm4,(%rdx,%r9,4)
+ .byte 196,161,126,127,108,138,32 // vmovdqu %ymm5,0x20(%rdx,%r9,4)
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 185,8,0,0,0 // mov $0x8,%ecx
- .byte 68,41,193 // sub %r8d,%ecx
- .byte 192,225,3 // shl $0x3,%cl
- .byte 72,199,194,255,255,255,255 // mov $0xffffffffffffffff,%rdx
- .byte 72,211,234 // shr %cl,%rdx
- .byte 196,225,249,110,210 // vmovq %rdx,%xmm2
- .byte 196,226,125,33,210 // vpmovsxbd %xmm2,%ymm2
- .byte 196,226,109,140,16 // vpmaskmovd (%rax),%ymm2,%ymm2
- .byte 233,111,255,255,255 // jmpq 841 <_sk_srcover_rgba_8888_hsw_8bit+0x29>
- .byte 185,8,0,0,0 // mov $0x8,%ecx
- .byte 68,41,193 // sub %r8d,%ecx
- .byte 192,225,3 // shl $0x3,%cl
- .byte 72,199,194,255,255,255,255 // mov $0xffffffffffffffff,%rdx
- .byte 72,211,234 // shr %cl,%rdx
- .byte 196,225,249,110,218 // vmovq %rdx,%xmm3
- .byte 196,226,125,33,219 // vpmovsxbd %xmm3,%ymm3
- .byte 196,226,101,142,16 // vpmaskmovd %ymm2,%ymm3,(%rax)
- .byte 235,173 // jmp 8a5 <_sk_srcover_rgba_8888_hsw_8bit+0x8d>
+ .byte 68,137,192 // mov %r8d,%eax
+ .byte 36,15 // and $0xf,%al
+ .byte 196,65,53,239,201 // vpxor %ymm9,%ymm9,%ymm9
+ .byte 196,65,37,239,219 // vpxor %ymm11,%ymm11,%ymm11
+ .byte 254,200 // dec %al
+ .byte 60,14 // cmp $0xe,%al
+ .byte 15,135,22,255,255,255 // ja 1232 <_sk_srcover_rgba_8888_hsw_8bit+0x32>
+ .byte 15,182,192 // movzbl %al,%eax
+ .byte 72,141,13,234,1,0,0 // lea 0x1ea(%rip),%rcx # 1510 <_sk_srcover_rgba_8888_hsw_8bit+0x310>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 196,33,121,110,28,138 // vmovd (%rdx,%r9,4),%xmm11
+ .byte 233,248,254,255,255 // jmpq 1232 <_sk_srcover_rgba_8888_hsw_8bit+0x32>
+ .byte 65,128,224,15 // and $0xf,%r8b
+ .byte 65,254,200 // dec %r8b
+ .byte 65,128,248,14 // cmp $0xe,%r8b
+ .byte 119,184 // ja 12ff <_sk_srcover_rgba_8888_hsw_8bit+0xff>
+ .byte 65,15,182,192 // movzbl %r8b,%eax
+ .byte 72,141,13,250,1,0,0 // lea 0x1fa(%rip),%rcx # 154c <_sk_srcover_rgba_8888_hsw_8bit+0x34c>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 196,161,121,126,36,138 // vmovd %xmm4,(%rdx,%r9,4)
+ .byte 235,156 // jmp 12ff <_sk_srcover_rgba_8888_hsw_8bit+0xff>
+ .byte 196,161,121,110,100,138,8 // vmovd 0x8(%rdx,%r9,4),%xmm4
+ .byte 196,226,121,89,236 // vpbroadcastq %xmm4,%xmm5
+ .byte 196,65,53,239,201 // vpxor %ymm9,%ymm9,%ymm9
+ .byte 196,99,53,2,221,4 // vpblendd $0x4,%ymm5,%ymm9,%ymm11
+ .byte 196,162,121,53,52,138 // vpmovzxdq (%rdx,%r9,4),%xmm6
+ .byte 197,249,112,246,232 // vpshufd $0xe8,%xmm6,%xmm6
+ .byte 196,99,37,2,222,3 // vpblendd $0x3,%ymm6,%ymm11,%ymm11
+ .byte 233,162,254,255,255 // jmpq 1232 <_sk_srcover_rgba_8888_hsw_8bit+0x32>
+ .byte 196,161,121,110,100,138,24 // vmovd 0x18(%rdx,%r9,4),%xmm4
+ .byte 196,226,125,89,236 // vpbroadcastq %xmm4,%ymm5
+ .byte 196,65,53,239,201 // vpxor %ymm9,%ymm9,%ymm9
+ .byte 196,99,53,2,221,64 // vpblendd $0x40,%ymm5,%ymm9,%ymm11
+ .byte 196,99,125,57,222,1 // vextracti128 $0x1,%ymm11,%xmm6
+ .byte 196,163,73,34,116,138,20,1 // vpinsrd $0x1,0x14(%rdx,%r9,4),%xmm6,%xmm6
+ .byte 196,99,37,56,222,1 // vinserti128 $0x1,%xmm6,%ymm11,%ymm11
+ .byte 196,99,125,57,222,1 // vextracti128 $0x1,%ymm11,%xmm6
+ .byte 196,163,73,34,116,138,16,0 // vpinsrd $0x0,0x10(%rdx,%r9,4),%xmm6,%xmm6
+ .byte 196,99,37,56,222,1 // vinserti128 $0x1,%xmm6,%ymm11,%ymm11
+ .byte 196,161,122,111,52,138 // vmovdqu (%rdx,%r9,4),%xmm6
+ .byte 196,67,77,2,219,240 // vpblendd $0xf0,%ymm11,%ymm6,%ymm11
+ .byte 233,82,254,255,255 // jmpq 1232 <_sk_srcover_rgba_8888_hsw_8bit+0x32>
+ .byte 196,161,121,110,100,138,40 // vmovd 0x28(%rdx,%r9,4),%xmm4
+ .byte 196,226,121,89,228 // vpbroadcastq %xmm4,%xmm4
+ .byte 197,213,239,237 // vpxor %ymm5,%ymm5,%ymm5
+ .byte 196,99,85,2,204,4 // vpblendd $0x4,%ymm4,%ymm5,%ymm9
+ .byte 196,163,49,34,108,138,36,1 // vpinsrd $0x1,0x24(%rdx,%r9,4),%xmm9,%xmm5
+ .byte 196,99,53,2,205,15 // vpblendd $0xf,%ymm5,%ymm9,%ymm9
+ .byte 196,161,121,110,108,138,32 // vmovd 0x20(%rdx,%r9,4),%xmm5
+ .byte 196,99,53,2,205,1 // vpblendd $0x1,%ymm5,%ymm9,%ymm9
+ .byte 233,22,254,255,255 // jmpq 122c <_sk_srcover_rgba_8888_hsw_8bit+0x2c>
+ .byte 196,161,121,110,100,138,56 // vmovd 0x38(%rdx,%r9,4),%xmm4
+ .byte 196,226,125,89,228 // vpbroadcastq %xmm4,%ymm4
+ .byte 197,213,239,237 // vpxor %ymm5,%ymm5,%ymm5
+ .byte 196,99,85,2,204,64 // vpblendd $0x40,%ymm4,%ymm5,%ymm9
+ .byte 196,99,125,57,205,1 // vextracti128 $0x1,%ymm9,%xmm5
+ .byte 196,163,81,34,108,138,52,1 // vpinsrd $0x1,0x34(%rdx,%r9,4),%xmm5,%xmm5
+ .byte 196,99,53,56,205,1 // vinserti128 $0x1,%xmm5,%ymm9,%ymm9
+ .byte 196,99,125,57,205,1 // vextracti128 $0x1,%ymm9,%xmm5
+ .byte 196,163,81,34,108,138,48,0 // vpinsrd $0x0,0x30(%rdx,%r9,4),%xmm5,%xmm5
+ .byte 196,99,53,56,205,1 // vinserti128 $0x1,%xmm5,%ymm9,%ymm9
+ .byte 196,33,126,111,28,138 // vmovdqu (%rdx,%r9,4),%ymm11
+ .byte 196,161,122,111,116,138,32 // vmovdqu 0x20(%rdx,%r9,4),%xmm6
+ .byte 196,67,77,2,201,240 // vpblendd $0xf0,%ymm9,%ymm6,%ymm9
+ .byte 233,198,253,255,255 // jmpq 1232 <_sk_srcover_rgba_8888_hsw_8bit+0x32>
+ .byte 196,163,121,22,100,138,8,2 // vpextrd $0x2,%xmm4,0x8(%rdx,%r9,4)
+ .byte 196,161,121,214,36,138 // vmovq %xmm4,(%rdx,%r9,4)
+ .byte 233,128,254,255,255 // jmpq 12ff <_sk_srcover_rgba_8888_hsw_8bit+0xff>
+ .byte 196,227,125,57,229,1 // vextracti128 $0x1,%ymm4,%xmm5
+ .byte 196,163,121,22,108,138,24,2 // vpextrd $0x2,%xmm5,0x18(%rdx,%r9,4)
+ .byte 196,227,125,57,229,1 // vextracti128 $0x1,%ymm4,%xmm5
+ .byte 196,163,121,22,108,138,20,1 // vpextrd $0x1,%xmm5,0x14(%rdx,%r9,4)
+ .byte 196,227,125,57,229,1 // vextracti128 $0x1,%ymm4,%xmm5
+ .byte 196,161,121,126,108,138,16 // vmovd %xmm5,0x10(%rdx,%r9,4)
+ .byte 196,161,122,127,36,138 // vmovdqu %xmm4,(%rdx,%r9,4)
+ .byte 233,76,254,255,255 // jmpq 12ff <_sk_srcover_rgba_8888_hsw_8bit+0xff>
+ .byte 196,163,121,22,108,138,40,2 // vpextrd $0x2,%xmm5,0x28(%rdx,%r9,4)
+ .byte 196,163,121,22,108,138,36,1 // vpextrd $0x1,%xmm5,0x24(%rdx,%r9,4)
+ .byte 196,161,121,126,108,138,32 // vmovd %xmm5,0x20(%rdx,%r9,4)
+ .byte 196,161,126,127,36,138 // vmovdqu %ymm4,(%rdx,%r9,4)
+ .byte 233,42,254,255,255 // jmpq 12ff <_sk_srcover_rgba_8888_hsw_8bit+0xff>
+ .byte 196,227,125,57,238,1 // vextracti128 $0x1,%ymm5,%xmm6
+ .byte 196,163,121,22,116,138,56,2 // vpextrd $0x2,%xmm6,0x38(%rdx,%r9,4)
+ .byte 196,227,125,57,238,1 // vextracti128 $0x1,%ymm5,%xmm6
+ .byte 196,163,121,22,116,138,52,1 // vpextrd $0x1,%xmm6,0x34(%rdx,%r9,4)
+ .byte 196,227,125,57,238,1 // vextracti128 $0x1,%ymm5,%xmm6
+ .byte 196,161,121,126,116,138,48 // vmovd %xmm6,0x30(%rdx,%r9,4)
+ .byte 196,161,126,127,36,138 // vmovdqu %ymm4,(%rdx,%r9,4)
+ .byte 196,161,122,127,108,138,32 // vmovdqu %xmm5,0x20(%rdx,%r9,4)
+ .byte 233,239,253,255,255 // jmpq 12ff <_sk_srcover_rgba_8888_hsw_8bit+0xff>
+ .byte 31 // (bad)
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,106,254 // ljmp *-0x2(%rdx)
+ .byte 255 // (bad)
+ .byte 255,83,254 // callq *-0x2(%rbx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 191,254,255,255,171 // mov $0xabfffffe,%edi
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,151,254,255,255,128 // callq *-0x7f000002(%rdi)
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,28,253,255,255,244,254 // lcall *-0x10b0001(,%rdi,8)
+ .byte 255 // (bad)
+ .byte 255,230 // jmpq *%rsi
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,208 // callq *%rax
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,68,255,255 // incl -0x1(%rdi,%rdi,8)
+ .byte 255,48 // pushq (%rax)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,28,255 // lcall *(%rdi,%rdi,8)
+ .byte 255 // (bad)
+ .byte 255,6 // incl (%rsi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,15 // decl (%rdi)
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,40 // ljmp *(%rax)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,32 // jmpq *(%rax)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,92,255,255 // lcall *-0x1(%rdi,%rdi,8)
+ .byte 255,79,255 // decl -0x1(%rdi)
+ .byte 255 // (bad)
+ .byte 255,65,255 // incl -0x1(%rcx)
+ .byte 255 // (bad)
+ .byte 255,51 // pushq (%rbx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 126,255 // jle 1569 <_sk_srcover_rgba_8888_hsw_8bit+0x369>
+ .byte 255 // (bad)
+ .byte 255,119,255 // pushq -0x1(%rdi)
+ .byte 255 // (bad)
+ .byte 255,111,255 // ljmp *-0x1(%rdi)
+ .byte 255 // (bad)
+ .byte 255,103,255 // jmpq *-0x1(%rdi)
+ .byte 255 // (bad)
+ .byte 255,178,255,255,255,165 // pushq -0x5a000001(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,151,255,255,255,137 // callq *-0x76000001(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_scale_1_float_hsw_8bit
.globl _sk_scale_1_float_hsw_8bit
FUNCTION(_sk_scale_1_float_hsw_8bit)
_sk_scale_1_float_hsw_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 197,250,16,16 // vmovss (%rax),%xmm2
- .byte 197,234,89,21,2,10,0,0 // vmulss 0xa02(%rip),%xmm2,%xmm2 # 1308 <_sk_xor__hsw_8bit+0xd1>
- .byte 197,250,44,194 // vcvttss2si %xmm2,%eax
- .byte 197,249,110,208 // vmovd %eax,%xmm2
- .byte 196,226,125,120,210 // vpbroadcastb %xmm2,%ymm2
- .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3
+ .byte 197,250,16,32 // vmovss (%rax),%xmm4
+ .byte 197,218,89,37,126,18,0,0 // vmulss 0x127e(%rip),%xmm4,%xmm4 # 2814 <_sk_xor__hsw_8bit+0x189>
+ .byte 197,250,44,196 // vcvttss2si %xmm4,%eax
+ .byte 197,249,110,224 // vmovd %eax,%xmm4
+ .byte 196,226,125,120,228 // vpbroadcastb %xmm4,%ymm4
+ .byte 196,226,125,48,232 // vpmovzxbw %xmm0,%ymm5
.byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0
.byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
- .byte 197,237,219,21,21,11,0,0 // vpand 0xb15(%rip),%ymm2,%ymm2 # 1440 <_sk_xor__hsw_8bit+0x209>
- .byte 197,237,213,224 // vpmullw %ymm0,%ymm2,%ymm4
- .byte 197,237,213,211 // vpmullw %ymm3,%ymm2,%ymm2
- .byte 197,237,253,211 // vpaddw %ymm3,%ymm2,%ymm2
- .byte 197,221,253,192 // vpaddw %ymm0,%ymm4,%ymm0
+ .byte 196,226,125,48,241 // vpmovzxbw %xmm1,%ymm6
+ .byte 196,227,125,57,201,1 // vextracti128 $0x1,%ymm1,%xmm1
+ .byte 196,226,125,48,201 // vpmovzxbw %xmm1,%ymm1
+ .byte 197,221,219,37,117,19,0,0 // vpand 0x1375(%rip),%ymm4,%ymm4 # 2940 <_sk_xor__hsw_8bit+0x2b5>
+ .byte 197,221,213,249 // vpmullw %ymm1,%ymm4,%ymm7
+ .byte 197,93,213,198 // vpmullw %ymm6,%ymm4,%ymm8
+ .byte 197,93,213,200 // vpmullw %ymm0,%ymm4,%ymm9
+ .byte 197,221,213,229 // vpmullw %ymm5,%ymm4,%ymm4
+ .byte 197,221,253,229 // vpaddw %ymm5,%ymm4,%ymm4
+ .byte 197,181,253,192 // vpaddw %ymm0,%ymm9,%ymm0
+ .byte 197,189,253,238 // vpaddw %ymm6,%ymm8,%ymm5
+ .byte 197,197,253,201 // vpaddw %ymm1,%ymm7,%ymm1
+ .byte 197,245,113,209,8 // vpsrlw $0x8,%ymm1,%ymm1
+ .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5
.byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0
- .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2
- .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3
- .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0
+ .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
+ .byte 196,227,93,56,240,1 // vinserti128 $0x1,%xmm0,%ymm4,%ymm6
+ .byte 196,227,93,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm4,%ymm0
+ .byte 197,205,103,192 // vpackuswb %ymm0,%ymm6,%ymm0
+ .byte 196,227,85,56,225,1 // vinserti128 $0x1,%xmm1,%ymm5,%ymm4
+ .byte 196,227,85,70,201,49 // vperm2i128 $0x31,%ymm1,%ymm5,%ymm1
+ .byte 197,221,103,201 // vpackuswb %ymm1,%ymm4,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -56813,75 +57617,118 @@ _sk_scale_u8_hsw_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,106 // jne 9e0 <_sk_scale_u8_hsw_8bit+0x87>
- .byte 196,162,121,48,20,2 // vpmovzxbw (%rdx,%r8,1),%xmm2
- .byte 197,233,219,21,236,12,0,0 // vpand 0xcec(%rip),%xmm2,%xmm2 # 1670 <_sk_xor__hsw_8bit+0x439>
- .byte 196,226,125,51,210 // vpmovzxwd %xmm2,%ymm2
- .byte 196,226,109,0,21,206,10,0,0 // vpshufb 0xace(%rip),%ymm2,%ymm2 # 1460 <_sk_xor__hsw_8bit+0x229>
- .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3
+ .byte 15,133,191,0,0,0 // jne 1703 <_sk_scale_u8_hsw_8bit+0xe0>
+ .byte 196,161,122,111,36,2 // vmovdqu (%rdx,%r8,1),%xmm4
+ .byte 196,226,125,49,236 // vpmovzxbd %xmm4,%ymm5
+ .byte 197,249,112,228,78 // vpshufd $0x4e,%xmm4,%xmm4
+ .byte 196,226,125,49,228 // vpmovzxbd %xmm4,%ymm4
+ .byte 197,253,111,53,255,18,0,0 // vmovdqa 0x12ff(%rip),%ymm6 # 2960 <_sk_xor__hsw_8bit+0x2d5>
+ .byte 196,226,93,0,230 // vpshufb %ymm6,%ymm4,%ymm4
+ .byte 196,226,85,0,238 // vpshufb %ymm6,%ymm5,%ymm5
+ .byte 196,226,125,48,240 // vpmovzxbw %xmm0,%ymm6
.byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0
.byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
- .byte 196,226,125,48,226 // vpmovzxbw %xmm2,%ymm4
- .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2
- .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2
- .byte 197,237,213,208 // vpmullw %ymm0,%ymm2,%ymm2
- .byte 197,221,213,227 // vpmullw %ymm3,%ymm4,%ymm4
- .byte 197,221,253,219 // vpaddw %ymm3,%ymm4,%ymm3
- .byte 197,237,253,192 // vpaddw %ymm0,%ymm2,%ymm0
+ .byte 196,226,125,48,249 // vpmovzxbw %xmm1,%ymm7
+ .byte 196,227,125,57,201,1 // vextracti128 $0x1,%ymm1,%xmm1
+ .byte 196,226,125,48,201 // vpmovzxbw %xmm1,%ymm1
+ .byte 196,98,125,48,197 // vpmovzxbw %xmm5,%ymm8
+ .byte 196,227,125,57,237,1 // vextracti128 $0x1,%ymm5,%xmm5
+ .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
+ .byte 196,98,125,48,204 // vpmovzxbw %xmm4,%ymm9
+ .byte 196,227,125,57,228,1 // vextracti128 $0x1,%ymm4,%xmm4
+ .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
+ .byte 197,221,213,225 // vpmullw %ymm1,%ymm4,%ymm4
+ .byte 197,53,213,207 // vpmullw %ymm7,%ymm9,%ymm9
+ .byte 197,213,213,232 // vpmullw %ymm0,%ymm5,%ymm5
+ .byte 197,61,213,198 // vpmullw %ymm6,%ymm8,%ymm8
+ .byte 197,189,253,246 // vpaddw %ymm6,%ymm8,%ymm6
+ .byte 197,213,253,192 // vpaddw %ymm0,%ymm5,%ymm0
+ .byte 197,181,253,239 // vpaddw %ymm7,%ymm9,%ymm5
+ .byte 197,221,253,201 // vpaddw %ymm1,%ymm4,%ymm1
+ .byte 197,245,113,209,8 // vpsrlw $0x8,%ymm1,%ymm1
+ .byte 197,221,113,213,8 // vpsrlw $0x8,%ymm5,%ymm4
.byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0
- .byte 197,237,113,211,8 // vpsrlw $0x8,%ymm3,%ymm2
- .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3
- .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0
+ .byte 197,213,113,214,8 // vpsrlw $0x8,%ymm6,%ymm5
+ .byte 196,227,85,56,240,1 // vinserti128 $0x1,%xmm0,%ymm5,%ymm6
+ .byte 196,227,85,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm5,%ymm0
+ .byte 197,205,103,192 // vpackuswb %ymm0,%ymm6,%ymm0
+ .byte 196,227,93,56,233,1 // vinserti128 $0x1,%xmm1,%ymm4,%ymm5
+ .byte 196,227,93,70,201,49 // vperm2i128 $0x31,%ymm1,%ymm4,%ymm1
+ .byte 197,213,103,201 // vpackuswb %ymm1,%ymm5,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,7 // and $0x7,%r9b
- .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 197,217,239,228 // vpxor %xmm4,%xmm4,%xmm4
.byte 65,254,201 // dec %r9b
- .byte 65,128,249,6 // cmp $0x6,%r9b
- .byte 119,139 // ja 97c <_sk_scale_u8_hsw_8bit+0x23>
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 15,135,50,255,255,255 // ja 164a <_sk_scale_u8_hsw_8bit+0x27>
.byte 65,15,182,193 // movzbl %r9b,%eax
- .byte 72,141,13,124,0,0,0 // lea 0x7c(%rip),%rcx # a78 <_sk_scale_u8_hsw_8bit+0x11f>
+ .byte 72,141,13,201,0,0,0 // lea 0xc9(%rip),%rcx # 17ec <_sk_scale_u8_hsw_8bit+0x1c9>
.byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
.byte 72,1,200 // add %rcx,%rax
.byte 255,224 // jmpq *%rax
.byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
- .byte 197,249,110,208 // vmovd %eax,%xmm2
- .byte 233,105,255,255,255 // jmpq 97c <_sk_scale_u8_hsw_8bit+0x23>
- .byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
- .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2
- .byte 197,233,196,208,2 // vpinsrw $0x2,%eax,%xmm2,%xmm2
+ .byte 197,249,110,224 // vmovd %eax,%xmm4
+ .byte 233,16,255,255,255 // jmpq 164a <_sk_scale_u8_hsw_8bit+0x27>
+ .byte 197,217,239,228 // vpxor %xmm4,%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,2,2 // vpinsrb $0x2,0x2(%rdx,%r8,1),%xmm4,%xmm4
.byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
- .byte 197,249,110,216 // vmovd %eax,%xmm3
- .byte 196,226,121,48,219 // vpmovzxbw %xmm3,%xmm3
- .byte 196,227,105,2,211,1 // vpblendd $0x1,%xmm3,%xmm2,%xmm2
- .byte 233,65,255,255,255 // jmpq 97c <_sk_scale_u8_hsw_8bit+0x23>
- .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
- .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2
- .byte 197,233,196,208,6 // vpinsrw $0x6,%eax,%xmm2,%xmm2
- .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
- .byte 197,233,196,208,5 // vpinsrw $0x5,%eax,%xmm2,%xmm2
- .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
- .byte 197,233,196,208,4 // vpinsrw $0x4,%eax,%xmm2,%xmm2
- .byte 196,161,121,110,28,2 // vmovd (%rdx,%r8,1),%xmm3
- .byte 196,226,121,48,219 // vpmovzxbw %xmm3,%xmm3
- .byte 196,227,97,2,210,12 // vpblendd $0xc,%xmm2,%xmm3,%xmm2
- .byte 233,6,255,255,255 // jmpq 97c <_sk_scale_u8_hsw_8bit+0x23>
- .byte 102,144 // xchg %ax,%ax
- .byte 141 // (bad)
+ .byte 197,249,110,232 // vmovd %eax,%xmm5
+ .byte 196,227,89,14,229,1 // vpblendw $0x1,%xmm5,%xmm4,%xmm4
+ .byte 233,240,254,255,255 // jmpq 164a <_sk_scale_u8_hsw_8bit+0x27>
+ .byte 197,217,239,228 // vpxor %xmm4,%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,6,6 // vpinsrb $0x6,0x6(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,5,5 // vpinsrb $0x5,0x5(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,4,4 // vpinsrb $0x4,0x4(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,161,121,110,44,2 // vmovd (%rdx,%r8,1),%xmm5
+ .byte 196,227,89,2,229,1 // vpblendd $0x1,%xmm5,%xmm4,%xmm4
+ .byte 233,195,254,255,255 // jmpq 164a <_sk_scale_u8_hsw_8bit+0x27>
+ .byte 197,217,239,228 // vpxor %xmm4,%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,10,10 // vpinsrb $0xa,0xa(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,9,9 // vpinsrb $0x9,0x9(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,8,8 // vpinsrb $0x8,0x8(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,161,122,126,44,2 // vmovq (%rdx,%r8,1),%xmm5
+ .byte 196,227,81,2,228,12 // vpblendd $0xc,%xmm4,%xmm5,%xmm4
+ .byte 233,150,254,255,255 // jmpq 164a <_sk_scale_u8_hsw_8bit+0x27>
+ .byte 197,217,239,228 // vpxor %xmm4,%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,14,14 // vpinsrb $0xe,0xe(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,13,13 // vpinsrb $0xd,0xd(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,12,12 // vpinsrb $0xc,0xc(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,161,122,126,44,2 // vmovq (%rdx,%r8,1),%xmm5
+ .byte 196,163,81,34,108,2,8,2 // vpinsrd $0x2,0x8(%rdx,%r8,1),%xmm5,%xmm5
+ .byte 196,227,81,2,228,8 // vpblendd $0x8,%xmm4,%xmm5,%xmm4
+ .byte 233,97,254,255,255 // jmpq 164a <_sk_scale_u8_hsw_8bit+0x27>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 64,255 // rex (bad)
.byte 255 // (bad)
+ .byte 255,90,255 // lcall *-0x1(%rdx)
.byte 255 // (bad)
- .byte 255,170,255,255,255,155 // ljmp *-0x64000001(%rdx)
+ .byte 255,78,255 // decl -0x1(%rsi)
.byte 255 // (bad)
+ .byte 255,138,255,255,255,130 // decl -0x7d000001(%rdx)
.byte 255 // (bad)
.byte 255 // (bad)
- .byte 232,255,255,255,221 // callq ffffffffde000a88 <_sk_xor__hsw_8bit+0xffffffffddfff851>
.byte 255 // (bad)
+ .byte 122,255 // jp 1801 <_sk_scale_u8_hsw_8bit+0x1de>
.byte 255 // (bad)
- .byte 255,210 // callq *%rdx
+ .byte 255,110,255 // ljmp *-0x1(%rsi)
.byte 255 // (bad)
+ .byte 255,183,255,255,255,175 // pushq -0x50000001(%rdi)
.byte 255 // (bad)
- .byte 255,195 // inc %ebx
+ .byte 255 // (bad)
+ .byte 255,167,255,255,255,155 // jmpq *-0x64000001(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,228 // jmpq *%rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 220,255 // fdivr %st,%st(7)
+ .byte 255 // (bad)
+ .byte 255,212 // callq *%rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,200 // dec %eax
.byte 255 // (bad)
.byte 255 // (bad)
.byte 255 // .byte 0xff
@@ -56891,42 +57738,67 @@ HIDDEN _sk_lerp_1_float_hsw_8bit
FUNCTION(_sk_lerp_1_float_hsw_8bit)
_sk_lerp_1_float_hsw_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 197,250,16,16 // vmovss (%rax),%xmm2
- .byte 197,234,89,21,106,8,0,0 // vmulss 0x86a(%rip),%xmm2,%xmm2 # 130c <_sk_xor__hsw_8bit+0xd5>
- .byte 197,250,44,194 // vcvttss2si %xmm2,%eax
- .byte 197,249,110,208 // vmovd %eax,%xmm2
- .byte 196,226,125,120,210 // vpbroadcastb %xmm2,%ymm2
- .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3
+ .byte 197,250,16,32 // vmovss (%rax),%xmm4
+ .byte 197,218,89,37,226,15,0,0 // vmulss 0xfe2(%rip),%xmm4,%xmm4 # 2818 <_sk_xor__hsw_8bit+0x18d>
+ .byte 197,250,44,196 // vcvttss2si %xmm4,%eax
+ .byte 197,249,110,224 // vmovd %eax,%xmm4
+ .byte 196,226,125,120,228 // vpbroadcastb %xmm4,%ymm4
+ .byte 196,226,125,48,232 // vpmovzxbw %xmm0,%ymm5
.byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0
.byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
- .byte 197,237,219,37,185,9,0,0 // vpand 0x9b9(%rip),%ymm2,%ymm4 # 1480 <_sk_xor__hsw_8bit+0x249>
- .byte 197,221,213,232 // vpmullw %ymm0,%ymm4,%ymm5
- .byte 197,221,213,227 // vpmullw %ymm3,%ymm4,%ymm4
- .byte 197,221,253,219 // vpaddw %ymm3,%ymm4,%ymm3
- .byte 197,213,253,192 // vpaddw %ymm0,%ymm5,%ymm0
+ .byte 196,226,125,48,241 // vpmovzxbw %xmm1,%ymm6
+ .byte 196,227,125,57,201,1 // vextracti128 $0x1,%ymm1,%xmm1
+ .byte 196,226,125,48,201 // vpmovzxbw %xmm1,%ymm1
+ .byte 197,221,219,61,21,17,0,0 // vpand 0x1115(%rip),%ymm4,%ymm7 # 2980 <_sk_xor__hsw_8bit+0x2f5>
+ .byte 197,69,213,193 // vpmullw %ymm1,%ymm7,%ymm8
+ .byte 197,69,213,206 // vpmullw %ymm6,%ymm7,%ymm9
+ .byte 197,69,213,208 // vpmullw %ymm0,%ymm7,%ymm10
+ .byte 197,197,213,253 // vpmullw %ymm5,%ymm7,%ymm7
+ .byte 197,197,253,237 // vpaddw %ymm5,%ymm7,%ymm5
+ .byte 197,173,253,192 // vpaddw %ymm0,%ymm10,%ymm0
+ .byte 197,181,253,246 // vpaddw %ymm6,%ymm9,%ymm6
+ .byte 197,189,253,201 // vpaddw %ymm1,%ymm8,%ymm1
+ .byte 197,245,113,209,8 // vpsrlw $0x8,%ymm1,%ymm1
+ .byte 197,205,113,214,8 // vpsrlw $0x8,%ymm6,%ymm6
.byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0
- .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3
- .byte 196,227,101,56,224,1 // vinserti128 $0x1,%xmm0,%ymm3,%ymm4
- .byte 196,227,101,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm3,%ymm0
- .byte 197,221,103,192 // vpackuswb %ymm0,%ymm4,%ymm0
- .byte 197,229,118,219 // vpcmpeqd %ymm3,%ymm3,%ymm3
- .byte 197,237,239,211 // vpxor %ymm3,%ymm2,%ymm2
- .byte 196,226,125,48,217 // vpmovzxbw %xmm1,%ymm3
- .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4
- .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
+ .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5
+ .byte 196,227,85,56,248,1 // vinserti128 $0x1,%xmm0,%ymm5,%ymm7
+ .byte 196,227,85,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm5,%ymm0
+ .byte 197,69,103,192 // vpackuswb %ymm0,%ymm7,%ymm8
+ .byte 196,227,77,56,233,1 // vinserti128 $0x1,%xmm1,%ymm6,%ymm5
+ .byte 196,227,77,70,201,49 // vperm2i128 $0x31,%ymm1,%ymm6,%ymm1
+ .byte 197,213,103,201 // vpackuswb %ymm1,%ymm5,%ymm1
+ .byte 197,213,118,237 // vpcmpeqd %ymm5,%ymm5,%ymm5
+ .byte 197,221,239,229 // vpxor %ymm5,%ymm4,%ymm4
.byte 196,226,125,48,234 // vpmovzxbw %xmm2,%ymm5
- .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2
- .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2
- .byte 197,237,213,212 // vpmullw %ymm4,%ymm2,%ymm2
- .byte 197,213,213,235 // vpmullw %ymm3,%ymm5,%ymm5
- .byte 197,213,253,219 // vpaddw %ymm3,%ymm5,%ymm3
- .byte 197,237,253,212 // vpaddw %ymm4,%ymm2,%ymm2
- .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2
- .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3
- .byte 196,227,101,56,226,1 // vinserti128 $0x1,%xmm2,%ymm3,%ymm4
- .byte 196,227,101,70,210,49 // vperm2i128 $0x31,%ymm2,%ymm3,%ymm2
- .byte 197,221,103,210 // vpackuswb %ymm2,%ymm4,%ymm2
- .byte 197,237,252,192 // vpaddb %ymm0,%ymm2,%ymm0
+ .byte 196,227,125,57,214,1 // vextracti128 $0x1,%ymm2,%xmm6
+ .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6
+ .byte 196,226,125,48,251 // vpmovzxbw %xmm3,%ymm7
+ .byte 196,227,125,57,216,1 // vextracti128 $0x1,%ymm3,%xmm0
+ .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
+ .byte 196,98,125,48,204 // vpmovzxbw %xmm4,%ymm9
+ .byte 196,227,125,57,228,1 // vextracti128 $0x1,%ymm4,%xmm4
+ .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
+ .byte 197,93,213,208 // vpmullw %ymm0,%ymm4,%ymm10
+ .byte 197,53,213,223 // vpmullw %ymm7,%ymm9,%ymm11
+ .byte 197,221,213,230 // vpmullw %ymm6,%ymm4,%ymm4
+ .byte 197,53,213,205 // vpmullw %ymm5,%ymm9,%ymm9
+ .byte 197,181,253,237 // vpaddw %ymm5,%ymm9,%ymm5
+ .byte 197,221,253,230 // vpaddw %ymm6,%ymm4,%ymm4
+ .byte 197,165,253,247 // vpaddw %ymm7,%ymm11,%ymm6
+ .byte 197,173,253,192 // vpaddw %ymm0,%ymm10,%ymm0
+ .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0
+ .byte 197,205,113,214,8 // vpsrlw $0x8,%ymm6,%ymm6
+ .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
+ .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5
+ .byte 196,227,85,56,252,1 // vinserti128 $0x1,%xmm4,%ymm5,%ymm7
+ .byte 196,227,85,70,228,49 // vperm2i128 $0x31,%ymm4,%ymm5,%ymm4
+ .byte 197,197,103,228 // vpackuswb %ymm4,%ymm7,%ymm4
+ .byte 196,227,77,56,232,1 // vinserti128 $0x1,%xmm0,%ymm6,%ymm5
+ .byte 196,227,77,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm6,%ymm0
+ .byte 197,213,103,232 // vpackuswb %ymm0,%ymm5,%ymm5
+ .byte 196,193,93,252,192 // vpaddb %ymm8,%ymm4,%ymm0
+ .byte 197,213,252,201 // vpaddb %ymm1,%ymm5,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -56942,93 +57814,153 @@ _sk_lerp_u8_hsw_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 15,133,192,0,0,0 // jne c2c <_sk_lerp_u8_hsw_8bit+0xe1>
- .byte 196,162,121,48,20,2 // vpmovzxbw (%rdx,%r8,1),%xmm2
- .byte 197,233,219,21,6,11,0,0 // vpand 0xb06(%rip),%xmm2,%xmm2 # 1680 <_sk_xor__hsw_8bit+0x449>
- .byte 196,226,125,51,210 // vpmovzxwd %xmm2,%ymm2
- .byte 196,226,109,0,21,24,9,0,0 // vpshufb 0x918(%rip),%ymm2,%ymm2 # 14a0 <_sk_xor__hsw_8bit+0x269>
- .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3
+ .byte 15,133,106,1,0,0 // jne 1ae3 <_sk_lerp_u8_hsw_8bit+0x18b>
+ .byte 196,161,122,111,36,2 // vmovdqu (%rdx,%r8,1),%xmm4
+ .byte 196,226,125,49,236 // vpmovzxbd %xmm4,%ymm5
+ .byte 197,249,112,228,78 // vpshufd $0x4e,%xmm4,%xmm4
+ .byte 196,226,125,49,228 // vpmovzxbd %xmm4,%ymm4
+ .byte 197,253,111,53,10,16,0,0 // vmovdqa 0x100a(%rip),%ymm6 # 29a0 <_sk_xor__hsw_8bit+0x315>
+ .byte 196,98,93,0,206 // vpshufb %ymm6,%ymm4,%ymm9
+ .byte 196,98,85,0,222 // vpshufb %ymm6,%ymm5,%ymm11
+ .byte 196,226,125,48,240 // vpmovzxbw %xmm0,%ymm6
.byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0
.byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
- .byte 196,226,125,48,226 // vpmovzxbw %xmm2,%ymm4
- .byte 196,227,125,57,213,1 // vextracti128 $0x1,%ymm2,%xmm5
+ .byte 196,226,125,48,249 // vpmovzxbw %xmm1,%ymm7
+ .byte 196,227,125,57,201,1 // vextracti128 $0x1,%ymm1,%xmm1
+ .byte 196,226,125,48,201 // vpmovzxbw %xmm1,%ymm1
+ .byte 196,66,125,48,195 // vpmovzxbw %xmm11,%ymm8
+ .byte 196,99,125,57,220,1 // vextracti128 $0x1,%ymm11,%xmm4
+ .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
+ .byte 196,66,125,48,209 // vpmovzxbw %xmm9,%ymm10
+ .byte 196,99,125,57,205,1 // vextracti128 $0x1,%ymm9,%xmm5
+ .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
+ .byte 197,213,213,233 // vpmullw %ymm1,%ymm5,%ymm5
+ .byte 197,45,213,215 // vpmullw %ymm7,%ymm10,%ymm10
+ .byte 197,221,213,224 // vpmullw %ymm0,%ymm4,%ymm4
+ .byte 197,61,213,198 // vpmullw %ymm6,%ymm8,%ymm8
+ .byte 197,189,253,246 // vpaddw %ymm6,%ymm8,%ymm6
+ .byte 197,221,253,192 // vpaddw %ymm0,%ymm4,%ymm0
+ .byte 197,173,253,231 // vpaddw %ymm7,%ymm10,%ymm4
+ .byte 197,213,253,201 // vpaddw %ymm1,%ymm5,%ymm1
+ .byte 197,245,113,209,8 // vpsrlw $0x8,%ymm1,%ymm1
+ .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
+ .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0
+ .byte 197,213,113,214,8 // vpsrlw $0x8,%ymm6,%ymm5
+ .byte 196,227,85,56,240,1 // vinserti128 $0x1,%xmm0,%ymm5,%ymm6
+ .byte 196,227,85,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm5,%ymm0
+ .byte 197,77,103,208 // vpackuswb %ymm0,%ymm6,%ymm10
+ .byte 196,227,93,56,233,1 // vinserti128 $0x1,%xmm1,%ymm4,%ymm5
+ .byte 196,227,93,70,201,49 // vperm2i128 $0x31,%ymm1,%ymm4,%ymm1
+ .byte 197,213,103,201 // vpackuswb %ymm1,%ymm5,%ymm1
+ .byte 197,221,118,228 // vpcmpeqd %ymm4,%ymm4,%ymm4
+ .byte 197,181,239,236 // vpxor %ymm4,%ymm9,%ymm5
+ .byte 197,165,239,228 // vpxor %ymm4,%ymm11,%ymm4
+ .byte 196,226,125,48,242 // vpmovzxbw %xmm2,%ymm6
+ .byte 196,227,125,57,215,1 // vextracti128 $0x1,%ymm2,%xmm7
+ .byte 196,226,125,48,255 // vpmovzxbw %xmm7,%ymm7
+ .byte 196,98,125,48,195 // vpmovzxbw %xmm3,%ymm8
+ .byte 196,227,125,57,216,1 // vextracti128 $0x1,%ymm3,%xmm0
+ .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
+ .byte 196,98,125,48,204 // vpmovzxbw %xmm4,%ymm9
+ .byte 196,227,125,57,228,1 // vextracti128 $0x1,%ymm4,%xmm4
+ .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
+ .byte 196,98,125,48,221 // vpmovzxbw %xmm5,%ymm11
+ .byte 196,227,125,57,237,1 // vextracti128 $0x1,%ymm5,%xmm5
.byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
.byte 197,213,213,232 // vpmullw %ymm0,%ymm5,%ymm5
- .byte 197,221,213,227 // vpmullw %ymm3,%ymm4,%ymm4
- .byte 197,221,253,219 // vpaddw %ymm3,%ymm4,%ymm3
+ .byte 196,65,37,213,216 // vpmullw %ymm8,%ymm11,%ymm11
+ .byte 197,221,213,231 // vpmullw %ymm7,%ymm4,%ymm4
+ .byte 197,53,213,206 // vpmullw %ymm6,%ymm9,%ymm9
+ .byte 197,181,253,246 // vpaddw %ymm6,%ymm9,%ymm6
+ .byte 197,221,253,231 // vpaddw %ymm7,%ymm4,%ymm4
+ .byte 196,193,37,253,248 // vpaddw %ymm8,%ymm11,%ymm7
.byte 197,213,253,192 // vpaddw %ymm0,%ymm5,%ymm0
.byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0
- .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3
- .byte 196,227,101,56,224,1 // vinserti128 $0x1,%xmm0,%ymm3,%ymm4
- .byte 196,227,101,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm3,%ymm0
- .byte 197,221,103,192 // vpackuswb %ymm0,%ymm4,%ymm0
- .byte 197,229,118,219 // vpcmpeqd %ymm3,%ymm3,%ymm3
- .byte 197,237,239,211 // vpxor %ymm3,%ymm2,%ymm2
- .byte 196,226,125,48,217 // vpmovzxbw %xmm1,%ymm3
- .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4
- .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
- .byte 196,226,125,48,234 // vpmovzxbw %xmm2,%ymm5
- .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2
- .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2
- .byte 197,237,213,212 // vpmullw %ymm4,%ymm2,%ymm2
- .byte 197,213,213,235 // vpmullw %ymm3,%ymm5,%ymm5
- .byte 197,213,253,219 // vpaddw %ymm3,%ymm5,%ymm3
- .byte 197,237,253,212 // vpaddw %ymm4,%ymm2,%ymm2
- .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2
- .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3
- .byte 196,227,101,56,226,1 // vinserti128 $0x1,%xmm2,%ymm3,%ymm4
- .byte 196,227,101,70,210,49 // vperm2i128 $0x31,%ymm2,%ymm3,%ymm2
- .byte 197,221,103,210 // vpackuswb %ymm2,%ymm4,%ymm2
- .byte 197,237,252,192 // vpaddb %ymm0,%ymm2,%ymm0
+ .byte 197,213,113,215,8 // vpsrlw $0x8,%ymm7,%ymm5
+ .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
+ .byte 197,205,113,214,8 // vpsrlw $0x8,%ymm6,%ymm6
+ .byte 196,227,77,56,252,1 // vinserti128 $0x1,%xmm4,%ymm6,%ymm7
+ .byte 196,227,77,70,228,49 // vperm2i128 $0x31,%ymm4,%ymm6,%ymm4
+ .byte 197,197,103,228 // vpackuswb %ymm4,%ymm7,%ymm4
+ .byte 196,227,85,56,240,1 // vinserti128 $0x1,%xmm0,%ymm5,%ymm6
+ .byte 196,227,85,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm5,%ymm0
+ .byte 197,205,103,232 // vpackuswb %ymm0,%ymm6,%ymm5
+ .byte 196,193,93,252,194 // vpaddb %ymm10,%ymm4,%ymm0
+ .byte 197,213,252,201 // vpaddb %ymm1,%ymm5,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,7 // and $0x7,%r9b
- .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 197,217,239,228 // vpxor %xmm4,%xmm4,%xmm4
.byte 65,254,201 // dec %r9b
- .byte 65,128,249,6 // cmp $0x6,%r9b
- .byte 15,135,49,255,255,255 // ja b72 <_sk_lerp_u8_hsw_8bit+0x27>
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 15,135,135,254,255,255 // ja 197f <_sk_lerp_u8_hsw_8bit+0x27>
.byte 65,15,182,193 // movzbl %r9b,%eax
- .byte 72,141,13,124,0,0,0 // lea 0x7c(%rip),%rcx # cc8 <_sk_lerp_u8_hsw_8bit+0x17d>
+ .byte 72,141,13,201,0,0,0 // lea 0xc9(%rip),%rcx # 1bcc <_sk_lerp_u8_hsw_8bit+0x274>
.byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
.byte 72,1,200 // add %rcx,%rax
.byte 255,224 // jmpq *%rax
.byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
- .byte 197,249,110,208 // vmovd %eax,%xmm2
- .byte 233,15,255,255,255 // jmpq b72 <_sk_lerp_u8_hsw_8bit+0x27>
- .byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
- .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2
- .byte 197,233,196,208,2 // vpinsrw $0x2,%eax,%xmm2,%xmm2
+ .byte 197,249,110,224 // vmovd %eax,%xmm4
+ .byte 233,101,254,255,255 // jmpq 197f <_sk_lerp_u8_hsw_8bit+0x27>
+ .byte 197,217,239,228 // vpxor %xmm4,%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,2,2 // vpinsrb $0x2,0x2(%rdx,%r8,1),%xmm4,%xmm4
.byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
- .byte 197,249,110,216 // vmovd %eax,%xmm3
- .byte 196,226,121,48,219 // vpmovzxbw %xmm3,%xmm3
- .byte 196,227,105,2,211,1 // vpblendd $0x1,%xmm3,%xmm2,%xmm2
- .byte 233,231,254,255,255 // jmpq b72 <_sk_lerp_u8_hsw_8bit+0x27>
- .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
- .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2
- .byte 197,233,196,208,6 // vpinsrw $0x6,%eax,%xmm2,%xmm2
- .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
- .byte 197,233,196,208,5 // vpinsrw $0x5,%eax,%xmm2,%xmm2
- .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
- .byte 197,233,196,208,4 // vpinsrw $0x4,%eax,%xmm2,%xmm2
- .byte 196,161,121,110,28,2 // vmovd (%rdx,%r8,1),%xmm3
- .byte 196,226,121,48,219 // vpmovzxbw %xmm3,%xmm3
- .byte 196,227,97,2,210,12 // vpblendd $0xc,%xmm2,%xmm3,%xmm2
- .byte 233,172,254,255,255 // jmpq b72 <_sk_lerp_u8_hsw_8bit+0x27>
- .byte 102,144 // xchg %ax,%ax
- .byte 141 // (bad)
+ .byte 197,249,110,232 // vmovd %eax,%xmm5
+ .byte 196,227,89,14,229,1 // vpblendw $0x1,%xmm5,%xmm4,%xmm4
+ .byte 233,69,254,255,255 // jmpq 197f <_sk_lerp_u8_hsw_8bit+0x27>
+ .byte 197,217,239,228 // vpxor %xmm4,%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,6,6 // vpinsrb $0x6,0x6(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,5,5 // vpinsrb $0x5,0x5(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,4,4 // vpinsrb $0x4,0x4(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,161,121,110,44,2 // vmovd (%rdx,%r8,1),%xmm5
+ .byte 196,227,89,2,229,1 // vpblendd $0x1,%xmm5,%xmm4,%xmm4
+ .byte 233,24,254,255,255 // jmpq 197f <_sk_lerp_u8_hsw_8bit+0x27>
+ .byte 197,217,239,228 // vpxor %xmm4,%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,10,10 // vpinsrb $0xa,0xa(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,9,9 // vpinsrb $0x9,0x9(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,8,8 // vpinsrb $0x8,0x8(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,161,122,126,44,2 // vmovq (%rdx,%r8,1),%xmm5
+ .byte 196,227,81,2,228,12 // vpblendd $0xc,%xmm4,%xmm5,%xmm4
+ .byte 233,235,253,255,255 // jmpq 197f <_sk_lerp_u8_hsw_8bit+0x27>
+ .byte 197,217,239,228 // vpxor %xmm4,%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,14,14 // vpinsrb $0xe,0xe(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,13,13 // vpinsrb $0xd,0xd(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,163,89,32,100,2,12,12 // vpinsrb $0xc,0xc(%rdx,%r8,1),%xmm4,%xmm4
+ .byte 196,161,122,126,44,2 // vmovq (%rdx,%r8,1),%xmm5
+ .byte 196,163,81,34,108,2,8,2 // vpinsrd $0x2,0x8(%rdx,%r8,1),%xmm5,%xmm5
+ .byte 196,227,81,2,228,8 // vpblendd $0x8,%xmm4,%xmm5,%xmm4
+ .byte 233,182,253,255,255 // jmpq 197f <_sk_lerp_u8_hsw_8bit+0x27>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 64,255 // rex (bad)
.byte 255 // (bad)
+ .byte 255,90,255 // lcall *-0x1(%rdx)
.byte 255 // (bad)
- .byte 255,170,255,255,255,155 // ljmp *-0x64000001(%rdx)
+ .byte 255,78,255 // decl -0x1(%rsi)
.byte 255 // (bad)
+ .byte 255,138,255,255,255,130 // decl -0x7d000001(%rdx)
.byte 255 // (bad)
.byte 255 // (bad)
- .byte 232,255,255,255,221 // callq ffffffffde000cd8 <_sk_xor__hsw_8bit+0xffffffffddfffaa1>
.byte 255 // (bad)
+ .byte 122,255 // jp 1be1 <_sk_lerp_u8_hsw_8bit+0x289>
.byte 255 // (bad)
- .byte 255,210 // callq *%rdx
+ .byte 255,110,255 // ljmp *-0x1(%rsi)
.byte 255 // (bad)
+ .byte 255,183,255,255,255,175 // pushq -0x50000001(%rdi)
.byte 255 // (bad)
- .byte 255,195 // inc %ebx
+ .byte 255 // (bad)
+ .byte 255,167,255,255,255,155 // jmpq *-0x64000001(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,228 // jmpq *%rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 220,255 // fdivr %st,%st(7)
+ .byte 255 // (bad)
+ .byte 255,212 // callq *%rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,200 // dec %eax
.byte 255 // (bad)
.byte 255 // (bad)
.byte 255 // .byte 0xff
@@ -57038,7 +57970,8 @@ HIDDEN _sk_move_src_dst_hsw_8bit
FUNCTION(_sk_move_src_dst_hsw_8bit)
_sk_move_src_dst_hsw_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 197,252,40,200 // vmovaps %ymm0,%ymm1
+ .byte 197,252,40,208 // vmovaps %ymm0,%ymm2
+ .byte 197,252,40,217 // vmovaps %ymm1,%ymm3
.byte 255,224 // jmpq *%rax
HIDDEN _sk_move_dst_src_hsw_8bit
@@ -57046,7 +57979,8 @@ HIDDEN _sk_move_dst_src_hsw_8bit
FUNCTION(_sk_move_dst_src_hsw_8bit)
_sk_move_dst_src_hsw_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 197,252,40,193 // vmovaps %ymm1,%ymm0
+ .byte 197,252,40,194 // vmovaps %ymm2,%ymm0
+ .byte 197,252,40,203 // vmovaps %ymm3,%ymm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_black_color_hsw_8bit
@@ -57054,7 +57988,8 @@ HIDDEN _sk_black_color_hsw_8bit
FUNCTION(_sk_black_color_hsw_8bit)
_sk_black_color_hsw_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 196,226,125,25,5,145,9,0,0 // vbroadcastsd 0x991(%rip),%ymm0 # 1690 <_sk_xor__hsw_8bit+0x459>
+ .byte 196,226,125,24,5,241,11,0,0 // vbroadcastss 0xbf1(%rip),%ymm0 # 281c <_sk_xor__hsw_8bit+0x191>
+ .byte 197,252,40,200 // vmovaps %ymm0,%ymm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_white_color_hsw_8bit
@@ -57063,6 +57998,7 @@ FUNCTION(_sk_white_color_hsw_8bit)
_sk_white_color_hsw_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 197,253,118,192 // vpcmpeqd %ymm0,%ymm0,%ymm0
+ .byte 197,245,118,201 // vpcmpeqd %ymm1,%ymm1,%ymm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_clear_hsw_8bit
@@ -57071,48 +58007,83 @@ FUNCTION(_sk_clear_hsw_8bit)
_sk_clear_hsw_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 197,252,87,192 // vxorps %ymm0,%ymm0,%ymm0
+ .byte 197,244,87,201 // vxorps %ymm1,%ymm1,%ymm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_srcatop_hsw_8bit
.globl _sk_srcatop_hsw_8bit
FUNCTION(_sk_srcatop_hsw_8bit)
_sk_srcatop_hsw_8bit:
- .byte 197,253,111,21,167,7,0,0 // vmovdqa 0x7a7(%rip),%ymm2 # 14c0 <_sk_xor__hsw_8bit+0x289>
- .byte 196,226,117,0,218 // vpshufb %ymm2,%ymm1,%ymm3
- .byte 196,226,125,48,224 // vpmovzxbw %xmm0,%ymm4
- .byte 196,227,125,57,197,1 // vextracti128 $0x1,%ymm0,%xmm5
+ .byte 197,125,111,5,111,13,0,0 // vmovdqa 0xd6f(%rip),%ymm8 # 29c0 <_sk_xor__hsw_8bit+0x335>
+ .byte 196,194,101,0,224 // vpshufb %ymm8,%ymm3,%ymm4
+ .byte 196,194,109,0,232 // vpshufb %ymm8,%ymm2,%ymm5
+ .byte 196,98,125,48,208 // vpmovzxbw %xmm0,%ymm10
+ .byte 196,227,125,57,198,1 // vextracti128 $0x1,%ymm0,%xmm6
+ .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6
+ .byte 196,98,125,48,201 // vpmovzxbw %xmm1,%ymm9
+ .byte 196,227,125,57,207,1 // vextracti128 $0x1,%ymm1,%xmm7
+ .byte 196,226,125,48,255 // vpmovzxbw %xmm7,%ymm7
+ .byte 196,98,125,48,221 // vpmovzxbw %xmm5,%ymm11
+ .byte 196,227,125,57,237,1 // vextracti128 $0x1,%ymm5,%xmm5
.byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
- .byte 196,226,125,48,243 // vpmovzxbw %xmm3,%ymm6
- .byte 196,227,125,57,219,1 // vextracti128 $0x1,%ymm3,%xmm3
- .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3
- .byte 197,229,213,221 // vpmullw %ymm5,%ymm3,%ymm3
- .byte 197,205,213,244 // vpmullw %ymm4,%ymm6,%ymm6
- .byte 197,205,253,228 // vpaddw %ymm4,%ymm6,%ymm4
- .byte 197,229,253,221 // vpaddw %ymm5,%ymm3,%ymm3
- .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3
- .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
- .byte 196,227,93,56,235,1 // vinserti128 $0x1,%xmm3,%ymm4,%ymm5
- .byte 196,227,93,70,219,49 // vperm2i128 $0x31,%ymm3,%ymm4,%ymm3
- .byte 197,213,103,219 // vpackuswb %ymm3,%ymm5,%ymm3
- .byte 196,226,125,0,194 // vpshufb %ymm2,%ymm0,%ymm0
- .byte 197,237,118,210 // vpcmpeqd %ymm2,%ymm2,%ymm2
- .byte 197,253,239,194 // vpxor %ymm2,%ymm0,%ymm0
- .byte 196,226,125,48,209 // vpmovzxbw %xmm1,%ymm2
- .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4
+ .byte 196,98,125,48,228 // vpmovzxbw %xmm4,%ymm12
+ .byte 196,227,125,57,228,1 // vextracti128 $0x1,%ymm4,%xmm4
.byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
- .byte 196,226,125,48,232 // vpmovzxbw %xmm0,%ymm5
+ .byte 197,221,213,231 // vpmullw %ymm7,%ymm4,%ymm4
+ .byte 196,65,29,213,225 // vpmullw %ymm9,%ymm12,%ymm12
+ .byte 197,213,213,238 // vpmullw %ymm6,%ymm5,%ymm5
+ .byte 196,65,37,213,218 // vpmullw %ymm10,%ymm11,%ymm11
+ .byte 196,65,37,253,210 // vpaddw %ymm10,%ymm11,%ymm10
+ .byte 197,213,253,238 // vpaddw %ymm6,%ymm5,%ymm5
+ .byte 196,193,29,253,241 // vpaddw %ymm9,%ymm12,%ymm6
+ .byte 197,221,253,231 // vpaddw %ymm7,%ymm4,%ymm4
+ .byte 197,197,113,212,8 // vpsrlw $0x8,%ymm4,%ymm7
+ .byte 197,205,113,214,8 // vpsrlw $0x8,%ymm6,%ymm6
+ .byte 197,221,113,213,8 // vpsrlw $0x8,%ymm5,%ymm4
+ .byte 196,193,85,113,210,8 // vpsrlw $0x8,%ymm10,%ymm5
+ .byte 196,99,85,56,204,1 // vinserti128 $0x1,%xmm4,%ymm5,%ymm9
+ .byte 196,227,85,70,228,49 // vperm2i128 $0x31,%ymm4,%ymm5,%ymm4
+ .byte 197,53,103,204 // vpackuswb %ymm4,%ymm9,%ymm9
+ .byte 196,227,77,56,239,1 // vinserti128 $0x1,%xmm7,%ymm6,%ymm5
+ .byte 196,227,77,70,247,49 // vperm2i128 $0x31,%ymm7,%ymm6,%ymm6
+ .byte 197,213,103,238 // vpackuswb %ymm6,%ymm5,%ymm5
+ .byte 196,194,125,0,192 // vpshufb %ymm8,%ymm0,%ymm0
+ .byte 196,194,117,0,200 // vpshufb %ymm8,%ymm1,%ymm1
+ .byte 197,205,118,246 // vpcmpeqd %ymm6,%ymm6,%ymm6
+ .byte 197,245,239,206 // vpxor %ymm6,%ymm1,%ymm1
+ .byte 197,253,239,198 // vpxor %ymm6,%ymm0,%ymm0
+ .byte 196,226,125,48,242 // vpmovzxbw %xmm2,%ymm6
+ .byte 196,227,125,57,215,1 // vextracti128 $0x1,%ymm2,%xmm7
+ .byte 196,226,125,48,255 // vpmovzxbw %xmm7,%ymm7
+ .byte 196,98,125,48,195 // vpmovzxbw %xmm3,%ymm8
+ .byte 196,227,125,57,220,1 // vextracti128 $0x1,%ymm3,%xmm4
+ .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
+ .byte 196,98,125,48,208 // vpmovzxbw %xmm0,%ymm10
.byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0
.byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
- .byte 197,253,213,196 // vpmullw %ymm4,%ymm0,%ymm0
- .byte 197,213,213,234 // vpmullw %ymm2,%ymm5,%ymm5
- .byte 197,213,253,210 // vpaddw %ymm2,%ymm5,%ymm2
- .byte 197,253,253,196 // vpaddw %ymm4,%ymm0,%ymm0
+ .byte 196,98,125,48,217 // vpmovzxbw %xmm1,%ymm11
+ .byte 196,227,125,57,201,1 // vextracti128 $0x1,%ymm1,%xmm1
+ .byte 196,226,125,48,201 // vpmovzxbw %xmm1,%ymm1
+ .byte 197,221,213,201 // vpmullw %ymm1,%ymm4,%ymm1
+ .byte 196,65,61,213,219 // vpmullw %ymm11,%ymm8,%ymm11
+ .byte 197,197,213,192 // vpmullw %ymm0,%ymm7,%ymm0
+ .byte 196,65,77,213,210 // vpmullw %ymm10,%ymm6,%ymm10
+ .byte 197,173,253,246 // vpaddw %ymm6,%ymm10,%ymm6
+ .byte 197,253,253,199 // vpaddw %ymm7,%ymm0,%ymm0
+ .byte 196,193,37,253,248 // vpaddw %ymm8,%ymm11,%ymm7
+ .byte 197,245,253,204 // vpaddw %ymm4,%ymm1,%ymm1
+ .byte 197,245,113,209,8 // vpsrlw $0x8,%ymm1,%ymm1
+ .byte 197,221,113,215,8 // vpsrlw $0x8,%ymm7,%ymm4
.byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0
- .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2
- .byte 196,227,109,56,224,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm4
- .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- .byte 197,221,103,192 // vpackuswb %ymm0,%ymm4,%ymm0
- .byte 197,253,252,195 // vpaddb %ymm3,%ymm0,%ymm0
+ .byte 197,205,113,214,8 // vpsrlw $0x8,%ymm6,%ymm6
+ .byte 196,227,77,56,248,1 // vinserti128 $0x1,%xmm0,%ymm6,%ymm7
+ .byte 196,227,77,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm6,%ymm0
+ .byte 197,197,103,192 // vpackuswb %ymm0,%ymm7,%ymm0
+ .byte 196,227,93,56,241,1 // vinserti128 $0x1,%xmm1,%ymm4,%ymm6
+ .byte 196,227,93,70,201,49 // vperm2i128 $0x31,%ymm1,%ymm4,%ymm1
+ .byte 197,205,103,201 // vpackuswb %ymm1,%ymm6,%ymm1
+ .byte 196,193,125,252,193 // vpaddb %ymm9,%ymm0,%ymm0
+ .byte 197,245,252,205 // vpaddb %ymm5,%ymm1,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -57120,42 +58091,76 @@ HIDDEN _sk_dstatop_hsw_8bit
.globl _sk_dstatop_hsw_8bit
FUNCTION(_sk_dstatop_hsw_8bit)
_sk_dstatop_hsw_8bit:
- .byte 197,253,111,21,17,7,0,0 // vmovdqa 0x711(%rip),%ymm2 # 14e0 <_sk_xor__hsw_8bit+0x2a9>
- .byte 196,226,125,0,218 // vpshufb %ymm2,%ymm0,%ymm3
- .byte 196,226,125,48,225 // vpmovzxbw %xmm1,%ymm4
- .byte 196,227,125,57,205,1 // vextracti128 $0x1,%ymm1,%xmm5
+ .byte 197,125,111,5,42,12,0,0 // vmovdqa 0xc2a(%rip),%ymm8 # 29e0 <_sk_xor__hsw_8bit+0x355>
+ .byte 196,194,117,0,224 // vpshufb %ymm8,%ymm1,%ymm4
+ .byte 196,194,125,0,232 // vpshufb %ymm8,%ymm0,%ymm5
+ .byte 196,98,125,48,210 // vpmovzxbw %xmm2,%ymm10
+ .byte 196,227,125,57,214,1 // vextracti128 $0x1,%ymm2,%xmm6
+ .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6
+ .byte 196,98,125,48,203 // vpmovzxbw %xmm3,%ymm9
+ .byte 196,227,125,57,223,1 // vextracti128 $0x1,%ymm3,%xmm7
+ .byte 196,226,125,48,255 // vpmovzxbw %xmm7,%ymm7
+ .byte 196,98,125,48,221 // vpmovzxbw %xmm5,%ymm11
+ .byte 196,227,125,57,237,1 // vextracti128 $0x1,%ymm5,%xmm5
.byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
- .byte 196,226,125,48,243 // vpmovzxbw %xmm3,%ymm6
- .byte 196,227,125,57,219,1 // vextracti128 $0x1,%ymm3,%xmm3
- .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3
- .byte 197,229,213,221 // vpmullw %ymm5,%ymm3,%ymm3
- .byte 197,205,213,244 // vpmullw %ymm4,%ymm6,%ymm6
- .byte 197,205,253,228 // vpaddw %ymm4,%ymm6,%ymm4
- .byte 197,229,253,221 // vpaddw %ymm5,%ymm3,%ymm3
- .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3
- .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
- .byte 196,227,93,56,235,1 // vinserti128 $0x1,%xmm3,%ymm4,%ymm5
- .byte 196,227,93,70,219,49 // vperm2i128 $0x31,%ymm3,%ymm4,%ymm3
- .byte 197,213,103,219 // vpackuswb %ymm3,%ymm5,%ymm3
- .byte 196,226,117,0,210 // vpshufb %ymm2,%ymm1,%ymm2
- .byte 197,221,118,228 // vpcmpeqd %ymm4,%ymm4,%ymm4
- .byte 197,237,239,212 // vpxor %ymm4,%ymm2,%ymm2
- .byte 196,226,125,48,224 // vpmovzxbw %xmm0,%ymm4
+ .byte 196,98,125,48,228 // vpmovzxbw %xmm4,%ymm12
+ .byte 196,227,125,57,228,1 // vextracti128 $0x1,%ymm4,%xmm4
+ .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
+ .byte 197,197,213,228 // vpmullw %ymm4,%ymm7,%ymm4
+ .byte 196,65,53,213,228 // vpmullw %ymm12,%ymm9,%ymm12
+ .byte 197,205,213,237 // vpmullw %ymm5,%ymm6,%ymm5
+ .byte 196,65,45,213,219 // vpmullw %ymm11,%ymm10,%ymm11
+ .byte 196,65,37,253,210 // vpaddw %ymm10,%ymm11,%ymm10
+ .byte 197,213,253,238 // vpaddw %ymm6,%ymm5,%ymm5
+ .byte 196,193,29,253,241 // vpaddw %ymm9,%ymm12,%ymm6
+ .byte 197,221,253,231 // vpaddw %ymm7,%ymm4,%ymm4
+ .byte 197,197,113,212,8 // vpsrlw $0x8,%ymm4,%ymm7
+ .byte 197,205,113,214,8 // vpsrlw $0x8,%ymm6,%ymm6
+ .byte 197,221,113,213,8 // vpsrlw $0x8,%ymm5,%ymm4
+ .byte 196,193,85,113,210,8 // vpsrlw $0x8,%ymm10,%ymm5
+ .byte 196,99,85,56,204,1 // vinserti128 $0x1,%xmm4,%ymm5,%ymm9
+ .byte 196,227,85,70,228,49 // vperm2i128 $0x31,%ymm4,%ymm5,%ymm4
+ .byte 197,181,103,228 // vpackuswb %ymm4,%ymm9,%ymm4
+ .byte 196,227,77,56,239,1 // vinserti128 $0x1,%xmm7,%ymm6,%ymm5
+ .byte 196,227,77,70,247,49 // vperm2i128 $0x31,%ymm7,%ymm6,%ymm6
+ .byte 197,213,103,238 // vpackuswb %ymm6,%ymm5,%ymm5
+ .byte 196,194,109,0,240 // vpshufb %ymm8,%ymm2,%ymm6
+ .byte 196,194,101,0,248 // vpshufb %ymm8,%ymm3,%ymm7
+ .byte 196,65,61,118,192 // vpcmpeqd %ymm8,%ymm8,%ymm8
+ .byte 196,193,69,239,248 // vpxor %ymm8,%ymm7,%ymm7
+ .byte 196,193,77,239,240 // vpxor %ymm8,%ymm6,%ymm6
+ .byte 196,98,125,48,192 // vpmovzxbw %xmm0,%ymm8
.byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0
.byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
- .byte 196,226,125,48,234 // vpmovzxbw %xmm2,%ymm5
- .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2
- .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2
- .byte 197,237,213,208 // vpmullw %ymm0,%ymm2,%ymm2
- .byte 197,213,213,236 // vpmullw %ymm4,%ymm5,%ymm5
- .byte 197,213,253,228 // vpaddw %ymm4,%ymm5,%ymm4
- .byte 197,237,253,192 // vpaddw %ymm0,%ymm2,%ymm0
+ .byte 196,98,125,48,201 // vpmovzxbw %xmm1,%ymm9
+ .byte 196,227,125,57,201,1 // vextracti128 $0x1,%ymm1,%xmm1
+ .byte 196,226,125,48,201 // vpmovzxbw %xmm1,%ymm1
+ .byte 196,98,125,48,214 // vpmovzxbw %xmm6,%ymm10
+ .byte 196,227,125,57,246,1 // vextracti128 $0x1,%ymm6,%xmm6
+ .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6
+ .byte 196,98,125,48,223 // vpmovzxbw %xmm7,%ymm11
+ .byte 196,227,125,57,255,1 // vextracti128 $0x1,%ymm7,%xmm7
+ .byte 196,226,125,48,255 // vpmovzxbw %xmm7,%ymm7
+ .byte 197,197,213,249 // vpmullw %ymm1,%ymm7,%ymm7
+ .byte 196,65,37,213,217 // vpmullw %ymm9,%ymm11,%ymm11
+ .byte 197,205,213,240 // vpmullw %ymm0,%ymm6,%ymm6
+ .byte 196,65,45,213,208 // vpmullw %ymm8,%ymm10,%ymm10
+ .byte 196,65,45,253,192 // vpaddw %ymm8,%ymm10,%ymm8
+ .byte 197,205,253,192 // vpaddw %ymm0,%ymm6,%ymm0
+ .byte 196,193,37,253,241 // vpaddw %ymm9,%ymm11,%ymm6
+ .byte 197,197,253,201 // vpaddw %ymm1,%ymm7,%ymm1
+ .byte 197,245,113,209,8 // vpsrlw $0x8,%ymm1,%ymm1
+ .byte 197,205,113,214,8 // vpsrlw $0x8,%ymm6,%ymm6
.byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0
- .byte 197,237,113,212,8 // vpsrlw $0x8,%ymm4,%ymm2
- .byte 196,227,109,56,224,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm4
- .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- .byte 197,221,103,192 // vpackuswb %ymm0,%ymm4,%ymm0
- .byte 197,253,252,195 // vpaddb %ymm3,%ymm0,%ymm0
+ .byte 196,193,69,113,208,8 // vpsrlw $0x8,%ymm8,%ymm7
+ .byte 196,99,69,56,192,1 // vinserti128 $0x1,%xmm0,%ymm7,%ymm8
+ .byte 196,227,69,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm7,%ymm0
+ .byte 197,189,103,192 // vpackuswb %ymm0,%ymm8,%ymm0
+ .byte 196,227,77,56,249,1 // vinserti128 $0x1,%xmm1,%ymm6,%ymm7
+ .byte 196,227,77,70,201,49 // vperm2i128 $0x31,%ymm1,%ymm6,%ymm1
+ .byte 197,197,103,201 // vpackuswb %ymm1,%ymm7,%ymm1
+ .byte 197,253,252,196 // vpaddb %ymm4,%ymm0,%ymm0
+ .byte 197,245,252,205 // vpaddb %ymm5,%ymm1,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -57163,22 +58168,39 @@ HIDDEN _sk_srcin_hsw_8bit
.globl _sk_srcin_hsw_8bit
FUNCTION(_sk_srcin_hsw_8bit)
_sk_srcin_hsw_8bit:
- .byte 196,226,117,0,21,122,6,0,0 // vpshufb 0x67a(%rip),%ymm1,%ymm2 # 1500 <_sk_xor__hsw_8bit+0x2c9>
- .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3
+ .byte 197,253,111,37,225,10,0,0 // vmovdqa 0xae1(%rip),%ymm4 # 2a00 <_sk_xor__hsw_8bit+0x375>
+ .byte 196,226,101,0,236 // vpshufb %ymm4,%ymm3,%ymm5
+ .byte 196,226,109,0,228 // vpshufb %ymm4,%ymm2,%ymm4
+ .byte 196,226,125,48,240 // vpmovzxbw %xmm0,%ymm6
.byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0
.byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
- .byte 196,226,125,48,226 // vpmovzxbw %xmm2,%ymm4
- .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2
- .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2
- .byte 197,237,213,208 // vpmullw %ymm0,%ymm2,%ymm2
- .byte 197,221,213,227 // vpmullw %ymm3,%ymm4,%ymm4
- .byte 197,221,253,219 // vpaddw %ymm3,%ymm4,%ymm3
- .byte 197,237,253,192 // vpaddw %ymm0,%ymm2,%ymm0
+ .byte 196,226,125,48,249 // vpmovzxbw %xmm1,%ymm7
+ .byte 196,227,125,57,201,1 // vextracti128 $0x1,%ymm1,%xmm1
+ .byte 196,226,125,48,201 // vpmovzxbw %xmm1,%ymm1
+ .byte 196,98,125,48,196 // vpmovzxbw %xmm4,%ymm8
+ .byte 196,227,125,57,228,1 // vextracti128 $0x1,%ymm4,%xmm4
+ .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
+ .byte 196,98,125,48,205 // vpmovzxbw %xmm5,%ymm9
+ .byte 196,227,125,57,237,1 // vextracti128 $0x1,%ymm5,%xmm5
+ .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
+ .byte 197,213,213,233 // vpmullw %ymm1,%ymm5,%ymm5
+ .byte 197,53,213,207 // vpmullw %ymm7,%ymm9,%ymm9
+ .byte 197,221,213,224 // vpmullw %ymm0,%ymm4,%ymm4
+ .byte 197,61,213,198 // vpmullw %ymm6,%ymm8,%ymm8
+ .byte 197,189,253,246 // vpaddw %ymm6,%ymm8,%ymm6
+ .byte 197,221,253,192 // vpaddw %ymm0,%ymm4,%ymm0
+ .byte 197,181,253,231 // vpaddw %ymm7,%ymm9,%ymm4
+ .byte 197,213,253,201 // vpaddw %ymm1,%ymm5,%ymm1
+ .byte 197,245,113,209,8 // vpsrlw $0x8,%ymm1,%ymm1
+ .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
.byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0
- .byte 197,237,113,211,8 // vpsrlw $0x8,%ymm3,%ymm2
- .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3
- .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0
+ .byte 197,213,113,214,8 // vpsrlw $0x8,%ymm6,%ymm5
+ .byte 196,227,85,56,240,1 // vinserti128 $0x1,%xmm0,%ymm5,%ymm6
+ .byte 196,227,85,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm5,%ymm0
+ .byte 197,205,103,192 // vpackuswb %ymm0,%ymm6,%ymm0
+ .byte 196,227,93,56,233,1 // vinserti128 $0x1,%xmm1,%ymm4,%ymm5
+ .byte 196,227,93,70,201,49 // vperm2i128 $0x31,%ymm1,%ymm4,%ymm1
+ .byte 197,213,103,201 // vpackuswb %ymm1,%ymm5,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -57186,22 +58208,39 @@ HIDDEN _sk_dstin_hsw_8bit
.globl _sk_dstin_hsw_8bit
FUNCTION(_sk_dstin_hsw_8bit)
_sk_dstin_hsw_8bit:
- .byte 196,226,125,0,5,67,6,0,0 // vpshufb 0x643(%rip),%ymm0,%ymm0 # 1520 <_sk_xor__hsw_8bit+0x2e9>
- .byte 196,226,125,48,209 // vpmovzxbw %xmm1,%ymm2
- .byte 196,227,125,57,203,1 // vextracti128 $0x1,%ymm1,%xmm3
- .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3
- .byte 196,226,125,48,224 // vpmovzxbw %xmm0,%ymm4
+ .byte 197,253,111,37,87,10,0,0 // vmovdqa 0xa57(%rip),%ymm4 # 2a20 <_sk_xor__hsw_8bit+0x395>
+ .byte 196,226,117,0,204 // vpshufb %ymm4,%ymm1,%ymm1
+ .byte 196,226,125,0,196 // vpshufb %ymm4,%ymm0,%ymm0
+ .byte 196,226,125,48,226 // vpmovzxbw %xmm2,%ymm4
+ .byte 196,227,125,57,213,1 // vextracti128 $0x1,%ymm2,%xmm5
+ .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
+ .byte 196,226,125,48,243 // vpmovzxbw %xmm3,%ymm6
+ .byte 196,227,125,57,223,1 // vextracti128 $0x1,%ymm3,%xmm7
+ .byte 196,226,125,48,255 // vpmovzxbw %xmm7,%ymm7
+ .byte 196,98,125,48,192 // vpmovzxbw %xmm0,%ymm8
.byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0
.byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
- .byte 197,253,213,195 // vpmullw %ymm3,%ymm0,%ymm0
- .byte 197,221,213,226 // vpmullw %ymm2,%ymm4,%ymm4
- .byte 197,221,253,210 // vpaddw %ymm2,%ymm4,%ymm2
- .byte 197,253,253,195 // vpaddw %ymm3,%ymm0,%ymm0
+ .byte 196,98,125,48,201 // vpmovzxbw %xmm1,%ymm9
+ .byte 196,227,125,57,201,1 // vextracti128 $0x1,%ymm1,%xmm1
+ .byte 196,226,125,48,201 // vpmovzxbw %xmm1,%ymm1
+ .byte 197,197,213,201 // vpmullw %ymm1,%ymm7,%ymm1
+ .byte 196,65,77,213,201 // vpmullw %ymm9,%ymm6,%ymm9
+ .byte 197,213,213,192 // vpmullw %ymm0,%ymm5,%ymm0
+ .byte 196,65,93,213,192 // vpmullw %ymm8,%ymm4,%ymm8
+ .byte 197,189,253,228 // vpaddw %ymm4,%ymm8,%ymm4
+ .byte 197,253,253,197 // vpaddw %ymm5,%ymm0,%ymm0
+ .byte 197,181,253,238 // vpaddw %ymm6,%ymm9,%ymm5
+ .byte 197,245,253,207 // vpaddw %ymm7,%ymm1,%ymm1
+ .byte 197,245,113,209,8 // vpsrlw $0x8,%ymm1,%ymm1
+ .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5
.byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0
- .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2
- .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3
- .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0
+ .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
+ .byte 196,227,93,56,240,1 // vinserti128 $0x1,%xmm0,%ymm4,%ymm6
+ .byte 196,227,93,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm4,%ymm0
+ .byte 197,205,103,192 // vpackuswb %ymm0,%ymm6,%ymm0
+ .byte 196,227,85,56,225,1 // vinserti128 $0x1,%xmm1,%ymm5,%ymm4
+ .byte 196,227,85,70,201,49 // vperm2i128 $0x31,%ymm1,%ymm5,%ymm1
+ .byte 197,221,103,201 // vpackuswb %ymm1,%ymm4,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -57209,24 +58248,42 @@ HIDDEN _sk_srcout_hsw_8bit
.globl _sk_srcout_hsw_8bit
FUNCTION(_sk_srcout_hsw_8bit)
_sk_srcout_hsw_8bit:
- .byte 196,226,117,0,21,12,6,0,0 // vpshufb 0x60c(%rip),%ymm1,%ymm2 # 1540 <_sk_xor__hsw_8bit+0x309>
- .byte 197,229,118,219 // vpcmpeqd %ymm3,%ymm3,%ymm3
- .byte 197,237,239,211 // vpxor %ymm3,%ymm2,%ymm2
- .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3
+ .byte 197,253,111,37,203,9,0,0 // vmovdqa 0x9cb(%rip),%ymm4 # 2a40 <_sk_xor__hsw_8bit+0x3b5>
+ .byte 196,226,109,0,236 // vpshufb %ymm4,%ymm2,%ymm5
+ .byte 196,226,101,0,228 // vpshufb %ymm4,%ymm3,%ymm4
+ .byte 197,205,118,246 // vpcmpeqd %ymm6,%ymm6,%ymm6
+ .byte 197,221,239,230 // vpxor %ymm6,%ymm4,%ymm4
+ .byte 197,213,239,238 // vpxor %ymm6,%ymm5,%ymm5
+ .byte 196,226,125,48,240 // vpmovzxbw %xmm0,%ymm6
.byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0
.byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
- .byte 196,226,125,48,226 // vpmovzxbw %xmm2,%ymm4
- .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2
- .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2
- .byte 197,237,213,208 // vpmullw %ymm0,%ymm2,%ymm2
- .byte 197,221,213,227 // vpmullw %ymm3,%ymm4,%ymm4
- .byte 197,221,253,219 // vpaddw %ymm3,%ymm4,%ymm3
- .byte 197,237,253,192 // vpaddw %ymm0,%ymm2,%ymm0
+ .byte 196,226,125,48,249 // vpmovzxbw %xmm1,%ymm7
+ .byte 196,227,125,57,201,1 // vextracti128 $0x1,%ymm1,%xmm1
+ .byte 196,226,125,48,201 // vpmovzxbw %xmm1,%ymm1
+ .byte 196,98,125,48,197 // vpmovzxbw %xmm5,%ymm8
+ .byte 196,227,125,57,237,1 // vextracti128 $0x1,%ymm5,%xmm5
+ .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
+ .byte 196,98,125,48,204 // vpmovzxbw %xmm4,%ymm9
+ .byte 196,227,125,57,228,1 // vextracti128 $0x1,%ymm4,%xmm4
+ .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
+ .byte 197,221,213,225 // vpmullw %ymm1,%ymm4,%ymm4
+ .byte 197,53,213,207 // vpmullw %ymm7,%ymm9,%ymm9
+ .byte 197,213,213,232 // vpmullw %ymm0,%ymm5,%ymm5
+ .byte 197,61,213,198 // vpmullw %ymm6,%ymm8,%ymm8
+ .byte 197,189,253,246 // vpaddw %ymm6,%ymm8,%ymm6
+ .byte 197,213,253,192 // vpaddw %ymm0,%ymm5,%ymm0
+ .byte 197,181,253,239 // vpaddw %ymm7,%ymm9,%ymm5
+ .byte 197,221,253,201 // vpaddw %ymm1,%ymm4,%ymm1
+ .byte 197,245,113,209,8 // vpsrlw $0x8,%ymm1,%ymm1
+ .byte 197,221,113,213,8 // vpsrlw $0x8,%ymm5,%ymm4
.byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0
- .byte 197,237,113,211,8 // vpsrlw $0x8,%ymm3,%ymm2
- .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3
- .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0
+ .byte 197,213,113,214,8 // vpsrlw $0x8,%ymm6,%ymm5
+ .byte 196,227,85,56,240,1 // vinserti128 $0x1,%xmm0,%ymm5,%ymm6
+ .byte 196,227,85,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm5,%ymm0
+ .byte 197,205,103,192 // vpackuswb %ymm0,%ymm6,%ymm0
+ .byte 196,227,93,56,233,1 // vinserti128 $0x1,%xmm1,%ymm4,%ymm5
+ .byte 196,227,93,70,201,49 // vperm2i128 $0x31,%ymm1,%ymm4,%ymm1
+ .byte 197,213,103,201 // vpackuswb %ymm1,%ymm5,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -57234,24 +58291,42 @@ HIDDEN _sk_dstout_hsw_8bit
.globl _sk_dstout_hsw_8bit
FUNCTION(_sk_dstout_hsw_8bit)
_sk_dstout_hsw_8bit:
- .byte 196,226,125,0,5,205,5,0,0 // vpshufb 0x5cd(%rip),%ymm0,%ymm0 # 1560 <_sk_xor__hsw_8bit+0x329>
- .byte 197,237,118,210 // vpcmpeqd %ymm2,%ymm2,%ymm2
- .byte 197,253,239,194 // vpxor %ymm2,%ymm0,%ymm0
- .byte 196,226,125,48,209 // vpmovzxbw %xmm1,%ymm2
- .byte 196,227,125,57,203,1 // vextracti128 $0x1,%ymm1,%xmm3
- .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3
- .byte 196,226,125,48,224 // vpmovzxbw %xmm0,%ymm4
+ .byte 197,253,111,37,53,9,0,0 // vmovdqa 0x935(%rip),%ymm4 # 2a60 <_sk_xor__hsw_8bit+0x3d5>
+ .byte 196,226,125,0,196 // vpshufb %ymm4,%ymm0,%ymm0
+ .byte 196,226,117,0,204 // vpshufb %ymm4,%ymm1,%ymm1
+ .byte 197,221,118,228 // vpcmpeqd %ymm4,%ymm4,%ymm4
+ .byte 197,245,239,204 // vpxor %ymm4,%ymm1,%ymm1
+ .byte 197,253,239,196 // vpxor %ymm4,%ymm0,%ymm0
+ .byte 196,226,125,48,226 // vpmovzxbw %xmm2,%ymm4
+ .byte 196,227,125,57,213,1 // vextracti128 $0x1,%ymm2,%xmm5
+ .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
+ .byte 196,226,125,48,243 // vpmovzxbw %xmm3,%ymm6
+ .byte 196,227,125,57,223,1 // vextracti128 $0x1,%ymm3,%xmm7
+ .byte 196,226,125,48,255 // vpmovzxbw %xmm7,%ymm7
+ .byte 196,98,125,48,192 // vpmovzxbw %xmm0,%ymm8
.byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0
.byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
- .byte 197,253,213,195 // vpmullw %ymm3,%ymm0,%ymm0
- .byte 197,221,213,226 // vpmullw %ymm2,%ymm4,%ymm4
- .byte 197,221,253,210 // vpaddw %ymm2,%ymm4,%ymm2
- .byte 197,253,253,195 // vpaddw %ymm3,%ymm0,%ymm0
+ .byte 196,98,125,48,201 // vpmovzxbw %xmm1,%ymm9
+ .byte 196,227,125,57,201,1 // vextracti128 $0x1,%ymm1,%xmm1
+ .byte 196,226,125,48,201 // vpmovzxbw %xmm1,%ymm1
+ .byte 197,197,213,201 // vpmullw %ymm1,%ymm7,%ymm1
+ .byte 196,65,77,213,201 // vpmullw %ymm9,%ymm6,%ymm9
+ .byte 197,213,213,192 // vpmullw %ymm0,%ymm5,%ymm0
+ .byte 196,65,93,213,192 // vpmullw %ymm8,%ymm4,%ymm8
+ .byte 197,189,253,228 // vpaddw %ymm4,%ymm8,%ymm4
+ .byte 197,253,253,197 // vpaddw %ymm5,%ymm0,%ymm0
+ .byte 197,181,253,238 // vpaddw %ymm6,%ymm9,%ymm5
+ .byte 197,245,253,207 // vpaddw %ymm7,%ymm1,%ymm1
+ .byte 197,245,113,209,8 // vpsrlw $0x8,%ymm1,%ymm1
+ .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5
.byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0
- .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2
- .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3
- .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0
+ .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
+ .byte 196,227,93,56,240,1 // vinserti128 $0x1,%xmm0,%ymm4,%ymm6
+ .byte 196,227,93,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm4,%ymm0
+ .byte 197,205,103,192 // vpackuswb %ymm0,%ymm6,%ymm0
+ .byte 196,227,85,56,225,1 // vinserti128 $0x1,%xmm1,%ymm5,%ymm4
+ .byte 196,227,85,70,201,49 // vperm2i128 $0x31,%ymm1,%ymm5,%ymm1
+ .byte 197,221,103,201 // vpackuswb %ymm1,%ymm4,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -57259,24 +58334,43 @@ HIDDEN _sk_srcover_hsw_8bit
.globl _sk_srcover_hsw_8bit
FUNCTION(_sk_srcover_hsw_8bit)
_sk_srcover_hsw_8bit:
- .byte 196,226,125,0,21,142,5,0,0 // vpshufb 0x58e(%rip),%ymm0,%ymm2 # 1580 <_sk_xor__hsw_8bit+0x349>
- .byte 196,226,125,48,217 // vpmovzxbw %xmm1,%ymm3
- .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4
+ .byte 197,253,111,37,157,8,0,0 // vmovdqa 0x89d(%rip),%ymm4 # 2a80 <_sk_xor__hsw_8bit+0x3f5>
+ .byte 196,226,117,0,236 // vpshufb %ymm4,%ymm1,%ymm5
+ .byte 196,226,125,0,228 // vpshufb %ymm4,%ymm0,%ymm4
+ .byte 196,98,125,48,202 // vpmovzxbw %xmm2,%ymm9
+ .byte 196,227,125,57,215,1 // vextracti128 $0x1,%ymm2,%xmm7
+ .byte 196,226,125,48,255 // vpmovzxbw %xmm7,%ymm7
+ .byte 196,98,125,48,195 // vpmovzxbw %xmm3,%ymm8
+ .byte 196,227,125,57,222,1 // vextracti128 $0x1,%ymm3,%xmm6
+ .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6
+ .byte 196,98,125,48,212 // vpmovzxbw %xmm4,%ymm10
+ .byte 196,227,125,57,228,1 // vextracti128 $0x1,%ymm4,%xmm4
.byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
- .byte 196,226,125,48,234 // vpmovzxbw %xmm2,%ymm5
- .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2
- .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2
- .byte 197,237,213,212 // vpmullw %ymm4,%ymm2,%ymm2
- .byte 197,213,213,235 // vpmullw %ymm3,%ymm5,%ymm5
- .byte 197,213,253,219 // vpaddw %ymm3,%ymm5,%ymm3
- .byte 197,237,253,212 // vpaddw %ymm4,%ymm2,%ymm2
- .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2
- .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3
- .byte 196,227,101,56,226,1 // vinserti128 $0x1,%xmm2,%ymm3,%ymm4
- .byte 196,227,101,70,210,49 // vperm2i128 $0x31,%ymm2,%ymm3,%ymm2
- .byte 197,221,103,210 // vpackuswb %ymm2,%ymm4,%ymm2
- .byte 197,245,252,192 // vpaddb %ymm0,%ymm1,%ymm0
- .byte 197,253,248,194 // vpsubb %ymm2,%ymm0,%ymm0
+ .byte 196,98,125,48,221 // vpmovzxbw %xmm5,%ymm11
+ .byte 196,227,125,57,237,1 // vextracti128 $0x1,%ymm5,%xmm5
+ .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
+ .byte 197,205,213,237 // vpmullw %ymm5,%ymm6,%ymm5
+ .byte 196,65,61,213,219 // vpmullw %ymm11,%ymm8,%ymm11
+ .byte 197,197,213,228 // vpmullw %ymm4,%ymm7,%ymm4
+ .byte 196,65,53,213,210 // vpmullw %ymm10,%ymm9,%ymm10
+ .byte 196,65,45,253,201 // vpaddw %ymm9,%ymm10,%ymm9
+ .byte 197,221,253,231 // vpaddw %ymm7,%ymm4,%ymm4
+ .byte 196,193,37,253,248 // vpaddw %ymm8,%ymm11,%ymm7
+ .byte 197,213,253,238 // vpaddw %ymm6,%ymm5,%ymm5
+ .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5
+ .byte 197,205,113,215,8 // vpsrlw $0x8,%ymm7,%ymm6
+ .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
+ .byte 196,193,69,113,209,8 // vpsrlw $0x8,%ymm9,%ymm7
+ .byte 196,99,69,56,196,1 // vinserti128 $0x1,%xmm4,%ymm7,%ymm8
+ .byte 196,227,69,70,228,49 // vperm2i128 $0x31,%ymm4,%ymm7,%ymm4
+ .byte 197,189,103,228 // vpackuswb %ymm4,%ymm8,%ymm4
+ .byte 196,227,77,56,253,1 // vinserti128 $0x1,%xmm5,%ymm6,%ymm7
+ .byte 196,227,77,70,237,49 // vperm2i128 $0x31,%ymm5,%ymm6,%ymm5
+ .byte 197,197,103,237 // vpackuswb %ymm5,%ymm7,%ymm5
+ .byte 197,229,252,201 // vpaddb %ymm1,%ymm3,%ymm1
+ .byte 197,237,252,192 // vpaddb %ymm0,%ymm2,%ymm0
+ .byte 197,253,248,196 // vpsubb %ymm4,%ymm0,%ymm0
+ .byte 197,245,248,205 // vpsubb %ymm5,%ymm1,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -57284,24 +58378,43 @@ HIDDEN _sk_dstover_hsw_8bit
.globl _sk_dstover_hsw_8bit
FUNCTION(_sk_dstover_hsw_8bit)
_sk_dstover_hsw_8bit:
- .byte 196,226,117,0,21,79,5,0,0 // vpshufb 0x54f(%rip),%ymm1,%ymm2 # 15a0 <_sk_xor__hsw_8bit+0x369>
- .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3
- .byte 196,227,125,57,196,1 // vextracti128 $0x1,%ymm0,%xmm4
+ .byte 197,253,111,37,254,7,0,0 // vmovdqa 0x7fe(%rip),%ymm4 # 2aa0 <_sk_xor__hsw_8bit+0x415>
+ .byte 196,226,101,0,236 // vpshufb %ymm4,%ymm3,%ymm5
+ .byte 196,226,109,0,228 // vpshufb %ymm4,%ymm2,%ymm4
+ .byte 196,98,125,48,200 // vpmovzxbw %xmm0,%ymm9
+ .byte 196,227,125,57,199,1 // vextracti128 $0x1,%ymm0,%xmm7
+ .byte 196,226,125,48,255 // vpmovzxbw %xmm7,%ymm7
+ .byte 196,98,125,48,193 // vpmovzxbw %xmm1,%ymm8
+ .byte 196,227,125,57,206,1 // vextracti128 $0x1,%ymm1,%xmm6
+ .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6
+ .byte 196,98,125,48,212 // vpmovzxbw %xmm4,%ymm10
+ .byte 196,227,125,57,228,1 // vextracti128 $0x1,%ymm4,%xmm4
.byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
- .byte 196,226,125,48,234 // vpmovzxbw %xmm2,%ymm5
- .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2
- .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2
- .byte 197,237,213,212 // vpmullw %ymm4,%ymm2,%ymm2
- .byte 197,213,213,235 // vpmullw %ymm3,%ymm5,%ymm5
- .byte 197,213,253,219 // vpaddw %ymm3,%ymm5,%ymm3
- .byte 197,237,253,212 // vpaddw %ymm4,%ymm2,%ymm2
- .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2
- .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3
- .byte 196,227,101,56,226,1 // vinserti128 $0x1,%xmm2,%ymm3,%ymm4
- .byte 196,227,101,70,210,49 // vperm2i128 $0x31,%ymm2,%ymm3,%ymm2
- .byte 197,221,103,210 // vpackuswb %ymm2,%ymm4,%ymm2
- .byte 197,245,252,192 // vpaddb %ymm0,%ymm1,%ymm0
- .byte 197,253,248,194 // vpsubb %ymm2,%ymm0,%ymm0
+ .byte 196,98,125,48,221 // vpmovzxbw %xmm5,%ymm11
+ .byte 196,227,125,57,237,1 // vextracti128 $0x1,%ymm5,%xmm5
+ .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
+ .byte 197,213,213,238 // vpmullw %ymm6,%ymm5,%ymm5
+ .byte 196,65,37,213,216 // vpmullw %ymm8,%ymm11,%ymm11
+ .byte 197,221,213,231 // vpmullw %ymm7,%ymm4,%ymm4
+ .byte 196,65,45,213,209 // vpmullw %ymm9,%ymm10,%ymm10
+ .byte 196,65,45,253,201 // vpaddw %ymm9,%ymm10,%ymm9
+ .byte 197,221,253,231 // vpaddw %ymm7,%ymm4,%ymm4
+ .byte 196,193,37,253,248 // vpaddw %ymm8,%ymm11,%ymm7
+ .byte 197,213,253,238 // vpaddw %ymm6,%ymm5,%ymm5
+ .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5
+ .byte 197,205,113,215,8 // vpsrlw $0x8,%ymm7,%ymm6
+ .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
+ .byte 196,193,69,113,209,8 // vpsrlw $0x8,%ymm9,%ymm7
+ .byte 196,99,69,56,196,1 // vinserti128 $0x1,%xmm4,%ymm7,%ymm8
+ .byte 196,227,69,70,228,49 // vperm2i128 $0x31,%ymm4,%ymm7,%ymm4
+ .byte 197,189,103,228 // vpackuswb %ymm4,%ymm8,%ymm4
+ .byte 196,227,77,56,253,1 // vinserti128 $0x1,%xmm5,%ymm6,%ymm7
+ .byte 196,227,77,70,237,49 // vperm2i128 $0x31,%ymm5,%ymm6,%ymm5
+ .byte 197,197,103,237 // vpackuswb %ymm5,%ymm7,%ymm5
+ .byte 197,229,252,201 // vpaddb %ymm1,%ymm3,%ymm1
+ .byte 197,237,252,192 // vpaddb %ymm0,%ymm2,%ymm0
+ .byte 197,253,248,196 // vpsubb %ymm4,%ymm0,%ymm0
+ .byte 197,245,248,205 // vpsubb %ymm5,%ymm1,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -57309,21 +58422,36 @@ HIDDEN _sk_modulate_hsw_8bit
.globl _sk_modulate_hsw_8bit
FUNCTION(_sk_modulate_hsw_8bit)
_sk_modulate_hsw_8bit:
- .byte 196,226,125,48,208 // vpmovzxbw %xmm0,%ymm2
+ .byte 196,226,125,48,224 // vpmovzxbw %xmm0,%ymm4
.byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0
.byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
- .byte 196,226,125,48,217 // vpmovzxbw %xmm1,%ymm3
- .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4
- .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
- .byte 197,221,213,224 // vpmullw %ymm0,%ymm4,%ymm4
- .byte 197,229,213,218 // vpmullw %ymm2,%ymm3,%ymm3
- .byte 197,229,253,210 // vpaddw %ymm2,%ymm3,%ymm2
- .byte 197,221,253,192 // vpaddw %ymm0,%ymm4,%ymm0
+ .byte 196,226,125,48,233 // vpmovzxbw %xmm1,%ymm5
+ .byte 196,227,125,57,201,1 // vextracti128 $0x1,%ymm1,%xmm1
+ .byte 196,226,125,48,201 // vpmovzxbw %xmm1,%ymm1
+ .byte 196,98,125,48,202 // vpmovzxbw %xmm2,%ymm9
+ .byte 196,227,125,57,215,1 // vextracti128 $0x1,%ymm2,%xmm7
+ .byte 196,226,125,48,255 // vpmovzxbw %xmm7,%ymm7
+ .byte 196,98,125,48,195 // vpmovzxbw %xmm3,%ymm8
+ .byte 196,227,125,57,222,1 // vextracti128 $0x1,%ymm3,%xmm6
+ .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6
+ .byte 197,205,213,241 // vpmullw %ymm1,%ymm6,%ymm6
+ .byte 197,61,213,197 // vpmullw %ymm5,%ymm8,%ymm8
+ .byte 197,197,213,248 // vpmullw %ymm0,%ymm7,%ymm7
+ .byte 197,53,213,204 // vpmullw %ymm4,%ymm9,%ymm9
+ .byte 197,181,253,228 // vpaddw %ymm4,%ymm9,%ymm4
+ .byte 197,197,253,192 // vpaddw %ymm0,%ymm7,%ymm0
+ .byte 197,189,253,237 // vpaddw %ymm5,%ymm8,%ymm5
+ .byte 197,205,253,201 // vpaddw %ymm1,%ymm6,%ymm1
+ .byte 197,245,113,209,8 // vpsrlw $0x8,%ymm1,%ymm1
+ .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5
.byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0
- .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2
- .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3
- .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0
+ .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
+ .byte 196,227,93,56,240,1 // vinserti128 $0x1,%xmm0,%ymm4,%ymm6
+ .byte 196,227,93,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm4,%ymm0
+ .byte 197,205,103,192 // vpackuswb %ymm0,%ymm6,%ymm0
+ .byte 196,227,85,56,225,1 // vinserti128 $0x1,%xmm1,%ymm5,%ymm4
+ .byte 196,227,85,70,201,49 // vperm2i128 $0x31,%ymm1,%ymm5,%ymm1
+ .byte 197,221,103,201 // vpackuswb %ymm1,%ymm4,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -57331,78 +58459,145 @@ HIDDEN _sk_multiply_hsw_8bit
.globl _sk_multiply_hsw_8bit
FUNCTION(_sk_multiply_hsw_8bit)
_sk_multiply_hsw_8bit:
- .byte 197,253,111,37,195,4,0,0 // vmovdqa 0x4c3(%rip),%ymm4 # 15c0 <_sk_xor__hsw_8bit+0x389>
- .byte 196,226,117,0,212 // vpshufb %ymm4,%ymm1,%ymm2
- .byte 197,213,118,237 // vpcmpeqd %ymm5,%ymm5,%ymm5
- .byte 197,237,239,245 // vpxor %ymm5,%ymm2,%ymm6
- .byte 196,226,125,48,208 // vpmovzxbw %xmm0,%ymm2
- .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3
- .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3
- .byte 196,226,125,48,254 // vpmovzxbw %xmm6,%ymm7
- .byte 196,227,125,57,246,1 // vextracti128 $0x1,%ymm6,%xmm6
+ .byte 197,253,111,243 // vmovdqa %ymm3,%ymm6
+ .byte 197,253,111,218 // vmovdqa %ymm2,%ymm3
+ .byte 197,125,111,13,191,6,0,0 // vmovdqa 0x6bf(%rip),%ymm9 # 2ac0 <_sk_xor__hsw_8bit+0x435>
+ .byte 196,194,101,0,225 // vpshufb %ymm9,%ymm3,%ymm4
+ .byte 196,194,77,0,233 // vpshufb %ymm9,%ymm6,%ymm5
+ .byte 196,65,45,118,210 // vpcmpeqd %ymm10,%ymm10,%ymm10
+ .byte 196,65,85,239,194 // vpxor %ymm10,%ymm5,%ymm8
+ .byte 196,65,93,239,218 // vpxor %ymm10,%ymm4,%ymm11
+ .byte 196,98,125,48,232 // vpmovzxbw %xmm0,%ymm13
+ .byte 196,227,125,57,197,1 // vextracti128 $0x1,%ymm0,%xmm5
+ .byte 196,98,125,48,245 // vpmovzxbw %xmm5,%ymm14
+ .byte 196,226,125,48,209 // vpmovzxbw %xmm1,%ymm2
+ .byte 196,227,125,57,207,1 // vextracti128 $0x1,%ymm1,%xmm7
+ .byte 196,226,125,48,255 // vpmovzxbw %xmm7,%ymm7
+ .byte 196,66,125,48,227 // vpmovzxbw %xmm11,%ymm12
+ .byte 196,99,125,57,220,1 // vextracti128 $0x1,%ymm11,%xmm4
+ .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
+ .byte 196,66,125,48,216 // vpmovzxbw %xmm8,%ymm11
+ .byte 196,99,125,57,197,1 // vextracti128 $0x1,%ymm8,%xmm5
+ .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
+ .byte 197,213,213,239 // vpmullw %ymm7,%ymm5,%ymm5
+ .byte 197,37,213,194 // vpmullw %ymm2,%ymm11,%ymm8
+ .byte 196,193,93,213,230 // vpmullw %ymm14,%ymm4,%ymm4
+ .byte 196,65,29,213,221 // vpmullw %ymm13,%ymm12,%ymm11
+ .byte 196,65,37,253,221 // vpaddw %ymm13,%ymm11,%ymm11
+ .byte 196,193,93,253,230 // vpaddw %ymm14,%ymm4,%ymm4
+ .byte 197,61,253,194 // vpaddw %ymm2,%ymm8,%ymm8
+ .byte 197,213,253,239 // vpaddw %ymm7,%ymm5,%ymm5
+ .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5
+ .byte 196,193,29,113,208,8 // vpsrlw $0x8,%ymm8,%ymm12
+ .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
+ .byte 196,193,61,113,211,8 // vpsrlw $0x8,%ymm11,%ymm8
+ .byte 196,99,61,56,220,1 // vinserti128 $0x1,%xmm4,%ymm8,%ymm11
+ .byte 196,227,61,70,228,49 // vperm2i128 $0x31,%ymm4,%ymm8,%ymm4
+ .byte 197,37,103,196 // vpackuswb %ymm4,%ymm11,%ymm8
+ .byte 196,227,29,56,229,1 // vinserti128 $0x1,%xmm5,%ymm12,%ymm4
+ .byte 196,227,29,70,237,49 // vperm2i128 $0x31,%ymm5,%ymm12,%ymm5
+ .byte 197,93,103,221 // vpackuswb %ymm5,%ymm4,%ymm11
+ .byte 196,194,125,0,193 // vpshufb %ymm9,%ymm0,%ymm0
+ .byte 196,194,117,0,201 // vpshufb %ymm9,%ymm1,%ymm1
+ .byte 196,65,117,239,226 // vpxor %ymm10,%ymm1,%ymm12
+ .byte 196,193,125,239,226 // vpxor %ymm10,%ymm0,%ymm4
+ .byte 196,226,125,48,195 // vpmovzxbw %xmm3,%ymm0
+ .byte 196,227,125,57,217,1 // vextracti128 $0x1,%ymm3,%xmm1
+ .byte 196,226,125,48,201 // vpmovzxbw %xmm1,%ymm1
+ .byte 197,254,127,116,36,200 // vmovdqu %ymm6,-0x38(%rsp)
+ .byte 196,98,125,48,206 // vpmovzxbw %xmm6,%ymm9
+ .byte 196,227,125,57,245,1 // vextracti128 $0x1,%ymm6,%xmm5
+ .byte 196,98,125,48,213 // vpmovzxbw %xmm5,%ymm10
+ .byte 196,226,125,48,236 // vpmovzxbw %xmm4,%ymm5
+ .byte 196,227,125,57,228,1 // vextracti128 $0x1,%ymm4,%xmm4
+ .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
+ .byte 197,245,213,228 // vpmullw %ymm4,%ymm1,%ymm4
+ .byte 197,253,213,237 // vpmullw %ymm5,%ymm0,%ymm5
+ .byte 197,213,253,232 // vpaddw %ymm0,%ymm5,%ymm5
+ .byte 197,221,253,225 // vpaddw %ymm1,%ymm4,%ymm4
+ .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
+ .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5
+ .byte 196,99,85,56,252,1 // vinserti128 $0x1,%xmm4,%ymm5,%ymm15
+ .byte 196,227,85,70,228,49 // vperm2i128 $0x31,%ymm4,%ymm5,%ymm4
+ .byte 196,194,125,48,236 // vpmovzxbw %xmm12,%ymm5
+ .byte 196,99,125,57,230,1 // vextracti128 $0x1,%ymm12,%xmm6
.byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6
- .byte 197,205,213,243 // vpmullw %ymm3,%ymm6,%ymm6
- .byte 197,197,213,250 // vpmullw %ymm2,%ymm7,%ymm7
- .byte 197,197,253,250 // vpaddw %ymm2,%ymm7,%ymm7
- .byte 197,205,253,243 // vpaddw %ymm3,%ymm6,%ymm6
+ .byte 197,173,213,246 // vpmullw %ymm6,%ymm10,%ymm6
+ .byte 197,181,213,237 // vpmullw %ymm5,%ymm9,%ymm5
+ .byte 196,193,85,253,233 // vpaddw %ymm9,%ymm5,%ymm5
+ .byte 196,193,77,253,242 // vpaddw %ymm10,%ymm6,%ymm6
.byte 197,205,113,214,8 // vpsrlw $0x8,%ymm6,%ymm6
- .byte 197,197,113,215,8 // vpsrlw $0x8,%ymm7,%ymm7
- .byte 196,99,69,56,198,1 // vinserti128 $0x1,%xmm6,%ymm7,%ymm8
- .byte 196,227,69,70,246,49 // vperm2i128 $0x31,%ymm6,%ymm7,%ymm6
- .byte 197,189,103,246 // vpackuswb %ymm6,%ymm8,%ymm6
- .byte 196,226,125,0,196 // vpshufb %ymm4,%ymm0,%ymm0
- .byte 197,253,239,197 // vpxor %ymm5,%ymm0,%ymm0
- .byte 196,226,125,48,225 // vpmovzxbw %xmm1,%ymm4
- .byte 196,227,125,57,205,1 // vextracti128 $0x1,%ymm1,%xmm5
- .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
- .byte 196,226,125,48,248 // vpmovzxbw %xmm0,%ymm7
- .byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0
- .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
- .byte 197,253,213,197 // vpmullw %ymm5,%ymm0,%ymm0
- .byte 197,197,213,252 // vpmullw %ymm4,%ymm7,%ymm7
- .byte 197,197,253,252 // vpaddw %ymm4,%ymm7,%ymm7
- .byte 197,253,253,197 // vpaddw %ymm5,%ymm0,%ymm0
+ .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5
+ .byte 197,133,103,228 // vpackuswb %ymm4,%ymm15,%ymm4
+ .byte 196,99,85,56,230,1 // vinserti128 $0x1,%xmm6,%ymm5,%ymm12
+ .byte 196,227,85,70,238,49 // vperm2i128 $0x31,%ymm6,%ymm5,%ymm5
+ .byte 197,157,103,237 // vpackuswb %ymm5,%ymm12,%ymm5
+ .byte 196,193,85,252,235 // vpaddb %ymm11,%ymm5,%ymm5
+ .byte 196,193,93,252,224 // vpaddb %ymm8,%ymm4,%ymm4
+ .byte 196,193,125,213,197 // vpmullw %ymm13,%ymm0,%ymm0
+ .byte 196,193,125,253,197 // vpaddw %ymm13,%ymm0,%ymm0
+ .byte 196,193,117,213,206 // vpmullw %ymm14,%ymm1,%ymm1
+ .byte 196,193,117,253,206 // vpaddw %ymm14,%ymm1,%ymm1
+ .byte 197,181,213,242 // vpmullw %ymm2,%ymm9,%ymm6
+ .byte 197,205,253,210 // vpaddw %ymm2,%ymm6,%ymm2
+ .byte 197,173,213,247 // vpmullw %ymm7,%ymm10,%ymm6
+ .byte 197,205,253,247 // vpaddw %ymm7,%ymm6,%ymm6
+ .byte 197,245,113,209,8 // vpsrlw $0x8,%ymm1,%ymm1
.byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0
- .byte 197,197,113,215,8 // vpsrlw $0x8,%ymm7,%ymm7
- .byte 196,99,69,56,192,1 // vinserti128 $0x1,%xmm0,%ymm7,%ymm8
- .byte 196,227,69,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm7,%ymm0
- .byte 197,189,103,192 // vpackuswb %ymm0,%ymm8,%ymm0
- .byte 197,253,252,198 // vpaddb %ymm6,%ymm0,%ymm0
- .byte 197,213,213,235 // vpmullw %ymm3,%ymm5,%ymm5
- .byte 197,221,213,226 // vpmullw %ymm2,%ymm4,%ymm4
- .byte 197,221,253,210 // vpaddw %ymm2,%ymm4,%ymm2
- .byte 197,213,253,219 // vpaddw %ymm3,%ymm5,%ymm3
- .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3
+ .byte 196,227,125,56,249,1 // vinserti128 $0x1,%xmm1,%ymm0,%ymm7
+ .byte 196,227,125,70,193,49 // vperm2i128 $0x31,%ymm1,%ymm0,%ymm0
+ .byte 197,245,113,214,8 // vpsrlw $0x8,%ymm6,%ymm1
.byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2
- .byte 196,227,109,56,227,1 // vinserti128 $0x1,%xmm3,%ymm2,%ymm4
- .byte 196,227,109,70,211,49 // vperm2i128 $0x31,%ymm3,%ymm2,%ymm2
- .byte 197,221,103,210 // vpackuswb %ymm2,%ymm4,%ymm2
- .byte 197,253,252,194 // vpaddb %ymm2,%ymm0,%ymm0
+ .byte 197,197,103,192 // vpackuswb %ymm0,%ymm7,%ymm0
+ .byte 196,227,109,56,241,1 // vinserti128 $0x1,%xmm1,%ymm2,%ymm6
+ .byte 196,227,109,70,201,49 // vperm2i128 $0x31,%ymm1,%ymm2,%ymm1
+ .byte 197,205,103,201 // vpackuswb %ymm1,%ymm6,%ymm1
+ .byte 197,221,252,192 // vpaddb %ymm0,%ymm4,%ymm0
+ .byte 197,213,252,201 // vpaddb %ymm1,%ymm5,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,253,111,211 // vmovdqa %ymm3,%ymm2
+ .byte 197,252,16,92,36,200 // vmovups -0x38(%rsp),%ymm3
.byte 255,224 // jmpq *%rax
HIDDEN _sk_screen_hsw_8bit
.globl _sk_screen_hsw_8bit
FUNCTION(_sk_screen_hsw_8bit)
_sk_screen_hsw_8bit:
- .byte 197,237,118,210 // vpcmpeqd %ymm2,%ymm2,%ymm2
- .byte 197,253,239,210 // vpxor %ymm2,%ymm0,%ymm2
- .byte 196,226,125,48,218 // vpmovzxbw %xmm2,%ymm3
- .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2
- .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2
- .byte 196,226,125,48,225 // vpmovzxbw %xmm1,%ymm4
- .byte 196,227,125,57,205,1 // vextracti128 $0x1,%ymm1,%xmm5
+ .byte 197,221,118,228 // vpcmpeqd %ymm4,%ymm4,%ymm4
+ .byte 197,245,239,236 // vpxor %ymm4,%ymm1,%ymm5
+ .byte 197,253,239,228 // vpxor %ymm4,%ymm0,%ymm4
+ .byte 196,98,125,48,204 // vpmovzxbw %xmm4,%ymm9
+ .byte 196,227,125,57,228,1 // vextracti128 $0x1,%ymm4,%xmm4
+ .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
+ .byte 196,226,125,48,253 // vpmovzxbw %xmm5,%ymm7
+ .byte 196,227,125,57,237,1 // vextracti128 $0x1,%ymm5,%xmm5
.byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
- .byte 197,213,213,234 // vpmullw %ymm2,%ymm5,%ymm5
- .byte 197,221,213,227 // vpmullw %ymm3,%ymm4,%ymm4
- .byte 197,221,253,219 // vpaddw %ymm3,%ymm4,%ymm3
- .byte 197,213,253,210 // vpaddw %ymm2,%ymm5,%ymm2
- .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2
- .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3
- .byte 196,227,101,56,226,1 // vinserti128 $0x1,%xmm2,%ymm3,%ymm4
- .byte 196,227,101,70,210,49 // vperm2i128 $0x31,%ymm2,%ymm3,%ymm2
- .byte 197,221,103,210 // vpackuswb %ymm2,%ymm4,%ymm2
- .byte 197,237,252,192 // vpaddb %ymm0,%ymm2,%ymm0
+ .byte 196,98,125,48,194 // vpmovzxbw %xmm2,%ymm8
+ .byte 196,227,125,57,214,1 // vextracti128 $0x1,%ymm2,%xmm6
+ .byte 196,98,125,48,222 // vpmovzxbw %xmm6,%ymm11
+ .byte 196,98,125,48,211 // vpmovzxbw %xmm3,%ymm10
+ .byte 196,227,125,57,222,1 // vextracti128 $0x1,%ymm3,%xmm6
+ .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6
+ .byte 197,205,213,245 // vpmullw %ymm5,%ymm6,%ymm6
+ .byte 197,45,213,215 // vpmullw %ymm7,%ymm10,%ymm10
+ .byte 197,37,213,220 // vpmullw %ymm4,%ymm11,%ymm11
+ .byte 196,65,61,213,193 // vpmullw %ymm9,%ymm8,%ymm8
+ .byte 196,65,61,253,193 // vpaddw %ymm9,%ymm8,%ymm8
+ .byte 197,165,253,228 // vpaddw %ymm4,%ymm11,%ymm4
+ .byte 197,173,253,255 // vpaddw %ymm7,%ymm10,%ymm7
+ .byte 197,205,253,237 // vpaddw %ymm5,%ymm6,%ymm5
+ .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5
+ .byte 197,205,113,215,8 // vpsrlw $0x8,%ymm7,%ymm6
+ .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
+ .byte 196,193,69,113,208,8 // vpsrlw $0x8,%ymm8,%ymm7
+ .byte 196,99,69,56,196,1 // vinserti128 $0x1,%xmm4,%ymm7,%ymm8
+ .byte 196,227,69,70,228,49 // vperm2i128 $0x31,%ymm4,%ymm7,%ymm4
+ .byte 197,189,103,228 // vpackuswb %ymm4,%ymm8,%ymm4
+ .byte 196,227,77,56,253,1 // vinserti128 $0x1,%xmm5,%ymm6,%ymm7
+ .byte 196,227,77,70,237,49 // vperm2i128 $0x31,%ymm5,%ymm6,%ymm5
+ .byte 197,197,103,237 // vpackuswb %ymm5,%ymm7,%ymm5
+ .byte 197,221,252,192 // vpaddb %ymm0,%ymm4,%ymm0
+ .byte 197,213,252,201 // vpaddb %ymm1,%ymm5,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -57410,49 +58605,84 @@ HIDDEN _sk_xor__hsw_8bit
.globl _sk_xor__hsw_8bit
FUNCTION(_sk_xor__hsw_8bit)
_sk_xor__hsw_8bit:
- .byte 197,253,111,21,161,3,0,0 // vmovdqa 0x3a1(%rip),%ymm2 # 15e0 <_sk_xor__hsw_8bit+0x3a9>
- .byte 196,226,117,0,218 // vpshufb %ymm2,%ymm1,%ymm3
- .byte 197,221,118,228 // vpcmpeqd %ymm4,%ymm4,%ymm4
- .byte 197,229,239,220 // vpxor %ymm4,%ymm3,%ymm3
- .byte 196,226,125,48,232 // vpmovzxbw %xmm0,%ymm5
- .byte 196,227,125,57,198,1 // vextracti128 $0x1,%ymm0,%xmm6
+ .byte 197,125,111,13,77,4,0,0 // vmovdqa 0x44d(%rip),%ymm9 # 2ae0 <_sk_xor__hsw_8bit+0x455>
+ .byte 196,194,109,0,225 // vpshufb %ymm9,%ymm2,%ymm4
+ .byte 196,194,101,0,249 // vpshufb %ymm9,%ymm3,%ymm7
+ .byte 196,65,37,118,219 // vpcmpeqd %ymm11,%ymm11,%ymm11
+ .byte 196,193,69,239,251 // vpxor %ymm11,%ymm7,%ymm7
+ .byte 196,193,93,239,227 // vpxor %ymm11,%ymm4,%ymm4
+ .byte 196,98,125,48,192 // vpmovzxbw %xmm0,%ymm8
+ .byte 196,227,125,57,197,1 // vextracti128 $0x1,%ymm0,%xmm5
+ .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5
+ .byte 196,98,125,48,209 // vpmovzxbw %xmm1,%ymm10
+ .byte 196,227,125,57,206,1 // vextracti128 $0x1,%ymm1,%xmm6
.byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6
- .byte 196,226,125,48,251 // vpmovzxbw %xmm3,%ymm7
- .byte 196,227,125,57,219,1 // vextracti128 $0x1,%ymm3,%xmm3
- .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3
- .byte 197,229,213,222 // vpmullw %ymm6,%ymm3,%ymm3
- .byte 197,197,213,253 // vpmullw %ymm5,%ymm7,%ymm7
- .byte 197,197,253,237 // vpaddw %ymm5,%ymm7,%ymm5
- .byte 197,229,253,222 // vpaddw %ymm6,%ymm3,%ymm3
- .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3
+ .byte 196,98,125,48,228 // vpmovzxbw %xmm4,%ymm12
+ .byte 196,227,125,57,228,1 // vextracti128 $0x1,%ymm4,%xmm4
+ .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
+ .byte 196,98,125,48,239 // vpmovzxbw %xmm7,%ymm13
+ .byte 196,227,125,57,255,1 // vextracti128 $0x1,%ymm7,%xmm7
+ .byte 196,226,125,48,255 // vpmovzxbw %xmm7,%ymm7
+ .byte 197,197,213,254 // vpmullw %ymm6,%ymm7,%ymm7
+ .byte 196,65,21,213,234 // vpmullw %ymm10,%ymm13,%ymm13
+ .byte 197,221,213,229 // vpmullw %ymm5,%ymm4,%ymm4
+ .byte 196,65,29,213,224 // vpmullw %ymm8,%ymm12,%ymm12
+ .byte 196,65,29,253,192 // vpaddw %ymm8,%ymm12,%ymm8
+ .byte 197,221,253,229 // vpaddw %ymm5,%ymm4,%ymm4
+ .byte 196,193,21,253,234 // vpaddw %ymm10,%ymm13,%ymm5
+ .byte 197,197,253,246 // vpaddw %ymm6,%ymm7,%ymm6
+ .byte 197,205,113,214,8 // vpsrlw $0x8,%ymm6,%ymm6
.byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5
- .byte 196,227,85,56,243,1 // vinserti128 $0x1,%xmm3,%ymm5,%ymm6
- .byte 196,227,85,70,219,49 // vperm2i128 $0x31,%ymm3,%ymm5,%ymm3
- .byte 197,205,103,219 // vpackuswb %ymm3,%ymm6,%ymm3
- .byte 196,226,125,0,194 // vpshufb %ymm2,%ymm0,%ymm0
- .byte 197,253,239,196 // vpxor %ymm4,%ymm0,%ymm0
- .byte 196,226,125,48,209 // vpmovzxbw %xmm1,%ymm2
- .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4
+ .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4
+ .byte 196,193,69,113,208,8 // vpsrlw $0x8,%ymm8,%ymm7
+ .byte 196,99,69,56,196,1 // vinserti128 $0x1,%xmm4,%ymm7,%ymm8
+ .byte 196,227,69,70,228,49 // vperm2i128 $0x31,%ymm4,%ymm7,%ymm4
+ .byte 197,61,103,212 // vpackuswb %ymm4,%ymm8,%ymm10
+ .byte 196,227,85,56,254,1 // vinserti128 $0x1,%xmm6,%ymm5,%ymm7
+ .byte 196,227,85,70,238,49 // vperm2i128 $0x31,%ymm6,%ymm5,%ymm5
+ .byte 197,197,103,253 // vpackuswb %ymm5,%ymm7,%ymm7
+ .byte 196,194,125,0,193 // vpshufb %ymm9,%ymm0,%ymm0
+ .byte 196,194,117,0,201 // vpshufb %ymm9,%ymm1,%ymm1
+ .byte 196,193,117,239,203 // vpxor %ymm11,%ymm1,%ymm1
+ .byte 196,193,125,239,195 // vpxor %ymm11,%ymm0,%ymm0
+ .byte 196,226,125,48,234 // vpmovzxbw %xmm2,%ymm5
+ .byte 196,227,125,57,214,1 // vextracti128 $0x1,%ymm2,%xmm6
+ .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6
+ .byte 196,98,125,48,195 // vpmovzxbw %xmm3,%ymm8
+ .byte 196,227,125,57,220,1 // vextracti128 $0x1,%ymm3,%xmm4
.byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4
- .byte 196,226,125,48,232 // vpmovzxbw %xmm0,%ymm5
+ .byte 196,98,125,48,200 // vpmovzxbw %xmm0,%ymm9
.byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0
.byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
- .byte 197,253,213,196 // vpmullw %ymm4,%ymm0,%ymm0
- .byte 197,213,213,234 // vpmullw %ymm2,%ymm5,%ymm5
- .byte 197,213,253,210 // vpaddw %ymm2,%ymm5,%ymm2
- .byte 197,253,253,196 // vpaddw %ymm4,%ymm0,%ymm0
+ .byte 196,98,125,48,217 // vpmovzxbw %xmm1,%ymm11
+ .byte 196,227,125,57,201,1 // vextracti128 $0x1,%ymm1,%xmm1
+ .byte 196,226,125,48,201 // vpmovzxbw %xmm1,%ymm1
+ .byte 197,221,213,201 // vpmullw %ymm1,%ymm4,%ymm1
+ .byte 196,65,61,213,219 // vpmullw %ymm11,%ymm8,%ymm11
+ .byte 197,205,213,192 // vpmullw %ymm0,%ymm6,%ymm0
+ .byte 196,65,85,213,201 // vpmullw %ymm9,%ymm5,%ymm9
+ .byte 197,181,253,237 // vpaddw %ymm5,%ymm9,%ymm5
+ .byte 197,253,253,198 // vpaddw %ymm6,%ymm0,%ymm0
+ .byte 196,193,37,253,240 // vpaddw %ymm8,%ymm11,%ymm6
+ .byte 197,245,253,204 // vpaddw %ymm4,%ymm1,%ymm1
+ .byte 197,245,113,209,8 // vpsrlw $0x8,%ymm1,%ymm1
+ .byte 197,221,113,214,8 // vpsrlw $0x8,%ymm6,%ymm4
.byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0
- .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2
- .byte 196,227,109,56,224,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm4
- .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- .byte 197,221,103,192 // vpackuswb %ymm0,%ymm4,%ymm0
- .byte 197,253,252,195 // vpaddb %ymm3,%ymm0,%ymm0
+ .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5
+ .byte 196,227,85,56,240,1 // vinserti128 $0x1,%xmm0,%ymm5,%ymm6
+ .byte 196,227,85,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm5,%ymm0
+ .byte 197,205,103,192 // vpackuswb %ymm0,%ymm6,%ymm0
+ .byte 196,227,93,56,233,1 // vinserti128 $0x1,%xmm1,%ymm4,%ymm5
+ .byte 196,227,93,70,201,49 // vperm2i128 $0x31,%ymm1,%ymm4,%ymm1
+ .byte 197,213,103,201 // vpackuswb %ymm1,%ymm5,%ymm1
+ .byte 196,193,125,252,194 // vpaddb %ymm10,%ymm0,%ymm0
+ .byte 197,245,252,207 // vpaddb %ymm7,%ymm1,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
BALIGN4
.byte 0,0 // add %al,(%rax)
- .byte 127,67 // jg 133b <_sk_xor__hsw_8bit+0x104>
+ .byte 127,67 // jg 2847 <_sk_xor__hsw_8bit+0x1bc>
.byte 1,1 // add %eax,(%rcx)
.byte 1,0 // add %eax,(%rax)
.byte 0,0 // add %al,(%rax)
@@ -57462,9 +58692,11 @@ BALIGN4
.byte 0,0 // add %al,(%rax)
.byte 0,255 // add %bh,%bh
.byte 0,0 // add %al,(%rax)
- .byte 127,67 // jg 134f <_sk_xor__hsw_8bit+0x118>
+ .byte 127,67 // jg 285b <_sk_xor__hsw_8bit+0x1d0>
+ .byte 0,0 // add %al,(%rax)
+ .byte 127,67 // jg 285f <_sk_xor__hsw_8bit+0x1d4>
.byte 0,0 // add %al,(%rax)
- .byte 127,67 // jg 1353 <_sk_xor__hsw_8bit+0x11c>
+ .byte 0,255 // add %bh,%bh
BALIGN32
.byte 0,0 // add %al,(%rax)
@@ -57902,38 +59134,6 @@ BALIGN32
.byte 15 // .byte 0xf
BALIGN16
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 0,2 // add %al,(%rdx)
- .byte 0,0 // add %al,(%rax)
- .byte 0,0 // add %al,(%rax)
- .byte 0,0 // add %al,(%rax)
- .byte 0,0 // add %al,(%rax)
- .byte 0,0 // add %al,(%rax)
- .byte 0,0 // add %al,(%rax)
- .byte 0,0 // add %al,(%rax)
- .byte 0,2 // add %al,(%rdx)
- .byte 4,6 // add $0x6,%al
- .byte 0,0 // add %al,(%rax)
- .byte 0,0 // add %al,(%rax)
- .byte 0,0 // add %al,(%rax)
- .byte 0,0 // add %al,(%rax)
- .byte 0,0 // add %al,(%rax)
- .byte 0,0 // add %al,(%rax)
.byte 0,2 // add %al,(%rdx)
.byte 4,6 // add $0x6,%al
.byte 8,10 // or %cl,(%rdx)
@@ -57942,44 +59142,6 @@ BALIGN16
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
-
-BALIGN8
- .byte 0,0 // add %al,(%rax)
- .byte 0,255 // add %bh,%bh
- .byte 0,0 // add %al,(%rax)
- .byte 0,255 // add %bh,%bh
BALIGN32
HIDDEN _sk_start_pipeline_sse41_8bit
@@ -58005,7 +59167,7 @@ _sk_start_pipeline_sse41_8bit:
.byte 73,57,207 // cmp %rcx,%r15
.byte 115,102 // jae 95 <_sk_start_pipeline_sse41_8bit+0x95>
.byte 72,139,69,208 // mov -0x30(%rbp),%rax
- .byte 72,141,64,4 // lea 0x4(%rax),%rax
+ .byte 72,141,64,8 // lea 0x8(%rax),%rax
.byte 72,137,69,176 // mov %rax,-0x50(%rbp)
.byte 76,141,101,184 // lea -0x48(%rbp),%r12
.byte 72,57,93,176 // cmp %rbx,-0x50(%rbp)
@@ -58018,9 +59180,9 @@ _sk_start_pipeline_sse41_8bit:
.byte 76,137,246 // mov %r14,%rsi
.byte 65,255,213 // callq *%r13
.byte 72,139,77,184 // mov -0x48(%rbp),%rcx
- .byte 72,141,65,4 // lea 0x4(%rcx),%rax
+ .byte 72,141,65,8 // lea 0x8(%rcx),%rax
.byte 72,137,69,184 // mov %rax,-0x48(%rbp)
- .byte 72,131,193,8 // add $0x8,%rcx
+ .byte 72,131,193,16 // add $0x10,%rcx
.byte 72,57,217 // cmp %rbx,%rcx
.byte 118,226 // jbe 59 <_sk_start_pipeline_sse41_8bit+0x59>
.byte 72,137,217 // mov %rbx,%rcx
@@ -58056,6 +59218,7 @@ _sk_uniform_color_sse41_8bit:
.byte 102,15,110,64,16 // movd 0x10(%rax),%xmm0
.byte 102,15,112,192,0 // pshufd $0x0,%xmm0,%xmm0
.byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 102,15,111,200 // movdqa %xmm0,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_set_rgb_sse41_8bit
@@ -58063,23 +59226,26 @@ HIDDEN _sk_set_rgb_sse41_8bit
FUNCTION(_sk_set_rgb_sse41_8bit)
_sk_set_rgb_sse41_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 243,15,16,21,77,14,0,0 // movss 0xe4d(%rip),%xmm2 # f0c <_sk_xor__sse41_8bit+0xa2>
- .byte 243,15,16,24 // movss (%rax),%xmm3
- .byte 243,15,89,218 // mulss %xmm2,%xmm3
- .byte 243,72,15,44,203 // cvttss2si %xmm3,%rcx
- .byte 243,15,16,88,4 // movss 0x4(%rax),%xmm3
- .byte 243,15,89,218 // mulss %xmm2,%xmm3
- .byte 243,72,15,44,211 // cvttss2si %xmm3,%rdx
+ .byte 243,15,16,37,97,28,0,0 // movss 0x1c61(%rip),%xmm4 # 1d24 <_sk_xor__sse41_8bit+0x148>
+ .byte 243,15,16,40 // movss (%rax),%xmm5
+ .byte 243,15,89,236 // mulss %xmm4,%xmm5
+ .byte 243,72,15,44,205 // cvttss2si %xmm5,%rcx
+ .byte 243,15,16,104,4 // movss 0x4(%rax),%xmm5
+ .byte 243,15,89,236 // mulss %xmm4,%xmm5
+ .byte 243,72,15,44,213 // cvttss2si %xmm5,%rdx
.byte 193,226,8 // shl $0x8,%edx
.byte 9,202 // or %ecx,%edx
- .byte 243,15,89,80,8 // mulss 0x8(%rax),%xmm2
- .byte 243,72,15,44,194 // cvttss2si %xmm2,%rax
+ .byte 243,15,89,96,8 // mulss 0x8(%rax),%xmm4
+ .byte 243,72,15,44,196 // cvttss2si %xmm4,%rax
.byte 193,224,16 // shl $0x10,%eax
.byte 9,208 // or %edx,%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 102,15,112,210,0 // pshufd $0x0,%xmm2,%xmm2
- .byte 102,15,219,5,33,14,0,0 // pand 0xe21(%rip),%xmm0 # f20 <_sk_xor__sse41_8bit+0xb6>
- .byte 102,15,235,194 // por %xmm2,%xmm0
+ .byte 102,15,110,224 // movd %eax,%xmm4
+ .byte 102,15,112,228,0 // pshufd $0x0,%xmm4,%xmm4
+ .byte 102,15,111,45,45,28,0,0 // movdqa 0x1c2d(%rip),%xmm5 # 1d30 <_sk_xor__sse41_8bit+0x154>
+ .byte 102,15,219,205 // pand %xmm5,%xmm1
+ .byte 102,15,219,197 // pand %xmm5,%xmm0
+ .byte 102,15,235,196 // por %xmm4,%xmm0
+ .byte 102,15,235,204 // por %xmm4,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -58087,30 +59253,49 @@ HIDDEN _sk_premul_sse41_8bit
.globl _sk_premul_sse41_8bit
FUNCTION(_sk_premul_sse41_8bit)
_sk_premul_sse41_8bit:
- .byte 102,15,111,216 // movdqa %xmm0,%xmm3
- .byte 102,15,56,0,29,28,14,0,0 // pshufb 0xe1c(%rip),%xmm3 # f30 <_sk_xor__sse41_8bit+0xc6>
- .byte 102,15,235,29,36,14,0,0 // por 0xe24(%rip),%xmm3 # f40 <_sk_xor__sse41_8bit+0xd6>
- .byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,56,48,232 // pmovzxbw %xmm0,%xmm5
- .byte 102,15,104,196 // punpckhbw %xmm4,%xmm0
- .byte 102,15,56,48,211 // pmovzxbw %xmm3,%xmm2
- .byte 102,15,104,220 // punpckhbw %xmm4,%xmm3
- .byte 102,15,213,216 // pmullw %xmm0,%xmm3
- .byte 102,15,213,213 // pmullw %xmm5,%xmm2
- .byte 102,15,253,216 // paddw %xmm0,%xmm3
- .byte 102,15,253,213 // paddw %xmm5,%xmm2
- .byte 102,15,113,211,8 // psrlw $0x8,%xmm3
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,211 // packuswb %xmm3,%xmm2
+ .byte 102,15,111,225 // movdqa %xmm1,%xmm4
+ .byte 102,15,111,232 // movdqa %xmm0,%xmm5
+ .byte 102,15,111,5,25,28,0,0 // movdqa 0x1c19(%rip),%xmm0 # 1d40 <_sk_xor__sse41_8bit+0x164>
+ .byte 102,15,111,253 // movdqa %xmm5,%xmm7
+ .byte 102,15,56,0,248 // pshufb %xmm0,%xmm7
+ .byte 102,15,111,244 // movdqa %xmm4,%xmm6
+ .byte 102,15,56,0,240 // pshufb %xmm0,%xmm6
+ .byte 102,15,111,5,15,28,0,0 // movdqa 0x1c0f(%rip),%xmm0 # 1d50 <_sk_xor__sse41_8bit+0x174>
+ .byte 102,15,235,240 // por %xmm0,%xmm6
+ .byte 102,15,235,248 // por %xmm0,%xmm7
+ .byte 102,69,15,239,192 // pxor %xmm8,%xmm8
+ .byte 102,68,15,56,48,205 // pmovzxbw %xmm5,%xmm9
+ .byte 102,65,15,104,232 // punpckhbw %xmm8,%xmm5
+ .byte 102,68,15,56,48,212 // pmovzxbw %xmm4,%xmm10
+ .byte 102,65,15,104,224 // punpckhbw %xmm8,%xmm4
+ .byte 102,15,56,48,199 // pmovzxbw %xmm7,%xmm0
+ .byte 102,15,56,48,206 // pmovzxbw %xmm6,%xmm1
+ .byte 102,65,15,104,248 // punpckhbw %xmm8,%xmm7
+ .byte 102,65,15,104,240 // punpckhbw %xmm8,%xmm6
+ .byte 102,15,213,244 // pmullw %xmm4,%xmm6
+ .byte 102,15,213,253 // pmullw %xmm5,%xmm7
+ .byte 102,65,15,213,202 // pmullw %xmm10,%xmm1
+ .byte 102,65,15,213,193 // pmullw %xmm9,%xmm0
+ .byte 102,15,253,253 // paddw %xmm5,%xmm7
+ .byte 102,15,253,244 // paddw %xmm4,%xmm6
+ .byte 102,65,15,253,193 // paddw %xmm9,%xmm0
+ .byte 102,65,15,253,202 // paddw %xmm10,%xmm1
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
+ .byte 102,15,113,208,8 // psrlw $0x8,%xmm0
+ .byte 102,15,103,199 // packuswb %xmm7,%xmm0
+ .byte 102,15,103,206 // packuswb %xmm6,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
.byte 255,224 // jmpq *%rax
HIDDEN _sk_swap_rb_sse41_8bit
.globl _sk_swap_rb_sse41_8bit
FUNCTION(_sk_swap_rb_sse41_8bit)
_sk_swap_rb_sse41_8bit:
- .byte 102,15,56,0,5,239,13,0,0 // pshufb 0xdef(%rip),%xmm0 # f50 <_sk_xor__sse41_8bit+0xe6>
+ .byte 102,15,111,37,156,27,0,0 // movdqa 0x1b9c(%rip),%xmm4 # 1d60 <_sk_xor__sse41_8bit+0x184>
+ .byte 102,15,56,0,196 // pshufb %xmm4,%xmm0
+ .byte 102,15,56,0,204 // pshufb %xmm4,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -58118,8 +59303,9 @@ HIDDEN _sk_invert_sse41_8bit
.globl _sk_invert_sse41_8bit
FUNCTION(_sk_invert_sse41_8bit)
_sk_invert_sse41_8bit:
- .byte 102,15,118,210 // pcmpeqd %xmm2,%xmm2
- .byte 102,15,239,194 // pxor %xmm2,%xmm0
+ .byte 102,15,118,228 // pcmpeqd %xmm4,%xmm4
+ .byte 102,15,239,196 // pxor %xmm4,%xmm0
+ .byte 102,15,239,204 // pxor %xmm4,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -58136,25 +59322,51 @@ _sk_load_8888_sse41_8bit:
.byte 72,193,226,2 // shl $0x2,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,10 // jne 19c <_sk_load_8888_sse41_8bit+0x2b>
+ .byte 117,17 // jne 214 <_sk_load_8888_sse41_8bit+0x32>
+ .byte 243,66,15,111,76,130,16 // movdqu 0x10(%rdx,%r8,4),%xmm1
.byte 243,66,15,111,4,130 // movdqu (%rdx,%r8,4),%xmm0
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,42 // je 1d0 <_sk_load_8888_sse41_8bit+0x5f>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,201 // pxor %xmm1,%xmm1
.byte 102,15,239,192 // pxor %xmm0,%xmm0
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,18 // je 1c2 <_sk_load_8888_sse41_8bit+0x51>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,226 // jne 198 <_sk_load_8888_sse41_8bit+0x27>
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,231 // ja 210 <_sk_load_8888_sse41_8bit+0x2e>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,76,0,0,0 // lea 0x4c(%rip),%rcx # 280 <_sk_load_8888_sse41_8bit+0x9e>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 102,66,15,110,4,130 // movd (%rdx,%r8,4),%xmm0
+ .byte 235,203 // jmp 210 <_sk_load_8888_sse41_8bit+0x2e>
.byte 102,66,15,110,68,130,8 // movd 0x8(%rdx,%r8,4),%xmm0
.byte 102,15,112,192,69 // pshufd $0x45,%xmm0,%xmm0
- .byte 243,66,15,126,20,130 // movq (%rdx,%r8,4),%xmm2
- .byte 102,15,58,14,194,15 // pblendw $0xf,%xmm2,%xmm0
- .byte 235,200 // jmp 198 <_sk_load_8888_sse41_8bit+0x27>
- .byte 102,66,15,110,4,130 // movd (%rdx,%r8,4),%xmm0
- .byte 235,192 // jmp 198 <_sk_load_8888_sse41_8bit+0x27>
+ .byte 243,66,15,126,36,130 // movq (%rdx,%r8,4),%xmm4
+ .byte 102,15,58,14,196,15 // pblendw $0xf,%xmm4,%xmm0
+ .byte 235,177 // jmp 210 <_sk_load_8888_sse41_8bit+0x2e>
+ .byte 102,66,15,110,68,130,24 // movd 0x18(%rdx,%r8,4),%xmm0
+ .byte 102,15,112,200,69 // pshufd $0x45,%xmm0,%xmm1
+ .byte 102,66,15,58,34,76,130,20,1 // pinsrd $0x1,0x14(%rdx,%r8,4),%xmm1
+ .byte 102,66,15,58,34,76,130,16,0 // pinsrd $0x0,0x10(%rdx,%r8,4),%xmm1
+ .byte 235,139 // jmp 20a <_sk_load_8888_sse41_8bit+0x28>
+ .byte 144 // nop
+ .byte 189,255,255,255,209 // mov $0xd1ffffff,%ebp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,197 // inc %ebp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,138,255,255,255,244 // decl -0xb000001(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 235,255 // jmp 295 <_sk_load_8888_sse41_8bit+0xb3>
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 223,255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_8888_dst_sse41_8bit
.globl _sk_load_8888_dst_sse41_8bit
@@ -58169,55 +59381,111 @@ _sk_load_8888_dst_sse41_8bit:
.byte 72,193,226,2 // shl $0x2,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,10 // jne 203 <_sk_load_8888_dst_sse41_8bit+0x2b>
- .byte 243,66,15,111,12,130 // movdqu (%rdx,%r8,4),%xmm1
+ .byte 117,17 // jne 2ce <_sk_load_8888_dst_sse41_8bit+0x32>
+ .byte 243,66,15,111,92,130,16 // movdqu 0x10(%rdx,%r8,4),%xmm3
+ .byte 243,66,15,111,20,130 // movdqu (%rdx,%r8,4),%xmm2
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,42 // je 237 <_sk_load_8888_dst_sse41_8bit+0x5f>
- .byte 102,15,239,201 // pxor %xmm1,%xmm1
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,18 // je 229 <_sk_load_8888_dst_sse41_8bit+0x51>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,226 // jne 1ff <_sk_load_8888_dst_sse41_8bit+0x27>
- .byte 102,66,15,110,76,130,8 // movd 0x8(%rdx,%r8,4),%xmm1
- .byte 102,15,112,201,69 // pshufd $0x45,%xmm1,%xmm1
- .byte 243,66,15,126,20,130 // movq (%rdx,%r8,4),%xmm2
- .byte 102,15,58,14,202,15 // pblendw $0xf,%xmm2,%xmm1
- .byte 235,200 // jmp 1ff <_sk_load_8888_dst_sse41_8bit+0x27>
- .byte 102,66,15,110,12,130 // movd (%rdx,%r8,4),%xmm1
- .byte 235,192 // jmp 1ff <_sk_load_8888_dst_sse41_8bit+0x27>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,219 // pxor %xmm3,%xmm3
+ .byte 102,15,239,210 // pxor %xmm2,%xmm2
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,231 // ja 2ca <_sk_load_8888_dst_sse41_8bit+0x2e>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,78,0,0,0 // lea 0x4e(%rip),%rcx # 33c <_sk_load_8888_dst_sse41_8bit+0xa0>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 102,66,15,110,20,130 // movd (%rdx,%r8,4),%xmm2
+ .byte 235,203 // jmp 2ca <_sk_load_8888_dst_sse41_8bit+0x2e>
+ .byte 102,66,15,110,84,130,8 // movd 0x8(%rdx,%r8,4),%xmm2
+ .byte 102,15,112,210,69 // pshufd $0x45,%xmm2,%xmm2
+ .byte 243,66,15,126,36,130 // movq (%rdx,%r8,4),%xmm4
+ .byte 102,15,58,14,212,15 // pblendw $0xf,%xmm4,%xmm2
+ .byte 235,177 // jmp 2ca <_sk_load_8888_dst_sse41_8bit+0x2e>
+ .byte 102,66,15,110,84,130,24 // movd 0x18(%rdx,%r8,4),%xmm2
+ .byte 102,15,112,218,69 // pshufd $0x45,%xmm2,%xmm3
+ .byte 102,66,15,58,34,92,130,20,1 // pinsrd $0x1,0x14(%rdx,%r8,4),%xmm3
+ .byte 102,66,15,58,34,92,130,16,0 // pinsrd $0x0,0x10(%rdx,%r8,4),%xmm3
+ .byte 235,139 // jmp 2c4 <_sk_load_8888_dst_sse41_8bit+0x28>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 187,255,255,255,207 // mov $0xcfffffff,%ebx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,195 // inc %ebx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,136,255,255,255,242 // decl -0xd000001(%rax)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 233,255,255,255,221 // jmpq ffffffffde000354 <_sk_xor__sse41_8bit+0xffffffffddffe778>
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_store_8888_sse41_8bit
.globl _sk_store_8888_sse41_8bit
FUNCTION(_sk_store_8888_sse41_8bit)
_sk_store_8888_sse41_8bit:
- .byte 76,99,7 // movslq (%rdi),%r8
- .byte 76,139,79,16 // mov 0x10(%rdi),%r9
+ .byte 76,99,15 // movslq (%rdi),%r9
+ .byte 76,139,71,16 // mov 0x10(%rdi),%r8
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 72,99,72,8 // movslq 0x8(%rax),%rcx
.byte 72,99,87,8 // movslq 0x8(%rdi),%rdx
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,193,226,2 // shl $0x2,%rdx
.byte 72,3,16 // add (%rax),%rdx
- .byte 77,133,201 // test %r9,%r9
- .byte 117,10 // jne 26a <_sk_store_8888_sse41_8bit+0x2b>
- .byte 243,66,15,127,4,130 // movdqu %xmm0,(%rdx,%r8,4)
- .byte 72,173 // lods %ds:(%rsi),%rax
- .byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,29 // je 291 <_sk_store_8888_sse41_8bit+0x52>
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,15 // je 289 <_sk_store_8888_sse41_8bit+0x4a>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,230 // jne 266 <_sk_store_8888_sse41_8bit+0x27>
- .byte 102,66,15,58,22,68,130,8,2 // pextrd $0x2,%xmm0,0x8(%rdx,%r8,4)
- .byte 102,66,15,214,4,130 // movq %xmm0,(%rdx,%r8,4)
- .byte 235,213 // jmp 266 <_sk_store_8888_sse41_8bit+0x27>
- .byte 102,66,15,126,4,130 // movd %xmm0,(%rdx,%r8,4)
- .byte 235,205 // jmp 266 <_sk_store_8888_sse41_8bit+0x27>
+ .byte 77,133,192 // test %r8,%r8
+ .byte 117,17 // jne 38a <_sk_store_8888_sse41_8bit+0x32>
+ .byte 243,66,15,127,4,138 // movdqu %xmm0,(%rdx,%r9,4)
+ .byte 243,66,15,127,76,138,16 // movdqu %xmm1,0x10(%rdx,%r9,4)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 65,128,224,7 // and $0x7,%r8b
+ .byte 65,254,200 // dec %r8b
+ .byte 65,128,248,6 // cmp $0x6,%r8b
+ .byte 119,239 // ja 386 <_sk_store_8888_sse41_8bit+0x2e>
+ .byte 65,15,182,192 // movzbl %r8b,%eax
+ .byte 72,141,13,70,0,0,0 // lea 0x46(%rip),%rcx # 3e8 <_sk_store_8888_sse41_8bit+0x90>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 102,66,15,126,4,138 // movd %xmm0,(%rdx,%r9,4)
+ .byte 235,211 // jmp 386 <_sk_store_8888_sse41_8bit+0x2e>
+ .byte 102,66,15,58,22,68,138,8,2 // pextrd $0x2,%xmm0,0x8(%rdx,%r9,4)
+ .byte 102,66,15,214,4,138 // movq %xmm0,(%rdx,%r9,4)
+ .byte 235,194 // jmp 386 <_sk_store_8888_sse41_8bit+0x2e>
+ .byte 102,66,15,58,22,76,138,24,2 // pextrd $0x2,%xmm1,0x18(%rdx,%r9,4)
+ .byte 102,66,15,58,22,76,138,20,1 // pextrd $0x1,%xmm1,0x14(%rdx,%r9,4)
+ .byte 102,66,15,126,76,138,16 // movd %xmm1,0x10(%rdx,%r9,4)
+ .byte 243,66,15,127,4,138 // movdqu %xmm0,(%rdx,%r9,4)
+ .byte 235,161 // jmp 386 <_sk_store_8888_sse41_8bit+0x2e>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 195 // retq
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,212 // callq *%rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,203 // dec %ebx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,245 // push %rbp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 238 // out %al,(%dx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,229 // jmpq *%rbp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 220,255 // fdivr %st,%st(7)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_bgra_sse41_8bit
.globl _sk_load_bgra_sse41_8bit
@@ -58232,26 +59500,55 @@ _sk_load_bgra_sse41_8bit:
.byte 72,193,226,2 // shl $0x2,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,19 // jne 2cd <_sk_load_bgra_sse41_8bit+0x34>
+ .byte 117,35 // jne 448 <_sk_load_bgra_sse41_8bit+0x44>
+ .byte 243,66,15,111,76,130,16 // movdqu 0x10(%rdx,%r8,4),%xmm1
.byte 243,66,15,111,4,130 // movdqu (%rdx,%r8,4),%xmm0
- .byte 102,15,56,0,5,151,12,0,0 // pshufb 0xc97(%rip),%xmm0 # f60 <_sk_xor__sse41_8bit+0xf6>
+ .byte 102,15,111,37,54,25,0,0 // movdqa 0x1936(%rip),%xmm4 # 1d70 <_sk_xor__sse41_8bit+0x194>
+ .byte 102,15,56,0,196 // pshufb %xmm4,%xmm0
+ .byte 102,15,56,0,204 // pshufb %xmm4,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,42 // je 301 <_sk_load_bgra_sse41_8bit+0x68>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,201 // pxor %xmm1,%xmm1
.byte 102,15,239,192 // pxor %xmm0,%xmm0
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,18 // je 2f3 <_sk_load_bgra_sse41_8bit+0x5a>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,217 // jne 2c0 <_sk_load_bgra_sse41_8bit+0x27>
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,213 // ja 432 <_sk_load_bgra_sse41_8bit+0x2e>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,80,0,0,0 // lea 0x50(%rip),%rcx # 4b8 <_sk_load_bgra_sse41_8bit+0xb4>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 102,66,15,110,4,130 // movd (%rdx,%r8,4),%xmm0
+ .byte 235,185 // jmp 432 <_sk_load_bgra_sse41_8bit+0x2e>
.byte 102,66,15,110,68,130,8 // movd 0x8(%rdx,%r8,4),%xmm0
.byte 102,15,112,192,69 // pshufd $0x45,%xmm0,%xmm0
- .byte 243,66,15,126,20,130 // movq (%rdx,%r8,4),%xmm2
- .byte 102,15,58,14,194,15 // pblendw $0xf,%xmm2,%xmm0
- .byte 235,191 // jmp 2c0 <_sk_load_bgra_sse41_8bit+0x27>
- .byte 102,66,15,110,4,130 // movd (%rdx,%r8,4),%xmm0
- .byte 235,183 // jmp 2c0 <_sk_load_bgra_sse41_8bit+0x27>
+ .byte 243,66,15,126,36,130 // movq (%rdx,%r8,4),%xmm4
+ .byte 102,15,58,14,196,15 // pblendw $0xf,%xmm4,%xmm0
+ .byte 235,159 // jmp 432 <_sk_load_bgra_sse41_8bit+0x2e>
+ .byte 102,66,15,110,68,130,24 // movd 0x18(%rdx,%r8,4),%xmm0
+ .byte 102,15,112,200,69 // pshufd $0x45,%xmm0,%xmm1
+ .byte 102,66,15,58,34,76,130,20,1 // pinsrd $0x1,0x14(%rdx,%r8,4),%xmm1
+ .byte 102,66,15,58,34,76,130,16,0 // pinsrd $0x0,0x10(%rdx,%r8,4),%xmm1
+ .byte 233,118,255,255,255 // jmpq 42c <_sk_load_bgra_sse41_8bit+0x28>
+ .byte 102,144 // xchg %ax,%ax
+ .byte 185,255,255,255,205 // mov $0xcdffffff,%ecx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,193 // inc %ecx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,116,255,255 // pushq -0x1(%rdi,%rdi,8)
+ .byte 255,240 // push %rax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,231 // jmpq *%rdi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 219,255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_bgra_dst_sse41_8bit
.globl _sk_load_bgra_dst_sse41_8bit
@@ -58266,58 +59563,119 @@ _sk_load_bgra_dst_sse41_8bit:
.byte 72,193,226,2 // shl $0x2,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,19 // jne 33d <_sk_load_bgra_dst_sse41_8bit+0x34>
- .byte 243,66,15,111,12,130 // movdqu (%rdx,%r8,4),%xmm1
- .byte 102,15,56,0,13,55,12,0,0 // pshufb 0xc37(%rip),%xmm1 # f70 <_sk_xor__sse41_8bit+0x106>
+ .byte 117,35 // jne 518 <_sk_load_bgra_dst_sse41_8bit+0x44>
+ .byte 243,66,15,111,92,130,16 // movdqu 0x10(%rdx,%r8,4),%xmm3
+ .byte 243,66,15,111,20,130 // movdqu (%rdx,%r8,4),%xmm2
+ .byte 102,15,111,37,118,24,0,0 // movdqa 0x1876(%rip),%xmm4 # 1d80 <_sk_xor__sse41_8bit+0x1a4>
+ .byte 102,15,56,0,212 // pshufb %xmm4,%xmm2
+ .byte 102,15,56,0,220 // pshufb %xmm4,%xmm3
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,42 // je 371 <_sk_load_bgra_dst_sse41_8bit+0x68>
- .byte 102,15,239,201 // pxor %xmm1,%xmm1
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,18 // je 363 <_sk_load_bgra_dst_sse41_8bit+0x5a>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,217 // jne 330 <_sk_load_bgra_dst_sse41_8bit+0x27>
- .byte 102,66,15,110,76,130,8 // movd 0x8(%rdx,%r8,4),%xmm1
- .byte 102,15,112,201,69 // pshufd $0x45,%xmm1,%xmm1
- .byte 243,66,15,126,20,130 // movq (%rdx,%r8,4),%xmm2
- .byte 102,15,58,14,202,15 // pblendw $0xf,%xmm2,%xmm1
- .byte 235,191 // jmp 330 <_sk_load_bgra_dst_sse41_8bit+0x27>
- .byte 102,66,15,110,12,130 // movd (%rdx,%r8,4),%xmm1
- .byte 235,183 // jmp 330 <_sk_load_bgra_dst_sse41_8bit+0x27>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,219 // pxor %xmm3,%xmm3
+ .byte 102,15,239,210 // pxor %xmm2,%xmm2
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,213 // ja 502 <_sk_load_bgra_dst_sse41_8bit+0x2e>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,80,0,0,0 // lea 0x50(%rip),%rcx # 588 <_sk_load_bgra_dst_sse41_8bit+0xb4>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 102,66,15,110,20,130 // movd (%rdx,%r8,4),%xmm2
+ .byte 235,185 // jmp 502 <_sk_load_bgra_dst_sse41_8bit+0x2e>
+ .byte 102,66,15,110,84,130,8 // movd 0x8(%rdx,%r8,4),%xmm2
+ .byte 102,15,112,210,69 // pshufd $0x45,%xmm2,%xmm2
+ .byte 243,66,15,126,36,130 // movq (%rdx,%r8,4),%xmm4
+ .byte 102,15,58,14,212,15 // pblendw $0xf,%xmm4,%xmm2
+ .byte 235,159 // jmp 502 <_sk_load_bgra_dst_sse41_8bit+0x2e>
+ .byte 102,66,15,110,84,130,24 // movd 0x18(%rdx,%r8,4),%xmm2
+ .byte 102,15,112,218,69 // pshufd $0x45,%xmm2,%xmm3
+ .byte 102,66,15,58,34,92,130,20,1 // pinsrd $0x1,0x14(%rdx,%r8,4),%xmm3
+ .byte 102,66,15,58,34,92,130,16,0 // pinsrd $0x0,0x10(%rdx,%r8,4),%xmm3
+ .byte 233,118,255,255,255 // jmpq 4fc <_sk_load_bgra_dst_sse41_8bit+0x28>
+ .byte 102,144 // xchg %ax,%ax
+ .byte 185,255,255,255,205 // mov $0xcdffffff,%ecx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,193 // inc %ecx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,116,255,255 // pushq -0x1(%rdi,%rdi,8)
+ .byte 255,240 // push %rax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,231 // jmpq *%rdi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 219,255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_store_bgra_sse41_8bit
.globl _sk_store_bgra_sse41_8bit
FUNCTION(_sk_store_bgra_sse41_8bit)
_sk_store_bgra_sse41_8bit:
- .byte 76,99,7 // movslq (%rdi),%r8
- .byte 76,139,79,16 // mov 0x10(%rdi),%r9
+ .byte 76,99,15 // movslq (%rdi),%r9
+ .byte 76,139,71,16 // mov 0x10(%rdi),%r8
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 72,99,72,8 // movslq 0x8(%rax),%rcx
.byte 72,99,87,8 // movslq 0x8(%rdi),%rdx
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,193,226,2 // shl $0x2,%rdx
.byte 72,3,16 // add (%rax),%rdx
- .byte 102,15,111,208 // movdqa %xmm0,%xmm2
- .byte 102,15,56,0,21,222,11,0,0 // pshufb 0xbde(%rip),%xmm2 # f80 <_sk_xor__sse41_8bit+0x116>
- .byte 77,133,201 // test %r9,%r9
- .byte 117,10 // jne 3b1 <_sk_store_bgra_sse41_8bit+0x38>
- .byte 243,66,15,127,20,130 // movdqu %xmm2,(%rdx,%r8,4)
- .byte 72,173 // lods %ds:(%rsi),%rax
- .byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,29 // je 3d8 <_sk_store_bgra_sse41_8bit+0x5f>
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,15 // je 3d0 <_sk_store_bgra_sse41_8bit+0x57>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,230 // jne 3ad <_sk_store_bgra_sse41_8bit+0x34>
- .byte 102,66,15,58,22,84,130,8,2 // pextrd $0x2,%xmm2,0x8(%rdx,%r8,4)
- .byte 102,66,15,214,20,130 // movq %xmm2,(%rdx,%r8,4)
- .byte 235,213 // jmp 3ad <_sk_store_bgra_sse41_8bit+0x34>
- .byte 102,66,15,126,20,130 // movd %xmm2,(%rdx,%r8,4)
- .byte 235,205 // jmp 3ad <_sk_store_bgra_sse41_8bit+0x34>
+ .byte 102,15,111,53,200,23,0,0 // movdqa 0x17c8(%rip),%xmm6 # 1d90 <_sk_xor__sse41_8bit+0x1b4>
+ .byte 102,15,111,233 // movdqa %xmm1,%xmm5
+ .byte 102,15,56,0,238 // pshufb %xmm6,%xmm5
+ .byte 102,15,111,224 // movdqa %xmm0,%xmm4
+ .byte 102,15,56,0,230 // pshufb %xmm6,%xmm4
+ .byte 77,133,192 // test %r8,%r8
+ .byte 117,17 // jne 5f0 <_sk_store_bgra_sse41_8bit+0x4c>
+ .byte 243,66,15,127,36,138 // movdqu %xmm4,(%rdx,%r9,4)
+ .byte 243,66,15,127,108,138,16 // movdqu %xmm5,0x10(%rdx,%r9,4)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 65,128,224,7 // and $0x7,%r8b
+ .byte 65,254,200 // dec %r8b
+ .byte 65,128,248,6 // cmp $0x6,%r8b
+ .byte 119,239 // ja 5ec <_sk_store_bgra_sse41_8bit+0x48>
+ .byte 65,15,182,192 // movzbl %r8b,%eax
+ .byte 72,141,13,68,0,0,0 // lea 0x44(%rip),%rcx # 64c <_sk_store_bgra_sse41_8bit+0xa8>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 102,66,15,126,36,138 // movd %xmm4,(%rdx,%r9,4)
+ .byte 235,211 // jmp 5ec <_sk_store_bgra_sse41_8bit+0x48>
+ .byte 102,66,15,58,22,100,138,8,2 // pextrd $0x2,%xmm4,0x8(%rdx,%r9,4)
+ .byte 102,66,15,214,36,138 // movq %xmm4,(%rdx,%r9,4)
+ .byte 235,194 // jmp 5ec <_sk_store_bgra_sse41_8bit+0x48>
+ .byte 102,66,15,58,22,108,138,24,2 // pextrd $0x2,%xmm5,0x18(%rdx,%r9,4)
+ .byte 102,66,15,58,22,108,138,20,1 // pextrd $0x1,%xmm5,0x14(%rdx,%r9,4)
+ .byte 102,66,15,126,108,138,16 // movd %xmm5,0x10(%rdx,%r9,4)
+ .byte 243,66,15,127,36,138 // movdqu %xmm4,(%rdx,%r9,4)
+ .byte 235,161 // jmp 5ec <_sk_store_bgra_sse41_8bit+0x48>
+ .byte 144 // nop
+ .byte 197,255,255 // (bad)
+ .byte 255,214 // callq *%rsi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,205 // dec %ebp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,247 // push %rdi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,240 // push %rax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,231 // jmpq *%rdi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 222,255 // fdivrp %st,%st(7)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_a8_sse41_8bit
.globl _sk_load_a8_sse41_8bit
@@ -58331,30 +59689,66 @@ _sk_load_a8_sse41_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,16 // jne 40d <_sk_load_a8_sse41_8bit+0x2d>
- .byte 102,66,15,56,49,4,2 // pmovzxbd (%rdx,%r8,1),%xmm0
+ .byte 117,42 // jne 6af <_sk_load_a8_sse41_8bit+0x47>
+ .byte 102,66,15,56,48,12,2 // pmovzxbw (%rdx,%r8,1),%xmm1
+ .byte 102,15,219,13,12,23,0,0 // pand 0x170c(%rip),%xmm1 # 1da0 <_sk_xor__sse41_8bit+0x1c4>
+ .byte 102,15,239,228 // pxor %xmm4,%xmm4
+ .byte 102,15,56,51,193 // pmovzxwd %xmm1,%xmm0
+ .byte 102,15,105,204 // punpckhwd %xmm4,%xmm1
+ .byte 102,15,114,241,24 // pslld $0x18,%xmm1
.byte 102,15,114,240,24 // pslld $0x18,%xmm0
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,53 // je 44c <_sk_load_a8_sse41_8bit+0x6c>
- .byte 102,15,239,192 // pxor %xmm0,%xmm0
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,21 // je 436 <_sk_load_a8_sse41_8bit+0x56>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,221 // jne 404 <_sk_load_a8_sse41_8bit+0x24>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,201 // pxor %xmm1,%xmm1
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,204 // ja 68c <_sk_load_a8_sse41_8bit+0x24>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,117,0,0,0 // lea 0x75(%rip),%rcx # 740 <_sk_load_a8_sse41_8bit+0xd8>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
+ .byte 102,15,110,200 // movd %eax,%xmm1
+ .byte 235,173 // jmp 68c <_sk_load_a8_sse41_8bit+0x24>
.byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
- .byte 102,15,110,192 // movd %eax,%xmm0
- .byte 102,15,112,192,69 // pshufd $0x45,%xmm0,%xmm0
+ .byte 102,15,239,201 // pxor %xmm1,%xmm1
+ .byte 102,15,196,200,2 // pinsrw $0x2,%eax,%xmm1
.byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 102,15,56,49,210 // pmovzxbd %xmm2,%xmm2
- .byte 102,15,58,14,194,15 // pblendw $0xf,%xmm2,%xmm0
- .byte 235,184 // jmp 404 <_sk_load_a8_sse41_8bit+0x24>
- .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
.byte 102,15,110,192 // movd %eax,%xmm0
- .byte 235,173 // jmp 404 <_sk_load_a8_sse41_8bit+0x24>
+ .byte 102,15,56,48,192 // pmovzxbw %xmm0,%xmm0
+ .byte 102,15,58,14,200,3 // pblendw $0x3,%xmm0,%xmm1
+ .byte 235,136 // jmp 68c <_sk_load_a8_sse41_8bit+0x24>
+ .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
+ .byte 102,15,239,201 // pxor %xmm1,%xmm1
+ .byte 102,15,196,200,6 // pinsrw $0x6,%eax,%xmm1
+ .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
+ .byte 102,15,196,200,5 // pinsrw $0x5,%eax,%xmm1
+ .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
+ .byte 102,15,196,200,4 // pinsrw $0x4,%eax,%xmm1
+ .byte 102,66,15,110,4,2 // movd (%rdx,%r8,1),%xmm0
+ .byte 102,15,56,48,192 // pmovzxbw %xmm0,%xmm0
+ .byte 102,15,58,14,200,15 // pblendw $0xf,%xmm0,%xmm1
+ .byte 233,77,255,255,255 // jmpq 68c <_sk_load_a8_sse41_8bit+0x24>
+ .byte 144 // nop
+ .byte 148 // xchg %eax,%esp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,174,255,255,255,159 // ljmp *-0x60000001(%rsi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 233,255,255,255,222 // jmpq ffffffffdf000750 <_sk_xor__sse41_8bit+0xffffffffdeffeb74>
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,211 // callq *%rbx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,196 // inc %esp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_a8_dst_sse41_8bit
.globl _sk_load_a8_dst_sse41_8bit
@@ -58368,30 +59762,66 @@ _sk_load_a8_dst_sse41_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,16 // jne 484 <_sk_load_a8_dst_sse41_8bit+0x2d>
- .byte 102,66,15,56,49,12,2 // pmovzxbd (%rdx,%r8,1),%xmm1
- .byte 102,15,114,241,24 // pslld $0x18,%xmm1
+ .byte 117,42 // jne 7a3 <_sk_load_a8_dst_sse41_8bit+0x47>
+ .byte 102,66,15,56,48,28,2 // pmovzxbw (%rdx,%r8,1),%xmm3
+ .byte 102,15,219,29,40,22,0,0 // pand 0x1628(%rip),%xmm3 # 1db0 <_sk_xor__sse41_8bit+0x1d4>
+ .byte 102,15,239,228 // pxor %xmm4,%xmm4
+ .byte 102,15,56,51,211 // pmovzxwd %xmm3,%xmm2
+ .byte 102,15,105,220 // punpckhwd %xmm4,%xmm3
+ .byte 102,15,114,243,24 // pslld $0x18,%xmm3
+ .byte 102,15,114,242,24 // pslld $0x18,%xmm2
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,53 // je 4c3 <_sk_load_a8_dst_sse41_8bit+0x6c>
- .byte 102,15,239,201 // pxor %xmm1,%xmm1
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,21 // je 4ad <_sk_load_a8_dst_sse41_8bit+0x56>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,221 // jne 47b <_sk_load_a8_dst_sse41_8bit+0x24>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,219 // pxor %xmm3,%xmm3
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,204 // ja 780 <_sk_load_a8_dst_sse41_8bit+0x24>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,117,0,0,0 // lea 0x75(%rip),%rcx # 834 <_sk_load_a8_dst_sse41_8bit+0xd8>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
+ .byte 102,15,110,216 // movd %eax,%xmm3
+ .byte 235,173 // jmp 780 <_sk_load_a8_dst_sse41_8bit+0x24>
.byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
- .byte 102,15,110,200 // movd %eax,%xmm1
- .byte 102,15,112,201,69 // pshufd $0x45,%xmm1,%xmm1
+ .byte 102,15,239,219 // pxor %xmm3,%xmm3
+ .byte 102,15,196,216,2 // pinsrw $0x2,%eax,%xmm3
.byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
.byte 102,15,110,208 // movd %eax,%xmm2
- .byte 102,15,56,49,210 // pmovzxbd %xmm2,%xmm2
- .byte 102,15,58,14,202,15 // pblendw $0xf,%xmm2,%xmm1
- .byte 235,184 // jmp 47b <_sk_load_a8_dst_sse41_8bit+0x24>
- .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
- .byte 102,15,110,200 // movd %eax,%xmm1
- .byte 235,173 // jmp 47b <_sk_load_a8_dst_sse41_8bit+0x24>
+ .byte 102,15,56,48,210 // pmovzxbw %xmm2,%xmm2
+ .byte 102,15,58,14,218,3 // pblendw $0x3,%xmm2,%xmm3
+ .byte 235,136 // jmp 780 <_sk_load_a8_dst_sse41_8bit+0x24>
+ .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
+ .byte 102,15,239,219 // pxor %xmm3,%xmm3
+ .byte 102,15,196,216,6 // pinsrw $0x6,%eax,%xmm3
+ .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
+ .byte 102,15,196,216,5 // pinsrw $0x5,%eax,%xmm3
+ .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
+ .byte 102,15,196,216,4 // pinsrw $0x4,%eax,%xmm3
+ .byte 102,66,15,110,20,2 // movd (%rdx,%r8,1),%xmm2
+ .byte 102,15,56,48,210 // pmovzxbw %xmm2,%xmm2
+ .byte 102,15,58,14,218,15 // pblendw $0xf,%xmm2,%xmm3
+ .byte 233,77,255,255,255 // jmpq 780 <_sk_load_a8_dst_sse41_8bit+0x24>
+ .byte 144 // nop
+ .byte 148 // xchg %eax,%esp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,174,255,255,255,159 // ljmp *-0x60000001(%rsi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 233,255,255,255,222 // jmpq ffffffffdf000844 <_sk_xor__sse41_8bit+0xffffffffdeffec68>
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,211 // callq *%rbx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,196 // inc %esp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_store_a8_sse41_8bit
.globl _sk_store_a8_sse41_8bit
@@ -58404,27 +59834,61 @@ _sk_store_a8_sse41_8bit:
.byte 72,99,87,8 // movslq 0x8(%rdi),%rdx
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
- .byte 102,15,111,208 // movdqa %xmm0,%xmm2
- .byte 102,15,114,210,24 // psrld $0x18,%xmm2
+ .byte 102,15,111,45,80,21,0,0 // movdqa 0x1550(%rip),%xmm5 # 1dc0 <_sk_xor__sse41_8bit+0x1e4>
+ .byte 102,15,111,241 // movdqa %xmm1,%xmm6
+ .byte 102,15,56,0,245 // pshufb %xmm5,%xmm6
+ .byte 102,15,111,224 // movdqa %xmm0,%xmm4
+ .byte 102,15,56,0,229 // pshufb %xmm5,%xmm4
+ .byte 102,15,108,230 // punpcklqdq %xmm6,%xmm4
.byte 77,133,201 // test %r9,%r9
- .byte 117,19 // jne 507 <_sk_store_a8_sse41_8bit+0x39>
- .byte 102,15,56,0,21,163,10,0,0 // pshufb 0xaa3(%rip),%xmm2 # fa0 <_sk_xor__sse41_8bit+0x136>
- .byte 102,66,15,126,20,2 // movd %xmm2,(%rdx,%r8,1)
- .byte 72,173 // lods %ds:(%rsi),%rax
- .byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,40 // je 539 <_sk_store_a8_sse41_8bit+0x6b>
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,15 // je 526 <_sk_store_a8_sse41_8bit+0x58>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,230 // jne 503 <_sk_store_a8_sse41_8bit+0x35>
- .byte 102,66,15,58,20,84,2,2,8 // pextrb $0x8,%xmm2,0x2(%rdx,%r8,1)
- .byte 102,15,56,0,21,97,10,0,0 // pshufb 0xa61(%rip),%xmm2 # f90 <_sk_xor__sse41_8bit+0x126>
- .byte 102,66,15,58,21,20,2,0 // pextrw $0x0,%xmm2,(%rdx,%r8,1)
- .byte 235,202 // jmp 503 <_sk_store_a8_sse41_8bit+0x35>
- .byte 102,66,15,58,20,20,2,0 // pextrb $0x0,%xmm2,(%rdx,%r8,1)
- .byte 235,192 // jmp 503 <_sk_store_a8_sse41_8bit+0x35>
+ .byte 117,19 // jne 89e <_sk_store_a8_sse41_8bit+0x4e>
+ .byte 102,15,56,0,37,92,21,0,0 // pshufb 0x155c(%rip),%xmm4 # 1df0 <_sk_xor__sse41_8bit+0x214>
+ .byte 102,66,15,214,36,2 // movq %xmm4,(%rdx,%r8,1)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,239 // ja 89a <_sk_store_a8_sse41_8bit+0x4a>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,94,0,0,0 // lea 0x5e(%rip),%rcx # 914 <_sk_store_a8_sse41_8bit+0xc4>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 102,66,15,58,20,36,2,0 // pextrb $0x0,%xmm4,(%rdx,%r8,1)
+ .byte 235,209 // jmp 89a <_sk_store_a8_sse41_8bit+0x4a>
+ .byte 102,66,15,58,20,100,2,2,4 // pextrb $0x4,%xmm4,0x2(%rdx,%r8,1)
+ .byte 102,15,56,0,37,5,21,0,0 // pshufb 0x1505(%rip),%xmm4 # 1de0 <_sk_xor__sse41_8bit+0x204>
+ .byte 102,66,15,58,21,36,2,0 // pextrw $0x0,%xmm4,(%rdx,%r8,1)
+ .byte 235,181 // jmp 89a <_sk_store_a8_sse41_8bit+0x4a>
+ .byte 102,66,15,58,20,100,2,6,12 // pextrb $0xc,%xmm4,0x6(%rdx,%r8,1)
+ .byte 102,66,15,58,20,100,2,5,10 // pextrb $0xa,%xmm4,0x5(%rdx,%r8,1)
+ .byte 102,66,15,58,20,100,2,4,8 // pextrb $0x8,%xmm4,0x4(%rdx,%r8,1)
+ .byte 102,15,56,0,37,199,20,0,0 // pshufb 0x14c7(%rip),%xmm4 # 1dd0 <_sk_xor__sse41_8bit+0x1f4>
+ .byte 102,66,15,126,36,2 // movd %xmm4,(%rdx,%r8,1)
+ .byte 235,137 // jmp 89a <_sk_store_a8_sse41_8bit+0x4a>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 171 // stos %eax,%es:(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 190,255,255,255,181 // mov $0xb5ffffff,%esi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 236 // in (%dx),%al
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,227 // jmpq *%rbx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 218,255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,209 // callq *%rcx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_g8_sse41_8bit
.globl _sk_load_g8_sse41_8bit
@@ -58438,32 +59902,70 @@ _sk_load_g8_sse41_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,36 // jne 584 <_sk_load_g8_sse41_8bit+0x41>
- .byte 102,66,15,56,49,4,2 // pmovzxbd (%rdx,%r8,1),%xmm0
- .byte 102,15,219,5,65,10,0,0 // pand 0xa41(%rip),%xmm0 # fb0 <_sk_xor__sse41_8bit+0x146>
- .byte 102,15,56,64,5,72,10,0,0 // pmulld 0xa48(%rip),%xmm0 # fc0 <_sk_xor__sse41_8bit+0x156>
- .byte 102,15,235,5,80,10,0,0 // por 0xa50(%rip),%xmm0 # fd0 <_sk_xor__sse41_8bit+0x166>
+ .byte 117,66 // jne 98f <_sk_load_g8_sse41_8bit+0x5f>
+ .byte 102,66,15,56,48,12,2 // pmovzxbw (%rdx,%r8,1),%xmm1
+ .byte 102,15,219,13,164,20,0,0 // pand 0x14a4(%rip),%xmm1 # 1e00 <_sk_xor__sse41_8bit+0x224>
+ .byte 102,15,239,228 // pxor %xmm4,%xmm4
+ .byte 102,15,56,51,193 // pmovzxwd %xmm1,%xmm0
+ .byte 102,15,105,204 // punpckhwd %xmm4,%xmm1
+ .byte 102,15,111,37,159,20,0,0 // movdqa 0x149f(%rip),%xmm4 # 1e10 <_sk_xor__sse41_8bit+0x234>
+ .byte 102,15,56,64,204 // pmulld %xmm4,%xmm1
+ .byte 102,15,56,64,196 // pmulld %xmm4,%xmm0
+ .byte 102,15,111,37,157,20,0,0 // movdqa 0x149d(%rip),%xmm4 # 1e20 <_sk_xor__sse41_8bit+0x244>
+ .byte 102,15,235,196 // por %xmm4,%xmm0
+ .byte 102,15,235,204 // por %xmm4,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,53 // je 5c3 <_sk_load_g8_sse41_8bit+0x80>
- .byte 102,15,239,192 // pxor %xmm0,%xmm0
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,21 // je 5ad <_sk_load_g8_sse41_8bit+0x6a>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,201 // jne 567 <_sk_load_g8_sse41_8bit+0x24>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,201 // pxor %xmm1,%xmm1
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,180 // ja 954 <_sk_load_g8_sse41_8bit+0x24>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,121,0,0,0 // lea 0x79(%rip),%rcx # a24 <_sk_load_g8_sse41_8bit+0xf4>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
+ .byte 102,15,110,200 // movd %eax,%xmm1
+ .byte 235,149 // jmp 954 <_sk_load_g8_sse41_8bit+0x24>
.byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
- .byte 102,15,110,192 // movd %eax,%xmm0
- .byte 102,15,112,192,69 // pshufd $0x45,%xmm0,%xmm0
+ .byte 102,15,239,201 // pxor %xmm1,%xmm1
+ .byte 102,15,196,200,2 // pinsrw $0x2,%eax,%xmm1
.byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 102,15,56,49,210 // pmovzxbd %xmm2,%xmm2
- .byte 102,15,58,14,194,15 // pblendw $0xf,%xmm2,%xmm0
- .byte 235,164 // jmp 567 <_sk_load_g8_sse41_8bit+0x24>
- .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
.byte 102,15,110,192 // movd %eax,%xmm0
- .byte 235,153 // jmp 567 <_sk_load_g8_sse41_8bit+0x24>
+ .byte 102,15,56,48,192 // pmovzxbw %xmm0,%xmm0
+ .byte 102,15,58,14,200,3 // pblendw $0x3,%xmm0,%xmm1
+ .byte 233,109,255,255,255 // jmpq 954 <_sk_load_g8_sse41_8bit+0x24>
+ .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
+ .byte 102,15,239,201 // pxor %xmm1,%xmm1
+ .byte 102,15,196,200,6 // pinsrw $0x6,%eax,%xmm1
+ .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
+ .byte 102,15,196,200,5 // pinsrw $0x5,%eax,%xmm1
+ .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
+ .byte 102,15,196,200,4 // pinsrw $0x4,%eax,%xmm1
+ .byte 102,66,15,110,4,2 // movd (%rdx,%r8,1),%xmm0
+ .byte 102,15,56,48,192 // pmovzxbw %xmm0,%xmm0
+ .byte 102,15,58,14,200,15 // pblendw $0xf,%xmm0,%xmm1
+ .byte 233,50,255,255,255 // jmpq 954 <_sk_load_g8_sse41_8bit+0x24>
+ .byte 102,144 // xchg %ax,%ax
+ .byte 144 // nop
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,170,255,255,255,155 // ljmp *-0x64000001(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 232,255,255,255,221 // callq ffffffffde000a34 <_sk_xor__sse41_8bit+0xffffffffddffee58>
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,210 // callq *%rdx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,195 // inc %ebx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_g8_dst_sse41_8bit
.globl _sk_load_g8_dst_sse41_8bit
@@ -58477,122 +59979,237 @@ _sk_load_g8_dst_sse41_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,36 // jne 60f <_sk_load_g8_dst_sse41_8bit+0x41>
- .byte 102,66,15,56,49,12,2 // pmovzxbd (%rdx,%r8,1),%xmm1
- .byte 102,15,219,13,230,9,0,0 // pand 0x9e6(%rip),%xmm1 # fe0 <_sk_xor__sse41_8bit+0x176>
- .byte 102,15,56,64,13,237,9,0,0 // pmulld 0x9ed(%rip),%xmm1 # ff0 <_sk_xor__sse41_8bit+0x186>
- .byte 102,15,235,13,245,9,0,0 // por 0x9f5(%rip),%xmm1 # 1000 <_sk_xor__sse41_8bit+0x196>
+ .byte 117,66 // jne a9f <_sk_load_g8_dst_sse41_8bit+0x5f>
+ .byte 102,66,15,56,48,28,2 // pmovzxbw (%rdx,%r8,1),%xmm3
+ .byte 102,15,219,29,196,19,0,0 // pand 0x13c4(%rip),%xmm3 # 1e30 <_sk_xor__sse41_8bit+0x254>
+ .byte 102,15,239,228 // pxor %xmm4,%xmm4
+ .byte 102,15,56,51,211 // pmovzxwd %xmm3,%xmm2
+ .byte 102,15,105,220 // punpckhwd %xmm4,%xmm3
+ .byte 102,15,111,37,191,19,0,0 // movdqa 0x13bf(%rip),%xmm4 # 1e40 <_sk_xor__sse41_8bit+0x264>
+ .byte 102,15,56,64,220 // pmulld %xmm4,%xmm3
+ .byte 102,15,56,64,212 // pmulld %xmm4,%xmm2
+ .byte 102,15,111,37,189,19,0,0 // movdqa 0x13bd(%rip),%xmm4 # 1e50 <_sk_xor__sse41_8bit+0x274>
+ .byte 102,15,235,212 // por %xmm4,%xmm2
+ .byte 102,15,235,220 // por %xmm4,%xmm3
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,53 // je 64e <_sk_load_g8_dst_sse41_8bit+0x80>
- .byte 102,15,239,201 // pxor %xmm1,%xmm1
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,21 // je 638 <_sk_load_g8_dst_sse41_8bit+0x6a>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,201 // jne 5f2 <_sk_load_g8_dst_sse41_8bit+0x24>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,219 // pxor %xmm3,%xmm3
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,180 // ja a64 <_sk_load_g8_dst_sse41_8bit+0x24>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,121,0,0,0 // lea 0x79(%rip),%rcx # b34 <_sk_load_g8_dst_sse41_8bit+0xf4>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
+ .byte 102,15,110,216 // movd %eax,%xmm3
+ .byte 235,149 // jmp a64 <_sk_load_g8_dst_sse41_8bit+0x24>
.byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
- .byte 102,15,110,200 // movd %eax,%xmm1
- .byte 102,15,112,201,69 // pshufd $0x45,%xmm1,%xmm1
+ .byte 102,15,239,219 // pxor %xmm3,%xmm3
+ .byte 102,15,196,216,2 // pinsrw $0x2,%eax,%xmm3
.byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
.byte 102,15,110,208 // movd %eax,%xmm2
- .byte 102,15,56,49,210 // pmovzxbd %xmm2,%xmm2
- .byte 102,15,58,14,202,15 // pblendw $0xf,%xmm2,%xmm1
- .byte 235,164 // jmp 5f2 <_sk_load_g8_dst_sse41_8bit+0x24>
- .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
- .byte 102,15,110,200 // movd %eax,%xmm1
- .byte 235,153 // jmp 5f2 <_sk_load_g8_dst_sse41_8bit+0x24>
+ .byte 102,15,56,48,210 // pmovzxbw %xmm2,%xmm2
+ .byte 102,15,58,14,218,3 // pblendw $0x3,%xmm2,%xmm3
+ .byte 233,109,255,255,255 // jmpq a64 <_sk_load_g8_dst_sse41_8bit+0x24>
+ .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
+ .byte 102,15,239,219 // pxor %xmm3,%xmm3
+ .byte 102,15,196,216,6 // pinsrw $0x6,%eax,%xmm3
+ .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
+ .byte 102,15,196,216,5 // pinsrw $0x5,%eax,%xmm3
+ .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
+ .byte 102,15,196,216,4 // pinsrw $0x4,%eax,%xmm3
+ .byte 102,66,15,110,20,2 // movd (%rdx,%r8,1),%xmm2
+ .byte 102,15,56,48,210 // pmovzxbw %xmm2,%xmm2
+ .byte 102,15,58,14,218,15 // pblendw $0xf,%xmm2,%xmm3
+ .byte 233,50,255,255,255 // jmpq a64 <_sk_load_g8_dst_sse41_8bit+0x24>
+ .byte 102,144 // xchg %ax,%ax
+ .byte 144 // nop
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,170,255,255,255,155 // ljmp *-0x64000001(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 232,255,255,255,221 // callq ffffffffde000b44 <_sk_xor__sse41_8bit+0xffffffffddffef68>
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,210 // callq *%rdx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,195 // inc %ebx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_srcover_rgba_8888_sse41_8bit
.globl _sk_srcover_rgba_8888_sse41_8bit
FUNCTION(_sk_srcover_rgba_8888_sse41_8bit)
_sk_srcover_rgba_8888_sse41_8bit:
- .byte 76,99,7 // movslq (%rdi),%r8
- .byte 76,139,79,16 // mov 0x10(%rdi),%r9
+ .byte 76,99,15 // movslq (%rdi),%r9
+ .byte 76,139,71,16 // mov 0x10(%rdi),%r8
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 72,99,72,8 // movslq 0x8(%rax),%rcx
.byte 72,99,87,8 // movslq 0x8(%rdi),%rdx
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,193,226,2 // shl $0x2,%rdx
.byte 72,3,16 // add (%rax),%rdx
- .byte 77,133,201 // test %r9,%r9
- .byte 117,98 // jne 6dc <_sk_srcover_rgba_8888_sse41_8bit+0x83>
- .byte 243,66,15,111,20,130 // movdqu (%rdx,%r8,4),%xmm2
- .byte 77,133,201 // test %r9,%r9
- .byte 102,15,111,216 // movdqa %xmm0,%xmm3
- .byte 102,15,56,0,29,128,9,0,0 // pshufb 0x980(%rip),%xmm3 # 1010 <_sk_xor__sse41_8bit+0x1a6>
- .byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,111,234 // movdqa %xmm2,%xmm5
- .byte 102,15,104,236 // punpckhbw %xmm4,%xmm5
- .byte 102,15,56,48,242 // pmovzxbw %xmm2,%xmm6
- .byte 102,15,56,48,251 // pmovzxbw %xmm3,%xmm7
- .byte 102,15,104,220 // punpckhbw %xmm4,%xmm3
- .byte 102,15,213,221 // pmullw %xmm5,%xmm3
- .byte 102,15,213,254 // pmullw %xmm6,%xmm7
- .byte 102,15,253,221 // paddw %xmm5,%xmm3
- .byte 102,15,253,254 // paddw %xmm6,%xmm7
- .byte 102,15,113,211,8 // psrlw $0x8,%xmm3
+ .byte 77,133,192 // test %r8,%r8
+ .byte 15,133,206,0,0,0 // jne c43 <_sk_srcover_rgba_8888_sse41_8bit+0xf3>
+ .byte 243,70,15,111,68,138,16 // movdqu 0x10(%rdx,%r9,4),%xmm8
+ .byte 243,70,15,111,12,138 // movdqu (%rdx,%r9,4),%xmm9
+ .byte 77,133,192 // test %r8,%r8
+ .byte 102,15,111,37,211,18,0,0 // movdqa 0x12d3(%rip),%xmm4 # 1e60 <_sk_xor__sse41_8bit+0x284>
+ .byte 102,15,111,241 // movdqa %xmm1,%xmm6
+ .byte 102,15,56,0,244 // pshufb %xmm4,%xmm6
+ .byte 102,15,111,248 // movdqa %xmm0,%xmm7
+ .byte 102,15,56,0,252 // pshufb %xmm4,%xmm7
+ .byte 102,69,15,239,210 // pxor %xmm10,%xmm10
+ .byte 102,69,15,111,217 // movdqa %xmm9,%xmm11
+ .byte 102,69,15,104,218 // punpckhbw %xmm10,%xmm11
+ .byte 102,69,15,111,224 // movdqa %xmm8,%xmm12
+ .byte 102,69,15,104,226 // punpckhbw %xmm10,%xmm12
+ .byte 102,69,15,56,48,233 // pmovzxbw %xmm9,%xmm13
+ .byte 102,69,15,56,48,240 // pmovzxbw %xmm8,%xmm14
+ .byte 102,15,56,48,231 // pmovzxbw %xmm7,%xmm4
+ .byte 102,15,56,48,238 // pmovzxbw %xmm6,%xmm5
+ .byte 102,65,15,104,250 // punpckhbw %xmm10,%xmm7
+ .byte 102,65,15,104,242 // punpckhbw %xmm10,%xmm6
+ .byte 102,65,15,213,244 // pmullw %xmm12,%xmm6
+ .byte 102,65,15,213,251 // pmullw %xmm11,%xmm7
+ .byte 102,65,15,213,238 // pmullw %xmm14,%xmm5
+ .byte 102,65,15,213,229 // pmullw %xmm13,%xmm4
+ .byte 102,65,15,253,251 // paddw %xmm11,%xmm7
+ .byte 102,65,15,253,244 // paddw %xmm12,%xmm6
+ .byte 102,65,15,253,229 // paddw %xmm13,%xmm4
+ .byte 102,65,15,253,238 // paddw %xmm14,%xmm5
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
.byte 102,15,113,215,8 // psrlw $0x8,%xmm7
- .byte 102,15,103,251 // packuswb %xmm3,%xmm7
- .byte 102,15,248,215 // psubb %xmm7,%xmm2
- .byte 102,15,252,208 // paddb %xmm0,%xmm2
- .byte 117,60 // jne 70e <_sk_srcover_rgba_8888_sse41_8bit+0xb5>
- .byte 243,66,15,127,20,130 // movdqu %xmm2,(%rdx,%r8,4)
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,15,103,231 // packuswb %xmm7,%xmm4
+ .byte 102,15,103,238 // packuswb %xmm6,%xmm5
+ .byte 102,68,15,248,197 // psubb %xmm5,%xmm8
+ .byte 102,68,15,248,204 // psubb %xmm4,%xmm9
+ .byte 102,68,15,252,200 // paddb %xmm0,%xmm9
+ .byte 102,68,15,252,193 // paddb %xmm1,%xmm8
+ .byte 117,72 // jne c7a <_sk_srcover_rgba_8888_sse41_8bit+0x12a>
+ .byte 243,70,15,127,12,138 // movdqu %xmm9,(%rdx,%r9,4)
+ .byte 243,70,15,127,68,138,16 // movdqu %xmm8,0x10(%rdx,%r9,4)
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 68,137,200 // mov %r9d,%eax
- .byte 36,3 // and $0x3,%al
- .byte 60,1 // cmp $0x1,%al
- .byte 116,80 // je 735 <_sk_srcover_rgba_8888_sse41_8bit+0xdc>
- .byte 102,15,239,210 // pxor %xmm2,%xmm2
- .byte 60,2 // cmp $0x2,%al
- .byte 116,16 // je 6fd <_sk_srcover_rgba_8888_sse41_8bit+0xa4>
- .byte 60,3 // cmp $0x3,%al
- .byte 117,143 // jne 680 <_sk_srcover_rgba_8888_sse41_8bit+0x27>
- .byte 102,66,15,110,84,130,8 // movd 0x8(%rdx,%r8,4),%xmm2
- .byte 102,15,112,210,69 // pshufd $0x45,%xmm2,%xmm2
- .byte 243,66,15,126,28,130 // movq (%rdx,%r8,4),%xmm3
- .byte 102,15,58,14,211,15 // pblendw $0xf,%xmm3,%xmm2
- .byte 233,114,255,255,255 // jmpq 680 <_sk_srcover_rgba_8888_sse41_8bit+0x27>
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,40 // je 740 <_sk_srcover_rgba_8888_sse41_8bit+0xe7>
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,15 // je 72d <_sk_srcover_rgba_8888_sse41_8bit+0xd4>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,180 // jne 6d8 <_sk_srcover_rgba_8888_sse41_8bit+0x7f>
- .byte 102,66,15,58,22,84,130,8,2 // pextrd $0x2,%xmm2,0x8(%rdx,%r8,4)
- .byte 102,66,15,214,20,130 // movq %xmm2,(%rdx,%r8,4)
- .byte 235,163 // jmp 6d8 <_sk_srcover_rgba_8888_sse41_8bit+0x7f>
- .byte 102,66,15,110,20,130 // movd (%rdx,%r8,4),%xmm2
- .byte 233,64,255,255,255 // jmpq 680 <_sk_srcover_rgba_8888_sse41_8bit+0x27>
- .byte 102,66,15,126,20,130 // movd %xmm2,(%rdx,%r8,4)
- .byte 235,144 // jmp 6d8 <_sk_srcover_rgba_8888_sse41_8bit+0x7f>
+ .byte 68,137,192 // mov %r8d,%eax
+ .byte 36,7 // and $0x7,%al
+ .byte 102,69,15,239,192 // pxor %xmm8,%xmm8
+ .byte 102,69,15,239,201 // pxor %xmm9,%xmm9
+ .byte 254,200 // dec %al
+ .byte 60,6 // cmp $0x6,%al
+ .byte 15,135,38,255,255,255 // ja b82 <_sk_srcover_rgba_8888_sse41_8bit+0x32>
+ .byte 15,182,192 // movzbl %al,%eax
+ .byte 72,141,13,186,0,0,0 // lea 0xba(%rip),%rcx # d20 <_sk_srcover_rgba_8888_sse41_8bit+0x1d0>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 102,70,15,110,12,138 // movd (%rdx,%r9,4),%xmm9
+ .byte 233,8,255,255,255 // jmpq b82 <_sk_srcover_rgba_8888_sse41_8bit+0x32>
+ .byte 65,128,224,7 // and $0x7,%r8b
+ .byte 65,254,200 // dec %r8b
+ .byte 65,128,248,6 // cmp $0x6,%r8b
+ .byte 119,184 // ja c3f <_sk_srcover_rgba_8888_sse41_8bit+0xef>
+ .byte 65,15,182,192 // movzbl %r8b,%eax
+ .byte 72,141,13,170,0,0,0 // lea 0xaa(%rip),%rcx # d3c <_sk_srcover_rgba_8888_sse41_8bit+0x1ec>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 102,70,15,126,12,138 // movd %xmm9,(%rdx,%r9,4)
+ .byte 235,156 // jmp c3f <_sk_srcover_rgba_8888_sse41_8bit+0xef>
+ .byte 102,66,15,110,100,138,8 // movd 0x8(%rdx,%r9,4),%xmm4
+ .byte 102,68,15,112,204,69 // pshufd $0x45,%xmm4,%xmm9
+ .byte 243,66,15,126,36,138 // movq (%rdx,%r9,4),%xmm4
+ .byte 102,68,15,58,14,204,15 // pblendw $0xf,%xmm4,%xmm9
+ .byte 233,192,254,255,255 // jmpq b82 <_sk_srcover_rgba_8888_sse41_8bit+0x32>
+ .byte 102,66,15,110,100,138,24 // movd 0x18(%rdx,%r9,4),%xmm4
+ .byte 102,68,15,112,196,69 // pshufd $0x45,%xmm4,%xmm8
+ .byte 102,70,15,58,34,68,138,20,1 // pinsrd $0x1,0x14(%rdx,%r9,4),%xmm8
+ .byte 102,70,15,58,34,68,138,16,0 // pinsrd $0x0,0x10(%rdx,%r9,4),%xmm8
+ .byte 233,150,254,255,255 // jmpq b7c <_sk_srcover_rgba_8888_sse41_8bit+0x2c>
+ .byte 102,70,15,58,22,76,138,8,2 // pextrd $0x2,%xmm9,0x8(%rdx,%r9,4)
+ .byte 102,70,15,214,12,138 // movq %xmm9,(%rdx,%r9,4)
+ .byte 233,69,255,255,255 // jmpq c3f <_sk_srcover_rgba_8888_sse41_8bit+0xef>
+ .byte 102,70,15,58,22,68,138,24,2 // pextrd $0x2,%xmm8,0x18(%rdx,%r9,4)
+ .byte 102,70,15,58,22,68,138,20,1 // pextrd $0x1,%xmm8,0x14(%rdx,%r9,4)
+ .byte 102,70,15,126,68,138,16 // movd %xmm8,0x10(%rdx,%r9,4)
+ .byte 243,70,15,127,12,138 // movdqu %xmm9,(%rdx,%r9,4)
+ .byte 233,33,255,255,255 // jmpq c3f <_sk_srcover_rgba_8888_sse41_8bit+0xef>
+ .byte 102,144 // xchg %ax,%ax
+ .byte 79,255 // rex.WRXB (bad)
+ .byte 255 // (bad)
+ .byte 255,144,255,255,255,131 // callq *-0x7c000001(%rax)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,92,254,255 // lcall *-0x1(%rsi,%rdi,8)
+ .byte 255 // (bad)
+ .byte 184,255,255,255,175 // mov $0xafffffff,%eax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,162,255,255,255,95 // jmpq *0x5fffffff(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,179,255,255,255,170 // pushq -0x55000001(%rbx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,215 // callq *%rdi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,208 // callq *%rax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,199 // inc %edi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 190 // .byte 0xbe
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_scale_1_float_sse41_8bit
.globl _sk_scale_1_float_sse41_8bit
FUNCTION(_sk_scale_1_float_sse41_8bit)
_sk_scale_1_float_sse41_8bit:
+ .byte 102,15,111,225 // movdqa %xmm1,%xmm4
+ .byte 102,15,111,232 // movdqa %xmm0,%xmm5
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 243,15,16,16 // movss (%rax),%xmm2
- .byte 243,15,89,21,186,7,0,0 // mulss 0x7ba(%rip),%xmm2 # f10 <_sk_xor__sse41_8bit+0xa6>
- .byte 243,15,44,194 // cvttss2si %xmm2,%eax
- .byte 102,15,110,216 // movd %eax,%xmm3
- .byte 15,87,210 // xorps %xmm2,%xmm2
- .byte 102,15,56,48,224 // pmovzxbw %xmm0,%xmm4
- .byte 102,15,104,194 // punpckhbw %xmm2,%xmm0
- .byte 102,15,56,0,29,173,8,0,0 // pshufb 0x8ad(%rip),%xmm3 # 1020 <_sk_xor__sse41_8bit+0x1b6>
- .byte 102,15,111,211 // movdqa %xmm3,%xmm2
- .byte 102,15,213,212 // pmullw %xmm4,%xmm2
- .byte 102,15,213,216 // pmullw %xmm0,%xmm3
- .byte 102,15,253,216 // paddw %xmm0,%xmm3
- .byte 102,15,253,212 // paddw %xmm4,%xmm2
- .byte 102,15,113,211,8 // psrlw $0x8,%xmm3
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,211 // packuswb %xmm3,%xmm2
+ .byte 243,15,16,0 // movss (%rax),%xmm0
+ .byte 243,15,89,5,186,15,0,0 // mulss 0xfba(%rip),%xmm0 # 1d28 <_sk_xor__sse41_8bit+0x14c>
+ .byte 243,15,44,192 // cvttss2si %xmm0,%eax
+ .byte 15,87,192 // xorps %xmm0,%xmm0
+ .byte 102,68,15,56,48,197 // pmovzxbw %xmm5,%xmm8
+ .byte 102,15,104,232 // punpckhbw %xmm0,%xmm5
+ .byte 102,68,15,56,48,204 // pmovzxbw %xmm4,%xmm9
+ .byte 102,15,104,224 // punpckhbw %xmm0,%xmm4
+ .byte 102,15,110,240 // movd %eax,%xmm6
+ .byte 102,15,56,0,53,218,16,0,0 // pshufb 0x10da(%rip),%xmm6 # 1e70 <_sk_xor__sse41_8bit+0x294>
+ .byte 102,15,111,206 // movdqa %xmm6,%xmm1
+ .byte 102,65,15,213,201 // pmullw %xmm9,%xmm1
+ .byte 102,15,111,198 // movdqa %xmm6,%xmm0
+ .byte 102,65,15,213,192 // pmullw %xmm8,%xmm0
+ .byte 102,15,111,254 // movdqa %xmm6,%xmm7
+ .byte 102,15,213,252 // pmullw %xmm4,%xmm7
+ .byte 102,15,213,245 // pmullw %xmm5,%xmm6
+ .byte 102,15,253,245 // paddw %xmm5,%xmm6
+ .byte 102,15,253,252 // paddw %xmm4,%xmm7
+ .byte 102,65,15,253,192 // paddw %xmm8,%xmm0
+ .byte 102,65,15,253,201 // paddw %xmm9,%xmm1
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
+ .byte 102,15,113,208,8 // psrlw $0x8,%xmm0
+ .byte 102,15,103,198 // packuswb %xmm6,%xmm0
+ .byte 102,15,103,207 // packuswb %xmm7,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
.byte 255,224 // jmpq *%rax
HIDDEN _sk_scale_u8_sse41_8bit
@@ -58607,82 +60224,154 @@ _sk_scale_u8_sse41_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,76 // jne 806 <_sk_scale_u8_sse41_8bit+0x69>
- .byte 102,66,15,56,49,28,2 // pmovzxbd (%rdx,%r8,1),%xmm3
- .byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,56,0,29,98,8,0,0 // pshufb 0x862(%rip),%xmm3 # 1030 <_sk_xor__sse41_8bit+0x1c6>
- .byte 102,15,56,48,232 // pmovzxbw %xmm0,%xmm5
- .byte 102,15,104,196 // punpckhbw %xmm4,%xmm0
- .byte 102,15,56,48,211 // pmovzxbw %xmm3,%xmm2
- .byte 102,15,104,220 // punpckhbw %xmm4,%xmm3
- .byte 102,15,213,216 // pmullw %xmm0,%xmm3
- .byte 102,15,213,213 // pmullw %xmm5,%xmm2
- .byte 102,15,253,216 // paddw %xmm0,%xmm3
- .byte 102,15,253,213 // paddw %xmm5,%xmm2
- .byte 102,15,113,211,8 // psrlw $0x8,%xmm3
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,211 // packuswb %xmm3,%xmm2
+ .byte 15,133,160,0,0,0 // jne ea7 <_sk_scale_u8_sse41_8bit+0xc1>
+ .byte 102,66,15,56,48,52,2 // pmovzxbw (%rdx,%r8,1),%xmm6
+ .byte 102,15,219,53,106,16,0,0 // pand 0x106a(%rip),%xmm6 # 1e80 <_sk_xor__sse41_8bit+0x2a4>
+ .byte 102,69,15,239,192 // pxor %xmm8,%xmm8
+ .byte 102,15,111,254 // movdqa %xmm6,%xmm7
+ .byte 102,15,56,0,61,104,16,0,0 // pshufb 0x1068(%rip),%xmm7 # 1e90 <_sk_xor__sse41_8bit+0x2b4>
+ .byte 102,15,56,0,53,111,16,0,0 // pshufb 0x106f(%rip),%xmm6 # 1ea0 <_sk_xor__sse41_8bit+0x2c4>
+ .byte 102,68,15,56,48,200 // pmovzxbw %xmm0,%xmm9
+ .byte 102,65,15,104,192 // punpckhbw %xmm8,%xmm0
+ .byte 102,68,15,56,48,209 // pmovzxbw %xmm1,%xmm10
+ .byte 102,65,15,104,200 // punpckhbw %xmm8,%xmm1
+ .byte 102,15,56,48,230 // pmovzxbw %xmm6,%xmm4
+ .byte 102,65,15,104,240 // punpckhbw %xmm8,%xmm6
+ .byte 102,15,56,48,239 // pmovzxbw %xmm7,%xmm5
+ .byte 102,65,15,104,248 // punpckhbw %xmm8,%xmm7
+ .byte 102,15,213,249 // pmullw %xmm1,%xmm7
+ .byte 102,15,213,240 // pmullw %xmm0,%xmm6
+ .byte 102,65,15,213,234 // pmullw %xmm10,%xmm5
+ .byte 102,65,15,213,225 // pmullw %xmm9,%xmm4
+ .byte 102,15,253,240 // paddw %xmm0,%xmm6
+ .byte 102,15,253,249 // paddw %xmm1,%xmm7
+ .byte 102,65,15,253,225 // paddw %xmm9,%xmm4
+ .byte 102,65,15,253,234 // paddw %xmm10,%xmm5
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,15,103,230 // packuswb %xmm6,%xmm4
+ .byte 102,15,103,239 // packuswb %xmm7,%xmm5
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
+ .byte 102,15,111,196 // movdqa %xmm4,%xmm0
+ .byte 102,15,111,205 // movdqa %xmm5,%xmm1
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,56 // je 848 <_sk_scale_u8_sse41_8bit+0xab>
- .byte 102,15,239,219 // pxor %xmm3,%xmm3
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,21 // je 82f <_sk_scale_u8_sse41_8bit+0x92>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,161 // jne 7c1 <_sk_scale_u8_sse41_8bit+0x24>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,246 // pxor %xmm6,%xmm6
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 15,135,82,255,255,255 // ja e0e <_sk_scale_u8_sse41_8bit+0x28>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,125,0,0,0 // lea 0x7d(%rip),%rcx # f44 <_sk_scale_u8_sse41_8bit+0x15e>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
+ .byte 102,15,110,240 // movd %eax,%xmm6
+ .byte 233,48,255,255,255 // jmpq e0e <_sk_scale_u8_sse41_8bit+0x28>
.byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 102,15,112,218,69 // pshufd $0x45,%xmm2,%xmm3
+ .byte 102,15,239,246 // pxor %xmm6,%xmm6
+ .byte 102,15,196,240,2 // pinsrw $0x2,%eax,%xmm6
.byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 102,15,56,49,210 // pmovzxbd %xmm2,%xmm2
- .byte 102,15,58,14,218,15 // pblendw $0xf,%xmm2,%xmm3
- .byte 233,121,255,255,255 // jmpq 7c1 <_sk_scale_u8_sse41_8bit+0x24>
- .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
- .byte 102,15,110,216 // movd %eax,%xmm3
- .byte 233,107,255,255,255 // jmpq 7c1 <_sk_scale_u8_sse41_8bit+0x24>
+ .byte 102,15,110,224 // movd %eax,%xmm4
+ .byte 102,15,56,48,228 // pmovzxbw %xmm4,%xmm4
+ .byte 102,15,58,14,244,3 // pblendw $0x3,%xmm4,%xmm6
+ .byte 233,8,255,255,255 // jmpq e0e <_sk_scale_u8_sse41_8bit+0x28>
+ .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
+ .byte 102,15,239,246 // pxor %xmm6,%xmm6
+ .byte 102,15,196,240,6 // pinsrw $0x6,%eax,%xmm6
+ .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
+ .byte 102,15,196,240,5 // pinsrw $0x5,%eax,%xmm6
+ .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
+ .byte 102,15,196,240,4 // pinsrw $0x4,%eax,%xmm6
+ .byte 102,66,15,110,36,2 // movd (%rdx,%r8,1),%xmm4
+ .byte 102,15,56,48,228 // pmovzxbw %xmm4,%xmm4
+ .byte 102,15,58,14,244,15 // pblendw $0xf,%xmm4,%xmm6
+ .byte 233,205,254,255,255 // jmpq e0e <_sk_scale_u8_sse41_8bit+0x28>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 140,255 // mov %?,%edi
+ .byte 255 // (bad)
+ .byte 255,169,255,255,255,154 // ljmp *-0x65000001(%rcx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,231 // jmpq *%rdi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 220,255 // fdivr %st,%st(7)
+ .byte 255 // (bad)
+ .byte 255,209 // callq *%rcx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,194 // inc %edx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_lerp_1_float_sse41_8bit
.globl _sk_lerp_1_float_sse41_8bit
FUNCTION(_sk_lerp_1_float_sse41_8bit)
_sk_lerp_1_float_sse41_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 243,15,16,16 // movss (%rax),%xmm2
- .byte 243,15,89,21,176,6,0,0 // mulss 0x6b0(%rip),%xmm2 # f14 <_sk_xor__sse41_8bit+0xaa>
- .byte 243,15,44,194 // cvttss2si %xmm2,%eax
- .byte 102,15,110,216 // movd %eax,%xmm3
- .byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,56,0,220 // pshufb %xmm4,%xmm3
- .byte 102,15,56,48,232 // pmovzxbw %xmm0,%xmm5
- .byte 102,15,104,196 // punpckhbw %xmm4,%xmm0
- .byte 102,15,111,21,186,7,0,0 // movdqa 0x7ba(%rip),%xmm2 # 1040 <_sk_xor__sse41_8bit+0x1d6>
- .byte 102,15,219,211 // pand %xmm3,%xmm2
- .byte 102,15,111,242 // movdqa %xmm2,%xmm6
- .byte 102,15,213,240 // pmullw %xmm0,%xmm6
- .byte 102,15,213,213 // pmullw %xmm5,%xmm2
- .byte 102,15,253,240 // paddw %xmm0,%xmm6
- .byte 102,15,253,213 // paddw %xmm5,%xmm2
+ .byte 243,15,16,32 // movss (%rax),%xmm4
+ .byte 243,15,89,37,190,13,0,0 // mulss 0xdbe(%rip),%xmm4 # 1d2c <_sk_xor__sse41_8bit+0x150>
+ .byte 243,15,44,196 // cvttss2si %xmm4,%eax
+ .byte 102,15,110,224 // movd %eax,%xmm4
+ .byte 102,15,96,228 // punpcklbw %xmm4,%xmm4
+ .byte 242,15,112,252,0 // pshuflw $0x0,%xmm4,%xmm7
+ .byte 102,68,15,112,199,80 // pshufd $0x50,%xmm7,%xmm8
+ .byte 102,69,15,239,201 // pxor %xmm9,%xmm9
+ .byte 102,68,15,56,48,208 // pmovzxbw %xmm0,%xmm10
+ .byte 102,65,15,104,193 // punpckhbw %xmm9,%xmm0
+ .byte 102,68,15,56,48,217 // pmovzxbw %xmm1,%xmm11
+ .byte 102,65,15,104,201 // punpckhbw %xmm9,%xmm1
+ .byte 102,15,56,0,61,7,15,0,0 // pshufb 0xf07(%rip),%xmm7 # 1eb0 <_sk_xor__sse41_8bit+0x2d4>
+ .byte 102,68,15,111,231 // movdqa %xmm7,%xmm12
+ .byte 102,69,15,213,227 // pmullw %xmm11,%xmm12
+ .byte 102,68,15,111,239 // movdqa %xmm7,%xmm13
+ .byte 102,69,15,213,234 // pmullw %xmm10,%xmm13
+ .byte 102,15,111,247 // movdqa %xmm7,%xmm6
+ .byte 102,15,213,241 // pmullw %xmm1,%xmm6
+ .byte 102,15,213,248 // pmullw %xmm0,%xmm7
+ .byte 102,15,253,248 // paddw %xmm0,%xmm7
+ .byte 102,15,253,241 // paddw %xmm1,%xmm6
+ .byte 102,69,15,253,234 // paddw %xmm10,%xmm13
+ .byte 102,69,15,253,227 // paddw %xmm11,%xmm12
.byte 102,15,113,214,8 // psrlw $0x8,%xmm6
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,214 // packuswb %xmm6,%xmm2
- .byte 102,15,118,237 // pcmpeqd %xmm5,%xmm5
- .byte 102,15,239,235 // pxor %xmm3,%xmm5
- .byte 102,15,56,48,217 // pmovzxbw %xmm1,%xmm3
- .byte 102,15,111,241 // movdqa %xmm1,%xmm6
- .byte 102,15,104,244 // punpckhbw %xmm4,%xmm6
- .byte 102,15,56,48,197 // pmovzxbw %xmm5,%xmm0
- .byte 102,15,104,236 // punpckhbw %xmm4,%xmm5
- .byte 102,15,213,238 // pmullw %xmm6,%xmm5
- .byte 102,15,213,195 // pmullw %xmm3,%xmm0
- .byte 102,15,253,238 // paddw %xmm6,%xmm5
- .byte 102,15,253,195 // paddw %xmm3,%xmm0
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,65,15,113,212,8 // psrlw $0x8,%xmm12
+ .byte 102,65,15,113,213,8 // psrlw $0x8,%xmm13
+ .byte 102,68,15,103,239 // packuswb %xmm7,%xmm13
+ .byte 102,68,15,103,230 // packuswb %xmm6,%xmm12
+ .byte 102,15,118,255 // pcmpeqd %xmm7,%xmm7
+ .byte 102,65,15,239,248 // pxor %xmm8,%xmm7
+ .byte 102,68,15,56,48,194 // pmovzxbw %xmm2,%xmm8
+ .byte 102,15,111,242 // movdqa %xmm2,%xmm6
+ .byte 102,65,15,104,241 // punpckhbw %xmm9,%xmm6
+ .byte 102,68,15,56,48,211 // pmovzxbw %xmm3,%xmm10
+ .byte 102,15,111,227 // movdqa %xmm3,%xmm4
+ .byte 102,65,15,104,225 // punpckhbw %xmm9,%xmm4
+ .byte 102,15,56,48,199 // pmovzxbw %xmm7,%xmm0
+ .byte 102,65,15,104,249 // punpckhbw %xmm9,%xmm7
+ .byte 102,15,111,239 // movdqa %xmm7,%xmm5
+ .byte 102,15,213,236 // pmullw %xmm4,%xmm5
+ .byte 102,15,213,254 // pmullw %xmm6,%xmm7
+ .byte 102,15,111,200 // movdqa %xmm0,%xmm1
+ .byte 102,65,15,213,202 // pmullw %xmm10,%xmm1
+ .byte 102,65,15,213,192 // pmullw %xmm8,%xmm0
+ .byte 102,15,253,254 // paddw %xmm6,%xmm7
+ .byte 102,15,253,236 // paddw %xmm4,%xmm5
+ .byte 102,65,15,253,192 // paddw %xmm8,%xmm0
+ .byte 102,65,15,253,202 // paddw %xmm10,%xmm1
.byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
.byte 102,15,113,208,8 // psrlw $0x8,%xmm0
- .byte 102,15,103,197 // packuswb %xmm5,%xmm0
- .byte 102,15,252,194 // paddb %xmm2,%xmm0
+ .byte 102,15,103,199 // packuswb %xmm7,%xmm0
+ .byte 102,15,103,205 // packuswb %xmm5,%xmm1
+ .byte 102,65,15,252,197 // paddb %xmm13,%xmm0
+ .byte 102,65,15,252,204 // paddb %xmm12,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -58698,65 +60387,125 @@ _sk_lerp_u8_sse41_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 15,133,140,0,0,0 // jne 99d <_sk_lerp_u8_sse41_8bit+0xad>
- .byte 102,66,15,56,49,20,2 // pmovzxbd (%rdx,%r8,1),%xmm2
- .byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,56,0,21,43,7,0,0 // pshufb 0x72b(%rip),%xmm2 # 1050 <_sk_xor__sse41_8bit+0x1e6>
- .byte 102,15,56,48,232 // pmovzxbw %xmm0,%xmm5
- .byte 102,15,104,196 // punpckhbw %xmm4,%xmm0
- .byte 102,15,111,242 // movdqa %xmm2,%xmm6
- .byte 102,15,104,244 // punpckhbw %xmm4,%xmm6
- .byte 102,15,56,48,218 // pmovzxbw %xmm2,%xmm3
- .byte 102,15,213,240 // pmullw %xmm0,%xmm6
- .byte 102,15,213,221 // pmullw %xmm5,%xmm3
- .byte 102,15,253,240 // paddw %xmm0,%xmm6
- .byte 102,15,253,221 // paddw %xmm5,%xmm3
- .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
- .byte 102,15,113,211,8 // psrlw $0x8,%xmm3
- .byte 102,15,103,222 // packuswb %xmm6,%xmm3
- .byte 102,15,118,237 // pcmpeqd %xmm5,%xmm5
- .byte 102,15,239,234 // pxor %xmm2,%xmm5
- .byte 102,15,111,209 // movdqa %xmm1,%xmm2
- .byte 102,15,104,212 // punpckhbw %xmm4,%xmm2
- .byte 102,15,56,48,241 // pmovzxbw %xmm1,%xmm6
- .byte 102,15,56,48,197 // pmovzxbw %xmm5,%xmm0
- .byte 102,15,104,236 // punpckhbw %xmm4,%xmm5
- .byte 102,15,213,234 // pmullw %xmm2,%xmm5
- .byte 102,15,213,198 // pmullw %xmm6,%xmm0
- .byte 102,15,253,234 // paddw %xmm2,%xmm5
- .byte 102,15,253,198 // paddw %xmm6,%xmm0
+ .byte 15,133,46,1,0,0 // jne 11d1 <_sk_lerp_u8_sse41_8bit+0x14f>
+ .byte 102,66,15,56,48,60,2 // pmovzxbw (%rdx,%r8,1),%xmm7
+ .byte 102,15,219,61,14,14,0,0 // pand 0xe0e(%rip),%xmm7 # 1ec0 <_sk_xor__sse41_8bit+0x2e4>
+ .byte 102,69,15,239,192 // pxor %xmm8,%xmm8
+ .byte 102,15,111,247 // movdqa %xmm7,%xmm6
+ .byte 102,15,56,0,53,12,14,0,0 // pshufb 0xe0c(%rip),%xmm6 # 1ed0 <_sk_xor__sse41_8bit+0x2f4>
+ .byte 102,15,56,0,61,19,14,0,0 // pshufb 0xe13(%rip),%xmm7 # 1ee0 <_sk_xor__sse41_8bit+0x304>
+ .byte 102,68,15,56,48,200 // pmovzxbw %xmm0,%xmm9
+ .byte 102,65,15,104,192 // punpckhbw %xmm8,%xmm0
+ .byte 102,68,15,56,48,209 // pmovzxbw %xmm1,%xmm10
+ .byte 102,65,15,104,200 // punpckhbw %xmm8,%xmm1
+ .byte 102,15,111,231 // movdqa %xmm7,%xmm4
+ .byte 102,65,15,104,224 // punpckhbw %xmm8,%xmm4
+ .byte 102,15,111,238 // movdqa %xmm6,%xmm5
+ .byte 102,65,15,104,232 // punpckhbw %xmm8,%xmm5
+ .byte 102,68,15,56,48,231 // pmovzxbw %xmm7,%xmm12
+ .byte 102,68,15,56,48,222 // pmovzxbw %xmm6,%xmm11
+ .byte 102,15,213,233 // pmullw %xmm1,%xmm5
+ .byte 102,15,213,224 // pmullw %xmm0,%xmm4
+ .byte 102,69,15,213,218 // pmullw %xmm10,%xmm11
+ .byte 102,69,15,213,225 // pmullw %xmm9,%xmm12
+ .byte 102,15,253,224 // paddw %xmm0,%xmm4
+ .byte 102,15,253,233 // paddw %xmm1,%xmm5
+ .byte 102,69,15,253,225 // paddw %xmm9,%xmm12
+ .byte 102,69,15,253,218 // paddw %xmm10,%xmm11
.byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,65,15,113,211,8 // psrlw $0x8,%xmm11
+ .byte 102,65,15,113,212,8 // psrlw $0x8,%xmm12
+ .byte 102,68,15,103,228 // packuswb %xmm4,%xmm12
+ .byte 102,68,15,103,221 // packuswb %xmm5,%xmm11
+ .byte 102,15,118,192 // pcmpeqd %xmm0,%xmm0
+ .byte 102,15,239,240 // pxor %xmm0,%xmm6
+ .byte 102,15,239,248 // pxor %xmm0,%xmm7
+ .byte 102,15,111,227 // movdqa %xmm3,%xmm4
+ .byte 102,65,15,104,224 // punpckhbw %xmm8,%xmm4
+ .byte 102,68,15,56,48,202 // pmovzxbw %xmm2,%xmm9
+ .byte 102,15,111,234 // movdqa %xmm2,%xmm5
+ .byte 102,65,15,104,232 // punpckhbw %xmm8,%xmm5
+ .byte 102,68,15,56,48,211 // pmovzxbw %xmm3,%xmm10
+ .byte 102,15,56,48,199 // pmovzxbw %xmm7,%xmm0
+ .byte 102,15,56,48,206 // pmovzxbw %xmm6,%xmm1
+ .byte 102,65,15,104,248 // punpckhbw %xmm8,%xmm7
+ .byte 102,65,15,104,240 // punpckhbw %xmm8,%xmm6
+ .byte 102,15,213,244 // pmullw %xmm4,%xmm6
+ .byte 102,15,213,253 // pmullw %xmm5,%xmm7
+ .byte 102,65,15,213,202 // pmullw %xmm10,%xmm1
+ .byte 102,65,15,213,193 // pmullw %xmm9,%xmm0
+ .byte 102,15,253,253 // paddw %xmm5,%xmm7
+ .byte 102,15,253,244 // paddw %xmm4,%xmm6
+ .byte 102,65,15,253,193 // paddw %xmm9,%xmm0
+ .byte 102,65,15,253,202 // paddw %xmm10,%xmm1
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
.byte 102,15,113,208,8 // psrlw $0x8,%xmm0
- .byte 102,15,103,197 // packuswb %xmm5,%xmm0
- .byte 102,15,252,195 // paddb %xmm3,%xmm0
+ .byte 102,15,103,199 // packuswb %xmm7,%xmm0
+ .byte 102,15,103,206 // packuswb %xmm6,%xmm1
+ .byte 102,65,15,252,196 // paddb %xmm12,%xmm0
+ .byte 102,65,15,252,203 // paddb %xmm11,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,60 // je 9e3 <_sk_lerp_u8_sse41_8bit+0xf3>
- .byte 102,15,239,210 // pxor %xmm2,%xmm2
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,25 // je 9ca <_sk_lerp_u8_sse41_8bit+0xda>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 15,133,93,255,255,255 // jne 918 <_sk_lerp_u8_sse41_8bit+0x28>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,255 // pxor %xmm7,%xmm7
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 15,135,196,254,255,255 // ja 10aa <_sk_lerp_u8_sse41_8bit+0x28>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,123,0,0,0 // lea 0x7b(%rip),%rcx # 126c <_sk_lerp_u8_sse41_8bit+0x1ea>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
+ .byte 102,15,110,248 // movd %eax,%xmm7
+ .byte 233,162,254,255,255 // jmpq 10aa <_sk_lerp_u8_sse41_8bit+0x28>
.byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 102,15,112,210,69 // pshufd $0x45,%xmm2,%xmm2
+ .byte 102,15,239,255 // pxor %xmm7,%xmm7
+ .byte 102,15,196,248,2 // pinsrw $0x2,%eax,%xmm7
.byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
- .byte 102,15,110,216 // movd %eax,%xmm3
- .byte 102,15,56,49,219 // pmovzxbd %xmm3,%xmm3
- .byte 102,15,58,14,211,15 // pblendw $0xf,%xmm3,%xmm2
- .byte 233,53,255,255,255 // jmpq 918 <_sk_lerp_u8_sse41_8bit+0x28>
- .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 233,39,255,255,255 // jmpq 918 <_sk_lerp_u8_sse41_8bit+0x28>
+ .byte 102,15,110,232 // movd %eax,%xmm5
+ .byte 102,15,56,48,237 // pmovzxbw %xmm5,%xmm5
+ .byte 102,15,58,14,253,3 // pblendw $0x3,%xmm5,%xmm7
+ .byte 233,122,254,255,255 // jmpq 10aa <_sk_lerp_u8_sse41_8bit+0x28>
+ .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
+ .byte 102,15,239,255 // pxor %xmm7,%xmm7
+ .byte 102,15,196,248,6 // pinsrw $0x6,%eax,%xmm7
+ .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
+ .byte 102,15,196,248,5 // pinsrw $0x5,%eax,%xmm7
+ .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
+ .byte 102,15,196,248,4 // pinsrw $0x4,%eax,%xmm7
+ .byte 102,66,15,110,44,2 // movd (%rdx,%r8,1),%xmm5
+ .byte 102,15,56,48,237 // pmovzxbw %xmm5,%xmm5
+ .byte 102,15,58,14,253,15 // pblendw $0xf,%xmm5,%xmm7
+ .byte 233,63,254,255,255 // jmpq 10aa <_sk_lerp_u8_sse41_8bit+0x28>
+ .byte 144 // nop
+ .byte 142,255 // mov %edi,%?
+ .byte 255 // (bad)
+ .byte 255,171,255,255,255,156 // ljmp *-0x63000001(%rbx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 233,255,255,255,222 // jmpq ffffffffdf00127c <_sk_xor__sse41_8bit+0xffffffffdefff6a0>
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,211 // callq *%rbx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,196 // inc %esp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_move_src_dst_sse41_8bit
.globl _sk_move_src_dst_sse41_8bit
FUNCTION(_sk_move_src_dst_sse41_8bit)
_sk_move_src_dst_sse41_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 15,40,200 // movaps %xmm0,%xmm1
+ .byte 15,40,208 // movaps %xmm0,%xmm2
+ .byte 15,40,217 // movaps %xmm1,%xmm3
.byte 255,224 // jmpq *%rax
HIDDEN _sk_move_dst_src_sse41_8bit
@@ -58764,7 +60513,8 @@ HIDDEN _sk_move_dst_src_sse41_8bit
FUNCTION(_sk_move_dst_src_sse41_8bit)
_sk_move_dst_src_sse41_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 15,40,193 // movaps %xmm1,%xmm0
+ .byte 15,40,194 // movaps %xmm2,%xmm0
+ .byte 15,40,203 // movaps %xmm3,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_black_color_sse41_8bit
@@ -58772,7 +60522,8 @@ HIDDEN _sk_black_color_sse41_8bit
FUNCTION(_sk_black_color_sse41_8bit)
_sk_black_color_sse41_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 15,40,5,88,6,0,0 // movaps 0x658(%rip),%xmm0 # 1060 <_sk_xor__sse41_8bit+0x1f6>
+ .byte 15,40,5,75,12,0,0 // movaps 0xc4b(%rip),%xmm0 # 1ef0 <_sk_xor__sse41_8bit+0x314>
+ .byte 15,40,200 // movaps %xmm0,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_white_color_sse41_8bit
@@ -58781,6 +60532,7 @@ FUNCTION(_sk_white_color_sse41_8bit)
_sk_white_color_sse41_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 102,15,118,192 // pcmpeqd %xmm0,%xmm0
+ .byte 102,15,118,201 // pcmpeqd %xmm1,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_clear_sse41_8bit
@@ -58789,199 +60541,341 @@ FUNCTION(_sk_clear_sse41_8bit)
_sk_clear_sse41_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 15,87,192 // xorps %xmm0,%xmm0
+ .byte 15,87,201 // xorps %xmm1,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_srcatop_sse41_8bit
.globl _sk_srcatop_sse41_8bit
FUNCTION(_sk_srcatop_sse41_8bit)
_sk_srcatop_sse41_8bit:
- .byte 102,68,15,111,5,78,6,0,0 // movdqa 0x64e(%rip),%xmm8 # 1070 <_sk_xor__sse41_8bit+0x206>
- .byte 102,15,111,217 // movdqa %xmm1,%xmm3
- .byte 102,15,56,48,225 // pmovzxbw %xmm1,%xmm4
- .byte 102,15,111,233 // movdqa %xmm1,%xmm5
- .byte 102,65,15,56,0,232 // pshufb %xmm8,%xmm5
- .byte 102,15,239,246 // pxor %xmm6,%xmm6
+ .byte 102,68,15,111,21,55,12,0,0 // movdqa 0xc37(%rip),%xmm10 # 1f00 <_sk_xor__sse41_8bit+0x324>
+ .byte 102,68,15,111,219 // movdqa %xmm3,%xmm11
+ .byte 102,68,15,56,48,195 // pmovzxbw %xmm3,%xmm8
+ .byte 102,15,111,235 // movdqa %xmm3,%xmm5
+ .byte 102,65,15,56,0,234 // pshufb %xmm10,%xmm5
+ .byte 102,68,15,111,226 // movdqa %xmm2,%xmm12
+ .byte 102,68,15,56,48,202 // pmovzxbw %xmm2,%xmm9
+ .byte 102,15,111,226 // movdqa %xmm2,%xmm4
+ .byte 102,65,15,56,0,226 // pshufb %xmm10,%xmm4
+ .byte 102,69,15,239,237 // pxor %xmm13,%xmm13
.byte 102,15,111,248 // movdqa %xmm0,%xmm7
- .byte 102,15,104,254 // punpckhbw %xmm6,%xmm7
- .byte 102,15,56,48,213 // pmovzxbw %xmm5,%xmm2
- .byte 102,15,104,238 // punpckhbw %xmm6,%xmm5
- .byte 102,15,213,239 // pmullw %xmm7,%xmm5
- .byte 102,15,253,239 // paddw %xmm7,%xmm5
- .byte 102,15,56,48,248 // pmovzxbw %xmm0,%xmm7
- .byte 102,15,213,215 // pmullw %xmm7,%xmm2
- .byte 102,15,253,215 // paddw %xmm7,%xmm2
- .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,213 // packuswb %xmm5,%xmm2
- .byte 102,65,15,56,0,192 // pshufb %xmm8,%xmm0
- .byte 102,15,118,237 // pcmpeqd %xmm5,%xmm5
- .byte 102,15,239,232 // pxor %xmm0,%xmm5
- .byte 102,15,104,222 // punpckhbw %xmm6,%xmm3
- .byte 102,15,56,48,197 // pmovzxbw %xmm5,%xmm0
- .byte 102,15,104,238 // punpckhbw %xmm6,%xmm5
- .byte 102,15,213,235 // pmullw %xmm3,%xmm5
- .byte 102,15,213,196 // pmullw %xmm4,%xmm0
- .byte 102,15,253,235 // paddw %xmm3,%xmm5
- .byte 102,15,253,196 // paddw %xmm4,%xmm0
+ .byte 102,65,15,104,253 // punpckhbw %xmm13,%xmm7
+ .byte 102,68,15,111,241 // movdqa %xmm1,%xmm14
+ .byte 102,69,15,104,245 // punpckhbw %xmm13,%xmm14
+ .byte 102,15,56,48,244 // pmovzxbw %xmm4,%xmm6
+ .byte 102,65,15,104,229 // punpckhbw %xmm13,%xmm4
+ .byte 102,15,213,231 // pmullw %xmm7,%xmm4
+ .byte 102,15,253,231 // paddw %xmm7,%xmm4
+ .byte 102,15,56,48,253 // pmovzxbw %xmm5,%xmm7
+ .byte 102,65,15,104,237 // punpckhbw %xmm13,%xmm5
+ .byte 102,65,15,213,238 // pmullw %xmm14,%xmm5
+ .byte 102,65,15,253,238 // paddw %xmm14,%xmm5
+ .byte 102,68,15,56,48,240 // pmovzxbw %xmm0,%xmm14
+ .byte 102,65,15,213,246 // pmullw %xmm14,%xmm6
+ .byte 102,65,15,253,246 // paddw %xmm14,%xmm6
+ .byte 102,68,15,56,48,241 // pmovzxbw %xmm1,%xmm14
+ .byte 102,65,15,213,254 // pmullw %xmm14,%xmm7
+ .byte 102,65,15,253,254 // paddw %xmm14,%xmm7
.byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,103,244 // packuswb %xmm4,%xmm6
+ .byte 102,15,103,253 // packuswb %xmm5,%xmm7
+ .byte 102,65,15,56,0,194 // pshufb %xmm10,%xmm0
+ .byte 102,65,15,56,0,202 // pshufb %xmm10,%xmm1
+ .byte 102,15,118,228 // pcmpeqd %xmm4,%xmm4
+ .byte 102,15,239,204 // pxor %xmm4,%xmm1
+ .byte 102,15,239,196 // pxor %xmm4,%xmm0
+ .byte 102,69,15,104,229 // punpckhbw %xmm13,%xmm12
+ .byte 102,69,15,104,221 // punpckhbw %xmm13,%xmm11
+ .byte 102,15,56,48,224 // pmovzxbw %xmm0,%xmm4
+ .byte 102,15,56,48,233 // pmovzxbw %xmm1,%xmm5
+ .byte 102,65,15,104,197 // punpckhbw %xmm13,%xmm0
+ .byte 102,65,15,104,205 // punpckhbw %xmm13,%xmm1
+ .byte 102,65,15,213,203 // pmullw %xmm11,%xmm1
+ .byte 102,65,15,213,196 // pmullw %xmm12,%xmm0
+ .byte 102,65,15,213,232 // pmullw %xmm8,%xmm5
+ .byte 102,65,15,213,225 // pmullw %xmm9,%xmm4
+ .byte 102,65,15,253,196 // paddw %xmm12,%xmm0
+ .byte 102,65,15,253,203 // paddw %xmm11,%xmm1
+ .byte 102,65,15,253,225 // paddw %xmm9,%xmm4
+ .byte 102,65,15,253,232 // paddw %xmm8,%xmm5
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
.byte 102,15,113,208,8 // psrlw $0x8,%xmm0
- .byte 102,15,103,197 // packuswb %xmm5,%xmm0
- .byte 102,15,252,194 // paddb %xmm2,%xmm0
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,15,103,224 // packuswb %xmm0,%xmm4
+ .byte 102,15,103,233 // packuswb %xmm1,%xmm5
+ .byte 102,15,252,230 // paddb %xmm6,%xmm4
+ .byte 102,15,252,239 // paddb %xmm7,%xmm5
.byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 102,15,111,196 // movdqa %xmm4,%xmm0
+ .byte 102,15,111,205 // movdqa %xmm5,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_dstatop_sse41_8bit
.globl _sk_dstatop_sse41_8bit
FUNCTION(_sk_dstatop_sse41_8bit)
_sk_dstatop_sse41_8bit:
- .byte 102,15,111,21,202,5,0,0 // movdqa 0x5ca(%rip),%xmm2 # 1080 <_sk_xor__sse41_8bit+0x216>
- .byte 102,15,111,216 // movdqa %xmm0,%xmm3
- .byte 102,15,56,0,218 // pshufb %xmm2,%xmm3
+ .byte 102,68,15,111,29,12,11,0,0 // movdqa 0xb0c(%rip),%xmm11 # 1f10 <_sk_xor__sse41_8bit+0x334>
+ .byte 102,68,15,111,233 // movdqa %xmm1,%xmm13
+ .byte 102,69,15,56,0,235 // pshufb %xmm11,%xmm13
+ .byte 102,68,15,111,248 // movdqa %xmm0,%xmm15
+ .byte 102,69,15,56,0,251 // pshufb %xmm11,%xmm15
.byte 102,69,15,239,192 // pxor %xmm8,%xmm8
- .byte 102,15,56,48,233 // pmovzxbw %xmm1,%xmm5
- .byte 102,15,111,241 // movdqa %xmm1,%xmm6
- .byte 102,15,111,249 // movdqa %xmm1,%xmm7
+ .byte 102,68,15,56,48,226 // pmovzxbw %xmm2,%xmm12
+ .byte 102,15,111,242 // movdqa %xmm2,%xmm6
+ .byte 102,15,111,234 // movdqa %xmm2,%xmm5
+ .byte 102,65,15,104,232 // punpckhbw %xmm8,%xmm5
+ .byte 102,68,15,56,48,243 // pmovzxbw %xmm3,%xmm14
+ .byte 102,15,111,251 // movdqa %xmm3,%xmm7
+ .byte 102,15,111,227 // movdqa %xmm3,%xmm4
+ .byte 102,65,15,104,224 // punpckhbw %xmm8,%xmm4
+ .byte 102,69,15,56,48,215 // pmovzxbw %xmm15,%xmm10
+ .byte 102,69,15,104,248 // punpckhbw %xmm8,%xmm15
+ .byte 102,68,15,213,253 // pmullw %xmm5,%xmm15
+ .byte 102,68,15,253,253 // paddw %xmm5,%xmm15
+ .byte 102,69,15,56,48,205 // pmovzxbw %xmm13,%xmm9
+ .byte 102,69,15,104,232 // punpckhbw %xmm8,%xmm13
+ .byte 102,68,15,213,236 // pmullw %xmm4,%xmm13
+ .byte 102,68,15,253,236 // paddw %xmm4,%xmm13
+ .byte 102,69,15,213,206 // pmullw %xmm14,%xmm9
+ .byte 102,69,15,213,212 // pmullw %xmm12,%xmm10
+ .byte 102,69,15,253,212 // paddw %xmm12,%xmm10
+ .byte 102,69,15,253,206 // paddw %xmm14,%xmm9
+ .byte 102,65,15,113,213,8 // psrlw $0x8,%xmm13
+ .byte 102,65,15,113,215,8 // psrlw $0x8,%xmm15
+ .byte 102,65,15,113,209,8 // psrlw $0x8,%xmm9
+ .byte 102,65,15,113,210,8 // psrlw $0x8,%xmm10
+ .byte 102,69,15,103,215 // packuswb %xmm15,%xmm10
+ .byte 102,69,15,103,205 // packuswb %xmm13,%xmm9
+ .byte 102,65,15,56,0,243 // pshufb %xmm11,%xmm6
+ .byte 102,65,15,56,0,251 // pshufb %xmm11,%xmm7
+ .byte 102,15,118,228 // pcmpeqd %xmm4,%xmm4
+ .byte 102,15,239,252 // pxor %xmm4,%xmm7
+ .byte 102,15,239,244 // pxor %xmm4,%xmm6
+ .byte 102,68,15,56,48,216 // pmovzxbw %xmm0,%xmm11
+ .byte 102,65,15,104,192 // punpckhbw %xmm8,%xmm0
+ .byte 102,68,15,56,48,225 // pmovzxbw %xmm1,%xmm12
+ .byte 102,65,15,104,200 // punpckhbw %xmm8,%xmm1
+ .byte 102,15,56,48,230 // pmovzxbw %xmm6,%xmm4
+ .byte 102,15,56,48,239 // pmovzxbw %xmm7,%xmm5
+ .byte 102,65,15,104,240 // punpckhbw %xmm8,%xmm6
.byte 102,65,15,104,248 // punpckhbw %xmm8,%xmm7
- .byte 102,15,56,48,227 // pmovzxbw %xmm3,%xmm4
- .byte 102,65,15,104,216 // punpckhbw %xmm8,%xmm3
- .byte 102,15,213,223 // pmullw %xmm7,%xmm3
- .byte 102,15,253,223 // paddw %xmm7,%xmm3
- .byte 102,15,213,229 // pmullw %xmm5,%xmm4
- .byte 102,15,253,229 // paddw %xmm5,%xmm4
- .byte 102,15,113,211,8 // psrlw $0x8,%xmm3
+ .byte 102,15,213,249 // pmullw %xmm1,%xmm7
+ .byte 102,15,213,240 // pmullw %xmm0,%xmm6
+ .byte 102,65,15,213,236 // pmullw %xmm12,%xmm5
+ .byte 102,65,15,213,227 // pmullw %xmm11,%xmm4
+ .byte 102,15,253,240 // paddw %xmm0,%xmm6
+ .byte 102,15,253,249 // paddw %xmm1,%xmm7
+ .byte 102,65,15,253,227 // paddw %xmm11,%xmm4
+ .byte 102,65,15,253,236 // paddw %xmm12,%xmm5
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
.byte 102,15,113,212,8 // psrlw $0x8,%xmm4
- .byte 102,15,103,227 // packuswb %xmm3,%xmm4
- .byte 102,15,56,0,242 // pshufb %xmm2,%xmm6
- .byte 102,15,118,219 // pcmpeqd %xmm3,%xmm3
- .byte 102,15,239,222 // pxor %xmm6,%xmm3
- .byte 102,15,56,48,232 // pmovzxbw %xmm0,%xmm5
- .byte 102,65,15,104,192 // punpckhbw %xmm8,%xmm0
- .byte 102,15,56,48,211 // pmovzxbw %xmm3,%xmm2
- .byte 102,65,15,104,216 // punpckhbw %xmm8,%xmm3
- .byte 102,15,213,216 // pmullw %xmm0,%xmm3
- .byte 102,15,213,213 // pmullw %xmm5,%xmm2
- .byte 102,15,253,216 // paddw %xmm0,%xmm3
- .byte 102,15,253,213 // paddw %xmm5,%xmm2
- .byte 102,15,113,211,8 // psrlw $0x8,%xmm3
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,211 // packuswb %xmm3,%xmm2
- .byte 102,15,252,212 // paddb %xmm4,%xmm2
+ .byte 102,15,103,230 // packuswb %xmm6,%xmm4
+ .byte 102,15,103,239 // packuswb %xmm7,%xmm5
+ .byte 102,65,15,252,226 // paddb %xmm10,%xmm4
+ .byte 102,65,15,252,233 // paddb %xmm9,%xmm5
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
+ .byte 102,15,111,196 // movdqa %xmm4,%xmm0
+ .byte 102,15,111,205 // movdqa %xmm5,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_srcin_sse41_8bit
.globl _sk_srcin_sse41_8bit
FUNCTION(_sk_srcin_sse41_8bit)
_sk_srcin_sse41_8bit:
- .byte 102,15,111,217 // movdqa %xmm1,%xmm3
- .byte 102,15,56,0,29,58,5,0,0 // pshufb 0x53a(%rip),%xmm3 # 1090 <_sk_xor__sse41_8bit+0x226>
- .byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,56,48,232 // pmovzxbw %xmm0,%xmm5
- .byte 102,15,104,196 // punpckhbw %xmm4,%xmm0
- .byte 102,15,56,48,211 // pmovzxbw %xmm3,%xmm2
- .byte 102,15,104,220 // punpckhbw %xmm4,%xmm3
- .byte 102,15,213,216 // pmullw %xmm0,%xmm3
- .byte 102,15,213,213 // pmullw %xmm5,%xmm2
- .byte 102,15,253,216 // paddw %xmm0,%xmm3
- .byte 102,15,253,213 // paddw %xmm5,%xmm2
- .byte 102,15,113,211,8 // psrlw $0x8,%xmm3
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,211 // packuswb %xmm3,%xmm2
+ .byte 102,15,111,225 // movdqa %xmm1,%xmm4
+ .byte 102,15,111,232 // movdqa %xmm0,%xmm5
+ .byte 102,15,111,5,211,9,0,0 // movdqa 0x9d3(%rip),%xmm0 # 1f20 <_sk_xor__sse41_8bit+0x344>
+ .byte 102,15,111,243 // movdqa %xmm3,%xmm6
+ .byte 102,15,56,0,240 // pshufb %xmm0,%xmm6
+ .byte 102,15,111,250 // movdqa %xmm2,%xmm7
+ .byte 102,15,56,0,248 // pshufb %xmm0,%xmm7
+ .byte 102,69,15,239,192 // pxor %xmm8,%xmm8
+ .byte 102,68,15,56,48,205 // pmovzxbw %xmm5,%xmm9
+ .byte 102,65,15,104,232 // punpckhbw %xmm8,%xmm5
+ .byte 102,68,15,56,48,212 // pmovzxbw %xmm4,%xmm10
+ .byte 102,65,15,104,224 // punpckhbw %xmm8,%xmm4
+ .byte 102,15,56,48,199 // pmovzxbw %xmm7,%xmm0
+ .byte 102,15,56,48,206 // pmovzxbw %xmm6,%xmm1
+ .byte 102,65,15,104,248 // punpckhbw %xmm8,%xmm7
+ .byte 102,65,15,104,240 // punpckhbw %xmm8,%xmm6
+ .byte 102,15,213,244 // pmullw %xmm4,%xmm6
+ .byte 102,15,213,253 // pmullw %xmm5,%xmm7
+ .byte 102,65,15,213,202 // pmullw %xmm10,%xmm1
+ .byte 102,65,15,213,193 // pmullw %xmm9,%xmm0
+ .byte 102,15,253,253 // paddw %xmm5,%xmm7
+ .byte 102,15,253,244 // paddw %xmm4,%xmm6
+ .byte 102,65,15,253,193 // paddw %xmm9,%xmm0
+ .byte 102,65,15,253,202 // paddw %xmm10,%xmm1
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
+ .byte 102,15,113,208,8 // psrlw $0x8,%xmm0
+ .byte 102,15,103,199 // packuswb %xmm7,%xmm0
+ .byte 102,15,103,206 // packuswb %xmm6,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
.byte 255,224 // jmpq *%rax
HIDDEN _sk_dstin_sse41_8bit
.globl _sk_dstin_sse41_8bit
FUNCTION(_sk_dstin_sse41_8bit)
_sk_dstin_sse41_8bit:
- .byte 102,15,56,0,5,5,5,0,0 // pshufb 0x505(%rip),%xmm0 # 10a0 <_sk_xor__sse41_8bit+0x236>
- .byte 102,15,239,219 // pxor %xmm3,%xmm3
- .byte 102,15,56,48,225 // pmovzxbw %xmm1,%xmm4
- .byte 102,15,111,233 // movdqa %xmm1,%xmm5
- .byte 102,15,104,235 // punpckhbw %xmm3,%xmm5
- .byte 102,15,56,48,208 // pmovzxbw %xmm0,%xmm2
- .byte 102,15,104,195 // punpckhbw %xmm3,%xmm0
- .byte 102,15,213,197 // pmullw %xmm5,%xmm0
- .byte 102,15,213,212 // pmullw %xmm4,%xmm2
- .byte 102,15,253,197 // paddw %xmm5,%xmm0
- .byte 102,15,253,212 // paddw %xmm4,%xmm2
+ .byte 102,15,111,37,86,9,0,0 // movdqa 0x956(%rip),%xmm4 # 1f30 <_sk_xor__sse41_8bit+0x354>
+ .byte 102,15,56,0,204 // pshufb %xmm4,%xmm1
+ .byte 102,15,56,0,196 // pshufb %xmm4,%xmm0
+ .byte 102,69,15,239,210 // pxor %xmm10,%xmm10
+ .byte 102,68,15,56,48,194 // pmovzxbw %xmm2,%xmm8
+ .byte 102,15,111,250 // movdqa %xmm2,%xmm7
+ .byte 102,65,15,104,250 // punpckhbw %xmm10,%xmm7
+ .byte 102,68,15,56,48,203 // pmovzxbw %xmm3,%xmm9
+ .byte 102,15,111,243 // movdqa %xmm3,%xmm6
+ .byte 102,65,15,104,242 // punpckhbw %xmm10,%xmm6
+ .byte 102,15,56,48,224 // pmovzxbw %xmm0,%xmm4
+ .byte 102,15,56,48,233 // pmovzxbw %xmm1,%xmm5
+ .byte 102,65,15,104,194 // punpckhbw %xmm10,%xmm0
+ .byte 102,65,15,104,202 // punpckhbw %xmm10,%xmm1
+ .byte 102,15,213,206 // pmullw %xmm6,%xmm1
+ .byte 102,15,213,199 // pmullw %xmm7,%xmm0
+ .byte 102,65,15,213,233 // pmullw %xmm9,%xmm5
+ .byte 102,65,15,213,224 // pmullw %xmm8,%xmm4
+ .byte 102,15,253,199 // paddw %xmm7,%xmm0
+ .byte 102,15,253,206 // paddw %xmm6,%xmm1
+ .byte 102,65,15,253,224 // paddw %xmm8,%xmm4
+ .byte 102,65,15,253,233 // paddw %xmm9,%xmm5
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
.byte 102,15,113,208,8 // psrlw $0x8,%xmm0
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,208 // packuswb %xmm0,%xmm2
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,15,103,224 // packuswb %xmm0,%xmm4
+ .byte 102,15,103,233 // packuswb %xmm1,%xmm5
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
+ .byte 102,15,111,196 // movdqa %xmm4,%xmm0
+ .byte 102,15,111,205 // movdqa %xmm5,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_srcout_sse41_8bit
.globl _sk_srcout_sse41_8bit
FUNCTION(_sk_srcout_sse41_8bit)
_sk_srcout_sse41_8bit:
- .byte 102,15,111,209 // movdqa %xmm1,%xmm2
- .byte 102,15,56,0,21,200,4,0,0 // pshufb 0x4c8(%rip),%xmm2 # 10b0 <_sk_xor__sse41_8bit+0x246>
- .byte 102,15,118,219 // pcmpeqd %xmm3,%xmm3
- .byte 102,15,239,218 // pxor %xmm2,%xmm3
- .byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,56,48,232 // pmovzxbw %xmm0,%xmm5
- .byte 102,15,104,196 // punpckhbw %xmm4,%xmm0
- .byte 102,15,56,48,211 // pmovzxbw %xmm3,%xmm2
- .byte 102,15,104,220 // punpckhbw %xmm4,%xmm3
- .byte 102,15,213,216 // pmullw %xmm0,%xmm3
- .byte 102,15,213,213 // pmullw %xmm5,%xmm2
- .byte 102,15,253,216 // paddw %xmm0,%xmm3
- .byte 102,15,253,213 // paddw %xmm5,%xmm2
- .byte 102,15,113,211,8 // psrlw $0x8,%xmm3
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,211 // packuswb %xmm3,%xmm2
+ .byte 102,15,111,225 // movdqa %xmm1,%xmm4
+ .byte 102,15,111,232 // movdqa %xmm0,%xmm5
+ .byte 102,15,111,5,201,8,0,0 // movdqa 0x8c9(%rip),%xmm0 # 1f40 <_sk_xor__sse41_8bit+0x364>
+ .byte 102,15,111,250 // movdqa %xmm2,%xmm7
+ .byte 102,15,56,0,248 // pshufb %xmm0,%xmm7
+ .byte 102,15,111,243 // movdqa %xmm3,%xmm6
+ .byte 102,15,56,0,240 // pshufb %xmm0,%xmm6
+ .byte 102,15,118,192 // pcmpeqd %xmm0,%xmm0
+ .byte 102,15,239,240 // pxor %xmm0,%xmm6
+ .byte 102,15,239,248 // pxor %xmm0,%xmm7
+ .byte 102,69,15,239,192 // pxor %xmm8,%xmm8
+ .byte 102,68,15,56,48,205 // pmovzxbw %xmm5,%xmm9
+ .byte 102,65,15,104,232 // punpckhbw %xmm8,%xmm5
+ .byte 102,68,15,56,48,212 // pmovzxbw %xmm4,%xmm10
+ .byte 102,65,15,104,224 // punpckhbw %xmm8,%xmm4
+ .byte 102,15,56,48,199 // pmovzxbw %xmm7,%xmm0
+ .byte 102,15,56,48,206 // pmovzxbw %xmm6,%xmm1
+ .byte 102,65,15,104,248 // punpckhbw %xmm8,%xmm7
+ .byte 102,65,15,104,240 // punpckhbw %xmm8,%xmm6
+ .byte 102,15,213,244 // pmullw %xmm4,%xmm6
+ .byte 102,15,213,253 // pmullw %xmm5,%xmm7
+ .byte 102,65,15,213,202 // pmullw %xmm10,%xmm1
+ .byte 102,65,15,213,193 // pmullw %xmm9,%xmm0
+ .byte 102,15,253,253 // paddw %xmm5,%xmm7
+ .byte 102,15,253,244 // paddw %xmm4,%xmm6
+ .byte 102,65,15,253,193 // paddw %xmm9,%xmm0
+ .byte 102,65,15,253,202 // paddw %xmm10,%xmm1
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
+ .byte 102,15,113,208,8 // psrlw $0x8,%xmm0
+ .byte 102,15,103,199 // packuswb %xmm7,%xmm0
+ .byte 102,15,103,206 // packuswb %xmm6,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
.byte 255,224 // jmpq *%rax
HIDDEN _sk_dstout_sse41_8bit
.globl _sk_dstout_sse41_8bit
FUNCTION(_sk_dstout_sse41_8bit)
_sk_dstout_sse41_8bit:
- .byte 102,15,56,0,5,139,4,0,0 // pshufb 0x48b(%rip),%xmm0 # 10c0 <_sk_xor__sse41_8bit+0x256>
- .byte 102,15,118,210 // pcmpeqd %xmm2,%xmm2
- .byte 102,15,239,208 // pxor %xmm0,%xmm2
- .byte 102,15,239,219 // pxor %xmm3,%xmm3
- .byte 102,15,56,48,225 // pmovzxbw %xmm1,%xmm4
- .byte 102,15,111,233 // movdqa %xmm1,%xmm5
- .byte 102,15,104,235 // punpckhbw %xmm3,%xmm5
- .byte 102,15,56,48,194 // pmovzxbw %xmm2,%xmm0
- .byte 102,15,104,211 // punpckhbw %xmm3,%xmm2
- .byte 102,15,213,213 // pmullw %xmm5,%xmm2
- .byte 102,15,213,196 // pmullw %xmm4,%xmm0
- .byte 102,15,253,213 // paddw %xmm5,%xmm2
- .byte 102,15,253,196 // paddw %xmm4,%xmm0
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
+ .byte 102,15,111,37,64,8,0,0 // movdqa 0x840(%rip),%xmm4 # 1f50 <_sk_xor__sse41_8bit+0x374>
+ .byte 102,15,56,0,196 // pshufb %xmm4,%xmm0
+ .byte 102,15,56,0,204 // pshufb %xmm4,%xmm1
+ .byte 102,15,118,228 // pcmpeqd %xmm4,%xmm4
+ .byte 102,15,239,204 // pxor %xmm4,%xmm1
+ .byte 102,15,239,196 // pxor %xmm4,%xmm0
+ .byte 102,69,15,239,210 // pxor %xmm10,%xmm10
+ .byte 102,68,15,56,48,194 // pmovzxbw %xmm2,%xmm8
+ .byte 102,15,111,250 // movdqa %xmm2,%xmm7
+ .byte 102,65,15,104,250 // punpckhbw %xmm10,%xmm7
+ .byte 102,68,15,56,48,203 // pmovzxbw %xmm3,%xmm9
+ .byte 102,15,111,243 // movdqa %xmm3,%xmm6
+ .byte 102,65,15,104,242 // punpckhbw %xmm10,%xmm6
+ .byte 102,15,56,48,224 // pmovzxbw %xmm0,%xmm4
+ .byte 102,15,56,48,233 // pmovzxbw %xmm1,%xmm5
+ .byte 102,65,15,104,194 // punpckhbw %xmm10,%xmm0
+ .byte 102,65,15,104,202 // punpckhbw %xmm10,%xmm1
+ .byte 102,15,213,206 // pmullw %xmm6,%xmm1
+ .byte 102,15,213,199 // pmullw %xmm7,%xmm0
+ .byte 102,65,15,213,233 // pmullw %xmm9,%xmm5
+ .byte 102,65,15,213,224 // pmullw %xmm8,%xmm4
+ .byte 102,15,253,199 // paddw %xmm7,%xmm0
+ .byte 102,15,253,206 // paddw %xmm6,%xmm1
+ .byte 102,65,15,253,224 // paddw %xmm8,%xmm4
+ .byte 102,65,15,253,233 // paddw %xmm9,%xmm5
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
.byte 102,15,113,208,8 // psrlw $0x8,%xmm0
- .byte 102,15,103,194 // packuswb %xmm2,%xmm0
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,15,103,224 // packuswb %xmm0,%xmm4
+ .byte 102,15,103,233 // packuswb %xmm1,%xmm5
.byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 102,15,111,196 // movdqa %xmm4,%xmm0
+ .byte 102,15,111,205 // movdqa %xmm5,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_srcover_sse41_8bit
.globl _sk_srcover_sse41_8bit
FUNCTION(_sk_srcover_sse41_8bit)
_sk_srcover_sse41_8bit:
- .byte 102,15,111,208 // movdqa %xmm0,%xmm2
- .byte 102,15,56,0,21,74,4,0,0 // pshufb 0x44a(%rip),%xmm2 # 10d0 <_sk_xor__sse41_8bit+0x266>
- .byte 102,15,239,219 // pxor %xmm3,%xmm3
- .byte 102,15,56,48,225 // pmovzxbw %xmm1,%xmm4
- .byte 102,15,252,193 // paddb %xmm1,%xmm0
- .byte 102,15,111,233 // movdqa %xmm1,%xmm5
- .byte 102,15,104,235 // punpckhbw %xmm3,%xmm5
- .byte 102,15,56,48,242 // pmovzxbw %xmm2,%xmm6
- .byte 102,15,104,211 // punpckhbw %xmm3,%xmm2
- .byte 102,15,213,213 // pmullw %xmm5,%xmm2
- .byte 102,15,213,244 // pmullw %xmm4,%xmm6
- .byte 102,15,253,213 // paddw %xmm5,%xmm2
- .byte 102,15,253,244 // paddw %xmm4,%xmm6
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
+ .byte 102,15,111,53,175,7,0,0 // movdqa 0x7af(%rip),%xmm6 # 1f60 <_sk_xor__sse41_8bit+0x384>
+ .byte 102,68,15,111,217 // movdqa %xmm1,%xmm11
+ .byte 102,68,15,56,0,222 // pshufb %xmm6,%xmm11
+ .byte 102,15,111,232 // movdqa %xmm0,%xmm5
+ .byte 102,15,56,0,238 // pshufb %xmm6,%xmm5
+ .byte 102,69,15,239,192 // pxor %xmm8,%xmm8
+ .byte 102,68,15,56,48,202 // pmovzxbw %xmm2,%xmm9
+ .byte 102,15,252,194 // paddb %xmm2,%xmm0
+ .byte 102,68,15,111,226 // movdqa %xmm2,%xmm12
+ .byte 102,69,15,104,224 // punpckhbw %xmm8,%xmm12
+ .byte 102,68,15,56,48,211 // pmovzxbw %xmm3,%xmm10
+ .byte 102,15,252,203 // paddb %xmm3,%xmm1
+ .byte 102,15,111,251 // movdqa %xmm3,%xmm7
+ .byte 102,65,15,104,248 // punpckhbw %xmm8,%xmm7
+ .byte 102,15,56,48,229 // pmovzxbw %xmm5,%xmm4
+ .byte 102,65,15,56,48,243 // pmovzxbw %xmm11,%xmm6
+ .byte 102,65,15,104,232 // punpckhbw %xmm8,%xmm5
+ .byte 102,69,15,104,216 // punpckhbw %xmm8,%xmm11
+ .byte 102,68,15,213,223 // pmullw %xmm7,%xmm11
+ .byte 102,65,15,213,236 // pmullw %xmm12,%xmm5
+ .byte 102,65,15,213,242 // pmullw %xmm10,%xmm6
+ .byte 102,65,15,213,225 // pmullw %xmm9,%xmm4
+ .byte 102,65,15,253,236 // paddw %xmm12,%xmm5
+ .byte 102,68,15,253,223 // paddw %xmm7,%xmm11
+ .byte 102,65,15,253,225 // paddw %xmm9,%xmm4
+ .byte 102,65,15,253,242 // paddw %xmm10,%xmm6
+ .byte 102,65,15,113,211,8 // psrlw $0x8,%xmm11
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
.byte 102,15,113,214,8 // psrlw $0x8,%xmm6
- .byte 102,15,103,242 // packuswb %xmm2,%xmm6
- .byte 102,15,248,198 // psubb %xmm6,%xmm0
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,15,103,229 // packuswb %xmm5,%xmm4
+ .byte 102,65,15,103,243 // packuswb %xmm11,%xmm6
+ .byte 102,15,248,196 // psubb %xmm4,%xmm0
+ .byte 102,15,248,206 // psubb %xmm6,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -58989,23 +60883,40 @@ HIDDEN _sk_dstover_sse41_8bit
.globl _sk_dstover_sse41_8bit
FUNCTION(_sk_dstover_sse41_8bit)
_sk_dstover_sse41_8bit:
- .byte 102,15,111,208 // movdqa %xmm0,%xmm2
- .byte 102,15,56,48,216 // pmovzxbw %xmm0,%xmm3
- .byte 102,15,252,193 // paddb %xmm1,%xmm0
- .byte 102,15,111,225 // movdqa %xmm1,%xmm4
- .byte 102,15,56,0,37,252,3,0,0 // pshufb 0x3fc(%rip),%xmm4 # 10e0 <_sk_xor__sse41_8bit+0x276>
- .byte 102,15,239,237 // pxor %xmm5,%xmm5
- .byte 102,15,104,213 // punpckhbw %xmm5,%xmm2
- .byte 102,15,56,48,244 // pmovzxbw %xmm4,%xmm6
- .byte 102,15,104,229 // punpckhbw %xmm5,%xmm4
- .byte 102,15,213,226 // pmullw %xmm2,%xmm4
- .byte 102,15,213,243 // pmullw %xmm3,%xmm6
- .byte 102,15,253,226 // paddw %xmm2,%xmm4
- .byte 102,15,253,243 // paddw %xmm3,%xmm6
+ .byte 102,68,15,111,5,15,7,0,0 // movdqa 0x70f(%rip),%xmm8 # 1f70 <_sk_xor__sse41_8bit+0x394>
+ .byte 102,68,15,111,209 // movdqa %xmm1,%xmm10
+ .byte 102,68,15,56,48,201 // pmovzxbw %xmm1,%xmm9
+ .byte 102,15,252,203 // paddb %xmm3,%xmm1
+ .byte 102,15,111,251 // movdqa %xmm3,%xmm7
+ .byte 102,65,15,56,0,248 // pshufb %xmm8,%xmm7
+ .byte 102,68,15,111,224 // movdqa %xmm0,%xmm12
+ .byte 102,68,15,56,48,216 // pmovzxbw %xmm0,%xmm11
+ .byte 102,15,252,194 // paddb %xmm2,%xmm0
+ .byte 102,15,111,234 // movdqa %xmm2,%xmm5
+ .byte 102,65,15,56,0,232 // pshufb %xmm8,%xmm5
+ .byte 102,69,15,239,192 // pxor %xmm8,%xmm8
+ .byte 102,69,15,104,224 // punpckhbw %xmm8,%xmm12
+ .byte 102,69,15,104,208 // punpckhbw %xmm8,%xmm10
+ .byte 102,15,56,48,245 // pmovzxbw %xmm5,%xmm6
+ .byte 102,15,56,48,231 // pmovzxbw %xmm7,%xmm4
+ .byte 102,65,15,104,232 // punpckhbw %xmm8,%xmm5
+ .byte 102,65,15,104,248 // punpckhbw %xmm8,%xmm7
+ .byte 102,65,15,213,250 // pmullw %xmm10,%xmm7
+ .byte 102,65,15,213,236 // pmullw %xmm12,%xmm5
+ .byte 102,65,15,213,225 // pmullw %xmm9,%xmm4
+ .byte 102,65,15,213,243 // pmullw %xmm11,%xmm6
+ .byte 102,65,15,253,236 // paddw %xmm12,%xmm5
+ .byte 102,65,15,253,250 // paddw %xmm10,%xmm7
+ .byte 102,65,15,253,243 // paddw %xmm11,%xmm6
+ .byte 102,65,15,253,225 // paddw %xmm9,%xmm4
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
.byte 102,15,113,212,8 // psrlw $0x8,%xmm4
.byte 102,15,113,214,8 // psrlw $0x8,%xmm6
- .byte 102,15,103,244 // packuswb %xmm4,%xmm6
+ .byte 102,15,103,245 // packuswb %xmm5,%xmm6
+ .byte 102,15,103,231 // packuswb %xmm7,%xmm4
.byte 102,15,248,198 // psubb %xmm6,%xmm0
+ .byte 102,15,248,204 // psubb %xmm4,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -59013,92 +60924,166 @@ HIDDEN _sk_modulate_sse41_8bit
.globl _sk_modulate_sse41_8bit
FUNCTION(_sk_modulate_sse41_8bit)
_sk_modulate_sse41_8bit:
- .byte 102,15,239,219 // pxor %xmm3,%xmm3
- .byte 102,15,56,48,224 // pmovzxbw %xmm0,%xmm4
- .byte 102,15,104,195 // punpckhbw %xmm3,%xmm0
- .byte 102,15,56,48,209 // pmovzxbw %xmm1,%xmm2
- .byte 102,15,111,233 // movdqa %xmm1,%xmm5
- .byte 102,15,104,235 // punpckhbw %xmm3,%xmm5
- .byte 102,15,213,232 // pmullw %xmm0,%xmm5
- .byte 102,15,213,212 // pmullw %xmm4,%xmm2
- .byte 102,15,253,232 // paddw %xmm0,%xmm5
- .byte 102,15,253,212 // paddw %xmm4,%xmm2
- .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,213 // packuswb %xmm5,%xmm2
+ .byte 102,15,111,225 // movdqa %xmm1,%xmm4
+ .byte 102,15,111,232 // movdqa %xmm0,%xmm5
+ .byte 102,69,15,239,210 // pxor %xmm10,%xmm10
+ .byte 102,68,15,56,48,197 // pmovzxbw %xmm5,%xmm8
+ .byte 102,65,15,104,234 // punpckhbw %xmm10,%xmm5
+ .byte 102,68,15,56,48,204 // pmovzxbw %xmm4,%xmm9
+ .byte 102,65,15,104,226 // punpckhbw %xmm10,%xmm4
+ .byte 102,15,56,48,194 // pmovzxbw %xmm2,%xmm0
+ .byte 102,15,111,250 // movdqa %xmm2,%xmm7
+ .byte 102,65,15,104,250 // punpckhbw %xmm10,%xmm7
+ .byte 102,15,56,48,203 // pmovzxbw %xmm3,%xmm1
+ .byte 102,15,111,243 // movdqa %xmm3,%xmm6
+ .byte 102,65,15,104,242 // punpckhbw %xmm10,%xmm6
+ .byte 102,15,213,244 // pmullw %xmm4,%xmm6
+ .byte 102,15,213,253 // pmullw %xmm5,%xmm7
+ .byte 102,65,15,213,201 // pmullw %xmm9,%xmm1
+ .byte 102,65,15,213,192 // pmullw %xmm8,%xmm0
+ .byte 102,15,253,253 // paddw %xmm5,%xmm7
+ .byte 102,15,253,244 // paddw %xmm4,%xmm6
+ .byte 102,65,15,253,192 // paddw %xmm8,%xmm0
+ .byte 102,65,15,253,201 // paddw %xmm9,%xmm1
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
+ .byte 102,15,113,208,8 // psrlw $0x8,%xmm0
+ .byte 102,15,103,199 // packuswb %xmm7,%xmm0
+ .byte 102,15,103,206 // packuswb %xmm6,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
.byte 255,224 // jmpq *%rax
HIDDEN _sk_multiply_sse41_8bit
.globl _sk_multiply_sse41_8bit
FUNCTION(_sk_multiply_sse41_8bit)
_sk_multiply_sse41_8bit:
- .byte 102,68,15,111,5,140,3,0,0 // movdqa 0x38c(%rip),%xmm8 # 10f0 <_sk_xor__sse41_8bit+0x286>
- .byte 102,15,111,225 // movdqa %xmm1,%xmm4
- .byte 102,15,56,48,209 // pmovzxbw %xmm1,%xmm2
- .byte 102,15,111,233 // movdqa %xmm1,%xmm5
- .byte 102,65,15,56,0,232 // pshufb %xmm8,%xmm5
- .byte 102,69,15,118,201 // pcmpeqd %xmm9,%xmm9
- .byte 102,65,15,239,233 // pxor %xmm9,%xmm5
- .byte 102,69,15,239,210 // pxor %xmm10,%xmm10
- .byte 102,15,111,216 // movdqa %xmm0,%xmm3
- .byte 102,65,15,104,218 // punpckhbw %xmm10,%xmm3
- .byte 102,15,56,48,240 // pmovzxbw %xmm0,%xmm6
- .byte 102,15,56,48,253 // pmovzxbw %xmm5,%xmm7
- .byte 102,65,15,104,234 // punpckhbw %xmm10,%xmm5
- .byte 102,15,213,235 // pmullw %xmm3,%xmm5
- .byte 102,15,213,254 // pmullw %xmm6,%xmm7
- .byte 102,15,253,235 // paddw %xmm3,%xmm5
- .byte 102,15,253,254 // paddw %xmm6,%xmm7
+ .byte 102,68,15,111,211 // movdqa %xmm3,%xmm10
+ .byte 102,15,111,218 // movdqa %xmm2,%xmm3
+ .byte 102,15,111,209 // movdqa %xmm1,%xmm2
+ .byte 102,15,111,200 // movdqa %xmm0,%xmm1
+ .byte 102,68,15,111,53,221,5,0,0 // movdqa 0x5dd(%rip),%xmm14 # 1f80 <_sk_xor__sse41_8bit+0x3a4>
+ .byte 102,68,15,111,195 // movdqa %xmm3,%xmm8
+ .byte 102,15,111,235 // movdqa %xmm3,%xmm5
+ .byte 102,65,15,56,0,238 // pshufb %xmm14,%xmm5
+ .byte 102,65,15,111,250 // movdqa %xmm10,%xmm7
+ .byte 102,65,15,56,0,254 // pshufb %xmm14,%xmm7
+ .byte 102,69,15,118,255 // pcmpeqd %xmm15,%xmm15
+ .byte 102,65,15,239,255 // pxor %xmm15,%xmm7
+ .byte 102,65,15,239,239 // pxor %xmm15,%xmm5
+ .byte 102,15,239,228 // pxor %xmm4,%xmm4
+ .byte 102,68,15,111,233 // movdqa %xmm1,%xmm13
+ .byte 102,68,15,104,236 // punpckhbw %xmm4,%xmm13
+ .byte 102,68,15,111,226 // movdqa %xmm2,%xmm12
+ .byte 102,68,15,104,228 // punpckhbw %xmm4,%xmm12
+ .byte 102,68,15,56,48,217 // pmovzxbw %xmm1,%xmm11
+ .byte 102,68,15,56,48,202 // pmovzxbw %xmm2,%xmm9
+ .byte 102,15,56,48,245 // pmovzxbw %xmm5,%xmm6
+ .byte 102,15,104,236 // punpckhbw %xmm4,%xmm5
+ .byte 102,65,15,213,237 // pmullw %xmm13,%xmm5
+ .byte 102,65,15,213,243 // pmullw %xmm11,%xmm6
+ .byte 102,65,15,253,237 // paddw %xmm13,%xmm5
+ .byte 102,65,15,253,243 // paddw %xmm11,%xmm6
.byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,103,245 // packuswb %xmm5,%xmm6
+ .byte 102,15,56,48,199 // pmovzxbw %xmm7,%xmm0
+ .byte 102,15,104,252 // punpckhbw %xmm4,%xmm7
+ .byte 102,65,15,213,252 // pmullw %xmm12,%xmm7
+ .byte 102,65,15,213,193 // pmullw %xmm9,%xmm0
+ .byte 102,65,15,253,252 // paddw %xmm12,%xmm7
+ .byte 102,65,15,253,193 // paddw %xmm9,%xmm0
.byte 102,15,113,215,8 // psrlw $0x8,%xmm7
- .byte 102,15,103,253 // packuswb %xmm5,%xmm7
- .byte 102,65,15,56,0,192 // pshufb %xmm8,%xmm0
- .byte 102,65,15,239,193 // pxor %xmm9,%xmm0
- .byte 102,65,15,104,226 // punpckhbw %xmm10,%xmm4
- .byte 102,15,56,48,232 // pmovzxbw %xmm0,%xmm5
- .byte 102,65,15,104,194 // punpckhbw %xmm10,%xmm0
- .byte 102,15,213,196 // pmullw %xmm4,%xmm0
- .byte 102,15,213,234 // pmullw %xmm2,%xmm5
- .byte 102,15,253,196 // paddw %xmm4,%xmm0
- .byte 102,15,253,234 // paddw %xmm2,%xmm5
.byte 102,15,113,208,8 // psrlw $0x8,%xmm0
- .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
- .byte 102,15,103,232 // packuswb %xmm0,%xmm5
- .byte 102,15,252,239 // paddb %xmm7,%xmm5
- .byte 102,15,213,227 // pmullw %xmm3,%xmm4
- .byte 102,15,213,214 // pmullw %xmm6,%xmm2
- .byte 102,15,253,227 // paddw %xmm3,%xmm4
- .byte 102,15,253,214 // paddw %xmm6,%xmm2
- .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,15,103,199 // packuswb %xmm7,%xmm0
+ .byte 102,65,15,111,234 // movdqa %xmm10,%xmm5
+ .byte 102,65,15,56,0,206 // pshufb %xmm14,%xmm1
+ .byte 102,65,15,56,0,214 // pshufb %xmm14,%xmm2
+ .byte 102,65,15,239,215 // pxor %xmm15,%xmm2
+ .byte 102,65,15,239,207 // pxor %xmm15,%xmm1
+ .byte 102,68,15,104,196 // punpckhbw %xmm4,%xmm8
+ .byte 102,68,15,104,212 // punpckhbw %xmm4,%xmm10
+ .byte 102,15,56,48,249 // pmovzxbw %xmm1,%xmm7
+ .byte 102,68,15,56,48,242 // pmovzxbw %xmm2,%xmm14
+ .byte 102,15,104,204 // punpckhbw %xmm4,%xmm1
+ .byte 102,15,104,212 // punpckhbw %xmm4,%xmm2
+ .byte 102,68,15,111,251 // movdqa %xmm3,%xmm15
+ .byte 102,65,15,56,48,231 // pmovzxbw %xmm15,%xmm4
+ .byte 102,65,15,213,200 // pmullw %xmm8,%xmm1
+ .byte 102,15,213,252 // pmullw %xmm4,%xmm7
+ .byte 102,65,15,253,200 // paddw %xmm8,%xmm1
+ .byte 102,15,253,252 // paddw %xmm4,%xmm7
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,103,249 // packuswb %xmm1,%xmm7
+ .byte 102,15,111,221 // movdqa %xmm5,%xmm3
+ .byte 102,15,56,48,235 // pmovzxbw %xmm3,%xmm5
+ .byte 102,65,15,213,210 // pmullw %xmm10,%xmm2
+ .byte 102,68,15,213,245 // pmullw %xmm5,%xmm14
+ .byte 102,65,15,253,210 // paddw %xmm10,%xmm2
+ .byte 102,68,15,253,245 // paddw %xmm5,%xmm14
.byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,212 // packuswb %xmm4,%xmm2
- .byte 102,15,252,213 // paddb %xmm5,%xmm2
+ .byte 102,65,15,113,214,8 // psrlw $0x8,%xmm14
+ .byte 102,68,15,103,242 // packuswb %xmm2,%xmm14
+ .byte 102,68,15,252,240 // paddb %xmm0,%xmm14
+ .byte 102,15,252,254 // paddb %xmm6,%xmm7
+ .byte 102,69,15,213,197 // pmullw %xmm13,%xmm8
+ .byte 102,69,15,253,197 // paddw %xmm13,%xmm8
+ .byte 102,69,15,213,212 // pmullw %xmm12,%xmm10
+ .byte 102,69,15,253,212 // paddw %xmm12,%xmm10
+ .byte 102,65,15,213,227 // pmullw %xmm11,%xmm4
+ .byte 102,65,15,253,227 // paddw %xmm11,%xmm4
+ .byte 102,65,15,213,233 // pmullw %xmm9,%xmm5
+ .byte 102,65,15,253,233 // paddw %xmm9,%xmm5
+ .byte 102,65,15,113,208,8 // psrlw $0x8,%xmm8
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,65,15,103,224 // packuswb %xmm8,%xmm4
+ .byte 102,65,15,113,210,8 // psrlw $0x8,%xmm10
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,65,15,103,234 // packuswb %xmm10,%xmm5
+ .byte 102,15,252,231 // paddb %xmm7,%xmm4
+ .byte 102,65,15,252,238 // paddb %xmm14,%xmm5
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
+ .byte 102,65,15,111,215 // movdqa %xmm15,%xmm2
+ .byte 102,15,111,196 // movdqa %xmm4,%xmm0
+ .byte 102,15,111,205 // movdqa %xmm5,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_screen_sse41_8bit
.globl _sk_screen_sse41_8bit
FUNCTION(_sk_screen_sse41_8bit)
_sk_screen_sse41_8bit:
- .byte 102,15,118,210 // pcmpeqd %xmm2,%xmm2
- .byte 102,15,239,208 // pxor %xmm0,%xmm2
- .byte 102,15,56,48,218 // pmovzxbw %xmm2,%xmm3
- .byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,104,212 // punpckhbw %xmm4,%xmm2
- .byte 102,15,56,48,233 // pmovzxbw %xmm1,%xmm5
- .byte 102,15,111,241 // movdqa %xmm1,%xmm6
- .byte 102,15,104,244 // punpckhbw %xmm4,%xmm6
- .byte 102,15,213,242 // pmullw %xmm2,%xmm6
- .byte 102,15,213,235 // pmullw %xmm3,%xmm5
- .byte 102,15,253,235 // paddw %xmm3,%xmm5
- .byte 102,15,253,242 // paddw %xmm2,%xmm6
- .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,69,15,118,228 // pcmpeqd %xmm12,%xmm12
+ .byte 102,68,15,111,217 // movdqa %xmm1,%xmm11
+ .byte 102,69,15,239,220 // pxor %xmm12,%xmm11
+ .byte 102,68,15,239,224 // pxor %xmm0,%xmm12
+ .byte 102,69,15,56,48,204 // pmovzxbw %xmm12,%xmm9
+ .byte 102,69,15,56,48,195 // pmovzxbw %xmm11,%xmm8
+ .byte 102,69,15,239,210 // pxor %xmm10,%xmm10
+ .byte 102,69,15,104,226 // punpckhbw %xmm10,%xmm12
+ .byte 102,69,15,104,218 // punpckhbw %xmm10,%xmm11
+ .byte 102,15,56,48,242 // pmovzxbw %xmm2,%xmm6
+ .byte 102,15,111,250 // movdqa %xmm2,%xmm7
+ .byte 102,65,15,104,250 // punpckhbw %xmm10,%xmm7
+ .byte 102,15,56,48,235 // pmovzxbw %xmm3,%xmm5
+ .byte 102,15,111,227 // movdqa %xmm3,%xmm4
+ .byte 102,65,15,104,226 // punpckhbw %xmm10,%xmm4
+ .byte 102,65,15,213,227 // pmullw %xmm11,%xmm4
+ .byte 102,65,15,213,252 // pmullw %xmm12,%xmm7
+ .byte 102,65,15,213,232 // pmullw %xmm8,%xmm5
+ .byte 102,65,15,213,241 // pmullw %xmm9,%xmm6
+ .byte 102,65,15,253,241 // paddw %xmm9,%xmm6
+ .byte 102,65,15,253,252 // paddw %xmm12,%xmm7
+ .byte 102,65,15,253,232 // paddw %xmm8,%xmm5
+ .byte 102,65,15,253,227 // paddw %xmm11,%xmm4
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
.byte 102,15,113,213,8 // psrlw $0x8,%xmm5
- .byte 102,15,103,238 // packuswb %xmm6,%xmm5
- .byte 102,15,252,197 // paddb %xmm5,%xmm0
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,103,247 // packuswb %xmm7,%xmm6
+ .byte 102,15,103,236 // packuswb %xmm4,%xmm5
+ .byte 102,15,252,198 // paddb %xmm6,%xmm0
+ .byte 102,15,252,205 // paddb %xmm5,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -59106,50 +61091,81 @@ HIDDEN _sk_xor__sse41_8bit
.globl _sk_xor__sse41_8bit
FUNCTION(_sk_xor__sse41_8bit)
_sk_xor__sse41_8bit:
- .byte 102,68,15,111,5,141,2,0,0 // movdqa 0x28d(%rip),%xmm8 # 1100 <_sk_xor__sse41_8bit+0x296>
- .byte 102,15,111,217 // movdqa %xmm1,%xmm3
- .byte 102,15,56,48,225 // pmovzxbw %xmm1,%xmm4
- .byte 102,15,111,233 // movdqa %xmm1,%xmm5
- .byte 102,65,15,56,0,232 // pshufb %xmm8,%xmm5
- .byte 102,69,15,118,201 // pcmpeqd %xmm9,%xmm9
- .byte 102,65,15,239,233 // pxor %xmm9,%xmm5
- .byte 102,15,239,255 // pxor %xmm7,%xmm7
- .byte 102,15,111,208 // movdqa %xmm0,%xmm2
- .byte 102,15,104,215 // punpckhbw %xmm7,%xmm2
+ .byte 102,68,15,111,21,171,3,0,0 // movdqa 0x3ab(%rip),%xmm10 # 1f90 <_sk_xor__sse41_8bit+0x3b4>
+ .byte 102,68,15,111,226 // movdqa %xmm2,%xmm12
+ .byte 102,68,15,56,48,194 // pmovzxbw %xmm2,%xmm8
+ .byte 102,15,111,234 // movdqa %xmm2,%xmm5
+ .byte 102,65,15,56,0,234 // pshufb %xmm10,%xmm5
+ .byte 102,68,15,111,235 // movdqa %xmm3,%xmm13
+ .byte 102,68,15,56,48,203 // pmovzxbw %xmm3,%xmm9
+ .byte 102,15,111,227 // movdqa %xmm3,%xmm4
+ .byte 102,65,15,56,0,226 // pshufb %xmm10,%xmm4
+ .byte 102,69,15,118,219 // pcmpeqd %xmm11,%xmm11
+ .byte 102,65,15,239,227 // pxor %xmm11,%xmm4
+ .byte 102,65,15,239,235 // pxor %xmm11,%xmm5
+ .byte 102,69,15,239,246 // pxor %xmm14,%xmm14
+ .byte 102,15,111,248 // movdqa %xmm0,%xmm7
+ .byte 102,65,15,104,254 // punpckhbw %xmm14,%xmm7
+ .byte 102,68,15,111,249 // movdqa %xmm1,%xmm15
+ .byte 102,69,15,104,254 // punpckhbw %xmm14,%xmm15
.byte 102,15,56,48,245 // pmovzxbw %xmm5,%xmm6
- .byte 102,15,104,239 // punpckhbw %xmm7,%xmm5
- .byte 102,15,213,234 // pmullw %xmm2,%xmm5
- .byte 102,15,253,234 // paddw %xmm2,%xmm5
- .byte 102,15,56,48,208 // pmovzxbw %xmm0,%xmm2
- .byte 102,15,213,242 // pmullw %xmm2,%xmm6
- .byte 102,15,253,242 // paddw %xmm2,%xmm6
+ .byte 102,65,15,104,238 // punpckhbw %xmm14,%xmm5
+ .byte 102,15,213,239 // pmullw %xmm7,%xmm5
+ .byte 102,15,253,239 // paddw %xmm7,%xmm5
+ .byte 102,15,56,48,252 // pmovzxbw %xmm4,%xmm7
+ .byte 102,65,15,104,230 // punpckhbw %xmm14,%xmm4
+ .byte 102,65,15,213,231 // pmullw %xmm15,%xmm4
+ .byte 102,65,15,253,231 // paddw %xmm15,%xmm4
+ .byte 102,68,15,56,48,248 // pmovzxbw %xmm0,%xmm15
+ .byte 102,65,15,213,247 // pmullw %xmm15,%xmm6
+ .byte 102,65,15,253,247 // paddw %xmm15,%xmm6
+ .byte 102,68,15,56,48,249 // pmovzxbw %xmm1,%xmm15
+ .byte 102,65,15,213,255 // pmullw %xmm15,%xmm7
+ .byte 102,65,15,253,255 // paddw %xmm15,%xmm7
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
.byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
.byte 102,15,113,214,8 // psrlw $0x8,%xmm6
.byte 102,15,103,245 // packuswb %xmm5,%xmm6
- .byte 102,65,15,56,0,192 // pshufb %xmm8,%xmm0
- .byte 102,65,15,239,193 // pxor %xmm9,%xmm0
- .byte 102,15,104,223 // punpckhbw %xmm7,%xmm3
- .byte 102,15,56,48,208 // pmovzxbw %xmm0,%xmm2
- .byte 102,15,104,199 // punpckhbw %xmm7,%xmm0
- .byte 102,15,213,195 // pmullw %xmm3,%xmm0
- .byte 102,15,213,212 // pmullw %xmm4,%xmm2
- .byte 102,15,253,195 // paddw %xmm3,%xmm0
- .byte 102,15,253,212 // paddw %xmm4,%xmm2
+ .byte 102,15,103,252 // packuswb %xmm4,%xmm7
+ .byte 102,65,15,56,0,194 // pshufb %xmm10,%xmm0
+ .byte 102,65,15,56,0,202 // pshufb %xmm10,%xmm1
+ .byte 102,65,15,239,203 // pxor %xmm11,%xmm1
+ .byte 102,65,15,239,195 // pxor %xmm11,%xmm0
+ .byte 102,69,15,104,230 // punpckhbw %xmm14,%xmm12
+ .byte 102,69,15,104,238 // punpckhbw %xmm14,%xmm13
+ .byte 102,15,56,48,224 // pmovzxbw %xmm0,%xmm4
+ .byte 102,15,56,48,233 // pmovzxbw %xmm1,%xmm5
+ .byte 102,65,15,104,198 // punpckhbw %xmm14,%xmm0
+ .byte 102,65,15,104,206 // punpckhbw %xmm14,%xmm1
+ .byte 102,65,15,213,205 // pmullw %xmm13,%xmm1
+ .byte 102,65,15,213,196 // pmullw %xmm12,%xmm0
+ .byte 102,65,15,213,233 // pmullw %xmm9,%xmm5
+ .byte 102,65,15,213,224 // pmullw %xmm8,%xmm4
+ .byte 102,65,15,253,196 // paddw %xmm12,%xmm0
+ .byte 102,65,15,253,205 // paddw %xmm13,%xmm1
+ .byte 102,65,15,253,224 // paddw %xmm8,%xmm4
+ .byte 102,65,15,253,233 // paddw %xmm9,%xmm5
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
.byte 102,15,113,208,8 // psrlw $0x8,%xmm0
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,208 // packuswb %xmm0,%xmm2
- .byte 102,15,252,214 // paddb %xmm6,%xmm2
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,15,103,224 // packuswb %xmm0,%xmm4
+ .byte 102,15,103,233 // packuswb %xmm1,%xmm5
+ .byte 102,15,252,230 // paddb %xmm6,%xmm4
+ .byte 102,15,252,239 // paddb %xmm7,%xmm5
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
+ .byte 102,15,111,196 // movdqa %xmm4,%xmm0
+ .byte 102,15,111,205 // movdqa %xmm5,%xmm1
.byte 255,224 // jmpq *%rax
BALIGN4
.byte 0,0 // add %al,(%rax)
- .byte 127,67 // jg f53 <_sk_xor__sse41_8bit+0xe9>
+ .byte 127,67 // jg 1d6b <_sk_xor__sse41_8bit+0x18f>
.byte 0,0 // add %al,(%rax)
- .byte 127,67 // jg f57 <_sk_xor__sse41_8bit+0xed>
+ .byte 127,67 // jg 1d6f <_sk_xor__sse41_8bit+0x193>
.byte 0,0 // add %al,(%rax)
- .byte 127,67 // jg f5b <_sk_xor__sse41_8bit+0xf1>
+ .byte 127,67 // jg 1d73 <_sk_xor__sse41_8bit+0x197>
BALIGN16
.byte 0,0 // add %al,(%rax)
@@ -59203,54 +61219,87 @@ BALIGN16
.byte 5,4,7,10,9 // add $0x90a0704,%eax
.byte 8,11 // or %cl,(%rbx)
.byte 14 // (bad)
- .byte 13,12,15,0,4 // or $0x4000f0c,%eax
+ .byte 13,12,15,255,0 // or $0xff0f0c,%eax
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 3,255 // add %edi,%edi
+ .byte 7 // (bad)
+ .byte 255,11 // decl (%rbx)
+ .byte 255,15 // decl (%rdi)
+ .byte 255,11 // decl (%rbx)
+ .byte 255,15 // decl (%rdi)
+ .byte 255,15 // decl (%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,0 // incl (%rax)
+ .byte 2,4,6 // add (%rsi,%rax,1),%al
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
+ .byte 0,2 // add %al,(%rdx)
.byte 0,0 // add %al,(%rax)
- .byte 0,4,8 // add %al,(%rax,%rcx,1)
- .byte 12,0 // or $0x0,%al
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
- .byte 0,255 // add %bh,%bh
.byte 0,0 // add %al,(%rax)
- .byte 0,255 // add %bh,%bh
+ .byte 0,2 // add %al,(%rdx)
+ .byte 4,6 // add $0x6,%al
+ .byte 8,10 // or %cl,(%rdx)
+ .byte 12,14 // or $0xe,%al
.byte 0,0 // add %al,(%rax)
- .byte 0,255 // add %bh,%bh
.byte 0,0 // add %al,(%rax)
- .byte 0,255 // add %bh,%bh
.byte 0,0 // add %al,(%rax)
- .byte 0,1 // add %al,(%rcx)
+ .byte 0,0 // add %al,(%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
.byte 1,1 // add %eax,(%rcx)
- .byte 0,1 // add %al,(%rcx)
+ .byte 1,0 // add %eax,(%rax)
.byte 1,1 // add %eax,(%rcx)
- .byte 0,1 // add %al,(%rcx)
+ .byte 1,0 // add %eax,(%rax)
.byte 1,1 // add %eax,(%rcx)
- .byte 0,1 // add %al,(%rcx)
+ .byte 1,0 // add %eax,(%rax)
.byte 1,1 // add %eax,(%rcx)
+ .byte 1,0 // add %eax,(%rax)
.byte 0,0 // add %al,(%rax)
+ .byte 0,255 // add %bh,%bh
.byte 0,0 // add %al,(%rax)
- .byte 255,0 // incl (%rax)
+ .byte 0,255 // add %bh,%bh
.byte 0,0 // add %al,(%rax)
- .byte 255,0 // incl (%rax)
+ .byte 0,255 // add %bh,%bh
.byte 0,0 // add %al,(%rax)
+ .byte 0,255 // add %bh,%bh
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
- .byte 255 // (bad)
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
.byte 1,1 // add %eax,(%rcx)
.byte 1,0 // add %eax,(%rax)
.byte 1,1 // add %eax,(%rcx)
@@ -59286,14 +61335,7 @@ BALIGN16
.byte 255,0 // incl (%rax)
.byte 255,0 // incl (%rax)
.byte 255,0 // incl (%rax)
- .byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
- .byte 0,4,4 // add %al,(%rsp,%rax,1)
- .byte 4,4 // add $0x4,%al
- .byte 8,8 // or %cl,(%rax)
- .byte 8,8 // or %cl,(%rax)
- .byte 12,12 // or $0xc,%al
- .byte 12,12 // or $0xc,%al
+ .byte 255 // (bad)
.byte 255,0 // incl (%rax)
.byte 255,0 // incl (%rax)
.byte 255,0 // incl (%rax)
@@ -59302,14 +61344,59 @@ BALIGN16
.byte 255,0 // incl (%rax)
.byte 255,0 // incl (%rax)
.byte 255,0 // incl (%rax)
+ .byte 8,8 // or %cl,(%rax)
+ .byte 8,8 // or %cl,(%rax)
+ .byte 10,10 // or (%rdx),%cl
+ .byte 10,10 // or (%rdx),%cl
+ .byte 12,12 // or $0xc,%al
+ .byte 12,12 // or $0xc,%al
+ .byte 14 // (bad)
+ .byte 14 // (bad)
+ .byte 14 // (bad)
+ .byte 14 // (bad)
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
+ .byte 2,2 // add (%rdx),%al
+ .byte 2,2 // add (%rdx),%al
.byte 4,4 // add $0x4,%al
.byte 4,4 // add $0x4,%al
+ .byte 6 // (bad)
+ .byte 6 // (bad)
+ .byte 6 // (bad)
+ .byte 6 // (bad)
+ .byte 0,128,2,128,4,128 // add %al,-0x7ffb7ffe(%rax)
+ .byte 6 // (bad)
+ .byte 128,4,128,5 // addb $0x5,(%rax,%rax,4)
+ .byte 128,6,128 // addb $0x80,(%rsi)
+ .byte 7 // (bad)
+ .byte 128,255,0 // cmp $0x0,%bh
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
.byte 8,8 // or %cl,(%rax)
.byte 8,8 // or %cl,(%rax)
+ .byte 10,10 // or (%rdx),%cl
+ .byte 10,10 // or (%rdx),%cl
.byte 12,12 // or $0xc,%al
.byte 12,12 // or $0xc,%al
+ .byte 14 // (bad)
+ .byte 14 // (bad)
+ .byte 14 // (bad)
+ .byte 14 // (bad)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 2,2 // add (%rdx),%al
+ .byte 2,2 // add (%rdx),%al
+ .byte 4,4 // add $0x4,%al
+ .byte 4,4 // add $0x4,%al
+ .byte 6 // (bad)
+ .byte 6 // (bad)
+ .byte 6 // (bad)
+ .byte 6 // (bad)
.byte 0,0 // add %al,(%rax)
.byte 0,255 // add %bh,%bh
.byte 0,0 // add %al,(%rax)
@@ -59454,7 +61541,7 @@ _sk_start_pipeline_sse2_8bit:
.byte 73,57,207 // cmp %rcx,%r15
.byte 115,102 // jae 95 <_sk_start_pipeline_sse2_8bit+0x95>
.byte 72,139,69,208 // mov -0x30(%rbp),%rax
- .byte 72,141,64,4 // lea 0x4(%rax),%rax
+ .byte 72,141,64,8 // lea 0x8(%rax),%rax
.byte 72,137,69,176 // mov %rax,-0x50(%rbp)
.byte 76,141,101,184 // lea -0x48(%rbp),%r12
.byte 72,57,93,176 // cmp %rbx,-0x50(%rbp)
@@ -59467,9 +61554,9 @@ _sk_start_pipeline_sse2_8bit:
.byte 76,137,246 // mov %r14,%rsi
.byte 65,255,213 // callq *%r13
.byte 72,139,77,184 // mov -0x48(%rbp),%rcx
- .byte 72,141,65,4 // lea 0x4(%rcx),%rax
+ .byte 72,141,65,8 // lea 0x8(%rcx),%rax
.byte 72,137,69,184 // mov %rax,-0x48(%rbp)
- .byte 72,131,193,8 // add $0x8,%rcx
+ .byte 72,131,193,16 // add $0x10,%rcx
.byte 72,57,217 // cmp %rbx,%rcx
.byte 118,226 // jbe 59 <_sk_start_pipeline_sse2_8bit+0x59>
.byte 72,137,217 // mov %rbx,%rcx
@@ -59505,6 +61592,7 @@ _sk_uniform_color_sse2_8bit:
.byte 102,15,110,64,16 // movd 0x10(%rax),%xmm0
.byte 102,15,112,192,0 // pshufd $0x0,%xmm0,%xmm0
.byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 102,15,111,200 // movdqa %xmm0,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_set_rgb_sse2_8bit
@@ -59512,23 +61600,26 @@ HIDDEN _sk_set_rgb_sse2_8bit
FUNCTION(_sk_set_rgb_sse2_8bit)
_sk_set_rgb_sse2_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 243,15,16,21,1,17,0,0 // movss 0x1101(%rip),%xmm2 # 11c0 <_sk_xor__sse2_8bit+0xc5>
- .byte 243,15,16,24 // movss (%rax),%xmm3
- .byte 243,15,89,218 // mulss %xmm2,%xmm3
- .byte 243,72,15,44,203 // cvttss2si %xmm3,%rcx
- .byte 243,15,16,88,4 // movss 0x4(%rax),%xmm3
- .byte 243,15,89,218 // mulss %xmm2,%xmm3
- .byte 243,72,15,44,211 // cvttss2si %xmm3,%rdx
+ .byte 243,15,16,37,253,33,0,0 // movss 0x21fd(%rip),%xmm4 # 22c0 <_sk_xor__sse2_8bit+0x1a7>
+ .byte 243,15,16,40 // movss (%rax),%xmm5
+ .byte 243,15,89,236 // mulss %xmm4,%xmm5
+ .byte 243,72,15,44,205 // cvttss2si %xmm5,%rcx
+ .byte 243,15,16,104,4 // movss 0x4(%rax),%xmm5
+ .byte 243,15,89,236 // mulss %xmm4,%xmm5
+ .byte 243,72,15,44,213 // cvttss2si %xmm5,%rdx
.byte 193,226,8 // shl $0x8,%edx
.byte 9,202 // or %ecx,%edx
- .byte 243,15,89,80,8 // mulss 0x8(%rax),%xmm2
- .byte 243,72,15,44,194 // cvttss2si %xmm2,%rax
+ .byte 243,15,89,96,8 // mulss 0x8(%rax),%xmm4
+ .byte 243,72,15,44,196 // cvttss2si %xmm4,%rax
.byte 193,224,16 // shl $0x10,%eax
.byte 9,208 // or %edx,%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 102,15,112,210,0 // pshufd $0x0,%xmm2,%xmm2
- .byte 102,15,219,5,209,16,0,0 // pand 0x10d1(%rip),%xmm0 # 11d0 <_sk_xor__sse2_8bit+0xd5>
- .byte 102,15,235,194 // por %xmm2,%xmm0
+ .byte 102,15,110,224 // movd %eax,%xmm4
+ .byte 102,15,112,228,0 // pshufd $0x0,%xmm4,%xmm4
+ .byte 102,15,111,45,205,33,0,0 // movdqa 0x21cd(%rip),%xmm5 # 22d0 <_sk_xor__sse2_8bit+0x1b7>
+ .byte 102,15,219,205 // pand %xmm5,%xmm1
+ .byte 102,15,219,197 // pand %xmm5,%xmm0
+ .byte 102,15,235,196 // por %xmm4,%xmm0
+ .byte 102,15,235,204 // por %xmm4,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -59536,44 +61627,74 @@ HIDDEN _sk_premul_sse2_8bit
.globl _sk_premul_sse2_8bit
FUNCTION(_sk_premul_sse2_8bit)
_sk_premul_sse2_8bit:
- .byte 242,15,112,208,231 // pshuflw $0xe7,%xmm0,%xmm2
- .byte 243,15,112,210,231 // pshufhw $0xe7,%xmm2,%xmm2
- .byte 102,15,112,210,232 // pshufd $0xe8,%xmm2,%xmm2
- .byte 102,15,96,210 // punpcklbw %xmm2,%xmm2
- .byte 242,15,112,210,95 // pshuflw $0x5f,%xmm2,%xmm2
- .byte 243,15,112,218,95 // pshufhw $0x5f,%xmm2,%xmm3
- .byte 102,15,235,29,180,16,0,0 // por 0x10b4(%rip),%xmm3 # 11e0 <_sk_xor__sse2_8bit+0xe5>
- .byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,111,208 // movdqa %xmm0,%xmm2
- .byte 102,15,96,212 // punpcklbw %xmm4,%xmm2
- .byte 102,15,104,196 // punpckhbw %xmm4,%xmm0
- .byte 102,15,111,235 // movdqa %xmm3,%xmm5
- .byte 102,15,96,236 // punpcklbw %xmm4,%xmm5
- .byte 102,15,104,220 // punpckhbw %xmm4,%xmm3
- .byte 102,15,213,216 // pmullw %xmm0,%xmm3
- .byte 102,15,213,234 // pmullw %xmm2,%xmm5
- .byte 102,15,253,213 // paddw %xmm5,%xmm2
- .byte 102,15,253,216 // paddw %xmm0,%xmm3
- .byte 102,15,113,211,8 // psrlw $0x8,%xmm3
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,211 // packuswb %xmm3,%xmm2
+ .byte 102,68,15,111,192 // movdqa %xmm0,%xmm8
+ .byte 242,65,15,112,192,231 // pshuflw $0xe7,%xmm8,%xmm0
+ .byte 243,15,112,192,231 // pshufhw $0xe7,%xmm0,%xmm0
+ .byte 102,15,112,192,232 // pshufd $0xe8,%xmm0,%xmm0
+ .byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
+ .byte 242,15,112,192,95 // pshuflw $0x5f,%xmm0,%xmm0
+ .byte 243,15,112,240,95 // pshufhw $0x5f,%xmm0,%xmm6
+ .byte 242,15,112,193,231 // pshuflw $0xe7,%xmm1,%xmm0
+ .byte 243,15,112,192,231 // pshufhw $0xe7,%xmm0,%xmm0
+ .byte 102,15,112,192,232 // pshufd $0xe8,%xmm0,%xmm0
+ .byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
+ .byte 242,15,112,192,95 // pshuflw $0x5f,%xmm0,%xmm0
+ .byte 243,15,112,248,95 // pshufhw $0x5f,%xmm0,%xmm7
+ .byte 102,15,111,5,129,33,0,0 // movdqa 0x2181(%rip),%xmm0 # 22e0 <_sk_xor__sse2_8bit+0x1c7>
+ .byte 102,15,235,248 // por %xmm0,%xmm7
+ .byte 102,15,235,240 // por %xmm0,%xmm6
+ .byte 102,69,15,239,201 // pxor %xmm9,%xmm9
+ .byte 102,65,15,111,192 // movdqa %xmm8,%xmm0
+ .byte 102,65,15,96,193 // punpcklbw %xmm9,%xmm0
+ .byte 102,69,15,104,193 // punpckhbw %xmm9,%xmm8
+ .byte 102,15,111,233 // movdqa %xmm1,%xmm5
+ .byte 102,65,15,96,233 // punpcklbw %xmm9,%xmm5
+ .byte 102,65,15,104,201 // punpckhbw %xmm9,%xmm1
+ .byte 102,15,111,230 // movdqa %xmm6,%xmm4
+ .byte 102,65,15,96,225 // punpcklbw %xmm9,%xmm4
+ .byte 102,65,15,104,241 // punpckhbw %xmm9,%xmm6
+ .byte 102,68,15,111,215 // movdqa %xmm7,%xmm10
+ .byte 102,69,15,96,209 // punpcklbw %xmm9,%xmm10
+ .byte 102,65,15,104,249 // punpckhbw %xmm9,%xmm7
+ .byte 102,15,213,249 // pmullw %xmm1,%xmm7
+ .byte 102,68,15,213,213 // pmullw %xmm5,%xmm10
+ .byte 102,65,15,213,240 // pmullw %xmm8,%xmm6
+ .byte 102,15,213,224 // pmullw %xmm0,%xmm4
+ .byte 102,15,253,196 // paddw %xmm4,%xmm0
+ .byte 102,65,15,253,240 // paddw %xmm8,%xmm6
+ .byte 102,65,15,253,234 // paddw %xmm10,%xmm5
+ .byte 102,15,253,249 // paddw %xmm1,%xmm7
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,113,208,8 // psrlw $0x8,%xmm0
+ .byte 102,15,103,198 // packuswb %xmm6,%xmm0
+ .byte 102,15,103,239 // packuswb %xmm7,%xmm5
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
+ .byte 102,15,111,205 // movdqa %xmm5,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_swap_rb_sse2_8bit
.globl _sk_swap_rb_sse2_8bit
FUNCTION(_sk_swap_rb_sse2_8bit)
_sk_swap_rb_sse2_8bit:
- .byte 102,15,239,210 // pxor %xmm2,%xmm2
- .byte 102,15,111,216 // movdqa %xmm0,%xmm3
- .byte 102,15,104,218 // punpckhbw %xmm2,%xmm3
- .byte 242,15,112,219,198 // pshuflw $0xc6,%xmm3,%xmm3
- .byte 243,15,112,219,198 // pshufhw $0xc6,%xmm3,%xmm3
- .byte 102,15,96,194 // punpcklbw %xmm2,%xmm0
+ .byte 102,15,239,228 // pxor %xmm4,%xmm4
+ .byte 102,15,111,232 // movdqa %xmm0,%xmm5
+ .byte 102,15,104,236 // punpckhbw %xmm4,%xmm5
+ .byte 242,15,112,237,198 // pshuflw $0xc6,%xmm5,%xmm5
+ .byte 243,15,112,237,198 // pshufhw $0xc6,%xmm5,%xmm5
+ .byte 102,15,96,196 // punpcklbw %xmm4,%xmm0
.byte 242,15,112,192,198 // pshuflw $0xc6,%xmm0,%xmm0
.byte 243,15,112,192,198 // pshufhw $0xc6,%xmm0,%xmm0
- .byte 102,15,103,195 // packuswb %xmm3,%xmm0
+ .byte 102,15,103,197 // packuswb %xmm5,%xmm0
+ .byte 102,15,111,233 // movdqa %xmm1,%xmm5
+ .byte 102,15,104,236 // punpckhbw %xmm4,%xmm5
+ .byte 242,15,112,237,198 // pshuflw $0xc6,%xmm5,%xmm5
+ .byte 243,15,112,237,198 // pshufhw $0xc6,%xmm5,%xmm5
+ .byte 102,15,96,204 // punpcklbw %xmm4,%xmm1
+ .byte 242,15,112,201,198 // pshuflw $0xc6,%xmm1,%xmm1
+ .byte 243,15,112,201,198 // pshufhw $0xc6,%xmm1,%xmm1
+ .byte 102,15,103,205 // packuswb %xmm5,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -59581,8 +61702,9 @@ HIDDEN _sk_invert_sse2_8bit
.globl _sk_invert_sse2_8bit
FUNCTION(_sk_invert_sse2_8bit)
_sk_invert_sse2_8bit:
- .byte 102,15,118,210 // pcmpeqd %xmm2,%xmm2
- .byte 102,15,239,194 // pxor %xmm2,%xmm0
+ .byte 102,15,118,228 // pcmpeqd %xmm4,%xmm4
+ .byte 102,15,239,196 // pxor %xmm4,%xmm0
+ .byte 102,15,239,204 // pxor %xmm4,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -59599,24 +61721,54 @@ _sk_load_8888_sse2_8bit:
.byte 72,193,226,2 // shl $0x2,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,10 // jne 1d1 <_sk_load_8888_sse2_8bit+0x2b>
- .byte 243,66,15,111,4,130 // movdqu (%rdx,%r8,4),%xmm0
+ .byte 117,16 // jne 27f <_sk_load_8888_sse2_8bit+0x31>
+ .byte 66,15,16,76,130,16 // movups 0x10(%rdx,%r8,4),%xmm1
+ .byte 102,66,15,16,4,130 // movupd (%rdx,%r8,4),%xmm0
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,36 // je 1ff <_sk_load_8888_sse2_8bit+0x59>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,201 // pxor %xmm1,%xmm1
.byte 102,15,239,192 // pxor %xmm0,%xmm0
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,18 // je 1f7 <_sk_load_8888_sse2_8bit+0x51>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,226 // jne 1cd <_sk_load_8888_sse2_8bit+0x27>
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,231 // ja 27b <_sk_load_8888_sse2_8bit+0x2d>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,81,0,0,0 // lea 0x51(%rip),%rcx # 2f0 <_sk_load_8888_sse2_8bit+0xa2>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 243,66,15,16,4,130 // movss (%rdx,%r8,4),%xmm0
+ .byte 235,203 // jmp 27b <_sk_load_8888_sse2_8bit+0x2d>
.byte 102,66,15,110,68,130,8 // movd 0x8(%rdx,%r8,4),%xmm0
.byte 102,15,112,192,69 // pshufd $0x45,%xmm0,%xmm0
.byte 102,66,15,18,4,130 // movlpd (%rdx,%r8,4),%xmm0
- .byte 235,206 // jmp 1cd <_sk_load_8888_sse2_8bit+0x27>
- .byte 102,66,15,110,4,130 // movd (%rdx,%r8,4),%xmm0
- .byte 235,198 // jmp 1cd <_sk_load_8888_sse2_8bit+0x27>
+ .byte 235,183 // jmp 27b <_sk_load_8888_sse2_8bit+0x2d>
+ .byte 102,66,15,110,68,130,24 // movd 0x18(%rdx,%r8,4),%xmm0
+ .byte 102,15,112,200,69 // pshufd $0x45,%xmm0,%xmm1
+ .byte 243,66,15,16,68,130,20 // movss 0x14(%rdx,%r8,4),%xmm0
+ .byte 15,198,193,0 // shufps $0x0,%xmm1,%xmm0
+ .byte 15,198,193,226 // shufps $0xe2,%xmm1,%xmm0
+ .byte 15,40,200 // movaps %xmm0,%xmm1
+ .byte 243,66,15,16,68,130,16 // movss 0x10(%rdx,%r8,4),%xmm0
+ .byte 243,15,16,200 // movss %xmm0,%xmm1
+ .byte 235,134 // jmp 275 <_sk_load_8888_sse2_8bit+0x27>
+ .byte 144 // nop
+ .byte 184,255,255,255,204 // mov $0xccffffff,%eax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,192 // inc %eax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,133,255,255,255,242 // incl -0xd000001(%rbp)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,224 // jmpq *%rax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,212 // callq *%rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_8888_dst_sse2_8bit
.globl _sk_load_8888_dst_sse2_8bit
@@ -59631,55 +61783,119 @@ _sk_load_8888_dst_sse2_8bit:
.byte 72,193,226,2 // shl $0x2,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,10 // jne 232 <_sk_load_8888_dst_sse2_8bit+0x2b>
- .byte 243,66,15,111,12,130 // movdqu (%rdx,%r8,4),%xmm1
+ .byte 117,16 // jne 33d <_sk_load_8888_dst_sse2_8bit+0x31>
+ .byte 66,15,16,92,130,16 // movups 0x10(%rdx,%r8,4),%xmm3
+ .byte 102,66,15,16,20,130 // movupd (%rdx,%r8,4),%xmm2
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,36 // je 260 <_sk_load_8888_dst_sse2_8bit+0x59>
- .byte 102,15,239,201 // pxor %xmm1,%xmm1
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,18 // je 258 <_sk_load_8888_dst_sse2_8bit+0x51>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,226 // jne 22e <_sk_load_8888_dst_sse2_8bit+0x27>
- .byte 102,66,15,110,76,130,8 // movd 0x8(%rdx,%r8,4),%xmm1
- .byte 102,15,112,201,69 // pshufd $0x45,%xmm1,%xmm1
- .byte 102,66,15,18,12,130 // movlpd (%rdx,%r8,4),%xmm1
- .byte 235,206 // jmp 22e <_sk_load_8888_dst_sse2_8bit+0x27>
- .byte 102,66,15,110,12,130 // movd (%rdx,%r8,4),%xmm1
- .byte 235,198 // jmp 22e <_sk_load_8888_dst_sse2_8bit+0x27>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,219 // pxor %xmm3,%xmm3
+ .byte 102,15,239,210 // pxor %xmm2,%xmm2
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,231 // ja 339 <_sk_load_8888_dst_sse2_8bit+0x2d>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,83,0,0,0 // lea 0x53(%rip),%rcx # 3b0 <_sk_load_8888_dst_sse2_8bit+0xa4>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 243,66,15,16,20,130 // movss (%rdx,%r8,4),%xmm2
+ .byte 235,203 // jmp 339 <_sk_load_8888_dst_sse2_8bit+0x2d>
+ .byte 102,66,15,110,84,130,8 // movd 0x8(%rdx,%r8,4),%xmm2
+ .byte 102,15,112,210,69 // pshufd $0x45,%xmm2,%xmm2
+ .byte 102,66,15,18,20,130 // movlpd (%rdx,%r8,4),%xmm2
+ .byte 235,183 // jmp 339 <_sk_load_8888_dst_sse2_8bit+0x2d>
+ .byte 102,66,15,110,84,130,24 // movd 0x18(%rdx,%r8,4),%xmm2
+ .byte 102,15,112,218,69 // pshufd $0x45,%xmm2,%xmm3
+ .byte 243,66,15,16,84,130,20 // movss 0x14(%rdx,%r8,4),%xmm2
+ .byte 15,198,211,0 // shufps $0x0,%xmm3,%xmm2
+ .byte 15,198,211,226 // shufps $0xe2,%xmm3,%xmm2
+ .byte 15,40,218 // movaps %xmm2,%xmm3
+ .byte 243,66,15,16,84,130,16 // movss 0x10(%rdx,%r8,4),%xmm2
+ .byte 243,15,16,218 // movss %xmm2,%xmm3
+ .byte 235,134 // jmp 333 <_sk_load_8888_dst_sse2_8bit+0x27>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 182,255 // mov $0xff,%dh
+ .byte 255 // (bad)
+ .byte 255,202 // dec %edx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 190,255,255,255,131 // mov $0x83ffffff,%esi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,240 // push %rax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 222,255 // fdivrp %st,%st(7)
+ .byte 255 // (bad)
+ .byte 255,210 // callq *%rdx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_store_8888_sse2_8bit
.globl _sk_store_8888_sse2_8bit
FUNCTION(_sk_store_8888_sse2_8bit)
_sk_store_8888_sse2_8bit:
- .byte 76,99,7 // movslq (%rdi),%r8
- .byte 76,139,79,16 // mov 0x10(%rdi),%r9
+ .byte 76,99,15 // movslq (%rdi),%r9
+ .byte 76,139,71,16 // mov 0x10(%rdi),%r8
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 72,99,72,8 // movslq 0x8(%rax),%rcx
.byte 72,99,87,8 // movslq 0x8(%rdi),%rdx
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,193,226,2 // shl $0x2,%rdx
.byte 72,3,16 // add (%rax),%rdx
- .byte 77,133,201 // test %r9,%r9
- .byte 117,10 // jne 293 <_sk_store_8888_sse2_8bit+0x2b>
- .byte 243,66,15,127,4,130 // movdqu %xmm0,(%rdx,%r8,4)
+ .byte 77,133,192 // test %r8,%r8
+ .byte 117,17 // jne 3fe <_sk_store_8888_sse2_8bit+0x32>
+ .byte 243,66,15,127,4,138 // movdqu %xmm0,(%rdx,%r9,4)
+ .byte 243,66,15,127,76,138,16 // movdqu %xmm1,0x10(%rdx,%r9,4)
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,32 // je 2bd <_sk_store_8888_sse2_8bit+0x55>
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,18 // je 2b5 <_sk_store_8888_sse2_8bit+0x4d>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,230 // jne 28f <_sk_store_8888_sse2_8bit+0x27>
- .byte 102,15,112,208,78 // pshufd $0x4e,%xmm0,%xmm2
- .byte 102,66,15,126,84,130,8 // movd %xmm2,0x8(%rdx,%r8,4)
- .byte 102,66,15,214,4,130 // movq %xmm0,(%rdx,%r8,4)
- .byte 235,210 // jmp 28f <_sk_store_8888_sse2_8bit+0x27>
- .byte 102,66,15,126,4,130 // movd %xmm0,(%rdx,%r8,4)
- .byte 235,202 // jmp 28f <_sk_store_8888_sse2_8bit+0x27>
+ .byte 65,128,224,7 // and $0x7,%r8b
+ .byte 65,254,200 // dec %r8b
+ .byte 65,128,248,6 // cmp $0x6,%r8b
+ .byte 119,239 // ja 3fa <_sk_store_8888_sse2_8bit+0x2e>
+ .byte 65,15,182,192 // movzbl %r8b,%eax
+ .byte 72,141,13,78,0,0,0 // lea 0x4e(%rip),%rcx # 464 <_sk_store_8888_sse2_8bit+0x98>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 102,66,15,126,4,138 // movd %xmm0,(%rdx,%r9,4)
+ .byte 235,211 // jmp 3fa <_sk_store_8888_sse2_8bit+0x2e>
+ .byte 102,15,112,224,78 // pshufd $0x4e,%xmm0,%xmm4
+ .byte 102,66,15,126,100,138,8 // movd %xmm4,0x8(%rdx,%r9,4)
+ .byte 102,66,15,214,4,138 // movq %xmm0,(%rdx,%r9,4)
+ .byte 235,191 // jmp 3fa <_sk_store_8888_sse2_8bit+0x2e>
+ .byte 102,15,112,225,78 // pshufd $0x4e,%xmm1,%xmm4
+ .byte 102,66,15,126,100,138,24 // movd %xmm4,0x18(%rdx,%r9,4)
+ .byte 102,15,112,225,229 // pshufd $0xe5,%xmm1,%xmm4
+ .byte 102,66,15,126,100,138,20 // movd %xmm4,0x14(%rdx,%r9,4)
+ .byte 102,66,15,126,76,138,16 // movd %xmm1,0x10(%rdx,%r9,4)
+ .byte 243,66,15,127,4,138 // movdqu %xmm0,(%rdx,%r9,4)
+ .byte 235,152 // jmp 3fa <_sk_store_8888_sse2_8bit+0x2e>
+ .byte 102,144 // xchg %ax,%ax
+ .byte 187,255,255,255,207 // mov $0xcfffffff,%ebx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,195 // inc %ebx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,246 // push %rsi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 239 // out %eax,(%dx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,227 // jmpq *%rbx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,215 // callq *%rdi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_bgra_sse2_8bit
.globl _sk_load_bgra_sse2_8bit
@@ -59694,33 +61910,76 @@ _sk_load_bgra_sse2_8bit:
.byte 72,193,226,2 // shl $0x2,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,50 // jne 318 <_sk_load_bgra_sse2_8bit+0x53>
- .byte 243,66,15,111,4,130 // movdqu (%rdx,%r8,4),%xmm0
- .byte 102,15,239,210 // pxor %xmm2,%xmm2
- .byte 102,15,111,216 // movdqa %xmm0,%xmm3
- .byte 102,15,104,218 // punpckhbw %xmm2,%xmm3
- .byte 242,15,112,219,198 // pshuflw $0xc6,%xmm3,%xmm3
- .byte 243,15,112,219,198 // pshufhw $0xc6,%xmm3,%xmm3
- .byte 102,15,96,194 // punpcklbw %xmm2,%xmm0
+ .byte 117,92 // jne 4fd <_sk_load_bgra_sse2_8bit+0x7d>
+ .byte 66,15,16,76,130,16 // movups 0x10(%rdx,%r8,4),%xmm1
+ .byte 102,66,15,16,4,130 // movupd (%rdx,%r8,4),%xmm0
+ .byte 102,15,239,228 // pxor %xmm4,%xmm4
+ .byte 102,15,40,232 // movapd %xmm0,%xmm5
+ .byte 102,15,104,236 // punpckhbw %xmm4,%xmm5
+ .byte 242,15,112,237,198 // pshuflw $0xc6,%xmm5,%xmm5
+ .byte 243,15,112,237,198 // pshufhw $0xc6,%xmm5,%xmm5
+ .byte 102,15,96,196 // punpcklbw %xmm4,%xmm0
.byte 242,15,112,192,198 // pshuflw $0xc6,%xmm0,%xmm0
.byte 243,15,112,192,198 // pshufhw $0xc6,%xmm0,%xmm0
- .byte 102,15,103,195 // packuswb %xmm3,%xmm0
+ .byte 102,15,103,197 // packuswb %xmm5,%xmm0
+ .byte 102,15,111,233 // movdqa %xmm1,%xmm5
+ .byte 102,15,104,236 // punpckhbw %xmm4,%xmm5
+ .byte 242,15,112,237,198 // pshuflw $0xc6,%xmm5,%xmm5
+ .byte 243,15,112,237,198 // pshufhw $0xc6,%xmm5,%xmm5
+ .byte 102,15,96,204 // punpcklbw %xmm4,%xmm1
+ .byte 242,15,112,201,198 // pshuflw $0xc6,%xmm1,%xmm1
+ .byte 243,15,112,201,198 // pshufhw $0xc6,%xmm1,%xmm1
+ .byte 102,15,103,205 // packuswb %xmm5,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,36 // je 346 <_sk_load_bgra_sse2_8bit+0x81>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,201 // pxor %xmm1,%xmm1
.byte 102,15,239,192 // pxor %xmm0,%xmm0
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,18 // je 33e <_sk_load_bgra_sse2_8bit+0x79>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,186 // jne 2ec <_sk_load_bgra_sse2_8bit+0x27>
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,155 // ja 4ad <_sk_load_bgra_sse2_8bit+0x2d>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,91,0,0,0 // lea 0x5b(%rip),%rcx # 578 <_sk_load_bgra_sse2_8bit+0xf8>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 243,66,15,16,4,130 // movss (%rdx,%r8,4),%xmm0
+ .byte 233,124,255,255,255 // jmpq 4ad <_sk_load_bgra_sse2_8bit+0x2d>
.byte 102,66,15,110,68,130,8 // movd 0x8(%rdx,%r8,4),%xmm0
.byte 102,15,112,192,69 // pshufd $0x45,%xmm0,%xmm0
.byte 102,66,15,18,4,130 // movlpd (%rdx,%r8,4),%xmm0
- .byte 235,166 // jmp 2ec <_sk_load_bgra_sse2_8bit+0x27>
- .byte 102,66,15,110,4,130 // movd (%rdx,%r8,4),%xmm0
- .byte 235,158 // jmp 2ec <_sk_load_bgra_sse2_8bit+0x27>
+ .byte 233,101,255,255,255 // jmpq 4ad <_sk_load_bgra_sse2_8bit+0x2d>
+ .byte 102,66,15,110,68,130,24 // movd 0x18(%rdx,%r8,4),%xmm0
+ .byte 102,15,112,200,69 // pshufd $0x45,%xmm0,%xmm1
+ .byte 243,66,15,16,68,130,20 // movss 0x14(%rdx,%r8,4),%xmm0
+ .byte 15,198,193,0 // shufps $0x0,%xmm1,%xmm0
+ .byte 15,198,193,226 // shufps $0xe2,%xmm1,%xmm0
+ .byte 15,40,200 // movaps %xmm0,%xmm1
+ .byte 243,66,15,16,68,130,16 // movss 0x10(%rdx,%r8,4),%xmm0
+ .byte 243,15,16,200 // movss %xmm0,%xmm1
+ .byte 233,49,255,255,255 // jmpq 4a7 <_sk_load_bgra_sse2_8bit+0x27>
+ .byte 102,144 // xchg %ax,%ax
+ .byte 174 // scas %es:(%rdi),%al
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,197 // inc %ebp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 185,255,255,255,47 // mov $0x2fffffff,%ecx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 238 // out %al,(%dx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 220,255 // fdivr %st,%st(7)
+ .byte 255 // (bad)
+ .byte 255,208 // callq *%rax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_bgra_dst_sse2_8bit
.globl _sk_load_bgra_dst_sse2_8bit
@@ -59735,74 +61994,157 @@ _sk_load_bgra_dst_sse2_8bit:
.byte 72,193,226,2 // shl $0x2,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,50 // jne 3a1 <_sk_load_bgra_dst_sse2_8bit+0x53>
- .byte 243,66,15,111,12,130 // movdqu (%rdx,%r8,4),%xmm1
- .byte 102,15,239,210 // pxor %xmm2,%xmm2
- .byte 102,15,111,217 // movdqa %xmm1,%xmm3
- .byte 102,15,104,218 // punpckhbw %xmm2,%xmm3
+ .byte 117,92 // jne 611 <_sk_load_bgra_dst_sse2_8bit+0x7d>
+ .byte 66,15,16,92,130,16 // movups 0x10(%rdx,%r8,4),%xmm3
+ .byte 102,66,15,16,20,130 // movupd (%rdx,%r8,4),%xmm2
+ .byte 102,15,239,228 // pxor %xmm4,%xmm4
+ .byte 102,15,40,234 // movapd %xmm2,%xmm5
+ .byte 102,15,104,236 // punpckhbw %xmm4,%xmm5
+ .byte 242,15,112,237,198 // pshuflw $0xc6,%xmm5,%xmm5
+ .byte 243,15,112,237,198 // pshufhw $0xc6,%xmm5,%xmm5
+ .byte 102,15,96,212 // punpcklbw %xmm4,%xmm2
+ .byte 242,15,112,210,198 // pshuflw $0xc6,%xmm2,%xmm2
+ .byte 243,15,112,210,198 // pshufhw $0xc6,%xmm2,%xmm2
+ .byte 102,15,103,213 // packuswb %xmm5,%xmm2
+ .byte 102,15,111,235 // movdqa %xmm3,%xmm5
+ .byte 102,15,104,236 // punpckhbw %xmm4,%xmm5
+ .byte 242,15,112,237,198 // pshuflw $0xc6,%xmm5,%xmm5
+ .byte 243,15,112,237,198 // pshufhw $0xc6,%xmm5,%xmm5
+ .byte 102,15,96,220 // punpcklbw %xmm4,%xmm3
.byte 242,15,112,219,198 // pshuflw $0xc6,%xmm3,%xmm3
.byte 243,15,112,219,198 // pshufhw $0xc6,%xmm3,%xmm3
- .byte 102,15,96,202 // punpcklbw %xmm2,%xmm1
- .byte 242,15,112,201,198 // pshuflw $0xc6,%xmm1,%xmm1
- .byte 243,15,112,201,198 // pshufhw $0xc6,%xmm1,%xmm1
- .byte 102,15,103,203 // packuswb %xmm3,%xmm1
+ .byte 102,15,103,221 // packuswb %xmm5,%xmm3
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,36 // je 3cf <_sk_load_bgra_dst_sse2_8bit+0x81>
- .byte 102,15,239,201 // pxor %xmm1,%xmm1
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,18 // je 3c7 <_sk_load_bgra_dst_sse2_8bit+0x79>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,186 // jne 375 <_sk_load_bgra_dst_sse2_8bit+0x27>
- .byte 102,66,15,110,76,130,8 // movd 0x8(%rdx,%r8,4),%xmm1
- .byte 102,15,112,201,69 // pshufd $0x45,%xmm1,%xmm1
- .byte 102,66,15,18,12,130 // movlpd (%rdx,%r8,4),%xmm1
- .byte 235,166 // jmp 375 <_sk_load_bgra_dst_sse2_8bit+0x27>
- .byte 102,66,15,110,12,130 // movd (%rdx,%r8,4),%xmm1
- .byte 235,158 // jmp 375 <_sk_load_bgra_dst_sse2_8bit+0x27>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,219 // pxor %xmm3,%xmm3
+ .byte 102,15,239,210 // pxor %xmm2,%xmm2
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,155 // ja 5c1 <_sk_load_bgra_dst_sse2_8bit+0x2d>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,91,0,0,0 // lea 0x5b(%rip),%rcx # 68c <_sk_load_bgra_dst_sse2_8bit+0xf8>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 243,66,15,16,20,130 // movss (%rdx,%r8,4),%xmm2
+ .byte 233,124,255,255,255 // jmpq 5c1 <_sk_load_bgra_dst_sse2_8bit+0x2d>
+ .byte 102,66,15,110,84,130,8 // movd 0x8(%rdx,%r8,4),%xmm2
+ .byte 102,15,112,210,69 // pshufd $0x45,%xmm2,%xmm2
+ .byte 102,66,15,18,20,130 // movlpd (%rdx,%r8,4),%xmm2
+ .byte 233,101,255,255,255 // jmpq 5c1 <_sk_load_bgra_dst_sse2_8bit+0x2d>
+ .byte 102,66,15,110,84,130,24 // movd 0x18(%rdx,%r8,4),%xmm2
+ .byte 102,15,112,218,69 // pshufd $0x45,%xmm2,%xmm3
+ .byte 243,66,15,16,84,130,20 // movss 0x14(%rdx,%r8,4),%xmm2
+ .byte 15,198,211,0 // shufps $0x0,%xmm3,%xmm2
+ .byte 15,198,211,226 // shufps $0xe2,%xmm3,%xmm2
+ .byte 15,40,218 // movaps %xmm2,%xmm3
+ .byte 243,66,15,16,84,130,16 // movss 0x10(%rdx,%r8,4),%xmm2
+ .byte 243,15,16,218 // movss %xmm2,%xmm3
+ .byte 233,49,255,255,255 // jmpq 5bb <_sk_load_bgra_dst_sse2_8bit+0x27>
+ .byte 102,144 // xchg %ax,%ax
+ .byte 174 // scas %es:(%rdi),%al
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,197 // inc %ebp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 185,255,255,255,47 // mov $0x2fffffff,%ecx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 238 // out %al,(%dx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 220,255 // fdivr %st,%st(7)
+ .byte 255 // (bad)
+ .byte 255,208 // callq *%rax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_store_bgra_sse2_8bit
.globl _sk_store_bgra_sse2_8bit
FUNCTION(_sk_store_bgra_sse2_8bit)
_sk_store_bgra_sse2_8bit:
- .byte 76,99,7 // movslq (%rdi),%r8
- .byte 76,139,79,16 // mov 0x10(%rdi),%r9
+ .byte 76,99,15 // movslq (%rdi),%r9
+ .byte 76,139,71,16 // mov 0x10(%rdi),%r8
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 72,99,72,8 // movslq 0x8(%rax),%rcx
.byte 72,99,87,8 // movslq 0x8(%rdi),%rdx
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,193,226,2 // shl $0x2,%rdx
.byte 72,3,16 // add (%rax),%rdx
- .byte 102,15,239,210 // pxor %xmm2,%xmm2
- .byte 102,15,111,216 // movdqa %xmm0,%xmm3
- .byte 102,15,104,218 // punpckhbw %xmm2,%xmm3
- .byte 242,15,112,219,198 // pshuflw $0xc6,%xmm3,%xmm3
- .byte 243,15,112,219,198 // pshufhw $0xc6,%xmm3,%xmm3
- .byte 102,15,111,224 // movdqa %xmm0,%xmm4
- .byte 102,15,96,226 // punpcklbw %xmm2,%xmm4
- .byte 242,15,112,212,198 // pshuflw $0xc6,%xmm4,%xmm2
- .byte 243,15,112,210,198 // pshufhw $0xc6,%xmm2,%xmm2
- .byte 102,15,103,211 // packuswb %xmm3,%xmm2
- .byte 77,133,201 // test %r9,%r9
- .byte 117,10 // jne 42e <_sk_store_bgra_sse2_8bit+0x57>
- .byte 243,66,15,127,20,130 // movdqu %xmm2,(%rdx,%r8,4)
+ .byte 102,15,239,237 // pxor %xmm5,%xmm5
+ .byte 102,15,111,225 // movdqa %xmm1,%xmm4
+ .byte 102,15,104,229 // punpckhbw %xmm5,%xmm4
+ .byte 242,15,112,228,198 // pshuflw $0xc6,%xmm4,%xmm4
+ .byte 243,15,112,244,198 // pshufhw $0xc6,%xmm4,%xmm6
+ .byte 102,15,111,225 // movdqa %xmm1,%xmm4
+ .byte 102,15,96,229 // punpcklbw %xmm5,%xmm4
+ .byte 242,15,112,228,198 // pshuflw $0xc6,%xmm4,%xmm4
+ .byte 243,15,112,228,198 // pshufhw $0xc6,%xmm4,%xmm4
+ .byte 102,15,103,230 // packuswb %xmm6,%xmm4
+ .byte 102,15,111,240 // movdqa %xmm0,%xmm6
+ .byte 102,15,104,245 // punpckhbw %xmm5,%xmm6
+ .byte 242,15,112,246,198 // pshuflw $0xc6,%xmm6,%xmm6
+ .byte 243,15,112,246,198 // pshufhw $0xc6,%xmm6,%xmm6
+ .byte 102,15,111,248 // movdqa %xmm0,%xmm7
+ .byte 102,15,96,253 // punpcklbw %xmm5,%xmm7
+ .byte 242,15,112,239,198 // pshuflw $0xc6,%xmm7,%xmm5
+ .byte 243,15,112,237,198 // pshufhw $0xc6,%xmm5,%xmm5
+ .byte 102,15,103,238 // packuswb %xmm6,%xmm5
+ .byte 77,133,192 // test %r8,%r8
+ .byte 117,17 // jne 72e <_sk_store_bgra_sse2_8bit+0x86>
+ .byte 243,66,15,127,44,138 // movdqu %xmm5,(%rdx,%r9,4)
+ .byte 243,66,15,127,100,138,16 // movdqu %xmm4,0x10(%rdx,%r9,4)
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,32 // je 458 <_sk_store_bgra_sse2_8bit+0x81>
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,18 // je 450 <_sk_store_bgra_sse2_8bit+0x79>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,230 // jne 42a <_sk_store_bgra_sse2_8bit+0x53>
- .byte 102,15,112,218,78 // pshufd $0x4e,%xmm2,%xmm3
- .byte 102,66,15,126,92,130,8 // movd %xmm3,0x8(%rdx,%r8,4)
- .byte 102,66,15,214,20,130 // movq %xmm2,(%rdx,%r8,4)
- .byte 235,210 // jmp 42a <_sk_store_bgra_sse2_8bit+0x53>
- .byte 102,66,15,126,20,130 // movd %xmm2,(%rdx,%r8,4)
- .byte 235,202 // jmp 42a <_sk_store_bgra_sse2_8bit+0x53>
+ .byte 65,128,224,7 // and $0x7,%r8b
+ .byte 65,254,200 // dec %r8b
+ .byte 65,128,248,6 // cmp $0x6,%r8b
+ .byte 119,239 // ja 72a <_sk_store_bgra_sse2_8bit+0x82>
+ .byte 65,15,182,192 // movzbl %r8b,%eax
+ .byte 72,141,13,78,0,0,0 // lea 0x4e(%rip),%rcx # 794 <_sk_store_bgra_sse2_8bit+0xec>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 102,66,15,126,44,138 // movd %xmm5,(%rdx,%r9,4)
+ .byte 235,211 // jmp 72a <_sk_store_bgra_sse2_8bit+0x82>
+ .byte 102,15,112,229,78 // pshufd $0x4e,%xmm5,%xmm4
+ .byte 102,66,15,126,100,138,8 // movd %xmm4,0x8(%rdx,%r9,4)
+ .byte 102,66,15,214,44,138 // movq %xmm5,(%rdx,%r9,4)
+ .byte 235,191 // jmp 72a <_sk_store_bgra_sse2_8bit+0x82>
+ .byte 102,15,112,244,78 // pshufd $0x4e,%xmm4,%xmm6
+ .byte 102,66,15,126,116,138,24 // movd %xmm6,0x18(%rdx,%r9,4)
+ .byte 102,15,112,244,229 // pshufd $0xe5,%xmm4,%xmm6
+ .byte 102,66,15,126,116,138,20 // movd %xmm6,0x14(%rdx,%r9,4)
+ .byte 102,66,15,126,100,138,16 // movd %xmm4,0x10(%rdx,%r9,4)
+ .byte 243,66,15,127,44,138 // movdqu %xmm5,(%rdx,%r9,4)
+ .byte 235,152 // jmp 72a <_sk_store_bgra_sse2_8bit+0x82>
+ .byte 102,144 // xchg %ax,%ax
+ .byte 187,255,255,255,207 // mov $0xcfffffff,%ebx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,195 // inc %ebx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,246 // push %rsi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 239 // out %eax,(%dx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,227 // jmpq *%rbx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,215 // callq *%rdi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_a8_sse2_8bit
.globl _sk_load_a8_sse2_8bit
@@ -59816,33 +62158,70 @@ _sk_load_a8_sse2_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,23 // jne 494 <_sk_load_a8_sse2_8bit+0x34>
- .byte 102,66,15,110,4,2 // movd (%rdx,%r8,1),%xmm0
+ .byte 117,48 // jne 7fd <_sk_load_a8_sse2_8bit+0x4d>
+ .byte 243,66,15,126,4,2 // movq (%rdx,%r8,1),%xmm0
.byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
- .byte 102,15,97,192 // punpcklwd %xmm0,%xmm0
+ .byte 102,15,84,5,17,27,0,0 // andpd 0x1b11(%rip),%xmm0 # 22f0 <_sk_xor__sse2_8bit+0x1d7>
+ .byte 102,15,239,228 // pxor %xmm4,%xmm4
+ .byte 102,15,40,200 // movapd %xmm0,%xmm1
+ .byte 102,15,105,204 // punpckhwd %xmm4,%xmm1
+ .byte 102,15,97,196 // punpcklwd %xmm4,%xmm0
.byte 102,15,114,240,24 // pslld $0x18,%xmm0
+ .byte 102,15,114,241,24 // pslld $0x18,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,54 // je 4d4 <_sk_load_a8_sse2_8bit+0x74>
+ .byte 65,128,225,7 // and $0x7,%r9b
.byte 102,15,239,192 // pxor %xmm0,%xmm0
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,21 // je 4bd <_sk_load_a8_sse2_8bit+0x5d>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,221 // jne 48b <_sk_load_a8_sse2_8bit+0x2b>
- .byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
- .byte 102,15,110,192 // movd %eax,%xmm0
- .byte 102,15,112,192,69 // pshufd $0x45,%xmm0,%xmm0
- .byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 102,15,96,208 // punpcklbw %xmm0,%xmm2
- .byte 102,15,97,208 // punpcklwd %xmm0,%xmm2
- .byte 242,15,16,194 // movsd %xmm2,%xmm0
- .byte 235,183 // jmp 48b <_sk_load_a8_sse2_8bit+0x2b>
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,201 // ja 7d7 <_sk_load_a8_sse2_8bit+0x27>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,111,0,0,0 // lea 0x6f(%rip),%rcx # 888 <_sk_load_a8_sse2_8bit+0xd8>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
.byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
.byte 102,15,110,192 // movd %eax,%xmm0
- .byte 235,172 // jmp 48b <_sk_load_a8_sse2_8bit+0x2b>
+ .byte 235,170 // jmp 7d7 <_sk_load_a8_sse2_8bit+0x27>
+ .byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
+ .byte 102,15,239,192 // pxor %xmm0,%xmm0
+ .byte 102,15,196,192,2 // pinsrw $0x2,%eax,%xmm0
+ .byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
+ .byte 102,15,110,200 // movd %eax,%xmm1
+ .byte 102,15,96,200 // punpcklbw %xmm0,%xmm1
+ .byte 243,15,16,193 // movss %xmm1,%xmm0
+ .byte 235,136 // jmp 7d7 <_sk_load_a8_sse2_8bit+0x27>
+ .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
+ .byte 102,15,239,192 // pxor %xmm0,%xmm0
+ .byte 102,15,196,192,6 // pinsrw $0x6,%eax,%xmm0
+ .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
+ .byte 102,15,196,192,5 // pinsrw $0x5,%eax,%xmm0
+ .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
+ .byte 102,15,196,192,4 // pinsrw $0x4,%eax,%xmm0
+ .byte 102,66,15,110,12,2 // movd (%rdx,%r8,1),%xmm1
+ .byte 102,15,96,200 // punpcklbw %xmm0,%xmm1
+ .byte 242,15,16,193 // movsd %xmm1,%xmm0
+ .byte 233,80,255,255,255 // jmpq 7d7 <_sk_load_a8_sse2_8bit+0x27>
+ .byte 144 // nop
+ .byte 154 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,180,255,255,255,165,255 // pushq -0x5a0001(%rdi,%rdi,8)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 236 // in (%dx),%al
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,225 // jmpq *%rcx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,214 // callq *%rsi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,199 // inc %edi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_a8_dst_sse2_8bit
.globl _sk_load_a8_dst_sse2_8bit
@@ -59856,33 +62235,70 @@ _sk_load_a8_dst_sse2_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,23 // jne 513 <_sk_load_a8_dst_sse2_8bit+0x34>
- .byte 102,66,15,110,12,2 // movd (%rdx,%r8,1),%xmm1
- .byte 102,15,96,200 // punpcklbw %xmm0,%xmm1
- .byte 102,15,97,200 // punpcklwd %xmm0,%xmm1
- .byte 102,15,114,241,24 // pslld $0x18,%xmm1
+ .byte 117,48 // jne 8f1 <_sk_load_a8_dst_sse2_8bit+0x4d>
+ .byte 243,66,15,126,20,2 // movq (%rdx,%r8,1),%xmm2
+ .byte 102,15,96,208 // punpcklbw %xmm0,%xmm2
+ .byte 102,15,84,21,45,26,0,0 // andpd 0x1a2d(%rip),%xmm2 # 2300 <_sk_xor__sse2_8bit+0x1e7>
+ .byte 102,15,239,228 // pxor %xmm4,%xmm4
+ .byte 102,15,40,218 // movapd %xmm2,%xmm3
+ .byte 102,15,105,220 // punpckhwd %xmm4,%xmm3
+ .byte 102,15,97,212 // punpcklwd %xmm4,%xmm2
+ .byte 102,15,114,242,24 // pslld $0x18,%xmm2
+ .byte 102,15,114,243,24 // pslld $0x18,%xmm3
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,54 // je 553 <_sk_load_a8_dst_sse2_8bit+0x74>
- .byte 102,15,239,201 // pxor %xmm1,%xmm1
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,21 // je 53c <_sk_load_a8_dst_sse2_8bit+0x5d>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,221 // jne 50a <_sk_load_a8_dst_sse2_8bit+0x2b>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,210 // pxor %xmm2,%xmm2
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,201 // ja 8cb <_sk_load_a8_dst_sse2_8bit+0x27>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,111,0,0,0 // lea 0x6f(%rip),%rcx # 97c <_sk_load_a8_dst_sse2_8bit+0xd8>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
+ .byte 102,15,110,208 // movd %eax,%xmm2
+ .byte 235,170 // jmp 8cb <_sk_load_a8_dst_sse2_8bit+0x27>
.byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
- .byte 102,15,110,200 // movd %eax,%xmm1
- .byte 102,15,112,201,69 // pshufd $0x45,%xmm1,%xmm1
+ .byte 102,15,239,210 // pxor %xmm2,%xmm2
+ .byte 102,15,196,208,2 // pinsrw $0x2,%eax,%xmm2
.byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 102,15,96,208 // punpcklbw %xmm0,%xmm2
- .byte 102,15,97,208 // punpcklwd %xmm0,%xmm2
- .byte 242,15,16,202 // movsd %xmm2,%xmm1
- .byte 235,183 // jmp 50a <_sk_load_a8_dst_sse2_8bit+0x2b>
- .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
- .byte 102,15,110,200 // movd %eax,%xmm1
- .byte 235,172 // jmp 50a <_sk_load_a8_dst_sse2_8bit+0x2b>
+ .byte 102,15,110,216 // movd %eax,%xmm3
+ .byte 102,15,96,216 // punpcklbw %xmm0,%xmm3
+ .byte 243,15,16,211 // movss %xmm3,%xmm2
+ .byte 235,136 // jmp 8cb <_sk_load_a8_dst_sse2_8bit+0x27>
+ .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
+ .byte 102,15,239,210 // pxor %xmm2,%xmm2
+ .byte 102,15,196,208,6 // pinsrw $0x6,%eax,%xmm2
+ .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
+ .byte 102,15,196,208,5 // pinsrw $0x5,%eax,%xmm2
+ .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
+ .byte 102,15,196,208,4 // pinsrw $0x4,%eax,%xmm2
+ .byte 102,66,15,110,28,2 // movd (%rdx,%r8,1),%xmm3
+ .byte 102,15,96,216 // punpcklbw %xmm0,%xmm3
+ .byte 242,15,16,211 // movsd %xmm3,%xmm2
+ .byte 233,80,255,255,255 // jmpq 8cb <_sk_load_a8_dst_sse2_8bit+0x27>
+ .byte 144 // nop
+ .byte 154 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,180,255,255,255,165,255 // pushq -0x5a0001(%rdi,%rdi,8)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 236 // in (%dx),%al
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,225 // jmpq *%rcx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,214 // callq *%rsi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,199 // inc %edi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_store_a8_sse2_8bit
.globl _sk_store_a8_sse2_8bit
@@ -59895,36 +62311,73 @@ _sk_store_a8_sse2_8bit:
.byte 72,99,87,8 // movslq 0x8(%rdi),%rdx
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
- .byte 102,15,111,208 // movdqa %xmm0,%xmm2
- .byte 102,15,114,210,24 // psrld $0x18,%xmm2
+ .byte 102,15,111,224 // movdqa %xmm0,%xmm4
+ .byte 102,15,114,212,24 // psrld $0x18,%xmm4
+ .byte 102,15,111,233 // movdqa %xmm1,%xmm5
+ .byte 102,15,114,213,24 // psrld $0x18,%xmm5
+ .byte 102,15,114,245,16 // pslld $0x10,%xmm5
+ .byte 102,15,114,229,16 // psrad $0x10,%xmm5
+ .byte 102,15,114,244,16 // pslld $0x10,%xmm4
+ .byte 102,15,114,228,16 // psrad $0x10,%xmm4
+ .byte 102,15,107,229 // packssdw %xmm5,%xmm4
.byte 77,133,201 // test %r9,%r9
- .byte 117,26 // jne 59e <_sk_store_a8_sse2_8bit+0x40>
- .byte 102,15,219,21,100,12,0,0 // pand 0xc64(%rip),%xmm2 # 11f0 <_sk_xor__sse2_8bit+0xf5>
- .byte 102,15,103,210 // packuswb %xmm2,%xmm2
- .byte 102,15,103,210 // packuswb %xmm2,%xmm2
- .byte 102,66,15,126,20,2 // movd %xmm2,(%rdx,%r8,1)
- .byte 72,173 // lods %ds:(%rsi),%rax
- .byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,54 // je 5de <_sk_store_a8_sse2_8bit+0x80>
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,21 // je 5c3 <_sk_store_a8_sse2_8bit+0x65>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,230 // jne 59a <_sk_store_a8_sse2_8bit+0x3c>
- .byte 102,15,127,84,36,232 // movdqa %xmm2,-0x18(%rsp)
- .byte 138,68,36,240 // mov -0x10(%rsp),%al
+ .byte 117,22 // jne 9f5 <_sk_store_a8_sse2_8bit+0x5d>
+ .byte 102,15,219,37,41,25,0,0 // pand 0x1929(%rip),%xmm4 # 2310 <_sk_xor__sse2_8bit+0x1f7>
+ .byte 102,15,103,228 // packuswb %xmm4,%xmm4
+ .byte 102,66,15,214,36,2 // movq %xmm4,(%rdx,%r8,1)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,239 // ja 9f1 <_sk_store_a8_sse2_8bit+0x59>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,131,0,0,0 // lea 0x83(%rip),%rcx # a90 <_sk_store_a8_sse2_8bit+0xf8>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 102,15,127,100,36,168 // movdqa %xmm4,-0x58(%rsp)
+ .byte 138,68,36,168 // mov -0x58(%rsp),%al
+ .byte 66,136,4,2 // mov %al,(%rdx,%r8,1)
+ .byte 235,203 // jmp 9f1 <_sk_store_a8_sse2_8bit+0x59>
+ .byte 102,15,127,100,36,184 // movdqa %xmm4,-0x48(%rsp)
+ .byte 138,68,36,188 // mov -0x44(%rsp),%al
.byte 66,136,68,2,2 // mov %al,0x2(%rdx,%r8,1)
- .byte 102,15,219,21,37,12,0,0 // pand 0xc25(%rip),%xmm2 # 11f0 <_sk_xor__sse2_8bit+0xf5>
- .byte 102,15,103,210 // packuswb %xmm2,%xmm2
- .byte 102,15,103,210 // packuswb %xmm2,%xmm2
- .byte 102,15,126,208 // movd %xmm2,%eax
+ .byte 102,15,219,37,211,24,0,0 // pand 0x18d3(%rip),%xmm4 # 2310 <_sk_xor__sse2_8bit+0x1f7>
+ .byte 102,15,103,228 // packuswb %xmm4,%xmm4
+ .byte 102,15,126,224 // movd %xmm4,%eax
.byte 102,66,137,4,2 // mov %ax,(%rdx,%r8,1)
- .byte 235,188 // jmp 59a <_sk_store_a8_sse2_8bit+0x3c>
- .byte 102,15,127,84,36,216 // movdqa %xmm2,-0x28(%rsp)
- .byte 138,68,36,216 // mov -0x28(%rsp),%al
- .byte 66,136,4,2 // mov %al,(%rdx,%r8,1)
- .byte 235,172 // jmp 59a <_sk_store_a8_sse2_8bit+0x3c>
+ .byte 235,165 // jmp 9f1 <_sk_store_a8_sse2_8bit+0x59>
+ .byte 102,15,127,100,36,232 // movdqa %xmm4,-0x18(%rsp)
+ .byte 138,68,36,244 // mov -0xc(%rsp),%al
+ .byte 66,136,68,2,6 // mov %al,0x6(%rdx,%r8,1)
+ .byte 102,15,127,100,36,216 // movdqa %xmm4,-0x28(%rsp)
+ .byte 138,68,36,226 // mov -0x1e(%rsp),%al
+ .byte 66,136,68,2,5 // mov %al,0x5(%rdx,%r8,1)
+ .byte 102,15,127,100,36,200 // movdqa %xmm4,-0x38(%rsp)
+ .byte 138,68,36,208 // mov -0x30(%rsp),%al
+ .byte 66,136,68,2,4 // mov %al,0x4(%rdx,%r8,1)
+ .byte 102,15,219,37,143,24,0,0 // pand 0x188f(%rip),%xmm4 # 2310 <_sk_xor__sse2_8bit+0x1f7>
+ .byte 102,15,103,228 // packuswb %xmm4,%xmm4
+ .byte 102,66,15,126,36,2 // movd %xmm4,(%rdx,%r8,1)
+ .byte 233,97,255,255,255 // jmpq 9f1 <_sk_store_a8_sse2_8bit+0x59>
+ .byte 134,255 // xchg %bh,%bh
+ .byte 255 // (bad)
+ .byte 255,165,255,255,255,150 // jmpq *-0x69000001(%rbp)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 233,255,255,255,218 // jmpq ffffffffdb000aa0 <_sk_xor__sse2_8bit+0xffffffffdaffe987>
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,203 // dec %ebx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 188 // .byte 0xbc
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_g8_sse2_8bit
.globl _sk_load_g8_sse2_8bit
@@ -59938,41 +62391,85 @@ _sk_load_g8_sse2_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,69 // jne 650 <_sk_load_g8_sse2_8bit+0x62>
- .byte 102,66,15,110,4,2 // movd (%rdx,%r8,1),%xmm0
+ .byte 117,116 // jne b3d <_sk_load_g8_sse2_8bit+0x91>
+ .byte 243,66,15,126,4,2 // movq (%rdx,%r8,1),%xmm0
.byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
- .byte 102,15,97,192 // punpcklwd %xmm0,%xmm0
- .byte 102,15,219,5,223,11,0,0 // pand 0xbdf(%rip),%xmm0 # 1200 <_sk_xor__sse2_8bit+0x105>
- .byte 102,15,111,21,231,11,0,0 // movdqa 0xbe7(%rip),%xmm2 # 1210 <_sk_xor__sse2_8bit+0x115>
- .byte 102,15,112,216,245 // pshufd $0xf5,%xmm0,%xmm3
- .byte 102,15,244,194 // pmuludq %xmm2,%xmm0
- .byte 102,15,112,192,232 // pshufd $0xe8,%xmm0,%xmm0
- .byte 102,15,244,218 // pmuludq %xmm2,%xmm3
- .byte 102,15,112,211,232 // pshufd $0xe8,%xmm3,%xmm2
- .byte 102,15,98,194 // punpckldq %xmm2,%xmm0
- .byte 102,15,235,5,212,11,0,0 // por 0xbd4(%rip),%xmm0 # 1220 <_sk_xor__sse2_8bit+0x125>
+ .byte 102,15,84,5,69,24,0,0 // andpd 0x1845(%rip),%xmm0 # 2320 <_sk_xor__sse2_8bit+0x207>
+ .byte 102,15,239,201 // pxor %xmm1,%xmm1
+ .byte 102,15,40,224 // movapd %xmm0,%xmm4
+ .byte 102,15,97,225 // punpcklwd %xmm1,%xmm4
+ .byte 102,15,105,193 // punpckhwd %xmm1,%xmm0
+ .byte 102,15,111,45,61,24,0,0 // movdqa 0x183d(%rip),%xmm5 # 2330 <_sk_xor__sse2_8bit+0x217>
+ .byte 102,15,112,240,245 // pshufd $0xf5,%xmm0,%xmm6
+ .byte 102,15,244,197 // pmuludq %xmm5,%xmm0
+ .byte 102,15,112,200,232 // pshufd $0xe8,%xmm0,%xmm1
+ .byte 102,15,244,245 // pmuludq %xmm5,%xmm6
+ .byte 102,15,112,198,232 // pshufd $0xe8,%xmm6,%xmm0
+ .byte 102,15,98,200 // punpckldq %xmm0,%xmm1
+ .byte 102,15,112,244,245 // pshufd $0xf5,%xmm4,%xmm6
+ .byte 102,15,244,229 // pmuludq %xmm5,%xmm4
+ .byte 102,15,112,196,232 // pshufd $0xe8,%xmm4,%xmm0
+ .byte 102,15,244,245 // pmuludq %xmm5,%xmm6
+ .byte 102,15,112,230,232 // pshufd $0xe8,%xmm6,%xmm4
+ .byte 102,15,98,196 // punpckldq %xmm4,%xmm0
+ .byte 102,15,111,37,15,24,0,0 // movdqa 0x180f(%rip),%xmm4 # 2340 <_sk_xor__sse2_8bit+0x227>
+ .byte 102,15,235,196 // por %xmm4,%xmm0
+ .byte 102,15,235,204 // por %xmm4,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,54 // je 690 <_sk_load_g8_sse2_8bit+0xa2>
+ .byte 65,128,225,7 // and $0x7,%r9b
.byte 102,15,239,192 // pxor %xmm0,%xmm0
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,21 // je 679 <_sk_load_g8_sse2_8bit+0x8b>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,175 // jne 619 <_sk_load_g8_sse2_8bit+0x2b>
- .byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
- .byte 102,15,110,192 // movd %eax,%xmm0
- .byte 102,15,112,192,69 // pshufd $0x45,%xmm0,%xmm0
- .byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 102,15,96,208 // punpcklbw %xmm0,%xmm2
- .byte 102,15,97,208 // punpcklwd %xmm0,%xmm2
- .byte 242,15,16,194 // movsd %xmm2,%xmm0
- .byte 235,137 // jmp 619 <_sk_load_g8_sse2_8bit+0x2b>
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,133 // ja ad3 <_sk_load_g8_sse2_8bit+0x27>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,119,0,0,0 // lea 0x77(%rip),%rcx # bd0 <_sk_load_g8_sse2_8bit+0x124>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
.byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
.byte 102,15,110,192 // movd %eax,%xmm0
- .byte 233,123,255,255,255 // jmpq 619 <_sk_load_g8_sse2_8bit+0x2b>
+ .byte 233,99,255,255,255 // jmpq ad3 <_sk_load_g8_sse2_8bit+0x27>
+ .byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
+ .byte 102,15,239,192 // pxor %xmm0,%xmm0
+ .byte 102,15,196,192,2 // pinsrw $0x2,%eax,%xmm0
+ .byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
+ .byte 102,15,110,200 // movd %eax,%xmm1
+ .byte 102,15,96,200 // punpcklbw %xmm0,%xmm1
+ .byte 243,15,16,193 // movss %xmm1,%xmm0
+ .byte 233,62,255,255,255 // jmpq ad3 <_sk_load_g8_sse2_8bit+0x27>
+ .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
+ .byte 102,15,239,192 // pxor %xmm0,%xmm0
+ .byte 102,15,196,192,6 // pinsrw $0x6,%eax,%xmm0
+ .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
+ .byte 102,15,196,192,5 // pinsrw $0x5,%eax,%xmm0
+ .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
+ .byte 102,15,196,192,4 // pinsrw $0x4,%eax,%xmm0
+ .byte 102,66,15,110,12,2 // movd (%rdx,%r8,1),%xmm1
+ .byte 102,15,96,200 // punpcklbw %xmm0,%xmm1
+ .byte 242,15,16,193 // movsd %xmm1,%xmm0
+ .byte 233,6,255,255,255 // jmpq ad3 <_sk_load_g8_sse2_8bit+0x27>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 146 // xchg %eax,%edx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,175,255,255,255,160 // ljmp *-0x5f000001(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 234 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 223,255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,212 // callq *%rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,197 // inc %ebp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_load_g8_dst_sse2_8bit
.globl _sk_load_g8_dst_sse2_8bit
@@ -59986,141 +62483,275 @@ _sk_load_g8_dst_sse2_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,69 // jne 700 <_sk_load_g8_dst_sse2_8bit+0x62>
- .byte 102,66,15,110,12,2 // movd (%rdx,%r8,1),%xmm1
- .byte 102,15,96,200 // punpcklbw %xmm0,%xmm1
- .byte 102,15,97,200 // punpcklwd %xmm0,%xmm1
- .byte 102,15,219,13,95,11,0,0 // pand 0xb5f(%rip),%xmm1 # 1230 <_sk_xor__sse2_8bit+0x135>
- .byte 102,15,111,21,103,11,0,0 // movdqa 0xb67(%rip),%xmm2 # 1240 <_sk_xor__sse2_8bit+0x145>
- .byte 102,15,112,217,245 // pshufd $0xf5,%xmm1,%xmm3
- .byte 102,15,244,202 // pmuludq %xmm2,%xmm1
- .byte 102,15,112,201,232 // pshufd $0xe8,%xmm1,%xmm1
- .byte 102,15,244,218 // pmuludq %xmm2,%xmm3
- .byte 102,15,112,211,232 // pshufd $0xe8,%xmm3,%xmm2
- .byte 102,15,98,202 // punpckldq %xmm2,%xmm1
- .byte 102,15,235,13,84,11,0,0 // por 0xb54(%rip),%xmm1 # 1250 <_sk_xor__sse2_8bit+0x155>
+ .byte 117,116 // jne c7d <_sk_load_g8_dst_sse2_8bit+0x91>
+ .byte 243,66,15,126,20,2 // movq (%rdx,%r8,1),%xmm2
+ .byte 102,15,96,208 // punpcklbw %xmm0,%xmm2
+ .byte 102,15,84,21,53,23,0,0 // andpd 0x1735(%rip),%xmm2 # 2350 <_sk_xor__sse2_8bit+0x237>
+ .byte 102,15,239,219 // pxor %xmm3,%xmm3
+ .byte 102,15,40,226 // movapd %xmm2,%xmm4
+ .byte 102,15,97,227 // punpcklwd %xmm3,%xmm4
+ .byte 102,15,105,211 // punpckhwd %xmm3,%xmm2
+ .byte 102,15,111,45,45,23,0,0 // movdqa 0x172d(%rip),%xmm5 # 2360 <_sk_xor__sse2_8bit+0x247>
+ .byte 102,15,112,242,245 // pshufd $0xf5,%xmm2,%xmm6
+ .byte 102,15,244,213 // pmuludq %xmm5,%xmm2
+ .byte 102,15,112,218,232 // pshufd $0xe8,%xmm2,%xmm3
+ .byte 102,15,244,245 // pmuludq %xmm5,%xmm6
+ .byte 102,15,112,214,232 // pshufd $0xe8,%xmm6,%xmm2
+ .byte 102,15,98,218 // punpckldq %xmm2,%xmm3
+ .byte 102,15,112,244,245 // pshufd $0xf5,%xmm4,%xmm6
+ .byte 102,15,244,229 // pmuludq %xmm5,%xmm4
+ .byte 102,15,112,212,232 // pshufd $0xe8,%xmm4,%xmm2
+ .byte 102,15,244,245 // pmuludq %xmm5,%xmm6
+ .byte 102,15,112,230,232 // pshufd $0xe8,%xmm6,%xmm4
+ .byte 102,15,98,212 // punpckldq %xmm4,%xmm2
+ .byte 102,15,111,37,255,22,0,0 // movdqa 0x16ff(%rip),%xmm4 # 2370 <_sk_xor__sse2_8bit+0x257>
+ .byte 102,15,235,212 // por %xmm4,%xmm2
+ .byte 102,15,235,220 // por %xmm4,%xmm3
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,54 // je 740 <_sk_load_g8_dst_sse2_8bit+0xa2>
- .byte 102,15,239,201 // pxor %xmm1,%xmm1
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,21 // je 729 <_sk_load_g8_dst_sse2_8bit+0x8b>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,175 // jne 6c9 <_sk_load_g8_dst_sse2_8bit+0x2b>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,210 // pxor %xmm2,%xmm2
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,133 // ja c13 <_sk_load_g8_dst_sse2_8bit+0x27>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,119,0,0,0 // lea 0x77(%rip),%rcx # d10 <_sk_load_g8_dst_sse2_8bit+0x124>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
+ .byte 102,15,110,208 // movd %eax,%xmm2
+ .byte 233,99,255,255,255 // jmpq c13 <_sk_load_g8_dst_sse2_8bit+0x27>
.byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
- .byte 102,15,110,200 // movd %eax,%xmm1
- .byte 102,15,112,201,69 // pshufd $0x45,%xmm1,%xmm1
+ .byte 102,15,239,210 // pxor %xmm2,%xmm2
+ .byte 102,15,196,208,2 // pinsrw $0x2,%eax,%xmm2
.byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 102,15,96,208 // punpcklbw %xmm0,%xmm2
- .byte 102,15,97,208 // punpcklwd %xmm0,%xmm2
- .byte 242,15,16,202 // movsd %xmm2,%xmm1
- .byte 235,137 // jmp 6c9 <_sk_load_g8_dst_sse2_8bit+0x2b>
- .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
- .byte 102,15,110,200 // movd %eax,%xmm1
- .byte 233,123,255,255,255 // jmpq 6c9 <_sk_load_g8_dst_sse2_8bit+0x2b>
+ .byte 102,15,110,216 // movd %eax,%xmm3
+ .byte 102,15,96,216 // punpcklbw %xmm0,%xmm3
+ .byte 243,15,16,211 // movss %xmm3,%xmm2
+ .byte 233,62,255,255,255 // jmpq c13 <_sk_load_g8_dst_sse2_8bit+0x27>
+ .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
+ .byte 102,15,239,210 // pxor %xmm2,%xmm2
+ .byte 102,15,196,208,6 // pinsrw $0x6,%eax,%xmm2
+ .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
+ .byte 102,15,196,208,5 // pinsrw $0x5,%eax,%xmm2
+ .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
+ .byte 102,15,196,208,4 // pinsrw $0x4,%eax,%xmm2
+ .byte 102,66,15,110,28,2 // movd (%rdx,%r8,1),%xmm3
+ .byte 102,15,96,216 // punpcklbw %xmm0,%xmm3
+ .byte 242,15,16,211 // movsd %xmm3,%xmm2
+ .byte 233,6,255,255,255 // jmpq c13 <_sk_load_g8_dst_sse2_8bit+0x27>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 146 // xchg %eax,%edx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,175,255,255,255,160 // ljmp *-0x5f000001(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 234 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 223,255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,212 // callq *%rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,197 // inc %ebp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_srcover_rgba_8888_sse2_8bit
.globl _sk_srcover_rgba_8888_sse2_8bit
FUNCTION(_sk_srcover_rgba_8888_sse2_8bit)
_sk_srcover_rgba_8888_sse2_8bit:
- .byte 76,99,7 // movslq (%rdi),%r8
- .byte 76,139,79,16 // mov 0x10(%rdi),%r9
+ .byte 76,99,15 // movslq (%rdi),%r9
+ .byte 76,139,71,16 // mov 0x10(%rdi),%r8
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 72,99,72,8 // movslq 0x8(%rax),%rcx
.byte 72,99,87,8 // movslq 0x8(%rdi),%rdx
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,193,226,2 // shl $0x2,%rdx
.byte 72,3,16 // add (%rax),%rdx
- .byte 77,133,201 // test %r9,%r9
- .byte 117,120 // jne 7e7 <_sk_srcover_rgba_8888_sse2_8bit+0x99>
- .byte 243,66,15,111,20,130 // movdqu (%rdx,%r8,4),%xmm2
- .byte 77,133,201 // test %r9,%r9
- .byte 242,15,112,216,231 // pshuflw $0xe7,%xmm0,%xmm3
- .byte 243,15,112,219,231 // pshufhw $0xe7,%xmm3,%xmm3
- .byte 102,15,112,219,232 // pshufd $0xe8,%xmm3,%xmm3
- .byte 102,15,96,219 // punpcklbw %xmm3,%xmm3
- .byte 242,15,112,219,95 // pshuflw $0x5f,%xmm3,%xmm3
- .byte 243,15,112,219,95 // pshufhw $0x5f,%xmm3,%xmm3
- .byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,111,234 // movdqa %xmm2,%xmm5
- .byte 102,15,96,236 // punpcklbw %xmm4,%xmm5
- .byte 102,15,111,242 // movdqa %xmm2,%xmm6
- .byte 102,15,104,244 // punpckhbw %xmm4,%xmm6
- .byte 102,15,111,251 // movdqa %xmm3,%xmm7
- .byte 102,15,96,252 // punpcklbw %xmm4,%xmm7
- .byte 102,15,104,220 // punpckhbw %xmm4,%xmm3
- .byte 102,15,213,222 // pmullw %xmm6,%xmm3
- .byte 102,15,213,253 // pmullw %xmm5,%xmm7
- .byte 102,15,253,253 // paddw %xmm5,%xmm7
- .byte 102,15,253,222 // paddw %xmm6,%xmm3
- .byte 102,15,113,211,8 // psrlw $0x8,%xmm3
+ .byte 77,133,192 // test %r8,%r8
+ .byte 15,133,253,0,0,0 // jne e4e <_sk_srcover_rgba_8888_sse2_8bit+0x122>
+ .byte 70,15,16,68,138,16 // movups 0x10(%rdx,%r9,4),%xmm8
+ .byte 102,70,15,16,12,138 // movupd (%rdx,%r9,4),%xmm9
+ .byte 77,133,192 // test %r8,%r8
+ .byte 242,15,112,225,231 // pshuflw $0xe7,%xmm1,%xmm4
+ .byte 243,15,112,228,231 // pshufhw $0xe7,%xmm4,%xmm4
+ .byte 102,15,112,228,232 // pshufd $0xe8,%xmm4,%xmm4
+ .byte 102,15,96,228 // punpcklbw %xmm4,%xmm4
+ .byte 242,15,112,228,95 // pshuflw $0x5f,%xmm4,%xmm4
+ .byte 243,15,112,236,95 // pshufhw $0x5f,%xmm4,%xmm5
+ .byte 242,15,112,224,231 // pshuflw $0xe7,%xmm0,%xmm4
+ .byte 243,15,112,228,231 // pshufhw $0xe7,%xmm4,%xmm4
+ .byte 102,15,112,228,232 // pshufd $0xe8,%xmm4,%xmm4
+ .byte 102,15,96,228 // punpcklbw %xmm4,%xmm4
+ .byte 242,15,112,228,95 // pshuflw $0x5f,%xmm4,%xmm4
+ .byte 243,15,112,228,95 // pshufhw $0x5f,%xmm4,%xmm4
+ .byte 102,69,15,239,210 // pxor %xmm10,%xmm10
+ .byte 102,69,15,40,217 // movapd %xmm9,%xmm11
+ .byte 102,69,15,96,218 // punpcklbw %xmm10,%xmm11
+ .byte 102,69,15,40,225 // movapd %xmm9,%xmm12
+ .byte 102,69,15,104,226 // punpckhbw %xmm10,%xmm12
+ .byte 102,69,15,111,232 // movdqa %xmm8,%xmm13
+ .byte 102,69,15,96,234 // punpcklbw %xmm10,%xmm13
+ .byte 102,69,15,111,240 // movdqa %xmm8,%xmm14
+ .byte 102,69,15,104,242 // punpckhbw %xmm10,%xmm14
+ .byte 102,15,111,252 // movdqa %xmm4,%xmm7
+ .byte 102,65,15,96,250 // punpcklbw %xmm10,%xmm7
+ .byte 102,65,15,104,226 // punpckhbw %xmm10,%xmm4
+ .byte 102,15,111,245 // movdqa %xmm5,%xmm6
+ .byte 102,65,15,96,242 // punpcklbw %xmm10,%xmm6
+ .byte 102,65,15,104,234 // punpckhbw %xmm10,%xmm5
+ .byte 102,65,15,213,238 // pmullw %xmm14,%xmm5
+ .byte 102,65,15,213,245 // pmullw %xmm13,%xmm6
+ .byte 102,65,15,213,228 // pmullw %xmm12,%xmm4
+ .byte 102,65,15,213,251 // pmullw %xmm11,%xmm7
+ .byte 102,65,15,253,251 // paddw %xmm11,%xmm7
+ .byte 102,65,15,253,228 // paddw %xmm12,%xmm4
+ .byte 102,65,15,253,245 // paddw %xmm13,%xmm6
+ .byte 102,65,15,253,238 // paddw %xmm14,%xmm5
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
.byte 102,15,113,215,8 // psrlw $0x8,%xmm7
- .byte 102,15,103,251 // packuswb %xmm3,%xmm7
- .byte 102,15,248,215 // psubb %xmm7,%xmm2
- .byte 102,15,252,208 // paddb %xmm0,%xmm2
- .byte 117,58 // jne 817 <_sk_srcover_rgba_8888_sse2_8bit+0xc9>
- .byte 243,66,15,127,20,130 // movdqu %xmm2,(%rdx,%r8,4)
+ .byte 102,15,103,252 // packuswb %xmm4,%xmm7
+ .byte 102,15,103,245 // packuswb %xmm5,%xmm6
+ .byte 102,68,15,248,198 // psubb %xmm6,%xmm8
+ .byte 102,68,15,248,207 // psubb %xmm7,%xmm9
+ .byte 102,68,15,252,200 // paddb %xmm0,%xmm9
+ .byte 102,68,15,252,193 // paddb %xmm1,%xmm8
+ .byte 117,72 // jne e85 <_sk_srcover_rgba_8888_sse2_8bit+0x159>
+ .byte 243,70,15,127,12,138 // movdqu %xmm9,(%rdx,%r9,4)
+ .byte 243,70,15,127,68,138,16 // movdqu %xmm8,0x10(%rdx,%r9,4)
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 68,137,200 // mov %r9d,%eax
- .byte 36,3 // and $0x3,%al
- .byte 60,1 // cmp $0x1,%al
- .byte 116,81 // je 841 <_sk_srcover_rgba_8888_sse2_8bit+0xf3>
- .byte 102,15,239,210 // pxor %xmm2,%xmm2
- .byte 60,2 // cmp $0x2,%al
- .byte 116,20 // je 80c <_sk_srcover_rgba_8888_sse2_8bit+0xbe>
- .byte 60,3 // cmp $0x3,%al
- .byte 15,133,117,255,255,255 // jne 775 <_sk_srcover_rgba_8888_sse2_8bit+0x27>
- .byte 102,66,15,110,84,130,8 // movd 0x8(%rdx,%r8,4),%xmm2
- .byte 102,15,112,210,69 // pshufd $0x45,%xmm2,%xmm2
- .byte 102,66,15,18,20,130 // movlpd (%rdx,%r8,4),%xmm2
- .byte 233,94,255,255,255 // jmpq 775 <_sk_srcover_rgba_8888_sse2_8bit+0x27>
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,43 // je 84c <_sk_srcover_rgba_8888_sse2_8bit+0xfe>
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,18 // je 839 <_sk_srcover_rgba_8888_sse2_8bit+0xeb>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,182 // jne 7e3 <_sk_srcover_rgba_8888_sse2_8bit+0x95>
- .byte 102,15,112,218,78 // pshufd $0x4e,%xmm2,%xmm3
- .byte 102,66,15,126,92,130,8 // movd %xmm3,0x8(%rdx,%r8,4)
- .byte 102,66,15,214,20,130 // movq %xmm2,(%rdx,%r8,4)
- .byte 235,162 // jmp 7e3 <_sk_srcover_rgba_8888_sse2_8bit+0x95>
- .byte 102,66,15,110,20,130 // movd (%rdx,%r8,4),%xmm2
- .byte 233,41,255,255,255 // jmpq 775 <_sk_srcover_rgba_8888_sse2_8bit+0x27>
- .byte 102,66,15,126,20,130 // movd %xmm2,(%rdx,%r8,4)
- .byte 235,143 // jmp 7e3 <_sk_srcover_rgba_8888_sse2_8bit+0x95>
+ .byte 68,137,192 // mov %r8d,%eax
+ .byte 36,7 // and $0x7,%al
+ .byte 102,69,15,239,192 // pxor %xmm8,%xmm8
+ .byte 102,69,15,239,201 // pxor %xmm9,%xmm9
+ .byte 254,200 // dec %al
+ .byte 60,6 // cmp $0x6,%al
+ .byte 15,135,246,254,255,255 // ja d5d <_sk_srcover_rgba_8888_sse2_8bit+0x31>
+ .byte 15,182,192 // movzbl %al,%eax
+ .byte 72,141,13,207,0,0,0 // lea 0xcf(%rip),%rcx # f40 <_sk_srcover_rgba_8888_sse2_8bit+0x214>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 243,70,15,16,12,138 // movss (%rdx,%r9,4),%xmm9
+ .byte 233,216,254,255,255 // jmpq d5d <_sk_srcover_rgba_8888_sse2_8bit+0x31>
+ .byte 65,128,224,7 // and $0x7,%r8b
+ .byte 65,254,200 // dec %r8b
+ .byte 65,128,248,6 // cmp $0x6,%r8b
+ .byte 119,184 // ja e4a <_sk_srcover_rgba_8888_sse2_8bit+0x11e>
+ .byte 65,15,182,192 // movzbl %r8b,%eax
+ .byte 72,141,13,191,0,0,0 // lea 0xbf(%rip),%rcx # f5c <_sk_srcover_rgba_8888_sse2_8bit+0x230>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 102,70,15,126,12,138 // movd %xmm9,(%rdx,%r9,4)
+ .byte 235,156 // jmp e4a <_sk_srcover_rgba_8888_sse2_8bit+0x11e>
+ .byte 102,66,15,110,100,138,8 // movd 0x8(%rdx,%r9,4),%xmm4
+ .byte 102,68,15,112,204,69 // pshufd $0x45,%xmm4,%xmm9
+ .byte 102,70,15,18,12,138 // movlpd (%rdx,%r9,4),%xmm9
+ .byte 233,151,254,255,255 // jmpq d5d <_sk_srcover_rgba_8888_sse2_8bit+0x31>
+ .byte 102,66,15,110,100,138,24 // movd 0x18(%rdx,%r9,4),%xmm4
+ .byte 102,68,15,112,196,69 // pshufd $0x45,%xmm4,%xmm8
+ .byte 243,66,15,16,100,138,20 // movss 0x14(%rdx,%r9,4),%xmm4
+ .byte 65,15,198,224,0 // shufps $0x0,%xmm8,%xmm4
+ .byte 65,15,198,224,226 // shufps $0xe2,%xmm8,%xmm4
+ .byte 68,15,40,196 // movaps %xmm4,%xmm8
+ .byte 243,66,15,16,100,138,16 // movss 0x10(%rdx,%r9,4),%xmm4
+ .byte 243,68,15,16,196 // movss %xmm4,%xmm8
+ .byte 233,94,254,255,255 // jmpq d57 <_sk_srcover_rgba_8888_sse2_8bit+0x2b>
+ .byte 102,65,15,112,225,78 // pshufd $0x4e,%xmm9,%xmm4
+ .byte 102,66,15,126,100,138,8 // movd %xmm4,0x8(%rdx,%r9,4)
+ .byte 102,70,15,214,12,138 // movq %xmm9,(%rdx,%r9,4)
+ .byte 233,57,255,255,255 // jmpq e4a <_sk_srcover_rgba_8888_sse2_8bit+0x11e>
+ .byte 102,65,15,112,224,78 // pshufd $0x4e,%xmm8,%xmm4
+ .byte 102,66,15,126,100,138,24 // movd %xmm4,0x18(%rdx,%r9,4)
+ .byte 102,65,15,112,224,229 // pshufd $0xe5,%xmm8,%xmm4
+ .byte 102,66,15,126,100,138,20 // movd %xmm4,0x14(%rdx,%r9,4)
+ .byte 102,70,15,126,68,138,16 // movd %xmm8,0x10(%rdx,%r9,4)
+ .byte 243,70,15,127,12,138 // movdqu %xmm9,(%rdx,%r9,4)
+ .byte 233,13,255,255,255 // jmpq e4a <_sk_srcover_rgba_8888_sse2_8bit+0x11e>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 58,255 // cmp %bh,%bh
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 123,255 // jnp f45 <_sk_srcover_rgba_8888_sse2_8bit+0x219>
+ .byte 255 // (bad)
+ .byte 255,110,255 // ljmp *-0x1(%rsi)
+ .byte 255 // (bad)
+ .byte 255,23 // callq *(%rdi)
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,168,255,255,255,147 // ljmp *-0x6c000001(%rax)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,134,255,255,255,74 // incl 0x4affffff(%rsi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,170,255,255,255,157 // ljmp *-0x62000001(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,214 // callq *%rsi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,207 // dec %edi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,194 // inc %edx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
+ .byte 181,255 // mov $0xff,%ch
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_scale_1_float_sse2_8bit
.globl _sk_scale_1_float_sse2_8bit
FUNCTION(_sk_scale_1_float_sse2_8bit)
_sk_scale_1_float_sse2_8bit:
+ .byte 102,68,15,111,193 // movdqa %xmm1,%xmm8
+ .byte 102,68,15,111,200 // movdqa %xmm0,%xmm9
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 243,15,16,16 // movss (%rax),%xmm2
- .byte 243,15,89,21,98,9,0,0 // mulss 0x962(%rip),%xmm2 # 11c4 <_sk_xor__sse2_8bit+0xc9>
- .byte 243,15,44,194 // cvttss2si %xmm2,%eax
- .byte 102,15,239,219 // pxor %xmm3,%xmm3
- .byte 102,15,111,208 // movdqa %xmm0,%xmm2
- .byte 102,15,96,211 // punpcklbw %xmm3,%xmm2
- .byte 102,15,104,195 // punpckhbw %xmm3,%xmm0
- .byte 102,15,110,216 // movd %eax,%xmm3
- .byte 102,15,96,219 // punpcklbw %xmm3,%xmm3
- .byte 242,15,112,219,0 // pshuflw $0x0,%xmm3,%xmm3
- .byte 102,15,112,219,80 // pshufd $0x50,%xmm3,%xmm3
- .byte 102,15,219,29,208,9,0,0 // pand 0x9d0(%rip),%xmm3 # 1260 <_sk_xor__sse2_8bit+0x165>
- .byte 102,15,111,227 // movdqa %xmm3,%xmm4
- .byte 102,15,213,224 // pmullw %xmm0,%xmm4
- .byte 102,15,213,218 // pmullw %xmm2,%xmm3
- .byte 102,15,253,211 // paddw %xmm3,%xmm2
- .byte 102,15,253,224 // paddw %xmm0,%xmm4
- .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,212 // packuswb %xmm4,%xmm2
+ .byte 243,15,16,0 // movss (%rax),%xmm0
+ .byte 243,15,89,5,52,19,0,0 // mulss 0x1334(%rip),%xmm0 # 22c4 <_sk_xor__sse2_8bit+0x1ab>
+ .byte 243,15,44,192 // cvttss2si %xmm0,%eax
+ .byte 102,15,239,246 // pxor %xmm6,%xmm6
+ .byte 102,65,15,111,193 // movdqa %xmm9,%xmm0
+ .byte 102,15,96,198 // punpcklbw %xmm6,%xmm0
+ .byte 102,68,15,104,206 // punpckhbw %xmm6,%xmm9
+ .byte 102,15,96,206 // punpcklbw %xmm6,%xmm1
+ .byte 102,68,15,104,198 // punpckhbw %xmm6,%xmm8
+ .byte 102,15,110,240 // movd %eax,%xmm6
+ .byte 102,15,96,246 // punpcklbw %xmm6,%xmm6
+ .byte 242,15,112,246,0 // pshuflw $0x0,%xmm6,%xmm6
+ .byte 102,15,112,246,80 // pshufd $0x50,%xmm6,%xmm6
+ .byte 102,15,219,53,183,19,0,0 // pand 0x13b7(%rip),%xmm6 # 2380 <_sk_xor__sse2_8bit+0x267>
+ .byte 102,15,111,254 // movdqa %xmm6,%xmm7
+ .byte 102,65,15,213,248 // pmullw %xmm8,%xmm7
+ .byte 102,15,111,230 // movdqa %xmm6,%xmm4
+ .byte 102,15,213,225 // pmullw %xmm1,%xmm4
+ .byte 102,15,111,238 // movdqa %xmm6,%xmm5
+ .byte 102,65,15,213,233 // pmullw %xmm9,%xmm5
+ .byte 102,15,213,240 // pmullw %xmm0,%xmm6
+ .byte 102,15,253,198 // paddw %xmm6,%xmm0
+ .byte 102,65,15,253,233 // paddw %xmm9,%xmm5
+ .byte 102,15,253,204 // paddw %xmm4,%xmm1
+ .byte 102,65,15,253,248 // paddw %xmm8,%xmm7
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,208,8 // psrlw $0x8,%xmm0
+ .byte 102,15,103,197 // packuswb %xmm5,%xmm0
+ .byte 102,15,103,207 // packuswb %xmm7,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
.byte 255,224 // jmpq *%rax
HIDDEN _sk_scale_u8_sse2_8bit
@@ -60135,98 +62766,181 @@ _sk_scale_u8_sse2_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 117,114 // jne 949 <_sk_scale_u8_sse2_8bit+0x8f>
- .byte 102,66,15,110,20,2 // movd (%rdx,%r8,1),%xmm2
- .byte 102,15,96,208 // punpcklbw %xmm0,%xmm2
- .byte 102,15,97,208 // punpcklwd %xmm0,%xmm2
- .byte 102,15,114,242,24 // pslld $0x18,%xmm2
- .byte 242,15,112,210,231 // pshuflw $0xe7,%xmm2,%xmm2
- .byte 243,15,112,210,231 // pshufhw $0xe7,%xmm2,%xmm2
- .byte 102,15,112,210,232 // pshufd $0xe8,%xmm2,%xmm2
- .byte 102,15,96,210 // punpcklbw %xmm2,%xmm2
- .byte 242,15,112,210,95 // pshuflw $0x5f,%xmm2,%xmm2
- .byte 243,15,112,218,95 // pshufhw $0x5f,%xmm2,%xmm3
- .byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,111,232 // movdqa %xmm0,%xmm5
- .byte 102,15,96,236 // punpcklbw %xmm4,%xmm5
- .byte 102,15,104,196 // punpckhbw %xmm4,%xmm0
- .byte 102,15,111,211 // movdqa %xmm3,%xmm2
- .byte 102,15,96,212 // punpcklbw %xmm4,%xmm2
- .byte 102,15,104,220 // punpckhbw %xmm4,%xmm3
- .byte 102,15,213,216 // pmullw %xmm0,%xmm3
- .byte 102,15,213,213 // pmullw %xmm5,%xmm2
- .byte 102,15,253,213 // paddw %xmm5,%xmm2
- .byte 102,15,253,216 // paddw %xmm0,%xmm3
- .byte 102,15,113,211,8 // psrlw $0x8,%xmm3
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,211 // packuswb %xmm3,%xmm2
+ .byte 15,133,239,0,0,0 // jne 1129 <_sk_scale_u8_sse2_8bit+0x110>
+ .byte 243,66,15,126,36,2 // movq (%rdx,%r8,1),%xmm4
+ .byte 102,15,96,224 // punpcklbw %xmm0,%xmm4
+ .byte 102,15,84,37,68,19,0,0 // andpd 0x1344(%rip),%xmm4 # 2390 <_sk_xor__sse2_8bit+0x277>
+ .byte 102,69,15,239,192 // pxor %xmm8,%xmm8
+ .byte 102,15,40,236 // movapd %xmm4,%xmm5
+ .byte 102,65,15,105,232 // punpckhwd %xmm8,%xmm5
+ .byte 102,65,15,97,224 // punpcklwd %xmm8,%xmm4
+ .byte 102,15,114,244,24 // pslld $0x18,%xmm4
+ .byte 102,15,114,245,24 // pslld $0x18,%xmm5
+ .byte 242,15,112,237,231 // pshuflw $0xe7,%xmm5,%xmm5
+ .byte 243,15,112,237,231 // pshufhw $0xe7,%xmm5,%xmm5
+ .byte 102,15,112,237,232 // pshufd $0xe8,%xmm5,%xmm5
+ .byte 102,15,96,237 // punpcklbw %xmm5,%xmm5
+ .byte 242,15,112,237,95 // pshuflw $0x5f,%xmm5,%xmm5
+ .byte 243,15,112,245,95 // pshufhw $0x5f,%xmm5,%xmm6
+ .byte 242,15,112,228,231 // pshuflw $0xe7,%xmm4,%xmm4
+ .byte 243,15,112,228,231 // pshufhw $0xe7,%xmm4,%xmm4
+ .byte 102,15,112,228,232 // pshufd $0xe8,%xmm4,%xmm4
+ .byte 102,15,96,228 // punpcklbw %xmm4,%xmm4
+ .byte 242,15,112,228,95 // pshuflw $0x5f,%xmm4,%xmm4
+ .byte 243,15,112,252,95 // pshufhw $0x5f,%xmm4,%xmm7
+ .byte 102,68,15,111,200 // movdqa %xmm0,%xmm9
+ .byte 102,69,15,96,200 // punpcklbw %xmm8,%xmm9
+ .byte 102,65,15,104,192 // punpckhbw %xmm8,%xmm0
+ .byte 102,68,15,111,209 // movdqa %xmm1,%xmm10
+ .byte 102,69,15,96,208 // punpcklbw %xmm8,%xmm10
+ .byte 102,65,15,104,200 // punpckhbw %xmm8,%xmm1
+ .byte 102,15,111,231 // movdqa %xmm7,%xmm4
+ .byte 102,65,15,96,224 // punpcklbw %xmm8,%xmm4
+ .byte 102,65,15,104,248 // punpckhbw %xmm8,%xmm7
+ .byte 102,15,111,238 // movdqa %xmm6,%xmm5
+ .byte 102,65,15,96,232 // punpcklbw %xmm8,%xmm5
+ .byte 102,65,15,104,240 // punpckhbw %xmm8,%xmm6
+ .byte 102,15,213,241 // pmullw %xmm1,%xmm6
+ .byte 102,65,15,213,234 // pmullw %xmm10,%xmm5
+ .byte 102,15,213,248 // pmullw %xmm0,%xmm7
+ .byte 102,65,15,213,225 // pmullw %xmm9,%xmm4
+ .byte 102,65,15,253,225 // paddw %xmm9,%xmm4
+ .byte 102,15,253,248 // paddw %xmm0,%xmm7
+ .byte 102,65,15,253,234 // paddw %xmm10,%xmm5
+ .byte 102,15,253,241 // paddw %xmm1,%xmm6
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,15,103,231 // packuswb %xmm7,%xmm4
+ .byte 102,15,103,238 // packuswb %xmm6,%xmm5
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
+ .byte 102,15,111,196 // movdqa %xmm4,%xmm0
+ .byte 102,15,111,205 // movdqa %xmm5,%xmm1
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,57 // je 98c <_sk_scale_u8_sse2_8bit+0xd2>
- .byte 102,15,239,210 // pxor %xmm2,%xmm2
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,21 // je 972 <_sk_scale_u8_sse2_8bit+0xb8>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 117,130 // jne 8e5 <_sk_scale_u8_sse2_8bit+0x2b>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,228 // pxor %xmm4,%xmm4
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 15,135,6,255,255,255 // ja 1044 <_sk_scale_u8_sse2_8bit+0x2b>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,119,0,0,0 // lea 0x77(%rip),%rcx # 11c0 <_sk_scale_u8_sse2_8bit+0x1a7>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
+ .byte 102,15,110,224 // movd %eax,%xmm4
+ .byte 233,228,254,255,255 // jmpq 1044 <_sk_scale_u8_sse2_8bit+0x2b>
.byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 102,15,112,210,69 // pshufd $0x45,%xmm2,%xmm2
+ .byte 102,15,239,228 // pxor %xmm4,%xmm4
+ .byte 102,15,196,224,2 // pinsrw $0x2,%eax,%xmm4
.byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
- .byte 102,15,110,216 // movd %eax,%xmm3
- .byte 102,15,96,216 // punpcklbw %xmm0,%xmm3
- .byte 102,15,97,216 // punpcklwd %xmm0,%xmm3
- .byte 242,15,16,211 // movsd %xmm3,%xmm2
- .byte 233,89,255,255,255 // jmpq 8e5 <_sk_scale_u8_sse2_8bit+0x2b>
- .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 233,75,255,255,255 // jmpq 8e5 <_sk_scale_u8_sse2_8bit+0x2b>
+ .byte 102,15,110,232 // movd %eax,%xmm5
+ .byte 102,15,96,232 // punpcklbw %xmm0,%xmm5
+ .byte 243,15,16,229 // movss %xmm5,%xmm4
+ .byte 233,191,254,255,255 // jmpq 1044 <_sk_scale_u8_sse2_8bit+0x2b>
+ .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
+ .byte 102,15,239,228 // pxor %xmm4,%xmm4
+ .byte 102,15,196,224,6 // pinsrw $0x6,%eax,%xmm4
+ .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
+ .byte 102,15,196,224,5 // pinsrw $0x5,%eax,%xmm4
+ .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
+ .byte 102,15,196,224,4 // pinsrw $0x4,%eax,%xmm4
+ .byte 102,66,15,110,44,2 // movd (%rdx,%r8,1),%xmm5
+ .byte 102,15,96,232 // punpcklbw %xmm0,%xmm5
+ .byte 242,15,16,229 // movsd %xmm5,%xmm4
+ .byte 233,135,254,255,255 // jmpq 1044 <_sk_scale_u8_sse2_8bit+0x2b>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 146 // xchg %eax,%edx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,175,255,255,255,160 // ljmp *-0x5f000001(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 234 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 223,255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,212 // callq *%rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,197 // inc %ebp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_lerp_1_float_sse2_8bit
.globl _sk_lerp_1_float_sse2_8bit
FUNCTION(_sk_lerp_1_float_sse2_8bit)
_sk_lerp_1_float_sse2_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 243,15,16,16 // movss (%rax),%xmm2
- .byte 243,15,89,21,32,8,0,0 // mulss 0x820(%rip),%xmm2 # 11c8 <_sk_xor__sse2_8bit+0xcd>
- .byte 243,15,44,194 // cvttss2si %xmm2,%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 102,15,96,210 // punpcklbw %xmm2,%xmm2
- .byte 242,15,112,210,0 // pshuflw $0x0,%xmm2,%xmm2
- .byte 102,15,112,218,80 // pshufd $0x50,%xmm2,%xmm3
- .byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,111,232 // movdqa %xmm0,%xmm5
- .byte 102,15,96,236 // punpcklbw %xmm4,%xmm5
- .byte 102,15,104,196 // punpckhbw %xmm4,%xmm0
- .byte 102,15,111,21,154,8,0,0 // movdqa 0x89a(%rip),%xmm2 # 1270 <_sk_xor__sse2_8bit+0x175>
- .byte 102,15,219,211 // pand %xmm3,%xmm2
- .byte 102,15,111,242 // movdqa %xmm2,%xmm6
- .byte 102,15,213,240 // pmullw %xmm0,%xmm6
- .byte 102,15,213,213 // pmullw %xmm5,%xmm2
- .byte 102,15,253,213 // paddw %xmm5,%xmm2
- .byte 102,15,253,240 // paddw %xmm0,%xmm6
- .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,214 // packuswb %xmm6,%xmm2
- .byte 102,15,118,237 // pcmpeqd %xmm5,%xmm5
- .byte 102,15,239,235 // pxor %xmm3,%xmm5
- .byte 102,15,111,217 // movdqa %xmm1,%xmm3
- .byte 102,15,111,241 // movdqa %xmm1,%xmm6
- .byte 102,15,96,244 // punpcklbw %xmm4,%xmm6
- .byte 102,15,104,220 // punpckhbw %xmm4,%xmm3
- .byte 102,15,111,197 // movdqa %xmm5,%xmm0
- .byte 102,15,96,196 // punpcklbw %xmm4,%xmm0
- .byte 102,15,104,236 // punpckhbw %xmm4,%xmm5
- .byte 102,15,213,235 // pmullw %xmm3,%xmm5
- .byte 102,15,213,198 // pmullw %xmm6,%xmm0
- .byte 102,15,253,198 // paddw %xmm6,%xmm0
- .byte 102,15,253,235 // paddw %xmm3,%xmm5
+ .byte 243,15,16,32 // movss (%rax),%xmm4
+ .byte 243,15,89,37,222,16,0,0 // mulss 0x10de(%rip),%xmm4 # 22c8 <_sk_xor__sse2_8bit+0x1af>
+ .byte 243,15,44,196 // cvttss2si %xmm4,%eax
+ .byte 102,15,110,224 // movd %eax,%xmm4
+ .byte 102,15,96,228 // punpcklbw %xmm4,%xmm4
+ .byte 242,15,112,228,0 // pshuflw $0x0,%xmm4,%xmm4
+ .byte 102,68,15,112,196,80 // pshufd $0x50,%xmm4,%xmm8
+ .byte 102,69,15,239,201 // pxor %xmm9,%xmm9
+ .byte 102,15,111,248 // movdqa %xmm0,%xmm7
+ .byte 102,65,15,96,249 // punpcklbw %xmm9,%xmm7
+ .byte 102,65,15,104,193 // punpckhbw %xmm9,%xmm0
+ .byte 102,68,15,111,217 // movdqa %xmm1,%xmm11
+ .byte 102,69,15,96,217 // punpcklbw %xmm9,%xmm11
+ .byte 102,65,15,104,201 // punpckhbw %xmm9,%xmm1
+ .byte 102,15,111,53,117,17,0,0 // movdqa 0x1175(%rip),%xmm6 # 23a0 <_sk_xor__sse2_8bit+0x287>
+ .byte 102,65,15,219,240 // pand %xmm8,%xmm6
+ .byte 102,15,111,230 // movdqa %xmm6,%xmm4
+ .byte 102,15,213,225 // pmullw %xmm1,%xmm4
+ .byte 102,68,15,111,214 // movdqa %xmm6,%xmm10
+ .byte 102,69,15,213,211 // pmullw %xmm11,%xmm10
+ .byte 102,15,111,238 // movdqa %xmm6,%xmm5
+ .byte 102,15,213,232 // pmullw %xmm0,%xmm5
+ .byte 102,15,213,247 // pmullw %xmm7,%xmm6
+ .byte 102,15,253,247 // paddw %xmm7,%xmm6
+ .byte 102,15,253,232 // paddw %xmm0,%xmm5
+ .byte 102,69,15,253,211 // paddw %xmm11,%xmm10
+ .byte 102,15,253,225 // paddw %xmm1,%xmm4
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,65,15,113,210,8 // psrlw $0x8,%xmm10
.byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,103,245 // packuswb %xmm5,%xmm6
+ .byte 102,68,15,103,212 // packuswb %xmm4,%xmm10
+ .byte 102,15,118,255 // pcmpeqd %xmm7,%xmm7
+ .byte 102,65,15,239,248 // pxor %xmm8,%xmm7
+ .byte 102,68,15,111,218 // movdqa %xmm2,%xmm11
+ .byte 102,15,111,234 // movdqa %xmm2,%xmm5
+ .byte 102,65,15,96,233 // punpcklbw %xmm9,%xmm5
+ .byte 102,69,15,104,217 // punpckhbw %xmm9,%xmm11
+ .byte 102,68,15,111,195 // movdqa %xmm3,%xmm8
+ .byte 102,68,15,111,227 // movdqa %xmm3,%xmm12
+ .byte 102,69,15,96,225 // punpcklbw %xmm9,%xmm12
+ .byte 102,69,15,104,193 // punpckhbw %xmm9,%xmm8
+ .byte 102,15,111,199 // movdqa %xmm7,%xmm0
+ .byte 102,65,15,96,193 // punpcklbw %xmm9,%xmm0
+ .byte 102,65,15,104,249 // punpckhbw %xmm9,%xmm7
+ .byte 102,15,111,231 // movdqa %xmm7,%xmm4
+ .byte 102,65,15,213,224 // pmullw %xmm8,%xmm4
+ .byte 102,15,111,200 // movdqa %xmm0,%xmm1
+ .byte 102,65,15,213,204 // pmullw %xmm12,%xmm1
+ .byte 102,65,15,213,251 // pmullw %xmm11,%xmm7
+ .byte 102,15,213,197 // pmullw %xmm5,%xmm0
+ .byte 102,15,253,197 // paddw %xmm5,%xmm0
+ .byte 102,65,15,253,251 // paddw %xmm11,%xmm7
+ .byte 102,65,15,253,204 // paddw %xmm12,%xmm1
+ .byte 102,65,15,253,224 // paddw %xmm8,%xmm4
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
.byte 102,15,113,208,8 // psrlw $0x8,%xmm0
- .byte 102,15,103,197 // packuswb %xmm5,%xmm0
- .byte 102,15,252,194 // paddb %xmm2,%xmm0
+ .byte 102,15,103,199 // packuswb %xmm7,%xmm0
+ .byte 102,15,103,204 // packuswb %xmm4,%xmm1
+ .byte 102,15,252,198 // paddb %xmm6,%xmm0
+ .byte 102,65,15,252,202 // paddb %xmm10,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -60242,77 +62956,151 @@ _sk_lerp_u8_sse2_8bit:
.byte 72,15,175,209 // imul %rcx,%rdx
.byte 72,3,16 // add (%rax),%rdx
.byte 77,133,201 // test %r9,%r9
- .byte 15,133,180,0,0,0 // jne b1b <_sk_lerp_u8_sse2_8bit+0xd5>
- .byte 102,66,15,110,20,2 // movd (%rdx,%r8,1),%xmm2
- .byte 102,15,96,208 // punpcklbw %xmm0,%xmm2
- .byte 102,15,97,208 // punpcklwd %xmm0,%xmm2
- .byte 102,15,114,242,24 // pslld $0x18,%xmm2
- .byte 242,15,112,210,231 // pshuflw $0xe7,%xmm2,%xmm2
- .byte 243,15,112,210,231 // pshufhw $0xe7,%xmm2,%xmm2
- .byte 102,15,112,210,232 // pshufd $0xe8,%xmm2,%xmm2
- .byte 102,15,96,210 // punpcklbw %xmm2,%xmm2
- .byte 242,15,112,210,95 // pshuflw $0x5f,%xmm2,%xmm2
- .byte 243,15,112,218,95 // pshufhw $0x5f,%xmm2,%xmm3
- .byte 102,15,239,210 // pxor %xmm2,%xmm2
- .byte 102,15,111,224 // movdqa %xmm0,%xmm4
- .byte 102,15,96,226 // punpcklbw %xmm2,%xmm4
- .byte 102,15,104,194 // punpckhbw %xmm2,%xmm0
- .byte 102,15,111,235 // movdqa %xmm3,%xmm5
- .byte 102,15,118,246 // pcmpeqd %xmm6,%xmm6
- .byte 102,15,239,243 // pxor %xmm3,%xmm6
- .byte 102,15,96,218 // punpcklbw %xmm2,%xmm3
- .byte 102,15,104,234 // punpckhbw %xmm2,%xmm5
+ .byte 15,133,141,1,0,0 // jne 14c0 <_sk_lerp_u8_sse2_8bit+0x1ae>
+ .byte 243,66,15,126,44,2 // movq (%rdx,%r8,1),%xmm5
+ .byte 102,15,96,232 // punpcklbw %xmm0,%xmm5
+ .byte 102,15,84,45,107,16,0,0 // andpd 0x106b(%rip),%xmm5 # 23b0 <_sk_xor__sse2_8bit+0x297>
+ .byte 102,69,15,239,192 // pxor %xmm8,%xmm8
+ .byte 102,15,40,229 // movapd %xmm5,%xmm4
+ .byte 102,65,15,105,224 // punpckhwd %xmm8,%xmm4
+ .byte 102,65,15,97,232 // punpcklwd %xmm8,%xmm5
+ .byte 102,15,114,245,24 // pslld $0x18,%xmm5
+ .byte 102,15,114,244,24 // pslld $0x18,%xmm4
+ .byte 242,15,112,228,231 // pshuflw $0xe7,%xmm4,%xmm4
+ .byte 243,15,112,228,231 // pshufhw $0xe7,%xmm4,%xmm4
+ .byte 102,15,112,228,232 // pshufd $0xe8,%xmm4,%xmm4
+ .byte 102,15,96,228 // punpcklbw %xmm4,%xmm4
+ .byte 242,15,112,228,95 // pshuflw $0x5f,%xmm4,%xmm4
+ .byte 243,15,112,244,95 // pshufhw $0x5f,%xmm4,%xmm6
+ .byte 242,15,112,237,231 // pshuflw $0xe7,%xmm5,%xmm5
+ .byte 243,15,112,237,231 // pshufhw $0xe7,%xmm5,%xmm5
+ .byte 102,15,112,237,232 // pshufd $0xe8,%xmm5,%xmm5
+ .byte 102,15,96,237 // punpcklbw %xmm5,%xmm5
+ .byte 242,15,112,237,95 // pshuflw $0x5f,%xmm5,%xmm5
+ .byte 243,15,112,253,95 // pshufhw $0x5f,%xmm5,%xmm7
+ .byte 102,68,15,111,200 // movdqa %xmm0,%xmm9
+ .byte 102,69,15,96,200 // punpcklbw %xmm8,%xmm9
+ .byte 102,65,15,104,192 // punpckhbw %xmm8,%xmm0
+ .byte 102,68,15,111,209 // movdqa %xmm1,%xmm10
+ .byte 102,69,15,96,208 // punpcklbw %xmm8,%xmm10
+ .byte 102,65,15,104,200 // punpckhbw %xmm8,%xmm1
+ .byte 102,68,15,111,223 // movdqa %xmm7,%xmm11
+ .byte 102,69,15,96,216 // punpcklbw %xmm8,%xmm11
+ .byte 102,15,111,239 // movdqa %xmm7,%xmm5
+ .byte 102,65,15,104,232 // punpckhbw %xmm8,%xmm5
+ .byte 102,68,15,111,230 // movdqa %xmm6,%xmm12
+ .byte 102,69,15,96,224 // punpcklbw %xmm8,%xmm12
+ .byte 102,15,111,230 // movdqa %xmm6,%xmm4
+ .byte 102,65,15,104,224 // punpckhbw %xmm8,%xmm4
+ .byte 102,15,213,225 // pmullw %xmm1,%xmm4
+ .byte 102,69,15,213,226 // pmullw %xmm10,%xmm12
.byte 102,15,213,232 // pmullw %xmm0,%xmm5
- .byte 102,15,213,220 // pmullw %xmm4,%xmm3
- .byte 102,15,253,220 // paddw %xmm4,%xmm3
+ .byte 102,69,15,213,217 // pmullw %xmm9,%xmm11
+ .byte 102,69,15,253,217 // paddw %xmm9,%xmm11
.byte 102,15,253,232 // paddw %xmm0,%xmm5
+ .byte 102,69,15,253,226 // paddw %xmm10,%xmm12
+ .byte 102,15,253,225 // paddw %xmm1,%xmm4
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,65,15,113,212,8 // psrlw $0x8,%xmm12
.byte 102,15,113,213,8 // psrlw $0x8,%xmm5
- .byte 102,15,113,211,8 // psrlw $0x8,%xmm3
- .byte 102,15,103,221 // packuswb %xmm5,%xmm3
- .byte 102,15,111,225 // movdqa %xmm1,%xmm4
- .byte 102,15,96,226 // punpcklbw %xmm2,%xmm4
- .byte 102,15,111,233 // movdqa %xmm1,%xmm5
- .byte 102,15,104,234 // punpckhbw %xmm2,%xmm5
- .byte 102,15,111,198 // movdqa %xmm6,%xmm0
- .byte 102,15,96,194 // punpcklbw %xmm2,%xmm0
- .byte 102,15,104,242 // punpckhbw %xmm2,%xmm6
- .byte 102,15,213,245 // pmullw %xmm5,%xmm6
+ .byte 102,65,15,113,211,8 // psrlw $0x8,%xmm11
+ .byte 102,68,15,103,221 // packuswb %xmm5,%xmm11
+ .byte 102,68,15,103,228 // packuswb %xmm4,%xmm12
+ .byte 102,15,118,192 // pcmpeqd %xmm0,%xmm0
+ .byte 102,15,239,240 // pxor %xmm0,%xmm6
+ .byte 102,15,239,248 // pxor %xmm0,%xmm7
+ .byte 102,15,111,226 // movdqa %xmm2,%xmm4
+ .byte 102,65,15,96,224 // punpcklbw %xmm8,%xmm4
+ .byte 102,15,111,234 // movdqa %xmm2,%xmm5
+ .byte 102,65,15,104,232 // punpckhbw %xmm8,%xmm5
+ .byte 102,68,15,111,203 // movdqa %xmm3,%xmm9
+ .byte 102,69,15,96,200 // punpcklbw %xmm8,%xmm9
+ .byte 102,68,15,111,211 // movdqa %xmm3,%xmm10
+ .byte 102,69,15,104,208 // punpckhbw %xmm8,%xmm10
+ .byte 102,15,111,199 // movdqa %xmm7,%xmm0
+ .byte 102,65,15,96,192 // punpcklbw %xmm8,%xmm0
+ .byte 102,65,15,104,248 // punpckhbw %xmm8,%xmm7
+ .byte 102,15,111,206 // movdqa %xmm6,%xmm1
+ .byte 102,65,15,96,200 // punpcklbw %xmm8,%xmm1
+ .byte 102,65,15,104,240 // punpckhbw %xmm8,%xmm6
+ .byte 102,65,15,213,242 // pmullw %xmm10,%xmm6
+ .byte 102,65,15,213,201 // pmullw %xmm9,%xmm1
+ .byte 102,15,213,253 // pmullw %xmm5,%xmm7
.byte 102,15,213,196 // pmullw %xmm4,%xmm0
.byte 102,15,253,196 // paddw %xmm4,%xmm0
- .byte 102,15,253,245 // paddw %xmm5,%xmm6
+ .byte 102,15,253,253 // paddw %xmm5,%xmm7
+ .byte 102,65,15,253,201 // paddw %xmm9,%xmm1
+ .byte 102,65,15,253,242 // paddw %xmm10,%xmm6
.byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
.byte 102,15,113,208,8 // psrlw $0x8,%xmm0
- .byte 102,15,103,198 // packuswb %xmm6,%xmm0
- .byte 102,15,252,195 // paddb %xmm3,%xmm0
+ .byte 102,15,103,199 // packuswb %xmm7,%xmm0
+ .byte 102,15,103,206 // packuswb %xmm6,%xmm1
+ .byte 102,65,15,252,195 // paddb %xmm11,%xmm0
+ .byte 102,65,15,252,204 // paddb %xmm12,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
- .byte 65,128,225,3 // and $0x3,%r9b
- .byte 65,128,249,1 // cmp $0x1,%r9b
- .byte 116,61 // je b62 <_sk_lerp_u8_sse2_8bit+0x11c>
- .byte 102,15,239,210 // pxor %xmm2,%xmm2
- .byte 65,128,249,2 // cmp $0x2,%r9b
- .byte 116,25 // je b48 <_sk_lerp_u8_sse2_8bit+0x102>
- .byte 65,128,249,3 // cmp $0x3,%r9b
- .byte 15,133,60,255,255,255 // jne a75 <_sk_lerp_u8_sse2_8bit+0x2f>
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,237 // pxor %xmm5,%xmm5
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 15,135,104,254,255,255 // ja 133d <_sk_lerp_u8_sse2_8bit+0x2b>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 72,141,13,116,0,0,0 // lea 0x74(%rip),%rcx # 1554 <_sk_lerp_u8_sse2_8bit+0x242>
+ .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
+ .byte 72,1,200 // add %rcx,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
+ .byte 102,15,110,232 // movd %eax,%xmm5
+ .byte 233,70,254,255,255 // jmpq 133d <_sk_lerp_u8_sse2_8bit+0x2b>
.byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 102,15,112,210,69 // pshufd $0x45,%xmm2,%xmm2
+ .byte 102,15,239,237 // pxor %xmm5,%xmm5
+ .byte 102,15,196,232,2 // pinsrw $0x2,%eax,%xmm5
.byte 66,15,183,4,2 // movzwl (%rdx,%r8,1),%eax
- .byte 102,15,110,216 // movd %eax,%xmm3
- .byte 102,15,96,216 // punpcklbw %xmm0,%xmm3
- .byte 102,15,97,216 // punpcklwd %xmm0,%xmm3
- .byte 242,15,16,211 // movsd %xmm3,%xmm2
- .byte 233,19,255,255,255 // jmpq a75 <_sk_lerp_u8_sse2_8bit+0x2f>
- .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax
- .byte 102,15,110,208 // movd %eax,%xmm2
- .byte 233,5,255,255,255 // jmpq a75 <_sk_lerp_u8_sse2_8bit+0x2f>
+ .byte 102,15,110,224 // movd %eax,%xmm4
+ .byte 102,15,96,224 // punpcklbw %xmm0,%xmm4
+ .byte 243,15,16,236 // movss %xmm4,%xmm5
+ .byte 233,33,254,255,255 // jmpq 133d <_sk_lerp_u8_sse2_8bit+0x2b>
+ .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax
+ .byte 102,15,239,237 // pxor %xmm5,%xmm5
+ .byte 102,15,196,232,6 // pinsrw $0x6,%eax,%xmm5
+ .byte 66,15,182,68,2,5 // movzbl 0x5(%rdx,%r8,1),%eax
+ .byte 102,15,196,232,5 // pinsrw $0x5,%eax,%xmm5
+ .byte 66,15,182,68,2,4 // movzbl 0x4(%rdx,%r8,1),%eax
+ .byte 102,15,196,232,4 // pinsrw $0x4,%eax,%xmm5
+ .byte 102,66,15,110,36,2 // movd (%rdx,%r8,1),%xmm4
+ .byte 102,15,96,224 // punpcklbw %xmm0,%xmm4
+ .byte 242,15,16,236 // movsd %xmm4,%xmm5
+ .byte 233,233,253,255,255 // jmpq 133d <_sk_lerp_u8_sse2_8bit+0x2b>
+ .byte 149 // xchg %eax,%ebp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,178,255,255,255,163 // pushq -0x5c000001(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 237 // in (%dx),%eax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,226 // jmpq *%rdx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,215 // callq *%rdi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,200 // dec %eax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
HIDDEN _sk_move_src_dst_sse2_8bit
.globl _sk_move_src_dst_sse2_8bit
FUNCTION(_sk_move_src_dst_sse2_8bit)
_sk_move_src_dst_sse2_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 15,40,200 // movaps %xmm0,%xmm1
+ .byte 15,40,208 // movaps %xmm0,%xmm2
+ .byte 15,40,217 // movaps %xmm1,%xmm3
.byte 255,224 // jmpq *%rax
HIDDEN _sk_move_dst_src_sse2_8bit
@@ -60320,7 +63108,8 @@ HIDDEN _sk_move_dst_src_sse2_8bit
FUNCTION(_sk_move_dst_src_sse2_8bit)
_sk_move_dst_src_sse2_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 15,40,193 // movaps %xmm1,%xmm0
+ .byte 15,40,194 // movaps %xmm2,%xmm0
+ .byte 15,40,203 // movaps %xmm3,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_black_color_sse2_8bit
@@ -60328,7 +63117,8 @@ HIDDEN _sk_black_color_sse2_8bit
FUNCTION(_sk_black_color_sse2_8bit)
_sk_black_color_sse2_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 15,40,5,249,6,0,0 // movaps 0x6f9(%rip),%xmm0 # 1280 <_sk_xor__sse2_8bit+0x185>
+ .byte 15,40,5,51,14,0,0 // movaps 0xe33(%rip),%xmm0 # 23c0 <_sk_xor__sse2_8bit+0x2a7>
+ .byte 15,40,200 // movaps %xmm0,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_white_color_sse2_8bit
@@ -60337,6 +63127,7 @@ FUNCTION(_sk_white_color_sse2_8bit)
_sk_white_color_sse2_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 102,15,118,192 // pcmpeqd %xmm0,%xmm0
+ .byte 102,15,118,201 // pcmpeqd %xmm1,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_clear_sse2_8bit
@@ -60345,55 +63136,97 @@ FUNCTION(_sk_clear_sse2_8bit)
_sk_clear_sse2_8bit:
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 15,87,192 // xorps %xmm0,%xmm0
+ .byte 15,87,201 // xorps %xmm1,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_srcatop_sse2_8bit
.globl _sk_srcatop_sse2_8bit
FUNCTION(_sk_srcatop_sse2_8bit)
_sk_srcatop_sse2_8bit:
- .byte 242,15,112,209,231 // pshuflw $0xe7,%xmm1,%xmm2
- .byte 243,15,112,210,231 // pshufhw $0xe7,%xmm2,%xmm2
- .byte 102,15,112,210,232 // pshufd $0xe8,%xmm2,%xmm2
- .byte 102,15,96,210 // punpcklbw %xmm2,%xmm2
- .byte 242,15,112,210,95 // pshuflw $0x5f,%xmm2,%xmm2
- .byte 243,15,112,226,95 // pshufhw $0x5f,%xmm2,%xmm4
- .byte 102,15,239,219 // pxor %xmm3,%xmm3
- .byte 102,15,111,232 // movdqa %xmm0,%xmm5
- .byte 102,15,96,235 // punpcklbw %xmm3,%xmm5
- .byte 242,15,112,240,231 // pshuflw $0xe7,%xmm0,%xmm6
- .byte 102,15,104,195 // punpckhbw %xmm3,%xmm0
- .byte 102,15,111,212 // movdqa %xmm4,%xmm2
- .byte 102,15,96,211 // punpcklbw %xmm3,%xmm2
- .byte 102,15,104,227 // punpckhbw %xmm3,%xmm4
- .byte 102,15,213,224 // pmullw %xmm0,%xmm4
- .byte 102,15,213,213 // pmullw %xmm5,%xmm2
- .byte 102,15,253,213 // paddw %xmm5,%xmm2
- .byte 102,15,253,224 // paddw %xmm0,%xmm4
+ .byte 242,15,112,227,231 // pshuflw $0xe7,%xmm3,%xmm4
+ .byte 243,15,112,228,231 // pshufhw $0xe7,%xmm4,%xmm4
+ .byte 102,15,112,228,232 // pshufd $0xe8,%xmm4,%xmm4
+ .byte 102,15,96,228 // punpcklbw %xmm4,%xmm4
+ .byte 242,15,112,228,95 // pshuflw $0x5f,%xmm4,%xmm4
+ .byte 243,68,15,112,220,95 // pshufhw $0x5f,%xmm4,%xmm11
+ .byte 242,15,112,226,231 // pshuflw $0xe7,%xmm2,%xmm4
+ .byte 243,15,112,228,231 // pshufhw $0xe7,%xmm4,%xmm4
+ .byte 102,15,112,228,232 // pshufd $0xe8,%xmm4,%xmm4
+ .byte 102,15,96,228 // punpcklbw %xmm4,%xmm4
+ .byte 242,15,112,228,95 // pshuflw $0x5f,%xmm4,%xmm4
+ .byte 243,15,112,228,95 // pshufhw $0x5f,%xmm4,%xmm4
+ .byte 102,69,15,239,201 // pxor %xmm9,%xmm9
+ .byte 102,68,15,111,192 // movdqa %xmm0,%xmm8
+ .byte 242,68,15,112,208,231 // pshuflw $0xe7,%xmm0,%xmm10
+ .byte 102,65,15,96,193 // punpcklbw %xmm9,%xmm0
+ .byte 102,69,15,104,193 // punpckhbw %xmm9,%xmm8
+ .byte 102,15,111,249 // movdqa %xmm1,%xmm7
+ .byte 242,68,15,112,225,231 // pshuflw $0xe7,%xmm1,%xmm12
+ .byte 102,65,15,96,201 // punpcklbw %xmm9,%xmm1
+ .byte 102,65,15,104,249 // punpckhbw %xmm9,%xmm7
+ .byte 102,15,111,244 // movdqa %xmm4,%xmm6
+ .byte 102,65,15,96,241 // punpcklbw %xmm9,%xmm6
+ .byte 102,65,15,104,225 // punpckhbw %xmm9,%xmm4
+ .byte 102,65,15,111,235 // movdqa %xmm11,%xmm5
+ .byte 102,65,15,96,233 // punpcklbw %xmm9,%xmm5
+ .byte 102,69,15,104,217 // punpckhbw %xmm9,%xmm11
+ .byte 102,68,15,213,223 // pmullw %xmm7,%xmm11
+ .byte 102,15,213,233 // pmullw %xmm1,%xmm5
+ .byte 102,65,15,213,224 // pmullw %xmm8,%xmm4
+ .byte 102,15,213,240 // pmullw %xmm0,%xmm6
+ .byte 102,15,253,240 // paddw %xmm0,%xmm6
+ .byte 102,65,15,253,224 // paddw %xmm8,%xmm4
+ .byte 102,15,253,233 // paddw %xmm1,%xmm5
+ .byte 102,68,15,253,223 // paddw %xmm7,%xmm11
+ .byte 102,65,15,113,211,8 // psrlw $0x8,%xmm11
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
.byte 102,15,113,212,8 // psrlw $0x8,%xmm4
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,212 // packuswb %xmm4,%xmm2
- .byte 243,15,112,198,231 // pshufhw $0xe7,%xmm6,%xmm0
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,103,244 // packuswb %xmm4,%xmm6
+ .byte 102,65,15,103,235 // packuswb %xmm11,%xmm5
+ .byte 243,65,15,112,194,231 // pshufhw $0xe7,%xmm10,%xmm0
.byte 102,15,112,192,232 // pshufd $0xe8,%xmm0,%xmm0
.byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
.byte 242,15,112,192,95 // pshuflw $0x5f,%xmm0,%xmm0
- .byte 243,15,112,192,95 // pshufhw $0x5f,%xmm0,%xmm0
- .byte 102,15,118,228 // pcmpeqd %xmm4,%xmm4
+ .byte 243,15,112,224,95 // pshufhw $0x5f,%xmm0,%xmm4
+ .byte 243,65,15,112,196,231 // pshufhw $0xe7,%xmm12,%xmm0
+ .byte 102,15,112,192,232 // pshufd $0xe8,%xmm0,%xmm0
+ .byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
+ .byte 242,15,112,192,95 // pshuflw $0x5f,%xmm0,%xmm0
+ .byte 243,15,112,248,95 // pshufhw $0x5f,%xmm0,%xmm7
+ .byte 102,15,118,192 // pcmpeqd %xmm0,%xmm0
+ .byte 102,15,239,248 // pxor %xmm0,%xmm7
.byte 102,15,239,224 // pxor %xmm0,%xmm4
- .byte 102,15,111,233 // movdqa %xmm1,%xmm5
- .byte 102,15,111,241 // movdqa %xmm1,%xmm6
- .byte 102,15,96,243 // punpcklbw %xmm3,%xmm6
- .byte 102,15,104,235 // punpckhbw %xmm3,%xmm5
+ .byte 102,68,15,111,194 // movdqa %xmm2,%xmm8
+ .byte 102,68,15,111,210 // movdqa %xmm2,%xmm10
+ .byte 102,69,15,96,209 // punpcklbw %xmm9,%xmm10
+ .byte 102,69,15,104,193 // punpckhbw %xmm9,%xmm8
+ .byte 102,68,15,111,219 // movdqa %xmm3,%xmm11
+ .byte 102,68,15,111,227 // movdqa %xmm3,%xmm12
+ .byte 102,69,15,96,225 // punpcklbw %xmm9,%xmm12
+ .byte 102,69,15,104,217 // punpckhbw %xmm9,%xmm11
.byte 102,15,111,196 // movdqa %xmm4,%xmm0
- .byte 102,15,96,195 // punpcklbw %xmm3,%xmm0
- .byte 102,15,104,227 // punpckhbw %xmm3,%xmm4
- .byte 102,15,213,229 // pmullw %xmm5,%xmm4
- .byte 102,15,213,198 // pmullw %xmm6,%xmm0
- .byte 102,15,253,198 // paddw %xmm6,%xmm0
- .byte 102,15,253,229 // paddw %xmm5,%xmm4
+ .byte 102,65,15,96,193 // punpcklbw %xmm9,%xmm0
+ .byte 102,65,15,104,225 // punpckhbw %xmm9,%xmm4
+ .byte 102,15,111,207 // movdqa %xmm7,%xmm1
+ .byte 102,65,15,96,201 // punpcklbw %xmm9,%xmm1
+ .byte 102,65,15,104,249 // punpckhbw %xmm9,%xmm7
+ .byte 102,65,15,213,251 // pmullw %xmm11,%xmm7
+ .byte 102,65,15,213,204 // pmullw %xmm12,%xmm1
+ .byte 102,65,15,213,224 // pmullw %xmm8,%xmm4
+ .byte 102,65,15,213,194 // pmullw %xmm10,%xmm0
+ .byte 102,65,15,253,194 // paddw %xmm10,%xmm0
+ .byte 102,65,15,253,224 // paddw %xmm8,%xmm4
+ .byte 102,65,15,253,204 // paddw %xmm12,%xmm1
+ .byte 102,65,15,253,251 // paddw %xmm11,%xmm7
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
.byte 102,15,113,212,8 // psrlw $0x8,%xmm4
.byte 102,15,113,208,8 // psrlw $0x8,%xmm0
.byte 102,15,103,196 // packuswb %xmm4,%xmm0
- .byte 102,15,252,194 // paddb %xmm2,%xmm0
+ .byte 102,15,103,207 // packuswb %xmm7,%xmm1
+ .byte 102,15,252,198 // paddb %xmm6,%xmm0
+ .byte 102,15,252,205 // paddb %xmm5,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -60401,106 +63234,188 @@ HIDDEN _sk_dstatop_sse2_8bit
.globl _sk_dstatop_sse2_8bit
FUNCTION(_sk_dstatop_sse2_8bit)
_sk_dstatop_sse2_8bit:
- .byte 242,15,112,208,231 // pshuflw $0xe7,%xmm0,%xmm2
- .byte 243,15,112,210,231 // pshufhw $0xe7,%xmm2,%xmm2
- .byte 102,15,112,210,232 // pshufd $0xe8,%xmm2,%xmm2
- .byte 102,15,96,210 // punpcklbw %xmm2,%xmm2
- .byte 242,15,112,210,95 // pshuflw $0x5f,%xmm2,%xmm2
- .byte 243,15,112,210,95 // pshufhw $0x5f,%xmm2,%xmm2
- .byte 102,15,239,219 // pxor %xmm3,%xmm3
- .byte 102,15,111,225 // movdqa %xmm1,%xmm4
- .byte 242,15,112,233,231 // pshuflw $0xe7,%xmm1,%xmm5
- .byte 102,15,111,241 // movdqa %xmm1,%xmm6
- .byte 102,15,96,243 // punpcklbw %xmm3,%xmm6
+ .byte 242,15,112,225,231 // pshuflw $0xe7,%xmm1,%xmm4
+ .byte 243,15,112,228,231 // pshufhw $0xe7,%xmm4,%xmm4
+ .byte 102,15,112,228,232 // pshufd $0xe8,%xmm4,%xmm4
+ .byte 102,15,96,228 // punpcklbw %xmm4,%xmm4
+ .byte 242,15,112,228,95 // pshuflw $0x5f,%xmm4,%xmm4
+ .byte 243,15,112,228,95 // pshufhw $0x5f,%xmm4,%xmm4
+ .byte 242,15,112,232,231 // pshuflw $0xe7,%xmm0,%xmm5
+ .byte 243,15,112,237,231 // pshufhw $0xe7,%xmm5,%xmm5
+ .byte 102,15,112,237,232 // pshufd $0xe8,%xmm5,%xmm5
+ .byte 102,15,96,237 // punpcklbw %xmm5,%xmm5
+ .byte 242,15,112,237,95 // pshuflw $0x5f,%xmm5,%xmm5
+ .byte 243,15,112,237,95 // pshufhw $0x5f,%xmm5,%xmm5
+ .byte 102,69,15,239,192 // pxor %xmm8,%xmm8
+ .byte 102,68,15,111,210 // movdqa %xmm2,%xmm10
+ .byte 242,68,15,112,218,231 // pshuflw $0xe7,%xmm2,%xmm11
.byte 102,15,111,250 // movdqa %xmm2,%xmm7
- .byte 102,15,96,251 // punpcklbw %xmm3,%xmm7
- .byte 102,15,213,254 // pmullw %xmm6,%xmm7
- .byte 102,15,253,254 // paddw %xmm6,%xmm7
- .byte 102,15,104,227 // punpckhbw %xmm3,%xmm4
- .byte 102,15,104,211 // punpckhbw %xmm3,%xmm2
- .byte 102,15,213,212 // pmullw %xmm4,%xmm2
- .byte 102,15,253,212 // paddw %xmm4,%xmm2
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
- .byte 102,15,103,250 // packuswb %xmm2,%xmm7
- .byte 243,15,112,213,231 // pshufhw $0xe7,%xmm5,%xmm2
- .byte 102,15,112,210,232 // pshufd $0xe8,%xmm2,%xmm2
- .byte 102,15,96,210 // punpcklbw %xmm2,%xmm2
- .byte 242,15,112,210,95 // pshuflw $0x5f,%xmm2,%xmm2
- .byte 243,15,112,210,95 // pshufhw $0x5f,%xmm2,%xmm2
+ .byte 102,65,15,96,248 // punpcklbw %xmm8,%xmm7
+ .byte 102,69,15,104,208 // punpckhbw %xmm8,%xmm10
+ .byte 102,15,111,243 // movdqa %xmm3,%xmm6
+ .byte 102,68,15,111,205 // movdqa %xmm5,%xmm9
+ .byte 102,69,15,96,200 // punpcklbw %xmm8,%xmm9
+ .byte 102,68,15,213,207 // pmullw %xmm7,%xmm9
+ .byte 102,68,15,253,207 // paddw %xmm7,%xmm9
+ .byte 242,68,15,112,227,231 // pshuflw $0xe7,%xmm3,%xmm12
+ .byte 102,15,111,251 // movdqa %xmm3,%xmm7
+ .byte 102,65,15,96,248 // punpcklbw %xmm8,%xmm7
+ .byte 102,65,15,104,240 // punpckhbw %xmm8,%xmm6
+ .byte 102,65,15,104,232 // punpckhbw %xmm8,%xmm5
+ .byte 102,65,15,213,234 // pmullw %xmm10,%xmm5
+ .byte 102,65,15,253,234 // paddw %xmm10,%xmm5
+ .byte 102,68,15,111,212 // movdqa %xmm4,%xmm10
+ .byte 102,69,15,96,208 // punpcklbw %xmm8,%xmm10
+ .byte 102,65,15,104,224 // punpckhbw %xmm8,%xmm4
+ .byte 102,15,213,230 // pmullw %xmm6,%xmm4
+ .byte 102,68,15,213,215 // pmullw %xmm7,%xmm10
+ .byte 102,68,15,253,215 // paddw %xmm7,%xmm10
+ .byte 102,15,253,230 // paddw %xmm6,%xmm4
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,65,15,113,210,8 // psrlw $0x8,%xmm10
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,65,15,113,209,8 // psrlw $0x8,%xmm9
+ .byte 102,68,15,103,205 // packuswb %xmm5,%xmm9
+ .byte 102,68,15,103,212 // packuswb %xmm4,%xmm10
+ .byte 243,65,15,112,227,231 // pshufhw $0xe7,%xmm11,%xmm4
+ .byte 102,15,112,228,232 // pshufd $0xe8,%xmm4,%xmm4
+ .byte 102,15,96,228 // punpcklbw %xmm4,%xmm4
+ .byte 242,15,112,228,95 // pshuflw $0x5f,%xmm4,%xmm4
+ .byte 243,15,112,252,95 // pshufhw $0x5f,%xmm4,%xmm7
+ .byte 243,65,15,112,228,231 // pshufhw $0xe7,%xmm12,%xmm4
+ .byte 102,15,112,228,232 // pshufd $0xe8,%xmm4,%xmm4
+ .byte 102,15,96,228 // punpcklbw %xmm4,%xmm4
+ .byte 242,15,112,228,95 // pshuflw $0x5f,%xmm4,%xmm4
+ .byte 243,15,112,244,95 // pshufhw $0x5f,%xmm4,%xmm6
.byte 102,15,118,228 // pcmpeqd %xmm4,%xmm4
- .byte 102,15,239,226 // pxor %xmm2,%xmm4
- .byte 102,15,111,208 // movdqa %xmm0,%xmm2
- .byte 102,15,96,211 // punpcklbw %xmm3,%xmm2
- .byte 102,15,104,195 // punpckhbw %xmm3,%xmm0
- .byte 102,15,111,236 // movdqa %xmm4,%xmm5
- .byte 102,15,96,235 // punpcklbw %xmm3,%xmm5
- .byte 102,15,104,227 // punpckhbw %xmm3,%xmm4
- .byte 102,15,213,224 // pmullw %xmm0,%xmm4
- .byte 102,15,213,234 // pmullw %xmm2,%xmm5
- .byte 102,15,253,213 // paddw %xmm5,%xmm2
- .byte 102,15,253,224 // paddw %xmm0,%xmm4
+ .byte 102,15,239,244 // pxor %xmm4,%xmm6
+ .byte 102,15,239,252 // pxor %xmm4,%xmm7
+ .byte 102,15,111,224 // movdqa %xmm0,%xmm4
+ .byte 102,65,15,96,224 // punpcklbw %xmm8,%xmm4
+ .byte 102,65,15,104,192 // punpckhbw %xmm8,%xmm0
+ .byte 102,15,111,233 // movdqa %xmm1,%xmm5
+ .byte 102,65,15,96,232 // punpcklbw %xmm8,%xmm5
+ .byte 102,65,15,104,200 // punpckhbw %xmm8,%xmm1
+ .byte 102,68,15,111,223 // movdqa %xmm7,%xmm11
+ .byte 102,69,15,96,216 // punpcklbw %xmm8,%xmm11
+ .byte 102,65,15,104,248 // punpckhbw %xmm8,%xmm7
+ .byte 102,68,15,111,230 // movdqa %xmm6,%xmm12
+ .byte 102,69,15,96,224 // punpcklbw %xmm8,%xmm12
+ .byte 102,65,15,104,240 // punpckhbw %xmm8,%xmm6
+ .byte 102,15,213,241 // pmullw %xmm1,%xmm6
+ .byte 102,68,15,213,229 // pmullw %xmm5,%xmm12
+ .byte 102,15,213,248 // pmullw %xmm0,%xmm7
+ .byte 102,68,15,213,220 // pmullw %xmm4,%xmm11
+ .byte 102,65,15,253,227 // paddw %xmm11,%xmm4
+ .byte 102,15,253,248 // paddw %xmm0,%xmm7
+ .byte 102,65,15,253,236 // paddw %xmm12,%xmm5
+ .byte 102,15,253,241 // paddw %xmm1,%xmm6
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
.byte 102,15,113,212,8 // psrlw $0x8,%xmm4
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,212 // packuswb %xmm4,%xmm2
- .byte 102,15,252,215 // paddb %xmm7,%xmm2
+ .byte 102,15,103,231 // packuswb %xmm7,%xmm4
+ .byte 102,15,103,238 // packuswb %xmm6,%xmm5
+ .byte 102,65,15,252,225 // paddb %xmm9,%xmm4
+ .byte 102,65,15,252,234 // paddb %xmm10,%xmm5
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
+ .byte 102,15,111,196 // movdqa %xmm4,%xmm0
+ .byte 102,15,111,205 // movdqa %xmm5,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_srcin_sse2_8bit
.globl _sk_srcin_sse2_8bit
FUNCTION(_sk_srcin_sse2_8bit)
_sk_srcin_sse2_8bit:
- .byte 242,15,112,209,231 // pshuflw $0xe7,%xmm1,%xmm2
- .byte 243,15,112,210,231 // pshufhw $0xe7,%xmm2,%xmm2
- .byte 102,15,112,210,232 // pshufd $0xe8,%xmm2,%xmm2
- .byte 102,15,96,210 // punpcklbw %xmm2,%xmm2
- .byte 242,15,112,210,95 // pshuflw $0x5f,%xmm2,%xmm2
- .byte 243,15,112,218,95 // pshufhw $0x5f,%xmm2,%xmm3
- .byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,111,208 // movdqa %xmm0,%xmm2
- .byte 102,15,96,212 // punpcklbw %xmm4,%xmm2
- .byte 102,15,104,196 // punpckhbw %xmm4,%xmm0
- .byte 102,15,111,235 // movdqa %xmm3,%xmm5
- .byte 102,15,96,236 // punpcklbw %xmm4,%xmm5
- .byte 102,15,104,220 // punpckhbw %xmm4,%xmm3
- .byte 102,15,213,216 // pmullw %xmm0,%xmm3
- .byte 102,15,213,234 // pmullw %xmm2,%xmm5
- .byte 102,15,253,213 // paddw %xmm5,%xmm2
- .byte 102,15,253,216 // paddw %xmm0,%xmm3
- .byte 102,15,113,211,8 // psrlw $0x8,%xmm3
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,211 // packuswb %xmm3,%xmm2
+ .byte 102,68,15,111,192 // movdqa %xmm0,%xmm8
+ .byte 242,15,112,195,231 // pshuflw $0xe7,%xmm3,%xmm0
+ .byte 243,15,112,192,231 // pshufhw $0xe7,%xmm0,%xmm0
+ .byte 102,15,112,192,232 // pshufd $0xe8,%xmm0,%xmm0
+ .byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
+ .byte 242,15,112,192,95 // pshuflw $0x5f,%xmm0,%xmm0
+ .byte 243,15,112,240,95 // pshufhw $0x5f,%xmm0,%xmm6
+ .byte 242,15,112,194,231 // pshuflw $0xe7,%xmm2,%xmm0
+ .byte 243,15,112,192,231 // pshufhw $0xe7,%xmm0,%xmm0
+ .byte 102,15,112,192,232 // pshufd $0xe8,%xmm0,%xmm0
+ .byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
+ .byte 242,15,112,192,95 // pshuflw $0x5f,%xmm0,%xmm0
+ .byte 243,15,112,248,95 // pshufhw $0x5f,%xmm0,%xmm7
+ .byte 102,69,15,239,201 // pxor %xmm9,%xmm9
+ .byte 102,65,15,111,192 // movdqa %xmm8,%xmm0
+ .byte 102,65,15,96,193 // punpcklbw %xmm9,%xmm0
+ .byte 102,69,15,104,193 // punpckhbw %xmm9,%xmm8
+ .byte 102,15,111,225 // movdqa %xmm1,%xmm4
+ .byte 102,65,15,96,225 // punpcklbw %xmm9,%xmm4
+ .byte 102,65,15,104,201 // punpckhbw %xmm9,%xmm1
+ .byte 102,15,111,239 // movdqa %xmm7,%xmm5
+ .byte 102,65,15,96,233 // punpcklbw %xmm9,%xmm5
+ .byte 102,65,15,104,249 // punpckhbw %xmm9,%xmm7
+ .byte 102,68,15,111,214 // movdqa %xmm6,%xmm10
+ .byte 102,69,15,96,209 // punpcklbw %xmm9,%xmm10
+ .byte 102,65,15,104,241 // punpckhbw %xmm9,%xmm6
+ .byte 102,15,213,241 // pmullw %xmm1,%xmm6
+ .byte 102,68,15,213,212 // pmullw %xmm4,%xmm10
+ .byte 102,65,15,213,248 // pmullw %xmm8,%xmm7
+ .byte 102,15,213,232 // pmullw %xmm0,%xmm5
+ .byte 102,15,253,197 // paddw %xmm5,%xmm0
+ .byte 102,65,15,253,248 // paddw %xmm8,%xmm7
+ .byte 102,65,15,253,226 // paddw %xmm10,%xmm4
+ .byte 102,15,253,241 // paddw %xmm1,%xmm6
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,208,8 // psrlw $0x8,%xmm0
+ .byte 102,15,103,199 // packuswb %xmm7,%xmm0
+ .byte 102,15,103,230 // packuswb %xmm6,%xmm4
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
+ .byte 102,15,111,204 // movdqa %xmm4,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_dstin_sse2_8bit
.globl _sk_dstin_sse2_8bit
FUNCTION(_sk_dstin_sse2_8bit)
_sk_dstin_sse2_8bit:
+ .byte 242,15,112,201,231 // pshuflw $0xe7,%xmm1,%xmm1
+ .byte 243,15,112,201,231 // pshufhw $0xe7,%xmm1,%xmm1
+ .byte 102,15,112,201,232 // pshufd $0xe8,%xmm1,%xmm1
+ .byte 102,15,96,201 // punpcklbw %xmm1,%xmm1
+ .byte 242,15,112,201,95 // pshuflw $0x5f,%xmm1,%xmm1
+ .byte 243,15,112,225,95 // pshufhw $0x5f,%xmm1,%xmm4
.byte 242,15,112,192,231 // pshuflw $0xe7,%xmm0,%xmm0
.byte 243,15,112,192,231 // pshufhw $0xe7,%xmm0,%xmm0
.byte 102,15,112,192,232 // pshufd $0xe8,%xmm0,%xmm0
.byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
.byte 242,15,112,192,95 // pshuflw $0x5f,%xmm0,%xmm0
- .byte 243,15,112,208,95 // pshufhw $0x5f,%xmm0,%xmm2
- .byte 102,15,239,219 // pxor %xmm3,%xmm3
- .byte 102,15,111,225 // movdqa %xmm1,%xmm4
- .byte 102,15,111,233 // movdqa %xmm1,%xmm5
- .byte 102,15,96,235 // punpcklbw %xmm3,%xmm5
- .byte 102,15,104,227 // punpckhbw %xmm3,%xmm4
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
- .byte 102,15,96,195 // punpcklbw %xmm3,%xmm0
- .byte 102,15,104,211 // punpckhbw %xmm3,%xmm2
- .byte 102,15,213,212 // pmullw %xmm4,%xmm2
- .byte 102,15,213,197 // pmullw %xmm5,%xmm0
- .byte 102,15,253,197 // paddw %xmm5,%xmm0
- .byte 102,15,253,212 // paddw %xmm4,%xmm2
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
+ .byte 243,15,112,232,95 // pshufhw $0x5f,%xmm0,%xmm5
+ .byte 102,15,239,246 // pxor %xmm6,%xmm6
+ .byte 102,68,15,111,194 // movdqa %xmm2,%xmm8
+ .byte 102,15,111,250 // movdqa %xmm2,%xmm7
+ .byte 102,15,96,254 // punpcklbw %xmm6,%xmm7
+ .byte 102,68,15,104,198 // punpckhbw %xmm6,%xmm8
+ .byte 102,68,15,111,203 // movdqa %xmm3,%xmm9
+ .byte 102,68,15,111,211 // movdqa %xmm3,%xmm10
+ .byte 102,68,15,96,214 // punpcklbw %xmm6,%xmm10
+ .byte 102,68,15,104,206 // punpckhbw %xmm6,%xmm9
+ .byte 102,15,111,197 // movdqa %xmm5,%xmm0
+ .byte 102,15,96,198 // punpcklbw %xmm6,%xmm0
+ .byte 102,15,104,238 // punpckhbw %xmm6,%xmm5
+ .byte 102,15,111,204 // movdqa %xmm4,%xmm1
+ .byte 102,15,96,206 // punpcklbw %xmm6,%xmm1
+ .byte 102,15,104,230 // punpckhbw %xmm6,%xmm4
+ .byte 102,65,15,213,225 // pmullw %xmm9,%xmm4
+ .byte 102,65,15,213,202 // pmullw %xmm10,%xmm1
+ .byte 102,65,15,213,232 // pmullw %xmm8,%xmm5
+ .byte 102,15,213,199 // pmullw %xmm7,%xmm0
+ .byte 102,15,253,199 // paddw %xmm7,%xmm0
+ .byte 102,65,15,253,232 // paddw %xmm8,%xmm5
+ .byte 102,65,15,253,202 // paddw %xmm10,%xmm1
+ .byte 102,65,15,253,225 // paddw %xmm9,%xmm4
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
.byte 102,15,113,208,8 // psrlw $0x8,%xmm0
- .byte 102,15,103,194 // packuswb %xmm2,%xmm0
+ .byte 102,15,103,197 // packuswb %xmm5,%xmm0
+ .byte 102,15,103,204 // packuswb %xmm4,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -60508,30 +63423,51 @@ HIDDEN _sk_srcout_sse2_8bit
.globl _sk_srcout_sse2_8bit
FUNCTION(_sk_srcout_sse2_8bit)
_sk_srcout_sse2_8bit:
- .byte 242,15,112,209,231 // pshuflw $0xe7,%xmm1,%xmm2
- .byte 243,15,112,210,231 // pshufhw $0xe7,%xmm2,%xmm2
- .byte 102,15,112,210,232 // pshufd $0xe8,%xmm2,%xmm2
- .byte 102,15,96,210 // punpcklbw %xmm2,%xmm2
- .byte 242,15,112,210,95 // pshuflw $0x5f,%xmm2,%xmm2
- .byte 243,15,112,210,95 // pshufhw $0x5f,%xmm2,%xmm2
- .byte 102,15,118,219 // pcmpeqd %xmm3,%xmm3
- .byte 102,15,239,218 // pxor %xmm2,%xmm3
- .byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,111,208 // movdqa %xmm0,%xmm2
- .byte 102,15,96,212 // punpcklbw %xmm4,%xmm2
- .byte 102,15,104,196 // punpckhbw %xmm4,%xmm0
- .byte 102,15,111,235 // movdqa %xmm3,%xmm5
- .byte 102,15,96,236 // punpcklbw %xmm4,%xmm5
- .byte 102,15,104,220 // punpckhbw %xmm4,%xmm3
- .byte 102,15,213,216 // pmullw %xmm0,%xmm3
- .byte 102,15,213,234 // pmullw %xmm2,%xmm5
- .byte 102,15,253,213 // paddw %xmm5,%xmm2
- .byte 102,15,253,216 // paddw %xmm0,%xmm3
- .byte 102,15,113,211,8 // psrlw $0x8,%xmm3
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,211 // packuswb %xmm3,%xmm2
+ .byte 102,68,15,111,192 // movdqa %xmm0,%xmm8
+ .byte 242,15,112,194,231 // pshuflw $0xe7,%xmm2,%xmm0
+ .byte 243,15,112,192,231 // pshufhw $0xe7,%xmm0,%xmm0
+ .byte 102,15,112,192,232 // pshufd $0xe8,%xmm0,%xmm0
+ .byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
+ .byte 242,15,112,192,95 // pshuflw $0x5f,%xmm0,%xmm0
+ .byte 243,15,112,240,95 // pshufhw $0x5f,%xmm0,%xmm6
+ .byte 242,15,112,195,231 // pshuflw $0xe7,%xmm3,%xmm0
+ .byte 243,15,112,192,231 // pshufhw $0xe7,%xmm0,%xmm0
+ .byte 102,15,112,192,232 // pshufd $0xe8,%xmm0,%xmm0
+ .byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
+ .byte 242,15,112,192,95 // pshuflw $0x5f,%xmm0,%xmm0
+ .byte 243,15,112,248,95 // pshufhw $0x5f,%xmm0,%xmm7
+ .byte 102,15,118,192 // pcmpeqd %xmm0,%xmm0
+ .byte 102,15,239,248 // pxor %xmm0,%xmm7
+ .byte 102,15,239,240 // pxor %xmm0,%xmm6
+ .byte 102,69,15,239,201 // pxor %xmm9,%xmm9
+ .byte 102,65,15,111,192 // movdqa %xmm8,%xmm0
+ .byte 102,65,15,96,193 // punpcklbw %xmm9,%xmm0
+ .byte 102,69,15,104,193 // punpckhbw %xmm9,%xmm8
+ .byte 102,15,111,233 // movdqa %xmm1,%xmm5
+ .byte 102,65,15,96,233 // punpcklbw %xmm9,%xmm5
+ .byte 102,65,15,104,201 // punpckhbw %xmm9,%xmm1
+ .byte 102,15,111,230 // movdqa %xmm6,%xmm4
+ .byte 102,65,15,96,225 // punpcklbw %xmm9,%xmm4
+ .byte 102,65,15,104,241 // punpckhbw %xmm9,%xmm6
+ .byte 102,68,15,111,215 // movdqa %xmm7,%xmm10
+ .byte 102,69,15,96,209 // punpcklbw %xmm9,%xmm10
+ .byte 102,65,15,104,249 // punpckhbw %xmm9,%xmm7
+ .byte 102,15,213,249 // pmullw %xmm1,%xmm7
+ .byte 102,68,15,213,213 // pmullw %xmm5,%xmm10
+ .byte 102,65,15,213,240 // pmullw %xmm8,%xmm6
+ .byte 102,15,213,224 // pmullw %xmm0,%xmm4
+ .byte 102,15,253,196 // paddw %xmm4,%xmm0
+ .byte 102,65,15,253,240 // paddw %xmm8,%xmm6
+ .byte 102,65,15,253,234 // paddw %xmm10,%xmm5
+ .byte 102,15,253,249 // paddw %xmm1,%xmm7
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,113,208,8 // psrlw $0x8,%xmm0
+ .byte 102,15,103,198 // packuswb %xmm6,%xmm0
+ .byte 102,15,103,239 // packuswb %xmm7,%xmm5
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
+ .byte 102,15,111,205 // movdqa %xmm5,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_dstout_sse2_8bit
@@ -60543,24 +63479,45 @@ _sk_dstout_sse2_8bit:
.byte 102,15,112,192,232 // pshufd $0xe8,%xmm0,%xmm0
.byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
.byte 242,15,112,192,95 // pshuflw $0x5f,%xmm0,%xmm0
- .byte 243,15,112,192,95 // pshufhw $0x5f,%xmm0,%xmm0
- .byte 102,15,118,210 // pcmpeqd %xmm2,%xmm2
- .byte 102,15,239,208 // pxor %xmm0,%xmm2
- .byte 102,15,239,219 // pxor %xmm3,%xmm3
- .byte 102,15,111,225 // movdqa %xmm1,%xmm4
- .byte 102,15,111,233 // movdqa %xmm1,%xmm5
- .byte 102,15,96,235 // punpcklbw %xmm3,%xmm5
- .byte 102,15,104,227 // punpckhbw %xmm3,%xmm4
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
- .byte 102,15,96,195 // punpcklbw %xmm3,%xmm0
- .byte 102,15,104,211 // punpckhbw %xmm3,%xmm2
- .byte 102,15,213,212 // pmullw %xmm4,%xmm2
- .byte 102,15,213,197 // pmullw %xmm5,%xmm0
- .byte 102,15,253,197 // paddw %xmm5,%xmm0
- .byte 102,15,253,212 // paddw %xmm4,%xmm2
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
+ .byte 243,15,112,224,95 // pshufhw $0x5f,%xmm0,%xmm4
+ .byte 242,15,112,193,231 // pshuflw $0xe7,%xmm1,%xmm0
+ .byte 243,15,112,192,231 // pshufhw $0xe7,%xmm0,%xmm0
+ .byte 102,15,112,192,232 // pshufd $0xe8,%xmm0,%xmm0
+ .byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
+ .byte 242,15,112,192,95 // pshuflw $0x5f,%xmm0,%xmm0
+ .byte 243,15,112,232,95 // pshufhw $0x5f,%xmm0,%xmm5
+ .byte 102,15,118,192 // pcmpeqd %xmm0,%xmm0
+ .byte 102,15,239,232 // pxor %xmm0,%xmm5
+ .byte 102,15,239,224 // pxor %xmm0,%xmm4
+ .byte 102,15,239,246 // pxor %xmm6,%xmm6
+ .byte 102,68,15,111,194 // movdqa %xmm2,%xmm8
+ .byte 102,15,111,250 // movdqa %xmm2,%xmm7
+ .byte 102,15,96,254 // punpcklbw %xmm6,%xmm7
+ .byte 102,68,15,104,198 // punpckhbw %xmm6,%xmm8
+ .byte 102,68,15,111,203 // movdqa %xmm3,%xmm9
+ .byte 102,68,15,111,211 // movdqa %xmm3,%xmm10
+ .byte 102,68,15,96,214 // punpcklbw %xmm6,%xmm10
+ .byte 102,68,15,104,206 // punpckhbw %xmm6,%xmm9
+ .byte 102,15,111,196 // movdqa %xmm4,%xmm0
+ .byte 102,15,96,198 // punpcklbw %xmm6,%xmm0
+ .byte 102,15,104,230 // punpckhbw %xmm6,%xmm4
+ .byte 102,15,111,205 // movdqa %xmm5,%xmm1
+ .byte 102,15,96,206 // punpcklbw %xmm6,%xmm1
+ .byte 102,15,104,238 // punpckhbw %xmm6,%xmm5
+ .byte 102,65,15,213,233 // pmullw %xmm9,%xmm5
+ .byte 102,65,15,213,202 // pmullw %xmm10,%xmm1
+ .byte 102,65,15,213,224 // pmullw %xmm8,%xmm4
+ .byte 102,15,213,199 // pmullw %xmm7,%xmm0
+ .byte 102,15,253,199 // paddw %xmm7,%xmm0
+ .byte 102,65,15,253,224 // paddw %xmm8,%xmm4
+ .byte 102,65,15,253,202 // paddw %xmm10,%xmm1
+ .byte 102,65,15,253,233 // paddw %xmm9,%xmm5
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
.byte 102,15,113,208,8 // psrlw $0x8,%xmm0
- .byte 102,15,103,194 // packuswb %xmm2,%xmm0
+ .byte 102,15,103,196 // packuswb %xmm4,%xmm0
+ .byte 102,15,103,205 // packuswb %xmm5,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -60568,29 +63525,51 @@ HIDDEN _sk_srcover_sse2_8bit
.globl _sk_srcover_sse2_8bit
FUNCTION(_sk_srcover_sse2_8bit)
_sk_srcover_sse2_8bit:
- .byte 242,15,112,208,231 // pshuflw $0xe7,%xmm0,%xmm2
- .byte 243,15,112,210,231 // pshufhw $0xe7,%xmm2,%xmm2
- .byte 102,15,112,210,232 // pshufd $0xe8,%xmm2,%xmm2
- .byte 102,15,96,210 // punpcklbw %xmm2,%xmm2
- .byte 242,15,112,210,95 // pshuflw $0x5f,%xmm2,%xmm2
- .byte 243,15,112,210,95 // pshufhw $0x5f,%xmm2,%xmm2
- .byte 102,15,239,219 // pxor %xmm3,%xmm3
- .byte 102,15,111,225 // movdqa %xmm1,%xmm4
- .byte 102,15,252,193 // paddb %xmm1,%xmm0
- .byte 102,15,111,233 // movdqa %xmm1,%xmm5
- .byte 102,15,96,235 // punpcklbw %xmm3,%xmm5
- .byte 102,15,104,227 // punpckhbw %xmm3,%xmm4
+ .byte 242,15,112,225,231 // pshuflw $0xe7,%xmm1,%xmm4
+ .byte 243,15,112,228,231 // pshufhw $0xe7,%xmm4,%xmm4
+ .byte 102,15,112,228,232 // pshufd $0xe8,%xmm4,%xmm4
+ .byte 102,15,96,228 // punpcklbw %xmm4,%xmm4
+ .byte 242,15,112,228,95 // pshuflw $0x5f,%xmm4,%xmm4
+ .byte 243,68,15,112,212,95 // pshufhw $0x5f,%xmm4,%xmm10
+ .byte 242,15,112,232,231 // pshuflw $0xe7,%xmm0,%xmm5
+ .byte 243,15,112,237,231 // pshufhw $0xe7,%xmm5,%xmm5
+ .byte 102,15,112,237,232 // pshufd $0xe8,%xmm5,%xmm5
+ .byte 102,15,96,237 // punpcklbw %xmm5,%xmm5
+ .byte 242,15,112,237,95 // pshuflw $0x5f,%xmm5,%xmm5
+ .byte 243,68,15,112,221,95 // pshufhw $0x5f,%xmm5,%xmm11
+ .byte 102,69,15,239,192 // pxor %xmm8,%xmm8
+ .byte 102,68,15,111,202 // movdqa %xmm2,%xmm9
+ .byte 102,15,252,194 // paddb %xmm2,%xmm0
.byte 102,15,111,242 // movdqa %xmm2,%xmm6
- .byte 102,15,96,243 // punpcklbw %xmm3,%xmm6
- .byte 102,15,104,211 // punpckhbw %xmm3,%xmm2
- .byte 102,15,213,212 // pmullw %xmm4,%xmm2
- .byte 102,15,213,245 // pmullw %xmm5,%xmm6
- .byte 102,15,253,245 // paddw %xmm5,%xmm6
- .byte 102,15,253,212 // paddw %xmm4,%xmm2
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
- .byte 102,15,103,242 // packuswb %xmm2,%xmm6
- .byte 102,15,248,198 // psubb %xmm6,%xmm0
+ .byte 102,65,15,96,240 // punpcklbw %xmm8,%xmm6
+ .byte 102,69,15,104,200 // punpckhbw %xmm8,%xmm9
+ .byte 102,68,15,111,227 // movdqa %xmm3,%xmm12
+ .byte 102,15,252,203 // paddb %xmm3,%xmm1
+ .byte 102,15,111,227 // movdqa %xmm3,%xmm4
+ .byte 102,65,15,96,224 // punpcklbw %xmm8,%xmm4
+ .byte 102,69,15,104,224 // punpckhbw %xmm8,%xmm12
+ .byte 102,65,15,111,235 // movdqa %xmm11,%xmm5
+ .byte 102,65,15,96,232 // punpcklbw %xmm8,%xmm5
+ .byte 102,69,15,104,216 // punpckhbw %xmm8,%xmm11
+ .byte 102,65,15,111,250 // movdqa %xmm10,%xmm7
+ .byte 102,65,15,96,248 // punpcklbw %xmm8,%xmm7
+ .byte 102,69,15,104,208 // punpckhbw %xmm8,%xmm10
+ .byte 102,69,15,213,212 // pmullw %xmm12,%xmm10
+ .byte 102,15,213,252 // pmullw %xmm4,%xmm7
+ .byte 102,69,15,213,217 // pmullw %xmm9,%xmm11
+ .byte 102,15,213,238 // pmullw %xmm6,%xmm5
+ .byte 102,15,253,238 // paddw %xmm6,%xmm5
+ .byte 102,69,15,253,217 // paddw %xmm9,%xmm11
+ .byte 102,15,253,252 // paddw %xmm4,%xmm7
+ .byte 102,69,15,253,212 // paddw %xmm12,%xmm10
+ .byte 102,65,15,113,210,8 // psrlw $0x8,%xmm10
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,65,15,113,211,8 // psrlw $0x8,%xmm11
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,65,15,103,235 // packuswb %xmm11,%xmm5
+ .byte 102,65,15,103,250 // packuswb %xmm10,%xmm7
+ .byte 102,15,248,197 // psubb %xmm5,%xmm0
+ .byte 102,15,248,207 // psubb %xmm7,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -60598,30 +63577,51 @@ HIDDEN _sk_dstover_sse2_8bit
.globl _sk_dstover_sse2_8bit
FUNCTION(_sk_dstover_sse2_8bit)
_sk_dstover_sse2_8bit:
- .byte 102,15,111,208 // movdqa %xmm0,%xmm2
- .byte 242,15,112,193,231 // pshuflw $0xe7,%xmm1,%xmm0
- .byte 243,15,112,192,231 // pshufhw $0xe7,%xmm0,%xmm0
- .byte 102,15,112,192,232 // pshufd $0xe8,%xmm0,%xmm0
- .byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
- .byte 242,15,112,192,95 // pshuflw $0x5f,%xmm0,%xmm0
- .byte 243,15,112,216,95 // pshufhw $0x5f,%xmm0,%xmm3
- .byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,111,234 // movdqa %xmm2,%xmm5
- .byte 102,15,96,236 // punpcklbw %xmm4,%xmm5
- .byte 102,15,111,193 // movdqa %xmm1,%xmm0
- .byte 102,15,252,194 // paddb %xmm2,%xmm0
- .byte 102,15,104,212 // punpckhbw %xmm4,%xmm2
- .byte 102,15,111,243 // movdqa %xmm3,%xmm6
- .byte 102,15,96,244 // punpcklbw %xmm4,%xmm6
- .byte 102,15,104,220 // punpckhbw %xmm4,%xmm3
- .byte 102,15,213,218 // pmullw %xmm2,%xmm3
- .byte 102,15,213,245 // pmullw %xmm5,%xmm6
- .byte 102,15,253,245 // paddw %xmm5,%xmm6
- .byte 102,15,253,218 // paddw %xmm2,%xmm3
- .byte 102,15,113,211,8 // psrlw $0x8,%xmm3
+ .byte 242,15,112,227,231 // pshuflw $0xe7,%xmm3,%xmm4
+ .byte 243,15,112,228,231 // pshufhw $0xe7,%xmm4,%xmm4
+ .byte 102,15,112,228,232 // pshufd $0xe8,%xmm4,%xmm4
+ .byte 102,15,96,228 // punpcklbw %xmm4,%xmm4
+ .byte 242,15,112,228,95 // pshuflw $0x5f,%xmm4,%xmm4
+ .byte 243,15,112,228,95 // pshufhw $0x5f,%xmm4,%xmm4
+ .byte 242,15,112,234,231 // pshuflw $0xe7,%xmm2,%xmm5
+ .byte 243,15,112,237,231 // pshufhw $0xe7,%xmm5,%xmm5
+ .byte 102,15,112,237,232 // pshufd $0xe8,%xmm5,%xmm5
+ .byte 102,15,96,237 // punpcklbw %xmm5,%xmm5
+ .byte 242,15,112,237,95 // pshuflw $0x5f,%xmm5,%xmm5
+ .byte 243,15,112,237,95 // pshufhw $0x5f,%xmm5,%xmm5
+ .byte 102,69,15,239,192 // pxor %xmm8,%xmm8
+ .byte 102,68,15,111,200 // movdqa %xmm0,%xmm9
+ .byte 102,69,15,96,200 // punpcklbw %xmm8,%xmm9
+ .byte 102,68,15,111,208 // movdqa %xmm0,%xmm10
+ .byte 102,69,15,104,208 // punpckhbw %xmm8,%xmm10
+ .byte 102,68,15,111,217 // movdqa %xmm1,%xmm11
+ .byte 102,69,15,96,216 // punpcklbw %xmm8,%xmm11
+ .byte 102,68,15,111,225 // movdqa %xmm1,%xmm12
+ .byte 102,69,15,104,224 // punpckhbw %xmm8,%xmm12
+ .byte 102,15,111,245 // movdqa %xmm5,%xmm6
+ .byte 102,65,15,96,240 // punpcklbw %xmm8,%xmm6
+ .byte 102,65,15,104,232 // punpckhbw %xmm8,%xmm5
+ .byte 102,15,111,252 // movdqa %xmm4,%xmm7
+ .byte 102,65,15,96,248 // punpcklbw %xmm8,%xmm7
+ .byte 102,65,15,104,224 // punpckhbw %xmm8,%xmm4
+ .byte 102,65,15,213,228 // pmullw %xmm12,%xmm4
+ .byte 102,65,15,213,251 // pmullw %xmm11,%xmm7
+ .byte 102,65,15,213,234 // pmullw %xmm10,%xmm5
+ .byte 102,65,15,213,241 // pmullw %xmm9,%xmm6
+ .byte 102,65,15,253,241 // paddw %xmm9,%xmm6
+ .byte 102,65,15,253,234 // paddw %xmm10,%xmm5
+ .byte 102,65,15,253,251 // paddw %xmm11,%xmm7
+ .byte 102,65,15,253,228 // paddw %xmm12,%xmm4
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
.byte 102,15,113,214,8 // psrlw $0x8,%xmm6
- .byte 102,15,103,243 // packuswb %xmm3,%xmm6
+ .byte 102,15,103,245 // packuswb %xmm5,%xmm6
+ .byte 102,15,103,252 // packuswb %xmm4,%xmm7
+ .byte 102,15,252,203 // paddb %xmm3,%xmm1
+ .byte 102,15,252,194 // paddb %xmm2,%xmm0
.byte 102,15,248,198 // psubb %xmm6,%xmm0
+ .byte 102,15,248,207 // psubb %xmm7,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -60629,168 +63629,298 @@ HIDDEN _sk_modulate_sse2_8bit
.globl _sk_modulate_sse2_8bit
FUNCTION(_sk_modulate_sse2_8bit)
_sk_modulate_sse2_8bit:
- .byte 102,15,239,219 // pxor %xmm3,%xmm3
- .byte 102,15,111,208 // movdqa %xmm0,%xmm2
- .byte 102,15,96,211 // punpcklbw %xmm3,%xmm2
- .byte 102,15,104,195 // punpckhbw %xmm3,%xmm0
- .byte 102,15,111,225 // movdqa %xmm1,%xmm4
- .byte 102,15,111,233 // movdqa %xmm1,%xmm5
- .byte 102,15,96,235 // punpcklbw %xmm3,%xmm5
- .byte 102,15,104,227 // punpckhbw %xmm3,%xmm4
+ .byte 102,68,15,111,193 // movdqa %xmm1,%xmm8
+ .byte 102,68,15,111,200 // movdqa %xmm0,%xmm9
+ .byte 102,69,15,239,210 // pxor %xmm10,%xmm10
+ .byte 102,65,15,96,194 // punpcklbw %xmm10,%xmm0
+ .byte 102,69,15,104,202 // punpckhbw %xmm10,%xmm9
+ .byte 102,65,15,96,202 // punpcklbw %xmm10,%xmm1
+ .byte 102,69,15,104,194 // punpckhbw %xmm10,%xmm8
+ .byte 102,15,111,250 // movdqa %xmm2,%xmm7
+ .byte 102,15,111,226 // movdqa %xmm2,%xmm4
+ .byte 102,65,15,96,226 // punpcklbw %xmm10,%xmm4
+ .byte 102,65,15,104,250 // punpckhbw %xmm10,%xmm7
+ .byte 102,15,111,235 // movdqa %xmm3,%xmm5
+ .byte 102,15,111,243 // movdqa %xmm3,%xmm6
+ .byte 102,65,15,96,242 // punpcklbw %xmm10,%xmm6
+ .byte 102,65,15,104,234 // punpckhbw %xmm10,%xmm5
+ .byte 102,65,15,213,232 // pmullw %xmm8,%xmm5
+ .byte 102,15,213,241 // pmullw %xmm1,%xmm6
+ .byte 102,65,15,213,249 // pmullw %xmm9,%xmm7
.byte 102,15,213,224 // pmullw %xmm0,%xmm4
- .byte 102,15,213,234 // pmullw %xmm2,%xmm5
- .byte 102,15,253,213 // paddw %xmm5,%xmm2
- .byte 102,15,253,224 // paddw %xmm0,%xmm4
- .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,212 // packuswb %xmm4,%xmm2
+ .byte 102,15,253,196 // paddw %xmm4,%xmm0
+ .byte 102,65,15,253,249 // paddw %xmm9,%xmm7
+ .byte 102,15,253,206 // paddw %xmm6,%xmm1
+ .byte 102,65,15,253,232 // paddw %xmm8,%xmm5
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,15,113,208,8 // psrlw $0x8,%xmm0
+ .byte 102,15,103,199 // packuswb %xmm7,%xmm0
+ .byte 102,15,103,205 // packuswb %xmm5,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,194 // movdqa %xmm2,%xmm0
.byte 255,224 // jmpq *%rax
HIDDEN _sk_multiply_sse2_8bit
.globl _sk_multiply_sse2_8bit
FUNCTION(_sk_multiply_sse2_8bit)
_sk_multiply_sse2_8bit:
- .byte 242,15,112,209,231 // pshuflw $0xe7,%xmm1,%xmm2
+ .byte 102,68,15,111,243 // movdqa %xmm3,%xmm14
+ .byte 102,15,111,218 // movdqa %xmm2,%xmm3
+ .byte 242,15,112,211,231 // pshuflw $0xe7,%xmm3,%xmm2
.byte 243,15,112,210,231 // pshufhw $0xe7,%xmm2,%xmm2
.byte 102,15,112,210,232 // pshufd $0xe8,%xmm2,%xmm2
.byte 102,15,96,210 // punpcklbw %xmm2,%xmm2
.byte 242,15,112,210,95 // pshuflw $0x5f,%xmm2,%xmm2
- .byte 243,15,112,242,95 // pshufhw $0x5f,%xmm2,%xmm6
- .byte 102,15,118,237 // pcmpeqd %xmm5,%xmm5
- .byte 102,15,239,245 // pxor %xmm5,%xmm6
- .byte 102,69,15,239,192 // pxor %xmm8,%xmm8
- .byte 102,15,111,216 // movdqa %xmm0,%xmm3
- .byte 102,65,15,96,216 // punpcklbw %xmm8,%xmm3
- .byte 242,15,112,248,231 // pshuflw $0xe7,%xmm0,%xmm7
- .byte 102,65,15,104,192 // punpckhbw %xmm8,%xmm0
- .byte 102,15,111,230 // movdqa %xmm6,%xmm4
- .byte 102,65,15,96,224 // punpcklbw %xmm8,%xmm4
- .byte 102,65,15,104,240 // punpckhbw %xmm8,%xmm6
- .byte 102,15,213,240 // pmullw %xmm0,%xmm6
- .byte 102,15,213,227 // pmullw %xmm3,%xmm4
- .byte 102,15,253,227 // paddw %xmm3,%xmm4
- .byte 102,15,253,240 // paddw %xmm0,%xmm6
- .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 243,15,112,250,95 // pshufhw $0x5f,%xmm2,%xmm7
+ .byte 242,65,15,112,214,231 // pshuflw $0xe7,%xmm14,%xmm2
+ .byte 243,15,112,210,231 // pshufhw $0xe7,%xmm2,%xmm2
+ .byte 102,15,112,210,232 // pshufd $0xe8,%xmm2,%xmm2
+ .byte 102,15,96,210 // punpcklbw %xmm2,%xmm2
+ .byte 242,15,112,210,95 // pshuflw $0x5f,%xmm2,%xmm2
+ .byte 243,15,112,226,95 // pshufhw $0x5f,%xmm2,%xmm4
+ .byte 102,69,15,118,192 // pcmpeqd %xmm8,%xmm8
+ .byte 102,65,15,239,224 // pxor %xmm8,%xmm4
+ .byte 102,65,15,239,248 // pxor %xmm8,%xmm7
+ .byte 102,69,15,239,237 // pxor %xmm13,%xmm13
+ .byte 102,68,15,111,200 // movdqa %xmm0,%xmm9
+ .byte 242,15,112,208,231 // pshuflw $0xe7,%xmm0,%xmm2
+ .byte 102,68,15,111,216 // movdqa %xmm0,%xmm11
+ .byte 102,69,15,96,221 // punpcklbw %xmm13,%xmm11
+ .byte 102,69,15,104,205 // punpckhbw %xmm13,%xmm9
+ .byte 102,68,15,111,209 // movdqa %xmm1,%xmm10
+ .byte 242,15,112,241,231 // pshuflw $0xe7,%xmm1,%xmm6
+ .byte 102,68,15,111,225 // movdqa %xmm1,%xmm12
+ .byte 102,69,15,96,229 // punpcklbw %xmm13,%xmm12
+ .byte 102,69,15,104,213 // punpckhbw %xmm13,%xmm10
+ .byte 102,68,15,111,255 // movdqa %xmm7,%xmm15
+ .byte 102,69,15,96,253 // punpcklbw %xmm13,%xmm15
+ .byte 102,65,15,104,253 // punpckhbw %xmm13,%xmm7
+ .byte 102,15,111,236 // movdqa %xmm4,%xmm5
+ .byte 102,65,15,96,237 // punpcklbw %xmm13,%xmm5
+ .byte 102,65,15,104,229 // punpckhbw %xmm13,%xmm4
+ .byte 102,65,15,213,226 // pmullw %xmm10,%xmm4
+ .byte 102,65,15,213,236 // pmullw %xmm12,%xmm5
+ .byte 102,65,15,213,249 // pmullw %xmm9,%xmm7
+ .byte 102,69,15,213,251 // pmullw %xmm11,%xmm15
+ .byte 102,69,15,253,251 // paddw %xmm11,%xmm15
+ .byte 102,65,15,253,249 // paddw %xmm9,%xmm7
+ .byte 102,65,15,253,236 // paddw %xmm12,%xmm5
+ .byte 102,65,15,253,226 // paddw %xmm10,%xmm4
.byte 102,15,113,212,8 // psrlw $0x8,%xmm4
- .byte 102,15,103,230 // packuswb %xmm6,%xmm4
- .byte 243,15,112,247,231 // pshufhw $0xe7,%xmm7,%xmm6
- .byte 102,15,112,246,232 // pshufd $0xe8,%xmm6,%xmm6
- .byte 102,15,96,246 // punpcklbw %xmm6,%xmm6
- .byte 242,15,112,246,95 // pshuflw $0x5f,%xmm6,%xmm6
- .byte 243,15,112,214,95 // pshufhw $0x5f,%xmm6,%xmm2
- .byte 102,15,239,213 // pxor %xmm5,%xmm2
- .byte 102,15,111,249 // movdqa %xmm1,%xmm7
- .byte 102,15,111,241 // movdqa %xmm1,%xmm6
- .byte 102,65,15,96,240 // punpcklbw %xmm8,%xmm6
- .byte 102,65,15,104,248 // punpckhbw %xmm8,%xmm7
- .byte 102,15,111,234 // movdqa %xmm2,%xmm5
- .byte 102,65,15,96,232 // punpcklbw %xmm8,%xmm5
- .byte 102,65,15,104,208 // punpckhbw %xmm8,%xmm2
- .byte 102,15,213,215 // pmullw %xmm7,%xmm2
- .byte 102,15,213,238 // pmullw %xmm6,%xmm5
- .byte 102,15,253,238 // paddw %xmm6,%xmm5
- .byte 102,15,253,215 // paddw %xmm7,%xmm2
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
.byte 102,15,113,213,8 // psrlw $0x8,%xmm5
- .byte 102,15,103,234 // packuswb %xmm2,%xmm5
- .byte 102,15,252,236 // paddb %xmm4,%xmm5
- .byte 102,15,213,248 // pmullw %xmm0,%xmm7
- .byte 102,15,213,243 // pmullw %xmm3,%xmm6
- .byte 102,15,253,243 // paddw %xmm3,%xmm6
- .byte 102,15,253,248 // paddw %xmm0,%xmm7
.byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,65,15,113,215,8 // psrlw $0x8,%xmm15
+ .byte 102,68,15,103,255 // packuswb %xmm7,%xmm15
+ .byte 102,15,103,236 // packuswb %xmm4,%xmm5
+ .byte 243,15,112,194,231 // pshufhw $0xe7,%xmm2,%xmm0
+ .byte 102,15,112,192,232 // pshufd $0xe8,%xmm0,%xmm0
+ .byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
+ .byte 242,15,112,192,95 // pshuflw $0x5f,%xmm0,%xmm0
+ .byte 243,15,112,192,95 // pshufhw $0x5f,%xmm0,%xmm0
+ .byte 243,15,112,206,231 // pshufhw $0xe7,%xmm6,%xmm1
+ .byte 102,15,112,201,232 // pshufd $0xe8,%xmm1,%xmm1
+ .byte 102,15,96,201 // punpcklbw %xmm1,%xmm1
+ .byte 242,15,112,201,95 // pshuflw $0x5f,%xmm1,%xmm1
+ .byte 243,15,112,241,95 // pshufhw $0x5f,%xmm1,%xmm6
+ .byte 102,65,15,239,240 // pxor %xmm8,%xmm6
+ .byte 102,65,15,239,192 // pxor %xmm8,%xmm0
+ .byte 102,68,15,111,195 // movdqa %xmm3,%xmm8
+ .byte 102,15,111,211 // movdqa %xmm3,%xmm2
+ .byte 102,65,15,96,213 // punpcklbw %xmm13,%xmm2
+ .byte 102,69,15,104,197 // punpckhbw %xmm13,%xmm8
+ .byte 102,65,15,111,206 // movdqa %xmm14,%xmm1
+ .byte 102,15,127,76,36,232 // movdqa %xmm1,-0x18(%rsp)
+ .byte 102,15,111,249 // movdqa %xmm1,%xmm7
+ .byte 102,65,15,96,253 // punpcklbw %xmm13,%xmm7
+ .byte 102,69,15,104,245 // punpckhbw %xmm13,%xmm14
+ .byte 102,15,111,224 // movdqa %xmm0,%xmm4
+ .byte 102,65,15,96,229 // punpcklbw %xmm13,%xmm4
+ .byte 102,65,15,104,197 // punpckhbw %xmm13,%xmm0
+ .byte 102,15,111,206 // movdqa %xmm6,%xmm1
+ .byte 102,65,15,96,205 // punpcklbw %xmm13,%xmm1
+ .byte 102,65,15,104,245 // punpckhbw %xmm13,%xmm6
+ .byte 102,65,15,213,192 // pmullw %xmm8,%xmm0
+ .byte 102,15,213,226 // pmullw %xmm2,%xmm4
+ .byte 102,15,253,226 // paddw %xmm2,%xmm4
+ .byte 102,65,15,253,192 // paddw %xmm8,%xmm0
+ .byte 102,15,113,208,8 // psrlw $0x8,%xmm0
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
+ .byte 102,15,103,224 // packuswb %xmm0,%xmm4
+ .byte 102,65,15,213,246 // pmullw %xmm14,%xmm6
+ .byte 102,15,213,207 // pmullw %xmm7,%xmm1
+ .byte 102,15,253,207 // paddw %xmm7,%xmm1
+ .byte 102,65,15,253,246 // paddw %xmm14,%xmm6
.byte 102,15,113,214,8 // psrlw $0x8,%xmm6
- .byte 102,15,103,247 // packuswb %xmm7,%xmm6
- .byte 102,15,252,238 // paddb %xmm6,%xmm5
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
+ .byte 102,15,103,206 // packuswb %xmm6,%xmm1
+ .byte 102,15,252,205 // paddb %xmm5,%xmm1
+ .byte 102,65,15,252,231 // paddb %xmm15,%xmm4
+ .byte 102,65,15,213,211 // pmullw %xmm11,%xmm2
+ .byte 102,65,15,253,211 // paddw %xmm11,%xmm2
+ .byte 102,69,15,213,193 // pmullw %xmm9,%xmm8
+ .byte 102,69,15,253,193 // paddw %xmm9,%xmm8
+ .byte 102,65,15,213,252 // pmullw %xmm12,%xmm7
+ .byte 102,65,15,253,252 // paddw %xmm12,%xmm7
+ .byte 102,69,15,213,242 // pmullw %xmm10,%xmm14
+ .byte 102,69,15,253,242 // paddw %xmm10,%xmm14
+ .byte 102,65,15,113,208,8 // psrlw $0x8,%xmm8
+ .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
+ .byte 102,65,15,103,208 // packuswb %xmm8,%xmm2
+ .byte 102,65,15,113,214,8 // psrlw $0x8,%xmm14
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,65,15,103,254 // packuswb %xmm14,%xmm7
+ .byte 102,15,252,226 // paddb %xmm2,%xmm4
+ .byte 102,15,252,207 // paddb %xmm7,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 102,15,111,197 // movdqa %xmm5,%xmm0
+ .byte 102,15,111,211 // movdqa %xmm3,%xmm2
+ .byte 15,40,92,36,232 // movaps -0x18(%rsp),%xmm3
+ .byte 102,15,111,196 // movdqa %xmm4,%xmm0
.byte 255,224 // jmpq *%rax
HIDDEN _sk_screen_sse2_8bit
.globl _sk_screen_sse2_8bit
FUNCTION(_sk_screen_sse2_8bit)
_sk_screen_sse2_8bit:
- .byte 102,15,118,210 // pcmpeqd %xmm2,%xmm2
- .byte 102,15,239,208 // pxor %xmm0,%xmm2
- .byte 102,15,239,219 // pxor %xmm3,%xmm3
- .byte 102,15,111,226 // movdqa %xmm2,%xmm4
- .byte 102,15,96,227 // punpcklbw %xmm3,%xmm4
- .byte 102,15,104,211 // punpckhbw %xmm3,%xmm2
- .byte 102,15,111,233 // movdqa %xmm1,%xmm5
- .byte 102,15,111,241 // movdqa %xmm1,%xmm6
- .byte 102,15,96,243 // punpcklbw %xmm3,%xmm6
- .byte 102,15,104,235 // punpckhbw %xmm3,%xmm5
- .byte 102,15,213,234 // pmullw %xmm2,%xmm5
- .byte 102,15,213,244 // pmullw %xmm4,%xmm6
- .byte 102,15,253,230 // paddw %xmm6,%xmm4
- .byte 102,15,253,234 // paddw %xmm2,%xmm5
- .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,69,15,118,219 // pcmpeqd %xmm11,%xmm11
+ .byte 102,68,15,111,201 // movdqa %xmm1,%xmm9
+ .byte 102,69,15,239,203 // pxor %xmm11,%xmm9
+ .byte 102,68,15,239,216 // pxor %xmm0,%xmm11
+ .byte 102,69,15,239,192 // pxor %xmm8,%xmm8
+ .byte 102,65,15,111,235 // movdqa %xmm11,%xmm5
+ .byte 102,65,15,96,232 // punpcklbw %xmm8,%xmm5
+ .byte 102,69,15,104,216 // punpckhbw %xmm8,%xmm11
+ .byte 102,65,15,111,225 // movdqa %xmm9,%xmm4
+ .byte 102,65,15,96,224 // punpcklbw %xmm8,%xmm4
+ .byte 102,69,15,104,200 // punpckhbw %xmm8,%xmm9
+ .byte 102,15,111,242 // movdqa %xmm2,%xmm6
+ .byte 102,68,15,111,210 // movdqa %xmm2,%xmm10
+ .byte 102,69,15,96,208 // punpcklbw %xmm8,%xmm10
+ .byte 102,65,15,104,240 // punpckhbw %xmm8,%xmm6
+ .byte 102,15,111,251 // movdqa %xmm3,%xmm7
+ .byte 102,68,15,111,227 // movdqa %xmm3,%xmm12
+ .byte 102,69,15,96,224 // punpcklbw %xmm8,%xmm12
+ .byte 102,65,15,104,248 // punpckhbw %xmm8,%xmm7
+ .byte 102,65,15,213,249 // pmullw %xmm9,%xmm7
+ .byte 102,68,15,213,228 // pmullw %xmm4,%xmm12
+ .byte 102,65,15,213,243 // pmullw %xmm11,%xmm6
+ .byte 102,68,15,213,213 // pmullw %xmm5,%xmm10
+ .byte 102,65,15,253,234 // paddw %xmm10,%xmm5
+ .byte 102,65,15,253,243 // paddw %xmm11,%xmm6
+ .byte 102,65,15,253,228 // paddw %xmm12,%xmm4
+ .byte 102,65,15,253,249 // paddw %xmm9,%xmm7
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
.byte 102,15,113,212,8 // psrlw $0x8,%xmm4
- .byte 102,15,103,229 // packuswb %xmm5,%xmm4
- .byte 102,15,252,196 // paddb %xmm4,%xmm0
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,103,238 // packuswb %xmm6,%xmm5
+ .byte 102,15,103,231 // packuswb %xmm7,%xmm4
+ .byte 102,15,252,197 // paddb %xmm5,%xmm0
+ .byte 102,15,252,225 // paddb %xmm1,%xmm4
.byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 102,15,111,204 // movdqa %xmm4,%xmm1
.byte 255,224 // jmpq *%rax
HIDDEN _sk_xor__sse2_8bit
.globl _sk_xor__sse2_8bit
FUNCTION(_sk_xor__sse2_8bit)
_sk_xor__sse2_8bit:
- .byte 242,15,112,209,231 // pshuflw $0xe7,%xmm1,%xmm2
- .byte 243,15,112,210,231 // pshufhw $0xe7,%xmm2,%xmm2
- .byte 102,15,112,210,232 // pshufd $0xe8,%xmm2,%xmm2
- .byte 102,15,96,210 // punpcklbw %xmm2,%xmm2
- .byte 242,15,112,210,95 // pshuflw $0x5f,%xmm2,%xmm2
- .byte 243,15,112,234,95 // pshufhw $0x5f,%xmm2,%xmm5
- .byte 102,15,118,228 // pcmpeqd %xmm4,%xmm4
- .byte 102,15,239,236 // pxor %xmm4,%xmm5
- .byte 102,15,239,219 // pxor %xmm3,%xmm3
- .byte 102,15,111,240 // movdqa %xmm0,%xmm6
- .byte 102,15,96,243 // punpcklbw %xmm3,%xmm6
- .byte 242,15,112,248,231 // pshuflw $0xe7,%xmm0,%xmm7
- .byte 102,15,104,195 // punpckhbw %xmm3,%xmm0
- .byte 102,15,111,213 // movdqa %xmm5,%xmm2
- .byte 102,15,96,211 // punpcklbw %xmm3,%xmm2
- .byte 102,15,104,235 // punpckhbw %xmm3,%xmm5
- .byte 102,15,213,232 // pmullw %xmm0,%xmm5
- .byte 102,15,213,214 // pmullw %xmm6,%xmm2
- .byte 102,15,253,214 // paddw %xmm6,%xmm2
- .byte 102,15,253,232 // paddw %xmm0,%xmm5
+ .byte 242,15,112,226,231 // pshuflw $0xe7,%xmm2,%xmm4
+ .byte 243,15,112,228,231 // pshufhw $0xe7,%xmm4,%xmm4
+ .byte 102,15,112,228,232 // pshufd $0xe8,%xmm4,%xmm4
+ .byte 102,15,96,228 // punpcklbw %xmm4,%xmm4
+ .byte 242,15,112,228,95 // pshuflw $0x5f,%xmm4,%xmm4
+ .byte 243,68,15,112,228,95 // pshufhw $0x5f,%xmm4,%xmm12
+ .byte 242,15,112,235,231 // pshuflw $0xe7,%xmm3,%xmm5
+ .byte 243,15,112,237,231 // pshufhw $0xe7,%xmm5,%xmm5
+ .byte 102,15,112,237,232 // pshufd $0xe8,%xmm5,%xmm5
+ .byte 102,15,96,237 // punpcklbw %xmm5,%xmm5
+ .byte 242,15,112,237,95 // pshuflw $0x5f,%xmm5,%xmm5
+ .byte 243,15,112,237,95 // pshufhw $0x5f,%xmm5,%xmm5
+ .byte 102,69,15,118,192 // pcmpeqd %xmm8,%xmm8
+ .byte 102,65,15,239,232 // pxor %xmm8,%xmm5
+ .byte 102,69,15,239,224 // pxor %xmm8,%xmm12
+ .byte 102,69,15,239,210 // pxor %xmm10,%xmm10
+ .byte 102,68,15,111,200 // movdqa %xmm0,%xmm9
+ .byte 242,68,15,112,216,231 // pshuflw $0xe7,%xmm0,%xmm11
+ .byte 102,65,15,96,194 // punpcklbw %xmm10,%xmm0
+ .byte 102,69,15,104,202 // punpckhbw %xmm10,%xmm9
+ .byte 102,15,111,225 // movdqa %xmm1,%xmm4
+ .byte 242,68,15,112,233,231 // pshuflw $0xe7,%xmm1,%xmm13
+ .byte 102,65,15,96,202 // punpcklbw %xmm10,%xmm1
+ .byte 102,65,15,104,226 // punpckhbw %xmm10,%xmm4
+ .byte 102,65,15,111,244 // movdqa %xmm12,%xmm6
+ .byte 102,65,15,96,242 // punpcklbw %xmm10,%xmm6
+ .byte 102,69,15,104,226 // punpckhbw %xmm10,%xmm12
+ .byte 102,15,111,253 // movdqa %xmm5,%xmm7
+ .byte 102,65,15,96,250 // punpcklbw %xmm10,%xmm7
+ .byte 102,65,15,104,234 // punpckhbw %xmm10,%xmm5
+ .byte 102,15,213,236 // pmullw %xmm4,%xmm5
+ .byte 102,15,213,249 // pmullw %xmm1,%xmm7
+ .byte 102,69,15,213,225 // pmullw %xmm9,%xmm12
+ .byte 102,15,213,240 // pmullw %xmm0,%xmm6
+ .byte 102,15,253,240 // paddw %xmm0,%xmm6
+ .byte 102,69,15,253,225 // paddw %xmm9,%xmm12
+ .byte 102,15,253,249 // paddw %xmm1,%xmm7
+ .byte 102,15,253,236 // paddw %xmm4,%xmm5
.byte 102,15,113,213,8 // psrlw $0x8,%xmm5
- .byte 102,15,113,210,8 // psrlw $0x8,%xmm2
- .byte 102,15,103,213 // packuswb %xmm5,%xmm2
- .byte 243,15,112,199,231 // pshufhw $0xe7,%xmm7,%xmm0
+ .byte 102,15,113,215,8 // psrlw $0x8,%xmm7
+ .byte 102,65,15,113,212,8 // psrlw $0x8,%xmm12
+ .byte 102,15,113,214,8 // psrlw $0x8,%xmm6
+ .byte 102,65,15,103,244 // packuswb %xmm12,%xmm6
+ .byte 102,15,103,253 // packuswb %xmm5,%xmm7
+ .byte 243,65,15,112,195,231 // pshufhw $0xe7,%xmm11,%xmm0
+ .byte 102,15,112,192,232 // pshufd $0xe8,%xmm0,%xmm0
+ .byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
+ .byte 242,15,112,192,95 // pshuflw $0x5f,%xmm0,%xmm0
+ .byte 243,15,112,224,95 // pshufhw $0x5f,%xmm0,%xmm4
+ .byte 243,65,15,112,197,231 // pshufhw $0xe7,%xmm13,%xmm0
.byte 102,15,112,192,232 // pshufd $0xe8,%xmm0,%xmm0
.byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
.byte 242,15,112,192,95 // pshuflw $0x5f,%xmm0,%xmm0
.byte 243,15,112,232,95 // pshufhw $0x5f,%xmm0,%xmm5
- .byte 102,15,239,236 // pxor %xmm4,%xmm5
- .byte 102,15,111,225 // movdqa %xmm1,%xmm4
- .byte 102,15,111,241 // movdqa %xmm1,%xmm6
- .byte 102,15,96,243 // punpcklbw %xmm3,%xmm6
- .byte 102,15,104,227 // punpckhbw %xmm3,%xmm4
- .byte 102,15,111,197 // movdqa %xmm5,%xmm0
- .byte 102,15,96,195 // punpcklbw %xmm3,%xmm0
- .byte 102,15,104,235 // punpckhbw %xmm3,%xmm5
- .byte 102,15,213,236 // pmullw %xmm4,%xmm5
- .byte 102,15,213,198 // pmullw %xmm6,%xmm0
- .byte 102,15,253,198 // paddw %xmm6,%xmm0
- .byte 102,15,253,236 // paddw %xmm4,%xmm5
+ .byte 102,65,15,239,232 // pxor %xmm8,%xmm5
+ .byte 102,65,15,239,224 // pxor %xmm8,%xmm4
+ .byte 102,68,15,111,194 // movdqa %xmm2,%xmm8
+ .byte 102,68,15,111,202 // movdqa %xmm2,%xmm9
+ .byte 102,69,15,96,202 // punpcklbw %xmm10,%xmm9
+ .byte 102,69,15,104,194 // punpckhbw %xmm10,%xmm8
+ .byte 102,68,15,111,219 // movdqa %xmm3,%xmm11
+ .byte 102,68,15,111,227 // movdqa %xmm3,%xmm12
+ .byte 102,69,15,96,226 // punpcklbw %xmm10,%xmm12
+ .byte 102,69,15,104,218 // punpckhbw %xmm10,%xmm11
+ .byte 102,15,111,196 // movdqa %xmm4,%xmm0
+ .byte 102,65,15,96,194 // punpcklbw %xmm10,%xmm0
+ .byte 102,65,15,104,226 // punpckhbw %xmm10,%xmm4
+ .byte 102,15,111,205 // movdqa %xmm5,%xmm1
+ .byte 102,65,15,96,202 // punpcklbw %xmm10,%xmm1
+ .byte 102,65,15,104,234 // punpckhbw %xmm10,%xmm5
+ .byte 102,65,15,213,235 // pmullw %xmm11,%xmm5
+ .byte 102,65,15,213,204 // pmullw %xmm12,%xmm1
+ .byte 102,65,15,213,224 // pmullw %xmm8,%xmm4
+ .byte 102,65,15,213,193 // pmullw %xmm9,%xmm0
+ .byte 102,65,15,253,193 // paddw %xmm9,%xmm0
+ .byte 102,65,15,253,224 // paddw %xmm8,%xmm4
+ .byte 102,65,15,253,204 // paddw %xmm12,%xmm1
+ .byte 102,65,15,253,235 // paddw %xmm11,%xmm5
.byte 102,15,113,213,8 // psrlw $0x8,%xmm5
+ .byte 102,15,113,209,8 // psrlw $0x8,%xmm1
+ .byte 102,15,113,212,8 // psrlw $0x8,%xmm4
.byte 102,15,113,208,8 // psrlw $0x8,%xmm0
- .byte 102,15,103,197 // packuswb %xmm5,%xmm0
- .byte 102,15,252,194 // paddb %xmm2,%xmm0
+ .byte 102,15,103,196 // packuswb %xmm4,%xmm0
+ .byte 102,15,103,205 // packuswb %xmm5,%xmm1
+ .byte 102,15,252,198 // paddb %xmm6,%xmm0
+ .byte 102,15,252,207 // paddb %xmm7,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
BALIGN4
.byte 0,0 // add %al,(%rax)
- .byte 127,67 // jg 1207 <_sk_xor__sse2_8bit+0x10c>
+ .byte 127,67 // jg 2307 <_sk_xor__sse2_8bit+0x1ee>
.byte 0,0 // add %al,(%rax)
- .byte 127,67 // jg 120b <_sk_xor__sse2_8bit+0x110>
+ .byte 127,67 // jg 230b <_sk_xor__sse2_8bit+0x1f2>
.byte 0,0 // add %al,(%rax)
- .byte 127,67 // jg 120f <_sk_xor__sse2_8bit+0x114>
+ .byte 127,67 // jg 230f <_sk_xor__sse2_8bit+0x1f6>
BALIGN16
.byte 0,0 // add %al,(%rax)
@@ -60810,21 +63940,37 @@ BALIGN16
.byte 0,0 // add %al,(%rax)
.byte 0,255 // add %bh,%bh
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
.byte 1,1 // add %eax,(%rcx)
.byte 1,0 // add %eax,(%rax)
.byte 1,1 // add %eax,(%rcx)
@@ -60842,13 +63988,13 @@ BALIGN16
.byte 0,0 // add %al,(%rax)
.byte 0,255 // add %bh,%bh
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
.byte 255,0 // incl (%rax)
- .byte 0,0 // add %al,(%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
.byte 1,1 // add %eax,(%rcx)
.byte 1,0 // add %eax,(%rax)
.byte 1,1 // add %eax,(%rcx)
@@ -60881,6 +64027,22 @@ BALIGN16
.byte 255,0 // incl (%rax)
.byte 255,0 // incl (%rax)
.byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
+ .byte 255,0 // incl (%rax)
.byte 0,0 // add %al,(%rax)
.byte 0,255 // add %bh,%bh
.byte 0,0 // add %al,(%rax)
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index 4937e80125..ef9b9c94dd 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -38480,7 +38480,7 @@ _sk_start_pipeline_hsw_8bit LABEL PROC
DB 77,57,207 ; cmp %r9,%r15
DB 15,131,138,0,0,0 ; jae 10d <_sk_start_pipeline_hsw_8bit+0x10d>
DB 72,139,133,24,255,255,255 ; mov -0xe8(%rbp),%rax
- DB 72,141,64,8 ; lea 0x8(%rax),%rax
+ DB 72,141,64,16 ; lea 0x10(%rax),%rax
DB 72,137,133,248,254,255,255 ; mov %rax,-0x108(%rbp)
DB 76,141,165,0,255,255,255 ; lea -0x100(%rbp),%r12
DB 72,139,133,24,255,255,255 ; mov -0xe8(%rbp),%rax
@@ -38493,9 +38493,9 @@ _sk_start_pipeline_hsw_8bit LABEL PROC
DB 76,137,246 ; mov %r14,%rsi
DB 65,255,213 ; callq *%r13
DB 72,139,141,0,255,255,255 ; mov -0x100(%rbp),%rcx
- DB 72,141,65,8 ; lea 0x8(%rcx),%rax
+ DB 72,141,65,16 ; lea 0x10(%rcx),%rax
DB 72,137,133,0,255,255,255 ; mov %rax,-0x100(%rbp)
- DB 72,131,193,16 ; add $0x10,%rcx
+ DB 72,131,193,32 ; add $0x20,%rcx
DB 72,57,217 ; cmp %rbx,%rcx
DB 118,220 ; jbe c5 <_sk_start_pipeline_hsw_8bit+0xc5>
DB 72,137,217 ; mov %rbx,%rcx
@@ -38539,222 +38539,720 @@ _sk_uniform_color_hsw_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
DB 196,226,125,24,64,16 ; vbroadcastss 0x10(%rax),%ymm0
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,252,40,200 ; vmovaps %ymm0,%ymm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_set_rgb_hsw_8bit
_sk_set_rgb_hsw_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 197,250,16,21,57,18,0,0 ; vmovss 0x1239(%rip),%xmm2 # 13b8 <_sk_xor__hsw_8bit+0xbd>
- DB 197,234,89,24 ; vmulss (%rax),%xmm2,%xmm3
- DB 196,225,250,44,203 ; vcvttss2si %xmm3,%rcx
- DB 197,234,89,88,4 ; vmulss 0x4(%rax),%xmm2,%xmm3
- DB 196,225,250,44,211 ; vcvttss2si %xmm3,%rdx
+ DB 197,250,16,37,65,39,0,0 ; vmovss 0x2741(%rip),%xmm4 # 28c4 <_sk_xor__hsw_8bit+0x173>
+ DB 197,218,89,40 ; vmulss (%rax),%xmm4,%xmm5
+ DB 196,225,250,44,205 ; vcvttss2si %xmm5,%rcx
+ DB 197,218,89,104,4 ; vmulss 0x4(%rax),%xmm4,%xmm5
+ DB 196,225,250,44,213 ; vcvttss2si %xmm5,%rdx
DB 193,226,8 ; shl $0x8,%edx
DB 9,202 ; or %ecx,%edx
- DB 197,234,89,80,8 ; vmulss 0x8(%rax),%xmm2,%xmm2
- DB 196,225,250,44,194 ; vcvttss2si %xmm2,%rax
+ DB 197,218,89,96,8 ; vmulss 0x8(%rax),%xmm4,%xmm4
+ DB 196,225,250,44,196 ; vcvttss2si %xmm4,%rax
DB 193,224,16 ; shl $0x10,%eax
DB 9,208 ; or %edx,%eax
- DB 197,249,110,208 ; vmovd %eax,%xmm2
- DB 196,226,125,88,210 ; vpbroadcastd %xmm2,%ymm2
- DB 197,253,219,5,41,18,0,0 ; vpand 0x1229(%rip),%ymm0,%ymm0 # 13e0 <_sk_xor__hsw_8bit+0xe5>
- DB 197,237,235,192 ; vpor %ymm0,%ymm2,%ymm0
+ DB 197,249,110,224 ; vmovd %eax,%xmm4
+ DB 196,226,125,88,228 ; vpbroadcastd %xmm4,%ymm4
+ DB 197,253,111,45,69,39,0,0 ; vmovdqa 0x2745(%rip),%ymm5 # 2900 <_sk_xor__hsw_8bit+0x1af>
+ DB 197,245,219,205 ; vpand %ymm5,%ymm1,%ymm1
+ DB 197,253,219,197 ; vpand %ymm5,%ymm0,%ymm0
+ DB 197,221,235,192 ; vpor %ymm0,%ymm4,%ymm0
+ DB 197,221,235,201 ; vpor %ymm1,%ymm4,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_premul_hsw_8bit
_sk_premul_hsw_8bit LABEL PROC
- DB 196,226,125,0,21,56,18,0,0 ; vpshufb 0x1238(%rip),%ymm0,%ymm2 # 1400 <_sk_xor__hsw_8bit+0x105>
- DB 197,237,235,21,80,18,0,0 ; vpor 0x1250(%rip),%ymm2,%ymm2 # 1420 <_sk_xor__hsw_8bit+0x125>
- DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3
+ DB 197,253,111,37,73,39,0,0 ; vmovdqa 0x2749(%rip),%ymm4 # 2920 <_sk_xor__hsw_8bit+0x1cf>
+ DB 196,226,125,0,236 ; vpshufb %ymm4,%ymm0,%ymm5
+ DB 196,226,117,0,228 ; vpshufb %ymm4,%ymm1,%ymm4
+ DB 197,253,111,53,87,39,0,0 ; vmovdqa 0x2757(%rip),%ymm6 # 2940 <_sk_xor__hsw_8bit+0x1ef>
+ DB 197,221,235,230 ; vpor %ymm6,%ymm4,%ymm4
+ DB 197,213,235,238 ; vpor %ymm6,%ymm5,%ymm5
+ DB 196,226,125,48,240 ; vpmovzxbw %xmm0,%ymm6
DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0
DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
- DB 196,226,125,48,226 ; vpmovzxbw %xmm2,%ymm4
- DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2
- DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2
- DB 197,237,213,208 ; vpmullw %ymm0,%ymm2,%ymm2
- DB 197,221,213,227 ; vpmullw %ymm3,%ymm4,%ymm4
- DB 197,221,253,219 ; vpaddw %ymm3,%ymm4,%ymm3
- DB 197,237,253,192 ; vpaddw %ymm0,%ymm2,%ymm0
+ DB 196,226,125,48,249 ; vpmovzxbw %xmm1,%ymm7
+ DB 196,227,125,57,201,1 ; vextracti128 $0x1,%ymm1,%xmm1
+ DB 196,226,125,48,201 ; vpmovzxbw %xmm1,%ymm1
+ DB 196,98,125,48,197 ; vpmovzxbw %xmm5,%ymm8
+ DB 196,227,125,57,237,1 ; vextracti128 $0x1,%ymm5,%xmm5
+ DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
+ DB 196,98,125,48,204 ; vpmovzxbw %xmm4,%ymm9
+ DB 196,227,125,57,228,1 ; vextracti128 $0x1,%ymm4,%xmm4
+ DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
+ DB 197,221,213,225 ; vpmullw %ymm1,%ymm4,%ymm4
+ DB 197,53,213,207 ; vpmullw %ymm7,%ymm9,%ymm9
+ DB 197,213,213,232 ; vpmullw %ymm0,%ymm5,%ymm5
+ DB 197,61,213,198 ; vpmullw %ymm6,%ymm8,%ymm8
+ DB 197,189,253,246 ; vpaddw %ymm6,%ymm8,%ymm6
+ DB 197,213,253,192 ; vpaddw %ymm0,%ymm5,%ymm0
+ DB 197,181,253,239 ; vpaddw %ymm7,%ymm9,%ymm5
+ DB 197,221,253,201 ; vpaddw %ymm1,%ymm4,%ymm1
+ DB 197,245,113,209,8 ; vpsrlw $0x8,%ymm1,%ymm1
+ DB 197,221,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm4
DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0
- DB 197,237,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm2
- DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3
- DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0
+ DB 197,213,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm5
+ DB 196,227,85,56,240,1 ; vinserti128 $0x1,%xmm0,%ymm5,%ymm6
+ DB 196,227,85,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm5,%ymm0
+ DB 197,205,103,192 ; vpackuswb %ymm0,%ymm6,%ymm0
+ DB 196,227,93,56,233,1 ; vinserti128 $0x1,%xmm1,%ymm4,%ymm5
+ DB 196,227,93,70,201,49 ; vperm2i128 $0x31,%ymm1,%ymm4,%ymm1
+ DB 197,213,103,201 ; vpackuswb %ymm1,%ymm5,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_swap_rb_hsw_8bit
_sk_swap_rb_hsw_8bit LABEL PROC
- DB 196,226,125,0,5,25,18,0,0 ; vpshufb 0x1219(%rip),%ymm0,%ymm0 # 1440 <_sk_xor__hsw_8bit+0x145>
+ DB 197,253,111,37,207,38,0,0 ; vmovdqa 0x26cf(%rip),%ymm4 # 2960 <_sk_xor__hsw_8bit+0x20f>
+ DB 196,226,125,0,196 ; vpshufb %ymm4,%ymm0,%ymm0
+ DB 196,226,117,0,204 ; vpshufb %ymm4,%ymm1,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_invert_hsw_8bit
_sk_invert_hsw_8bit LABEL PROC
- DB 197,237,118,210 ; vpcmpeqd %ymm2,%ymm2,%ymm2
- DB 197,253,239,194 ; vpxor %ymm2,%ymm0,%ymm0
+ DB 197,221,118,228 ; vpcmpeqd %ymm4,%ymm4,%ymm4
+ DB 197,253,239,196 ; vpxor %ymm4,%ymm0,%ymm0
+ DB 197,245,239,204 ; vpxor %ymm4,%ymm1,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_load_8888_hsw_8bit
_sk_load_8888_hsw_8bit LABEL PROC
- DB 76,99,15 ; movslq (%rdi),%r9
- DB 76,139,71,16 ; mov 0x10(%rdi),%r8
+ DB 76,99,7 ; movslq (%rdi),%r8
+ DB 76,139,79,16 ; mov 0x10(%rdi),%r9
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 72,99,80,8 ; movslq 0x8(%rax),%rdx
- DB 72,99,79,8 ; movslq 0x8(%rdi),%rcx
- DB 72,15,175,202 ; imul %rdx,%rcx
- DB 72,193,225,2 ; shl $0x2,%rcx
- DB 72,3,8 ; add (%rax),%rcx
- DB 74,141,4,137 ; lea (%rcx,%r9,4),%rax
- DB 77,133,192 ; test %r8,%r8
- DB 117,8 ; jne 264 <_sk_load_8888_hsw_8bit+0x2d>
- DB 197,254,111,0 ; vmovdqu (%rax),%ymm0
+ DB 72,99,72,8 ; movslq 0x8(%rax),%rcx
+ DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx
+ DB 72,15,175,209 ; imul %rcx,%rdx
+ DB 72,193,226,2 ; shl $0x2,%rdx
+ DB 72,3,16 ; add (%rax),%rdx
+ DB 77,133,201 ; test %r9,%r9
+ DB 117,17 ; jne 2e1 <_sk_load_8888_hsw_8bit+0x32>
+ DB 196,161,126,111,76,130,32 ; vmovdqu 0x20(%rdx,%r8,4),%ymm1
+ DB 196,161,126,111,4,130 ; vmovdqu (%rdx,%r8,4),%ymm0
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 185,8,0,0,0 ; mov $0x8,%ecx
- DB 68,41,193 ; sub %r8d,%ecx
- DB 192,225,3 ; shl $0x3,%cl
- DB 72,199,194,255,255,255,255 ; mov $0xffffffffffffffff,%rdx
- DB 72,211,234 ; shr %cl,%rdx
- DB 196,225,249,110,194 ; vmovq %rdx,%xmm0
- DB 196,226,125,33,192 ; vpmovsxbd %xmm0,%ymm0
- DB 196,226,125,140,0 ; vpmaskmovd (%rax),%ymm0,%ymm0
- DB 235,214 ; jmp 260 <_sk_load_8888_hsw_8bit+0x29>
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 197,245,239,201 ; vpxor %ymm1,%ymm1,%ymm1
+ DB 197,253,239,192 ; vpxor %ymm0,%ymm0,%ymm0
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,231 ; ja 2dd <_sk_load_8888_hsw_8bit+0x2e>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,23,1,0,0 ; lea 0x117(%rip),%rcx # 418 <_sk_load_8888_hsw_8bit+0x169>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 196,161,121,110,4,130 ; vmovd (%rdx,%r8,4),%xmm0
+ DB 235,203 ; jmp 2dd <_sk_load_8888_hsw_8bit+0x2e>
+ DB 196,161,121,110,68,130,8 ; vmovd 0x8(%rdx,%r8,4),%xmm0
+ DB 196,226,121,89,192 ; vpbroadcastq %xmm0,%xmm0
+ DB 197,245,239,201 ; vpxor %ymm1,%ymm1,%ymm1
+ DB 196,227,117,2,192,4 ; vpblendd $0x4,%ymm0,%ymm1,%ymm0
+ DB 196,162,121,53,36,130 ; vpmovzxdq (%rdx,%r8,4),%xmm4
+ DB 197,249,112,228,232 ; vpshufd $0xe8,%xmm4,%xmm4
+ DB 196,227,125,2,196,3 ; vpblendd $0x3,%ymm4,%ymm0,%ymm0
+ DB 235,162 ; jmp 2dd <_sk_load_8888_hsw_8bit+0x2e>
+ DB 196,161,121,110,68,130,24 ; vmovd 0x18(%rdx,%r8,4),%xmm0
+ DB 196,226,125,89,192 ; vpbroadcastq %xmm0,%ymm0
+ DB 197,245,239,201 ; vpxor %ymm1,%ymm1,%ymm1
+ DB 196,227,117,2,192,64 ; vpblendd $0x40,%ymm0,%ymm1,%ymm0
+ DB 196,227,125,57,196,1 ; vextracti128 $0x1,%ymm0,%xmm4
+ DB 196,163,89,34,100,130,20,1 ; vpinsrd $0x1,0x14(%rdx,%r8,4),%xmm4,%xmm4
+ DB 196,227,125,56,196,1 ; vinserti128 $0x1,%xmm4,%ymm0,%ymm0
+ DB 196,227,125,57,196,1 ; vextracti128 $0x1,%ymm0,%xmm4
+ DB 196,163,89,34,100,130,16,0 ; vpinsrd $0x0,0x10(%rdx,%r8,4),%xmm4,%xmm4
+ DB 196,227,125,56,196,1 ; vinserti128 $0x1,%xmm4,%ymm0,%ymm0
+ DB 196,161,122,111,36,130 ; vmovdqu (%rdx,%r8,4),%xmm4
+ DB 196,227,93,2,192,240 ; vpblendd $0xf0,%ymm0,%ymm4,%ymm0
+ DB 233,83,255,255,255 ; jmpq 2dd <_sk_load_8888_hsw_8bit+0x2e>
+ DB 196,161,121,110,68,130,40 ; vmovd 0x28(%rdx,%r8,4),%xmm0
+ DB 196,226,121,89,192 ; vpbroadcastq %xmm0,%xmm0
+ DB 197,245,239,201 ; vpxor %ymm1,%ymm1,%ymm1
+ DB 196,227,117,2,200,4 ; vpblendd $0x4,%ymm0,%ymm1,%ymm1
+ DB 196,163,113,34,68,130,36,1 ; vpinsrd $0x1,0x24(%rdx,%r8,4),%xmm1,%xmm0
+ DB 196,227,117,2,200,15 ; vpblendd $0xf,%ymm0,%ymm1,%ymm1
+ DB 196,161,121,110,68,130,32 ; vmovd 0x20(%rdx,%r8,4),%xmm0
+ DB 196,227,117,2,200,1 ; vpblendd $0x1,%ymm0,%ymm1,%ymm1
+ DB 233,23,255,255,255 ; jmpq 2d7 <_sk_load_8888_hsw_8bit+0x28>
+ DB 196,161,121,110,68,130,56 ; vmovd 0x38(%rdx,%r8,4),%xmm0
+ DB 196,226,125,89,192 ; vpbroadcastq %xmm0,%ymm0
+ DB 197,245,239,201 ; vpxor %ymm1,%ymm1,%ymm1
+ DB 196,227,117,2,200,64 ; vpblendd $0x40,%ymm0,%ymm1,%ymm1
+ DB 196,227,125,57,200,1 ; vextracti128 $0x1,%ymm1,%xmm0
+ DB 196,163,121,34,68,130,52,1 ; vpinsrd $0x1,0x34(%rdx,%r8,4),%xmm0,%xmm0
+ DB 196,227,117,56,200,1 ; vinserti128 $0x1,%xmm0,%ymm1,%ymm1
+ DB 196,227,125,57,200,1 ; vextracti128 $0x1,%ymm1,%xmm0
+ DB 196,163,121,34,68,130,48,0 ; vpinsrd $0x0,0x30(%rdx,%r8,4),%xmm0,%xmm0
+ DB 196,227,117,56,200,1 ; vinserti128 $0x1,%xmm0,%ymm1,%ymm1
+ DB 196,161,126,111,4,130 ; vmovdqu (%rdx,%r8,4),%ymm0
+ DB 196,161,122,111,100,130,32 ; vmovdqu 0x20(%rdx,%r8,4),%xmm4
+ DB 196,227,93,2,201,240 ; vpblendd $0xf0,%ymm1,%ymm4,%ymm1
+ DB 233,199,254,255,255 ; jmpq 2dd <_sk_load_8888_hsw_8bit+0x2e>
+ DB 102,144 ; xchg %ax,%ax
+ DB 242,254 ; repnz (bad)
+ DB 255 ; (bad)
+ DB 255,16 ; callq *(%rax)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 250 ; cli
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,97,255 ; jmpq *-0x1(%rcx)
+ DB 255 ; (bad)
+ DB 255,77,255 ; decl -0x1(%rbp)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 57,255 ; cmp %edi,%edi
+ DB 255 ; (bad)
+ DB 255,35 ; jmpq *(%rbx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 191,254,255,255,150 ; mov $0x96fffffe,%edi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,136,255,255,255,114 ; decl 0x72ffffff(%rax)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,230 ; jmpq *%rsi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,210 ; callq *%rdx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 190,255,255,255,168 ; mov $0xa8ffffff,%esi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_8888_dst_hsw_8bit
_sk_load_8888_dst_hsw_8bit LABEL PROC
- DB 76,99,15 ; movslq (%rdi),%r9
- DB 76,139,71,16 ; mov 0x10(%rdi),%r8
+ DB 76,99,7 ; movslq (%rdi),%r8
+ DB 76,139,79,16 ; mov 0x10(%rdi),%r9
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 72,99,80,8 ; movslq 0x8(%rax),%rdx
- DB 72,99,79,8 ; movslq 0x8(%rdi),%rcx
- DB 72,15,175,202 ; imul %rdx,%rcx
- DB 72,193,225,2 ; shl $0x2,%rcx
- DB 72,3,8 ; add (%rax),%rcx
- DB 74,141,4,137 ; lea (%rcx,%r9,4),%rax
- DB 77,133,192 ; test %r8,%r8
- DB 117,8 ; jne 2b7 <_sk_load_8888_dst_hsw_8bit+0x2d>
- DB 197,254,111,8 ; vmovdqu (%rax),%ymm1
+ DB 72,99,72,8 ; movslq 0x8(%rax),%rcx
+ DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx
+ DB 72,15,175,209 ; imul %rcx,%rdx
+ DB 72,193,226,2 ; shl $0x2,%rdx
+ DB 72,3,16 ; add (%rax),%rdx
+ DB 77,133,201 ; test %r9,%r9
+ DB 117,17 ; jne 486 <_sk_load_8888_dst_hsw_8bit+0x32>
+ DB 196,161,126,111,92,130,32 ; vmovdqu 0x20(%rdx,%r8,4),%ymm3
+ DB 196,161,126,111,20,130 ; vmovdqu (%rdx,%r8,4),%ymm2
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 185,8,0,0,0 ; mov $0x8,%ecx
- DB 68,41,193 ; sub %r8d,%ecx
- DB 192,225,3 ; shl $0x3,%cl
- DB 72,199,194,255,255,255,255 ; mov $0xffffffffffffffff,%rdx
- DB 72,211,234 ; shr %cl,%rdx
- DB 196,225,249,110,202 ; vmovq %rdx,%xmm1
- DB 196,226,125,33,201 ; vpmovsxbd %xmm1,%ymm1
- DB 196,226,117,140,8 ; vpmaskmovd (%rax),%ymm1,%ymm1
- DB 235,214 ; jmp 2b3 <_sk_load_8888_dst_hsw_8bit+0x29>
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 197,229,239,219 ; vpxor %ymm3,%ymm3,%ymm3
+ DB 197,237,239,210 ; vpxor %ymm2,%ymm2,%ymm2
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,231 ; ja 482 <_sk_load_8888_dst_hsw_8bit+0x2e>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,22,1,0,0 ; lea 0x116(%rip),%rcx # 5bc <_sk_load_8888_dst_hsw_8bit+0x168>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 196,161,121,110,20,130 ; vmovd (%rdx,%r8,4),%xmm2
+ DB 235,203 ; jmp 482 <_sk_load_8888_dst_hsw_8bit+0x2e>
+ DB 196,161,121,110,84,130,8 ; vmovd 0x8(%rdx,%r8,4),%xmm2
+ DB 196,226,121,89,210 ; vpbroadcastq %xmm2,%xmm2
+ DB 197,229,239,219 ; vpxor %ymm3,%ymm3,%ymm3
+ DB 196,227,101,2,210,4 ; vpblendd $0x4,%ymm2,%ymm3,%ymm2
+ DB 196,162,121,53,36,130 ; vpmovzxdq (%rdx,%r8,4),%xmm4
+ DB 197,249,112,228,232 ; vpshufd $0xe8,%xmm4,%xmm4
+ DB 196,227,109,2,212,3 ; vpblendd $0x3,%ymm4,%ymm2,%ymm2
+ DB 235,162 ; jmp 482 <_sk_load_8888_dst_hsw_8bit+0x2e>
+ DB 196,161,121,110,84,130,24 ; vmovd 0x18(%rdx,%r8,4),%xmm2
+ DB 196,226,125,89,210 ; vpbroadcastq %xmm2,%ymm2
+ DB 197,229,239,219 ; vpxor %ymm3,%ymm3,%ymm3
+ DB 196,227,101,2,210,64 ; vpblendd $0x40,%ymm2,%ymm3,%ymm2
+ DB 196,227,125,57,212,1 ; vextracti128 $0x1,%ymm2,%xmm4
+ DB 196,163,89,34,100,130,20,1 ; vpinsrd $0x1,0x14(%rdx,%r8,4),%xmm4,%xmm4
+ DB 196,227,109,56,212,1 ; vinserti128 $0x1,%xmm4,%ymm2,%ymm2
+ DB 196,227,125,57,212,1 ; vextracti128 $0x1,%ymm2,%xmm4
+ DB 196,163,89,34,100,130,16,0 ; vpinsrd $0x0,0x10(%rdx,%r8,4),%xmm4,%xmm4
+ DB 196,227,109,56,212,1 ; vinserti128 $0x1,%xmm4,%ymm2,%ymm2
+ DB 196,161,122,111,36,130 ; vmovdqu (%rdx,%r8,4),%xmm4
+ DB 196,227,93,2,210,240 ; vpblendd $0xf0,%ymm2,%ymm4,%ymm2
+ DB 233,83,255,255,255 ; jmpq 482 <_sk_load_8888_dst_hsw_8bit+0x2e>
+ DB 196,161,121,110,84,130,40 ; vmovd 0x28(%rdx,%r8,4),%xmm2
+ DB 196,226,121,89,210 ; vpbroadcastq %xmm2,%xmm2
+ DB 197,229,239,219 ; vpxor %ymm3,%ymm3,%ymm3
+ DB 196,227,101,2,218,4 ; vpblendd $0x4,%ymm2,%ymm3,%ymm3
+ DB 196,163,97,34,84,130,36,1 ; vpinsrd $0x1,0x24(%rdx,%r8,4),%xmm3,%xmm2
+ DB 196,227,101,2,218,15 ; vpblendd $0xf,%ymm2,%ymm3,%ymm3
+ DB 196,161,121,110,84,130,32 ; vmovd 0x20(%rdx,%r8,4),%xmm2
+ DB 196,227,101,2,218,1 ; vpblendd $0x1,%ymm2,%ymm3,%ymm3
+ DB 233,23,255,255,255 ; jmpq 47c <_sk_load_8888_dst_hsw_8bit+0x28>
+ DB 196,161,121,110,84,130,56 ; vmovd 0x38(%rdx,%r8,4),%xmm2
+ DB 196,226,125,89,210 ; vpbroadcastq %xmm2,%ymm2
+ DB 197,229,239,219 ; vpxor %ymm3,%ymm3,%ymm3
+ DB 196,227,101,2,218,64 ; vpblendd $0x40,%ymm2,%ymm3,%ymm3
+ DB 196,227,125,57,218,1 ; vextracti128 $0x1,%ymm3,%xmm2
+ DB 196,163,105,34,84,130,52,1 ; vpinsrd $0x1,0x34(%rdx,%r8,4),%xmm2,%xmm2
+ DB 196,227,101,56,218,1 ; vinserti128 $0x1,%xmm2,%ymm3,%ymm3
+ DB 196,227,125,57,218,1 ; vextracti128 $0x1,%ymm3,%xmm2
+ DB 196,163,105,34,84,130,48,0 ; vpinsrd $0x0,0x30(%rdx,%r8,4),%xmm2,%xmm2
+ DB 196,227,101,56,218,1 ; vinserti128 $0x1,%xmm2,%ymm3,%ymm3
+ DB 196,161,126,111,20,130 ; vmovdqu (%rdx,%r8,4),%ymm2
+ DB 196,161,122,111,100,130,32 ; vmovdqu 0x20(%rdx,%r8,4),%xmm4
+ DB 196,227,93,2,219,240 ; vpblendd $0xf0,%ymm3,%ymm4,%ymm3
+ DB 233,199,254,255,255 ; jmpq 482 <_sk_load_8888_dst_hsw_8bit+0x2e>
+ DB 144 ; nop
+ DB 243,254 ; repz (bad)
+ DB 255 ; (bad)
+ DB 255,17 ; callq *(%rcx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 251 ; sti
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,98,255 ; jmpq *-0x1(%rdx)
+ DB 255 ; (bad)
+ DB 255,78,255 ; decl -0x1(%rsi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 58,255 ; cmp %bh,%bh
+ DB 255 ; (bad)
+ DB 255,36,255 ; jmpq *(%rdi,%rdi,8)
+ DB 255 ; (bad)
+ DB 255,192 ; inc %eax
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,151,255,255,255,137 ; callq *-0x76000001(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,115,255 ; pushq -0x1(%rbx)
+ DB 255 ; (bad)
+ DB 255,231 ; jmpq *%rdi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,211 ; callq *%rbx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 191,255,255,255,169 ; mov $0xa9ffffff,%edi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_store_8888_hsw_8bit
_sk_store_8888_hsw_8bit LABEL PROC
- DB 76,99,15 ; movslq (%rdi),%r9
- DB 76,139,71,16 ; mov 0x10(%rdi),%r8
+ DB 76,99,7 ; movslq (%rdi),%r8
+ DB 76,139,79,16 ; mov 0x10(%rdi),%r9
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 72,99,80,8 ; movslq 0x8(%rax),%rdx
- DB 72,99,79,8 ; movslq 0x8(%rdi),%rcx
- DB 72,15,175,202 ; imul %rdx,%rcx
- DB 72,193,225,2 ; shl $0x2,%rcx
- DB 72,3,8 ; add (%rax),%rcx
- DB 74,141,4,137 ; lea (%rcx,%r9,4),%rax
- DB 77,133,192 ; test %r8,%r8
- DB 117,8 ; jne 30a <_sk_store_8888_hsw_8bit+0x2d>
- DB 197,254,127,0 ; vmovdqu %ymm0,(%rax)
+ DB 72,99,72,8 ; movslq 0x8(%rax),%rcx
+ DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx
+ DB 72,15,175,209 ; imul %rcx,%rdx
+ DB 72,193,226,2 ; shl $0x2,%rdx
+ DB 72,3,16 ; add (%rax),%rdx
+ DB 77,133,201 ; test %r9,%r9
+ DB 117,17 ; jne 62a <_sk_store_8888_hsw_8bit+0x32>
+ DB 196,161,126,127,4,130 ; vmovdqu %ymm0,(%rdx,%r8,4)
+ DB 196,161,126,127,76,130,32 ; vmovdqu %ymm1,0x20(%rdx,%r8,4)
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 185,8,0,0,0 ; mov $0x8,%ecx
- DB 68,41,193 ; sub %r8d,%ecx
- DB 192,225,3 ; shl $0x3,%cl
- DB 72,199,194,255,255,255,255 ; mov $0xffffffffffffffff,%rdx
- DB 72,211,234 ; shr %cl,%rdx
- DB 196,225,249,110,210 ; vmovq %rdx,%xmm2
- DB 196,226,125,33,210 ; vpmovsxbd %xmm2,%ymm2
- DB 196,226,109,142,0 ; vpmaskmovd %ymm0,%ymm2,(%rax)
- DB 235,214 ; jmp 306 <_sk_store_8888_hsw_8bit+0x29>
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,239 ; ja 626 <_sk_store_8888_hsw_8bit+0x2e>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,178,0,0,0 ; lea 0xb2(%rip),%rcx # 6f4 <_sk_store_8888_hsw_8bit+0xfc>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 196,161,121,126,4,130 ; vmovd %xmm0,(%rdx,%r8,4)
+ DB 235,211 ; jmp 626 <_sk_store_8888_hsw_8bit+0x2e>
+ DB 196,163,121,22,68,130,8,2 ; vpextrd $0x2,%xmm0,0x8(%rdx,%r8,4)
+ DB 196,161,121,214,4,130 ; vmovq %xmm0,(%rdx,%r8,4)
+ DB 235,195 ; jmp 626 <_sk_store_8888_hsw_8bit+0x2e>
+ DB 196,227,125,57,196,1 ; vextracti128 $0x1,%ymm0,%xmm4
+ DB 196,163,121,22,100,130,24,2 ; vpextrd $0x2,%xmm4,0x18(%rdx,%r8,4)
+ DB 196,227,125,57,196,1 ; vextracti128 $0x1,%ymm0,%xmm4
+ DB 196,163,121,22,100,130,20,1 ; vpextrd $0x1,%xmm4,0x14(%rdx,%r8,4)
+ DB 196,227,125,57,196,1 ; vextracti128 $0x1,%ymm0,%xmm4
+ DB 196,161,121,126,100,130,16 ; vmovd %xmm4,0x10(%rdx,%r8,4)
+ DB 196,161,122,127,4,130 ; vmovdqu %xmm0,(%rdx,%r8,4)
+ DB 235,146 ; jmp 626 <_sk_store_8888_hsw_8bit+0x2e>
+ DB 196,163,121,22,76,130,40,2 ; vpextrd $0x2,%xmm1,0x28(%rdx,%r8,4)
+ DB 196,163,121,22,76,130,36,1 ; vpextrd $0x1,%xmm1,0x24(%rdx,%r8,4)
+ DB 196,161,121,126,76,130,32 ; vmovd %xmm1,0x20(%rdx,%r8,4)
+ DB 196,161,126,127,4,130 ; vmovdqu %ymm0,(%rdx,%r8,4)
+ DB 233,112,255,255,255 ; jmpq 626 <_sk_store_8888_hsw_8bit+0x2e>
+ DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4
+ DB 196,163,121,22,100,130,56,2 ; vpextrd $0x2,%xmm4,0x38(%rdx,%r8,4)
+ DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4
+ DB 196,163,121,22,100,130,52,1 ; vpextrd $0x1,%xmm4,0x34(%rdx,%r8,4)
+ DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4
+ DB 196,161,121,126,100,130,48 ; vmovd %xmm4,0x30(%rdx,%r8,4)
+ DB 196,161,126,127,4,130 ; vmovdqu %ymm0,(%rdx,%r8,4)
+ DB 196,161,122,127,76,130,32 ; vmovdqu %xmm1,0x20(%rdx,%r8,4)
+ DB 233,53,255,255,255 ; jmpq 626 <_sk_store_8888_hsw_8bit+0x2e>
+ DB 15,31,0 ; nopl (%rax)
+ DB 87 ; push %rdi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,103,255 ; jmpq *-0x1(%rdi)
+ DB 255 ; (bad)
+ DB 255,95,255 ; lcall *-0x1(%rdi)
+ DB 255 ; (bad)
+ DB 255,152,255,255,255,139 ; lcall *-0x74000001(%rax)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 125,255 ; jge 709 <_sk_store_8888_hsw_8bit+0x111>
+ DB 255 ; (bad)
+ DB 255,111,255 ; ljmp *-0x1(%rdi)
+ DB 255 ; (bad)
+ DB 255,183,255,255,255,176 ; pushq -0x4f000001(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,168,255,255,255,160 ; ljmp *-0x5f000001(%rax)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 235,255 ; jmp 721 <_sk_store_8888_hsw_8bit+0x129>
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 222,255 ; fdivrp %st,%st(7)
+ DB 255 ; (bad)
+ DB 255,208 ; callq *%rax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,194 ; inc %edx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_bgra_hsw_8bit
_sk_load_bgra_hsw_8bit LABEL PROC
- DB 76,99,15 ; movslq (%rdi),%r9
- DB 76,139,71,16 ; mov 0x10(%rdi),%r8
+ DB 76,99,7 ; movslq (%rdi),%r8
+ DB 76,139,79,16 ; mov 0x10(%rdi),%r9
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 72,99,80,8 ; movslq 0x8(%rax),%rdx
- DB 72,99,79,8 ; movslq 0x8(%rdi),%rcx
- DB 72,15,175,202 ; imul %rdx,%rcx
- DB 72,193,225,2 ; shl $0x2,%rcx
- DB 72,3,8 ; add (%rax),%rcx
- DB 74,141,4,137 ; lea (%rcx,%r9,4),%rax
- DB 77,133,192 ; test %r8,%r8
- DB 117,17 ; jne 366 <_sk_load_bgra_hsw_8bit+0x36>
- DB 197,254,111,0 ; vmovdqu (%rax),%ymm0
- DB 196,226,125,0,5,254,16,0,0 ; vpshufb 0x10fe(%rip),%ymm0,%ymm0 # 1460 <_sk_xor__hsw_8bit+0x165>
+ DB 72,99,72,8 ; movslq 0x8(%rax),%rcx
+ DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx
+ DB 72,15,175,209 ; imul %rcx,%rdx
+ DB 72,193,226,2 ; shl $0x2,%rdx
+ DB 72,3,16 ; add (%rax),%rdx
+ DB 77,133,201 ; test %r9,%r9
+ DB 117,35 ; jne 774 <_sk_load_bgra_hsw_8bit+0x44>
+ DB 196,161,126,111,76,130,32 ; vmovdqu 0x20(%rdx,%r8,4),%ymm1
+ DB 196,161,126,111,4,130 ; vmovdqu (%rdx,%r8,4),%ymm0
+ DB 197,253,111,37,26,34,0,0 ; vmovdqa 0x221a(%rip),%ymm4 # 2980 <_sk_xor__hsw_8bit+0x22f>
+ DB 196,226,125,0,196 ; vpshufb %ymm4,%ymm0,%ymm0
+ DB 196,226,117,0,204 ; vpshufb %ymm4,%ymm1,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 185,8,0,0,0 ; mov $0x8,%ecx
- DB 68,41,193 ; sub %r8d,%ecx
- DB 192,225,3 ; shl $0x3,%cl
- DB 72,199,194,255,255,255,255 ; mov $0xffffffffffffffff,%rdx
- DB 72,211,234 ; shr %cl,%rdx
- DB 196,225,249,110,194 ; vmovq %rdx,%xmm0
- DB 196,226,125,33,192 ; vpmovsxbd %xmm0,%ymm0
- DB 196,226,125,140,0 ; vpmaskmovd (%rax),%ymm0,%ymm0
- DB 235,205 ; jmp 359 <_sk_load_bgra_hsw_8bit+0x29>
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 197,245,239,201 ; vpxor %ymm1,%ymm1,%ymm1
+ DB 197,253,239,192 ; vpxor %ymm0,%ymm0,%ymm0
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,213 ; ja 75e <_sk_load_bgra_hsw_8bit+0x2e>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,24,1,0,0 ; lea 0x118(%rip),%rcx # 8ac <_sk_load_bgra_hsw_8bit+0x17c>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 196,161,121,110,4,130 ; vmovd (%rdx,%r8,4),%xmm0
+ DB 235,185 ; jmp 75e <_sk_load_bgra_hsw_8bit+0x2e>
+ DB 196,161,121,110,68,130,8 ; vmovd 0x8(%rdx,%r8,4),%xmm0
+ DB 196,226,121,89,192 ; vpbroadcastq %xmm0,%xmm0
+ DB 197,245,239,201 ; vpxor %ymm1,%ymm1,%ymm1
+ DB 196,227,117,2,192,4 ; vpblendd $0x4,%ymm0,%ymm1,%ymm0
+ DB 196,162,121,53,36,130 ; vpmovzxdq (%rdx,%r8,4),%xmm4
+ DB 197,249,112,228,232 ; vpshufd $0xe8,%xmm4,%xmm4
+ DB 196,227,125,2,196,3 ; vpblendd $0x3,%ymm4,%ymm0,%ymm0
+ DB 235,144 ; jmp 75e <_sk_load_bgra_hsw_8bit+0x2e>
+ DB 196,161,121,110,68,130,24 ; vmovd 0x18(%rdx,%r8,4),%xmm0
+ DB 196,226,125,89,192 ; vpbroadcastq %xmm0,%ymm0
+ DB 197,245,239,201 ; vpxor %ymm1,%ymm1,%ymm1
+ DB 196,227,117,2,192,64 ; vpblendd $0x40,%ymm0,%ymm1,%ymm0
+ DB 196,227,125,57,196,1 ; vextracti128 $0x1,%ymm0,%xmm4
+ DB 196,163,89,34,100,130,20,1 ; vpinsrd $0x1,0x14(%rdx,%r8,4),%xmm4,%xmm4
+ DB 196,227,125,56,196,1 ; vinserti128 $0x1,%xmm4,%ymm0,%ymm0
+ DB 196,227,125,57,196,1 ; vextracti128 $0x1,%ymm0,%xmm4
+ DB 196,163,89,34,100,130,16,0 ; vpinsrd $0x0,0x10(%rdx,%r8,4),%xmm4,%xmm4
+ DB 196,227,125,56,196,1 ; vinserti128 $0x1,%xmm4,%ymm0,%ymm0
+ DB 196,161,122,111,36,130 ; vmovdqu (%rdx,%r8,4),%xmm4
+ DB 196,227,93,2,192,240 ; vpblendd $0xf0,%ymm0,%ymm4,%ymm0
+ DB 233,65,255,255,255 ; jmpq 75e <_sk_load_bgra_hsw_8bit+0x2e>
+ DB 196,161,121,110,68,130,40 ; vmovd 0x28(%rdx,%r8,4),%xmm0
+ DB 196,226,121,89,192 ; vpbroadcastq %xmm0,%xmm0
+ DB 197,245,239,201 ; vpxor %ymm1,%ymm1,%ymm1
+ DB 196,227,117,2,200,4 ; vpblendd $0x4,%ymm0,%ymm1,%ymm1
+ DB 196,163,113,34,68,130,36,1 ; vpinsrd $0x1,0x24(%rdx,%r8,4),%xmm1,%xmm0
+ DB 196,227,117,2,200,15 ; vpblendd $0xf,%ymm0,%ymm1,%ymm1
+ DB 196,161,121,110,68,130,32 ; vmovd 0x20(%rdx,%r8,4),%xmm0
+ DB 196,227,117,2,200,1 ; vpblendd $0x1,%ymm0,%ymm1,%ymm1
+ DB 233,5,255,255,255 ; jmpq 758 <_sk_load_bgra_hsw_8bit+0x28>
+ DB 196,161,121,110,68,130,56 ; vmovd 0x38(%rdx,%r8,4),%xmm0
+ DB 196,226,125,89,192 ; vpbroadcastq %xmm0,%ymm0
+ DB 197,245,239,201 ; vpxor %ymm1,%ymm1,%ymm1
+ DB 196,227,117,2,200,64 ; vpblendd $0x40,%ymm0,%ymm1,%ymm1
+ DB 196,227,125,57,200,1 ; vextracti128 $0x1,%ymm1,%xmm0
+ DB 196,163,121,34,68,130,52,1 ; vpinsrd $0x1,0x34(%rdx,%r8,4),%xmm0,%xmm0
+ DB 196,227,117,56,200,1 ; vinserti128 $0x1,%xmm0,%ymm1,%ymm1
+ DB 196,227,125,57,200,1 ; vextracti128 $0x1,%ymm1,%xmm0
+ DB 196,163,121,34,68,130,48,0 ; vpinsrd $0x0,0x30(%rdx,%r8,4),%xmm0,%xmm0
+ DB 196,227,117,56,200,1 ; vinserti128 $0x1,%xmm0,%ymm1,%ymm1
+ DB 196,161,126,111,4,130 ; vmovdqu (%rdx,%r8,4),%ymm0
+ DB 196,161,122,111,100,130,32 ; vmovdqu 0x20(%rdx,%r8,4),%xmm4
+ DB 196,227,93,2,201,240 ; vpblendd $0xf0,%ymm1,%ymm4,%ymm1
+ DB 233,181,254,255,255 ; jmpq 75e <_sk_load_bgra_hsw_8bit+0x2e>
+ DB 15,31,0 ; nopl (%rax)
+ DB 241 ; icebp
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,15 ; decl (%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 249 ; stc
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,96,255 ; jmpq *-0x1(%rax)
+ DB 255 ; (bad)
+ DB 255,76,255,255 ; decl -0x1(%rdi,%rdi,8)
+ DB 255 ; (bad)
+ DB 56,255 ; cmp %bh,%bh
+ DB 255 ; (bad)
+ DB 255,34 ; jmpq *(%rdx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,172,254,255,255,149,255 ; ljmp *-0x6a0001(%rsi,%rdi,8)
+ DB 255 ; (bad)
+ DB 255,135,255,255,255,113 ; incl 0x71ffffff(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,229 ; jmpq *%rbp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,209 ; callq *%rcx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 189,255,255,255,167 ; mov $0xa7ffffff,%ebp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_bgra_dst_hsw_8bit
_sk_load_bgra_dst_hsw_8bit LABEL PROC
- DB 76,99,15 ; movslq (%rdi),%r9
- DB 76,139,71,16 ; mov 0x10(%rdi),%r8
+ DB 76,99,7 ; movslq (%rdi),%r8
+ DB 76,139,79,16 ; mov 0x10(%rdi),%r9
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 72,99,80,8 ; movslq 0x8(%rax),%rdx
- DB 72,99,79,8 ; movslq 0x8(%rdi),%rcx
- DB 72,15,175,202 ; imul %rdx,%rcx
- DB 72,193,225,2 ; shl $0x2,%rcx
- DB 72,3,8 ; add (%rax),%rcx
- DB 74,141,4,137 ; lea (%rcx,%r9,4),%rax
- DB 77,133,192 ; test %r8,%r8
- DB 117,17 ; jne 3c2 <_sk_load_bgra_dst_hsw_8bit+0x36>
- DB 197,254,111,8 ; vmovdqu (%rax),%ymm1
- DB 196,226,117,0,13,194,16,0,0 ; vpshufb 0x10c2(%rip),%ymm1,%ymm1 # 1480 <_sk_xor__hsw_8bit+0x185>
+ DB 72,99,72,8 ; movslq 0x8(%rax),%rcx
+ DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx
+ DB 72,15,175,209 ; imul %rcx,%rdx
+ DB 72,193,226,2 ; shl $0x2,%rdx
+ DB 72,3,16 ; add (%rax),%rdx
+ DB 77,133,201 ; test %r9,%r9
+ DB 117,35 ; jne 92c <_sk_load_bgra_dst_hsw_8bit+0x44>
+ DB 196,161,126,111,92,130,32 ; vmovdqu 0x20(%rdx,%r8,4),%ymm3
+ DB 196,161,126,111,20,130 ; vmovdqu (%rdx,%r8,4),%ymm2
+ DB 197,253,111,37,130,32,0,0 ; vmovdqa 0x2082(%rip),%ymm4 # 29a0 <_sk_xor__hsw_8bit+0x24f>
+ DB 196,226,109,0,212 ; vpshufb %ymm4,%ymm2,%ymm2
+ DB 196,226,101,0,220 ; vpshufb %ymm4,%ymm3,%ymm3
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 185,8,0,0,0 ; mov $0x8,%ecx
- DB 68,41,193 ; sub %r8d,%ecx
- DB 192,225,3 ; shl $0x3,%cl
- DB 72,199,194,255,255,255,255 ; mov $0xffffffffffffffff,%rdx
- DB 72,211,234 ; shr %cl,%rdx
- DB 196,225,249,110,202 ; vmovq %rdx,%xmm1
- DB 196,226,125,33,201 ; vpmovsxbd %xmm1,%ymm1
- DB 196,226,117,140,8 ; vpmaskmovd (%rax),%ymm1,%ymm1
- DB 235,205 ; jmp 3b5 <_sk_load_bgra_dst_hsw_8bit+0x29>
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 197,229,239,219 ; vpxor %ymm3,%ymm3,%ymm3
+ DB 197,237,239,210 ; vpxor %ymm2,%ymm2,%ymm2
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,213 ; ja 916 <_sk_load_bgra_dst_hsw_8bit+0x2e>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,24,1,0,0 ; lea 0x118(%rip),%rcx # a64 <_sk_load_bgra_dst_hsw_8bit+0x17c>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 196,161,121,110,20,130 ; vmovd (%rdx,%r8,4),%xmm2
+ DB 235,185 ; jmp 916 <_sk_load_bgra_dst_hsw_8bit+0x2e>
+ DB 196,161,121,110,84,130,8 ; vmovd 0x8(%rdx,%r8,4),%xmm2
+ DB 196,226,121,89,210 ; vpbroadcastq %xmm2,%xmm2
+ DB 197,229,239,219 ; vpxor %ymm3,%ymm3,%ymm3
+ DB 196,227,101,2,210,4 ; vpblendd $0x4,%ymm2,%ymm3,%ymm2
+ DB 196,162,121,53,36,130 ; vpmovzxdq (%rdx,%r8,4),%xmm4
+ DB 197,249,112,228,232 ; vpshufd $0xe8,%xmm4,%xmm4
+ DB 196,227,109,2,212,3 ; vpblendd $0x3,%ymm4,%ymm2,%ymm2
+ DB 235,144 ; jmp 916 <_sk_load_bgra_dst_hsw_8bit+0x2e>
+ DB 196,161,121,110,84,130,24 ; vmovd 0x18(%rdx,%r8,4),%xmm2
+ DB 196,226,125,89,210 ; vpbroadcastq %xmm2,%ymm2
+ DB 197,229,239,219 ; vpxor %ymm3,%ymm3,%ymm3
+ DB 196,227,101,2,210,64 ; vpblendd $0x40,%ymm2,%ymm3,%ymm2
+ DB 196,227,125,57,212,1 ; vextracti128 $0x1,%ymm2,%xmm4
+ DB 196,163,89,34,100,130,20,1 ; vpinsrd $0x1,0x14(%rdx,%r8,4),%xmm4,%xmm4
+ DB 196,227,109,56,212,1 ; vinserti128 $0x1,%xmm4,%ymm2,%ymm2
+ DB 196,227,125,57,212,1 ; vextracti128 $0x1,%ymm2,%xmm4
+ DB 196,163,89,34,100,130,16,0 ; vpinsrd $0x0,0x10(%rdx,%r8,4),%xmm4,%xmm4
+ DB 196,227,109,56,212,1 ; vinserti128 $0x1,%xmm4,%ymm2,%ymm2
+ DB 196,161,122,111,36,130 ; vmovdqu (%rdx,%r8,4),%xmm4
+ DB 196,227,93,2,210,240 ; vpblendd $0xf0,%ymm2,%ymm4,%ymm2
+ DB 233,65,255,255,255 ; jmpq 916 <_sk_load_bgra_dst_hsw_8bit+0x2e>
+ DB 196,161,121,110,84,130,40 ; vmovd 0x28(%rdx,%r8,4),%xmm2
+ DB 196,226,121,89,210 ; vpbroadcastq %xmm2,%xmm2
+ DB 197,229,239,219 ; vpxor %ymm3,%ymm3,%ymm3
+ DB 196,227,101,2,218,4 ; vpblendd $0x4,%ymm2,%ymm3,%ymm3
+ DB 196,163,97,34,84,130,36,1 ; vpinsrd $0x1,0x24(%rdx,%r8,4),%xmm3,%xmm2
+ DB 196,227,101,2,218,15 ; vpblendd $0xf,%ymm2,%ymm3,%ymm3
+ DB 196,161,121,110,84,130,32 ; vmovd 0x20(%rdx,%r8,4),%xmm2
+ DB 196,227,101,2,218,1 ; vpblendd $0x1,%ymm2,%ymm3,%ymm3
+ DB 233,5,255,255,255 ; jmpq 910 <_sk_load_bgra_dst_hsw_8bit+0x28>
+ DB 196,161,121,110,84,130,56 ; vmovd 0x38(%rdx,%r8,4),%xmm2
+ DB 196,226,125,89,210 ; vpbroadcastq %xmm2,%ymm2
+ DB 197,229,239,219 ; vpxor %ymm3,%ymm3,%ymm3
+ DB 196,227,101,2,218,64 ; vpblendd $0x40,%ymm2,%ymm3,%ymm3
+ DB 196,227,125,57,218,1 ; vextracti128 $0x1,%ymm3,%xmm2
+ DB 196,163,105,34,84,130,52,1 ; vpinsrd $0x1,0x34(%rdx,%r8,4),%xmm2,%xmm2
+ DB 196,227,101,56,218,1 ; vinserti128 $0x1,%xmm2,%ymm3,%ymm3
+ DB 196,227,125,57,218,1 ; vextracti128 $0x1,%ymm3,%xmm2
+ DB 196,163,105,34,84,130,48,0 ; vpinsrd $0x0,0x30(%rdx,%r8,4),%xmm2,%xmm2
+ DB 196,227,101,56,218,1 ; vinserti128 $0x1,%xmm2,%ymm3,%ymm3
+ DB 196,161,126,111,20,130 ; vmovdqu (%rdx,%r8,4),%ymm2
+ DB 196,161,122,111,100,130,32 ; vmovdqu 0x20(%rdx,%r8,4),%xmm4
+ DB 196,227,93,2,219,240 ; vpblendd $0xf0,%ymm3,%ymm4,%ymm3
+ DB 233,181,254,255,255 ; jmpq 916 <_sk_load_bgra_dst_hsw_8bit+0x2e>
+ DB 15,31,0 ; nopl (%rax)
+ DB 241 ; icebp
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,15 ; decl (%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 249 ; stc
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,96,255 ; jmpq *-0x1(%rax)
+ DB 255 ; (bad)
+ DB 255,76,255,255 ; decl -0x1(%rdi,%rdi,8)
+ DB 255 ; (bad)
+ DB 56,255 ; cmp %bh,%bh
+ DB 255 ; (bad)
+ DB 255,34 ; jmpq *(%rdx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,172,254,255,255,149,255 ; ljmp *-0x6a0001(%rsi,%rdi,8)
+ DB 255 ; (bad)
+ DB 255,135,255,255,255,113 ; incl 0x71ffffff(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,229 ; jmpq *%rbp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,209 ; callq *%rcx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 189,255,255,255,167 ; mov $0xa7ffffff,%ebp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_store_bgra_hsw_8bit
_sk_store_bgra_hsw_8bit LABEL PROC
- DB 76,99,15 ; movslq (%rdi),%r9
- DB 76,139,71,16 ; mov 0x10(%rdi),%r8
+ DB 76,99,7 ; movslq (%rdi),%r8
+ DB 76,139,79,16 ; mov 0x10(%rdi),%r9
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 72,99,80,8 ; movslq 0x8(%rax),%rdx
- DB 72,99,79,8 ; movslq 0x8(%rdi),%rcx
- DB 72,15,175,202 ; imul %rdx,%rcx
- DB 72,193,225,2 ; shl $0x2,%rcx
- DB 72,3,8 ; add (%rax),%rcx
- DB 74,141,4,137 ; lea (%rcx,%r9,4),%rax
- DB 196,226,125,0,21,143,16,0,0 ; vpshufb 0x108f(%rip),%ymm0,%ymm2 # 14a0 <_sk_xor__hsw_8bit+0x1a5>
- DB 77,133,192 ; test %r8,%r8
- DB 117,8 ; jne 41e <_sk_store_bgra_hsw_8bit+0x36>
- DB 197,254,127,16 ; vmovdqu %ymm2,(%rax)
+ DB 72,99,72,8 ; movslq 0x8(%rax),%rcx
+ DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx
+ DB 72,15,175,209 ; imul %rcx,%rdx
+ DB 72,193,226,2 ; shl $0x2,%rdx
+ DB 72,3,16 ; add (%rax),%rdx
+ DB 197,253,111,37,252,30,0,0 ; vmovdqa 0x1efc(%rip),%ymm4 # 29c0 <_sk_xor__hsw_8bit+0x26f>
+ DB 196,226,117,0,236 ; vpshufb %ymm4,%ymm1,%ymm5
+ DB 196,226,125,0,228 ; vpshufb %ymm4,%ymm0,%ymm4
+ DB 77,133,201 ; test %r9,%r9
+ DB 117,17 ; jne ae4 <_sk_store_bgra_hsw_8bit+0x44>
+ DB 196,161,126,127,36,130 ; vmovdqu %ymm4,(%rdx,%r8,4)
+ DB 196,161,126,127,108,130,32 ; vmovdqu %ymm5,0x20(%rdx,%r8,4)
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 185,8,0,0,0 ; mov $0x8,%ecx
- DB 68,41,193 ; sub %r8d,%ecx
- DB 192,225,3 ; shl $0x3,%cl
- DB 72,199,194,255,255,255,255 ; mov $0xffffffffffffffff,%rdx
- DB 72,211,234 ; shr %cl,%rdx
- DB 196,225,249,110,218 ; vmovq %rdx,%xmm3
- DB 196,226,125,33,219 ; vpmovsxbd %xmm3,%ymm3
- DB 196,226,101,142,16 ; vpmaskmovd %ymm2,%ymm3,(%rax)
- DB 235,214 ; jmp 41a <_sk_store_bgra_hsw_8bit+0x32>
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,239 ; ja ae0 <_sk_store_bgra_hsw_8bit+0x40>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,176,0,0,0 ; lea 0xb0(%rip),%rcx # bac <_sk_store_bgra_hsw_8bit+0x10c>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 196,161,121,126,36,130 ; vmovd %xmm4,(%rdx,%r8,4)
+ DB 235,211 ; jmp ae0 <_sk_store_bgra_hsw_8bit+0x40>
+ DB 196,163,121,22,100,130,8,2 ; vpextrd $0x2,%xmm4,0x8(%rdx,%r8,4)
+ DB 196,161,121,214,36,130 ; vmovq %xmm4,(%rdx,%r8,4)
+ DB 235,195 ; jmp ae0 <_sk_store_bgra_hsw_8bit+0x40>
+ DB 196,227,125,57,229,1 ; vextracti128 $0x1,%ymm4,%xmm5
+ DB 196,163,121,22,108,130,24,2 ; vpextrd $0x2,%xmm5,0x18(%rdx,%r8,4)
+ DB 196,227,125,57,229,1 ; vextracti128 $0x1,%ymm4,%xmm5
+ DB 196,163,121,22,108,130,20,1 ; vpextrd $0x1,%xmm5,0x14(%rdx,%r8,4)
+ DB 196,227,125,57,229,1 ; vextracti128 $0x1,%ymm4,%xmm5
+ DB 196,161,121,126,108,130,16 ; vmovd %xmm5,0x10(%rdx,%r8,4)
+ DB 196,161,122,127,36,130 ; vmovdqu %xmm4,(%rdx,%r8,4)
+ DB 235,146 ; jmp ae0 <_sk_store_bgra_hsw_8bit+0x40>
+ DB 196,163,121,22,108,130,40,2 ; vpextrd $0x2,%xmm5,0x28(%rdx,%r8,4)
+ DB 196,163,121,22,108,130,36,1 ; vpextrd $0x1,%xmm5,0x24(%rdx,%r8,4)
+ DB 196,161,121,126,108,130,32 ; vmovd %xmm5,0x20(%rdx,%r8,4)
+ DB 196,161,126,127,36,130 ; vmovdqu %ymm4,(%rdx,%r8,4)
+ DB 233,112,255,255,255 ; jmpq ae0 <_sk_store_bgra_hsw_8bit+0x40>
+ DB 196,227,125,57,238,1 ; vextracti128 $0x1,%ymm5,%xmm6
+ DB 196,163,121,22,116,130,56,2 ; vpextrd $0x2,%xmm6,0x38(%rdx,%r8,4)
+ DB 196,227,125,57,238,1 ; vextracti128 $0x1,%ymm5,%xmm6
+ DB 196,163,121,22,116,130,52,1 ; vpextrd $0x1,%xmm6,0x34(%rdx,%r8,4)
+ DB 196,227,125,57,238,1 ; vextracti128 $0x1,%ymm5,%xmm6
+ DB 196,161,121,126,116,130,48 ; vmovd %xmm6,0x30(%rdx,%r8,4)
+ DB 196,161,126,127,36,130 ; vmovdqu %ymm4,(%rdx,%r8,4)
+ DB 196,161,122,127,108,130,32 ; vmovdqu %xmm5,0x20(%rdx,%r8,4)
+ DB 233,53,255,255,255 ; jmpq ae0 <_sk_store_bgra_hsw_8bit+0x40>
+ DB 144 ; nop
+ DB 89 ; pop %rcx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,105,255 ; ljmp *-0x1(%rcx)
+ DB 255 ; (bad)
+ DB 255,97,255 ; jmpq *-0x1(%rcx)
+ DB 255 ; (bad)
+ DB 255,154,255,255,255,141 ; lcall *-0x72000001(%rdx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 127,255 ; jg bc1 <_sk_store_bgra_hsw_8bit+0x121>
+ DB 255 ; (bad)
+ DB 255,113,255 ; pushq -0x1(%rcx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 185,255,255,255,178 ; mov $0xb2ffffff,%ecx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,170,255,255,255,162 ; ljmp *-0x5d000001(%rdx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 237 ; in (%dx),%eax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,224 ; jmpq *%rax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,210 ; callq *%rdx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,196 ; inc %esp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_a8_hsw_8bit
_sk_load_a8_hsw_8bit LABEL PROC
@@ -38766,61 +39264,86 @@ _sk_load_a8_hsw_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,28 ; jne 47d <_sk_load_a8_hsw_8bit+0x39>
- DB 196,162,121,48,4,2 ; vpmovzxbw (%rdx,%r8,1),%xmm0
- DB 197,249,219,5,81,18,0,0 ; vpand 0x1251(%rip),%xmm0,%xmm0 # 16c0 <_sk_xor__hsw_8bit+0x3c5>
- DB 196,226,125,51,192 ; vpmovzxwd %xmm0,%ymm0
+ DB 117,35 ; jne c28 <_sk_load_a8_hsw_8bit+0x40>
+ DB 196,161,122,111,4,2 ; vmovdqu (%rdx,%r8,1),%xmm0
+ DB 197,249,112,200,78 ; vpshufd $0x4e,%xmm0,%xmm1
+ DB 196,226,125,49,201 ; vpmovzxbd %xmm1,%ymm1
+ DB 196,226,125,49,192 ; vpmovzxbd %xmm0,%ymm0
DB 197,253,114,240,24 ; vpslld $0x18,%ymm0,%ymm0
+ DB 197,245,114,241,24 ; vpslld $0x18,%ymm1,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,7 ; and $0x7,%r9b
+ DB 65,128,225,15 ; and $0xf,%r9b
DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
DB 65,254,201 ; dec %r9b
- DB 65,128,249,6 ; cmp $0x6,%r9b
- DB 119,217 ; ja 467 <_sk_load_a8_hsw_8bit+0x23>
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,210 ; ja c0b <_sk_load_a8_hsw_8bit+0x23>
DB 65,15,182,193 ; movzbl %r9b,%eax
- DB 72,141,13,119,0,0,0 ; lea 0x77(%rip),%rcx # 510 <_sk_load_a8_hsw_8bit+0xcc>
+ DB 72,141,13,192,0,0,0 ; lea 0xc0(%rip),%rcx # d04 <_sk_load_a8_hsw_8bit+0x11c>
DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
DB 72,1,200 ; add %rcx,%rax
DB 255,224 ; jmpq *%rax
DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
DB 197,249,110,192 ; vmovd %eax,%xmm0
- DB 235,186 ; jmp 467 <_sk_load_a8_hsw_8bit+0x23>
- DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
+ DB 235,179 ; jmp c0b <_sk_load_a8_hsw_8bit+0x23>
DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
- DB 197,249,196,192,2 ; vpinsrw $0x2,%eax,%xmm0,%xmm0
+ DB 196,163,121,32,68,2,2,2 ; vpinsrb $0x2,0x2(%rdx,%r8,1),%xmm0,%xmm0
DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
- DB 197,249,110,208 ; vmovd %eax,%xmm2
- DB 196,226,121,48,210 ; vpmovzxbw %xmm2,%xmm2
- DB 196,227,121,2,194,1 ; vpblendd $0x1,%xmm2,%xmm0,%xmm0
- DB 235,149 ; jmp 467 <_sk_load_a8_hsw_8bit+0x23>
- DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
+ DB 197,249,110,200 ; vmovd %eax,%xmm1
+ DB 196,227,121,14,193,1 ; vpblendw $0x1,%xmm1,%xmm0,%xmm0
+ DB 235,150 ; jmp c0b <_sk_load_a8_hsw_8bit+0x23>
DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
- DB 197,249,196,192,6 ; vpinsrw $0x6,%eax,%xmm0,%xmm0
- DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
- DB 197,249,196,192,5 ; vpinsrw $0x5,%eax,%xmm0,%xmm0
- DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
- DB 197,249,196,192,4 ; vpinsrw $0x4,%eax,%xmm0,%xmm0
- DB 196,161,121,110,20,2 ; vmovd (%rdx,%r8,1),%xmm2
- DB 196,226,121,48,210 ; vpmovzxbw %xmm2,%xmm2
- DB 196,227,105,2,192,12 ; vpblendd $0xc,%xmm0,%xmm2,%xmm0
- DB 233,90,255,255,255 ; jmpq 467 <_sk_load_a8_hsw_8bit+0x23>
- DB 15,31,0 ; nopl (%rax)
- DB 146 ; xchg %eax,%edx
+ DB 196,163,121,32,68,2,6,6 ; vpinsrb $0x6,0x6(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,163,121,32,68,2,5,5 ; vpinsrb $0x5,0x5(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,163,121,32,68,2,4,4 ; vpinsrb $0x4,0x4(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,161,121,110,12,2 ; vmovd (%rdx,%r8,1),%xmm1
+ DB 196,227,121,2,193,1 ; vpblendd $0x1,%xmm1,%xmm0,%xmm0
+ DB 233,105,255,255,255 ; jmpq c0b <_sk_load_a8_hsw_8bit+0x23>
+ DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
+ DB 196,163,121,32,68,2,10,10 ; vpinsrb $0xa,0xa(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,163,121,32,68,2,9,9 ; vpinsrb $0x9,0x9(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,163,121,32,68,2,8,8 ; vpinsrb $0x8,0x8(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,161,122,126,12,2 ; vmovq (%rdx,%r8,1),%xmm1
+ DB 196,227,113,2,192,12 ; vpblendd $0xc,%xmm0,%xmm1,%xmm0
+ DB 233,60,255,255,255 ; jmpq c0b <_sk_load_a8_hsw_8bit+0x23>
+ DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
+ DB 196,163,121,32,68,2,14,14 ; vpinsrb $0xe,0xe(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,163,121,32,68,2,13,13 ; vpinsrb $0xd,0xd(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,163,121,32,68,2,12,12 ; vpinsrb $0xc,0xc(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,161,122,126,12,2 ; vmovq (%rdx,%r8,1),%xmm1
+ DB 196,163,113,34,76,2,8,2 ; vpinsrd $0x2,0x8(%rdx,%r8,1),%xmm1,%xmm1
+ DB 196,227,113,2,192,8 ; vpblendd $0x8,%xmm0,%xmm1,%xmm0
+ DB 233,7,255,255,255 ; jmpq c0b <_sk_load_a8_hsw_8bit+0x23>
+ DB 73,255 ; rex.WB (bad)
DB 255 ; (bad)
+ DB 255,96,255 ; jmpq *-0x1(%rax)
+ DB 255 ; (bad)
+ DB 255,84,255,255 ; callq *-0x1(%rdi,%rdi,8)
+ DB 255,141,255,255,255,133 ; decl -0x7a000001(%rbp)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 125,255 ; jge d19 <_sk_load_a8_hsw_8bit+0x131>
+ DB 255 ; (bad)
+ DB 255,113,255 ; pushq -0x1(%rcx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 186,255,255,255,178 ; mov $0xb2ffffff,%edx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,170,255,255,255,158 ; ljmp *-0x61000001(%rdx)
DB 255 ; (bad)
- DB 255,172,255,255,255,157,255 ; ljmp *-0x620001(%rdi,%rdi,8)
DB 255 ; (bad)
DB 255,231 ; jmpq *%rdi
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; (bad)
- DB 220,255 ; fdivr %st,%st(7)
+ DB 223,255 ; (bad)
DB 255 ; (bad)
- DB 255,209 ; callq *%rcx
+ DB 255,215 ; callq *%rdi
DB 255 ; (bad)
DB 255 ; (bad)
- DB 255,194 ; inc %edx
+ DB 255,203 ; dec %ebx
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; .byte 0xff
@@ -38835,61 +39358,86 @@ _sk_load_a8_dst_hsw_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,28 ; jne 565 <_sk_load_a8_dst_hsw_8bit+0x39>
- DB 196,162,121,48,12,2 ; vpmovzxbw (%rdx,%r8,1),%xmm1
- DB 197,241,219,13,121,17,0,0 ; vpand 0x1179(%rip),%xmm1,%xmm1 # 16d0 <_sk_xor__hsw_8bit+0x3d5>
- DB 196,226,125,51,201 ; vpmovzxwd %xmm1,%ymm1
- DB 197,245,114,241,24 ; vpslld $0x18,%ymm1,%ymm1
+ DB 117,35 ; jne d80 <_sk_load_a8_dst_hsw_8bit+0x40>
+ DB 196,161,122,111,20,2 ; vmovdqu (%rdx,%r8,1),%xmm2
+ DB 197,249,112,218,78 ; vpshufd $0x4e,%xmm2,%xmm3
+ DB 196,226,125,49,219 ; vpmovzxbd %xmm3,%ymm3
+ DB 196,226,125,49,210 ; vpmovzxbd %xmm2,%ymm2
+ DB 197,237,114,242,24 ; vpslld $0x18,%ymm2,%ymm2
+ DB 197,229,114,243,24 ; vpslld $0x18,%ymm3,%ymm3
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,7 ; and $0x7,%r9b
- DB 197,241,239,201 ; vpxor %xmm1,%xmm1,%xmm1
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2
DB 65,254,201 ; dec %r9b
- DB 65,128,249,6 ; cmp $0x6,%r9b
- DB 119,217 ; ja 54f <_sk_load_a8_dst_hsw_8bit+0x23>
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,210 ; ja d63 <_sk_load_a8_dst_hsw_8bit+0x23>
DB 65,15,182,193 ; movzbl %r9b,%eax
- DB 72,141,13,119,0,0,0 ; lea 0x77(%rip),%rcx # 5f8 <_sk_load_a8_dst_hsw_8bit+0xcc>
+ DB 72,141,13,192,0,0,0 ; lea 0xc0(%rip),%rcx # e5c <_sk_load_a8_dst_hsw_8bit+0x11c>
DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
DB 72,1,200 ; add %rcx,%rax
DB 255,224 ; jmpq *%rax
DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
- DB 197,249,110,200 ; vmovd %eax,%xmm1
- DB 235,186 ; jmp 54f <_sk_load_a8_dst_hsw_8bit+0x23>
- DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
- DB 197,241,239,201 ; vpxor %xmm1,%xmm1,%xmm1
- DB 197,241,196,200,2 ; vpinsrw $0x2,%eax,%xmm1,%xmm1
- DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
DB 197,249,110,208 ; vmovd %eax,%xmm2
- DB 196,226,121,48,210 ; vpmovzxbw %xmm2,%xmm2
- DB 196,227,113,2,202,1 ; vpblendd $0x1,%xmm2,%xmm1,%xmm1
- DB 235,149 ; jmp 54f <_sk_load_a8_dst_hsw_8bit+0x23>
- DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
- DB 197,241,239,201 ; vpxor %xmm1,%xmm1,%xmm1
- DB 197,241,196,200,6 ; vpinsrw $0x6,%eax,%xmm1,%xmm1
- DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
- DB 197,241,196,200,5 ; vpinsrw $0x5,%eax,%xmm1,%xmm1
- DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
- DB 197,241,196,200,4 ; vpinsrw $0x4,%eax,%xmm1,%xmm1
- DB 196,161,121,110,20,2 ; vmovd (%rdx,%r8,1),%xmm2
- DB 196,226,121,48,210 ; vpmovzxbw %xmm2,%xmm2
- DB 196,227,105,2,201,12 ; vpblendd $0xc,%xmm1,%xmm2,%xmm1
- DB 233,90,255,255,255 ; jmpq 54f <_sk_load_a8_dst_hsw_8bit+0x23>
- DB 15,31,0 ; nopl (%rax)
- DB 146 ; xchg %eax,%edx
+ DB 235,179 ; jmp d63 <_sk_load_a8_dst_hsw_8bit+0x23>
+ DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2
+ DB 196,163,105,32,84,2,2,2 ; vpinsrb $0x2,0x2(%rdx,%r8,1),%xmm2,%xmm2
+ DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
+ DB 197,249,110,216 ; vmovd %eax,%xmm3
+ DB 196,227,105,14,211,1 ; vpblendw $0x1,%xmm3,%xmm2,%xmm2
+ DB 235,150 ; jmp d63 <_sk_load_a8_dst_hsw_8bit+0x23>
+ DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2
+ DB 196,163,105,32,84,2,6,6 ; vpinsrb $0x6,0x6(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,163,105,32,84,2,5,5 ; vpinsrb $0x5,0x5(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,163,105,32,84,2,4,4 ; vpinsrb $0x4,0x4(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,161,121,110,28,2 ; vmovd (%rdx,%r8,1),%xmm3
+ DB 196,227,105,2,211,1 ; vpblendd $0x1,%xmm3,%xmm2,%xmm2
+ DB 233,105,255,255,255 ; jmpq d63 <_sk_load_a8_dst_hsw_8bit+0x23>
+ DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2
+ DB 196,163,105,32,84,2,10,10 ; vpinsrb $0xa,0xa(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,163,105,32,84,2,9,9 ; vpinsrb $0x9,0x9(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,163,105,32,84,2,8,8 ; vpinsrb $0x8,0x8(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,161,122,126,28,2 ; vmovq (%rdx,%r8,1),%xmm3
+ DB 196,227,97,2,210,12 ; vpblendd $0xc,%xmm2,%xmm3,%xmm2
+ DB 233,60,255,255,255 ; jmpq d63 <_sk_load_a8_dst_hsw_8bit+0x23>
+ DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2
+ DB 196,163,105,32,84,2,14,14 ; vpinsrb $0xe,0xe(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,163,105,32,84,2,13,13 ; vpinsrb $0xd,0xd(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,163,105,32,84,2,12,12 ; vpinsrb $0xc,0xc(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,161,122,126,28,2 ; vmovq (%rdx,%r8,1),%xmm3
+ DB 196,163,97,34,92,2,8,2 ; vpinsrd $0x2,0x8(%rdx,%r8,1),%xmm3,%xmm3
+ DB 196,227,97,2,210,8 ; vpblendd $0x8,%xmm2,%xmm3,%xmm2
+ DB 233,7,255,255,255 ; jmpq d63 <_sk_load_a8_dst_hsw_8bit+0x23>
+ DB 73,255 ; rex.WB (bad)
DB 255 ; (bad)
+ DB 255,96,255 ; jmpq *-0x1(%rax)
+ DB 255 ; (bad)
+ DB 255,84,255,255 ; callq *-0x1(%rdi,%rdi,8)
+ DB 255,141,255,255,255,133 ; decl -0x7a000001(%rbp)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 125,255 ; jge e71 <_sk_load_a8_dst_hsw_8bit+0x131>
+ DB 255 ; (bad)
+ DB 255,113,255 ; pushq -0x1(%rcx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 186,255,255,255,178 ; mov $0xb2ffffff,%edx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,170,255,255,255,158 ; ljmp *-0x61000001(%rdx)
DB 255 ; (bad)
- DB 255,172,255,255,255,157,255 ; ljmp *-0x620001(%rdi,%rdi,8)
DB 255 ; (bad)
DB 255,231 ; jmpq *%rdi
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; (bad)
- DB 220,255 ; fdivr %st,%st(7)
+ DB 223,255 ; (bad)
DB 255 ; (bad)
- DB 255,209 ; callq *%rcx
+ DB 255,215 ; callq *%rdi
DB 255 ; (bad)
DB 255 ; (bad)
- DB 255,194 ; inc %edx
+ DB 255,203 ; dec %ebx
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; .byte 0xff
@@ -38903,51 +39451,80 @@ _sk_store_a8_hsw_8bit LABEL PROC
DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
- DB 196,226,125,0,21,139,14,0,0 ; vpshufb 0xe8b(%rip),%ymm0,%ymm2 # 14c0 <_sk_xor__hsw_8bit+0x1c5>
- DB 196,227,253,0,210,232 ; vpermq $0xe8,%ymm2,%ymm2
+ DB 197,253,111,37,40,27,0,0 ; vmovdqa 0x1b28(%rip),%ymm4 # 29e0 <_sk_xor__hsw_8bit+0x28f>
+ DB 196,226,117,0,236 ; vpshufb %ymm4,%ymm1,%ymm5
+ DB 196,227,253,0,237,232 ; vpermq $0xe8,%ymm5,%ymm5
+ DB 197,249,111,53,21,29,0,0 ; vmovdqa 0x1d15(%rip),%xmm6 # 2be0 <_sk_xor__hsw_8bit+0x48f>
+ DB 196,226,81,0,238 ; vpshufb %xmm6,%xmm5,%xmm5
+ DB 196,226,125,0,228 ; vpshufb %ymm4,%ymm0,%ymm4
+ DB 196,227,253,0,228,232 ; vpermq $0xe8,%ymm4,%ymm4
+ DB 196,226,89,0,230 ; vpshufb %xmm6,%xmm4,%xmm4
+ DB 197,217,108,229 ; vpunpcklqdq %xmm5,%xmm4,%xmm4
DB 77,133,201 ; test %r9,%r9
- DB 117,19 ; jne 653 <_sk_store_a8_hsw_8bit+0x3f>
- DB 196,226,105,0,21,183,16,0,0 ; vpshufb 0x10b7(%rip),%xmm2,%xmm2 # 1700 <_sk_xor__hsw_8bit+0x405>
- DB 196,161,121,214,20,2 ; vmovq %xmm2,(%rdx,%r8,1)
+ DB 117,10 ; jne ef3 <_sk_store_a8_hsw_8bit+0x5b>
+ DB 196,161,122,127,36,2 ; vmovdqu %xmm4,(%rdx,%r8,1)
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,7 ; and $0x7,%r9b
+ DB 65,128,225,15 ; and $0xf,%r9b
DB 65,254,201 ; dec %r9b
- DB 65,128,249,6 ; cmp $0x6,%r9b
- DB 119,239 ; ja 64f <_sk_store_a8_hsw_8bit+0x3b>
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,239 ; ja eef <_sk_store_a8_hsw_8bit+0x57>
DB 65,15,182,193 ; movzbl %r9b,%eax
- DB 72,141,13,85,0,0,0 ; lea 0x55(%rip),%rcx # 6c0 <_sk_store_a8_hsw_8bit+0xac>
+ DB 72,141,13,137,0,0,0 ; lea 0x89(%rip),%rcx # f94 <_sk_store_a8_hsw_8bit+0xfc>
DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
DB 72,1,200 ; add %rcx,%rax
DB 255,224 ; jmpq *%rax
- DB 196,163,121,20,20,2,0 ; vpextrb $0x0,%xmm2,(%rdx,%r8,1)
- DB 235,210 ; jmp 64f <_sk_store_a8_hsw_8bit+0x3b>
- DB 196,163,121,20,84,2,2,4 ; vpextrb $0x4,%xmm2,0x2(%rdx,%r8,1)
- DB 196,226,105,0,21,82,16,0,0 ; vpshufb 0x1052(%rip),%xmm2,%xmm2 # 16e0 <_sk_xor__hsw_8bit+0x3e5>
- DB 196,163,121,21,20,2,0 ; vpextrw $0x0,%xmm2,(%rdx,%r8,1)
- DB 235,184 ; jmp 64f <_sk_store_a8_hsw_8bit+0x3b>
- DB 196,163,121,20,84,2,6,12 ; vpextrb $0xc,%xmm2,0x6(%rdx,%r8,1)
- DB 196,163,121,20,84,2,5,10 ; vpextrb $0xa,%xmm2,0x5(%rdx,%r8,1)
- DB 196,163,121,20,84,2,4,8 ; vpextrb $0x8,%xmm2,0x4(%rdx,%r8,1)
- DB 196,226,105,0,21,56,16,0,0 ; vpshufb 0x1038(%rip),%xmm2,%xmm2 # 16f0 <_sk_xor__hsw_8bit+0x3f5>
- DB 196,161,121,126,20,2 ; vmovd %xmm2,(%rdx,%r8,1)
- DB 235,143 ; jmp 64f <_sk_store_a8_hsw_8bit+0x3b>
- DB 180,255 ; mov $0xff,%ah
+ DB 196,163,121,20,36,2,0 ; vpextrb $0x0,%xmm4,(%rdx,%r8,1)
+ DB 235,210 ; jmp eef <_sk_store_a8_hsw_8bit+0x57>
+ DB 196,163,121,20,100,2,2,2 ; vpextrb $0x2,%xmm4,0x2(%rdx,%r8,1)
+ DB 196,163,121,21,36,2,0 ; vpextrw $0x0,%xmm4,(%rdx,%r8,1)
+ DB 235,193 ; jmp eef <_sk_store_a8_hsw_8bit+0x57>
+ DB 196,163,121,20,100,2,6,6 ; vpextrb $0x6,%xmm4,0x6(%rdx,%r8,1)
+ DB 196,163,121,20,100,2,5,5 ; vpextrb $0x5,%xmm4,0x5(%rdx,%r8,1)
+ DB 196,163,121,20,100,2,4,4 ; vpextrb $0x4,%xmm4,0x4(%rdx,%r8,1)
+ DB 196,161,121,126,36,2 ; vmovd %xmm4,(%rdx,%r8,1)
+ DB 235,161 ; jmp eef <_sk_store_a8_hsw_8bit+0x57>
+ DB 196,163,121,20,100,2,10,10 ; vpextrb $0xa,%xmm4,0xa(%rdx,%r8,1)
+ DB 196,163,121,20,100,2,9,9 ; vpextrb $0x9,%xmm4,0x9(%rdx,%r8,1)
+ DB 196,163,121,20,100,2,8,8 ; vpextrb $0x8,%xmm4,0x8(%rdx,%r8,1)
+ DB 235,32 ; jmp f88 <_sk_store_a8_hsw_8bit+0xf0>
+ DB 196,163,121,20,100,2,14,14 ; vpextrb $0xe,%xmm4,0xe(%rdx,%r8,1)
+ DB 196,163,121,20,100,2,13,13 ; vpextrb $0xd,%xmm4,0xd(%rdx,%r8,1)
+ DB 196,163,121,20,100,2,12,12 ; vpextrb $0xc,%xmm4,0xc(%rdx,%r8,1)
+ DB 196,163,121,22,100,2,8,2 ; vpextrd $0x2,%xmm4,0x8(%rdx,%r8,1)
+ DB 196,161,121,214,36,2 ; vmovq %xmm4,(%rdx,%r8,1)
+ DB 233,92,255,255,255 ; jmpq eef <_sk_store_a8_hsw_8bit+0x57>
+ DB 144 ; nop
+ DB 128,255,255 ; cmp $0xff,%bh
+ DB 255,145,255,255,255,137 ; callq *-0x76000001(%rcx)
DB 255 ; (bad)
- DB 255,197 ; inc %ebp
DB 255 ; (bad)
+ DB 255,178,255,255,255,170 ; pushq -0x55000001(%rdx)
DB 255 ; (bad)
DB 255 ; (bad)
- DB 189,255,255,255,239 ; mov $0xefffffff,%ebp
+ DB 255,162,255,255,255,154 ; jmpq *-0x65000001(%rdx)
DB 255 ; (bad)
DB 255 ; (bad)
- DB 255,231 ; jmpq *%rdi
+ DB 255,244 ; push %rsp
DB 255 ; (bad)
DB 255 ; (bad)
+ DB 255,202 ; dec %edx
DB 255 ; (bad)
- DB 223,255 ; (bad)
DB 255 ; (bad)
- DB 255,215 ; callq *%rdi
+ DB 255,194 ; inc %edx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 186,255,255,255,236 ; mov $0xecffffff,%edx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,228 ; jmpq *%rsp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 220,255 ; fdivr %st,%st(7)
+ DB 255 ; (bad)
+ DB 255,212 ; callq *%rsp
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; .byte 0xff
@@ -38962,63 +39539,91 @@ _sk_load_g8_hsw_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,50 ; jne 72b <_sk_load_g8_hsw_8bit+0x4f>
- DB 196,162,121,48,4,2 ; vpmovzxbw (%rdx,%r8,1),%xmm0
- DB 197,249,219,5,9,16,0,0 ; vpand 0x1009(%rip),%xmm0,%xmm0 # 1710 <_sk_xor__hsw_8bit+0x415>
- DB 196,226,125,51,192 ; vpmovzxwd %xmm0,%ymm0
- DB 196,226,125,88,21,167,12,0,0 ; vpbroadcastd 0xca7(%rip),%ymm2 # 13bc <_sk_xor__hsw_8bit+0xc1>
- DB 196,226,125,64,194 ; vpmulld %ymm2,%ymm0,%ymm0
- DB 196,226,125,88,21,157,12,0,0 ; vpbroadcastd 0xc9d(%rip),%ymm2 # 13c0 <_sk_xor__hsw_8bit+0xc5>
- DB 197,253,235,194 ; vpor %ymm2,%ymm0,%ymm0
+ DB 117,61 ; jne 102a <_sk_load_g8_hsw_8bit+0x5a>
+ DB 196,161,122,111,4,2 ; vmovdqu (%rdx,%r8,1),%xmm0
+ DB 196,226,125,49,200 ; vpmovzxbd %xmm0,%ymm1
+ DB 197,249,112,192,78 ; vpshufd $0x4e,%xmm0,%xmm0
+ DB 196,226,125,49,192 ; vpmovzxbd %xmm0,%ymm0
+ DB 196,226,125,88,37,189,24,0,0 ; vpbroadcastd 0x18bd(%rip),%ymm4 # 28c8 <_sk_xor__hsw_8bit+0x177>
+ DB 196,226,125,64,236 ; vpmulld %ymm4,%ymm0,%ymm5
+ DB 196,226,117,64,196 ; vpmulld %ymm4,%ymm1,%ymm0
+ DB 196,226,125,88,13,174,24,0,0 ; vpbroadcastd 0x18ae(%rip),%ymm1 # 28cc <_sk_xor__hsw_8bit+0x17b>
+ DB 197,253,235,193 ; vpor %ymm1,%ymm0,%ymm0
+ DB 197,213,235,201 ; vpor %ymm1,%ymm5,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,7 ; and $0x7,%r9b
+ DB 65,128,225,15 ; and $0xf,%r9b
DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
DB 65,254,201 ; dec %r9b
- DB 65,128,249,6 ; cmp $0x6,%r9b
- DB 119,195 ; ja 6ff <_sk_load_g8_hsw_8bit+0x23>
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,184 ; ja ff3 <_sk_load_g8_hsw_8bit+0x23>
DB 65,15,182,193 ; movzbl %r9b,%eax
- DB 72,141,13,121,0,0,0 ; lea 0x79(%rip),%rcx # 7c0 <_sk_load_g8_hsw_8bit+0xe4>
+ DB 72,141,13,198,0,0,0 ; lea 0xc6(%rip),%rcx # 110c <_sk_load_g8_hsw_8bit+0x13c>
DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
DB 72,1,200 ; add %rcx,%rax
DB 255,224 ; jmpq *%rax
DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
DB 197,249,110,192 ; vmovd %eax,%xmm0
- DB 235,164 ; jmp 6ff <_sk_load_g8_hsw_8bit+0x23>
- DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
+ DB 235,153 ; jmp ff3 <_sk_load_g8_hsw_8bit+0x23>
DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
- DB 197,249,196,192,2 ; vpinsrw $0x2,%eax,%xmm0,%xmm0
+ DB 196,163,121,32,68,2,2,2 ; vpinsrb $0x2,0x2(%rdx,%r8,1),%xmm0,%xmm0
DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
- DB 197,249,110,208 ; vmovd %eax,%xmm2
- DB 196,226,121,48,210 ; vpmovzxbw %xmm2,%xmm2
- DB 196,227,121,2,194,1 ; vpblendd $0x1,%xmm2,%xmm0,%xmm0
- DB 233,124,255,255,255 ; jmpq 6ff <_sk_load_g8_hsw_8bit+0x23>
- DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
+ DB 197,249,110,200 ; vmovd %eax,%xmm1
+ DB 196,227,121,14,193,1 ; vpblendw $0x1,%xmm1,%xmm0,%xmm0
+ DB 233,121,255,255,255 ; jmpq ff3 <_sk_load_g8_hsw_8bit+0x23>
DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
- DB 197,249,196,192,6 ; vpinsrw $0x6,%eax,%xmm0,%xmm0
- DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
- DB 197,249,196,192,5 ; vpinsrw $0x5,%eax,%xmm0,%xmm0
- DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
- DB 197,249,196,192,4 ; vpinsrw $0x4,%eax,%xmm0,%xmm0
- DB 196,161,121,110,20,2 ; vmovd (%rdx,%r8,1),%xmm2
- DB 196,226,121,48,210 ; vpmovzxbw %xmm2,%xmm2
- DB 196,227,105,2,192,12 ; vpblendd $0xc,%xmm0,%xmm2,%xmm0
- DB 233,65,255,255,255 ; jmpq 6ff <_sk_load_g8_hsw_8bit+0x23>
- DB 102,144 ; xchg %ax,%ax
- DB 144 ; nop
+ DB 196,163,121,32,68,2,6,6 ; vpinsrb $0x6,0x6(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,163,121,32,68,2,5,5 ; vpinsrb $0x5,0x5(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,163,121,32,68,2,4,4 ; vpinsrb $0x4,0x4(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,161,121,110,12,2 ; vmovd (%rdx,%r8,1),%xmm1
+ DB 196,227,121,2,193,1 ; vpblendd $0x1,%xmm1,%xmm0,%xmm0
+ DB 233,76,255,255,255 ; jmpq ff3 <_sk_load_g8_hsw_8bit+0x23>
+ DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
+ DB 196,163,121,32,68,2,10,10 ; vpinsrb $0xa,0xa(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,163,121,32,68,2,9,9 ; vpinsrb $0x9,0x9(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,163,121,32,68,2,8,8 ; vpinsrb $0x8,0x8(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,161,122,126,12,2 ; vmovq (%rdx,%r8,1),%xmm1
+ DB 196,227,113,2,192,12 ; vpblendd $0xc,%xmm0,%xmm1,%xmm0
+ DB 233,31,255,255,255 ; jmpq ff3 <_sk_load_g8_hsw_8bit+0x23>
+ DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
+ DB 196,163,121,32,68,2,14,14 ; vpinsrb $0xe,0xe(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,163,121,32,68,2,13,13 ; vpinsrb $0xd,0xd(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,163,121,32,68,2,12,12 ; vpinsrb $0xc,0xc(%rdx,%r8,1),%xmm0,%xmm0
+ DB 196,161,122,126,12,2 ; vmovq (%rdx,%r8,1),%xmm1
+ DB 196,163,113,34,76,2,8,2 ; vpinsrd $0x2,0x8(%rdx,%r8,1),%xmm1,%xmm1
+ DB 196,227,113,2,192,8 ; vpblendd $0x8,%xmm0,%xmm1,%xmm0
+ DB 233,234,254,255,255 ; jmpq ff3 <_sk_load_g8_hsw_8bit+0x23>
+ DB 15,31,0 ; nopl (%rax)
+ DB 67,255 ; rex.XB (bad)
DB 255 ; (bad)
+ DB 255,90,255 ; lcall *-0x1(%rdx)
DB 255 ; (bad)
- DB 255,170,255,255,255,155 ; ljmp *-0x64000001(%rdx)
+ DB 255,78,255 ; decl -0x1(%rsi)
DB 255 ; (bad)
+ DB 255,138,255,255,255,130 ; decl -0x7d000001(%rdx)
DB 255 ; (bad)
DB 255 ; (bad)
- DB 232,255,255,255,221 ; callq ffffffffde0007d0 <_sk_xor__hsw_8bit+0xffffffffddfff4d5>
DB 255 ; (bad)
+ DB 122,255 ; jp 1121 <_sk_load_g8_hsw_8bit+0x151>
DB 255 ; (bad)
- DB 255,210 ; callq *%rdx
+ DB 255,110,255 ; ljmp *-0x1(%rsi)
DB 255 ; (bad)
+ DB 255,183,255,255,255,175 ; pushq -0x50000001(%rdi)
DB 255 ; (bad)
- DB 255,195 ; inc %ebx
+ DB 255 ; (bad)
+ DB 255,167,255,255,255,155 ; jmpq *-0x64000001(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,228 ; jmpq *%rsp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 220,255 ; fdivr %st,%st(7)
+ DB 255 ; (bad)
+ DB 255,212 ; callq *%rsp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,200 ; dec %eax
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; .byte 0xff
@@ -39033,63 +39638,91 @@ _sk_load_g8_dst_hsw_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,50 ; jne 82b <_sk_load_g8_dst_hsw_8bit+0x4f>
- DB 196,162,121,48,12,2 ; vpmovzxbw (%rdx,%r8,1),%xmm1
- DB 197,241,219,13,25,15,0,0 ; vpand 0xf19(%rip),%xmm1,%xmm1 # 1720 <_sk_xor__hsw_8bit+0x425>
- DB 196,226,125,51,201 ; vpmovzxwd %xmm1,%ymm1
- DB 196,226,125,88,21,175,11,0,0 ; vpbroadcastd 0xbaf(%rip),%ymm2 # 13c4 <_sk_xor__hsw_8bit+0xc9>
- DB 196,226,117,64,202 ; vpmulld %ymm2,%ymm1,%ymm1
- DB 196,226,125,88,21,165,11,0,0 ; vpbroadcastd 0xba5(%rip),%ymm2 # 13c8 <_sk_xor__hsw_8bit+0xcd>
- DB 197,245,235,202 ; vpor %ymm2,%ymm1,%ymm1
+ DB 117,61 ; jne 11a2 <_sk_load_g8_dst_hsw_8bit+0x5a>
+ DB 196,161,122,111,20,2 ; vmovdqu (%rdx,%r8,1),%xmm2
+ DB 196,226,125,49,218 ; vpmovzxbd %xmm2,%ymm3
+ DB 197,249,112,210,78 ; vpshufd $0x4e,%xmm2,%xmm2
+ DB 196,226,125,49,210 ; vpmovzxbd %xmm2,%ymm2
+ DB 196,226,125,88,37,77,23,0,0 ; vpbroadcastd 0x174d(%rip),%ymm4 # 28d0 <_sk_xor__hsw_8bit+0x17f>
+ DB 196,226,109,64,236 ; vpmulld %ymm4,%ymm2,%ymm5
+ DB 196,226,101,64,212 ; vpmulld %ymm4,%ymm3,%ymm2
+ DB 196,226,125,88,29,62,23,0,0 ; vpbroadcastd 0x173e(%rip),%ymm3 # 28d4 <_sk_xor__hsw_8bit+0x183>
+ DB 197,237,235,211 ; vpor %ymm3,%ymm2,%ymm2
+ DB 197,213,235,219 ; vpor %ymm3,%ymm5,%ymm3
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,7 ; and $0x7,%r9b
- DB 197,241,239,201 ; vpxor %xmm1,%xmm1,%xmm1
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2
DB 65,254,201 ; dec %r9b
- DB 65,128,249,6 ; cmp $0x6,%r9b
- DB 119,195 ; ja 7ff <_sk_load_g8_dst_hsw_8bit+0x23>
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,184 ; ja 116b <_sk_load_g8_dst_hsw_8bit+0x23>
DB 65,15,182,193 ; movzbl %r9b,%eax
- DB 72,141,13,121,0,0,0 ; lea 0x79(%rip),%rcx # 8c0 <_sk_load_g8_dst_hsw_8bit+0xe4>
+ DB 72,141,13,198,0,0,0 ; lea 0xc6(%rip),%rcx # 1284 <_sk_load_g8_dst_hsw_8bit+0x13c>
DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
DB 72,1,200 ; add %rcx,%rax
DB 255,224 ; jmpq *%rax
DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
- DB 197,249,110,200 ; vmovd %eax,%xmm1
- DB 235,164 ; jmp 7ff <_sk_load_g8_dst_hsw_8bit+0x23>
- DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
- DB 197,241,239,201 ; vpxor %xmm1,%xmm1,%xmm1
- DB 197,241,196,200,2 ; vpinsrw $0x2,%eax,%xmm1,%xmm1
- DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
DB 197,249,110,208 ; vmovd %eax,%xmm2
- DB 196,226,121,48,210 ; vpmovzxbw %xmm2,%xmm2
- DB 196,227,113,2,202,1 ; vpblendd $0x1,%xmm2,%xmm1,%xmm1
- DB 233,124,255,255,255 ; jmpq 7ff <_sk_load_g8_dst_hsw_8bit+0x23>
- DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
- DB 197,241,239,201 ; vpxor %xmm1,%xmm1,%xmm1
- DB 197,241,196,200,6 ; vpinsrw $0x6,%eax,%xmm1,%xmm1
- DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
- DB 197,241,196,200,5 ; vpinsrw $0x5,%eax,%xmm1,%xmm1
- DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
- DB 197,241,196,200,4 ; vpinsrw $0x4,%eax,%xmm1,%xmm1
- DB 196,161,121,110,20,2 ; vmovd (%rdx,%r8,1),%xmm2
- DB 196,226,121,48,210 ; vpmovzxbw %xmm2,%xmm2
- DB 196,227,105,2,201,12 ; vpblendd $0xc,%xmm1,%xmm2,%xmm1
- DB 233,65,255,255,255 ; jmpq 7ff <_sk_load_g8_dst_hsw_8bit+0x23>
- DB 102,144 ; xchg %ax,%ax
- DB 144 ; nop
+ DB 235,153 ; jmp 116b <_sk_load_g8_dst_hsw_8bit+0x23>
+ DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2
+ DB 196,163,105,32,84,2,2,2 ; vpinsrb $0x2,0x2(%rdx,%r8,1),%xmm2,%xmm2
+ DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
+ DB 197,249,110,216 ; vmovd %eax,%xmm3
+ DB 196,227,105,14,211,1 ; vpblendw $0x1,%xmm3,%xmm2,%xmm2
+ DB 233,121,255,255,255 ; jmpq 116b <_sk_load_g8_dst_hsw_8bit+0x23>
+ DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2
+ DB 196,163,105,32,84,2,6,6 ; vpinsrb $0x6,0x6(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,163,105,32,84,2,5,5 ; vpinsrb $0x5,0x5(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,163,105,32,84,2,4,4 ; vpinsrb $0x4,0x4(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,161,121,110,28,2 ; vmovd (%rdx,%r8,1),%xmm3
+ DB 196,227,105,2,211,1 ; vpblendd $0x1,%xmm3,%xmm2,%xmm2
+ DB 233,76,255,255,255 ; jmpq 116b <_sk_load_g8_dst_hsw_8bit+0x23>
+ DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2
+ DB 196,163,105,32,84,2,10,10 ; vpinsrb $0xa,0xa(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,163,105,32,84,2,9,9 ; vpinsrb $0x9,0x9(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,163,105,32,84,2,8,8 ; vpinsrb $0x8,0x8(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,161,122,126,28,2 ; vmovq (%rdx,%r8,1),%xmm3
+ DB 196,227,97,2,210,12 ; vpblendd $0xc,%xmm2,%xmm3,%xmm2
+ DB 233,31,255,255,255 ; jmpq 116b <_sk_load_g8_dst_hsw_8bit+0x23>
+ DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2
+ DB 196,163,105,32,84,2,14,14 ; vpinsrb $0xe,0xe(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,163,105,32,84,2,13,13 ; vpinsrb $0xd,0xd(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,163,105,32,84,2,12,12 ; vpinsrb $0xc,0xc(%rdx,%r8,1),%xmm2,%xmm2
+ DB 196,161,122,126,28,2 ; vmovq (%rdx,%r8,1),%xmm3
+ DB 196,163,97,34,92,2,8,2 ; vpinsrd $0x2,0x8(%rdx,%r8,1),%xmm3,%xmm3
+ DB 196,227,97,2,210,8 ; vpblendd $0x8,%xmm2,%xmm3,%xmm2
+ DB 233,234,254,255,255 ; jmpq 116b <_sk_load_g8_dst_hsw_8bit+0x23>
+ DB 15,31,0 ; nopl (%rax)
+ DB 67,255 ; rex.XB (bad)
DB 255 ; (bad)
+ DB 255,90,255 ; lcall *-0x1(%rdx)
DB 255 ; (bad)
- DB 255,170,255,255,255,155 ; ljmp *-0x64000001(%rdx)
+ DB 255,78,255 ; decl -0x1(%rsi)
DB 255 ; (bad)
+ DB 255,138,255,255,255,130 ; decl -0x7d000001(%rdx)
DB 255 ; (bad)
DB 255 ; (bad)
- DB 232,255,255,255,221 ; callq ffffffffde0008d0 <_sk_xor__hsw_8bit+0xffffffffddfff5d5>
DB 255 ; (bad)
+ DB 122,255 ; jp 1299 <_sk_load_g8_dst_hsw_8bit+0x151>
DB 255 ; (bad)
- DB 255,210 ; callq *%rdx
+ DB 255,110,255 ; ljmp *-0x1(%rsi)
DB 255 ; (bad)
+ DB 255,183,255,255,255,175 ; pushq -0x50000001(%rdi)
DB 255 ; (bad)
- DB 255,195 ; inc %ebx
+ DB 255 ; (bad)
+ DB 255,167,255,255,255,155 ; jmpq *-0x64000001(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,228 ; jmpq *%rsp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 220,255 ; fdivr %st,%st(7)
+ DB 255 ; (bad)
+ DB 255,212 ; callq *%rsp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,200 ; dec %eax
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; .byte 0xff
@@ -39099,78 +39732,250 @@ _sk_srcover_rgba_8888_hsw_8bit LABEL PROC
DB 76,99,15 ; movslq (%rdi),%r9
DB 76,139,71,16 ; mov 0x10(%rdi),%r8
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 72,99,80,8 ; movslq 0x8(%rax),%rdx
- DB 72,99,79,8 ; movslq 0x8(%rdi),%rcx
- DB 72,15,175,202 ; imul %rdx,%rcx
- DB 72,193,225,2 ; shl $0x2,%rcx
- DB 72,3,8 ; add (%rax),%rcx
- DB 74,141,4,137 ; lea (%rcx,%r9,4),%rax
+ DB 72,99,72,8 ; movslq 0x8(%rax),%rcx
+ DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx
+ DB 72,15,175,209 ; imul %rcx,%rdx
+ DB 72,193,226,2 ; shl $0x2,%rdx
+ DB 72,3,16 ; add (%rax),%rdx
DB 77,133,192 ; test %r8,%r8
- DB 117,108 ; jne 96d <_sk_srcover_rgba_8888_hsw_8bit+0x91>
- DB 197,254,111,16 ; vmovdqu (%rax),%ymm2
- DB 196,226,125,0,29,210,11,0,0 ; vpshufb 0xbd2(%rip),%ymm0,%ymm3 # 14e0 <_sk_xor__hsw_8bit+0x1e5>
- DB 196,226,125,48,226 ; vpmovzxbw %xmm2,%ymm4
- DB 196,227,125,57,213,1 ; vextracti128 $0x1,%ymm2,%xmm5
+ DB 15,133,222,0,0,0 ; jne 13c3 <_sk_srcover_rgba_8888_hsw_8bit+0x103>
+ DB 196,33,126,111,76,138,32 ; vmovdqu 0x20(%rdx,%r9,4),%ymm9
+ DB 196,33,126,111,28,138 ; vmovdqu (%rdx,%r9,4),%ymm11
+ DB 197,253,111,53,6,23,0,0 ; vmovdqa 0x1706(%rip),%ymm6 # 2a00 <_sk_xor__hsw_8bit+0x2af>
+ DB 196,226,117,0,254 ; vpshufb %ymm6,%ymm1,%ymm7
+ DB 196,226,125,0,246 ; vpshufb %ymm6,%ymm0,%ymm6
+ DB 196,66,125,48,195 ; vpmovzxbw %xmm11,%ymm8
+ DB 196,99,125,57,220,1 ; vextracti128 $0x1,%ymm11,%xmm4
+ DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
+ DB 196,66,125,48,209 ; vpmovzxbw %xmm9,%ymm10
+ DB 196,99,125,57,205,1 ; vextracti128 $0x1,%ymm9,%xmm5
DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
- DB 196,226,125,48,243 ; vpmovzxbw %xmm3,%ymm6
- DB 196,227,125,57,219,1 ; vextracti128 $0x1,%ymm3,%xmm3
- DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3
- DB 197,213,213,219 ; vpmullw %ymm3,%ymm5,%ymm3
+ DB 196,98,125,48,230 ; vpmovzxbw %xmm6,%ymm12
+ DB 196,227,125,57,246,1 ; vextracti128 $0x1,%ymm6,%xmm6
+ DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6
+ DB 196,98,125,48,239 ; vpmovzxbw %xmm7,%ymm13
+ DB 196,227,125,57,255,1 ; vextracti128 $0x1,%ymm7,%xmm7
+ DB 196,226,125,48,255 ; vpmovzxbw %xmm7,%ymm7
+ DB 197,213,213,255 ; vpmullw %ymm7,%ymm5,%ymm7
+ DB 196,65,45,213,237 ; vpmullw %ymm13,%ymm10,%ymm13
DB 197,221,213,246 ; vpmullw %ymm6,%ymm4,%ymm6
+ DB 196,65,61,213,228 ; vpmullw %ymm12,%ymm8,%ymm12
+ DB 196,65,29,253,192 ; vpaddw %ymm8,%ymm12,%ymm8
DB 197,205,253,228 ; vpaddw %ymm4,%ymm6,%ymm4
- DB 197,229,253,221 ; vpaddw %ymm5,%ymm3,%ymm3
- DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3
+ DB 196,193,21,253,242 ; vpaddw %ymm10,%ymm13,%ymm6
+ DB 197,197,253,237 ; vpaddw %ymm5,%ymm7,%ymm5
+ DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5
+ DB 197,205,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm6
DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
- DB 196,227,93,56,235,1 ; vinserti128 $0x1,%xmm3,%ymm4,%ymm5
- DB 196,227,93,70,219,49 ; vperm2i128 $0x31,%ymm3,%ymm4,%ymm3
- DB 197,213,103,219 ; vpackuswb %ymm3,%ymm5,%ymm3
- DB 197,237,248,211 ; vpsubb %ymm3,%ymm2,%ymm2
- DB 197,237,252,208 ; vpaddb %ymm0,%ymm2,%ymm2
+ DB 196,193,69,113,208,8 ; vpsrlw $0x8,%ymm8,%ymm7
+ DB 196,99,69,56,196,1 ; vinserti128 $0x1,%xmm4,%ymm7,%ymm8
+ DB 196,227,69,70,228,49 ; vperm2i128 $0x31,%ymm4,%ymm7,%ymm4
+ DB 197,189,103,228 ; vpackuswb %ymm4,%ymm8,%ymm4
+ DB 196,227,77,56,253,1 ; vinserti128 $0x1,%xmm5,%ymm6,%ymm7
+ DB 196,227,77,70,237,49 ; vperm2i128 $0x31,%ymm5,%ymm6,%ymm5
+ DB 197,197,103,237 ; vpackuswb %ymm5,%ymm7,%ymm5
+ DB 197,181,248,237 ; vpsubb %ymm5,%ymm9,%ymm5
+ DB 197,165,248,228 ; vpsubb %ymm4,%ymm11,%ymm4
+ DB 197,221,252,224 ; vpaddb %ymm0,%ymm4,%ymm4
+ DB 197,213,252,233 ; vpaddb %ymm1,%ymm5,%ymm5
DB 77,133,192 ; test %r8,%r8
- DB 117,49 ; jne 996 <_sk_srcover_rgba_8888_hsw_8bit+0xba>
- DB 197,254,127,16 ; vmovdqu %ymm2,(%rax)
+ DB 117,72 ; jne 13fa <_sk_srcover_rgba_8888_hsw_8bit+0x13a>
+ DB 196,161,126,127,36,138 ; vmovdqu %ymm4,(%rdx,%r9,4)
+ DB 196,161,126,127,108,138,32 ; vmovdqu %ymm5,0x20(%rdx,%r9,4)
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 185,8,0,0,0 ; mov $0x8,%ecx
- DB 68,41,193 ; sub %r8d,%ecx
- DB 192,225,3 ; shl $0x3,%cl
- DB 72,199,194,255,255,255,255 ; mov $0xffffffffffffffff,%rdx
- DB 72,211,234 ; shr %cl,%rdx
- DB 196,225,249,110,210 ; vmovq %rdx,%xmm2
- DB 196,226,125,33,210 ; vpmovsxbd %xmm2,%ymm2
- DB 196,226,109,140,16 ; vpmaskmovd (%rax),%ymm2,%ymm2
- DB 233,111,255,255,255 ; jmpq 905 <_sk_srcover_rgba_8888_hsw_8bit+0x29>
- DB 185,8,0,0,0 ; mov $0x8,%ecx
- DB 68,41,193 ; sub %r8d,%ecx
- DB 192,225,3 ; shl $0x3,%cl
- DB 72,199,194,255,255,255,255 ; mov $0xffffffffffffffff,%rdx
- DB 72,211,234 ; shr %cl,%rdx
- DB 196,225,249,110,218 ; vmovq %rdx,%xmm3
- DB 196,226,125,33,219 ; vpmovsxbd %xmm3,%ymm3
- DB 196,226,101,142,16 ; vpmaskmovd %ymm2,%ymm3,(%rax)
- DB 235,173 ; jmp 969 <_sk_srcover_rgba_8888_hsw_8bit+0x8d>
+ DB 68,137,192 ; mov %r8d,%eax
+ DB 36,15 ; and $0xf,%al
+ DB 196,65,53,239,201 ; vpxor %ymm9,%ymm9,%ymm9
+ DB 196,65,37,239,219 ; vpxor %ymm11,%ymm11,%ymm11
+ DB 254,200 ; dec %al
+ DB 60,14 ; cmp $0xe,%al
+ DB 15,135,22,255,255,255 ; ja 12f2 <_sk_srcover_rgba_8888_hsw_8bit+0x32>
+ DB 15,182,192 ; movzbl %al,%eax
+ DB 72,141,13,234,1,0,0 ; lea 0x1ea(%rip),%rcx # 15d0 <_sk_srcover_rgba_8888_hsw_8bit+0x310>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 196,33,121,110,28,138 ; vmovd (%rdx,%r9,4),%xmm11
+ DB 233,248,254,255,255 ; jmpq 12f2 <_sk_srcover_rgba_8888_hsw_8bit+0x32>
+ DB 65,128,224,15 ; and $0xf,%r8b
+ DB 65,254,200 ; dec %r8b
+ DB 65,128,248,14 ; cmp $0xe,%r8b
+ DB 119,184 ; ja 13bf <_sk_srcover_rgba_8888_hsw_8bit+0xff>
+ DB 65,15,182,192 ; movzbl %r8b,%eax
+ DB 72,141,13,250,1,0,0 ; lea 0x1fa(%rip),%rcx # 160c <_sk_srcover_rgba_8888_hsw_8bit+0x34c>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 196,161,121,126,36,138 ; vmovd %xmm4,(%rdx,%r9,4)
+ DB 235,156 ; jmp 13bf <_sk_srcover_rgba_8888_hsw_8bit+0xff>
+ DB 196,161,121,110,100,138,8 ; vmovd 0x8(%rdx,%r9,4),%xmm4
+ DB 196,226,121,89,236 ; vpbroadcastq %xmm4,%xmm5
+ DB 196,65,53,239,201 ; vpxor %ymm9,%ymm9,%ymm9
+ DB 196,99,53,2,221,4 ; vpblendd $0x4,%ymm5,%ymm9,%ymm11
+ DB 196,162,121,53,52,138 ; vpmovzxdq (%rdx,%r9,4),%xmm6
+ DB 197,249,112,246,232 ; vpshufd $0xe8,%xmm6,%xmm6
+ DB 196,99,37,2,222,3 ; vpblendd $0x3,%ymm6,%ymm11,%ymm11
+ DB 233,162,254,255,255 ; jmpq 12f2 <_sk_srcover_rgba_8888_hsw_8bit+0x32>
+ DB 196,161,121,110,100,138,24 ; vmovd 0x18(%rdx,%r9,4),%xmm4
+ DB 196,226,125,89,236 ; vpbroadcastq %xmm4,%ymm5
+ DB 196,65,53,239,201 ; vpxor %ymm9,%ymm9,%ymm9
+ DB 196,99,53,2,221,64 ; vpblendd $0x40,%ymm5,%ymm9,%ymm11
+ DB 196,99,125,57,222,1 ; vextracti128 $0x1,%ymm11,%xmm6
+ DB 196,163,73,34,116,138,20,1 ; vpinsrd $0x1,0x14(%rdx,%r9,4),%xmm6,%xmm6
+ DB 196,99,37,56,222,1 ; vinserti128 $0x1,%xmm6,%ymm11,%ymm11
+ DB 196,99,125,57,222,1 ; vextracti128 $0x1,%ymm11,%xmm6
+ DB 196,163,73,34,116,138,16,0 ; vpinsrd $0x0,0x10(%rdx,%r9,4),%xmm6,%xmm6
+ DB 196,99,37,56,222,1 ; vinserti128 $0x1,%xmm6,%ymm11,%ymm11
+ DB 196,161,122,111,52,138 ; vmovdqu (%rdx,%r9,4),%xmm6
+ DB 196,67,77,2,219,240 ; vpblendd $0xf0,%ymm11,%ymm6,%ymm11
+ DB 233,82,254,255,255 ; jmpq 12f2 <_sk_srcover_rgba_8888_hsw_8bit+0x32>
+ DB 196,161,121,110,100,138,40 ; vmovd 0x28(%rdx,%r9,4),%xmm4
+ DB 196,226,121,89,228 ; vpbroadcastq %xmm4,%xmm4
+ DB 197,213,239,237 ; vpxor %ymm5,%ymm5,%ymm5
+ DB 196,99,85,2,204,4 ; vpblendd $0x4,%ymm4,%ymm5,%ymm9
+ DB 196,163,49,34,108,138,36,1 ; vpinsrd $0x1,0x24(%rdx,%r9,4),%xmm9,%xmm5
+ DB 196,99,53,2,205,15 ; vpblendd $0xf,%ymm5,%ymm9,%ymm9
+ DB 196,161,121,110,108,138,32 ; vmovd 0x20(%rdx,%r9,4),%xmm5
+ DB 196,99,53,2,205,1 ; vpblendd $0x1,%ymm5,%ymm9,%ymm9
+ DB 233,22,254,255,255 ; jmpq 12ec <_sk_srcover_rgba_8888_hsw_8bit+0x2c>
+ DB 196,161,121,110,100,138,56 ; vmovd 0x38(%rdx,%r9,4),%xmm4
+ DB 196,226,125,89,228 ; vpbroadcastq %xmm4,%ymm4
+ DB 197,213,239,237 ; vpxor %ymm5,%ymm5,%ymm5
+ DB 196,99,85,2,204,64 ; vpblendd $0x40,%ymm4,%ymm5,%ymm9
+ DB 196,99,125,57,205,1 ; vextracti128 $0x1,%ymm9,%xmm5
+ DB 196,163,81,34,108,138,52,1 ; vpinsrd $0x1,0x34(%rdx,%r9,4),%xmm5,%xmm5
+ DB 196,99,53,56,205,1 ; vinserti128 $0x1,%xmm5,%ymm9,%ymm9
+ DB 196,99,125,57,205,1 ; vextracti128 $0x1,%ymm9,%xmm5
+ DB 196,163,81,34,108,138,48,0 ; vpinsrd $0x0,0x30(%rdx,%r9,4),%xmm5,%xmm5
+ DB 196,99,53,56,205,1 ; vinserti128 $0x1,%xmm5,%ymm9,%ymm9
+ DB 196,33,126,111,28,138 ; vmovdqu (%rdx,%r9,4),%ymm11
+ DB 196,161,122,111,116,138,32 ; vmovdqu 0x20(%rdx,%r9,4),%xmm6
+ DB 196,67,77,2,201,240 ; vpblendd $0xf0,%ymm9,%ymm6,%ymm9
+ DB 233,198,253,255,255 ; jmpq 12f2 <_sk_srcover_rgba_8888_hsw_8bit+0x32>
+ DB 196,163,121,22,100,138,8,2 ; vpextrd $0x2,%xmm4,0x8(%rdx,%r9,4)
+ DB 196,161,121,214,36,138 ; vmovq %xmm4,(%rdx,%r9,4)
+ DB 233,128,254,255,255 ; jmpq 13bf <_sk_srcover_rgba_8888_hsw_8bit+0xff>
+ DB 196,227,125,57,229,1 ; vextracti128 $0x1,%ymm4,%xmm5
+ DB 196,163,121,22,108,138,24,2 ; vpextrd $0x2,%xmm5,0x18(%rdx,%r9,4)
+ DB 196,227,125,57,229,1 ; vextracti128 $0x1,%ymm4,%xmm5
+ DB 196,163,121,22,108,138,20,1 ; vpextrd $0x1,%xmm5,0x14(%rdx,%r9,4)
+ DB 196,227,125,57,229,1 ; vextracti128 $0x1,%ymm4,%xmm5
+ DB 196,161,121,126,108,138,16 ; vmovd %xmm5,0x10(%rdx,%r9,4)
+ DB 196,161,122,127,36,138 ; vmovdqu %xmm4,(%rdx,%r9,4)
+ DB 233,76,254,255,255 ; jmpq 13bf <_sk_srcover_rgba_8888_hsw_8bit+0xff>
+ DB 196,163,121,22,108,138,40,2 ; vpextrd $0x2,%xmm5,0x28(%rdx,%r9,4)
+ DB 196,163,121,22,108,138,36,1 ; vpextrd $0x1,%xmm5,0x24(%rdx,%r9,4)
+ DB 196,161,121,126,108,138,32 ; vmovd %xmm5,0x20(%rdx,%r9,4)
+ DB 196,161,126,127,36,138 ; vmovdqu %ymm4,(%rdx,%r9,4)
+ DB 233,42,254,255,255 ; jmpq 13bf <_sk_srcover_rgba_8888_hsw_8bit+0xff>
+ DB 196,227,125,57,238,1 ; vextracti128 $0x1,%ymm5,%xmm6
+ DB 196,163,121,22,116,138,56,2 ; vpextrd $0x2,%xmm6,0x38(%rdx,%r9,4)
+ DB 196,227,125,57,238,1 ; vextracti128 $0x1,%ymm5,%xmm6
+ DB 196,163,121,22,116,138,52,1 ; vpextrd $0x1,%xmm6,0x34(%rdx,%r9,4)
+ DB 196,227,125,57,238,1 ; vextracti128 $0x1,%ymm5,%xmm6
+ DB 196,161,121,126,116,138,48 ; vmovd %xmm6,0x30(%rdx,%r9,4)
+ DB 196,161,126,127,36,138 ; vmovdqu %ymm4,(%rdx,%r9,4)
+ DB 196,161,122,127,108,138,32 ; vmovdqu %xmm5,0x20(%rdx,%r9,4)
+ DB 233,239,253,255,255 ; jmpq 13bf <_sk_srcover_rgba_8888_hsw_8bit+0xff>
+ DB 31 ; (bad)
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,106,254 ; ljmp *-0x2(%rdx)
+ DB 255 ; (bad)
+ DB 255,83,254 ; callq *-0x2(%rbx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 191,254,255,255,171 ; mov $0xabfffffe,%edi
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,151,254,255,255,128 ; callq *-0x7f000002(%rdi)
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,28,253,255,255,244,254 ; lcall *-0x10b0001(,%rdi,8)
+ DB 255 ; (bad)
+ DB 255,230 ; jmpq *%rsi
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,208 ; callq *%rax
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,68,255,255 ; incl -0x1(%rdi,%rdi,8)
+ DB 255,48 ; pushq (%rax)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,28,255 ; lcall *(%rdi,%rdi,8)
+ DB 255 ; (bad)
+ DB 255,6 ; incl (%rsi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,15 ; decl (%rdi)
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,40 ; ljmp *(%rax)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,32 ; jmpq *(%rax)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,92,255,255 ; lcall *-0x1(%rdi,%rdi,8)
+ DB 255,79,255 ; decl -0x1(%rdi)
+ DB 255 ; (bad)
+ DB 255,65,255 ; incl -0x1(%rcx)
+ DB 255 ; (bad)
+ DB 255,51 ; pushq (%rbx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 126,255 ; jle 1629 <_sk_srcover_rgba_8888_hsw_8bit+0x369>
+ DB 255 ; (bad)
+ DB 255,119,255 ; pushq -0x1(%rdi)
+ DB 255 ; (bad)
+ DB 255,111,255 ; ljmp *-0x1(%rdi)
+ DB 255 ; (bad)
+ DB 255,103,255 ; jmpq *-0x1(%rdi)
+ DB 255 ; (bad)
+ DB 255,178,255,255,255,165 ; pushq -0x5a000001(%rdx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,151,255,255,255,137 ; callq *-0x76000001(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_scale_1_float_hsw_8bit
_sk_scale_1_float_hsw_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 197,250,16,16 ; vmovss (%rax),%xmm2
- DB 197,234,89,21,2,10,0,0 ; vmulss 0xa02(%rip),%xmm2,%xmm2 # 13cc <_sk_xor__hsw_8bit+0xd1>
- DB 197,250,44,194 ; vcvttss2si %xmm2,%eax
- DB 197,249,110,208 ; vmovd %eax,%xmm2
- DB 196,226,125,120,210 ; vpbroadcastb %xmm2,%ymm2
- DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3
+ DB 197,250,16,32 ; vmovss (%rax),%xmm4
+ DB 197,218,89,37,130,18,0,0 ; vmulss 0x1282(%rip),%xmm4,%xmm4 # 28d8 <_sk_xor__hsw_8bit+0x187>
+ DB 197,250,44,196 ; vcvttss2si %xmm4,%eax
+ DB 197,249,110,224 ; vmovd %eax,%xmm4
+ DB 196,226,125,120,228 ; vpbroadcastb %xmm4,%ymm4
+ DB 196,226,125,48,232 ; vpmovzxbw %xmm0,%ymm5
DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0
DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
- DB 197,237,219,21,17,11,0,0 ; vpand 0xb11(%rip),%ymm2,%ymm2 # 1500 <_sk_xor__hsw_8bit+0x205>
- DB 197,237,213,224 ; vpmullw %ymm0,%ymm2,%ymm4
- DB 197,237,213,211 ; vpmullw %ymm3,%ymm2,%ymm2
- DB 197,237,253,211 ; vpaddw %ymm3,%ymm2,%ymm2
- DB 197,221,253,192 ; vpaddw %ymm0,%ymm4,%ymm0
+ DB 196,226,125,48,241 ; vpmovzxbw %xmm1,%ymm6
+ DB 196,227,125,57,201,1 ; vextracti128 $0x1,%ymm1,%xmm1
+ DB 196,226,125,48,201 ; vpmovzxbw %xmm1,%ymm1
+ DB 197,221,219,37,149,19,0,0 ; vpand 0x1395(%rip),%ymm4,%ymm4 # 2a20 <_sk_xor__hsw_8bit+0x2cf>
+ DB 197,221,213,249 ; vpmullw %ymm1,%ymm4,%ymm7
+ DB 197,93,213,198 ; vpmullw %ymm6,%ymm4,%ymm8
+ DB 197,93,213,200 ; vpmullw %ymm0,%ymm4,%ymm9
+ DB 197,221,213,229 ; vpmullw %ymm5,%ymm4,%ymm4
+ DB 197,221,253,229 ; vpaddw %ymm5,%ymm4,%ymm4
+ DB 197,181,253,192 ; vpaddw %ymm0,%ymm9,%ymm0
+ DB 197,189,253,238 ; vpaddw %ymm6,%ymm8,%ymm5
+ DB 197,197,253,201 ; vpaddw %ymm1,%ymm7,%ymm1
+ DB 197,245,113,209,8 ; vpsrlw $0x8,%ymm1,%ymm1
+ DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5
DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0
- DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2
- DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3
- DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0
+ DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
+ DB 196,227,93,56,240,1 ; vinserti128 $0x1,%xmm0,%ymm4,%ymm6
+ DB 196,227,93,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm4,%ymm0
+ DB 197,205,103,192 ; vpackuswb %ymm0,%ymm6,%ymm0
+ DB 196,227,85,56,225,1 ; vinserti128 $0x1,%xmm1,%ymm5,%ymm4
+ DB 196,227,85,70,201,49 ; vperm2i128 $0x31,%ymm1,%ymm5,%ymm1
+ DB 197,221,103,201 ; vpackuswb %ymm1,%ymm4,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
@@ -39184,75 +39989,118 @@ _sk_scale_u8_hsw_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,106 ; jne aa4 <_sk_scale_u8_hsw_8bit+0x87>
- DB 196,162,121,48,20,2 ; vpmovzxbw (%rdx,%r8,1),%xmm2
- DB 197,233,219,21,232,12,0,0 ; vpand 0xce8(%rip),%xmm2,%xmm2 # 1730 <_sk_xor__hsw_8bit+0x435>
- DB 196,226,125,51,210 ; vpmovzxwd %xmm2,%ymm2
- DB 196,226,109,0,21,202,10,0,0 ; vpshufb 0xaca(%rip),%ymm2,%ymm2 # 1520 <_sk_xor__hsw_8bit+0x225>
- DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3
+ DB 15,133,191,0,0,0 ; jne 17c3 <_sk_scale_u8_hsw_8bit+0xe0>
+ DB 196,161,122,111,36,2 ; vmovdqu (%rdx,%r8,1),%xmm4
+ DB 196,226,125,49,236 ; vpmovzxbd %xmm4,%ymm5
+ DB 197,249,112,228,78 ; vpshufd $0x4e,%xmm4,%xmm4
+ DB 196,226,125,49,228 ; vpmovzxbd %xmm4,%ymm4
+ DB 197,253,111,53,31,19,0,0 ; vmovdqa 0x131f(%rip),%ymm6 # 2a40 <_sk_xor__hsw_8bit+0x2ef>
+ DB 196,226,93,0,230 ; vpshufb %ymm6,%ymm4,%ymm4
+ DB 196,226,85,0,238 ; vpshufb %ymm6,%ymm5,%ymm5
+ DB 196,226,125,48,240 ; vpmovzxbw %xmm0,%ymm6
DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0
DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
- DB 196,226,125,48,226 ; vpmovzxbw %xmm2,%ymm4
- DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2
- DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2
- DB 197,237,213,208 ; vpmullw %ymm0,%ymm2,%ymm2
- DB 197,221,213,227 ; vpmullw %ymm3,%ymm4,%ymm4
- DB 197,221,253,219 ; vpaddw %ymm3,%ymm4,%ymm3
- DB 197,237,253,192 ; vpaddw %ymm0,%ymm2,%ymm0
+ DB 196,226,125,48,249 ; vpmovzxbw %xmm1,%ymm7
+ DB 196,227,125,57,201,1 ; vextracti128 $0x1,%ymm1,%xmm1
+ DB 196,226,125,48,201 ; vpmovzxbw %xmm1,%ymm1
+ DB 196,98,125,48,197 ; vpmovzxbw %xmm5,%ymm8
+ DB 196,227,125,57,237,1 ; vextracti128 $0x1,%ymm5,%xmm5
+ DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
+ DB 196,98,125,48,204 ; vpmovzxbw %xmm4,%ymm9
+ DB 196,227,125,57,228,1 ; vextracti128 $0x1,%ymm4,%xmm4
+ DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
+ DB 197,221,213,225 ; vpmullw %ymm1,%ymm4,%ymm4
+ DB 197,53,213,207 ; vpmullw %ymm7,%ymm9,%ymm9
+ DB 197,213,213,232 ; vpmullw %ymm0,%ymm5,%ymm5
+ DB 197,61,213,198 ; vpmullw %ymm6,%ymm8,%ymm8
+ DB 197,189,253,246 ; vpaddw %ymm6,%ymm8,%ymm6
+ DB 197,213,253,192 ; vpaddw %ymm0,%ymm5,%ymm0
+ DB 197,181,253,239 ; vpaddw %ymm7,%ymm9,%ymm5
+ DB 197,221,253,201 ; vpaddw %ymm1,%ymm4,%ymm1
+ DB 197,245,113,209,8 ; vpsrlw $0x8,%ymm1,%ymm1
+ DB 197,221,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm4
DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0
- DB 197,237,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm2
- DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3
- DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0
+ DB 197,213,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm5
+ DB 196,227,85,56,240,1 ; vinserti128 $0x1,%xmm0,%ymm5,%ymm6
+ DB 196,227,85,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm5,%ymm0
+ DB 197,205,103,192 ; vpackuswb %ymm0,%ymm6,%ymm0
+ DB 196,227,93,56,233,1 ; vinserti128 $0x1,%xmm1,%ymm4,%ymm5
+ DB 196,227,93,70,201,49 ; vperm2i128 $0x31,%ymm1,%ymm4,%ymm1
+ DB 197,213,103,201 ; vpackuswb %ymm1,%ymm5,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,7 ; and $0x7,%r9b
- DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 197,217,239,228 ; vpxor %xmm4,%xmm4,%xmm4
DB 65,254,201 ; dec %r9b
- DB 65,128,249,6 ; cmp $0x6,%r9b
- DB 119,139 ; ja a40 <_sk_scale_u8_hsw_8bit+0x23>
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 15,135,50,255,255,255 ; ja 170a <_sk_scale_u8_hsw_8bit+0x27>
DB 65,15,182,193 ; movzbl %r9b,%eax
- DB 72,141,13,124,0,0,0 ; lea 0x7c(%rip),%rcx # b3c <_sk_scale_u8_hsw_8bit+0x11f>
+ DB 72,141,13,201,0,0,0 ; lea 0xc9(%rip),%rcx # 18ac <_sk_scale_u8_hsw_8bit+0x1c9>
DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
DB 72,1,200 ; add %rcx,%rax
DB 255,224 ; jmpq *%rax
DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
- DB 197,249,110,208 ; vmovd %eax,%xmm2
- DB 233,105,255,255,255 ; jmpq a40 <_sk_scale_u8_hsw_8bit+0x23>
- DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
- DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2
- DB 197,233,196,208,2 ; vpinsrw $0x2,%eax,%xmm2,%xmm2
+ DB 197,249,110,224 ; vmovd %eax,%xmm4
+ DB 233,16,255,255,255 ; jmpq 170a <_sk_scale_u8_hsw_8bit+0x27>
+ DB 197,217,239,228 ; vpxor %xmm4,%xmm4,%xmm4
+ DB 196,163,89,32,100,2,2,2 ; vpinsrb $0x2,0x2(%rdx,%r8,1),%xmm4,%xmm4
DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
- DB 197,249,110,216 ; vmovd %eax,%xmm3
- DB 196,226,121,48,219 ; vpmovzxbw %xmm3,%xmm3
- DB 196,227,105,2,211,1 ; vpblendd $0x1,%xmm3,%xmm2,%xmm2
- DB 233,65,255,255,255 ; jmpq a40 <_sk_scale_u8_hsw_8bit+0x23>
- DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
- DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2
- DB 197,233,196,208,6 ; vpinsrw $0x6,%eax,%xmm2,%xmm2
- DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
- DB 197,233,196,208,5 ; vpinsrw $0x5,%eax,%xmm2,%xmm2
- DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
- DB 197,233,196,208,4 ; vpinsrw $0x4,%eax,%xmm2,%xmm2
- DB 196,161,121,110,28,2 ; vmovd (%rdx,%r8,1),%xmm3
- DB 196,226,121,48,219 ; vpmovzxbw %xmm3,%xmm3
- DB 196,227,97,2,210,12 ; vpblendd $0xc,%xmm2,%xmm3,%xmm2
- DB 233,6,255,255,255 ; jmpq a40 <_sk_scale_u8_hsw_8bit+0x23>
- DB 102,144 ; xchg %ax,%ax
- DB 141 ; (bad)
+ DB 197,249,110,232 ; vmovd %eax,%xmm5
+ DB 196,227,89,14,229,1 ; vpblendw $0x1,%xmm5,%xmm4,%xmm4
+ DB 233,240,254,255,255 ; jmpq 170a <_sk_scale_u8_hsw_8bit+0x27>
+ DB 197,217,239,228 ; vpxor %xmm4,%xmm4,%xmm4
+ DB 196,163,89,32,100,2,6,6 ; vpinsrb $0x6,0x6(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,163,89,32,100,2,5,5 ; vpinsrb $0x5,0x5(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,163,89,32,100,2,4,4 ; vpinsrb $0x4,0x4(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,161,121,110,44,2 ; vmovd (%rdx,%r8,1),%xmm5
+ DB 196,227,89,2,229,1 ; vpblendd $0x1,%xmm5,%xmm4,%xmm4
+ DB 233,195,254,255,255 ; jmpq 170a <_sk_scale_u8_hsw_8bit+0x27>
+ DB 197,217,239,228 ; vpxor %xmm4,%xmm4,%xmm4
+ DB 196,163,89,32,100,2,10,10 ; vpinsrb $0xa,0xa(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,163,89,32,100,2,9,9 ; vpinsrb $0x9,0x9(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,163,89,32,100,2,8,8 ; vpinsrb $0x8,0x8(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,161,122,126,44,2 ; vmovq (%rdx,%r8,1),%xmm5
+ DB 196,227,81,2,228,12 ; vpblendd $0xc,%xmm4,%xmm5,%xmm4
+ DB 233,150,254,255,255 ; jmpq 170a <_sk_scale_u8_hsw_8bit+0x27>
+ DB 197,217,239,228 ; vpxor %xmm4,%xmm4,%xmm4
+ DB 196,163,89,32,100,2,14,14 ; vpinsrb $0xe,0xe(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,163,89,32,100,2,13,13 ; vpinsrb $0xd,0xd(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,163,89,32,100,2,12,12 ; vpinsrb $0xc,0xc(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,161,122,126,44,2 ; vmovq (%rdx,%r8,1),%xmm5
+ DB 196,163,81,34,108,2,8,2 ; vpinsrd $0x2,0x8(%rdx,%r8,1),%xmm5,%xmm5
+ DB 196,227,81,2,228,8 ; vpblendd $0x8,%xmm4,%xmm5,%xmm4
+ DB 233,97,254,255,255 ; jmpq 170a <_sk_scale_u8_hsw_8bit+0x27>
+ DB 15,31,0 ; nopl (%rax)
+ DB 64,255 ; rex (bad)
DB 255 ; (bad)
+ DB 255,90,255 ; lcall *-0x1(%rdx)
DB 255 ; (bad)
- DB 255,170,255,255,255,155 ; ljmp *-0x64000001(%rdx)
+ DB 255,78,255 ; decl -0x1(%rsi)
DB 255 ; (bad)
+ DB 255,138,255,255,255,130 ; decl -0x7d000001(%rdx)
DB 255 ; (bad)
DB 255 ; (bad)
- DB 232,255,255,255,221 ; callq ffffffffde000b4c <_sk_xor__hsw_8bit+0xffffffffddfff851>
DB 255 ; (bad)
+ DB 122,255 ; jp 18c1 <_sk_scale_u8_hsw_8bit+0x1de>
DB 255 ; (bad)
- DB 255,210 ; callq *%rdx
+ DB 255,110,255 ; ljmp *-0x1(%rsi)
DB 255 ; (bad)
+ DB 255,183,255,255,255,175 ; pushq -0x50000001(%rdi)
DB 255 ; (bad)
- DB 255,195 ; inc %ebx
+ DB 255 ; (bad)
+ DB 255,167,255,255,255,155 ; jmpq *-0x64000001(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,228 ; jmpq *%rsp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 220,255 ; fdivr %st,%st(7)
+ DB 255 ; (bad)
+ DB 255,212 ; callq *%rsp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,200 ; dec %eax
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; .byte 0xff
@@ -39260,42 +40108,67 @@ _sk_scale_u8_hsw_8bit LABEL PROC
PUBLIC _sk_lerp_1_float_hsw_8bit
_sk_lerp_1_float_hsw_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 197,250,16,16 ; vmovss (%rax),%xmm2
- DB 197,234,89,21,106,8,0,0 ; vmulss 0x86a(%rip),%xmm2,%xmm2 # 13d0 <_sk_xor__hsw_8bit+0xd5>
- DB 197,250,44,194 ; vcvttss2si %xmm2,%eax
- DB 197,249,110,208 ; vmovd %eax,%xmm2
- DB 196,226,125,120,210 ; vpbroadcastb %xmm2,%ymm2
- DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3
+ DB 197,250,16,32 ; vmovss (%rax),%xmm4
+ DB 197,218,89,37,230,15,0,0 ; vmulss 0xfe6(%rip),%xmm4,%xmm4 # 28dc <_sk_xor__hsw_8bit+0x18b>
+ DB 197,250,44,196 ; vcvttss2si %xmm4,%eax
+ DB 197,249,110,224 ; vmovd %eax,%xmm4
+ DB 196,226,125,120,228 ; vpbroadcastb %xmm4,%ymm4
+ DB 196,226,125,48,232 ; vpmovzxbw %xmm0,%ymm5
DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0
DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
- DB 197,237,219,37,181,9,0,0 ; vpand 0x9b5(%rip),%ymm2,%ymm4 # 1540 <_sk_xor__hsw_8bit+0x245>
- DB 197,221,213,232 ; vpmullw %ymm0,%ymm4,%ymm5
- DB 197,221,213,227 ; vpmullw %ymm3,%ymm4,%ymm4
- DB 197,221,253,219 ; vpaddw %ymm3,%ymm4,%ymm3
- DB 197,213,253,192 ; vpaddw %ymm0,%ymm5,%ymm0
+ DB 196,226,125,48,241 ; vpmovzxbw %xmm1,%ymm6
+ DB 196,227,125,57,201,1 ; vextracti128 $0x1,%ymm1,%xmm1
+ DB 196,226,125,48,201 ; vpmovzxbw %xmm1,%ymm1
+ DB 197,221,219,61,53,17,0,0 ; vpand 0x1135(%rip),%ymm4,%ymm7 # 2a60 <_sk_xor__hsw_8bit+0x30f>
+ DB 197,69,213,193 ; vpmullw %ymm1,%ymm7,%ymm8
+ DB 197,69,213,206 ; vpmullw %ymm6,%ymm7,%ymm9
+ DB 197,69,213,208 ; vpmullw %ymm0,%ymm7,%ymm10
+ DB 197,197,213,253 ; vpmullw %ymm5,%ymm7,%ymm7
+ DB 197,197,253,237 ; vpaddw %ymm5,%ymm7,%ymm5
+ DB 197,173,253,192 ; vpaddw %ymm0,%ymm10,%ymm0
+ DB 197,181,253,246 ; vpaddw %ymm6,%ymm9,%ymm6
+ DB 197,189,253,201 ; vpaddw %ymm1,%ymm8,%ymm1
+ DB 197,245,113,209,8 ; vpsrlw $0x8,%ymm1,%ymm1
+ DB 197,205,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm6
DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0
- DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3
- DB 196,227,101,56,224,1 ; vinserti128 $0x1,%xmm0,%ymm3,%ymm4
- DB 196,227,101,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm3,%ymm0
- DB 197,221,103,192 ; vpackuswb %ymm0,%ymm4,%ymm0
- DB 197,229,118,219 ; vpcmpeqd %ymm3,%ymm3,%ymm3
- DB 197,237,239,211 ; vpxor %ymm3,%ymm2,%ymm2
- DB 196,226,125,48,217 ; vpmovzxbw %xmm1,%ymm3
- DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4
- DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
+ DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5
+ DB 196,227,85,56,248,1 ; vinserti128 $0x1,%xmm0,%ymm5,%ymm7
+ DB 196,227,85,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm5,%ymm0
+ DB 197,69,103,192 ; vpackuswb %ymm0,%ymm7,%ymm8
+ DB 196,227,77,56,233,1 ; vinserti128 $0x1,%xmm1,%ymm6,%ymm5
+ DB 196,227,77,70,201,49 ; vperm2i128 $0x31,%ymm1,%ymm6,%ymm1
+ DB 197,213,103,201 ; vpackuswb %ymm1,%ymm5,%ymm1
+ DB 197,213,118,237 ; vpcmpeqd %ymm5,%ymm5,%ymm5
+ DB 197,221,239,229 ; vpxor %ymm5,%ymm4,%ymm4
DB 196,226,125,48,234 ; vpmovzxbw %xmm2,%ymm5
- DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2
- DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2
- DB 197,237,213,212 ; vpmullw %ymm4,%ymm2,%ymm2
- DB 197,213,213,235 ; vpmullw %ymm3,%ymm5,%ymm5
- DB 197,213,253,219 ; vpaddw %ymm3,%ymm5,%ymm3
- DB 197,237,253,212 ; vpaddw %ymm4,%ymm2,%ymm2
- DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2
- DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3
- DB 196,227,101,56,226,1 ; vinserti128 $0x1,%xmm2,%ymm3,%ymm4
- DB 196,227,101,70,210,49 ; vperm2i128 $0x31,%ymm2,%ymm3,%ymm2
- DB 197,221,103,210 ; vpackuswb %ymm2,%ymm4,%ymm2
- DB 197,237,252,192 ; vpaddb %ymm0,%ymm2,%ymm0
+ DB 196,227,125,57,214,1 ; vextracti128 $0x1,%ymm2,%xmm6
+ DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6
+ DB 196,226,125,48,251 ; vpmovzxbw %xmm3,%ymm7
+ DB 196,227,125,57,216,1 ; vextracti128 $0x1,%ymm3,%xmm0
+ DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
+ DB 196,98,125,48,204 ; vpmovzxbw %xmm4,%ymm9
+ DB 196,227,125,57,228,1 ; vextracti128 $0x1,%ymm4,%xmm4
+ DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
+ DB 197,93,213,208 ; vpmullw %ymm0,%ymm4,%ymm10
+ DB 197,53,213,223 ; vpmullw %ymm7,%ymm9,%ymm11
+ DB 197,221,213,230 ; vpmullw %ymm6,%ymm4,%ymm4
+ DB 197,53,213,205 ; vpmullw %ymm5,%ymm9,%ymm9
+ DB 197,181,253,237 ; vpaddw %ymm5,%ymm9,%ymm5
+ DB 197,221,253,230 ; vpaddw %ymm6,%ymm4,%ymm4
+ DB 197,165,253,247 ; vpaddw %ymm7,%ymm11,%ymm6
+ DB 197,173,253,192 ; vpaddw %ymm0,%ymm10,%ymm0
+ DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0
+ DB 197,205,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm6
+ DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
+ DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5
+ DB 196,227,85,56,252,1 ; vinserti128 $0x1,%xmm4,%ymm5,%ymm7
+ DB 196,227,85,70,228,49 ; vperm2i128 $0x31,%ymm4,%ymm5,%ymm4
+ DB 197,197,103,228 ; vpackuswb %ymm4,%ymm7,%ymm4
+ DB 196,227,77,56,232,1 ; vinserti128 $0x1,%xmm0,%ymm6,%ymm5
+ DB 196,227,77,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm6,%ymm0
+ DB 197,213,103,232 ; vpackuswb %ymm0,%ymm5,%ymm5
+ DB 196,193,93,252,192 ; vpaddb %ymm8,%ymm4,%ymm0
+ DB 197,213,252,201 ; vpaddb %ymm1,%ymm5,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
@@ -39309,93 +40182,153 @@ _sk_lerp_u8_hsw_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 15,133,192,0,0,0 ; jne cf0 <_sk_lerp_u8_hsw_8bit+0xe1>
- DB 196,162,121,48,20,2 ; vpmovzxbw (%rdx,%r8,1),%xmm2
- DB 197,233,219,21,2,11,0,0 ; vpand 0xb02(%rip),%xmm2,%xmm2 # 1740 <_sk_xor__hsw_8bit+0x445>
- DB 196,226,125,51,210 ; vpmovzxwd %xmm2,%ymm2
- DB 196,226,109,0,21,20,9,0,0 ; vpshufb 0x914(%rip),%ymm2,%ymm2 # 1560 <_sk_xor__hsw_8bit+0x265>
- DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3
+ DB 15,133,106,1,0,0 ; jne 1ba3 <_sk_lerp_u8_hsw_8bit+0x18b>
+ DB 196,161,122,111,36,2 ; vmovdqu (%rdx,%r8,1),%xmm4
+ DB 196,226,125,49,236 ; vpmovzxbd %xmm4,%ymm5
+ DB 197,249,112,228,78 ; vpshufd $0x4e,%xmm4,%xmm4
+ DB 196,226,125,49,228 ; vpmovzxbd %xmm4,%ymm4
+ DB 197,253,111,53,42,16,0,0 ; vmovdqa 0x102a(%rip),%ymm6 # 2a80 <_sk_xor__hsw_8bit+0x32f>
+ DB 196,98,93,0,206 ; vpshufb %ymm6,%ymm4,%ymm9
+ DB 196,98,85,0,222 ; vpshufb %ymm6,%ymm5,%ymm11
+ DB 196,226,125,48,240 ; vpmovzxbw %xmm0,%ymm6
DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0
DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
- DB 196,226,125,48,226 ; vpmovzxbw %xmm2,%ymm4
- DB 196,227,125,57,213,1 ; vextracti128 $0x1,%ymm2,%xmm5
+ DB 196,226,125,48,249 ; vpmovzxbw %xmm1,%ymm7
+ DB 196,227,125,57,201,1 ; vextracti128 $0x1,%ymm1,%xmm1
+ DB 196,226,125,48,201 ; vpmovzxbw %xmm1,%ymm1
+ DB 196,66,125,48,195 ; vpmovzxbw %xmm11,%ymm8
+ DB 196,99,125,57,220,1 ; vextracti128 $0x1,%ymm11,%xmm4
+ DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
+ DB 196,66,125,48,209 ; vpmovzxbw %xmm9,%ymm10
+ DB 196,99,125,57,205,1 ; vextracti128 $0x1,%ymm9,%xmm5
+ DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
+ DB 197,213,213,233 ; vpmullw %ymm1,%ymm5,%ymm5
+ DB 197,45,213,215 ; vpmullw %ymm7,%ymm10,%ymm10
+ DB 197,221,213,224 ; vpmullw %ymm0,%ymm4,%ymm4
+ DB 197,61,213,198 ; vpmullw %ymm6,%ymm8,%ymm8
+ DB 197,189,253,246 ; vpaddw %ymm6,%ymm8,%ymm6
+ DB 197,221,253,192 ; vpaddw %ymm0,%ymm4,%ymm0
+ DB 197,173,253,231 ; vpaddw %ymm7,%ymm10,%ymm4
+ DB 197,213,253,201 ; vpaddw %ymm1,%ymm5,%ymm1
+ DB 197,245,113,209,8 ; vpsrlw $0x8,%ymm1,%ymm1
+ DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
+ DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0
+ DB 197,213,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm5
+ DB 196,227,85,56,240,1 ; vinserti128 $0x1,%xmm0,%ymm5,%ymm6
+ DB 196,227,85,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm5,%ymm0
+ DB 197,77,103,208 ; vpackuswb %ymm0,%ymm6,%ymm10
+ DB 196,227,93,56,233,1 ; vinserti128 $0x1,%xmm1,%ymm4,%ymm5
+ DB 196,227,93,70,201,49 ; vperm2i128 $0x31,%ymm1,%ymm4,%ymm1
+ DB 197,213,103,201 ; vpackuswb %ymm1,%ymm5,%ymm1
+ DB 197,221,118,228 ; vpcmpeqd %ymm4,%ymm4,%ymm4
+ DB 197,181,239,236 ; vpxor %ymm4,%ymm9,%ymm5
+ DB 197,165,239,228 ; vpxor %ymm4,%ymm11,%ymm4
+ DB 196,226,125,48,242 ; vpmovzxbw %xmm2,%ymm6
+ DB 196,227,125,57,215,1 ; vextracti128 $0x1,%ymm2,%xmm7
+ DB 196,226,125,48,255 ; vpmovzxbw %xmm7,%ymm7
+ DB 196,98,125,48,195 ; vpmovzxbw %xmm3,%ymm8
+ DB 196,227,125,57,216,1 ; vextracti128 $0x1,%ymm3,%xmm0
+ DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
+ DB 196,98,125,48,204 ; vpmovzxbw %xmm4,%ymm9
+ DB 196,227,125,57,228,1 ; vextracti128 $0x1,%ymm4,%xmm4
+ DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
+ DB 196,98,125,48,221 ; vpmovzxbw %xmm5,%ymm11
+ DB 196,227,125,57,237,1 ; vextracti128 $0x1,%ymm5,%xmm5
DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
DB 197,213,213,232 ; vpmullw %ymm0,%ymm5,%ymm5
- DB 197,221,213,227 ; vpmullw %ymm3,%ymm4,%ymm4
- DB 197,221,253,219 ; vpaddw %ymm3,%ymm4,%ymm3
+ DB 196,65,37,213,216 ; vpmullw %ymm8,%ymm11,%ymm11
+ DB 197,221,213,231 ; vpmullw %ymm7,%ymm4,%ymm4
+ DB 197,53,213,206 ; vpmullw %ymm6,%ymm9,%ymm9
+ DB 197,181,253,246 ; vpaddw %ymm6,%ymm9,%ymm6
+ DB 197,221,253,231 ; vpaddw %ymm7,%ymm4,%ymm4
+ DB 196,193,37,253,248 ; vpaddw %ymm8,%ymm11,%ymm7
DB 197,213,253,192 ; vpaddw %ymm0,%ymm5,%ymm0
DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0
- DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3
- DB 196,227,101,56,224,1 ; vinserti128 $0x1,%xmm0,%ymm3,%ymm4
- DB 196,227,101,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm3,%ymm0
- DB 197,221,103,192 ; vpackuswb %ymm0,%ymm4,%ymm0
- DB 197,229,118,219 ; vpcmpeqd %ymm3,%ymm3,%ymm3
- DB 197,237,239,211 ; vpxor %ymm3,%ymm2,%ymm2
- DB 196,226,125,48,217 ; vpmovzxbw %xmm1,%ymm3
- DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4
- DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
- DB 196,226,125,48,234 ; vpmovzxbw %xmm2,%ymm5
- DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2
- DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2
- DB 197,237,213,212 ; vpmullw %ymm4,%ymm2,%ymm2
- DB 197,213,213,235 ; vpmullw %ymm3,%ymm5,%ymm5
- DB 197,213,253,219 ; vpaddw %ymm3,%ymm5,%ymm3
- DB 197,237,253,212 ; vpaddw %ymm4,%ymm2,%ymm2
- DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2
- DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3
- DB 196,227,101,56,226,1 ; vinserti128 $0x1,%xmm2,%ymm3,%ymm4
- DB 196,227,101,70,210,49 ; vperm2i128 $0x31,%ymm2,%ymm3,%ymm2
- DB 197,221,103,210 ; vpackuswb %ymm2,%ymm4,%ymm2
- DB 197,237,252,192 ; vpaddb %ymm0,%ymm2,%ymm0
+ DB 197,213,113,215,8 ; vpsrlw $0x8,%ymm7,%ymm5
+ DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
+ DB 197,205,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm6
+ DB 196,227,77,56,252,1 ; vinserti128 $0x1,%xmm4,%ymm6,%ymm7
+ DB 196,227,77,70,228,49 ; vperm2i128 $0x31,%ymm4,%ymm6,%ymm4
+ DB 197,197,103,228 ; vpackuswb %ymm4,%ymm7,%ymm4
+ DB 196,227,85,56,240,1 ; vinserti128 $0x1,%xmm0,%ymm5,%ymm6
+ DB 196,227,85,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm5,%ymm0
+ DB 197,205,103,232 ; vpackuswb %ymm0,%ymm6,%ymm5
+ DB 196,193,93,252,194 ; vpaddb %ymm10,%ymm4,%ymm0
+ DB 197,213,252,201 ; vpaddb %ymm1,%ymm5,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,7 ; and $0x7,%r9b
- DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 197,217,239,228 ; vpxor %xmm4,%xmm4,%xmm4
DB 65,254,201 ; dec %r9b
- DB 65,128,249,6 ; cmp $0x6,%r9b
- DB 15,135,49,255,255,255 ; ja c36 <_sk_lerp_u8_hsw_8bit+0x27>
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 15,135,135,254,255,255 ; ja 1a3f <_sk_lerp_u8_hsw_8bit+0x27>
DB 65,15,182,193 ; movzbl %r9b,%eax
- DB 72,141,13,124,0,0,0 ; lea 0x7c(%rip),%rcx # d8c <_sk_lerp_u8_hsw_8bit+0x17d>
+ DB 72,141,13,201,0,0,0 ; lea 0xc9(%rip),%rcx # 1c8c <_sk_lerp_u8_hsw_8bit+0x274>
DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
DB 72,1,200 ; add %rcx,%rax
DB 255,224 ; jmpq *%rax
DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
- DB 197,249,110,208 ; vmovd %eax,%xmm2
- DB 233,15,255,255,255 ; jmpq c36 <_sk_lerp_u8_hsw_8bit+0x27>
- DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
- DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2
- DB 197,233,196,208,2 ; vpinsrw $0x2,%eax,%xmm2,%xmm2
+ DB 197,249,110,224 ; vmovd %eax,%xmm4
+ DB 233,101,254,255,255 ; jmpq 1a3f <_sk_lerp_u8_hsw_8bit+0x27>
+ DB 197,217,239,228 ; vpxor %xmm4,%xmm4,%xmm4
+ DB 196,163,89,32,100,2,2,2 ; vpinsrb $0x2,0x2(%rdx,%r8,1),%xmm4,%xmm4
DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
- DB 197,249,110,216 ; vmovd %eax,%xmm3
- DB 196,226,121,48,219 ; vpmovzxbw %xmm3,%xmm3
- DB 196,227,105,2,211,1 ; vpblendd $0x1,%xmm3,%xmm2,%xmm2
- DB 233,231,254,255,255 ; jmpq c36 <_sk_lerp_u8_hsw_8bit+0x27>
- DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
- DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2
- DB 197,233,196,208,6 ; vpinsrw $0x6,%eax,%xmm2,%xmm2
- DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
- DB 197,233,196,208,5 ; vpinsrw $0x5,%eax,%xmm2,%xmm2
- DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
- DB 197,233,196,208,4 ; vpinsrw $0x4,%eax,%xmm2,%xmm2
- DB 196,161,121,110,28,2 ; vmovd (%rdx,%r8,1),%xmm3
- DB 196,226,121,48,219 ; vpmovzxbw %xmm3,%xmm3
- DB 196,227,97,2,210,12 ; vpblendd $0xc,%xmm2,%xmm3,%xmm2
- DB 233,172,254,255,255 ; jmpq c36 <_sk_lerp_u8_hsw_8bit+0x27>
- DB 102,144 ; xchg %ax,%ax
- DB 141 ; (bad)
+ DB 197,249,110,232 ; vmovd %eax,%xmm5
+ DB 196,227,89,14,229,1 ; vpblendw $0x1,%xmm5,%xmm4,%xmm4
+ DB 233,69,254,255,255 ; jmpq 1a3f <_sk_lerp_u8_hsw_8bit+0x27>
+ DB 197,217,239,228 ; vpxor %xmm4,%xmm4,%xmm4
+ DB 196,163,89,32,100,2,6,6 ; vpinsrb $0x6,0x6(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,163,89,32,100,2,5,5 ; vpinsrb $0x5,0x5(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,163,89,32,100,2,4,4 ; vpinsrb $0x4,0x4(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,161,121,110,44,2 ; vmovd (%rdx,%r8,1),%xmm5
+ DB 196,227,89,2,229,1 ; vpblendd $0x1,%xmm5,%xmm4,%xmm4
+ DB 233,24,254,255,255 ; jmpq 1a3f <_sk_lerp_u8_hsw_8bit+0x27>
+ DB 197,217,239,228 ; vpxor %xmm4,%xmm4,%xmm4
+ DB 196,163,89,32,100,2,10,10 ; vpinsrb $0xa,0xa(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,163,89,32,100,2,9,9 ; vpinsrb $0x9,0x9(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,163,89,32,100,2,8,8 ; vpinsrb $0x8,0x8(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,161,122,126,44,2 ; vmovq (%rdx,%r8,1),%xmm5
+ DB 196,227,81,2,228,12 ; vpblendd $0xc,%xmm4,%xmm5,%xmm4
+ DB 233,235,253,255,255 ; jmpq 1a3f <_sk_lerp_u8_hsw_8bit+0x27>
+ DB 197,217,239,228 ; vpxor %xmm4,%xmm4,%xmm4
+ DB 196,163,89,32,100,2,14,14 ; vpinsrb $0xe,0xe(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,163,89,32,100,2,13,13 ; vpinsrb $0xd,0xd(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,163,89,32,100,2,12,12 ; vpinsrb $0xc,0xc(%rdx,%r8,1),%xmm4,%xmm4
+ DB 196,161,122,126,44,2 ; vmovq (%rdx,%r8,1),%xmm5
+ DB 196,163,81,34,108,2,8,2 ; vpinsrd $0x2,0x8(%rdx,%r8,1),%xmm5,%xmm5
+ DB 196,227,81,2,228,8 ; vpblendd $0x8,%xmm4,%xmm5,%xmm4
+ DB 233,182,253,255,255 ; jmpq 1a3f <_sk_lerp_u8_hsw_8bit+0x27>
+ DB 15,31,0 ; nopl (%rax)
+ DB 64,255 ; rex (bad)
DB 255 ; (bad)
+ DB 255,90,255 ; lcall *-0x1(%rdx)
DB 255 ; (bad)
- DB 255,170,255,255,255,155 ; ljmp *-0x64000001(%rdx)
+ DB 255,78,255 ; decl -0x1(%rsi)
DB 255 ; (bad)
+ DB 255,138,255,255,255,130 ; decl -0x7d000001(%rdx)
DB 255 ; (bad)
DB 255 ; (bad)
- DB 232,255,255,255,221 ; callq ffffffffde000d9c <_sk_xor__hsw_8bit+0xffffffffddfffaa1>
DB 255 ; (bad)
+ DB 122,255 ; jp 1ca1 <_sk_lerp_u8_hsw_8bit+0x289>
DB 255 ; (bad)
- DB 255,210 ; callq *%rdx
+ DB 255,110,255 ; ljmp *-0x1(%rsi)
DB 255 ; (bad)
+ DB 255,183,255,255,255,175 ; pushq -0x50000001(%rdi)
DB 255 ; (bad)
- DB 255,195 ; inc %ebx
+ DB 255 ; (bad)
+ DB 255,167,255,255,255,155 ; jmpq *-0x64000001(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,228 ; jmpq *%rsp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 220,255 ; fdivr %st,%st(7)
+ DB 255 ; (bad)
+ DB 255,212 ; callq *%rsp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,200 ; dec %eax
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; .byte 0xff
@@ -39403,389 +40336,689 @@ _sk_lerp_u8_hsw_8bit LABEL PROC
PUBLIC _sk_move_src_dst_hsw_8bit
_sk_move_src_dst_hsw_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 197,252,40,200 ; vmovaps %ymm0,%ymm1
+ DB 197,252,40,208 ; vmovaps %ymm0,%ymm2
+ DB 197,252,40,217 ; vmovaps %ymm1,%ymm3
DB 255,224 ; jmpq *%rax
PUBLIC _sk_move_dst_src_hsw_8bit
_sk_move_dst_src_hsw_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 197,252,40,193 ; vmovaps %ymm1,%ymm0
+ DB 197,252,40,194 ; vmovaps %ymm2,%ymm0
+ DB 197,252,40,203 ; vmovaps %ymm3,%ymm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_black_color_hsw_8bit
_sk_black_color_hsw_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 196,226,125,25,5,141,9,0,0 ; vbroadcastsd 0x98d(%rip),%ymm0 # 1750 <_sk_xor__hsw_8bit+0x455>
+ DB 196,226,125,24,5,245,11,0,0 ; vbroadcastss 0xbf5(%rip),%ymm0 # 28e0 <_sk_xor__hsw_8bit+0x18f>
+ DB 197,252,40,200 ; vmovaps %ymm0,%ymm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_white_color_hsw_8bit
_sk_white_color_hsw_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
DB 197,253,118,192 ; vpcmpeqd %ymm0,%ymm0,%ymm0
+ DB 197,245,118,201 ; vpcmpeqd %ymm1,%ymm1,%ymm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_clear_hsw_8bit
_sk_clear_hsw_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
DB 197,252,87,192 ; vxorps %ymm0,%ymm0,%ymm0
+ DB 197,244,87,201 ; vxorps %ymm1,%ymm1,%ymm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_srcatop_hsw_8bit
_sk_srcatop_hsw_8bit LABEL PROC
- DB 197,253,111,21,163,7,0,0 ; vmovdqa 0x7a3(%rip),%ymm2 # 1580 <_sk_xor__hsw_8bit+0x285>
- DB 196,226,117,0,218 ; vpshufb %ymm2,%ymm1,%ymm3
- DB 196,226,125,48,224 ; vpmovzxbw %xmm0,%ymm4
- DB 196,227,125,57,197,1 ; vextracti128 $0x1,%ymm0,%xmm5
+ DB 197,125,111,5,143,13,0,0 ; vmovdqa 0xd8f(%rip),%ymm8 # 2aa0 <_sk_xor__hsw_8bit+0x34f>
+ DB 196,194,101,0,224 ; vpshufb %ymm8,%ymm3,%ymm4
+ DB 196,194,109,0,232 ; vpshufb %ymm8,%ymm2,%ymm5
+ DB 196,98,125,48,208 ; vpmovzxbw %xmm0,%ymm10
+ DB 196,227,125,57,198,1 ; vextracti128 $0x1,%ymm0,%xmm6
+ DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6
+ DB 196,98,125,48,201 ; vpmovzxbw %xmm1,%ymm9
+ DB 196,227,125,57,207,1 ; vextracti128 $0x1,%ymm1,%xmm7
+ DB 196,226,125,48,255 ; vpmovzxbw %xmm7,%ymm7
+ DB 196,98,125,48,221 ; vpmovzxbw %xmm5,%ymm11
+ DB 196,227,125,57,237,1 ; vextracti128 $0x1,%ymm5,%xmm5
DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
- DB 196,226,125,48,243 ; vpmovzxbw %xmm3,%ymm6
- DB 196,227,125,57,219,1 ; vextracti128 $0x1,%ymm3,%xmm3
- DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3
- DB 197,229,213,221 ; vpmullw %ymm5,%ymm3,%ymm3
- DB 197,205,213,244 ; vpmullw %ymm4,%ymm6,%ymm6
- DB 197,205,253,228 ; vpaddw %ymm4,%ymm6,%ymm4
- DB 197,229,253,221 ; vpaddw %ymm5,%ymm3,%ymm3
- DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3
- DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
- DB 196,227,93,56,235,1 ; vinserti128 $0x1,%xmm3,%ymm4,%ymm5
- DB 196,227,93,70,219,49 ; vperm2i128 $0x31,%ymm3,%ymm4,%ymm3
- DB 197,213,103,219 ; vpackuswb %ymm3,%ymm5,%ymm3
- DB 196,226,125,0,194 ; vpshufb %ymm2,%ymm0,%ymm0
- DB 197,237,118,210 ; vpcmpeqd %ymm2,%ymm2,%ymm2
- DB 197,253,239,194 ; vpxor %ymm2,%ymm0,%ymm0
- DB 196,226,125,48,209 ; vpmovzxbw %xmm1,%ymm2
- DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4
+ DB 196,98,125,48,228 ; vpmovzxbw %xmm4,%ymm12
+ DB 196,227,125,57,228,1 ; vextracti128 $0x1,%ymm4,%xmm4
DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
- DB 196,226,125,48,232 ; vpmovzxbw %xmm0,%ymm5
+ DB 197,221,213,231 ; vpmullw %ymm7,%ymm4,%ymm4
+ DB 196,65,29,213,225 ; vpmullw %ymm9,%ymm12,%ymm12
+ DB 197,213,213,238 ; vpmullw %ymm6,%ymm5,%ymm5
+ DB 196,65,37,213,218 ; vpmullw %ymm10,%ymm11,%ymm11
+ DB 196,65,37,253,210 ; vpaddw %ymm10,%ymm11,%ymm10
+ DB 197,213,253,238 ; vpaddw %ymm6,%ymm5,%ymm5
+ DB 196,193,29,253,241 ; vpaddw %ymm9,%ymm12,%ymm6
+ DB 197,221,253,231 ; vpaddw %ymm7,%ymm4,%ymm4
+ DB 197,197,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm7
+ DB 197,205,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm6
+ DB 197,221,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm4
+ DB 196,193,85,113,210,8 ; vpsrlw $0x8,%ymm10,%ymm5
+ DB 196,99,85,56,204,1 ; vinserti128 $0x1,%xmm4,%ymm5,%ymm9
+ DB 196,227,85,70,228,49 ; vperm2i128 $0x31,%ymm4,%ymm5,%ymm4
+ DB 197,53,103,204 ; vpackuswb %ymm4,%ymm9,%ymm9
+ DB 196,227,77,56,239,1 ; vinserti128 $0x1,%xmm7,%ymm6,%ymm5
+ DB 196,227,77,70,247,49 ; vperm2i128 $0x31,%ymm7,%ymm6,%ymm6
+ DB 197,213,103,238 ; vpackuswb %ymm6,%ymm5,%ymm5
+ DB 196,194,125,0,192 ; vpshufb %ymm8,%ymm0,%ymm0
+ DB 196,194,117,0,200 ; vpshufb %ymm8,%ymm1,%ymm1
+ DB 197,205,118,246 ; vpcmpeqd %ymm6,%ymm6,%ymm6
+ DB 197,245,239,206 ; vpxor %ymm6,%ymm1,%ymm1
+ DB 197,253,239,198 ; vpxor %ymm6,%ymm0,%ymm0
+ DB 196,226,125,48,242 ; vpmovzxbw %xmm2,%ymm6
+ DB 196,227,125,57,215,1 ; vextracti128 $0x1,%ymm2,%xmm7
+ DB 196,226,125,48,255 ; vpmovzxbw %xmm7,%ymm7
+ DB 196,98,125,48,195 ; vpmovzxbw %xmm3,%ymm8
+ DB 196,227,125,57,220,1 ; vextracti128 $0x1,%ymm3,%xmm4
+ DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
+ DB 196,98,125,48,208 ; vpmovzxbw %xmm0,%ymm10
DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0
DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
- DB 197,253,213,196 ; vpmullw %ymm4,%ymm0,%ymm0
- DB 197,213,213,234 ; vpmullw %ymm2,%ymm5,%ymm5
- DB 197,213,253,210 ; vpaddw %ymm2,%ymm5,%ymm2
- DB 197,253,253,196 ; vpaddw %ymm4,%ymm0,%ymm0
+ DB 196,98,125,48,217 ; vpmovzxbw %xmm1,%ymm11
+ DB 196,227,125,57,201,1 ; vextracti128 $0x1,%ymm1,%xmm1
+ DB 196,226,125,48,201 ; vpmovzxbw %xmm1,%ymm1
+ DB 197,221,213,201 ; vpmullw %ymm1,%ymm4,%ymm1
+ DB 196,65,61,213,219 ; vpmullw %ymm11,%ymm8,%ymm11
+ DB 197,197,213,192 ; vpmullw %ymm0,%ymm7,%ymm0
+ DB 196,65,77,213,210 ; vpmullw %ymm10,%ymm6,%ymm10
+ DB 197,173,253,246 ; vpaddw %ymm6,%ymm10,%ymm6
+ DB 197,253,253,199 ; vpaddw %ymm7,%ymm0,%ymm0
+ DB 196,193,37,253,248 ; vpaddw %ymm8,%ymm11,%ymm7
+ DB 197,245,253,204 ; vpaddw %ymm4,%ymm1,%ymm1
+ DB 197,245,113,209,8 ; vpsrlw $0x8,%ymm1,%ymm1
+ DB 197,221,113,215,8 ; vpsrlw $0x8,%ymm7,%ymm4
DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0
- DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2
- DB 196,227,109,56,224,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm4
- DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- DB 197,221,103,192 ; vpackuswb %ymm0,%ymm4,%ymm0
- DB 197,253,252,195 ; vpaddb %ymm3,%ymm0,%ymm0
+ DB 197,205,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm6
+ DB 196,227,77,56,248,1 ; vinserti128 $0x1,%xmm0,%ymm6,%ymm7
+ DB 196,227,77,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm6,%ymm0
+ DB 197,197,103,192 ; vpackuswb %ymm0,%ymm7,%ymm0
+ DB 196,227,93,56,241,1 ; vinserti128 $0x1,%xmm1,%ymm4,%ymm6
+ DB 196,227,93,70,201,49 ; vperm2i128 $0x31,%ymm1,%ymm4,%ymm1
+ DB 197,205,103,201 ; vpackuswb %ymm1,%ymm6,%ymm1
+ DB 196,193,125,252,193 ; vpaddb %ymm9,%ymm0,%ymm0
+ DB 197,245,252,205 ; vpaddb %ymm5,%ymm1,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_dstatop_hsw_8bit
_sk_dstatop_hsw_8bit LABEL PROC
- DB 197,253,111,21,13,7,0,0 ; vmovdqa 0x70d(%rip),%ymm2 # 15a0 <_sk_xor__hsw_8bit+0x2a5>
- DB 196,226,125,0,218 ; vpshufb %ymm2,%ymm0,%ymm3
- DB 196,226,125,48,225 ; vpmovzxbw %xmm1,%ymm4
- DB 196,227,125,57,205,1 ; vextracti128 $0x1,%ymm1,%xmm5
+ DB 197,125,111,5,74,12,0,0 ; vmovdqa 0xc4a(%rip),%ymm8 # 2ac0 <_sk_xor__hsw_8bit+0x36f>
+ DB 196,194,117,0,224 ; vpshufb %ymm8,%ymm1,%ymm4
+ DB 196,194,125,0,232 ; vpshufb %ymm8,%ymm0,%ymm5
+ DB 196,98,125,48,210 ; vpmovzxbw %xmm2,%ymm10
+ DB 196,227,125,57,214,1 ; vextracti128 $0x1,%ymm2,%xmm6
+ DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6
+ DB 196,98,125,48,203 ; vpmovzxbw %xmm3,%ymm9
+ DB 196,227,125,57,223,1 ; vextracti128 $0x1,%ymm3,%xmm7
+ DB 196,226,125,48,255 ; vpmovzxbw %xmm7,%ymm7
+ DB 196,98,125,48,221 ; vpmovzxbw %xmm5,%ymm11
+ DB 196,227,125,57,237,1 ; vextracti128 $0x1,%ymm5,%xmm5
DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
- DB 196,226,125,48,243 ; vpmovzxbw %xmm3,%ymm6
- DB 196,227,125,57,219,1 ; vextracti128 $0x1,%ymm3,%xmm3
- DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3
- DB 197,229,213,221 ; vpmullw %ymm5,%ymm3,%ymm3
- DB 197,205,213,244 ; vpmullw %ymm4,%ymm6,%ymm6
- DB 197,205,253,228 ; vpaddw %ymm4,%ymm6,%ymm4
- DB 197,229,253,221 ; vpaddw %ymm5,%ymm3,%ymm3
- DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3
- DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
- DB 196,227,93,56,235,1 ; vinserti128 $0x1,%xmm3,%ymm4,%ymm5
- DB 196,227,93,70,219,49 ; vperm2i128 $0x31,%ymm3,%ymm4,%ymm3
- DB 197,213,103,219 ; vpackuswb %ymm3,%ymm5,%ymm3
- DB 196,226,117,0,210 ; vpshufb %ymm2,%ymm1,%ymm2
- DB 197,221,118,228 ; vpcmpeqd %ymm4,%ymm4,%ymm4
- DB 197,237,239,212 ; vpxor %ymm4,%ymm2,%ymm2
- DB 196,226,125,48,224 ; vpmovzxbw %xmm0,%ymm4
+ DB 196,98,125,48,228 ; vpmovzxbw %xmm4,%ymm12
+ DB 196,227,125,57,228,1 ; vextracti128 $0x1,%ymm4,%xmm4
+ DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
+ DB 197,197,213,228 ; vpmullw %ymm4,%ymm7,%ymm4
+ DB 196,65,53,213,228 ; vpmullw %ymm12,%ymm9,%ymm12
+ DB 197,205,213,237 ; vpmullw %ymm5,%ymm6,%ymm5
+ DB 196,65,45,213,219 ; vpmullw %ymm11,%ymm10,%ymm11
+ DB 196,65,37,253,210 ; vpaddw %ymm10,%ymm11,%ymm10
+ DB 197,213,253,238 ; vpaddw %ymm6,%ymm5,%ymm5
+ DB 196,193,29,253,241 ; vpaddw %ymm9,%ymm12,%ymm6
+ DB 197,221,253,231 ; vpaddw %ymm7,%ymm4,%ymm4
+ DB 197,197,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm7
+ DB 197,205,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm6
+ DB 197,221,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm4
+ DB 196,193,85,113,210,8 ; vpsrlw $0x8,%ymm10,%ymm5
+ DB 196,99,85,56,204,1 ; vinserti128 $0x1,%xmm4,%ymm5,%ymm9
+ DB 196,227,85,70,228,49 ; vperm2i128 $0x31,%ymm4,%ymm5,%ymm4
+ DB 197,181,103,228 ; vpackuswb %ymm4,%ymm9,%ymm4
+ DB 196,227,77,56,239,1 ; vinserti128 $0x1,%xmm7,%ymm6,%ymm5
+ DB 196,227,77,70,247,49 ; vperm2i128 $0x31,%ymm7,%ymm6,%ymm6
+ DB 197,213,103,238 ; vpackuswb %ymm6,%ymm5,%ymm5
+ DB 196,194,109,0,240 ; vpshufb %ymm8,%ymm2,%ymm6
+ DB 196,194,101,0,248 ; vpshufb %ymm8,%ymm3,%ymm7
+ DB 196,65,61,118,192 ; vpcmpeqd %ymm8,%ymm8,%ymm8
+ DB 196,193,69,239,248 ; vpxor %ymm8,%ymm7,%ymm7
+ DB 196,193,77,239,240 ; vpxor %ymm8,%ymm6,%ymm6
+ DB 196,98,125,48,192 ; vpmovzxbw %xmm0,%ymm8
DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0
DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
- DB 196,226,125,48,234 ; vpmovzxbw %xmm2,%ymm5
- DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2
- DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2
- DB 197,237,213,208 ; vpmullw %ymm0,%ymm2,%ymm2
- DB 197,213,213,236 ; vpmullw %ymm4,%ymm5,%ymm5
- DB 197,213,253,228 ; vpaddw %ymm4,%ymm5,%ymm4
- DB 197,237,253,192 ; vpaddw %ymm0,%ymm2,%ymm0
+ DB 196,98,125,48,201 ; vpmovzxbw %xmm1,%ymm9
+ DB 196,227,125,57,201,1 ; vextracti128 $0x1,%ymm1,%xmm1
+ DB 196,226,125,48,201 ; vpmovzxbw %xmm1,%ymm1
+ DB 196,98,125,48,214 ; vpmovzxbw %xmm6,%ymm10
+ DB 196,227,125,57,246,1 ; vextracti128 $0x1,%ymm6,%xmm6
+ DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6
+ DB 196,98,125,48,223 ; vpmovzxbw %xmm7,%ymm11
+ DB 196,227,125,57,255,1 ; vextracti128 $0x1,%ymm7,%xmm7
+ DB 196,226,125,48,255 ; vpmovzxbw %xmm7,%ymm7
+ DB 197,197,213,249 ; vpmullw %ymm1,%ymm7,%ymm7
+ DB 196,65,37,213,217 ; vpmullw %ymm9,%ymm11,%ymm11
+ DB 197,205,213,240 ; vpmullw %ymm0,%ymm6,%ymm6
+ DB 196,65,45,213,208 ; vpmullw %ymm8,%ymm10,%ymm10
+ DB 196,65,45,253,192 ; vpaddw %ymm8,%ymm10,%ymm8
+ DB 197,205,253,192 ; vpaddw %ymm0,%ymm6,%ymm0
+ DB 196,193,37,253,241 ; vpaddw %ymm9,%ymm11,%ymm6
+ DB 197,197,253,201 ; vpaddw %ymm1,%ymm7,%ymm1
+ DB 197,245,113,209,8 ; vpsrlw $0x8,%ymm1,%ymm1
+ DB 197,205,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm6
DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0
- DB 197,237,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm2
- DB 196,227,109,56,224,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm4
- DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- DB 197,221,103,192 ; vpackuswb %ymm0,%ymm4,%ymm0
- DB 197,253,252,195 ; vpaddb %ymm3,%ymm0,%ymm0
+ DB 196,193,69,113,208,8 ; vpsrlw $0x8,%ymm8,%ymm7
+ DB 196,99,69,56,192,1 ; vinserti128 $0x1,%xmm0,%ymm7,%ymm8
+ DB 196,227,69,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm7,%ymm0
+ DB 197,189,103,192 ; vpackuswb %ymm0,%ymm8,%ymm0
+ DB 196,227,77,56,249,1 ; vinserti128 $0x1,%xmm1,%ymm6,%ymm7
+ DB 196,227,77,70,201,49 ; vperm2i128 $0x31,%ymm1,%ymm6,%ymm1
+ DB 197,197,103,201 ; vpackuswb %ymm1,%ymm7,%ymm1
+ DB 197,253,252,196 ; vpaddb %ymm4,%ymm0,%ymm0
+ DB 197,245,252,205 ; vpaddb %ymm5,%ymm1,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_srcin_hsw_8bit
_sk_srcin_hsw_8bit LABEL PROC
- DB 196,226,117,0,21,118,6,0,0 ; vpshufb 0x676(%rip),%ymm1,%ymm2 # 15c0 <_sk_xor__hsw_8bit+0x2c5>
- DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3
+ DB 197,253,111,37,1,11,0,0 ; vmovdqa 0xb01(%rip),%ymm4 # 2ae0 <_sk_xor__hsw_8bit+0x38f>
+ DB 196,226,101,0,236 ; vpshufb %ymm4,%ymm3,%ymm5
+ DB 196,226,109,0,228 ; vpshufb %ymm4,%ymm2,%ymm4
+ DB 196,226,125,48,240 ; vpmovzxbw %xmm0,%ymm6
DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0
DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
- DB 196,226,125,48,226 ; vpmovzxbw %xmm2,%ymm4
- DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2
- DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2
- DB 197,237,213,208 ; vpmullw %ymm0,%ymm2,%ymm2
- DB 197,221,213,227 ; vpmullw %ymm3,%ymm4,%ymm4
- DB 197,221,253,219 ; vpaddw %ymm3,%ymm4,%ymm3
- DB 197,237,253,192 ; vpaddw %ymm0,%ymm2,%ymm0
+ DB 196,226,125,48,249 ; vpmovzxbw %xmm1,%ymm7
+ DB 196,227,125,57,201,1 ; vextracti128 $0x1,%ymm1,%xmm1
+ DB 196,226,125,48,201 ; vpmovzxbw %xmm1,%ymm1
+ DB 196,98,125,48,196 ; vpmovzxbw %xmm4,%ymm8
+ DB 196,227,125,57,228,1 ; vextracti128 $0x1,%ymm4,%xmm4
+ DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
+ DB 196,98,125,48,205 ; vpmovzxbw %xmm5,%ymm9
+ DB 196,227,125,57,237,1 ; vextracti128 $0x1,%ymm5,%xmm5
+ DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
+ DB 197,213,213,233 ; vpmullw %ymm1,%ymm5,%ymm5
+ DB 197,53,213,207 ; vpmullw %ymm7,%ymm9,%ymm9
+ DB 197,221,213,224 ; vpmullw %ymm0,%ymm4,%ymm4
+ DB 197,61,213,198 ; vpmullw %ymm6,%ymm8,%ymm8
+ DB 197,189,253,246 ; vpaddw %ymm6,%ymm8,%ymm6
+ DB 197,221,253,192 ; vpaddw %ymm0,%ymm4,%ymm0
+ DB 197,181,253,231 ; vpaddw %ymm7,%ymm9,%ymm4
+ DB 197,213,253,201 ; vpaddw %ymm1,%ymm5,%ymm1
+ DB 197,245,113,209,8 ; vpsrlw $0x8,%ymm1,%ymm1
+ DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0
- DB 197,237,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm2
- DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3
- DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0
+ DB 197,213,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm5
+ DB 196,227,85,56,240,1 ; vinserti128 $0x1,%xmm0,%ymm5,%ymm6
+ DB 196,227,85,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm5,%ymm0
+ DB 197,205,103,192 ; vpackuswb %ymm0,%ymm6,%ymm0
+ DB 196,227,93,56,233,1 ; vinserti128 $0x1,%xmm1,%ymm4,%ymm5
+ DB 196,227,93,70,201,49 ; vperm2i128 $0x31,%ymm1,%ymm4,%ymm1
+ DB 197,213,103,201 ; vpackuswb %ymm1,%ymm5,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_dstin_hsw_8bit
_sk_dstin_hsw_8bit LABEL PROC
- DB 196,226,125,0,5,63,6,0,0 ; vpshufb 0x63f(%rip),%ymm0,%ymm0 # 15e0 <_sk_xor__hsw_8bit+0x2e5>
- DB 196,226,125,48,209 ; vpmovzxbw %xmm1,%ymm2
- DB 196,227,125,57,203,1 ; vextracti128 $0x1,%ymm1,%xmm3
- DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3
- DB 196,226,125,48,224 ; vpmovzxbw %xmm0,%ymm4
+ DB 197,253,111,37,119,10,0,0 ; vmovdqa 0xa77(%rip),%ymm4 # 2b00 <_sk_xor__hsw_8bit+0x3af>
+ DB 196,226,117,0,204 ; vpshufb %ymm4,%ymm1,%ymm1
+ DB 196,226,125,0,196 ; vpshufb %ymm4,%ymm0,%ymm0
+ DB 196,226,125,48,226 ; vpmovzxbw %xmm2,%ymm4
+ DB 196,227,125,57,213,1 ; vextracti128 $0x1,%ymm2,%xmm5
+ DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
+ DB 196,226,125,48,243 ; vpmovzxbw %xmm3,%ymm6
+ DB 196,227,125,57,223,1 ; vextracti128 $0x1,%ymm3,%xmm7
+ DB 196,226,125,48,255 ; vpmovzxbw %xmm7,%ymm7
+ DB 196,98,125,48,192 ; vpmovzxbw %xmm0,%ymm8
DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0
DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
- DB 197,253,213,195 ; vpmullw %ymm3,%ymm0,%ymm0
- DB 197,221,213,226 ; vpmullw %ymm2,%ymm4,%ymm4
- DB 197,221,253,210 ; vpaddw %ymm2,%ymm4,%ymm2
- DB 197,253,253,195 ; vpaddw %ymm3,%ymm0,%ymm0
+ DB 196,98,125,48,201 ; vpmovzxbw %xmm1,%ymm9
+ DB 196,227,125,57,201,1 ; vextracti128 $0x1,%ymm1,%xmm1
+ DB 196,226,125,48,201 ; vpmovzxbw %xmm1,%ymm1
+ DB 197,197,213,201 ; vpmullw %ymm1,%ymm7,%ymm1
+ DB 196,65,77,213,201 ; vpmullw %ymm9,%ymm6,%ymm9
+ DB 197,213,213,192 ; vpmullw %ymm0,%ymm5,%ymm0
+ DB 196,65,93,213,192 ; vpmullw %ymm8,%ymm4,%ymm8
+ DB 197,189,253,228 ; vpaddw %ymm4,%ymm8,%ymm4
+ DB 197,253,253,197 ; vpaddw %ymm5,%ymm0,%ymm0
+ DB 197,181,253,238 ; vpaddw %ymm6,%ymm9,%ymm5
+ DB 197,245,253,207 ; vpaddw %ymm7,%ymm1,%ymm1
+ DB 197,245,113,209,8 ; vpsrlw $0x8,%ymm1,%ymm1
+ DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5
DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0
- DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2
- DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3
- DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0
+ DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
+ DB 196,227,93,56,240,1 ; vinserti128 $0x1,%xmm0,%ymm4,%ymm6
+ DB 196,227,93,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm4,%ymm0
+ DB 197,205,103,192 ; vpackuswb %ymm0,%ymm6,%ymm0
+ DB 196,227,85,56,225,1 ; vinserti128 $0x1,%xmm1,%ymm5,%ymm4
+ DB 196,227,85,70,201,49 ; vperm2i128 $0x31,%ymm1,%ymm5,%ymm1
+ DB 197,221,103,201 ; vpackuswb %ymm1,%ymm4,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_srcout_hsw_8bit
_sk_srcout_hsw_8bit LABEL PROC
- DB 196,226,117,0,21,8,6,0,0 ; vpshufb 0x608(%rip),%ymm1,%ymm2 # 1600 <_sk_xor__hsw_8bit+0x305>
- DB 197,229,118,219 ; vpcmpeqd %ymm3,%ymm3,%ymm3
- DB 197,237,239,211 ; vpxor %ymm3,%ymm2,%ymm2
- DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3
+ DB 197,253,111,37,235,9,0,0 ; vmovdqa 0x9eb(%rip),%ymm4 # 2b20 <_sk_xor__hsw_8bit+0x3cf>
+ DB 196,226,109,0,236 ; vpshufb %ymm4,%ymm2,%ymm5
+ DB 196,226,101,0,228 ; vpshufb %ymm4,%ymm3,%ymm4
+ DB 197,205,118,246 ; vpcmpeqd %ymm6,%ymm6,%ymm6
+ DB 197,221,239,230 ; vpxor %ymm6,%ymm4,%ymm4
+ DB 197,213,239,238 ; vpxor %ymm6,%ymm5,%ymm5
+ DB 196,226,125,48,240 ; vpmovzxbw %xmm0,%ymm6
DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0
DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
- DB 196,226,125,48,226 ; vpmovzxbw %xmm2,%ymm4
- DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2
- DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2
- DB 197,237,213,208 ; vpmullw %ymm0,%ymm2,%ymm2
- DB 197,221,213,227 ; vpmullw %ymm3,%ymm4,%ymm4
- DB 197,221,253,219 ; vpaddw %ymm3,%ymm4,%ymm3
- DB 197,237,253,192 ; vpaddw %ymm0,%ymm2,%ymm0
+ DB 196,226,125,48,249 ; vpmovzxbw %xmm1,%ymm7
+ DB 196,227,125,57,201,1 ; vextracti128 $0x1,%ymm1,%xmm1
+ DB 196,226,125,48,201 ; vpmovzxbw %xmm1,%ymm1
+ DB 196,98,125,48,197 ; vpmovzxbw %xmm5,%ymm8
+ DB 196,227,125,57,237,1 ; vextracti128 $0x1,%ymm5,%xmm5
+ DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
+ DB 196,98,125,48,204 ; vpmovzxbw %xmm4,%ymm9
+ DB 196,227,125,57,228,1 ; vextracti128 $0x1,%ymm4,%xmm4
+ DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
+ DB 197,221,213,225 ; vpmullw %ymm1,%ymm4,%ymm4
+ DB 197,53,213,207 ; vpmullw %ymm7,%ymm9,%ymm9
+ DB 197,213,213,232 ; vpmullw %ymm0,%ymm5,%ymm5
+ DB 197,61,213,198 ; vpmullw %ymm6,%ymm8,%ymm8
+ DB 197,189,253,246 ; vpaddw %ymm6,%ymm8,%ymm6
+ DB 197,213,253,192 ; vpaddw %ymm0,%ymm5,%ymm0
+ DB 197,181,253,239 ; vpaddw %ymm7,%ymm9,%ymm5
+ DB 197,221,253,201 ; vpaddw %ymm1,%ymm4,%ymm1
+ DB 197,245,113,209,8 ; vpsrlw $0x8,%ymm1,%ymm1
+ DB 197,221,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm4
DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0
- DB 197,237,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm2
- DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3
- DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0
+ DB 197,213,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm5
+ DB 196,227,85,56,240,1 ; vinserti128 $0x1,%xmm0,%ymm5,%ymm6
+ DB 196,227,85,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm5,%ymm0
+ DB 197,205,103,192 ; vpackuswb %ymm0,%ymm6,%ymm0
+ DB 196,227,93,56,233,1 ; vinserti128 $0x1,%xmm1,%ymm4,%ymm5
+ DB 196,227,93,70,201,49 ; vperm2i128 $0x31,%ymm1,%ymm4,%ymm1
+ DB 197,213,103,201 ; vpackuswb %ymm1,%ymm5,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_dstout_hsw_8bit
_sk_dstout_hsw_8bit LABEL PROC
- DB 196,226,125,0,5,201,5,0,0 ; vpshufb 0x5c9(%rip),%ymm0,%ymm0 # 1620 <_sk_xor__hsw_8bit+0x325>
- DB 197,237,118,210 ; vpcmpeqd %ymm2,%ymm2,%ymm2
- DB 197,253,239,194 ; vpxor %ymm2,%ymm0,%ymm0
- DB 196,226,125,48,209 ; vpmovzxbw %xmm1,%ymm2
- DB 196,227,125,57,203,1 ; vextracti128 $0x1,%ymm1,%xmm3
- DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3
- DB 196,226,125,48,224 ; vpmovzxbw %xmm0,%ymm4
+ DB 197,253,111,37,85,9,0,0 ; vmovdqa 0x955(%rip),%ymm4 # 2b40 <_sk_xor__hsw_8bit+0x3ef>
+ DB 196,226,125,0,196 ; vpshufb %ymm4,%ymm0,%ymm0
+ DB 196,226,117,0,204 ; vpshufb %ymm4,%ymm1,%ymm1
+ DB 197,221,118,228 ; vpcmpeqd %ymm4,%ymm4,%ymm4
+ DB 197,245,239,204 ; vpxor %ymm4,%ymm1,%ymm1
+ DB 197,253,239,196 ; vpxor %ymm4,%ymm0,%ymm0
+ DB 196,226,125,48,226 ; vpmovzxbw %xmm2,%ymm4
+ DB 196,227,125,57,213,1 ; vextracti128 $0x1,%ymm2,%xmm5
+ DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
+ DB 196,226,125,48,243 ; vpmovzxbw %xmm3,%ymm6
+ DB 196,227,125,57,223,1 ; vextracti128 $0x1,%ymm3,%xmm7
+ DB 196,226,125,48,255 ; vpmovzxbw %xmm7,%ymm7
+ DB 196,98,125,48,192 ; vpmovzxbw %xmm0,%ymm8
DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0
DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
- DB 197,253,213,195 ; vpmullw %ymm3,%ymm0,%ymm0
- DB 197,221,213,226 ; vpmullw %ymm2,%ymm4,%ymm4
- DB 197,221,253,210 ; vpaddw %ymm2,%ymm4,%ymm2
- DB 197,253,253,195 ; vpaddw %ymm3,%ymm0,%ymm0
+ DB 196,98,125,48,201 ; vpmovzxbw %xmm1,%ymm9
+ DB 196,227,125,57,201,1 ; vextracti128 $0x1,%ymm1,%xmm1
+ DB 196,226,125,48,201 ; vpmovzxbw %xmm1,%ymm1
+ DB 197,197,213,201 ; vpmullw %ymm1,%ymm7,%ymm1
+ DB 196,65,77,213,201 ; vpmullw %ymm9,%ymm6,%ymm9
+ DB 197,213,213,192 ; vpmullw %ymm0,%ymm5,%ymm0
+ DB 196,65,93,213,192 ; vpmullw %ymm8,%ymm4,%ymm8
+ DB 197,189,253,228 ; vpaddw %ymm4,%ymm8,%ymm4
+ DB 197,253,253,197 ; vpaddw %ymm5,%ymm0,%ymm0
+ DB 197,181,253,238 ; vpaddw %ymm6,%ymm9,%ymm5
+ DB 197,245,253,207 ; vpaddw %ymm7,%ymm1,%ymm1
+ DB 197,245,113,209,8 ; vpsrlw $0x8,%ymm1,%ymm1
+ DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5
DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0
- DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2
- DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3
- DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0
+ DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
+ DB 196,227,93,56,240,1 ; vinserti128 $0x1,%xmm0,%ymm4,%ymm6
+ DB 196,227,93,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm4,%ymm0
+ DB 197,205,103,192 ; vpackuswb %ymm0,%ymm6,%ymm0
+ DB 196,227,85,56,225,1 ; vinserti128 $0x1,%xmm1,%ymm5,%ymm4
+ DB 196,227,85,70,201,49 ; vperm2i128 $0x31,%ymm1,%ymm5,%ymm1
+ DB 197,221,103,201 ; vpackuswb %ymm1,%ymm4,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_srcover_hsw_8bit
_sk_srcover_hsw_8bit LABEL PROC
- DB 196,226,125,0,21,138,5,0,0 ; vpshufb 0x58a(%rip),%ymm0,%ymm2 # 1640 <_sk_xor__hsw_8bit+0x345>
- DB 196,226,125,48,217 ; vpmovzxbw %xmm1,%ymm3
- DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4
+ DB 197,253,111,37,189,8,0,0 ; vmovdqa 0x8bd(%rip),%ymm4 # 2b60 <_sk_xor__hsw_8bit+0x40f>
+ DB 196,226,117,0,236 ; vpshufb %ymm4,%ymm1,%ymm5
+ DB 196,226,125,0,228 ; vpshufb %ymm4,%ymm0,%ymm4
+ DB 196,98,125,48,202 ; vpmovzxbw %xmm2,%ymm9
+ DB 196,227,125,57,215,1 ; vextracti128 $0x1,%ymm2,%xmm7
+ DB 196,226,125,48,255 ; vpmovzxbw %xmm7,%ymm7
+ DB 196,98,125,48,195 ; vpmovzxbw %xmm3,%ymm8
+ DB 196,227,125,57,222,1 ; vextracti128 $0x1,%ymm3,%xmm6
+ DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6
+ DB 196,98,125,48,212 ; vpmovzxbw %xmm4,%ymm10
+ DB 196,227,125,57,228,1 ; vextracti128 $0x1,%ymm4,%xmm4
DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
- DB 196,226,125,48,234 ; vpmovzxbw %xmm2,%ymm5
- DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2
- DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2
- DB 197,237,213,212 ; vpmullw %ymm4,%ymm2,%ymm2
- DB 197,213,213,235 ; vpmullw %ymm3,%ymm5,%ymm5
- DB 197,213,253,219 ; vpaddw %ymm3,%ymm5,%ymm3
- DB 197,237,253,212 ; vpaddw %ymm4,%ymm2,%ymm2
- DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2
- DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3
- DB 196,227,101,56,226,1 ; vinserti128 $0x1,%xmm2,%ymm3,%ymm4
- DB 196,227,101,70,210,49 ; vperm2i128 $0x31,%ymm2,%ymm3,%ymm2
- DB 197,221,103,210 ; vpackuswb %ymm2,%ymm4,%ymm2
- DB 197,245,252,192 ; vpaddb %ymm0,%ymm1,%ymm0
- DB 197,253,248,194 ; vpsubb %ymm2,%ymm0,%ymm0
+ DB 196,98,125,48,221 ; vpmovzxbw %xmm5,%ymm11
+ DB 196,227,125,57,237,1 ; vextracti128 $0x1,%ymm5,%xmm5
+ DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
+ DB 197,205,213,237 ; vpmullw %ymm5,%ymm6,%ymm5
+ DB 196,65,61,213,219 ; vpmullw %ymm11,%ymm8,%ymm11
+ DB 197,197,213,228 ; vpmullw %ymm4,%ymm7,%ymm4
+ DB 196,65,53,213,210 ; vpmullw %ymm10,%ymm9,%ymm10
+ DB 196,65,45,253,201 ; vpaddw %ymm9,%ymm10,%ymm9
+ DB 197,221,253,231 ; vpaddw %ymm7,%ymm4,%ymm4
+ DB 196,193,37,253,248 ; vpaddw %ymm8,%ymm11,%ymm7
+ DB 197,213,253,238 ; vpaddw %ymm6,%ymm5,%ymm5
+ DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5
+ DB 197,205,113,215,8 ; vpsrlw $0x8,%ymm7,%ymm6
+ DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
+ DB 196,193,69,113,209,8 ; vpsrlw $0x8,%ymm9,%ymm7
+ DB 196,99,69,56,196,1 ; vinserti128 $0x1,%xmm4,%ymm7,%ymm8
+ DB 196,227,69,70,228,49 ; vperm2i128 $0x31,%ymm4,%ymm7,%ymm4
+ DB 197,189,103,228 ; vpackuswb %ymm4,%ymm8,%ymm4
+ DB 196,227,77,56,253,1 ; vinserti128 $0x1,%xmm5,%ymm6,%ymm7
+ DB 196,227,77,70,237,49 ; vperm2i128 $0x31,%ymm5,%ymm6,%ymm5
+ DB 197,197,103,237 ; vpackuswb %ymm5,%ymm7,%ymm5
+ DB 197,229,252,201 ; vpaddb %ymm1,%ymm3,%ymm1
+ DB 197,237,252,192 ; vpaddb %ymm0,%ymm2,%ymm0
+ DB 197,253,248,196 ; vpsubb %ymm4,%ymm0,%ymm0
+ DB 197,245,248,205 ; vpsubb %ymm5,%ymm1,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_dstover_hsw_8bit
_sk_dstover_hsw_8bit LABEL PROC
- DB 196,226,117,0,21,75,5,0,0 ; vpshufb 0x54b(%rip),%ymm1,%ymm2 # 1660 <_sk_xor__hsw_8bit+0x365>
- DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3
- DB 196,227,125,57,196,1 ; vextracti128 $0x1,%ymm0,%xmm4
+ DB 197,253,111,37,30,8,0,0 ; vmovdqa 0x81e(%rip),%ymm4 # 2b80 <_sk_xor__hsw_8bit+0x42f>
+ DB 196,226,101,0,236 ; vpshufb %ymm4,%ymm3,%ymm5
+ DB 196,226,109,0,228 ; vpshufb %ymm4,%ymm2,%ymm4
+ DB 196,98,125,48,200 ; vpmovzxbw %xmm0,%ymm9
+ DB 196,227,125,57,199,1 ; vextracti128 $0x1,%ymm0,%xmm7
+ DB 196,226,125,48,255 ; vpmovzxbw %xmm7,%ymm7
+ DB 196,98,125,48,193 ; vpmovzxbw %xmm1,%ymm8
+ DB 196,227,125,57,206,1 ; vextracti128 $0x1,%ymm1,%xmm6
+ DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6
+ DB 196,98,125,48,212 ; vpmovzxbw %xmm4,%ymm10
+ DB 196,227,125,57,228,1 ; vextracti128 $0x1,%ymm4,%xmm4
DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
- DB 196,226,125,48,234 ; vpmovzxbw %xmm2,%ymm5
- DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2
- DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2
- DB 197,237,213,212 ; vpmullw %ymm4,%ymm2,%ymm2
- DB 197,213,213,235 ; vpmullw %ymm3,%ymm5,%ymm5
- DB 197,213,253,219 ; vpaddw %ymm3,%ymm5,%ymm3
- DB 197,237,253,212 ; vpaddw %ymm4,%ymm2,%ymm2
- DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2
- DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3
- DB 196,227,101,56,226,1 ; vinserti128 $0x1,%xmm2,%ymm3,%ymm4
- DB 196,227,101,70,210,49 ; vperm2i128 $0x31,%ymm2,%ymm3,%ymm2
- DB 197,221,103,210 ; vpackuswb %ymm2,%ymm4,%ymm2
- DB 197,245,252,192 ; vpaddb %ymm0,%ymm1,%ymm0
- DB 197,253,248,194 ; vpsubb %ymm2,%ymm0,%ymm0
+ DB 196,98,125,48,221 ; vpmovzxbw %xmm5,%ymm11
+ DB 196,227,125,57,237,1 ; vextracti128 $0x1,%ymm5,%xmm5
+ DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
+ DB 197,213,213,238 ; vpmullw %ymm6,%ymm5,%ymm5
+ DB 196,65,37,213,216 ; vpmullw %ymm8,%ymm11,%ymm11
+ DB 197,221,213,231 ; vpmullw %ymm7,%ymm4,%ymm4
+ DB 196,65,45,213,209 ; vpmullw %ymm9,%ymm10,%ymm10
+ DB 196,65,45,253,201 ; vpaddw %ymm9,%ymm10,%ymm9
+ DB 197,221,253,231 ; vpaddw %ymm7,%ymm4,%ymm4
+ DB 196,193,37,253,248 ; vpaddw %ymm8,%ymm11,%ymm7
+ DB 197,213,253,238 ; vpaddw %ymm6,%ymm5,%ymm5
+ DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5
+ DB 197,205,113,215,8 ; vpsrlw $0x8,%ymm7,%ymm6
+ DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
+ DB 196,193,69,113,209,8 ; vpsrlw $0x8,%ymm9,%ymm7
+ DB 196,99,69,56,196,1 ; vinserti128 $0x1,%xmm4,%ymm7,%ymm8
+ DB 196,227,69,70,228,49 ; vperm2i128 $0x31,%ymm4,%ymm7,%ymm4
+ DB 197,189,103,228 ; vpackuswb %ymm4,%ymm8,%ymm4
+ DB 196,227,77,56,253,1 ; vinserti128 $0x1,%xmm5,%ymm6,%ymm7
+ DB 196,227,77,70,237,49 ; vperm2i128 $0x31,%ymm5,%ymm6,%ymm5
+ DB 197,197,103,237 ; vpackuswb %ymm5,%ymm7,%ymm5
+ DB 197,229,252,201 ; vpaddb %ymm1,%ymm3,%ymm1
+ DB 197,237,252,192 ; vpaddb %ymm0,%ymm2,%ymm0
+ DB 197,253,248,196 ; vpsubb %ymm4,%ymm0,%ymm0
+ DB 197,245,248,205 ; vpsubb %ymm5,%ymm1,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_modulate_hsw_8bit
_sk_modulate_hsw_8bit LABEL PROC
- DB 196,226,125,48,208 ; vpmovzxbw %xmm0,%ymm2
+ DB 196,226,125,48,224 ; vpmovzxbw %xmm0,%ymm4
DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0
DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
- DB 196,226,125,48,217 ; vpmovzxbw %xmm1,%ymm3
- DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4
- DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
- DB 197,221,213,224 ; vpmullw %ymm0,%ymm4,%ymm4
- DB 197,229,213,218 ; vpmullw %ymm2,%ymm3,%ymm3
- DB 197,229,253,210 ; vpaddw %ymm2,%ymm3,%ymm2
- DB 197,221,253,192 ; vpaddw %ymm0,%ymm4,%ymm0
+ DB 196,226,125,48,233 ; vpmovzxbw %xmm1,%ymm5
+ DB 196,227,125,57,201,1 ; vextracti128 $0x1,%ymm1,%xmm1
+ DB 196,226,125,48,201 ; vpmovzxbw %xmm1,%ymm1
+ DB 196,98,125,48,202 ; vpmovzxbw %xmm2,%ymm9
+ DB 196,227,125,57,215,1 ; vextracti128 $0x1,%ymm2,%xmm7
+ DB 196,226,125,48,255 ; vpmovzxbw %xmm7,%ymm7
+ DB 196,98,125,48,195 ; vpmovzxbw %xmm3,%ymm8
+ DB 196,227,125,57,222,1 ; vextracti128 $0x1,%ymm3,%xmm6
+ DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6
+ DB 197,205,213,241 ; vpmullw %ymm1,%ymm6,%ymm6
+ DB 197,61,213,197 ; vpmullw %ymm5,%ymm8,%ymm8
+ DB 197,197,213,248 ; vpmullw %ymm0,%ymm7,%ymm7
+ DB 197,53,213,204 ; vpmullw %ymm4,%ymm9,%ymm9
+ DB 197,181,253,228 ; vpaddw %ymm4,%ymm9,%ymm4
+ DB 197,197,253,192 ; vpaddw %ymm0,%ymm7,%ymm0
+ DB 197,189,253,237 ; vpaddw %ymm5,%ymm8,%ymm5
+ DB 197,205,253,201 ; vpaddw %ymm1,%ymm6,%ymm1
+ DB 197,245,113,209,8 ; vpsrlw $0x8,%ymm1,%ymm1
+ DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5
DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0
- DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2
- DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3
- DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0
+ DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
+ DB 196,227,93,56,240,1 ; vinserti128 $0x1,%xmm0,%ymm4,%ymm6
+ DB 196,227,93,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm4,%ymm0
+ DB 197,205,103,192 ; vpackuswb %ymm0,%ymm6,%ymm0
+ DB 196,227,85,56,225,1 ; vinserti128 $0x1,%xmm1,%ymm5,%ymm4
+ DB 196,227,85,70,201,49 ; vperm2i128 $0x31,%ymm1,%ymm5,%ymm1
+ DB 197,221,103,201 ; vpackuswb %ymm1,%ymm4,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_multiply_hsw_8bit
_sk_multiply_hsw_8bit LABEL PROC
- DB 197,253,111,37,191,4,0,0 ; vmovdqa 0x4bf(%rip),%ymm4 # 1680 <_sk_xor__hsw_8bit+0x385>
- DB 196,226,117,0,212 ; vpshufb %ymm4,%ymm1,%ymm2
- DB 197,213,118,237 ; vpcmpeqd %ymm5,%ymm5,%ymm5
- DB 197,237,239,245 ; vpxor %ymm5,%ymm2,%ymm6
- DB 196,226,125,48,208 ; vpmovzxbw %xmm0,%ymm2
- DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3
- DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3
- DB 196,226,125,48,254 ; vpmovzxbw %xmm6,%ymm7
- DB 196,227,125,57,246,1 ; vextracti128 $0x1,%ymm6,%xmm6
+ DB 72,131,236,56 ; sub $0x38,%rsp
+ DB 197,253,111,243 ; vmovdqa %ymm3,%ymm6
+ DB 197,253,111,218 ; vmovdqa %ymm2,%ymm3
+ DB 197,125,111,13,219,6,0,0 ; vmovdqa 0x6db(%rip),%ymm9 # 2ba0 <_sk_xor__hsw_8bit+0x44f>
+ DB 196,194,101,0,225 ; vpshufb %ymm9,%ymm3,%ymm4
+ DB 196,194,77,0,233 ; vpshufb %ymm9,%ymm6,%ymm5
+ DB 196,65,45,118,210 ; vpcmpeqd %ymm10,%ymm10,%ymm10
+ DB 196,65,85,239,194 ; vpxor %ymm10,%ymm5,%ymm8
+ DB 196,65,93,239,218 ; vpxor %ymm10,%ymm4,%ymm11
+ DB 196,98,125,48,232 ; vpmovzxbw %xmm0,%ymm13
+ DB 196,227,125,57,197,1 ; vextracti128 $0x1,%ymm0,%xmm5
+ DB 196,98,125,48,245 ; vpmovzxbw %xmm5,%ymm14
+ DB 196,226,125,48,209 ; vpmovzxbw %xmm1,%ymm2
+ DB 196,227,125,57,207,1 ; vextracti128 $0x1,%ymm1,%xmm7
+ DB 196,226,125,48,255 ; vpmovzxbw %xmm7,%ymm7
+ DB 196,66,125,48,227 ; vpmovzxbw %xmm11,%ymm12
+ DB 196,99,125,57,220,1 ; vextracti128 $0x1,%ymm11,%xmm4
+ DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
+ DB 196,66,125,48,216 ; vpmovzxbw %xmm8,%ymm11
+ DB 196,99,125,57,197,1 ; vextracti128 $0x1,%ymm8,%xmm5
+ DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
+ DB 197,213,213,239 ; vpmullw %ymm7,%ymm5,%ymm5
+ DB 197,37,213,194 ; vpmullw %ymm2,%ymm11,%ymm8
+ DB 196,193,93,213,230 ; vpmullw %ymm14,%ymm4,%ymm4
+ DB 196,65,29,213,221 ; vpmullw %ymm13,%ymm12,%ymm11
+ DB 196,65,37,253,221 ; vpaddw %ymm13,%ymm11,%ymm11
+ DB 196,193,93,253,230 ; vpaddw %ymm14,%ymm4,%ymm4
+ DB 197,61,253,194 ; vpaddw %ymm2,%ymm8,%ymm8
+ DB 197,213,253,239 ; vpaddw %ymm7,%ymm5,%ymm5
+ DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5
+ DB 196,193,29,113,208,8 ; vpsrlw $0x8,%ymm8,%ymm12
+ DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
+ DB 196,193,61,113,211,8 ; vpsrlw $0x8,%ymm11,%ymm8
+ DB 196,99,61,56,220,1 ; vinserti128 $0x1,%xmm4,%ymm8,%ymm11
+ DB 196,227,61,70,228,49 ; vperm2i128 $0x31,%ymm4,%ymm8,%ymm4
+ DB 197,37,103,196 ; vpackuswb %ymm4,%ymm11,%ymm8
+ DB 196,227,29,56,229,1 ; vinserti128 $0x1,%xmm5,%ymm12,%ymm4
+ DB 196,227,29,70,237,49 ; vperm2i128 $0x31,%ymm5,%ymm12,%ymm5
+ DB 197,93,103,221 ; vpackuswb %ymm5,%ymm4,%ymm11
+ DB 196,194,125,0,193 ; vpshufb %ymm9,%ymm0,%ymm0
+ DB 196,194,117,0,201 ; vpshufb %ymm9,%ymm1,%ymm1
+ DB 196,65,117,239,226 ; vpxor %ymm10,%ymm1,%ymm12
+ DB 196,193,125,239,226 ; vpxor %ymm10,%ymm0,%ymm4
+ DB 196,226,125,48,195 ; vpmovzxbw %xmm3,%ymm0
+ DB 196,227,125,57,217,1 ; vextracti128 $0x1,%ymm3,%xmm1
+ DB 196,226,125,48,201 ; vpmovzxbw %xmm1,%ymm1
+ DB 197,254,127,52,36 ; vmovdqu %ymm6,(%rsp)
+ DB 196,98,125,48,206 ; vpmovzxbw %xmm6,%ymm9
+ DB 196,227,125,57,245,1 ; vextracti128 $0x1,%ymm6,%xmm5
+ DB 196,98,125,48,213 ; vpmovzxbw %xmm5,%ymm10
+ DB 196,226,125,48,236 ; vpmovzxbw %xmm4,%ymm5
+ DB 196,227,125,57,228,1 ; vextracti128 $0x1,%ymm4,%xmm4
+ DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
+ DB 197,245,213,228 ; vpmullw %ymm4,%ymm1,%ymm4
+ DB 197,253,213,237 ; vpmullw %ymm5,%ymm0,%ymm5
+ DB 197,213,253,232 ; vpaddw %ymm0,%ymm5,%ymm5
+ DB 197,221,253,225 ; vpaddw %ymm1,%ymm4,%ymm4
+ DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
+ DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5
+ DB 196,99,85,56,252,1 ; vinserti128 $0x1,%xmm4,%ymm5,%ymm15
+ DB 196,227,85,70,228,49 ; vperm2i128 $0x31,%ymm4,%ymm5,%ymm4
+ DB 196,194,125,48,236 ; vpmovzxbw %xmm12,%ymm5
+ DB 196,99,125,57,230,1 ; vextracti128 $0x1,%ymm12,%xmm6
DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6
- DB 197,205,213,243 ; vpmullw %ymm3,%ymm6,%ymm6
- DB 197,197,213,250 ; vpmullw %ymm2,%ymm7,%ymm7
- DB 197,197,253,250 ; vpaddw %ymm2,%ymm7,%ymm7
- DB 197,205,253,243 ; vpaddw %ymm3,%ymm6,%ymm6
+ DB 197,173,213,246 ; vpmullw %ymm6,%ymm10,%ymm6
+ DB 197,181,213,237 ; vpmullw %ymm5,%ymm9,%ymm5
+ DB 196,193,85,253,233 ; vpaddw %ymm9,%ymm5,%ymm5
+ DB 196,193,77,253,242 ; vpaddw %ymm10,%ymm6,%ymm6
DB 197,205,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm6
- DB 197,197,113,215,8 ; vpsrlw $0x8,%ymm7,%ymm7
- DB 196,99,69,56,198,1 ; vinserti128 $0x1,%xmm6,%ymm7,%ymm8
- DB 196,227,69,70,246,49 ; vperm2i128 $0x31,%ymm6,%ymm7,%ymm6
- DB 197,189,103,246 ; vpackuswb %ymm6,%ymm8,%ymm6
- DB 196,226,125,0,196 ; vpshufb %ymm4,%ymm0,%ymm0
- DB 197,253,239,197 ; vpxor %ymm5,%ymm0,%ymm0
- DB 196,226,125,48,225 ; vpmovzxbw %xmm1,%ymm4
- DB 196,227,125,57,205,1 ; vextracti128 $0x1,%ymm1,%xmm5
- DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
- DB 196,226,125,48,248 ; vpmovzxbw %xmm0,%ymm7
- DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0
- DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
- DB 197,253,213,197 ; vpmullw %ymm5,%ymm0,%ymm0
- DB 197,197,213,252 ; vpmullw %ymm4,%ymm7,%ymm7
- DB 197,197,253,252 ; vpaddw %ymm4,%ymm7,%ymm7
- DB 197,253,253,197 ; vpaddw %ymm5,%ymm0,%ymm0
+ DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5
+ DB 197,133,103,228 ; vpackuswb %ymm4,%ymm15,%ymm4
+ DB 196,99,85,56,230,1 ; vinserti128 $0x1,%xmm6,%ymm5,%ymm12
+ DB 196,227,85,70,238,49 ; vperm2i128 $0x31,%ymm6,%ymm5,%ymm5
+ DB 197,157,103,237 ; vpackuswb %ymm5,%ymm12,%ymm5
+ DB 196,193,85,252,235 ; vpaddb %ymm11,%ymm5,%ymm5
+ DB 196,193,93,252,224 ; vpaddb %ymm8,%ymm4,%ymm4
+ DB 196,193,125,213,197 ; vpmullw %ymm13,%ymm0,%ymm0
+ DB 196,193,125,253,197 ; vpaddw %ymm13,%ymm0,%ymm0
+ DB 196,193,117,213,206 ; vpmullw %ymm14,%ymm1,%ymm1
+ DB 196,193,117,253,206 ; vpaddw %ymm14,%ymm1,%ymm1
+ DB 197,181,213,242 ; vpmullw %ymm2,%ymm9,%ymm6
+ DB 197,205,253,210 ; vpaddw %ymm2,%ymm6,%ymm2
+ DB 197,173,213,247 ; vpmullw %ymm7,%ymm10,%ymm6
+ DB 197,205,253,247 ; vpaddw %ymm7,%ymm6,%ymm6
+ DB 197,245,113,209,8 ; vpsrlw $0x8,%ymm1,%ymm1
DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0
- DB 197,197,113,215,8 ; vpsrlw $0x8,%ymm7,%ymm7
- DB 196,99,69,56,192,1 ; vinserti128 $0x1,%xmm0,%ymm7,%ymm8
- DB 196,227,69,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm7,%ymm0
- DB 197,189,103,192 ; vpackuswb %ymm0,%ymm8,%ymm0
- DB 197,253,252,198 ; vpaddb %ymm6,%ymm0,%ymm0
- DB 197,213,213,235 ; vpmullw %ymm3,%ymm5,%ymm5
- DB 197,221,213,226 ; vpmullw %ymm2,%ymm4,%ymm4
- DB 197,221,253,210 ; vpaddw %ymm2,%ymm4,%ymm2
- DB 197,213,253,219 ; vpaddw %ymm3,%ymm5,%ymm3
- DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3
+ DB 196,227,125,56,249,1 ; vinserti128 $0x1,%xmm1,%ymm0,%ymm7
+ DB 196,227,125,70,193,49 ; vperm2i128 $0x31,%ymm1,%ymm0,%ymm0
+ DB 197,245,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm1
DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2
- DB 196,227,109,56,227,1 ; vinserti128 $0x1,%xmm3,%ymm2,%ymm4
- DB 196,227,109,70,211,49 ; vperm2i128 $0x31,%ymm3,%ymm2,%ymm2
- DB 197,221,103,210 ; vpackuswb %ymm2,%ymm4,%ymm2
- DB 197,253,252,194 ; vpaddb %ymm2,%ymm0,%ymm0
- DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,197,103,192 ; vpackuswb %ymm0,%ymm7,%ymm0
+ DB 196,227,109,56,241,1 ; vinserti128 $0x1,%xmm1,%ymm2,%ymm6
+ DB 196,227,109,70,201,49 ; vperm2i128 $0x31,%ymm1,%ymm2,%ymm1
+ DB 197,205,103,201 ; vpackuswb %ymm1,%ymm6,%ymm1
+ DB 197,221,252,192 ; vpaddb %ymm0,%ymm4,%ymm0
+ DB 197,213,252,201 ; vpaddb %ymm1,%ymm5,%ymm1
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,253,111,211 ; vmovdqa %ymm3,%ymm2
+ DB 197,252,16,28,36 ; vmovups (%rsp),%ymm3
+ DB 72,131,196,56 ; add $0x38,%rsp
DB 255,224 ; jmpq *%rax
PUBLIC _sk_screen_hsw_8bit
_sk_screen_hsw_8bit LABEL PROC
- DB 197,237,118,210 ; vpcmpeqd %ymm2,%ymm2,%ymm2
- DB 197,253,239,210 ; vpxor %ymm2,%ymm0,%ymm2
- DB 196,226,125,48,218 ; vpmovzxbw %xmm2,%ymm3
- DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2
- DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2
- DB 196,226,125,48,225 ; vpmovzxbw %xmm1,%ymm4
- DB 196,227,125,57,205,1 ; vextracti128 $0x1,%ymm1,%xmm5
+ DB 197,221,118,228 ; vpcmpeqd %ymm4,%ymm4,%ymm4
+ DB 197,245,239,236 ; vpxor %ymm4,%ymm1,%ymm5
+ DB 197,253,239,228 ; vpxor %ymm4,%ymm0,%ymm4
+ DB 196,98,125,48,204 ; vpmovzxbw %xmm4,%ymm9
+ DB 196,227,125,57,228,1 ; vextracti128 $0x1,%ymm4,%xmm4
+ DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
+ DB 196,226,125,48,253 ; vpmovzxbw %xmm5,%ymm7
+ DB 196,227,125,57,237,1 ; vextracti128 $0x1,%ymm5,%xmm5
DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
- DB 197,213,213,234 ; vpmullw %ymm2,%ymm5,%ymm5
- DB 197,221,213,227 ; vpmullw %ymm3,%ymm4,%ymm4
- DB 197,221,253,219 ; vpaddw %ymm3,%ymm4,%ymm3
- DB 197,213,253,210 ; vpaddw %ymm2,%ymm5,%ymm2
- DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2
- DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3
- DB 196,227,101,56,226,1 ; vinserti128 $0x1,%xmm2,%ymm3,%ymm4
- DB 196,227,101,70,210,49 ; vperm2i128 $0x31,%ymm2,%ymm3,%ymm2
- DB 197,221,103,210 ; vpackuswb %ymm2,%ymm4,%ymm2
- DB 197,237,252,192 ; vpaddb %ymm0,%ymm2,%ymm0
+ DB 196,98,125,48,194 ; vpmovzxbw %xmm2,%ymm8
+ DB 196,227,125,57,214,1 ; vextracti128 $0x1,%ymm2,%xmm6
+ DB 196,98,125,48,222 ; vpmovzxbw %xmm6,%ymm11
+ DB 196,98,125,48,211 ; vpmovzxbw %xmm3,%ymm10
+ DB 196,227,125,57,222,1 ; vextracti128 $0x1,%ymm3,%xmm6
+ DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6
+ DB 197,205,213,245 ; vpmullw %ymm5,%ymm6,%ymm6
+ DB 197,45,213,215 ; vpmullw %ymm7,%ymm10,%ymm10
+ DB 197,37,213,220 ; vpmullw %ymm4,%ymm11,%ymm11
+ DB 196,65,61,213,193 ; vpmullw %ymm9,%ymm8,%ymm8
+ DB 196,65,61,253,193 ; vpaddw %ymm9,%ymm8,%ymm8
+ DB 197,165,253,228 ; vpaddw %ymm4,%ymm11,%ymm4
+ DB 197,173,253,255 ; vpaddw %ymm7,%ymm10,%ymm7
+ DB 197,205,253,237 ; vpaddw %ymm5,%ymm6,%ymm5
+ DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5
+ DB 197,205,113,215,8 ; vpsrlw $0x8,%ymm7,%ymm6
+ DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
+ DB 196,193,69,113,208,8 ; vpsrlw $0x8,%ymm8,%ymm7
+ DB 196,99,69,56,196,1 ; vinserti128 $0x1,%xmm4,%ymm7,%ymm8
+ DB 196,227,69,70,228,49 ; vperm2i128 $0x31,%ymm4,%ymm7,%ymm4
+ DB 197,189,103,228 ; vpackuswb %ymm4,%ymm8,%ymm4
+ DB 196,227,77,56,253,1 ; vinserti128 $0x1,%xmm5,%ymm6,%ymm7
+ DB 196,227,77,70,237,49 ; vperm2i128 $0x31,%ymm5,%ymm6,%ymm5
+ DB 197,197,103,237 ; vpackuswb %ymm5,%ymm7,%ymm5
+ DB 197,221,252,192 ; vpaddb %ymm0,%ymm4,%ymm0
+ DB 197,213,252,201 ; vpaddb %ymm1,%ymm5,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_xor__hsw_8bit
_sk_xor__hsw_8bit LABEL PROC
- DB 197,253,111,21,157,3,0,0 ; vmovdqa 0x39d(%rip),%ymm2 # 16a0 <_sk_xor__hsw_8bit+0x3a5>
- DB 196,226,117,0,218 ; vpshufb %ymm2,%ymm1,%ymm3
- DB 197,221,118,228 ; vpcmpeqd %ymm4,%ymm4,%ymm4
- DB 197,229,239,220 ; vpxor %ymm4,%ymm3,%ymm3
- DB 196,226,125,48,232 ; vpmovzxbw %xmm0,%ymm5
- DB 196,227,125,57,198,1 ; vextracti128 $0x1,%ymm0,%xmm6
+ DB 197,125,111,13,103,4,0,0 ; vmovdqa 0x467(%rip),%ymm9 # 2bc0 <_sk_xor__hsw_8bit+0x46f>
+ DB 196,194,109,0,225 ; vpshufb %ymm9,%ymm2,%ymm4
+ DB 196,194,101,0,249 ; vpshufb %ymm9,%ymm3,%ymm7
+ DB 196,65,37,118,219 ; vpcmpeqd %ymm11,%ymm11,%ymm11
+ DB 196,193,69,239,251 ; vpxor %ymm11,%ymm7,%ymm7
+ DB 196,193,93,239,227 ; vpxor %ymm11,%ymm4,%ymm4
+ DB 196,98,125,48,192 ; vpmovzxbw %xmm0,%ymm8
+ DB 196,227,125,57,197,1 ; vextracti128 $0x1,%ymm0,%xmm5
+ DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5
+ DB 196,98,125,48,209 ; vpmovzxbw %xmm1,%ymm10
+ DB 196,227,125,57,206,1 ; vextracti128 $0x1,%ymm1,%xmm6
DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6
- DB 196,226,125,48,251 ; vpmovzxbw %xmm3,%ymm7
- DB 196,227,125,57,219,1 ; vextracti128 $0x1,%ymm3,%xmm3
- DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3
- DB 197,229,213,222 ; vpmullw %ymm6,%ymm3,%ymm3
- DB 197,197,213,253 ; vpmullw %ymm5,%ymm7,%ymm7
- DB 197,197,253,237 ; vpaddw %ymm5,%ymm7,%ymm5
- DB 197,229,253,222 ; vpaddw %ymm6,%ymm3,%ymm3
- DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3
+ DB 196,98,125,48,228 ; vpmovzxbw %xmm4,%ymm12
+ DB 196,227,125,57,228,1 ; vextracti128 $0x1,%ymm4,%xmm4
+ DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
+ DB 196,98,125,48,239 ; vpmovzxbw %xmm7,%ymm13
+ DB 196,227,125,57,255,1 ; vextracti128 $0x1,%ymm7,%xmm7
+ DB 196,226,125,48,255 ; vpmovzxbw %xmm7,%ymm7
+ DB 197,197,213,254 ; vpmullw %ymm6,%ymm7,%ymm7
+ DB 196,65,21,213,234 ; vpmullw %ymm10,%ymm13,%ymm13
+ DB 197,221,213,229 ; vpmullw %ymm5,%ymm4,%ymm4
+ DB 196,65,29,213,224 ; vpmullw %ymm8,%ymm12,%ymm12
+ DB 196,65,29,253,192 ; vpaddw %ymm8,%ymm12,%ymm8
+ DB 197,221,253,229 ; vpaddw %ymm5,%ymm4,%ymm4
+ DB 196,193,21,253,234 ; vpaddw %ymm10,%ymm13,%ymm5
+ DB 197,197,253,246 ; vpaddw %ymm6,%ymm7,%ymm6
+ DB 197,205,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm6
DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5
- DB 196,227,85,56,243,1 ; vinserti128 $0x1,%xmm3,%ymm5,%ymm6
- DB 196,227,85,70,219,49 ; vperm2i128 $0x31,%ymm3,%ymm5,%ymm3
- DB 197,205,103,219 ; vpackuswb %ymm3,%ymm6,%ymm3
- DB 196,226,125,0,194 ; vpshufb %ymm2,%ymm0,%ymm0
- DB 197,253,239,196 ; vpxor %ymm4,%ymm0,%ymm0
- DB 196,226,125,48,209 ; vpmovzxbw %xmm1,%ymm2
- DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4
+ DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4
+ DB 196,193,69,113,208,8 ; vpsrlw $0x8,%ymm8,%ymm7
+ DB 196,99,69,56,196,1 ; vinserti128 $0x1,%xmm4,%ymm7,%ymm8
+ DB 196,227,69,70,228,49 ; vperm2i128 $0x31,%ymm4,%ymm7,%ymm4
+ DB 197,61,103,212 ; vpackuswb %ymm4,%ymm8,%ymm10
+ DB 196,227,85,56,254,1 ; vinserti128 $0x1,%xmm6,%ymm5,%ymm7
+ DB 196,227,85,70,238,49 ; vperm2i128 $0x31,%ymm6,%ymm5,%ymm5
+ DB 197,197,103,253 ; vpackuswb %ymm5,%ymm7,%ymm7
+ DB 196,194,125,0,193 ; vpshufb %ymm9,%ymm0,%ymm0
+ DB 196,194,117,0,201 ; vpshufb %ymm9,%ymm1,%ymm1
+ DB 196,193,117,239,203 ; vpxor %ymm11,%ymm1,%ymm1
+ DB 196,193,125,239,195 ; vpxor %ymm11,%ymm0,%ymm0
+ DB 196,226,125,48,234 ; vpmovzxbw %xmm2,%ymm5
+ DB 196,227,125,57,214,1 ; vextracti128 $0x1,%ymm2,%xmm6
+ DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6
+ DB 196,98,125,48,195 ; vpmovzxbw %xmm3,%ymm8
+ DB 196,227,125,57,220,1 ; vextracti128 $0x1,%ymm3,%xmm4
DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4
- DB 196,226,125,48,232 ; vpmovzxbw %xmm0,%ymm5
+ DB 196,98,125,48,200 ; vpmovzxbw %xmm0,%ymm9
DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0
DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
- DB 197,253,213,196 ; vpmullw %ymm4,%ymm0,%ymm0
- DB 197,213,213,234 ; vpmullw %ymm2,%ymm5,%ymm5
- DB 197,213,253,210 ; vpaddw %ymm2,%ymm5,%ymm2
- DB 197,253,253,196 ; vpaddw %ymm4,%ymm0,%ymm0
+ DB 196,98,125,48,217 ; vpmovzxbw %xmm1,%ymm11
+ DB 196,227,125,57,201,1 ; vextracti128 $0x1,%ymm1,%xmm1
+ DB 196,226,125,48,201 ; vpmovzxbw %xmm1,%ymm1
+ DB 197,221,213,201 ; vpmullw %ymm1,%ymm4,%ymm1
+ DB 196,65,61,213,219 ; vpmullw %ymm11,%ymm8,%ymm11
+ DB 197,205,213,192 ; vpmullw %ymm0,%ymm6,%ymm0
+ DB 196,65,85,213,201 ; vpmullw %ymm9,%ymm5,%ymm9
+ DB 197,181,253,237 ; vpaddw %ymm5,%ymm9,%ymm5
+ DB 197,253,253,198 ; vpaddw %ymm6,%ymm0,%ymm0
+ DB 196,193,37,253,240 ; vpaddw %ymm8,%ymm11,%ymm6
+ DB 197,245,253,204 ; vpaddw %ymm4,%ymm1,%ymm1
+ DB 197,245,113,209,8 ; vpsrlw $0x8,%ymm1,%ymm1
+ DB 197,221,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm4
DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0
- DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2
- DB 196,227,109,56,224,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm4
- DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
- DB 197,221,103,192 ; vpackuswb %ymm0,%ymm4,%ymm0
- DB 197,253,252,195 ; vpaddb %ymm3,%ymm0,%ymm0
+ DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5
+ DB 196,227,85,56,240,1 ; vinserti128 $0x1,%xmm0,%ymm5,%ymm6
+ DB 196,227,85,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm5,%ymm0
+ DB 197,205,103,192 ; vpackuswb %ymm0,%ymm6,%ymm0
+ DB 196,227,93,56,233,1 ; vinserti128 $0x1,%xmm1,%ymm4,%ymm5
+ DB 196,227,93,70,201,49 ; vperm2i128 $0x31,%ymm1,%ymm4,%ymm1
+ DB 197,213,103,201 ; vpackuswb %ymm1,%ymm5,%ymm1
+ DB 196,193,125,252,194 ; vpaddb %ymm10,%ymm0,%ymm0
+ DB 197,245,252,207 ; vpaddb %ymm7,%ymm1,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
ALIGN 4
DB 0,0 ; add %al,(%rax)
- DB 127,67 ; jg 13ff <_sk_xor__hsw_8bit+0x104>
+ DB 127,67 ; jg 290b <_sk_xor__hsw_8bit+0x1ba>
DB 1,1 ; add %eax,(%rcx)
DB 1,0 ; add %eax,(%rax)
DB 0,0 ; add %al,(%rax)
@@ -39795,9 +41028,11 @@ ALIGN 4
DB 0,0 ; add %al,(%rax)
DB 0,255 ; add %bh,%bh
DB 0,0 ; add %al,(%rax)
- DB 127,67 ; jg 1413 <_sk_xor__hsw_8bit+0x118>
+ DB 127,67 ; jg 291f <_sk_xor__hsw_8bit+0x1ce>
+ DB 0,0 ; add %al,(%rax)
+ DB 127,67 ; jg 2923 <_sk_xor__hsw_8bit+0x1d2>
DB 0,0 ; add %al,(%rax)
- DB 127,67 ; jg 1417 <_sk_xor__hsw_8bit+0x11c>
+ DB 0,255 ; add %bh,%bh
ALIGN 32
DB 0,0 ; add %al,(%rax)
@@ -40235,38 +41470,6 @@ ALIGN 32
DB 15 ; .byte 0xf
ALIGN 16
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 0,2 ; add %al,(%rdx)
- DB 0,0 ; add %al,(%rax)
- DB 0,0 ; add %al,(%rax)
- DB 0,0 ; add %al,(%rax)
- DB 0,0 ; add %al,(%rax)
- DB 0,0 ; add %al,(%rax)
- DB 0,0 ; add %al,(%rax)
- DB 0,0 ; add %al,(%rax)
- DB 0,2 ; add %al,(%rdx)
- DB 4,6 ; add $0x6,%al
- DB 0,0 ; add %al,(%rax)
- DB 0,0 ; add %al,(%rax)
- DB 0,0 ; add %al,(%rax)
- DB 0,0 ; add %al,(%rax)
- DB 0,0 ; add %al,(%rax)
- DB 0,0 ; add %al,(%rax)
DB 0,2 ; add %al,(%rdx)
DB 4,6 ; add $0x6,%al
DB 8,10 ; or %cl,(%rdx)
@@ -40275,44 +41478,6 @@ ALIGN 16
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
-
-ALIGN 8
- DB 0,0 ; add %al,(%rax)
- DB 0,255 ; add %bh,%bh
- DB 0,0 ; add %al,(%rax)
- DB 0,255 ; add %bh,%bh
ALIGN 32
PUBLIC _sk_start_pipeline_sse41_8bit
@@ -40348,7 +41513,7 @@ _sk_start_pipeline_sse41_8bit LABEL PROC
DB 77,57,207 ; cmp %r9,%r15
DB 15,131,138,0,0,0 ; jae 10b <_sk_start_pipeline_sse41_8bit+0x10b>
DB 72,139,133,24,255,255,255 ; mov -0xe8(%rbp),%rax
- DB 72,141,64,4 ; lea 0x4(%rax),%rax
+ DB 72,141,64,8 ; lea 0x8(%rax),%rax
DB 72,137,133,248,254,255,255 ; mov %rax,-0x108(%rbp)
DB 76,141,165,0,255,255,255 ; lea -0x100(%rbp),%r12
DB 72,57,157,248,254,255,255 ; cmp %rbx,-0x108(%rbp)
@@ -40361,9 +41526,9 @@ _sk_start_pipeline_sse41_8bit LABEL PROC
DB 76,137,246 ; mov %r14,%rsi
DB 65,255,213 ; callq *%r13
DB 72,139,141,0,255,255,255 ; mov -0x100(%rbp),%rcx
- DB 72,141,65,4 ; lea 0x4(%rcx),%rax
+ DB 72,141,65,8 ; lea 0x8(%rcx),%rax
DB 72,137,133,0,255,255,255 ; mov %rax,-0x100(%rbp)
- DB 72,131,193,8 ; add $0x8,%rcx
+ DB 72,131,193,16 ; add $0x10,%rcx
DB 72,57,217 ; cmp %rbx,%rcx
DB 118,220 ; jbe c3 <_sk_start_pipeline_sse41_8bit+0xc3>
DB 72,137,217 ; mov %rbx,%rcx
@@ -40407,62 +41572,86 @@ _sk_uniform_color_sse41_8bit LABEL PROC
DB 102,15,110,64,16 ; movd 0x10(%rax),%xmm0
DB 102,15,112,192,0 ; pshufd $0x0,%xmm0,%xmm0
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 102,15,111,200 ; movdqa %xmm0,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_set_rgb_sse41_8bit
_sk_set_rgb_sse41_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 243,15,16,21,76,14,0,0 ; movss 0xe4c(%rip),%xmm2 # fc8 <_sk_xor__sse41_8bit+0xa1>
- DB 243,15,16,24 ; movss (%rax),%xmm3
- DB 243,15,89,218 ; mulss %xmm2,%xmm3
- DB 243,72,15,44,203 ; cvttss2si %xmm3,%rcx
- DB 243,15,16,88,4 ; movss 0x4(%rax),%xmm3
- DB 243,15,89,218 ; mulss %xmm2,%xmm3
- DB 243,72,15,44,211 ; cvttss2si %xmm3,%rdx
+ DB 243,15,16,37,96,28,0,0 ; movss 0x1c60(%rip),%xmm4 # 1de0 <_sk_xor__sse41_8bit+0x148>
+ DB 243,15,16,40 ; movss (%rax),%xmm5
+ DB 243,15,89,236 ; mulss %xmm4,%xmm5
+ DB 243,72,15,44,205 ; cvttss2si %xmm5,%rcx
+ DB 243,15,16,104,4 ; movss 0x4(%rax),%xmm5
+ DB 243,15,89,236 ; mulss %xmm4,%xmm5
+ DB 243,72,15,44,213 ; cvttss2si %xmm5,%rdx
DB 193,226,8 ; shl $0x8,%edx
DB 9,202 ; or %ecx,%edx
- DB 243,15,89,80,8 ; mulss 0x8(%rax),%xmm2
- DB 243,72,15,44,194 ; cvttss2si %xmm2,%rax
+ DB 243,15,89,96,8 ; mulss 0x8(%rax),%xmm4
+ DB 243,72,15,44,196 ; cvttss2si %xmm4,%rax
DB 193,224,16 ; shl $0x10,%eax
DB 9,208 ; or %edx,%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 102,15,112,210,0 ; pshufd $0x0,%xmm2,%xmm2
- DB 102,15,219,5,36,14,0,0 ; pand 0xe24(%rip),%xmm0 # fe0 <_sk_xor__sse41_8bit+0xb9>
- DB 102,15,235,194 ; por %xmm2,%xmm0
+ DB 102,15,110,224 ; movd %eax,%xmm4
+ DB 102,15,112,228,0 ; pshufd $0x0,%xmm4,%xmm4
+ DB 102,15,111,45,48,28,0,0 ; movdqa 0x1c30(%rip),%xmm5 # 1df0 <_sk_xor__sse41_8bit+0x158>
+ DB 102,15,219,205 ; pand %xmm5,%xmm1
+ DB 102,15,219,197 ; pand %xmm5,%xmm0
+ DB 102,15,235,196 ; por %xmm4,%xmm0
+ DB 102,15,235,204 ; por %xmm4,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_premul_sse41_8bit
_sk_premul_sse41_8bit LABEL PROC
- DB 102,15,111,216 ; movdqa %xmm0,%xmm3
- DB 102,15,56,0,29,31,14,0,0 ; pshufb 0xe1f(%rip),%xmm3 # ff0 <_sk_xor__sse41_8bit+0xc9>
- DB 102,15,235,29,39,14,0,0 ; por 0xe27(%rip),%xmm3 # 1000 <_sk_xor__sse41_8bit+0xd9>
- DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,56,48,232 ; pmovzxbw %xmm0,%xmm5
- DB 102,15,104,196 ; punpckhbw %xmm4,%xmm0
- DB 102,15,56,48,211 ; pmovzxbw %xmm3,%xmm2
- DB 102,15,104,220 ; punpckhbw %xmm4,%xmm3
- DB 102,15,213,216 ; pmullw %xmm0,%xmm3
- DB 102,15,213,213 ; pmullw %xmm5,%xmm2
- DB 102,15,253,216 ; paddw %xmm0,%xmm3
- DB 102,15,253,213 ; paddw %xmm5,%xmm2
- DB 102,15,113,211,8 ; psrlw $0x8,%xmm3
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,211 ; packuswb %xmm3,%xmm2
+ DB 102,15,111,225 ; movdqa %xmm1,%xmm4
+ DB 102,15,111,232 ; movdqa %xmm0,%xmm5
+ DB 102,15,111,5,28,28,0,0 ; movdqa 0x1c1c(%rip),%xmm0 # 1e00 <_sk_xor__sse41_8bit+0x168>
+ DB 102,15,111,253 ; movdqa %xmm5,%xmm7
+ DB 102,15,56,0,248 ; pshufb %xmm0,%xmm7
+ DB 102,15,111,244 ; movdqa %xmm4,%xmm6
+ DB 102,15,56,0,240 ; pshufb %xmm0,%xmm6
+ DB 102,15,111,5,18,28,0,0 ; movdqa 0x1c12(%rip),%xmm0 # 1e10 <_sk_xor__sse41_8bit+0x178>
+ DB 102,15,235,240 ; por %xmm0,%xmm6
+ DB 102,15,235,248 ; por %xmm0,%xmm7
+ DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
+ DB 102,68,15,56,48,205 ; pmovzxbw %xmm5,%xmm9
+ DB 102,65,15,104,232 ; punpckhbw %xmm8,%xmm5
+ DB 102,68,15,56,48,212 ; pmovzxbw %xmm4,%xmm10
+ DB 102,65,15,104,224 ; punpckhbw %xmm8,%xmm4
+ DB 102,15,56,48,199 ; pmovzxbw %xmm7,%xmm0
+ DB 102,15,56,48,206 ; pmovzxbw %xmm6,%xmm1
+ DB 102,65,15,104,248 ; punpckhbw %xmm8,%xmm7
+ DB 102,65,15,104,240 ; punpckhbw %xmm8,%xmm6
+ DB 102,15,213,244 ; pmullw %xmm4,%xmm6
+ DB 102,15,213,253 ; pmullw %xmm5,%xmm7
+ DB 102,65,15,213,202 ; pmullw %xmm10,%xmm1
+ DB 102,65,15,213,193 ; pmullw %xmm9,%xmm0
+ DB 102,15,253,253 ; paddw %xmm5,%xmm7
+ DB 102,15,253,244 ; paddw %xmm4,%xmm6
+ DB 102,65,15,253,193 ; paddw %xmm9,%xmm0
+ DB 102,65,15,253,202 ; paddw %xmm10,%xmm1
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
+ DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
+ DB 102,15,103,199 ; packuswb %xmm7,%xmm0
+ DB 102,15,103,206 ; packuswb %xmm6,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
DB 255,224 ; jmpq *%rax
PUBLIC _sk_swap_rb_sse41_8bit
_sk_swap_rb_sse41_8bit LABEL PROC
- DB 102,15,56,0,5,242,13,0,0 ; pshufb 0xdf2(%rip),%xmm0 # 1010 <_sk_xor__sse41_8bit+0xe9>
+ DB 102,15,111,37,159,27,0,0 ; movdqa 0x1b9f(%rip),%xmm4 # 1e20 <_sk_xor__sse41_8bit+0x188>
+ DB 102,15,56,0,196 ; pshufb %xmm4,%xmm0
+ DB 102,15,56,0,204 ; pshufb %xmm4,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_invert_sse41_8bit
_sk_invert_sse41_8bit LABEL PROC
- DB 102,15,118,210 ; pcmpeqd %xmm2,%xmm2
- DB 102,15,239,194 ; pxor %xmm2,%xmm0
+ DB 102,15,118,228 ; pcmpeqd %xmm4,%xmm4
+ DB 102,15,239,196 ; pxor %xmm4,%xmm0
+ DB 102,15,239,204 ; pxor %xmm4,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
@@ -40477,25 +41666,51 @@ _sk_load_8888_sse41_8bit LABEL PROC
DB 72,193,226,2 ; shl $0x2,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,10 ; jne 259 <_sk_load_8888_sse41_8bit+0x2b>
+ DB 117,17 ; jne 2d1 <_sk_load_8888_sse41_8bit+0x32>
+ DB 243,66,15,111,76,130,16 ; movdqu 0x10(%rdx,%r8,4),%xmm1
DB 243,66,15,111,4,130 ; movdqu (%rdx,%r8,4),%xmm0
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,42 ; je 28d <_sk_load_8888_sse41_8bit+0x5f>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,201 ; pxor %xmm1,%xmm1
DB 102,15,239,192 ; pxor %xmm0,%xmm0
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,18 ; je 27f <_sk_load_8888_sse41_8bit+0x51>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,226 ; jne 255 <_sk_load_8888_sse41_8bit+0x27>
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,231 ; ja 2cd <_sk_load_8888_sse41_8bit+0x2e>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,75,0,0,0 ; lea 0x4b(%rip),%rcx # 33c <_sk_load_8888_sse41_8bit+0x9d>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 102,66,15,110,4,130 ; movd (%rdx,%r8,4),%xmm0
+ DB 235,203 ; jmp 2cd <_sk_load_8888_sse41_8bit+0x2e>
DB 102,66,15,110,68,130,8 ; movd 0x8(%rdx,%r8,4),%xmm0
DB 102,15,112,192,69 ; pshufd $0x45,%xmm0,%xmm0
- DB 243,66,15,126,20,130 ; movq (%rdx,%r8,4),%xmm2
- DB 102,15,58,14,194,15 ; pblendw $0xf,%xmm2,%xmm0
- DB 235,200 ; jmp 255 <_sk_load_8888_sse41_8bit+0x27>
- DB 102,66,15,110,4,130 ; movd (%rdx,%r8,4),%xmm0
- DB 235,192 ; jmp 255 <_sk_load_8888_sse41_8bit+0x27>
+ DB 243,66,15,126,36,130 ; movq (%rdx,%r8,4),%xmm4
+ DB 102,15,58,14,196,15 ; pblendw $0xf,%xmm4,%xmm0
+ DB 235,177 ; jmp 2cd <_sk_load_8888_sse41_8bit+0x2e>
+ DB 102,66,15,110,68,130,24 ; movd 0x18(%rdx,%r8,4),%xmm0
+ DB 102,15,112,200,69 ; pshufd $0x45,%xmm0,%xmm1
+ DB 102,66,15,58,34,76,130,20,1 ; pinsrd $0x1,0x14(%rdx,%r8,4),%xmm1
+ DB 102,66,15,58,34,76,130,16,0 ; pinsrd $0x0,0x10(%rdx,%r8,4),%xmm1
+ DB 235,139 ; jmp 2c7 <_sk_load_8888_sse41_8bit+0x28>
+ DB 190,255,255,255,210 ; mov $0xd2ffffff,%esi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,198 ; inc %esi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,139,255,255,255,245 ; decl -0xa000001(%rbx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 236 ; in (%dx),%al
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,224 ; jmpq *%rax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_8888_dst_sse41_8bit
_sk_load_8888_dst_sse41_8bit LABEL PROC
@@ -40508,53 +41723,109 @@ _sk_load_8888_dst_sse41_8bit LABEL PROC
DB 72,193,226,2 ; shl $0x2,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,10 ; jne 2c0 <_sk_load_8888_dst_sse41_8bit+0x2b>
- DB 243,66,15,111,12,130 ; movdqu (%rdx,%r8,4),%xmm1
+ DB 117,17 ; jne 38a <_sk_load_8888_dst_sse41_8bit+0x32>
+ DB 243,66,15,111,92,130,16 ; movdqu 0x10(%rdx,%r8,4),%xmm3
+ DB 243,66,15,111,20,130 ; movdqu (%rdx,%r8,4),%xmm2
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,42 ; je 2f4 <_sk_load_8888_dst_sse41_8bit+0x5f>
- DB 102,15,239,201 ; pxor %xmm1,%xmm1
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,18 ; je 2e6 <_sk_load_8888_dst_sse41_8bit+0x51>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,226 ; jne 2bc <_sk_load_8888_dst_sse41_8bit+0x27>
- DB 102,66,15,110,76,130,8 ; movd 0x8(%rdx,%r8,4),%xmm1
- DB 102,15,112,201,69 ; pshufd $0x45,%xmm1,%xmm1
- DB 243,66,15,126,20,130 ; movq (%rdx,%r8,4),%xmm2
- DB 102,15,58,14,202,15 ; pblendw $0xf,%xmm2,%xmm1
- DB 235,200 ; jmp 2bc <_sk_load_8888_dst_sse41_8bit+0x27>
- DB 102,66,15,110,12,130 ; movd (%rdx,%r8,4),%xmm1
- DB 235,192 ; jmp 2bc <_sk_load_8888_dst_sse41_8bit+0x27>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,219 ; pxor %xmm3,%xmm3
+ DB 102,15,239,210 ; pxor %xmm2,%xmm2
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,231 ; ja 386 <_sk_load_8888_dst_sse41_8bit+0x2e>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,78,0,0,0 ; lea 0x4e(%rip),%rcx # 3f8 <_sk_load_8888_dst_sse41_8bit+0xa0>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 102,66,15,110,20,130 ; movd (%rdx,%r8,4),%xmm2
+ DB 235,203 ; jmp 386 <_sk_load_8888_dst_sse41_8bit+0x2e>
+ DB 102,66,15,110,84,130,8 ; movd 0x8(%rdx,%r8,4),%xmm2
+ DB 102,15,112,210,69 ; pshufd $0x45,%xmm2,%xmm2
+ DB 243,66,15,126,36,130 ; movq (%rdx,%r8,4),%xmm4
+ DB 102,15,58,14,212,15 ; pblendw $0xf,%xmm4,%xmm2
+ DB 235,177 ; jmp 386 <_sk_load_8888_dst_sse41_8bit+0x2e>
+ DB 102,66,15,110,84,130,24 ; movd 0x18(%rdx,%r8,4),%xmm2
+ DB 102,15,112,218,69 ; pshufd $0x45,%xmm2,%xmm3
+ DB 102,66,15,58,34,92,130,20,1 ; pinsrd $0x1,0x14(%rdx,%r8,4),%xmm3
+ DB 102,66,15,58,34,92,130,16,0 ; pinsrd $0x0,0x10(%rdx,%r8,4),%xmm3
+ DB 235,139 ; jmp 380 <_sk_load_8888_dst_sse41_8bit+0x28>
+ DB 15,31,0 ; nopl (%rax)
+ DB 187,255,255,255,207 ; mov $0xcfffffff,%ebx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,195 ; inc %ebx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,136,255,255,255,242 ; decl -0xd000001(%rax)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 233,255,255,255,221 ; jmpq ffffffffde000410 <_sk_xor__sse41_8bit+0xffffffffddffe778>
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_store_8888_sse41_8bit
_sk_store_8888_sse41_8bit LABEL PROC
- DB 76,99,7 ; movslq (%rdi),%r8
- DB 76,139,79,16 ; mov 0x10(%rdi),%r9
+ DB 76,99,15 ; movslq (%rdi),%r9
+ DB 76,139,71,16 ; mov 0x10(%rdi),%r8
DB 72,173 ; lods %ds:(%rsi),%rax
DB 72,99,72,8 ; movslq 0x8(%rax),%rcx
DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,193,226,2 ; shl $0x2,%rdx
DB 72,3,16 ; add (%rax),%rdx
- DB 77,133,201 ; test %r9,%r9
- DB 117,10 ; jne 327 <_sk_store_8888_sse41_8bit+0x2b>
- DB 243,66,15,127,4,130 ; movdqu %xmm0,(%rdx,%r8,4)
- DB 72,173 ; lods %ds:(%rsi),%rax
- DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,29 ; je 34e <_sk_store_8888_sse41_8bit+0x52>
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,15 ; je 346 <_sk_store_8888_sse41_8bit+0x4a>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,230 ; jne 323 <_sk_store_8888_sse41_8bit+0x27>
- DB 102,66,15,58,22,68,130,8,2 ; pextrd $0x2,%xmm0,0x8(%rdx,%r8,4)
- DB 102,66,15,214,4,130 ; movq %xmm0,(%rdx,%r8,4)
- DB 235,213 ; jmp 323 <_sk_store_8888_sse41_8bit+0x27>
- DB 102,66,15,126,4,130 ; movd %xmm0,(%rdx,%r8,4)
- DB 235,205 ; jmp 323 <_sk_store_8888_sse41_8bit+0x27>
+ DB 77,133,192 ; test %r8,%r8
+ DB 117,17 ; jne 446 <_sk_store_8888_sse41_8bit+0x32>
+ DB 243,66,15,127,4,138 ; movdqu %xmm0,(%rdx,%r9,4)
+ DB 243,66,15,127,76,138,16 ; movdqu %xmm1,0x10(%rdx,%r9,4)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+ DB 65,128,224,7 ; and $0x7,%r8b
+ DB 65,254,200 ; dec %r8b
+ DB 65,128,248,6 ; cmp $0x6,%r8b
+ DB 119,239 ; ja 442 <_sk_store_8888_sse41_8bit+0x2e>
+ DB 65,15,182,192 ; movzbl %r8b,%eax
+ DB 72,141,13,70,0,0,0 ; lea 0x46(%rip),%rcx # 4a4 <_sk_store_8888_sse41_8bit+0x90>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 102,66,15,126,4,138 ; movd %xmm0,(%rdx,%r9,4)
+ DB 235,211 ; jmp 442 <_sk_store_8888_sse41_8bit+0x2e>
+ DB 102,66,15,58,22,68,138,8,2 ; pextrd $0x2,%xmm0,0x8(%rdx,%r9,4)
+ DB 102,66,15,214,4,138 ; movq %xmm0,(%rdx,%r9,4)
+ DB 235,194 ; jmp 442 <_sk_store_8888_sse41_8bit+0x2e>
+ DB 102,66,15,58,22,76,138,24,2 ; pextrd $0x2,%xmm1,0x18(%rdx,%r9,4)
+ DB 102,66,15,58,22,76,138,20,1 ; pextrd $0x1,%xmm1,0x14(%rdx,%r9,4)
+ DB 102,66,15,126,76,138,16 ; movd %xmm1,0x10(%rdx,%r9,4)
+ DB 243,66,15,127,4,138 ; movdqu %xmm0,(%rdx,%r9,4)
+ DB 235,161 ; jmp 442 <_sk_store_8888_sse41_8bit+0x2e>
+ DB 15,31,0 ; nopl (%rax)
+ DB 195 ; retq
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,212 ; callq *%rsp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,203 ; dec %ebx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,245 ; push %rbp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 238 ; out %al,(%dx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,229 ; jmpq *%rbp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 220,255 ; fdivr %st,%st(7)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_bgra_sse41_8bit
_sk_load_bgra_sse41_8bit LABEL PROC
@@ -40567,26 +41838,55 @@ _sk_load_bgra_sse41_8bit LABEL PROC
DB 72,193,226,2 ; shl $0x2,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,19 ; jne 38a <_sk_load_bgra_sse41_8bit+0x34>
+ DB 117,35 ; jne 504 <_sk_load_bgra_sse41_8bit+0x44>
+ DB 243,66,15,111,76,130,16 ; movdqu 0x10(%rdx,%r8,4),%xmm1
DB 243,66,15,111,4,130 ; movdqu (%rdx,%r8,4),%xmm0
- DB 102,15,56,0,5,154,12,0,0 ; pshufb 0xc9a(%rip),%xmm0 # 1020 <_sk_xor__sse41_8bit+0xf9>
+ DB 102,15,111,37,58,25,0,0 ; movdqa 0x193a(%rip),%xmm4 # 1e30 <_sk_xor__sse41_8bit+0x198>
+ DB 102,15,56,0,196 ; pshufb %xmm4,%xmm0
+ DB 102,15,56,0,204 ; pshufb %xmm4,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,42 ; je 3be <_sk_load_bgra_sse41_8bit+0x68>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,201 ; pxor %xmm1,%xmm1
DB 102,15,239,192 ; pxor %xmm0,%xmm0
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,18 ; je 3b0 <_sk_load_bgra_sse41_8bit+0x5a>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,217 ; jne 37d <_sk_load_bgra_sse41_8bit+0x27>
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,213 ; ja 4ee <_sk_load_bgra_sse41_8bit+0x2e>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,80,0,0,0 ; lea 0x50(%rip),%rcx # 574 <_sk_load_bgra_sse41_8bit+0xb4>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 102,66,15,110,4,130 ; movd (%rdx,%r8,4),%xmm0
+ DB 235,185 ; jmp 4ee <_sk_load_bgra_sse41_8bit+0x2e>
DB 102,66,15,110,68,130,8 ; movd 0x8(%rdx,%r8,4),%xmm0
DB 102,15,112,192,69 ; pshufd $0x45,%xmm0,%xmm0
- DB 243,66,15,126,20,130 ; movq (%rdx,%r8,4),%xmm2
- DB 102,15,58,14,194,15 ; pblendw $0xf,%xmm2,%xmm0
- DB 235,191 ; jmp 37d <_sk_load_bgra_sse41_8bit+0x27>
- DB 102,66,15,110,4,130 ; movd (%rdx,%r8,4),%xmm0
- DB 235,183 ; jmp 37d <_sk_load_bgra_sse41_8bit+0x27>
+ DB 243,66,15,126,36,130 ; movq (%rdx,%r8,4),%xmm4
+ DB 102,15,58,14,196,15 ; pblendw $0xf,%xmm4,%xmm0
+ DB 235,159 ; jmp 4ee <_sk_load_bgra_sse41_8bit+0x2e>
+ DB 102,66,15,110,68,130,24 ; movd 0x18(%rdx,%r8,4),%xmm0
+ DB 102,15,112,200,69 ; pshufd $0x45,%xmm0,%xmm1
+ DB 102,66,15,58,34,76,130,20,1 ; pinsrd $0x1,0x14(%rdx,%r8,4),%xmm1
+ DB 102,66,15,58,34,76,130,16,0 ; pinsrd $0x0,0x10(%rdx,%r8,4),%xmm1
+ DB 233,118,255,255,255 ; jmpq 4e8 <_sk_load_bgra_sse41_8bit+0x28>
+ DB 102,144 ; xchg %ax,%ax
+ DB 185,255,255,255,205 ; mov $0xcdffffff,%ecx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,193 ; inc %ecx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,116,255,255 ; pushq -0x1(%rdi,%rdi,8)
+ DB 255,240 ; push %rax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,231 ; jmpq *%rdi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 219,255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_bgra_dst_sse41_8bit
_sk_load_bgra_dst_sse41_8bit LABEL PROC
@@ -40599,56 +41899,117 @@ _sk_load_bgra_dst_sse41_8bit LABEL PROC
DB 72,193,226,2 ; shl $0x2,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,19 ; jne 3fa <_sk_load_bgra_dst_sse41_8bit+0x34>
- DB 243,66,15,111,12,130 ; movdqu (%rdx,%r8,4),%xmm1
- DB 102,15,56,0,13,58,12,0,0 ; pshufb 0xc3a(%rip),%xmm1 # 1030 <_sk_xor__sse41_8bit+0x109>
+ DB 117,35 ; jne 5d4 <_sk_load_bgra_dst_sse41_8bit+0x44>
+ DB 243,66,15,111,92,130,16 ; movdqu 0x10(%rdx,%r8,4),%xmm3
+ DB 243,66,15,111,20,130 ; movdqu (%rdx,%r8,4),%xmm2
+ DB 102,15,111,37,122,24,0,0 ; movdqa 0x187a(%rip),%xmm4 # 1e40 <_sk_xor__sse41_8bit+0x1a8>
+ DB 102,15,56,0,212 ; pshufb %xmm4,%xmm2
+ DB 102,15,56,0,220 ; pshufb %xmm4,%xmm3
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,42 ; je 42e <_sk_load_bgra_dst_sse41_8bit+0x68>
- DB 102,15,239,201 ; pxor %xmm1,%xmm1
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,18 ; je 420 <_sk_load_bgra_dst_sse41_8bit+0x5a>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,217 ; jne 3ed <_sk_load_bgra_dst_sse41_8bit+0x27>
- DB 102,66,15,110,76,130,8 ; movd 0x8(%rdx,%r8,4),%xmm1
- DB 102,15,112,201,69 ; pshufd $0x45,%xmm1,%xmm1
- DB 243,66,15,126,20,130 ; movq (%rdx,%r8,4),%xmm2
- DB 102,15,58,14,202,15 ; pblendw $0xf,%xmm2,%xmm1
- DB 235,191 ; jmp 3ed <_sk_load_bgra_dst_sse41_8bit+0x27>
- DB 102,66,15,110,12,130 ; movd (%rdx,%r8,4),%xmm1
- DB 235,183 ; jmp 3ed <_sk_load_bgra_dst_sse41_8bit+0x27>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,219 ; pxor %xmm3,%xmm3
+ DB 102,15,239,210 ; pxor %xmm2,%xmm2
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,213 ; ja 5be <_sk_load_bgra_dst_sse41_8bit+0x2e>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,80,0,0,0 ; lea 0x50(%rip),%rcx # 644 <_sk_load_bgra_dst_sse41_8bit+0xb4>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 102,66,15,110,20,130 ; movd (%rdx,%r8,4),%xmm2
+ DB 235,185 ; jmp 5be <_sk_load_bgra_dst_sse41_8bit+0x2e>
+ DB 102,66,15,110,84,130,8 ; movd 0x8(%rdx,%r8,4),%xmm2
+ DB 102,15,112,210,69 ; pshufd $0x45,%xmm2,%xmm2
+ DB 243,66,15,126,36,130 ; movq (%rdx,%r8,4),%xmm4
+ DB 102,15,58,14,212,15 ; pblendw $0xf,%xmm4,%xmm2
+ DB 235,159 ; jmp 5be <_sk_load_bgra_dst_sse41_8bit+0x2e>
+ DB 102,66,15,110,84,130,24 ; movd 0x18(%rdx,%r8,4),%xmm2
+ DB 102,15,112,218,69 ; pshufd $0x45,%xmm2,%xmm3
+ DB 102,66,15,58,34,92,130,20,1 ; pinsrd $0x1,0x14(%rdx,%r8,4),%xmm3
+ DB 102,66,15,58,34,92,130,16,0 ; pinsrd $0x0,0x10(%rdx,%r8,4),%xmm3
+ DB 233,118,255,255,255 ; jmpq 5b8 <_sk_load_bgra_dst_sse41_8bit+0x28>
+ DB 102,144 ; xchg %ax,%ax
+ DB 185,255,255,255,205 ; mov $0xcdffffff,%ecx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,193 ; inc %ecx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,116,255,255 ; pushq -0x1(%rdi,%rdi,8)
+ DB 255,240 ; push %rax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,231 ; jmpq *%rdi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 219,255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_store_bgra_sse41_8bit
_sk_store_bgra_sse41_8bit LABEL PROC
- DB 76,99,7 ; movslq (%rdi),%r8
- DB 76,139,79,16 ; mov 0x10(%rdi),%r9
+ DB 76,99,15 ; movslq (%rdi),%r9
+ DB 76,139,71,16 ; mov 0x10(%rdi),%r8
DB 72,173 ; lods %ds:(%rsi),%rax
DB 72,99,72,8 ; movslq 0x8(%rax),%rcx
DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,193,226,2 ; shl $0x2,%rdx
DB 72,3,16 ; add (%rax),%rdx
- DB 102,15,111,208 ; movdqa %xmm0,%xmm2
- DB 102,15,56,0,21,225,11,0,0 ; pshufb 0xbe1(%rip),%xmm2 # 1040 <_sk_xor__sse41_8bit+0x119>
- DB 77,133,201 ; test %r9,%r9
- DB 117,10 ; jne 46e <_sk_store_bgra_sse41_8bit+0x38>
- DB 243,66,15,127,20,130 ; movdqu %xmm2,(%rdx,%r8,4)
- DB 72,173 ; lods %ds:(%rsi),%rax
- DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,29 ; je 495 <_sk_store_bgra_sse41_8bit+0x5f>
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,15 ; je 48d <_sk_store_bgra_sse41_8bit+0x57>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,230 ; jne 46a <_sk_store_bgra_sse41_8bit+0x34>
- DB 102,66,15,58,22,84,130,8,2 ; pextrd $0x2,%xmm2,0x8(%rdx,%r8,4)
- DB 102,66,15,214,20,130 ; movq %xmm2,(%rdx,%r8,4)
- DB 235,213 ; jmp 46a <_sk_store_bgra_sse41_8bit+0x34>
- DB 102,66,15,126,20,130 ; movd %xmm2,(%rdx,%r8,4)
- DB 235,205 ; jmp 46a <_sk_store_bgra_sse41_8bit+0x34>
+ DB 102,15,111,53,204,23,0,0 ; movdqa 0x17cc(%rip),%xmm6 # 1e50 <_sk_xor__sse41_8bit+0x1b8>
+ DB 102,15,111,233 ; movdqa %xmm1,%xmm5
+ DB 102,15,56,0,238 ; pshufb %xmm6,%xmm5
+ DB 102,15,111,224 ; movdqa %xmm0,%xmm4
+ DB 102,15,56,0,230 ; pshufb %xmm6,%xmm4
+ DB 77,133,192 ; test %r8,%r8
+ DB 117,17 ; jne 6ac <_sk_store_bgra_sse41_8bit+0x4c>
+ DB 243,66,15,127,36,138 ; movdqu %xmm4,(%rdx,%r9,4)
+ DB 243,66,15,127,108,138,16 ; movdqu %xmm5,0x10(%rdx,%r9,4)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+ DB 65,128,224,7 ; and $0x7,%r8b
+ DB 65,254,200 ; dec %r8b
+ DB 65,128,248,6 ; cmp $0x6,%r8b
+ DB 119,239 ; ja 6a8 <_sk_store_bgra_sse41_8bit+0x48>
+ DB 65,15,182,192 ; movzbl %r8b,%eax
+ DB 72,141,13,68,0,0,0 ; lea 0x44(%rip),%rcx # 708 <_sk_store_bgra_sse41_8bit+0xa8>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 102,66,15,126,36,138 ; movd %xmm4,(%rdx,%r9,4)
+ DB 235,211 ; jmp 6a8 <_sk_store_bgra_sse41_8bit+0x48>
+ DB 102,66,15,58,22,100,138,8,2 ; pextrd $0x2,%xmm4,0x8(%rdx,%r9,4)
+ DB 102,66,15,214,36,138 ; movq %xmm4,(%rdx,%r9,4)
+ DB 235,194 ; jmp 6a8 <_sk_store_bgra_sse41_8bit+0x48>
+ DB 102,66,15,58,22,108,138,24,2 ; pextrd $0x2,%xmm5,0x18(%rdx,%r9,4)
+ DB 102,66,15,58,22,108,138,20,1 ; pextrd $0x1,%xmm5,0x14(%rdx,%r9,4)
+ DB 102,66,15,126,108,138,16 ; movd %xmm5,0x10(%rdx,%r9,4)
+ DB 243,66,15,127,36,138 ; movdqu %xmm4,(%rdx,%r9,4)
+ DB 235,161 ; jmp 6a8 <_sk_store_bgra_sse41_8bit+0x48>
+ DB 144 ; nop
+ DB 197,255,255 ; (bad)
+ DB 255,214 ; callq *%rsi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,205 ; dec %ebp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,247 ; push %rdi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,240 ; push %rax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,231 ; jmpq *%rdi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 222,255 ; fdivrp %st,%st(7)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_a8_sse41_8bit
_sk_load_a8_sse41_8bit LABEL PROC
@@ -40660,30 +42021,66 @@ _sk_load_a8_sse41_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,16 ; jne 4ca <_sk_load_a8_sse41_8bit+0x2d>
- DB 102,66,15,56,49,4,2 ; pmovzxbd (%rdx,%r8,1),%xmm0
+ DB 117,42 ; jne 76b <_sk_load_a8_sse41_8bit+0x47>
+ DB 102,66,15,56,48,12,2 ; pmovzxbw (%rdx,%r8,1),%xmm1
+ DB 102,15,219,13,16,23,0,0 ; pand 0x1710(%rip),%xmm1 # 1e60 <_sk_xor__sse41_8bit+0x1c8>
+ DB 102,15,239,228 ; pxor %xmm4,%xmm4
+ DB 102,15,56,51,193 ; pmovzxwd %xmm1,%xmm0
+ DB 102,15,105,204 ; punpckhwd %xmm4,%xmm1
+ DB 102,15,114,241,24 ; pslld $0x18,%xmm1
DB 102,15,114,240,24 ; pslld $0x18,%xmm0
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,53 ; je 509 <_sk_load_a8_sse41_8bit+0x6c>
- DB 102,15,239,192 ; pxor %xmm0,%xmm0
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,21 ; je 4f3 <_sk_load_a8_sse41_8bit+0x56>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,221 ; jne 4c1 <_sk_load_a8_sse41_8bit+0x24>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,201 ; pxor %xmm1,%xmm1
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,204 ; ja 748 <_sk_load_a8_sse41_8bit+0x24>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,117,0,0,0 ; lea 0x75(%rip),%rcx # 7fc <_sk_load_a8_sse41_8bit+0xd8>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
+ DB 102,15,110,200 ; movd %eax,%xmm1
+ DB 235,173 ; jmp 748 <_sk_load_a8_sse41_8bit+0x24>
DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
- DB 102,15,110,192 ; movd %eax,%xmm0
- DB 102,15,112,192,69 ; pshufd $0x45,%xmm0,%xmm0
+ DB 102,15,239,201 ; pxor %xmm1,%xmm1
+ DB 102,15,196,200,2 ; pinsrw $0x2,%eax,%xmm1
DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 102,15,56,49,210 ; pmovzxbd %xmm2,%xmm2
- DB 102,15,58,14,194,15 ; pblendw $0xf,%xmm2,%xmm0
- DB 235,184 ; jmp 4c1 <_sk_load_a8_sse41_8bit+0x24>
- DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
DB 102,15,110,192 ; movd %eax,%xmm0
- DB 235,173 ; jmp 4c1 <_sk_load_a8_sse41_8bit+0x24>
+ DB 102,15,56,48,192 ; pmovzxbw %xmm0,%xmm0
+ DB 102,15,58,14,200,3 ; pblendw $0x3,%xmm0,%xmm1
+ DB 235,136 ; jmp 748 <_sk_load_a8_sse41_8bit+0x24>
+ DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
+ DB 102,15,239,201 ; pxor %xmm1,%xmm1
+ DB 102,15,196,200,6 ; pinsrw $0x6,%eax,%xmm1
+ DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
+ DB 102,15,196,200,5 ; pinsrw $0x5,%eax,%xmm1
+ DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
+ DB 102,15,196,200,4 ; pinsrw $0x4,%eax,%xmm1
+ DB 102,66,15,110,4,2 ; movd (%rdx,%r8,1),%xmm0
+ DB 102,15,56,48,192 ; pmovzxbw %xmm0,%xmm0
+ DB 102,15,58,14,200,15 ; pblendw $0xf,%xmm0,%xmm1
+ DB 233,77,255,255,255 ; jmpq 748 <_sk_load_a8_sse41_8bit+0x24>
+ DB 144 ; nop
+ DB 148 ; xchg %eax,%esp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,174,255,255,255,159 ; ljmp *-0x60000001(%rsi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 233,255,255,255,222 ; jmpq ffffffffdf00080c <_sk_xor__sse41_8bit+0xffffffffdeffeb74>
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,211 ; callq *%rbx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,196 ; inc %esp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_a8_dst_sse41_8bit
_sk_load_a8_dst_sse41_8bit LABEL PROC
@@ -40695,30 +42092,66 @@ _sk_load_a8_dst_sse41_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,16 ; jne 541 <_sk_load_a8_dst_sse41_8bit+0x2d>
- DB 102,66,15,56,49,12,2 ; pmovzxbd (%rdx,%r8,1),%xmm1
- DB 102,15,114,241,24 ; pslld $0x18,%xmm1
+ DB 117,42 ; jne 85f <_sk_load_a8_dst_sse41_8bit+0x47>
+ DB 102,66,15,56,48,28,2 ; pmovzxbw (%rdx,%r8,1),%xmm3
+ DB 102,15,219,29,44,22,0,0 ; pand 0x162c(%rip),%xmm3 # 1e70 <_sk_xor__sse41_8bit+0x1d8>
+ DB 102,15,239,228 ; pxor %xmm4,%xmm4
+ DB 102,15,56,51,211 ; pmovzxwd %xmm3,%xmm2
+ DB 102,15,105,220 ; punpckhwd %xmm4,%xmm3
+ DB 102,15,114,243,24 ; pslld $0x18,%xmm3
+ DB 102,15,114,242,24 ; pslld $0x18,%xmm2
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,53 ; je 580 <_sk_load_a8_dst_sse41_8bit+0x6c>
- DB 102,15,239,201 ; pxor %xmm1,%xmm1
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,21 ; je 56a <_sk_load_a8_dst_sse41_8bit+0x56>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,221 ; jne 538 <_sk_load_a8_dst_sse41_8bit+0x24>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,219 ; pxor %xmm3,%xmm3
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,204 ; ja 83c <_sk_load_a8_dst_sse41_8bit+0x24>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,117,0,0,0 ; lea 0x75(%rip),%rcx # 8f0 <_sk_load_a8_dst_sse41_8bit+0xd8>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
+ DB 102,15,110,216 ; movd %eax,%xmm3
+ DB 235,173 ; jmp 83c <_sk_load_a8_dst_sse41_8bit+0x24>
DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
- DB 102,15,110,200 ; movd %eax,%xmm1
- DB 102,15,112,201,69 ; pshufd $0x45,%xmm1,%xmm1
+ DB 102,15,239,219 ; pxor %xmm3,%xmm3
+ DB 102,15,196,216,2 ; pinsrw $0x2,%eax,%xmm3
DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
DB 102,15,110,208 ; movd %eax,%xmm2
- DB 102,15,56,49,210 ; pmovzxbd %xmm2,%xmm2
- DB 102,15,58,14,202,15 ; pblendw $0xf,%xmm2,%xmm1
- DB 235,184 ; jmp 538 <_sk_load_a8_dst_sse41_8bit+0x24>
- DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
- DB 102,15,110,200 ; movd %eax,%xmm1
- DB 235,173 ; jmp 538 <_sk_load_a8_dst_sse41_8bit+0x24>
+ DB 102,15,56,48,210 ; pmovzxbw %xmm2,%xmm2
+ DB 102,15,58,14,218,3 ; pblendw $0x3,%xmm2,%xmm3
+ DB 235,136 ; jmp 83c <_sk_load_a8_dst_sse41_8bit+0x24>
+ DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
+ DB 102,15,239,219 ; pxor %xmm3,%xmm3
+ DB 102,15,196,216,6 ; pinsrw $0x6,%eax,%xmm3
+ DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
+ DB 102,15,196,216,5 ; pinsrw $0x5,%eax,%xmm3
+ DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
+ DB 102,15,196,216,4 ; pinsrw $0x4,%eax,%xmm3
+ DB 102,66,15,110,20,2 ; movd (%rdx,%r8,1),%xmm2
+ DB 102,15,56,48,210 ; pmovzxbw %xmm2,%xmm2
+ DB 102,15,58,14,218,15 ; pblendw $0xf,%xmm2,%xmm3
+ DB 233,77,255,255,255 ; jmpq 83c <_sk_load_a8_dst_sse41_8bit+0x24>
+ DB 144 ; nop
+ DB 148 ; xchg %eax,%esp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,174,255,255,255,159 ; ljmp *-0x60000001(%rsi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 233,255,255,255,222 ; jmpq ffffffffdf000900 <_sk_xor__sse41_8bit+0xffffffffdeffec68>
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,211 ; callq *%rbx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,196 ; inc %esp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_store_a8_sse41_8bit
_sk_store_a8_sse41_8bit LABEL PROC
@@ -40729,27 +42162,61 @@ _sk_store_a8_sse41_8bit LABEL PROC
DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
- DB 102,15,111,208 ; movdqa %xmm0,%xmm2
- DB 102,15,114,210,24 ; psrld $0x18,%xmm2
+ DB 102,15,111,45,84,21,0,0 ; movdqa 0x1554(%rip),%xmm5 # 1e80 <_sk_xor__sse41_8bit+0x1e8>
+ DB 102,15,111,241 ; movdqa %xmm1,%xmm6
+ DB 102,15,56,0,245 ; pshufb %xmm5,%xmm6
+ DB 102,15,111,224 ; movdqa %xmm0,%xmm4
+ DB 102,15,56,0,229 ; pshufb %xmm5,%xmm4
+ DB 102,15,108,230 ; punpcklqdq %xmm6,%xmm4
DB 77,133,201 ; test %r9,%r9
- DB 117,19 ; jne 5c4 <_sk_store_a8_sse41_8bit+0x39>
- DB 102,15,56,0,21,166,10,0,0 ; pshufb 0xaa6(%rip),%xmm2 # 1060 <_sk_xor__sse41_8bit+0x139>
- DB 102,66,15,126,20,2 ; movd %xmm2,(%rdx,%r8,1)
- DB 72,173 ; lods %ds:(%rsi),%rax
- DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,40 ; je 5f6 <_sk_store_a8_sse41_8bit+0x6b>
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,15 ; je 5e3 <_sk_store_a8_sse41_8bit+0x58>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,230 ; jne 5c0 <_sk_store_a8_sse41_8bit+0x35>
- DB 102,66,15,58,20,84,2,2,8 ; pextrb $0x8,%xmm2,0x2(%rdx,%r8,1)
- DB 102,15,56,0,21,100,10,0,0 ; pshufb 0xa64(%rip),%xmm2 # 1050 <_sk_xor__sse41_8bit+0x129>
- DB 102,66,15,58,21,20,2,0 ; pextrw $0x0,%xmm2,(%rdx,%r8,1)
- DB 235,202 ; jmp 5c0 <_sk_store_a8_sse41_8bit+0x35>
- DB 102,66,15,58,20,20,2,0 ; pextrb $0x0,%xmm2,(%rdx,%r8,1)
- DB 235,192 ; jmp 5c0 <_sk_store_a8_sse41_8bit+0x35>
+ DB 117,19 ; jne 95a <_sk_store_a8_sse41_8bit+0x4e>
+ DB 102,15,56,0,37,96,21,0,0 ; pshufb 0x1560(%rip),%xmm4 # 1eb0 <_sk_xor__sse41_8bit+0x218>
+ DB 102,66,15,214,36,2 ; movq %xmm4,(%rdx,%r8,1)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,239 ; ja 956 <_sk_store_a8_sse41_8bit+0x4a>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,94,0,0,0 ; lea 0x5e(%rip),%rcx # 9d0 <_sk_store_a8_sse41_8bit+0xc4>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 102,66,15,58,20,36,2,0 ; pextrb $0x0,%xmm4,(%rdx,%r8,1)
+ DB 235,209 ; jmp 956 <_sk_store_a8_sse41_8bit+0x4a>
+ DB 102,66,15,58,20,100,2,2,4 ; pextrb $0x4,%xmm4,0x2(%rdx,%r8,1)
+ DB 102,15,56,0,37,9,21,0,0 ; pshufb 0x1509(%rip),%xmm4 # 1ea0 <_sk_xor__sse41_8bit+0x208>
+ DB 102,66,15,58,21,36,2,0 ; pextrw $0x0,%xmm4,(%rdx,%r8,1)
+ DB 235,181 ; jmp 956 <_sk_store_a8_sse41_8bit+0x4a>
+ DB 102,66,15,58,20,100,2,6,12 ; pextrb $0xc,%xmm4,0x6(%rdx,%r8,1)
+ DB 102,66,15,58,20,100,2,5,10 ; pextrb $0xa,%xmm4,0x5(%rdx,%r8,1)
+ DB 102,66,15,58,20,100,2,4,8 ; pextrb $0x8,%xmm4,0x4(%rdx,%r8,1)
+ DB 102,15,56,0,37,203,20,0,0 ; pshufb 0x14cb(%rip),%xmm4 # 1e90 <_sk_xor__sse41_8bit+0x1f8>
+ DB 102,66,15,126,36,2 ; movd %xmm4,(%rdx,%r8,1)
+ DB 235,137 ; jmp 956 <_sk_store_a8_sse41_8bit+0x4a>
+ DB 15,31,0 ; nopl (%rax)
+ DB 171 ; stos %eax,%es:(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 190,255,255,255,181 ; mov $0xb5ffffff,%esi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 236 ; in (%dx),%al
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,227 ; jmpq *%rbx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 218,255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,209 ; callq *%rcx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_g8_sse41_8bit
_sk_load_g8_sse41_8bit LABEL PROC
@@ -40761,32 +42228,70 @@ _sk_load_g8_sse41_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,36 ; jne 641 <_sk_load_g8_sse41_8bit+0x41>
- DB 102,66,15,56,49,4,2 ; pmovzxbd (%rdx,%r8,1),%xmm0
- DB 102,15,219,5,68,10,0,0 ; pand 0xa44(%rip),%xmm0 # 1070 <_sk_xor__sse41_8bit+0x149>
- DB 102,15,56,64,5,75,10,0,0 ; pmulld 0xa4b(%rip),%xmm0 # 1080 <_sk_xor__sse41_8bit+0x159>
- DB 102,15,235,5,83,10,0,0 ; por 0xa53(%rip),%xmm0 # 1090 <_sk_xor__sse41_8bit+0x169>
+ DB 117,66 ; jne a4b <_sk_load_g8_sse41_8bit+0x5f>
+ DB 102,66,15,56,48,12,2 ; pmovzxbw (%rdx,%r8,1),%xmm1
+ DB 102,15,219,13,168,20,0,0 ; pand 0x14a8(%rip),%xmm1 # 1ec0 <_sk_xor__sse41_8bit+0x228>
+ DB 102,15,239,228 ; pxor %xmm4,%xmm4
+ DB 102,15,56,51,193 ; pmovzxwd %xmm1,%xmm0
+ DB 102,15,105,204 ; punpckhwd %xmm4,%xmm1
+ DB 102,15,111,37,163,20,0,0 ; movdqa 0x14a3(%rip),%xmm4 # 1ed0 <_sk_xor__sse41_8bit+0x238>
+ DB 102,15,56,64,204 ; pmulld %xmm4,%xmm1
+ DB 102,15,56,64,196 ; pmulld %xmm4,%xmm0
+ DB 102,15,111,37,161,20,0,0 ; movdqa 0x14a1(%rip),%xmm4 # 1ee0 <_sk_xor__sse41_8bit+0x248>
+ DB 102,15,235,196 ; por %xmm4,%xmm0
+ DB 102,15,235,204 ; por %xmm4,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,53 ; je 680 <_sk_load_g8_sse41_8bit+0x80>
- DB 102,15,239,192 ; pxor %xmm0,%xmm0
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,21 ; je 66a <_sk_load_g8_sse41_8bit+0x6a>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,201 ; jne 624 <_sk_load_g8_sse41_8bit+0x24>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,201 ; pxor %xmm1,%xmm1
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,180 ; ja a10 <_sk_load_g8_sse41_8bit+0x24>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,121,0,0,0 ; lea 0x79(%rip),%rcx # ae0 <_sk_load_g8_sse41_8bit+0xf4>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
+ DB 102,15,110,200 ; movd %eax,%xmm1
+ DB 235,149 ; jmp a10 <_sk_load_g8_sse41_8bit+0x24>
DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
- DB 102,15,110,192 ; movd %eax,%xmm0
- DB 102,15,112,192,69 ; pshufd $0x45,%xmm0,%xmm0
+ DB 102,15,239,201 ; pxor %xmm1,%xmm1
+ DB 102,15,196,200,2 ; pinsrw $0x2,%eax,%xmm1
DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 102,15,56,49,210 ; pmovzxbd %xmm2,%xmm2
- DB 102,15,58,14,194,15 ; pblendw $0xf,%xmm2,%xmm0
- DB 235,164 ; jmp 624 <_sk_load_g8_sse41_8bit+0x24>
- DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
DB 102,15,110,192 ; movd %eax,%xmm0
- DB 235,153 ; jmp 624 <_sk_load_g8_sse41_8bit+0x24>
+ DB 102,15,56,48,192 ; pmovzxbw %xmm0,%xmm0
+ DB 102,15,58,14,200,3 ; pblendw $0x3,%xmm0,%xmm1
+ DB 233,109,255,255,255 ; jmpq a10 <_sk_load_g8_sse41_8bit+0x24>
+ DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
+ DB 102,15,239,201 ; pxor %xmm1,%xmm1
+ DB 102,15,196,200,6 ; pinsrw $0x6,%eax,%xmm1
+ DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
+ DB 102,15,196,200,5 ; pinsrw $0x5,%eax,%xmm1
+ DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
+ DB 102,15,196,200,4 ; pinsrw $0x4,%eax,%xmm1
+ DB 102,66,15,110,4,2 ; movd (%rdx,%r8,1),%xmm0
+ DB 102,15,56,48,192 ; pmovzxbw %xmm0,%xmm0
+ DB 102,15,58,14,200,15 ; pblendw $0xf,%xmm0,%xmm1
+ DB 233,50,255,255,255 ; jmpq a10 <_sk_load_g8_sse41_8bit+0x24>
+ DB 102,144 ; xchg %ax,%ax
+ DB 144 ; nop
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,170,255,255,255,155 ; ljmp *-0x64000001(%rdx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 232,255,255,255,221 ; callq ffffffffde000af0 <_sk_xor__sse41_8bit+0xffffffffddffee58>
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,210 ; callq *%rdx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,195 ; inc %ebx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_g8_dst_sse41_8bit
_sk_load_g8_dst_sse41_8bit LABEL PROC
@@ -40798,118 +42303,233 @@ _sk_load_g8_dst_sse41_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,36 ; jne 6cc <_sk_load_g8_dst_sse41_8bit+0x41>
- DB 102,66,15,56,49,12,2 ; pmovzxbd (%rdx,%r8,1),%xmm1
- DB 102,15,219,13,233,9,0,0 ; pand 0x9e9(%rip),%xmm1 # 10a0 <_sk_xor__sse41_8bit+0x179>
- DB 102,15,56,64,13,240,9,0,0 ; pmulld 0x9f0(%rip),%xmm1 # 10b0 <_sk_xor__sse41_8bit+0x189>
- DB 102,15,235,13,248,9,0,0 ; por 0x9f8(%rip),%xmm1 # 10c0 <_sk_xor__sse41_8bit+0x199>
+ DB 117,66 ; jne b5b <_sk_load_g8_dst_sse41_8bit+0x5f>
+ DB 102,66,15,56,48,28,2 ; pmovzxbw (%rdx,%r8,1),%xmm3
+ DB 102,15,219,29,200,19,0,0 ; pand 0x13c8(%rip),%xmm3 # 1ef0 <_sk_xor__sse41_8bit+0x258>
+ DB 102,15,239,228 ; pxor %xmm4,%xmm4
+ DB 102,15,56,51,211 ; pmovzxwd %xmm3,%xmm2
+ DB 102,15,105,220 ; punpckhwd %xmm4,%xmm3
+ DB 102,15,111,37,195,19,0,0 ; movdqa 0x13c3(%rip),%xmm4 # 1f00 <_sk_xor__sse41_8bit+0x268>
+ DB 102,15,56,64,220 ; pmulld %xmm4,%xmm3
+ DB 102,15,56,64,212 ; pmulld %xmm4,%xmm2
+ DB 102,15,111,37,193,19,0,0 ; movdqa 0x13c1(%rip),%xmm4 # 1f10 <_sk_xor__sse41_8bit+0x278>
+ DB 102,15,235,212 ; por %xmm4,%xmm2
+ DB 102,15,235,220 ; por %xmm4,%xmm3
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,53 ; je 70b <_sk_load_g8_dst_sse41_8bit+0x80>
- DB 102,15,239,201 ; pxor %xmm1,%xmm1
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,21 ; je 6f5 <_sk_load_g8_dst_sse41_8bit+0x6a>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,201 ; jne 6af <_sk_load_g8_dst_sse41_8bit+0x24>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,219 ; pxor %xmm3,%xmm3
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,180 ; ja b20 <_sk_load_g8_dst_sse41_8bit+0x24>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,121,0,0,0 ; lea 0x79(%rip),%rcx # bf0 <_sk_load_g8_dst_sse41_8bit+0xf4>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
+ DB 102,15,110,216 ; movd %eax,%xmm3
+ DB 235,149 ; jmp b20 <_sk_load_g8_dst_sse41_8bit+0x24>
DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
- DB 102,15,110,200 ; movd %eax,%xmm1
- DB 102,15,112,201,69 ; pshufd $0x45,%xmm1,%xmm1
+ DB 102,15,239,219 ; pxor %xmm3,%xmm3
+ DB 102,15,196,216,2 ; pinsrw $0x2,%eax,%xmm3
DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
DB 102,15,110,208 ; movd %eax,%xmm2
- DB 102,15,56,49,210 ; pmovzxbd %xmm2,%xmm2
- DB 102,15,58,14,202,15 ; pblendw $0xf,%xmm2,%xmm1
- DB 235,164 ; jmp 6af <_sk_load_g8_dst_sse41_8bit+0x24>
- DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
- DB 102,15,110,200 ; movd %eax,%xmm1
- DB 235,153 ; jmp 6af <_sk_load_g8_dst_sse41_8bit+0x24>
+ DB 102,15,56,48,210 ; pmovzxbw %xmm2,%xmm2
+ DB 102,15,58,14,218,3 ; pblendw $0x3,%xmm2,%xmm3
+ DB 233,109,255,255,255 ; jmpq b20 <_sk_load_g8_dst_sse41_8bit+0x24>
+ DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
+ DB 102,15,239,219 ; pxor %xmm3,%xmm3
+ DB 102,15,196,216,6 ; pinsrw $0x6,%eax,%xmm3
+ DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
+ DB 102,15,196,216,5 ; pinsrw $0x5,%eax,%xmm3
+ DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
+ DB 102,15,196,216,4 ; pinsrw $0x4,%eax,%xmm3
+ DB 102,66,15,110,20,2 ; movd (%rdx,%r8,1),%xmm2
+ DB 102,15,56,48,210 ; pmovzxbw %xmm2,%xmm2
+ DB 102,15,58,14,218,15 ; pblendw $0xf,%xmm2,%xmm3
+ DB 233,50,255,255,255 ; jmpq b20 <_sk_load_g8_dst_sse41_8bit+0x24>
+ DB 102,144 ; xchg %ax,%ax
+ DB 144 ; nop
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,170,255,255,255,155 ; ljmp *-0x64000001(%rdx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 232,255,255,255,221 ; callq ffffffffde000c00 <_sk_xor__sse41_8bit+0xffffffffddffef68>
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,210 ; callq *%rdx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,195 ; inc %ebx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_srcover_rgba_8888_sse41_8bit
_sk_srcover_rgba_8888_sse41_8bit LABEL PROC
- DB 76,99,7 ; movslq (%rdi),%r8
- DB 76,139,79,16 ; mov 0x10(%rdi),%r9
+ DB 76,99,15 ; movslq (%rdi),%r9
+ DB 76,139,71,16 ; mov 0x10(%rdi),%r8
DB 72,173 ; lods %ds:(%rsi),%rax
DB 72,99,72,8 ; movslq 0x8(%rax),%rcx
DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,193,226,2 ; shl $0x2,%rdx
DB 72,3,16 ; add (%rax),%rdx
- DB 77,133,201 ; test %r9,%r9
- DB 117,98 ; jne 799 <_sk_srcover_rgba_8888_sse41_8bit+0x83>
- DB 243,66,15,111,20,130 ; movdqu (%rdx,%r8,4),%xmm2
- DB 77,133,201 ; test %r9,%r9
- DB 102,15,111,216 ; movdqa %xmm0,%xmm3
- DB 102,15,56,0,29,131,9,0,0 ; pshufb 0x983(%rip),%xmm3 # 10d0 <_sk_xor__sse41_8bit+0x1a9>
- DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,111,234 ; movdqa %xmm2,%xmm5
- DB 102,15,104,236 ; punpckhbw %xmm4,%xmm5
- DB 102,15,56,48,242 ; pmovzxbw %xmm2,%xmm6
- DB 102,15,56,48,251 ; pmovzxbw %xmm3,%xmm7
- DB 102,15,104,220 ; punpckhbw %xmm4,%xmm3
- DB 102,15,213,221 ; pmullw %xmm5,%xmm3
- DB 102,15,213,254 ; pmullw %xmm6,%xmm7
- DB 102,15,253,221 ; paddw %xmm5,%xmm3
- DB 102,15,253,254 ; paddw %xmm6,%xmm7
- DB 102,15,113,211,8 ; psrlw $0x8,%xmm3
+ DB 77,133,192 ; test %r8,%r8
+ DB 15,133,206,0,0,0 ; jne cff <_sk_srcover_rgba_8888_sse41_8bit+0xf3>
+ DB 243,70,15,111,68,138,16 ; movdqu 0x10(%rdx,%r9,4),%xmm8
+ DB 243,70,15,111,12,138 ; movdqu (%rdx,%r9,4),%xmm9
+ DB 77,133,192 ; test %r8,%r8
+ DB 102,15,111,37,215,18,0,0 ; movdqa 0x12d7(%rip),%xmm4 # 1f20 <_sk_xor__sse41_8bit+0x288>
+ DB 102,15,111,241 ; movdqa %xmm1,%xmm6
+ DB 102,15,56,0,244 ; pshufb %xmm4,%xmm6
+ DB 102,15,111,248 ; movdqa %xmm0,%xmm7
+ DB 102,15,56,0,252 ; pshufb %xmm4,%xmm7
+ DB 102,69,15,239,210 ; pxor %xmm10,%xmm10
+ DB 102,69,15,111,217 ; movdqa %xmm9,%xmm11
+ DB 102,69,15,104,218 ; punpckhbw %xmm10,%xmm11
+ DB 102,69,15,111,224 ; movdqa %xmm8,%xmm12
+ DB 102,69,15,104,226 ; punpckhbw %xmm10,%xmm12
+ DB 102,69,15,56,48,233 ; pmovzxbw %xmm9,%xmm13
+ DB 102,69,15,56,48,240 ; pmovzxbw %xmm8,%xmm14
+ DB 102,15,56,48,231 ; pmovzxbw %xmm7,%xmm4
+ DB 102,15,56,48,238 ; pmovzxbw %xmm6,%xmm5
+ DB 102,65,15,104,250 ; punpckhbw %xmm10,%xmm7
+ DB 102,65,15,104,242 ; punpckhbw %xmm10,%xmm6
+ DB 102,65,15,213,244 ; pmullw %xmm12,%xmm6
+ DB 102,65,15,213,251 ; pmullw %xmm11,%xmm7
+ DB 102,65,15,213,238 ; pmullw %xmm14,%xmm5
+ DB 102,65,15,213,229 ; pmullw %xmm13,%xmm4
+ DB 102,65,15,253,251 ; paddw %xmm11,%xmm7
+ DB 102,65,15,253,244 ; paddw %xmm12,%xmm6
+ DB 102,65,15,253,229 ; paddw %xmm13,%xmm4
+ DB 102,65,15,253,238 ; paddw %xmm14,%xmm5
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
- DB 102,15,103,251 ; packuswb %xmm3,%xmm7
- DB 102,15,248,215 ; psubb %xmm7,%xmm2
- DB 102,15,252,208 ; paddb %xmm0,%xmm2
- DB 117,60 ; jne 7cb <_sk_srcover_rgba_8888_sse41_8bit+0xb5>
- DB 243,66,15,127,20,130 ; movdqu %xmm2,(%rdx,%r8,4)
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,15,103,231 ; packuswb %xmm7,%xmm4
+ DB 102,15,103,238 ; packuswb %xmm6,%xmm5
+ DB 102,68,15,248,197 ; psubb %xmm5,%xmm8
+ DB 102,68,15,248,204 ; psubb %xmm4,%xmm9
+ DB 102,68,15,252,200 ; paddb %xmm0,%xmm9
+ DB 102,68,15,252,193 ; paddb %xmm1,%xmm8
+ DB 117,72 ; jne d36 <_sk_srcover_rgba_8888_sse41_8bit+0x12a>
+ DB 243,70,15,127,12,138 ; movdqu %xmm9,(%rdx,%r9,4)
+ DB 243,70,15,127,68,138,16 ; movdqu %xmm8,0x10(%rdx,%r9,4)
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 68,137,200 ; mov %r9d,%eax
- DB 36,3 ; and $0x3,%al
- DB 60,1 ; cmp $0x1,%al
- DB 116,80 ; je 7f2 <_sk_srcover_rgba_8888_sse41_8bit+0xdc>
- DB 102,15,239,210 ; pxor %xmm2,%xmm2
- DB 60,2 ; cmp $0x2,%al
- DB 116,16 ; je 7ba <_sk_srcover_rgba_8888_sse41_8bit+0xa4>
- DB 60,3 ; cmp $0x3,%al
- DB 117,143 ; jne 73d <_sk_srcover_rgba_8888_sse41_8bit+0x27>
- DB 102,66,15,110,84,130,8 ; movd 0x8(%rdx,%r8,4),%xmm2
- DB 102,15,112,210,69 ; pshufd $0x45,%xmm2,%xmm2
- DB 243,66,15,126,28,130 ; movq (%rdx,%r8,4),%xmm3
- DB 102,15,58,14,211,15 ; pblendw $0xf,%xmm3,%xmm2
- DB 233,114,255,255,255 ; jmpq 73d <_sk_srcover_rgba_8888_sse41_8bit+0x27>
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,40 ; je 7fd <_sk_srcover_rgba_8888_sse41_8bit+0xe7>
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,15 ; je 7ea <_sk_srcover_rgba_8888_sse41_8bit+0xd4>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,180 ; jne 795 <_sk_srcover_rgba_8888_sse41_8bit+0x7f>
- DB 102,66,15,58,22,84,130,8,2 ; pextrd $0x2,%xmm2,0x8(%rdx,%r8,4)
- DB 102,66,15,214,20,130 ; movq %xmm2,(%rdx,%r8,4)
- DB 235,163 ; jmp 795 <_sk_srcover_rgba_8888_sse41_8bit+0x7f>
- DB 102,66,15,110,20,130 ; movd (%rdx,%r8,4),%xmm2
- DB 233,64,255,255,255 ; jmpq 73d <_sk_srcover_rgba_8888_sse41_8bit+0x27>
- DB 102,66,15,126,20,130 ; movd %xmm2,(%rdx,%r8,4)
- DB 235,144 ; jmp 795 <_sk_srcover_rgba_8888_sse41_8bit+0x7f>
+ DB 68,137,192 ; mov %r8d,%eax
+ DB 36,7 ; and $0x7,%al
+ DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
+ DB 102,69,15,239,201 ; pxor %xmm9,%xmm9
+ DB 254,200 ; dec %al
+ DB 60,6 ; cmp $0x6,%al
+ DB 15,135,38,255,255,255 ; ja c3e <_sk_srcover_rgba_8888_sse41_8bit+0x32>
+ DB 15,182,192 ; movzbl %al,%eax
+ DB 72,141,13,186,0,0,0 ; lea 0xba(%rip),%rcx # ddc <_sk_srcover_rgba_8888_sse41_8bit+0x1d0>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 102,70,15,110,12,138 ; movd (%rdx,%r9,4),%xmm9
+ DB 233,8,255,255,255 ; jmpq c3e <_sk_srcover_rgba_8888_sse41_8bit+0x32>
+ DB 65,128,224,7 ; and $0x7,%r8b
+ DB 65,254,200 ; dec %r8b
+ DB 65,128,248,6 ; cmp $0x6,%r8b
+ DB 119,184 ; ja cfb <_sk_srcover_rgba_8888_sse41_8bit+0xef>
+ DB 65,15,182,192 ; movzbl %r8b,%eax
+ DB 72,141,13,170,0,0,0 ; lea 0xaa(%rip),%rcx # df8 <_sk_srcover_rgba_8888_sse41_8bit+0x1ec>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 102,70,15,126,12,138 ; movd %xmm9,(%rdx,%r9,4)
+ DB 235,156 ; jmp cfb <_sk_srcover_rgba_8888_sse41_8bit+0xef>
+ DB 102,66,15,110,100,138,8 ; movd 0x8(%rdx,%r9,4),%xmm4
+ DB 102,68,15,112,204,69 ; pshufd $0x45,%xmm4,%xmm9
+ DB 243,66,15,126,36,138 ; movq (%rdx,%r9,4),%xmm4
+ DB 102,68,15,58,14,204,15 ; pblendw $0xf,%xmm4,%xmm9
+ DB 233,192,254,255,255 ; jmpq c3e <_sk_srcover_rgba_8888_sse41_8bit+0x32>
+ DB 102,66,15,110,100,138,24 ; movd 0x18(%rdx,%r9,4),%xmm4
+ DB 102,68,15,112,196,69 ; pshufd $0x45,%xmm4,%xmm8
+ DB 102,70,15,58,34,68,138,20,1 ; pinsrd $0x1,0x14(%rdx,%r9,4),%xmm8
+ DB 102,70,15,58,34,68,138,16,0 ; pinsrd $0x0,0x10(%rdx,%r9,4),%xmm8
+ DB 233,150,254,255,255 ; jmpq c38 <_sk_srcover_rgba_8888_sse41_8bit+0x2c>
+ DB 102,70,15,58,22,76,138,8,2 ; pextrd $0x2,%xmm9,0x8(%rdx,%r9,4)
+ DB 102,70,15,214,12,138 ; movq %xmm9,(%rdx,%r9,4)
+ DB 233,69,255,255,255 ; jmpq cfb <_sk_srcover_rgba_8888_sse41_8bit+0xef>
+ DB 102,70,15,58,22,68,138,24,2 ; pextrd $0x2,%xmm8,0x18(%rdx,%r9,4)
+ DB 102,70,15,58,22,68,138,20,1 ; pextrd $0x1,%xmm8,0x14(%rdx,%r9,4)
+ DB 102,70,15,126,68,138,16 ; movd %xmm8,0x10(%rdx,%r9,4)
+ DB 243,70,15,127,12,138 ; movdqu %xmm9,(%rdx,%r9,4)
+ DB 233,33,255,255,255 ; jmpq cfb <_sk_srcover_rgba_8888_sse41_8bit+0xef>
+ DB 102,144 ; xchg %ax,%ax
+ DB 79,255 ; rex.WRXB (bad)
+ DB 255 ; (bad)
+ DB 255,144,255,255,255,131 ; callq *-0x7c000001(%rax)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,92,254,255 ; lcall *-0x1(%rsi,%rdi,8)
+ DB 255 ; (bad)
+ DB 184,255,255,255,175 ; mov $0xafffffff,%eax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,162,255,255,255,95 ; jmpq *0x5fffffff(%rdx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,179,255,255,255,170 ; pushq -0x55000001(%rbx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,215 ; callq *%rdi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,208 ; callq *%rax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,199 ; inc %edi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 190 ; .byte 0xbe
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_scale_1_float_sse41_8bit
_sk_scale_1_float_sse41_8bit LABEL PROC
+ DB 102,15,111,225 ; movdqa %xmm1,%xmm4
+ DB 102,15,111,232 ; movdqa %xmm0,%xmm5
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 243,15,16,16 ; movss (%rax),%xmm2
- DB 243,15,89,21,185,7,0,0 ; mulss 0x7b9(%rip),%xmm2 # fcc <_sk_xor__sse41_8bit+0xa5>
- DB 243,15,44,194 ; cvttss2si %xmm2,%eax
- DB 102,15,110,216 ; movd %eax,%xmm3
- DB 15,87,210 ; xorps %xmm2,%xmm2
- DB 102,15,56,48,224 ; pmovzxbw %xmm0,%xmm4
- DB 102,15,104,194 ; punpckhbw %xmm2,%xmm0
- DB 102,15,56,0,29,176,8,0,0 ; pshufb 0x8b0(%rip),%xmm3 # 10e0 <_sk_xor__sse41_8bit+0x1b9>
- DB 102,15,111,211 ; movdqa %xmm3,%xmm2
- DB 102,15,213,212 ; pmullw %xmm4,%xmm2
- DB 102,15,213,216 ; pmullw %xmm0,%xmm3
- DB 102,15,253,216 ; paddw %xmm0,%xmm3
- DB 102,15,253,212 ; paddw %xmm4,%xmm2
- DB 102,15,113,211,8 ; psrlw $0x8,%xmm3
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,211 ; packuswb %xmm3,%xmm2
+ DB 243,15,16,0 ; movss (%rax),%xmm0
+ DB 243,15,89,5,186,15,0,0 ; mulss 0xfba(%rip),%xmm0 # 1de4 <_sk_xor__sse41_8bit+0x14c>
+ DB 243,15,44,192 ; cvttss2si %xmm0,%eax
+ DB 15,87,192 ; xorps %xmm0,%xmm0
+ DB 102,68,15,56,48,197 ; pmovzxbw %xmm5,%xmm8
+ DB 102,15,104,232 ; punpckhbw %xmm0,%xmm5
+ DB 102,68,15,56,48,204 ; pmovzxbw %xmm4,%xmm9
+ DB 102,15,104,224 ; punpckhbw %xmm0,%xmm4
+ DB 102,15,110,240 ; movd %eax,%xmm6
+ DB 102,15,56,0,53,222,16,0,0 ; pshufb 0x10de(%rip),%xmm6 # 1f30 <_sk_xor__sse41_8bit+0x298>
+ DB 102,15,111,206 ; movdqa %xmm6,%xmm1
+ DB 102,65,15,213,201 ; pmullw %xmm9,%xmm1
+ DB 102,15,111,198 ; movdqa %xmm6,%xmm0
+ DB 102,65,15,213,192 ; pmullw %xmm8,%xmm0
+ DB 102,15,111,254 ; movdqa %xmm6,%xmm7
+ DB 102,15,213,252 ; pmullw %xmm4,%xmm7
+ DB 102,15,213,245 ; pmullw %xmm5,%xmm6
+ DB 102,15,253,245 ; paddw %xmm5,%xmm6
+ DB 102,15,253,252 ; paddw %xmm4,%xmm7
+ DB 102,65,15,253,192 ; paddw %xmm8,%xmm0
+ DB 102,65,15,253,201 ; paddw %xmm9,%xmm1
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
+ DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
+ DB 102,15,103,198 ; packuswb %xmm6,%xmm0
+ DB 102,15,103,207 ; packuswb %xmm7,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
DB 255,224 ; jmpq *%rax
PUBLIC _sk_scale_u8_sse41_8bit
@@ -40922,80 +42542,152 @@ _sk_scale_u8_sse41_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,76 ; jne 8c3 <_sk_scale_u8_sse41_8bit+0x69>
- DB 102,66,15,56,49,28,2 ; pmovzxbd (%rdx,%r8,1),%xmm3
- DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,56,0,29,101,8,0,0 ; pshufb 0x865(%rip),%xmm3 # 10f0 <_sk_xor__sse41_8bit+0x1c9>
- DB 102,15,56,48,232 ; pmovzxbw %xmm0,%xmm5
- DB 102,15,104,196 ; punpckhbw %xmm4,%xmm0
- DB 102,15,56,48,211 ; pmovzxbw %xmm3,%xmm2
- DB 102,15,104,220 ; punpckhbw %xmm4,%xmm3
- DB 102,15,213,216 ; pmullw %xmm0,%xmm3
- DB 102,15,213,213 ; pmullw %xmm5,%xmm2
- DB 102,15,253,216 ; paddw %xmm0,%xmm3
- DB 102,15,253,213 ; paddw %xmm5,%xmm2
- DB 102,15,113,211,8 ; psrlw $0x8,%xmm3
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,211 ; packuswb %xmm3,%xmm2
+ DB 15,133,160,0,0,0 ; jne f63 <_sk_scale_u8_sse41_8bit+0xc1>
+ DB 102,66,15,56,48,52,2 ; pmovzxbw (%rdx,%r8,1),%xmm6
+ DB 102,15,219,53,110,16,0,0 ; pand 0x106e(%rip),%xmm6 # 1f40 <_sk_xor__sse41_8bit+0x2a8>
+ DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
+ DB 102,15,111,254 ; movdqa %xmm6,%xmm7
+ DB 102,15,56,0,61,108,16,0,0 ; pshufb 0x106c(%rip),%xmm7 # 1f50 <_sk_xor__sse41_8bit+0x2b8>
+ DB 102,15,56,0,53,115,16,0,0 ; pshufb 0x1073(%rip),%xmm6 # 1f60 <_sk_xor__sse41_8bit+0x2c8>
+ DB 102,68,15,56,48,200 ; pmovzxbw %xmm0,%xmm9
+ DB 102,65,15,104,192 ; punpckhbw %xmm8,%xmm0
+ DB 102,68,15,56,48,209 ; pmovzxbw %xmm1,%xmm10
+ DB 102,65,15,104,200 ; punpckhbw %xmm8,%xmm1
+ DB 102,15,56,48,230 ; pmovzxbw %xmm6,%xmm4
+ DB 102,65,15,104,240 ; punpckhbw %xmm8,%xmm6
+ DB 102,15,56,48,239 ; pmovzxbw %xmm7,%xmm5
+ DB 102,65,15,104,248 ; punpckhbw %xmm8,%xmm7
+ DB 102,15,213,249 ; pmullw %xmm1,%xmm7
+ DB 102,15,213,240 ; pmullw %xmm0,%xmm6
+ DB 102,65,15,213,234 ; pmullw %xmm10,%xmm5
+ DB 102,65,15,213,225 ; pmullw %xmm9,%xmm4
+ DB 102,15,253,240 ; paddw %xmm0,%xmm6
+ DB 102,15,253,249 ; paddw %xmm1,%xmm7
+ DB 102,65,15,253,225 ; paddw %xmm9,%xmm4
+ DB 102,65,15,253,234 ; paddw %xmm10,%xmm5
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,15,103,230 ; packuswb %xmm6,%xmm4
+ DB 102,15,103,239 ; packuswb %xmm7,%xmm5
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
+ DB 102,15,111,196 ; movdqa %xmm4,%xmm0
+ DB 102,15,111,205 ; movdqa %xmm5,%xmm1
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,56 ; je 905 <_sk_scale_u8_sse41_8bit+0xab>
- DB 102,15,239,219 ; pxor %xmm3,%xmm3
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,21 ; je 8ec <_sk_scale_u8_sse41_8bit+0x92>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,161 ; jne 87e <_sk_scale_u8_sse41_8bit+0x24>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,246 ; pxor %xmm6,%xmm6
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 15,135,82,255,255,255 ; ja eca <_sk_scale_u8_sse41_8bit+0x28>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,125,0,0,0 ; lea 0x7d(%rip),%rcx # 1000 <_sk_scale_u8_sse41_8bit+0x15e>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
+ DB 102,15,110,240 ; movd %eax,%xmm6
+ DB 233,48,255,255,255 ; jmpq eca <_sk_scale_u8_sse41_8bit+0x28>
DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 102,15,112,218,69 ; pshufd $0x45,%xmm2,%xmm3
+ DB 102,15,239,246 ; pxor %xmm6,%xmm6
+ DB 102,15,196,240,2 ; pinsrw $0x2,%eax,%xmm6
DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 102,15,56,49,210 ; pmovzxbd %xmm2,%xmm2
- DB 102,15,58,14,218,15 ; pblendw $0xf,%xmm2,%xmm3
- DB 233,121,255,255,255 ; jmpq 87e <_sk_scale_u8_sse41_8bit+0x24>
- DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
- DB 102,15,110,216 ; movd %eax,%xmm3
- DB 233,107,255,255,255 ; jmpq 87e <_sk_scale_u8_sse41_8bit+0x24>
+ DB 102,15,110,224 ; movd %eax,%xmm4
+ DB 102,15,56,48,228 ; pmovzxbw %xmm4,%xmm4
+ DB 102,15,58,14,244,3 ; pblendw $0x3,%xmm4,%xmm6
+ DB 233,8,255,255,255 ; jmpq eca <_sk_scale_u8_sse41_8bit+0x28>
+ DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
+ DB 102,15,239,246 ; pxor %xmm6,%xmm6
+ DB 102,15,196,240,6 ; pinsrw $0x6,%eax,%xmm6
+ DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
+ DB 102,15,196,240,5 ; pinsrw $0x5,%eax,%xmm6
+ DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
+ DB 102,15,196,240,4 ; pinsrw $0x4,%eax,%xmm6
+ DB 102,66,15,110,36,2 ; movd (%rdx,%r8,1),%xmm4
+ DB 102,15,56,48,228 ; pmovzxbw %xmm4,%xmm4
+ DB 102,15,58,14,244,15 ; pblendw $0xf,%xmm4,%xmm6
+ DB 233,205,254,255,255 ; jmpq eca <_sk_scale_u8_sse41_8bit+0x28>
+ DB 15,31,0 ; nopl (%rax)
+ DB 140,255 ; mov %?,%edi
+ DB 255 ; (bad)
+ DB 255,169,255,255,255,154 ; ljmp *-0x65000001(%rcx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,231 ; jmpq *%rdi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 220,255 ; fdivr %st,%st(7)
+ DB 255 ; (bad)
+ DB 255,209 ; callq *%rcx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,194 ; inc %edx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_lerp_1_float_sse41_8bit
_sk_lerp_1_float_sse41_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 243,15,16,16 ; movss (%rax),%xmm2
- DB 243,15,89,21,175,6,0,0 ; mulss 0x6af(%rip),%xmm2 # fd0 <_sk_xor__sse41_8bit+0xa9>
- DB 243,15,44,194 ; cvttss2si %xmm2,%eax
- DB 102,15,110,216 ; movd %eax,%xmm3
- DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,56,0,220 ; pshufb %xmm4,%xmm3
- DB 102,15,56,48,232 ; pmovzxbw %xmm0,%xmm5
- DB 102,15,104,196 ; punpckhbw %xmm4,%xmm0
- DB 102,15,111,21,189,7,0,0 ; movdqa 0x7bd(%rip),%xmm2 # 1100 <_sk_xor__sse41_8bit+0x1d9>
- DB 102,15,219,211 ; pand %xmm3,%xmm2
- DB 102,15,111,242 ; movdqa %xmm2,%xmm6
- DB 102,15,213,240 ; pmullw %xmm0,%xmm6
- DB 102,15,213,213 ; pmullw %xmm5,%xmm2
- DB 102,15,253,240 ; paddw %xmm0,%xmm6
- DB 102,15,253,213 ; paddw %xmm5,%xmm2
+ DB 243,15,16,32 ; movss (%rax),%xmm4
+ DB 243,15,89,37,190,13,0,0 ; mulss 0xdbe(%rip),%xmm4 # 1de8 <_sk_xor__sse41_8bit+0x150>
+ DB 243,15,44,196 ; cvttss2si %xmm4,%eax
+ DB 102,15,110,224 ; movd %eax,%xmm4
+ DB 102,15,96,228 ; punpcklbw %xmm4,%xmm4
+ DB 242,15,112,252,0 ; pshuflw $0x0,%xmm4,%xmm7
+ DB 102,68,15,112,199,80 ; pshufd $0x50,%xmm7,%xmm8
+ DB 102,69,15,239,201 ; pxor %xmm9,%xmm9
+ DB 102,68,15,56,48,208 ; pmovzxbw %xmm0,%xmm10
+ DB 102,65,15,104,193 ; punpckhbw %xmm9,%xmm0
+ DB 102,68,15,56,48,217 ; pmovzxbw %xmm1,%xmm11
+ DB 102,65,15,104,201 ; punpckhbw %xmm9,%xmm1
+ DB 102,15,56,0,61,11,15,0,0 ; pshufb 0xf0b(%rip),%xmm7 # 1f70 <_sk_xor__sse41_8bit+0x2d8>
+ DB 102,68,15,111,231 ; movdqa %xmm7,%xmm12
+ DB 102,69,15,213,227 ; pmullw %xmm11,%xmm12
+ DB 102,68,15,111,239 ; movdqa %xmm7,%xmm13
+ DB 102,69,15,213,234 ; pmullw %xmm10,%xmm13
+ DB 102,15,111,247 ; movdqa %xmm7,%xmm6
+ DB 102,15,213,241 ; pmullw %xmm1,%xmm6
+ DB 102,15,213,248 ; pmullw %xmm0,%xmm7
+ DB 102,15,253,248 ; paddw %xmm0,%xmm7
+ DB 102,15,253,241 ; paddw %xmm1,%xmm6
+ DB 102,69,15,253,234 ; paddw %xmm10,%xmm13
+ DB 102,69,15,253,227 ; paddw %xmm11,%xmm12
DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,214 ; packuswb %xmm6,%xmm2
- DB 102,15,118,237 ; pcmpeqd %xmm5,%xmm5
- DB 102,15,239,235 ; pxor %xmm3,%xmm5
- DB 102,15,56,48,217 ; pmovzxbw %xmm1,%xmm3
- DB 102,15,111,241 ; movdqa %xmm1,%xmm6
- DB 102,15,104,244 ; punpckhbw %xmm4,%xmm6
- DB 102,15,56,48,197 ; pmovzxbw %xmm5,%xmm0
- DB 102,15,104,236 ; punpckhbw %xmm4,%xmm5
- DB 102,15,213,238 ; pmullw %xmm6,%xmm5
- DB 102,15,213,195 ; pmullw %xmm3,%xmm0
- DB 102,15,253,238 ; paddw %xmm6,%xmm5
- DB 102,15,253,195 ; paddw %xmm3,%xmm0
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,65,15,113,212,8 ; psrlw $0x8,%xmm12
+ DB 102,65,15,113,213,8 ; psrlw $0x8,%xmm13
+ DB 102,68,15,103,239 ; packuswb %xmm7,%xmm13
+ DB 102,68,15,103,230 ; packuswb %xmm6,%xmm12
+ DB 102,15,118,255 ; pcmpeqd %xmm7,%xmm7
+ DB 102,65,15,239,248 ; pxor %xmm8,%xmm7
+ DB 102,68,15,56,48,194 ; pmovzxbw %xmm2,%xmm8
+ DB 102,15,111,242 ; movdqa %xmm2,%xmm6
+ DB 102,65,15,104,241 ; punpckhbw %xmm9,%xmm6
+ DB 102,68,15,56,48,211 ; pmovzxbw %xmm3,%xmm10
+ DB 102,15,111,227 ; movdqa %xmm3,%xmm4
+ DB 102,65,15,104,225 ; punpckhbw %xmm9,%xmm4
+ DB 102,15,56,48,199 ; pmovzxbw %xmm7,%xmm0
+ DB 102,65,15,104,249 ; punpckhbw %xmm9,%xmm7
+ DB 102,15,111,239 ; movdqa %xmm7,%xmm5
+ DB 102,15,213,236 ; pmullw %xmm4,%xmm5
+ DB 102,15,213,254 ; pmullw %xmm6,%xmm7
+ DB 102,15,111,200 ; movdqa %xmm0,%xmm1
+ DB 102,65,15,213,202 ; pmullw %xmm10,%xmm1
+ DB 102,65,15,213,192 ; pmullw %xmm8,%xmm0
+ DB 102,15,253,254 ; paddw %xmm6,%xmm7
+ DB 102,15,253,236 ; paddw %xmm4,%xmm5
+ DB 102,65,15,253,192 ; paddw %xmm8,%xmm0
+ DB 102,65,15,253,202 ; paddw %xmm10,%xmm1
DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
- DB 102,15,103,197 ; packuswb %xmm5,%xmm0
- DB 102,15,252,194 ; paddb %xmm2,%xmm0
+ DB 102,15,103,199 ; packuswb %xmm7,%xmm0
+ DB 102,15,103,205 ; packuswb %xmm5,%xmm1
+ DB 102,65,15,252,197 ; paddb %xmm13,%xmm0
+ DB 102,65,15,252,204 ; paddb %xmm12,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
@@ -41009,424 +42701,751 @@ _sk_lerp_u8_sse41_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 15,133,140,0,0,0 ; jne a5a <_sk_lerp_u8_sse41_8bit+0xad>
- DB 102,66,15,56,49,20,2 ; pmovzxbd (%rdx,%r8,1),%xmm2
- DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,56,0,21,46,7,0,0 ; pshufb 0x72e(%rip),%xmm2 # 1110 <_sk_xor__sse41_8bit+0x1e9>
- DB 102,15,56,48,232 ; pmovzxbw %xmm0,%xmm5
- DB 102,15,104,196 ; punpckhbw %xmm4,%xmm0
- DB 102,15,111,242 ; movdqa %xmm2,%xmm6
- DB 102,15,104,244 ; punpckhbw %xmm4,%xmm6
- DB 102,15,56,48,218 ; pmovzxbw %xmm2,%xmm3
- DB 102,15,213,240 ; pmullw %xmm0,%xmm6
- DB 102,15,213,221 ; pmullw %xmm5,%xmm3
- DB 102,15,253,240 ; paddw %xmm0,%xmm6
- DB 102,15,253,221 ; paddw %xmm5,%xmm3
- DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
- DB 102,15,113,211,8 ; psrlw $0x8,%xmm3
- DB 102,15,103,222 ; packuswb %xmm6,%xmm3
- DB 102,15,118,237 ; pcmpeqd %xmm5,%xmm5
- DB 102,15,239,234 ; pxor %xmm2,%xmm5
- DB 102,15,111,209 ; movdqa %xmm1,%xmm2
- DB 102,15,104,212 ; punpckhbw %xmm4,%xmm2
- DB 102,15,56,48,241 ; pmovzxbw %xmm1,%xmm6
- DB 102,15,56,48,197 ; pmovzxbw %xmm5,%xmm0
- DB 102,15,104,236 ; punpckhbw %xmm4,%xmm5
- DB 102,15,213,234 ; pmullw %xmm2,%xmm5
- DB 102,15,213,198 ; pmullw %xmm6,%xmm0
- DB 102,15,253,234 ; paddw %xmm2,%xmm5
- DB 102,15,253,198 ; paddw %xmm6,%xmm0
+ DB 15,133,46,1,0,0 ; jne 128d <_sk_lerp_u8_sse41_8bit+0x14f>
+ DB 102,66,15,56,48,60,2 ; pmovzxbw (%rdx,%r8,1),%xmm7
+ DB 102,15,219,61,18,14,0,0 ; pand 0xe12(%rip),%xmm7 # 1f80 <_sk_xor__sse41_8bit+0x2e8>
+ DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
+ DB 102,15,111,247 ; movdqa %xmm7,%xmm6
+ DB 102,15,56,0,53,16,14,0,0 ; pshufb 0xe10(%rip),%xmm6 # 1f90 <_sk_xor__sse41_8bit+0x2f8>
+ DB 102,15,56,0,61,23,14,0,0 ; pshufb 0xe17(%rip),%xmm7 # 1fa0 <_sk_xor__sse41_8bit+0x308>
+ DB 102,68,15,56,48,200 ; pmovzxbw %xmm0,%xmm9
+ DB 102,65,15,104,192 ; punpckhbw %xmm8,%xmm0
+ DB 102,68,15,56,48,209 ; pmovzxbw %xmm1,%xmm10
+ DB 102,65,15,104,200 ; punpckhbw %xmm8,%xmm1
+ DB 102,15,111,231 ; movdqa %xmm7,%xmm4
+ DB 102,65,15,104,224 ; punpckhbw %xmm8,%xmm4
+ DB 102,15,111,238 ; movdqa %xmm6,%xmm5
+ DB 102,65,15,104,232 ; punpckhbw %xmm8,%xmm5
+ DB 102,68,15,56,48,231 ; pmovzxbw %xmm7,%xmm12
+ DB 102,68,15,56,48,222 ; pmovzxbw %xmm6,%xmm11
+ DB 102,15,213,233 ; pmullw %xmm1,%xmm5
+ DB 102,15,213,224 ; pmullw %xmm0,%xmm4
+ DB 102,69,15,213,218 ; pmullw %xmm10,%xmm11
+ DB 102,69,15,213,225 ; pmullw %xmm9,%xmm12
+ DB 102,15,253,224 ; paddw %xmm0,%xmm4
+ DB 102,15,253,233 ; paddw %xmm1,%xmm5
+ DB 102,69,15,253,225 ; paddw %xmm9,%xmm12
+ DB 102,69,15,253,218 ; paddw %xmm10,%xmm11
DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,65,15,113,211,8 ; psrlw $0x8,%xmm11
+ DB 102,65,15,113,212,8 ; psrlw $0x8,%xmm12
+ DB 102,68,15,103,228 ; packuswb %xmm4,%xmm12
+ DB 102,68,15,103,221 ; packuswb %xmm5,%xmm11
+ DB 102,15,118,192 ; pcmpeqd %xmm0,%xmm0
+ DB 102,15,239,240 ; pxor %xmm0,%xmm6
+ DB 102,15,239,248 ; pxor %xmm0,%xmm7
+ DB 102,15,111,227 ; movdqa %xmm3,%xmm4
+ DB 102,65,15,104,224 ; punpckhbw %xmm8,%xmm4
+ DB 102,68,15,56,48,202 ; pmovzxbw %xmm2,%xmm9
+ DB 102,15,111,234 ; movdqa %xmm2,%xmm5
+ DB 102,65,15,104,232 ; punpckhbw %xmm8,%xmm5
+ DB 102,68,15,56,48,211 ; pmovzxbw %xmm3,%xmm10
+ DB 102,15,56,48,199 ; pmovzxbw %xmm7,%xmm0
+ DB 102,15,56,48,206 ; pmovzxbw %xmm6,%xmm1
+ DB 102,65,15,104,248 ; punpckhbw %xmm8,%xmm7
+ DB 102,65,15,104,240 ; punpckhbw %xmm8,%xmm6
+ DB 102,15,213,244 ; pmullw %xmm4,%xmm6
+ DB 102,15,213,253 ; pmullw %xmm5,%xmm7
+ DB 102,65,15,213,202 ; pmullw %xmm10,%xmm1
+ DB 102,65,15,213,193 ; pmullw %xmm9,%xmm0
+ DB 102,15,253,253 ; paddw %xmm5,%xmm7
+ DB 102,15,253,244 ; paddw %xmm4,%xmm6
+ DB 102,65,15,253,193 ; paddw %xmm9,%xmm0
+ DB 102,65,15,253,202 ; paddw %xmm10,%xmm1
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
- DB 102,15,103,197 ; packuswb %xmm5,%xmm0
- DB 102,15,252,195 ; paddb %xmm3,%xmm0
+ DB 102,15,103,199 ; packuswb %xmm7,%xmm0
+ DB 102,15,103,206 ; packuswb %xmm6,%xmm1
+ DB 102,65,15,252,196 ; paddb %xmm12,%xmm0
+ DB 102,65,15,252,203 ; paddb %xmm11,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,60 ; je aa0 <_sk_lerp_u8_sse41_8bit+0xf3>
- DB 102,15,239,210 ; pxor %xmm2,%xmm2
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,25 ; je a87 <_sk_lerp_u8_sse41_8bit+0xda>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 15,133,93,255,255,255 ; jne 9d5 <_sk_lerp_u8_sse41_8bit+0x28>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,255 ; pxor %xmm7,%xmm7
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 15,135,196,254,255,255 ; ja 1166 <_sk_lerp_u8_sse41_8bit+0x28>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,123,0,0,0 ; lea 0x7b(%rip),%rcx # 1328 <_sk_lerp_u8_sse41_8bit+0x1ea>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
+ DB 102,15,110,248 ; movd %eax,%xmm7
+ DB 233,162,254,255,255 ; jmpq 1166 <_sk_lerp_u8_sse41_8bit+0x28>
DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 102,15,112,210,69 ; pshufd $0x45,%xmm2,%xmm2
+ DB 102,15,239,255 ; pxor %xmm7,%xmm7
+ DB 102,15,196,248,2 ; pinsrw $0x2,%eax,%xmm7
DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
- DB 102,15,110,216 ; movd %eax,%xmm3
- DB 102,15,56,49,219 ; pmovzxbd %xmm3,%xmm3
- DB 102,15,58,14,211,15 ; pblendw $0xf,%xmm3,%xmm2
- DB 233,53,255,255,255 ; jmpq 9d5 <_sk_lerp_u8_sse41_8bit+0x28>
- DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 233,39,255,255,255 ; jmpq 9d5 <_sk_lerp_u8_sse41_8bit+0x28>
+ DB 102,15,110,232 ; movd %eax,%xmm5
+ DB 102,15,56,48,237 ; pmovzxbw %xmm5,%xmm5
+ DB 102,15,58,14,253,3 ; pblendw $0x3,%xmm5,%xmm7
+ DB 233,122,254,255,255 ; jmpq 1166 <_sk_lerp_u8_sse41_8bit+0x28>
+ DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
+ DB 102,15,239,255 ; pxor %xmm7,%xmm7
+ DB 102,15,196,248,6 ; pinsrw $0x6,%eax,%xmm7
+ DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
+ DB 102,15,196,248,5 ; pinsrw $0x5,%eax,%xmm7
+ DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
+ DB 102,15,196,248,4 ; pinsrw $0x4,%eax,%xmm7
+ DB 102,66,15,110,44,2 ; movd (%rdx,%r8,1),%xmm5
+ DB 102,15,56,48,237 ; pmovzxbw %xmm5,%xmm5
+ DB 102,15,58,14,253,15 ; pblendw $0xf,%xmm5,%xmm7
+ DB 233,63,254,255,255 ; jmpq 1166 <_sk_lerp_u8_sse41_8bit+0x28>
+ DB 144 ; nop
+ DB 142,255 ; mov %edi,%?
+ DB 255 ; (bad)
+ DB 255,171,255,255,255,156 ; ljmp *-0x63000001(%rbx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 233,255,255,255,222 ; jmpq ffffffffdf001338 <_sk_xor__sse41_8bit+0xffffffffdefff6a0>
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,211 ; callq *%rbx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,196 ; inc %esp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_move_src_dst_sse41_8bit
_sk_move_src_dst_sse41_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 15,40,200 ; movaps %xmm0,%xmm1
+ DB 15,40,208 ; movaps %xmm0,%xmm2
+ DB 15,40,217 ; movaps %xmm1,%xmm3
DB 255,224 ; jmpq *%rax
PUBLIC _sk_move_dst_src_sse41_8bit
_sk_move_dst_src_sse41_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 15,40,193 ; movaps %xmm1,%xmm0
+ DB 15,40,194 ; movaps %xmm2,%xmm0
+ DB 15,40,203 ; movaps %xmm3,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_black_color_sse41_8bit
_sk_black_color_sse41_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 15,40,5,91,6,0,0 ; movaps 0x65b(%rip),%xmm0 # 1120 <_sk_xor__sse41_8bit+0x1f9>
+ DB 15,40,5,79,12,0,0 ; movaps 0xc4f(%rip),%xmm0 # 1fb0 <_sk_xor__sse41_8bit+0x318>
+ DB 15,40,200 ; movaps %xmm0,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_white_color_sse41_8bit
_sk_white_color_sse41_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
DB 102,15,118,192 ; pcmpeqd %xmm0,%xmm0
+ DB 102,15,118,201 ; pcmpeqd %xmm1,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_clear_sse41_8bit
_sk_clear_sse41_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
DB 15,87,192 ; xorps %xmm0,%xmm0
+ DB 15,87,201 ; xorps %xmm1,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_srcatop_sse41_8bit
_sk_srcatop_sse41_8bit LABEL PROC
- DB 102,68,15,111,5,81,6,0,0 ; movdqa 0x651(%rip),%xmm8 # 1130 <_sk_xor__sse41_8bit+0x209>
- DB 102,15,111,217 ; movdqa %xmm1,%xmm3
- DB 102,15,56,48,225 ; pmovzxbw %xmm1,%xmm4
- DB 102,15,111,233 ; movdqa %xmm1,%xmm5
- DB 102,65,15,56,0,232 ; pshufb %xmm8,%xmm5
- DB 102,15,239,246 ; pxor %xmm6,%xmm6
+ DB 102,68,15,111,21,59,12,0,0 ; movdqa 0xc3b(%rip),%xmm10 # 1fc0 <_sk_xor__sse41_8bit+0x328>
+ DB 102,68,15,111,219 ; movdqa %xmm3,%xmm11
+ DB 102,68,15,56,48,195 ; pmovzxbw %xmm3,%xmm8
+ DB 102,15,111,235 ; movdqa %xmm3,%xmm5
+ DB 102,65,15,56,0,234 ; pshufb %xmm10,%xmm5
+ DB 102,68,15,111,226 ; movdqa %xmm2,%xmm12
+ DB 102,68,15,56,48,202 ; pmovzxbw %xmm2,%xmm9
+ DB 102,15,111,226 ; movdqa %xmm2,%xmm4
+ DB 102,65,15,56,0,226 ; pshufb %xmm10,%xmm4
+ DB 102,69,15,239,237 ; pxor %xmm13,%xmm13
DB 102,15,111,248 ; movdqa %xmm0,%xmm7
- DB 102,15,104,254 ; punpckhbw %xmm6,%xmm7
- DB 102,15,56,48,213 ; pmovzxbw %xmm5,%xmm2
- DB 102,15,104,238 ; punpckhbw %xmm6,%xmm5
- DB 102,15,213,239 ; pmullw %xmm7,%xmm5
- DB 102,15,253,239 ; paddw %xmm7,%xmm5
- DB 102,15,56,48,248 ; pmovzxbw %xmm0,%xmm7
- DB 102,15,213,215 ; pmullw %xmm7,%xmm2
- DB 102,15,253,215 ; paddw %xmm7,%xmm2
- DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,213 ; packuswb %xmm5,%xmm2
- DB 102,65,15,56,0,192 ; pshufb %xmm8,%xmm0
- DB 102,15,118,237 ; pcmpeqd %xmm5,%xmm5
- DB 102,15,239,232 ; pxor %xmm0,%xmm5
- DB 102,15,104,222 ; punpckhbw %xmm6,%xmm3
- DB 102,15,56,48,197 ; pmovzxbw %xmm5,%xmm0
- DB 102,15,104,238 ; punpckhbw %xmm6,%xmm5
- DB 102,15,213,235 ; pmullw %xmm3,%xmm5
- DB 102,15,213,196 ; pmullw %xmm4,%xmm0
- DB 102,15,253,235 ; paddw %xmm3,%xmm5
- DB 102,15,253,196 ; paddw %xmm4,%xmm0
+ DB 102,65,15,104,253 ; punpckhbw %xmm13,%xmm7
+ DB 102,68,15,111,241 ; movdqa %xmm1,%xmm14
+ DB 102,69,15,104,245 ; punpckhbw %xmm13,%xmm14
+ DB 102,15,56,48,244 ; pmovzxbw %xmm4,%xmm6
+ DB 102,65,15,104,229 ; punpckhbw %xmm13,%xmm4
+ DB 102,15,213,231 ; pmullw %xmm7,%xmm4
+ DB 102,15,253,231 ; paddw %xmm7,%xmm4
+ DB 102,15,56,48,253 ; pmovzxbw %xmm5,%xmm7
+ DB 102,65,15,104,237 ; punpckhbw %xmm13,%xmm5
+ DB 102,65,15,213,238 ; pmullw %xmm14,%xmm5
+ DB 102,65,15,253,238 ; paddw %xmm14,%xmm5
+ DB 102,68,15,56,48,240 ; pmovzxbw %xmm0,%xmm14
+ DB 102,65,15,213,246 ; pmullw %xmm14,%xmm6
+ DB 102,65,15,253,246 ; paddw %xmm14,%xmm6
+ DB 102,68,15,56,48,241 ; pmovzxbw %xmm1,%xmm14
+ DB 102,65,15,213,254 ; pmullw %xmm14,%xmm7
+ DB 102,65,15,253,254 ; paddw %xmm14,%xmm7
DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,103,244 ; packuswb %xmm4,%xmm6
+ DB 102,15,103,253 ; packuswb %xmm5,%xmm7
+ DB 102,65,15,56,0,194 ; pshufb %xmm10,%xmm0
+ DB 102,65,15,56,0,202 ; pshufb %xmm10,%xmm1
+ DB 102,15,118,228 ; pcmpeqd %xmm4,%xmm4
+ DB 102,15,239,204 ; pxor %xmm4,%xmm1
+ DB 102,15,239,196 ; pxor %xmm4,%xmm0
+ DB 102,69,15,104,229 ; punpckhbw %xmm13,%xmm12
+ DB 102,69,15,104,221 ; punpckhbw %xmm13,%xmm11
+ DB 102,15,56,48,224 ; pmovzxbw %xmm0,%xmm4
+ DB 102,15,56,48,233 ; pmovzxbw %xmm1,%xmm5
+ DB 102,65,15,104,197 ; punpckhbw %xmm13,%xmm0
+ DB 102,65,15,104,205 ; punpckhbw %xmm13,%xmm1
+ DB 102,65,15,213,203 ; pmullw %xmm11,%xmm1
+ DB 102,65,15,213,196 ; pmullw %xmm12,%xmm0
+ DB 102,65,15,213,232 ; pmullw %xmm8,%xmm5
+ DB 102,65,15,213,225 ; pmullw %xmm9,%xmm4
+ DB 102,65,15,253,196 ; paddw %xmm12,%xmm0
+ DB 102,65,15,253,203 ; paddw %xmm11,%xmm1
+ DB 102,65,15,253,225 ; paddw %xmm9,%xmm4
+ DB 102,65,15,253,232 ; paddw %xmm8,%xmm5
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
- DB 102,15,103,197 ; packuswb %xmm5,%xmm0
- DB 102,15,252,194 ; paddb %xmm2,%xmm0
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,15,103,224 ; packuswb %xmm0,%xmm4
+ DB 102,15,103,233 ; packuswb %xmm1,%xmm5
+ DB 102,15,252,230 ; paddb %xmm6,%xmm4
+ DB 102,15,252,239 ; paddb %xmm7,%xmm5
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 102,15,111,196 ; movdqa %xmm4,%xmm0
+ DB 102,15,111,205 ; movdqa %xmm5,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_dstatop_sse41_8bit
_sk_dstatop_sse41_8bit LABEL PROC
- DB 102,15,111,21,205,5,0,0 ; movdqa 0x5cd(%rip),%xmm2 # 1140 <_sk_xor__sse41_8bit+0x219>
- DB 102,15,111,216 ; movdqa %xmm0,%xmm3
- DB 102,15,56,0,218 ; pshufb %xmm2,%xmm3
+ DB 102,68,15,111,29,16,11,0,0 ; movdqa 0xb10(%rip),%xmm11 # 1fd0 <_sk_xor__sse41_8bit+0x338>
+ DB 102,68,15,111,233 ; movdqa %xmm1,%xmm13
+ DB 102,69,15,56,0,235 ; pshufb %xmm11,%xmm13
+ DB 102,68,15,111,248 ; movdqa %xmm0,%xmm15
+ DB 102,69,15,56,0,251 ; pshufb %xmm11,%xmm15
DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
- DB 102,15,56,48,233 ; pmovzxbw %xmm1,%xmm5
- DB 102,15,111,241 ; movdqa %xmm1,%xmm6
- DB 102,15,111,249 ; movdqa %xmm1,%xmm7
+ DB 102,68,15,56,48,226 ; pmovzxbw %xmm2,%xmm12
+ DB 102,15,111,242 ; movdqa %xmm2,%xmm6
+ DB 102,15,111,234 ; movdqa %xmm2,%xmm5
+ DB 102,65,15,104,232 ; punpckhbw %xmm8,%xmm5
+ DB 102,68,15,56,48,243 ; pmovzxbw %xmm3,%xmm14
+ DB 102,15,111,251 ; movdqa %xmm3,%xmm7
+ DB 102,15,111,227 ; movdqa %xmm3,%xmm4
+ DB 102,65,15,104,224 ; punpckhbw %xmm8,%xmm4
+ DB 102,69,15,56,48,215 ; pmovzxbw %xmm15,%xmm10
+ DB 102,69,15,104,248 ; punpckhbw %xmm8,%xmm15
+ DB 102,68,15,213,253 ; pmullw %xmm5,%xmm15
+ DB 102,68,15,253,253 ; paddw %xmm5,%xmm15
+ DB 102,69,15,56,48,205 ; pmovzxbw %xmm13,%xmm9
+ DB 102,69,15,104,232 ; punpckhbw %xmm8,%xmm13
+ DB 102,68,15,213,236 ; pmullw %xmm4,%xmm13
+ DB 102,68,15,253,236 ; paddw %xmm4,%xmm13
+ DB 102,69,15,213,206 ; pmullw %xmm14,%xmm9
+ DB 102,69,15,213,212 ; pmullw %xmm12,%xmm10
+ DB 102,69,15,253,212 ; paddw %xmm12,%xmm10
+ DB 102,69,15,253,206 ; paddw %xmm14,%xmm9
+ DB 102,65,15,113,213,8 ; psrlw $0x8,%xmm13
+ DB 102,65,15,113,215,8 ; psrlw $0x8,%xmm15
+ DB 102,65,15,113,209,8 ; psrlw $0x8,%xmm9
+ DB 102,65,15,113,210,8 ; psrlw $0x8,%xmm10
+ DB 102,69,15,103,215 ; packuswb %xmm15,%xmm10
+ DB 102,69,15,103,205 ; packuswb %xmm13,%xmm9
+ DB 102,65,15,56,0,243 ; pshufb %xmm11,%xmm6
+ DB 102,65,15,56,0,251 ; pshufb %xmm11,%xmm7
+ DB 102,15,118,228 ; pcmpeqd %xmm4,%xmm4
+ DB 102,15,239,252 ; pxor %xmm4,%xmm7
+ DB 102,15,239,244 ; pxor %xmm4,%xmm6
+ DB 102,68,15,56,48,216 ; pmovzxbw %xmm0,%xmm11
+ DB 102,65,15,104,192 ; punpckhbw %xmm8,%xmm0
+ DB 102,68,15,56,48,225 ; pmovzxbw %xmm1,%xmm12
+ DB 102,65,15,104,200 ; punpckhbw %xmm8,%xmm1
+ DB 102,15,56,48,230 ; pmovzxbw %xmm6,%xmm4
+ DB 102,15,56,48,239 ; pmovzxbw %xmm7,%xmm5
+ DB 102,65,15,104,240 ; punpckhbw %xmm8,%xmm6
DB 102,65,15,104,248 ; punpckhbw %xmm8,%xmm7
- DB 102,15,56,48,227 ; pmovzxbw %xmm3,%xmm4
- DB 102,65,15,104,216 ; punpckhbw %xmm8,%xmm3
- DB 102,15,213,223 ; pmullw %xmm7,%xmm3
- DB 102,15,253,223 ; paddw %xmm7,%xmm3
- DB 102,15,213,229 ; pmullw %xmm5,%xmm4
- DB 102,15,253,229 ; paddw %xmm5,%xmm4
- DB 102,15,113,211,8 ; psrlw $0x8,%xmm3
+ DB 102,15,213,249 ; pmullw %xmm1,%xmm7
+ DB 102,15,213,240 ; pmullw %xmm0,%xmm6
+ DB 102,65,15,213,236 ; pmullw %xmm12,%xmm5
+ DB 102,65,15,213,227 ; pmullw %xmm11,%xmm4
+ DB 102,15,253,240 ; paddw %xmm0,%xmm6
+ DB 102,15,253,249 ; paddw %xmm1,%xmm7
+ DB 102,65,15,253,227 ; paddw %xmm11,%xmm4
+ DB 102,65,15,253,236 ; paddw %xmm12,%xmm5
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
- DB 102,15,103,227 ; packuswb %xmm3,%xmm4
- DB 102,15,56,0,242 ; pshufb %xmm2,%xmm6
- DB 102,15,118,219 ; pcmpeqd %xmm3,%xmm3
- DB 102,15,239,222 ; pxor %xmm6,%xmm3
- DB 102,15,56,48,232 ; pmovzxbw %xmm0,%xmm5
- DB 102,65,15,104,192 ; punpckhbw %xmm8,%xmm0
- DB 102,15,56,48,211 ; pmovzxbw %xmm3,%xmm2
- DB 102,65,15,104,216 ; punpckhbw %xmm8,%xmm3
- DB 102,15,213,216 ; pmullw %xmm0,%xmm3
- DB 102,15,213,213 ; pmullw %xmm5,%xmm2
- DB 102,15,253,216 ; paddw %xmm0,%xmm3
- DB 102,15,253,213 ; paddw %xmm5,%xmm2
- DB 102,15,113,211,8 ; psrlw $0x8,%xmm3
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,211 ; packuswb %xmm3,%xmm2
- DB 102,15,252,212 ; paddb %xmm4,%xmm2
+ DB 102,15,103,230 ; packuswb %xmm6,%xmm4
+ DB 102,15,103,239 ; packuswb %xmm7,%xmm5
+ DB 102,65,15,252,226 ; paddb %xmm10,%xmm4
+ DB 102,65,15,252,233 ; paddb %xmm9,%xmm5
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
+ DB 102,15,111,196 ; movdqa %xmm4,%xmm0
+ DB 102,15,111,205 ; movdqa %xmm5,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_srcin_sse41_8bit
_sk_srcin_sse41_8bit LABEL PROC
- DB 102,15,111,217 ; movdqa %xmm1,%xmm3
- DB 102,15,56,0,29,61,5,0,0 ; pshufb 0x53d(%rip),%xmm3 # 1150 <_sk_xor__sse41_8bit+0x229>
- DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,56,48,232 ; pmovzxbw %xmm0,%xmm5
- DB 102,15,104,196 ; punpckhbw %xmm4,%xmm0
- DB 102,15,56,48,211 ; pmovzxbw %xmm3,%xmm2
- DB 102,15,104,220 ; punpckhbw %xmm4,%xmm3
- DB 102,15,213,216 ; pmullw %xmm0,%xmm3
- DB 102,15,213,213 ; pmullw %xmm5,%xmm2
- DB 102,15,253,216 ; paddw %xmm0,%xmm3
- DB 102,15,253,213 ; paddw %xmm5,%xmm2
- DB 102,15,113,211,8 ; psrlw $0x8,%xmm3
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,211 ; packuswb %xmm3,%xmm2
+ DB 102,15,111,225 ; movdqa %xmm1,%xmm4
+ DB 102,15,111,232 ; movdqa %xmm0,%xmm5
+ DB 102,15,111,5,215,9,0,0 ; movdqa 0x9d7(%rip),%xmm0 # 1fe0 <_sk_xor__sse41_8bit+0x348>
+ DB 102,15,111,243 ; movdqa %xmm3,%xmm6
+ DB 102,15,56,0,240 ; pshufb %xmm0,%xmm6
+ DB 102,15,111,250 ; movdqa %xmm2,%xmm7
+ DB 102,15,56,0,248 ; pshufb %xmm0,%xmm7
+ DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
+ DB 102,68,15,56,48,205 ; pmovzxbw %xmm5,%xmm9
+ DB 102,65,15,104,232 ; punpckhbw %xmm8,%xmm5
+ DB 102,68,15,56,48,212 ; pmovzxbw %xmm4,%xmm10
+ DB 102,65,15,104,224 ; punpckhbw %xmm8,%xmm4
+ DB 102,15,56,48,199 ; pmovzxbw %xmm7,%xmm0
+ DB 102,15,56,48,206 ; pmovzxbw %xmm6,%xmm1
+ DB 102,65,15,104,248 ; punpckhbw %xmm8,%xmm7
+ DB 102,65,15,104,240 ; punpckhbw %xmm8,%xmm6
+ DB 102,15,213,244 ; pmullw %xmm4,%xmm6
+ DB 102,15,213,253 ; pmullw %xmm5,%xmm7
+ DB 102,65,15,213,202 ; pmullw %xmm10,%xmm1
+ DB 102,65,15,213,193 ; pmullw %xmm9,%xmm0
+ DB 102,15,253,253 ; paddw %xmm5,%xmm7
+ DB 102,15,253,244 ; paddw %xmm4,%xmm6
+ DB 102,65,15,253,193 ; paddw %xmm9,%xmm0
+ DB 102,65,15,253,202 ; paddw %xmm10,%xmm1
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
+ DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
+ DB 102,15,103,199 ; packuswb %xmm7,%xmm0
+ DB 102,15,103,206 ; packuswb %xmm6,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
DB 255,224 ; jmpq *%rax
PUBLIC _sk_dstin_sse41_8bit
_sk_dstin_sse41_8bit LABEL PROC
- DB 102,15,56,0,5,8,5,0,0 ; pshufb 0x508(%rip),%xmm0 # 1160 <_sk_xor__sse41_8bit+0x239>
- DB 102,15,239,219 ; pxor %xmm3,%xmm3
- DB 102,15,56,48,225 ; pmovzxbw %xmm1,%xmm4
- DB 102,15,111,233 ; movdqa %xmm1,%xmm5
- DB 102,15,104,235 ; punpckhbw %xmm3,%xmm5
- DB 102,15,56,48,208 ; pmovzxbw %xmm0,%xmm2
- DB 102,15,104,195 ; punpckhbw %xmm3,%xmm0
- DB 102,15,213,197 ; pmullw %xmm5,%xmm0
- DB 102,15,213,212 ; pmullw %xmm4,%xmm2
- DB 102,15,253,197 ; paddw %xmm5,%xmm0
- DB 102,15,253,212 ; paddw %xmm4,%xmm2
+ DB 102,15,111,37,90,9,0,0 ; movdqa 0x95a(%rip),%xmm4 # 1ff0 <_sk_xor__sse41_8bit+0x358>
+ DB 102,15,56,0,204 ; pshufb %xmm4,%xmm1
+ DB 102,15,56,0,196 ; pshufb %xmm4,%xmm0
+ DB 102,69,15,239,210 ; pxor %xmm10,%xmm10
+ DB 102,68,15,56,48,194 ; pmovzxbw %xmm2,%xmm8
+ DB 102,15,111,250 ; movdqa %xmm2,%xmm7
+ DB 102,65,15,104,250 ; punpckhbw %xmm10,%xmm7
+ DB 102,68,15,56,48,203 ; pmovzxbw %xmm3,%xmm9
+ DB 102,15,111,243 ; movdqa %xmm3,%xmm6
+ DB 102,65,15,104,242 ; punpckhbw %xmm10,%xmm6
+ DB 102,15,56,48,224 ; pmovzxbw %xmm0,%xmm4
+ DB 102,15,56,48,233 ; pmovzxbw %xmm1,%xmm5
+ DB 102,65,15,104,194 ; punpckhbw %xmm10,%xmm0
+ DB 102,65,15,104,202 ; punpckhbw %xmm10,%xmm1
+ DB 102,15,213,206 ; pmullw %xmm6,%xmm1
+ DB 102,15,213,199 ; pmullw %xmm7,%xmm0
+ DB 102,65,15,213,233 ; pmullw %xmm9,%xmm5
+ DB 102,65,15,213,224 ; pmullw %xmm8,%xmm4
+ DB 102,15,253,199 ; paddw %xmm7,%xmm0
+ DB 102,15,253,206 ; paddw %xmm6,%xmm1
+ DB 102,65,15,253,224 ; paddw %xmm8,%xmm4
+ DB 102,65,15,253,233 ; paddw %xmm9,%xmm5
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,208 ; packuswb %xmm0,%xmm2
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,15,103,224 ; packuswb %xmm0,%xmm4
+ DB 102,15,103,233 ; packuswb %xmm1,%xmm5
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
+ DB 102,15,111,196 ; movdqa %xmm4,%xmm0
+ DB 102,15,111,205 ; movdqa %xmm5,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_srcout_sse41_8bit
_sk_srcout_sse41_8bit LABEL PROC
- DB 102,15,111,209 ; movdqa %xmm1,%xmm2
- DB 102,15,56,0,21,203,4,0,0 ; pshufb 0x4cb(%rip),%xmm2 # 1170 <_sk_xor__sse41_8bit+0x249>
- DB 102,15,118,219 ; pcmpeqd %xmm3,%xmm3
- DB 102,15,239,218 ; pxor %xmm2,%xmm3
- DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,56,48,232 ; pmovzxbw %xmm0,%xmm5
- DB 102,15,104,196 ; punpckhbw %xmm4,%xmm0
- DB 102,15,56,48,211 ; pmovzxbw %xmm3,%xmm2
- DB 102,15,104,220 ; punpckhbw %xmm4,%xmm3
- DB 102,15,213,216 ; pmullw %xmm0,%xmm3
- DB 102,15,213,213 ; pmullw %xmm5,%xmm2
- DB 102,15,253,216 ; paddw %xmm0,%xmm3
- DB 102,15,253,213 ; paddw %xmm5,%xmm2
- DB 102,15,113,211,8 ; psrlw $0x8,%xmm3
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,211 ; packuswb %xmm3,%xmm2
+ DB 102,15,111,225 ; movdqa %xmm1,%xmm4
+ DB 102,15,111,232 ; movdqa %xmm0,%xmm5
+ DB 102,15,111,5,205,8,0,0 ; movdqa 0x8cd(%rip),%xmm0 # 2000 <_sk_xor__sse41_8bit+0x368>
+ DB 102,15,111,250 ; movdqa %xmm2,%xmm7
+ DB 102,15,56,0,248 ; pshufb %xmm0,%xmm7
+ DB 102,15,111,243 ; movdqa %xmm3,%xmm6
+ DB 102,15,56,0,240 ; pshufb %xmm0,%xmm6
+ DB 102,15,118,192 ; pcmpeqd %xmm0,%xmm0
+ DB 102,15,239,240 ; pxor %xmm0,%xmm6
+ DB 102,15,239,248 ; pxor %xmm0,%xmm7
+ DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
+ DB 102,68,15,56,48,205 ; pmovzxbw %xmm5,%xmm9
+ DB 102,65,15,104,232 ; punpckhbw %xmm8,%xmm5
+ DB 102,68,15,56,48,212 ; pmovzxbw %xmm4,%xmm10
+ DB 102,65,15,104,224 ; punpckhbw %xmm8,%xmm4
+ DB 102,15,56,48,199 ; pmovzxbw %xmm7,%xmm0
+ DB 102,15,56,48,206 ; pmovzxbw %xmm6,%xmm1
+ DB 102,65,15,104,248 ; punpckhbw %xmm8,%xmm7
+ DB 102,65,15,104,240 ; punpckhbw %xmm8,%xmm6
+ DB 102,15,213,244 ; pmullw %xmm4,%xmm6
+ DB 102,15,213,253 ; pmullw %xmm5,%xmm7
+ DB 102,65,15,213,202 ; pmullw %xmm10,%xmm1
+ DB 102,65,15,213,193 ; pmullw %xmm9,%xmm0
+ DB 102,15,253,253 ; paddw %xmm5,%xmm7
+ DB 102,15,253,244 ; paddw %xmm4,%xmm6
+ DB 102,65,15,253,193 ; paddw %xmm9,%xmm0
+ DB 102,65,15,253,202 ; paddw %xmm10,%xmm1
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
+ DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
+ DB 102,15,103,199 ; packuswb %xmm7,%xmm0
+ DB 102,15,103,206 ; packuswb %xmm6,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
DB 255,224 ; jmpq *%rax
PUBLIC _sk_dstout_sse41_8bit
_sk_dstout_sse41_8bit LABEL PROC
- DB 102,15,56,0,5,142,4,0,0 ; pshufb 0x48e(%rip),%xmm0 # 1180 <_sk_xor__sse41_8bit+0x259>
- DB 102,15,118,210 ; pcmpeqd %xmm2,%xmm2
- DB 102,15,239,208 ; pxor %xmm0,%xmm2
- DB 102,15,239,219 ; pxor %xmm3,%xmm3
- DB 102,15,56,48,225 ; pmovzxbw %xmm1,%xmm4
- DB 102,15,111,233 ; movdqa %xmm1,%xmm5
- DB 102,15,104,235 ; punpckhbw %xmm3,%xmm5
- DB 102,15,56,48,194 ; pmovzxbw %xmm2,%xmm0
- DB 102,15,104,211 ; punpckhbw %xmm3,%xmm2
- DB 102,15,213,213 ; pmullw %xmm5,%xmm2
- DB 102,15,213,196 ; pmullw %xmm4,%xmm0
- DB 102,15,253,213 ; paddw %xmm5,%xmm2
- DB 102,15,253,196 ; paddw %xmm4,%xmm0
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
+ DB 102,15,111,37,68,8,0,0 ; movdqa 0x844(%rip),%xmm4 # 2010 <_sk_xor__sse41_8bit+0x378>
+ DB 102,15,56,0,196 ; pshufb %xmm4,%xmm0
+ DB 102,15,56,0,204 ; pshufb %xmm4,%xmm1
+ DB 102,15,118,228 ; pcmpeqd %xmm4,%xmm4
+ DB 102,15,239,204 ; pxor %xmm4,%xmm1
+ DB 102,15,239,196 ; pxor %xmm4,%xmm0
+ DB 102,69,15,239,210 ; pxor %xmm10,%xmm10
+ DB 102,68,15,56,48,194 ; pmovzxbw %xmm2,%xmm8
+ DB 102,15,111,250 ; movdqa %xmm2,%xmm7
+ DB 102,65,15,104,250 ; punpckhbw %xmm10,%xmm7
+ DB 102,68,15,56,48,203 ; pmovzxbw %xmm3,%xmm9
+ DB 102,15,111,243 ; movdqa %xmm3,%xmm6
+ DB 102,65,15,104,242 ; punpckhbw %xmm10,%xmm6
+ DB 102,15,56,48,224 ; pmovzxbw %xmm0,%xmm4
+ DB 102,15,56,48,233 ; pmovzxbw %xmm1,%xmm5
+ DB 102,65,15,104,194 ; punpckhbw %xmm10,%xmm0
+ DB 102,65,15,104,202 ; punpckhbw %xmm10,%xmm1
+ DB 102,15,213,206 ; pmullw %xmm6,%xmm1
+ DB 102,15,213,199 ; pmullw %xmm7,%xmm0
+ DB 102,65,15,213,233 ; pmullw %xmm9,%xmm5
+ DB 102,65,15,213,224 ; pmullw %xmm8,%xmm4
+ DB 102,15,253,199 ; paddw %xmm7,%xmm0
+ DB 102,15,253,206 ; paddw %xmm6,%xmm1
+ DB 102,65,15,253,224 ; paddw %xmm8,%xmm4
+ DB 102,65,15,253,233 ; paddw %xmm9,%xmm5
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
- DB 102,15,103,194 ; packuswb %xmm2,%xmm0
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,15,103,224 ; packuswb %xmm0,%xmm4
+ DB 102,15,103,233 ; packuswb %xmm1,%xmm5
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 102,15,111,196 ; movdqa %xmm4,%xmm0
+ DB 102,15,111,205 ; movdqa %xmm5,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_srcover_sse41_8bit
_sk_srcover_sse41_8bit LABEL PROC
- DB 102,15,111,208 ; movdqa %xmm0,%xmm2
- DB 102,15,56,0,21,77,4,0,0 ; pshufb 0x44d(%rip),%xmm2 # 1190 <_sk_xor__sse41_8bit+0x269>
- DB 102,15,239,219 ; pxor %xmm3,%xmm3
- DB 102,15,56,48,225 ; pmovzxbw %xmm1,%xmm4
- DB 102,15,252,193 ; paddb %xmm1,%xmm0
- DB 102,15,111,233 ; movdqa %xmm1,%xmm5
- DB 102,15,104,235 ; punpckhbw %xmm3,%xmm5
- DB 102,15,56,48,242 ; pmovzxbw %xmm2,%xmm6
- DB 102,15,104,211 ; punpckhbw %xmm3,%xmm2
- DB 102,15,213,213 ; pmullw %xmm5,%xmm2
- DB 102,15,213,244 ; pmullw %xmm4,%xmm6
- DB 102,15,253,213 ; paddw %xmm5,%xmm2
- DB 102,15,253,244 ; paddw %xmm4,%xmm6
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
+ DB 102,15,111,53,179,7,0,0 ; movdqa 0x7b3(%rip),%xmm6 # 2020 <_sk_xor__sse41_8bit+0x388>
+ DB 102,68,15,111,217 ; movdqa %xmm1,%xmm11
+ DB 102,68,15,56,0,222 ; pshufb %xmm6,%xmm11
+ DB 102,15,111,232 ; movdqa %xmm0,%xmm5
+ DB 102,15,56,0,238 ; pshufb %xmm6,%xmm5
+ DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
+ DB 102,68,15,56,48,202 ; pmovzxbw %xmm2,%xmm9
+ DB 102,15,252,194 ; paddb %xmm2,%xmm0
+ DB 102,68,15,111,226 ; movdqa %xmm2,%xmm12
+ DB 102,69,15,104,224 ; punpckhbw %xmm8,%xmm12
+ DB 102,68,15,56,48,211 ; pmovzxbw %xmm3,%xmm10
+ DB 102,15,252,203 ; paddb %xmm3,%xmm1
+ DB 102,15,111,251 ; movdqa %xmm3,%xmm7
+ DB 102,65,15,104,248 ; punpckhbw %xmm8,%xmm7
+ DB 102,15,56,48,229 ; pmovzxbw %xmm5,%xmm4
+ DB 102,65,15,56,48,243 ; pmovzxbw %xmm11,%xmm6
+ DB 102,65,15,104,232 ; punpckhbw %xmm8,%xmm5
+ DB 102,69,15,104,216 ; punpckhbw %xmm8,%xmm11
+ DB 102,68,15,213,223 ; pmullw %xmm7,%xmm11
+ DB 102,65,15,213,236 ; pmullw %xmm12,%xmm5
+ DB 102,65,15,213,242 ; pmullw %xmm10,%xmm6
+ DB 102,65,15,213,225 ; pmullw %xmm9,%xmm4
+ DB 102,65,15,253,236 ; paddw %xmm12,%xmm5
+ DB 102,68,15,253,223 ; paddw %xmm7,%xmm11
+ DB 102,65,15,253,225 ; paddw %xmm9,%xmm4
+ DB 102,65,15,253,242 ; paddw %xmm10,%xmm6
+ DB 102,65,15,113,211,8 ; psrlw $0x8,%xmm11
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
- DB 102,15,103,242 ; packuswb %xmm2,%xmm6
- DB 102,15,248,198 ; psubb %xmm6,%xmm0
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,15,103,229 ; packuswb %xmm5,%xmm4
+ DB 102,65,15,103,243 ; packuswb %xmm11,%xmm6
+ DB 102,15,248,196 ; psubb %xmm4,%xmm0
+ DB 102,15,248,206 ; psubb %xmm6,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_dstover_sse41_8bit
_sk_dstover_sse41_8bit LABEL PROC
- DB 102,15,111,208 ; movdqa %xmm0,%xmm2
- DB 102,15,56,48,216 ; pmovzxbw %xmm0,%xmm3
- DB 102,15,252,193 ; paddb %xmm1,%xmm0
- DB 102,15,111,225 ; movdqa %xmm1,%xmm4
- DB 102,15,56,0,37,255,3,0,0 ; pshufb 0x3ff(%rip),%xmm4 # 11a0 <_sk_xor__sse41_8bit+0x279>
- DB 102,15,239,237 ; pxor %xmm5,%xmm5
- DB 102,15,104,213 ; punpckhbw %xmm5,%xmm2
- DB 102,15,56,48,244 ; pmovzxbw %xmm4,%xmm6
- DB 102,15,104,229 ; punpckhbw %xmm5,%xmm4
- DB 102,15,213,226 ; pmullw %xmm2,%xmm4
- DB 102,15,213,243 ; pmullw %xmm3,%xmm6
- DB 102,15,253,226 ; paddw %xmm2,%xmm4
- DB 102,15,253,243 ; paddw %xmm3,%xmm6
+ DB 102,68,15,111,5,19,7,0,0 ; movdqa 0x713(%rip),%xmm8 # 2030 <_sk_xor__sse41_8bit+0x398>
+ DB 102,68,15,111,209 ; movdqa %xmm1,%xmm10
+ DB 102,68,15,56,48,201 ; pmovzxbw %xmm1,%xmm9
+ DB 102,15,252,203 ; paddb %xmm3,%xmm1
+ DB 102,15,111,251 ; movdqa %xmm3,%xmm7
+ DB 102,65,15,56,0,248 ; pshufb %xmm8,%xmm7
+ DB 102,68,15,111,224 ; movdqa %xmm0,%xmm12
+ DB 102,68,15,56,48,216 ; pmovzxbw %xmm0,%xmm11
+ DB 102,15,252,194 ; paddb %xmm2,%xmm0
+ DB 102,15,111,234 ; movdqa %xmm2,%xmm5
+ DB 102,65,15,56,0,232 ; pshufb %xmm8,%xmm5
+ DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
+ DB 102,69,15,104,224 ; punpckhbw %xmm8,%xmm12
+ DB 102,69,15,104,208 ; punpckhbw %xmm8,%xmm10
+ DB 102,15,56,48,245 ; pmovzxbw %xmm5,%xmm6
+ DB 102,15,56,48,231 ; pmovzxbw %xmm7,%xmm4
+ DB 102,65,15,104,232 ; punpckhbw %xmm8,%xmm5
+ DB 102,65,15,104,248 ; punpckhbw %xmm8,%xmm7
+ DB 102,65,15,213,250 ; pmullw %xmm10,%xmm7
+ DB 102,65,15,213,236 ; pmullw %xmm12,%xmm5
+ DB 102,65,15,213,225 ; pmullw %xmm9,%xmm4
+ DB 102,65,15,213,243 ; pmullw %xmm11,%xmm6
+ DB 102,65,15,253,236 ; paddw %xmm12,%xmm5
+ DB 102,65,15,253,250 ; paddw %xmm10,%xmm7
+ DB 102,65,15,253,243 ; paddw %xmm11,%xmm6
+ DB 102,65,15,253,225 ; paddw %xmm9,%xmm4
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
- DB 102,15,103,244 ; packuswb %xmm4,%xmm6
+ DB 102,15,103,245 ; packuswb %xmm5,%xmm6
+ DB 102,15,103,231 ; packuswb %xmm7,%xmm4
DB 102,15,248,198 ; psubb %xmm6,%xmm0
+ DB 102,15,248,204 ; psubb %xmm4,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_modulate_sse41_8bit
_sk_modulate_sse41_8bit LABEL PROC
- DB 102,15,239,219 ; pxor %xmm3,%xmm3
- DB 102,15,56,48,224 ; pmovzxbw %xmm0,%xmm4
- DB 102,15,104,195 ; punpckhbw %xmm3,%xmm0
- DB 102,15,56,48,209 ; pmovzxbw %xmm1,%xmm2
- DB 102,15,111,233 ; movdqa %xmm1,%xmm5
- DB 102,15,104,235 ; punpckhbw %xmm3,%xmm5
- DB 102,15,213,232 ; pmullw %xmm0,%xmm5
- DB 102,15,213,212 ; pmullw %xmm4,%xmm2
- DB 102,15,253,232 ; paddw %xmm0,%xmm5
- DB 102,15,253,212 ; paddw %xmm4,%xmm2
- DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,213 ; packuswb %xmm5,%xmm2
+ DB 102,15,111,225 ; movdqa %xmm1,%xmm4
+ DB 102,15,111,232 ; movdqa %xmm0,%xmm5
+ DB 102,69,15,239,210 ; pxor %xmm10,%xmm10
+ DB 102,68,15,56,48,197 ; pmovzxbw %xmm5,%xmm8
+ DB 102,65,15,104,234 ; punpckhbw %xmm10,%xmm5
+ DB 102,68,15,56,48,204 ; pmovzxbw %xmm4,%xmm9
+ DB 102,65,15,104,226 ; punpckhbw %xmm10,%xmm4
+ DB 102,15,56,48,194 ; pmovzxbw %xmm2,%xmm0
+ DB 102,15,111,250 ; movdqa %xmm2,%xmm7
+ DB 102,65,15,104,250 ; punpckhbw %xmm10,%xmm7
+ DB 102,15,56,48,203 ; pmovzxbw %xmm3,%xmm1
+ DB 102,15,111,243 ; movdqa %xmm3,%xmm6
+ DB 102,65,15,104,242 ; punpckhbw %xmm10,%xmm6
+ DB 102,15,213,244 ; pmullw %xmm4,%xmm6
+ DB 102,15,213,253 ; pmullw %xmm5,%xmm7
+ DB 102,65,15,213,201 ; pmullw %xmm9,%xmm1
+ DB 102,65,15,213,192 ; pmullw %xmm8,%xmm0
+ DB 102,15,253,253 ; paddw %xmm5,%xmm7
+ DB 102,15,253,244 ; paddw %xmm4,%xmm6
+ DB 102,65,15,253,192 ; paddw %xmm8,%xmm0
+ DB 102,65,15,253,201 ; paddw %xmm9,%xmm1
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
+ DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
+ DB 102,15,103,199 ; packuswb %xmm7,%xmm0
+ DB 102,15,103,206 ; packuswb %xmm6,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
DB 255,224 ; jmpq *%rax
PUBLIC _sk_multiply_sse41_8bit
_sk_multiply_sse41_8bit LABEL PROC
- DB 102,68,15,111,5,143,3,0,0 ; movdqa 0x38f(%rip),%xmm8 # 11b0 <_sk_xor__sse41_8bit+0x289>
- DB 102,15,111,225 ; movdqa %xmm1,%xmm4
- DB 102,15,56,48,209 ; pmovzxbw %xmm1,%xmm2
- DB 102,15,111,233 ; movdqa %xmm1,%xmm5
- DB 102,65,15,56,0,232 ; pshufb %xmm8,%xmm5
- DB 102,69,15,118,201 ; pcmpeqd %xmm9,%xmm9
- DB 102,65,15,239,233 ; pxor %xmm9,%xmm5
- DB 102,69,15,239,210 ; pxor %xmm10,%xmm10
- DB 102,15,111,216 ; movdqa %xmm0,%xmm3
- DB 102,65,15,104,218 ; punpckhbw %xmm10,%xmm3
- DB 102,15,56,48,240 ; pmovzxbw %xmm0,%xmm6
- DB 102,15,56,48,253 ; pmovzxbw %xmm5,%xmm7
- DB 102,65,15,104,234 ; punpckhbw %xmm10,%xmm5
- DB 102,15,213,235 ; pmullw %xmm3,%xmm5
- DB 102,15,213,254 ; pmullw %xmm6,%xmm7
- DB 102,15,253,235 ; paddw %xmm3,%xmm5
- DB 102,15,253,254 ; paddw %xmm6,%xmm7
+ DB 102,68,15,111,211 ; movdqa %xmm3,%xmm10
+ DB 102,15,111,218 ; movdqa %xmm2,%xmm3
+ DB 102,15,111,209 ; movdqa %xmm1,%xmm2
+ DB 102,15,111,200 ; movdqa %xmm0,%xmm1
+ DB 102,68,15,111,53,225,5,0,0 ; movdqa 0x5e1(%rip),%xmm14 # 2040 <_sk_xor__sse41_8bit+0x3a8>
+ DB 102,68,15,111,195 ; movdqa %xmm3,%xmm8
+ DB 102,15,111,235 ; movdqa %xmm3,%xmm5
+ DB 102,65,15,56,0,238 ; pshufb %xmm14,%xmm5
+ DB 102,65,15,111,250 ; movdqa %xmm10,%xmm7
+ DB 102,65,15,56,0,254 ; pshufb %xmm14,%xmm7
+ DB 102,69,15,118,255 ; pcmpeqd %xmm15,%xmm15
+ DB 102,65,15,239,255 ; pxor %xmm15,%xmm7
+ DB 102,65,15,239,239 ; pxor %xmm15,%xmm5
+ DB 102,15,239,228 ; pxor %xmm4,%xmm4
+ DB 102,68,15,111,233 ; movdqa %xmm1,%xmm13
+ DB 102,68,15,104,236 ; punpckhbw %xmm4,%xmm13
+ DB 102,68,15,111,226 ; movdqa %xmm2,%xmm12
+ DB 102,68,15,104,228 ; punpckhbw %xmm4,%xmm12
+ DB 102,68,15,56,48,217 ; pmovzxbw %xmm1,%xmm11
+ DB 102,68,15,56,48,202 ; pmovzxbw %xmm2,%xmm9
+ DB 102,15,56,48,245 ; pmovzxbw %xmm5,%xmm6
+ DB 102,15,104,236 ; punpckhbw %xmm4,%xmm5
+ DB 102,65,15,213,237 ; pmullw %xmm13,%xmm5
+ DB 102,65,15,213,243 ; pmullw %xmm11,%xmm6
+ DB 102,65,15,253,237 ; paddw %xmm13,%xmm5
+ DB 102,65,15,253,243 ; paddw %xmm11,%xmm6
DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,103,245 ; packuswb %xmm5,%xmm6
+ DB 102,15,56,48,199 ; pmovzxbw %xmm7,%xmm0
+ DB 102,15,104,252 ; punpckhbw %xmm4,%xmm7
+ DB 102,65,15,213,252 ; pmullw %xmm12,%xmm7
+ DB 102,65,15,213,193 ; pmullw %xmm9,%xmm0
+ DB 102,65,15,253,252 ; paddw %xmm12,%xmm7
+ DB 102,65,15,253,193 ; paddw %xmm9,%xmm0
DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
- DB 102,15,103,253 ; packuswb %xmm5,%xmm7
- DB 102,65,15,56,0,192 ; pshufb %xmm8,%xmm0
- DB 102,65,15,239,193 ; pxor %xmm9,%xmm0
- DB 102,65,15,104,226 ; punpckhbw %xmm10,%xmm4
- DB 102,15,56,48,232 ; pmovzxbw %xmm0,%xmm5
- DB 102,65,15,104,194 ; punpckhbw %xmm10,%xmm0
- DB 102,15,213,196 ; pmullw %xmm4,%xmm0
- DB 102,15,213,234 ; pmullw %xmm2,%xmm5
- DB 102,15,253,196 ; paddw %xmm4,%xmm0
- DB 102,15,253,234 ; paddw %xmm2,%xmm5
DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
- DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
- DB 102,15,103,232 ; packuswb %xmm0,%xmm5
- DB 102,15,252,239 ; paddb %xmm7,%xmm5
- DB 102,15,213,227 ; pmullw %xmm3,%xmm4
- DB 102,15,213,214 ; pmullw %xmm6,%xmm2
- DB 102,15,253,227 ; paddw %xmm3,%xmm4
- DB 102,15,253,214 ; paddw %xmm6,%xmm2
- DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,15,103,199 ; packuswb %xmm7,%xmm0
+ DB 102,65,15,111,234 ; movdqa %xmm10,%xmm5
+ DB 102,65,15,56,0,206 ; pshufb %xmm14,%xmm1
+ DB 102,65,15,56,0,214 ; pshufb %xmm14,%xmm2
+ DB 102,65,15,239,215 ; pxor %xmm15,%xmm2
+ DB 102,65,15,239,207 ; pxor %xmm15,%xmm1
+ DB 102,68,15,104,196 ; punpckhbw %xmm4,%xmm8
+ DB 102,68,15,104,212 ; punpckhbw %xmm4,%xmm10
+ DB 102,15,56,48,249 ; pmovzxbw %xmm1,%xmm7
+ DB 102,68,15,56,48,242 ; pmovzxbw %xmm2,%xmm14
+ DB 102,15,104,204 ; punpckhbw %xmm4,%xmm1
+ DB 102,15,104,212 ; punpckhbw %xmm4,%xmm2
+ DB 102,68,15,111,251 ; movdqa %xmm3,%xmm15
+ DB 102,65,15,56,48,231 ; pmovzxbw %xmm15,%xmm4
+ DB 102,65,15,213,200 ; pmullw %xmm8,%xmm1
+ DB 102,15,213,252 ; pmullw %xmm4,%xmm7
+ DB 102,65,15,253,200 ; paddw %xmm8,%xmm1
+ DB 102,15,253,252 ; paddw %xmm4,%xmm7
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,103,249 ; packuswb %xmm1,%xmm7
+ DB 102,15,111,221 ; movdqa %xmm5,%xmm3
+ DB 102,15,56,48,235 ; pmovzxbw %xmm3,%xmm5
+ DB 102,65,15,213,210 ; pmullw %xmm10,%xmm2
+ DB 102,68,15,213,245 ; pmullw %xmm5,%xmm14
+ DB 102,65,15,253,210 ; paddw %xmm10,%xmm2
+ DB 102,68,15,253,245 ; paddw %xmm5,%xmm14
DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,212 ; packuswb %xmm4,%xmm2
- DB 102,15,252,213 ; paddb %xmm5,%xmm2
+ DB 102,65,15,113,214,8 ; psrlw $0x8,%xmm14
+ DB 102,68,15,103,242 ; packuswb %xmm2,%xmm14
+ DB 102,68,15,252,240 ; paddb %xmm0,%xmm14
+ DB 102,15,252,254 ; paddb %xmm6,%xmm7
+ DB 102,69,15,213,197 ; pmullw %xmm13,%xmm8
+ DB 102,69,15,253,197 ; paddw %xmm13,%xmm8
+ DB 102,69,15,213,212 ; pmullw %xmm12,%xmm10
+ DB 102,69,15,253,212 ; paddw %xmm12,%xmm10
+ DB 102,65,15,213,227 ; pmullw %xmm11,%xmm4
+ DB 102,65,15,253,227 ; paddw %xmm11,%xmm4
+ DB 102,65,15,213,233 ; pmullw %xmm9,%xmm5
+ DB 102,65,15,253,233 ; paddw %xmm9,%xmm5
+ DB 102,65,15,113,208,8 ; psrlw $0x8,%xmm8
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,65,15,103,224 ; packuswb %xmm8,%xmm4
+ DB 102,65,15,113,210,8 ; psrlw $0x8,%xmm10
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,65,15,103,234 ; packuswb %xmm10,%xmm5
+ DB 102,15,252,231 ; paddb %xmm7,%xmm4
+ DB 102,65,15,252,238 ; paddb %xmm14,%xmm5
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
+ DB 102,65,15,111,215 ; movdqa %xmm15,%xmm2
+ DB 102,15,111,196 ; movdqa %xmm4,%xmm0
+ DB 102,15,111,205 ; movdqa %xmm5,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_screen_sse41_8bit
_sk_screen_sse41_8bit LABEL PROC
- DB 102,15,118,210 ; pcmpeqd %xmm2,%xmm2
- DB 102,15,239,208 ; pxor %xmm0,%xmm2
- DB 102,15,56,48,218 ; pmovzxbw %xmm2,%xmm3
- DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,104,212 ; punpckhbw %xmm4,%xmm2
- DB 102,15,56,48,233 ; pmovzxbw %xmm1,%xmm5
- DB 102,15,111,241 ; movdqa %xmm1,%xmm6
- DB 102,15,104,244 ; punpckhbw %xmm4,%xmm6
- DB 102,15,213,242 ; pmullw %xmm2,%xmm6
- DB 102,15,213,235 ; pmullw %xmm3,%xmm5
- DB 102,15,253,235 ; paddw %xmm3,%xmm5
- DB 102,15,253,242 ; paddw %xmm2,%xmm6
- DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,69,15,118,228 ; pcmpeqd %xmm12,%xmm12
+ DB 102,68,15,111,217 ; movdqa %xmm1,%xmm11
+ DB 102,69,15,239,220 ; pxor %xmm12,%xmm11
+ DB 102,68,15,239,224 ; pxor %xmm0,%xmm12
+ DB 102,69,15,56,48,204 ; pmovzxbw %xmm12,%xmm9
+ DB 102,69,15,56,48,195 ; pmovzxbw %xmm11,%xmm8
+ DB 102,69,15,239,210 ; pxor %xmm10,%xmm10
+ DB 102,69,15,104,226 ; punpckhbw %xmm10,%xmm12
+ DB 102,69,15,104,218 ; punpckhbw %xmm10,%xmm11
+ DB 102,15,56,48,242 ; pmovzxbw %xmm2,%xmm6
+ DB 102,15,111,250 ; movdqa %xmm2,%xmm7
+ DB 102,65,15,104,250 ; punpckhbw %xmm10,%xmm7
+ DB 102,15,56,48,235 ; pmovzxbw %xmm3,%xmm5
+ DB 102,15,111,227 ; movdqa %xmm3,%xmm4
+ DB 102,65,15,104,226 ; punpckhbw %xmm10,%xmm4
+ DB 102,65,15,213,227 ; pmullw %xmm11,%xmm4
+ DB 102,65,15,213,252 ; pmullw %xmm12,%xmm7
+ DB 102,65,15,213,232 ; pmullw %xmm8,%xmm5
+ DB 102,65,15,213,241 ; pmullw %xmm9,%xmm6
+ DB 102,65,15,253,241 ; paddw %xmm9,%xmm6
+ DB 102,65,15,253,252 ; paddw %xmm12,%xmm7
+ DB 102,65,15,253,232 ; paddw %xmm8,%xmm5
+ DB 102,65,15,253,227 ; paddw %xmm11,%xmm4
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
- DB 102,15,103,238 ; packuswb %xmm6,%xmm5
- DB 102,15,252,197 ; paddb %xmm5,%xmm0
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,103,247 ; packuswb %xmm7,%xmm6
+ DB 102,15,103,236 ; packuswb %xmm4,%xmm5
+ DB 102,15,252,198 ; paddb %xmm6,%xmm0
+ DB 102,15,252,205 ; paddb %xmm5,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_xor__sse41_8bit
_sk_xor__sse41_8bit LABEL PROC
- DB 102,68,15,111,5,144,2,0,0 ; movdqa 0x290(%rip),%xmm8 # 11c0 <_sk_xor__sse41_8bit+0x299>
- DB 102,15,111,217 ; movdqa %xmm1,%xmm3
- DB 102,15,56,48,225 ; pmovzxbw %xmm1,%xmm4
- DB 102,15,111,233 ; movdqa %xmm1,%xmm5
- DB 102,65,15,56,0,232 ; pshufb %xmm8,%xmm5
- DB 102,69,15,118,201 ; pcmpeqd %xmm9,%xmm9
- DB 102,65,15,239,233 ; pxor %xmm9,%xmm5
- DB 102,15,239,255 ; pxor %xmm7,%xmm7
- DB 102,15,111,208 ; movdqa %xmm0,%xmm2
- DB 102,15,104,215 ; punpckhbw %xmm7,%xmm2
+ DB 102,68,15,111,21,175,3,0,0 ; movdqa 0x3af(%rip),%xmm10 # 2050 <_sk_xor__sse41_8bit+0x3b8>
+ DB 102,68,15,111,226 ; movdqa %xmm2,%xmm12
+ DB 102,68,15,56,48,194 ; pmovzxbw %xmm2,%xmm8
+ DB 102,15,111,234 ; movdqa %xmm2,%xmm5
+ DB 102,65,15,56,0,234 ; pshufb %xmm10,%xmm5
+ DB 102,68,15,111,235 ; movdqa %xmm3,%xmm13
+ DB 102,68,15,56,48,203 ; pmovzxbw %xmm3,%xmm9
+ DB 102,15,111,227 ; movdqa %xmm3,%xmm4
+ DB 102,65,15,56,0,226 ; pshufb %xmm10,%xmm4
+ DB 102,69,15,118,219 ; pcmpeqd %xmm11,%xmm11
+ DB 102,65,15,239,227 ; pxor %xmm11,%xmm4
+ DB 102,65,15,239,235 ; pxor %xmm11,%xmm5
+ DB 102,69,15,239,246 ; pxor %xmm14,%xmm14
+ DB 102,15,111,248 ; movdqa %xmm0,%xmm7
+ DB 102,65,15,104,254 ; punpckhbw %xmm14,%xmm7
+ DB 102,68,15,111,249 ; movdqa %xmm1,%xmm15
+ DB 102,69,15,104,254 ; punpckhbw %xmm14,%xmm15
DB 102,15,56,48,245 ; pmovzxbw %xmm5,%xmm6
- DB 102,15,104,239 ; punpckhbw %xmm7,%xmm5
- DB 102,15,213,234 ; pmullw %xmm2,%xmm5
- DB 102,15,253,234 ; paddw %xmm2,%xmm5
- DB 102,15,56,48,208 ; pmovzxbw %xmm0,%xmm2
- DB 102,15,213,242 ; pmullw %xmm2,%xmm6
- DB 102,15,253,242 ; paddw %xmm2,%xmm6
+ DB 102,65,15,104,238 ; punpckhbw %xmm14,%xmm5
+ DB 102,15,213,239 ; pmullw %xmm7,%xmm5
+ DB 102,15,253,239 ; paddw %xmm7,%xmm5
+ DB 102,15,56,48,252 ; pmovzxbw %xmm4,%xmm7
+ DB 102,65,15,104,230 ; punpckhbw %xmm14,%xmm4
+ DB 102,65,15,213,231 ; pmullw %xmm15,%xmm4
+ DB 102,65,15,253,231 ; paddw %xmm15,%xmm4
+ DB 102,68,15,56,48,248 ; pmovzxbw %xmm0,%xmm15
+ DB 102,65,15,213,247 ; pmullw %xmm15,%xmm6
+ DB 102,65,15,253,247 ; paddw %xmm15,%xmm6
+ DB 102,68,15,56,48,249 ; pmovzxbw %xmm1,%xmm15
+ DB 102,65,15,213,255 ; pmullw %xmm15,%xmm7
+ DB 102,65,15,253,255 ; paddw %xmm15,%xmm7
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
DB 102,15,103,245 ; packuswb %xmm5,%xmm6
- DB 102,65,15,56,0,192 ; pshufb %xmm8,%xmm0
- DB 102,65,15,239,193 ; pxor %xmm9,%xmm0
- DB 102,15,104,223 ; punpckhbw %xmm7,%xmm3
- DB 102,15,56,48,208 ; pmovzxbw %xmm0,%xmm2
- DB 102,15,104,199 ; punpckhbw %xmm7,%xmm0
- DB 102,15,213,195 ; pmullw %xmm3,%xmm0
- DB 102,15,213,212 ; pmullw %xmm4,%xmm2
- DB 102,15,253,195 ; paddw %xmm3,%xmm0
- DB 102,15,253,212 ; paddw %xmm4,%xmm2
+ DB 102,15,103,252 ; packuswb %xmm4,%xmm7
+ DB 102,65,15,56,0,194 ; pshufb %xmm10,%xmm0
+ DB 102,65,15,56,0,202 ; pshufb %xmm10,%xmm1
+ DB 102,65,15,239,203 ; pxor %xmm11,%xmm1
+ DB 102,65,15,239,195 ; pxor %xmm11,%xmm0
+ DB 102,69,15,104,230 ; punpckhbw %xmm14,%xmm12
+ DB 102,69,15,104,238 ; punpckhbw %xmm14,%xmm13
+ DB 102,15,56,48,224 ; pmovzxbw %xmm0,%xmm4
+ DB 102,15,56,48,233 ; pmovzxbw %xmm1,%xmm5
+ DB 102,65,15,104,198 ; punpckhbw %xmm14,%xmm0
+ DB 102,65,15,104,206 ; punpckhbw %xmm14,%xmm1
+ DB 102,65,15,213,205 ; pmullw %xmm13,%xmm1
+ DB 102,65,15,213,196 ; pmullw %xmm12,%xmm0
+ DB 102,65,15,213,233 ; pmullw %xmm9,%xmm5
+ DB 102,65,15,213,224 ; pmullw %xmm8,%xmm4
+ DB 102,65,15,253,196 ; paddw %xmm12,%xmm0
+ DB 102,65,15,253,205 ; paddw %xmm13,%xmm1
+ DB 102,65,15,253,224 ; paddw %xmm8,%xmm4
+ DB 102,65,15,253,233 ; paddw %xmm9,%xmm5
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,208 ; packuswb %xmm0,%xmm2
- DB 102,15,252,214 ; paddb %xmm6,%xmm2
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,15,103,224 ; packuswb %xmm0,%xmm4
+ DB 102,15,103,233 ; packuswb %xmm1,%xmm5
+ DB 102,15,252,230 ; paddb %xmm6,%xmm4
+ DB 102,15,252,239 ; paddb %xmm7,%xmm5
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
+ DB 102,15,111,196 ; movdqa %xmm4,%xmm0
+ DB 102,15,111,205 ; movdqa %xmm5,%xmm1
DB 255,224 ; jmpq *%rax
ALIGN 4
DB 0,0 ; add %al,(%rax)
- DB 127,67 ; jg 100f <_sk_xor__sse41_8bit+0xe8>
+ DB 127,67 ; jg 1e27 <_sk_xor__sse41_8bit+0x18f>
DB 0,0 ; add %al,(%rax)
- DB 127,67 ; jg 1013 <_sk_xor__sse41_8bit+0xec>
+ DB 127,67 ; jg 1e2b <_sk_xor__sse41_8bit+0x193>
DB 0,0 ; add %al,(%rax)
- DB 127,67 ; jg 1017 <_sk_xor__sse41_8bit+0xf0>
+ DB 127,67 ; jg 1e2f <_sk_xor__sse41_8bit+0x197>
ALIGN 16
DB 0,0 ; add %al,(%rax)
@@ -41480,54 +43499,87 @@ ALIGN 16
DB 5,4,7,10,9 ; add $0x90a0704,%eax
DB 8,11 ; or %cl,(%rbx)
DB 14 ; (bad)
- DB 13,12,15,0,4 ; or $0x4000f0c,%eax
+ DB 13,12,15,255,0 ; or $0xff0f0c,%eax
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 3,255 ; add %edi,%edi
+ DB 7 ; (bad)
+ DB 255,11 ; decl (%rbx)
+ DB 255,15 ; decl (%rdi)
+ DB 255,11 ; decl (%rbx)
+ DB 255,15 ; decl (%rdi)
+ DB 255,15 ; decl (%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,0 ; incl (%rax)
+ DB 2,4,6 ; add (%rsi,%rax,1),%al
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
+ DB 0,2 ; add %al,(%rdx)
DB 0,0 ; add %al,(%rax)
- DB 0,4,8 ; add %al,(%rax,%rcx,1)
- DB 12,0 ; or $0x0,%al
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
- DB 0,255 ; add %bh,%bh
DB 0,0 ; add %al,(%rax)
- DB 0,255 ; add %bh,%bh
+ DB 0,2 ; add %al,(%rdx)
+ DB 4,6 ; add $0x6,%al
+ DB 8,10 ; or %cl,(%rdx)
+ DB 12,14 ; or $0xe,%al
DB 0,0 ; add %al,(%rax)
- DB 0,255 ; add %bh,%bh
DB 0,0 ; add %al,(%rax)
- DB 0,255 ; add %bh,%bh
DB 0,0 ; add %al,(%rax)
- DB 0,1 ; add %al,(%rcx)
+ DB 0,0 ; add %al,(%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
DB 1,1 ; add %eax,(%rcx)
- DB 0,1 ; add %al,(%rcx)
+ DB 1,0 ; add %eax,(%rax)
DB 1,1 ; add %eax,(%rcx)
- DB 0,1 ; add %al,(%rcx)
+ DB 1,0 ; add %eax,(%rax)
DB 1,1 ; add %eax,(%rcx)
- DB 0,1 ; add %al,(%rcx)
+ DB 1,0 ; add %eax,(%rax)
DB 1,1 ; add %eax,(%rcx)
+ DB 1,0 ; add %eax,(%rax)
DB 0,0 ; add %al,(%rax)
+ DB 0,255 ; add %bh,%bh
DB 0,0 ; add %al,(%rax)
- DB 255,0 ; incl (%rax)
+ DB 0,255 ; add %bh,%bh
DB 0,0 ; add %al,(%rax)
- DB 255,0 ; incl (%rax)
+ DB 0,255 ; add %bh,%bh
DB 0,0 ; add %al,(%rax)
+ DB 0,255 ; add %bh,%bh
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
- DB 255 ; (bad)
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
DB 1,1 ; add %eax,(%rcx)
DB 1,0 ; add %eax,(%rax)
DB 1,1 ; add %eax,(%rcx)
@@ -41563,14 +43615,7 @@ ALIGN 16
DB 255,0 ; incl (%rax)
DB 255,0 ; incl (%rax)
DB 255,0 ; incl (%rax)
- DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
- DB 0,4,4 ; add %al,(%rsp,%rax,1)
- DB 4,4 ; add $0x4,%al
- DB 8,8 ; or %cl,(%rax)
- DB 8,8 ; or %cl,(%rax)
- DB 12,12 ; or $0xc,%al
- DB 12,12 ; or $0xc,%al
+ DB 255 ; (bad)
DB 255,0 ; incl (%rax)
DB 255,0 ; incl (%rax)
DB 255,0 ; incl (%rax)
@@ -41579,14 +43624,59 @@ ALIGN 16
DB 255,0 ; incl (%rax)
DB 255,0 ; incl (%rax)
DB 255,0 ; incl (%rax)
+ DB 8,8 ; or %cl,(%rax)
+ DB 8,8 ; or %cl,(%rax)
+ DB 10,10 ; or (%rdx),%cl
+ DB 10,10 ; or (%rdx),%cl
+ DB 12,12 ; or $0xc,%al
+ DB 12,12 ; or $0xc,%al
+ DB 14 ; (bad)
+ DB 14 ; (bad)
+ DB 14 ; (bad)
+ DB 14 ; (bad)
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
+ DB 2,2 ; add (%rdx),%al
+ DB 2,2 ; add (%rdx),%al
DB 4,4 ; add $0x4,%al
DB 4,4 ; add $0x4,%al
+ DB 6 ; (bad)
+ DB 6 ; (bad)
+ DB 6 ; (bad)
+ DB 6 ; (bad)
+ DB 0,128,2,128,4,128 ; add %al,-0x7ffb7ffe(%rax)
+ DB 6 ; (bad)
+ DB 128,4,128,5 ; addb $0x5,(%rax,%rax,4)
+ DB 128,6,128 ; addb $0x80,(%rsi)
+ DB 7 ; (bad)
+ DB 128,255,0 ; cmp $0x0,%bh
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
DB 8,8 ; or %cl,(%rax)
DB 8,8 ; or %cl,(%rax)
+ DB 10,10 ; or (%rdx),%cl
+ DB 10,10 ; or (%rdx),%cl
DB 12,12 ; or $0xc,%al
DB 12,12 ; or $0xc,%al
+ DB 14 ; (bad)
+ DB 14 ; (bad)
+ DB 14 ; (bad)
+ DB 14 ; (bad)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 2,2 ; add (%rdx),%al
+ DB 2,2 ; add (%rdx),%al
+ DB 4,4 ; add $0x4,%al
+ DB 4,4 ; add $0x4,%al
+ DB 6 ; (bad)
+ DB 6 ; (bad)
+ DB 6 ; (bad)
+ DB 6 ; (bad)
DB 0,0 ; add %al,(%rax)
DB 0,255 ; add %bh,%bh
DB 0,0 ; add %al,(%rax)
@@ -41741,7 +43831,7 @@ _sk_start_pipeline_sse2_8bit LABEL PROC
DB 77,57,207 ; cmp %r9,%r15
DB 15,131,138,0,0,0 ; jae 10b <_sk_start_pipeline_sse2_8bit+0x10b>
DB 72,139,133,24,255,255,255 ; mov -0xe8(%rbp),%rax
- DB 72,141,64,4 ; lea 0x4(%rax),%rax
+ DB 72,141,64,8 ; lea 0x8(%rax),%rax
DB 72,137,133,248,254,255,255 ; mov %rax,-0x108(%rbp)
DB 76,141,165,0,255,255,255 ; lea -0x100(%rbp),%r12
DB 72,57,157,248,254,255,255 ; cmp %rbx,-0x108(%rbp)
@@ -41754,9 +43844,9 @@ _sk_start_pipeline_sse2_8bit LABEL PROC
DB 76,137,246 ; mov %r14,%rsi
DB 65,255,213 ; callq *%r13
DB 72,139,141,0,255,255,255 ; mov -0x100(%rbp),%rcx
- DB 72,141,65,4 ; lea 0x4(%rcx),%rax
+ DB 72,141,65,8 ; lea 0x8(%rcx),%rax
DB 72,137,133,0,255,255,255 ; mov %rax,-0x100(%rbp)
- DB 72,131,193,8 ; add $0x8,%rcx
+ DB 72,131,193,16 ; add $0x10,%rcx
DB 72,57,217 ; cmp %rbx,%rcx
DB 118,220 ; jbe c3 <_sk_start_pipeline_sse2_8bit+0xc3>
DB 72,137,217 ; mov %rbx,%rcx
@@ -41800,76 +43890,111 @@ _sk_uniform_color_sse2_8bit LABEL PROC
DB 102,15,110,64,16 ; movd 0x10(%rax),%xmm0
DB 102,15,112,192,0 ; pshufd $0x0,%xmm0,%xmm0
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 102,15,111,200 ; movdqa %xmm0,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_set_rgb_sse2_8bit
_sk_set_rgb_sse2_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 243,15,16,21,4,17,0,0 ; movss 0x1104(%rip),%xmm2 # 1280 <_sk_xor__sse2_8bit+0xc2>
- DB 243,15,16,24 ; movss (%rax),%xmm3
- DB 243,15,89,218 ; mulss %xmm2,%xmm3
- DB 243,72,15,44,203 ; cvttss2si %xmm3,%rcx
- DB 243,15,16,88,4 ; movss 0x4(%rax),%xmm3
- DB 243,15,89,218 ; mulss %xmm2,%xmm3
- DB 243,72,15,44,211 ; cvttss2si %xmm3,%rdx
+ DB 243,15,16,37,8,34,0,0 ; movss 0x2208(%rip),%xmm4 # 2388 <_sk_xor__sse2_8bit+0x1a5>
+ DB 243,15,16,40 ; movss (%rax),%xmm5
+ DB 243,15,89,236 ; mulss %xmm4,%xmm5
+ DB 243,72,15,44,205 ; cvttss2si %xmm5,%rcx
+ DB 243,15,16,104,4 ; movss 0x4(%rax),%xmm5
+ DB 243,15,89,236 ; mulss %xmm4,%xmm5
+ DB 243,72,15,44,213 ; cvttss2si %xmm5,%rdx
DB 193,226,8 ; shl $0x8,%edx
DB 9,202 ; or %ecx,%edx
- DB 243,15,89,80,8 ; mulss 0x8(%rax),%xmm2
- DB 243,72,15,44,194 ; cvttss2si %xmm2,%rax
+ DB 243,15,89,96,8 ; mulss 0x8(%rax),%xmm4
+ DB 243,72,15,44,196 ; cvttss2si %xmm4,%rax
DB 193,224,16 ; shl $0x10,%eax
DB 9,208 ; or %edx,%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 102,15,112,210,0 ; pshufd $0x0,%xmm2,%xmm2
- DB 102,15,219,5,212,16,0,0 ; pand 0x10d4(%rip),%xmm0 # 1290 <_sk_xor__sse2_8bit+0xd2>
- DB 102,15,235,194 ; por %xmm2,%xmm0
+ DB 102,15,110,224 ; movd %eax,%xmm4
+ DB 102,15,112,228,0 ; pshufd $0x0,%xmm4,%xmm4
+ DB 102,15,111,45,224,33,0,0 ; movdqa 0x21e0(%rip),%xmm5 # 23a0 <_sk_xor__sse2_8bit+0x1bd>
+ DB 102,15,219,205 ; pand %xmm5,%xmm1
+ DB 102,15,219,197 ; pand %xmm5,%xmm0
+ DB 102,15,235,196 ; por %xmm4,%xmm0
+ DB 102,15,235,204 ; por %xmm4,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_premul_sse2_8bit
_sk_premul_sse2_8bit LABEL PROC
- DB 242,15,112,208,231 ; pshuflw $0xe7,%xmm0,%xmm2
- DB 243,15,112,210,231 ; pshufhw $0xe7,%xmm2,%xmm2
- DB 102,15,112,210,232 ; pshufd $0xe8,%xmm2,%xmm2
- DB 102,15,96,210 ; punpcklbw %xmm2,%xmm2
- DB 242,15,112,210,95 ; pshuflw $0x5f,%xmm2,%xmm2
- DB 243,15,112,218,95 ; pshufhw $0x5f,%xmm2,%xmm3
- DB 102,15,235,29,183,16,0,0 ; por 0x10b7(%rip),%xmm3 # 12a0 <_sk_xor__sse2_8bit+0xe2>
- DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,111,208 ; movdqa %xmm0,%xmm2
- DB 102,15,96,212 ; punpcklbw %xmm4,%xmm2
- DB 102,15,104,196 ; punpckhbw %xmm4,%xmm0
- DB 102,15,111,235 ; movdqa %xmm3,%xmm5
- DB 102,15,96,236 ; punpcklbw %xmm4,%xmm5
- DB 102,15,104,220 ; punpckhbw %xmm4,%xmm3
- DB 102,15,213,216 ; pmullw %xmm0,%xmm3
- DB 102,15,213,234 ; pmullw %xmm2,%xmm5
- DB 102,15,253,213 ; paddw %xmm5,%xmm2
- DB 102,15,253,216 ; paddw %xmm0,%xmm3
- DB 102,15,113,211,8 ; psrlw $0x8,%xmm3
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,211 ; packuswb %xmm3,%xmm2
+ DB 102,68,15,111,192 ; movdqa %xmm0,%xmm8
+ DB 242,65,15,112,192,231 ; pshuflw $0xe7,%xmm8,%xmm0
+ DB 243,15,112,192,231 ; pshufhw $0xe7,%xmm0,%xmm0
+ DB 102,15,112,192,232 ; pshufd $0xe8,%xmm0,%xmm0
+ DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
+ DB 242,15,112,192,95 ; pshuflw $0x5f,%xmm0,%xmm0
+ DB 243,15,112,240,95 ; pshufhw $0x5f,%xmm0,%xmm6
+ DB 242,15,112,193,231 ; pshuflw $0xe7,%xmm1,%xmm0
+ DB 243,15,112,192,231 ; pshufhw $0xe7,%xmm0,%xmm0
+ DB 102,15,112,192,232 ; pshufd $0xe8,%xmm0,%xmm0
+ DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
+ DB 242,15,112,192,95 ; pshuflw $0x5f,%xmm0,%xmm0
+ DB 243,15,112,248,95 ; pshufhw $0x5f,%xmm0,%xmm7
+ DB 102,15,111,5,148,33,0,0 ; movdqa 0x2194(%rip),%xmm0 # 23b0 <_sk_xor__sse2_8bit+0x1cd>
+ DB 102,15,235,248 ; por %xmm0,%xmm7
+ DB 102,15,235,240 ; por %xmm0,%xmm6
+ DB 102,69,15,239,201 ; pxor %xmm9,%xmm9
+ DB 102,65,15,111,192 ; movdqa %xmm8,%xmm0
+ DB 102,65,15,96,193 ; punpcklbw %xmm9,%xmm0
+ DB 102,69,15,104,193 ; punpckhbw %xmm9,%xmm8
+ DB 102,15,111,233 ; movdqa %xmm1,%xmm5
+ DB 102,65,15,96,233 ; punpcklbw %xmm9,%xmm5
+ DB 102,65,15,104,201 ; punpckhbw %xmm9,%xmm1
+ DB 102,15,111,230 ; movdqa %xmm6,%xmm4
+ DB 102,65,15,96,225 ; punpcklbw %xmm9,%xmm4
+ DB 102,65,15,104,241 ; punpckhbw %xmm9,%xmm6
+ DB 102,68,15,111,215 ; movdqa %xmm7,%xmm10
+ DB 102,69,15,96,209 ; punpcklbw %xmm9,%xmm10
+ DB 102,65,15,104,249 ; punpckhbw %xmm9,%xmm7
+ DB 102,15,213,249 ; pmullw %xmm1,%xmm7
+ DB 102,68,15,213,213 ; pmullw %xmm5,%xmm10
+ DB 102,65,15,213,240 ; pmullw %xmm8,%xmm6
+ DB 102,15,213,224 ; pmullw %xmm0,%xmm4
+ DB 102,15,253,196 ; paddw %xmm4,%xmm0
+ DB 102,65,15,253,240 ; paddw %xmm8,%xmm6
+ DB 102,65,15,253,234 ; paddw %xmm10,%xmm5
+ DB 102,15,253,249 ; paddw %xmm1,%xmm7
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
+ DB 102,15,103,198 ; packuswb %xmm6,%xmm0
+ DB 102,15,103,239 ; packuswb %xmm7,%xmm5
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
+ DB 102,15,111,205 ; movdqa %xmm5,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_swap_rb_sse2_8bit
_sk_swap_rb_sse2_8bit LABEL PROC
- DB 102,15,239,210 ; pxor %xmm2,%xmm2
- DB 102,15,111,216 ; movdqa %xmm0,%xmm3
- DB 102,15,104,218 ; punpckhbw %xmm2,%xmm3
- DB 242,15,112,219,198 ; pshuflw $0xc6,%xmm3,%xmm3
- DB 243,15,112,219,198 ; pshufhw $0xc6,%xmm3,%xmm3
- DB 102,15,96,194 ; punpcklbw %xmm2,%xmm0
+ DB 102,15,239,228 ; pxor %xmm4,%xmm4
+ DB 102,15,111,232 ; movdqa %xmm0,%xmm5
+ DB 102,15,104,236 ; punpckhbw %xmm4,%xmm5
+ DB 242,15,112,237,198 ; pshuflw $0xc6,%xmm5,%xmm5
+ DB 243,15,112,237,198 ; pshufhw $0xc6,%xmm5,%xmm5
+ DB 102,15,96,196 ; punpcklbw %xmm4,%xmm0
DB 242,15,112,192,198 ; pshuflw $0xc6,%xmm0,%xmm0
DB 243,15,112,192,198 ; pshufhw $0xc6,%xmm0,%xmm0
- DB 102,15,103,195 ; packuswb %xmm3,%xmm0
+ DB 102,15,103,197 ; packuswb %xmm5,%xmm0
+ DB 102,15,111,233 ; movdqa %xmm1,%xmm5
+ DB 102,15,104,236 ; punpckhbw %xmm4,%xmm5
+ DB 242,15,112,237,198 ; pshuflw $0xc6,%xmm5,%xmm5
+ DB 243,15,112,237,198 ; pshufhw $0xc6,%xmm5,%xmm5
+ DB 102,15,96,204 ; punpcklbw %xmm4,%xmm1
+ DB 242,15,112,201,198 ; pshuflw $0xc6,%xmm1,%xmm1
+ DB 243,15,112,201,198 ; pshufhw $0xc6,%xmm1,%xmm1
+ DB 102,15,103,205 ; packuswb %xmm5,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_invert_sse2_8bit
_sk_invert_sse2_8bit LABEL PROC
- DB 102,15,118,210 ; pcmpeqd %xmm2,%xmm2
- DB 102,15,239,194 ; pxor %xmm2,%xmm0
+ DB 102,15,118,228 ; pcmpeqd %xmm4,%xmm4
+ DB 102,15,239,196 ; pxor %xmm4,%xmm0
+ DB 102,15,239,204 ; pxor %xmm4,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
@@ -41884,24 +44009,53 @@ _sk_load_8888_sse2_8bit LABEL PROC
DB 72,193,226,2 ; shl $0x2,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,10 ; jne 28e <_sk_load_8888_sse2_8bit+0x2b>
- DB 243,66,15,111,4,130 ; movdqu (%rdx,%r8,4),%xmm0
+ DB 117,16 ; jne 33c <_sk_load_8888_sse2_8bit+0x31>
+ DB 66,15,16,76,130,16 ; movups 0x10(%rdx,%r8,4),%xmm1
+ DB 102,66,15,16,4,130 ; movupd (%rdx,%r8,4),%xmm0
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,36 ; je 2bc <_sk_load_8888_sse2_8bit+0x59>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,201 ; pxor %xmm1,%xmm1
DB 102,15,239,192 ; pxor %xmm0,%xmm0
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,18 ; je 2b4 <_sk_load_8888_sse2_8bit+0x51>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,226 ; jne 28a <_sk_load_8888_sse2_8bit+0x27>
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,231 ; ja 338 <_sk_load_8888_sse2_8bit+0x2d>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,80,0,0,0 ; lea 0x50(%rip),%rcx # 3ac <_sk_load_8888_sse2_8bit+0xa1>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 243,66,15,16,4,130 ; movss (%rdx,%r8,4),%xmm0
+ DB 235,203 ; jmp 338 <_sk_load_8888_sse2_8bit+0x2d>
DB 102,66,15,110,68,130,8 ; movd 0x8(%rdx,%r8,4),%xmm0
DB 102,15,112,192,69 ; pshufd $0x45,%xmm0,%xmm0
DB 102,66,15,18,4,130 ; movlpd (%rdx,%r8,4),%xmm0
- DB 235,206 ; jmp 28a <_sk_load_8888_sse2_8bit+0x27>
- DB 102,66,15,110,4,130 ; movd (%rdx,%r8,4),%xmm0
- DB 235,198 ; jmp 28a <_sk_load_8888_sse2_8bit+0x27>
+ DB 235,183 ; jmp 338 <_sk_load_8888_sse2_8bit+0x2d>
+ DB 102,66,15,110,68,130,24 ; movd 0x18(%rdx,%r8,4),%xmm0
+ DB 102,15,112,200,69 ; pshufd $0x45,%xmm0,%xmm1
+ DB 243,66,15,16,68,130,20 ; movss 0x14(%rdx,%r8,4),%xmm0
+ DB 15,198,193,0 ; shufps $0x0,%xmm1,%xmm0
+ DB 15,198,193,226 ; shufps $0xe2,%xmm1,%xmm0
+ DB 15,40,200 ; movaps %xmm0,%xmm1
+ DB 243,66,15,16,68,130,16 ; movss 0x10(%rdx,%r8,4),%xmm0
+ DB 243,15,16,200 ; movss %xmm0,%xmm1
+ DB 235,134 ; jmp 332 <_sk_load_8888_sse2_8bit+0x27>
+ DB 185,255,255,255,205 ; mov $0xcdffffff,%ecx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,193 ; inc %ecx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,134,255,255,255,243 ; incl -0xc000001(%rsi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,225 ; jmpq *%rcx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,213 ; callq *%rbp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_8888_dst_sse2_8bit
_sk_load_8888_dst_sse2_8bit LABEL PROC
@@ -41914,53 +44068,117 @@ _sk_load_8888_dst_sse2_8bit LABEL PROC
DB 72,193,226,2 ; shl $0x2,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,10 ; jne 2ef <_sk_load_8888_dst_sse2_8bit+0x2b>
- DB 243,66,15,111,12,130 ; movdqu (%rdx,%r8,4),%xmm1
+ DB 117,16 ; jne 3f9 <_sk_load_8888_dst_sse2_8bit+0x31>
+ DB 66,15,16,92,130,16 ; movups 0x10(%rdx,%r8,4),%xmm3
+ DB 102,66,15,16,20,130 ; movupd (%rdx,%r8,4),%xmm2
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,36 ; je 31d <_sk_load_8888_dst_sse2_8bit+0x59>
- DB 102,15,239,201 ; pxor %xmm1,%xmm1
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,18 ; je 315 <_sk_load_8888_dst_sse2_8bit+0x51>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,226 ; jne 2eb <_sk_load_8888_dst_sse2_8bit+0x27>
- DB 102,66,15,110,76,130,8 ; movd 0x8(%rdx,%r8,4),%xmm1
- DB 102,15,112,201,69 ; pshufd $0x45,%xmm1,%xmm1
- DB 102,66,15,18,12,130 ; movlpd (%rdx,%r8,4),%xmm1
- DB 235,206 ; jmp 2eb <_sk_load_8888_dst_sse2_8bit+0x27>
- DB 102,66,15,110,12,130 ; movd (%rdx,%r8,4),%xmm1
- DB 235,198 ; jmp 2eb <_sk_load_8888_dst_sse2_8bit+0x27>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,219 ; pxor %xmm3,%xmm3
+ DB 102,15,239,210 ; pxor %xmm2,%xmm2
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,231 ; ja 3f5 <_sk_load_8888_dst_sse2_8bit+0x2d>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,83,0,0,0 ; lea 0x53(%rip),%rcx # 46c <_sk_load_8888_dst_sse2_8bit+0xa4>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 243,66,15,16,20,130 ; movss (%rdx,%r8,4),%xmm2
+ DB 235,203 ; jmp 3f5 <_sk_load_8888_dst_sse2_8bit+0x2d>
+ DB 102,66,15,110,84,130,8 ; movd 0x8(%rdx,%r8,4),%xmm2
+ DB 102,15,112,210,69 ; pshufd $0x45,%xmm2,%xmm2
+ DB 102,66,15,18,20,130 ; movlpd (%rdx,%r8,4),%xmm2
+ DB 235,183 ; jmp 3f5 <_sk_load_8888_dst_sse2_8bit+0x2d>
+ DB 102,66,15,110,84,130,24 ; movd 0x18(%rdx,%r8,4),%xmm2
+ DB 102,15,112,218,69 ; pshufd $0x45,%xmm2,%xmm3
+ DB 243,66,15,16,84,130,20 ; movss 0x14(%rdx,%r8,4),%xmm2
+ DB 15,198,211,0 ; shufps $0x0,%xmm3,%xmm2
+ DB 15,198,211,226 ; shufps $0xe2,%xmm3,%xmm2
+ DB 15,40,218 ; movaps %xmm2,%xmm3
+ DB 243,66,15,16,84,130,16 ; movss 0x10(%rdx,%r8,4),%xmm2
+ DB 243,15,16,218 ; movss %xmm2,%xmm3
+ DB 235,134 ; jmp 3ef <_sk_load_8888_dst_sse2_8bit+0x27>
+ DB 15,31,0 ; nopl (%rax)
+ DB 182,255 ; mov $0xff,%dh
+ DB 255 ; (bad)
+ DB 255,202 ; dec %edx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 190,255,255,255,131 ; mov $0x83ffffff,%esi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,240 ; push %rax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 222,255 ; fdivrp %st,%st(7)
+ DB 255 ; (bad)
+ DB 255,210 ; callq *%rdx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_store_8888_sse2_8bit
_sk_store_8888_sse2_8bit LABEL PROC
- DB 76,99,7 ; movslq (%rdi),%r8
- DB 76,139,79,16 ; mov 0x10(%rdi),%r9
+ DB 76,99,15 ; movslq (%rdi),%r9
+ DB 76,139,71,16 ; mov 0x10(%rdi),%r8
DB 72,173 ; lods %ds:(%rsi),%rax
DB 72,99,72,8 ; movslq 0x8(%rax),%rcx
DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,193,226,2 ; shl $0x2,%rdx
DB 72,3,16 ; add (%rax),%rdx
- DB 77,133,201 ; test %r9,%r9
- DB 117,10 ; jne 350 <_sk_store_8888_sse2_8bit+0x2b>
- DB 243,66,15,127,4,130 ; movdqu %xmm0,(%rdx,%r8,4)
+ DB 77,133,192 ; test %r8,%r8
+ DB 117,17 ; jne 4ba <_sk_store_8888_sse2_8bit+0x32>
+ DB 243,66,15,127,4,138 ; movdqu %xmm0,(%rdx,%r9,4)
+ DB 243,66,15,127,76,138,16 ; movdqu %xmm1,0x10(%rdx,%r9,4)
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,32 ; je 37a <_sk_store_8888_sse2_8bit+0x55>
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,18 ; je 372 <_sk_store_8888_sse2_8bit+0x4d>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,230 ; jne 34c <_sk_store_8888_sse2_8bit+0x27>
- DB 102,15,112,208,78 ; pshufd $0x4e,%xmm0,%xmm2
- DB 102,66,15,126,84,130,8 ; movd %xmm2,0x8(%rdx,%r8,4)
- DB 102,66,15,214,4,130 ; movq %xmm0,(%rdx,%r8,4)
- DB 235,210 ; jmp 34c <_sk_store_8888_sse2_8bit+0x27>
- DB 102,66,15,126,4,130 ; movd %xmm0,(%rdx,%r8,4)
- DB 235,202 ; jmp 34c <_sk_store_8888_sse2_8bit+0x27>
+ DB 65,128,224,7 ; and $0x7,%r8b
+ DB 65,254,200 ; dec %r8b
+ DB 65,128,248,6 ; cmp $0x6,%r8b
+ DB 119,239 ; ja 4b6 <_sk_store_8888_sse2_8bit+0x2e>
+ DB 65,15,182,192 ; movzbl %r8b,%eax
+ DB 72,141,13,78,0,0,0 ; lea 0x4e(%rip),%rcx # 520 <_sk_store_8888_sse2_8bit+0x98>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 102,66,15,126,4,138 ; movd %xmm0,(%rdx,%r9,4)
+ DB 235,211 ; jmp 4b6 <_sk_store_8888_sse2_8bit+0x2e>
+ DB 102,15,112,224,78 ; pshufd $0x4e,%xmm0,%xmm4
+ DB 102,66,15,126,100,138,8 ; movd %xmm4,0x8(%rdx,%r9,4)
+ DB 102,66,15,214,4,138 ; movq %xmm0,(%rdx,%r9,4)
+ DB 235,191 ; jmp 4b6 <_sk_store_8888_sse2_8bit+0x2e>
+ DB 102,15,112,225,78 ; pshufd $0x4e,%xmm1,%xmm4
+ DB 102,66,15,126,100,138,24 ; movd %xmm4,0x18(%rdx,%r9,4)
+ DB 102,15,112,225,229 ; pshufd $0xe5,%xmm1,%xmm4
+ DB 102,66,15,126,100,138,20 ; movd %xmm4,0x14(%rdx,%r9,4)
+ DB 102,66,15,126,76,138,16 ; movd %xmm1,0x10(%rdx,%r9,4)
+ DB 243,66,15,127,4,138 ; movdqu %xmm0,(%rdx,%r9,4)
+ DB 235,152 ; jmp 4b6 <_sk_store_8888_sse2_8bit+0x2e>
+ DB 102,144 ; xchg %ax,%ax
+ DB 187,255,255,255,207 ; mov $0xcfffffff,%ebx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,195 ; inc %ebx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,246 ; push %rsi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 239 ; out %eax,(%dx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,227 ; jmpq *%rbx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,215 ; callq *%rdi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_bgra_sse2_8bit
_sk_load_bgra_sse2_8bit LABEL PROC
@@ -41973,33 +44191,76 @@ _sk_load_bgra_sse2_8bit LABEL PROC
DB 72,193,226,2 ; shl $0x2,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,50 ; jne 3d5 <_sk_load_bgra_sse2_8bit+0x53>
- DB 243,66,15,111,4,130 ; movdqu (%rdx,%r8,4),%xmm0
- DB 102,15,239,210 ; pxor %xmm2,%xmm2
- DB 102,15,111,216 ; movdqa %xmm0,%xmm3
- DB 102,15,104,218 ; punpckhbw %xmm2,%xmm3
- DB 242,15,112,219,198 ; pshuflw $0xc6,%xmm3,%xmm3
- DB 243,15,112,219,198 ; pshufhw $0xc6,%xmm3,%xmm3
- DB 102,15,96,194 ; punpcklbw %xmm2,%xmm0
+ DB 117,92 ; jne 5b9 <_sk_load_bgra_sse2_8bit+0x7d>
+ DB 66,15,16,76,130,16 ; movups 0x10(%rdx,%r8,4),%xmm1
+ DB 102,66,15,16,4,130 ; movupd (%rdx,%r8,4),%xmm0
+ DB 102,15,239,228 ; pxor %xmm4,%xmm4
+ DB 102,15,40,232 ; movapd %xmm0,%xmm5
+ DB 102,15,104,236 ; punpckhbw %xmm4,%xmm5
+ DB 242,15,112,237,198 ; pshuflw $0xc6,%xmm5,%xmm5
+ DB 243,15,112,237,198 ; pshufhw $0xc6,%xmm5,%xmm5
+ DB 102,15,96,196 ; punpcklbw %xmm4,%xmm0
DB 242,15,112,192,198 ; pshuflw $0xc6,%xmm0,%xmm0
DB 243,15,112,192,198 ; pshufhw $0xc6,%xmm0,%xmm0
- DB 102,15,103,195 ; packuswb %xmm3,%xmm0
+ DB 102,15,103,197 ; packuswb %xmm5,%xmm0
+ DB 102,15,111,233 ; movdqa %xmm1,%xmm5
+ DB 102,15,104,236 ; punpckhbw %xmm4,%xmm5
+ DB 242,15,112,237,198 ; pshuflw $0xc6,%xmm5,%xmm5
+ DB 243,15,112,237,198 ; pshufhw $0xc6,%xmm5,%xmm5
+ DB 102,15,96,204 ; punpcklbw %xmm4,%xmm1
+ DB 242,15,112,201,198 ; pshuflw $0xc6,%xmm1,%xmm1
+ DB 243,15,112,201,198 ; pshufhw $0xc6,%xmm1,%xmm1
+ DB 102,15,103,205 ; packuswb %xmm5,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,36 ; je 403 <_sk_load_bgra_sse2_8bit+0x81>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,201 ; pxor %xmm1,%xmm1
DB 102,15,239,192 ; pxor %xmm0,%xmm0
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,18 ; je 3fb <_sk_load_bgra_sse2_8bit+0x79>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,186 ; jne 3a9 <_sk_load_bgra_sse2_8bit+0x27>
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,155 ; ja 569 <_sk_load_bgra_sse2_8bit+0x2d>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,91,0,0,0 ; lea 0x5b(%rip),%rcx # 634 <_sk_load_bgra_sse2_8bit+0xf8>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 243,66,15,16,4,130 ; movss (%rdx,%r8,4),%xmm0
+ DB 233,124,255,255,255 ; jmpq 569 <_sk_load_bgra_sse2_8bit+0x2d>
DB 102,66,15,110,68,130,8 ; movd 0x8(%rdx,%r8,4),%xmm0
DB 102,15,112,192,69 ; pshufd $0x45,%xmm0,%xmm0
DB 102,66,15,18,4,130 ; movlpd (%rdx,%r8,4),%xmm0
- DB 235,166 ; jmp 3a9 <_sk_load_bgra_sse2_8bit+0x27>
- DB 102,66,15,110,4,130 ; movd (%rdx,%r8,4),%xmm0
- DB 235,158 ; jmp 3a9 <_sk_load_bgra_sse2_8bit+0x27>
+ DB 233,101,255,255,255 ; jmpq 569 <_sk_load_bgra_sse2_8bit+0x2d>
+ DB 102,66,15,110,68,130,24 ; movd 0x18(%rdx,%r8,4),%xmm0
+ DB 102,15,112,200,69 ; pshufd $0x45,%xmm0,%xmm1
+ DB 243,66,15,16,68,130,20 ; movss 0x14(%rdx,%r8,4),%xmm0
+ DB 15,198,193,0 ; shufps $0x0,%xmm1,%xmm0
+ DB 15,198,193,226 ; shufps $0xe2,%xmm1,%xmm0
+ DB 15,40,200 ; movaps %xmm0,%xmm1
+ DB 243,66,15,16,68,130,16 ; movss 0x10(%rdx,%r8,4),%xmm0
+ DB 243,15,16,200 ; movss %xmm0,%xmm1
+ DB 233,49,255,255,255 ; jmpq 563 <_sk_load_bgra_sse2_8bit+0x27>
+ DB 102,144 ; xchg %ax,%ax
+ DB 174 ; scas %es:(%rdi),%al
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,197 ; inc %ebp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 185,255,255,255,47 ; mov $0x2fffffff,%ecx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 238 ; out %al,(%dx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 220,255 ; fdivr %st,%st(7)
+ DB 255 ; (bad)
+ DB 255,208 ; callq *%rax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_bgra_dst_sse2_8bit
_sk_load_bgra_dst_sse2_8bit LABEL PROC
@@ -42012,72 +44273,155 @@ _sk_load_bgra_dst_sse2_8bit LABEL PROC
DB 72,193,226,2 ; shl $0x2,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,50 ; jne 45e <_sk_load_bgra_dst_sse2_8bit+0x53>
- DB 243,66,15,111,12,130 ; movdqu (%rdx,%r8,4),%xmm1
- DB 102,15,239,210 ; pxor %xmm2,%xmm2
- DB 102,15,111,217 ; movdqa %xmm1,%xmm3
- DB 102,15,104,218 ; punpckhbw %xmm2,%xmm3
+ DB 117,92 ; jne 6cd <_sk_load_bgra_dst_sse2_8bit+0x7d>
+ DB 66,15,16,92,130,16 ; movups 0x10(%rdx,%r8,4),%xmm3
+ DB 102,66,15,16,20,130 ; movupd (%rdx,%r8,4),%xmm2
+ DB 102,15,239,228 ; pxor %xmm4,%xmm4
+ DB 102,15,40,234 ; movapd %xmm2,%xmm5
+ DB 102,15,104,236 ; punpckhbw %xmm4,%xmm5
+ DB 242,15,112,237,198 ; pshuflw $0xc6,%xmm5,%xmm5
+ DB 243,15,112,237,198 ; pshufhw $0xc6,%xmm5,%xmm5
+ DB 102,15,96,212 ; punpcklbw %xmm4,%xmm2
+ DB 242,15,112,210,198 ; pshuflw $0xc6,%xmm2,%xmm2
+ DB 243,15,112,210,198 ; pshufhw $0xc6,%xmm2,%xmm2
+ DB 102,15,103,213 ; packuswb %xmm5,%xmm2
+ DB 102,15,111,235 ; movdqa %xmm3,%xmm5
+ DB 102,15,104,236 ; punpckhbw %xmm4,%xmm5
+ DB 242,15,112,237,198 ; pshuflw $0xc6,%xmm5,%xmm5
+ DB 243,15,112,237,198 ; pshufhw $0xc6,%xmm5,%xmm5
+ DB 102,15,96,220 ; punpcklbw %xmm4,%xmm3
DB 242,15,112,219,198 ; pshuflw $0xc6,%xmm3,%xmm3
DB 243,15,112,219,198 ; pshufhw $0xc6,%xmm3,%xmm3
- DB 102,15,96,202 ; punpcklbw %xmm2,%xmm1
- DB 242,15,112,201,198 ; pshuflw $0xc6,%xmm1,%xmm1
- DB 243,15,112,201,198 ; pshufhw $0xc6,%xmm1,%xmm1
- DB 102,15,103,203 ; packuswb %xmm3,%xmm1
+ DB 102,15,103,221 ; packuswb %xmm5,%xmm3
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,36 ; je 48c <_sk_load_bgra_dst_sse2_8bit+0x81>
- DB 102,15,239,201 ; pxor %xmm1,%xmm1
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,18 ; je 484 <_sk_load_bgra_dst_sse2_8bit+0x79>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,186 ; jne 432 <_sk_load_bgra_dst_sse2_8bit+0x27>
- DB 102,66,15,110,76,130,8 ; movd 0x8(%rdx,%r8,4),%xmm1
- DB 102,15,112,201,69 ; pshufd $0x45,%xmm1,%xmm1
- DB 102,66,15,18,12,130 ; movlpd (%rdx,%r8,4),%xmm1
- DB 235,166 ; jmp 432 <_sk_load_bgra_dst_sse2_8bit+0x27>
- DB 102,66,15,110,12,130 ; movd (%rdx,%r8,4),%xmm1
- DB 235,158 ; jmp 432 <_sk_load_bgra_dst_sse2_8bit+0x27>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,219 ; pxor %xmm3,%xmm3
+ DB 102,15,239,210 ; pxor %xmm2,%xmm2
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,155 ; ja 67d <_sk_load_bgra_dst_sse2_8bit+0x2d>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,91,0,0,0 ; lea 0x5b(%rip),%rcx # 748 <_sk_load_bgra_dst_sse2_8bit+0xf8>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 243,66,15,16,20,130 ; movss (%rdx,%r8,4),%xmm2
+ DB 233,124,255,255,255 ; jmpq 67d <_sk_load_bgra_dst_sse2_8bit+0x2d>
+ DB 102,66,15,110,84,130,8 ; movd 0x8(%rdx,%r8,4),%xmm2
+ DB 102,15,112,210,69 ; pshufd $0x45,%xmm2,%xmm2
+ DB 102,66,15,18,20,130 ; movlpd (%rdx,%r8,4),%xmm2
+ DB 233,101,255,255,255 ; jmpq 67d <_sk_load_bgra_dst_sse2_8bit+0x2d>
+ DB 102,66,15,110,84,130,24 ; movd 0x18(%rdx,%r8,4),%xmm2
+ DB 102,15,112,218,69 ; pshufd $0x45,%xmm2,%xmm3
+ DB 243,66,15,16,84,130,20 ; movss 0x14(%rdx,%r8,4),%xmm2
+ DB 15,198,211,0 ; shufps $0x0,%xmm3,%xmm2
+ DB 15,198,211,226 ; shufps $0xe2,%xmm3,%xmm2
+ DB 15,40,218 ; movaps %xmm2,%xmm3
+ DB 243,66,15,16,84,130,16 ; movss 0x10(%rdx,%r8,4),%xmm2
+ DB 243,15,16,218 ; movss %xmm2,%xmm3
+ DB 233,49,255,255,255 ; jmpq 677 <_sk_load_bgra_dst_sse2_8bit+0x27>
+ DB 102,144 ; xchg %ax,%ax
+ DB 174 ; scas %es:(%rdi),%al
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,197 ; inc %ebp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 185,255,255,255,47 ; mov $0x2fffffff,%ecx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 238 ; out %al,(%dx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 220,255 ; fdivr %st,%st(7)
+ DB 255 ; (bad)
+ DB 255,208 ; callq *%rax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_store_bgra_sse2_8bit
_sk_store_bgra_sse2_8bit LABEL PROC
- DB 76,99,7 ; movslq (%rdi),%r8
- DB 76,139,79,16 ; mov 0x10(%rdi),%r9
+ DB 76,99,15 ; movslq (%rdi),%r9
+ DB 76,139,71,16 ; mov 0x10(%rdi),%r8
DB 72,173 ; lods %ds:(%rsi),%rax
DB 72,99,72,8 ; movslq 0x8(%rax),%rcx
DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,193,226,2 ; shl $0x2,%rdx
DB 72,3,16 ; add (%rax),%rdx
- DB 102,15,239,210 ; pxor %xmm2,%xmm2
- DB 102,15,111,216 ; movdqa %xmm0,%xmm3
- DB 102,15,104,218 ; punpckhbw %xmm2,%xmm3
- DB 242,15,112,219,198 ; pshuflw $0xc6,%xmm3,%xmm3
- DB 243,15,112,219,198 ; pshufhw $0xc6,%xmm3,%xmm3
- DB 102,15,111,224 ; movdqa %xmm0,%xmm4
- DB 102,15,96,226 ; punpcklbw %xmm2,%xmm4
- DB 242,15,112,212,198 ; pshuflw $0xc6,%xmm4,%xmm2
- DB 243,15,112,210,198 ; pshufhw $0xc6,%xmm2,%xmm2
- DB 102,15,103,211 ; packuswb %xmm3,%xmm2
- DB 77,133,201 ; test %r9,%r9
- DB 117,10 ; jne 4eb <_sk_store_bgra_sse2_8bit+0x57>
- DB 243,66,15,127,20,130 ; movdqu %xmm2,(%rdx,%r8,4)
+ DB 102,15,239,237 ; pxor %xmm5,%xmm5
+ DB 102,15,111,225 ; movdqa %xmm1,%xmm4
+ DB 102,15,104,229 ; punpckhbw %xmm5,%xmm4
+ DB 242,15,112,228,198 ; pshuflw $0xc6,%xmm4,%xmm4
+ DB 243,15,112,244,198 ; pshufhw $0xc6,%xmm4,%xmm6
+ DB 102,15,111,225 ; movdqa %xmm1,%xmm4
+ DB 102,15,96,229 ; punpcklbw %xmm5,%xmm4
+ DB 242,15,112,228,198 ; pshuflw $0xc6,%xmm4,%xmm4
+ DB 243,15,112,228,198 ; pshufhw $0xc6,%xmm4,%xmm4
+ DB 102,15,103,230 ; packuswb %xmm6,%xmm4
+ DB 102,15,111,240 ; movdqa %xmm0,%xmm6
+ DB 102,15,104,245 ; punpckhbw %xmm5,%xmm6
+ DB 242,15,112,246,198 ; pshuflw $0xc6,%xmm6,%xmm6
+ DB 243,15,112,246,198 ; pshufhw $0xc6,%xmm6,%xmm6
+ DB 102,15,111,248 ; movdqa %xmm0,%xmm7
+ DB 102,15,96,253 ; punpcklbw %xmm5,%xmm7
+ DB 242,15,112,239,198 ; pshuflw $0xc6,%xmm7,%xmm5
+ DB 243,15,112,237,198 ; pshufhw $0xc6,%xmm5,%xmm5
+ DB 102,15,103,238 ; packuswb %xmm6,%xmm5
+ DB 77,133,192 ; test %r8,%r8
+ DB 117,17 ; jne 7ea <_sk_store_bgra_sse2_8bit+0x86>
+ DB 243,66,15,127,44,138 ; movdqu %xmm5,(%rdx,%r9,4)
+ DB 243,66,15,127,100,138,16 ; movdqu %xmm4,0x10(%rdx,%r9,4)
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,32 ; je 515 <_sk_store_bgra_sse2_8bit+0x81>
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,18 ; je 50d <_sk_store_bgra_sse2_8bit+0x79>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,230 ; jne 4e7 <_sk_store_bgra_sse2_8bit+0x53>
- DB 102,15,112,218,78 ; pshufd $0x4e,%xmm2,%xmm3
- DB 102,66,15,126,92,130,8 ; movd %xmm3,0x8(%rdx,%r8,4)
- DB 102,66,15,214,20,130 ; movq %xmm2,(%rdx,%r8,4)
- DB 235,210 ; jmp 4e7 <_sk_store_bgra_sse2_8bit+0x53>
- DB 102,66,15,126,20,130 ; movd %xmm2,(%rdx,%r8,4)
- DB 235,202 ; jmp 4e7 <_sk_store_bgra_sse2_8bit+0x53>
+ DB 65,128,224,7 ; and $0x7,%r8b
+ DB 65,254,200 ; dec %r8b
+ DB 65,128,248,6 ; cmp $0x6,%r8b
+ DB 119,239 ; ja 7e6 <_sk_store_bgra_sse2_8bit+0x82>
+ DB 65,15,182,192 ; movzbl %r8b,%eax
+ DB 72,141,13,78,0,0,0 ; lea 0x4e(%rip),%rcx # 850 <_sk_store_bgra_sse2_8bit+0xec>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 102,66,15,126,44,138 ; movd %xmm5,(%rdx,%r9,4)
+ DB 235,211 ; jmp 7e6 <_sk_store_bgra_sse2_8bit+0x82>
+ DB 102,15,112,229,78 ; pshufd $0x4e,%xmm5,%xmm4
+ DB 102,66,15,126,100,138,8 ; movd %xmm4,0x8(%rdx,%r9,4)
+ DB 102,66,15,214,44,138 ; movq %xmm5,(%rdx,%r9,4)
+ DB 235,191 ; jmp 7e6 <_sk_store_bgra_sse2_8bit+0x82>
+ DB 102,15,112,244,78 ; pshufd $0x4e,%xmm4,%xmm6
+ DB 102,66,15,126,116,138,24 ; movd %xmm6,0x18(%rdx,%r9,4)
+ DB 102,15,112,244,229 ; pshufd $0xe5,%xmm4,%xmm6
+ DB 102,66,15,126,116,138,20 ; movd %xmm6,0x14(%rdx,%r9,4)
+ DB 102,66,15,126,100,138,16 ; movd %xmm4,0x10(%rdx,%r9,4)
+ DB 243,66,15,127,44,138 ; movdqu %xmm5,(%rdx,%r9,4)
+ DB 235,152 ; jmp 7e6 <_sk_store_bgra_sse2_8bit+0x82>
+ DB 102,144 ; xchg %ax,%ax
+ DB 187,255,255,255,207 ; mov $0xcfffffff,%ebx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,195 ; inc %ebx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,246 ; push %rsi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 239 ; out %eax,(%dx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,227 ; jmpq *%rbx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,215 ; callq *%rdi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_a8_sse2_8bit
_sk_load_a8_sse2_8bit LABEL PROC
@@ -42089,33 +44433,70 @@ _sk_load_a8_sse2_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,23 ; jne 551 <_sk_load_a8_sse2_8bit+0x34>
- DB 102,66,15,110,4,2 ; movd (%rdx,%r8,1),%xmm0
+ DB 117,48 ; jne 8b9 <_sk_load_a8_sse2_8bit+0x4d>
+ DB 243,66,15,126,4,2 ; movq (%rdx,%r8,1),%xmm0
DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
- DB 102,15,97,192 ; punpcklwd %xmm0,%xmm0
+ DB 102,15,84,5,37,27,0,0 ; andpd 0x1b25(%rip),%xmm0 # 23c0 <_sk_xor__sse2_8bit+0x1dd>
+ DB 102,15,239,228 ; pxor %xmm4,%xmm4
+ DB 102,15,40,200 ; movapd %xmm0,%xmm1
+ DB 102,15,105,204 ; punpckhwd %xmm4,%xmm1
+ DB 102,15,97,196 ; punpcklwd %xmm4,%xmm0
DB 102,15,114,240,24 ; pslld $0x18,%xmm0
+ DB 102,15,114,241,24 ; pslld $0x18,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,54 ; je 591 <_sk_load_a8_sse2_8bit+0x74>
+ DB 65,128,225,7 ; and $0x7,%r9b
DB 102,15,239,192 ; pxor %xmm0,%xmm0
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,21 ; je 57a <_sk_load_a8_sse2_8bit+0x5d>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,221 ; jne 548 <_sk_load_a8_sse2_8bit+0x2b>
- DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
- DB 102,15,110,192 ; movd %eax,%xmm0
- DB 102,15,112,192,69 ; pshufd $0x45,%xmm0,%xmm0
- DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 102,15,96,208 ; punpcklbw %xmm0,%xmm2
- DB 102,15,97,208 ; punpcklwd %xmm0,%xmm2
- DB 242,15,16,194 ; movsd %xmm2,%xmm0
- DB 235,183 ; jmp 548 <_sk_load_a8_sse2_8bit+0x2b>
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,201 ; ja 893 <_sk_load_a8_sse2_8bit+0x27>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,111,0,0,0 ; lea 0x6f(%rip),%rcx # 944 <_sk_load_a8_sse2_8bit+0xd8>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
DB 102,15,110,192 ; movd %eax,%xmm0
- DB 235,172 ; jmp 548 <_sk_load_a8_sse2_8bit+0x2b>
+ DB 235,170 ; jmp 893 <_sk_load_a8_sse2_8bit+0x27>
+ DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
+ DB 102,15,239,192 ; pxor %xmm0,%xmm0
+ DB 102,15,196,192,2 ; pinsrw $0x2,%eax,%xmm0
+ DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
+ DB 102,15,110,200 ; movd %eax,%xmm1
+ DB 102,15,96,200 ; punpcklbw %xmm0,%xmm1
+ DB 243,15,16,193 ; movss %xmm1,%xmm0
+ DB 235,136 ; jmp 893 <_sk_load_a8_sse2_8bit+0x27>
+ DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
+ DB 102,15,239,192 ; pxor %xmm0,%xmm0
+ DB 102,15,196,192,6 ; pinsrw $0x6,%eax,%xmm0
+ DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
+ DB 102,15,196,192,5 ; pinsrw $0x5,%eax,%xmm0
+ DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
+ DB 102,15,196,192,4 ; pinsrw $0x4,%eax,%xmm0
+ DB 102,66,15,110,12,2 ; movd (%rdx,%r8,1),%xmm1
+ DB 102,15,96,200 ; punpcklbw %xmm0,%xmm1
+ DB 242,15,16,193 ; movsd %xmm1,%xmm0
+ DB 233,80,255,255,255 ; jmpq 893 <_sk_load_a8_sse2_8bit+0x27>
+ DB 144 ; nop
+ DB 154 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,180,255,255,255,165,255 ; pushq -0x5a0001(%rdi,%rdi,8)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 236 ; in (%dx),%al
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,225 ; jmpq *%rcx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,214 ; callq *%rsi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,199 ; inc %edi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_a8_dst_sse2_8bit
_sk_load_a8_dst_sse2_8bit LABEL PROC
@@ -42127,37 +44508,74 @@ _sk_load_a8_dst_sse2_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,23 ; jne 5d0 <_sk_load_a8_dst_sse2_8bit+0x34>
- DB 102,66,15,110,12,2 ; movd (%rdx,%r8,1),%xmm1
- DB 102,15,96,200 ; punpcklbw %xmm0,%xmm1
- DB 102,15,97,200 ; punpcklwd %xmm0,%xmm1
- DB 102,15,114,241,24 ; pslld $0x18,%xmm1
+ DB 117,48 ; jne 9ad <_sk_load_a8_dst_sse2_8bit+0x4d>
+ DB 243,66,15,126,20,2 ; movq (%rdx,%r8,1),%xmm2
+ DB 102,15,96,208 ; punpcklbw %xmm0,%xmm2
+ DB 102,15,84,21,65,26,0,0 ; andpd 0x1a41(%rip),%xmm2 # 23d0 <_sk_xor__sse2_8bit+0x1ed>
+ DB 102,15,239,228 ; pxor %xmm4,%xmm4
+ DB 102,15,40,218 ; movapd %xmm2,%xmm3
+ DB 102,15,105,220 ; punpckhwd %xmm4,%xmm3
+ DB 102,15,97,212 ; punpcklwd %xmm4,%xmm2
+ DB 102,15,114,242,24 ; pslld $0x18,%xmm2
+ DB 102,15,114,243,24 ; pslld $0x18,%xmm3
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,54 ; je 610 <_sk_load_a8_dst_sse2_8bit+0x74>
- DB 102,15,239,201 ; pxor %xmm1,%xmm1
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,21 ; je 5f9 <_sk_load_a8_dst_sse2_8bit+0x5d>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,221 ; jne 5c7 <_sk_load_a8_dst_sse2_8bit+0x2b>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,210 ; pxor %xmm2,%xmm2
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,201 ; ja 987 <_sk_load_a8_dst_sse2_8bit+0x27>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,111,0,0,0 ; lea 0x6f(%rip),%rcx # a38 <_sk_load_a8_dst_sse2_8bit+0xd8>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
+ DB 102,15,110,208 ; movd %eax,%xmm2
+ DB 235,170 ; jmp 987 <_sk_load_a8_dst_sse2_8bit+0x27>
DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
- DB 102,15,110,200 ; movd %eax,%xmm1
- DB 102,15,112,201,69 ; pshufd $0x45,%xmm1,%xmm1
+ DB 102,15,239,210 ; pxor %xmm2,%xmm2
+ DB 102,15,196,208,2 ; pinsrw $0x2,%eax,%xmm2
DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 102,15,96,208 ; punpcklbw %xmm0,%xmm2
- DB 102,15,97,208 ; punpcklwd %xmm0,%xmm2
- DB 242,15,16,202 ; movsd %xmm2,%xmm1
- DB 235,183 ; jmp 5c7 <_sk_load_a8_dst_sse2_8bit+0x2b>
- DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
- DB 102,15,110,200 ; movd %eax,%xmm1
- DB 235,172 ; jmp 5c7 <_sk_load_a8_dst_sse2_8bit+0x2b>
+ DB 102,15,110,216 ; movd %eax,%xmm3
+ DB 102,15,96,216 ; punpcklbw %xmm0,%xmm3
+ DB 243,15,16,211 ; movss %xmm3,%xmm2
+ DB 235,136 ; jmp 987 <_sk_load_a8_dst_sse2_8bit+0x27>
+ DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
+ DB 102,15,239,210 ; pxor %xmm2,%xmm2
+ DB 102,15,196,208,6 ; pinsrw $0x6,%eax,%xmm2
+ DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
+ DB 102,15,196,208,5 ; pinsrw $0x5,%eax,%xmm2
+ DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
+ DB 102,15,196,208,4 ; pinsrw $0x4,%eax,%xmm2
+ DB 102,66,15,110,28,2 ; movd (%rdx,%r8,1),%xmm3
+ DB 102,15,96,216 ; punpcklbw %xmm0,%xmm3
+ DB 242,15,16,211 ; movsd %xmm3,%xmm2
+ DB 233,80,255,255,255 ; jmpq 987 <_sk_load_a8_dst_sse2_8bit+0x27>
+ DB 144 ; nop
+ DB 154 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,180,255,255,255,165,255 ; pushq -0x5a0001(%rdi,%rdi,8)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 236 ; in (%dx),%al
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,225 ; jmpq *%rcx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,214 ; callq *%rsi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,199 ; inc %edi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_store_a8_sse2_8bit
_sk_store_a8_sse2_8bit LABEL PROC
- DB 72,131,236,40 ; sub $0x28,%rsp
+ DB 72,131,236,88 ; sub $0x58,%rsp
DB 76,99,7 ; movslq (%rdi),%r8
DB 76,139,79,16 ; mov 0x10(%rdi),%r9
DB 72,173 ; lods %ds:(%rsi),%rax
@@ -42165,37 +44583,77 @@ _sk_store_a8_sse2_8bit LABEL PROC
DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
- DB 102,15,111,208 ; movdqa %xmm0,%xmm2
- DB 102,15,114,210,24 ; psrld $0x18,%xmm2
+ DB 102,15,111,224 ; movdqa %xmm0,%xmm4
+ DB 102,15,114,212,24 ; psrld $0x18,%xmm4
+ DB 102,15,111,233 ; movdqa %xmm1,%xmm5
+ DB 102,15,114,213,24 ; psrld $0x18,%xmm5
+ DB 102,15,114,245,16 ; pslld $0x10,%xmm5
+ DB 102,15,114,229,16 ; psrad $0x10,%xmm5
+ DB 102,15,114,244,16 ; pslld $0x10,%xmm4
+ DB 102,15,114,228,16 ; psrad $0x10,%xmm4
+ DB 102,15,107,229 ; packssdw %xmm5,%xmm4
DB 77,133,201 ; test %r9,%r9
- DB 117,30 ; jne 663 <_sk_store_a8_sse2_8bit+0x48>
- DB 102,15,219,21,99,12,0,0 ; pand 0xc63(%rip),%xmm2 # 12b0 <_sk_xor__sse2_8bit+0xf2>
- DB 102,15,103,210 ; packuswb %xmm2,%xmm2
- DB 102,15,103,210 ; packuswb %xmm2,%xmm2
- DB 102,66,15,126,20,2 ; movd %xmm2,(%rdx,%r8,1)
+ DB 117,26 ; jne ab9 <_sk_store_a8_sse2_8bit+0x65>
+ DB 102,15,219,37,57,25,0,0 ; pand 0x1939(%rip),%xmm4 # 23e0 <_sk_xor__sse2_8bit+0x1fd>
+ DB 102,15,103,228 ; packuswb %xmm4,%xmm4
+ DB 102,66,15,214,36,2 ; movq %xmm4,(%rdx,%r8,1)
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 72,131,196,40 ; add $0x28,%rsp
+ DB 72,131,196,88 ; add $0x58,%rsp
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,54 ; je 6a3 <_sk_store_a8_sse2_8bit+0x88>
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,21 ; je 688 <_sk_store_a8_sse2_8bit+0x6d>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,226 ; jne 65b <_sk_store_a8_sse2_8bit+0x40>
- DB 102,15,127,84,36,16 ; movdqa %xmm2,0x10(%rsp)
- DB 138,68,36,24 ; mov 0x18(%rsp),%al
- DB 66,136,68,2,2 ; mov %al,0x2(%rdx,%r8,1)
- DB 102,15,219,21,32,12,0,0 ; pand 0xc20(%rip),%xmm2 # 12b0 <_sk_xor__sse2_8bit+0xf2>
- DB 102,15,103,210 ; packuswb %xmm2,%xmm2
- DB 102,15,103,210 ; packuswb %xmm2,%xmm2
- DB 102,15,126,208 ; movd %xmm2,%eax
- DB 102,66,137,4,2 ; mov %ax,(%rdx,%r8,1)
- DB 235,184 ; jmp 65b <_sk_store_a8_sse2_8bit+0x40>
- DB 102,15,127,20,36 ; movdqa %xmm2,(%rsp)
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,235 ; ja ab1 <_sk_store_a8_sse2_8bit+0x5d>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,131,0,0,0 ; lea 0x83(%rip),%rcx # b54 <_sk_store_a8_sse2_8bit+0x100>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 102,15,127,36,36 ; movdqa %xmm4,(%rsp)
DB 138,4,36 ; mov (%rsp),%al
DB 66,136,4,2 ; mov %al,(%rdx,%r8,1)
- DB 235,170 ; jmp 65b <_sk_store_a8_sse2_8bit+0x40>
+ DB 235,201 ; jmp ab1 <_sk_store_a8_sse2_8bit+0x5d>
+ DB 102,15,127,100,36,16 ; movdqa %xmm4,0x10(%rsp)
+ DB 138,68,36,20 ; mov 0x14(%rsp),%al
+ DB 66,136,68,2,2 ; mov %al,0x2(%rdx,%r8,1)
+ DB 102,15,219,37,225,24,0,0 ; pand 0x18e1(%rip),%xmm4 # 23e0 <_sk_xor__sse2_8bit+0x1fd>
+ DB 102,15,103,228 ; packuswb %xmm4,%xmm4
+ DB 102,15,126,224 ; movd %xmm4,%eax
+ DB 102,66,137,4,2 ; mov %ax,(%rdx,%r8,1)
+ DB 235,163 ; jmp ab1 <_sk_store_a8_sse2_8bit+0x5d>
+ DB 102,15,127,100,36,64 ; movdqa %xmm4,0x40(%rsp)
+ DB 138,68,36,76 ; mov 0x4c(%rsp),%al
+ DB 66,136,68,2,6 ; mov %al,0x6(%rdx,%r8,1)
+ DB 102,15,127,100,36,48 ; movdqa %xmm4,0x30(%rsp)
+ DB 138,68,36,58 ; mov 0x3a(%rsp),%al
+ DB 66,136,68,2,5 ; mov %al,0x5(%rdx,%r8,1)
+ DB 102,15,127,100,36,32 ; movdqa %xmm4,0x20(%rsp)
+ DB 138,68,36,40 ; mov 0x28(%rsp),%al
+ DB 66,136,68,2,4 ; mov %al,0x4(%rdx,%r8,1)
+ DB 102,15,219,37,157,24,0,0 ; pand 0x189d(%rip),%xmm4 # 23e0 <_sk_xor__sse2_8bit+0x1fd>
+ DB 102,15,103,228 ; packuswb %xmm4,%xmm4
+ DB 102,66,15,126,36,2 ; movd %xmm4,(%rdx,%r8,1)
+ DB 233,95,255,255,255 ; jmpq ab1 <_sk_store_a8_sse2_8bit+0x5d>
+ DB 102,144 ; xchg %ax,%ax
+ DB 134,255 ; xchg %bh,%bh
+ DB 255 ; (bad)
+ DB 255,163,255,255,255,148 ; jmpq *-0x6b000001(%rbx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,231 ; jmpq *%rdi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 216,255 ; fdivr %st(7),%st
+ DB 255 ; (bad)
+ DB 255,201 ; dec %ecx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 186 ; .byte 0xba
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_g8_sse2_8bit
_sk_load_g8_sse2_8bit LABEL PROC
@@ -42207,41 +44665,85 @@ _sk_load_g8_sse2_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,69 ; jne 713 <_sk_load_g8_sse2_8bit+0x62>
- DB 102,66,15,110,4,2 ; movd (%rdx,%r8,1),%xmm0
+ DB 117,116 ; jne c01 <_sk_load_g8_sse2_8bit+0x91>
+ DB 243,66,15,126,4,2 ; movq (%rdx,%r8,1),%xmm0
DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
- DB 102,15,97,192 ; punpcklwd %xmm0,%xmm0
- DB 102,15,219,5,220,11,0,0 ; pand 0xbdc(%rip),%xmm0 # 12c0 <_sk_xor__sse2_8bit+0x102>
- DB 102,15,111,21,228,11,0,0 ; movdqa 0xbe4(%rip),%xmm2 # 12d0 <_sk_xor__sse2_8bit+0x112>
- DB 102,15,112,216,245 ; pshufd $0xf5,%xmm0,%xmm3
- DB 102,15,244,194 ; pmuludq %xmm2,%xmm0
- DB 102,15,112,192,232 ; pshufd $0xe8,%xmm0,%xmm0
- DB 102,15,244,218 ; pmuludq %xmm2,%xmm3
- DB 102,15,112,211,232 ; pshufd $0xe8,%xmm3,%xmm2
- DB 102,15,98,194 ; punpckldq %xmm2,%xmm0
- DB 102,15,235,5,209,11,0,0 ; por 0xbd1(%rip),%xmm0 # 12e0 <_sk_xor__sse2_8bit+0x122>
+ DB 102,15,84,5,81,24,0,0 ; andpd 0x1851(%rip),%xmm0 # 23f0 <_sk_xor__sse2_8bit+0x20d>
+ DB 102,15,239,201 ; pxor %xmm1,%xmm1
+ DB 102,15,40,224 ; movapd %xmm0,%xmm4
+ DB 102,15,97,225 ; punpcklwd %xmm1,%xmm4
+ DB 102,15,105,193 ; punpckhwd %xmm1,%xmm0
+ DB 102,15,111,45,73,24,0,0 ; movdqa 0x1849(%rip),%xmm5 # 2400 <_sk_xor__sse2_8bit+0x21d>
+ DB 102,15,112,240,245 ; pshufd $0xf5,%xmm0,%xmm6
+ DB 102,15,244,197 ; pmuludq %xmm5,%xmm0
+ DB 102,15,112,200,232 ; pshufd $0xe8,%xmm0,%xmm1
+ DB 102,15,244,245 ; pmuludq %xmm5,%xmm6
+ DB 102,15,112,198,232 ; pshufd $0xe8,%xmm6,%xmm0
+ DB 102,15,98,200 ; punpckldq %xmm0,%xmm1
+ DB 102,15,112,244,245 ; pshufd $0xf5,%xmm4,%xmm6
+ DB 102,15,244,229 ; pmuludq %xmm5,%xmm4
+ DB 102,15,112,196,232 ; pshufd $0xe8,%xmm4,%xmm0
+ DB 102,15,244,245 ; pmuludq %xmm5,%xmm6
+ DB 102,15,112,230,232 ; pshufd $0xe8,%xmm6,%xmm4
+ DB 102,15,98,196 ; punpckldq %xmm4,%xmm0
+ DB 102,15,111,37,27,24,0,0 ; movdqa 0x181b(%rip),%xmm4 # 2410 <_sk_xor__sse2_8bit+0x22d>
+ DB 102,15,235,196 ; por %xmm4,%xmm0
+ DB 102,15,235,204 ; por %xmm4,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,54 ; je 753 <_sk_load_g8_sse2_8bit+0xa2>
+ DB 65,128,225,7 ; and $0x7,%r9b
DB 102,15,239,192 ; pxor %xmm0,%xmm0
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,21 ; je 73c <_sk_load_g8_sse2_8bit+0x8b>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,175 ; jne 6dc <_sk_load_g8_sse2_8bit+0x2b>
- DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
- DB 102,15,110,192 ; movd %eax,%xmm0
- DB 102,15,112,192,69 ; pshufd $0x45,%xmm0,%xmm0
- DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 102,15,96,208 ; punpcklbw %xmm0,%xmm2
- DB 102,15,97,208 ; punpcklwd %xmm0,%xmm2
- DB 242,15,16,194 ; movsd %xmm2,%xmm0
- DB 235,137 ; jmp 6dc <_sk_load_g8_sse2_8bit+0x2b>
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,133 ; ja b97 <_sk_load_g8_sse2_8bit+0x27>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,119,0,0,0 ; lea 0x77(%rip),%rcx # c94 <_sk_load_g8_sse2_8bit+0x124>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
DB 102,15,110,192 ; movd %eax,%xmm0
- DB 233,123,255,255,255 ; jmpq 6dc <_sk_load_g8_sse2_8bit+0x2b>
+ DB 233,99,255,255,255 ; jmpq b97 <_sk_load_g8_sse2_8bit+0x27>
+ DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
+ DB 102,15,239,192 ; pxor %xmm0,%xmm0
+ DB 102,15,196,192,2 ; pinsrw $0x2,%eax,%xmm0
+ DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
+ DB 102,15,110,200 ; movd %eax,%xmm1
+ DB 102,15,96,200 ; punpcklbw %xmm0,%xmm1
+ DB 243,15,16,193 ; movss %xmm1,%xmm0
+ DB 233,62,255,255,255 ; jmpq b97 <_sk_load_g8_sse2_8bit+0x27>
+ DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
+ DB 102,15,239,192 ; pxor %xmm0,%xmm0
+ DB 102,15,196,192,6 ; pinsrw $0x6,%eax,%xmm0
+ DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
+ DB 102,15,196,192,5 ; pinsrw $0x5,%eax,%xmm0
+ DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
+ DB 102,15,196,192,4 ; pinsrw $0x4,%eax,%xmm0
+ DB 102,66,15,110,12,2 ; movd (%rdx,%r8,1),%xmm1
+ DB 102,15,96,200 ; punpcklbw %xmm0,%xmm1
+ DB 242,15,16,193 ; movsd %xmm1,%xmm0
+ DB 233,6,255,255,255 ; jmpq b97 <_sk_load_g8_sse2_8bit+0x27>
+ DB 15,31,0 ; nopl (%rax)
+ DB 146 ; xchg %eax,%edx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,175,255,255,255,160 ; ljmp *-0x5f000001(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 234 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 223,255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,212 ; callq *%rsp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,197 ; inc %ebp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_load_g8_dst_sse2_8bit
_sk_load_g8_dst_sse2_8bit LABEL PROC
@@ -42253,137 +44755,271 @@ _sk_load_g8_dst_sse2_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,69 ; jne 7c3 <_sk_load_g8_dst_sse2_8bit+0x62>
- DB 102,66,15,110,12,2 ; movd (%rdx,%r8,1),%xmm1
- DB 102,15,96,200 ; punpcklbw %xmm0,%xmm1
- DB 102,15,97,200 ; punpcklwd %xmm0,%xmm1
- DB 102,15,219,13,92,11,0,0 ; pand 0xb5c(%rip),%xmm1 # 12f0 <_sk_xor__sse2_8bit+0x132>
- DB 102,15,111,21,100,11,0,0 ; movdqa 0xb64(%rip),%xmm2 # 1300 <_sk_xor__sse2_8bit+0x142>
- DB 102,15,112,217,245 ; pshufd $0xf5,%xmm1,%xmm3
- DB 102,15,244,202 ; pmuludq %xmm2,%xmm1
- DB 102,15,112,201,232 ; pshufd $0xe8,%xmm1,%xmm1
- DB 102,15,244,218 ; pmuludq %xmm2,%xmm3
- DB 102,15,112,211,232 ; pshufd $0xe8,%xmm3,%xmm2
- DB 102,15,98,202 ; punpckldq %xmm2,%xmm1
- DB 102,15,235,13,81,11,0,0 ; por 0xb51(%rip),%xmm1 # 1310 <_sk_xor__sse2_8bit+0x152>
+ DB 117,116 ; jne d41 <_sk_load_g8_dst_sse2_8bit+0x91>
+ DB 243,66,15,126,20,2 ; movq (%rdx,%r8,1),%xmm2
+ DB 102,15,96,208 ; punpcklbw %xmm0,%xmm2
+ DB 102,15,84,21,65,23,0,0 ; andpd 0x1741(%rip),%xmm2 # 2420 <_sk_xor__sse2_8bit+0x23d>
+ DB 102,15,239,219 ; pxor %xmm3,%xmm3
+ DB 102,15,40,226 ; movapd %xmm2,%xmm4
+ DB 102,15,97,227 ; punpcklwd %xmm3,%xmm4
+ DB 102,15,105,211 ; punpckhwd %xmm3,%xmm2
+ DB 102,15,111,45,57,23,0,0 ; movdqa 0x1739(%rip),%xmm5 # 2430 <_sk_xor__sse2_8bit+0x24d>
+ DB 102,15,112,242,245 ; pshufd $0xf5,%xmm2,%xmm6
+ DB 102,15,244,213 ; pmuludq %xmm5,%xmm2
+ DB 102,15,112,218,232 ; pshufd $0xe8,%xmm2,%xmm3
+ DB 102,15,244,245 ; pmuludq %xmm5,%xmm6
+ DB 102,15,112,214,232 ; pshufd $0xe8,%xmm6,%xmm2
+ DB 102,15,98,218 ; punpckldq %xmm2,%xmm3
+ DB 102,15,112,244,245 ; pshufd $0xf5,%xmm4,%xmm6
+ DB 102,15,244,229 ; pmuludq %xmm5,%xmm4
+ DB 102,15,112,212,232 ; pshufd $0xe8,%xmm4,%xmm2
+ DB 102,15,244,245 ; pmuludq %xmm5,%xmm6
+ DB 102,15,112,230,232 ; pshufd $0xe8,%xmm6,%xmm4
+ DB 102,15,98,212 ; punpckldq %xmm4,%xmm2
+ DB 102,15,111,37,11,23,0,0 ; movdqa 0x170b(%rip),%xmm4 # 2440 <_sk_xor__sse2_8bit+0x25d>
+ DB 102,15,235,212 ; por %xmm4,%xmm2
+ DB 102,15,235,220 ; por %xmm4,%xmm3
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,54 ; je 803 <_sk_load_g8_dst_sse2_8bit+0xa2>
- DB 102,15,239,201 ; pxor %xmm1,%xmm1
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,21 ; je 7ec <_sk_load_g8_dst_sse2_8bit+0x8b>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,175 ; jne 78c <_sk_load_g8_dst_sse2_8bit+0x2b>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,210 ; pxor %xmm2,%xmm2
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,133 ; ja cd7 <_sk_load_g8_dst_sse2_8bit+0x27>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,119,0,0,0 ; lea 0x77(%rip),%rcx # dd4 <_sk_load_g8_dst_sse2_8bit+0x124>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
+ DB 102,15,110,208 ; movd %eax,%xmm2
+ DB 233,99,255,255,255 ; jmpq cd7 <_sk_load_g8_dst_sse2_8bit+0x27>
DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
- DB 102,15,110,200 ; movd %eax,%xmm1
- DB 102,15,112,201,69 ; pshufd $0x45,%xmm1,%xmm1
+ DB 102,15,239,210 ; pxor %xmm2,%xmm2
+ DB 102,15,196,208,2 ; pinsrw $0x2,%eax,%xmm2
DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 102,15,96,208 ; punpcklbw %xmm0,%xmm2
- DB 102,15,97,208 ; punpcklwd %xmm0,%xmm2
- DB 242,15,16,202 ; movsd %xmm2,%xmm1
- DB 235,137 ; jmp 78c <_sk_load_g8_dst_sse2_8bit+0x2b>
- DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
- DB 102,15,110,200 ; movd %eax,%xmm1
- DB 233,123,255,255,255 ; jmpq 78c <_sk_load_g8_dst_sse2_8bit+0x2b>
+ DB 102,15,110,216 ; movd %eax,%xmm3
+ DB 102,15,96,216 ; punpcklbw %xmm0,%xmm3
+ DB 243,15,16,211 ; movss %xmm3,%xmm2
+ DB 233,62,255,255,255 ; jmpq cd7 <_sk_load_g8_dst_sse2_8bit+0x27>
+ DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
+ DB 102,15,239,210 ; pxor %xmm2,%xmm2
+ DB 102,15,196,208,6 ; pinsrw $0x6,%eax,%xmm2
+ DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
+ DB 102,15,196,208,5 ; pinsrw $0x5,%eax,%xmm2
+ DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
+ DB 102,15,196,208,4 ; pinsrw $0x4,%eax,%xmm2
+ DB 102,66,15,110,28,2 ; movd (%rdx,%r8,1),%xmm3
+ DB 102,15,96,216 ; punpcklbw %xmm0,%xmm3
+ DB 242,15,16,211 ; movsd %xmm3,%xmm2
+ DB 233,6,255,255,255 ; jmpq cd7 <_sk_load_g8_dst_sse2_8bit+0x27>
+ DB 15,31,0 ; nopl (%rax)
+ DB 146 ; xchg %eax,%edx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,175,255,255,255,160 ; ljmp *-0x5f000001(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 234 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 223,255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,212 ; callq *%rsp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,197 ; inc %ebp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_srcover_rgba_8888_sse2_8bit
_sk_srcover_rgba_8888_sse2_8bit LABEL PROC
- DB 76,99,7 ; movslq (%rdi),%r8
- DB 76,139,79,16 ; mov 0x10(%rdi),%r9
+ DB 76,99,15 ; movslq (%rdi),%r9
+ DB 76,139,71,16 ; mov 0x10(%rdi),%r8
DB 72,173 ; lods %ds:(%rsi),%rax
DB 72,99,72,8 ; movslq 0x8(%rax),%rcx
DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,193,226,2 ; shl $0x2,%rdx
DB 72,3,16 ; add (%rax),%rdx
- DB 77,133,201 ; test %r9,%r9
- DB 117,120 ; jne 8aa <_sk_srcover_rgba_8888_sse2_8bit+0x99>
- DB 243,66,15,111,20,130 ; movdqu (%rdx,%r8,4),%xmm2
- DB 77,133,201 ; test %r9,%r9
- DB 242,15,112,216,231 ; pshuflw $0xe7,%xmm0,%xmm3
- DB 243,15,112,219,231 ; pshufhw $0xe7,%xmm3,%xmm3
- DB 102,15,112,219,232 ; pshufd $0xe8,%xmm3,%xmm3
- DB 102,15,96,219 ; punpcklbw %xmm3,%xmm3
- DB 242,15,112,219,95 ; pshuflw $0x5f,%xmm3,%xmm3
- DB 243,15,112,219,95 ; pshufhw $0x5f,%xmm3,%xmm3
- DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,111,234 ; movdqa %xmm2,%xmm5
- DB 102,15,96,236 ; punpcklbw %xmm4,%xmm5
- DB 102,15,111,242 ; movdqa %xmm2,%xmm6
- DB 102,15,104,244 ; punpckhbw %xmm4,%xmm6
- DB 102,15,111,251 ; movdqa %xmm3,%xmm7
- DB 102,15,96,252 ; punpcklbw %xmm4,%xmm7
- DB 102,15,104,220 ; punpckhbw %xmm4,%xmm3
- DB 102,15,213,222 ; pmullw %xmm6,%xmm3
- DB 102,15,213,253 ; pmullw %xmm5,%xmm7
- DB 102,15,253,253 ; paddw %xmm5,%xmm7
- DB 102,15,253,222 ; paddw %xmm6,%xmm3
- DB 102,15,113,211,8 ; psrlw $0x8,%xmm3
+ DB 77,133,192 ; test %r8,%r8
+ DB 15,133,253,0,0,0 ; jne f12 <_sk_srcover_rgba_8888_sse2_8bit+0x122>
+ DB 70,15,16,68,138,16 ; movups 0x10(%rdx,%r9,4),%xmm8
+ DB 102,70,15,16,12,138 ; movupd (%rdx,%r9,4),%xmm9
+ DB 77,133,192 ; test %r8,%r8
+ DB 242,15,112,225,231 ; pshuflw $0xe7,%xmm1,%xmm4
+ DB 243,15,112,228,231 ; pshufhw $0xe7,%xmm4,%xmm4
+ DB 102,15,112,228,232 ; pshufd $0xe8,%xmm4,%xmm4
+ DB 102,15,96,228 ; punpcklbw %xmm4,%xmm4
+ DB 242,15,112,228,95 ; pshuflw $0x5f,%xmm4,%xmm4
+ DB 243,15,112,236,95 ; pshufhw $0x5f,%xmm4,%xmm5
+ DB 242,15,112,224,231 ; pshuflw $0xe7,%xmm0,%xmm4
+ DB 243,15,112,228,231 ; pshufhw $0xe7,%xmm4,%xmm4
+ DB 102,15,112,228,232 ; pshufd $0xe8,%xmm4,%xmm4
+ DB 102,15,96,228 ; punpcklbw %xmm4,%xmm4
+ DB 242,15,112,228,95 ; pshuflw $0x5f,%xmm4,%xmm4
+ DB 243,15,112,228,95 ; pshufhw $0x5f,%xmm4,%xmm4
+ DB 102,69,15,239,210 ; pxor %xmm10,%xmm10
+ DB 102,69,15,40,217 ; movapd %xmm9,%xmm11
+ DB 102,69,15,96,218 ; punpcklbw %xmm10,%xmm11
+ DB 102,69,15,40,225 ; movapd %xmm9,%xmm12
+ DB 102,69,15,104,226 ; punpckhbw %xmm10,%xmm12
+ DB 102,69,15,111,232 ; movdqa %xmm8,%xmm13
+ DB 102,69,15,96,234 ; punpcklbw %xmm10,%xmm13
+ DB 102,69,15,111,240 ; movdqa %xmm8,%xmm14
+ DB 102,69,15,104,242 ; punpckhbw %xmm10,%xmm14
+ DB 102,15,111,252 ; movdqa %xmm4,%xmm7
+ DB 102,65,15,96,250 ; punpcklbw %xmm10,%xmm7
+ DB 102,65,15,104,226 ; punpckhbw %xmm10,%xmm4
+ DB 102,15,111,245 ; movdqa %xmm5,%xmm6
+ DB 102,65,15,96,242 ; punpcklbw %xmm10,%xmm6
+ DB 102,65,15,104,234 ; punpckhbw %xmm10,%xmm5
+ DB 102,65,15,213,238 ; pmullw %xmm14,%xmm5
+ DB 102,65,15,213,245 ; pmullw %xmm13,%xmm6
+ DB 102,65,15,213,228 ; pmullw %xmm12,%xmm4
+ DB 102,65,15,213,251 ; pmullw %xmm11,%xmm7
+ DB 102,65,15,253,251 ; paddw %xmm11,%xmm7
+ DB 102,65,15,253,228 ; paddw %xmm12,%xmm4
+ DB 102,65,15,253,245 ; paddw %xmm13,%xmm6
+ DB 102,65,15,253,238 ; paddw %xmm14,%xmm5
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
- DB 102,15,103,251 ; packuswb %xmm3,%xmm7
- DB 102,15,248,215 ; psubb %xmm7,%xmm2
- DB 102,15,252,208 ; paddb %xmm0,%xmm2
- DB 117,58 ; jne 8da <_sk_srcover_rgba_8888_sse2_8bit+0xc9>
- DB 243,66,15,127,20,130 ; movdqu %xmm2,(%rdx,%r8,4)
+ DB 102,15,103,252 ; packuswb %xmm4,%xmm7
+ DB 102,15,103,245 ; packuswb %xmm5,%xmm6
+ DB 102,68,15,248,198 ; psubb %xmm6,%xmm8
+ DB 102,68,15,248,207 ; psubb %xmm7,%xmm9
+ DB 102,68,15,252,200 ; paddb %xmm0,%xmm9
+ DB 102,68,15,252,193 ; paddb %xmm1,%xmm8
+ DB 117,72 ; jne f49 <_sk_srcover_rgba_8888_sse2_8bit+0x159>
+ DB 243,70,15,127,12,138 ; movdqu %xmm9,(%rdx,%r9,4)
+ DB 243,70,15,127,68,138,16 ; movdqu %xmm8,0x10(%rdx,%r9,4)
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 68,137,200 ; mov %r9d,%eax
- DB 36,3 ; and $0x3,%al
- DB 60,1 ; cmp $0x1,%al
- DB 116,81 ; je 904 <_sk_srcover_rgba_8888_sse2_8bit+0xf3>
- DB 102,15,239,210 ; pxor %xmm2,%xmm2
- DB 60,2 ; cmp $0x2,%al
- DB 116,20 ; je 8cf <_sk_srcover_rgba_8888_sse2_8bit+0xbe>
- DB 60,3 ; cmp $0x3,%al
- DB 15,133,117,255,255,255 ; jne 838 <_sk_srcover_rgba_8888_sse2_8bit+0x27>
- DB 102,66,15,110,84,130,8 ; movd 0x8(%rdx,%r8,4),%xmm2
- DB 102,15,112,210,69 ; pshufd $0x45,%xmm2,%xmm2
- DB 102,66,15,18,20,130 ; movlpd (%rdx,%r8,4),%xmm2
- DB 233,94,255,255,255 ; jmpq 838 <_sk_srcover_rgba_8888_sse2_8bit+0x27>
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,43 ; je 90f <_sk_srcover_rgba_8888_sse2_8bit+0xfe>
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,18 ; je 8fc <_sk_srcover_rgba_8888_sse2_8bit+0xeb>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,182 ; jne 8a6 <_sk_srcover_rgba_8888_sse2_8bit+0x95>
- DB 102,15,112,218,78 ; pshufd $0x4e,%xmm2,%xmm3
- DB 102,66,15,126,92,130,8 ; movd %xmm3,0x8(%rdx,%r8,4)
- DB 102,66,15,214,20,130 ; movq %xmm2,(%rdx,%r8,4)
- DB 235,162 ; jmp 8a6 <_sk_srcover_rgba_8888_sse2_8bit+0x95>
- DB 102,66,15,110,20,130 ; movd (%rdx,%r8,4),%xmm2
- DB 233,41,255,255,255 ; jmpq 838 <_sk_srcover_rgba_8888_sse2_8bit+0x27>
- DB 102,66,15,126,20,130 ; movd %xmm2,(%rdx,%r8,4)
- DB 235,143 ; jmp 8a6 <_sk_srcover_rgba_8888_sse2_8bit+0x95>
+ DB 68,137,192 ; mov %r8d,%eax
+ DB 36,7 ; and $0x7,%al
+ DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
+ DB 102,69,15,239,201 ; pxor %xmm9,%xmm9
+ DB 254,200 ; dec %al
+ DB 60,6 ; cmp $0x6,%al
+ DB 15,135,246,254,255,255 ; ja e21 <_sk_srcover_rgba_8888_sse2_8bit+0x31>
+ DB 15,182,192 ; movzbl %al,%eax
+ DB 72,141,13,207,0,0,0 ; lea 0xcf(%rip),%rcx # 1004 <_sk_srcover_rgba_8888_sse2_8bit+0x214>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 243,70,15,16,12,138 ; movss (%rdx,%r9,4),%xmm9
+ DB 233,216,254,255,255 ; jmpq e21 <_sk_srcover_rgba_8888_sse2_8bit+0x31>
+ DB 65,128,224,7 ; and $0x7,%r8b
+ DB 65,254,200 ; dec %r8b
+ DB 65,128,248,6 ; cmp $0x6,%r8b
+ DB 119,184 ; ja f0e <_sk_srcover_rgba_8888_sse2_8bit+0x11e>
+ DB 65,15,182,192 ; movzbl %r8b,%eax
+ DB 72,141,13,191,0,0,0 ; lea 0xbf(%rip),%rcx # 1020 <_sk_srcover_rgba_8888_sse2_8bit+0x230>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 102,70,15,126,12,138 ; movd %xmm9,(%rdx,%r9,4)
+ DB 235,156 ; jmp f0e <_sk_srcover_rgba_8888_sse2_8bit+0x11e>
+ DB 102,66,15,110,100,138,8 ; movd 0x8(%rdx,%r9,4),%xmm4
+ DB 102,68,15,112,204,69 ; pshufd $0x45,%xmm4,%xmm9
+ DB 102,70,15,18,12,138 ; movlpd (%rdx,%r9,4),%xmm9
+ DB 233,151,254,255,255 ; jmpq e21 <_sk_srcover_rgba_8888_sse2_8bit+0x31>
+ DB 102,66,15,110,100,138,24 ; movd 0x18(%rdx,%r9,4),%xmm4
+ DB 102,68,15,112,196,69 ; pshufd $0x45,%xmm4,%xmm8
+ DB 243,66,15,16,100,138,20 ; movss 0x14(%rdx,%r9,4),%xmm4
+ DB 65,15,198,224,0 ; shufps $0x0,%xmm8,%xmm4
+ DB 65,15,198,224,226 ; shufps $0xe2,%xmm8,%xmm4
+ DB 68,15,40,196 ; movaps %xmm4,%xmm8
+ DB 243,66,15,16,100,138,16 ; movss 0x10(%rdx,%r9,4),%xmm4
+ DB 243,68,15,16,196 ; movss %xmm4,%xmm8
+ DB 233,94,254,255,255 ; jmpq e1b <_sk_srcover_rgba_8888_sse2_8bit+0x2b>
+ DB 102,65,15,112,225,78 ; pshufd $0x4e,%xmm9,%xmm4
+ DB 102,66,15,126,100,138,8 ; movd %xmm4,0x8(%rdx,%r9,4)
+ DB 102,70,15,214,12,138 ; movq %xmm9,(%rdx,%r9,4)
+ DB 233,57,255,255,255 ; jmpq f0e <_sk_srcover_rgba_8888_sse2_8bit+0x11e>
+ DB 102,65,15,112,224,78 ; pshufd $0x4e,%xmm8,%xmm4
+ DB 102,66,15,126,100,138,24 ; movd %xmm4,0x18(%rdx,%r9,4)
+ DB 102,65,15,112,224,229 ; pshufd $0xe5,%xmm8,%xmm4
+ DB 102,66,15,126,100,138,20 ; movd %xmm4,0x14(%rdx,%r9,4)
+ DB 102,70,15,126,68,138,16 ; movd %xmm8,0x10(%rdx,%r9,4)
+ DB 243,70,15,127,12,138 ; movdqu %xmm9,(%rdx,%r9,4)
+ DB 233,13,255,255,255 ; jmpq f0e <_sk_srcover_rgba_8888_sse2_8bit+0x11e>
+ DB 15,31,0 ; nopl (%rax)
+ DB 58,255 ; cmp %bh,%bh
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 123,255 ; jnp 1009 <_sk_srcover_rgba_8888_sse2_8bit+0x219>
+ DB 255 ; (bad)
+ DB 255,110,255 ; ljmp *-0x1(%rsi)
+ DB 255 ; (bad)
+ DB 255,23 ; callq *(%rdi)
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,168,255,255,255,147 ; ljmp *-0x6c000001(%rax)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,134,255,255,255,74 ; incl 0x4affffff(%rsi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,170,255,255,255,157 ; ljmp *-0x62000001(%rdx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,214 ; callq *%rsi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,207 ; dec %edi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,194 ; inc %edx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
+ DB 181,255 ; mov $0xff,%ch
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_scale_1_float_sse2_8bit
_sk_scale_1_float_sse2_8bit LABEL PROC
+ DB 102,68,15,111,193 ; movdqa %xmm1,%xmm8
+ DB 102,68,15,111,200 ; movdqa %xmm0,%xmm9
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 243,15,16,16 ; movss (%rax),%xmm2
- DB 243,15,89,21,95,9,0,0 ; mulss 0x95f(%rip),%xmm2 # 1284 <_sk_xor__sse2_8bit+0xc6>
- DB 243,15,44,194 ; cvttss2si %xmm2,%eax
- DB 102,15,239,219 ; pxor %xmm3,%xmm3
- DB 102,15,111,208 ; movdqa %xmm0,%xmm2
- DB 102,15,96,211 ; punpcklbw %xmm3,%xmm2
- DB 102,15,104,195 ; punpckhbw %xmm3,%xmm0
- DB 102,15,110,216 ; movd %eax,%xmm3
- DB 102,15,96,219 ; punpcklbw %xmm3,%xmm3
- DB 242,15,112,219,0 ; pshuflw $0x0,%xmm3,%xmm3
- DB 102,15,112,219,80 ; pshufd $0x50,%xmm3,%xmm3
- DB 102,15,219,29,205,9,0,0 ; pand 0x9cd(%rip),%xmm3 # 1320 <_sk_xor__sse2_8bit+0x162>
- DB 102,15,111,227 ; movdqa %xmm3,%xmm4
- DB 102,15,213,224 ; pmullw %xmm0,%xmm4
- DB 102,15,213,218 ; pmullw %xmm2,%xmm3
- DB 102,15,253,211 ; paddw %xmm3,%xmm2
- DB 102,15,253,224 ; paddw %xmm0,%xmm4
- DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,212 ; packuswb %xmm4,%xmm2
+ DB 243,15,16,0 ; movss (%rax),%xmm0
+ DB 243,15,89,5,56,19,0,0 ; mulss 0x1338(%rip),%xmm0 # 238c <_sk_xor__sse2_8bit+0x1a9>
+ DB 243,15,44,192 ; cvttss2si %xmm0,%eax
+ DB 102,15,239,246 ; pxor %xmm6,%xmm6
+ DB 102,65,15,111,193 ; movdqa %xmm9,%xmm0
+ DB 102,15,96,198 ; punpcklbw %xmm6,%xmm0
+ DB 102,68,15,104,206 ; punpckhbw %xmm6,%xmm9
+ DB 102,15,96,206 ; punpcklbw %xmm6,%xmm1
+ DB 102,68,15,104,198 ; punpckhbw %xmm6,%xmm8
+ DB 102,15,110,240 ; movd %eax,%xmm6
+ DB 102,15,96,246 ; punpcklbw %xmm6,%xmm6
+ DB 242,15,112,246,0 ; pshuflw $0x0,%xmm6,%xmm6
+ DB 102,15,112,246,80 ; pshufd $0x50,%xmm6,%xmm6
+ DB 102,15,219,53,195,19,0,0 ; pand 0x13c3(%rip),%xmm6 # 2450 <_sk_xor__sse2_8bit+0x26d>
+ DB 102,15,111,254 ; movdqa %xmm6,%xmm7
+ DB 102,65,15,213,248 ; pmullw %xmm8,%xmm7
+ DB 102,15,111,230 ; movdqa %xmm6,%xmm4
+ DB 102,15,213,225 ; pmullw %xmm1,%xmm4
+ DB 102,15,111,238 ; movdqa %xmm6,%xmm5
+ DB 102,65,15,213,233 ; pmullw %xmm9,%xmm5
+ DB 102,15,213,240 ; pmullw %xmm0,%xmm6
+ DB 102,15,253,198 ; paddw %xmm6,%xmm0
+ DB 102,65,15,253,233 ; paddw %xmm9,%xmm5
+ DB 102,15,253,204 ; paddw %xmm4,%xmm1
+ DB 102,65,15,253,248 ; paddw %xmm8,%xmm7
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
+ DB 102,15,103,197 ; packuswb %xmm5,%xmm0
+ DB 102,15,103,207 ; packuswb %xmm7,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
DB 255,224 ; jmpq *%rax
PUBLIC _sk_scale_u8_sse2_8bit
@@ -42396,96 +45032,179 @@ _sk_scale_u8_sse2_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 117,114 ; jne a0c <_sk_scale_u8_sse2_8bit+0x8f>
- DB 102,66,15,110,20,2 ; movd (%rdx,%r8,1),%xmm2
- DB 102,15,96,208 ; punpcklbw %xmm0,%xmm2
- DB 102,15,97,208 ; punpcklwd %xmm0,%xmm2
- DB 102,15,114,242,24 ; pslld $0x18,%xmm2
- DB 242,15,112,210,231 ; pshuflw $0xe7,%xmm2,%xmm2
- DB 243,15,112,210,231 ; pshufhw $0xe7,%xmm2,%xmm2
- DB 102,15,112,210,232 ; pshufd $0xe8,%xmm2,%xmm2
- DB 102,15,96,210 ; punpcklbw %xmm2,%xmm2
- DB 242,15,112,210,95 ; pshuflw $0x5f,%xmm2,%xmm2
- DB 243,15,112,218,95 ; pshufhw $0x5f,%xmm2,%xmm3
- DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,111,232 ; movdqa %xmm0,%xmm5
- DB 102,15,96,236 ; punpcklbw %xmm4,%xmm5
- DB 102,15,104,196 ; punpckhbw %xmm4,%xmm0
- DB 102,15,111,211 ; movdqa %xmm3,%xmm2
- DB 102,15,96,212 ; punpcklbw %xmm4,%xmm2
- DB 102,15,104,220 ; punpckhbw %xmm4,%xmm3
- DB 102,15,213,216 ; pmullw %xmm0,%xmm3
- DB 102,15,213,213 ; pmullw %xmm5,%xmm2
- DB 102,15,253,213 ; paddw %xmm5,%xmm2
- DB 102,15,253,216 ; paddw %xmm0,%xmm3
- DB 102,15,113,211,8 ; psrlw $0x8,%xmm3
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,211 ; packuswb %xmm3,%xmm2
+ DB 15,133,239,0,0,0 ; jne 11ed <_sk_scale_u8_sse2_8bit+0x110>
+ DB 243,66,15,126,36,2 ; movq (%rdx,%r8,1),%xmm4
+ DB 102,15,96,224 ; punpcklbw %xmm0,%xmm4
+ DB 102,15,84,37,80,19,0,0 ; andpd 0x1350(%rip),%xmm4 # 2460 <_sk_xor__sse2_8bit+0x27d>
+ DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
+ DB 102,15,40,236 ; movapd %xmm4,%xmm5
+ DB 102,65,15,105,232 ; punpckhwd %xmm8,%xmm5
+ DB 102,65,15,97,224 ; punpcklwd %xmm8,%xmm4
+ DB 102,15,114,244,24 ; pslld $0x18,%xmm4
+ DB 102,15,114,245,24 ; pslld $0x18,%xmm5
+ DB 242,15,112,237,231 ; pshuflw $0xe7,%xmm5,%xmm5
+ DB 243,15,112,237,231 ; pshufhw $0xe7,%xmm5,%xmm5
+ DB 102,15,112,237,232 ; pshufd $0xe8,%xmm5,%xmm5
+ DB 102,15,96,237 ; punpcklbw %xmm5,%xmm5
+ DB 242,15,112,237,95 ; pshuflw $0x5f,%xmm5,%xmm5
+ DB 243,15,112,245,95 ; pshufhw $0x5f,%xmm5,%xmm6
+ DB 242,15,112,228,231 ; pshuflw $0xe7,%xmm4,%xmm4
+ DB 243,15,112,228,231 ; pshufhw $0xe7,%xmm4,%xmm4
+ DB 102,15,112,228,232 ; pshufd $0xe8,%xmm4,%xmm4
+ DB 102,15,96,228 ; punpcklbw %xmm4,%xmm4
+ DB 242,15,112,228,95 ; pshuflw $0x5f,%xmm4,%xmm4
+ DB 243,15,112,252,95 ; pshufhw $0x5f,%xmm4,%xmm7
+ DB 102,68,15,111,200 ; movdqa %xmm0,%xmm9
+ DB 102,69,15,96,200 ; punpcklbw %xmm8,%xmm9
+ DB 102,65,15,104,192 ; punpckhbw %xmm8,%xmm0
+ DB 102,68,15,111,209 ; movdqa %xmm1,%xmm10
+ DB 102,69,15,96,208 ; punpcklbw %xmm8,%xmm10
+ DB 102,65,15,104,200 ; punpckhbw %xmm8,%xmm1
+ DB 102,15,111,231 ; movdqa %xmm7,%xmm4
+ DB 102,65,15,96,224 ; punpcklbw %xmm8,%xmm4
+ DB 102,65,15,104,248 ; punpckhbw %xmm8,%xmm7
+ DB 102,15,111,238 ; movdqa %xmm6,%xmm5
+ DB 102,65,15,96,232 ; punpcklbw %xmm8,%xmm5
+ DB 102,65,15,104,240 ; punpckhbw %xmm8,%xmm6
+ DB 102,15,213,241 ; pmullw %xmm1,%xmm6
+ DB 102,65,15,213,234 ; pmullw %xmm10,%xmm5
+ DB 102,15,213,248 ; pmullw %xmm0,%xmm7
+ DB 102,65,15,213,225 ; pmullw %xmm9,%xmm4
+ DB 102,65,15,253,225 ; paddw %xmm9,%xmm4
+ DB 102,15,253,248 ; paddw %xmm0,%xmm7
+ DB 102,65,15,253,234 ; paddw %xmm10,%xmm5
+ DB 102,15,253,241 ; paddw %xmm1,%xmm6
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,15,103,231 ; packuswb %xmm7,%xmm4
+ DB 102,15,103,238 ; packuswb %xmm6,%xmm5
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
+ DB 102,15,111,196 ; movdqa %xmm4,%xmm0
+ DB 102,15,111,205 ; movdqa %xmm5,%xmm1
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,57 ; je a4f <_sk_scale_u8_sse2_8bit+0xd2>
- DB 102,15,239,210 ; pxor %xmm2,%xmm2
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,21 ; je a35 <_sk_scale_u8_sse2_8bit+0xb8>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 117,130 ; jne 9a8 <_sk_scale_u8_sse2_8bit+0x2b>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,228 ; pxor %xmm4,%xmm4
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 15,135,6,255,255,255 ; ja 1108 <_sk_scale_u8_sse2_8bit+0x2b>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,119,0,0,0 ; lea 0x77(%rip),%rcx # 1284 <_sk_scale_u8_sse2_8bit+0x1a7>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
+ DB 102,15,110,224 ; movd %eax,%xmm4
+ DB 233,228,254,255,255 ; jmpq 1108 <_sk_scale_u8_sse2_8bit+0x2b>
DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 102,15,112,210,69 ; pshufd $0x45,%xmm2,%xmm2
+ DB 102,15,239,228 ; pxor %xmm4,%xmm4
+ DB 102,15,196,224,2 ; pinsrw $0x2,%eax,%xmm4
DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
- DB 102,15,110,216 ; movd %eax,%xmm3
- DB 102,15,96,216 ; punpcklbw %xmm0,%xmm3
- DB 102,15,97,216 ; punpcklwd %xmm0,%xmm3
- DB 242,15,16,211 ; movsd %xmm3,%xmm2
- DB 233,89,255,255,255 ; jmpq 9a8 <_sk_scale_u8_sse2_8bit+0x2b>
- DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 233,75,255,255,255 ; jmpq 9a8 <_sk_scale_u8_sse2_8bit+0x2b>
+ DB 102,15,110,232 ; movd %eax,%xmm5
+ DB 102,15,96,232 ; punpcklbw %xmm0,%xmm5
+ DB 243,15,16,229 ; movss %xmm5,%xmm4
+ DB 233,191,254,255,255 ; jmpq 1108 <_sk_scale_u8_sse2_8bit+0x2b>
+ DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
+ DB 102,15,239,228 ; pxor %xmm4,%xmm4
+ DB 102,15,196,224,6 ; pinsrw $0x6,%eax,%xmm4
+ DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
+ DB 102,15,196,224,5 ; pinsrw $0x5,%eax,%xmm4
+ DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
+ DB 102,15,196,224,4 ; pinsrw $0x4,%eax,%xmm4
+ DB 102,66,15,110,44,2 ; movd (%rdx,%r8,1),%xmm5
+ DB 102,15,96,232 ; punpcklbw %xmm0,%xmm5
+ DB 242,15,16,229 ; movsd %xmm5,%xmm4
+ DB 233,135,254,255,255 ; jmpq 1108 <_sk_scale_u8_sse2_8bit+0x2b>
+ DB 15,31,0 ; nopl (%rax)
+ DB 146 ; xchg %eax,%edx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,175,255,255,255,160 ; ljmp *-0x5f000001(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 234 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 223,255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,212 ; callq *%rsp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,197 ; inc %ebp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_lerp_1_float_sse2_8bit
_sk_lerp_1_float_sse2_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 243,15,16,16 ; movss (%rax),%xmm2
- DB 243,15,89,21,29,8,0,0 ; mulss 0x81d(%rip),%xmm2 # 1288 <_sk_xor__sse2_8bit+0xca>
- DB 243,15,44,194 ; cvttss2si %xmm2,%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 102,15,96,210 ; punpcklbw %xmm2,%xmm2
- DB 242,15,112,210,0 ; pshuflw $0x0,%xmm2,%xmm2
- DB 102,15,112,218,80 ; pshufd $0x50,%xmm2,%xmm3
- DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,111,232 ; movdqa %xmm0,%xmm5
- DB 102,15,96,236 ; punpcklbw %xmm4,%xmm5
- DB 102,15,104,196 ; punpckhbw %xmm4,%xmm0
- DB 102,15,111,21,151,8,0,0 ; movdqa 0x897(%rip),%xmm2 # 1330 <_sk_xor__sse2_8bit+0x172>
- DB 102,15,219,211 ; pand %xmm3,%xmm2
- DB 102,15,111,242 ; movdqa %xmm2,%xmm6
- DB 102,15,213,240 ; pmullw %xmm0,%xmm6
- DB 102,15,213,213 ; pmullw %xmm5,%xmm2
- DB 102,15,253,213 ; paddw %xmm5,%xmm2
- DB 102,15,253,240 ; paddw %xmm0,%xmm6
- DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,214 ; packuswb %xmm6,%xmm2
- DB 102,15,118,237 ; pcmpeqd %xmm5,%xmm5
- DB 102,15,239,235 ; pxor %xmm3,%xmm5
- DB 102,15,111,217 ; movdqa %xmm1,%xmm3
- DB 102,15,111,241 ; movdqa %xmm1,%xmm6
- DB 102,15,96,244 ; punpcklbw %xmm4,%xmm6
- DB 102,15,104,220 ; punpckhbw %xmm4,%xmm3
- DB 102,15,111,197 ; movdqa %xmm5,%xmm0
- DB 102,15,96,196 ; punpcklbw %xmm4,%xmm0
- DB 102,15,104,236 ; punpckhbw %xmm4,%xmm5
- DB 102,15,213,235 ; pmullw %xmm3,%xmm5
- DB 102,15,213,198 ; pmullw %xmm6,%xmm0
- DB 102,15,253,198 ; paddw %xmm6,%xmm0
- DB 102,15,253,235 ; paddw %xmm3,%xmm5
+ DB 243,15,16,32 ; movss (%rax),%xmm4
+ DB 243,15,89,37,226,16,0,0 ; mulss 0x10e2(%rip),%xmm4 # 2390 <_sk_xor__sse2_8bit+0x1ad>
+ DB 243,15,44,196 ; cvttss2si %xmm4,%eax
+ DB 102,15,110,224 ; movd %eax,%xmm4
+ DB 102,15,96,228 ; punpcklbw %xmm4,%xmm4
+ DB 242,15,112,228,0 ; pshuflw $0x0,%xmm4,%xmm4
+ DB 102,68,15,112,196,80 ; pshufd $0x50,%xmm4,%xmm8
+ DB 102,69,15,239,201 ; pxor %xmm9,%xmm9
+ DB 102,15,111,248 ; movdqa %xmm0,%xmm7
+ DB 102,65,15,96,249 ; punpcklbw %xmm9,%xmm7
+ DB 102,65,15,104,193 ; punpckhbw %xmm9,%xmm0
+ DB 102,68,15,111,217 ; movdqa %xmm1,%xmm11
+ DB 102,69,15,96,217 ; punpcklbw %xmm9,%xmm11
+ DB 102,65,15,104,201 ; punpckhbw %xmm9,%xmm1
+ DB 102,15,111,53,129,17,0,0 ; movdqa 0x1181(%rip),%xmm6 # 2470 <_sk_xor__sse2_8bit+0x28d>
+ DB 102,65,15,219,240 ; pand %xmm8,%xmm6
+ DB 102,15,111,230 ; movdqa %xmm6,%xmm4
+ DB 102,15,213,225 ; pmullw %xmm1,%xmm4
+ DB 102,68,15,111,214 ; movdqa %xmm6,%xmm10
+ DB 102,69,15,213,211 ; pmullw %xmm11,%xmm10
+ DB 102,15,111,238 ; movdqa %xmm6,%xmm5
+ DB 102,15,213,232 ; pmullw %xmm0,%xmm5
+ DB 102,15,213,247 ; pmullw %xmm7,%xmm6
+ DB 102,15,253,247 ; paddw %xmm7,%xmm6
+ DB 102,15,253,232 ; paddw %xmm0,%xmm5
+ DB 102,69,15,253,211 ; paddw %xmm11,%xmm10
+ DB 102,15,253,225 ; paddw %xmm1,%xmm4
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,65,15,113,210,8 ; psrlw $0x8,%xmm10
DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,103,245 ; packuswb %xmm5,%xmm6
+ DB 102,68,15,103,212 ; packuswb %xmm4,%xmm10
+ DB 102,15,118,255 ; pcmpeqd %xmm7,%xmm7
+ DB 102,65,15,239,248 ; pxor %xmm8,%xmm7
+ DB 102,68,15,111,218 ; movdqa %xmm2,%xmm11
+ DB 102,15,111,234 ; movdqa %xmm2,%xmm5
+ DB 102,65,15,96,233 ; punpcklbw %xmm9,%xmm5
+ DB 102,69,15,104,217 ; punpckhbw %xmm9,%xmm11
+ DB 102,68,15,111,195 ; movdqa %xmm3,%xmm8
+ DB 102,68,15,111,227 ; movdqa %xmm3,%xmm12
+ DB 102,69,15,96,225 ; punpcklbw %xmm9,%xmm12
+ DB 102,69,15,104,193 ; punpckhbw %xmm9,%xmm8
+ DB 102,15,111,199 ; movdqa %xmm7,%xmm0
+ DB 102,65,15,96,193 ; punpcklbw %xmm9,%xmm0
+ DB 102,65,15,104,249 ; punpckhbw %xmm9,%xmm7
+ DB 102,15,111,231 ; movdqa %xmm7,%xmm4
+ DB 102,65,15,213,224 ; pmullw %xmm8,%xmm4
+ DB 102,15,111,200 ; movdqa %xmm0,%xmm1
+ DB 102,65,15,213,204 ; pmullw %xmm12,%xmm1
+ DB 102,65,15,213,251 ; pmullw %xmm11,%xmm7
+ DB 102,15,213,197 ; pmullw %xmm5,%xmm0
+ DB 102,15,253,197 ; paddw %xmm5,%xmm0
+ DB 102,65,15,253,251 ; paddw %xmm11,%xmm7
+ DB 102,65,15,253,204 ; paddw %xmm12,%xmm1
+ DB 102,65,15,253,224 ; paddw %xmm8,%xmm4
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
- DB 102,15,103,197 ; packuswb %xmm5,%xmm0
- DB 102,15,252,194 ; paddb %xmm2,%xmm0
+ DB 102,15,103,199 ; packuswb %xmm7,%xmm0
+ DB 102,15,103,204 ; packuswb %xmm4,%xmm1
+ DB 102,15,252,198 ; paddb %xmm6,%xmm0
+ DB 102,65,15,252,202 ; paddb %xmm10,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
@@ -42499,276 +45218,498 @@ _sk_lerp_u8_sse2_8bit LABEL PROC
DB 72,15,175,209 ; imul %rcx,%rdx
DB 72,3,16 ; add (%rax),%rdx
DB 77,133,201 ; test %r9,%r9
- DB 15,133,180,0,0,0 ; jne bde <_sk_lerp_u8_sse2_8bit+0xd5>
- DB 102,66,15,110,20,2 ; movd (%rdx,%r8,1),%xmm2
- DB 102,15,96,208 ; punpcklbw %xmm0,%xmm2
- DB 102,15,97,208 ; punpcklwd %xmm0,%xmm2
- DB 102,15,114,242,24 ; pslld $0x18,%xmm2
- DB 242,15,112,210,231 ; pshuflw $0xe7,%xmm2,%xmm2
- DB 243,15,112,210,231 ; pshufhw $0xe7,%xmm2,%xmm2
- DB 102,15,112,210,232 ; pshufd $0xe8,%xmm2,%xmm2
- DB 102,15,96,210 ; punpcklbw %xmm2,%xmm2
- DB 242,15,112,210,95 ; pshuflw $0x5f,%xmm2,%xmm2
- DB 243,15,112,218,95 ; pshufhw $0x5f,%xmm2,%xmm3
- DB 102,15,239,210 ; pxor %xmm2,%xmm2
- DB 102,15,111,224 ; movdqa %xmm0,%xmm4
- DB 102,15,96,226 ; punpcklbw %xmm2,%xmm4
- DB 102,15,104,194 ; punpckhbw %xmm2,%xmm0
- DB 102,15,111,235 ; movdqa %xmm3,%xmm5
- DB 102,15,118,246 ; pcmpeqd %xmm6,%xmm6
- DB 102,15,239,243 ; pxor %xmm3,%xmm6
- DB 102,15,96,218 ; punpcklbw %xmm2,%xmm3
- DB 102,15,104,234 ; punpckhbw %xmm2,%xmm5
+ DB 15,133,141,1,0,0 ; jne 1584 <_sk_lerp_u8_sse2_8bit+0x1ae>
+ DB 243,66,15,126,44,2 ; movq (%rdx,%r8,1),%xmm5
+ DB 102,15,96,232 ; punpcklbw %xmm0,%xmm5
+ DB 102,15,84,45,119,16,0,0 ; andpd 0x1077(%rip),%xmm5 # 2480 <_sk_xor__sse2_8bit+0x29d>
+ DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
+ DB 102,15,40,229 ; movapd %xmm5,%xmm4
+ DB 102,65,15,105,224 ; punpckhwd %xmm8,%xmm4
+ DB 102,65,15,97,232 ; punpcklwd %xmm8,%xmm5
+ DB 102,15,114,245,24 ; pslld $0x18,%xmm5
+ DB 102,15,114,244,24 ; pslld $0x18,%xmm4
+ DB 242,15,112,228,231 ; pshuflw $0xe7,%xmm4,%xmm4
+ DB 243,15,112,228,231 ; pshufhw $0xe7,%xmm4,%xmm4
+ DB 102,15,112,228,232 ; pshufd $0xe8,%xmm4,%xmm4
+ DB 102,15,96,228 ; punpcklbw %xmm4,%xmm4
+ DB 242,15,112,228,95 ; pshuflw $0x5f,%xmm4,%xmm4
+ DB 243,15,112,244,95 ; pshufhw $0x5f,%xmm4,%xmm6
+ DB 242,15,112,237,231 ; pshuflw $0xe7,%xmm5,%xmm5
+ DB 243,15,112,237,231 ; pshufhw $0xe7,%xmm5,%xmm5
+ DB 102,15,112,237,232 ; pshufd $0xe8,%xmm5,%xmm5
+ DB 102,15,96,237 ; punpcklbw %xmm5,%xmm5
+ DB 242,15,112,237,95 ; pshuflw $0x5f,%xmm5,%xmm5
+ DB 243,15,112,253,95 ; pshufhw $0x5f,%xmm5,%xmm7
+ DB 102,68,15,111,200 ; movdqa %xmm0,%xmm9
+ DB 102,69,15,96,200 ; punpcklbw %xmm8,%xmm9
+ DB 102,65,15,104,192 ; punpckhbw %xmm8,%xmm0
+ DB 102,68,15,111,209 ; movdqa %xmm1,%xmm10
+ DB 102,69,15,96,208 ; punpcklbw %xmm8,%xmm10
+ DB 102,65,15,104,200 ; punpckhbw %xmm8,%xmm1
+ DB 102,68,15,111,223 ; movdqa %xmm7,%xmm11
+ DB 102,69,15,96,216 ; punpcklbw %xmm8,%xmm11
+ DB 102,15,111,239 ; movdqa %xmm7,%xmm5
+ DB 102,65,15,104,232 ; punpckhbw %xmm8,%xmm5
+ DB 102,68,15,111,230 ; movdqa %xmm6,%xmm12
+ DB 102,69,15,96,224 ; punpcklbw %xmm8,%xmm12
+ DB 102,15,111,230 ; movdqa %xmm6,%xmm4
+ DB 102,65,15,104,224 ; punpckhbw %xmm8,%xmm4
+ DB 102,15,213,225 ; pmullw %xmm1,%xmm4
+ DB 102,69,15,213,226 ; pmullw %xmm10,%xmm12
DB 102,15,213,232 ; pmullw %xmm0,%xmm5
- DB 102,15,213,220 ; pmullw %xmm4,%xmm3
- DB 102,15,253,220 ; paddw %xmm4,%xmm3
+ DB 102,69,15,213,217 ; pmullw %xmm9,%xmm11
+ DB 102,69,15,253,217 ; paddw %xmm9,%xmm11
DB 102,15,253,232 ; paddw %xmm0,%xmm5
+ DB 102,69,15,253,226 ; paddw %xmm10,%xmm12
+ DB 102,15,253,225 ; paddw %xmm1,%xmm4
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,65,15,113,212,8 ; psrlw $0x8,%xmm12
DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
- DB 102,15,113,211,8 ; psrlw $0x8,%xmm3
- DB 102,15,103,221 ; packuswb %xmm5,%xmm3
- DB 102,15,111,225 ; movdqa %xmm1,%xmm4
- DB 102,15,96,226 ; punpcklbw %xmm2,%xmm4
- DB 102,15,111,233 ; movdqa %xmm1,%xmm5
- DB 102,15,104,234 ; punpckhbw %xmm2,%xmm5
- DB 102,15,111,198 ; movdqa %xmm6,%xmm0
- DB 102,15,96,194 ; punpcklbw %xmm2,%xmm0
- DB 102,15,104,242 ; punpckhbw %xmm2,%xmm6
- DB 102,15,213,245 ; pmullw %xmm5,%xmm6
+ DB 102,65,15,113,211,8 ; psrlw $0x8,%xmm11
+ DB 102,68,15,103,221 ; packuswb %xmm5,%xmm11
+ DB 102,68,15,103,228 ; packuswb %xmm4,%xmm12
+ DB 102,15,118,192 ; pcmpeqd %xmm0,%xmm0
+ DB 102,15,239,240 ; pxor %xmm0,%xmm6
+ DB 102,15,239,248 ; pxor %xmm0,%xmm7
+ DB 102,15,111,226 ; movdqa %xmm2,%xmm4
+ DB 102,65,15,96,224 ; punpcklbw %xmm8,%xmm4
+ DB 102,15,111,234 ; movdqa %xmm2,%xmm5
+ DB 102,65,15,104,232 ; punpckhbw %xmm8,%xmm5
+ DB 102,68,15,111,203 ; movdqa %xmm3,%xmm9
+ DB 102,69,15,96,200 ; punpcklbw %xmm8,%xmm9
+ DB 102,68,15,111,211 ; movdqa %xmm3,%xmm10
+ DB 102,69,15,104,208 ; punpckhbw %xmm8,%xmm10
+ DB 102,15,111,199 ; movdqa %xmm7,%xmm0
+ DB 102,65,15,96,192 ; punpcklbw %xmm8,%xmm0
+ DB 102,65,15,104,248 ; punpckhbw %xmm8,%xmm7
+ DB 102,15,111,206 ; movdqa %xmm6,%xmm1
+ DB 102,65,15,96,200 ; punpcklbw %xmm8,%xmm1
+ DB 102,65,15,104,240 ; punpckhbw %xmm8,%xmm6
+ DB 102,65,15,213,242 ; pmullw %xmm10,%xmm6
+ DB 102,65,15,213,201 ; pmullw %xmm9,%xmm1
+ DB 102,15,213,253 ; pmullw %xmm5,%xmm7
DB 102,15,213,196 ; pmullw %xmm4,%xmm0
DB 102,15,253,196 ; paddw %xmm4,%xmm0
- DB 102,15,253,245 ; paddw %xmm5,%xmm6
+ DB 102,15,253,253 ; paddw %xmm5,%xmm7
+ DB 102,65,15,253,201 ; paddw %xmm9,%xmm1
+ DB 102,65,15,253,242 ; paddw %xmm10,%xmm6
DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
- DB 102,15,103,198 ; packuswb %xmm6,%xmm0
- DB 102,15,252,195 ; paddb %xmm3,%xmm0
+ DB 102,15,103,199 ; packuswb %xmm7,%xmm0
+ DB 102,15,103,206 ; packuswb %xmm6,%xmm1
+ DB 102,65,15,252,195 ; paddb %xmm11,%xmm0
+ DB 102,65,15,252,204 ; paddb %xmm12,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
- DB 65,128,225,3 ; and $0x3,%r9b
- DB 65,128,249,1 ; cmp $0x1,%r9b
- DB 116,61 ; je c25 <_sk_lerp_u8_sse2_8bit+0x11c>
- DB 102,15,239,210 ; pxor %xmm2,%xmm2
- DB 65,128,249,2 ; cmp $0x2,%r9b
- DB 116,25 ; je c0b <_sk_lerp_u8_sse2_8bit+0x102>
- DB 65,128,249,3 ; cmp $0x3,%r9b
- DB 15,133,60,255,255,255 ; jne b38 <_sk_lerp_u8_sse2_8bit+0x2f>
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,237 ; pxor %xmm5,%xmm5
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 15,135,104,254,255,255 ; ja 1401 <_sk_lerp_u8_sse2_8bit+0x2b>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 72,141,13,116,0,0,0 ; lea 0x74(%rip),%rcx # 1618 <_sk_lerp_u8_sse2_8bit+0x242>
+ DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
+ DB 72,1,200 ; add %rcx,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
+ DB 102,15,110,232 ; movd %eax,%xmm5
+ DB 233,70,254,255,255 ; jmpq 1401 <_sk_lerp_u8_sse2_8bit+0x2b>
DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 102,15,112,210,69 ; pshufd $0x45,%xmm2,%xmm2
+ DB 102,15,239,237 ; pxor %xmm5,%xmm5
+ DB 102,15,196,232,2 ; pinsrw $0x2,%eax,%xmm5
DB 66,15,183,4,2 ; movzwl (%rdx,%r8,1),%eax
- DB 102,15,110,216 ; movd %eax,%xmm3
- DB 102,15,96,216 ; punpcklbw %xmm0,%xmm3
- DB 102,15,97,216 ; punpcklwd %xmm0,%xmm3
- DB 242,15,16,211 ; movsd %xmm3,%xmm2
- DB 233,19,255,255,255 ; jmpq b38 <_sk_lerp_u8_sse2_8bit+0x2f>
- DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax
- DB 102,15,110,208 ; movd %eax,%xmm2
- DB 233,5,255,255,255 ; jmpq b38 <_sk_lerp_u8_sse2_8bit+0x2f>
+ DB 102,15,110,224 ; movd %eax,%xmm4
+ DB 102,15,96,224 ; punpcklbw %xmm0,%xmm4
+ DB 243,15,16,236 ; movss %xmm4,%xmm5
+ DB 233,33,254,255,255 ; jmpq 1401 <_sk_lerp_u8_sse2_8bit+0x2b>
+ DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax
+ DB 102,15,239,237 ; pxor %xmm5,%xmm5
+ DB 102,15,196,232,6 ; pinsrw $0x6,%eax,%xmm5
+ DB 66,15,182,68,2,5 ; movzbl 0x5(%rdx,%r8,1),%eax
+ DB 102,15,196,232,5 ; pinsrw $0x5,%eax,%xmm5
+ DB 66,15,182,68,2,4 ; movzbl 0x4(%rdx,%r8,1),%eax
+ DB 102,15,196,232,4 ; pinsrw $0x4,%eax,%xmm5
+ DB 102,66,15,110,36,2 ; movd (%rdx,%r8,1),%xmm4
+ DB 102,15,96,224 ; punpcklbw %xmm0,%xmm4
+ DB 242,15,16,236 ; movsd %xmm4,%xmm5
+ DB 233,233,253,255,255 ; jmpq 1401 <_sk_lerp_u8_sse2_8bit+0x2b>
+ DB 149 ; xchg %eax,%ebp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,178,255,255,255,163 ; pushq -0x5c000001(%rdx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 237 ; in (%dx),%eax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,226 ; jmpq *%rdx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,215 ; callq *%rdi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,200 ; dec %eax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
PUBLIC _sk_move_src_dst_sse2_8bit
_sk_move_src_dst_sse2_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 15,40,200 ; movaps %xmm0,%xmm1
+ DB 15,40,208 ; movaps %xmm0,%xmm2
+ DB 15,40,217 ; movaps %xmm1,%xmm3
DB 255,224 ; jmpq *%rax
PUBLIC _sk_move_dst_src_sse2_8bit
_sk_move_dst_src_sse2_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 15,40,193 ; movaps %xmm1,%xmm0
+ DB 15,40,194 ; movaps %xmm2,%xmm0
+ DB 15,40,203 ; movaps %xmm3,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_black_color_sse2_8bit
_sk_black_color_sse2_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 15,40,5,246,6,0,0 ; movaps 0x6f6(%rip),%xmm0 # 1340 <_sk_xor__sse2_8bit+0x182>
+ DB 15,40,5,63,14,0,0 ; movaps 0xe3f(%rip),%xmm0 # 2490 <_sk_xor__sse2_8bit+0x2ad>
+ DB 15,40,200 ; movaps %xmm0,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_white_color_sse2_8bit
_sk_white_color_sse2_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
DB 102,15,118,192 ; pcmpeqd %xmm0,%xmm0
+ DB 102,15,118,201 ; pcmpeqd %xmm1,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_clear_sse2_8bit
_sk_clear_sse2_8bit LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
DB 15,87,192 ; xorps %xmm0,%xmm0
+ DB 15,87,201 ; xorps %xmm1,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_srcatop_sse2_8bit
_sk_srcatop_sse2_8bit LABEL PROC
- DB 242,15,112,209,231 ; pshuflw $0xe7,%xmm1,%xmm2
- DB 243,15,112,210,231 ; pshufhw $0xe7,%xmm2,%xmm2
- DB 102,15,112,210,232 ; pshufd $0xe8,%xmm2,%xmm2
- DB 102,15,96,210 ; punpcklbw %xmm2,%xmm2
- DB 242,15,112,210,95 ; pshuflw $0x5f,%xmm2,%xmm2
- DB 243,15,112,226,95 ; pshufhw $0x5f,%xmm2,%xmm4
- DB 102,15,239,219 ; pxor %xmm3,%xmm3
- DB 102,15,111,232 ; movdqa %xmm0,%xmm5
- DB 102,15,96,235 ; punpcklbw %xmm3,%xmm5
- DB 242,15,112,240,231 ; pshuflw $0xe7,%xmm0,%xmm6
- DB 102,15,104,195 ; punpckhbw %xmm3,%xmm0
- DB 102,15,111,212 ; movdqa %xmm4,%xmm2
- DB 102,15,96,211 ; punpcklbw %xmm3,%xmm2
- DB 102,15,104,227 ; punpckhbw %xmm3,%xmm4
- DB 102,15,213,224 ; pmullw %xmm0,%xmm4
- DB 102,15,213,213 ; pmullw %xmm5,%xmm2
- DB 102,15,253,213 ; paddw %xmm5,%xmm2
- DB 102,15,253,224 ; paddw %xmm0,%xmm4
+ DB 242,15,112,227,231 ; pshuflw $0xe7,%xmm3,%xmm4
+ DB 243,15,112,228,231 ; pshufhw $0xe7,%xmm4,%xmm4
+ DB 102,15,112,228,232 ; pshufd $0xe8,%xmm4,%xmm4
+ DB 102,15,96,228 ; punpcklbw %xmm4,%xmm4
+ DB 242,15,112,228,95 ; pshuflw $0x5f,%xmm4,%xmm4
+ DB 243,68,15,112,220,95 ; pshufhw $0x5f,%xmm4,%xmm11
+ DB 242,15,112,226,231 ; pshuflw $0xe7,%xmm2,%xmm4
+ DB 243,15,112,228,231 ; pshufhw $0xe7,%xmm4,%xmm4
+ DB 102,15,112,228,232 ; pshufd $0xe8,%xmm4,%xmm4
+ DB 102,15,96,228 ; punpcklbw %xmm4,%xmm4
+ DB 242,15,112,228,95 ; pshuflw $0x5f,%xmm4,%xmm4
+ DB 243,15,112,228,95 ; pshufhw $0x5f,%xmm4,%xmm4
+ DB 102,69,15,239,201 ; pxor %xmm9,%xmm9
+ DB 102,68,15,111,192 ; movdqa %xmm0,%xmm8
+ DB 242,68,15,112,208,231 ; pshuflw $0xe7,%xmm0,%xmm10
+ DB 102,65,15,96,193 ; punpcklbw %xmm9,%xmm0
+ DB 102,69,15,104,193 ; punpckhbw %xmm9,%xmm8
+ DB 102,15,111,249 ; movdqa %xmm1,%xmm7
+ DB 242,68,15,112,225,231 ; pshuflw $0xe7,%xmm1,%xmm12
+ DB 102,65,15,96,201 ; punpcklbw %xmm9,%xmm1
+ DB 102,65,15,104,249 ; punpckhbw %xmm9,%xmm7
+ DB 102,15,111,244 ; movdqa %xmm4,%xmm6
+ DB 102,65,15,96,241 ; punpcklbw %xmm9,%xmm6
+ DB 102,65,15,104,225 ; punpckhbw %xmm9,%xmm4
+ DB 102,65,15,111,235 ; movdqa %xmm11,%xmm5
+ DB 102,65,15,96,233 ; punpcklbw %xmm9,%xmm5
+ DB 102,69,15,104,217 ; punpckhbw %xmm9,%xmm11
+ DB 102,68,15,213,223 ; pmullw %xmm7,%xmm11
+ DB 102,15,213,233 ; pmullw %xmm1,%xmm5
+ DB 102,65,15,213,224 ; pmullw %xmm8,%xmm4
+ DB 102,15,213,240 ; pmullw %xmm0,%xmm6
+ DB 102,15,253,240 ; paddw %xmm0,%xmm6
+ DB 102,65,15,253,224 ; paddw %xmm8,%xmm4
+ DB 102,15,253,233 ; paddw %xmm1,%xmm5
+ DB 102,68,15,253,223 ; paddw %xmm7,%xmm11
+ DB 102,65,15,113,211,8 ; psrlw $0x8,%xmm11
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,212 ; packuswb %xmm4,%xmm2
- DB 243,15,112,198,231 ; pshufhw $0xe7,%xmm6,%xmm0
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,103,244 ; packuswb %xmm4,%xmm6
+ DB 102,65,15,103,235 ; packuswb %xmm11,%xmm5
+ DB 243,65,15,112,194,231 ; pshufhw $0xe7,%xmm10,%xmm0
DB 102,15,112,192,232 ; pshufd $0xe8,%xmm0,%xmm0
DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
DB 242,15,112,192,95 ; pshuflw $0x5f,%xmm0,%xmm0
- DB 243,15,112,192,95 ; pshufhw $0x5f,%xmm0,%xmm0
- DB 102,15,118,228 ; pcmpeqd %xmm4,%xmm4
+ DB 243,15,112,224,95 ; pshufhw $0x5f,%xmm0,%xmm4
+ DB 243,65,15,112,196,231 ; pshufhw $0xe7,%xmm12,%xmm0
+ DB 102,15,112,192,232 ; pshufd $0xe8,%xmm0,%xmm0
+ DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
+ DB 242,15,112,192,95 ; pshuflw $0x5f,%xmm0,%xmm0
+ DB 243,15,112,248,95 ; pshufhw $0x5f,%xmm0,%xmm7
+ DB 102,15,118,192 ; pcmpeqd %xmm0,%xmm0
+ DB 102,15,239,248 ; pxor %xmm0,%xmm7
DB 102,15,239,224 ; pxor %xmm0,%xmm4
- DB 102,15,111,233 ; movdqa %xmm1,%xmm5
- DB 102,15,111,241 ; movdqa %xmm1,%xmm6
- DB 102,15,96,243 ; punpcklbw %xmm3,%xmm6
- DB 102,15,104,235 ; punpckhbw %xmm3,%xmm5
+ DB 102,68,15,111,194 ; movdqa %xmm2,%xmm8
+ DB 102,68,15,111,210 ; movdqa %xmm2,%xmm10
+ DB 102,69,15,96,209 ; punpcklbw %xmm9,%xmm10
+ DB 102,69,15,104,193 ; punpckhbw %xmm9,%xmm8
+ DB 102,68,15,111,219 ; movdqa %xmm3,%xmm11
+ DB 102,68,15,111,227 ; movdqa %xmm3,%xmm12
+ DB 102,69,15,96,225 ; punpcklbw %xmm9,%xmm12
+ DB 102,69,15,104,217 ; punpckhbw %xmm9,%xmm11
DB 102,15,111,196 ; movdqa %xmm4,%xmm0
- DB 102,15,96,195 ; punpcklbw %xmm3,%xmm0
- DB 102,15,104,227 ; punpckhbw %xmm3,%xmm4
- DB 102,15,213,229 ; pmullw %xmm5,%xmm4
- DB 102,15,213,198 ; pmullw %xmm6,%xmm0
- DB 102,15,253,198 ; paddw %xmm6,%xmm0
- DB 102,15,253,229 ; paddw %xmm5,%xmm4
+ DB 102,65,15,96,193 ; punpcklbw %xmm9,%xmm0
+ DB 102,65,15,104,225 ; punpckhbw %xmm9,%xmm4
+ DB 102,15,111,207 ; movdqa %xmm7,%xmm1
+ DB 102,65,15,96,201 ; punpcklbw %xmm9,%xmm1
+ DB 102,65,15,104,249 ; punpckhbw %xmm9,%xmm7
+ DB 102,65,15,213,251 ; pmullw %xmm11,%xmm7
+ DB 102,65,15,213,204 ; pmullw %xmm12,%xmm1
+ DB 102,65,15,213,224 ; pmullw %xmm8,%xmm4
+ DB 102,65,15,213,194 ; pmullw %xmm10,%xmm0
+ DB 102,65,15,253,194 ; paddw %xmm10,%xmm0
+ DB 102,65,15,253,224 ; paddw %xmm8,%xmm4
+ DB 102,65,15,253,204 ; paddw %xmm12,%xmm1
+ DB 102,65,15,253,251 ; paddw %xmm11,%xmm7
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
DB 102,15,103,196 ; packuswb %xmm4,%xmm0
- DB 102,15,252,194 ; paddb %xmm2,%xmm0
+ DB 102,15,103,207 ; packuswb %xmm7,%xmm1
+ DB 102,15,252,198 ; paddb %xmm6,%xmm0
+ DB 102,15,252,205 ; paddb %xmm5,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_dstatop_sse2_8bit
_sk_dstatop_sse2_8bit LABEL PROC
- DB 242,15,112,208,231 ; pshuflw $0xe7,%xmm0,%xmm2
- DB 243,15,112,210,231 ; pshufhw $0xe7,%xmm2,%xmm2
- DB 102,15,112,210,232 ; pshufd $0xe8,%xmm2,%xmm2
- DB 102,15,96,210 ; punpcklbw %xmm2,%xmm2
- DB 242,15,112,210,95 ; pshuflw $0x5f,%xmm2,%xmm2
- DB 243,15,112,210,95 ; pshufhw $0x5f,%xmm2,%xmm2
- DB 102,15,239,219 ; pxor %xmm3,%xmm3
- DB 102,15,111,225 ; movdqa %xmm1,%xmm4
- DB 242,15,112,233,231 ; pshuflw $0xe7,%xmm1,%xmm5
- DB 102,15,111,241 ; movdqa %xmm1,%xmm6
- DB 102,15,96,243 ; punpcklbw %xmm3,%xmm6
+ DB 242,15,112,225,231 ; pshuflw $0xe7,%xmm1,%xmm4
+ DB 243,15,112,228,231 ; pshufhw $0xe7,%xmm4,%xmm4
+ DB 102,15,112,228,232 ; pshufd $0xe8,%xmm4,%xmm4
+ DB 102,15,96,228 ; punpcklbw %xmm4,%xmm4
+ DB 242,15,112,228,95 ; pshuflw $0x5f,%xmm4,%xmm4
+ DB 243,15,112,228,95 ; pshufhw $0x5f,%xmm4,%xmm4
+ DB 242,15,112,232,231 ; pshuflw $0xe7,%xmm0,%xmm5
+ DB 243,15,112,237,231 ; pshufhw $0xe7,%xmm5,%xmm5
+ DB 102,15,112,237,232 ; pshufd $0xe8,%xmm5,%xmm5
+ DB 102,15,96,237 ; punpcklbw %xmm5,%xmm5
+ DB 242,15,112,237,95 ; pshuflw $0x5f,%xmm5,%xmm5
+ DB 243,15,112,237,95 ; pshufhw $0x5f,%xmm5,%xmm5
+ DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
+ DB 102,68,15,111,210 ; movdqa %xmm2,%xmm10
+ DB 242,68,15,112,218,231 ; pshuflw $0xe7,%xmm2,%xmm11
DB 102,15,111,250 ; movdqa %xmm2,%xmm7
- DB 102,15,96,251 ; punpcklbw %xmm3,%xmm7
- DB 102,15,213,254 ; pmullw %xmm6,%xmm7
- DB 102,15,253,254 ; paddw %xmm6,%xmm7
- DB 102,15,104,227 ; punpckhbw %xmm3,%xmm4
- DB 102,15,104,211 ; punpckhbw %xmm3,%xmm2
- DB 102,15,213,212 ; pmullw %xmm4,%xmm2
- DB 102,15,253,212 ; paddw %xmm4,%xmm2
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
- DB 102,15,103,250 ; packuswb %xmm2,%xmm7
- DB 243,15,112,213,231 ; pshufhw $0xe7,%xmm5,%xmm2
- DB 102,15,112,210,232 ; pshufd $0xe8,%xmm2,%xmm2
- DB 102,15,96,210 ; punpcklbw %xmm2,%xmm2
- DB 242,15,112,210,95 ; pshuflw $0x5f,%xmm2,%xmm2
- DB 243,15,112,210,95 ; pshufhw $0x5f,%xmm2,%xmm2
+ DB 102,65,15,96,248 ; punpcklbw %xmm8,%xmm7
+ DB 102,69,15,104,208 ; punpckhbw %xmm8,%xmm10
+ DB 102,15,111,243 ; movdqa %xmm3,%xmm6
+ DB 102,68,15,111,205 ; movdqa %xmm5,%xmm9
+ DB 102,69,15,96,200 ; punpcklbw %xmm8,%xmm9
+ DB 102,68,15,213,207 ; pmullw %xmm7,%xmm9
+ DB 102,68,15,253,207 ; paddw %xmm7,%xmm9
+ DB 242,68,15,112,227,231 ; pshuflw $0xe7,%xmm3,%xmm12
+ DB 102,15,111,251 ; movdqa %xmm3,%xmm7
+ DB 102,65,15,96,248 ; punpcklbw %xmm8,%xmm7
+ DB 102,65,15,104,240 ; punpckhbw %xmm8,%xmm6
+ DB 102,65,15,104,232 ; punpckhbw %xmm8,%xmm5
+ DB 102,65,15,213,234 ; pmullw %xmm10,%xmm5
+ DB 102,65,15,253,234 ; paddw %xmm10,%xmm5
+ DB 102,68,15,111,212 ; movdqa %xmm4,%xmm10
+ DB 102,69,15,96,208 ; punpcklbw %xmm8,%xmm10
+ DB 102,65,15,104,224 ; punpckhbw %xmm8,%xmm4
+ DB 102,15,213,230 ; pmullw %xmm6,%xmm4
+ DB 102,68,15,213,215 ; pmullw %xmm7,%xmm10
+ DB 102,68,15,253,215 ; paddw %xmm7,%xmm10
+ DB 102,15,253,230 ; paddw %xmm6,%xmm4
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,65,15,113,210,8 ; psrlw $0x8,%xmm10
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,65,15,113,209,8 ; psrlw $0x8,%xmm9
+ DB 102,68,15,103,205 ; packuswb %xmm5,%xmm9
+ DB 102,68,15,103,212 ; packuswb %xmm4,%xmm10
+ DB 243,65,15,112,227,231 ; pshufhw $0xe7,%xmm11,%xmm4
+ DB 102,15,112,228,232 ; pshufd $0xe8,%xmm4,%xmm4
+ DB 102,15,96,228 ; punpcklbw %xmm4,%xmm4
+ DB 242,15,112,228,95 ; pshuflw $0x5f,%xmm4,%xmm4
+ DB 243,15,112,252,95 ; pshufhw $0x5f,%xmm4,%xmm7
+ DB 243,65,15,112,228,231 ; pshufhw $0xe7,%xmm12,%xmm4
+ DB 102,15,112,228,232 ; pshufd $0xe8,%xmm4,%xmm4
+ DB 102,15,96,228 ; punpcklbw %xmm4,%xmm4
+ DB 242,15,112,228,95 ; pshuflw $0x5f,%xmm4,%xmm4
+ DB 243,15,112,244,95 ; pshufhw $0x5f,%xmm4,%xmm6
DB 102,15,118,228 ; pcmpeqd %xmm4,%xmm4
- DB 102,15,239,226 ; pxor %xmm2,%xmm4
- DB 102,15,111,208 ; movdqa %xmm0,%xmm2
- DB 102,15,96,211 ; punpcklbw %xmm3,%xmm2
- DB 102,15,104,195 ; punpckhbw %xmm3,%xmm0
- DB 102,15,111,236 ; movdqa %xmm4,%xmm5
- DB 102,15,96,235 ; punpcklbw %xmm3,%xmm5
- DB 102,15,104,227 ; punpckhbw %xmm3,%xmm4
- DB 102,15,213,224 ; pmullw %xmm0,%xmm4
- DB 102,15,213,234 ; pmullw %xmm2,%xmm5
- DB 102,15,253,213 ; paddw %xmm5,%xmm2
- DB 102,15,253,224 ; paddw %xmm0,%xmm4
+ DB 102,15,239,244 ; pxor %xmm4,%xmm6
+ DB 102,15,239,252 ; pxor %xmm4,%xmm7
+ DB 102,15,111,224 ; movdqa %xmm0,%xmm4
+ DB 102,65,15,96,224 ; punpcklbw %xmm8,%xmm4
+ DB 102,65,15,104,192 ; punpckhbw %xmm8,%xmm0
+ DB 102,15,111,233 ; movdqa %xmm1,%xmm5
+ DB 102,65,15,96,232 ; punpcklbw %xmm8,%xmm5
+ DB 102,65,15,104,200 ; punpckhbw %xmm8,%xmm1
+ DB 102,68,15,111,223 ; movdqa %xmm7,%xmm11
+ DB 102,69,15,96,216 ; punpcklbw %xmm8,%xmm11
+ DB 102,65,15,104,248 ; punpckhbw %xmm8,%xmm7
+ DB 102,68,15,111,230 ; movdqa %xmm6,%xmm12
+ DB 102,69,15,96,224 ; punpcklbw %xmm8,%xmm12
+ DB 102,65,15,104,240 ; punpckhbw %xmm8,%xmm6
+ DB 102,15,213,241 ; pmullw %xmm1,%xmm6
+ DB 102,68,15,213,229 ; pmullw %xmm5,%xmm12
+ DB 102,15,213,248 ; pmullw %xmm0,%xmm7
+ DB 102,68,15,213,220 ; pmullw %xmm4,%xmm11
+ DB 102,65,15,253,227 ; paddw %xmm11,%xmm4
+ DB 102,15,253,248 ; paddw %xmm0,%xmm7
+ DB 102,65,15,253,236 ; paddw %xmm12,%xmm5
+ DB 102,15,253,241 ; paddw %xmm1,%xmm6
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,212 ; packuswb %xmm4,%xmm2
- DB 102,15,252,215 ; paddb %xmm7,%xmm2
+ DB 102,15,103,231 ; packuswb %xmm7,%xmm4
+ DB 102,15,103,238 ; packuswb %xmm6,%xmm5
+ DB 102,65,15,252,225 ; paddb %xmm9,%xmm4
+ DB 102,65,15,252,234 ; paddb %xmm10,%xmm5
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
+ DB 102,15,111,196 ; movdqa %xmm4,%xmm0
+ DB 102,15,111,205 ; movdqa %xmm5,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_srcin_sse2_8bit
_sk_srcin_sse2_8bit LABEL PROC
- DB 242,15,112,209,231 ; pshuflw $0xe7,%xmm1,%xmm2
- DB 243,15,112,210,231 ; pshufhw $0xe7,%xmm2,%xmm2
- DB 102,15,112,210,232 ; pshufd $0xe8,%xmm2,%xmm2
- DB 102,15,96,210 ; punpcklbw %xmm2,%xmm2
- DB 242,15,112,210,95 ; pshuflw $0x5f,%xmm2,%xmm2
- DB 243,15,112,218,95 ; pshufhw $0x5f,%xmm2,%xmm3
- DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,111,208 ; movdqa %xmm0,%xmm2
- DB 102,15,96,212 ; punpcklbw %xmm4,%xmm2
- DB 102,15,104,196 ; punpckhbw %xmm4,%xmm0
- DB 102,15,111,235 ; movdqa %xmm3,%xmm5
- DB 102,15,96,236 ; punpcklbw %xmm4,%xmm5
- DB 102,15,104,220 ; punpckhbw %xmm4,%xmm3
- DB 102,15,213,216 ; pmullw %xmm0,%xmm3
- DB 102,15,213,234 ; pmullw %xmm2,%xmm5
- DB 102,15,253,213 ; paddw %xmm5,%xmm2
- DB 102,15,253,216 ; paddw %xmm0,%xmm3
- DB 102,15,113,211,8 ; psrlw $0x8,%xmm3
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,211 ; packuswb %xmm3,%xmm2
+ DB 102,68,15,111,192 ; movdqa %xmm0,%xmm8
+ DB 242,15,112,195,231 ; pshuflw $0xe7,%xmm3,%xmm0
+ DB 243,15,112,192,231 ; pshufhw $0xe7,%xmm0,%xmm0
+ DB 102,15,112,192,232 ; pshufd $0xe8,%xmm0,%xmm0
+ DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
+ DB 242,15,112,192,95 ; pshuflw $0x5f,%xmm0,%xmm0
+ DB 243,15,112,240,95 ; pshufhw $0x5f,%xmm0,%xmm6
+ DB 242,15,112,194,231 ; pshuflw $0xe7,%xmm2,%xmm0
+ DB 243,15,112,192,231 ; pshufhw $0xe7,%xmm0,%xmm0
+ DB 102,15,112,192,232 ; pshufd $0xe8,%xmm0,%xmm0
+ DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
+ DB 242,15,112,192,95 ; pshuflw $0x5f,%xmm0,%xmm0
+ DB 243,15,112,248,95 ; pshufhw $0x5f,%xmm0,%xmm7
+ DB 102,69,15,239,201 ; pxor %xmm9,%xmm9
+ DB 102,65,15,111,192 ; movdqa %xmm8,%xmm0
+ DB 102,65,15,96,193 ; punpcklbw %xmm9,%xmm0
+ DB 102,69,15,104,193 ; punpckhbw %xmm9,%xmm8
+ DB 102,15,111,225 ; movdqa %xmm1,%xmm4
+ DB 102,65,15,96,225 ; punpcklbw %xmm9,%xmm4
+ DB 102,65,15,104,201 ; punpckhbw %xmm9,%xmm1
+ DB 102,15,111,239 ; movdqa %xmm7,%xmm5
+ DB 102,65,15,96,233 ; punpcklbw %xmm9,%xmm5
+ DB 102,65,15,104,249 ; punpckhbw %xmm9,%xmm7
+ DB 102,68,15,111,214 ; movdqa %xmm6,%xmm10
+ DB 102,69,15,96,209 ; punpcklbw %xmm9,%xmm10
+ DB 102,65,15,104,241 ; punpckhbw %xmm9,%xmm6
+ DB 102,15,213,241 ; pmullw %xmm1,%xmm6
+ DB 102,68,15,213,212 ; pmullw %xmm4,%xmm10
+ DB 102,65,15,213,248 ; pmullw %xmm8,%xmm7
+ DB 102,15,213,232 ; pmullw %xmm0,%xmm5
+ DB 102,15,253,197 ; paddw %xmm5,%xmm0
+ DB 102,65,15,253,248 ; paddw %xmm8,%xmm7
+ DB 102,65,15,253,226 ; paddw %xmm10,%xmm4
+ DB 102,15,253,241 ; paddw %xmm1,%xmm6
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
+ DB 102,15,103,199 ; packuswb %xmm7,%xmm0
+ DB 102,15,103,230 ; packuswb %xmm6,%xmm4
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
+ DB 102,15,111,204 ; movdqa %xmm4,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_dstin_sse2_8bit
_sk_dstin_sse2_8bit LABEL PROC
+ DB 242,15,112,201,231 ; pshuflw $0xe7,%xmm1,%xmm1
+ DB 243,15,112,201,231 ; pshufhw $0xe7,%xmm1,%xmm1
+ DB 102,15,112,201,232 ; pshufd $0xe8,%xmm1,%xmm1
+ DB 102,15,96,201 ; punpcklbw %xmm1,%xmm1
+ DB 242,15,112,201,95 ; pshuflw $0x5f,%xmm1,%xmm1
+ DB 243,15,112,225,95 ; pshufhw $0x5f,%xmm1,%xmm4
DB 242,15,112,192,231 ; pshuflw $0xe7,%xmm0,%xmm0
DB 243,15,112,192,231 ; pshufhw $0xe7,%xmm0,%xmm0
DB 102,15,112,192,232 ; pshufd $0xe8,%xmm0,%xmm0
DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
DB 242,15,112,192,95 ; pshuflw $0x5f,%xmm0,%xmm0
- DB 243,15,112,208,95 ; pshufhw $0x5f,%xmm0,%xmm2
- DB 102,15,239,219 ; pxor %xmm3,%xmm3
- DB 102,15,111,225 ; movdqa %xmm1,%xmm4
- DB 102,15,111,233 ; movdqa %xmm1,%xmm5
- DB 102,15,96,235 ; punpcklbw %xmm3,%xmm5
- DB 102,15,104,227 ; punpckhbw %xmm3,%xmm4
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
- DB 102,15,96,195 ; punpcklbw %xmm3,%xmm0
- DB 102,15,104,211 ; punpckhbw %xmm3,%xmm2
- DB 102,15,213,212 ; pmullw %xmm4,%xmm2
- DB 102,15,213,197 ; pmullw %xmm5,%xmm0
- DB 102,15,253,197 ; paddw %xmm5,%xmm0
- DB 102,15,253,212 ; paddw %xmm4,%xmm2
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
+ DB 243,15,112,232,95 ; pshufhw $0x5f,%xmm0,%xmm5
+ DB 102,15,239,246 ; pxor %xmm6,%xmm6
+ DB 102,68,15,111,194 ; movdqa %xmm2,%xmm8
+ DB 102,15,111,250 ; movdqa %xmm2,%xmm7
+ DB 102,15,96,254 ; punpcklbw %xmm6,%xmm7
+ DB 102,68,15,104,198 ; punpckhbw %xmm6,%xmm8
+ DB 102,68,15,111,203 ; movdqa %xmm3,%xmm9
+ DB 102,68,15,111,211 ; movdqa %xmm3,%xmm10
+ DB 102,68,15,96,214 ; punpcklbw %xmm6,%xmm10
+ DB 102,68,15,104,206 ; punpckhbw %xmm6,%xmm9
+ DB 102,15,111,197 ; movdqa %xmm5,%xmm0
+ DB 102,15,96,198 ; punpcklbw %xmm6,%xmm0
+ DB 102,15,104,238 ; punpckhbw %xmm6,%xmm5
+ DB 102,15,111,204 ; movdqa %xmm4,%xmm1
+ DB 102,15,96,206 ; punpcklbw %xmm6,%xmm1
+ DB 102,15,104,230 ; punpckhbw %xmm6,%xmm4
+ DB 102,65,15,213,225 ; pmullw %xmm9,%xmm4
+ DB 102,65,15,213,202 ; pmullw %xmm10,%xmm1
+ DB 102,65,15,213,232 ; pmullw %xmm8,%xmm5
+ DB 102,15,213,199 ; pmullw %xmm7,%xmm0
+ DB 102,15,253,199 ; paddw %xmm7,%xmm0
+ DB 102,65,15,253,232 ; paddw %xmm8,%xmm5
+ DB 102,65,15,253,202 ; paddw %xmm10,%xmm1
+ DB 102,65,15,253,225 ; paddw %xmm9,%xmm4
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
- DB 102,15,103,194 ; packuswb %xmm2,%xmm0
+ DB 102,15,103,197 ; packuswb %xmm5,%xmm0
+ DB 102,15,103,204 ; packuswb %xmm4,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_srcout_sse2_8bit
_sk_srcout_sse2_8bit LABEL PROC
- DB 242,15,112,209,231 ; pshuflw $0xe7,%xmm1,%xmm2
- DB 243,15,112,210,231 ; pshufhw $0xe7,%xmm2,%xmm2
- DB 102,15,112,210,232 ; pshufd $0xe8,%xmm2,%xmm2
- DB 102,15,96,210 ; punpcklbw %xmm2,%xmm2
- DB 242,15,112,210,95 ; pshuflw $0x5f,%xmm2,%xmm2
- DB 243,15,112,210,95 ; pshufhw $0x5f,%xmm2,%xmm2
- DB 102,15,118,219 ; pcmpeqd %xmm3,%xmm3
- DB 102,15,239,218 ; pxor %xmm2,%xmm3
- DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,111,208 ; movdqa %xmm0,%xmm2
- DB 102,15,96,212 ; punpcklbw %xmm4,%xmm2
- DB 102,15,104,196 ; punpckhbw %xmm4,%xmm0
- DB 102,15,111,235 ; movdqa %xmm3,%xmm5
- DB 102,15,96,236 ; punpcklbw %xmm4,%xmm5
- DB 102,15,104,220 ; punpckhbw %xmm4,%xmm3
- DB 102,15,213,216 ; pmullw %xmm0,%xmm3
- DB 102,15,213,234 ; pmullw %xmm2,%xmm5
- DB 102,15,253,213 ; paddw %xmm5,%xmm2
- DB 102,15,253,216 ; paddw %xmm0,%xmm3
- DB 102,15,113,211,8 ; psrlw $0x8,%xmm3
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,211 ; packuswb %xmm3,%xmm2
+ DB 102,68,15,111,192 ; movdqa %xmm0,%xmm8
+ DB 242,15,112,194,231 ; pshuflw $0xe7,%xmm2,%xmm0
+ DB 243,15,112,192,231 ; pshufhw $0xe7,%xmm0,%xmm0
+ DB 102,15,112,192,232 ; pshufd $0xe8,%xmm0,%xmm0
+ DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
+ DB 242,15,112,192,95 ; pshuflw $0x5f,%xmm0,%xmm0
+ DB 243,15,112,240,95 ; pshufhw $0x5f,%xmm0,%xmm6
+ DB 242,15,112,195,231 ; pshuflw $0xe7,%xmm3,%xmm0
+ DB 243,15,112,192,231 ; pshufhw $0xe7,%xmm0,%xmm0
+ DB 102,15,112,192,232 ; pshufd $0xe8,%xmm0,%xmm0
+ DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
+ DB 242,15,112,192,95 ; pshuflw $0x5f,%xmm0,%xmm0
+ DB 243,15,112,248,95 ; pshufhw $0x5f,%xmm0,%xmm7
+ DB 102,15,118,192 ; pcmpeqd %xmm0,%xmm0
+ DB 102,15,239,248 ; pxor %xmm0,%xmm7
+ DB 102,15,239,240 ; pxor %xmm0,%xmm6
+ DB 102,69,15,239,201 ; pxor %xmm9,%xmm9
+ DB 102,65,15,111,192 ; movdqa %xmm8,%xmm0
+ DB 102,65,15,96,193 ; punpcklbw %xmm9,%xmm0
+ DB 102,69,15,104,193 ; punpckhbw %xmm9,%xmm8
+ DB 102,15,111,233 ; movdqa %xmm1,%xmm5
+ DB 102,65,15,96,233 ; punpcklbw %xmm9,%xmm5
+ DB 102,65,15,104,201 ; punpckhbw %xmm9,%xmm1
+ DB 102,15,111,230 ; movdqa %xmm6,%xmm4
+ DB 102,65,15,96,225 ; punpcklbw %xmm9,%xmm4
+ DB 102,65,15,104,241 ; punpckhbw %xmm9,%xmm6
+ DB 102,68,15,111,215 ; movdqa %xmm7,%xmm10
+ DB 102,69,15,96,209 ; punpcklbw %xmm9,%xmm10
+ DB 102,65,15,104,249 ; punpckhbw %xmm9,%xmm7
+ DB 102,15,213,249 ; pmullw %xmm1,%xmm7
+ DB 102,68,15,213,213 ; pmullw %xmm5,%xmm10
+ DB 102,65,15,213,240 ; pmullw %xmm8,%xmm6
+ DB 102,15,213,224 ; pmullw %xmm0,%xmm4
+ DB 102,15,253,196 ; paddw %xmm4,%xmm0
+ DB 102,65,15,253,240 ; paddw %xmm8,%xmm6
+ DB 102,65,15,253,234 ; paddw %xmm10,%xmm5
+ DB 102,15,253,249 ; paddw %xmm1,%xmm7
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
+ DB 102,15,103,198 ; packuswb %xmm6,%xmm0
+ DB 102,15,103,239 ; packuswb %xmm7,%xmm5
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
+ DB 102,15,111,205 ; movdqa %xmm5,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_dstout_sse2_8bit
@@ -42778,242 +45719,438 @@ _sk_dstout_sse2_8bit LABEL PROC
DB 102,15,112,192,232 ; pshufd $0xe8,%xmm0,%xmm0
DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
DB 242,15,112,192,95 ; pshuflw $0x5f,%xmm0,%xmm0
- DB 243,15,112,192,95 ; pshufhw $0x5f,%xmm0,%xmm0
- DB 102,15,118,210 ; pcmpeqd %xmm2,%xmm2
- DB 102,15,239,208 ; pxor %xmm0,%xmm2
- DB 102,15,239,219 ; pxor %xmm3,%xmm3
- DB 102,15,111,225 ; movdqa %xmm1,%xmm4
- DB 102,15,111,233 ; movdqa %xmm1,%xmm5
- DB 102,15,96,235 ; punpcklbw %xmm3,%xmm5
- DB 102,15,104,227 ; punpckhbw %xmm3,%xmm4
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
- DB 102,15,96,195 ; punpcklbw %xmm3,%xmm0
- DB 102,15,104,211 ; punpckhbw %xmm3,%xmm2
- DB 102,15,213,212 ; pmullw %xmm4,%xmm2
- DB 102,15,213,197 ; pmullw %xmm5,%xmm0
- DB 102,15,253,197 ; paddw %xmm5,%xmm0
- DB 102,15,253,212 ; paddw %xmm4,%xmm2
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
+ DB 243,15,112,224,95 ; pshufhw $0x5f,%xmm0,%xmm4
+ DB 242,15,112,193,231 ; pshuflw $0xe7,%xmm1,%xmm0
+ DB 243,15,112,192,231 ; pshufhw $0xe7,%xmm0,%xmm0
+ DB 102,15,112,192,232 ; pshufd $0xe8,%xmm0,%xmm0
+ DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
+ DB 242,15,112,192,95 ; pshuflw $0x5f,%xmm0,%xmm0
+ DB 243,15,112,232,95 ; pshufhw $0x5f,%xmm0,%xmm5
+ DB 102,15,118,192 ; pcmpeqd %xmm0,%xmm0
+ DB 102,15,239,232 ; pxor %xmm0,%xmm5
+ DB 102,15,239,224 ; pxor %xmm0,%xmm4
+ DB 102,15,239,246 ; pxor %xmm6,%xmm6
+ DB 102,68,15,111,194 ; movdqa %xmm2,%xmm8
+ DB 102,15,111,250 ; movdqa %xmm2,%xmm7
+ DB 102,15,96,254 ; punpcklbw %xmm6,%xmm7
+ DB 102,68,15,104,198 ; punpckhbw %xmm6,%xmm8
+ DB 102,68,15,111,203 ; movdqa %xmm3,%xmm9
+ DB 102,68,15,111,211 ; movdqa %xmm3,%xmm10
+ DB 102,68,15,96,214 ; punpcklbw %xmm6,%xmm10
+ DB 102,68,15,104,206 ; punpckhbw %xmm6,%xmm9
+ DB 102,15,111,196 ; movdqa %xmm4,%xmm0
+ DB 102,15,96,198 ; punpcklbw %xmm6,%xmm0
+ DB 102,15,104,230 ; punpckhbw %xmm6,%xmm4
+ DB 102,15,111,205 ; movdqa %xmm5,%xmm1
+ DB 102,15,96,206 ; punpcklbw %xmm6,%xmm1
+ DB 102,15,104,238 ; punpckhbw %xmm6,%xmm5
+ DB 102,65,15,213,233 ; pmullw %xmm9,%xmm5
+ DB 102,65,15,213,202 ; pmullw %xmm10,%xmm1
+ DB 102,65,15,213,224 ; pmullw %xmm8,%xmm4
+ DB 102,15,213,199 ; pmullw %xmm7,%xmm0
+ DB 102,15,253,199 ; paddw %xmm7,%xmm0
+ DB 102,65,15,253,224 ; paddw %xmm8,%xmm4
+ DB 102,65,15,253,202 ; paddw %xmm10,%xmm1
+ DB 102,65,15,253,233 ; paddw %xmm9,%xmm5
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
- DB 102,15,103,194 ; packuswb %xmm2,%xmm0
+ DB 102,15,103,196 ; packuswb %xmm4,%xmm0
+ DB 102,15,103,205 ; packuswb %xmm5,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_srcover_sse2_8bit
_sk_srcover_sse2_8bit LABEL PROC
- DB 242,15,112,208,231 ; pshuflw $0xe7,%xmm0,%xmm2
- DB 243,15,112,210,231 ; pshufhw $0xe7,%xmm2,%xmm2
- DB 102,15,112,210,232 ; pshufd $0xe8,%xmm2,%xmm2
- DB 102,15,96,210 ; punpcklbw %xmm2,%xmm2
- DB 242,15,112,210,95 ; pshuflw $0x5f,%xmm2,%xmm2
- DB 243,15,112,210,95 ; pshufhw $0x5f,%xmm2,%xmm2
- DB 102,15,239,219 ; pxor %xmm3,%xmm3
- DB 102,15,111,225 ; movdqa %xmm1,%xmm4
- DB 102,15,252,193 ; paddb %xmm1,%xmm0
- DB 102,15,111,233 ; movdqa %xmm1,%xmm5
- DB 102,15,96,235 ; punpcklbw %xmm3,%xmm5
- DB 102,15,104,227 ; punpckhbw %xmm3,%xmm4
+ DB 242,15,112,225,231 ; pshuflw $0xe7,%xmm1,%xmm4
+ DB 243,15,112,228,231 ; pshufhw $0xe7,%xmm4,%xmm4
+ DB 102,15,112,228,232 ; pshufd $0xe8,%xmm4,%xmm4
+ DB 102,15,96,228 ; punpcklbw %xmm4,%xmm4
+ DB 242,15,112,228,95 ; pshuflw $0x5f,%xmm4,%xmm4
+ DB 243,68,15,112,212,95 ; pshufhw $0x5f,%xmm4,%xmm10
+ DB 242,15,112,232,231 ; pshuflw $0xe7,%xmm0,%xmm5
+ DB 243,15,112,237,231 ; pshufhw $0xe7,%xmm5,%xmm5
+ DB 102,15,112,237,232 ; pshufd $0xe8,%xmm5,%xmm5
+ DB 102,15,96,237 ; punpcklbw %xmm5,%xmm5
+ DB 242,15,112,237,95 ; pshuflw $0x5f,%xmm5,%xmm5
+ DB 243,68,15,112,221,95 ; pshufhw $0x5f,%xmm5,%xmm11
+ DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
+ DB 102,68,15,111,202 ; movdqa %xmm2,%xmm9
+ DB 102,15,252,194 ; paddb %xmm2,%xmm0
DB 102,15,111,242 ; movdqa %xmm2,%xmm6
- DB 102,15,96,243 ; punpcklbw %xmm3,%xmm6
- DB 102,15,104,211 ; punpckhbw %xmm3,%xmm2
- DB 102,15,213,212 ; pmullw %xmm4,%xmm2
- DB 102,15,213,245 ; pmullw %xmm5,%xmm6
- DB 102,15,253,245 ; paddw %xmm5,%xmm6
- DB 102,15,253,212 ; paddw %xmm4,%xmm2
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
- DB 102,15,103,242 ; packuswb %xmm2,%xmm6
- DB 102,15,248,198 ; psubb %xmm6,%xmm0
+ DB 102,65,15,96,240 ; punpcklbw %xmm8,%xmm6
+ DB 102,69,15,104,200 ; punpckhbw %xmm8,%xmm9
+ DB 102,68,15,111,227 ; movdqa %xmm3,%xmm12
+ DB 102,15,252,203 ; paddb %xmm3,%xmm1
+ DB 102,15,111,227 ; movdqa %xmm3,%xmm4
+ DB 102,65,15,96,224 ; punpcklbw %xmm8,%xmm4
+ DB 102,69,15,104,224 ; punpckhbw %xmm8,%xmm12
+ DB 102,65,15,111,235 ; movdqa %xmm11,%xmm5
+ DB 102,65,15,96,232 ; punpcklbw %xmm8,%xmm5
+ DB 102,69,15,104,216 ; punpckhbw %xmm8,%xmm11
+ DB 102,65,15,111,250 ; movdqa %xmm10,%xmm7
+ DB 102,65,15,96,248 ; punpcklbw %xmm8,%xmm7
+ DB 102,69,15,104,208 ; punpckhbw %xmm8,%xmm10
+ DB 102,69,15,213,212 ; pmullw %xmm12,%xmm10
+ DB 102,15,213,252 ; pmullw %xmm4,%xmm7
+ DB 102,69,15,213,217 ; pmullw %xmm9,%xmm11
+ DB 102,15,213,238 ; pmullw %xmm6,%xmm5
+ DB 102,15,253,238 ; paddw %xmm6,%xmm5
+ DB 102,69,15,253,217 ; paddw %xmm9,%xmm11
+ DB 102,15,253,252 ; paddw %xmm4,%xmm7
+ DB 102,69,15,253,212 ; paddw %xmm12,%xmm10
+ DB 102,65,15,113,210,8 ; psrlw $0x8,%xmm10
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,65,15,113,211,8 ; psrlw $0x8,%xmm11
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,65,15,103,235 ; packuswb %xmm11,%xmm5
+ DB 102,65,15,103,250 ; packuswb %xmm10,%xmm7
+ DB 102,15,248,197 ; psubb %xmm5,%xmm0
+ DB 102,15,248,207 ; psubb %xmm7,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_dstover_sse2_8bit
_sk_dstover_sse2_8bit LABEL PROC
- DB 102,15,111,208 ; movdqa %xmm0,%xmm2
- DB 242,15,112,193,231 ; pshuflw $0xe7,%xmm1,%xmm0
- DB 243,15,112,192,231 ; pshufhw $0xe7,%xmm0,%xmm0
- DB 102,15,112,192,232 ; pshufd $0xe8,%xmm0,%xmm0
- DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
- DB 242,15,112,192,95 ; pshuflw $0x5f,%xmm0,%xmm0
- DB 243,15,112,216,95 ; pshufhw $0x5f,%xmm0,%xmm3
- DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,111,234 ; movdqa %xmm2,%xmm5
- DB 102,15,96,236 ; punpcklbw %xmm4,%xmm5
- DB 102,15,111,193 ; movdqa %xmm1,%xmm0
- DB 102,15,252,194 ; paddb %xmm2,%xmm0
- DB 102,15,104,212 ; punpckhbw %xmm4,%xmm2
- DB 102,15,111,243 ; movdqa %xmm3,%xmm6
- DB 102,15,96,244 ; punpcklbw %xmm4,%xmm6
- DB 102,15,104,220 ; punpckhbw %xmm4,%xmm3
- DB 102,15,213,218 ; pmullw %xmm2,%xmm3
- DB 102,15,213,245 ; pmullw %xmm5,%xmm6
- DB 102,15,253,245 ; paddw %xmm5,%xmm6
- DB 102,15,253,218 ; paddw %xmm2,%xmm3
- DB 102,15,113,211,8 ; psrlw $0x8,%xmm3
+ DB 242,15,112,227,231 ; pshuflw $0xe7,%xmm3,%xmm4
+ DB 243,15,112,228,231 ; pshufhw $0xe7,%xmm4,%xmm4
+ DB 102,15,112,228,232 ; pshufd $0xe8,%xmm4,%xmm4
+ DB 102,15,96,228 ; punpcklbw %xmm4,%xmm4
+ DB 242,15,112,228,95 ; pshuflw $0x5f,%xmm4,%xmm4
+ DB 243,15,112,228,95 ; pshufhw $0x5f,%xmm4,%xmm4
+ DB 242,15,112,234,231 ; pshuflw $0xe7,%xmm2,%xmm5
+ DB 243,15,112,237,231 ; pshufhw $0xe7,%xmm5,%xmm5
+ DB 102,15,112,237,232 ; pshufd $0xe8,%xmm5,%xmm5
+ DB 102,15,96,237 ; punpcklbw %xmm5,%xmm5
+ DB 242,15,112,237,95 ; pshuflw $0x5f,%xmm5,%xmm5
+ DB 243,15,112,237,95 ; pshufhw $0x5f,%xmm5,%xmm5
+ DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
+ DB 102,68,15,111,200 ; movdqa %xmm0,%xmm9
+ DB 102,69,15,96,200 ; punpcklbw %xmm8,%xmm9
+ DB 102,68,15,111,208 ; movdqa %xmm0,%xmm10
+ DB 102,69,15,104,208 ; punpckhbw %xmm8,%xmm10
+ DB 102,68,15,111,217 ; movdqa %xmm1,%xmm11
+ DB 102,69,15,96,216 ; punpcklbw %xmm8,%xmm11
+ DB 102,68,15,111,225 ; movdqa %xmm1,%xmm12
+ DB 102,69,15,104,224 ; punpckhbw %xmm8,%xmm12
+ DB 102,15,111,245 ; movdqa %xmm5,%xmm6
+ DB 102,65,15,96,240 ; punpcklbw %xmm8,%xmm6
+ DB 102,65,15,104,232 ; punpckhbw %xmm8,%xmm5
+ DB 102,15,111,252 ; movdqa %xmm4,%xmm7
+ DB 102,65,15,96,248 ; punpcklbw %xmm8,%xmm7
+ DB 102,65,15,104,224 ; punpckhbw %xmm8,%xmm4
+ DB 102,65,15,213,228 ; pmullw %xmm12,%xmm4
+ DB 102,65,15,213,251 ; pmullw %xmm11,%xmm7
+ DB 102,65,15,213,234 ; pmullw %xmm10,%xmm5
+ DB 102,65,15,213,241 ; pmullw %xmm9,%xmm6
+ DB 102,65,15,253,241 ; paddw %xmm9,%xmm6
+ DB 102,65,15,253,234 ; paddw %xmm10,%xmm5
+ DB 102,65,15,253,251 ; paddw %xmm11,%xmm7
+ DB 102,65,15,253,228 ; paddw %xmm12,%xmm4
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
- DB 102,15,103,243 ; packuswb %xmm3,%xmm6
+ DB 102,15,103,245 ; packuswb %xmm5,%xmm6
+ DB 102,15,103,252 ; packuswb %xmm4,%xmm7
+ DB 102,15,252,203 ; paddb %xmm3,%xmm1
+ DB 102,15,252,194 ; paddb %xmm2,%xmm0
DB 102,15,248,198 ; psubb %xmm6,%xmm0
+ DB 102,15,248,207 ; psubb %xmm7,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_modulate_sse2_8bit
_sk_modulate_sse2_8bit LABEL PROC
- DB 102,15,239,219 ; pxor %xmm3,%xmm3
- DB 102,15,111,208 ; movdqa %xmm0,%xmm2
- DB 102,15,96,211 ; punpcklbw %xmm3,%xmm2
- DB 102,15,104,195 ; punpckhbw %xmm3,%xmm0
- DB 102,15,111,225 ; movdqa %xmm1,%xmm4
- DB 102,15,111,233 ; movdqa %xmm1,%xmm5
- DB 102,15,96,235 ; punpcklbw %xmm3,%xmm5
- DB 102,15,104,227 ; punpckhbw %xmm3,%xmm4
+ DB 102,68,15,111,193 ; movdqa %xmm1,%xmm8
+ DB 102,68,15,111,200 ; movdqa %xmm0,%xmm9
+ DB 102,69,15,239,210 ; pxor %xmm10,%xmm10
+ DB 102,65,15,96,194 ; punpcklbw %xmm10,%xmm0
+ DB 102,69,15,104,202 ; punpckhbw %xmm10,%xmm9
+ DB 102,65,15,96,202 ; punpcklbw %xmm10,%xmm1
+ DB 102,69,15,104,194 ; punpckhbw %xmm10,%xmm8
+ DB 102,15,111,250 ; movdqa %xmm2,%xmm7
+ DB 102,15,111,226 ; movdqa %xmm2,%xmm4
+ DB 102,65,15,96,226 ; punpcklbw %xmm10,%xmm4
+ DB 102,65,15,104,250 ; punpckhbw %xmm10,%xmm7
+ DB 102,15,111,235 ; movdqa %xmm3,%xmm5
+ DB 102,15,111,243 ; movdqa %xmm3,%xmm6
+ DB 102,65,15,96,242 ; punpcklbw %xmm10,%xmm6
+ DB 102,65,15,104,234 ; punpckhbw %xmm10,%xmm5
+ DB 102,65,15,213,232 ; pmullw %xmm8,%xmm5
+ DB 102,15,213,241 ; pmullw %xmm1,%xmm6
+ DB 102,65,15,213,249 ; pmullw %xmm9,%xmm7
DB 102,15,213,224 ; pmullw %xmm0,%xmm4
- DB 102,15,213,234 ; pmullw %xmm2,%xmm5
- DB 102,15,253,213 ; paddw %xmm5,%xmm2
- DB 102,15,253,224 ; paddw %xmm0,%xmm4
- DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,212 ; packuswb %xmm4,%xmm2
+ DB 102,15,253,196 ; paddw %xmm4,%xmm0
+ DB 102,65,15,253,249 ; paddw %xmm9,%xmm7
+ DB 102,15,253,206 ; paddw %xmm6,%xmm1
+ DB 102,65,15,253,232 ; paddw %xmm8,%xmm5
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
+ DB 102,15,103,199 ; packuswb %xmm7,%xmm0
+ DB 102,15,103,205 ; packuswb %xmm5,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,194 ; movdqa %xmm2,%xmm0
DB 255,224 ; jmpq *%rax
PUBLIC _sk_multiply_sse2_8bit
_sk_multiply_sse2_8bit LABEL PROC
- DB 242,15,112,209,231 ; pshuflw $0xe7,%xmm1,%xmm2
+ DB 72,131,236,24 ; sub $0x18,%rsp
+ DB 102,68,15,111,243 ; movdqa %xmm3,%xmm14
+ DB 102,15,111,218 ; movdqa %xmm2,%xmm3
+ DB 242,15,112,211,231 ; pshuflw $0xe7,%xmm3,%xmm2
DB 243,15,112,210,231 ; pshufhw $0xe7,%xmm2,%xmm2
DB 102,15,112,210,232 ; pshufd $0xe8,%xmm2,%xmm2
DB 102,15,96,210 ; punpcklbw %xmm2,%xmm2
DB 242,15,112,210,95 ; pshuflw $0x5f,%xmm2,%xmm2
- DB 243,15,112,242,95 ; pshufhw $0x5f,%xmm2,%xmm6
- DB 102,15,118,237 ; pcmpeqd %xmm5,%xmm5
- DB 102,15,239,245 ; pxor %xmm5,%xmm6
- DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
- DB 102,15,111,216 ; movdqa %xmm0,%xmm3
- DB 102,65,15,96,216 ; punpcklbw %xmm8,%xmm3
- DB 242,15,112,248,231 ; pshuflw $0xe7,%xmm0,%xmm7
- DB 102,65,15,104,192 ; punpckhbw %xmm8,%xmm0
- DB 102,15,111,230 ; movdqa %xmm6,%xmm4
- DB 102,65,15,96,224 ; punpcklbw %xmm8,%xmm4
- DB 102,65,15,104,240 ; punpckhbw %xmm8,%xmm6
- DB 102,15,213,240 ; pmullw %xmm0,%xmm6
- DB 102,15,213,227 ; pmullw %xmm3,%xmm4
- DB 102,15,253,227 ; paddw %xmm3,%xmm4
- DB 102,15,253,240 ; paddw %xmm0,%xmm6
- DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 243,15,112,250,95 ; pshufhw $0x5f,%xmm2,%xmm7
+ DB 242,65,15,112,214,231 ; pshuflw $0xe7,%xmm14,%xmm2
+ DB 243,15,112,210,231 ; pshufhw $0xe7,%xmm2,%xmm2
+ DB 102,15,112,210,232 ; pshufd $0xe8,%xmm2,%xmm2
+ DB 102,15,96,210 ; punpcklbw %xmm2,%xmm2
+ DB 242,15,112,210,95 ; pshuflw $0x5f,%xmm2,%xmm2
+ DB 243,15,112,226,95 ; pshufhw $0x5f,%xmm2,%xmm4
+ DB 102,69,15,118,192 ; pcmpeqd %xmm8,%xmm8
+ DB 102,65,15,239,224 ; pxor %xmm8,%xmm4
+ DB 102,65,15,239,248 ; pxor %xmm8,%xmm7
+ DB 102,69,15,239,237 ; pxor %xmm13,%xmm13
+ DB 102,68,15,111,200 ; movdqa %xmm0,%xmm9
+ DB 242,15,112,208,231 ; pshuflw $0xe7,%xmm0,%xmm2
+ DB 102,68,15,111,216 ; movdqa %xmm0,%xmm11
+ DB 102,69,15,96,221 ; punpcklbw %xmm13,%xmm11
+ DB 102,69,15,104,205 ; punpckhbw %xmm13,%xmm9
+ DB 102,68,15,111,209 ; movdqa %xmm1,%xmm10
+ DB 242,15,112,241,231 ; pshuflw $0xe7,%xmm1,%xmm6
+ DB 102,68,15,111,225 ; movdqa %xmm1,%xmm12
+ DB 102,69,15,96,229 ; punpcklbw %xmm13,%xmm12
+ DB 102,69,15,104,213 ; punpckhbw %xmm13,%xmm10
+ DB 102,68,15,111,255 ; movdqa %xmm7,%xmm15
+ DB 102,69,15,96,253 ; punpcklbw %xmm13,%xmm15
+ DB 102,65,15,104,253 ; punpckhbw %xmm13,%xmm7
+ DB 102,15,111,236 ; movdqa %xmm4,%xmm5
+ DB 102,65,15,96,237 ; punpcklbw %xmm13,%xmm5
+ DB 102,65,15,104,229 ; punpckhbw %xmm13,%xmm4
+ DB 102,65,15,213,226 ; pmullw %xmm10,%xmm4
+ DB 102,65,15,213,236 ; pmullw %xmm12,%xmm5
+ DB 102,65,15,213,249 ; pmullw %xmm9,%xmm7
+ DB 102,69,15,213,251 ; pmullw %xmm11,%xmm15
+ DB 102,69,15,253,251 ; paddw %xmm11,%xmm15
+ DB 102,65,15,253,249 ; paddw %xmm9,%xmm7
+ DB 102,65,15,253,236 ; paddw %xmm12,%xmm5
+ DB 102,65,15,253,226 ; paddw %xmm10,%xmm4
DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
- DB 102,15,103,230 ; packuswb %xmm6,%xmm4
- DB 243,15,112,247,231 ; pshufhw $0xe7,%xmm7,%xmm6
- DB 102,15,112,246,232 ; pshufd $0xe8,%xmm6,%xmm6
- DB 102,15,96,246 ; punpcklbw %xmm6,%xmm6
- DB 242,15,112,246,95 ; pshuflw $0x5f,%xmm6,%xmm6
- DB 243,15,112,214,95 ; pshufhw $0x5f,%xmm6,%xmm2
- DB 102,15,239,213 ; pxor %xmm5,%xmm2
- DB 102,15,111,249 ; movdqa %xmm1,%xmm7
- DB 102,15,111,241 ; movdqa %xmm1,%xmm6
- DB 102,65,15,96,240 ; punpcklbw %xmm8,%xmm6
- DB 102,65,15,104,248 ; punpckhbw %xmm8,%xmm7
- DB 102,15,111,234 ; movdqa %xmm2,%xmm5
- DB 102,65,15,96,232 ; punpcklbw %xmm8,%xmm5
- DB 102,65,15,104,208 ; punpckhbw %xmm8,%xmm2
- DB 102,15,213,215 ; pmullw %xmm7,%xmm2
- DB 102,15,213,238 ; pmullw %xmm6,%xmm5
- DB 102,15,253,238 ; paddw %xmm6,%xmm5
- DB 102,15,253,215 ; paddw %xmm7,%xmm2
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
- DB 102,15,103,234 ; packuswb %xmm2,%xmm5
- DB 102,15,252,236 ; paddb %xmm4,%xmm5
- DB 102,15,213,248 ; pmullw %xmm0,%xmm7
- DB 102,15,213,243 ; pmullw %xmm3,%xmm6
- DB 102,15,253,243 ; paddw %xmm3,%xmm6
- DB 102,15,253,248 ; paddw %xmm0,%xmm7
DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,65,15,113,215,8 ; psrlw $0x8,%xmm15
+ DB 102,68,15,103,255 ; packuswb %xmm7,%xmm15
+ DB 102,15,103,236 ; packuswb %xmm4,%xmm5
+ DB 243,15,112,194,231 ; pshufhw $0xe7,%xmm2,%xmm0
+ DB 102,15,112,192,232 ; pshufd $0xe8,%xmm0,%xmm0
+ DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
+ DB 242,15,112,192,95 ; pshuflw $0x5f,%xmm0,%xmm0
+ DB 243,15,112,192,95 ; pshufhw $0x5f,%xmm0,%xmm0
+ DB 243,15,112,206,231 ; pshufhw $0xe7,%xmm6,%xmm1
+ DB 102,15,112,201,232 ; pshufd $0xe8,%xmm1,%xmm1
+ DB 102,15,96,201 ; punpcklbw %xmm1,%xmm1
+ DB 242,15,112,201,95 ; pshuflw $0x5f,%xmm1,%xmm1
+ DB 243,15,112,241,95 ; pshufhw $0x5f,%xmm1,%xmm6
+ DB 102,65,15,239,240 ; pxor %xmm8,%xmm6
+ DB 102,65,15,239,192 ; pxor %xmm8,%xmm0
+ DB 102,68,15,111,195 ; movdqa %xmm3,%xmm8
+ DB 102,15,111,211 ; movdqa %xmm3,%xmm2
+ DB 102,65,15,96,213 ; punpcklbw %xmm13,%xmm2
+ DB 102,69,15,104,197 ; punpckhbw %xmm13,%xmm8
+ DB 102,65,15,111,206 ; movdqa %xmm14,%xmm1
+ DB 102,15,127,12,36 ; movdqa %xmm1,(%rsp)
+ DB 102,15,111,249 ; movdqa %xmm1,%xmm7
+ DB 102,65,15,96,253 ; punpcklbw %xmm13,%xmm7
+ DB 102,69,15,104,245 ; punpckhbw %xmm13,%xmm14
+ DB 102,15,111,224 ; movdqa %xmm0,%xmm4
+ DB 102,65,15,96,229 ; punpcklbw %xmm13,%xmm4
+ DB 102,65,15,104,197 ; punpckhbw %xmm13,%xmm0
+ DB 102,15,111,206 ; movdqa %xmm6,%xmm1
+ DB 102,65,15,96,205 ; punpcklbw %xmm13,%xmm1
+ DB 102,65,15,104,245 ; punpckhbw %xmm13,%xmm6
+ DB 102,65,15,213,192 ; pmullw %xmm8,%xmm0
+ DB 102,15,213,226 ; pmullw %xmm2,%xmm4
+ DB 102,15,253,226 ; paddw %xmm2,%xmm4
+ DB 102,65,15,253,192 ; paddw %xmm8,%xmm0
+ DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
+ DB 102,15,103,224 ; packuswb %xmm0,%xmm4
+ DB 102,65,15,213,246 ; pmullw %xmm14,%xmm6
+ DB 102,15,213,207 ; pmullw %xmm7,%xmm1
+ DB 102,15,253,207 ; paddw %xmm7,%xmm1
+ DB 102,65,15,253,246 ; paddw %xmm14,%xmm6
DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
- DB 102,15,103,247 ; packuswb %xmm7,%xmm6
- DB 102,15,252,238 ; paddb %xmm6,%xmm5
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
+ DB 102,15,103,206 ; packuswb %xmm6,%xmm1
+ DB 102,15,252,205 ; paddb %xmm5,%xmm1
+ DB 102,65,15,252,231 ; paddb %xmm15,%xmm4
+ DB 102,65,15,213,211 ; pmullw %xmm11,%xmm2
+ DB 102,65,15,253,211 ; paddw %xmm11,%xmm2
+ DB 102,69,15,213,193 ; pmullw %xmm9,%xmm8
+ DB 102,69,15,253,193 ; paddw %xmm9,%xmm8
+ DB 102,65,15,213,252 ; pmullw %xmm12,%xmm7
+ DB 102,65,15,253,252 ; paddw %xmm12,%xmm7
+ DB 102,69,15,213,242 ; pmullw %xmm10,%xmm14
+ DB 102,69,15,253,242 ; paddw %xmm10,%xmm14
+ DB 102,65,15,113,208,8 ; psrlw $0x8,%xmm8
+ DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
+ DB 102,65,15,103,208 ; packuswb %xmm8,%xmm2
+ DB 102,65,15,113,214,8 ; psrlw $0x8,%xmm14
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,65,15,103,254 ; packuswb %xmm14,%xmm7
+ DB 102,15,252,226 ; paddb %xmm2,%xmm4
+ DB 102,15,252,207 ; paddb %xmm7,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 102,15,111,197 ; movdqa %xmm5,%xmm0
+ DB 102,15,111,211 ; movdqa %xmm3,%xmm2
+ DB 15,40,28,36 ; movaps (%rsp),%xmm3
+ DB 102,15,111,196 ; movdqa %xmm4,%xmm0
+ DB 72,131,196,24 ; add $0x18,%rsp
DB 255,224 ; jmpq *%rax
PUBLIC _sk_screen_sse2_8bit
_sk_screen_sse2_8bit LABEL PROC
- DB 102,15,118,210 ; pcmpeqd %xmm2,%xmm2
- DB 102,15,239,208 ; pxor %xmm0,%xmm2
- DB 102,15,239,219 ; pxor %xmm3,%xmm3
- DB 102,15,111,226 ; movdqa %xmm2,%xmm4
- DB 102,15,96,227 ; punpcklbw %xmm3,%xmm4
- DB 102,15,104,211 ; punpckhbw %xmm3,%xmm2
- DB 102,15,111,233 ; movdqa %xmm1,%xmm5
- DB 102,15,111,241 ; movdqa %xmm1,%xmm6
- DB 102,15,96,243 ; punpcklbw %xmm3,%xmm6
- DB 102,15,104,235 ; punpckhbw %xmm3,%xmm5
- DB 102,15,213,234 ; pmullw %xmm2,%xmm5
- DB 102,15,213,244 ; pmullw %xmm4,%xmm6
- DB 102,15,253,230 ; paddw %xmm6,%xmm4
- DB 102,15,253,234 ; paddw %xmm2,%xmm5
- DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,69,15,118,219 ; pcmpeqd %xmm11,%xmm11
+ DB 102,68,15,111,201 ; movdqa %xmm1,%xmm9
+ DB 102,69,15,239,203 ; pxor %xmm11,%xmm9
+ DB 102,68,15,239,216 ; pxor %xmm0,%xmm11
+ DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
+ DB 102,65,15,111,235 ; movdqa %xmm11,%xmm5
+ DB 102,65,15,96,232 ; punpcklbw %xmm8,%xmm5
+ DB 102,69,15,104,216 ; punpckhbw %xmm8,%xmm11
+ DB 102,65,15,111,225 ; movdqa %xmm9,%xmm4
+ DB 102,65,15,96,224 ; punpcklbw %xmm8,%xmm4
+ DB 102,69,15,104,200 ; punpckhbw %xmm8,%xmm9
+ DB 102,15,111,242 ; movdqa %xmm2,%xmm6
+ DB 102,68,15,111,210 ; movdqa %xmm2,%xmm10
+ DB 102,69,15,96,208 ; punpcklbw %xmm8,%xmm10
+ DB 102,65,15,104,240 ; punpckhbw %xmm8,%xmm6
+ DB 102,15,111,251 ; movdqa %xmm3,%xmm7
+ DB 102,68,15,111,227 ; movdqa %xmm3,%xmm12
+ DB 102,69,15,96,224 ; punpcklbw %xmm8,%xmm12
+ DB 102,65,15,104,248 ; punpckhbw %xmm8,%xmm7
+ DB 102,65,15,213,249 ; pmullw %xmm9,%xmm7
+ DB 102,68,15,213,228 ; pmullw %xmm4,%xmm12
+ DB 102,65,15,213,243 ; pmullw %xmm11,%xmm6
+ DB 102,68,15,213,213 ; pmullw %xmm5,%xmm10
+ DB 102,65,15,253,234 ; paddw %xmm10,%xmm5
+ DB 102,65,15,253,243 ; paddw %xmm11,%xmm6
+ DB 102,65,15,253,228 ; paddw %xmm12,%xmm4
+ DB 102,65,15,253,249 ; paddw %xmm9,%xmm7
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
- DB 102,15,103,229 ; packuswb %xmm5,%xmm4
- DB 102,15,252,196 ; paddb %xmm4,%xmm0
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,103,238 ; packuswb %xmm6,%xmm5
+ DB 102,15,103,231 ; packuswb %xmm7,%xmm4
+ DB 102,15,252,197 ; paddb %xmm5,%xmm0
+ DB 102,15,252,225 ; paddb %xmm1,%xmm4
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 102,15,111,204 ; movdqa %xmm4,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_xor__sse2_8bit
_sk_xor__sse2_8bit LABEL PROC
- DB 242,15,112,209,231 ; pshuflw $0xe7,%xmm1,%xmm2
- DB 243,15,112,210,231 ; pshufhw $0xe7,%xmm2,%xmm2
- DB 102,15,112,210,232 ; pshufd $0xe8,%xmm2,%xmm2
- DB 102,15,96,210 ; punpcklbw %xmm2,%xmm2
- DB 242,15,112,210,95 ; pshuflw $0x5f,%xmm2,%xmm2
- DB 243,15,112,234,95 ; pshufhw $0x5f,%xmm2,%xmm5
- DB 102,15,118,228 ; pcmpeqd %xmm4,%xmm4
- DB 102,15,239,236 ; pxor %xmm4,%xmm5
- DB 102,15,239,219 ; pxor %xmm3,%xmm3
- DB 102,15,111,240 ; movdqa %xmm0,%xmm6
- DB 102,15,96,243 ; punpcklbw %xmm3,%xmm6
- DB 242,15,112,248,231 ; pshuflw $0xe7,%xmm0,%xmm7
- DB 102,15,104,195 ; punpckhbw %xmm3,%xmm0
- DB 102,15,111,213 ; movdqa %xmm5,%xmm2
- DB 102,15,96,211 ; punpcklbw %xmm3,%xmm2
- DB 102,15,104,235 ; punpckhbw %xmm3,%xmm5
- DB 102,15,213,232 ; pmullw %xmm0,%xmm5
- DB 102,15,213,214 ; pmullw %xmm6,%xmm2
- DB 102,15,253,214 ; paddw %xmm6,%xmm2
- DB 102,15,253,232 ; paddw %xmm0,%xmm5
+ DB 242,15,112,226,231 ; pshuflw $0xe7,%xmm2,%xmm4
+ DB 243,15,112,228,231 ; pshufhw $0xe7,%xmm4,%xmm4
+ DB 102,15,112,228,232 ; pshufd $0xe8,%xmm4,%xmm4
+ DB 102,15,96,228 ; punpcklbw %xmm4,%xmm4
+ DB 242,15,112,228,95 ; pshuflw $0x5f,%xmm4,%xmm4
+ DB 243,68,15,112,228,95 ; pshufhw $0x5f,%xmm4,%xmm12
+ DB 242,15,112,235,231 ; pshuflw $0xe7,%xmm3,%xmm5
+ DB 243,15,112,237,231 ; pshufhw $0xe7,%xmm5,%xmm5
+ DB 102,15,112,237,232 ; pshufd $0xe8,%xmm5,%xmm5
+ DB 102,15,96,237 ; punpcklbw %xmm5,%xmm5
+ DB 242,15,112,237,95 ; pshuflw $0x5f,%xmm5,%xmm5
+ DB 243,15,112,237,95 ; pshufhw $0x5f,%xmm5,%xmm5
+ DB 102,69,15,118,192 ; pcmpeqd %xmm8,%xmm8
+ DB 102,65,15,239,232 ; pxor %xmm8,%xmm5
+ DB 102,69,15,239,224 ; pxor %xmm8,%xmm12
+ DB 102,69,15,239,210 ; pxor %xmm10,%xmm10
+ DB 102,68,15,111,200 ; movdqa %xmm0,%xmm9
+ DB 242,68,15,112,216,231 ; pshuflw $0xe7,%xmm0,%xmm11
+ DB 102,65,15,96,194 ; punpcklbw %xmm10,%xmm0
+ DB 102,69,15,104,202 ; punpckhbw %xmm10,%xmm9
+ DB 102,15,111,225 ; movdqa %xmm1,%xmm4
+ DB 242,68,15,112,233,231 ; pshuflw $0xe7,%xmm1,%xmm13
+ DB 102,65,15,96,202 ; punpcklbw %xmm10,%xmm1
+ DB 102,65,15,104,226 ; punpckhbw %xmm10,%xmm4
+ DB 102,65,15,111,244 ; movdqa %xmm12,%xmm6
+ DB 102,65,15,96,242 ; punpcklbw %xmm10,%xmm6
+ DB 102,69,15,104,226 ; punpckhbw %xmm10,%xmm12
+ DB 102,15,111,253 ; movdqa %xmm5,%xmm7
+ DB 102,65,15,96,250 ; punpcklbw %xmm10,%xmm7
+ DB 102,65,15,104,234 ; punpckhbw %xmm10,%xmm5
+ DB 102,15,213,236 ; pmullw %xmm4,%xmm5
+ DB 102,15,213,249 ; pmullw %xmm1,%xmm7
+ DB 102,69,15,213,225 ; pmullw %xmm9,%xmm12
+ DB 102,15,213,240 ; pmullw %xmm0,%xmm6
+ DB 102,15,253,240 ; paddw %xmm0,%xmm6
+ DB 102,69,15,253,225 ; paddw %xmm9,%xmm12
+ DB 102,15,253,249 ; paddw %xmm1,%xmm7
+ DB 102,15,253,236 ; paddw %xmm4,%xmm5
DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
- DB 102,15,113,210,8 ; psrlw $0x8,%xmm2
- DB 102,15,103,213 ; packuswb %xmm5,%xmm2
- DB 243,15,112,199,231 ; pshufhw $0xe7,%xmm7,%xmm0
+ DB 102,15,113,215,8 ; psrlw $0x8,%xmm7
+ DB 102,65,15,113,212,8 ; psrlw $0x8,%xmm12
+ DB 102,15,113,214,8 ; psrlw $0x8,%xmm6
+ DB 102,65,15,103,244 ; packuswb %xmm12,%xmm6
+ DB 102,15,103,253 ; packuswb %xmm5,%xmm7
+ DB 243,65,15,112,195,231 ; pshufhw $0xe7,%xmm11,%xmm0
+ DB 102,15,112,192,232 ; pshufd $0xe8,%xmm0,%xmm0
+ DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
+ DB 242,15,112,192,95 ; pshuflw $0x5f,%xmm0,%xmm0
+ DB 243,15,112,224,95 ; pshufhw $0x5f,%xmm0,%xmm4
+ DB 243,65,15,112,197,231 ; pshufhw $0xe7,%xmm13,%xmm0
DB 102,15,112,192,232 ; pshufd $0xe8,%xmm0,%xmm0
DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
DB 242,15,112,192,95 ; pshuflw $0x5f,%xmm0,%xmm0
DB 243,15,112,232,95 ; pshufhw $0x5f,%xmm0,%xmm5
- DB 102,15,239,236 ; pxor %xmm4,%xmm5
- DB 102,15,111,225 ; movdqa %xmm1,%xmm4
- DB 102,15,111,241 ; movdqa %xmm1,%xmm6
- DB 102,15,96,243 ; punpcklbw %xmm3,%xmm6
- DB 102,15,104,227 ; punpckhbw %xmm3,%xmm4
- DB 102,15,111,197 ; movdqa %xmm5,%xmm0
- DB 102,15,96,195 ; punpcklbw %xmm3,%xmm0
- DB 102,15,104,235 ; punpckhbw %xmm3,%xmm5
- DB 102,15,213,236 ; pmullw %xmm4,%xmm5
- DB 102,15,213,198 ; pmullw %xmm6,%xmm0
- DB 102,15,253,198 ; paddw %xmm6,%xmm0
- DB 102,15,253,236 ; paddw %xmm4,%xmm5
+ DB 102,65,15,239,232 ; pxor %xmm8,%xmm5
+ DB 102,65,15,239,224 ; pxor %xmm8,%xmm4
+ DB 102,68,15,111,194 ; movdqa %xmm2,%xmm8
+ DB 102,68,15,111,202 ; movdqa %xmm2,%xmm9
+ DB 102,69,15,96,202 ; punpcklbw %xmm10,%xmm9
+ DB 102,69,15,104,194 ; punpckhbw %xmm10,%xmm8
+ DB 102,68,15,111,219 ; movdqa %xmm3,%xmm11
+ DB 102,68,15,111,227 ; movdqa %xmm3,%xmm12
+ DB 102,69,15,96,226 ; punpcklbw %xmm10,%xmm12
+ DB 102,69,15,104,218 ; punpckhbw %xmm10,%xmm11
+ DB 102,15,111,196 ; movdqa %xmm4,%xmm0
+ DB 102,65,15,96,194 ; punpcklbw %xmm10,%xmm0
+ DB 102,65,15,104,226 ; punpckhbw %xmm10,%xmm4
+ DB 102,15,111,205 ; movdqa %xmm5,%xmm1
+ DB 102,65,15,96,202 ; punpcklbw %xmm10,%xmm1
+ DB 102,65,15,104,234 ; punpckhbw %xmm10,%xmm5
+ DB 102,65,15,213,235 ; pmullw %xmm11,%xmm5
+ DB 102,65,15,213,204 ; pmullw %xmm12,%xmm1
+ DB 102,65,15,213,224 ; pmullw %xmm8,%xmm4
+ DB 102,65,15,213,193 ; pmullw %xmm9,%xmm0
+ DB 102,65,15,253,193 ; paddw %xmm9,%xmm0
+ DB 102,65,15,253,224 ; paddw %xmm8,%xmm4
+ DB 102,65,15,253,204 ; paddw %xmm12,%xmm1
+ DB 102,65,15,253,235 ; paddw %xmm11,%xmm5
DB 102,15,113,213,8 ; psrlw $0x8,%xmm5
+ DB 102,15,113,209,8 ; psrlw $0x8,%xmm1
+ DB 102,15,113,212,8 ; psrlw $0x8,%xmm4
DB 102,15,113,208,8 ; psrlw $0x8,%xmm0
- DB 102,15,103,197 ; packuswb %xmm5,%xmm0
- DB 102,15,252,194 ; paddb %xmm2,%xmm0
+ DB 102,15,103,196 ; packuswb %xmm4,%xmm0
+ DB 102,15,103,205 ; packuswb %xmm5,%xmm1
+ DB 102,15,252,198 ; paddb %xmm6,%xmm0
+ DB 102,15,252,207 ; paddb %xmm7,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
ALIGN 4
DB 0,0 ; add %al,(%rax)
- DB 127,67 ; jg 12c7 <_sk_xor__sse2_8bit+0x109>
+ DB 127,67 ; jg 23cf <_sk_xor__sse2_8bit+0x1ec>
DB 0,0 ; add %al,(%rax)
- DB 127,67 ; jg 12cb <_sk_xor__sse2_8bit+0x10d>
+ DB 127,67 ; jg 23d3 <_sk_xor__sse2_8bit+0x1f0>
DB 0,0 ; add %al,(%rax)
- DB 127,67 ; jg 12cf <_sk_xor__sse2_8bit+0x111>
+ DB 127,67 ; jg 23d7 <_sk_xor__sse2_8bit+0x1f4>
ALIGN 16
DB 0,0 ; add %al,(%rax)
@@ -43033,21 +46170,37 @@ ALIGN 16
DB 0,0 ; add %al,(%rax)
DB 0,255 ; add %bh,%bh
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
DB 1,1 ; add %eax,(%rcx)
DB 1,0 ; add %eax,(%rax)
DB 1,1 ; add %eax,(%rcx)
@@ -43065,13 +46218,13 @@ ALIGN 16
DB 0,0 ; add %al,(%rax)
DB 0,255 ; add %bh,%bh
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
DB 255,0 ; incl (%rax)
- DB 0,0 ; add %al,(%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
DB 1,1 ; add %eax,(%rcx)
DB 1,0 ; add %eax,(%rax)
DB 1,1 ; add %eax,(%rcx)
@@ -43104,6 +46257,22 @@ ALIGN 16
DB 255,0 ; incl (%rax)
DB 255,0 ; incl (%rax)
DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
+ DB 255,0 ; incl (%rax)
DB 0,0 ; add %al,(%rax)
DB 0,255 ; add %bh,%bh
DB 0,0 ; add %al,(%rax)
diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp
index cbef3fc5db..1d8c2fcb61 100644
--- a/src/jumper/SkJumper_stages_8bit.cpp
+++ b/src/jumper/SkJumper_stages_8bit.cpp
@@ -31,17 +31,36 @@
#endif
#if defined(__AVX2__)
+ using U8 = uint8_t __attribute__((ext_vector_type(16)));
+ using U32 = uint32_t __attribute__((ext_vector_type(16)));
+ using U8x4 = uint8_t __attribute__((ext_vector_type(64)));
+ using U16x4 = uint16_t __attribute__((ext_vector_type(64)));
+ using R = uint8_t __attribute__((ext_vector_type(32)));
+#else
using U8 = uint8_t __attribute__((ext_vector_type( 8)));
using U32 = uint32_t __attribute__((ext_vector_type( 8)));
using U8x4 = uint8_t __attribute__((ext_vector_type(32)));
using U16x4 = uint16_t __attribute__((ext_vector_type(32)));
-#else
- using U8 = uint8_t __attribute__((ext_vector_type( 4)));
- using U32 = uint32_t __attribute__((ext_vector_type( 4)));
- using U8x4 = uint8_t __attribute__((ext_vector_type(16)));
- using U16x4 = uint16_t __attribute__((ext_vector_type(16)));
+ using R = uint8_t __attribute__((ext_vector_type(16)));
#endif
+// We double pump our math, making each U32 or U8x4 twice as wide as a native
+// vector register, and each U16x4 occupy four.
+//
+// These would be tricky to pass around directly because of ABI restrictions,
+// so we split them across two R to pass data between stages. This is
+// typically only a virtual operation, with no runtime cost.
+SI U8x4 join(R lo, R hi) {
+ U8x4 u8x4;
+ memcpy((char*)&u8x4 , &lo, sizeof(R));
+ memcpy((char*)&u8x4 + sizeof(R), &hi, sizeof(R));
+ return u8x4;
+}
+SI void split(U8x4 u8x4, R* lo, R* hi) {
+ memcpy(lo, (char*)&u8x4 , sizeof(R));
+ memcpy(hi, (char*)&u8x4 + sizeof(R), sizeof(R));
+}
+
union V {
U32 u32;
U8x4 u8x4;
@@ -57,18 +76,25 @@ static const size_t kStride = sizeof(V) / sizeof(uint32_t);
// Usually __builtin_convertvector() is pretty good, but sometimes we can do better.
SI U8x4 pack(U16x4 v) {
#if defined(__AVX2__)
- static_assert(sizeof(v) == 64, "");
- auto lo = unaligned_load<__m256i>((char*)&v + 0),
- hi = unaligned_load<__m256i>((char*)&v + 32);
-
- auto _02 = _mm256_permute2x128_si256(lo,hi, 0x20),
- _13 = _mm256_permute2x128_si256(lo,hi, 0x31);
- return _mm256_packus_epi16(_02, _13);
+ static_assert(sizeof(v) == 128, "");
+ auto A = unaligned_load<__m256i>((char*)&v + 0),
+ B = unaligned_load<__m256i>((char*)&v + 32),
+ C = unaligned_load<__m256i>((char*)&v + 64),
+ D = unaligned_load<__m256i>((char*)&v + 96);
+
+ auto pack = [](__m256i lo, __m256i hi) {
+ auto _02 = _mm256_permute2x128_si256(lo,hi, 0x20),
+ _13 = _mm256_permute2x128_si256(lo,hi, 0x31);
+ return _mm256_packus_epi16(_02, _13);
+ };
+ return join(pack(A,B), pack(C,D));
#elif defined(__SSE2__)
- static_assert(sizeof(v) == 32, "");
- auto lo = unaligned_load<__m128i>((char*)&v + 0),
- hi = unaligned_load<__m128i>((char*)&v + 16);
- return _mm_packus_epi16(lo,hi);
+ static_assert(sizeof(v) == 64, "");
+ auto A = unaligned_load<__m128i>((char*)&v + 0),
+ B = unaligned_load<__m128i>((char*)&v + 16),
+ C = unaligned_load<__m128i>((char*)&v + 32),
+ D = unaligned_load<__m128i>((char*)&v + 48);
+ return join(_mm_packus_epi16(A,B), _mm_packus_epi16(C,D));
#else
return __builtin_convertvector(v, U8x4);
#endif
@@ -90,9 +116,13 @@ SI V alpha(V v) {
#if defined(__AVX2__)
return __builtin_shufflevector(v.u8x4,v.u8x4,
3, 3, 3, 3, 7, 7, 7, 7, 11,11,11,11, 15,15,15,15,
- 19,19,19,19, 23,23,23,23, 27,27,27,27, 31,31,31,31);
+ 19,19,19,19, 23,23,23,23, 27,27,27,27, 31,31,31,31,
+ 35,35,35,35, 39,39,39,39, 43,43,43,43, 47,47,47,47,
+ 51,51,51,51, 55,55,55,55, 59,59,59,59, 63,63,63,63);
#else
- return __builtin_shufflevector(v.u8x4,v.u8x4, 3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15);
+ return __builtin_shufflevector(v.u8x4,v.u8x4,
+ 3, 3, 3, 3, 7, 7, 7, 7, 11,11,11,11, 15,15,15,15,
+ 19,19,19,19, 23,23,23,23, 27,27,27,27, 31,31,31,31);
#endif
}
@@ -100,9 +130,13 @@ SI V swap_rb(V v) {
#if defined(__AVX2__)
return __builtin_shufflevector(v.u8x4,v.u8x4,
2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8,11, 14,13,12,15,
- 18,17,16,19, 22,21,20,23, 26,25,24,27, 30,29,28,31);
+ 18,17,16,19, 22,21,20,23, 26,25,24,27, 30,29,28,31,
+ 34,33,32,35, 38,37,36,39, 42,41,40,43, 46,45,44,47,
+ 50,49,48,51, 54,53,52,55, 58,57,56,59, 62,61,60,63);
#else
- return __builtin_shufflevector(v.u8x4,v.u8x4, 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
+ return __builtin_shufflevector(v.u8x4,v.u8x4,
+ 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8,11, 14,13,12,15,
+ 18,17,16,19, 22,21,20,23, 26,25,24,27, 30,29,28,31);
#endif
}
@@ -110,7 +144,7 @@ struct Params {
size_t x,y,tail;
};
-using Stage = void(const Params* params, void** program, V src, V dst);
+using Stage = void(const Params* params, void** program, R src_lo, R src_hi, R dst_lo, R dst_hi);
#if defined(__AVX__)
// We really want to make sure all paths go through this function's (implicit) vzeroupper.
@@ -120,31 +154,36 @@ using Stage = void(const Params* params, void** program, V src, V dst);
MAYBE_MSABI
extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit,
void** program, const SkJumper_constants*) {
- V v;
+ R r;
auto start = (Stage*)load_and_inc(program);
for (; y < ylimit; y++) {
Params params = { x,y,0 };
while (params.x + kStride <= xlimit) {
- start(&params,program, v,v);
+ start(&params,program, r,r,r,r);
params.x += kStride;
}
if (size_t tail = xlimit - params.x) {
params.tail = tail;
- start(&params,program, v,v);
+ start(&params,program, r,r,r,r);
}
}
}
-extern "C" void WRAP(just_return)(const Params*, void**, V,V) {}
-
-#define STAGE(name) \
- SI void name##_k(LazyCtx ctx, size_t x, size_t y, size_t tail, V& src, V& dst); \
- extern "C" void WRAP(name)(const Params* params, void** program, V src, V dst) { \
- LazyCtx ctx(program); \
- name##_k(ctx, params->x, params->y, params->tail, src, dst); \
- auto next = (Stage*)load_and_inc(program); \
- next(params,program, src,dst); \
- } \
+extern "C" void WRAP(just_return)(const Params*, void**, R,R,R,R) {}
+
+#define STAGE(name) \
+ SI void name##_k(LazyCtx ctx, size_t x, size_t y, size_t tail, V& src, V& dst); \
+ extern "C" void WRAP(name)(const Params* params, void** program, \
+ R src_lo, R src_hi, R dst_lo, R dst_hi) { \
+ V src = join(src_lo, src_hi), \
+ dst = join(dst_lo, dst_hi); \
+ LazyCtx ctx(program); \
+ name##_k(ctx, params->x, params->y, params->tail, src, dst); \
+ split(src.u8x4, &src_lo, &src_hi); \
+ split(dst.u8x4, &dst_lo, &dst_hi); \
+ auto next = (Stage*)load_and_inc(program); \
+ next(params,program, src_lo,src_hi, dst_lo,dst_hi); \
+ } \
SI void name##_k(LazyCtx ctx, size_t x, size_t y, size_t tail, V& src, V& dst)
template <typename V, typename T>
@@ -153,13 +192,21 @@ SI V load(const T* src, size_t tail) {
if (__builtin_expect(tail, 0)) {
V v = 0;
switch (tail) {
- case 7: v[6] = src[6];
- case 6: v[5] = src[5];
- case 5: v[4] = src[4];
- case 4: memcpy(&v, src, 4*sizeof(T)); break;
- case 3: v[2] = src[2];
- case 2: memcpy(&v, src, 2*sizeof(T)); break;
- case 1: memcpy(&v, src, 1*sizeof(T)); break;
+ case 15: v[14] = src[14];
+ case 14: v[13] = src[13];
+ case 13: v[12] = src[12];
+ case 12: memcpy(&v, src, 12*sizeof(T)); break;
+ case 11: v[10] = src[10];
+ case 10: v[ 9] = src[ 9];
+ case 9: v[ 8] = src[ 8];
+ case 8: memcpy(&v, src, 8*sizeof(T)); break;
+ case 7: v[6] = src[6];
+ case 6: v[5] = src[5];
+ case 5: v[4] = src[4];
+ case 4: memcpy(&v, src, 4*sizeof(T)); break;
+ case 3: v[2] = src[2];
+ case 2: memcpy(&v, src, 2*sizeof(T)); break;
+ case 1: memcpy(&v, src, 1*sizeof(T)); break;
}
return v;
}
@@ -171,20 +218,28 @@ SI void store(T* dst, V v, size_t tail) {
__builtin_assume(tail < kStride);
if (__builtin_expect(tail, 0)) {
switch (tail) {
- case 7: dst[6] = v[6];
- case 6: dst[5] = v[5];
- case 5: dst[4] = v[4];
- case 4: memcpy(dst, &v, 4*sizeof(T)); break;
- case 3: dst[2] = v[2];
- case 2: memcpy(dst, &v, 2*sizeof(T)); break;
- case 1: memcpy(dst, &v, 1*sizeof(T)); break;
+ case 15: dst[14] = v[14];
+ case 14: dst[13] = v[13];
+ case 13: dst[12] = v[12];
+ case 12: memcpy(dst, &v, 12*sizeof(T)); break;
+ case 11: dst[10] = v[10];
+ case 10: dst[ 9] = v[ 9];
+ case 9: dst[ 8] = v[ 8];
+ case 8: memcpy(dst, &v, 8*sizeof(T)); break;
+ case 7: dst[6] = v[6];
+ case 6: dst[5] = v[5];
+ case 5: dst[4] = v[4];
+ case 4: memcpy(dst, &v, 4*sizeof(T)); break;
+ case 3: dst[2] = v[2];
+ case 2: memcpy(dst, &v, 2*sizeof(T)); break;
+ case 1: memcpy(dst, &v, 1*sizeof(T)); break;
}
return;
}
unaligned_store(dst, v);
}
-#if 1 && defined(__AVX2__)
+#if 0 && defined(__AVX2__)
SI U32 mask(size_t tail) {
// We go a little out of our way to avoid needing large constant values here.