diff options
author | Mike Klein <mtklein@chromium.org> | 2017-08-03 00:04:12 -0400 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2017-08-03 13:24:46 +0000 |
commit | e7f89fc257a5ddd83a314e7bbdd23cb17a461ae5 (patch) | |
tree | 77b99ba0f8714c42f56f1638c39d68fc2fd5e9f9 /src | |
parent | 698edfecef121d8575eee6af207ce8a9525032ee (diff) |
improve HSW 16->8 bit pack
__builtin_convertvector(..., U8x4) is producing a fairly long
sequence of code to convert U16x4 to U8x4 on HSW:
vextracti128 $0x1,%ymm2,%xmm3
vmovdqa 0x1848(%rip),%xmm4
vpshufb %xmm4,%xmm3,%xmm3
vpshufb %xmm4,%xmm2,%xmm2
vpunpcklqdq %xmm3,%xmm2,%xmm2
vextracti128 $0x1,%ymm0,%xmm3
vpshufb %xmm4,%xmm3,%xmm3
vpshufb %xmm4,%xmm0,%xmm0
vpunpcklqdq %xmm3,%xmm0,%xmm0
vinserti128 $0x1,%xmm2,%ymm0,%ymm0
We can do much better with _mm256_packus_epi16:
vinserti128 $0x1,%xmm0,%ymm2,%ymm3
vperm2i128 $0x31,%ymm0,%ymm2,%ymm0
vpackuswb %ymm0,%ymm3,%ymm0
vpackuswb packs the values in a somewhat surprising order,
which the first two instructions get us lined up for.
This is a pretty noticeable speedup, 7-8% on some benchmarks.
The same sort of change could be made for SSE2 and SSE4.1 also
using _mm_packus_epi16, but the difference for that change is
much less dramatic. Might as well stick to focusing on HSW.
Change-Id: I0d6765bd67e0d024d658a61d19e6f6826b4d392c
Reviewed-on: https://skia-review.googlesource.com/30420
Reviewed-by: Florin Malita <fmalita@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src')
-rw-r--r-- | src/jumper/SkJumper_generated.S | 1066 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated_win.S | 1067 | ||||
-rw-r--r-- | src/jumper/SkJumper_stages_8bit.cpp | 23 |
3 files changed, 773 insertions, 1383 deletions
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S index c6afb7cb95..1b005b621f 100644 --- a/src/jumper/SkJumper_generated.S +++ b/src/jumper/SkJumper_generated.S @@ -50443,7 +50443,7 @@ HIDDEN _sk_uniform_color_hsw_8bit FUNCTION(_sk_uniform_color_hsw_8bit) _sk_uniform_color_hsw_8bit: .byte 72,173 // lods %ds:(%rsi),%rax - .byte 197,250,16,5,18,22,0,0 // vmovss 0x1612(%rip),%xmm0 # 16c4 <_sk_xor__hsw_8bit+0xff> + .byte 197,250,16,5,134,18,0,0 // vmovss 0x1286(%rip),%xmm0 # 1338 <_sk_xor__hsw_8bit+0xbd> .byte 197,250,89,16 // vmulss (%rax),%xmm0,%xmm2 .byte 196,225,250,44,202 // vcvttss2si %xmm2,%rcx .byte 197,250,89,80,4 // vmulss 0x4(%rax),%xmm0,%xmm2 @@ -50468,7 +50468,7 @@ HIDDEN _sk_set_rgb_hsw_8bit FUNCTION(_sk_set_rgb_hsw_8bit) _sk_set_rgb_hsw_8bit: .byte 72,173 // lods %ds:(%rsi),%rax - .byte 197,250,16,21,201,21,0,0 // vmovss 0x15c9(%rip),%xmm2 # 16c8 <_sk_xor__hsw_8bit+0x103> + .byte 197,250,16,21,61,18,0,0 // vmovss 0x123d(%rip),%xmm2 # 133c <_sk_xor__hsw_8bit+0xc1> .byte 197,234,89,24 // vmulss (%rax),%xmm2,%xmm3 .byte 196,225,250,44,203 // vcvttss2si %xmm3,%rcx .byte 197,234,89,88,4 // vmulss 0x4(%rax),%xmm2,%xmm3 @@ -50481,7 +50481,7 @@ _sk_set_rgb_hsw_8bit: .byte 9,208 // or %edx,%eax .byte 197,249,110,208 // vmovd %eax,%xmm2 .byte 196,226,125,88,210 // vpbroadcastd %xmm2,%ymm2 - .byte 197,253,219,5,201,21,0,0 // vpand 0x15c9(%rip),%ymm0,%ymm0 # 1700 <_sk_xor__hsw_8bit+0x13b> + .byte 197,253,219,5,41,18,0,0 // vpand 0x1229(%rip),%ymm0,%ymm0 # 1360 <_sk_xor__hsw_8bit+0xe5> .byte 197,237,235,192 // vpor %ymm0,%ymm2,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -50490,13 +50490,13 @@ HIDDEN _sk_premul_hsw_8bit .globl _sk_premul_hsw_8bit FUNCTION(_sk_premul_hsw_8bit) _sk_premul_hsw_8bit: - .byte 196,226,125,0,21,216,21,0,0 // vpshufb 0x15d8(%rip),%ymm0,%ymm2 # 1720 <_sk_xor__hsw_8bit+0x15b> - .byte 197,237,235,21,240,21,0,0 // vpor 0x15f0(%rip),%ymm2,%ymm2 # 1740 <_sk_xor__hsw_8bit+0x17b> - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 + .byte 196,226,125,0,21,56,18,0,0 // vpshufb 0x1238(%rip),%ymm0,%ymm2 # 1380 <_sk_xor__hsw_8bit+0x105> + .byte 197,237,235,21,80,18,0,0 // vpor 0x1250(%rip),%ymm2,%ymm2 # 13a0 <_sk_xor__hsw_8bit+0x125> + .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3 + .byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0 .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0 - .byte 196,227,125,57,212,1 // vextracti128 $0x1,%ymm2,%xmm4 - .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 + .byte 196,226,125,48,226 // vpmovzxbw %xmm2,%ymm4 + .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2 .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 .byte 197,237,213,208 // vpmullw %ymm0,%ymm2,%ymm2 .byte 197,221,213,227 // vpmullw %ymm3,%ymm4,%ymm4 @@ -50504,16 +50504,9 @@ _sk_premul_hsw_8bit: .byte 197,237,253,192 // vpaddw %ymm0,%ymm2,%ymm0 .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0 .byte 197,237,113,211,8 // vpsrlw $0x8,%ymm3,%ymm2 - .byte 196,227,125,57,211,1 // vextracti128 $0x1,%ymm2,%xmm3 - .byte 197,249,111,37,72,24,0,0 // vmovdqa 0x1848(%rip),%xmm4 # 19e0 <_sk_xor__hsw_8bit+0x41b> - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,105,0,212 // vpshufb %xmm4,%xmm2,%xmm2 - .byte 197,233,108,211 // vpunpcklqdq %xmm3,%xmm2,%xmm2 - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,121,0,196 // vpshufb %xmm4,%xmm0,%xmm0 - .byte 197,249,108,195 // vpunpcklqdq %xmm3,%xmm0,%xmm0 - .byte 196,227,125,56,194,1 // vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -50521,7 +50514,7 @@ HIDDEN _sk_swap_rb_hsw_8bit .globl _sk_swap_rb_hsw_8bit FUNCTION(_sk_swap_rb_hsw_8bit) _sk_swap_rb_hsw_8bit: - .byte 196,226,125,0,5,147,21,0,0 // vpshufb 0x1593(%rip),%ymm0,%ymm0 # 1760 <_sk_xor__hsw_8bit+0x19b> + .byte 196,226,125,0,5,25,18,0,0 // vpshufb 0x1219(%rip),%ymm0,%ymm0 # 13c0 <_sk_xor__hsw_8bit+0x145> .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -50539,7 +50532,7 @@ _sk_load_8888_hsw_8bit: .byte 72,3,8 // add (%rax),%rcx .byte 74,141,4,137 // lea (%rcx,%r9,4),%rax .byte 77,133,192 // test %r8,%r8 - .byte 117,8 // jne 1fe <_sk_load_8888_hsw_8bit+0x2d> + .byte 117,8 // jne 1d8 <_sk_load_8888_hsw_8bit+0x2d> .byte 197,254,111,0 // vmovdqu (%rax),%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -50551,7 +50544,7 @@ _sk_load_8888_hsw_8bit: .byte 196,225,249,110,194 // vmovq %rdx,%xmm0 .byte 196,226,125,33,192 // vpmovsxbd %xmm0,%ymm0 .byte 196,226,125,140,0 // vpmaskmovd (%rax),%ymm0,%ymm0 - .byte 235,214 // jmp 1fa <_sk_load_8888_hsw_8bit+0x29> + .byte 235,214 // jmp 1d4 <_sk_load_8888_hsw_8bit+0x29> HIDDEN _sk_load_8888_dst_hsw_8bit .globl _sk_load_8888_dst_hsw_8bit @@ -50567,7 +50560,7 @@ _sk_load_8888_dst_hsw_8bit: .byte 72,3,8 // add (%rax),%rcx .byte 74,141,4,137 // lea (%rcx,%r9,4),%rax .byte 77,133,192 // test %r8,%r8 - .byte 117,8 // jne 251 <_sk_load_8888_dst_hsw_8bit+0x2d> + .byte 117,8 // jne 22b <_sk_load_8888_dst_hsw_8bit+0x2d> .byte 197,254,111,8 // vmovdqu (%rax),%ymm1 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -50579,7 +50572,7 @@ _sk_load_8888_dst_hsw_8bit: .byte 196,225,249,110,202 // vmovq %rdx,%xmm1 .byte 196,226,125,33,201 // vpmovsxbd %xmm1,%ymm1 .byte 196,226,117,140,8 // vpmaskmovd (%rax),%ymm1,%ymm1 - .byte 235,214 // jmp 24d <_sk_load_8888_dst_hsw_8bit+0x29> + .byte 235,214 // jmp 227 <_sk_load_8888_dst_hsw_8bit+0x29> HIDDEN _sk_store_8888_hsw_8bit .globl _sk_store_8888_hsw_8bit @@ -50595,7 +50588,7 @@ _sk_store_8888_hsw_8bit: .byte 72,3,8 // add (%rax),%rcx .byte 74,141,4,137 // lea (%rcx,%r9,4),%rax .byte 77,133,192 // test %r8,%r8 - .byte 117,8 // jne 2a4 <_sk_store_8888_hsw_8bit+0x2d> + .byte 117,8 // jne 27e <_sk_store_8888_hsw_8bit+0x2d> .byte 197,254,127,0 // vmovdqu %ymm0,(%rax) .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -50607,7 +50600,7 @@ _sk_store_8888_hsw_8bit: .byte 196,225,249,110,210 // vmovq %rdx,%xmm2 .byte 196,226,125,33,210 // vpmovsxbd %xmm2,%ymm2 .byte 196,226,109,142,0 // vpmaskmovd %ymm0,%ymm2,(%rax) - .byte 235,214 // jmp 2a0 <_sk_store_8888_hsw_8bit+0x29> + .byte 235,214 // jmp 27a <_sk_store_8888_hsw_8bit+0x29> HIDDEN _sk_load_bgra_hsw_8bit .globl _sk_load_bgra_hsw_8bit @@ -50623,9 +50616,9 @@ _sk_load_bgra_hsw_8bit: .byte 72,3,8 // add (%rax),%rcx .byte 74,141,4,137 // lea (%rcx,%r9,4),%rax .byte 77,133,192 // test %r8,%r8 - .byte 117,17 // jne 300 <_sk_load_bgra_hsw_8bit+0x36> + .byte 117,17 // jne 2da <_sk_load_bgra_hsw_8bit+0x36> .byte 197,254,111,0 // vmovdqu (%rax),%ymm0 - .byte 196,226,125,0,5,132,20,0,0 // vpshufb 0x1484(%rip),%ymm0,%ymm0 # 1780 <_sk_xor__hsw_8bit+0x1bb> + .byte 196,226,125,0,5,10,17,0,0 // vpshufb 0x110a(%rip),%ymm0,%ymm0 # 13e0 <_sk_xor__hsw_8bit+0x165> .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax .byte 185,8,0,0,0 // mov $0x8,%ecx @@ -50636,7 +50629,7 @@ _sk_load_bgra_hsw_8bit: .byte 196,225,249,110,194 // vmovq %rdx,%xmm0 .byte 196,226,125,33,192 // vpmovsxbd %xmm0,%ymm0 .byte 196,226,125,140,0 // vpmaskmovd (%rax),%ymm0,%ymm0 - .byte 235,205 // jmp 2f3 <_sk_load_bgra_hsw_8bit+0x29> + .byte 235,205 // jmp 2cd <_sk_load_bgra_hsw_8bit+0x29> HIDDEN _sk_load_bgra_dst_hsw_8bit .globl _sk_load_bgra_dst_hsw_8bit @@ -50652,9 +50645,9 @@ _sk_load_bgra_dst_hsw_8bit: .byte 72,3,8 // add (%rax),%rcx .byte 74,141,4,137 // lea (%rcx,%r9,4),%rax .byte 77,133,192 // test %r8,%r8 - .byte 117,17 // jne 35c <_sk_load_bgra_dst_hsw_8bit+0x36> + .byte 117,17 // jne 336 <_sk_load_bgra_dst_hsw_8bit+0x36> .byte 197,254,111,8 // vmovdqu (%rax),%ymm1 - .byte 196,226,117,0,13,72,20,0,0 // vpshufb 0x1448(%rip),%ymm1,%ymm1 # 17a0 <_sk_xor__hsw_8bit+0x1db> + .byte 196,226,117,0,13,206,16,0,0 // vpshufb 0x10ce(%rip),%ymm1,%ymm1 # 1400 <_sk_xor__hsw_8bit+0x185> .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax .byte 185,8,0,0,0 // mov $0x8,%ecx @@ -50665,7 +50658,7 @@ _sk_load_bgra_dst_hsw_8bit: .byte 196,225,249,110,202 // vmovq %rdx,%xmm1 .byte 196,226,125,33,201 // vpmovsxbd %xmm1,%ymm1 .byte 196,226,117,140,8 // vpmaskmovd (%rax),%ymm1,%ymm1 - .byte 235,205 // jmp 34f <_sk_load_bgra_dst_hsw_8bit+0x29> + .byte 235,205 // jmp 329 <_sk_load_bgra_dst_hsw_8bit+0x29> HIDDEN _sk_store_bgra_hsw_8bit .globl _sk_store_bgra_hsw_8bit @@ -50680,9 +50673,9 @@ _sk_store_bgra_hsw_8bit: .byte 72,193,225,2 // shl $0x2,%rcx .byte 72,3,8 // add (%rax),%rcx .byte 74,141,4,137 // lea (%rcx,%r9,4),%rax - .byte 196,226,125,0,21,21,20,0,0 // vpshufb 0x1415(%rip),%ymm0,%ymm2 # 17c0 <_sk_xor__hsw_8bit+0x1fb> + .byte 196,226,125,0,21,155,16,0,0 // vpshufb 0x109b(%rip),%ymm0,%ymm2 # 1420 <_sk_xor__hsw_8bit+0x1a5> .byte 77,133,192 // test %r8,%r8 - .byte 117,8 // jne 3b8 <_sk_store_bgra_hsw_8bit+0x36> + .byte 117,8 // jne 392 <_sk_store_bgra_hsw_8bit+0x36> .byte 197,254,127,16 // vmovdqu %ymm2,(%rax) .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -50694,7 +50687,7 @@ _sk_store_bgra_hsw_8bit: .byte 196,225,249,110,218 // vmovq %rdx,%xmm3 .byte 196,226,125,33,219 // vpmovsxbd %xmm3,%ymm3 .byte 196,226,101,142,16 // vpmaskmovd %ymm2,%ymm3,(%rax) - .byte 235,214 // jmp 3b4 <_sk_store_bgra_hsw_8bit+0x32> + .byte 235,214 // jmp 38e <_sk_store_bgra_hsw_8bit+0x32> HIDDEN _sk_load_a8_hsw_8bit .globl _sk_load_a8_hsw_8bit @@ -50708,9 +50701,9 @@ _sk_load_a8_hsw_8bit: .byte 72,15,175,209 // imul %rcx,%rdx .byte 72,3,16 // add (%rax),%rdx .byte 77,133,201 // test %r9,%r9 - .byte 117,28 // jne 417 <_sk_load_a8_hsw_8bit+0x39> + .byte 117,28 // jne 3f1 <_sk_load_a8_hsw_8bit+0x39> .byte 196,162,121,48,4,2 // vpmovzxbw (%rdx,%r8,1),%xmm0 - .byte 197,249,219,5,231,21,0,0 // vpand 0x15e7(%rip),%xmm0,%xmm0 # 19f0 <_sk_xor__hsw_8bit+0x42b> + .byte 197,249,219,5,93,18,0,0 // vpand 0x125d(%rip),%xmm0,%xmm0 # 1640 <_sk_xor__hsw_8bit+0x3c5> .byte 196,226,125,51,192 // vpmovzxwd %xmm0,%ymm0 .byte 197,253,114,240,24 // vpslld $0x18,%ymm0,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax @@ -50719,15 +50712,15 @@ _sk_load_a8_hsw_8bit: .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0 .byte 65,254,201 // dec %r9b .byte 65,128,249,6 // cmp $0x6,%r9b - .byte 119,217 // ja 401 <_sk_load_a8_hsw_8bit+0x23> + .byte 119,217 // ja 3db <_sk_load_a8_hsw_8bit+0x23> .byte 65,15,182,193 // movzbl %r9b,%eax - .byte 72,141,13,117,0,0,0 // lea 0x75(%rip),%rcx # 4a8 <_sk_load_a8_hsw_8bit+0xca> + .byte 72,141,13,119,0,0,0 // lea 0x77(%rip),%rcx # 484 <_sk_load_a8_hsw_8bit+0xcc> .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax .byte 72,1,200 // add %rcx,%rax .byte 255,224 // jmpq *%rax .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax .byte 197,249,110,192 // vmovd %eax,%xmm0 - .byte 235,186 // jmp 401 <_sk_load_a8_hsw_8bit+0x23> + .byte 235,186 // jmp 3db <_sk_load_a8_hsw_8bit+0x23> .byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0 .byte 197,249,196,192,2 // vpinsrw $0x2,%eax,%xmm0,%xmm0 @@ -50735,7 +50728,7 @@ _sk_load_a8_hsw_8bit: .byte 197,249,110,208 // vmovd %eax,%xmm2 .byte 196,226,121,48,210 // vpmovzxbw %xmm2,%xmm2 .byte 196,227,121,2,194,1 // vpblendd $0x1,%xmm2,%xmm0,%xmm0 - .byte 235,149 // jmp 401 <_sk_load_a8_hsw_8bit+0x23> + .byte 235,149 // jmp 3db <_sk_load_a8_hsw_8bit+0x23> .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0 .byte 197,249,196,192,6 // vpinsrw $0x6,%eax,%xmm0,%xmm0 @@ -50746,22 +50739,23 @@ _sk_load_a8_hsw_8bit: .byte 196,161,121,110,20,2 // vmovd (%rdx,%r8,1),%xmm2 .byte 196,226,121,48,210 // vpmovzxbw %xmm2,%xmm2 .byte 196,227,105,2,192,12 // vpblendd $0xc,%xmm0,%xmm2,%xmm0 - .byte 233,90,255,255,255 // jmpq 401 <_sk_load_a8_hsw_8bit+0x23> - .byte 144 // nop - .byte 148 // xchg %eax,%esp + .byte 233,90,255,255,255 // jmpq 3db <_sk_load_a8_hsw_8bit+0x23> + .byte 15,31,0 // nopl (%rax) + .byte 146 // xchg %eax,%edx .byte 255 // (bad) .byte 255 // (bad) - .byte 255,174,255,255,255,159 // ljmp *-0x60000001(%rsi) + .byte 255,172,255,255,255,157,255 // ljmp *-0x620001(%rdi,%rdi,8) .byte 255 // (bad) + .byte 255,231 // jmpq *%rdi .byte 255 // (bad) .byte 255 // (bad) - .byte 233,255,255,255,222 // jmpq ffffffffdf0004b8 <_sk_xor__hsw_8bit+0xffffffffdeffeef3> .byte 255 // (bad) + .byte 220,255 // fdivr %st,%st(7) .byte 255 // (bad) - .byte 255,211 // callq *%rbx + .byte 255,209 // callq *%rcx .byte 255 // (bad) .byte 255 // (bad) - .byte 255,196 // inc %esp + .byte 255,194 // inc %edx .byte 255 // (bad) .byte 255 // (bad) .byte 255 // .byte 0xff @@ -50778,9 +50772,9 @@ _sk_load_a8_dst_hsw_8bit: .byte 72,15,175,209 // imul %rcx,%rdx .byte 72,3,16 // add (%rax),%rdx .byte 77,133,201 // test %r9,%r9 - .byte 117,28 // jne 4fd <_sk_load_a8_dst_hsw_8bit+0x39> + .byte 117,28 // jne 4d9 <_sk_load_a8_dst_hsw_8bit+0x39> .byte 196,162,121,48,12,2 // vpmovzxbw (%rdx,%r8,1),%xmm1 - .byte 197,241,219,13,17,21,0,0 // vpand 0x1511(%rip),%xmm1,%xmm1 # 1a00 <_sk_xor__hsw_8bit+0x43b> + .byte 197,241,219,13,133,17,0,0 // vpand 0x1185(%rip),%xmm1,%xmm1 # 1650 <_sk_xor__hsw_8bit+0x3d5> .byte 196,226,125,51,201 // vpmovzxwd %xmm1,%ymm1 .byte 197,245,114,241,24 // vpslld $0x18,%ymm1,%ymm1 .byte 72,173 // lods %ds:(%rsi),%rax @@ -50789,15 +50783,15 @@ _sk_load_a8_dst_hsw_8bit: .byte 197,241,239,201 // vpxor %xmm1,%xmm1,%xmm1 .byte 65,254,201 // dec %r9b .byte 65,128,249,6 // cmp $0x6,%r9b - .byte 119,217 // ja 4e7 <_sk_load_a8_dst_hsw_8bit+0x23> + .byte 119,217 // ja 4c3 <_sk_load_a8_dst_hsw_8bit+0x23> .byte 65,15,182,193 // movzbl %r9b,%eax - .byte 72,141,13,119,0,0,0 // lea 0x77(%rip),%rcx # 590 <_sk_load_a8_dst_hsw_8bit+0xcc> + .byte 72,141,13,119,0,0,0 // lea 0x77(%rip),%rcx # 56c <_sk_load_a8_dst_hsw_8bit+0xcc> .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax .byte 72,1,200 // add %rcx,%rax .byte 255,224 // jmpq *%rax .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax .byte 197,249,110,200 // vmovd %eax,%xmm1 - .byte 235,186 // jmp 4e7 <_sk_load_a8_dst_hsw_8bit+0x23> + .byte 235,186 // jmp 4c3 <_sk_load_a8_dst_hsw_8bit+0x23> .byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax .byte 197,241,239,201 // vpxor %xmm1,%xmm1,%xmm1 .byte 197,241,196,200,2 // vpinsrw $0x2,%eax,%xmm1,%xmm1 @@ -50805,7 +50799,7 @@ _sk_load_a8_dst_hsw_8bit: .byte 197,249,110,208 // vmovd %eax,%xmm2 .byte 196,226,121,48,210 // vpmovzxbw %xmm2,%xmm2 .byte 196,227,113,2,202,1 // vpblendd $0x1,%xmm2,%xmm1,%xmm1 - .byte 235,149 // jmp 4e7 <_sk_load_a8_dst_hsw_8bit+0x23> + .byte 235,149 // jmp 4c3 <_sk_load_a8_dst_hsw_8bit+0x23> .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax .byte 197,241,239,201 // vpxor %xmm1,%xmm1,%xmm1 .byte 197,241,196,200,6 // vpinsrw $0x6,%eax,%xmm1,%xmm1 @@ -50816,7 +50810,7 @@ _sk_load_a8_dst_hsw_8bit: .byte 196,161,121,110,20,2 // vmovd (%rdx,%r8,1),%xmm2 .byte 196,226,121,48,210 // vpmovzxbw %xmm2,%xmm2 .byte 196,227,105,2,201,12 // vpblendd $0xc,%xmm1,%xmm2,%xmm1 - .byte 233,90,255,255,255 // jmpq 4e7 <_sk_load_a8_dst_hsw_8bit+0x23> + .byte 233,90,255,255,255 // jmpq 4c3 <_sk_load_a8_dst_hsw_8bit+0x23> .byte 15,31,0 // nopl (%rax) .byte 146 // xchg %eax,%edx .byte 255 // (bad) @@ -50848,35 +50842,35 @@ _sk_store_a8_hsw_8bit: .byte 72,99,87,8 // movslq 0x8(%rdi),%rdx .byte 72,15,175,209 // imul %rcx,%rdx .byte 72,3,16 // add (%rax),%rdx - .byte 196,226,125,0,21,19,18,0,0 // vpshufb 0x1213(%rip),%ymm0,%ymm2 # 17e0 <_sk_xor__hsw_8bit+0x21b> + .byte 196,226,125,0,21,151,14,0,0 // vpshufb 0xe97(%rip),%ymm0,%ymm2 # 1440 <_sk_xor__hsw_8bit+0x1c5> .byte 196,227,253,0,210,232 // vpermq $0xe8,%ymm2,%ymm2 .byte 77,133,201 // test %r9,%r9 - .byte 117,19 // jne 5eb <_sk_store_a8_hsw_8bit+0x3f> - .byte 196,226,105,0,21,79,20,0,0 // vpshufb 0x144f(%rip),%xmm2,%xmm2 # 1a30 <_sk_xor__hsw_8bit+0x46b> + .byte 117,19 // jne 5c7 <_sk_store_a8_hsw_8bit+0x3f> + .byte 196,226,105,0,21,195,16,0,0 // vpshufb 0x10c3(%rip),%xmm2,%xmm2 # 1680 <_sk_xor__hsw_8bit+0x405> .byte 196,161,121,214,20,2 // vmovq %xmm2,(%rdx,%r8,1) .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax .byte 65,128,225,7 // and $0x7,%r9b .byte 65,254,201 // dec %r9b .byte 65,128,249,6 // cmp $0x6,%r9b - .byte 119,239 // ja 5e7 <_sk_store_a8_hsw_8bit+0x3b> + .byte 119,239 // ja 5c3 <_sk_store_a8_hsw_8bit+0x3b> .byte 65,15,182,193 // movzbl %r9b,%eax - .byte 72,141,13,85,0,0,0 // lea 0x55(%rip),%rcx # 658 <_sk_store_a8_hsw_8bit+0xac> + .byte 72,141,13,85,0,0,0 // lea 0x55(%rip),%rcx # 634 <_sk_store_a8_hsw_8bit+0xac> .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax .byte 72,1,200 // add %rcx,%rax .byte 255,224 // jmpq *%rax .byte 196,163,121,20,20,2,0 // vpextrb $0x0,%xmm2,(%rdx,%r8,1) - .byte 235,210 // jmp 5e7 <_sk_store_a8_hsw_8bit+0x3b> + .byte 235,210 // jmp 5c3 <_sk_store_a8_hsw_8bit+0x3b> .byte 196,163,121,20,84,2,2,4 // vpextrb $0x4,%xmm2,0x2(%rdx,%r8,1) - .byte 196,226,105,0,21,234,19,0,0 // vpshufb 0x13ea(%rip),%xmm2,%xmm2 # 1a10 <_sk_xor__hsw_8bit+0x44b> + .byte 196,226,105,0,21,94,16,0,0 // vpshufb 0x105e(%rip),%xmm2,%xmm2 # 1660 <_sk_xor__hsw_8bit+0x3e5> .byte 196,163,121,21,20,2,0 // vpextrw $0x0,%xmm2,(%rdx,%r8,1) - .byte 235,184 // jmp 5e7 <_sk_store_a8_hsw_8bit+0x3b> + .byte 235,184 // jmp 5c3 <_sk_store_a8_hsw_8bit+0x3b> .byte 196,163,121,20,84,2,6,12 // vpextrb $0xc,%xmm2,0x6(%rdx,%r8,1) .byte 196,163,121,20,84,2,5,10 // vpextrb $0xa,%xmm2,0x5(%rdx,%r8,1) .byte 196,163,121,20,84,2,4,8 // vpextrb $0x8,%xmm2,0x4(%rdx,%r8,1) - .byte 196,226,105,0,21,208,19,0,0 // vpshufb 0x13d0(%rip),%xmm2,%xmm2 # 1a20 <_sk_xor__hsw_8bit+0x45b> + .byte 196,226,105,0,21,68,16,0,0 // vpshufb 0x1044(%rip),%xmm2,%xmm2 # 1670 <_sk_xor__hsw_8bit+0x3f5> .byte 196,161,121,126,20,2 // vmovd %xmm2,(%rdx,%r8,1) - .byte 235,143 // jmp 5e7 <_sk_store_a8_hsw_8bit+0x3b> + .byte 235,143 // jmp 5c3 <_sk_store_a8_hsw_8bit+0x3b> .byte 180,255 // mov $0xff,%ah .byte 255 // (bad) .byte 255,197 // inc %ebp @@ -50909,13 +50903,13 @@ _sk_load_g8_hsw_8bit: .byte 72,15,175,209 // imul %rcx,%rdx .byte 72,3,16 // add (%rax),%rdx .byte 77,133,201 // test %r9,%r9 - .byte 117,50 // jne 6c3 <_sk_load_g8_hsw_8bit+0x4f> + .byte 117,50 // jne 69f <_sk_load_g8_hsw_8bit+0x4f> .byte 196,162,121,48,4,2 // vpmovzxbw (%rdx,%r8,1),%xmm0 - .byte 197,249,219,5,161,19,0,0 // vpand 0x13a1(%rip),%xmm0,%xmm0 # 1a40 <_sk_xor__hsw_8bit+0x47b> + .byte 197,249,219,5,21,16,0,0 // vpand 0x1015(%rip),%xmm0,%xmm0 # 1690 <_sk_xor__hsw_8bit+0x415> .byte 196,226,125,51,192 // vpmovzxwd %xmm0,%ymm0 - .byte 196,226,125,88,21,31,16,0,0 // vpbroadcastd 0x101f(%rip),%ymm2 # 16cc <_sk_xor__hsw_8bit+0x107> + .byte 196,226,125,88,21,183,12,0,0 // vpbroadcastd 0xcb7(%rip),%ymm2 # 1340 <_sk_xor__hsw_8bit+0xc5> .byte 196,226,125,64,194 // vpmulld %ymm2,%ymm0,%ymm0 - .byte 196,226,125,88,21,21,16,0,0 // vpbroadcastd 0x1015(%rip),%ymm2 # 16d0 <_sk_xor__hsw_8bit+0x10b> + .byte 196,226,125,88,21,173,12,0,0 // vpbroadcastd 0xcad(%rip),%ymm2 # 1344 <_sk_xor__hsw_8bit+0xc9> .byte 197,253,235,194 // vpor %ymm2,%ymm0,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -50923,15 +50917,15 @@ _sk_load_g8_hsw_8bit: .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0 .byte 65,254,201 // dec %r9b .byte 65,128,249,6 // cmp $0x6,%r9b - .byte 119,195 // ja 697 <_sk_load_g8_hsw_8bit+0x23> + .byte 119,195 // ja 673 <_sk_load_g8_hsw_8bit+0x23> .byte 65,15,182,193 // movzbl %r9b,%eax - .byte 72,141,13,121,0,0,0 // lea 0x79(%rip),%rcx # 758 <_sk_load_g8_hsw_8bit+0xe4> + .byte 72,141,13,121,0,0,0 // lea 0x79(%rip),%rcx # 734 <_sk_load_g8_hsw_8bit+0xe4> .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax .byte 72,1,200 // add %rcx,%rax .byte 255,224 // jmpq *%rax .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax .byte 197,249,110,192 // vmovd %eax,%xmm0 - .byte 235,164 // jmp 697 <_sk_load_g8_hsw_8bit+0x23> + .byte 235,164 // jmp 673 <_sk_load_g8_hsw_8bit+0x23> .byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0 .byte 197,249,196,192,2 // vpinsrw $0x2,%eax,%xmm0,%xmm0 @@ -50939,7 +50933,7 @@ _sk_load_g8_hsw_8bit: .byte 197,249,110,208 // vmovd %eax,%xmm2 .byte 196,226,121,48,210 // vpmovzxbw %xmm2,%xmm2 .byte 196,227,121,2,194,1 // vpblendd $0x1,%xmm2,%xmm0,%xmm0 - .byte 233,124,255,255,255 // jmpq 697 <_sk_load_g8_hsw_8bit+0x23> + .byte 233,124,255,255,255 // jmpq 673 <_sk_load_g8_hsw_8bit+0x23> .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0 .byte 197,249,196,192,6 // vpinsrw $0x6,%eax,%xmm0,%xmm0 @@ -50950,7 +50944,7 @@ _sk_load_g8_hsw_8bit: .byte 196,161,121,110,20,2 // vmovd (%rdx,%r8,1),%xmm2 .byte 196,226,121,48,210 // vpmovzxbw %xmm2,%xmm2 .byte 196,227,105,2,192,12 // vpblendd $0xc,%xmm0,%xmm2,%xmm0 - .byte 233,65,255,255,255 // jmpq 697 <_sk_load_g8_hsw_8bit+0x23> + .byte 233,65,255,255,255 // jmpq 673 <_sk_load_g8_hsw_8bit+0x23> .byte 102,144 // xchg %ax,%ax .byte 144 // nop .byte 255 // (bad) @@ -50959,7 +50953,7 @@ _sk_load_g8_hsw_8bit: .byte 255 // (bad) .byte 255 // (bad) .byte 255 // (bad) - .byte 232,255,255,255,221 // callq ffffffffde000768 <_sk_xor__hsw_8bit+0xffffffffddfff1a3> + .byte 232,255,255,255,221 // callq ffffffffde000744 <_sk_xor__hsw_8bit+0xffffffffddfff4c9> .byte 255 // (bad) .byte 255 // (bad) .byte 255,210 // callq *%rdx @@ -50982,13 +50976,13 @@ _sk_load_g8_dst_hsw_8bit: .byte 72,15,175,209 // imul %rcx,%rdx .byte 72,3,16 // add (%rax),%rdx .byte 77,133,201 // test %r9,%r9 - .byte 117,50 // jne 7c3 <_sk_load_g8_dst_hsw_8bit+0x4f> + .byte 117,50 // jne 79f <_sk_load_g8_dst_hsw_8bit+0x4f> .byte 196,162,121,48,12,2 // vpmovzxbw (%rdx,%r8,1),%xmm1 - .byte 197,241,219,13,177,18,0,0 // vpand 0x12b1(%rip),%xmm1,%xmm1 # 1a50 <_sk_xor__hsw_8bit+0x48b> + .byte 197,241,219,13,37,15,0,0 // vpand 0xf25(%rip),%xmm1,%xmm1 # 16a0 <_sk_xor__hsw_8bit+0x425> .byte 196,226,125,51,201 // vpmovzxwd %xmm1,%ymm1 - .byte 196,226,125,88,21,39,15,0,0 // vpbroadcastd 0xf27(%rip),%ymm2 # 16d4 <_sk_xor__hsw_8bit+0x10f> + .byte 196,226,125,88,21,191,11,0,0 // vpbroadcastd 0xbbf(%rip),%ymm2 # 1348 <_sk_xor__hsw_8bit+0xcd> .byte 196,226,117,64,202 // vpmulld %ymm2,%ymm1,%ymm1 - .byte 196,226,125,88,21,29,15,0,0 // vpbroadcastd 0xf1d(%rip),%ymm2 # 16d8 <_sk_xor__hsw_8bit+0x113> + .byte 196,226,125,88,21,181,11,0,0 // vpbroadcastd 0xbb5(%rip),%ymm2 # 134c <_sk_xor__hsw_8bit+0xd1> .byte 197,245,235,202 // vpor %ymm2,%ymm1,%ymm1 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -50996,15 +50990,15 @@ _sk_load_g8_dst_hsw_8bit: .byte 197,241,239,201 // vpxor %xmm1,%xmm1,%xmm1 .byte 65,254,201 // dec %r9b .byte 65,128,249,6 // cmp $0x6,%r9b - .byte 119,195 // ja 797 <_sk_load_g8_dst_hsw_8bit+0x23> + .byte 119,195 // ja 773 <_sk_load_g8_dst_hsw_8bit+0x23> .byte 65,15,182,193 // movzbl %r9b,%eax - .byte 72,141,13,121,0,0,0 // lea 0x79(%rip),%rcx # 858 <_sk_load_g8_dst_hsw_8bit+0xe4> + .byte 72,141,13,121,0,0,0 // lea 0x79(%rip),%rcx # 834 <_sk_load_g8_dst_hsw_8bit+0xe4> .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax .byte 72,1,200 // add %rcx,%rax .byte 255,224 // jmpq *%rax .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax .byte 197,249,110,200 // vmovd %eax,%xmm1 - .byte 235,164 // jmp 797 <_sk_load_g8_dst_hsw_8bit+0x23> + .byte 235,164 // jmp 773 <_sk_load_g8_dst_hsw_8bit+0x23> .byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax .byte 197,241,239,201 // vpxor %xmm1,%xmm1,%xmm1 .byte 197,241,196,200,2 // vpinsrw $0x2,%eax,%xmm1,%xmm1 @@ -51012,7 +51006,7 @@ _sk_load_g8_dst_hsw_8bit: .byte 197,249,110,208 // vmovd %eax,%xmm2 .byte 196,226,121,48,210 // vpmovzxbw %xmm2,%xmm2 .byte 196,227,113,2,202,1 // vpblendd $0x1,%xmm2,%xmm1,%xmm1 - .byte 233,124,255,255,255 // jmpq 797 <_sk_load_g8_dst_hsw_8bit+0x23> + .byte 233,124,255,255,255 // jmpq 773 <_sk_load_g8_dst_hsw_8bit+0x23> .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax .byte 197,241,239,201 // vpxor %xmm1,%xmm1,%xmm1 .byte 197,241,196,200,6 // vpinsrw $0x6,%eax,%xmm1,%xmm1 @@ -51023,7 +51017,7 @@ _sk_load_g8_dst_hsw_8bit: .byte 196,161,121,110,20,2 // vmovd (%rdx,%r8,1),%xmm2 .byte 196,226,121,48,210 // vpmovzxbw %xmm2,%xmm2 .byte 196,227,105,2,201,12 // vpblendd $0xc,%xmm1,%xmm2,%xmm1 - .byte 233,65,255,255,255 // jmpq 797 <_sk_load_g8_dst_hsw_8bit+0x23> + .byte 233,65,255,255,255 // jmpq 773 <_sk_load_g8_dst_hsw_8bit+0x23> .byte 102,144 // xchg %ax,%ax .byte 144 // nop .byte 255 // (bad) @@ -51032,7 +51026,7 @@ _sk_load_g8_dst_hsw_8bit: .byte 255 // (bad) .byte 255 // (bad) .byte 255 // (bad) - .byte 232,255,255,255,221 // callq ffffffffde000868 <_sk_xor__hsw_8bit+0xffffffffddfff2a3> + .byte 232,255,255,255,221 // callq ffffffffde000844 <_sk_xor__hsw_8bit+0xffffffffddfff5c9> .byte 255 // (bad) .byte 255 // (bad) .byte 255,210 // callq *%rdx @@ -51057,16 +51051,16 @@ _sk_srcover_rgba_8888_hsw_8bit: .byte 72,3,8 // add (%rax),%rcx .byte 74,141,4,137 // lea (%rcx,%r9,4),%rax .byte 77,133,192 // test %r8,%r8 - .byte 15,133,150,0,0,0 // jne 933 <_sk_srcover_rgba_8888_hsw_8bit+0xbf> + .byte 117,112 // jne 8e5 <_sk_srcover_rgba_8888_hsw_8bit+0x95> .byte 197,254,111,16 // vmovdqu (%rax),%ymm2 - .byte 196,226,125,0,29,86,15,0,0 // vpshufb 0xf56(%rip),%ymm0,%ymm3 # 1800 <_sk_xor__hsw_8bit+0x23b> + .byte 196,226,125,0,29,222,11,0,0 // vpshufb 0xbde(%rip),%ymm0,%ymm3 # 1460 <_sk_xor__hsw_8bit+0x1e5> .byte 197,221,118,228 // vpcmpeqd %ymm4,%ymm4,%ymm4 .byte 197,229,239,220 // vpxor %ymm4,%ymm3,%ymm3 - .byte 196,227,125,57,212,1 // vextracti128 $0x1,%ymm2,%xmm4 - .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 + .byte 196,226,125,48,226 // vpmovzxbw %xmm2,%ymm4 + .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2 .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 - .byte 196,227,125,57,221,1 // vextracti128 $0x1,%ymm3,%xmm5 - .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5 + .byte 196,226,125,48,235 // vpmovzxbw %xmm3,%ymm5 + .byte 196,227,125,57,219,1 // vextracti128 $0x1,%ymm3,%xmm3 .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 .byte 197,237,213,219 // vpmullw %ymm3,%ymm2,%ymm3 .byte 197,221,213,237 // vpmullw %ymm5,%ymm4,%ymm5 @@ -51074,19 +51068,12 @@ _sk_srcover_rgba_8888_hsw_8bit: .byte 197,229,253,210 // vpaddw %ymm2,%ymm3,%ymm2 .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2 .byte 197,229,113,212,8 // vpsrlw $0x8,%ymm4,%ymm3 - .byte 196,227,125,57,220,1 // vextracti128 $0x1,%ymm3,%xmm4 - .byte 197,249,111,45,102,17,0,0 // vmovdqa 0x1166(%rip),%xmm5 # 1a60 <_sk_xor__hsw_8bit+0x49b> - .byte 196,226,89,0,229 // vpshufb %xmm5,%xmm4,%xmm4 - .byte 196,226,97,0,221 // vpshufb %xmm5,%xmm3,%xmm3 - .byte 197,225,108,220 // vpunpcklqdq %xmm4,%xmm3,%xmm3 - .byte 196,227,125,57,212,1 // vextracti128 $0x1,%ymm2,%xmm4 - .byte 196,226,89,0,229 // vpshufb %xmm5,%xmm4,%xmm4 - .byte 196,226,105,0,213 // vpshufb %xmm5,%xmm2,%xmm2 - .byte 197,233,108,212 // vpunpcklqdq %xmm4,%xmm2,%xmm2 - .byte 196,227,109,56,211,1 // vinserti128 $0x1,%xmm3,%ymm2,%ymm2 + .byte 196,227,101,56,226,1 // vinserti128 $0x1,%xmm2,%ymm3,%ymm4 + .byte 196,227,101,70,210,49 // vperm2i128 $0x31,%ymm2,%ymm3,%ymm2 + .byte 197,221,103,210 // vpackuswb %ymm2,%ymm4,%ymm2 .byte 197,237,252,208 // vpaddb %ymm0,%ymm2,%ymm2 .byte 77,133,192 // test %r8,%r8 - .byte 117,49 // jne 95c <_sk_srcover_rgba_8888_hsw_8bit+0xe8> + .byte 117,49 // jne 90e <_sk_srcover_rgba_8888_hsw_8bit+0xbe> .byte 197,254,127,16 // vmovdqu %ymm2,(%rax) .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -51098,7 +51085,7 @@ _sk_srcover_rgba_8888_hsw_8bit: .byte 196,225,249,110,210 // vmovq %rdx,%xmm2 .byte 196,226,125,33,210 // vpmovsxbd %xmm2,%ymm2 .byte 196,226,109,140,16 // vpmaskmovd (%rax),%ymm2,%ymm2 - .byte 233,69,255,255,255 // jmpq 8a1 <_sk_srcover_rgba_8888_hsw_8bit+0x2d> + .byte 233,107,255,255,255 // jmpq 879 <_sk_srcover_rgba_8888_hsw_8bit+0x29> .byte 185,8,0,0,0 // mov $0x8,%ecx .byte 68,41,193 // sub %r8d,%ecx .byte 192,225,3 // shl $0x3,%cl @@ -51107,7 +51094,7 @@ _sk_srcover_rgba_8888_hsw_8bit: .byte 196,225,249,110,218 // vmovq %rdx,%xmm3 .byte 196,226,125,33,219 // vpmovsxbd %xmm3,%ymm3 .byte 196,226,101,142,16 // vpmaskmovd %ymm2,%ymm3,(%rax) - .byte 235,173 // jmp 92f <_sk_srcover_rgba_8888_hsw_8bit+0xbb> + .byte 235,173 // jmp 8e1 <_sk_srcover_rgba_8888_hsw_8bit+0x91> HIDDEN _sk_scale_1_float_hsw_8bit .globl _sk_scale_1_float_hsw_8bit @@ -51115,30 +51102,23 @@ FUNCTION(_sk_scale_1_float_hsw_8bit) _sk_scale_1_float_hsw_8bit: .byte 72,173 // lods %ds:(%rsi),%rax .byte 197,250,16,16 // vmovss (%rax),%xmm2 - .byte 197,234,89,21,76,13,0,0 // vmulss 0xd4c(%rip),%xmm2,%xmm2 # 16dc <_sk_xor__hsw_8bit+0x117> + .byte 197,234,89,21,14,10,0,0 // vmulss 0xa0e(%rip),%xmm2,%xmm2 # 1350 <_sk_xor__hsw_8bit+0xd5> .byte 197,250,44,194 // vcvttss2si %xmm2,%eax .byte 197,249,110,208 // vmovd %eax,%xmm2 .byte 196,226,125,120,210 // vpbroadcastb %xmm2,%ymm2 - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 + .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3 + .byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0 .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0 - .byte 197,237,219,21,107,14,0,0 // vpand 0xe6b(%rip),%ymm2,%ymm2 # 1820 <_sk_xor__hsw_8bit+0x25b> + .byte 197,237,219,21,25,11,0,0 // vpand 0xb19(%rip),%ymm2,%ymm2 # 1480 <_sk_xor__hsw_8bit+0x205> .byte 197,237,213,224 // vpmullw %ymm0,%ymm2,%ymm4 .byte 197,237,213,211 // vpmullw %ymm3,%ymm2,%ymm2 .byte 197,237,253,211 // vpaddw %ymm3,%ymm2,%ymm2 .byte 197,221,253,192 // vpaddw %ymm0,%ymm4,%ymm0 .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0 .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2 - .byte 196,227,125,57,211,1 // vextracti128 $0x1,%ymm2,%xmm3 - .byte 197,249,111,37,147,16,0,0 // vmovdqa 0x1093(%rip),%xmm4 # 1a70 <_sk_xor__hsw_8bit+0x4ab> - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,105,0,212 // vpshufb %xmm4,%xmm2,%xmm2 - .byte 197,233,108,211 // vpunpcklqdq %xmm3,%xmm2,%xmm2 - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,121,0,196 // vpshufb %xmm4,%xmm0,%xmm0 - .byte 197,249,108,195 // vpunpcklqdq %xmm3,%xmm0,%xmm0 - .byte 196,227,125,56,194,1 // vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -51154,16 +51134,16 @@ _sk_scale_u8_hsw_8bit: .byte 72,15,175,209 // imul %rcx,%rdx .byte 72,3,16 // add (%rax),%rdx .byte 77,133,201 // test %r9,%r9 - .byte 15,133,144,0,0,0 // jne aba <_sk_scale_u8_hsw_8bit+0xb1> + .byte 117,106 // jne a1c <_sk_scale_u8_hsw_8bit+0x87> .byte 196,162,121,48,20,2 // vpmovzxbw (%rdx,%r8,1),%xmm2 - .byte 197,233,219,21,72,16,0,0 // vpand 0x1048(%rip),%xmm2,%xmm2 # 1a80 <_sk_xor__hsw_8bit+0x4bb> + .byte 197,233,219,21,240,12,0,0 // vpand 0xcf0(%rip),%xmm2,%xmm2 # 16b0 <_sk_xor__hsw_8bit+0x435> .byte 196,226,125,51,210 // vpmovzxwd %xmm2,%ymm2 - .byte 196,226,109,0,21,250,13,0,0 // vpshufb 0xdfa(%rip),%ymm2,%ymm2 # 1840 <_sk_xor__hsw_8bit+0x27b> - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 + .byte 196,226,109,0,21,210,10,0,0 // vpshufb 0xad2(%rip),%ymm2,%ymm2 # 14a0 <_sk_xor__hsw_8bit+0x225> + .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3 + .byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0 .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0 - .byte 196,227,125,57,212,1 // vextracti128 $0x1,%ymm2,%xmm4 - .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 + .byte 196,226,125,48,226 // vpmovzxbw %xmm2,%ymm4 + .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2 .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 .byte 197,237,213,208 // vpmullw %ymm0,%ymm2,%ymm2 .byte 197,221,213,227 // vpmullw %ymm3,%ymm4,%ymm4 @@ -51171,31 +51151,24 @@ _sk_scale_u8_hsw_8bit: .byte 197,237,253,192 // vpaddw %ymm0,%ymm2,%ymm0 .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0 .byte 197,237,113,211,8 // vpsrlw $0x8,%ymm3,%ymm2 - .byte 196,227,125,57,211,1 // vextracti128 $0x1,%ymm2,%xmm3 - .byte 197,249,111,37,2,16,0,0 // vmovdqa 0x1002(%rip),%xmm4 # 1a90 <_sk_xor__hsw_8bit+0x4cb> - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,105,0,212 // vpshufb %xmm4,%xmm2,%xmm2 - .byte 197,233,108,211 // vpunpcklqdq %xmm3,%xmm2,%xmm2 - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,121,0,196 // vpshufb %xmm4,%xmm0,%xmm0 - .byte 197,249,108,195 // vpunpcklqdq %xmm3,%xmm0,%xmm0 - .byte 196,227,125,56,194,1 // vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax .byte 65,128,225,7 // and $0x7,%r9b .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2 .byte 65,254,201 // dec %r9b .byte 65,128,249,6 // cmp $0x6,%r9b - .byte 15,135,97,255,255,255 // ja a30 <_sk_scale_u8_hsw_8bit+0x27> + .byte 119,139 // ja 9b8 <_sk_scale_u8_hsw_8bit+0x23> .byte 65,15,182,193 // movzbl %r9b,%eax - .byte 72,141,13,122,0,0,0 // lea 0x7a(%rip),%rcx # b54 <_sk_scale_u8_hsw_8bit+0x14b> + .byte 72,141,13,124,0,0,0 // lea 0x7c(%rip),%rcx # ab4 <_sk_scale_u8_hsw_8bit+0x11f> .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax .byte 72,1,200 // add %rcx,%rax .byte 255,224 // jmpq *%rax .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax .byte 197,249,110,208 // vmovd %eax,%xmm2 - .byte 233,63,255,255,255 // jmpq a30 <_sk_scale_u8_hsw_8bit+0x27> + .byte 233,105,255,255,255 // jmpq 9b8 <_sk_scale_u8_hsw_8bit+0x23> .byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2 .byte 197,233,196,208,2 // vpinsrw $0x2,%eax,%xmm2,%xmm2 @@ -51203,7 +51176,7 @@ _sk_scale_u8_hsw_8bit: .byte 197,249,110,216 // vmovd %eax,%xmm3 .byte 196,226,121,48,219 // vpmovzxbw %xmm3,%xmm3 .byte 196,227,105,2,211,1 // vpblendd $0x1,%xmm3,%xmm2,%xmm2 - .byte 233,23,255,255,255 // jmpq a30 <_sk_scale_u8_hsw_8bit+0x27> + .byte 233,65,255,255,255 // jmpq 9b8 <_sk_scale_u8_hsw_8bit+0x23> .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2 .byte 197,233,196,208,6 // vpinsrw $0x6,%eax,%xmm2,%xmm2 @@ -51214,23 +51187,22 @@ _sk_scale_u8_hsw_8bit: .byte 196,161,121,110,28,2 // vmovd (%rdx,%r8,1),%xmm3 .byte 196,226,121,48,219 // vpmovzxbw %xmm3,%xmm3 .byte 196,227,97,2,210,12 // vpblendd $0xc,%xmm2,%xmm3,%xmm2 - .byte 233,220,254,255,255 // jmpq a30 <_sk_scale_u8_hsw_8bit+0x27> - .byte 143 // (bad) - .byte 255 // (bad) + .byte 233,6,255,255,255 // jmpq 9b8 <_sk_scale_u8_hsw_8bit+0x23> + .byte 102,144 // xchg %ax,%ax + .byte 141 // (bad) .byte 255 // (bad) - .byte 255,172,255,255,255,157,255 // ljmp *-0x620001(%rdi,%rdi,8) .byte 255 // (bad) + .byte 255,170,255,255,255,155 // ljmp *-0x64000001(%rdx) .byte 255 // (bad) - .byte 234 // (bad) .byte 255 // (bad) .byte 255 // (bad) + .byte 232,255,255,255,221 // callq ffffffffde000ac4 <_sk_xor__hsw_8bit+0xffffffffddfff849> .byte 255 // (bad) - .byte 223,255 // (bad) .byte 255 // (bad) - .byte 255,212 // callq *%rsp + .byte 255,210 // callq *%rdx .byte 255 // (bad) .byte 255 // (bad) - .byte 255,197 // inc %ebp + .byte 255,195 // inc %ebx .byte 255 // (bad) .byte 255 // (bad) .byte 255 // .byte 0xff @@ -51241,54 +51213,41 @@ FUNCTION(_sk_lerp_1_float_hsw_8bit) _sk_lerp_1_float_hsw_8bit: .byte 72,173 // lods %ds:(%rsi),%rax .byte 197,250,16,16 // vmovss (%rax),%xmm2 - .byte 197,234,89,21,98,11,0,0 // vmulss 0xb62(%rip),%xmm2,%xmm2 # 16e0 <_sk_xor__hsw_8bit+0x11b> + .byte 197,234,89,21,118,8,0,0 // vmulss 0x876(%rip),%xmm2,%xmm2 # 1354 <_sk_xor__hsw_8bit+0xd9> .byte 197,250,44,194 // vcvttss2si %xmm2,%eax .byte 197,249,110,208 // vmovd %eax,%xmm2 .byte 196,226,125,120,210 // vpbroadcastb %xmm2,%ymm2 - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 + .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3 + .byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0 .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0 - .byte 197,237,219,37,189,12,0,0 // vpand 0xcbd(%rip),%ymm2,%ymm4 # 1860 <_sk_xor__hsw_8bit+0x29b> + .byte 197,237,219,37,189,9,0,0 // vpand 0x9bd(%rip),%ymm2,%ymm4 # 14c0 <_sk_xor__hsw_8bit+0x245> .byte 197,221,213,232 // vpmullw %ymm0,%ymm4,%ymm5 .byte 197,221,213,227 // vpmullw %ymm3,%ymm4,%ymm4 .byte 197,221,253,219 // vpaddw %ymm3,%ymm4,%ymm3 .byte 197,213,253,192 // vpaddw %ymm0,%ymm5,%ymm0 - .byte 197,221,113,208,8 // vpsrlw $0x8,%ymm0,%ymm4 + .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0 .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3 - .byte 196,227,125,57,221,1 // vextracti128 $0x1,%ymm3,%xmm5 - .byte 197,249,111,5,213,14,0,0 // vmovdqa 0xed5(%rip),%xmm0 # 1aa0 <_sk_xor__hsw_8bit+0x4db> - .byte 196,226,81,0,232 // vpshufb %xmm0,%xmm5,%xmm5 - .byte 196,226,97,0,216 // vpshufb %xmm0,%xmm3,%xmm3 - .byte 197,225,108,221 // vpunpcklqdq %xmm5,%xmm3,%xmm3 - .byte 196,227,125,57,229,1 // vextracti128 $0x1,%ymm4,%xmm5 - .byte 196,226,81,0,232 // vpshufb %xmm0,%xmm5,%xmm5 - .byte 196,226,89,0,224 // vpshufb %xmm0,%xmm4,%xmm4 - .byte 197,217,108,229 // vpunpcklqdq %xmm5,%xmm4,%xmm4 - .byte 196,227,93,56,219,1 // vinserti128 $0x1,%xmm3,%ymm4,%ymm3 - .byte 197,221,118,228 // vpcmpeqd %ymm4,%ymm4,%ymm4 - .byte 197,237,239,212 // vpxor %ymm4,%ymm2,%ymm2 + .byte 196,227,101,56,224,1 // vinserti128 $0x1,%xmm0,%ymm3,%ymm4 + .byte 196,227,101,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm3,%ymm0 + .byte 197,221,103,192 // vpackuswb %ymm0,%ymm4,%ymm0 + .byte 197,229,118,219 // vpcmpeqd %ymm3,%ymm3,%ymm3 + .byte 197,237,239,211 // vpxor %ymm3,%ymm2,%ymm2 + .byte 196,226,125,48,217 // vpmovzxbw %xmm1,%ymm3 .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4 .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 - .byte 196,226,125,48,233 // vpmovzxbw %xmm1,%ymm5 - .byte 196,227,125,57,214,1 // vextracti128 $0x1,%ymm2,%xmm6 - .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6 + .byte 196,226,125,48,234 // vpmovzxbw %xmm2,%ymm5 + .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2 .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 - .byte 197,237,213,213 // vpmullw %ymm5,%ymm2,%ymm2 - .byte 197,205,213,244 // vpmullw %ymm4,%ymm6,%ymm6 - .byte 197,205,253,228 // vpaddw %ymm4,%ymm6,%ymm4 - .byte 197,237,253,213 // vpaddw %ymm5,%ymm2,%ymm2 + .byte 197,237,213,212 // vpmullw %ymm4,%ymm2,%ymm2 + .byte 197,213,213,235 // vpmullw %ymm3,%ymm5,%ymm5 + .byte 197,213,253,219 // vpaddw %ymm3,%ymm5,%ymm3 + .byte 197,237,253,212 // vpaddw %ymm4,%ymm2,%ymm2 .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2 - .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4 - .byte 196,227,125,57,229,1 // vextracti128 $0x1,%ymm4,%xmm5 - .byte 196,226,81,0,232 // vpshufb %xmm0,%xmm5,%xmm5 - .byte 196,226,89,0,224 // vpshufb %xmm0,%xmm4,%xmm4 - .byte 197,217,108,229 // vpunpcklqdq %xmm5,%xmm4,%xmm4 - .byte 196,227,125,57,213,1 // vextracti128 $0x1,%ymm2,%xmm5 - .byte 196,226,81,0,232 // vpshufb %xmm0,%xmm5,%xmm5 - .byte 196,226,105,0,192 // vpshufb %xmm0,%xmm2,%xmm0 - .byte 197,249,108,197 // vpunpcklqdq %xmm5,%xmm0,%xmm0 - .byte 196,227,125,56,196,1 // vinserti128 $0x1,%xmm4,%ymm0,%ymm0 - .byte 197,253,252,195 // vpaddb %ymm3,%ymm0,%ymm0 + .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3 + .byte 196,227,101,56,226,1 // vinserti128 $0x1,%xmm2,%ymm3,%ymm4 + .byte 196,227,101,70,210,49 // vperm2i128 $0x31,%ymm2,%ymm3,%ymm2 + .byte 197,221,103,210 // vpackuswb %ymm2,%ymm4,%ymm2 + .byte 197,237,252,192 // vpaddb %ymm0,%ymm2,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -51304,72 +51263,59 @@ _sk_lerp_u8_hsw_8bit: .byte 72,15,175,209 // imul %rcx,%rdx .byte 72,3,16 // add (%rax),%rdx .byte 77,133,201 // test %r9,%r9 - .byte 15,133,4,1,0,0 // jne d90 <_sk_lerp_u8_hsw_8bit+0x125> + .byte 15,133,192,0,0,0 // jne c68 <_sk_lerp_u8_hsw_8bit+0xe1> .byte 196,162,121,48,20,2 // vpmovzxbw (%rdx,%r8,1),%xmm2 - .byte 197,233,219,21,22,14,0,0 // vpand 0xe16(%rip),%xmm2,%xmm2 # 1ab0 <_sk_xor__hsw_8bit+0x4eb> + .byte 197,233,219,21,10,11,0,0 // vpand 0xb0a(%rip),%xmm2,%xmm2 # 16c0 <_sk_xor__hsw_8bit+0x445> .byte 196,226,125,51,210 // vpmovzxwd %xmm2,%ymm2 - .byte 196,226,109,0,21,216,11,0,0 // vpshufb 0xbd8(%rip),%ymm2,%ymm2 # 1880 <_sk_xor__hsw_8bit+0x2bb> - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 + .byte 196,226,109,0,21,28,9,0,0 // vpshufb 0x91c(%rip),%ymm2,%ymm2 # 14e0 <_sk_xor__hsw_8bit+0x265> + .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3 + .byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0 .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0 - .byte 196,227,125,57,212,1 // vextracti128 $0x1,%ymm2,%xmm4 - .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 - .byte 196,226,125,48,234 // vpmovzxbw %xmm2,%ymm5 + .byte 196,226,125,48,226 // vpmovzxbw %xmm2,%ymm4 + .byte 196,227,125,57,213,1 // vextracti128 $0x1,%ymm2,%xmm5 + .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5 .byte 197,213,213,232 // vpmullw %ymm0,%ymm5,%ymm5 .byte 197,221,213,227 // vpmullw %ymm3,%ymm4,%ymm4 .byte 197,221,253,219 // vpaddw %ymm3,%ymm4,%ymm3 .byte 197,213,253,192 // vpaddw %ymm0,%ymm5,%ymm0 - .byte 197,221,113,208,8 // vpsrlw $0x8,%ymm0,%ymm4 + .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0 .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3 - .byte 196,227,125,57,221,1 // vextracti128 $0x1,%ymm3,%xmm5 - .byte 197,249,111,5,208,13,0,0 // vmovdqa 0xdd0(%rip),%xmm0 # 1ac0 <_sk_xor__hsw_8bit+0x4fb> - .byte 196,226,81,0,232 // vpshufb %xmm0,%xmm5,%xmm5 - .byte 196,226,97,0,216 // vpshufb %xmm0,%xmm3,%xmm3 - .byte 197,225,108,221 // vpunpcklqdq %xmm5,%xmm3,%xmm3 - .byte 196,227,125,57,229,1 // vextracti128 $0x1,%ymm4,%xmm5 - .byte 196,226,81,0,232 // vpshufb %xmm0,%xmm5,%xmm5 - .byte 196,226,89,0,224 // vpshufb %xmm0,%xmm4,%xmm4 - .byte 197,217,108,229 // vpunpcklqdq %xmm5,%xmm4,%xmm4 - .byte 196,227,93,56,219,1 // vinserti128 $0x1,%xmm3,%ymm4,%ymm3 - .byte 197,221,118,228 // vpcmpeqd %ymm4,%ymm4,%ymm4 - .byte 197,237,239,212 // vpxor %ymm4,%ymm2,%ymm2 + .byte 196,227,101,56,224,1 // vinserti128 $0x1,%xmm0,%ymm3,%ymm4 + .byte 196,227,101,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm3,%ymm0 + .byte 197,221,103,192 // vpackuswb %ymm0,%ymm4,%ymm0 + .byte 197,229,118,219 // vpcmpeqd %ymm3,%ymm3,%ymm3 + .byte 197,237,239,211 // vpxor %ymm3,%ymm2,%ymm2 + .byte 196,226,125,48,217 // vpmovzxbw %xmm1,%ymm3 .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4 .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 - .byte 196,226,125,48,233 // vpmovzxbw %xmm1,%ymm5 - .byte 196,227,125,57,214,1 // vextracti128 $0x1,%ymm2,%xmm6 - .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6 + .byte 196,226,125,48,234 // vpmovzxbw %xmm2,%ymm5 + .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2 .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 - .byte 197,237,213,213 // vpmullw %ymm5,%ymm2,%ymm2 - .byte 197,205,213,244 // vpmullw %ymm4,%ymm6,%ymm6 - .byte 197,205,253,228 // vpaddw %ymm4,%ymm6,%ymm4 - .byte 197,237,253,213 // vpaddw %ymm5,%ymm2,%ymm2 + .byte 197,237,213,212 // vpmullw %ymm4,%ymm2,%ymm2 + .byte 197,213,213,235 // vpmullw %ymm3,%ymm5,%ymm5 + .byte 197,213,253,219 // vpaddw %ymm3,%ymm5,%ymm3 + .byte 197,237,253,212 // vpaddw %ymm4,%ymm2,%ymm2 .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2 - .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4 - .byte 196,227,125,57,229,1 // vextracti128 $0x1,%ymm4,%xmm5 - .byte 196,226,81,0,232 // vpshufb %xmm0,%xmm5,%xmm5 - .byte 196,226,89,0,224 // vpshufb %xmm0,%xmm4,%xmm4 - .byte 197,217,108,229 // vpunpcklqdq %xmm5,%xmm4,%xmm4 - .byte 196,227,125,57,213,1 // vextracti128 $0x1,%ymm2,%xmm5 - .byte 196,226,81,0,232 // vpshufb %xmm0,%xmm5,%xmm5 - .byte 196,226,105,0,192 // vpshufb %xmm0,%xmm2,%xmm0 - .byte 197,249,108,197 // vpunpcklqdq %xmm5,%xmm0,%xmm0 - .byte 196,227,125,56,196,1 // vinserti128 $0x1,%xmm4,%ymm0,%ymm0 - .byte 197,253,252,195 // vpaddb %ymm3,%ymm0,%ymm0 + .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3 + .byte 196,227,101,56,226,1 // vinserti128 $0x1,%xmm2,%ymm3,%ymm4 + .byte 196,227,101,70,210,49 // vperm2i128 $0x31,%ymm2,%ymm3,%ymm2 + .byte 197,221,103,210 // vpackuswb %ymm2,%ymm4,%ymm2 + .byte 197,237,252,192 // vpaddb %ymm0,%ymm2,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax .byte 65,128,225,7 // and $0x7,%r9b .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2 .byte 65,254,201 // dec %r9b .byte 65,128,249,6 // cmp $0x6,%r9b - .byte 15,135,237,254,255,255 // ja c92 <_sk_lerp_u8_hsw_8bit+0x27> + .byte 15,135,49,255,255,255 // ja bae <_sk_lerp_u8_hsw_8bit+0x27> .byte 65,15,182,193 // movzbl %r9b,%eax - .byte 72,141,13,124,0,0,0 // lea 0x7c(%rip),%rcx # e2c <_sk_lerp_u8_hsw_8bit+0x1c1> + .byte 72,141,13,124,0,0,0 // lea 0x7c(%rip),%rcx # d04 <_sk_lerp_u8_hsw_8bit+0x17d> .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax .byte 72,1,200 // add %rcx,%rax .byte 255,224 // jmpq *%rax .byte 66,15,182,4,2 // movzbl (%rdx,%r8,1),%eax .byte 197,249,110,208 // vmovd %eax,%xmm2 - .byte 233,203,254,255,255 // jmpq c92 <_sk_lerp_u8_hsw_8bit+0x27> + .byte 233,15,255,255,255 // jmpq bae <_sk_lerp_u8_hsw_8bit+0x27> .byte 66,15,182,68,2,2 // movzbl 0x2(%rdx,%r8,1),%eax .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2 .byte 197,233,196,208,2 // vpinsrw $0x2,%eax,%xmm2,%xmm2 @@ -51377,7 +51323,7 @@ _sk_lerp_u8_hsw_8bit: .byte 197,249,110,216 // vmovd %eax,%xmm3 .byte 196,226,121,48,219 // vpmovzxbw %xmm3,%xmm3 .byte 196,227,105,2,211,1 // vpblendd $0x1,%xmm3,%xmm2,%xmm2 - .byte 233,163,254,255,255 // jmpq c92 <_sk_lerp_u8_hsw_8bit+0x27> + .byte 233,231,254,255,255 // jmpq bae <_sk_lerp_u8_hsw_8bit+0x27> .byte 66,15,182,68,2,6 // movzbl 0x6(%rdx,%r8,1),%eax .byte 197,233,239,210 // vpxor %xmm2,%xmm2,%xmm2 .byte 197,233,196,208,6 // vpinsrw $0x6,%eax,%xmm2,%xmm2 @@ -51388,7 +51334,7 @@ _sk_lerp_u8_hsw_8bit: .byte 196,161,121,110,28,2 // vmovd (%rdx,%r8,1),%xmm3 .byte 196,226,121,48,219 // vpmovzxbw %xmm3,%xmm3 .byte 196,227,97,2,210,12 // vpblendd $0xc,%xmm2,%xmm3,%xmm2 - .byte 233,104,254,255,255 // jmpq c92 <_sk_lerp_u8_hsw_8bit+0x27> + .byte 233,172,254,255,255 // jmpq bae <_sk_lerp_u8_hsw_8bit+0x27> .byte 102,144 // xchg %ax,%ax .byte 141 // (bad) .byte 255 // (bad) @@ -51397,7 +51343,7 @@ _sk_lerp_u8_hsw_8bit: .byte 255 // (bad) .byte 255 // (bad) .byte 255 // (bad) - .byte 232,255,255,255,221 // callq ffffffffde000e3c <_sk_xor__hsw_8bit+0xffffffffddfff877> + .byte 232,255,255,255,221 // callq ffffffffde000d14 <_sk_xor__hsw_8bit+0xffffffffddfffa99> .byte 255 // (bad) .byte 255 // (bad) .byte 255,210 // callq *%rdx @@ -51429,7 +51375,7 @@ HIDDEN _sk_black_color_hsw_8bit FUNCTION(_sk_black_color_hsw_8bit) _sk_black_color_hsw_8bit: .byte 72,173 // lods %ds:(%rsi),%rax - .byte 196,226,125,25,5,45,13,0,0 // vbroadcastsd 0xd2d(%rip),%ymm0 # 1b90 <_sk_xor__hsw_8bit+0x5cb> + .byte 196,226,125,25,5,149,9,0,0 // vbroadcastsd 0x995(%rip),%ymm0 # 16d0 <_sk_xor__hsw_8bit+0x455> .byte 255,224 // jmpq *%rax HIDDEN _sk_white_color_hsw_8bit @@ -51452,55 +51398,42 @@ HIDDEN _sk_srcatop_hsw_8bit .globl _sk_srcatop_hsw_8bit FUNCTION(_sk_srcatop_hsw_8bit) _sk_srcatop_hsw_8bit: - .byte 197,253,111,29,35,10,0,0 // vmovdqa 0xa23(%rip),%ymm3 # 18a0 <_sk_xor__hsw_8bit+0x2db> - .byte 196,226,117,0,211 // vpshufb %ymm3,%ymm1,%ymm2 - .byte 196,227,125,57,196,1 // vextracti128 $0x1,%ymm0,%xmm4 - .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 - .byte 196,226,125,48,232 // vpmovzxbw %xmm0,%ymm5 - .byte 196,227,125,57,214,1 // vextracti128 $0x1,%ymm2,%xmm6 - .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6 - .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 - .byte 197,237,213,213 // vpmullw %ymm5,%ymm2,%ymm2 + .byte 197,253,111,21,171,7,0,0 // vmovdqa 0x7ab(%rip),%ymm2 # 1500 <_sk_xor__hsw_8bit+0x285> + .byte 196,226,117,0,218 // vpshufb %ymm2,%ymm1,%ymm3 + .byte 196,226,125,48,224 // vpmovzxbw %xmm0,%ymm4 + .byte 196,227,125,57,197,1 // vextracti128 $0x1,%ymm0,%xmm5 + .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5 + .byte 196,226,125,48,243 // vpmovzxbw %xmm3,%ymm6 + .byte 196,227,125,57,219,1 // vextracti128 $0x1,%ymm3,%xmm3 + .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 + .byte 197,229,213,221 // vpmullw %ymm5,%ymm3,%ymm3 .byte 197,205,213,244 // vpmullw %ymm4,%ymm6,%ymm6 .byte 197,205,253,228 // vpaddw %ymm4,%ymm6,%ymm4 - .byte 197,237,253,213 // vpaddw %ymm5,%ymm2,%ymm2 - .byte 197,213,113,210,8 // vpsrlw $0x8,%ymm2,%ymm5 + .byte 197,229,253,221 // vpaddw %ymm5,%ymm3,%ymm3 + .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3 .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4 - .byte 196,227,125,57,230,1 // vextracti128 $0x1,%ymm4,%xmm6 - .byte 197,249,111,21,6,12,0,0 // vmovdqa 0xc06(%rip),%xmm2 # 1ad0 <_sk_xor__hsw_8bit+0x50b> - .byte 196,226,73,0,242 // vpshufb %xmm2,%xmm6,%xmm6 - .byte 196,226,89,0,226 // vpshufb %xmm2,%xmm4,%xmm4 - .byte 197,217,108,230 // vpunpcklqdq %xmm6,%xmm4,%xmm4 - .byte 196,227,125,57,238,1 // vextracti128 $0x1,%ymm5,%xmm6 - .byte 196,226,73,0,242 // vpshufb %xmm2,%xmm6,%xmm6 - .byte 196,226,81,0,234 // vpshufb %xmm2,%xmm5,%xmm5 - .byte 197,209,108,238 // vpunpcklqdq %xmm6,%xmm5,%xmm5 - .byte 196,227,85,56,228,1 // vinserti128 $0x1,%xmm4,%ymm5,%ymm4 - .byte 196,226,125,0,195 // vpshufb %ymm3,%ymm0,%ymm0 - .byte 197,229,118,219 // vpcmpeqd %ymm3,%ymm3,%ymm3 - .byte 197,253,239,195 // vpxor %ymm3,%ymm0,%ymm0 - .byte 196,227,125,57,203,1 // vextracti128 $0x1,%ymm1,%xmm3 - .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 - .byte 196,226,125,48,233 // vpmovzxbw %xmm1,%ymm5 - .byte 196,227,125,57,198,1 // vextracti128 $0x1,%ymm0,%xmm6 - .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6 + .byte 196,227,93,56,235,1 // vinserti128 $0x1,%xmm3,%ymm4,%ymm5 + .byte 196,227,93,70,219,49 // vperm2i128 $0x31,%ymm3,%ymm4,%ymm3 + .byte 197,213,103,219 // vpackuswb %ymm3,%ymm5,%ymm3 + .byte 196,226,125,0,194 // vpshufb %ymm2,%ymm0,%ymm0 + .byte 197,237,118,210 // vpcmpeqd %ymm2,%ymm2,%ymm2 + .byte 197,253,239,194 // vpxor %ymm2,%ymm0,%ymm0 + .byte 196,226,125,48,209 // vpmovzxbw %xmm1,%ymm2 + .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4 + .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 + .byte 196,226,125,48,232 // vpmovzxbw %xmm0,%ymm5 + .byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0 .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0 - .byte 197,253,213,197 // vpmullw %ymm5,%ymm0,%ymm0 - .byte 197,205,213,243 // vpmullw %ymm3,%ymm6,%ymm6 - .byte 197,205,253,219 // vpaddw %ymm3,%ymm6,%ymm3 - .byte 197,253,253,197 // vpaddw %ymm5,%ymm0,%ymm0 + .byte 197,253,213,196 // vpmullw %ymm4,%ymm0,%ymm0 + .byte 197,213,213,234 // vpmullw %ymm2,%ymm5,%ymm5 + .byte 197,213,253,210 // vpaddw %ymm2,%ymm5,%ymm2 + .byte 197,253,253,196 // vpaddw %ymm4,%ymm0,%ymm0 .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0 - .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3 - .byte 196,227,125,57,221,1 // vextracti128 $0x1,%ymm3,%xmm5 - .byte 196,226,81,0,234 // vpshufb %xmm2,%xmm5,%xmm5 - .byte 196,226,97,0,218 // vpshufb %xmm2,%xmm3,%xmm3 - .byte 197,225,108,221 // vpunpcklqdq %xmm5,%xmm3,%xmm3 - .byte 196,227,125,57,197,1 // vextracti128 $0x1,%ymm0,%xmm5 - .byte 196,226,81,0,234 // vpshufb %xmm2,%xmm5,%xmm5 - .byte 196,226,121,0,194 // vpshufb %xmm2,%xmm0,%xmm0 - .byte 197,249,108,197 // vpunpcklqdq %xmm5,%xmm0,%xmm0 - .byte 196,227,125,56,195,1 // vinserti128 $0x1,%xmm3,%ymm0,%ymm0 - .byte 197,221,252,192 // vpaddb %ymm0,%ymm4,%ymm0 + .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2 + .byte 196,227,109,56,224,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm4 + .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + .byte 197,221,103,192 // vpackuswb %ymm0,%ymm4,%ymm0 + .byte 197,253,252,195 // vpaddb %ymm3,%ymm0,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -51508,55 +51441,42 @@ HIDDEN _sk_dstatop_hsw_8bit .globl _sk_dstatop_hsw_8bit FUNCTION(_sk_dstatop_hsw_8bit) _sk_dstatop_hsw_8bit: - .byte 197,253,111,29,73,9,0,0 // vmovdqa 0x949(%rip),%ymm3 # 18c0 <_sk_xor__hsw_8bit+0x2fb> - .byte 196,226,125,0,211 // vpshufb %ymm3,%ymm0,%ymm2 - .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4 - .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 - .byte 196,226,125,48,233 // vpmovzxbw %xmm1,%ymm5 - .byte 196,227,125,57,214,1 // vextracti128 $0x1,%ymm2,%xmm6 - .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6 - .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 - .byte 197,237,213,213 // vpmullw %ymm5,%ymm2,%ymm2 + .byte 197,253,111,21,21,7,0,0 // vmovdqa 0x715(%rip),%ymm2 # 1520 <_sk_xor__hsw_8bit+0x2a5> + .byte 196,226,125,0,218 // vpshufb %ymm2,%ymm0,%ymm3 + .byte 196,226,125,48,225 // vpmovzxbw %xmm1,%ymm4 + .byte 196,227,125,57,205,1 // vextracti128 $0x1,%ymm1,%xmm5 + .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5 + .byte 196,226,125,48,243 // vpmovzxbw %xmm3,%ymm6 + .byte 196,227,125,57,219,1 // vextracti128 $0x1,%ymm3,%xmm3 + .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 + .byte 197,229,213,221 // vpmullw %ymm5,%ymm3,%ymm3 .byte 197,205,213,244 // vpmullw %ymm4,%ymm6,%ymm6 .byte 197,205,253,228 // vpaddw %ymm4,%ymm6,%ymm4 - .byte 197,237,253,213 // vpaddw %ymm5,%ymm2,%ymm2 - .byte 197,213,113,210,8 // vpsrlw $0x8,%ymm2,%ymm5 + .byte 197,229,253,221 // vpaddw %ymm5,%ymm3,%ymm3 + .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3 .byte 197,221,113,212,8 // vpsrlw $0x8,%ymm4,%ymm4 - .byte 196,227,125,57,230,1 // vextracti128 $0x1,%ymm4,%xmm6 - .byte 197,249,111,21,28,11,0,0 // vmovdqa 0xb1c(%rip),%xmm2 # 1ae0 <_sk_xor__hsw_8bit+0x51b> - .byte 196,226,73,0,242 // vpshufb %xmm2,%xmm6,%xmm6 - .byte 196,226,89,0,226 // vpshufb %xmm2,%xmm4,%xmm4 - .byte 197,217,108,230 // vpunpcklqdq %xmm6,%xmm4,%xmm4 - .byte 196,227,125,57,238,1 // vextracti128 $0x1,%ymm5,%xmm6 - .byte 196,226,73,0,242 // vpshufb %xmm2,%xmm6,%xmm6 - .byte 196,226,81,0,234 // vpshufb %xmm2,%xmm5,%xmm5 - .byte 197,209,108,238 // vpunpcklqdq %xmm6,%xmm5,%xmm5 - .byte 196,227,85,56,228,1 // vinserti128 $0x1,%xmm4,%ymm5,%ymm4 - .byte 196,226,117,0,219 // vpshufb %ymm3,%ymm1,%ymm3 - .byte 197,213,118,237 // vpcmpeqd %ymm5,%ymm5,%ymm5 - .byte 197,229,239,221 // vpxor %ymm5,%ymm3,%ymm3 - .byte 196,227,125,57,197,1 // vextracti128 $0x1,%ymm0,%xmm5 - .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5 + .byte 196,227,93,56,235,1 // vinserti128 $0x1,%xmm3,%ymm4,%ymm5 + .byte 196,227,93,70,219,49 // vperm2i128 $0x31,%ymm3,%ymm4,%ymm3 + .byte 197,213,103,219 // vpackuswb %ymm3,%ymm5,%ymm3 + .byte 196,226,117,0,210 // vpshufb %ymm2,%ymm1,%ymm2 + .byte 197,221,118,228 // vpcmpeqd %ymm4,%ymm4,%ymm4 + .byte 197,237,239,212 // vpxor %ymm4,%ymm2,%ymm2 + .byte 196,226,125,48,224 // vpmovzxbw %xmm0,%ymm4 + .byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0 .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0 - .byte 196,227,125,57,222,1 // vextracti128 $0x1,%ymm3,%xmm6 - .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6 - .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 - .byte 197,229,213,216 // vpmullw %ymm0,%ymm3,%ymm3 - .byte 197,205,213,245 // vpmullw %ymm5,%ymm6,%ymm6 - .byte 197,205,253,237 // vpaddw %ymm5,%ymm6,%ymm5 - .byte 197,229,253,192 // vpaddw %ymm0,%ymm3,%ymm0 + .byte 196,226,125,48,234 // vpmovzxbw %xmm2,%ymm5 + .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2 + .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 + .byte 197,237,213,208 // vpmullw %ymm0,%ymm2,%ymm2 + .byte 197,213,213,236 // vpmullw %ymm4,%ymm5,%ymm5 + .byte 197,213,253,228 // vpaddw %ymm4,%ymm5,%ymm4 + .byte 197,237,253,192 // vpaddw %ymm0,%ymm2,%ymm0 .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0 - .byte 197,229,113,213,8 // vpsrlw $0x8,%ymm5,%ymm3 - .byte 196,227,125,57,221,1 // vextracti128 $0x1,%ymm3,%xmm5 - .byte 196,226,81,0,234 // vpshufb %xmm2,%xmm5,%xmm5 - .byte 196,226,97,0,218 // vpshufb %xmm2,%xmm3,%xmm3 - .byte 197,225,108,221 // vpunpcklqdq %xmm5,%xmm3,%xmm3 - .byte 196,227,125,57,197,1 // vextracti128 $0x1,%ymm0,%xmm5 - .byte 196,226,81,0,234 // vpshufb %xmm2,%xmm5,%xmm5 - .byte 196,226,121,0,194 // vpshufb %xmm2,%xmm0,%xmm0 - .byte 197,249,108,197 // vpunpcklqdq %xmm5,%xmm0,%xmm0 - .byte 196,227,125,56,195,1 // vinserti128 $0x1,%xmm3,%ymm0,%ymm0 - .byte 197,253,252,196 // vpaddb %ymm4,%ymm0,%ymm0 + .byte 197,237,113,212,8 // vpsrlw $0x8,%ymm4,%ymm2 + .byte 196,227,109,56,224,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm4 + .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + .byte 197,221,103,192 // vpackuswb %ymm0,%ymm4,%ymm0 + .byte 197,253,252,195 // vpaddb %ymm3,%ymm0,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -51564,12 +51484,12 @@ HIDDEN _sk_srcin_hsw_8bit .globl _sk_srcin_hsw_8bit FUNCTION(_sk_srcin_hsw_8bit) _sk_srcin_hsw_8bit: - .byte 196,226,117,0,21,110,8,0,0 // vpshufb 0x86e(%rip),%ymm1,%ymm2 # 18e0 <_sk_xor__hsw_8bit+0x31b> - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 + .byte 196,226,117,0,21,126,6,0,0 // vpshufb 0x67e(%rip),%ymm1,%ymm2 # 1540 <_sk_xor__hsw_8bit+0x2c5> + .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3 + .byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0 .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0 - .byte 196,227,125,57,212,1 // vextracti128 $0x1,%ymm2,%xmm4 - .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 + .byte 196,226,125,48,226 // vpmovzxbw %xmm2,%ymm4 + .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2 .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 .byte 197,237,213,208 // vpmullw %ymm0,%ymm2,%ymm2 .byte 197,221,213,227 // vpmullw %ymm3,%ymm4,%ymm4 @@ -51577,16 +51497,9 @@ _sk_srcin_hsw_8bit: .byte 197,237,253,192 // vpaddw %ymm0,%ymm2,%ymm0 .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0 .byte 197,237,113,211,8 // vpsrlw $0x8,%ymm3,%ymm2 - .byte 196,227,125,57,211,1 // vextracti128 $0x1,%ymm2,%xmm3 - .byte 197,249,111,37,54,10,0,0 // vmovdqa 0xa36(%rip),%xmm4 # 1af0 <_sk_xor__hsw_8bit+0x52b> - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,105,0,212 // vpshufb %xmm4,%xmm2,%xmm2 - .byte 197,233,108,211 // vpunpcklqdq %xmm3,%xmm2,%xmm2 - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,121,0,196 // vpshufb %xmm4,%xmm0,%xmm0 - .byte 197,249,108,195 // vpunpcklqdq %xmm3,%xmm0,%xmm0 - .byte 196,227,125,56,194,1 // vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -51594,12 +51507,12 @@ HIDDEN _sk_dstin_hsw_8bit .globl _sk_dstin_hsw_8bit FUNCTION(_sk_dstin_hsw_8bit) _sk_dstin_hsw_8bit: - .byte 196,226,125,0,5,17,8,0,0 // vpshufb 0x811(%rip),%ymm0,%ymm0 # 1900 <_sk_xor__hsw_8bit+0x33b> - .byte 196,227,125,57,202,1 // vextracti128 $0x1,%ymm1,%xmm2 - .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 - .byte 196,226,125,48,217 // vpmovzxbw %xmm1,%ymm3 - .byte 196,227,125,57,196,1 // vextracti128 $0x1,%ymm0,%xmm4 - .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 + .byte 196,226,125,0,5,71,6,0,0 // vpshufb 0x647(%rip),%ymm0,%ymm0 # 1560 <_sk_xor__hsw_8bit+0x2e5> + .byte 196,226,125,48,209 // vpmovzxbw %xmm1,%ymm2 + .byte 196,227,125,57,203,1 // vextracti128 $0x1,%ymm1,%xmm3 + .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 + .byte 196,226,125,48,224 // vpmovzxbw %xmm0,%ymm4 + .byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0 .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0 .byte 197,253,213,195 // vpmullw %ymm3,%ymm0,%ymm0 .byte 197,221,213,226 // vpmullw %ymm2,%ymm4,%ymm4 @@ -51607,16 +51520,9 @@ _sk_dstin_hsw_8bit: .byte 197,253,253,195 // vpaddw %ymm3,%ymm0,%ymm0 .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0 .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2 - .byte 196,227,125,57,211,1 // vextracti128 $0x1,%ymm2,%xmm3 - .byte 197,249,111,37,201,9,0,0 // vmovdqa 0x9c9(%rip),%xmm4 # 1b00 <_sk_xor__hsw_8bit+0x53b> - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,105,0,212 // vpshufb %xmm4,%xmm2,%xmm2 - .byte 197,233,108,211 // vpunpcklqdq %xmm3,%xmm2,%xmm2 - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,121,0,196 // vpshufb %xmm4,%xmm0,%xmm0 - .byte 197,249,108,195 // vpunpcklqdq %xmm3,%xmm0,%xmm0 - .byte 196,227,125,56,194,1 // vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -51624,14 +51530,14 @@ HIDDEN _sk_srcout_hsw_8bit .globl _sk_srcout_hsw_8bit FUNCTION(_sk_srcout_hsw_8bit) _sk_srcout_hsw_8bit: - .byte 196,226,117,0,21,180,7,0,0 // vpshufb 0x7b4(%rip),%ymm1,%ymm2 # 1920 <_sk_xor__hsw_8bit+0x35b> + .byte 196,226,117,0,21,16,6,0,0 // vpshufb 0x610(%rip),%ymm1,%ymm2 # 1580 <_sk_xor__hsw_8bit+0x305> .byte 197,229,118,219 // vpcmpeqd %ymm3,%ymm3,%ymm3 .byte 197,237,239,211 // vpxor %ymm3,%ymm2,%ymm2 - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 + .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3 + .byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0 .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0 - .byte 196,227,125,57,212,1 // vextracti128 $0x1,%ymm2,%xmm4 - .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 + .byte 196,226,125,48,226 // vpmovzxbw %xmm2,%ymm4 + .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2 .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 .byte 197,237,213,208 // vpmullw %ymm0,%ymm2,%ymm2 .byte 197,221,213,227 // vpmullw %ymm3,%ymm4,%ymm4 @@ -51639,16 +51545,9 @@ _sk_srcout_hsw_8bit: .byte 197,237,253,192 // vpaddw %ymm0,%ymm2,%ymm0 .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0 .byte 197,237,113,211,8 // vpsrlw $0x8,%ymm3,%ymm2 - .byte 196,227,125,57,211,1 // vextracti128 $0x1,%ymm2,%xmm3 - .byte 197,249,111,37,84,9,0,0 // vmovdqa 0x954(%rip),%xmm4 # 1b10 <_sk_xor__hsw_8bit+0x54b> - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,105,0,212 // vpshufb %xmm4,%xmm2,%xmm2 - .byte 197,233,108,211 // vpunpcklqdq %xmm3,%xmm2,%xmm2 - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,121,0,196 // vpshufb %xmm4,%xmm0,%xmm0 - .byte 197,249,108,195 // vpunpcklqdq %xmm3,%xmm0,%xmm0 - .byte 196,227,125,56,194,1 // vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -51656,14 +51555,14 @@ HIDDEN _sk_dstout_hsw_8bit .globl _sk_dstout_hsw_8bit FUNCTION(_sk_dstout_hsw_8bit) _sk_dstout_hsw_8bit: - .byte 196,226,125,0,5,79,7,0,0 // vpshufb 0x74f(%rip),%ymm0,%ymm0 # 1940 <_sk_xor__hsw_8bit+0x37b> + .byte 196,226,125,0,5,209,5,0,0 // vpshufb 0x5d1(%rip),%ymm0,%ymm0 # 15a0 <_sk_xor__hsw_8bit+0x325> .byte 197,237,118,210 // vpcmpeqd %ymm2,%ymm2,%ymm2 .byte 197,253,239,194 // vpxor %ymm2,%ymm0,%ymm0 - .byte 196,227,125,57,202,1 // vextracti128 $0x1,%ymm1,%xmm2 - .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 - .byte 196,226,125,48,217 // vpmovzxbw %xmm1,%ymm3 - .byte 196,227,125,57,196,1 // vextracti128 $0x1,%ymm0,%xmm4 - .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 + .byte 196,226,125,48,209 // vpmovzxbw %xmm1,%ymm2 + .byte 196,227,125,57,203,1 // vextracti128 $0x1,%ymm1,%xmm3 + .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 + .byte 196,226,125,48,224 // vpmovzxbw %xmm0,%ymm4 + .byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0 .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0 .byte 197,253,213,195 // vpmullw %ymm3,%ymm0,%ymm0 .byte 197,221,213,226 // vpmullw %ymm2,%ymm4,%ymm4 @@ -51671,16 +51570,9 @@ _sk_dstout_hsw_8bit: .byte 197,253,253,195 // vpaddw %ymm3,%ymm0,%ymm0 .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0 .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2 - .byte 196,227,125,57,211,1 // vextracti128 $0x1,%ymm2,%xmm3 - .byte 197,249,111,37,223,8,0,0 // vmovdqa 0x8df(%rip),%xmm4 # 1b20 <_sk_xor__hsw_8bit+0x55b> - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,105,0,212 // vpshufb %xmm4,%xmm2,%xmm2 - .byte 197,233,108,211 // vpunpcklqdq %xmm3,%xmm2,%xmm2 - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,121,0,196 // vpshufb %xmm4,%xmm0,%xmm0 - .byte 197,249,108,195 // vpunpcklqdq %xmm3,%xmm0,%xmm0 - .byte 196,227,125,56,194,1 // vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -51688,14 +51580,14 @@ HIDDEN _sk_srcover_hsw_8bit .globl _sk_srcover_hsw_8bit FUNCTION(_sk_srcover_hsw_8bit) _sk_srcover_hsw_8bit: - .byte 196,226,125,0,21,234,6,0,0 // vpshufb 0x6ea(%rip),%ymm0,%ymm2 # 1960 <_sk_xor__hsw_8bit+0x39b> + .byte 196,226,125,0,21,146,5,0,0 // vpshufb 0x592(%rip),%ymm0,%ymm2 # 15c0 <_sk_xor__hsw_8bit+0x345> .byte 197,229,118,219 // vpcmpeqd %ymm3,%ymm3,%ymm3 .byte 197,237,239,211 // vpxor %ymm3,%ymm2,%ymm2 - .byte 196,227,125,57,203,1 // vextracti128 $0x1,%ymm1,%xmm3 - .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 - .byte 196,226,125,48,225 // vpmovzxbw %xmm1,%ymm4 - .byte 196,227,125,57,213,1 // vextracti128 $0x1,%ymm2,%xmm5 - .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5 + .byte 196,226,125,48,217 // vpmovzxbw %xmm1,%ymm3 + .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4 + .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 + .byte 196,226,125,48,234 // vpmovzxbw %xmm2,%ymm5 + .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2 .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 .byte 197,237,213,212 // vpmullw %ymm4,%ymm2,%ymm2 .byte 197,213,213,235 // vpmullw %ymm3,%ymm5,%ymm5 @@ -51703,16 +51595,9 @@ _sk_srcover_hsw_8bit: .byte 197,237,253,212 // vpaddw %ymm4,%ymm2,%ymm2 .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2 .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3 - .byte 196,227,125,57,220,1 // vextracti128 $0x1,%ymm3,%xmm4 - .byte 197,249,111,45,106,8,0,0 // vmovdqa 0x86a(%rip),%xmm5 # 1b30 <_sk_xor__hsw_8bit+0x56b> - .byte 196,226,89,0,229 // vpshufb %xmm5,%xmm4,%xmm4 - .byte 196,226,97,0,221 // vpshufb %xmm5,%xmm3,%xmm3 - .byte 197,225,108,220 // vpunpcklqdq %xmm4,%xmm3,%xmm3 - .byte 196,227,125,57,212,1 // vextracti128 $0x1,%ymm2,%xmm4 - .byte 196,226,89,0,229 // vpshufb %xmm5,%xmm4,%xmm4 - .byte 196,226,105,0,213 // vpshufb %xmm5,%xmm2,%xmm2 - .byte 197,233,108,212 // vpunpcklqdq %xmm4,%xmm2,%xmm2 - .byte 196,227,109,56,211,1 // vinserti128 $0x1,%xmm3,%ymm2,%ymm2 + .byte 196,227,101,56,226,1 // vinserti128 $0x1,%xmm2,%ymm3,%ymm4 + .byte 196,227,101,70,210,49 // vperm2i128 $0x31,%ymm2,%ymm3,%ymm2 + .byte 197,221,103,210 // vpackuswb %ymm2,%ymm4,%ymm2 .byte 197,237,252,192 // vpaddb %ymm0,%ymm2,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -51721,14 +51606,14 @@ HIDDEN _sk_dstover_hsw_8bit .globl _sk_dstover_hsw_8bit FUNCTION(_sk_dstover_hsw_8bit) _sk_dstover_hsw_8bit: - .byte 196,226,117,0,21,129,6,0,0 // vpshufb 0x681(%rip),%ymm1,%ymm2 # 1980 <_sk_xor__hsw_8bit+0x3bb> + .byte 196,226,117,0,21,79,5,0,0 // vpshufb 0x54f(%rip),%ymm1,%ymm2 # 15e0 <_sk_xor__hsw_8bit+0x365> .byte 197,229,118,219 // vpcmpeqd %ymm3,%ymm3,%ymm3 .byte 197,237,239,211 // vpxor %ymm3,%ymm2,%ymm2 - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 + .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3 + .byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0 .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0 - .byte 196,227,125,57,212,1 // vextracti128 $0x1,%ymm2,%xmm4 - .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 + .byte 196,226,125,48,226 // vpmovzxbw %xmm2,%ymm4 + .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2 .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 .byte 197,237,213,208 // vpmullw %ymm0,%ymm2,%ymm2 .byte 197,221,213,227 // vpmullw %ymm3,%ymm4,%ymm4 @@ -51736,16 +51621,9 @@ _sk_dstover_hsw_8bit: .byte 197,237,253,192 // vpaddw %ymm0,%ymm2,%ymm0 .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0 .byte 197,237,113,211,8 // vpsrlw $0x8,%ymm3,%ymm2 - .byte 196,227,125,57,211,1 // vextracti128 $0x1,%ymm2,%xmm3 - .byte 197,249,111,37,241,7,0,0 // vmovdqa 0x7f1(%rip),%xmm4 # 1b40 <_sk_xor__hsw_8bit+0x57b> - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,105,0,212 // vpshufb %xmm4,%xmm2,%xmm2 - .byte 197,233,108,211 // vpunpcklqdq %xmm3,%xmm2,%xmm2 - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,121,0,196 // vpshufb %xmm4,%xmm0,%xmm0 - .byte 197,249,108,195 // vpunpcklqdq %xmm3,%xmm0,%xmm0 - .byte 196,227,125,56,194,1 // vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0 .byte 197,253,252,193 // vpaddb %ymm1,%ymm0,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -51754,28 +51632,21 @@ HIDDEN _sk_modulate_hsw_8bit .globl _sk_modulate_hsw_8bit FUNCTION(_sk_modulate_hsw_8bit) _sk_modulate_hsw_8bit: - .byte 196,227,125,57,194,1 // vextracti128 $0x1,%ymm0,%xmm2 - .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 + .byte 196,226,125,48,208 // vpmovzxbw %xmm0,%ymm2 + .byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0 .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0 - .byte 196,227,125,57,203,1 // vextracti128 $0x1,%ymm1,%xmm3 - .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 - .byte 196,226,125,48,225 // vpmovzxbw %xmm1,%ymm4 + .byte 196,226,125,48,217 // vpmovzxbw %xmm1,%ymm3 + .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4 + .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 .byte 197,221,213,224 // vpmullw %ymm0,%ymm4,%ymm4 .byte 197,229,213,218 // vpmullw %ymm2,%ymm3,%ymm3 .byte 197,229,253,210 // vpaddw %ymm2,%ymm3,%ymm2 .byte 197,221,253,192 // vpaddw %ymm0,%ymm4,%ymm0 .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0 .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2 - .byte 196,227,125,57,211,1 // vextracti128 $0x1,%ymm2,%xmm3 - .byte 197,249,111,37,137,7,0,0 // vmovdqa 0x789(%rip),%xmm4 # 1b50 <_sk_xor__hsw_8bit+0x58b> - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,105,0,212 // vpshufb %xmm4,%xmm2,%xmm2 - .byte 197,233,108,211 // vpunpcklqdq %xmm3,%xmm2,%xmm2 - .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 196,226,121,0,196 // vpshufb %xmm4,%xmm0,%xmm0 - .byte 197,249,108,195 // vpunpcklqdq %xmm3,%xmm0,%xmm0 - .byte 196,227,125,56,194,1 // vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + .byte 196,227,109,56,216,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + .byte 197,229,103,192 // vpackuswb %ymm0,%ymm3,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -51783,72 +51654,53 @@ HIDDEN _sk_multiply_hsw_8bit .globl _sk_multiply_hsw_8bit FUNCTION(_sk_multiply_hsw_8bit) _sk_multiply_hsw_8bit: - .byte 197,125,111,13,165,5,0,0 // vmovdqa 0x5a5(%rip),%ymm9 # 19a0 <_sk_xor__hsw_8bit+0x3db> - .byte 196,194,117,0,209 // vpshufb %ymm9,%ymm1,%ymm2 - .byte 196,65,45,118,210 // vpcmpeqd %ymm10,%ymm10,%ymm10 - .byte 196,193,109,239,226 // vpxor %ymm10,%ymm2,%ymm4 - .byte 196,227,125,57,194,1 // vextracti128 $0x1,%ymm0,%xmm2 - .byte 196,98,125,48,218 // vpmovzxbw %xmm2,%ymm11 - .byte 196,226,125,48,216 // vpmovzxbw %xmm0,%ymm3 - .byte 196,227,125,57,229,1 // vextracti128 $0x1,%ymm4,%xmm5 - .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5 - .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 - .byte 197,221,213,227 // vpmullw %ymm3,%ymm4,%ymm4 - .byte 196,193,85,213,235 // vpmullw %ymm11,%ymm5,%ymm5 - .byte 196,193,85,253,235 // vpaddw %ymm11,%ymm5,%ymm5 - .byte 197,221,253,227 // vpaddw %ymm3,%ymm4,%ymm4 - .byte 197,189,113,212,8 // vpsrlw $0x8,%ymm4,%ymm8 - .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5 - .byte 196,227,125,57,238,1 // vextracti128 $0x1,%ymm5,%xmm6 - .byte 197,249,111,37,12,7,0,0 // vmovdqa 0x70c(%rip),%xmm4 # 1b60 <_sk_xor__hsw_8bit+0x59b> - .byte 196,226,73,0,244 // vpshufb %xmm4,%xmm6,%xmm6 - .byte 196,226,81,0,236 // vpshufb %xmm4,%xmm5,%xmm5 - .byte 197,209,108,238 // vpunpcklqdq %xmm6,%xmm5,%xmm5 - .byte 196,99,125,57,198,1 // vextracti128 $0x1,%ymm8,%xmm6 - .byte 196,226,73,0,244 // vpshufb %xmm4,%xmm6,%xmm6 - .byte 196,226,57,0,252 // vpshufb %xmm4,%xmm8,%xmm7 - .byte 197,193,108,246 // vpunpcklqdq %xmm6,%xmm7,%xmm6 - .byte 196,99,77,56,197,1 // vinserti128 $0x1,%xmm5,%ymm6,%ymm8 - .byte 196,194,125,0,193 // vpshufb %ymm9,%ymm0,%ymm0 - .byte 196,193,125,239,194 // vpxor %ymm10,%ymm0,%ymm0 - .byte 196,227,125,57,206,1 // vextracti128 $0x1,%ymm1,%xmm6 + .byte 197,253,111,37,191,4,0,0 // vmovdqa 0x4bf(%rip),%ymm4 # 1600 <_sk_xor__hsw_8bit+0x385> + .byte 196,226,117,0,212 // vpshufb %ymm4,%ymm1,%ymm2 + .byte 197,213,118,237 // vpcmpeqd %ymm5,%ymm5,%ymm5 + .byte 197,237,239,245 // vpxor %ymm5,%ymm2,%ymm6 + .byte 196,226,125,48,208 // vpmovzxbw %xmm0,%ymm2 + .byte 196,227,125,57,195,1 // vextracti128 $0x1,%ymm0,%xmm3 + .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 + .byte 196,226,125,48,254 // vpmovzxbw %xmm6,%ymm7 + .byte 196,227,125,57,246,1 // vextracti128 $0x1,%ymm6,%xmm6 .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6 - .byte 196,226,125,48,249 // vpmovzxbw %xmm1,%ymm7 - .byte 196,227,125,57,197,1 // vextracti128 $0x1,%ymm0,%xmm5 + .byte 197,205,213,243 // vpmullw %ymm3,%ymm6,%ymm6 + .byte 197,197,213,250 // vpmullw %ymm2,%ymm7,%ymm7 + .byte 197,197,253,250 // vpaddw %ymm2,%ymm7,%ymm7 + .byte 197,205,253,243 // vpaddw %ymm3,%ymm6,%ymm6 + .byte 197,205,113,214,8 // vpsrlw $0x8,%ymm6,%ymm6 + .byte 197,197,113,215,8 // vpsrlw $0x8,%ymm7,%ymm7 + .byte 196,99,69,56,198,1 // vinserti128 $0x1,%xmm6,%ymm7,%ymm8 + .byte 196,227,69,70,246,49 // vperm2i128 $0x31,%ymm6,%ymm7,%ymm6 + .byte 197,189,103,246 // vpackuswb %ymm6,%ymm8,%ymm6 + .byte 196,226,125,0,196 // vpshufb %ymm4,%ymm0,%ymm0 + .byte 197,253,239,197 // vpxor %ymm5,%ymm0,%ymm0 + .byte 196,226,125,48,225 // vpmovzxbw %xmm1,%ymm4 + .byte 196,227,125,57,205,1 // vextracti128 $0x1,%ymm1,%xmm5 .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5 + .byte 196,226,125,48,248 // vpmovzxbw %xmm0,%ymm7 + .byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0 .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0 - .byte 197,253,213,199 // vpmullw %ymm7,%ymm0,%ymm0 - .byte 197,213,213,238 // vpmullw %ymm6,%ymm5,%ymm5 - .byte 197,213,253,238 // vpaddw %ymm6,%ymm5,%ymm5 - .byte 197,253,253,199 // vpaddw %ymm7,%ymm0,%ymm0 + .byte 197,253,213,197 // vpmullw %ymm5,%ymm0,%ymm0 + .byte 197,197,213,252 // vpmullw %ymm4,%ymm7,%ymm7 + .byte 197,197,253,252 // vpaddw %ymm4,%ymm7,%ymm7 + .byte 197,253,253,197 // vpaddw %ymm5,%ymm0,%ymm0 .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0 - .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5 - .byte 196,227,125,57,234,1 // vextracti128 $0x1,%ymm5,%xmm2 - .byte 196,226,105,0,212 // vpshufb %xmm4,%xmm2,%xmm2 - .byte 196,226,81,0,236 // vpshufb %xmm4,%xmm5,%xmm5 - .byte 197,209,108,210 // vpunpcklqdq %xmm2,%xmm5,%xmm2 - .byte 196,227,125,57,197,1 // vextracti128 $0x1,%ymm0,%xmm5 - .byte 196,226,81,0,236 // vpshufb %xmm4,%xmm5,%xmm5 - .byte 196,226,121,0,196 // vpshufb %xmm4,%xmm0,%xmm0 - .byte 197,249,108,197 // vpunpcklqdq %xmm5,%xmm0,%xmm0 - .byte 196,227,125,56,194,1 // vinserti128 $0x1,%xmm2,%ymm0,%ymm0 - .byte 197,197,213,211 // vpmullw %ymm3,%ymm7,%ymm2 - .byte 196,193,77,213,235 // vpmullw %ymm11,%ymm6,%ymm5 - .byte 196,193,85,253,235 // vpaddw %ymm11,%ymm5,%ymm5 - .byte 197,237,253,211 // vpaddw %ymm3,%ymm2,%ymm2 + .byte 197,197,113,215,8 // vpsrlw $0x8,%ymm7,%ymm7 + .byte 196,99,69,56,192,1 // vinserti128 $0x1,%xmm0,%ymm7,%ymm8 + .byte 196,227,69,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm7,%ymm0 + .byte 197,189,103,192 // vpackuswb %ymm0,%ymm8,%ymm0 + .byte 197,253,252,198 // vpaddb %ymm6,%ymm0,%ymm0 + .byte 197,213,213,235 // vpmullw %ymm3,%ymm5,%ymm5 + .byte 197,221,213,226 // vpmullw %ymm2,%ymm4,%ymm4 + .byte 197,221,253,210 // vpaddw %ymm2,%ymm4,%ymm2 + .byte 197,213,253,219 // vpaddw %ymm3,%ymm5,%ymm3 + .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3 .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2 - .byte 197,229,113,213,8 // vpsrlw $0x8,%ymm5,%ymm3 - .byte 196,227,125,57,221,1 // vextracti128 $0x1,%ymm3,%xmm5 - .byte 196,226,81,0,236 // vpshufb %xmm4,%xmm5,%xmm5 - .byte 196,226,97,0,220 // vpshufb %xmm4,%xmm3,%xmm3 - .byte 197,225,108,221 // vpunpcklqdq %xmm5,%xmm3,%xmm3 - .byte 196,227,125,57,213,1 // vextracti128 $0x1,%ymm2,%xmm5 - .byte 196,226,81,0,236 // vpshufb %xmm4,%xmm5,%xmm5 - .byte 196,226,105,0,212 // vpshufb %xmm4,%xmm2,%xmm2 - .byte 197,233,108,213 // vpunpcklqdq %xmm5,%xmm2,%xmm2 - .byte 196,227,109,56,211,1 // vinserti128 $0x1,%xmm3,%ymm2,%ymm2 - .byte 197,237,252,192 // vpaddb %ymm0,%ymm2,%ymm0 - .byte 196,193,125,252,192 // vpaddb %ymm8,%ymm0,%ymm0 + .byte 196,227,109,56,227,1 // vinserti128 $0x1,%xmm3,%ymm2,%ymm4 + .byte 196,227,109,70,211,49 // vperm2i128 $0x31,%ymm3,%ymm2,%ymm2 + .byte 197,221,103,210 // vpackuswb %ymm2,%ymm4,%ymm2 + .byte 197,253,252,194 // vpaddb %ymm2,%ymm0,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -51858,28 +51710,21 @@ FUNCTION(_sk_screen_hsw_8bit) _sk_screen_hsw_8bit: .byte 197,237,118,210 // vpcmpeqd %ymm2,%ymm2,%ymm2 .byte 197,253,239,210 // vpxor %ymm2,%ymm0,%ymm2 - .byte 196,227,125,57,211,1 // vextracti128 $0x1,%ymm2,%xmm3 - .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 + .byte 196,226,125,48,218 // vpmovzxbw %xmm2,%ymm3 + .byte 196,227,125,57,210,1 // vextracti128 $0x1,%ymm2,%xmm2 .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 - .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4 - .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 - .byte 196,226,125,48,233 // vpmovzxbw %xmm1,%ymm5 + .byte 196,226,125,48,225 // vpmovzxbw %xmm1,%ymm4 + .byte 196,227,125,57,205,1 // vextracti128 $0x1,%ymm1,%xmm5 + .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5 .byte 197,213,213,234 // vpmullw %ymm2,%ymm5,%ymm5 .byte 197,221,213,227 // vpmullw %ymm3,%ymm4,%ymm4 .byte 197,221,253,219 // vpaddw %ymm3,%ymm4,%ymm3 .byte 197,213,253,210 // vpaddw %ymm2,%ymm5,%ymm2 .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2 .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3 - .byte 196,227,125,57,220,1 // vextracti128 $0x1,%ymm3,%xmm4 - .byte 197,249,111,45,219,5,0,0 // vmovdqa 0x5db(%rip),%xmm5 # 1b70 <_sk_xor__hsw_8bit+0x5ab> - .byte 196,226,89,0,229 // vpshufb %xmm5,%xmm4,%xmm4 - .byte 196,226,97,0,221 // vpshufb %xmm5,%xmm3,%xmm3 - .byte 197,225,108,220 // vpunpcklqdq %xmm4,%xmm3,%xmm3 - .byte 196,227,125,57,212,1 // vextracti128 $0x1,%ymm2,%xmm4 - .byte 196,226,89,0,229 // vpshufb %xmm5,%xmm4,%xmm4 - .byte 196,226,105,0,213 // vpshufb %xmm5,%xmm2,%xmm2 - .byte 197,233,108,212 // vpunpcklqdq %xmm4,%xmm2,%xmm2 - .byte 196,227,109,56,211,1 // vinserti128 $0x1,%xmm3,%ymm2,%ymm2 + .byte 196,227,101,56,226,1 // vinserti128 $0x1,%xmm2,%ymm3,%ymm4 + .byte 196,227,101,70,210,49 // vperm2i128 $0x31,%ymm2,%ymm3,%ymm2 + .byte 197,221,103,210 // vpackuswb %ymm2,%ymm4,%ymm2 .byte 197,237,252,192 // vpaddb %ymm0,%ymm2,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -51888,64 +51733,51 @@ HIDDEN _sk_xor__hsw_8bit .globl _sk_xor__hsw_8bit FUNCTION(_sk_xor__hsw_8bit) _sk_xor__hsw_8bit: - .byte 197,253,111,29,243,3,0,0 // vmovdqa 0x3f3(%rip),%ymm3 # 19c0 <_sk_xor__hsw_8bit+0x3fb> - .byte 196,226,117,0,211 // vpshufb %ymm3,%ymm1,%ymm2 + .byte 197,253,111,21,157,3,0,0 // vmovdqa 0x39d(%rip),%ymm2 # 1620 <_sk_xor__hsw_8bit+0x3a5> + .byte 196,226,117,0,218 // vpshufb %ymm2,%ymm1,%ymm3 .byte 197,221,118,228 // vpcmpeqd %ymm4,%ymm4,%ymm4 - .byte 197,237,239,212 // vpxor %ymm4,%ymm2,%ymm2 - .byte 196,227,125,57,197,1 // vextracti128 $0x1,%ymm0,%xmm5 - .byte 196,226,125,48,237 // vpmovzxbw %xmm5,%ymm5 - .byte 196,226,125,48,240 // vpmovzxbw %xmm0,%ymm6 - .byte 196,227,125,57,215,1 // vextracti128 $0x1,%ymm2,%xmm7 - .byte 196,226,125,48,255 // vpmovzxbw %xmm7,%ymm7 - .byte 196,226,125,48,210 // vpmovzxbw %xmm2,%ymm2 - .byte 197,237,213,214 // vpmullw %ymm6,%ymm2,%ymm2 + .byte 197,229,239,220 // vpxor %ymm4,%ymm3,%ymm3 + .byte 196,226,125,48,232 // vpmovzxbw %xmm0,%ymm5 + .byte 196,227,125,57,198,1 // vextracti128 $0x1,%ymm0,%xmm6 + .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6 + .byte 196,226,125,48,251 // vpmovzxbw %xmm3,%ymm7 + .byte 196,227,125,57,219,1 // vextracti128 $0x1,%ymm3,%xmm3 + .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 + .byte 197,229,213,222 // vpmullw %ymm6,%ymm3,%ymm3 .byte 197,197,213,253 // vpmullw %ymm5,%ymm7,%ymm7 .byte 197,197,253,237 // vpaddw %ymm5,%ymm7,%ymm5 - .byte 197,237,253,214 // vpaddw %ymm6,%ymm2,%ymm2 - .byte 197,205,113,210,8 // vpsrlw $0x8,%ymm2,%ymm6 + .byte 197,229,253,222 // vpaddw %ymm6,%ymm3,%ymm3 + .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3 .byte 197,213,113,213,8 // vpsrlw $0x8,%ymm5,%ymm5 - .byte 196,227,125,57,239,1 // vextracti128 $0x1,%ymm5,%xmm7 - .byte 197,249,111,21,94,5,0,0 // vmovdqa 0x55e(%rip),%xmm2 # 1b80 <_sk_xor__hsw_8bit+0x5bb> - .byte 196,226,65,0,250 // vpshufb %xmm2,%xmm7,%xmm7 - .byte 196,226,81,0,234 // vpshufb %xmm2,%xmm5,%xmm5 - .byte 197,209,108,239 // vpunpcklqdq %xmm7,%xmm5,%xmm5 - .byte 196,227,125,57,247,1 // vextracti128 $0x1,%ymm6,%xmm7 - .byte 196,226,65,0,250 // vpshufb %xmm2,%xmm7,%xmm7 - .byte 196,226,73,0,242 // vpshufb %xmm2,%xmm6,%xmm6 - .byte 197,201,108,247 // vpunpcklqdq %xmm7,%xmm6,%xmm6 - .byte 196,227,77,56,237,1 // vinserti128 $0x1,%xmm5,%ymm6,%ymm5 - .byte 196,226,125,0,195 // vpshufb %ymm3,%ymm0,%ymm0 + .byte 196,227,85,56,243,1 // vinserti128 $0x1,%xmm3,%ymm5,%ymm6 + .byte 196,227,85,70,219,49 // vperm2i128 $0x31,%ymm3,%ymm5,%ymm3 + .byte 197,205,103,219 // vpackuswb %ymm3,%ymm6,%ymm3 + .byte 196,226,125,0,194 // vpshufb %ymm2,%ymm0,%ymm0 .byte 197,253,239,196 // vpxor %ymm4,%ymm0,%ymm0 - .byte 196,227,125,57,203,1 // vextracti128 $0x1,%ymm1,%xmm3 - .byte 196,226,125,48,219 // vpmovzxbw %xmm3,%ymm3 - .byte 196,226,125,48,225 // vpmovzxbw %xmm1,%ymm4 - .byte 196,227,125,57,198,1 // vextracti128 $0x1,%ymm0,%xmm6 - .byte 196,226,125,48,246 // vpmovzxbw %xmm6,%ymm6 + .byte 196,226,125,48,209 // vpmovzxbw %xmm1,%ymm2 + .byte 196,227,125,57,204,1 // vextracti128 $0x1,%ymm1,%xmm4 + .byte 196,226,125,48,228 // vpmovzxbw %xmm4,%ymm4 + .byte 196,226,125,48,232 // vpmovzxbw %xmm0,%ymm5 + .byte 196,227,125,57,192,1 // vextracti128 $0x1,%ymm0,%xmm0 .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0 .byte 197,253,213,196 // vpmullw %ymm4,%ymm0,%ymm0 - .byte 197,205,213,243 // vpmullw %ymm3,%ymm6,%ymm6 - .byte 197,205,253,219 // vpaddw %ymm3,%ymm6,%ymm3 + .byte 197,213,213,234 // vpmullw %ymm2,%ymm5,%ymm5 + .byte 197,213,253,210 // vpaddw %ymm2,%ymm5,%ymm2 .byte 197,253,253,196 // vpaddw %ymm4,%ymm0,%ymm0 .byte 197,253,113,208,8 // vpsrlw $0x8,%ymm0,%ymm0 - .byte 197,229,113,211,8 // vpsrlw $0x8,%ymm3,%ymm3 - .byte 196,227,125,57,220,1 // vextracti128 $0x1,%ymm3,%xmm4 - .byte 196,226,89,0,226 // vpshufb %xmm2,%xmm4,%xmm4 - .byte 196,226,97,0,218 // vpshufb %xmm2,%xmm3,%xmm3 - .byte 197,225,108,220 // vpunpcklqdq %xmm4,%xmm3,%xmm3 - .byte 196,227,125,57,196,1 // vextracti128 $0x1,%ymm0,%xmm4 - .byte 196,226,89,0,226 // vpshufb %xmm2,%xmm4,%xmm4 - .byte 196,226,121,0,194 // vpshufb %xmm2,%xmm0,%xmm0 - .byte 197,249,108,196 // vpunpcklqdq %xmm4,%xmm0,%xmm0 - .byte 196,227,125,56,195,1 // vinserti128 $0x1,%xmm3,%ymm0,%ymm0 - .byte 197,213,252,192 // vpaddb %ymm0,%ymm5,%ymm0 + .byte 197,237,113,210,8 // vpsrlw $0x8,%ymm2,%ymm2 + .byte 196,227,109,56,224,1 // vinserti128 $0x1,%xmm0,%ymm2,%ymm4 + .byte 196,227,109,70,192,49 // vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + .byte 197,221,103,192 // vpackuswb %ymm0,%ymm4,%ymm0 + .byte 197,253,252,195 // vpaddb %ymm3,%ymm0,%ymm0 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax BALIGN4 .byte 0,0 // add %al,(%rax) - .byte 127,67 // jg 170b <_sk_xor__hsw_8bit+0x146> + .byte 127,67 // jg 137f <_sk_xor__hsw_8bit+0x104> .byte 0,0 // add %al,(%rax) - .byte 127,67 // jg 170f <_sk_xor__hsw_8bit+0x14a> + .byte 127,67 // jg 1383 <_sk_xor__hsw_8bit+0x108> .byte 1,1 // add %eax,(%rcx) .byte 1,0 // add %eax,(%rax) .byte 0,0 // add %al,(%rax) @@ -51955,9 +51787,9 @@ BALIGN4 .byte 0,0 // add %al,(%rax) .byte 0,255 // add %bh,%bh .byte 0,0 // add %al,(%rax) - .byte 127,67 // jg 1723 <_sk_xor__hsw_8bit+0x15e> + .byte 127,67 // jg 1397 <_sk_xor__hsw_8bit+0x11c> .byte 0,0 // add %al,(%rax) - .byte 127,67 // jg 1727 <_sk_xor__hsw_8bit+0x162> + .byte 127,67 // jg 139b <_sk_xor__hsw_8bit+0x120> BALIGN32 .byte 0,0 // add %al,(%rax) @@ -52395,14 +52227,6 @@ BALIGN32 .byte 15 // .byte 0xf BALIGN16 - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) .byte 255,0 // incl (%rax) .byte 255,0 // incl (%rax) .byte 255,0 // incl (%rax) @@ -52459,22 +52283,6 @@ BALIGN16 .byte 255,0 // incl (%rax) .byte 255,0 // incl (%rax) .byte 255,0 // incl (%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) .byte 255,0 // incl (%rax) .byte 255,0 // incl (%rax) .byte 255,0 // incl (%rax) @@ -52483,22 +52291,6 @@ BALIGN16 .byte 255,0 // incl (%rax) .byte 255,0 // incl (%rax) .byte 255,0 // incl (%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) .byte 255,0 // incl (%rax) .byte 255,0 // incl (%rax) .byte 255,0 // incl (%rax) @@ -52507,110 +52299,6 @@ BALIGN16 .byte 255,0 // incl (%rax) .byte 255,0 // incl (%rax) .byte 255,0 // incl (%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,2 // add %al,(%rdx) - .byte 4,6 // add $0x6,%al - .byte 8,10 // or %cl,(%rdx) - .byte 12,14 // or $0xe,%al - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) - .byte 0,0 // add %al,(%rax) BALIGN8 .byte 0,0 // add %al,(%rax) diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S index 73f2c31d28..7d4325cac8 100644 --- a/src/jumper/SkJumper_generated_win.S +++ b/src/jumper/SkJumper_generated_win.S @@ -36244,7 +36244,7 @@ _sk_just_return_hsw_8bit LABEL PROC PUBLIC _sk_uniform_color_hsw_8bit _sk_uniform_color_hsw_8bit LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax - DB 197,250,16,5,17,22,0,0 ; vmovss 0x1611(%rip),%xmm0 # 1784 <_sk_xor__hsw_8bit+0xff> + DB 197,250,16,5,133,18,0,0 ; vmovss 0x1285(%rip),%xmm0 # 13f8 <_sk_xor__hsw_8bit+0xbd> DB 197,250,89,16 ; vmulss (%rax),%xmm0,%xmm2 DB 196,225,250,44,202 ; vcvttss2si %xmm2,%rcx DB 197,250,89,80,4 ; vmulss 0x4(%rax),%xmm0,%xmm2 @@ -36267,7 +36267,7 @@ _sk_uniform_color_hsw_8bit LABEL PROC PUBLIC _sk_set_rgb_hsw_8bit _sk_set_rgb_hsw_8bit LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax - DB 197,250,16,21,200,21,0,0 ; vmovss 0x15c8(%rip),%xmm2 # 1788 <_sk_xor__hsw_8bit+0x103> + DB 197,250,16,21,60,18,0,0 ; vmovss 0x123c(%rip),%xmm2 # 13fc <_sk_xor__hsw_8bit+0xc1> DB 197,234,89,24 ; vmulss (%rax),%xmm2,%xmm3 DB 196,225,250,44,203 ; vcvttss2si %xmm3,%rcx DB 197,234,89,88,4 ; vmulss 0x4(%rax),%xmm2,%xmm3 @@ -36280,20 +36280,20 @@ _sk_set_rgb_hsw_8bit LABEL PROC DB 9,208 ; or %edx,%eax DB 197,249,110,208 ; vmovd %eax,%xmm2 DB 196,226,125,88,210 ; vpbroadcastd %xmm2,%ymm2 - DB 197,253,219,5,200,21,0,0 ; vpand 0x15c8(%rip),%ymm0,%ymm0 # 17c0 <_sk_xor__hsw_8bit+0x13b> + DB 197,253,219,5,40,18,0,0 ; vpand 0x1228(%rip),%ymm0,%ymm0 # 1420 <_sk_xor__hsw_8bit+0xe5> DB 197,237,235,192 ; vpor %ymm0,%ymm2,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax PUBLIC _sk_premul_hsw_8bit _sk_premul_hsw_8bit LABEL PROC - DB 196,226,125,0,21,215,21,0,0 ; vpshufb 0x15d7(%rip),%ymm0,%ymm2 # 17e0 <_sk_xor__hsw_8bit+0x15b> - DB 197,237,235,21,239,21,0,0 ; vpor 0x15ef(%rip),%ymm2,%ymm2 # 1800 <_sk_xor__hsw_8bit+0x17b> - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 + DB 196,226,125,0,21,55,18,0,0 ; vpshufb 0x1237(%rip),%ymm0,%ymm2 # 1440 <_sk_xor__hsw_8bit+0x105> + DB 197,237,235,21,79,18,0,0 ; vpor 0x124f(%rip),%ymm2,%ymm2 # 1460 <_sk_xor__hsw_8bit+0x125> + DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3 + DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0 DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0 - DB 196,227,125,57,212,1 ; vextracti128 $0x1,%ymm2,%xmm4 - DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 + DB 196,226,125,48,226 ; vpmovzxbw %xmm2,%ymm4 + DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2 DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 DB 197,237,213,208 ; vpmullw %ymm0,%ymm2,%ymm2 DB 197,221,213,227 ; vpmullw %ymm3,%ymm4,%ymm4 @@ -36301,22 +36301,15 @@ _sk_premul_hsw_8bit LABEL PROC DB 197,237,253,192 ; vpaddw %ymm0,%ymm2,%ymm0 DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0 DB 197,237,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm2 - DB 196,227,125,57,211,1 ; vextracti128 $0x1,%ymm2,%xmm3 - DB 197,249,111,37,71,24,0,0 ; vmovdqa 0x1847(%rip),%xmm4 # 1aa0 <_sk_xor__hsw_8bit+0x41b> - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,105,0,212 ; vpshufb %xmm4,%xmm2,%xmm2 - DB 197,233,108,211 ; vpunpcklqdq %xmm3,%xmm2,%xmm2 - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,121,0,196 ; vpshufb %xmm4,%xmm0,%xmm0 - DB 197,249,108,195 ; vpunpcklqdq %xmm3,%xmm0,%xmm0 - DB 196,227,125,56,194,1 ; vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax PUBLIC _sk_swap_rb_hsw_8bit _sk_swap_rb_hsw_8bit LABEL PROC - DB 196,226,125,0,5,146,21,0,0 ; vpshufb 0x1592(%rip),%ymm0,%ymm0 # 1820 <_sk_xor__hsw_8bit+0x19b> + DB 196,226,125,0,5,24,18,0,0 ; vpshufb 0x1218(%rip),%ymm0,%ymm0 # 1480 <_sk_xor__hsw_8bit+0x145> DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -36332,7 +36325,7 @@ _sk_load_8888_hsw_8bit LABEL PROC DB 72,3,8 ; add (%rax),%rcx DB 74,141,4,137 ; lea (%rcx,%r9,4),%rax DB 77,133,192 ; test %r8,%r8 - DB 117,8 ; jne 2bf <_sk_load_8888_hsw_8bit+0x2d> + DB 117,8 ; jne 299 <_sk_load_8888_hsw_8bit+0x2d> DB 197,254,111,0 ; vmovdqu (%rax),%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -36344,7 +36337,7 @@ _sk_load_8888_hsw_8bit LABEL PROC DB 196,225,249,110,194 ; vmovq %rdx,%xmm0 DB 196,226,125,33,192 ; vpmovsxbd %xmm0,%ymm0 DB 196,226,125,140,0 ; vpmaskmovd (%rax),%ymm0,%ymm0 - DB 235,214 ; jmp 2bb <_sk_load_8888_hsw_8bit+0x29> + DB 235,214 ; jmp 295 <_sk_load_8888_hsw_8bit+0x29> PUBLIC _sk_load_8888_dst_hsw_8bit _sk_load_8888_dst_hsw_8bit LABEL PROC @@ -36358,7 +36351,7 @@ _sk_load_8888_dst_hsw_8bit LABEL PROC DB 72,3,8 ; add (%rax),%rcx DB 74,141,4,137 ; lea (%rcx,%r9,4),%rax DB 77,133,192 ; test %r8,%r8 - DB 117,8 ; jne 312 <_sk_load_8888_dst_hsw_8bit+0x2d> + DB 117,8 ; jne 2ec <_sk_load_8888_dst_hsw_8bit+0x2d> DB 197,254,111,8 ; vmovdqu (%rax),%ymm1 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -36370,7 +36363,7 @@ _sk_load_8888_dst_hsw_8bit LABEL PROC DB 196,225,249,110,202 ; vmovq %rdx,%xmm1 DB 196,226,125,33,201 ; vpmovsxbd %xmm1,%ymm1 DB 196,226,117,140,8 ; vpmaskmovd (%rax),%ymm1,%ymm1 - DB 235,214 ; jmp 30e <_sk_load_8888_dst_hsw_8bit+0x29> + DB 235,214 ; jmp 2e8 <_sk_load_8888_dst_hsw_8bit+0x29> PUBLIC _sk_store_8888_hsw_8bit _sk_store_8888_hsw_8bit LABEL PROC @@ -36384,7 +36377,7 @@ _sk_store_8888_hsw_8bit LABEL PROC DB 72,3,8 ; add (%rax),%rcx DB 74,141,4,137 ; lea (%rcx,%r9,4),%rax DB 77,133,192 ; test %r8,%r8 - DB 117,8 ; jne 365 <_sk_store_8888_hsw_8bit+0x2d> + DB 117,8 ; jne 33f <_sk_store_8888_hsw_8bit+0x2d> DB 197,254,127,0 ; vmovdqu %ymm0,(%rax) DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -36396,7 +36389,7 @@ _sk_store_8888_hsw_8bit LABEL PROC DB 196,225,249,110,210 ; vmovq %rdx,%xmm2 DB 196,226,125,33,210 ; vpmovsxbd %xmm2,%ymm2 DB 196,226,109,142,0 ; vpmaskmovd %ymm0,%ymm2,(%rax) - DB 235,214 ; jmp 361 <_sk_store_8888_hsw_8bit+0x29> + DB 235,214 ; jmp 33b <_sk_store_8888_hsw_8bit+0x29> PUBLIC _sk_load_bgra_hsw_8bit _sk_load_bgra_hsw_8bit LABEL PROC @@ -36410,9 +36403,9 @@ _sk_load_bgra_hsw_8bit LABEL PROC DB 72,3,8 ; add (%rax),%rcx DB 74,141,4,137 ; lea (%rcx,%r9,4),%rax DB 77,133,192 ; test %r8,%r8 - DB 117,17 ; jne 3c1 <_sk_load_bgra_hsw_8bit+0x36> + DB 117,17 ; jne 39b <_sk_load_bgra_hsw_8bit+0x36> DB 197,254,111,0 ; vmovdqu (%rax),%ymm0 - DB 196,226,125,0,5,131,20,0,0 ; vpshufb 0x1483(%rip),%ymm0,%ymm0 # 1840 <_sk_xor__hsw_8bit+0x1bb> + DB 196,226,125,0,5,9,17,0,0 ; vpshufb 0x1109(%rip),%ymm0,%ymm0 # 14a0 <_sk_xor__hsw_8bit+0x165> DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax DB 185,8,0,0,0 ; mov $0x8,%ecx @@ -36423,7 +36416,7 @@ _sk_load_bgra_hsw_8bit LABEL PROC DB 196,225,249,110,194 ; vmovq %rdx,%xmm0 DB 196,226,125,33,192 ; vpmovsxbd %xmm0,%ymm0 DB 196,226,125,140,0 ; vpmaskmovd (%rax),%ymm0,%ymm0 - DB 235,205 ; jmp 3b4 <_sk_load_bgra_hsw_8bit+0x29> + DB 235,205 ; jmp 38e <_sk_load_bgra_hsw_8bit+0x29> PUBLIC _sk_load_bgra_dst_hsw_8bit _sk_load_bgra_dst_hsw_8bit LABEL PROC @@ -36437,9 +36430,9 @@ _sk_load_bgra_dst_hsw_8bit LABEL PROC DB 72,3,8 ; add (%rax),%rcx DB 74,141,4,137 ; lea (%rcx,%r9,4),%rax DB 77,133,192 ; test %r8,%r8 - DB 117,17 ; jne 41d <_sk_load_bgra_dst_hsw_8bit+0x36> + DB 117,17 ; jne 3f7 <_sk_load_bgra_dst_hsw_8bit+0x36> DB 197,254,111,8 ; vmovdqu (%rax),%ymm1 - DB 196,226,117,0,13,71,20,0,0 ; vpshufb 0x1447(%rip),%ymm1,%ymm1 # 1860 <_sk_xor__hsw_8bit+0x1db> + DB 196,226,117,0,13,205,16,0,0 ; vpshufb 0x10cd(%rip),%ymm1,%ymm1 # 14c0 <_sk_xor__hsw_8bit+0x185> DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax DB 185,8,0,0,0 ; mov $0x8,%ecx @@ -36450,7 +36443,7 @@ _sk_load_bgra_dst_hsw_8bit LABEL PROC DB 196,225,249,110,202 ; vmovq %rdx,%xmm1 DB 196,226,125,33,201 ; vpmovsxbd %xmm1,%ymm1 DB 196,226,117,140,8 ; vpmaskmovd (%rax),%ymm1,%ymm1 - DB 235,205 ; jmp 410 <_sk_load_bgra_dst_hsw_8bit+0x29> + DB 235,205 ; jmp 3ea <_sk_load_bgra_dst_hsw_8bit+0x29> PUBLIC _sk_store_bgra_hsw_8bit _sk_store_bgra_hsw_8bit LABEL PROC @@ -36463,9 +36456,9 @@ _sk_store_bgra_hsw_8bit LABEL PROC DB 72,193,225,2 ; shl $0x2,%rcx DB 72,3,8 ; add (%rax),%rcx DB 74,141,4,137 ; lea (%rcx,%r9,4),%rax - DB 196,226,125,0,21,20,20,0,0 ; vpshufb 0x1414(%rip),%ymm0,%ymm2 # 1880 <_sk_xor__hsw_8bit+0x1fb> + DB 196,226,125,0,21,154,16,0,0 ; vpshufb 0x109a(%rip),%ymm0,%ymm2 # 14e0 <_sk_xor__hsw_8bit+0x1a5> DB 77,133,192 ; test %r8,%r8 - DB 117,8 ; jne 479 <_sk_store_bgra_hsw_8bit+0x36> + DB 117,8 ; jne 453 <_sk_store_bgra_hsw_8bit+0x36> DB 197,254,127,16 ; vmovdqu %ymm2,(%rax) DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -36477,7 +36470,7 @@ _sk_store_bgra_hsw_8bit LABEL PROC DB 196,225,249,110,218 ; vmovq %rdx,%xmm3 DB 196,226,125,33,219 ; vpmovsxbd %xmm3,%ymm3 DB 196,226,101,142,16 ; vpmaskmovd %ymm2,%ymm3,(%rax) - DB 235,214 ; jmp 475 <_sk_store_bgra_hsw_8bit+0x32> + DB 235,214 ; jmp 44f <_sk_store_bgra_hsw_8bit+0x32> PUBLIC _sk_load_a8_hsw_8bit _sk_load_a8_hsw_8bit LABEL PROC @@ -36489,9 +36482,9 @@ _sk_load_a8_hsw_8bit LABEL PROC DB 72,15,175,209 ; imul %rcx,%rdx DB 72,3,16 ; add (%rax),%rdx DB 77,133,201 ; test %r9,%r9 - DB 117,28 ; jne 4d8 <_sk_load_a8_hsw_8bit+0x39> + DB 117,28 ; jne 4b2 <_sk_load_a8_hsw_8bit+0x39> DB 196,162,121,48,4,2 ; vpmovzxbw (%rdx,%r8,1),%xmm0 - DB 197,249,219,5,230,21,0,0 ; vpand 0x15e6(%rip),%xmm0,%xmm0 # 1ab0 <_sk_xor__hsw_8bit+0x42b> + DB 197,249,219,5,92,18,0,0 ; vpand 0x125c(%rip),%xmm0,%xmm0 # 1700 <_sk_xor__hsw_8bit+0x3c5> DB 196,226,125,51,192 ; vpmovzxwd %xmm0,%ymm0 DB 197,253,114,240,24 ; vpslld $0x18,%ymm0,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax @@ -36500,15 +36493,15 @@ _sk_load_a8_hsw_8bit LABEL PROC DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0 DB 65,254,201 ; dec %r9b DB 65,128,249,6 ; cmp $0x6,%r9b - DB 119,217 ; ja 4c2 <_sk_load_a8_hsw_8bit+0x23> + DB 119,217 ; ja 49c <_sk_load_a8_hsw_8bit+0x23> DB 65,15,182,193 ; movzbl %r9b,%eax - DB 72,141,13,116,0,0,0 ; lea 0x74(%rip),%rcx # 568 <_sk_load_a8_hsw_8bit+0xc9> + DB 72,141,13,118,0,0,0 ; lea 0x76(%rip),%rcx # 544 <_sk_load_a8_hsw_8bit+0xcb> DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax DB 72,1,200 ; add %rcx,%rax DB 255,224 ; jmpq *%rax DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax DB 197,249,110,192 ; vmovd %eax,%xmm0 - DB 235,186 ; jmp 4c2 <_sk_load_a8_hsw_8bit+0x23> + DB 235,186 ; jmp 49c <_sk_load_a8_hsw_8bit+0x23> DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0 DB 197,249,196,192,2 ; vpinsrw $0x2,%eax,%xmm0,%xmm0 @@ -36516,7 +36509,7 @@ _sk_load_a8_hsw_8bit LABEL PROC DB 197,249,110,208 ; vmovd %eax,%xmm2 DB 196,226,121,48,210 ; vpmovzxbw %xmm2,%xmm2 DB 196,227,121,2,194,1 ; vpblendd $0x1,%xmm2,%xmm0,%xmm0 - DB 235,149 ; jmp 4c2 <_sk_load_a8_hsw_8bit+0x23> + DB 235,149 ; jmp 49c <_sk_load_a8_hsw_8bit+0x23> DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0 DB 197,249,196,192,6 ; vpinsrw $0x6,%eax,%xmm0,%xmm0 @@ -36527,24 +36520,22 @@ _sk_load_a8_hsw_8bit LABEL PROC DB 196,161,121,110,20,2 ; vmovd (%rdx,%r8,1),%xmm2 DB 196,226,121,48,210 ; vpmovzxbw %xmm2,%xmm2 DB 196,227,105,2,192,12 ; vpblendd $0xc,%xmm0,%xmm2,%xmm0 - DB 233,90,255,255,255 ; jmpq 4c2 <_sk_load_a8_hsw_8bit+0x23> - DB 149 ; xchg %eax,%ebp - DB 255 ; (bad) - DB 255 ; (bad) - DB 255,175,255,255,255,160 ; ljmp *-0x5f000001(%rdi) + DB 233,90,255,255,255 ; jmpq 49c <_sk_load_a8_hsw_8bit+0x23> + DB 102,144 ; xchg %ax,%ax + DB 147 ; xchg %eax,%ebx DB 255 ; (bad) DB 255 ; (bad) + DB 255,173,255,255,255,158 ; ljmp *-0x61000001(%rbp) DB 255 ; (bad) - DB 234 ; (bad) DB 255 ; (bad) DB 255 ; (bad) + DB 232,255,255,255,221 ; callq ffffffffde000554 <_sk_xor__hsw_8bit+0xffffffffddfff219> DB 255 ; (bad) - DB 223,255 ; (bad) DB 255 ; (bad) - DB 255,212 ; callq *%rsp + DB 255,210 ; callq *%rdx DB 255 ; (bad) DB 255 ; (bad) - DB 255,197 ; inc %ebp + DB 255,195 ; inc %ebx DB 255 ; (bad) DB 255 ; (bad) DB 255 ; .byte 0xff @@ -36559,9 +36550,9 @@ _sk_load_a8_dst_hsw_8bit LABEL PROC DB 72,15,175,209 ; imul %rcx,%rdx DB 72,3,16 ; add (%rax),%rdx DB 77,133,201 ; test %r9,%r9 - DB 117,28 ; jne 5bd <_sk_load_a8_dst_hsw_8bit+0x39> + DB 117,28 ; jne 599 <_sk_load_a8_dst_hsw_8bit+0x39> DB 196,162,121,48,12,2 ; vpmovzxbw (%rdx,%r8,1),%xmm1 - DB 197,241,219,13,17,21,0,0 ; vpand 0x1511(%rip),%xmm1,%xmm1 # 1ac0 <_sk_xor__hsw_8bit+0x43b> + DB 197,241,219,13,133,17,0,0 ; vpand 0x1185(%rip),%xmm1,%xmm1 # 1710 <_sk_xor__hsw_8bit+0x3d5> DB 196,226,125,51,201 ; vpmovzxwd %xmm1,%ymm1 DB 197,245,114,241,24 ; vpslld $0x18,%ymm1,%ymm1 DB 72,173 ; lods %ds:(%rsi),%rax @@ -36570,15 +36561,15 @@ _sk_load_a8_dst_hsw_8bit LABEL PROC DB 197,241,239,201 ; vpxor %xmm1,%xmm1,%xmm1 DB 65,254,201 ; dec %r9b DB 65,128,249,6 ; cmp $0x6,%r9b - DB 119,217 ; ja 5a7 <_sk_load_a8_dst_hsw_8bit+0x23> + DB 119,217 ; ja 583 <_sk_load_a8_dst_hsw_8bit+0x23> DB 65,15,182,193 ; movzbl %r9b,%eax - DB 72,141,13,119,0,0,0 ; lea 0x77(%rip),%rcx # 650 <_sk_load_a8_dst_hsw_8bit+0xcc> + DB 72,141,13,119,0,0,0 ; lea 0x77(%rip),%rcx # 62c <_sk_load_a8_dst_hsw_8bit+0xcc> DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax DB 72,1,200 ; add %rcx,%rax DB 255,224 ; jmpq *%rax DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax DB 197,249,110,200 ; vmovd %eax,%xmm1 - DB 235,186 ; jmp 5a7 <_sk_load_a8_dst_hsw_8bit+0x23> + DB 235,186 ; jmp 583 <_sk_load_a8_dst_hsw_8bit+0x23> DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax DB 197,241,239,201 ; vpxor %xmm1,%xmm1,%xmm1 DB 197,241,196,200,2 ; vpinsrw $0x2,%eax,%xmm1,%xmm1 @@ -36586,7 +36577,7 @@ _sk_load_a8_dst_hsw_8bit LABEL PROC DB 197,249,110,208 ; vmovd %eax,%xmm2 DB 196,226,121,48,210 ; vpmovzxbw %xmm2,%xmm2 DB 196,227,113,2,202,1 ; vpblendd $0x1,%xmm2,%xmm1,%xmm1 - DB 235,149 ; jmp 5a7 <_sk_load_a8_dst_hsw_8bit+0x23> + DB 235,149 ; jmp 583 <_sk_load_a8_dst_hsw_8bit+0x23> DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax DB 197,241,239,201 ; vpxor %xmm1,%xmm1,%xmm1 DB 197,241,196,200,6 ; vpinsrw $0x6,%eax,%xmm1,%xmm1 @@ -36597,7 +36588,7 @@ _sk_load_a8_dst_hsw_8bit LABEL PROC DB 196,161,121,110,20,2 ; vmovd (%rdx,%r8,1),%xmm2 DB 196,226,121,48,210 ; vpmovzxbw %xmm2,%xmm2 DB 196,227,105,2,201,12 ; vpblendd $0xc,%xmm1,%xmm2,%xmm1 - DB 233,90,255,255,255 ; jmpq 5a7 <_sk_load_a8_dst_hsw_8bit+0x23> + DB 233,90,255,255,255 ; jmpq 583 <_sk_load_a8_dst_hsw_8bit+0x23> DB 15,31,0 ; nopl (%rax) DB 146 ; xchg %eax,%edx DB 255 ; (bad) @@ -36627,35 +36618,35 @@ _sk_store_a8_hsw_8bit LABEL PROC DB 72,99,87,8 ; movslq 0x8(%rdi),%rdx DB 72,15,175,209 ; imul %rcx,%rdx DB 72,3,16 ; add (%rax),%rdx - DB 196,226,125,0,21,19,18,0,0 ; vpshufb 0x1213(%rip),%ymm0,%ymm2 # 18a0 <_sk_xor__hsw_8bit+0x21b> + DB 196,226,125,0,21,151,14,0,0 ; vpshufb 0xe97(%rip),%ymm0,%ymm2 # 1500 <_sk_xor__hsw_8bit+0x1c5> DB 196,227,253,0,210,232 ; vpermq $0xe8,%ymm2,%ymm2 DB 77,133,201 ; test %r9,%r9 - DB 117,19 ; jne 6ab <_sk_store_a8_hsw_8bit+0x3f> - DB 196,226,105,0,21,79,20,0,0 ; vpshufb 0x144f(%rip),%xmm2,%xmm2 # 1af0 <_sk_xor__hsw_8bit+0x46b> + DB 117,19 ; jne 687 <_sk_store_a8_hsw_8bit+0x3f> + DB 196,226,105,0,21,195,16,0,0 ; vpshufb 0x10c3(%rip),%xmm2,%xmm2 # 1740 <_sk_xor__hsw_8bit+0x405> DB 196,161,121,214,20,2 ; vmovq %xmm2,(%rdx,%r8,1) DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax DB 65,128,225,7 ; and $0x7,%r9b DB 65,254,201 ; dec %r9b DB 65,128,249,6 ; cmp $0x6,%r9b - DB 119,239 ; ja 6a7 <_sk_store_a8_hsw_8bit+0x3b> + DB 119,239 ; ja 683 <_sk_store_a8_hsw_8bit+0x3b> DB 65,15,182,193 ; movzbl %r9b,%eax - DB 72,141,13,85,0,0,0 ; lea 0x55(%rip),%rcx # 718 <_sk_store_a8_hsw_8bit+0xac> + DB 72,141,13,85,0,0,0 ; lea 0x55(%rip),%rcx # 6f4 <_sk_store_a8_hsw_8bit+0xac> DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax DB 72,1,200 ; add %rcx,%rax DB 255,224 ; jmpq *%rax DB 196,163,121,20,20,2,0 ; vpextrb $0x0,%xmm2,(%rdx,%r8,1) - DB 235,210 ; jmp 6a7 <_sk_store_a8_hsw_8bit+0x3b> + DB 235,210 ; jmp 683 <_sk_store_a8_hsw_8bit+0x3b> DB 196,163,121,20,84,2,2,4 ; vpextrb $0x4,%xmm2,0x2(%rdx,%r8,1) - DB 196,226,105,0,21,234,19,0,0 ; vpshufb 0x13ea(%rip),%xmm2,%xmm2 # 1ad0 <_sk_xor__hsw_8bit+0x44b> + DB 196,226,105,0,21,94,16,0,0 ; vpshufb 0x105e(%rip),%xmm2,%xmm2 # 1720 <_sk_xor__hsw_8bit+0x3e5> DB 196,163,121,21,20,2,0 ; vpextrw $0x0,%xmm2,(%rdx,%r8,1) - DB 235,184 ; jmp 6a7 <_sk_store_a8_hsw_8bit+0x3b> + DB 235,184 ; jmp 683 <_sk_store_a8_hsw_8bit+0x3b> DB 196,163,121,20,84,2,6,12 ; vpextrb $0xc,%xmm2,0x6(%rdx,%r8,1) DB 196,163,121,20,84,2,5,10 ; vpextrb $0xa,%xmm2,0x5(%rdx,%r8,1) DB 196,163,121,20,84,2,4,8 ; vpextrb $0x8,%xmm2,0x4(%rdx,%r8,1) - DB 196,226,105,0,21,208,19,0,0 ; vpshufb 0x13d0(%rip),%xmm2,%xmm2 # 1ae0 <_sk_xor__hsw_8bit+0x45b> + DB 196,226,105,0,21,68,16,0,0 ; vpshufb 0x1044(%rip),%xmm2,%xmm2 # 1730 <_sk_xor__hsw_8bit+0x3f5> DB 196,161,121,126,20,2 ; vmovd %xmm2,(%rdx,%r8,1) - DB 235,143 ; jmp 6a7 <_sk_store_a8_hsw_8bit+0x3b> + DB 235,143 ; jmp 683 <_sk_store_a8_hsw_8bit+0x3b> DB 180,255 ; mov $0xff,%ah DB 255 ; (bad) DB 255,197 ; inc %ebp @@ -36686,13 +36677,13 @@ _sk_load_g8_hsw_8bit LABEL PROC DB 72,15,175,209 ; imul %rcx,%rdx DB 72,3,16 ; add (%rax),%rdx DB 77,133,201 ; test %r9,%r9 - DB 117,50 ; jne 783 <_sk_load_g8_hsw_8bit+0x4f> + DB 117,50 ; jne 75f <_sk_load_g8_hsw_8bit+0x4f> DB 196,162,121,48,4,2 ; vpmovzxbw (%rdx,%r8,1),%xmm0 - DB 197,249,219,5,161,19,0,0 ; vpand 0x13a1(%rip),%xmm0,%xmm0 # 1b00 <_sk_xor__hsw_8bit+0x47b> + DB 197,249,219,5,21,16,0,0 ; vpand 0x1015(%rip),%xmm0,%xmm0 # 1750 <_sk_xor__hsw_8bit+0x415> DB 196,226,125,51,192 ; vpmovzxwd %xmm0,%ymm0 - DB 196,226,125,88,21,31,16,0,0 ; vpbroadcastd 0x101f(%rip),%ymm2 # 178c <_sk_xor__hsw_8bit+0x107> + DB 196,226,125,88,21,183,12,0,0 ; vpbroadcastd 0xcb7(%rip),%ymm2 # 1400 <_sk_xor__hsw_8bit+0xc5> DB 196,226,125,64,194 ; vpmulld %ymm2,%ymm0,%ymm0 - DB 196,226,125,88,21,21,16,0,0 ; vpbroadcastd 0x1015(%rip),%ymm2 # 1790 <_sk_xor__hsw_8bit+0x10b> + DB 196,226,125,88,21,173,12,0,0 ; vpbroadcastd 0xcad(%rip),%ymm2 # 1404 <_sk_xor__hsw_8bit+0xc9> DB 197,253,235,194 ; vpor %ymm2,%ymm0,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -36700,15 +36691,15 @@ _sk_load_g8_hsw_8bit LABEL PROC DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0 DB 65,254,201 ; dec %r9b DB 65,128,249,6 ; cmp $0x6,%r9b - DB 119,195 ; ja 757 <_sk_load_g8_hsw_8bit+0x23> + DB 119,195 ; ja 733 <_sk_load_g8_hsw_8bit+0x23> DB 65,15,182,193 ; movzbl %r9b,%eax - DB 72,141,13,121,0,0,0 ; lea 0x79(%rip),%rcx # 818 <_sk_load_g8_hsw_8bit+0xe4> + DB 72,141,13,121,0,0,0 ; lea 0x79(%rip),%rcx # 7f4 <_sk_load_g8_hsw_8bit+0xe4> DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax DB 72,1,200 ; add %rcx,%rax DB 255,224 ; jmpq *%rax DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax DB 197,249,110,192 ; vmovd %eax,%xmm0 - DB 235,164 ; jmp 757 <_sk_load_g8_hsw_8bit+0x23> + DB 235,164 ; jmp 733 <_sk_load_g8_hsw_8bit+0x23> DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0 DB 197,249,196,192,2 ; vpinsrw $0x2,%eax,%xmm0,%xmm0 @@ -36716,7 +36707,7 @@ _sk_load_g8_hsw_8bit LABEL PROC DB 197,249,110,208 ; vmovd %eax,%xmm2 DB 196,226,121,48,210 ; vpmovzxbw %xmm2,%xmm2 DB 196,227,121,2,194,1 ; vpblendd $0x1,%xmm2,%xmm0,%xmm0 - DB 233,124,255,255,255 ; jmpq 757 <_sk_load_g8_hsw_8bit+0x23> + DB 233,124,255,255,255 ; jmpq 733 <_sk_load_g8_hsw_8bit+0x23> DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0 DB 197,249,196,192,6 ; vpinsrw $0x6,%eax,%xmm0,%xmm0 @@ -36727,7 +36718,7 @@ _sk_load_g8_hsw_8bit LABEL PROC DB 196,161,121,110,20,2 ; vmovd (%rdx,%r8,1),%xmm2 DB 196,226,121,48,210 ; vpmovzxbw %xmm2,%xmm2 DB 196,227,105,2,192,12 ; vpblendd $0xc,%xmm0,%xmm2,%xmm0 - DB 233,65,255,255,255 ; jmpq 757 <_sk_load_g8_hsw_8bit+0x23> + DB 233,65,255,255,255 ; jmpq 733 <_sk_load_g8_hsw_8bit+0x23> DB 102,144 ; xchg %ax,%ax DB 144 ; nop DB 255 ; (bad) @@ -36736,7 +36727,7 @@ _sk_load_g8_hsw_8bit LABEL PROC DB 255 ; (bad) DB 255 ; (bad) DB 255 ; (bad) - DB 232,255,255,255,221 ; callq ffffffffde000828 <_sk_xor__hsw_8bit+0xffffffffddfff1a3> + DB 232,255,255,255,221 ; callq ffffffffde000804 <_sk_xor__hsw_8bit+0xffffffffddfff4c9> DB 255 ; (bad) DB 255 ; (bad) DB 255,210 ; callq *%rdx @@ -36757,13 +36748,13 @@ _sk_load_g8_dst_hsw_8bit LABEL PROC DB 72,15,175,209 ; imul %rcx,%rdx DB 72,3,16 ; add (%rax),%rdx DB 77,133,201 ; test %r9,%r9 - DB 117,50 ; jne 883 <_sk_load_g8_dst_hsw_8bit+0x4f> + DB 117,50 ; jne 85f <_sk_load_g8_dst_hsw_8bit+0x4f> DB 196,162,121,48,12,2 ; vpmovzxbw (%rdx,%r8,1),%xmm1 - DB 197,241,219,13,177,18,0,0 ; vpand 0x12b1(%rip),%xmm1,%xmm1 # 1b10 <_sk_xor__hsw_8bit+0x48b> + DB 197,241,219,13,37,15,0,0 ; vpand 0xf25(%rip),%xmm1,%xmm1 # 1760 <_sk_xor__hsw_8bit+0x425> DB 196,226,125,51,201 ; vpmovzxwd %xmm1,%ymm1 - DB 196,226,125,88,21,39,15,0,0 ; vpbroadcastd 0xf27(%rip),%ymm2 # 1794 <_sk_xor__hsw_8bit+0x10f> + DB 196,226,125,88,21,191,11,0,0 ; vpbroadcastd 0xbbf(%rip),%ymm2 # 1408 <_sk_xor__hsw_8bit+0xcd> DB 196,226,117,64,202 ; vpmulld %ymm2,%ymm1,%ymm1 - DB 196,226,125,88,21,29,15,0,0 ; vpbroadcastd 0xf1d(%rip),%ymm2 # 1798 <_sk_xor__hsw_8bit+0x113> + DB 196,226,125,88,21,181,11,0,0 ; vpbroadcastd 0xbb5(%rip),%ymm2 # 140c <_sk_xor__hsw_8bit+0xd1> DB 197,245,235,202 ; vpor %ymm2,%ymm1,%ymm1 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -36771,15 +36762,15 @@ _sk_load_g8_dst_hsw_8bit LABEL PROC DB 197,241,239,201 ; vpxor %xmm1,%xmm1,%xmm1 DB 65,254,201 ; dec %r9b DB 65,128,249,6 ; cmp $0x6,%r9b - DB 119,195 ; ja 857 <_sk_load_g8_dst_hsw_8bit+0x23> + DB 119,195 ; ja 833 <_sk_load_g8_dst_hsw_8bit+0x23> DB 65,15,182,193 ; movzbl %r9b,%eax - DB 72,141,13,121,0,0,0 ; lea 0x79(%rip),%rcx # 918 <_sk_load_g8_dst_hsw_8bit+0xe4> + DB 72,141,13,121,0,0,0 ; lea 0x79(%rip),%rcx # 8f4 <_sk_load_g8_dst_hsw_8bit+0xe4> DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax DB 72,1,200 ; add %rcx,%rax DB 255,224 ; jmpq *%rax DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax DB 197,249,110,200 ; vmovd %eax,%xmm1 - DB 235,164 ; jmp 857 <_sk_load_g8_dst_hsw_8bit+0x23> + DB 235,164 ; jmp 833 <_sk_load_g8_dst_hsw_8bit+0x23> DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax DB 197,241,239,201 ; vpxor %xmm1,%xmm1,%xmm1 DB 197,241,196,200,2 ; vpinsrw $0x2,%eax,%xmm1,%xmm1 @@ -36787,7 +36778,7 @@ _sk_load_g8_dst_hsw_8bit LABEL PROC DB 197,249,110,208 ; vmovd %eax,%xmm2 DB 196,226,121,48,210 ; vpmovzxbw %xmm2,%xmm2 DB 196,227,113,2,202,1 ; vpblendd $0x1,%xmm2,%xmm1,%xmm1 - DB 233,124,255,255,255 ; jmpq 857 <_sk_load_g8_dst_hsw_8bit+0x23> + DB 233,124,255,255,255 ; jmpq 833 <_sk_load_g8_dst_hsw_8bit+0x23> DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax DB 197,241,239,201 ; vpxor %xmm1,%xmm1,%xmm1 DB 197,241,196,200,6 ; vpinsrw $0x6,%eax,%xmm1,%xmm1 @@ -36798,7 +36789,7 @@ _sk_load_g8_dst_hsw_8bit LABEL PROC DB 196,161,121,110,20,2 ; vmovd (%rdx,%r8,1),%xmm2 DB 196,226,121,48,210 ; vpmovzxbw %xmm2,%xmm2 DB 196,227,105,2,201,12 ; vpblendd $0xc,%xmm1,%xmm2,%xmm1 - DB 233,65,255,255,255 ; jmpq 857 <_sk_load_g8_dst_hsw_8bit+0x23> + DB 233,65,255,255,255 ; jmpq 833 <_sk_load_g8_dst_hsw_8bit+0x23> DB 102,144 ; xchg %ax,%ax DB 144 ; nop DB 255 ; (bad) @@ -36807,7 +36798,7 @@ _sk_load_g8_dst_hsw_8bit LABEL PROC DB 255 ; (bad) DB 255 ; (bad) DB 255 ; (bad) - DB 232,255,255,255,221 ; callq ffffffffde000928 <_sk_xor__hsw_8bit+0xffffffffddfff2a3> + DB 232,255,255,255,221 ; callq ffffffffde000904 <_sk_xor__hsw_8bit+0xffffffffddfff5c9> DB 255 ; (bad) DB 255 ; (bad) DB 255,210 ; callq *%rdx @@ -36830,16 +36821,16 @@ _sk_srcover_rgba_8888_hsw_8bit LABEL PROC DB 72,3,8 ; add (%rax),%rcx DB 74,141,4,137 ; lea (%rcx,%r9,4),%rax DB 77,133,192 ; test %r8,%r8 - DB 15,133,150,0,0,0 ; jne 9f3 <_sk_srcover_rgba_8888_hsw_8bit+0xbf> + DB 117,112 ; jne 9a5 <_sk_srcover_rgba_8888_hsw_8bit+0x95> DB 197,254,111,16 ; vmovdqu (%rax),%ymm2 - DB 196,226,125,0,29,86,15,0,0 ; vpshufb 0xf56(%rip),%ymm0,%ymm3 # 18c0 <_sk_xor__hsw_8bit+0x23b> + DB 196,226,125,0,29,222,11,0,0 ; vpshufb 0xbde(%rip),%ymm0,%ymm3 # 1520 <_sk_xor__hsw_8bit+0x1e5> DB 197,221,118,228 ; vpcmpeqd %ymm4,%ymm4,%ymm4 DB 197,229,239,220 ; vpxor %ymm4,%ymm3,%ymm3 - DB 196,227,125,57,212,1 ; vextracti128 $0x1,%ymm2,%xmm4 - DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 + DB 196,226,125,48,226 ; vpmovzxbw %xmm2,%ymm4 + DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2 DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 - DB 196,227,125,57,221,1 ; vextracti128 $0x1,%ymm3,%xmm5 - DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5 + DB 196,226,125,48,235 ; vpmovzxbw %xmm3,%ymm5 + DB 196,227,125,57,219,1 ; vextracti128 $0x1,%ymm3,%xmm3 DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 DB 197,237,213,219 ; vpmullw %ymm3,%ymm2,%ymm3 DB 197,221,213,237 ; vpmullw %ymm5,%ymm4,%ymm5 @@ -36847,19 +36838,12 @@ _sk_srcover_rgba_8888_hsw_8bit LABEL PROC DB 197,229,253,210 ; vpaddw %ymm2,%ymm3,%ymm2 DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2 DB 197,229,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm3 - DB 196,227,125,57,220,1 ; vextracti128 $0x1,%ymm3,%xmm4 - DB 197,249,111,45,102,17,0,0 ; vmovdqa 0x1166(%rip),%xmm5 # 1b20 <_sk_xor__hsw_8bit+0x49b> - DB 196,226,89,0,229 ; vpshufb %xmm5,%xmm4,%xmm4 - DB 196,226,97,0,221 ; vpshufb %xmm5,%xmm3,%xmm3 - DB 197,225,108,220 ; vpunpcklqdq %xmm4,%xmm3,%xmm3 - DB 196,227,125,57,212,1 ; vextracti128 $0x1,%ymm2,%xmm4 - DB 196,226,89,0,229 ; vpshufb %xmm5,%xmm4,%xmm4 - DB 196,226,105,0,213 ; vpshufb %xmm5,%xmm2,%xmm2 - DB 197,233,108,212 ; vpunpcklqdq %xmm4,%xmm2,%xmm2 - DB 196,227,109,56,211,1 ; vinserti128 $0x1,%xmm3,%ymm2,%ymm2 + DB 196,227,101,56,226,1 ; vinserti128 $0x1,%xmm2,%ymm3,%ymm4 + DB 196,227,101,70,210,49 ; vperm2i128 $0x31,%ymm2,%ymm3,%ymm2 + DB 197,221,103,210 ; vpackuswb %ymm2,%ymm4,%ymm2 DB 197,237,252,208 ; vpaddb %ymm0,%ymm2,%ymm2 DB 77,133,192 ; test %r8,%r8 - DB 117,49 ; jne a1c <_sk_srcover_rgba_8888_hsw_8bit+0xe8> + DB 117,49 ; jne 9ce <_sk_srcover_rgba_8888_hsw_8bit+0xbe> DB 197,254,127,16 ; vmovdqu %ymm2,(%rax) DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -36871,7 +36855,7 @@ _sk_srcover_rgba_8888_hsw_8bit LABEL PROC DB 196,225,249,110,210 ; vmovq %rdx,%xmm2 DB 196,226,125,33,210 ; vpmovsxbd %xmm2,%ymm2 DB 196,226,109,140,16 ; vpmaskmovd (%rax),%ymm2,%ymm2 - DB 233,69,255,255,255 ; jmpq 961 <_sk_srcover_rgba_8888_hsw_8bit+0x2d> + DB 233,107,255,255,255 ; jmpq 939 <_sk_srcover_rgba_8888_hsw_8bit+0x29> DB 185,8,0,0,0 ; mov $0x8,%ecx DB 68,41,193 ; sub %r8d,%ecx DB 192,225,3 ; shl $0x3,%cl @@ -36880,36 +36864,29 @@ _sk_srcover_rgba_8888_hsw_8bit LABEL PROC DB 196,225,249,110,218 ; vmovq %rdx,%xmm3 DB 196,226,125,33,219 ; vpmovsxbd %xmm3,%ymm3 DB 196,226,101,142,16 ; vpmaskmovd %ymm2,%ymm3,(%rax) - DB 235,173 ; jmp 9ef <_sk_srcover_rgba_8888_hsw_8bit+0xbb> + DB 235,173 ; jmp 9a1 <_sk_srcover_rgba_8888_hsw_8bit+0x91> PUBLIC _sk_scale_1_float_hsw_8bit _sk_scale_1_float_hsw_8bit LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax DB 197,250,16,16 ; vmovss (%rax),%xmm2 - DB 197,234,89,21,76,13,0,0 ; vmulss 0xd4c(%rip),%xmm2,%xmm2 # 179c <_sk_xor__hsw_8bit+0x117> + DB 197,234,89,21,14,10,0,0 ; vmulss 0xa0e(%rip),%xmm2,%xmm2 # 1410 <_sk_xor__hsw_8bit+0xd5> DB 197,250,44,194 ; vcvttss2si %xmm2,%eax DB 197,249,110,208 ; vmovd %eax,%xmm2 DB 196,226,125,120,210 ; vpbroadcastb %xmm2,%ymm2 - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 + DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3 + DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0 DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0 - DB 197,237,219,21,107,14,0,0 ; vpand 0xe6b(%rip),%ymm2,%ymm2 # 18e0 <_sk_xor__hsw_8bit+0x25b> + DB 197,237,219,21,25,11,0,0 ; vpand 0xb19(%rip),%ymm2,%ymm2 # 1540 <_sk_xor__hsw_8bit+0x205> DB 197,237,213,224 ; vpmullw %ymm0,%ymm2,%ymm4 DB 197,237,213,211 ; vpmullw %ymm3,%ymm2,%ymm2 DB 197,237,253,211 ; vpaddw %ymm3,%ymm2,%ymm2 DB 197,221,253,192 ; vpaddw %ymm0,%ymm4,%ymm0 DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0 DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2 - DB 196,227,125,57,211,1 ; vextracti128 $0x1,%ymm2,%xmm3 - DB 197,249,111,37,147,16,0,0 ; vmovdqa 0x1093(%rip),%xmm4 # 1b30 <_sk_xor__hsw_8bit+0x4ab> - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,105,0,212 ; vpshufb %xmm4,%xmm2,%xmm2 - DB 197,233,108,211 ; vpunpcklqdq %xmm3,%xmm2,%xmm2 - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,121,0,196 ; vpshufb %xmm4,%xmm0,%xmm0 - DB 197,249,108,195 ; vpunpcklqdq %xmm3,%xmm0,%xmm0 - DB 196,227,125,56,194,1 ; vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -36923,16 +36900,16 @@ _sk_scale_u8_hsw_8bit LABEL PROC DB 72,15,175,209 ; imul %rcx,%rdx DB 72,3,16 ; add (%rax),%rdx DB 77,133,201 ; test %r9,%r9 - DB 15,133,144,0,0,0 ; jne b7a <_sk_scale_u8_hsw_8bit+0xb1> + DB 117,106 ; jne adc <_sk_scale_u8_hsw_8bit+0x87> DB 196,162,121,48,20,2 ; vpmovzxbw (%rdx,%r8,1),%xmm2 - DB 197,233,219,21,72,16,0,0 ; vpand 0x1048(%rip),%xmm2,%xmm2 # 1b40 <_sk_xor__hsw_8bit+0x4bb> + DB 197,233,219,21,240,12,0,0 ; vpand 0xcf0(%rip),%xmm2,%xmm2 # 1770 <_sk_xor__hsw_8bit+0x435> DB 196,226,125,51,210 ; vpmovzxwd %xmm2,%ymm2 - DB 196,226,109,0,21,250,13,0,0 ; vpshufb 0xdfa(%rip),%ymm2,%ymm2 # 1900 <_sk_xor__hsw_8bit+0x27b> - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 + DB 196,226,109,0,21,210,10,0,0 ; vpshufb 0xad2(%rip),%ymm2,%ymm2 # 1560 <_sk_xor__hsw_8bit+0x225> + DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3 + DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0 DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0 - DB 196,227,125,57,212,1 ; vextracti128 $0x1,%ymm2,%xmm4 - DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 + DB 196,226,125,48,226 ; vpmovzxbw %xmm2,%ymm4 + DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2 DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 DB 197,237,213,208 ; vpmullw %ymm0,%ymm2,%ymm2 DB 197,221,213,227 ; vpmullw %ymm3,%ymm4,%ymm4 @@ -36940,31 +36917,24 @@ _sk_scale_u8_hsw_8bit LABEL PROC DB 197,237,253,192 ; vpaddw %ymm0,%ymm2,%ymm0 DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0 DB 197,237,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm2 - DB 196,227,125,57,211,1 ; vextracti128 $0x1,%ymm2,%xmm3 - DB 197,249,111,37,2,16,0,0 ; vmovdqa 0x1002(%rip),%xmm4 # 1b50 <_sk_xor__hsw_8bit+0x4cb> - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,105,0,212 ; vpshufb %xmm4,%xmm2,%xmm2 - DB 197,233,108,211 ; vpunpcklqdq %xmm3,%xmm2,%xmm2 - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,121,0,196 ; vpshufb %xmm4,%xmm0,%xmm0 - DB 197,249,108,195 ; vpunpcklqdq %xmm3,%xmm0,%xmm0 - DB 196,227,125,56,194,1 ; vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax DB 65,128,225,7 ; and $0x7,%r9b DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2 DB 65,254,201 ; dec %r9b DB 65,128,249,6 ; cmp $0x6,%r9b - DB 15,135,97,255,255,255 ; ja af0 <_sk_scale_u8_hsw_8bit+0x27> + DB 119,139 ; ja a78 <_sk_scale_u8_hsw_8bit+0x23> DB 65,15,182,193 ; movzbl %r9b,%eax - DB 72,141,13,122,0,0,0 ; lea 0x7a(%rip),%rcx # c14 <_sk_scale_u8_hsw_8bit+0x14b> + DB 72,141,13,124,0,0,0 ; lea 0x7c(%rip),%rcx # b74 <_sk_scale_u8_hsw_8bit+0x11f> DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax DB 72,1,200 ; add %rcx,%rax DB 255,224 ; jmpq *%rax DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax DB 197,249,110,208 ; vmovd %eax,%xmm2 - DB 233,63,255,255,255 ; jmpq af0 <_sk_scale_u8_hsw_8bit+0x27> + DB 233,105,255,255,255 ; jmpq a78 <_sk_scale_u8_hsw_8bit+0x23> DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2 DB 197,233,196,208,2 ; vpinsrw $0x2,%eax,%xmm2,%xmm2 @@ -36972,7 +36942,7 @@ _sk_scale_u8_hsw_8bit LABEL PROC DB 197,249,110,216 ; vmovd %eax,%xmm3 DB 196,226,121,48,219 ; vpmovzxbw %xmm3,%xmm3 DB 196,227,105,2,211,1 ; vpblendd $0x1,%xmm3,%xmm2,%xmm2 - DB 233,23,255,255,255 ; jmpq af0 <_sk_scale_u8_hsw_8bit+0x27> + DB 233,65,255,255,255 ; jmpq a78 <_sk_scale_u8_hsw_8bit+0x23> DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2 DB 197,233,196,208,6 ; vpinsrw $0x6,%eax,%xmm2,%xmm2 @@ -36983,23 +36953,22 @@ _sk_scale_u8_hsw_8bit LABEL PROC DB 196,161,121,110,28,2 ; vmovd (%rdx,%r8,1),%xmm3 DB 196,226,121,48,219 ; vpmovzxbw %xmm3,%xmm3 DB 196,227,97,2,210,12 ; vpblendd $0xc,%xmm2,%xmm3,%xmm2 - DB 233,220,254,255,255 ; jmpq af0 <_sk_scale_u8_hsw_8bit+0x27> - DB 143 ; (bad) - DB 255 ; (bad) + DB 233,6,255,255,255 ; jmpq a78 <_sk_scale_u8_hsw_8bit+0x23> + DB 102,144 ; xchg %ax,%ax + DB 141 ; (bad) DB 255 ; (bad) - DB 255,172,255,255,255,157,255 ; ljmp *-0x620001(%rdi,%rdi,8) DB 255 ; (bad) + DB 255,170,255,255,255,155 ; ljmp *-0x64000001(%rdx) DB 255 ; (bad) - DB 234 ; (bad) DB 255 ; (bad) DB 255 ; (bad) + DB 232,255,255,255,221 ; callq ffffffffde000b84 <_sk_xor__hsw_8bit+0xffffffffddfff849> DB 255 ; (bad) - DB 223,255 ; (bad) DB 255 ; (bad) - DB 255,212 ; callq *%rsp + DB 255,210 ; callq *%rdx DB 255 ; (bad) DB 255 ; (bad) - DB 255,197 ; inc %ebp + DB 255,195 ; inc %ebx DB 255 ; (bad) DB 255 ; (bad) DB 255 ; .byte 0xff @@ -37008,54 +36977,41 @@ PUBLIC _sk_lerp_1_float_hsw_8bit _sk_lerp_1_float_hsw_8bit LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax DB 197,250,16,16 ; vmovss (%rax),%xmm2 - DB 197,234,89,21,98,11,0,0 ; vmulss 0xb62(%rip),%xmm2,%xmm2 # 17a0 <_sk_xor__hsw_8bit+0x11b> + DB 197,234,89,21,118,8,0,0 ; vmulss 0x876(%rip),%xmm2,%xmm2 # 1414 <_sk_xor__hsw_8bit+0xd9> DB 197,250,44,194 ; vcvttss2si %xmm2,%eax DB 197,249,110,208 ; vmovd %eax,%xmm2 DB 196,226,125,120,210 ; vpbroadcastb %xmm2,%ymm2 - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 + DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3 + DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0 DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0 - DB 197,237,219,37,189,12,0,0 ; vpand 0xcbd(%rip),%ymm2,%ymm4 # 1920 <_sk_xor__hsw_8bit+0x29b> + DB 197,237,219,37,189,9,0,0 ; vpand 0x9bd(%rip),%ymm2,%ymm4 # 1580 <_sk_xor__hsw_8bit+0x245> DB 197,221,213,232 ; vpmullw %ymm0,%ymm4,%ymm5 DB 197,221,213,227 ; vpmullw %ymm3,%ymm4,%ymm4 DB 197,221,253,219 ; vpaddw %ymm3,%ymm4,%ymm3 DB 197,213,253,192 ; vpaddw %ymm0,%ymm5,%ymm0 - DB 197,221,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm4 + DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0 DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3 - DB 196,227,125,57,221,1 ; vextracti128 $0x1,%ymm3,%xmm5 - DB 197,249,111,5,213,14,0,0 ; vmovdqa 0xed5(%rip),%xmm0 # 1b60 <_sk_xor__hsw_8bit+0x4db> - DB 196,226,81,0,232 ; vpshufb %xmm0,%xmm5,%xmm5 - DB 196,226,97,0,216 ; vpshufb %xmm0,%xmm3,%xmm3 - DB 197,225,108,221 ; vpunpcklqdq %xmm5,%xmm3,%xmm3 - DB 196,227,125,57,229,1 ; vextracti128 $0x1,%ymm4,%xmm5 - DB 196,226,81,0,232 ; vpshufb %xmm0,%xmm5,%xmm5 - DB 196,226,89,0,224 ; vpshufb %xmm0,%xmm4,%xmm4 - DB 197,217,108,229 ; vpunpcklqdq %xmm5,%xmm4,%xmm4 - DB 196,227,93,56,219,1 ; vinserti128 $0x1,%xmm3,%ymm4,%ymm3 - DB 197,221,118,228 ; vpcmpeqd %ymm4,%ymm4,%ymm4 - DB 197,237,239,212 ; vpxor %ymm4,%ymm2,%ymm2 + DB 196,227,101,56,224,1 ; vinserti128 $0x1,%xmm0,%ymm3,%ymm4 + DB 196,227,101,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm3,%ymm0 + DB 197,221,103,192 ; vpackuswb %ymm0,%ymm4,%ymm0 + DB 197,229,118,219 ; vpcmpeqd %ymm3,%ymm3,%ymm3 + DB 197,237,239,211 ; vpxor %ymm3,%ymm2,%ymm2 + DB 196,226,125,48,217 ; vpmovzxbw %xmm1,%ymm3 DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4 DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 - DB 196,226,125,48,233 ; vpmovzxbw %xmm1,%ymm5 - DB 196,227,125,57,214,1 ; vextracti128 $0x1,%ymm2,%xmm6 - DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6 + DB 196,226,125,48,234 ; vpmovzxbw %xmm2,%ymm5 + DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2 DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 - DB 197,237,213,213 ; vpmullw %ymm5,%ymm2,%ymm2 - DB 197,205,213,244 ; vpmullw %ymm4,%ymm6,%ymm6 - DB 197,205,253,228 ; vpaddw %ymm4,%ymm6,%ymm4 - DB 197,237,253,213 ; vpaddw %ymm5,%ymm2,%ymm2 + DB 197,237,213,212 ; vpmullw %ymm4,%ymm2,%ymm2 + DB 197,213,213,235 ; vpmullw %ymm3,%ymm5,%ymm5 + DB 197,213,253,219 ; vpaddw %ymm3,%ymm5,%ymm3 + DB 197,237,253,212 ; vpaddw %ymm4,%ymm2,%ymm2 DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2 - DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4 - DB 196,227,125,57,229,1 ; vextracti128 $0x1,%ymm4,%xmm5 - DB 196,226,81,0,232 ; vpshufb %xmm0,%xmm5,%xmm5 - DB 196,226,89,0,224 ; vpshufb %xmm0,%xmm4,%xmm4 - DB 197,217,108,229 ; vpunpcklqdq %xmm5,%xmm4,%xmm4 - DB 196,227,125,57,213,1 ; vextracti128 $0x1,%ymm2,%xmm5 - DB 196,226,81,0,232 ; vpshufb %xmm0,%xmm5,%xmm5 - DB 196,226,105,0,192 ; vpshufb %xmm0,%xmm2,%xmm0 - DB 197,249,108,197 ; vpunpcklqdq %xmm5,%xmm0,%xmm0 - DB 196,227,125,56,196,1 ; vinserti128 $0x1,%xmm4,%ymm0,%ymm0 - DB 197,253,252,195 ; vpaddb %ymm3,%ymm0,%ymm0 + DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3 + DB 196,227,101,56,226,1 ; vinserti128 $0x1,%xmm2,%ymm3,%ymm4 + DB 196,227,101,70,210,49 ; vperm2i128 $0x31,%ymm2,%ymm3,%ymm2 + DB 197,221,103,210 ; vpackuswb %ymm2,%ymm4,%ymm2 + DB 197,237,252,192 ; vpaddb %ymm0,%ymm2,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -37069,72 +37025,59 @@ _sk_lerp_u8_hsw_8bit LABEL PROC DB 72,15,175,209 ; imul %rcx,%rdx DB 72,3,16 ; add (%rax),%rdx DB 77,133,201 ; test %r9,%r9 - DB 15,133,4,1,0,0 ; jne e50 <_sk_lerp_u8_hsw_8bit+0x125> + DB 15,133,192,0,0,0 ; jne d28 <_sk_lerp_u8_hsw_8bit+0xe1> DB 196,162,121,48,20,2 ; vpmovzxbw (%rdx,%r8,1),%xmm2 - DB 197,233,219,21,22,14,0,0 ; vpand 0xe16(%rip),%xmm2,%xmm2 # 1b70 <_sk_xor__hsw_8bit+0x4eb> + DB 197,233,219,21,10,11,0,0 ; vpand 0xb0a(%rip),%xmm2,%xmm2 # 1780 <_sk_xor__hsw_8bit+0x445> DB 196,226,125,51,210 ; vpmovzxwd %xmm2,%ymm2 - DB 196,226,109,0,21,216,11,0,0 ; vpshufb 0xbd8(%rip),%ymm2,%ymm2 # 1940 <_sk_xor__hsw_8bit+0x2bb> - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 + DB 196,226,109,0,21,28,9,0,0 ; vpshufb 0x91c(%rip),%ymm2,%ymm2 # 15a0 <_sk_xor__hsw_8bit+0x265> + DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3 + DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0 DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0 - DB 196,227,125,57,212,1 ; vextracti128 $0x1,%ymm2,%xmm4 - DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 - DB 196,226,125,48,234 ; vpmovzxbw %xmm2,%ymm5 + DB 196,226,125,48,226 ; vpmovzxbw %xmm2,%ymm4 + DB 196,227,125,57,213,1 ; vextracti128 $0x1,%ymm2,%xmm5 + DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5 DB 197,213,213,232 ; vpmullw %ymm0,%ymm5,%ymm5 DB 197,221,213,227 ; vpmullw %ymm3,%ymm4,%ymm4 DB 197,221,253,219 ; vpaddw %ymm3,%ymm4,%ymm3 DB 197,213,253,192 ; vpaddw %ymm0,%ymm5,%ymm0 - DB 197,221,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm4 + DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0 DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3 - DB 196,227,125,57,221,1 ; vextracti128 $0x1,%ymm3,%xmm5 - DB 197,249,111,5,208,13,0,0 ; vmovdqa 0xdd0(%rip),%xmm0 # 1b80 <_sk_xor__hsw_8bit+0x4fb> - DB 196,226,81,0,232 ; vpshufb %xmm0,%xmm5,%xmm5 - DB 196,226,97,0,216 ; vpshufb %xmm0,%xmm3,%xmm3 - DB 197,225,108,221 ; vpunpcklqdq %xmm5,%xmm3,%xmm3 - DB 196,227,125,57,229,1 ; vextracti128 $0x1,%ymm4,%xmm5 - DB 196,226,81,0,232 ; vpshufb %xmm0,%xmm5,%xmm5 - DB 196,226,89,0,224 ; vpshufb %xmm0,%xmm4,%xmm4 - DB 197,217,108,229 ; vpunpcklqdq %xmm5,%xmm4,%xmm4 - DB 196,227,93,56,219,1 ; vinserti128 $0x1,%xmm3,%ymm4,%ymm3 - DB 197,221,118,228 ; vpcmpeqd %ymm4,%ymm4,%ymm4 - DB 197,237,239,212 ; vpxor %ymm4,%ymm2,%ymm2 + DB 196,227,101,56,224,1 ; vinserti128 $0x1,%xmm0,%ymm3,%ymm4 + DB 196,227,101,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm3,%ymm0 + DB 197,221,103,192 ; vpackuswb %ymm0,%ymm4,%ymm0 + DB 197,229,118,219 ; vpcmpeqd %ymm3,%ymm3,%ymm3 + DB 197,237,239,211 ; vpxor %ymm3,%ymm2,%ymm2 + DB 196,226,125,48,217 ; vpmovzxbw %xmm1,%ymm3 DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4 DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 - DB 196,226,125,48,233 ; vpmovzxbw %xmm1,%ymm5 - DB 196,227,125,57,214,1 ; vextracti128 $0x1,%ymm2,%xmm6 - DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6 + DB 196,226,125,48,234 ; vpmovzxbw %xmm2,%ymm5 + DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2 DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 - DB 197,237,213,213 ; vpmullw %ymm5,%ymm2,%ymm2 - DB 197,205,213,244 ; vpmullw %ymm4,%ymm6,%ymm6 - DB 197,205,253,228 ; vpaddw %ymm4,%ymm6,%ymm4 - DB 197,237,253,213 ; vpaddw %ymm5,%ymm2,%ymm2 + DB 197,237,213,212 ; vpmullw %ymm4,%ymm2,%ymm2 + DB 197,213,213,235 ; vpmullw %ymm3,%ymm5,%ymm5 + DB 197,213,253,219 ; vpaddw %ymm3,%ymm5,%ymm3 + DB 197,237,253,212 ; vpaddw %ymm4,%ymm2,%ymm2 DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2 - DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4 - DB 196,227,125,57,229,1 ; vextracti128 $0x1,%ymm4,%xmm5 - DB 196,226,81,0,232 ; vpshufb %xmm0,%xmm5,%xmm5 - DB 196,226,89,0,224 ; vpshufb %xmm0,%xmm4,%xmm4 - DB 197,217,108,229 ; vpunpcklqdq %xmm5,%xmm4,%xmm4 - DB 196,227,125,57,213,1 ; vextracti128 $0x1,%ymm2,%xmm5 - DB 196,226,81,0,232 ; vpshufb %xmm0,%xmm5,%xmm5 - DB 196,226,105,0,192 ; vpshufb %xmm0,%xmm2,%xmm0 - DB 197,249,108,197 ; vpunpcklqdq %xmm5,%xmm0,%xmm0 - DB 196,227,125,56,196,1 ; vinserti128 $0x1,%xmm4,%ymm0,%ymm0 - DB 197,253,252,195 ; vpaddb %ymm3,%ymm0,%ymm0 + DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3 + DB 196,227,101,56,226,1 ; vinserti128 $0x1,%xmm2,%ymm3,%ymm4 + DB 196,227,101,70,210,49 ; vperm2i128 $0x31,%ymm2,%ymm3,%ymm2 + DB 197,221,103,210 ; vpackuswb %ymm2,%ymm4,%ymm2 + DB 197,237,252,192 ; vpaddb %ymm0,%ymm2,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax DB 65,128,225,7 ; and $0x7,%r9b DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2 DB 65,254,201 ; dec %r9b DB 65,128,249,6 ; cmp $0x6,%r9b - DB 15,135,237,254,255,255 ; ja d52 <_sk_lerp_u8_hsw_8bit+0x27> + DB 15,135,49,255,255,255 ; ja c6e <_sk_lerp_u8_hsw_8bit+0x27> DB 65,15,182,193 ; movzbl %r9b,%eax - DB 72,141,13,124,0,0,0 ; lea 0x7c(%rip),%rcx # eec <_sk_lerp_u8_hsw_8bit+0x1c1> + DB 72,141,13,124,0,0,0 ; lea 0x7c(%rip),%rcx # dc4 <_sk_lerp_u8_hsw_8bit+0x17d> DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax DB 72,1,200 ; add %rcx,%rax DB 255,224 ; jmpq *%rax DB 66,15,182,4,2 ; movzbl (%rdx,%r8,1),%eax DB 197,249,110,208 ; vmovd %eax,%xmm2 - DB 233,203,254,255,255 ; jmpq d52 <_sk_lerp_u8_hsw_8bit+0x27> + DB 233,15,255,255,255 ; jmpq c6e <_sk_lerp_u8_hsw_8bit+0x27> DB 66,15,182,68,2,2 ; movzbl 0x2(%rdx,%r8,1),%eax DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2 DB 197,233,196,208,2 ; vpinsrw $0x2,%eax,%xmm2,%xmm2 @@ -37142,7 +37085,7 @@ _sk_lerp_u8_hsw_8bit LABEL PROC DB 197,249,110,216 ; vmovd %eax,%xmm3 DB 196,226,121,48,219 ; vpmovzxbw %xmm3,%xmm3 DB 196,227,105,2,211,1 ; vpblendd $0x1,%xmm3,%xmm2,%xmm2 - DB 233,163,254,255,255 ; jmpq d52 <_sk_lerp_u8_hsw_8bit+0x27> + DB 233,231,254,255,255 ; jmpq c6e <_sk_lerp_u8_hsw_8bit+0x27> DB 66,15,182,68,2,6 ; movzbl 0x6(%rdx,%r8,1),%eax DB 197,233,239,210 ; vpxor %xmm2,%xmm2,%xmm2 DB 197,233,196,208,6 ; vpinsrw $0x6,%eax,%xmm2,%xmm2 @@ -37153,7 +37096,7 @@ _sk_lerp_u8_hsw_8bit LABEL PROC DB 196,161,121,110,28,2 ; vmovd (%rdx,%r8,1),%xmm3 DB 196,226,121,48,219 ; vpmovzxbw %xmm3,%xmm3 DB 196,227,97,2,210,12 ; vpblendd $0xc,%xmm2,%xmm3,%xmm2 - DB 233,104,254,255,255 ; jmpq d52 <_sk_lerp_u8_hsw_8bit+0x27> + DB 233,172,254,255,255 ; jmpq c6e <_sk_lerp_u8_hsw_8bit+0x27> DB 102,144 ; xchg %ax,%ax DB 141 ; (bad) DB 255 ; (bad) @@ -37162,7 +37105,7 @@ _sk_lerp_u8_hsw_8bit LABEL PROC DB 255 ; (bad) DB 255 ; (bad) DB 255 ; (bad) - DB 232,255,255,255,221 ; callq ffffffffde000efc <_sk_xor__hsw_8bit+0xffffffffddfff877> + DB 232,255,255,255,221 ; callq ffffffffde000dd4 <_sk_xor__hsw_8bit+0xffffffffddfffa99> DB 255 ; (bad) DB 255 ; (bad) DB 255,210 ; callq *%rdx @@ -37188,7 +37131,7 @@ _sk_move_dst_src_hsw_8bit LABEL PROC PUBLIC _sk_black_color_hsw_8bit _sk_black_color_hsw_8bit LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax - DB 196,226,125,25,5,45,13,0,0 ; vbroadcastsd 0xd2d(%rip),%ymm0 # 1c50 <_sk_xor__hsw_8bit+0x5cb> + DB 196,226,125,25,5,149,9,0,0 ; vbroadcastsd 0x995(%rip),%ymm0 # 1790 <_sk_xor__hsw_8bit+0x455> DB 255,224 ; jmpq *%rax PUBLIC _sk_white_color_hsw_8bit @@ -37205,120 +37148,94 @@ _sk_clear_hsw_8bit LABEL PROC PUBLIC _sk_srcatop_hsw_8bit _sk_srcatop_hsw_8bit LABEL PROC - DB 197,253,111,29,35,10,0,0 ; vmovdqa 0xa23(%rip),%ymm3 # 1960 <_sk_xor__hsw_8bit+0x2db> - DB 196,226,117,0,211 ; vpshufb %ymm3,%ymm1,%ymm2 - DB 196,227,125,57,196,1 ; vextracti128 $0x1,%ymm0,%xmm4 - DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 - DB 196,226,125,48,232 ; vpmovzxbw %xmm0,%ymm5 - DB 196,227,125,57,214,1 ; vextracti128 $0x1,%ymm2,%xmm6 - DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6 - DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 - DB 197,237,213,213 ; vpmullw %ymm5,%ymm2,%ymm2 + DB 197,253,111,21,171,7,0,0 ; vmovdqa 0x7ab(%rip),%ymm2 # 15c0 <_sk_xor__hsw_8bit+0x285> + DB 196,226,117,0,218 ; vpshufb %ymm2,%ymm1,%ymm3 + DB 196,226,125,48,224 ; vpmovzxbw %xmm0,%ymm4 + DB 196,227,125,57,197,1 ; vextracti128 $0x1,%ymm0,%xmm5 + DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5 + DB 196,226,125,48,243 ; vpmovzxbw %xmm3,%ymm6 + DB 196,227,125,57,219,1 ; vextracti128 $0x1,%ymm3,%xmm3 + DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 + DB 197,229,213,221 ; vpmullw %ymm5,%ymm3,%ymm3 DB 197,205,213,244 ; vpmullw %ymm4,%ymm6,%ymm6 DB 197,205,253,228 ; vpaddw %ymm4,%ymm6,%ymm4 - DB 197,237,253,213 ; vpaddw %ymm5,%ymm2,%ymm2 - DB 197,213,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm5 + DB 197,229,253,221 ; vpaddw %ymm5,%ymm3,%ymm3 + DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3 DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4 - DB 196,227,125,57,230,1 ; vextracti128 $0x1,%ymm4,%xmm6 - DB 197,249,111,21,6,12,0,0 ; vmovdqa 0xc06(%rip),%xmm2 # 1b90 <_sk_xor__hsw_8bit+0x50b> - DB 196,226,73,0,242 ; vpshufb %xmm2,%xmm6,%xmm6 - DB 196,226,89,0,226 ; vpshufb %xmm2,%xmm4,%xmm4 - DB 197,217,108,230 ; vpunpcklqdq %xmm6,%xmm4,%xmm4 - DB 196,227,125,57,238,1 ; vextracti128 $0x1,%ymm5,%xmm6 - DB 196,226,73,0,242 ; vpshufb %xmm2,%xmm6,%xmm6 - DB 196,226,81,0,234 ; vpshufb %xmm2,%xmm5,%xmm5 - DB 197,209,108,238 ; vpunpcklqdq %xmm6,%xmm5,%xmm5 - DB 196,227,85,56,228,1 ; vinserti128 $0x1,%xmm4,%ymm5,%ymm4 - DB 196,226,125,0,195 ; vpshufb %ymm3,%ymm0,%ymm0 - DB 197,229,118,219 ; vpcmpeqd %ymm3,%ymm3,%ymm3 - DB 197,253,239,195 ; vpxor %ymm3,%ymm0,%ymm0 - DB 196,227,125,57,203,1 ; vextracti128 $0x1,%ymm1,%xmm3 - DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 - DB 196,226,125,48,233 ; vpmovzxbw %xmm1,%ymm5 - DB 196,227,125,57,198,1 ; vextracti128 $0x1,%ymm0,%xmm6 - DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6 + DB 196,227,93,56,235,1 ; vinserti128 $0x1,%xmm3,%ymm4,%ymm5 + DB 196,227,93,70,219,49 ; vperm2i128 $0x31,%ymm3,%ymm4,%ymm3 + DB 197,213,103,219 ; vpackuswb %ymm3,%ymm5,%ymm3 + DB 196,226,125,0,194 ; vpshufb %ymm2,%ymm0,%ymm0 + DB 197,237,118,210 ; vpcmpeqd %ymm2,%ymm2,%ymm2 + DB 197,253,239,194 ; vpxor %ymm2,%ymm0,%ymm0 + DB 196,226,125,48,209 ; vpmovzxbw %xmm1,%ymm2 + DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4 + DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 + DB 196,226,125,48,232 ; vpmovzxbw %xmm0,%ymm5 + DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0 DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0 - DB 197,253,213,197 ; vpmullw %ymm5,%ymm0,%ymm0 - DB 197,205,213,243 ; vpmullw %ymm3,%ymm6,%ymm6 - DB 197,205,253,219 ; vpaddw %ymm3,%ymm6,%ymm3 - DB 197,253,253,197 ; vpaddw %ymm5,%ymm0,%ymm0 + DB 197,253,213,196 ; vpmullw %ymm4,%ymm0,%ymm0 + DB 197,213,213,234 ; vpmullw %ymm2,%ymm5,%ymm5 + DB 197,213,253,210 ; vpaddw %ymm2,%ymm5,%ymm2 + DB 197,253,253,196 ; vpaddw %ymm4,%ymm0,%ymm0 DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0 - DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3 - DB 196,227,125,57,221,1 ; vextracti128 $0x1,%ymm3,%xmm5 - DB 196,226,81,0,234 ; vpshufb %xmm2,%xmm5,%xmm5 - DB 196,226,97,0,218 ; vpshufb %xmm2,%xmm3,%xmm3 - DB 197,225,108,221 ; vpunpcklqdq %xmm5,%xmm3,%xmm3 - DB 196,227,125,57,197,1 ; vextracti128 $0x1,%ymm0,%xmm5 - DB 196,226,81,0,234 ; vpshufb %xmm2,%xmm5,%xmm5 - DB 196,226,121,0,194 ; vpshufb %xmm2,%xmm0,%xmm0 - DB 197,249,108,197 ; vpunpcklqdq %xmm5,%xmm0,%xmm0 - DB 196,227,125,56,195,1 ; vinserti128 $0x1,%xmm3,%ymm0,%ymm0 - DB 197,221,252,192 ; vpaddb %ymm0,%ymm4,%ymm0 + DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2 + DB 196,227,109,56,224,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm4 + DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + DB 197,221,103,192 ; vpackuswb %ymm0,%ymm4,%ymm0 + DB 197,253,252,195 ; vpaddb %ymm3,%ymm0,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax PUBLIC _sk_dstatop_hsw_8bit _sk_dstatop_hsw_8bit LABEL PROC - DB 197,253,111,29,73,9,0,0 ; vmovdqa 0x949(%rip),%ymm3 # 1980 <_sk_xor__hsw_8bit+0x2fb> - DB 196,226,125,0,211 ; vpshufb %ymm3,%ymm0,%ymm2 - DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4 - DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 - DB 196,226,125,48,233 ; vpmovzxbw %xmm1,%ymm5 - DB 196,227,125,57,214,1 ; vextracti128 $0x1,%ymm2,%xmm6 - DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6 - DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 - DB 197,237,213,213 ; vpmullw %ymm5,%ymm2,%ymm2 + DB 197,253,111,21,21,7,0,0 ; vmovdqa 0x715(%rip),%ymm2 # 15e0 <_sk_xor__hsw_8bit+0x2a5> + DB 196,226,125,0,218 ; vpshufb %ymm2,%ymm0,%ymm3 + DB 196,226,125,48,225 ; vpmovzxbw %xmm1,%ymm4 + DB 196,227,125,57,205,1 ; vextracti128 $0x1,%ymm1,%xmm5 + DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5 + DB 196,226,125,48,243 ; vpmovzxbw %xmm3,%ymm6 + DB 196,227,125,57,219,1 ; vextracti128 $0x1,%ymm3,%xmm3 + DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 + DB 197,229,213,221 ; vpmullw %ymm5,%ymm3,%ymm3 DB 197,205,213,244 ; vpmullw %ymm4,%ymm6,%ymm6 DB 197,205,253,228 ; vpaddw %ymm4,%ymm6,%ymm4 - DB 197,237,253,213 ; vpaddw %ymm5,%ymm2,%ymm2 - DB 197,213,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm5 + DB 197,229,253,221 ; vpaddw %ymm5,%ymm3,%ymm3 + DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3 DB 197,221,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm4 - DB 196,227,125,57,230,1 ; vextracti128 $0x1,%ymm4,%xmm6 - DB 197,249,111,21,28,11,0,0 ; vmovdqa 0xb1c(%rip),%xmm2 # 1ba0 <_sk_xor__hsw_8bit+0x51b> - DB 196,226,73,0,242 ; vpshufb %xmm2,%xmm6,%xmm6 - DB 196,226,89,0,226 ; vpshufb %xmm2,%xmm4,%xmm4 - DB 197,217,108,230 ; vpunpcklqdq %xmm6,%xmm4,%xmm4 - DB 196,227,125,57,238,1 ; vextracti128 $0x1,%ymm5,%xmm6 - DB 196,226,73,0,242 ; vpshufb %xmm2,%xmm6,%xmm6 - DB 196,226,81,0,234 ; vpshufb %xmm2,%xmm5,%xmm5 - DB 197,209,108,238 ; vpunpcklqdq %xmm6,%xmm5,%xmm5 - DB 196,227,85,56,228,1 ; vinserti128 $0x1,%xmm4,%ymm5,%ymm4 - DB 196,226,117,0,219 ; vpshufb %ymm3,%ymm1,%ymm3 - DB 197,213,118,237 ; vpcmpeqd %ymm5,%ymm5,%ymm5 - DB 197,229,239,221 ; vpxor %ymm5,%ymm3,%ymm3 - DB 196,227,125,57,197,1 ; vextracti128 $0x1,%ymm0,%xmm5 - DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5 + DB 196,227,93,56,235,1 ; vinserti128 $0x1,%xmm3,%ymm4,%ymm5 + DB 196,227,93,70,219,49 ; vperm2i128 $0x31,%ymm3,%ymm4,%ymm3 + DB 197,213,103,219 ; vpackuswb %ymm3,%ymm5,%ymm3 + DB 196,226,117,0,210 ; vpshufb %ymm2,%ymm1,%ymm2 + DB 197,221,118,228 ; vpcmpeqd %ymm4,%ymm4,%ymm4 + DB 197,237,239,212 ; vpxor %ymm4,%ymm2,%ymm2 + DB 196,226,125,48,224 ; vpmovzxbw %xmm0,%ymm4 + DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0 DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0 - DB 196,227,125,57,222,1 ; vextracti128 $0x1,%ymm3,%xmm6 - DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6 - DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 - DB 197,229,213,216 ; vpmullw %ymm0,%ymm3,%ymm3 - DB 197,205,213,245 ; vpmullw %ymm5,%ymm6,%ymm6 - DB 197,205,253,237 ; vpaddw %ymm5,%ymm6,%ymm5 - DB 197,229,253,192 ; vpaddw %ymm0,%ymm3,%ymm0 + DB 196,226,125,48,234 ; vpmovzxbw %xmm2,%ymm5 + DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2 + DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 + DB 197,237,213,208 ; vpmullw %ymm0,%ymm2,%ymm2 + DB 197,213,213,236 ; vpmullw %ymm4,%ymm5,%ymm5 + DB 197,213,253,228 ; vpaddw %ymm4,%ymm5,%ymm4 + DB 197,237,253,192 ; vpaddw %ymm0,%ymm2,%ymm0 DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0 - DB 197,229,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm3 - DB 196,227,125,57,221,1 ; vextracti128 $0x1,%ymm3,%xmm5 - DB 196,226,81,0,234 ; vpshufb %xmm2,%xmm5,%xmm5 - DB 196,226,97,0,218 ; vpshufb %xmm2,%xmm3,%xmm3 - DB 197,225,108,221 ; vpunpcklqdq %xmm5,%xmm3,%xmm3 - DB 196,227,125,57,197,1 ; vextracti128 $0x1,%ymm0,%xmm5 - DB 196,226,81,0,234 ; vpshufb %xmm2,%xmm5,%xmm5 - DB 196,226,121,0,194 ; vpshufb %xmm2,%xmm0,%xmm0 - DB 197,249,108,197 ; vpunpcklqdq %xmm5,%xmm0,%xmm0 - DB 196,227,125,56,195,1 ; vinserti128 $0x1,%xmm3,%ymm0,%ymm0 - DB 197,253,252,196 ; vpaddb %ymm4,%ymm0,%ymm0 + DB 197,237,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm2 + DB 196,227,109,56,224,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm4 + DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + DB 197,221,103,192 ; vpackuswb %ymm0,%ymm4,%ymm0 + DB 197,253,252,195 ; vpaddb %ymm3,%ymm0,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax PUBLIC _sk_srcin_hsw_8bit _sk_srcin_hsw_8bit LABEL PROC - DB 196,226,117,0,21,110,8,0,0 ; vpshufb 0x86e(%rip),%ymm1,%ymm2 # 19a0 <_sk_xor__hsw_8bit+0x31b> - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 + DB 196,226,117,0,21,126,6,0,0 ; vpshufb 0x67e(%rip),%ymm1,%ymm2 # 1600 <_sk_xor__hsw_8bit+0x2c5> + DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3 + DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0 DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0 - DB 196,227,125,57,212,1 ; vextracti128 $0x1,%ymm2,%xmm4 - DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 + DB 196,226,125,48,226 ; vpmovzxbw %xmm2,%ymm4 + DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2 DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 DB 197,237,213,208 ; vpmullw %ymm0,%ymm2,%ymm2 DB 197,221,213,227 ; vpmullw %ymm3,%ymm4,%ymm4 @@ -37326,27 +37243,20 @@ _sk_srcin_hsw_8bit LABEL PROC DB 197,237,253,192 ; vpaddw %ymm0,%ymm2,%ymm0 DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0 DB 197,237,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm2 - DB 196,227,125,57,211,1 ; vextracti128 $0x1,%ymm2,%xmm3 - DB 197,249,111,37,54,10,0,0 ; vmovdqa 0xa36(%rip),%xmm4 # 1bb0 <_sk_xor__hsw_8bit+0x52b> - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,105,0,212 ; vpshufb %xmm4,%xmm2,%xmm2 - DB 197,233,108,211 ; vpunpcklqdq %xmm3,%xmm2,%xmm2 - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,121,0,196 ; vpshufb %xmm4,%xmm0,%xmm0 - DB 197,249,108,195 ; vpunpcklqdq %xmm3,%xmm0,%xmm0 - DB 196,227,125,56,194,1 ; vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax PUBLIC _sk_dstin_hsw_8bit _sk_dstin_hsw_8bit LABEL PROC - DB 196,226,125,0,5,17,8,0,0 ; vpshufb 0x811(%rip),%ymm0,%ymm0 # 19c0 <_sk_xor__hsw_8bit+0x33b> - DB 196,227,125,57,202,1 ; vextracti128 $0x1,%ymm1,%xmm2 - DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 - DB 196,226,125,48,217 ; vpmovzxbw %xmm1,%ymm3 - DB 196,227,125,57,196,1 ; vextracti128 $0x1,%ymm0,%xmm4 - DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 + DB 196,226,125,0,5,71,6,0,0 ; vpshufb 0x647(%rip),%ymm0,%ymm0 # 1620 <_sk_xor__hsw_8bit+0x2e5> + DB 196,226,125,48,209 ; vpmovzxbw %xmm1,%ymm2 + DB 196,227,125,57,203,1 ; vextracti128 $0x1,%ymm1,%xmm3 + DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 + DB 196,226,125,48,224 ; vpmovzxbw %xmm0,%ymm4 + DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0 DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0 DB 197,253,213,195 ; vpmullw %ymm3,%ymm0,%ymm0 DB 197,221,213,226 ; vpmullw %ymm2,%ymm4,%ymm4 @@ -37354,29 +37264,22 @@ _sk_dstin_hsw_8bit LABEL PROC DB 197,253,253,195 ; vpaddw %ymm3,%ymm0,%ymm0 DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0 DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2 - DB 196,227,125,57,211,1 ; vextracti128 $0x1,%ymm2,%xmm3 - DB 197,249,111,37,201,9,0,0 ; vmovdqa 0x9c9(%rip),%xmm4 # 1bc0 <_sk_xor__hsw_8bit+0x53b> - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,105,0,212 ; vpshufb %xmm4,%xmm2,%xmm2 - DB 197,233,108,211 ; vpunpcklqdq %xmm3,%xmm2,%xmm2 - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,121,0,196 ; vpshufb %xmm4,%xmm0,%xmm0 - DB 197,249,108,195 ; vpunpcklqdq %xmm3,%xmm0,%xmm0 - DB 196,227,125,56,194,1 ; vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax PUBLIC _sk_srcout_hsw_8bit _sk_srcout_hsw_8bit LABEL PROC - DB 196,226,117,0,21,180,7,0,0 ; vpshufb 0x7b4(%rip),%ymm1,%ymm2 # 19e0 <_sk_xor__hsw_8bit+0x35b> + DB 196,226,117,0,21,16,6,0,0 ; vpshufb 0x610(%rip),%ymm1,%ymm2 # 1640 <_sk_xor__hsw_8bit+0x305> DB 197,229,118,219 ; vpcmpeqd %ymm3,%ymm3,%ymm3 DB 197,237,239,211 ; vpxor %ymm3,%ymm2,%ymm2 - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 + DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3 + DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0 DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0 - DB 196,227,125,57,212,1 ; vextracti128 $0x1,%ymm2,%xmm4 - DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 + DB 196,226,125,48,226 ; vpmovzxbw %xmm2,%ymm4 + DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2 DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 DB 197,237,213,208 ; vpmullw %ymm0,%ymm2,%ymm2 DB 197,221,213,227 ; vpmullw %ymm3,%ymm4,%ymm4 @@ -37384,29 +37287,22 @@ _sk_srcout_hsw_8bit LABEL PROC DB 197,237,253,192 ; vpaddw %ymm0,%ymm2,%ymm0 DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0 DB 197,237,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm2 - DB 196,227,125,57,211,1 ; vextracti128 $0x1,%ymm2,%xmm3 - DB 197,249,111,37,84,9,0,0 ; vmovdqa 0x954(%rip),%xmm4 # 1bd0 <_sk_xor__hsw_8bit+0x54b> - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,105,0,212 ; vpshufb %xmm4,%xmm2,%xmm2 - DB 197,233,108,211 ; vpunpcklqdq %xmm3,%xmm2,%xmm2 - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,121,0,196 ; vpshufb %xmm4,%xmm0,%xmm0 - DB 197,249,108,195 ; vpunpcklqdq %xmm3,%xmm0,%xmm0 - DB 196,227,125,56,194,1 ; vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax PUBLIC _sk_dstout_hsw_8bit _sk_dstout_hsw_8bit LABEL PROC - DB 196,226,125,0,5,79,7,0,0 ; vpshufb 0x74f(%rip),%ymm0,%ymm0 # 1a00 <_sk_xor__hsw_8bit+0x37b> + DB 196,226,125,0,5,209,5,0,0 ; vpshufb 0x5d1(%rip),%ymm0,%ymm0 # 1660 <_sk_xor__hsw_8bit+0x325> DB 197,237,118,210 ; vpcmpeqd %ymm2,%ymm2,%ymm2 DB 197,253,239,194 ; vpxor %ymm2,%ymm0,%ymm0 - DB 196,227,125,57,202,1 ; vextracti128 $0x1,%ymm1,%xmm2 - DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 - DB 196,226,125,48,217 ; vpmovzxbw %xmm1,%ymm3 - DB 196,227,125,57,196,1 ; vextracti128 $0x1,%ymm0,%xmm4 - DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 + DB 196,226,125,48,209 ; vpmovzxbw %xmm1,%ymm2 + DB 196,227,125,57,203,1 ; vextracti128 $0x1,%ymm1,%xmm3 + DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 + DB 196,226,125,48,224 ; vpmovzxbw %xmm0,%ymm4 + DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0 DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0 DB 197,253,213,195 ; vpmullw %ymm3,%ymm0,%ymm0 DB 197,221,213,226 ; vpmullw %ymm2,%ymm4,%ymm4 @@ -37414,29 +37310,22 @@ _sk_dstout_hsw_8bit LABEL PROC DB 197,253,253,195 ; vpaddw %ymm3,%ymm0,%ymm0 DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0 DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2 - DB 196,227,125,57,211,1 ; vextracti128 $0x1,%ymm2,%xmm3 - DB 197,249,111,37,223,8,0,0 ; vmovdqa 0x8df(%rip),%xmm4 # 1be0 <_sk_xor__hsw_8bit+0x55b> - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,105,0,212 ; vpshufb %xmm4,%xmm2,%xmm2 - DB 197,233,108,211 ; vpunpcklqdq %xmm3,%xmm2,%xmm2 - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,121,0,196 ; vpshufb %xmm4,%xmm0,%xmm0 - DB 197,249,108,195 ; vpunpcklqdq %xmm3,%xmm0,%xmm0 - DB 196,227,125,56,194,1 ; vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax PUBLIC _sk_srcover_hsw_8bit _sk_srcover_hsw_8bit LABEL PROC - DB 196,226,125,0,21,234,6,0,0 ; vpshufb 0x6ea(%rip),%ymm0,%ymm2 # 1a20 <_sk_xor__hsw_8bit+0x39b> + DB 196,226,125,0,21,146,5,0,0 ; vpshufb 0x592(%rip),%ymm0,%ymm2 # 1680 <_sk_xor__hsw_8bit+0x345> DB 197,229,118,219 ; vpcmpeqd %ymm3,%ymm3,%ymm3 DB 197,237,239,211 ; vpxor %ymm3,%ymm2,%ymm2 - DB 196,227,125,57,203,1 ; vextracti128 $0x1,%ymm1,%xmm3 - DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 - DB 196,226,125,48,225 ; vpmovzxbw %xmm1,%ymm4 - DB 196,227,125,57,213,1 ; vextracti128 $0x1,%ymm2,%xmm5 - DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5 + DB 196,226,125,48,217 ; vpmovzxbw %xmm1,%ymm3 + DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4 + DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 + DB 196,226,125,48,234 ; vpmovzxbw %xmm2,%ymm5 + DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2 DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 DB 197,237,213,212 ; vpmullw %ymm4,%ymm2,%ymm2 DB 197,213,213,235 ; vpmullw %ymm3,%ymm5,%ymm5 @@ -37444,30 +37333,23 @@ _sk_srcover_hsw_8bit LABEL PROC DB 197,237,253,212 ; vpaddw %ymm4,%ymm2,%ymm2 DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2 DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3 - DB 196,227,125,57,220,1 ; vextracti128 $0x1,%ymm3,%xmm4 - DB 197,249,111,45,106,8,0,0 ; vmovdqa 0x86a(%rip),%xmm5 # 1bf0 <_sk_xor__hsw_8bit+0x56b> - DB 196,226,89,0,229 ; vpshufb %xmm5,%xmm4,%xmm4 - DB 196,226,97,0,221 ; vpshufb %xmm5,%xmm3,%xmm3 - DB 197,225,108,220 ; vpunpcklqdq %xmm4,%xmm3,%xmm3 - DB 196,227,125,57,212,1 ; vextracti128 $0x1,%ymm2,%xmm4 - DB 196,226,89,0,229 ; vpshufb %xmm5,%xmm4,%xmm4 - DB 196,226,105,0,213 ; vpshufb %xmm5,%xmm2,%xmm2 - DB 197,233,108,212 ; vpunpcklqdq %xmm4,%xmm2,%xmm2 - DB 196,227,109,56,211,1 ; vinserti128 $0x1,%xmm3,%ymm2,%ymm2 + DB 196,227,101,56,226,1 ; vinserti128 $0x1,%xmm2,%ymm3,%ymm4 + DB 196,227,101,70,210,49 ; vperm2i128 $0x31,%ymm2,%ymm3,%ymm2 + DB 197,221,103,210 ; vpackuswb %ymm2,%ymm4,%ymm2 DB 197,237,252,192 ; vpaddb %ymm0,%ymm2,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax PUBLIC _sk_dstover_hsw_8bit _sk_dstover_hsw_8bit LABEL PROC - DB 196,226,117,0,21,129,6,0,0 ; vpshufb 0x681(%rip),%ymm1,%ymm2 # 1a40 <_sk_xor__hsw_8bit+0x3bb> + DB 196,226,117,0,21,79,5,0,0 ; vpshufb 0x54f(%rip),%ymm1,%ymm2 # 16a0 <_sk_xor__hsw_8bit+0x365> DB 197,229,118,219 ; vpcmpeqd %ymm3,%ymm3,%ymm3 DB 197,237,239,211 ; vpxor %ymm3,%ymm2,%ymm2 - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 + DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3 + DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0 DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0 - DB 196,227,125,57,212,1 ; vextracti128 $0x1,%ymm2,%xmm4 - DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 + DB 196,226,125,48,226 ; vpmovzxbw %xmm2,%ymm4 + DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2 DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 DB 197,237,213,208 ; vpmullw %ymm0,%ymm2,%ymm2 DB 197,221,213,227 ; vpmullw %ymm3,%ymm4,%ymm4 @@ -37475,115 +37357,82 @@ _sk_dstover_hsw_8bit LABEL PROC DB 197,237,253,192 ; vpaddw %ymm0,%ymm2,%ymm0 DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0 DB 197,237,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm2 - DB 196,227,125,57,211,1 ; vextracti128 $0x1,%ymm2,%xmm3 - DB 197,249,111,37,241,7,0,0 ; vmovdqa 0x7f1(%rip),%xmm4 # 1c00 <_sk_xor__hsw_8bit+0x57b> - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,105,0,212 ; vpshufb %xmm4,%xmm2,%xmm2 - DB 197,233,108,211 ; vpunpcklqdq %xmm3,%xmm2,%xmm2 - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,121,0,196 ; vpshufb %xmm4,%xmm0,%xmm0 - DB 197,249,108,195 ; vpunpcklqdq %xmm3,%xmm0,%xmm0 - DB 196,227,125,56,194,1 ; vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0 DB 197,253,252,193 ; vpaddb %ymm1,%ymm0,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax PUBLIC _sk_modulate_hsw_8bit _sk_modulate_hsw_8bit LABEL PROC - DB 196,227,125,57,194,1 ; vextracti128 $0x1,%ymm0,%xmm2 - DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 + DB 196,226,125,48,208 ; vpmovzxbw %xmm0,%ymm2 + DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0 DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0 - DB 196,227,125,57,203,1 ; vextracti128 $0x1,%ymm1,%xmm3 - DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 - DB 196,226,125,48,225 ; vpmovzxbw %xmm1,%ymm4 + DB 196,226,125,48,217 ; vpmovzxbw %xmm1,%ymm3 + DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4 + DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 DB 197,221,213,224 ; vpmullw %ymm0,%ymm4,%ymm4 DB 197,229,213,218 ; vpmullw %ymm2,%ymm3,%ymm3 DB 197,229,253,210 ; vpaddw %ymm2,%ymm3,%ymm2 DB 197,221,253,192 ; vpaddw %ymm0,%ymm4,%ymm0 DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0 DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2 - DB 196,227,125,57,211,1 ; vextracti128 $0x1,%ymm2,%xmm3 - DB 197,249,111,37,137,7,0,0 ; vmovdqa 0x789(%rip),%xmm4 # 1c10 <_sk_xor__hsw_8bit+0x58b> - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,105,0,212 ; vpshufb %xmm4,%xmm2,%xmm2 - DB 197,233,108,211 ; vpunpcklqdq %xmm3,%xmm2,%xmm2 - DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 196,226,121,0,196 ; vpshufb %xmm4,%xmm0,%xmm0 - DB 197,249,108,195 ; vpunpcklqdq %xmm3,%xmm0,%xmm0 - DB 196,227,125,56,194,1 ; vinserti128 $0x1,%xmm2,%ymm0,%ymm0 + DB 196,227,109,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm3 + DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + DB 197,229,103,192 ; vpackuswb %ymm0,%ymm3,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax PUBLIC _sk_multiply_hsw_8bit _sk_multiply_hsw_8bit LABEL PROC - DB 197,125,111,13,165,5,0,0 ; vmovdqa 0x5a5(%rip),%ymm9 # 1a60 <_sk_xor__hsw_8bit+0x3db> - DB 196,194,117,0,209 ; vpshufb %ymm9,%ymm1,%ymm2 - DB 196,65,45,118,210 ; vpcmpeqd %ymm10,%ymm10,%ymm10 - DB 196,193,109,239,226 ; vpxor %ymm10,%ymm2,%ymm4 - DB 196,227,125,57,194,1 ; vextracti128 $0x1,%ymm0,%xmm2 - DB 196,98,125,48,218 ; vpmovzxbw %xmm2,%ymm11 - DB 196,226,125,48,216 ; vpmovzxbw %xmm0,%ymm3 - DB 196,227,125,57,229,1 ; vextracti128 $0x1,%ymm4,%xmm5 - DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5 - DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 - DB 197,221,213,227 ; vpmullw %ymm3,%ymm4,%ymm4 - DB 196,193,85,213,235 ; vpmullw %ymm11,%ymm5,%ymm5 - DB 196,193,85,253,235 ; vpaddw %ymm11,%ymm5,%ymm5 - DB 197,221,253,227 ; vpaddw %ymm3,%ymm4,%ymm4 - DB 197,189,113,212,8 ; vpsrlw $0x8,%ymm4,%ymm8 - DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5 - DB 196,227,125,57,238,1 ; vextracti128 $0x1,%ymm5,%xmm6 - DB 197,249,111,37,12,7,0,0 ; vmovdqa 0x70c(%rip),%xmm4 # 1c20 <_sk_xor__hsw_8bit+0x59b> - DB 196,226,73,0,244 ; vpshufb %xmm4,%xmm6,%xmm6 - DB 196,226,81,0,236 ; vpshufb %xmm4,%xmm5,%xmm5 - DB 197,209,108,238 ; vpunpcklqdq %xmm6,%xmm5,%xmm5 - DB 196,99,125,57,198,1 ; vextracti128 $0x1,%ymm8,%xmm6 - DB 196,226,73,0,244 ; vpshufb %xmm4,%xmm6,%xmm6 - DB 196,226,57,0,252 ; vpshufb %xmm4,%xmm8,%xmm7 - DB 197,193,108,246 ; vpunpcklqdq %xmm6,%xmm7,%xmm6 - DB 196,99,77,56,197,1 ; vinserti128 $0x1,%xmm5,%ymm6,%ymm8 - DB 196,194,125,0,193 ; vpshufb %ymm9,%ymm0,%ymm0 - DB 196,193,125,239,194 ; vpxor %ymm10,%ymm0,%ymm0 - DB 196,227,125,57,206,1 ; vextracti128 $0x1,%ymm1,%xmm6 + DB 197,253,111,37,191,4,0,0 ; vmovdqa 0x4bf(%rip),%ymm4 # 16c0 <_sk_xor__hsw_8bit+0x385> + DB 196,226,117,0,212 ; vpshufb %ymm4,%ymm1,%ymm2 + DB 197,213,118,237 ; vpcmpeqd %ymm5,%ymm5,%ymm5 + DB 197,237,239,245 ; vpxor %ymm5,%ymm2,%ymm6 + DB 196,226,125,48,208 ; vpmovzxbw %xmm0,%ymm2 + DB 196,227,125,57,195,1 ; vextracti128 $0x1,%ymm0,%xmm3 + DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 + DB 196,226,125,48,254 ; vpmovzxbw %xmm6,%ymm7 + DB 196,227,125,57,246,1 ; vextracti128 $0x1,%ymm6,%xmm6 DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6 - DB 196,226,125,48,249 ; vpmovzxbw %xmm1,%ymm7 - DB 196,227,125,57,197,1 ; vextracti128 $0x1,%ymm0,%xmm5 + DB 197,205,213,243 ; vpmullw %ymm3,%ymm6,%ymm6 + DB 197,197,213,250 ; vpmullw %ymm2,%ymm7,%ymm7 + DB 197,197,253,250 ; vpaddw %ymm2,%ymm7,%ymm7 + DB 197,205,253,243 ; vpaddw %ymm3,%ymm6,%ymm6 + DB 197,205,113,214,8 ; vpsrlw $0x8,%ymm6,%ymm6 + DB 197,197,113,215,8 ; vpsrlw $0x8,%ymm7,%ymm7 + DB 196,99,69,56,198,1 ; vinserti128 $0x1,%xmm6,%ymm7,%ymm8 + DB 196,227,69,70,246,49 ; vperm2i128 $0x31,%ymm6,%ymm7,%ymm6 + DB 197,189,103,246 ; vpackuswb %ymm6,%ymm8,%ymm6 + DB 196,226,125,0,196 ; vpshufb %ymm4,%ymm0,%ymm0 + DB 197,253,239,197 ; vpxor %ymm5,%ymm0,%ymm0 + DB 196,226,125,48,225 ; vpmovzxbw %xmm1,%ymm4 + DB 196,227,125,57,205,1 ; vextracti128 $0x1,%ymm1,%xmm5 DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5 + DB 196,226,125,48,248 ; vpmovzxbw %xmm0,%ymm7 + DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0 DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0 - DB 197,253,213,199 ; vpmullw %ymm7,%ymm0,%ymm0 - DB 197,213,213,238 ; vpmullw %ymm6,%ymm5,%ymm5 - DB 197,213,253,238 ; vpaddw %ymm6,%ymm5,%ymm5 - DB 197,253,253,199 ; vpaddw %ymm7,%ymm0,%ymm0 + DB 197,253,213,197 ; vpmullw %ymm5,%ymm0,%ymm0 + DB 197,197,213,252 ; vpmullw %ymm4,%ymm7,%ymm7 + DB 197,197,253,252 ; vpaddw %ymm4,%ymm7,%ymm7 + DB 197,253,253,197 ; vpaddw %ymm5,%ymm0,%ymm0 DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0 - DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5 - DB 196,227,125,57,234,1 ; vextracti128 $0x1,%ymm5,%xmm2 - DB 196,226,105,0,212 ; vpshufb %xmm4,%xmm2,%xmm2 - DB 196,226,81,0,236 ; vpshufb %xmm4,%xmm5,%xmm5 - DB 197,209,108,210 ; vpunpcklqdq %xmm2,%xmm5,%xmm2 - DB 196,227,125,57,197,1 ; vextracti128 $0x1,%ymm0,%xmm5 - DB 196,226,81,0,236 ; vpshufb %xmm4,%xmm5,%xmm5 - DB 196,226,121,0,196 ; vpshufb %xmm4,%xmm0,%xmm0 - DB 197,249,108,197 ; vpunpcklqdq %xmm5,%xmm0,%xmm0 - DB 196,227,125,56,194,1 ; vinserti128 $0x1,%xmm2,%ymm0,%ymm0 - DB 197,197,213,211 ; vpmullw %ymm3,%ymm7,%ymm2 - DB 196,193,77,213,235 ; vpmullw %ymm11,%ymm6,%ymm5 - DB 196,193,85,253,235 ; vpaddw %ymm11,%ymm5,%ymm5 - DB 197,237,253,211 ; vpaddw %ymm3,%ymm2,%ymm2 + DB 197,197,113,215,8 ; vpsrlw $0x8,%ymm7,%ymm7 + DB 196,99,69,56,192,1 ; vinserti128 $0x1,%xmm0,%ymm7,%ymm8 + DB 196,227,69,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm7,%ymm0 + DB 197,189,103,192 ; vpackuswb %ymm0,%ymm8,%ymm0 + DB 197,253,252,198 ; vpaddb %ymm6,%ymm0,%ymm0 + DB 197,213,213,235 ; vpmullw %ymm3,%ymm5,%ymm5 + DB 197,221,213,226 ; vpmullw %ymm2,%ymm4,%ymm4 + DB 197,221,253,210 ; vpaddw %ymm2,%ymm4,%ymm2 + DB 197,213,253,219 ; vpaddw %ymm3,%ymm5,%ymm3 + DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3 DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2 - DB 197,229,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm3 - DB 196,227,125,57,221,1 ; vextracti128 $0x1,%ymm3,%xmm5 - DB 196,226,81,0,236 ; vpshufb %xmm4,%xmm5,%xmm5 - DB 196,226,97,0,220 ; vpshufb %xmm4,%xmm3,%xmm3 - DB 197,225,108,221 ; vpunpcklqdq %xmm5,%xmm3,%xmm3 - DB 196,227,125,57,213,1 ; vextracti128 $0x1,%ymm2,%xmm5 - DB 196,226,81,0,236 ; vpshufb %xmm4,%xmm5,%xmm5 - DB 196,226,105,0,212 ; vpshufb %xmm4,%xmm2,%xmm2 - DB 197,233,108,213 ; vpunpcklqdq %xmm5,%xmm2,%xmm2 - DB 196,227,109,56,211,1 ; vinserti128 $0x1,%xmm3,%ymm2,%ymm2 - DB 197,237,252,192 ; vpaddb %ymm0,%ymm2,%ymm0 - DB 196,193,125,252,192 ; vpaddb %ymm8,%ymm0,%ymm0 + DB 196,227,109,56,227,1 ; vinserti128 $0x1,%xmm3,%ymm2,%ymm4 + DB 196,227,109,70,211,49 ; vperm2i128 $0x31,%ymm3,%ymm2,%ymm2 + DB 197,221,103,210 ; vpackuswb %ymm2,%ymm4,%ymm2 + DB 197,253,252,194 ; vpaddb %ymm2,%ymm0,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -37591,92 +37440,72 @@ PUBLIC _sk_screen_hsw_8bit _sk_screen_hsw_8bit LABEL PROC DB 197,237,118,210 ; vpcmpeqd %ymm2,%ymm2,%ymm2 DB 197,253,239,210 ; vpxor %ymm2,%ymm0,%ymm2 - DB 196,227,125,57,211,1 ; vextracti128 $0x1,%ymm2,%xmm3 - DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 + DB 196,226,125,48,218 ; vpmovzxbw %xmm2,%ymm3 + DB 196,227,125,57,210,1 ; vextracti128 $0x1,%ymm2,%xmm2 DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 - DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4 - DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 - DB 196,226,125,48,233 ; vpmovzxbw %xmm1,%ymm5 + DB 196,226,125,48,225 ; vpmovzxbw %xmm1,%ymm4 + DB 196,227,125,57,205,1 ; vextracti128 $0x1,%ymm1,%xmm5 + DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5 DB 197,213,213,234 ; vpmullw %ymm2,%ymm5,%ymm5 DB 197,221,213,227 ; vpmullw %ymm3,%ymm4,%ymm4 DB 197,221,253,219 ; vpaddw %ymm3,%ymm4,%ymm3 DB 197,213,253,210 ; vpaddw %ymm2,%ymm5,%ymm2 DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2 DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3 - DB 196,227,125,57,220,1 ; vextracti128 $0x1,%ymm3,%xmm4 - DB 197,249,111,45,219,5,0,0 ; vmovdqa 0x5db(%rip),%xmm5 # 1c30 <_sk_xor__hsw_8bit+0x5ab> - DB 196,226,89,0,229 ; vpshufb %xmm5,%xmm4,%xmm4 - DB 196,226,97,0,221 ; vpshufb %xmm5,%xmm3,%xmm3 - DB 197,225,108,220 ; vpunpcklqdq %xmm4,%xmm3,%xmm3 - DB 196,227,125,57,212,1 ; vextracti128 $0x1,%ymm2,%xmm4 - DB 196,226,89,0,229 ; vpshufb %xmm5,%xmm4,%xmm4 - DB 196,226,105,0,213 ; vpshufb %xmm5,%xmm2,%xmm2 - DB 197,233,108,212 ; vpunpcklqdq %xmm4,%xmm2,%xmm2 - DB 196,227,109,56,211,1 ; vinserti128 $0x1,%xmm3,%ymm2,%ymm2 + DB 196,227,101,56,226,1 ; vinserti128 $0x1,%xmm2,%ymm3,%ymm4 + DB 196,227,101,70,210,49 ; vperm2i128 $0x31,%ymm2,%ymm3,%ymm2 + DB 197,221,103,210 ; vpackuswb %ymm2,%ymm4,%ymm2 DB 197,237,252,192 ; vpaddb %ymm0,%ymm2,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax PUBLIC _sk_xor__hsw_8bit _sk_xor__hsw_8bit LABEL PROC - DB 197,253,111,29,243,3,0,0 ; vmovdqa 0x3f3(%rip),%ymm3 # 1a80 <_sk_xor__hsw_8bit+0x3fb> - DB 196,226,117,0,211 ; vpshufb %ymm3,%ymm1,%ymm2 + DB 197,253,111,21,157,3,0,0 ; vmovdqa 0x39d(%rip),%ymm2 # 16e0 <_sk_xor__hsw_8bit+0x3a5> + DB 196,226,117,0,218 ; vpshufb %ymm2,%ymm1,%ymm3 DB 197,221,118,228 ; vpcmpeqd %ymm4,%ymm4,%ymm4 - DB 197,237,239,212 ; vpxor %ymm4,%ymm2,%ymm2 - DB 196,227,125,57,197,1 ; vextracti128 $0x1,%ymm0,%xmm5 - DB 196,226,125,48,237 ; vpmovzxbw %xmm5,%ymm5 - DB 196,226,125,48,240 ; vpmovzxbw %xmm0,%ymm6 - DB 196,227,125,57,215,1 ; vextracti128 $0x1,%ymm2,%xmm7 - DB 196,226,125,48,255 ; vpmovzxbw %xmm7,%ymm7 - DB 196,226,125,48,210 ; vpmovzxbw %xmm2,%ymm2 - DB 197,237,213,214 ; vpmullw %ymm6,%ymm2,%ymm2 + DB 197,229,239,220 ; vpxor %ymm4,%ymm3,%ymm3 + DB 196,226,125,48,232 ; vpmovzxbw %xmm0,%ymm5 + DB 196,227,125,57,198,1 ; vextracti128 $0x1,%ymm0,%xmm6 + DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6 + DB 196,226,125,48,251 ; vpmovzxbw %xmm3,%ymm7 + DB 196,227,125,57,219,1 ; vextracti128 $0x1,%ymm3,%xmm3 + DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 + DB 197,229,213,222 ; vpmullw %ymm6,%ymm3,%ymm3 DB 197,197,213,253 ; vpmullw %ymm5,%ymm7,%ymm7 DB 197,197,253,237 ; vpaddw %ymm5,%ymm7,%ymm5 - DB 197,237,253,214 ; vpaddw %ymm6,%ymm2,%ymm2 - DB 197,205,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm6 + DB 197,229,253,222 ; vpaddw %ymm6,%ymm3,%ymm3 + DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3 DB 197,213,113,213,8 ; vpsrlw $0x8,%ymm5,%ymm5 - DB 196,227,125,57,239,1 ; vextracti128 $0x1,%ymm5,%xmm7 - DB 197,249,111,21,94,5,0,0 ; vmovdqa 0x55e(%rip),%xmm2 # 1c40 <_sk_xor__hsw_8bit+0x5bb> - DB 196,226,65,0,250 ; vpshufb %xmm2,%xmm7,%xmm7 - DB 196,226,81,0,234 ; vpshufb %xmm2,%xmm5,%xmm5 - DB 197,209,108,239 ; vpunpcklqdq %xmm7,%xmm5,%xmm5 - DB 196,227,125,57,247,1 ; vextracti128 $0x1,%ymm6,%xmm7 - DB 196,226,65,0,250 ; vpshufb %xmm2,%xmm7,%xmm7 - DB 196,226,73,0,242 ; vpshufb %xmm2,%xmm6,%xmm6 - DB 197,201,108,247 ; vpunpcklqdq %xmm7,%xmm6,%xmm6 - DB 196,227,77,56,237,1 ; vinserti128 $0x1,%xmm5,%ymm6,%ymm5 - DB 196,226,125,0,195 ; vpshufb %ymm3,%ymm0,%ymm0 + DB 196,227,85,56,243,1 ; vinserti128 $0x1,%xmm3,%ymm5,%ymm6 + DB 196,227,85,70,219,49 ; vperm2i128 $0x31,%ymm3,%ymm5,%ymm3 + DB 197,205,103,219 ; vpackuswb %ymm3,%ymm6,%ymm3 + DB 196,226,125,0,194 ; vpshufb %ymm2,%ymm0,%ymm0 DB 197,253,239,196 ; vpxor %ymm4,%ymm0,%ymm0 - DB 196,227,125,57,203,1 ; vextracti128 $0x1,%ymm1,%xmm3 - DB 196,226,125,48,219 ; vpmovzxbw %xmm3,%ymm3 - DB 196,226,125,48,225 ; vpmovzxbw %xmm1,%ymm4 - DB 196,227,125,57,198,1 ; vextracti128 $0x1,%ymm0,%xmm6 - DB 196,226,125,48,246 ; vpmovzxbw %xmm6,%ymm6 + DB 196,226,125,48,209 ; vpmovzxbw %xmm1,%ymm2 + DB 196,227,125,57,204,1 ; vextracti128 $0x1,%ymm1,%xmm4 + DB 196,226,125,48,228 ; vpmovzxbw %xmm4,%ymm4 + DB 196,226,125,48,232 ; vpmovzxbw %xmm0,%ymm5 + DB 196,227,125,57,192,1 ; vextracti128 $0x1,%ymm0,%xmm0 DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0 DB 197,253,213,196 ; vpmullw %ymm4,%ymm0,%ymm0 - DB 197,205,213,243 ; vpmullw %ymm3,%ymm6,%ymm6 - DB 197,205,253,219 ; vpaddw %ymm3,%ymm6,%ymm3 + DB 197,213,213,234 ; vpmullw %ymm2,%ymm5,%ymm5 + DB 197,213,253,210 ; vpaddw %ymm2,%ymm5,%ymm2 DB 197,253,253,196 ; vpaddw %ymm4,%ymm0,%ymm0 DB 197,253,113,208,8 ; vpsrlw $0x8,%ymm0,%ymm0 - DB 197,229,113,211,8 ; vpsrlw $0x8,%ymm3,%ymm3 - DB 196,227,125,57,220,1 ; vextracti128 $0x1,%ymm3,%xmm4 - DB 196,226,89,0,226 ; vpshufb %xmm2,%xmm4,%xmm4 - DB 196,226,97,0,218 ; vpshufb %xmm2,%xmm3,%xmm3 - DB 197,225,108,220 ; vpunpcklqdq %xmm4,%xmm3,%xmm3 - DB 196,227,125,57,196,1 ; vextracti128 $0x1,%ymm0,%xmm4 - DB 196,226,89,0,226 ; vpshufb %xmm2,%xmm4,%xmm4 - DB 196,226,121,0,194 ; vpshufb %xmm2,%xmm0,%xmm0 - DB 197,249,108,196 ; vpunpcklqdq %xmm4,%xmm0,%xmm0 - DB 196,227,125,56,195,1 ; vinserti128 $0x1,%xmm3,%ymm0,%ymm0 - DB 197,213,252,192 ; vpaddb %ymm0,%ymm5,%ymm0 + DB 197,237,113,210,8 ; vpsrlw $0x8,%ymm2,%ymm2 + DB 196,227,109,56,224,1 ; vinserti128 $0x1,%xmm0,%ymm2,%ymm4 + DB 196,227,109,70,192,49 ; vperm2i128 $0x31,%ymm0,%ymm2,%ymm0 + DB 197,221,103,192 ; vpackuswb %ymm0,%ymm4,%ymm0 + DB 197,253,252,195 ; vpaddb %ymm3,%ymm0,%ymm0 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax ALIGN 4 DB 0,0 ; add %al,(%rax) - DB 127,67 ; jg 17cb <_sk_xor__hsw_8bit+0x146> + DB 127,67 ; jg 143f <_sk_xor__hsw_8bit+0x104> DB 0,0 ; add %al,(%rax) - DB 127,67 ; jg 17cf <_sk_xor__hsw_8bit+0x14a> + DB 127,67 ; jg 1443 <_sk_xor__hsw_8bit+0x108> DB 1,1 ; add %eax,(%rcx) DB 1,0 ; add %eax,(%rax) DB 0,0 ; add %al,(%rax) @@ -37686,9 +37515,9 @@ ALIGN 4 DB 0,0 ; add %al,(%rax) DB 0,255 ; add %bh,%bh DB 0,0 ; add %al,(%rax) - DB 127,67 ; jg 17e3 <_sk_xor__hsw_8bit+0x15e> + DB 127,67 ; jg 1457 <_sk_xor__hsw_8bit+0x11c> DB 0,0 ; add %al,(%rax) - DB 127,67 ; jg 17e7 <_sk_xor__hsw_8bit+0x162> + DB 127,67 ; jg 145b <_sk_xor__hsw_8bit+0x120> ALIGN 32 DB 0,0 ; add %al,(%rax) @@ -38126,14 +37955,6 @@ ALIGN 32 DB 15 ; .byte 0xf ALIGN 16 - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) DB 255,0 ; incl (%rax) DB 255,0 ; incl (%rax) DB 255,0 ; incl (%rax) @@ -38190,22 +38011,6 @@ ALIGN 16 DB 255,0 ; incl (%rax) DB 255,0 ; incl (%rax) DB 255,0 ; incl (%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) DB 255,0 ; incl (%rax) DB 255,0 ; incl (%rax) DB 255,0 ; incl (%rax) @@ -38214,22 +38019,6 @@ ALIGN 16 DB 255,0 ; incl (%rax) DB 255,0 ; incl (%rax) DB 255,0 ; incl (%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) DB 255,0 ; incl (%rax) DB 255,0 ; incl (%rax) DB 255,0 ; incl (%rax) @@ -38238,110 +38027,6 @@ ALIGN 16 DB 255,0 ; incl (%rax) DB 255,0 ; incl (%rax) DB 255,0 ; incl (%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,2 ; add %al,(%rdx) - DB 4,6 ; add $0x6,%al - DB 8,10 ; or %cl,(%rdx) - DB 12,14 ; or $0xe,%al - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) - DB 0,0 ; add %al,(%rax) ALIGN 8 DB 0,0 ; add %al,(%rax) diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp index 4d314979a8..49f8358fdc 100644 --- a/src/jumper/SkJumper_stages_8bit.cpp +++ b/src/jumper/SkJumper_stages_8bit.cpp @@ -8,6 +8,10 @@ #include "SkJumper.h" #include "SkJumper_misc.h" +#if defined(__SSE2__) + #include <immintrin.h> +#endif + // We're going to try going even lower precision than _lowp.cpp, // 8-bit per channel, and while we're at it keep our pixels interlaced. // This is the natural format for kN32_SkColorType buffers, and we hope @@ -51,13 +55,28 @@ union V { }; static const size_t kStride = sizeof(V) / sizeof(uint32_t); +// Usually __builtin_convertvector() is pretty good, but sometimes we can do better. +SI U8x4 pack(U16x4 v) { +#if defined(__AVX2__) + static_assert(sizeof(v) == 64, ""); + auto lo = unaligned_load<__m256i>((char*)&v + 0), + hi = unaligned_load<__m256i>((char*)&v + 32); + + auto _02 = _mm256_permute2x128_si256(lo,hi, 0x20), + _13 = _mm256_permute2x128_si256(lo,hi, 0x31); + return _mm256_packus_epi16(_02, _13); +#else + return __builtin_convertvector(v, U8x4); +#endif +} + SI V operator+(V x, V y) { return x.u8x4 + y.u8x4; } SI V operator-(V x, V y) { return x.u8x4 - y.u8x4; } SI V operator*(V x, V y) { // (x*y + x)/256 is a very good approximation of (x*y + 127)/255. U16x4 X = __builtin_convertvector(x.u8x4, U16x4), Y = __builtin_convertvector(y.u8x4, U16x4); - return __builtin_convertvector((X*Y + X)>>8, U8x4); + return pack((X*Y + X)>>8); } SI V inv(V v) { return 0xff - v; } @@ -162,8 +181,6 @@ SI void store(T* dst, V v, size_t tail) { } #if 1 && defined(__AVX2__) - #include <immintrin.h> - SI U32 mask(size_t tail) { // We go a little out of our way to avoid needing large constant values here. |