aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-06-19 14:37:10 -0700
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-06-27 13:31:05 +0000
commit8c3d5156c7ab2bf723c307043841815d670895c5 (patch)
treeeade40cfacdb4a3a276bbec8342e9f8119e8fe74 /src/jumper
parent7f7b902d51a0abb5216ebb3593e890f3c2b51547 (diff)
add _hsw lowp backend
CQ_INCLUDE_TRYBOTS=skia.primary:Build-Ubuntu-Clang-x86_64-Debug-MSAN Change-Id: Id53279c17589b3434629bb644358ee238af8649f Reviewed-on: https://skia-review.googlesource.com/20269 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Mike Reed <reed@google.com>
Diffstat (limited to 'src/jumper')
-rw-r--r--src/jumper/SkJumper.cpp106
-rw-r--r--src/jumper/SkJumper_generated.S1783
-rw-r--r--src/jumper/SkJumper_generated_win.S1762
-rw-r--r--src/jumper/SkJumper_stages_lowp.cpp97
-rwxr-xr-xsrc/jumper/build_stages.py10
5 files changed, 3642 insertions, 116 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 459ca6233c..8c9eb73325 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -31,8 +31,7 @@ static const int kNumStages = SK_RASTER_PIPELINE_STAGES(M);
#undef M
#ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
-#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
- #if 0
+ #if 0 && !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
#include <atomic>
#define M(st) #st,
@@ -57,7 +56,6 @@ static const int kNumStages = SK_RASTER_PIPELINE_STAGES(M);
static void log_missing(SkRasterPipeline::StockStage) {}
#endif
#endif
-#endif
// We can't express the real types of most stage functions portably, so we use a stand-in.
// We'll only ever call start_pipeline(), which then chains into the rest for us.
@@ -128,12 +126,14 @@ extern "C" {
ASM(start_pipeline,avx ),
ASM(start_pipeline,sse41 ),
ASM(start_pipeline,sse2 ),
+ ASM(start_pipeline,hsw_lowp ),
ASM(start_pipeline,ssse3_lowp);
StageFn ASM(just_return,hsw),
ASM(just_return,avx),
ASM(just_return,sse41),
ASM(just_return,sse2),
+ ASM(just_return,hsw_lowp ),
ASM(just_return,ssse3_lowp);
#define M(st) StageFn ASM(st,hsw);
@@ -149,6 +149,9 @@ extern "C" {
SK_RASTER_PIPELINE_STAGES(M)
#undef M
+ #define M(st) StageFn ASM(st,hsw_lowp);
+ LOWP_STAGES(M)
+ #undef M
#define M(st) StageFn ASM(st,ssse3_lowp);
LOWP_STAGES(M)
#undef M
@@ -162,6 +165,24 @@ extern "C" {
#undef M
}
+#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
+ template <SkRasterPipeline::StockStage st>
+ static constexpr StageFn* hsw_lowp() { return nullptr; }
+
+ template <SkRasterPipeline::StockStage st>
+ static constexpr StageFn* ssse3_lowp() { return nullptr; }
+
+ #define M(st) \
+ template <> constexpr StageFn* hsw_lowp<SkRasterPipeline::st>() { \
+ return ASM(st,hsw_lowp); \
+ } \
+ template <> constexpr StageFn* ssse3_lowp<SkRasterPipeline::st>() { \
+ return ASM(st,ssse3_lowp); \
+ }
+ LOWP_STAGES(M)
+ #undef M
+#endif
+
// Engines comprise everything we need to run SkRasterPipelines.
struct SkJumper_Engine {
StageFn* stages[kNumStages];
@@ -239,41 +260,70 @@ static SkJumper_Engine choose_engine() {
return kPortable;
}
-StartPipelineFn* SkRasterPipeline::build_pipeline(void** ip) const {
#ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
-#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
- if (SkCpu::Supports(SkCpu::SSSE3)) {
- void** reset_point = ip;
-
- *--ip = (void*)ASM(just_return,ssse3_lowp);
- for (const StageList* st = fStages; st; st = st->prev) {
- StageFn* fn = nullptr;
- switch (st->stage) {
- #define M(st) case SkRasterPipeline::st: fn = ASM(st, ssse3_lowp); break;
- LOWP_STAGES(M)
+ static const SkJumper_Engine kNone = {
+ #define M(stage) nullptr,
+ { SK_RASTER_PIPELINE_STAGES(M) },
+ #undef M
+ nullptr,
+ nullptr,
+ };
+ static SkJumper_Engine gLowp = kNone;
+ static SkOnce gChooseLowpOnce;
+
+ static SkJumper_Engine choose_lowp() {
+ #if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
+ if (1 && SkCpu::Supports(SkCpu::HSW)) {
+ return {
+ #define M(st) hsw_lowp<SkRasterPipeline::st>(),
+ { SK_RASTER_PIPELINE_STAGES(M) },
+ ASM(start_pipeline,hsw_lowp),
+ ASM(just_return,hsw_lowp)
#undef M
- case SkRasterPipeline::clamp_0: continue; // clamp_0 is a no-op in lowp.
- default:
- log_missing(st->stage);
- ip = reset_point;
- }
- if (ip == reset_point) {
- break;
- }
+ };
+ }
+ if (1 && SkCpu::Supports(SkCpu::SSSE3)) {
+ return {
+ #define M(st) ssse3_lowp<SkRasterPipeline::st>(),
+ { SK_RASTER_PIPELINE_STAGES(M) },
+ ASM(start_pipeline,ssse3_lowp),
+ ASM(just_return,ssse3_lowp)
+ #undef M
+ };
+ }
+ #endif
+ return kNone;
+ }
+#endif
+
+StartPipelineFn* SkRasterPipeline::build_pipeline(void** ip) const {
+#ifndef SK_DISABLE_SSSE3_RUNTIME_CHECK_FOR_LOWP_STAGES
+ gChooseLowpOnce([]{ gLowp = choose_lowp(); });
+
+ // First try to build a lowp pipeline. If that fails, fall back to normal float gEngine.
+ void** reset_point = ip;
+ *--ip = (void*)gLowp.just_return;
+ for (const StageList* st = fStages; st; st = st->prev) {
+ if (st->stage == SkRasterPipeline::clamp_0) {
+ continue; // No-op in lowp.
+ }
+ if (StageFn* fn = gLowp.stages[st->stage]) {
if (st->ctx) {
*--ip = st->ctx;
}
*--ip = (void*)fn;
- }
-
- if (ip != reset_point) {
- return ASM(start_pipeline,ssse3_lowp);
+ } else {
+ log_missing(st->stage);
+ ip = reset_point;
+ break;
}
}
+ if (ip != reset_point) {
+ return gLowp.start_pipeline;
+ }
#endif
-#endif
- gChooseEngineOnce([]{ gEngine = choose_engine(); });
+ gChooseEngineOnce([]{ gEngine = choose_engine(); });
// We're building the pipeline backwards, so we start with the final stage just_return.
*--ip = (void*)gEngine.just_return;
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index ed8d74ada9..b1ec96fff1 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -37386,6 +37386,1733 @@ BALIGN4
.byte 0,63 // add %bh,(%rdi)
BALIGN32
+HIDDEN _sk_start_pipeline_hsw_lowp
+.globl _sk_start_pipeline_hsw_lowp
+FUNCTION(_sk_start_pipeline_hsw_lowp)
+_sk_start_pipeline_hsw_lowp:
+ .byte 85 // push %rbp
+ .byte 72,137,229 // mov %rsp,%rbp
+ .byte 65,87 // push %r15
+ .byte 65,86 // push %r14
+ .byte 65,85 // push %r13
+ .byte 65,84 // push %r12
+ .byte 83 // push %rbx
+ .byte 80 // push %rax
+ .byte 76,137,195 // mov %r8,%rbx
+ .byte 73,137,215 // mov %rdx,%r15
+ .byte 73,137,244 // mov %rsi,%r12
+ .byte 73,137,254 // mov %rdi,%r14
+ .byte 72,137,206 // mov %rcx,%rsi
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 73,137,197 // mov %rax,%r13
+ .byte 73,141,78,16 // lea 0x10(%r14),%rcx
+ .byte 76,57,249 // cmp %r15,%rcx
+ .byte 118,5 // jbe 30 <_sk_start_pipeline_hsw_lowp+0x30>
+ .byte 76,137,242 // mov %r14,%rdx
+ .byte 235,80 // jmp 80 <_sk_start_pipeline_hsw_lowp+0x80>
+ .byte 76,137,125,208 // mov %r15,-0x30(%rbp)
+ .byte 65,184,0,0,0,0 // mov $0x0,%r8d
+ .byte 197,252,87,192 // vxorps %ymm0,%ymm0,%ymm0
+ .byte 197,244,87,201 // vxorps %ymm1,%ymm1,%ymm1
+ .byte 197,236,87,210 // vxorps %ymm2,%ymm2,%ymm2
+ .byte 197,228,87,219 // vxorps %ymm3,%ymm3,%ymm3
+ .byte 197,220,87,228 // vxorps %ymm4,%ymm4,%ymm4
+ .byte 197,212,87,237 // vxorps %ymm5,%ymm5,%ymm5
+ .byte 197,204,87,246 // vxorps %ymm6,%ymm6,%ymm6
+ .byte 197,196,87,255 // vxorps %ymm7,%ymm7,%ymm7
+ .byte 72,137,223 // mov %rbx,%rdi
+ .byte 73,137,247 // mov %rsi,%r15
+ .byte 76,137,242 // mov %r14,%rdx
+ .byte 76,137,225 // mov %r12,%rcx
+ .byte 65,255,213 // callq *%r13
+ .byte 76,137,254 // mov %r15,%rsi
+ .byte 76,139,125,208 // mov -0x30(%rbp),%r15
+ .byte 73,141,86,16 // lea 0x10(%r14),%rdx
+ .byte 73,131,198,32 // add $0x20,%r14
+ .byte 77,57,254 // cmp %r15,%r14
+ .byte 73,137,214 // mov %rdx,%r14
+ .byte 118,180 // jbe 34 <_sk_start_pipeline_hsw_lowp+0x34>
+ .byte 77,137,248 // mov %r15,%r8
+ .byte 73,41,208 // sub %rdx,%r8
+ .byte 116,41 // je b1 <_sk_start_pipeline_hsw_lowp+0xb1>
+ .byte 197,252,87,192 // vxorps %ymm0,%ymm0,%ymm0
+ .byte 197,244,87,201 // vxorps %ymm1,%ymm1,%ymm1
+ .byte 197,236,87,210 // vxorps %ymm2,%ymm2,%ymm2
+ .byte 197,228,87,219 // vxorps %ymm3,%ymm3,%ymm3
+ .byte 197,220,87,228 // vxorps %ymm4,%ymm4,%ymm4
+ .byte 197,212,87,237 // vxorps %ymm5,%ymm5,%ymm5
+ .byte 197,204,87,246 // vxorps %ymm6,%ymm6,%ymm6
+ .byte 197,196,87,255 // vxorps %ymm7,%ymm7,%ymm7
+ .byte 72,137,223 // mov %rbx,%rdi
+ .byte 76,137,225 // mov %r12,%rcx
+ .byte 65,255,213 // callq *%r13
+ .byte 76,137,248 // mov %r15,%rax
+ .byte 72,131,196,8 // add $0x8,%rsp
+ .byte 91 // pop %rbx
+ .byte 65,92 // pop %r12
+ .byte 65,93 // pop %r13
+ .byte 65,94 // pop %r14
+ .byte 65,95 // pop %r15
+ .byte 93 // pop %rbp
+ .byte 197,248,119 // vzeroupper
+ .byte 195 // retq
+
+HIDDEN _sk_just_return_hsw_lowp
+.globl _sk_just_return_hsw_lowp
+FUNCTION(_sk_just_return_hsw_lowp)
+_sk_just_return_hsw_lowp:
+ .byte 195 // retq
+
+HIDDEN _sk_constant_color_hsw_lowp
+.globl _sk_constant_color_hsw_lowp
+FUNCTION(_sk_constant_color_hsw_lowp)
+_sk_constant_color_hsw_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 196,226,121,24,5,130,21,0,0 // vbroadcastss 0x1582(%rip),%xmm0 # 1654 <_sk_xor__hsw_lowp+0x78>
+ .byte 197,248,88,24 // vaddps (%rax),%xmm0,%xmm3
+ .byte 196,226,125,121,195 // vpbroadcastw %xmm3,%ymm0
+ .byte 197,251,112,203,234 // vpshuflw $0xea,%xmm3,%xmm1
+ .byte 196,226,125,88,201 // vpbroadcastd %xmm1,%ymm1
+ .byte 196,227,121,4,211,230 // vpermilps $0xe6,%xmm3,%xmm2
+ .byte 197,251,112,210,224 // vpshuflw $0xe0,%xmm2,%xmm2
+ .byte 196,226,125,88,210 // vpbroadcastd %xmm2,%ymm2
+ .byte 196,227,121,4,219,236 // vpermilps $0xec,%xmm3,%xmm3
+ .byte 197,251,112,219,234 // vpshuflw $0xea,%xmm3,%xmm3
+ .byte 196,226,125,88,219 // vpbroadcastd %xmm3,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_set_rgb_hsw_lowp
+.globl _sk_set_rgb_hsw_lowp
+FUNCTION(_sk_set_rgb_hsw_lowp)
+_sk_set_rgb_hsw_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,250,16,21,69,21,0,0 // vmovss 0x1545(%rip),%xmm2 # 1658 <_sk_xor__hsw_lowp+0x7c>
+ .byte 197,234,88,0 // vaddss (%rax),%xmm2,%xmm0
+ .byte 196,193,121,126,193 // vmovd %xmm0,%r9d
+ .byte 196,193,121,110,193 // vmovd %r9d,%xmm0
+ .byte 196,226,125,121,192 // vpbroadcastw %xmm0,%ymm0
+ .byte 197,234,88,72,4 // vaddss 0x4(%rax),%xmm2,%xmm1
+ .byte 196,193,121,126,201 // vmovd %xmm1,%r9d
+ .byte 196,193,121,110,201 // vmovd %r9d,%xmm1
+ .byte 196,226,125,121,201 // vpbroadcastw %xmm1,%ymm1
+ .byte 197,234,88,80,8 // vaddss 0x8(%rax),%xmm2,%xmm2
+ .byte 197,249,126,208 // vmovd %xmm2,%eax
+ .byte 197,249,110,208 // vmovd %eax,%xmm2
+ .byte 196,226,125,121,210 // vpbroadcastw %xmm2,%ymm2
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_premul_hsw_lowp
+.globl _sk_premul_hsw_lowp
+FUNCTION(_sk_premul_hsw_lowp)
+_sk_premul_hsw_lowp:
+ .byte 196,226,125,11,195 // vpmulhrsw %ymm3,%ymm0,%ymm0
+ .byte 196,226,125,29,192 // vpabsw %ymm0,%ymm0
+ .byte 196,226,117,11,203 // vpmulhrsw %ymm3,%ymm1,%ymm1
+ .byte 196,226,125,29,201 // vpabsw %ymm1,%ymm1
+ .byte 196,226,109,11,211 // vpmulhrsw %ymm3,%ymm2,%ymm2
+ .byte 196,226,125,29,210 // vpabsw %ymm2,%ymm2
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_load_8888_hsw_lowp
+.globl _sk_load_8888_hsw_lowp
+FUNCTION(_sk_load_8888_hsw_lowp)
+_sk_load_8888_hsw_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 76,139,24 // mov (%rax),%r11
+ .byte 77,133,192 // test %r8,%r8
+ .byte 15,133,210,0,0,0 // jne 252 <_sk_load_8888_hsw_lowp+0xe0>
+ .byte 196,193,126,111,92,147,32 // vmovdqu 0x20(%r11,%rdx,4),%ymm3
+ .byte 196,65,126,111,4,147 // vmovdqu (%r11,%rdx,4),%ymm8
+ .byte 197,253,111,5,235,20,0,0 // vmovdqa 0x14eb(%rip),%ymm0 # 1680 <_sk_xor__hsw_lowp+0xa4>
+ .byte 196,226,61,0,200 // vpshufb %ymm0,%ymm8,%ymm1
+ .byte 196,227,253,0,201,232 // vpermq $0xe8,%ymm1,%ymm1
+ .byte 196,226,101,0,192 // vpshufb %ymm0,%ymm3,%ymm0
+ .byte 196,227,253,0,192,232 // vpermq $0xe8,%ymm0,%ymm0
+ .byte 196,227,117,56,192,1 // vinserti128 $0x1,%xmm0,%ymm1,%ymm0
+ .byte 197,253,113,240,8 // vpsllw $0x8,%ymm0,%ymm0
+ .byte 196,98,125,121,13,225,20,0,0 // vpbroadcastw 0x14e1(%rip),%ymm9 # 16a0 <_sk_xor__hsw_lowp+0xc4>
+ .byte 196,193,125,228,193 // vpmulhuw %ymm9,%ymm0,%ymm0
+ .byte 197,253,111,13,244,20,0,0 // vmovdqa 0x14f4(%rip),%ymm1 # 16c0 <_sk_xor__hsw_lowp+0xe4>
+ .byte 196,226,61,0,209 // vpshufb %ymm1,%ymm8,%ymm2
+ .byte 196,227,253,0,210,232 // vpermq $0xe8,%ymm2,%ymm2
+ .byte 196,226,101,0,201 // vpshufb %ymm1,%ymm3,%ymm1
+ .byte 196,227,253,0,201,232 // vpermq $0xe8,%ymm1,%ymm1
+ .byte 196,227,109,56,201,1 // vinserti128 $0x1,%xmm1,%ymm2,%ymm1
+ .byte 197,245,113,241,8 // vpsllw $0x8,%ymm1,%ymm1
+ .byte 196,193,117,228,201 // vpmulhuw %ymm9,%ymm1,%ymm1
+ .byte 197,253,111,21,230,20,0,0 // vmovdqa 0x14e6(%rip),%ymm2 # 16e0 <_sk_xor__hsw_lowp+0x104>
+ .byte 196,98,61,0,210 // vpshufb %ymm2,%ymm8,%ymm10
+ .byte 196,67,253,0,210,232 // vpermq $0xe8,%ymm10,%ymm10
+ .byte 196,226,101,0,210 // vpshufb %ymm2,%ymm3,%ymm2
+ .byte 196,227,253,0,210,232 // vpermq $0xe8,%ymm2,%ymm2
+ .byte 196,227,45,56,210,1 // vinserti128 $0x1,%xmm2,%ymm10,%ymm2
+ .byte 197,237,113,242,8 // vpsllw $0x8,%ymm2,%ymm2
+ .byte 196,193,109,228,209 // vpmulhuw %ymm9,%ymm2,%ymm2
+ .byte 197,125,111,21,216,20,0,0 // vmovdqa 0x14d8(%rip),%ymm10 # 1700 <_sk_xor__hsw_lowp+0x124>
+ .byte 196,66,61,0,194 // vpshufb %ymm10,%ymm8,%ymm8
+ .byte 196,67,253,0,192,232 // vpermq $0xe8,%ymm8,%ymm8
+ .byte 196,194,101,0,218 // vpshufb %ymm10,%ymm3,%ymm3
+ .byte 196,227,253,0,219,232 // vpermq $0xe8,%ymm3,%ymm3
+ .byte 196,227,61,56,219,1 // vinserti128 $0x1,%xmm3,%ymm8,%ymm3
+ .byte 197,229,113,243,8 // vpsllw $0x8,%ymm3,%ymm3
+ .byte 196,193,101,228,217 // vpmulhuw %ymm9,%ymm3,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 69,137,193 // mov %r8d,%r9d
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 197,229,239,219 // vpxor %ymm3,%ymm3,%ymm3
+ .byte 196,65,61,239,192 // vpxor %ymm8,%ymm8,%ymm8
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 15,135,30,255,255,255 // ja 18d <_sk_load_8888_hsw_lowp+0x1b>
+ .byte 69,15,182,201 // movzbl %r9b,%r9d
+ .byte 76,141,21,30,1,0,0 // lea 0x11e(%rip),%r10 # 398 <_sk_load_8888_hsw_lowp+0x226>
+ .byte 75,99,4,138 // movslq (%r10,%r9,4),%rax
+ .byte 76,1,208 // add %r10,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 196,65,121,110,4,147 // vmovd (%r11,%rdx,4),%xmm8
+ .byte 233,255,254,255,255 // jmpq 18d <_sk_load_8888_hsw_lowp+0x1b>
+ .byte 196,193,121,110,68,147,8 // vmovd 0x8(%r11,%rdx,4),%xmm0
+ .byte 196,226,121,89,192 // vpbroadcastq %xmm0,%xmm0
+ .byte 197,229,239,219 // vpxor %ymm3,%ymm3,%ymm3
+ .byte 196,99,101,2,192,4 // vpblendd $0x4,%ymm0,%ymm3,%ymm8
+ .byte 196,194,121,53,4,147 // vpmovzxdq (%r11,%rdx,4),%xmm0
+ .byte 197,249,112,192,232 // vpshufd $0xe8,%xmm0,%xmm0
+ .byte 196,99,61,2,192,3 // vpblendd $0x3,%ymm0,%ymm8,%ymm8
+ .byte 233,211,254,255,255 // jmpq 18d <_sk_load_8888_hsw_lowp+0x1b>
+ .byte 196,193,121,110,68,147,24 // vmovd 0x18(%r11,%rdx,4),%xmm0
+ .byte 196,226,125,89,192 // vpbroadcastq %xmm0,%ymm0
+ .byte 197,229,239,219 // vpxor %ymm3,%ymm3,%ymm3
+ .byte 196,99,101,2,192,64 // vpblendd $0x40,%ymm0,%ymm3,%ymm8
+ .byte 196,99,125,57,192,1 // vextracti128 $0x1,%ymm8,%xmm0
+ .byte 196,195,121,34,68,147,20,1 // vpinsrd $0x1,0x14(%r11,%rdx,4),%xmm0,%xmm0
+ .byte 196,99,61,56,192,1 // vinserti128 $0x1,%xmm0,%ymm8,%ymm8
+ .byte 196,99,125,57,192,1 // vextracti128 $0x1,%ymm8,%xmm0
+ .byte 196,195,121,34,68,147,16,0 // vpinsrd $0x0,0x10(%r11,%rdx,4),%xmm0,%xmm0
+ .byte 196,99,61,56,192,1 // vinserti128 $0x1,%xmm0,%ymm8,%ymm8
+ .byte 196,193,122,111,4,147 // vmovdqu (%r11,%rdx,4),%xmm0
+ .byte 196,67,125,2,192,240 // vpblendd $0xf0,%ymm8,%ymm0,%ymm8
+ .byte 233,132,254,255,255 // jmpq 18d <_sk_load_8888_hsw_lowp+0x1b>
+ .byte 196,193,121,110,68,147,40 // vmovd 0x28(%r11,%rdx,4),%xmm0
+ .byte 196,226,121,89,192 // vpbroadcastq %xmm0,%xmm0
+ .byte 197,245,239,201 // vpxor %ymm1,%ymm1,%ymm1
+ .byte 196,227,117,2,216,4 // vpblendd $0x4,%ymm0,%ymm1,%ymm3
+ .byte 196,195,97,34,68,147,36,1 // vpinsrd $0x1,0x24(%r11,%rdx,4),%xmm3,%xmm0
+ .byte 196,227,101,2,216,15 // vpblendd $0xf,%ymm0,%ymm3,%ymm3
+ .byte 196,193,121,110,68,147,32 // vmovd 0x20(%r11,%rdx,4),%xmm0
+ .byte 196,227,101,2,216,1 // vpblendd $0x1,%ymm0,%ymm3,%ymm3
+ .byte 233,72,254,255,255 // jmpq 187 <_sk_load_8888_hsw_lowp+0x15>
+ .byte 196,193,121,110,68,147,56 // vmovd 0x38(%r11,%rdx,4),%xmm0
+ .byte 196,226,125,89,192 // vpbroadcastq %xmm0,%ymm0
+ .byte 197,245,239,201 // vpxor %ymm1,%ymm1,%ymm1
+ .byte 196,227,117,2,216,64 // vpblendd $0x40,%ymm0,%ymm1,%ymm3
+ .byte 196,227,125,57,216,1 // vextracti128 $0x1,%ymm3,%xmm0
+ .byte 196,195,121,34,68,147,52,1 // vpinsrd $0x1,0x34(%r11,%rdx,4),%xmm0,%xmm0
+ .byte 196,227,101,56,216,1 // vinserti128 $0x1,%xmm0,%ymm3,%ymm3
+ .byte 196,227,125,57,216,1 // vextracti128 $0x1,%ymm3,%xmm0
+ .byte 196,195,121,34,68,147,48,0 // vpinsrd $0x0,0x30(%r11,%rdx,4),%xmm0,%xmm0
+ .byte 196,227,101,56,216,1 // vinserti128 $0x1,%xmm0,%ymm3,%ymm3
+ .byte 196,65,126,111,4,147 // vmovdqu (%r11,%rdx,4),%ymm8
+ .byte 196,193,122,111,68,147,32 // vmovdqu 0x20(%r11,%rdx,4),%xmm0
+ .byte 196,227,125,2,219,240 // vpblendd $0xf0,%ymm3,%ymm0,%ymm3
+ .byte 233,248,253,255,255 // jmpq 18d <_sk_load_8888_hsw_lowp+0x1b>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 235,254 // jmp 398 <_sk_load_8888_hsw_lowp+0x226>
+ .byte 255 // (bad)
+ .byte 255,12,255 // decl (%rdi,%rdi,8)
+ .byte 255 // (bad)
+ .byte 255,246 // push %rsi
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,96,255 // jmpq *-0x1(%rax)
+ .byte 255 // (bad)
+ .byte 255,76,255,255 // decl -0x1(%rdi,%rdi,8)
+ .byte 255 // (bad)
+ .byte 56,255 // cmp %bh,%bh
+ .byte 255 // (bad)
+ .byte 255,34 // jmpq *(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 239 // out %eax,(%dx)
+ .byte 253 // std
+ .byte 255 // (bad)
+ .byte 255,149,255,255,255,135 // callq *-0x78000001(%rbp)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,113,255 // pushq -0x1(%rcx)
+ .byte 255 // (bad)
+ .byte 255,229 // jmpq *%rbp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,209 // callq *%rcx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 189,255,255,255,167 // mov $0xa7ffffff,%ebp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
+
+HIDDEN _sk_store_8888_hsw_lowp
+.globl _sk_store_8888_hsw_lowp
+FUNCTION(_sk_store_8888_hsw_lowp)
+_sk_store_8888_hsw_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 76,139,24 // mov (%rax),%r11
+ .byte 197,189,113,208,7 // vpsrlw $0x7,%ymm0,%ymm8
+ .byte 196,98,125,121,13,57,19,0,0 // vpbroadcastw 0x1339(%rip),%ymm9 # 1720 <_sk_xor__hsw_lowp+0x144>
+ .byte 196,65,61,234,193 // vpminsw %ymm9,%ymm8,%ymm8
+ .byte 196,66,125,51,208 // vpmovzxwd %xmm8,%ymm10
+ .byte 196,67,125,57,192,1 // vextracti128 $0x1,%ymm8,%xmm8
+ .byte 196,66,125,51,192 // vpmovzxwd %xmm8,%ymm8
+ .byte 197,165,113,209,7 // vpsrlw $0x7,%ymm1,%ymm11
+ .byte 196,65,37,234,217 // vpminsw %ymm9,%ymm11,%ymm11
+ .byte 196,67,125,57,220,1 // vextracti128 $0x1,%ymm11,%xmm12
+ .byte 196,66,125,51,228 // vpmovzxwd %xmm12,%ymm12
+ .byte 196,66,125,51,219 // vpmovzxwd %xmm11,%ymm11
+ .byte 196,193,37,114,243,8 // vpslld $0x8,%ymm11,%ymm11
+ .byte 196,193,29,114,244,8 // vpslld $0x8,%ymm12,%ymm12
+ .byte 196,65,29,235,192 // vpor %ymm8,%ymm12,%ymm8
+ .byte 196,65,37,235,210 // vpor %ymm10,%ymm11,%ymm10
+ .byte 197,165,113,210,7 // vpsrlw $0x7,%ymm2,%ymm11
+ .byte 196,65,37,234,217 // vpminsw %ymm9,%ymm11,%ymm11
+ .byte 196,66,125,51,227 // vpmovzxwd %xmm11,%ymm12
+ .byte 196,67,125,57,219,1 // vextracti128 $0x1,%ymm11,%xmm11
+ .byte 196,66,125,51,219 // vpmovzxwd %xmm11,%ymm11
+ .byte 196,193,37,114,243,16 // vpslld $0x10,%ymm11,%ymm11
+ .byte 196,193,29,114,244,16 // vpslld $0x10,%ymm12,%ymm12
+ .byte 197,149,113,211,7 // vpsrlw $0x7,%ymm3,%ymm13
+ .byte 196,65,21,234,201 // vpminsw %ymm9,%ymm13,%ymm9
+ .byte 196,67,125,57,205,1 // vextracti128 $0x1,%ymm9,%xmm13
+ .byte 196,66,125,51,237 // vpmovzxwd %xmm13,%ymm13
+ .byte 196,66,125,51,201 // vpmovzxwd %xmm9,%ymm9
+ .byte 196,193,13,114,241,24 // vpslld $0x18,%ymm9,%ymm14
+ .byte 196,193,53,114,245,24 // vpslld $0x18,%ymm13,%ymm9
+ .byte 196,65,37,235,201 // vpor %ymm9,%ymm11,%ymm9
+ .byte 196,65,61,235,201 // vpor %ymm9,%ymm8,%ymm9
+ .byte 196,65,29,235,198 // vpor %ymm14,%ymm12,%ymm8
+ .byte 196,65,45,235,192 // vpor %ymm8,%ymm10,%ymm8
+ .byte 77,133,192 // test %r8,%r8
+ .byte 117,17 // jne 4a2 <_sk_store_8888_hsw_lowp+0xce>
+ .byte 196,65,126,127,4,147 // vmovdqu %ymm8,(%r11,%rdx,4)
+ .byte 196,65,126,127,76,147,32 // vmovdqu %ymm9,0x20(%r11,%rdx,4)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 69,137,193 // mov %r8d,%r9d
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,236 // ja 49e <_sk_store_8888_hsw_lowp+0xca>
+ .byte 69,15,182,201 // movzbl %r9b,%r9d
+ .byte 76,141,21,175,0,0,0 // lea 0xaf(%rip),%r10 # 56c <_sk_store_8888_hsw_lowp+0x198>
+ .byte 75,99,4,138 // movslq (%r10,%r9,4),%rax
+ .byte 76,1,208 // add %r10,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 196,65,121,126,4,147 // vmovd %xmm8,(%r11,%rdx,4)
+ .byte 235,208 // jmp 49e <_sk_store_8888_hsw_lowp+0xca>
+ .byte 196,67,121,22,68,147,8,2 // vpextrd $0x2,%xmm8,0x8(%r11,%rdx,4)
+ .byte 196,65,121,214,4,147 // vmovq %xmm8,(%r11,%rdx,4)
+ .byte 235,192 // jmp 49e <_sk_store_8888_hsw_lowp+0xca>
+ .byte 196,67,125,57,193,1 // vextracti128 $0x1,%ymm8,%xmm9
+ .byte 196,67,121,22,76,147,24,2 // vpextrd $0x2,%xmm9,0x18(%r11,%rdx,4)
+ .byte 196,67,125,57,193,1 // vextracti128 $0x1,%ymm8,%xmm9
+ .byte 196,67,121,22,76,147,20,1 // vpextrd $0x1,%xmm9,0x14(%r11,%rdx,4)
+ .byte 196,67,125,57,193,1 // vextracti128 $0x1,%ymm8,%xmm9
+ .byte 196,65,121,126,76,147,16 // vmovd %xmm9,0x10(%r11,%rdx,4)
+ .byte 196,65,122,127,4,147 // vmovdqu %xmm8,(%r11,%rdx,4)
+ .byte 235,143 // jmp 49e <_sk_store_8888_hsw_lowp+0xca>
+ .byte 196,67,121,22,76,147,40,2 // vpextrd $0x2,%xmm9,0x28(%r11,%rdx,4)
+ .byte 196,67,121,22,76,147,36,1 // vpextrd $0x1,%xmm9,0x24(%r11,%rdx,4)
+ .byte 196,65,121,126,76,147,32 // vmovd %xmm9,0x20(%r11,%rdx,4)
+ .byte 196,65,126,127,4,147 // vmovdqu %ymm8,(%r11,%rdx,4)
+ .byte 233,109,255,255,255 // jmpq 49e <_sk_store_8888_hsw_lowp+0xca>
+ .byte 196,67,125,57,202,1 // vextracti128 $0x1,%ymm9,%xmm10
+ .byte 196,67,121,22,84,147,56,2 // vpextrd $0x2,%xmm10,0x38(%r11,%rdx,4)
+ .byte 196,67,125,57,202,1 // vextracti128 $0x1,%ymm9,%xmm10
+ .byte 196,67,121,22,84,147,52,1 // vpextrd $0x1,%xmm10,0x34(%r11,%rdx,4)
+ .byte 196,67,125,57,202,1 // vextracti128 $0x1,%ymm9,%xmm10
+ .byte 196,65,121,126,84,147,48 // vmovd %xmm10,0x30(%r11,%rdx,4)
+ .byte 196,65,126,127,4,147 // vmovdqu %ymm8,(%r11,%rdx,4)
+ .byte 196,65,122,127,76,147,32 // vmovdqu %xmm9,0x20(%r11,%rdx,4)
+ .byte 233,50,255,255,255 // jmpq 49e <_sk_store_8888_hsw_lowp+0xca>
+ .byte 90 // pop %rdx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,106,255 // ljmp *-0x1(%rdx)
+ .byte 255 // (bad)
+ .byte 255,98,255 // jmpq *-0x1(%rdx)
+ .byte 255 // (bad)
+ .byte 255,155,255,255,255,142 // lcall *-0x71000001(%rbx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,128,255,255,255,114 // incl 0x72ffffff(%rax)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 186,255,255,255,179 // mov $0xb3ffffff,%edx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,171,255,255,255,163 // ljmp *-0x5c000001(%rbx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 238 // out %al,(%dx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,225 // jmpq *%rcx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,211 // callq *%rbx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,197 // inc %ebp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
+
+HIDDEN _sk_load_a8_hsw_lowp
+.globl _sk_load_a8_hsw_lowp
+FUNCTION(_sk_load_a8_hsw_lowp)
+_sk_load_a8_hsw_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 76,139,24 // mov (%rax),%r11
+ .byte 77,133,192 // test %r8,%r8
+ .byte 117,45 // jne 5df <_sk_load_a8_hsw_lowp+0x37>
+ .byte 196,193,122,111,4,19 // vmovdqu (%r11,%rdx,1),%xmm0
+ .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
+ .byte 197,253,113,240,8 // vpsllw $0x8,%ymm0,%ymm0
+ .byte 196,226,125,121,13,87,17,0,0 // vpbroadcastw 0x1157(%rip),%ymm1 # 1722 <_sk_xor__hsw_lowp+0x146>
+ .byte 197,253,228,217 // vpmulhuw %ymm1,%ymm0,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,253,239,192 // vpxor %ymm0,%ymm0,%ymm0
+ .byte 197,245,239,201 // vpxor %ymm1,%ymm1,%ymm1
+ .byte 197,236,87,210 // vxorps %ymm2,%ymm2,%ymm2
+ .byte 255,224 // jmpq *%rax
+ .byte 69,137,193 // mov %r8d,%r9d
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,197 // ja 5b8 <_sk_load_a8_hsw_lowp+0x10>
+ .byte 69,15,182,201 // movzbl %r9b,%r9d
+ .byte 76,141,21,194,0,0,0 // lea 0xc2(%rip),%r10 # 6c0 <_sk_load_a8_hsw_lowp+0x118>
+ .byte 75,99,4,138 // movslq (%r10,%r9,4),%rax
+ .byte 76,1,208 // add %r10,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 65,15,182,4,19 // movzbl (%r11,%rdx,1),%eax
+ .byte 197,249,110,192 // vmovd %eax,%xmm0
+ .byte 235,166 // jmp 5b8 <_sk_load_a8_hsw_lowp+0x10>
+ .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,2,2 // vpinsrb $0x2,0x2(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 65,15,183,4,19 // movzwl (%r11,%rdx,1),%eax
+ .byte 197,249,110,200 // vmovd %eax,%xmm1
+ .byte 196,227,121,14,193,1 // vpblendw $0x1,%xmm1,%xmm0,%xmm0
+ .byte 235,137 // jmp 5b8 <_sk_load_a8_hsw_lowp+0x10>
+ .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,6,6 // vpinsrb $0x6,0x6(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,5,5 // vpinsrb $0x5,0x5(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,4,4 // vpinsrb $0x4,0x4(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,193,121,110,12,19 // vmovd (%r11,%rdx,1),%xmm1
+ .byte 196,227,121,2,193,1 // vpblendd $0x1,%xmm1,%xmm0,%xmm0
+ .byte 233,92,255,255,255 // jmpq 5b8 <_sk_load_a8_hsw_lowp+0x10>
+ .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,10,10 // vpinsrb $0xa,0xa(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,9,9 // vpinsrb $0x9,0x9(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,8,8 // vpinsrb $0x8,0x8(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,193,122,126,12,19 // vmovq (%r11,%rdx,1),%xmm1
+ .byte 196,227,113,2,192,12 // vpblendd $0xc,%xmm0,%xmm1,%xmm0
+ .byte 233,47,255,255,255 // jmpq 5b8 <_sk_load_a8_hsw_lowp+0x10>
+ .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,14,14 // vpinsrb $0xe,0xe(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,13,13 // vpinsrb $0xd,0xd(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,12,12 // vpinsrb $0xc,0xc(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,193,122,126,12,19 // vmovq (%r11,%rdx,1),%xmm1
+ .byte 196,195,113,34,76,19,8,2 // vpinsrd $0x2,0x8(%r11,%rdx,1),%xmm1,%xmm1
+ .byte 196,227,113,2,192,8 // vpblendd $0x8,%xmm0,%xmm1,%xmm0
+ .byte 233,250,254,255,255 // jmpq 5b8 <_sk_load_a8_hsw_lowp+0x10>
+ .byte 102,144 // xchg %ax,%ax
+ .byte 71,255 // rex.RXB (bad)
+ .byte 255 // (bad)
+ .byte 255,94,255 // lcall *-0x1(%rsi)
+ .byte 255 // (bad)
+ .byte 255,82,255 // callq *-0x1(%rdx)
+ .byte 255 // (bad)
+ .byte 255,139,255,255,255,131 // decl -0x7c000001(%rbx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 123,255 // jnp 6d5 <_sk_load_a8_hsw_lowp+0x12d>
+ .byte 255 // (bad)
+ .byte 255,111,255 // ljmp *-0x1(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 184,255,255,255,176 // mov $0xb0ffffff,%eax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,168,255,255,255,156 // ljmp *-0x63000001(%rax)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,229 // jmpq *%rbp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 221,255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,213 // callq *%rbp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,201 // dec %ecx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
+
+HIDDEN _sk_store_a8_hsw_lowp
+.globl _sk_store_a8_hsw_lowp
+FUNCTION(_sk_store_a8_hsw_lowp)
+_sk_store_a8_hsw_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 76,139,24 // mov (%rax),%r11
+ .byte 197,189,113,211,7 // vpsrlw $0x7,%ymm3,%ymm8
+ .byte 196,67,125,57,193,1 // vextracti128 $0x1,%ymm8,%xmm9
+ .byte 196,65,57,103,193 // vpackuswb %xmm9,%xmm8,%xmm8
+ .byte 77,133,192 // test %r8,%r8
+ .byte 117,10 // jne 720 <_sk_store_a8_hsw_lowp+0x24>
+ .byte 196,65,122,127,4,19 // vmovdqu %xmm8,(%r11,%rdx,1)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 69,137,193 // mov %r8d,%r9d
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,236 // ja 71c <_sk_store_a8_hsw_lowp+0x20>
+ .byte 69,15,182,201 // movzbl %r9b,%r9d
+ .byte 76,141,21,137,0,0,0 // lea 0x89(%rip),%r10 # 7c4 <_sk_store_a8_hsw_lowp+0xc8>
+ .byte 75,99,4,138 // movslq (%r10,%r9,4),%rax
+ .byte 76,1,208 // add %r10,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 196,67,121,20,4,19,0 // vpextrb $0x0,%xmm8,(%r11,%rdx,1)
+ .byte 235,207 // jmp 71c <_sk_store_a8_hsw_lowp+0x20>
+ .byte 196,67,121,20,68,19,2,2 // vpextrb $0x2,%xmm8,0x2(%r11,%rdx,1)
+ .byte 196,67,121,21,4,19,0 // vpextrw $0x0,%xmm8,(%r11,%rdx,1)
+ .byte 235,190 // jmp 71c <_sk_store_a8_hsw_lowp+0x20>
+ .byte 196,67,121,20,68,19,6,6 // vpextrb $0x6,%xmm8,0x6(%r11,%rdx,1)
+ .byte 196,67,121,20,68,19,5,5 // vpextrb $0x5,%xmm8,0x5(%r11,%rdx,1)
+ .byte 196,67,121,20,68,19,4,4 // vpextrb $0x4,%xmm8,0x4(%r11,%rdx,1)
+ .byte 196,65,121,126,4,19 // vmovd %xmm8,(%r11,%rdx,1)
+ .byte 235,158 // jmp 71c <_sk_store_a8_hsw_lowp+0x20>
+ .byte 196,67,121,20,68,19,10,10 // vpextrb $0xa,%xmm8,0xa(%r11,%rdx,1)
+ .byte 196,67,121,20,68,19,9,9 // vpextrb $0x9,%xmm8,0x9(%r11,%rdx,1)
+ .byte 196,67,121,20,68,19,8,8 // vpextrb $0x8,%xmm8,0x8(%r11,%rdx,1)
+ .byte 235,32 // jmp 7b8 <_sk_store_a8_hsw_lowp+0xbc>
+ .byte 196,67,121,20,68,19,14,14 // vpextrb $0xe,%xmm8,0xe(%r11,%rdx,1)
+ .byte 196,67,121,20,68,19,13,13 // vpextrb $0xd,%xmm8,0xd(%r11,%rdx,1)
+ .byte 196,67,121,20,68,19,12,12 // vpextrb $0xc,%xmm8,0xc(%r11,%rdx,1)
+ .byte 196,67,121,22,68,19,8,2 // vpextrd $0x2,%xmm8,0x8(%r11,%rdx,1)
+ .byte 196,65,121,214,4,19 // vmovq %xmm8,(%r11,%rdx,1)
+ .byte 233,89,255,255,255 // jmpq 71c <_sk_store_a8_hsw_lowp+0x20>
+ .byte 144 // nop
+ .byte 128,255,255 // cmp $0xff,%bh
+ .byte 255,145,255,255,255,137 // callq *-0x76000001(%rcx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,178,255,255,255,170 // pushq -0x55000001(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,162,255,255,255,154 // jmpq *-0x65000001(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,244 // push %rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,202 // dec %edx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,194 // inc %edx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 186,255,255,255,236 // mov $0xecffffff,%edx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,228 // jmpq *%rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 220,255 // fdivr %st,%st(7)
+ .byte 255 // (bad)
+ .byte 255,212 // callq *%rsp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
+
+HIDDEN _sk_load_g8_hsw_lowp
+.globl _sk_load_g8_hsw_lowp
+FUNCTION(_sk_load_g8_hsw_lowp)
+_sk_load_g8_hsw_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 76,139,24 // mov (%rax),%r11
+ .byte 77,133,192 // test %r8,%r8
+ .byte 117,50 // jne 83c <_sk_load_g8_hsw_lowp+0x3c>
+ .byte 196,193,122,111,4,19 // vmovdqu (%r11,%rdx,1),%xmm0
+ .byte 196,226,125,48,192 // vpmovzxbw %xmm0,%ymm0
+ .byte 197,253,113,240,8 // vpsllw $0x8,%ymm0,%ymm0
+ .byte 196,226,125,121,13,1,15,0,0 // vpbroadcastw 0xf01(%rip),%ymm1 # 1724 <_sk_xor__hsw_lowp+0x148>
+ .byte 197,253,228,193 // vpmulhuw %ymm1,%ymm0,%ymm0
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 196,226,125,121,29,244,14,0,0 // vpbroadcastw 0xef4(%rip),%ymm3 # 1726 <_sk_xor__hsw_lowp+0x14a>
+ .byte 197,253,111,200 // vmovdqa %ymm0,%ymm1
+ .byte 197,253,111,208 // vmovdqa %ymm0,%ymm2
+ .byte 255,224 // jmpq *%rax
+ .byte 69,137,193 // mov %r8d,%r9d
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,192 // ja 810 <_sk_load_g8_hsw_lowp+0x10>
+ .byte 69,15,182,201 // movzbl %r9b,%r9d
+ .byte 76,141,21,193,0,0,0 // lea 0xc1(%rip),%r10 # 91c <_sk_load_g8_hsw_lowp+0x11c>
+ .byte 75,99,4,138 // movslq (%r10,%r9,4),%rax
+ .byte 76,1,208 // add %r10,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 65,15,182,4,19 // movzbl (%r11,%rdx,1),%eax
+ .byte 197,249,110,192 // vmovd %eax,%xmm0
+ .byte 235,161 // jmp 810 <_sk_load_g8_hsw_lowp+0x10>
+ .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,2,2 // vpinsrb $0x2,0x2(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 65,15,183,4,19 // movzwl (%r11,%rdx,1),%eax
+ .byte 197,249,110,200 // vmovd %eax,%xmm1
+ .byte 196,227,121,14,193,1 // vpblendw $0x1,%xmm1,%xmm0,%xmm0
+ .byte 235,132 // jmp 810 <_sk_load_g8_hsw_lowp+0x10>
+ .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,6,6 // vpinsrb $0x6,0x6(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,5,5 // vpinsrb $0x5,0x5(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,4,4 // vpinsrb $0x4,0x4(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,193,121,110,12,19 // vmovd (%r11,%rdx,1),%xmm1
+ .byte 196,227,121,2,193,1 // vpblendd $0x1,%xmm1,%xmm0,%xmm0
+ .byte 233,87,255,255,255 // jmpq 810 <_sk_load_g8_hsw_lowp+0x10>
+ .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,10,10 // vpinsrb $0xa,0xa(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,9,9 // vpinsrb $0x9,0x9(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,8,8 // vpinsrb $0x8,0x8(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,193,122,126,12,19 // vmovq (%r11,%rdx,1),%xmm1
+ .byte 196,227,113,2,192,12 // vpblendd $0xc,%xmm0,%xmm1,%xmm0
+ .byte 233,42,255,255,255 // jmpq 810 <_sk_load_g8_hsw_lowp+0x10>
+ .byte 197,249,239,192 // vpxor %xmm0,%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,14,14 // vpinsrb $0xe,0xe(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,13,13 // vpinsrb $0xd,0xd(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,195,121,32,68,19,12,12 // vpinsrb $0xc,0xc(%r11,%rdx,1),%xmm0,%xmm0
+ .byte 196,193,122,126,12,19 // vmovq (%r11,%rdx,1),%xmm1
+ .byte 196,195,113,34,76,19,8,2 // vpinsrd $0x2,0x8(%r11,%rdx,1),%xmm1,%xmm1
+ .byte 196,227,113,2,192,8 // vpblendd $0x8,%xmm0,%xmm1,%xmm0
+ .byte 233,245,254,255,255 // jmpq 810 <_sk_load_g8_hsw_lowp+0x10>
+ .byte 144 // nop
+ .byte 72,255 // rex.W (bad)
+ .byte 255 // (bad)
+ .byte 255,95,255 // lcall *-0x1(%rdi)
+ .byte 255 // (bad)
+ .byte 255,83,255 // callq *-0x1(%rbx)
+ .byte 255 // (bad)
+ .byte 255,140,255,255,255,132,255 // decl -0x7b0001(%rdi,%rdi,8)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 124,255 // jl 931 <_sk_load_g8_hsw_lowp+0x131>
+ .byte 255 // (bad)
+ .byte 255,112,255 // pushq -0x1(%rax)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 185,255,255,255,177 // mov $0xb1ffffff,%ecx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,169,255,255,255,157 // ljmp *-0x62000001(%rcx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,230 // jmpq *%rsi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 222,255 // fdivrp %st,%st(7)
+ .byte 255 // (bad)
+ .byte 255,214 // callq *%rsi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,202 // dec %edx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
+
+HIDDEN _sk_srcover_rgba_8888_hsw_lowp
+.globl _sk_srcover_rgba_8888_hsw_lowp
+FUNCTION(_sk_srcover_rgba_8888_hsw_lowp)
+_sk_srcover_rgba_8888_hsw_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 76,139,24 // mov (%rax),%r11
+ .byte 77,133,192 // test %r8,%r8
+ .byte 15,133,220,1,0,0 // jne b42 <_sk_srcover_rgba_8888_hsw_lowp+0x1ea>
+ .byte 196,193,126,111,124,147,32 // vmovdqu 0x20(%r11,%rdx,4),%ymm7
+ .byte 196,65,126,111,4,147 // vmovdqu (%r11,%rdx,4),%ymm8
+ .byte 197,253,111,37,197,13,0,0 // vmovdqa 0xdc5(%rip),%ymm4 # 1740 <_sk_xor__hsw_lowp+0x164>
+ .byte 196,226,61,0,236 // vpshufb %ymm4,%ymm8,%ymm5
+ .byte 196,227,253,0,237,232 // vpermq $0xe8,%ymm5,%ymm5
+ .byte 196,226,69,0,228 // vpshufb %ymm4,%ymm7,%ymm4
+ .byte 196,227,253,0,228,232 // vpermq $0xe8,%ymm4,%ymm4
+ .byte 196,227,85,56,228,1 // vinserti128 $0x1,%xmm4,%ymm5,%ymm4
+ .byte 196,98,125,121,13,192,13,0,0 // vpbroadcastw 0xdc0(%rip),%ymm9 # 1760 <_sk_xor__hsw_lowp+0x184>
+ .byte 197,221,113,244,8 // vpsllw $0x8,%ymm4,%ymm4
+ .byte 196,98,125,121,21,180,13,0,0 // vpbroadcastw 0xdb4(%rip),%ymm10 # 1762 <_sk_xor__hsw_lowp+0x186>
+ .byte 196,193,93,228,226 // vpmulhuw %ymm10,%ymm4,%ymm4
+ .byte 197,253,111,45,197,13,0,0 // vmovdqa 0xdc5(%rip),%ymm5 # 1780 <_sk_xor__hsw_lowp+0x1a4>
+ .byte 196,226,61,0,245 // vpshufb %ymm5,%ymm8,%ymm6
+ .byte 196,227,253,0,246,232 // vpermq $0xe8,%ymm6,%ymm6
+ .byte 196,226,69,0,237 // vpshufb %ymm5,%ymm7,%ymm5
+ .byte 196,227,253,0,237,232 // vpermq $0xe8,%ymm5,%ymm5
+ .byte 196,227,77,56,237,1 // vinserti128 $0x1,%xmm5,%ymm6,%ymm5
+ .byte 197,213,113,245,8 // vpsllw $0x8,%ymm5,%ymm5
+ .byte 196,193,85,228,234 // vpmulhuw %ymm10,%ymm5,%ymm5
+ .byte 197,253,111,53,183,13,0,0 // vmovdqa 0xdb7(%rip),%ymm6 # 17a0 <_sk_xor__hsw_lowp+0x1c4>
+ .byte 196,98,61,0,222 // vpshufb %ymm6,%ymm8,%ymm11
+ .byte 196,67,253,0,219,232 // vpermq $0xe8,%ymm11,%ymm11
+ .byte 196,226,69,0,246 // vpshufb %ymm6,%ymm7,%ymm6
+ .byte 196,227,253,0,246,232 // vpermq $0xe8,%ymm6,%ymm6
+ .byte 196,227,37,56,246,1 // vinserti128 $0x1,%xmm6,%ymm11,%ymm6
+ .byte 197,205,113,246,8 // vpsllw $0x8,%ymm6,%ymm6
+ .byte 196,193,77,228,242 // vpmulhuw %ymm10,%ymm6,%ymm6
+ .byte 197,125,111,29,169,13,0,0 // vmovdqa 0xda9(%rip),%ymm11 # 17c0 <_sk_xor__hsw_lowp+0x1e4>
+ .byte 196,66,61,0,195 // vpshufb %ymm11,%ymm8,%ymm8
+ .byte 196,67,253,0,192,232 // vpermq $0xe8,%ymm8,%ymm8
+ .byte 196,194,69,0,251 // vpshufb %ymm11,%ymm7,%ymm7
+ .byte 196,227,253,0,255,232 // vpermq $0xe8,%ymm7,%ymm7
+ .byte 196,227,61,56,255,1 // vinserti128 $0x1,%xmm7,%ymm8,%ymm7
+ .byte 197,197,113,247,8 // vpsllw $0x8,%ymm7,%ymm7
+ .byte 196,193,69,228,250 // vpmulhuw %ymm10,%ymm7,%ymm7
+ .byte 196,98,125,121,5,154,13,0,0 // vpbroadcastw 0xd9a(%rip),%ymm8 # 17e0 <_sk_xor__hsw_lowp+0x204>
+ .byte 197,61,249,195 // vpsubw %ymm3,%ymm8,%ymm8
+ .byte 196,66,93,11,208 // vpmulhrsw %ymm8,%ymm4,%ymm10
+ .byte 196,66,125,29,210 // vpabsw %ymm10,%ymm10
+ .byte 197,173,253,192 // vpaddw %ymm0,%ymm10,%ymm0
+ .byte 196,66,85,11,208 // vpmulhrsw %ymm8,%ymm5,%ymm10
+ .byte 196,66,125,29,210 // vpabsw %ymm10,%ymm10
+ .byte 197,173,253,201 // vpaddw %ymm1,%ymm10,%ymm1
+ .byte 196,66,77,11,208 // vpmulhrsw %ymm8,%ymm6,%ymm10
+ .byte 196,66,125,29,210 // vpabsw %ymm10,%ymm10
+ .byte 197,173,253,210 // vpaddw %ymm2,%ymm10,%ymm2
+ .byte 196,66,69,11,192 // vpmulhrsw %ymm8,%ymm7,%ymm8
+ .byte 196,66,125,29,192 // vpabsw %ymm8,%ymm8
+ .byte 197,189,253,219 // vpaddw %ymm3,%ymm8,%ymm3
+ .byte 197,189,113,208,7 // vpsrlw $0x7,%ymm0,%ymm8
+ .byte 196,65,61,234,193 // vpminsw %ymm9,%ymm8,%ymm8
+ .byte 196,66,125,51,208 // vpmovzxwd %xmm8,%ymm10
+ .byte 196,67,125,57,192,1 // vextracti128 $0x1,%ymm8,%xmm8
+ .byte 196,66,125,51,192 // vpmovzxwd %xmm8,%ymm8
+ .byte 197,165,113,209,7 // vpsrlw $0x7,%ymm1,%ymm11
+ .byte 196,65,37,234,217 // vpminsw %ymm9,%ymm11,%ymm11
+ .byte 196,67,125,57,220,1 // vextracti128 $0x1,%ymm11,%xmm12
+ .byte 196,66,125,51,228 // vpmovzxwd %xmm12,%ymm12
+ .byte 196,66,125,51,219 // vpmovzxwd %xmm11,%ymm11
+ .byte 196,193,37,114,243,8 // vpslld $0x8,%ymm11,%ymm11
+ .byte 196,193,29,114,244,8 // vpslld $0x8,%ymm12,%ymm12
+ .byte 197,149,113,210,7 // vpsrlw $0x7,%ymm2,%ymm13
+ .byte 196,65,21,234,233 // vpminsw %ymm9,%ymm13,%ymm13
+ .byte 196,66,125,51,245 // vpmovzxwd %xmm13,%ymm14
+ .byte 196,67,125,57,237,1 // vextracti128 $0x1,%ymm13,%xmm13
+ .byte 196,66,125,51,237 // vpmovzxwd %xmm13,%ymm13
+ .byte 196,193,21,114,245,16 // vpslld $0x10,%ymm13,%ymm13
+ .byte 196,193,13,114,246,16 // vpslld $0x10,%ymm14,%ymm14
+ .byte 197,133,113,211,7 // vpsrlw $0x7,%ymm3,%ymm15
+ .byte 196,65,5,234,201 // vpminsw %ymm9,%ymm15,%ymm9
+ .byte 196,67,125,57,207,1 // vextracti128 $0x1,%ymm9,%xmm15
+ .byte 196,66,125,51,255 // vpmovzxwd %xmm15,%ymm15
+ .byte 196,66,125,51,201 // vpmovzxwd %xmm9,%ymm9
+ .byte 196,193,53,114,241,24 // vpslld $0x18,%ymm9,%ymm9
+ .byte 196,193,5,114,247,24 // vpslld $0x18,%ymm15,%ymm15
+ .byte 196,65,29,235,192 // vpor %ymm8,%ymm12,%ymm8
+ .byte 196,65,37,235,218 // vpor %ymm10,%ymm11,%ymm11
+ .byte 196,65,21,235,215 // vpor %ymm15,%ymm13,%ymm10
+ .byte 196,65,61,235,210 // vpor %ymm10,%ymm8,%ymm10
+ .byte 196,65,13,235,193 // vpor %ymm9,%ymm14,%ymm8
+ .byte 196,65,37,235,192 // vpor %ymm8,%ymm11,%ymm8
+ .byte 77,133,192 // test %r8,%r8
+ .byte 117,77 // jne b7e <_sk_srcover_rgba_8888_hsw_lowp+0x226>
+ .byte 196,65,126,127,4,147 // vmovdqu %ymm8,(%r11,%rdx,4)
+ .byte 196,65,126,127,84,147,32 // vmovdqu %ymm10,0x20(%r11,%rdx,4)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 69,137,193 // mov %r8d,%r9d
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 197,197,239,255 // vpxor %ymm7,%ymm7,%ymm7
+ .byte 196,65,61,239,192 // vpxor %ymm8,%ymm8,%ymm8
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 15,135,20,254,255,255 // ja 973 <_sk_srcover_rgba_8888_hsw_lowp+0x1b>
+ .byte 69,15,182,201 // movzbl %r9b,%r9d
+ .byte 76,141,21,238,1,0,0 // lea 0x1ee(%rip),%r10 # d58 <_sk_srcover_rgba_8888_hsw_lowp+0x400>
+ .byte 75,99,4,138 // movslq (%r10,%r9,4),%rax
+ .byte 76,1,208 // add %r10,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 196,65,121,110,4,147 // vmovd (%r11,%rdx,4),%xmm8
+ .byte 233,245,253,255,255 // jmpq 973 <_sk_srcover_rgba_8888_hsw_lowp+0x1b>
+ .byte 69,137,193 // mov %r8d,%r9d
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,176 // ja b3e <_sk_srcover_rgba_8888_hsw_lowp+0x1e6>
+ .byte 65,15,182,193 // movzbl %r9b,%eax
+ .byte 76,141,13,251,1,0,0 // lea 0x1fb(%rip),%r9 # d94 <_sk_srcover_rgba_8888_hsw_lowp+0x43c>
+ .byte 73,99,4,129 // movslq (%r9,%rax,4),%rax
+ .byte 76,1,200 // add %r9,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 196,65,121,126,4,147 // vmovd %xmm8,(%r11,%rdx,4)
+ .byte 235,148 // jmp b3e <_sk_srcover_rgba_8888_hsw_lowp+0x1e6>
+ .byte 196,193,121,110,100,147,8 // vmovd 0x8(%r11,%rdx,4),%xmm4
+ .byte 196,226,121,89,228 // vpbroadcastq %xmm4,%xmm4
+ .byte 197,197,239,255 // vpxor %ymm7,%ymm7,%ymm7
+ .byte 196,99,69,2,196,4 // vpblendd $0x4,%ymm4,%ymm7,%ymm8
+ .byte 196,194,121,53,36,147 // vpmovzxdq (%r11,%rdx,4),%xmm4
+ .byte 197,249,112,228,232 // vpshufd $0xe8,%xmm4,%xmm4
+ .byte 196,99,61,2,196,3 // vpblendd $0x3,%ymm4,%ymm8,%ymm8
+ .byte 233,157,253,255,255 // jmpq 973 <_sk_srcover_rgba_8888_hsw_lowp+0x1b>
+ .byte 196,193,121,110,100,147,24 // vmovd 0x18(%r11,%rdx,4),%xmm4
+ .byte 196,226,125,89,228 // vpbroadcastq %xmm4,%ymm4
+ .byte 197,197,239,255 // vpxor %ymm7,%ymm7,%ymm7
+ .byte 196,99,69,2,196,64 // vpblendd $0x40,%ymm4,%ymm7,%ymm8
+ .byte 196,99,125,57,196,1 // vextracti128 $0x1,%ymm8,%xmm4
+ .byte 196,195,89,34,100,147,20,1 // vpinsrd $0x1,0x14(%r11,%rdx,4),%xmm4,%xmm4
+ .byte 196,99,61,56,196,1 // vinserti128 $0x1,%xmm4,%ymm8,%ymm8
+ .byte 196,99,125,57,196,1 // vextracti128 $0x1,%ymm8,%xmm4
+ .byte 196,195,89,34,100,147,16,0 // vpinsrd $0x0,0x10(%r11,%rdx,4),%xmm4,%xmm4
+ .byte 196,99,61,56,196,1 // vinserti128 $0x1,%xmm4,%ymm8,%ymm8
+ .byte 196,193,122,111,36,147 // vmovdqu (%r11,%rdx,4),%xmm4
+ .byte 196,67,93,2,192,240 // vpblendd $0xf0,%ymm8,%ymm4,%ymm8
+ .byte 233,78,253,255,255 // jmpq 973 <_sk_srcover_rgba_8888_hsw_lowp+0x1b>
+ .byte 196,193,121,110,100,147,40 // vmovd 0x28(%r11,%rdx,4),%xmm4
+ .byte 196,226,121,89,228 // vpbroadcastq %xmm4,%xmm4
+ .byte 197,213,239,237 // vpxor %ymm5,%ymm5,%ymm5
+ .byte 196,227,85,2,252,4 // vpblendd $0x4,%ymm4,%ymm5,%ymm7
+ .byte 196,195,65,34,100,147,36,1 // vpinsrd $0x1,0x24(%r11,%rdx,4),%xmm7,%xmm4
+ .byte 196,227,69,2,252,15 // vpblendd $0xf,%ymm4,%ymm7,%ymm7
+ .byte 196,193,121,110,100,147,32 // vmovd 0x20(%r11,%rdx,4),%xmm4
+ .byte 196,227,69,2,252,1 // vpblendd $0x1,%ymm4,%ymm7,%ymm7
+ .byte 233,18,253,255,255 // jmpq 96d <_sk_srcover_rgba_8888_hsw_lowp+0x15>
+ .byte 196,193,121,110,100,147,56 // vmovd 0x38(%r11,%rdx,4),%xmm4
+ .byte 196,226,125,89,228 // vpbroadcastq %xmm4,%ymm4
+ .byte 197,213,239,237 // vpxor %ymm5,%ymm5,%ymm5
+ .byte 196,227,85,2,252,64 // vpblendd $0x40,%ymm4,%ymm5,%ymm7
+ .byte 196,227,125,57,252,1 // vextracti128 $0x1,%ymm7,%xmm4
+ .byte 196,195,89,34,100,147,52,1 // vpinsrd $0x1,0x34(%r11,%rdx,4),%xmm4,%xmm4
+ .byte 196,227,69,56,252,1 // vinserti128 $0x1,%xmm4,%ymm7,%ymm7
+ .byte 196,227,125,57,252,1 // vextracti128 $0x1,%ymm7,%xmm4
+ .byte 196,195,89,34,100,147,48,0 // vpinsrd $0x0,0x30(%r11,%rdx,4),%xmm4,%xmm4
+ .byte 196,227,69,56,252,1 // vinserti128 $0x1,%xmm4,%ymm7,%ymm7
+ .byte 196,65,126,111,4,147 // vmovdqu (%r11,%rdx,4),%ymm8
+ .byte 196,193,122,111,100,147,32 // vmovdqu 0x20(%r11,%rdx,4),%xmm4
+ .byte 196,227,93,2,255,240 // vpblendd $0xf0,%ymm7,%ymm4,%ymm7
+ .byte 233,194,252,255,255 // jmpq 973 <_sk_srcover_rgba_8888_hsw_lowp+0x1b>
+ .byte 196,67,121,22,68,147,8,2 // vpextrd $0x2,%xmm8,0x8(%r11,%rdx,4)
+ .byte 196,65,121,214,4,147 // vmovq %xmm8,(%r11,%rdx,4)
+ .byte 233,122,254,255,255 // jmpq b3e <_sk_srcover_rgba_8888_hsw_lowp+0x1e6>
+ .byte 196,67,125,57,193,1 // vextracti128 $0x1,%ymm8,%xmm9
+ .byte 196,67,121,22,76,147,24,2 // vpextrd $0x2,%xmm9,0x18(%r11,%rdx,4)
+ .byte 196,67,125,57,193,1 // vextracti128 $0x1,%ymm8,%xmm9
+ .byte 196,67,121,22,76,147,20,1 // vpextrd $0x1,%xmm9,0x14(%r11,%rdx,4)
+ .byte 196,67,125,57,193,1 // vextracti128 $0x1,%ymm8,%xmm9
+ .byte 196,65,121,126,76,147,16 // vmovd %xmm9,0x10(%r11,%rdx,4)
+ .byte 196,65,122,127,4,147 // vmovdqu %xmm8,(%r11,%rdx,4)
+ .byte 233,70,254,255,255 // jmpq b3e <_sk_srcover_rgba_8888_hsw_lowp+0x1e6>
+ .byte 196,67,121,22,84,147,40,2 // vpextrd $0x2,%xmm10,0x28(%r11,%rdx,4)
+ .byte 196,67,121,22,84,147,36,1 // vpextrd $0x1,%xmm10,0x24(%r11,%rdx,4)
+ .byte 196,65,121,126,84,147,32 // vmovd %xmm10,0x20(%r11,%rdx,4)
+ .byte 196,65,126,127,4,147 // vmovdqu %ymm8,(%r11,%rdx,4)
+ .byte 233,36,254,255,255 // jmpq b3e <_sk_srcover_rgba_8888_hsw_lowp+0x1e6>
+ .byte 196,67,125,57,209,1 // vextracti128 $0x1,%ymm10,%xmm9
+ .byte 196,67,121,22,76,147,56,2 // vpextrd $0x2,%xmm9,0x38(%r11,%rdx,4)
+ .byte 196,67,125,57,209,1 // vextracti128 $0x1,%ymm10,%xmm9
+ .byte 196,67,121,22,76,147,52,1 // vpextrd $0x1,%xmm9,0x34(%r11,%rdx,4)
+ .byte 196,67,125,57,209,1 // vextracti128 $0x1,%ymm10,%xmm9
+ .byte 196,65,121,126,76,147,48 // vmovd %xmm9,0x30(%r11,%rdx,4)
+ .byte 196,65,126,127,4,147 // vmovdqu %ymm8,(%r11,%rdx,4)
+ .byte 196,65,122,127,84,147,32 // vmovdqu %xmm10,0x20(%r11,%rdx,4)
+ .byte 233,233,253,255,255 // jmpq b3e <_sk_srcover_rgba_8888_hsw_lowp+0x1e6>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 27,254 // sbb %esi,%edi
+ .byte 255 // (bad)
+ .byte 255,104,254 // ljmp *-0x2(%rax)
+ .byte 255 // (bad)
+ .byte 255,82,254 // callq *-0x2(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 188,254,255,255,168 // mov $0xa8fffffe,%esp
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,148,254,255,255,126,254 // callq *-0x1810001(%rsi,%rdi,8)
+ .byte 255 // (bad)
+ .byte 255,21,252,255,255,241 // callq *-0xe000004(%rip) # fffffffff2000d75 <_sk_xor__hsw_lowp+0xfffffffff1fff799>
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,227 // jmpq *%rbx
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,205 // dec %ebp
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,65,255 // incl -0x1(%rcx)
+ .byte 255 // (bad)
+ .byte 255,45,255,255,255,25 // ljmp *0x19ffffff(%rip) # 1a000d8c <_sk_xor__hsw_lowp+0x19fff7b0>
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,3 // incl (%rbx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,14 // decl (%rsi)
+ .byte 254 // (bad)
+ .byte 255 // (bad)
+ .byte 255,37,255,255,255,29 // jmpq *0x1dffffff(%rip) # 1e000d9c <_sk_xor__hsw_lowp+0x1dfff7c0>
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,89,255 // lcall *-0x1(%rcx)
+ .byte 255 // (bad)
+ .byte 255,76,255,255 // decl -0x1(%rdi,%rdi,8)
+ .byte 255 // (bad)
+ .byte 62,255 // ds (bad)
+ .byte 255 // (bad)
+ .byte 255,48 // pushq (%rax)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 123,255 // jnp db1 <_sk_srcover_rgba_8888_hsw_lowp+0x459>
+ .byte 255 // (bad)
+ .byte 255,116,255,255 // pushq -0x1(%rdi,%rdi,8)
+ .byte 255,108,255,255 // ljmp *-0x1(%rdi,%rdi,8)
+ .byte 255,100,255,255 // jmpq *-0x1(%rdi,%rdi,8)
+ .byte 255,175,255,255,255,162 // ljmp *-0x5d000001(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,148,255,255,255,134,255 // callq *-0x790001(%rdi,%rdi,8)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
+
+HIDDEN _sk_scale_1_float_hsw_lowp
+.globl _sk_scale_1_float_hsw_lowp
+FUNCTION(_sk_scale_1_float_hsw_lowp)
+_sk_scale_1_float_hsw_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,122,16,0 // vmovss (%rax),%xmm8
+ .byte 197,58,88,5,126,8,0,0 // vaddss 0x87e(%rip),%xmm8,%xmm8 # 165c <_sk_xor__hsw_lowp+0x80>
+ .byte 197,121,126,192 // vmovd %xmm8,%eax
+ .byte 197,121,110,192 // vmovd %eax,%xmm8
+ .byte 196,66,125,121,192 // vpbroadcastw %xmm8,%ymm8
+ .byte 196,194,125,11,192 // vpmulhrsw %ymm8,%ymm0,%ymm0
+ .byte 196,226,125,29,192 // vpabsw %ymm0,%ymm0
+ .byte 196,194,117,11,200 // vpmulhrsw %ymm8,%ymm1,%ymm1
+ .byte 196,226,125,29,201 // vpabsw %ymm1,%ymm1
+ .byte 196,194,109,11,208 // vpmulhrsw %ymm8,%ymm2,%ymm2
+ .byte 196,226,125,29,210 // vpabsw %ymm2,%ymm2
+ .byte 196,194,101,11,216 // vpmulhrsw %ymm8,%ymm3,%ymm3
+ .byte 196,226,125,29,219 // vpabsw %ymm3,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_scale_u8_hsw_lowp
+.globl _sk_scale_u8_hsw_lowp
+FUNCTION(_sk_scale_u8_hsw_lowp)
+_sk_scale_u8_hsw_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 76,139,24 // mov (%rax),%r11
+ .byte 77,133,192 // test %r8,%r8
+ .byte 117,75 // jne e6c <_sk_scale_u8_hsw_lowp+0x55>
+ .byte 196,65,122,111,4,19 // vmovdqu (%r11,%rdx,1),%xmm8
+ .byte 196,66,125,48,192 // vpmovzxbw %xmm8,%ymm8
+ .byte 196,193,61,113,240,8 // vpsllw $0x8,%ymm8,%ymm8
+ .byte 196,98,125,121,13,167,9,0,0 // vpbroadcastw 0x9a7(%rip),%ymm9 # 17e2 <_sk_xor__hsw_lowp+0x206>
+ .byte 196,65,61,228,193 // vpmulhuw %ymm9,%ymm8,%ymm8
+ .byte 196,194,125,11,192 // vpmulhrsw %ymm8,%ymm0,%ymm0
+ .byte 196,226,125,29,192 // vpabsw %ymm0,%ymm0
+ .byte 196,194,117,11,200 // vpmulhrsw %ymm8,%ymm1,%ymm1
+ .byte 196,226,125,29,201 // vpabsw %ymm1,%ymm1
+ .byte 196,194,109,11,208 // vpmulhrsw %ymm8,%ymm2,%ymm2
+ .byte 196,226,125,29,210 // vpabsw %ymm2,%ymm2
+ .byte 196,194,101,11,216 // vpmulhrsw %ymm8,%ymm3,%ymm3
+ .byte 196,226,125,29,219 // vpabsw %ymm3,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 69,137,193 // mov %r8d,%r9d
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 196,65,57,239,192 // vpxor %xmm8,%xmm8,%xmm8
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 119,166 // ja e27 <_sk_scale_u8_hsw_lowp+0x10>
+ .byte 69,15,182,201 // movzbl %r9b,%r9d
+ .byte 76,141,21,200,0,0,0 // lea 0xc8(%rip),%r10 # f54 <_sk_scale_u8_hsw_lowp+0x13d>
+ .byte 75,99,4,138 // movslq (%r10,%r9,4),%rax
+ .byte 76,1,208 // add %r10,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 65,15,182,4,19 // movzbl (%r11,%rdx,1),%eax
+ .byte 197,121,110,192 // vmovd %eax,%xmm8
+ .byte 235,135 // jmp e27 <_sk_scale_u8_hsw_lowp+0x10>
+ .byte 196,65,57,239,192 // vpxor %xmm8,%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,2,2 // vpinsrb $0x2,0x2(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 65,15,183,4,19 // movzwl (%r11,%rdx,1),%eax
+ .byte 197,121,110,200 // vmovd %eax,%xmm9
+ .byte 196,67,57,14,193,1 // vpblendw $0x1,%xmm9,%xmm8,%xmm8
+ .byte 233,102,255,255,255 // jmpq e27 <_sk_scale_u8_hsw_lowp+0x10>
+ .byte 196,65,57,239,192 // vpxor %xmm8,%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,6,6 // vpinsrb $0x6,0x6(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,5,5 // vpinsrb $0x5,0x5(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,4,4 // vpinsrb $0x4,0x4(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,65,121,110,12,19 // vmovd (%r11,%rdx,1),%xmm9
+ .byte 196,67,57,2,193,1 // vpblendd $0x1,%xmm9,%xmm8,%xmm8
+ .byte 233,56,255,255,255 // jmpq e27 <_sk_scale_u8_hsw_lowp+0x10>
+ .byte 196,65,57,239,192 // vpxor %xmm8,%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,10,10 // vpinsrb $0xa,0xa(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,9,9 // vpinsrb $0x9,0x9(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,8,8 // vpinsrb $0x8,0x8(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,65,122,126,12,19 // vmovq (%r11,%rdx,1),%xmm9
+ .byte 196,67,49,2,192,12 // vpblendd $0xc,%xmm8,%xmm9,%xmm8
+ .byte 233,10,255,255,255 // jmpq e27 <_sk_scale_u8_hsw_lowp+0x10>
+ .byte 196,65,57,239,192 // vpxor %xmm8,%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,14,14 // vpinsrb $0xe,0xe(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,13,13 // vpinsrb $0xd,0xd(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,12,12 // vpinsrb $0xc,0xc(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,65,122,126,12,19 // vmovq (%r11,%rdx,1),%xmm9
+ .byte 196,67,49,34,76,19,8,2 // vpinsrd $0x2,0x8(%r11,%rdx,1),%xmm9,%xmm9
+ .byte 196,67,49,2,192,8 // vpblendd $0x8,%xmm8,%xmm9,%xmm8
+ .byte 233,212,254,255,255 // jmpq e27 <_sk_scale_u8_hsw_lowp+0x10>
+ .byte 144 // nop
+ .byte 65,255 // rex.B (bad)
+ .byte 255 // (bad)
+ .byte 255,89,255 // lcall *-0x1(%rcx)
+ .byte 255 // (bad)
+ .byte 255,76,255,255 // decl -0x1(%rdi,%rdi,8)
+ .byte 255,138,255,255,255,130 // decl -0x7d000001(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 122,255 // jp f69 <_sk_scale_u8_hsw_lowp+0x152>
+ .byte 255 // (bad)
+ .byte 255,109,255 // ljmp *-0x1(%rbp)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 184,255,255,255,176 // mov $0xb0ffffff,%eax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,168,255,255,255,155 // ljmp *-0x64000001(%rax)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,230 // jmpq *%rsi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 222,255 // fdivrp %st,%st(7)
+ .byte 255 // (bad)
+ .byte 255,214 // callq *%rsi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,201 // dec %ecx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
+
+HIDDEN _sk_lerp_1_float_hsw_lowp
+.globl _sk_lerp_1_float_hsw_lowp
+FUNCTION(_sk_lerp_1_float_hsw_lowp)
+_sk_lerp_1_float_hsw_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,122,16,0 // vmovss (%rax),%xmm8
+ .byte 197,58,88,5,194,6,0,0 // vaddss 0x6c2(%rip),%xmm8,%xmm8 # 1660 <_sk_xor__hsw_lowp+0x84>
+ .byte 197,121,126,192 // vmovd %xmm8,%eax
+ .byte 197,121,110,192 // vmovd %eax,%xmm8
+ .byte 196,66,125,121,192 // vpbroadcastw %xmm8,%ymm8
+ .byte 196,194,125,11,192 // vpmulhrsw %ymm8,%ymm0,%ymm0
+ .byte 196,226,125,29,192 // vpabsw %ymm0,%ymm0
+ .byte 196,98,125,121,13,38,8,0,0 // vpbroadcastw 0x826(%rip),%ymm9 # 17e4 <_sk_xor__hsw_lowp+0x208>
+ .byte 196,65,53,249,200 // vpsubw %ymm8,%ymm9,%ymm9
+ .byte 196,66,93,11,209 // vpmulhrsw %ymm9,%ymm4,%ymm10
+ .byte 196,66,125,29,210 // vpabsw %ymm10,%ymm10
+ .byte 197,173,253,192 // vpaddw %ymm0,%ymm10,%ymm0
+ .byte 196,194,117,11,200 // vpmulhrsw %ymm8,%ymm1,%ymm1
+ .byte 196,226,125,29,201 // vpabsw %ymm1,%ymm1
+ .byte 196,66,85,11,209 // vpmulhrsw %ymm9,%ymm5,%ymm10
+ .byte 196,66,125,29,210 // vpabsw %ymm10,%ymm10
+ .byte 197,173,253,201 // vpaddw %ymm1,%ymm10,%ymm1
+ .byte 196,194,109,11,208 // vpmulhrsw %ymm8,%ymm2,%ymm2
+ .byte 196,226,125,29,210 // vpabsw %ymm2,%ymm2
+ .byte 196,66,77,11,209 // vpmulhrsw %ymm9,%ymm6,%ymm10
+ .byte 196,66,125,29,210 // vpabsw %ymm10,%ymm10
+ .byte 197,173,253,210 // vpaddw %ymm2,%ymm10,%ymm2
+ .byte 196,194,101,11,216 // vpmulhrsw %ymm8,%ymm3,%ymm3
+ .byte 196,226,125,29,219 // vpabsw %ymm3,%ymm3
+ .byte 196,66,69,11,193 // vpmulhrsw %ymm9,%ymm7,%ymm8
+ .byte 196,66,125,29,192 // vpabsw %ymm8,%ymm8
+ .byte 197,189,253,219 // vpaddw %ymm3,%ymm8,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_lerp_u8_hsw_lowp
+.globl _sk_lerp_u8_hsw_lowp
+FUNCTION(_sk_lerp_u8_hsw_lowp)
+_sk_lerp_u8_hsw_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 76,139,24 // mov (%rax),%r11
+ .byte 77,133,192 // test %r8,%r8
+ .byte 15,133,145,0,0,0 // jne 10bc <_sk_lerp_u8_hsw_lowp+0x9f>
+ .byte 196,65,122,111,4,19 // vmovdqu (%r11,%rdx,1),%xmm8
+ .byte 196,66,125,48,192 // vpmovzxbw %xmm8,%ymm8
+ .byte 196,193,61,113,240,8 // vpsllw $0x8,%ymm8,%ymm8
+ .byte 196,98,125,121,13,161,7,0,0 // vpbroadcastw 0x7a1(%rip),%ymm9 # 17e6 <_sk_xor__hsw_lowp+0x20a>
+ .byte 196,65,61,228,193 // vpmulhuw %ymm9,%ymm8,%ymm8
+ .byte 196,194,125,11,192 // vpmulhrsw %ymm8,%ymm0,%ymm0
+ .byte 196,226,125,29,192 // vpabsw %ymm0,%ymm0
+ .byte 196,98,125,121,13,139,7,0,0 // vpbroadcastw 0x78b(%rip),%ymm9 # 17e8 <_sk_xor__hsw_lowp+0x20c>
+ .byte 196,65,53,249,200 // vpsubw %ymm8,%ymm9,%ymm9
+ .byte 196,66,93,11,209 // vpmulhrsw %ymm9,%ymm4,%ymm10
+ .byte 196,66,125,29,210 // vpabsw %ymm10,%ymm10
+ .byte 197,173,253,192 // vpaddw %ymm0,%ymm10,%ymm0
+ .byte 196,194,117,11,200 // vpmulhrsw %ymm8,%ymm1,%ymm1
+ .byte 196,226,125,29,201 // vpabsw %ymm1,%ymm1
+ .byte 196,66,85,11,209 // vpmulhrsw %ymm9,%ymm5,%ymm10
+ .byte 196,66,125,29,210 // vpabsw %ymm10,%ymm10
+ .byte 197,173,253,201 // vpaddw %ymm1,%ymm10,%ymm1
+ .byte 196,194,109,11,208 // vpmulhrsw %ymm8,%ymm2,%ymm2
+ .byte 196,226,125,29,210 // vpabsw %ymm2,%ymm2
+ .byte 196,66,77,11,209 // vpmulhrsw %ymm9,%ymm6,%ymm10
+ .byte 196,66,125,29,210 // vpabsw %ymm10,%ymm10
+ .byte 197,173,253,210 // vpaddw %ymm2,%ymm10,%ymm2
+ .byte 196,194,101,11,216 // vpmulhrsw %ymm8,%ymm3,%ymm3
+ .byte 196,226,125,29,219 // vpabsw %ymm3,%ymm3
+ .byte 196,66,69,11,193 // vpmulhrsw %ymm9,%ymm7,%ymm8
+ .byte 196,66,125,29,192 // vpabsw %ymm8,%ymm8
+ .byte 197,189,253,219 // vpaddw %ymm3,%ymm8,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 69,137,193 // mov %r8d,%r9d
+ .byte 65,128,225,15 // and $0xf,%r9b
+ .byte 196,65,57,239,192 // vpxor %xmm8,%xmm8,%xmm8
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,14 // cmp $0xe,%r9b
+ .byte 15,135,92,255,255,255 // ja 1031 <_sk_lerp_u8_hsw_lowp+0x14>
+ .byte 69,15,182,201 // movzbl %r9b,%r9d
+ .byte 76,141,21,204,0,0,0 // lea 0xcc(%rip),%r10 # 11ac <_sk_lerp_u8_hsw_lowp+0x18f>
+ .byte 75,99,4,138 // movslq (%r10,%r9,4),%rax
+ .byte 76,1,208 // add %r10,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 65,15,182,4,19 // movzbl (%r11,%rdx,1),%eax
+ .byte 197,121,110,192 // vmovd %eax,%xmm8
+ .byte 233,58,255,255,255 // jmpq 1031 <_sk_lerp_u8_hsw_lowp+0x14>
+ .byte 196,65,57,239,192 // vpxor %xmm8,%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,2,2 // vpinsrb $0x2,0x2(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 65,15,183,4,19 // movzwl (%r11,%rdx,1),%eax
+ .byte 197,121,110,200 // vmovd %eax,%xmm9
+ .byte 196,67,57,14,193,1 // vpblendw $0x1,%xmm9,%xmm8,%xmm8
+ .byte 233,25,255,255,255 // jmpq 1031 <_sk_lerp_u8_hsw_lowp+0x14>
+ .byte 196,65,57,239,192 // vpxor %xmm8,%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,6,6 // vpinsrb $0x6,0x6(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,5,5 // vpinsrb $0x5,0x5(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,4,4 // vpinsrb $0x4,0x4(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,65,121,110,12,19 // vmovd (%r11,%rdx,1),%xmm9
+ .byte 196,67,57,2,193,1 // vpblendd $0x1,%xmm9,%xmm8,%xmm8
+ .byte 233,235,254,255,255 // jmpq 1031 <_sk_lerp_u8_hsw_lowp+0x14>
+ .byte 196,65,57,239,192 // vpxor %xmm8,%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,10,10 // vpinsrb $0xa,0xa(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,9,9 // vpinsrb $0x9,0x9(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,8,8 // vpinsrb $0x8,0x8(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,65,122,126,12,19 // vmovq (%r11,%rdx,1),%xmm9
+ .byte 196,67,49,2,192,12 // vpblendd $0xc,%xmm8,%xmm9,%xmm8
+ .byte 233,189,254,255,255 // jmpq 1031 <_sk_lerp_u8_hsw_lowp+0x14>
+ .byte 196,65,57,239,192 // vpxor %xmm8,%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,14,14 // vpinsrb $0xe,0xe(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,13,13 // vpinsrb $0xd,0xd(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,67,57,32,68,19,12,12 // vpinsrb $0xc,0xc(%r11,%rdx,1),%xmm8,%xmm8
+ .byte 196,65,122,126,12,19 // vmovq (%r11,%rdx,1),%xmm9
+ .byte 196,67,49,34,76,19,8,2 // vpinsrd $0x2,0x8(%r11,%rdx,1),%xmm9,%xmm9
+ .byte 196,67,49,2,192,8 // vpblendd $0x8,%xmm8,%xmm9,%xmm8
+ .byte 233,135,254,255,255 // jmpq 1031 <_sk_lerp_u8_hsw_lowp+0x14>
+ .byte 102,144 // xchg %ax,%ax
+ .byte 61,255,255,255,88 // cmp $0x58ffffff,%eax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,75,255 // decl -0x1(%rbx)
+ .byte 255 // (bad)
+ .byte 255,137,255,255,255,129 // decl -0x7e000001(%rcx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 121,255 // jns 11c1 <_sk_lerp_u8_hsw_lowp+0x1a4>
+ .byte 255 // (bad)
+ .byte 255,108,255,255 // ljmp *-0x1(%rdi,%rdi,8)
+ .byte 255,183,255,255,255,175 // pushq -0x50000001(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,167,255,255,255,154 // jmpq *-0x65000001(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,229 // jmpq *%rbp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 221,255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,213 // callq *%rbp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,200 // dec %eax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
+
+HIDDEN _sk_swap_rb_hsw_lowp
+.globl _sk_swap_rb_hsw_lowp
+FUNCTION(_sk_swap_rb_hsw_lowp)
+_sk_swap_rb_hsw_lowp:
+ .byte 197,124,40,192 // vmovaps %ymm0,%ymm8
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,252,40,194 // vmovaps %ymm2,%ymm0
+ .byte 197,124,41,194 // vmovaps %ymm8,%ymm2
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_swap_hsw_lowp
+.globl _sk_swap_hsw_lowp
+FUNCTION(_sk_swap_hsw_lowp)
+_sk_swap_hsw_lowp:
+ .byte 197,124,40,195 // vmovaps %ymm3,%ymm8
+ .byte 197,124,40,202 // vmovaps %ymm2,%ymm9
+ .byte 197,124,40,209 // vmovaps %ymm1,%ymm10
+ .byte 197,124,40,216 // vmovaps %ymm0,%ymm11
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,252,40,196 // vmovaps %ymm4,%ymm0
+ .byte 197,252,40,205 // vmovaps %ymm5,%ymm1
+ .byte 197,252,40,214 // vmovaps %ymm6,%ymm2
+ .byte 197,252,40,223 // vmovaps %ymm7,%ymm3
+ .byte 197,124,41,220 // vmovaps %ymm11,%ymm4
+ .byte 197,124,41,213 // vmovaps %ymm10,%ymm5
+ .byte 197,124,41,206 // vmovaps %ymm9,%ymm6
+ .byte 197,124,41,199 // vmovaps %ymm8,%ymm7
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_move_src_dst_hsw_lowp
+.globl _sk_move_src_dst_hsw_lowp
+FUNCTION(_sk_move_src_dst_hsw_lowp)
+_sk_move_src_dst_hsw_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,252,40,224 // vmovaps %ymm0,%ymm4
+ .byte 197,252,40,233 // vmovaps %ymm1,%ymm5
+ .byte 197,252,40,242 // vmovaps %ymm2,%ymm6
+ .byte 197,252,40,251 // vmovaps %ymm3,%ymm7
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_move_dst_src_hsw_lowp
+.globl _sk_move_dst_src_hsw_lowp
+FUNCTION(_sk_move_dst_src_hsw_lowp)
+_sk_move_dst_src_hsw_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,252,40,196 // vmovaps %ymm4,%ymm0
+ .byte 197,252,40,205 // vmovaps %ymm5,%ymm1
+ .byte 197,252,40,214 // vmovaps %ymm6,%ymm2
+ .byte 197,252,40,223 // vmovaps %ymm7,%ymm3
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_clear_hsw_lowp
+.globl _sk_clear_hsw_lowp
+FUNCTION(_sk_clear_hsw_lowp)
+_sk_clear_hsw_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,252,87,192 // vxorps %ymm0,%ymm0,%ymm0
+ .byte 197,244,87,201 // vxorps %ymm1,%ymm1,%ymm1
+ .byte 197,236,87,210 // vxorps %ymm2,%ymm2,%ymm2
+ .byte 197,228,87,219 // vxorps %ymm3,%ymm3,%ymm3
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_srcatop_hsw_lowp
+.globl _sk_srcatop_hsw_lowp
+FUNCTION(_sk_srcatop_hsw_lowp)
+_sk_srcatop_hsw_lowp:
+ .byte 196,226,125,11,199 // vpmulhrsw %ymm7,%ymm0,%ymm0
+ .byte 196,226,125,29,192 // vpabsw %ymm0,%ymm0
+ .byte 196,98,125,121,5,111,5,0,0 // vpbroadcastw 0x56f(%rip),%ymm8 # 17ea <_sk_xor__hsw_lowp+0x20e>
+ .byte 197,61,249,195 // vpsubw %ymm3,%ymm8,%ymm8
+ .byte 196,66,93,11,200 // vpmulhrsw %ymm8,%ymm4,%ymm9
+ .byte 196,66,125,29,201 // vpabsw %ymm9,%ymm9
+ .byte 197,181,253,192 // vpaddw %ymm0,%ymm9,%ymm0
+ .byte 196,226,117,11,207 // vpmulhrsw %ymm7,%ymm1,%ymm1
+ .byte 196,226,125,29,201 // vpabsw %ymm1,%ymm1
+ .byte 196,66,85,11,200 // vpmulhrsw %ymm8,%ymm5,%ymm9
+ .byte 196,66,125,29,201 // vpabsw %ymm9,%ymm9
+ .byte 197,181,253,201 // vpaddw %ymm1,%ymm9,%ymm1
+ .byte 196,226,109,11,215 // vpmulhrsw %ymm7,%ymm2,%ymm2
+ .byte 196,226,125,29,210 // vpabsw %ymm2,%ymm2
+ .byte 196,66,77,11,200 // vpmulhrsw %ymm8,%ymm6,%ymm9
+ .byte 196,66,125,29,201 // vpabsw %ymm9,%ymm9
+ .byte 197,181,253,210 // vpaddw %ymm2,%ymm9,%ymm2
+ .byte 196,226,101,11,223 // vpmulhrsw %ymm7,%ymm3,%ymm3
+ .byte 196,226,125,29,219 // vpabsw %ymm3,%ymm3
+ .byte 196,66,69,11,192 // vpmulhrsw %ymm8,%ymm7,%ymm8
+ .byte 196,66,125,29,192 // vpabsw %ymm8,%ymm8
+ .byte 197,189,253,219 // vpaddw %ymm3,%ymm8,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_dstatop_hsw_lowp
+.globl _sk_dstatop_hsw_lowp
+FUNCTION(_sk_dstatop_hsw_lowp)
+_sk_dstatop_hsw_lowp:
+ .byte 196,98,93,11,195 // vpmulhrsw %ymm3,%ymm4,%ymm8
+ .byte 196,66,125,29,192 // vpabsw %ymm8,%ymm8
+ .byte 196,98,125,121,13,0,5,0,0 // vpbroadcastw 0x500(%rip),%ymm9 # 17ec <_sk_xor__hsw_lowp+0x210>
+ .byte 197,53,249,207 // vpsubw %ymm7,%ymm9,%ymm9
+ .byte 196,194,125,11,193 // vpmulhrsw %ymm9,%ymm0,%ymm0
+ .byte 196,226,125,29,192 // vpabsw %ymm0,%ymm0
+ .byte 196,193,125,253,192 // vpaddw %ymm8,%ymm0,%ymm0
+ .byte 196,98,85,11,195 // vpmulhrsw %ymm3,%ymm5,%ymm8
+ .byte 196,66,125,29,192 // vpabsw %ymm8,%ymm8
+ .byte 196,194,117,11,201 // vpmulhrsw %ymm9,%ymm1,%ymm1
+ .byte 196,226,125,29,201 // vpabsw %ymm1,%ymm1
+ .byte 196,193,117,253,200 // vpaddw %ymm8,%ymm1,%ymm1
+ .byte 196,98,77,11,195 // vpmulhrsw %ymm3,%ymm6,%ymm8
+ .byte 196,66,125,29,192 // vpabsw %ymm8,%ymm8
+ .byte 196,194,109,11,209 // vpmulhrsw %ymm9,%ymm2,%ymm2
+ .byte 196,226,125,29,210 // vpabsw %ymm2,%ymm2
+ .byte 196,193,109,253,208 // vpaddw %ymm8,%ymm2,%ymm2
+ .byte 196,98,69,11,195 // vpmulhrsw %ymm3,%ymm7,%ymm8
+ .byte 196,66,125,29,192 // vpabsw %ymm8,%ymm8
+ .byte 196,194,101,11,217 // vpmulhrsw %ymm9,%ymm3,%ymm3
+ .byte 196,226,125,29,219 // vpabsw %ymm3,%ymm3
+ .byte 196,193,101,253,216 // vpaddw %ymm8,%ymm3,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_srcin_hsw_lowp
+.globl _sk_srcin_hsw_lowp
+FUNCTION(_sk_srcin_hsw_lowp)
+_sk_srcin_hsw_lowp:
+ .byte 196,226,125,11,199 // vpmulhrsw %ymm7,%ymm0,%ymm0
+ .byte 196,226,125,29,192 // vpabsw %ymm0,%ymm0
+ .byte 196,226,117,11,207 // vpmulhrsw %ymm7,%ymm1,%ymm1
+ .byte 196,226,125,29,201 // vpabsw %ymm1,%ymm1
+ .byte 196,226,109,11,215 // vpmulhrsw %ymm7,%ymm2,%ymm2
+ .byte 196,226,125,29,210 // vpabsw %ymm2,%ymm2
+ .byte 196,226,101,11,223 // vpmulhrsw %ymm7,%ymm3,%ymm3
+ .byte 196,226,125,29,219 // vpabsw %ymm3,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_dstin_hsw_lowp
+.globl _sk_dstin_hsw_lowp
+FUNCTION(_sk_dstin_hsw_lowp)
+_sk_dstin_hsw_lowp:
+ .byte 196,226,93,11,195 // vpmulhrsw %ymm3,%ymm4,%ymm0
+ .byte 196,226,125,29,192 // vpabsw %ymm0,%ymm0
+ .byte 196,226,85,11,203 // vpmulhrsw %ymm3,%ymm5,%ymm1
+ .byte 196,226,125,29,201 // vpabsw %ymm1,%ymm1
+ .byte 196,226,77,11,211 // vpmulhrsw %ymm3,%ymm6,%ymm2
+ .byte 196,226,125,29,210 // vpabsw %ymm2,%ymm2
+ .byte 196,226,69,11,219 // vpmulhrsw %ymm3,%ymm7,%ymm3
+ .byte 196,226,125,29,219 // vpabsw %ymm3,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_srcout_hsw_lowp
+.globl _sk_srcout_hsw_lowp
+FUNCTION(_sk_srcout_hsw_lowp)
+_sk_srcout_hsw_lowp:
+ .byte 196,98,125,121,5,63,4,0,0 // vpbroadcastw 0x43f(%rip),%ymm8 # 17ee <_sk_xor__hsw_lowp+0x212>
+ .byte 197,61,249,199 // vpsubw %ymm7,%ymm8,%ymm8
+ .byte 196,194,125,11,192 // vpmulhrsw %ymm8,%ymm0,%ymm0
+ .byte 196,226,125,29,192 // vpabsw %ymm0,%ymm0
+ .byte 196,194,117,11,200 // vpmulhrsw %ymm8,%ymm1,%ymm1
+ .byte 196,226,125,29,201 // vpabsw %ymm1,%ymm1
+ .byte 196,194,109,11,208 // vpmulhrsw %ymm8,%ymm2,%ymm2
+ .byte 196,226,125,29,210 // vpabsw %ymm2,%ymm2
+ .byte 196,194,101,11,216 // vpmulhrsw %ymm8,%ymm3,%ymm3
+ .byte 196,226,125,29,219 // vpabsw %ymm3,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_dstout_hsw_lowp
+.globl _sk_dstout_hsw_lowp
+FUNCTION(_sk_dstout_hsw_lowp)
+_sk_dstout_hsw_lowp:
+ .byte 196,226,125,121,5,8,4,0,0 // vpbroadcastw 0x408(%rip),%ymm0 # 17f0 <_sk_xor__hsw_lowp+0x214>
+ .byte 197,253,249,219 // vpsubw %ymm3,%ymm0,%ymm3
+ .byte 196,226,93,11,195 // vpmulhrsw %ymm3,%ymm4,%ymm0
+ .byte 196,226,125,29,192 // vpabsw %ymm0,%ymm0
+ .byte 196,226,85,11,203 // vpmulhrsw %ymm3,%ymm5,%ymm1
+ .byte 196,226,125,29,201 // vpabsw %ymm1,%ymm1
+ .byte 196,226,77,11,211 // vpmulhrsw %ymm3,%ymm6,%ymm2
+ .byte 196,226,125,29,210 // vpabsw %ymm2,%ymm2
+ .byte 196,226,69,11,219 // vpmulhrsw %ymm3,%ymm7,%ymm3
+ .byte 196,226,125,29,219 // vpabsw %ymm3,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_srcover_hsw_lowp
+.globl _sk_srcover_hsw_lowp
+FUNCTION(_sk_srcover_hsw_lowp)
+_sk_srcover_hsw_lowp:
+ .byte 196,98,125,121,5,209,3,0,0 // vpbroadcastw 0x3d1(%rip),%ymm8 # 17f2 <_sk_xor__hsw_lowp+0x216>
+ .byte 197,61,249,195 // vpsubw %ymm3,%ymm8,%ymm8
+ .byte 196,66,93,11,200 // vpmulhrsw %ymm8,%ymm4,%ymm9
+ .byte 196,66,125,29,201 // vpabsw %ymm9,%ymm9
+ .byte 197,181,253,192 // vpaddw %ymm0,%ymm9,%ymm0
+ .byte 196,66,85,11,200 // vpmulhrsw %ymm8,%ymm5,%ymm9
+ .byte 196,66,125,29,201 // vpabsw %ymm9,%ymm9
+ .byte 197,181,253,201 // vpaddw %ymm1,%ymm9,%ymm1
+ .byte 196,66,77,11,200 // vpmulhrsw %ymm8,%ymm6,%ymm9
+ .byte 196,66,125,29,201 // vpabsw %ymm9,%ymm9
+ .byte 197,181,253,210 // vpaddw %ymm2,%ymm9,%ymm2
+ .byte 196,66,69,11,192 // vpmulhrsw %ymm8,%ymm7,%ymm8
+ .byte 196,66,125,29,192 // vpabsw %ymm8,%ymm8
+ .byte 197,189,253,219 // vpaddw %ymm3,%ymm8,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_dstover_hsw_lowp
+.globl _sk_dstover_hsw_lowp
+FUNCTION(_sk_dstover_hsw_lowp)
+_sk_dstover_hsw_lowp:
+ .byte 196,98,125,121,5,138,3,0,0 // vpbroadcastw 0x38a(%rip),%ymm8 # 17f4 <_sk_xor__hsw_lowp+0x218>
+ .byte 197,61,249,199 // vpsubw %ymm7,%ymm8,%ymm8
+ .byte 196,194,125,11,192 // vpmulhrsw %ymm8,%ymm0,%ymm0
+ .byte 196,226,125,29,192 // vpabsw %ymm0,%ymm0
+ .byte 197,253,253,196 // vpaddw %ymm4,%ymm0,%ymm0
+ .byte 196,194,117,11,200 // vpmulhrsw %ymm8,%ymm1,%ymm1
+ .byte 196,226,125,29,201 // vpabsw %ymm1,%ymm1
+ .byte 197,245,253,205 // vpaddw %ymm5,%ymm1,%ymm1
+ .byte 196,194,109,11,208 // vpmulhrsw %ymm8,%ymm2,%ymm2
+ .byte 196,226,125,29,210 // vpabsw %ymm2,%ymm2
+ .byte 197,237,253,214 // vpaddw %ymm6,%ymm2,%ymm2
+ .byte 196,194,101,11,216 // vpmulhrsw %ymm8,%ymm3,%ymm3
+ .byte 196,226,125,29,219 // vpabsw %ymm3,%ymm3
+ .byte 197,229,253,223 // vpaddw %ymm7,%ymm3,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_modulate_hsw_lowp
+.globl _sk_modulate_hsw_lowp
+FUNCTION(_sk_modulate_hsw_lowp)
+_sk_modulate_hsw_lowp:
+ .byte 196,226,125,11,196 // vpmulhrsw %ymm4,%ymm0,%ymm0
+ .byte 196,226,125,29,192 // vpabsw %ymm0,%ymm0
+ .byte 196,226,117,11,205 // vpmulhrsw %ymm5,%ymm1,%ymm1
+ .byte 196,226,125,29,201 // vpabsw %ymm1,%ymm1
+ .byte 196,226,109,11,214 // vpmulhrsw %ymm6,%ymm2,%ymm2
+ .byte 196,226,125,29,210 // vpabsw %ymm2,%ymm2
+ .byte 196,226,101,11,223 // vpmulhrsw %ymm7,%ymm3,%ymm3
+ .byte 196,226,125,29,219 // vpabsw %ymm3,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_multiply_hsw_lowp
+.globl _sk_multiply_hsw_lowp
+FUNCTION(_sk_multiply_hsw_lowp)
+_sk_multiply_hsw_lowp:
+ .byte 196,98,125,121,5,23,3,0,0 // vpbroadcastw 0x317(%rip),%ymm8 # 17f6 <_sk_xor__hsw_lowp+0x21a>
+ .byte 197,61,249,207 // vpsubw %ymm7,%ymm8,%ymm9
+ .byte 196,66,125,11,209 // vpmulhrsw %ymm9,%ymm0,%ymm10
+ .byte 196,66,125,29,210 // vpabsw %ymm10,%ymm10
+ .byte 197,61,249,195 // vpsubw %ymm3,%ymm8,%ymm8
+ .byte 196,66,93,11,216 // vpmulhrsw %ymm8,%ymm4,%ymm11
+ .byte 196,66,125,29,219 // vpabsw %ymm11,%ymm11
+ .byte 196,65,37,253,210 // vpaddw %ymm10,%ymm11,%ymm10
+ .byte 196,226,125,11,196 // vpmulhrsw %ymm4,%ymm0,%ymm0
+ .byte 196,226,125,29,192 // vpabsw %ymm0,%ymm0
+ .byte 197,173,253,192 // vpaddw %ymm0,%ymm10,%ymm0
+ .byte 196,66,117,11,209 // vpmulhrsw %ymm9,%ymm1,%ymm10
+ .byte 196,66,125,29,210 // vpabsw %ymm10,%ymm10
+ .byte 196,66,85,11,216 // vpmulhrsw %ymm8,%ymm5,%ymm11
+ .byte 196,66,125,29,219 // vpabsw %ymm11,%ymm11
+ .byte 196,65,37,253,210 // vpaddw %ymm10,%ymm11,%ymm10
+ .byte 196,226,117,11,205 // vpmulhrsw %ymm5,%ymm1,%ymm1
+ .byte 196,226,125,29,201 // vpabsw %ymm1,%ymm1
+ .byte 197,173,253,201 // vpaddw %ymm1,%ymm10,%ymm1
+ .byte 196,66,109,11,209 // vpmulhrsw %ymm9,%ymm2,%ymm10
+ .byte 196,66,125,29,210 // vpabsw %ymm10,%ymm10
+ .byte 196,66,77,11,216 // vpmulhrsw %ymm8,%ymm6,%ymm11
+ .byte 196,66,125,29,219 // vpabsw %ymm11,%ymm11
+ .byte 196,65,37,253,210 // vpaddw %ymm10,%ymm11,%ymm10
+ .byte 196,226,109,11,214 // vpmulhrsw %ymm6,%ymm2,%ymm2
+ .byte 196,226,125,29,210 // vpabsw %ymm2,%ymm2
+ .byte 197,173,253,210 // vpaddw %ymm2,%ymm10,%ymm2
+ .byte 196,66,101,11,201 // vpmulhrsw %ymm9,%ymm3,%ymm9
+ .byte 196,66,125,29,201 // vpabsw %ymm9,%ymm9
+ .byte 196,66,69,11,192 // vpmulhrsw %ymm8,%ymm7,%ymm8
+ .byte 196,66,125,29,192 // vpabsw %ymm8,%ymm8
+ .byte 196,65,61,253,193 // vpaddw %ymm9,%ymm8,%ymm8
+ .byte 196,226,101,11,223 // vpmulhrsw %ymm7,%ymm3,%ymm3
+ .byte 196,226,125,29,219 // vpabsw %ymm3,%ymm3
+ .byte 197,189,253,219 // vpaddw %ymm3,%ymm8,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_screen_hsw_lowp
+.globl _sk_screen_hsw_lowp
+FUNCTION(_sk_screen_hsw_lowp)
+_sk_screen_hsw_lowp:
+ .byte 196,98,125,121,5,104,2,0,0 // vpbroadcastw 0x268(%rip),%ymm8 # 17f8 <_sk_xor__hsw_lowp+0x21c>
+ .byte 197,61,249,200 // vpsubw %ymm0,%ymm8,%ymm9
+ .byte 196,98,53,11,204 // vpmulhrsw %ymm4,%ymm9,%ymm9
+ .byte 196,66,125,29,201 // vpabsw %ymm9,%ymm9
+ .byte 197,181,253,192 // vpaddw %ymm0,%ymm9,%ymm0
+ .byte 197,61,249,201 // vpsubw %ymm1,%ymm8,%ymm9
+ .byte 196,98,53,11,205 // vpmulhrsw %ymm5,%ymm9,%ymm9
+ .byte 196,66,125,29,201 // vpabsw %ymm9,%ymm9
+ .byte 197,181,253,201 // vpaddw %ymm1,%ymm9,%ymm1
+ .byte 197,61,249,202 // vpsubw %ymm2,%ymm8,%ymm9
+ .byte 196,98,53,11,206 // vpmulhrsw %ymm6,%ymm9,%ymm9
+ .byte 196,66,125,29,201 // vpabsw %ymm9,%ymm9
+ .byte 197,181,253,210 // vpaddw %ymm2,%ymm9,%ymm2
+ .byte 197,61,249,195 // vpsubw %ymm3,%ymm8,%ymm8
+ .byte 196,98,61,11,199 // vpmulhrsw %ymm7,%ymm8,%ymm8
+ .byte 196,66,125,29,192 // vpabsw %ymm8,%ymm8
+ .byte 197,189,253,219 // vpaddw %ymm3,%ymm8,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_xor__hsw_lowp
+.globl _sk_xor__hsw_lowp
+FUNCTION(_sk_xor__hsw_lowp)
+_sk_xor__hsw_lowp:
+ .byte 196,98,125,121,5,21,2,0,0 // vpbroadcastw 0x215(%rip),%ymm8 # 17fa <_sk_xor__hsw_lowp+0x21e>
+ .byte 197,61,249,207 // vpsubw %ymm7,%ymm8,%ymm9
+ .byte 196,194,125,11,193 // vpmulhrsw %ymm9,%ymm0,%ymm0
+ .byte 196,226,125,29,192 // vpabsw %ymm0,%ymm0
+ .byte 197,61,249,195 // vpsubw %ymm3,%ymm8,%ymm8
+ .byte 196,66,93,11,208 // vpmulhrsw %ymm8,%ymm4,%ymm10
+ .byte 196,66,125,29,210 // vpabsw %ymm10,%ymm10
+ .byte 197,173,253,192 // vpaddw %ymm0,%ymm10,%ymm0
+ .byte 196,194,117,11,201 // vpmulhrsw %ymm9,%ymm1,%ymm1
+ .byte 196,226,125,29,201 // vpabsw %ymm1,%ymm1
+ .byte 196,66,85,11,208 // vpmulhrsw %ymm8,%ymm5,%ymm10
+ .byte 196,66,125,29,210 // vpabsw %ymm10,%ymm10
+ .byte 197,173,253,201 // vpaddw %ymm1,%ymm10,%ymm1
+ .byte 196,194,109,11,209 // vpmulhrsw %ymm9,%ymm2,%ymm2
+ .byte 196,226,125,29,210 // vpabsw %ymm2,%ymm2
+ .byte 196,66,77,11,208 // vpmulhrsw %ymm8,%ymm6,%ymm10
+ .byte 196,66,125,29,210 // vpabsw %ymm10,%ymm10
+ .byte 197,173,253,210 // vpaddw %ymm2,%ymm10,%ymm2
+ .byte 196,194,101,11,217 // vpmulhrsw %ymm9,%ymm3,%ymm3
+ .byte 196,226,125,29,219 // vpabsw %ymm3,%ymm3
+ .byte 196,66,69,11,192 // vpmulhrsw %ymm8,%ymm7,%ymm8
+ .byte 196,66,125,29,192 // vpabsw %ymm8,%ymm8
+ .byte 197,189,253,219 // vpaddw %ymm3,%ymm8,%ymm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+BALIGN4
+ .byte 0,0 // add %al,(%rax)
+ .byte 128,67,0,0 // addb $0x0,0x0(%rbx)
+ .byte 128,67,0,0 // addb $0x0,0x0(%rbx)
+ .byte 128,67,0,0 // addb $0x0,0x0(%rbx)
+ .byte 128 // .byte 0x80
+ .byte 67 // rex.XB
+
+BALIGN32
+ .byte 0,1 // add %al,(%rcx)
+ .byte 4,5 // add $0x5,%al
+ .byte 8,9 // or %cl,(%rcx)
+ .byte 12,13 // or $0xd,%al
+ .byte 128,128,128,128,128,128,128 // addb $0x80,-0x7f7f7f80(%rax)
+ .byte 128,0,1 // addb $0x1,(%rax)
+ .byte 4,5 // add $0x5,%al
+ .byte 8,9 // or %cl,(%rcx)
+ .byte 12,13 // or $0xd,%al
+ .byte 128,128,128,128,128,128,128 // addb $0x80,-0x7f7f7f80(%rax)
+ .byte 128,129,128,0,0,0,0 // addb $0x0,0x80(%rcx)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 1,2 // add %eax,(%rdx)
+ .byte 5,6,9,10,13 // add $0xd0a0906,%eax
+ .byte 14 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,17 // callq *(%rcx)
+ .byte 18,21,22,25,26,29 // adc 0x1d1a1916(%rip),%dl # 1d1a2fed <_sk_xor__hsw_lowp+0x1d1a1a11>
+ .byte 30 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,2 // incl (%rdx)
+ .byte 3,6 // add (%rsi),%eax
+ .byte 7 // (bad)
+ .byte 10,11 // or (%rbx),%cl
+ .byte 14 // (bad)
+ .byte 15,255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,18 // callq *(%rdx)
+ .byte 19,22 // adc (%rsi),%edx
+ .byte 23 // (bad)
+ .byte 26,27 // sbb (%rbx),%bl
+ .byte 30 // (bad)
+ .byte 31 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,3 // incl (%rbx)
+ .byte 255,7 // incl (%rdi)
+ .byte 255,11 // decl (%rbx)
+ .byte 255,15 // decl (%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,19 // callq *(%rbx)
+ .byte 255,23 // callq *(%rdi)
+ .byte 255,27 // lcall *(%rbx)
+ .byte 255,31 // lcall *(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,0 // incl (%rax)
+ .byte 129,128,129,128,0,128,0,0,0,0 // addl $0x0,-0x7fff7f7f(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,1 // add %al,(%rcx)
+ .byte 4,5 // add $0x5,%al
+ .byte 8,9 // or %cl,(%rcx)
+ .byte 12,13 // or $0xd,%al
+ .byte 128,128,128,128,128,128,128 // addb $0x80,-0x7f7f7f80(%rax)
+ .byte 128,0,1 // addb $0x1,(%rax)
+ .byte 4,5 // add $0x5,%al
+ .byte 8,9 // or %cl,(%rcx)
+ .byte 12,13 // or $0xd,%al
+ .byte 128,128,128,128,128,128,128 // addb $0x80,-0x7f7f7f80(%rax)
+ .byte 128,255,0 // cmp $0x0,%bh
+ .byte 129,128,0,0,0,0,0,0,0,0 // addl $0x0,0x0(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 0,0 // add %al,(%rax)
+ .byte 1,2 // add %eax,(%rdx)
+ .byte 5,6,9,10,13 // add $0xd0a0906,%eax
+ .byte 14 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,17 // callq *(%rcx)
+ .byte 18,21,22,25,26,29 // adc 0x1d1a1916(%rip),%dl # 1d1a30ad <_sk_xor__hsw_lowp+0x1d1a1ad1>
+ .byte 30 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,2 // incl (%rdx)
+ .byte 3,6 // add (%rsi),%eax
+ .byte 7 // (bad)
+ .byte 10,11 // or (%rbx),%cl
+ .byte 14 // (bad)
+ .byte 15,255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,18 // callq *(%rdx)
+ .byte 19,22 // adc (%rsi),%edx
+ .byte 23 // (bad)
+ .byte 26,27 // sbb (%rbx),%bl
+ .byte 30 // (bad)
+ .byte 31 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,3 // incl (%rbx)
+ .byte 255,7 // incl (%rdi)
+ .byte 255,11 // decl (%rbx)
+ .byte 255,15 // decl (%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,19 // callq *(%rbx)
+ .byte 255,23 // callq *(%rdi)
+ .byte 255,27 // lcall *(%rbx)
+ .byte 255,31 // lcall *(%rdi)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,0 // incl (%rax)
+ .byte 128,129,128,0,128,129,128 // addb $0x80,-0x7e7fff80(%rcx)
+ .byte 0,128,0,128,0,128 // add %al,-0x7fff8000(%rax)
+ .byte 0,128,0,128,0,128 // add %al,-0x7fff8000(%rax)
+ .byte 0,128,0,128,0,128 // add %al,-0x7fff8000(%rax)
+ .byte 0 // .byte 0x0
+ .byte 128 // .byte 0x80
+BALIGN32
+
HIDDEN _sk_start_pipeline_ssse3_lowp
.globl _sk_start_pipeline_ssse3_lowp
FUNCTION(_sk_start_pipeline_ssse3_lowp)
@@ -37713,13 +39440,13 @@ _sk_load_a8_ssse3_lowp:
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 76,139,24 // mov (%rax),%r11
.byte 77,133,192 // test %r8,%r8
- .byte 117,36 // jne 47a <_sk_load_a8_ssse3_lowp+0x2e>
+ .byte 117,37 // jne 47b <_sk_load_a8_ssse3_lowp+0x2f>
.byte 243,65,15,126,28,19 // movq (%r11,%rdx,1),%xmm3
.byte 102,15,96,216 // punpcklbw %xmm0,%xmm3
.byte 102,15,113,243,8 // psllw $0x8,%xmm3
.byte 102,15,228,29,35,15,0,0 // pmulhuw 0xf23(%rip),%xmm3 # 1390 <_sk_xor__ssse3_lowp+0x10b>
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 15,87,192 // xorps %xmm0,%xmm0
+ .byte 102,15,87,192 // xorpd %xmm0,%xmm0
.byte 15,87,201 // xorps %xmm1,%xmm1
.byte 15,87,210 // xorps %xmm2,%xmm2
.byte 255,224 // jmpq *%rax
@@ -37728,15 +39455,15 @@ _sk_load_a8_ssse3_lowp:
.byte 102,15,239,219 // pxor %xmm3,%xmm3
.byte 65,254,201 // dec %r9b
.byte 65,128,249,6 // cmp $0x6,%r9b
- .byte 119,210 // ja 460 <_sk_load_a8_ssse3_lowp+0x14>
+ .byte 119,209 // ja 460 <_sk_load_a8_ssse3_lowp+0x14>
.byte 69,15,182,201 // movzbl %r9b,%r9d
- .byte 76,141,21,111,0,0,0 // lea 0x6f(%rip),%r10 # 508 <_sk_load_a8_ssse3_lowp+0xbc>
+ .byte 76,141,21,110,0,0,0 // lea 0x6e(%rip),%r10 # 508 <_sk_load_a8_ssse3_lowp+0xbc>
.byte 75,99,4,138 // movslq (%r10,%r9,4),%rax
.byte 76,1,208 // add %r10,%rax
.byte 255,224 // jmpq *%rax
.byte 65,15,182,4,19 // movzbl (%r11,%rdx,1),%eax
.byte 102,15,110,216 // movd %eax,%xmm3
- .byte 235,179 // jmp 460 <_sk_load_a8_ssse3_lowp+0x14>
+ .byte 235,178 // jmp 460 <_sk_load_a8_ssse3_lowp+0x14>
.byte 65,15,182,68,19,2 // movzbl 0x2(%r11,%rdx,1),%eax
.byte 102,15,239,219 // pxor %xmm3,%xmm3
.byte 102,15,196,216,2 // pinsrw $0x2,%eax,%xmm3
@@ -37744,7 +39471,7 @@ _sk_load_a8_ssse3_lowp:
.byte 102,15,110,192 // movd %eax,%xmm0
.byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
.byte 243,15,16,216 // movss %xmm0,%xmm3
- .byte 235,145 // jmp 460 <_sk_load_a8_ssse3_lowp+0x14>
+ .byte 235,144 // jmp 460 <_sk_load_a8_ssse3_lowp+0x14>
.byte 65,15,182,68,19,6 // movzbl 0x6(%r11,%rdx,1),%eax
.byte 102,15,239,219 // pxor %xmm3,%xmm3
.byte 102,15,196,216,6 // pinsrw $0x6,%eax,%xmm3
@@ -37755,24 +39482,24 @@ _sk_load_a8_ssse3_lowp:
.byte 102,65,15,110,4,19 // movd (%r11,%rdx,1),%xmm0
.byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
.byte 242,15,16,216 // movsd %xmm0,%xmm3
- .byte 233,89,255,255,255 // jmpq 460 <_sk_load_a8_ssse3_lowp+0x14>
- .byte 144 // nop
- .byte 154 // (bad)
+ .byte 233,88,255,255,255 // jmpq 460 <_sk_load_a8_ssse3_lowp+0x14>
+ .byte 155 // fwait
.byte 255 // (bad)
.byte 255 // (bad)
- .byte 255,180,255,255,255,165,255 // pushq -0x5a0001(%rdi,%rdi,8)
+ .byte 255,181,255,255,255,166 // pushq -0x59000001(%rbp)
.byte 255 // (bad)
.byte 255 // (bad)
- .byte 236 // in (%dx),%al
.byte 255 // (bad)
+ .byte 237 // in (%dx),%eax
.byte 255 // (bad)
- .byte 255,225 // jmpq *%rcx
.byte 255 // (bad)
+ .byte 255,226 // jmpq *%rdx
.byte 255 // (bad)
- .byte 255,214 // callq *%rsi
.byte 255 // (bad)
+ .byte 255,215 // callq *%rdi
.byte 255 // (bad)
- .byte 255,199 // inc %edi
+ .byte 255 // (bad)
+ .byte 255,200 // dec %eax
.byte 255 // (bad)
.byte 255 // (bad)
.byte 255 // .byte 0xff
@@ -37802,27 +39529,27 @@ _sk_store_a8_ssse3_lowp:
.byte 75,99,4,138 // movslq (%r10,%r9,4),%rax
.byte 76,1,208 // add %r10,%rax
.byte 255,224 // jmpq *%rax
- .byte 102,68,15,127,68,36,232 // movdqa %xmm8,-0x18(%rsp)
- .byte 138,68,36,232 // mov -0x18(%rsp),%al
+ .byte 102,68,15,127,68,36,168 // movdqa %xmm8,-0x58(%rsp)
+ .byte 138,68,36,168 // mov -0x58(%rsp),%al
.byte 65,136,4,19 // mov %al,(%r11,%rdx,1)
.byte 235,194 // jmp 544 <_sk_store_a8_ssse3_lowp+0x20>
- .byte 102,68,15,127,68,36,216 // movdqa %xmm8,-0x28(%rsp)
- .byte 138,68,36,220 // mov -0x24(%rsp),%al
+ .byte 102,68,15,127,68,36,184 // movdqa %xmm8,-0x48(%rsp)
+ .byte 138,68,36,188 // mov -0x44(%rsp),%al
.byte 65,136,68,19,2 // mov %al,0x2(%r11,%rdx,1)
- .byte 102,68,15,56,0,5,4,14,0,0 // pshufb 0xe04(%rip),%xmm8 # 13a0 <_sk_xor__ssse3_lowp+0x11b>
+ .byte 102,68,15,56,0,5,20,14,0,0 // pshufb 0xe14(%rip),%xmm8 # 13b0 <_sk_xor__ssse3_lowp+0x12b>
.byte 102,68,15,126,192 // movd %xmm8,%eax
.byte 102,65,137,4,19 // mov %ax,(%r11,%rdx,1)
.byte 235,156 // jmp 544 <_sk_store_a8_ssse3_lowp+0x20>
- .byte 102,68,15,127,68,36,200 // movdqa %xmm8,-0x38(%rsp)
- .byte 138,68,36,212 // mov -0x2c(%rsp),%al
+ .byte 102,68,15,127,68,36,232 // movdqa %xmm8,-0x18(%rsp)
+ .byte 138,68,36,244 // mov -0xc(%rsp),%al
.byte 65,136,68,19,6 // mov %al,0x6(%r11,%rdx,1)
- .byte 102,68,15,127,68,36,184 // movdqa %xmm8,-0x48(%rsp)
- .byte 138,68,36,194 // mov -0x3e(%rsp),%al
+ .byte 102,68,15,127,68,36,216 // movdqa %xmm8,-0x28(%rsp)
+ .byte 138,68,36,226 // mov -0x1e(%rsp),%al
.byte 65,136,68,19,5 // mov %al,0x5(%r11,%rdx,1)
- .byte 102,68,15,127,68,36,168 // movdqa %xmm8,-0x58(%rsp)
- .byte 138,68,36,176 // mov -0x50(%rsp),%al
+ .byte 102,68,15,127,68,36,200 // movdqa %xmm8,-0x38(%rsp)
+ .byte 138,68,36,208 // mov -0x30(%rsp),%al
.byte 65,136,68,19,4 // mov %al,0x4(%r11,%rdx,1)
- .byte 102,68,15,56,0,5,206,13,0,0 // pshufb 0xdce(%rip),%xmm8 # 13b0 <_sk_xor__ssse3_lowp+0x12b>
+ .byte 102,68,15,56,0,5,190,13,0,0 // pshufb 0xdbe(%rip),%xmm8 # 13a0 <_sk_xor__ssse3_lowp+0x11b>
.byte 102,69,15,126,4,19 // movd %xmm8,(%r11,%rdx,1)
.byte 233,87,255,255,255 // jmpq 544 <_sk_store_a8_ssse3_lowp+0x20>
.byte 15,31,0 // nopl (%rax)
@@ -38746,7 +40473,7 @@ BALIGN16
.byte 255,0 // incl (%rax)
.byte 255,0 // incl (%rax)
.byte 129,128,129,128,129,128,129,128,129,128// addl $0x80818081,-0x7f7e7f7f(%rax)
- .byte 129,128,129,128,129,128,0,2,0,0 // addl $0x200,-0x7f7e7f7f(%rax)
+ .byte 129,128,129,128,129,128,0,2,4,6 // addl $0x6040200,-0x7f7e7f7f(%rax)
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
@@ -38754,7 +40481,7 @@ BALIGN16
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
.byte 0,2 // add %al,(%rdx)
- .byte 4,6 // add $0x6,%al
+ .byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
.byte 0,0 // add %al,(%rax)
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index d64d125590..51b23da617 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -26683,6 +26683,1695 @@ ALIGN 4
DB 0,63 ; add %bh,(%rdi)
ALIGN 32
+PUBLIC _sk_start_pipeline_hsw_lowp
+_sk_start_pipeline_hsw_lowp LABEL PROC
+ DB 85 ; push %rbp
+ DB 72,137,229 ; mov %rsp,%rbp
+ DB 65,87 ; push %r15
+ DB 65,86 ; push %r14
+ DB 65,85 ; push %r13
+ DB 65,84 ; push %r12
+ DB 86 ; push %rsi
+ DB 87 ; push %rdi
+ DB 83 ; push %rbx
+ DB 72,129,236,184,0,0,0 ; sub $0xb8,%rsp
+ DB 197,120,41,125,176 ; vmovaps %xmm15,-0x50(%rbp)
+ DB 197,120,41,117,160 ; vmovaps %xmm14,-0x60(%rbp)
+ DB 197,120,41,109,144 ; vmovaps %xmm13,-0x70(%rbp)
+ DB 197,120,41,101,128 ; vmovaps %xmm12,-0x80(%rbp)
+ DB 197,120,41,157,112,255,255,255 ; vmovaps %xmm11,-0x90(%rbp)
+ DB 197,120,41,149,96,255,255,255 ; vmovaps %xmm10,-0xa0(%rbp)
+ DB 197,120,41,141,80,255,255,255 ; vmovaps %xmm9,-0xb0(%rbp)
+ DB 197,120,41,133,64,255,255,255 ; vmovaps %xmm8,-0xc0(%rbp)
+ DB 197,248,41,189,48,255,255,255 ; vmovaps %xmm7,-0xd0(%rbp)
+ DB 197,248,41,181,32,255,255,255 ; vmovaps %xmm6,-0xe0(%rbp)
+ DB 76,137,195 ; mov %r8,%rbx
+ DB 73,137,210 ; mov %rdx,%r10
+ DB 73,137,207 ; mov %rcx,%r15
+ DB 76,139,117,48 ; mov 0x30(%rbp),%r14
+ DB 76,137,206 ; mov %r9,%rsi
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 73,137,197 ; mov %rax,%r13
+ DB 73,137,244 ; mov %rsi,%r12
+ DB 73,141,79,16 ; lea 0x10(%r15),%rcx
+ DB 72,57,217 ; cmp %rbx,%rcx
+ DB 118,5 ; jbe 80 <_sk_start_pipeline_hsw_lowp+0x80>
+ DB 76,137,250 ; mov %r15,%rdx
+ DB 235,89 ; jmp d9 <_sk_start_pipeline_hsw_lowp+0xd9>
+ DB 72,137,157,24,255,255,255 ; mov %rbx,-0xe8(%rbp)
+ DB 65,184,0,0,0,0 ; mov $0x0,%r8d
+ DB 197,252,87,192 ; vxorps %ymm0,%ymm0,%ymm0
+ DB 197,244,87,201 ; vxorps %ymm1,%ymm1,%ymm1
+ DB 197,236,87,210 ; vxorps %ymm2,%ymm2,%ymm2
+ DB 197,228,87,219 ; vxorps %ymm3,%ymm3,%ymm3
+ DB 197,220,87,228 ; vxorps %ymm4,%ymm4,%ymm4
+ DB 197,212,87,237 ; vxorps %ymm5,%ymm5,%ymm5
+ DB 197,204,87,246 ; vxorps %ymm6,%ymm6,%ymm6
+ DB 197,196,87,255 ; vxorps %ymm7,%ymm7,%ymm7
+ DB 76,137,247 ; mov %r14,%rdi
+ DB 76,137,230 ; mov %r12,%rsi
+ DB 76,137,250 ; mov %r15,%rdx
+ DB 76,137,209 ; mov %r10,%rcx
+ DB 76,137,211 ; mov %r10,%rbx
+ DB 65,255,213 ; callq *%r13
+ DB 73,137,218 ; mov %rbx,%r10
+ DB 72,139,157,24,255,255,255 ; mov -0xe8(%rbp),%rbx
+ DB 73,141,87,16 ; lea 0x10(%r15),%rdx
+ DB 73,131,199,32 ; add $0x20,%r15
+ DB 73,57,223 ; cmp %rbx,%r15
+ DB 73,137,215 ; mov %rdx,%r15
+ DB 118,174 ; jbe 87 <_sk_start_pipeline_hsw_lowp+0x87>
+ DB 73,137,216 ; mov %rbx,%r8
+ DB 73,41,208 ; sub %rdx,%r8
+ DB 116,44 ; je 10d <_sk_start_pipeline_hsw_lowp+0x10d>
+ DB 197,252,87,192 ; vxorps %ymm0,%ymm0,%ymm0
+ DB 197,244,87,201 ; vxorps %ymm1,%ymm1,%ymm1
+ DB 197,236,87,210 ; vxorps %ymm2,%ymm2,%ymm2
+ DB 197,228,87,219 ; vxorps %ymm3,%ymm3,%ymm3
+ DB 197,220,87,228 ; vxorps %ymm4,%ymm4,%ymm4
+ DB 197,212,87,237 ; vxorps %ymm5,%ymm5,%ymm5
+ DB 197,204,87,246 ; vxorps %ymm6,%ymm6,%ymm6
+ DB 197,196,87,255 ; vxorps %ymm7,%ymm7,%ymm7
+ DB 76,137,247 ; mov %r14,%rdi
+ DB 76,137,230 ; mov %r12,%rsi
+ DB 76,137,209 ; mov %r10,%rcx
+ DB 65,255,213 ; callq *%r13
+ DB 72,137,216 ; mov %rbx,%rax
+ DB 197,248,40,181,32,255,255,255 ; vmovaps -0xe0(%rbp),%xmm6
+ DB 197,248,40,189,48,255,255,255 ; vmovaps -0xd0(%rbp),%xmm7
+ DB 197,120,40,133,64,255,255,255 ; vmovaps -0xc0(%rbp),%xmm8
+ DB 197,120,40,141,80,255,255,255 ; vmovaps -0xb0(%rbp),%xmm9
+ DB 197,120,40,149,96,255,255,255 ; vmovaps -0xa0(%rbp),%xmm10
+ DB 197,120,40,157,112,255,255,255 ; vmovaps -0x90(%rbp),%xmm11
+ DB 197,120,40,101,128 ; vmovaps -0x80(%rbp),%xmm12
+ DB 197,120,40,109,144 ; vmovaps -0x70(%rbp),%xmm13
+ DB 197,120,40,117,160 ; vmovaps -0x60(%rbp),%xmm14
+ DB 197,120,40,125,176 ; vmovaps -0x50(%rbp),%xmm15
+ DB 72,129,196,184,0,0,0 ; add $0xb8,%rsp
+ DB 91 ; pop %rbx
+ DB 95 ; pop %rdi
+ DB 94 ; pop %rsi
+ DB 65,92 ; pop %r12
+ DB 65,93 ; pop %r13
+ DB 65,94 ; pop %r14
+ DB 65,95 ; pop %r15
+ DB 93 ; pop %rbp
+ DB 197,248,119 ; vzeroupper
+ DB 195 ; retq
+
+PUBLIC _sk_just_return_hsw_lowp
+_sk_just_return_hsw_lowp LABEL PROC
+ DB 195 ; retq
+
+PUBLIC _sk_constant_color_hsw_lowp
+_sk_constant_color_hsw_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,226,121,24,5,129,21,0,0 ; vbroadcastss 0x1581(%rip),%xmm0 # 16f8 <_sk_xor__hsw_lowp+0x78>
+ DB 197,248,88,24 ; vaddps (%rax),%xmm0,%xmm3
+ DB 196,226,125,121,195 ; vpbroadcastw %xmm3,%ymm0
+ DB 197,251,112,203,234 ; vpshuflw $0xea,%xmm3,%xmm1
+ DB 196,226,125,88,201 ; vpbroadcastd %xmm1,%ymm1
+ DB 196,227,121,4,211,230 ; vpermilps $0xe6,%xmm3,%xmm2
+ DB 197,251,112,210,224 ; vpshuflw $0xe0,%xmm2,%xmm2
+ DB 196,226,125,88,210 ; vpbroadcastd %xmm2,%ymm2
+ DB 196,227,121,4,219,236 ; vpermilps $0xec,%xmm3,%xmm3
+ DB 197,251,112,219,234 ; vpshuflw $0xea,%xmm3,%xmm3
+ DB 196,226,125,88,219 ; vpbroadcastd %xmm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_set_rgb_hsw_lowp
+_sk_set_rgb_hsw_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,250,16,21,68,21,0,0 ; vmovss 0x1544(%rip),%xmm2 # 16fc <_sk_xor__hsw_lowp+0x7c>
+ DB 197,234,88,0 ; vaddss (%rax),%xmm2,%xmm0
+ DB 196,193,121,126,193 ; vmovd %xmm0,%r9d
+ DB 196,193,121,110,193 ; vmovd %r9d,%xmm0
+ DB 196,226,125,121,192 ; vpbroadcastw %xmm0,%ymm0
+ DB 197,234,88,72,4 ; vaddss 0x4(%rax),%xmm2,%xmm1
+ DB 196,193,121,126,201 ; vmovd %xmm1,%r9d
+ DB 196,193,121,110,201 ; vmovd %r9d,%xmm1
+ DB 196,226,125,121,201 ; vpbroadcastw %xmm1,%ymm1
+ DB 197,234,88,80,8 ; vaddss 0x8(%rax),%xmm2,%xmm2
+ DB 197,249,126,208 ; vmovd %xmm2,%eax
+ DB 197,249,110,208 ; vmovd %eax,%xmm2
+ DB 196,226,125,121,210 ; vpbroadcastw %xmm2,%ymm2
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_premul_hsw_lowp
+_sk_premul_hsw_lowp LABEL PROC
+ DB 196,226,125,11,195 ; vpmulhrsw %ymm3,%ymm0,%ymm0
+ DB 196,226,125,29,192 ; vpabsw %ymm0,%ymm0
+ DB 196,226,117,11,203 ; vpmulhrsw %ymm3,%ymm1,%ymm1
+ DB 196,226,125,29,201 ; vpabsw %ymm1,%ymm1
+ DB 196,226,109,11,211 ; vpmulhrsw %ymm3,%ymm2,%ymm2
+ DB 196,226,125,29,210 ; vpabsw %ymm2,%ymm2
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_load_8888_hsw_lowp
+_sk_load_8888_hsw_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 76,139,24 ; mov (%rax),%r11
+ DB 77,133,192 ; test %r8,%r8
+ DB 15,133,210,0,0,0 ; jne 2f7 <_sk_load_8888_hsw_lowp+0xe0>
+ DB 196,193,126,111,92,147,32 ; vmovdqu 0x20(%r11,%rdx,4),%ymm3
+ DB 196,65,126,111,4,147 ; vmovdqu (%r11,%rdx,4),%ymm8
+ DB 197,253,111,5,230,20,0,0 ; vmovdqa 0x14e6(%rip),%ymm0 # 1720 <_sk_xor__hsw_lowp+0xa0>
+ DB 196,226,61,0,200 ; vpshufb %ymm0,%ymm8,%ymm1
+ DB 196,227,253,0,201,232 ; vpermq $0xe8,%ymm1,%ymm1
+ DB 196,226,101,0,192 ; vpshufb %ymm0,%ymm3,%ymm0
+ DB 196,227,253,0,192,232 ; vpermq $0xe8,%ymm0,%ymm0
+ DB 196,227,117,56,192,1 ; vinserti128 $0x1,%xmm0,%ymm1,%ymm0
+ DB 197,253,113,240,8 ; vpsllw $0x8,%ymm0,%ymm0
+ DB 196,98,125,121,13,220,20,0,0 ; vpbroadcastw 0x14dc(%rip),%ymm9 # 1740 <_sk_xor__hsw_lowp+0xc0>
+ DB 196,193,125,228,193 ; vpmulhuw %ymm9,%ymm0,%ymm0
+ DB 197,253,111,13,239,20,0,0 ; vmovdqa 0x14ef(%rip),%ymm1 # 1760 <_sk_xor__hsw_lowp+0xe0>
+ DB 196,226,61,0,209 ; vpshufb %ymm1,%ymm8,%ymm2
+ DB 196,227,253,0,210,232 ; vpermq $0xe8,%ymm2,%ymm2
+ DB 196,226,101,0,201 ; vpshufb %ymm1,%ymm3,%ymm1
+ DB 196,227,253,0,201,232 ; vpermq $0xe8,%ymm1,%ymm1
+ DB 196,227,109,56,201,1 ; vinserti128 $0x1,%xmm1,%ymm2,%ymm1
+ DB 197,245,113,241,8 ; vpsllw $0x8,%ymm1,%ymm1
+ DB 196,193,117,228,201 ; vpmulhuw %ymm9,%ymm1,%ymm1
+ DB 197,253,111,21,225,20,0,0 ; vmovdqa 0x14e1(%rip),%ymm2 # 1780 <_sk_xor__hsw_lowp+0x100>
+ DB 196,98,61,0,210 ; vpshufb %ymm2,%ymm8,%ymm10
+ DB 196,67,253,0,210,232 ; vpermq $0xe8,%ymm10,%ymm10
+ DB 196,226,101,0,210 ; vpshufb %ymm2,%ymm3,%ymm2
+ DB 196,227,253,0,210,232 ; vpermq $0xe8,%ymm2,%ymm2
+ DB 196,227,45,56,210,1 ; vinserti128 $0x1,%xmm2,%ymm10,%ymm2
+ DB 197,237,113,242,8 ; vpsllw $0x8,%ymm2,%ymm2
+ DB 196,193,109,228,209 ; vpmulhuw %ymm9,%ymm2,%ymm2
+ DB 197,125,111,21,211,20,0,0 ; vmovdqa 0x14d3(%rip),%ymm10 # 17a0 <_sk_xor__hsw_lowp+0x120>
+ DB 196,66,61,0,194 ; vpshufb %ymm10,%ymm8,%ymm8
+ DB 196,67,253,0,192,232 ; vpermq $0xe8,%ymm8,%ymm8
+ DB 196,194,101,0,218 ; vpshufb %ymm10,%ymm3,%ymm3
+ DB 196,227,253,0,219,232 ; vpermq $0xe8,%ymm3,%ymm3
+ DB 196,227,61,56,219,1 ; vinserti128 $0x1,%xmm3,%ymm8,%ymm3
+ DB 197,229,113,243,8 ; vpsllw $0x8,%ymm3,%ymm3
+ DB 196,193,101,228,217 ; vpmulhuw %ymm9,%ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+ DB 69,137,193 ; mov %r8d,%r9d
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 197,229,239,219 ; vpxor %ymm3,%ymm3,%ymm3
+ DB 196,65,61,239,192 ; vpxor %ymm8,%ymm8,%ymm8
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 15,135,30,255,255,255 ; ja 232 <_sk_load_8888_hsw_lowp+0x1b>
+ DB 69,15,182,201 ; movzbl %r9b,%r9d
+ DB 76,141,21,29,1,0,0 ; lea 0x11d(%rip),%r10 # 43c <_sk_load_8888_hsw_lowp+0x225>
+ DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax
+ DB 76,1,208 ; add %r10,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 196,65,121,110,4,147 ; vmovd (%r11,%rdx,4),%xmm8
+ DB 233,255,254,255,255 ; jmpq 232 <_sk_load_8888_hsw_lowp+0x1b>
+ DB 196,193,121,110,68,147,8 ; vmovd 0x8(%r11,%rdx,4),%xmm0
+ DB 196,226,121,89,192 ; vpbroadcastq %xmm0,%xmm0
+ DB 197,229,239,219 ; vpxor %ymm3,%ymm3,%ymm3
+ DB 196,99,101,2,192,4 ; vpblendd $0x4,%ymm0,%ymm3,%ymm8
+ DB 196,194,121,53,4,147 ; vpmovzxdq (%r11,%rdx,4),%xmm0
+ DB 197,249,112,192,232 ; vpshufd $0xe8,%xmm0,%xmm0
+ DB 196,99,61,2,192,3 ; vpblendd $0x3,%ymm0,%ymm8,%ymm8
+ DB 233,211,254,255,255 ; jmpq 232 <_sk_load_8888_hsw_lowp+0x1b>
+ DB 196,193,121,110,68,147,24 ; vmovd 0x18(%r11,%rdx,4),%xmm0
+ DB 196,226,125,89,192 ; vpbroadcastq %xmm0,%ymm0
+ DB 197,229,239,219 ; vpxor %ymm3,%ymm3,%ymm3
+ DB 196,99,101,2,192,64 ; vpblendd $0x40,%ymm0,%ymm3,%ymm8
+ DB 196,99,125,57,192,1 ; vextracti128 $0x1,%ymm8,%xmm0
+ DB 196,195,121,34,68,147,20,1 ; vpinsrd $0x1,0x14(%r11,%rdx,4),%xmm0,%xmm0
+ DB 196,99,61,56,192,1 ; vinserti128 $0x1,%xmm0,%ymm8,%ymm8
+ DB 196,99,125,57,192,1 ; vextracti128 $0x1,%ymm8,%xmm0
+ DB 196,195,121,34,68,147,16,0 ; vpinsrd $0x0,0x10(%r11,%rdx,4),%xmm0,%xmm0
+ DB 196,99,61,56,192,1 ; vinserti128 $0x1,%xmm0,%ymm8,%ymm8
+ DB 196,193,122,111,4,147 ; vmovdqu (%r11,%rdx,4),%xmm0
+ DB 196,67,125,2,192,240 ; vpblendd $0xf0,%ymm8,%ymm0,%ymm8
+ DB 233,132,254,255,255 ; jmpq 232 <_sk_load_8888_hsw_lowp+0x1b>
+ DB 196,193,121,110,68,147,40 ; vmovd 0x28(%r11,%rdx,4),%xmm0
+ DB 196,226,121,89,192 ; vpbroadcastq %xmm0,%xmm0
+ DB 197,245,239,201 ; vpxor %ymm1,%ymm1,%ymm1
+ DB 196,227,117,2,216,4 ; vpblendd $0x4,%ymm0,%ymm1,%ymm3
+ DB 196,195,97,34,68,147,36,1 ; vpinsrd $0x1,0x24(%r11,%rdx,4),%xmm3,%xmm0
+ DB 196,227,101,2,216,15 ; vpblendd $0xf,%ymm0,%ymm3,%ymm3
+ DB 196,193,121,110,68,147,32 ; vmovd 0x20(%r11,%rdx,4),%xmm0
+ DB 196,227,101,2,216,1 ; vpblendd $0x1,%ymm0,%ymm3,%ymm3
+ DB 233,72,254,255,255 ; jmpq 22c <_sk_load_8888_hsw_lowp+0x15>
+ DB 196,193,121,110,68,147,56 ; vmovd 0x38(%r11,%rdx,4),%xmm0
+ DB 196,226,125,89,192 ; vpbroadcastq %xmm0,%ymm0
+ DB 197,245,239,201 ; vpxor %ymm1,%ymm1,%ymm1
+ DB 196,227,117,2,216,64 ; vpblendd $0x40,%ymm0,%ymm1,%ymm3
+ DB 196,227,125,57,216,1 ; vextracti128 $0x1,%ymm3,%xmm0
+ DB 196,195,121,34,68,147,52,1 ; vpinsrd $0x1,0x34(%r11,%rdx,4),%xmm0,%xmm0
+ DB 196,227,101,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm3,%ymm3
+ DB 196,227,125,57,216,1 ; vextracti128 $0x1,%ymm3,%xmm0
+ DB 196,195,121,34,68,147,48,0 ; vpinsrd $0x0,0x30(%r11,%rdx,4),%xmm0,%xmm0
+ DB 196,227,101,56,216,1 ; vinserti128 $0x1,%xmm0,%ymm3,%ymm3
+ DB 196,65,126,111,4,147 ; vmovdqu (%r11,%rdx,4),%ymm8
+ DB 196,193,122,111,68,147,32 ; vmovdqu 0x20(%r11,%rdx,4),%xmm0
+ DB 196,227,125,2,219,240 ; vpblendd $0xf0,%ymm3,%ymm0,%ymm3
+ DB 233,248,253,255,255 ; jmpq 232 <_sk_load_8888_hsw_lowp+0x1b>
+ DB 102,144 ; xchg %ax,%ax
+ DB 236 ; in (%dx),%al
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,13,255,255,255,247 ; decl -0x8000001(%rip) # fffffffff8000444 <_sk_xor__hsw_lowp+0xfffffffff7ffedc4>
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,97,255 ; jmpq *-0x1(%rcx)
+ DB 255 ; (bad)
+ DB 255,77,255 ; decl -0x1(%rbp)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 57,255 ; cmp %edi,%edi
+ DB 255 ; (bad)
+ DB 255,35 ; jmpq *(%rbx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,240 ; push %rax
+ DB 253 ; std
+ DB 255 ; (bad)
+ DB 255,150,255,255,255,136 ; callq *-0x77000001(%rsi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,114,255 ; pushq -0x1(%rdx)
+ DB 255 ; (bad)
+ DB 255,230 ; jmpq *%rsi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,210 ; callq *%rdx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 190,255,255,255,168 ; mov $0xa8ffffff,%esi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
+
+PUBLIC _sk_store_8888_hsw_lowp
+_sk_store_8888_hsw_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 76,139,24 ; mov (%rax),%r11
+ DB 197,189,113,208,7 ; vpsrlw $0x7,%ymm0,%ymm8
+ DB 196,98,125,121,13,53,19,0,0 ; vpbroadcastw 0x1335(%rip),%ymm9 # 17c0 <_sk_xor__hsw_lowp+0x140>
+ DB 196,65,61,234,193 ; vpminsw %ymm9,%ymm8,%ymm8
+ DB 196,66,125,51,208 ; vpmovzxwd %xmm8,%ymm10
+ DB 196,67,125,57,192,1 ; vextracti128 $0x1,%ymm8,%xmm8
+ DB 196,66,125,51,192 ; vpmovzxwd %xmm8,%ymm8
+ DB 197,165,113,209,7 ; vpsrlw $0x7,%ymm1,%ymm11
+ DB 196,65,37,234,217 ; vpminsw %ymm9,%ymm11,%ymm11
+ DB 196,67,125,57,220,1 ; vextracti128 $0x1,%ymm11,%xmm12
+ DB 196,66,125,51,228 ; vpmovzxwd %xmm12,%ymm12
+ DB 196,66,125,51,219 ; vpmovzxwd %xmm11,%ymm11
+ DB 196,193,37,114,243,8 ; vpslld $0x8,%ymm11,%ymm11
+ DB 196,193,29,114,244,8 ; vpslld $0x8,%ymm12,%ymm12
+ DB 196,65,29,235,192 ; vpor %ymm8,%ymm12,%ymm8
+ DB 196,65,37,235,210 ; vpor %ymm10,%ymm11,%ymm10
+ DB 197,165,113,210,7 ; vpsrlw $0x7,%ymm2,%ymm11
+ DB 196,65,37,234,217 ; vpminsw %ymm9,%ymm11,%ymm11
+ DB 196,66,125,51,227 ; vpmovzxwd %xmm11,%ymm12
+ DB 196,67,125,57,219,1 ; vextracti128 $0x1,%ymm11,%xmm11
+ DB 196,66,125,51,219 ; vpmovzxwd %xmm11,%ymm11
+ DB 196,193,37,114,243,16 ; vpslld $0x10,%ymm11,%ymm11
+ DB 196,193,29,114,244,16 ; vpslld $0x10,%ymm12,%ymm12
+ DB 197,149,113,211,7 ; vpsrlw $0x7,%ymm3,%ymm13
+ DB 196,65,21,234,201 ; vpminsw %ymm9,%ymm13,%ymm9
+ DB 196,67,125,57,205,1 ; vextracti128 $0x1,%ymm9,%xmm13
+ DB 196,66,125,51,237 ; vpmovzxwd %xmm13,%ymm13
+ DB 196,66,125,51,201 ; vpmovzxwd %xmm9,%ymm9
+ DB 196,193,13,114,241,24 ; vpslld $0x18,%ymm9,%ymm14
+ DB 196,193,53,114,245,24 ; vpslld $0x18,%ymm13,%ymm9
+ DB 196,65,37,235,201 ; vpor %ymm9,%ymm11,%ymm9
+ DB 196,65,61,235,201 ; vpor %ymm9,%ymm8,%ymm9
+ DB 196,65,29,235,198 ; vpor %ymm14,%ymm12,%ymm8
+ DB 196,65,45,235,192 ; vpor %ymm8,%ymm10,%ymm8
+ DB 77,133,192 ; test %r8,%r8
+ DB 117,17 ; jne 546 <_sk_store_8888_hsw_lowp+0xce>
+ DB 196,65,126,127,4,147 ; vmovdqu %ymm8,(%r11,%rdx,4)
+ DB 196,65,126,127,76,147,32 ; vmovdqu %ymm9,0x20(%r11,%rdx,4)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+ DB 69,137,193 ; mov %r8d,%r9d
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,236 ; ja 542 <_sk_store_8888_hsw_lowp+0xca>
+ DB 69,15,182,201 ; movzbl %r9b,%r9d
+ DB 76,141,21,175,0,0,0 ; lea 0xaf(%rip),%r10 # 610 <_sk_store_8888_hsw_lowp+0x198>
+ DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax
+ DB 76,1,208 ; add %r10,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 196,65,121,126,4,147 ; vmovd %xmm8,(%r11,%rdx,4)
+ DB 235,208 ; jmp 542 <_sk_store_8888_hsw_lowp+0xca>
+ DB 196,67,121,22,68,147,8,2 ; vpextrd $0x2,%xmm8,0x8(%r11,%rdx,4)
+ DB 196,65,121,214,4,147 ; vmovq %xmm8,(%r11,%rdx,4)
+ DB 235,192 ; jmp 542 <_sk_store_8888_hsw_lowp+0xca>
+ DB 196,67,125,57,193,1 ; vextracti128 $0x1,%ymm8,%xmm9
+ DB 196,67,121,22,76,147,24,2 ; vpextrd $0x2,%xmm9,0x18(%r11,%rdx,4)
+ DB 196,67,125,57,193,1 ; vextracti128 $0x1,%ymm8,%xmm9
+ DB 196,67,121,22,76,147,20,1 ; vpextrd $0x1,%xmm9,0x14(%r11,%rdx,4)
+ DB 196,67,125,57,193,1 ; vextracti128 $0x1,%ymm8,%xmm9
+ DB 196,65,121,126,76,147,16 ; vmovd %xmm9,0x10(%r11,%rdx,4)
+ DB 196,65,122,127,4,147 ; vmovdqu %xmm8,(%r11,%rdx,4)
+ DB 235,143 ; jmp 542 <_sk_store_8888_hsw_lowp+0xca>
+ DB 196,67,121,22,76,147,40,2 ; vpextrd $0x2,%xmm9,0x28(%r11,%rdx,4)
+ DB 196,67,121,22,76,147,36,1 ; vpextrd $0x1,%xmm9,0x24(%r11,%rdx,4)
+ DB 196,65,121,126,76,147,32 ; vmovd %xmm9,0x20(%r11,%rdx,4)
+ DB 196,65,126,127,4,147 ; vmovdqu %ymm8,(%r11,%rdx,4)
+ DB 233,109,255,255,255 ; jmpq 542 <_sk_store_8888_hsw_lowp+0xca>
+ DB 196,67,125,57,202,1 ; vextracti128 $0x1,%ymm9,%xmm10
+ DB 196,67,121,22,84,147,56,2 ; vpextrd $0x2,%xmm10,0x38(%r11,%rdx,4)
+ DB 196,67,125,57,202,1 ; vextracti128 $0x1,%ymm9,%xmm10
+ DB 196,67,121,22,84,147,52,1 ; vpextrd $0x1,%xmm10,0x34(%r11,%rdx,4)
+ DB 196,67,125,57,202,1 ; vextracti128 $0x1,%ymm9,%xmm10
+ DB 196,65,121,126,84,147,48 ; vmovd %xmm10,0x30(%r11,%rdx,4)
+ DB 196,65,126,127,4,147 ; vmovdqu %ymm8,(%r11,%rdx,4)
+ DB 196,65,122,127,76,147,32 ; vmovdqu %xmm9,0x20(%r11,%rdx,4)
+ DB 233,50,255,255,255 ; jmpq 542 <_sk_store_8888_hsw_lowp+0xca>
+ DB 90 ; pop %rdx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,106,255 ; ljmp *-0x1(%rdx)
+ DB 255 ; (bad)
+ DB 255,98,255 ; jmpq *-0x1(%rdx)
+ DB 255 ; (bad)
+ DB 255,155,255,255,255,142 ; lcall *-0x71000001(%rbx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,128,255,255,255,114 ; incl 0x72ffffff(%rax)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 186,255,255,255,179 ; mov $0xb3ffffff,%edx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,171,255,255,255,163 ; ljmp *-0x5c000001(%rbx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 238 ; out %al,(%dx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,225 ; jmpq *%rcx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,211 ; callq *%rbx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,197 ; inc %ebp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
+
+PUBLIC _sk_load_a8_hsw_lowp
+_sk_load_a8_hsw_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 76,139,24 ; mov (%rax),%r11
+ DB 77,133,192 ; test %r8,%r8
+ DB 117,45 ; jne 683 <_sk_load_a8_hsw_lowp+0x37>
+ DB 196,193,122,111,4,19 ; vmovdqu (%r11,%rdx,1),%xmm0
+ DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
+ DB 197,253,113,240,8 ; vpsllw $0x8,%ymm0,%ymm0
+ DB 196,226,125,121,13,83,17,0,0 ; vpbroadcastw 0x1153(%rip),%ymm1 # 17c2 <_sk_xor__hsw_lowp+0x142>
+ DB 197,253,228,217 ; vpmulhuw %ymm1,%ymm0,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,253,239,192 ; vpxor %ymm0,%ymm0,%ymm0
+ DB 197,245,239,201 ; vpxor %ymm1,%ymm1,%ymm1
+ DB 197,236,87,210 ; vxorps %ymm2,%ymm2,%ymm2
+ DB 255,224 ; jmpq *%rax
+ DB 69,137,193 ; mov %r8d,%r9d
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,197 ; ja 65c <_sk_load_a8_hsw_lowp+0x10>
+ DB 69,15,182,201 ; movzbl %r9b,%r9d
+ DB 76,141,21,194,0,0,0 ; lea 0xc2(%rip),%r10 # 764 <_sk_load_a8_hsw_lowp+0x118>
+ DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax
+ DB 76,1,208 ; add %r10,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 65,15,182,4,19 ; movzbl (%r11,%rdx,1),%eax
+ DB 197,249,110,192 ; vmovd %eax,%xmm0
+ DB 235,166 ; jmp 65c <_sk_load_a8_hsw_lowp+0x10>
+ DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
+ DB 196,195,121,32,68,19,2,2 ; vpinsrb $0x2,0x2(%r11,%rdx,1),%xmm0,%xmm0
+ DB 65,15,183,4,19 ; movzwl (%r11,%rdx,1),%eax
+ DB 197,249,110,200 ; vmovd %eax,%xmm1
+ DB 196,227,121,14,193,1 ; vpblendw $0x1,%xmm1,%xmm0,%xmm0
+ DB 235,137 ; jmp 65c <_sk_load_a8_hsw_lowp+0x10>
+ DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
+ DB 196,195,121,32,68,19,6,6 ; vpinsrb $0x6,0x6(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,195,121,32,68,19,5,5 ; vpinsrb $0x5,0x5(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,195,121,32,68,19,4,4 ; vpinsrb $0x4,0x4(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,193,121,110,12,19 ; vmovd (%r11,%rdx,1),%xmm1
+ DB 196,227,121,2,193,1 ; vpblendd $0x1,%xmm1,%xmm0,%xmm0
+ DB 233,92,255,255,255 ; jmpq 65c <_sk_load_a8_hsw_lowp+0x10>
+ DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
+ DB 196,195,121,32,68,19,10,10 ; vpinsrb $0xa,0xa(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,195,121,32,68,19,9,9 ; vpinsrb $0x9,0x9(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,195,121,32,68,19,8,8 ; vpinsrb $0x8,0x8(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,193,122,126,12,19 ; vmovq (%r11,%rdx,1),%xmm1
+ DB 196,227,113,2,192,12 ; vpblendd $0xc,%xmm0,%xmm1,%xmm0
+ DB 233,47,255,255,255 ; jmpq 65c <_sk_load_a8_hsw_lowp+0x10>
+ DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
+ DB 196,195,121,32,68,19,14,14 ; vpinsrb $0xe,0xe(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,195,121,32,68,19,13,13 ; vpinsrb $0xd,0xd(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,195,121,32,68,19,12,12 ; vpinsrb $0xc,0xc(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,193,122,126,12,19 ; vmovq (%r11,%rdx,1),%xmm1
+ DB 196,195,113,34,76,19,8,2 ; vpinsrd $0x2,0x8(%r11,%rdx,1),%xmm1,%xmm1
+ DB 196,227,113,2,192,8 ; vpblendd $0x8,%xmm0,%xmm1,%xmm0
+ DB 233,250,254,255,255 ; jmpq 65c <_sk_load_a8_hsw_lowp+0x10>
+ DB 102,144 ; xchg %ax,%ax
+ DB 71,255 ; rex.RXB (bad)
+ DB 255 ; (bad)
+ DB 255,94,255 ; lcall *-0x1(%rsi)
+ DB 255 ; (bad)
+ DB 255,82,255 ; callq *-0x1(%rdx)
+ DB 255 ; (bad)
+ DB 255,139,255,255,255,131 ; decl -0x7c000001(%rbx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 123,255 ; jnp 779 <_sk_load_a8_hsw_lowp+0x12d>
+ DB 255 ; (bad)
+ DB 255,111,255 ; ljmp *-0x1(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 184,255,255,255,176 ; mov $0xb0ffffff,%eax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,168,255,255,255,156 ; ljmp *-0x63000001(%rax)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,229 ; jmpq *%rbp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 221,255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,213 ; callq *%rbp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,201 ; dec %ecx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
+
+PUBLIC _sk_store_a8_hsw_lowp
+_sk_store_a8_hsw_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 76,139,24 ; mov (%rax),%r11
+ DB 197,189,113,211,7 ; vpsrlw $0x7,%ymm3,%ymm8
+ DB 196,67,125,57,193,1 ; vextracti128 $0x1,%ymm8,%xmm9
+ DB 196,65,57,103,193 ; vpackuswb %xmm9,%xmm8,%xmm8
+ DB 77,133,192 ; test %r8,%r8
+ DB 117,10 ; jne 7c4 <_sk_store_a8_hsw_lowp+0x24>
+ DB 196,65,122,127,4,19 ; vmovdqu %xmm8,(%r11,%rdx,1)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+ DB 69,137,193 ; mov %r8d,%r9d
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,236 ; ja 7c0 <_sk_store_a8_hsw_lowp+0x20>
+ DB 69,15,182,201 ; movzbl %r9b,%r9d
+ DB 76,141,21,137,0,0,0 ; lea 0x89(%rip),%r10 # 868 <_sk_store_a8_hsw_lowp+0xc8>
+ DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax
+ DB 76,1,208 ; add %r10,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 196,67,121,20,4,19,0 ; vpextrb $0x0,%xmm8,(%r11,%rdx,1)
+ DB 235,207 ; jmp 7c0 <_sk_store_a8_hsw_lowp+0x20>
+ DB 196,67,121,20,68,19,2,2 ; vpextrb $0x2,%xmm8,0x2(%r11,%rdx,1)
+ DB 196,67,121,21,4,19,0 ; vpextrw $0x0,%xmm8,(%r11,%rdx,1)
+ DB 235,190 ; jmp 7c0 <_sk_store_a8_hsw_lowp+0x20>
+ DB 196,67,121,20,68,19,6,6 ; vpextrb $0x6,%xmm8,0x6(%r11,%rdx,1)
+ DB 196,67,121,20,68,19,5,5 ; vpextrb $0x5,%xmm8,0x5(%r11,%rdx,1)
+ DB 196,67,121,20,68,19,4,4 ; vpextrb $0x4,%xmm8,0x4(%r11,%rdx,1)
+ DB 196,65,121,126,4,19 ; vmovd %xmm8,(%r11,%rdx,1)
+ DB 235,158 ; jmp 7c0 <_sk_store_a8_hsw_lowp+0x20>
+ DB 196,67,121,20,68,19,10,10 ; vpextrb $0xa,%xmm8,0xa(%r11,%rdx,1)
+ DB 196,67,121,20,68,19,9,9 ; vpextrb $0x9,%xmm8,0x9(%r11,%rdx,1)
+ DB 196,67,121,20,68,19,8,8 ; vpextrb $0x8,%xmm8,0x8(%r11,%rdx,1)
+ DB 235,32 ; jmp 85c <_sk_store_a8_hsw_lowp+0xbc>
+ DB 196,67,121,20,68,19,14,14 ; vpextrb $0xe,%xmm8,0xe(%r11,%rdx,1)
+ DB 196,67,121,20,68,19,13,13 ; vpextrb $0xd,%xmm8,0xd(%r11,%rdx,1)
+ DB 196,67,121,20,68,19,12,12 ; vpextrb $0xc,%xmm8,0xc(%r11,%rdx,1)
+ DB 196,67,121,22,68,19,8,2 ; vpextrd $0x2,%xmm8,0x8(%r11,%rdx,1)
+ DB 196,65,121,214,4,19 ; vmovq %xmm8,(%r11,%rdx,1)
+ DB 233,89,255,255,255 ; jmpq 7c0 <_sk_store_a8_hsw_lowp+0x20>
+ DB 144 ; nop
+ DB 128,255,255 ; cmp $0xff,%bh
+ DB 255,145,255,255,255,137 ; callq *-0x76000001(%rcx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,178,255,255,255,170 ; pushq -0x55000001(%rdx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,162,255,255,255,154 ; jmpq *-0x65000001(%rdx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,244 ; push %rsp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,202 ; dec %edx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,194 ; inc %edx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 186,255,255,255,236 ; mov $0xecffffff,%edx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,228 ; jmpq *%rsp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 220,255 ; fdivr %st,%st(7)
+ DB 255 ; (bad)
+ DB 255,212 ; callq *%rsp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
+
+PUBLIC _sk_load_g8_hsw_lowp
+_sk_load_g8_hsw_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 76,139,24 ; mov (%rax),%r11
+ DB 77,133,192 ; test %r8,%r8
+ DB 117,50 ; jne 8e0 <_sk_load_g8_hsw_lowp+0x3c>
+ DB 196,193,122,111,4,19 ; vmovdqu (%r11,%rdx,1),%xmm0
+ DB 196,226,125,48,192 ; vpmovzxbw %xmm0,%ymm0
+ DB 197,253,113,240,8 ; vpsllw $0x8,%ymm0,%ymm0
+ DB 196,226,125,121,13,253,14,0,0 ; vpbroadcastw 0xefd(%rip),%ymm1 # 17c4 <_sk_xor__hsw_lowp+0x144>
+ DB 197,253,228,193 ; vpmulhuw %ymm1,%ymm0,%ymm0
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,226,125,121,29,240,14,0,0 ; vpbroadcastw 0xef0(%rip),%ymm3 # 17c6 <_sk_xor__hsw_lowp+0x146>
+ DB 197,253,111,200 ; vmovdqa %ymm0,%ymm1
+ DB 197,253,111,208 ; vmovdqa %ymm0,%ymm2
+ DB 255,224 ; jmpq *%rax
+ DB 69,137,193 ; mov %r8d,%r9d
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,192 ; ja 8b4 <_sk_load_g8_hsw_lowp+0x10>
+ DB 69,15,182,201 ; movzbl %r9b,%r9d
+ DB 76,141,21,193,0,0,0 ; lea 0xc1(%rip),%r10 # 9c0 <_sk_load_g8_hsw_lowp+0x11c>
+ DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax
+ DB 76,1,208 ; add %r10,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 65,15,182,4,19 ; movzbl (%r11,%rdx,1),%eax
+ DB 197,249,110,192 ; vmovd %eax,%xmm0
+ DB 235,161 ; jmp 8b4 <_sk_load_g8_hsw_lowp+0x10>
+ DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
+ DB 196,195,121,32,68,19,2,2 ; vpinsrb $0x2,0x2(%r11,%rdx,1),%xmm0,%xmm0
+ DB 65,15,183,4,19 ; movzwl (%r11,%rdx,1),%eax
+ DB 197,249,110,200 ; vmovd %eax,%xmm1
+ DB 196,227,121,14,193,1 ; vpblendw $0x1,%xmm1,%xmm0,%xmm0
+ DB 235,132 ; jmp 8b4 <_sk_load_g8_hsw_lowp+0x10>
+ DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
+ DB 196,195,121,32,68,19,6,6 ; vpinsrb $0x6,0x6(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,195,121,32,68,19,5,5 ; vpinsrb $0x5,0x5(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,195,121,32,68,19,4,4 ; vpinsrb $0x4,0x4(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,193,121,110,12,19 ; vmovd (%r11,%rdx,1),%xmm1
+ DB 196,227,121,2,193,1 ; vpblendd $0x1,%xmm1,%xmm0,%xmm0
+ DB 233,87,255,255,255 ; jmpq 8b4 <_sk_load_g8_hsw_lowp+0x10>
+ DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
+ DB 196,195,121,32,68,19,10,10 ; vpinsrb $0xa,0xa(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,195,121,32,68,19,9,9 ; vpinsrb $0x9,0x9(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,195,121,32,68,19,8,8 ; vpinsrb $0x8,0x8(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,193,122,126,12,19 ; vmovq (%r11,%rdx,1),%xmm1
+ DB 196,227,113,2,192,12 ; vpblendd $0xc,%xmm0,%xmm1,%xmm0
+ DB 233,42,255,255,255 ; jmpq 8b4 <_sk_load_g8_hsw_lowp+0x10>
+ DB 197,249,239,192 ; vpxor %xmm0,%xmm0,%xmm0
+ DB 196,195,121,32,68,19,14,14 ; vpinsrb $0xe,0xe(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,195,121,32,68,19,13,13 ; vpinsrb $0xd,0xd(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,195,121,32,68,19,12,12 ; vpinsrb $0xc,0xc(%r11,%rdx,1),%xmm0,%xmm0
+ DB 196,193,122,126,12,19 ; vmovq (%r11,%rdx,1),%xmm1
+ DB 196,195,113,34,76,19,8,2 ; vpinsrd $0x2,0x8(%r11,%rdx,1),%xmm1,%xmm1
+ DB 196,227,113,2,192,8 ; vpblendd $0x8,%xmm0,%xmm1,%xmm0
+ DB 233,245,254,255,255 ; jmpq 8b4 <_sk_load_g8_hsw_lowp+0x10>
+ DB 144 ; nop
+ DB 72,255 ; rex.W (bad)
+ DB 255 ; (bad)
+ DB 255,95,255 ; lcall *-0x1(%rdi)
+ DB 255 ; (bad)
+ DB 255,83,255 ; callq *-0x1(%rbx)
+ DB 255 ; (bad)
+ DB 255,140,255,255,255,132,255 ; decl -0x7b0001(%rdi,%rdi,8)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 124,255 ; jl 9d5 <_sk_load_g8_hsw_lowp+0x131>
+ DB 255 ; (bad)
+ DB 255,112,255 ; pushq -0x1(%rax)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 185,255,255,255,177 ; mov $0xb1ffffff,%ecx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,169,255,255,255,157 ; ljmp *-0x62000001(%rcx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,230 ; jmpq *%rsi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 222,255 ; fdivrp %st,%st(7)
+ DB 255 ; (bad)
+ DB 255,214 ; callq *%rsi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,202 ; dec %edx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
+
+PUBLIC _sk_srcover_rgba_8888_hsw_lowp
+_sk_srcover_rgba_8888_hsw_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 76,139,24 ; mov (%rax),%r11
+ DB 77,133,192 ; test %r8,%r8
+ DB 15,133,220,1,0,0 ; jne be6 <_sk_srcover_rgba_8888_hsw_lowp+0x1ea>
+ DB 196,193,126,111,124,147,32 ; vmovdqu 0x20(%r11,%rdx,4),%ymm7
+ DB 196,65,126,111,4,147 ; vmovdqu (%r11,%rdx,4),%ymm8
+ DB 197,253,111,37,193,13,0,0 ; vmovdqa 0xdc1(%rip),%ymm4 # 17e0 <_sk_xor__hsw_lowp+0x160>
+ DB 196,226,61,0,236 ; vpshufb %ymm4,%ymm8,%ymm5
+ DB 196,227,253,0,237,232 ; vpermq $0xe8,%ymm5,%ymm5
+ DB 196,226,69,0,228 ; vpshufb %ymm4,%ymm7,%ymm4
+ DB 196,227,253,0,228,232 ; vpermq $0xe8,%ymm4,%ymm4
+ DB 196,227,85,56,228,1 ; vinserti128 $0x1,%xmm4,%ymm5,%ymm4
+ DB 196,98,125,121,13,188,13,0,0 ; vpbroadcastw 0xdbc(%rip),%ymm9 # 1800 <_sk_xor__hsw_lowp+0x180>
+ DB 197,221,113,244,8 ; vpsllw $0x8,%ymm4,%ymm4
+ DB 196,98,125,121,21,176,13,0,0 ; vpbroadcastw 0xdb0(%rip),%ymm10 # 1802 <_sk_xor__hsw_lowp+0x182>
+ DB 196,193,93,228,226 ; vpmulhuw %ymm10,%ymm4,%ymm4
+ DB 197,253,111,45,193,13,0,0 ; vmovdqa 0xdc1(%rip),%ymm5 # 1820 <_sk_xor__hsw_lowp+0x1a0>
+ DB 196,226,61,0,245 ; vpshufb %ymm5,%ymm8,%ymm6
+ DB 196,227,253,0,246,232 ; vpermq $0xe8,%ymm6,%ymm6
+ DB 196,226,69,0,237 ; vpshufb %ymm5,%ymm7,%ymm5
+ DB 196,227,253,0,237,232 ; vpermq $0xe8,%ymm5,%ymm5
+ DB 196,227,77,56,237,1 ; vinserti128 $0x1,%xmm5,%ymm6,%ymm5
+ DB 197,213,113,245,8 ; vpsllw $0x8,%ymm5,%ymm5
+ DB 196,193,85,228,234 ; vpmulhuw %ymm10,%ymm5,%ymm5
+ DB 197,253,111,53,179,13,0,0 ; vmovdqa 0xdb3(%rip),%ymm6 # 1840 <_sk_xor__hsw_lowp+0x1c0>
+ DB 196,98,61,0,222 ; vpshufb %ymm6,%ymm8,%ymm11
+ DB 196,67,253,0,219,232 ; vpermq $0xe8,%ymm11,%ymm11
+ DB 196,226,69,0,246 ; vpshufb %ymm6,%ymm7,%ymm6
+ DB 196,227,253,0,246,232 ; vpermq $0xe8,%ymm6,%ymm6
+ DB 196,227,37,56,246,1 ; vinserti128 $0x1,%xmm6,%ymm11,%ymm6
+ DB 197,205,113,246,8 ; vpsllw $0x8,%ymm6,%ymm6
+ DB 196,193,77,228,242 ; vpmulhuw %ymm10,%ymm6,%ymm6
+ DB 197,125,111,29,165,13,0,0 ; vmovdqa 0xda5(%rip),%ymm11 # 1860 <_sk_xor__hsw_lowp+0x1e0>
+ DB 196,66,61,0,195 ; vpshufb %ymm11,%ymm8,%ymm8
+ DB 196,67,253,0,192,232 ; vpermq $0xe8,%ymm8,%ymm8
+ DB 196,194,69,0,251 ; vpshufb %ymm11,%ymm7,%ymm7
+ DB 196,227,253,0,255,232 ; vpermq $0xe8,%ymm7,%ymm7
+ DB 196,227,61,56,255,1 ; vinserti128 $0x1,%xmm7,%ymm8,%ymm7
+ DB 197,197,113,247,8 ; vpsllw $0x8,%ymm7,%ymm7
+ DB 196,193,69,228,250 ; vpmulhuw %ymm10,%ymm7,%ymm7
+ DB 196,98,125,121,5,150,13,0,0 ; vpbroadcastw 0xd96(%rip),%ymm8 # 1880 <_sk_xor__hsw_lowp+0x200>
+ DB 197,61,249,195 ; vpsubw %ymm3,%ymm8,%ymm8
+ DB 196,66,93,11,208 ; vpmulhrsw %ymm8,%ymm4,%ymm10
+ DB 196,66,125,29,210 ; vpabsw %ymm10,%ymm10
+ DB 197,173,253,192 ; vpaddw %ymm0,%ymm10,%ymm0
+ DB 196,66,85,11,208 ; vpmulhrsw %ymm8,%ymm5,%ymm10
+ DB 196,66,125,29,210 ; vpabsw %ymm10,%ymm10
+ DB 197,173,253,201 ; vpaddw %ymm1,%ymm10,%ymm1
+ DB 196,66,77,11,208 ; vpmulhrsw %ymm8,%ymm6,%ymm10
+ DB 196,66,125,29,210 ; vpabsw %ymm10,%ymm10
+ DB 197,173,253,210 ; vpaddw %ymm2,%ymm10,%ymm2
+ DB 196,66,69,11,192 ; vpmulhrsw %ymm8,%ymm7,%ymm8
+ DB 196,66,125,29,192 ; vpabsw %ymm8,%ymm8
+ DB 197,189,253,219 ; vpaddw %ymm3,%ymm8,%ymm3
+ DB 197,189,113,208,7 ; vpsrlw $0x7,%ymm0,%ymm8
+ DB 196,65,61,234,193 ; vpminsw %ymm9,%ymm8,%ymm8
+ DB 196,66,125,51,208 ; vpmovzxwd %xmm8,%ymm10
+ DB 196,67,125,57,192,1 ; vextracti128 $0x1,%ymm8,%xmm8
+ DB 196,66,125,51,192 ; vpmovzxwd %xmm8,%ymm8
+ DB 197,165,113,209,7 ; vpsrlw $0x7,%ymm1,%ymm11
+ DB 196,65,37,234,217 ; vpminsw %ymm9,%ymm11,%ymm11
+ DB 196,67,125,57,220,1 ; vextracti128 $0x1,%ymm11,%xmm12
+ DB 196,66,125,51,228 ; vpmovzxwd %xmm12,%ymm12
+ DB 196,66,125,51,219 ; vpmovzxwd %xmm11,%ymm11
+ DB 196,193,37,114,243,8 ; vpslld $0x8,%ymm11,%ymm11
+ DB 196,193,29,114,244,8 ; vpslld $0x8,%ymm12,%ymm12
+ DB 197,149,113,210,7 ; vpsrlw $0x7,%ymm2,%ymm13
+ DB 196,65,21,234,233 ; vpminsw %ymm9,%ymm13,%ymm13
+ DB 196,66,125,51,245 ; vpmovzxwd %xmm13,%ymm14
+ DB 196,67,125,57,237,1 ; vextracti128 $0x1,%ymm13,%xmm13
+ DB 196,66,125,51,237 ; vpmovzxwd %xmm13,%ymm13
+ DB 196,193,21,114,245,16 ; vpslld $0x10,%ymm13,%ymm13
+ DB 196,193,13,114,246,16 ; vpslld $0x10,%ymm14,%ymm14
+ DB 197,133,113,211,7 ; vpsrlw $0x7,%ymm3,%ymm15
+ DB 196,65,5,234,201 ; vpminsw %ymm9,%ymm15,%ymm9
+ DB 196,67,125,57,207,1 ; vextracti128 $0x1,%ymm9,%xmm15
+ DB 196,66,125,51,255 ; vpmovzxwd %xmm15,%ymm15
+ DB 196,66,125,51,201 ; vpmovzxwd %xmm9,%ymm9
+ DB 196,193,53,114,241,24 ; vpslld $0x18,%ymm9,%ymm9
+ DB 196,193,5,114,247,24 ; vpslld $0x18,%ymm15,%ymm15
+ DB 196,65,29,235,192 ; vpor %ymm8,%ymm12,%ymm8
+ DB 196,65,37,235,218 ; vpor %ymm10,%ymm11,%ymm11
+ DB 196,65,21,235,215 ; vpor %ymm15,%ymm13,%ymm10
+ DB 196,65,61,235,210 ; vpor %ymm10,%ymm8,%ymm10
+ DB 196,65,13,235,193 ; vpor %ymm9,%ymm14,%ymm8
+ DB 196,65,37,235,192 ; vpor %ymm8,%ymm11,%ymm8
+ DB 77,133,192 ; test %r8,%r8
+ DB 117,77 ; jne c22 <_sk_srcover_rgba_8888_hsw_lowp+0x226>
+ DB 196,65,126,127,4,147 ; vmovdqu %ymm8,(%r11,%rdx,4)
+ DB 196,65,126,127,84,147,32 ; vmovdqu %ymm10,0x20(%r11,%rdx,4)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+ DB 69,137,193 ; mov %r8d,%r9d
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 197,197,239,255 ; vpxor %ymm7,%ymm7,%ymm7
+ DB 196,65,61,239,192 ; vpxor %ymm8,%ymm8,%ymm8
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 15,135,20,254,255,255 ; ja a17 <_sk_srcover_rgba_8888_hsw_lowp+0x1b>
+ DB 69,15,182,201 ; movzbl %r9b,%r9d
+ DB 76,141,21,238,1,0,0 ; lea 0x1ee(%rip),%r10 # dfc <_sk_srcover_rgba_8888_hsw_lowp+0x400>
+ DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax
+ DB 76,1,208 ; add %r10,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 196,65,121,110,4,147 ; vmovd (%r11,%rdx,4),%xmm8
+ DB 233,245,253,255,255 ; jmpq a17 <_sk_srcover_rgba_8888_hsw_lowp+0x1b>
+ DB 69,137,193 ; mov %r8d,%r9d
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,176 ; ja be2 <_sk_srcover_rgba_8888_hsw_lowp+0x1e6>
+ DB 65,15,182,193 ; movzbl %r9b,%eax
+ DB 76,141,13,251,1,0,0 ; lea 0x1fb(%rip),%r9 # e38 <_sk_srcover_rgba_8888_hsw_lowp+0x43c>
+ DB 73,99,4,129 ; movslq (%r9,%rax,4),%rax
+ DB 76,1,200 ; add %r9,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 196,65,121,126,4,147 ; vmovd %xmm8,(%r11,%rdx,4)
+ DB 235,148 ; jmp be2 <_sk_srcover_rgba_8888_hsw_lowp+0x1e6>
+ DB 196,193,121,110,100,147,8 ; vmovd 0x8(%r11,%rdx,4),%xmm4
+ DB 196,226,121,89,228 ; vpbroadcastq %xmm4,%xmm4
+ DB 197,197,239,255 ; vpxor %ymm7,%ymm7,%ymm7
+ DB 196,99,69,2,196,4 ; vpblendd $0x4,%ymm4,%ymm7,%ymm8
+ DB 196,194,121,53,36,147 ; vpmovzxdq (%r11,%rdx,4),%xmm4
+ DB 197,249,112,228,232 ; vpshufd $0xe8,%xmm4,%xmm4
+ DB 196,99,61,2,196,3 ; vpblendd $0x3,%ymm4,%ymm8,%ymm8
+ DB 233,157,253,255,255 ; jmpq a17 <_sk_srcover_rgba_8888_hsw_lowp+0x1b>
+ DB 196,193,121,110,100,147,24 ; vmovd 0x18(%r11,%rdx,4),%xmm4
+ DB 196,226,125,89,228 ; vpbroadcastq %xmm4,%ymm4
+ DB 197,197,239,255 ; vpxor %ymm7,%ymm7,%ymm7
+ DB 196,99,69,2,196,64 ; vpblendd $0x40,%ymm4,%ymm7,%ymm8
+ DB 196,99,125,57,196,1 ; vextracti128 $0x1,%ymm8,%xmm4
+ DB 196,195,89,34,100,147,20,1 ; vpinsrd $0x1,0x14(%r11,%rdx,4),%xmm4,%xmm4
+ DB 196,99,61,56,196,1 ; vinserti128 $0x1,%xmm4,%ymm8,%ymm8
+ DB 196,99,125,57,196,1 ; vextracti128 $0x1,%ymm8,%xmm4
+ DB 196,195,89,34,100,147,16,0 ; vpinsrd $0x0,0x10(%r11,%rdx,4),%xmm4,%xmm4
+ DB 196,99,61,56,196,1 ; vinserti128 $0x1,%xmm4,%ymm8,%ymm8
+ DB 196,193,122,111,36,147 ; vmovdqu (%r11,%rdx,4),%xmm4
+ DB 196,67,93,2,192,240 ; vpblendd $0xf0,%ymm8,%ymm4,%ymm8
+ DB 233,78,253,255,255 ; jmpq a17 <_sk_srcover_rgba_8888_hsw_lowp+0x1b>
+ DB 196,193,121,110,100,147,40 ; vmovd 0x28(%r11,%rdx,4),%xmm4
+ DB 196,226,121,89,228 ; vpbroadcastq %xmm4,%xmm4
+ DB 197,213,239,237 ; vpxor %ymm5,%ymm5,%ymm5
+ DB 196,227,85,2,252,4 ; vpblendd $0x4,%ymm4,%ymm5,%ymm7
+ DB 196,195,65,34,100,147,36,1 ; vpinsrd $0x1,0x24(%r11,%rdx,4),%xmm7,%xmm4
+ DB 196,227,69,2,252,15 ; vpblendd $0xf,%ymm4,%ymm7,%ymm7
+ DB 196,193,121,110,100,147,32 ; vmovd 0x20(%r11,%rdx,4),%xmm4
+ DB 196,227,69,2,252,1 ; vpblendd $0x1,%ymm4,%ymm7,%ymm7
+ DB 233,18,253,255,255 ; jmpq a11 <_sk_srcover_rgba_8888_hsw_lowp+0x15>
+ DB 196,193,121,110,100,147,56 ; vmovd 0x38(%r11,%rdx,4),%xmm4
+ DB 196,226,125,89,228 ; vpbroadcastq %xmm4,%ymm4
+ DB 197,213,239,237 ; vpxor %ymm5,%ymm5,%ymm5
+ DB 196,227,85,2,252,64 ; vpblendd $0x40,%ymm4,%ymm5,%ymm7
+ DB 196,227,125,57,252,1 ; vextracti128 $0x1,%ymm7,%xmm4
+ DB 196,195,89,34,100,147,52,1 ; vpinsrd $0x1,0x34(%r11,%rdx,4),%xmm4,%xmm4
+ DB 196,227,69,56,252,1 ; vinserti128 $0x1,%xmm4,%ymm7,%ymm7
+ DB 196,227,125,57,252,1 ; vextracti128 $0x1,%ymm7,%xmm4
+ DB 196,195,89,34,100,147,48,0 ; vpinsrd $0x0,0x30(%r11,%rdx,4),%xmm4,%xmm4
+ DB 196,227,69,56,252,1 ; vinserti128 $0x1,%xmm4,%ymm7,%ymm7
+ DB 196,65,126,111,4,147 ; vmovdqu (%r11,%rdx,4),%ymm8
+ DB 196,193,122,111,100,147,32 ; vmovdqu 0x20(%r11,%rdx,4),%xmm4
+ DB 196,227,93,2,255,240 ; vpblendd $0xf0,%ymm7,%ymm4,%ymm7
+ DB 233,194,252,255,255 ; jmpq a17 <_sk_srcover_rgba_8888_hsw_lowp+0x1b>
+ DB 196,67,121,22,68,147,8,2 ; vpextrd $0x2,%xmm8,0x8(%r11,%rdx,4)
+ DB 196,65,121,214,4,147 ; vmovq %xmm8,(%r11,%rdx,4)
+ DB 233,122,254,255,255 ; jmpq be2 <_sk_srcover_rgba_8888_hsw_lowp+0x1e6>
+ DB 196,67,125,57,193,1 ; vextracti128 $0x1,%ymm8,%xmm9
+ DB 196,67,121,22,76,147,24,2 ; vpextrd $0x2,%xmm9,0x18(%r11,%rdx,4)
+ DB 196,67,125,57,193,1 ; vextracti128 $0x1,%ymm8,%xmm9
+ DB 196,67,121,22,76,147,20,1 ; vpextrd $0x1,%xmm9,0x14(%r11,%rdx,4)
+ DB 196,67,125,57,193,1 ; vextracti128 $0x1,%ymm8,%xmm9
+ DB 196,65,121,126,76,147,16 ; vmovd %xmm9,0x10(%r11,%rdx,4)
+ DB 196,65,122,127,4,147 ; vmovdqu %xmm8,(%r11,%rdx,4)
+ DB 233,70,254,255,255 ; jmpq be2 <_sk_srcover_rgba_8888_hsw_lowp+0x1e6>
+ DB 196,67,121,22,84,147,40,2 ; vpextrd $0x2,%xmm10,0x28(%r11,%rdx,4)
+ DB 196,67,121,22,84,147,36,1 ; vpextrd $0x1,%xmm10,0x24(%r11,%rdx,4)
+ DB 196,65,121,126,84,147,32 ; vmovd %xmm10,0x20(%r11,%rdx,4)
+ DB 196,65,126,127,4,147 ; vmovdqu %ymm8,(%r11,%rdx,4)
+ DB 233,36,254,255,255 ; jmpq be2 <_sk_srcover_rgba_8888_hsw_lowp+0x1e6>
+ DB 196,67,125,57,209,1 ; vextracti128 $0x1,%ymm10,%xmm9
+ DB 196,67,121,22,76,147,56,2 ; vpextrd $0x2,%xmm9,0x38(%r11,%rdx,4)
+ DB 196,67,125,57,209,1 ; vextracti128 $0x1,%ymm10,%xmm9
+ DB 196,67,121,22,76,147,52,1 ; vpextrd $0x1,%xmm9,0x34(%r11,%rdx,4)
+ DB 196,67,125,57,209,1 ; vextracti128 $0x1,%ymm10,%xmm9
+ DB 196,65,121,126,76,147,48 ; vmovd %xmm9,0x30(%r11,%rdx,4)
+ DB 196,65,126,127,4,147 ; vmovdqu %ymm8,(%r11,%rdx,4)
+ DB 196,65,122,127,84,147,32 ; vmovdqu %xmm10,0x20(%r11,%rdx,4)
+ DB 233,233,253,255,255 ; jmpq be2 <_sk_srcover_rgba_8888_hsw_lowp+0x1e6>
+ DB 15,31,0 ; nopl (%rax)
+ DB 27,254 ; sbb %esi,%edi
+ DB 255 ; (bad)
+ DB 255,104,254 ; ljmp *-0x2(%rax)
+ DB 255 ; (bad)
+ DB 255,82,254 ; callq *-0x2(%rdx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 188,254,255,255,168 ; mov $0xa8fffffe,%esp
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,148,254,255,255,126,254 ; callq *-0x1810001(%rsi,%rdi,8)
+ DB 255 ; (bad)
+ DB 255,21,252,255,255,241 ; callq *-0xe000004(%rip) # fffffffff2000e19 <_sk_xor__hsw_lowp+0xfffffffff1fff799>
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,227 ; jmpq *%rbx
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,205 ; dec %ebp
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,65,255 ; incl -0x1(%rcx)
+ DB 255 ; (bad)
+ DB 255,45,255,255,255,25 ; ljmp *0x19ffffff(%rip) # 1a000e30 <_sk_xor__hsw_lowp+0x19fff7b0>
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,3 ; incl (%rbx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,14 ; decl (%rsi)
+ DB 254 ; (bad)
+ DB 255 ; (bad)
+ DB 255,37,255,255,255,29 ; jmpq *0x1dffffff(%rip) # 1e000e40 <_sk_xor__hsw_lowp+0x1dfff7c0>
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,89,255 ; lcall *-0x1(%rcx)
+ DB 255 ; (bad)
+ DB 255,76,255,255 ; decl -0x1(%rdi,%rdi,8)
+ DB 255 ; (bad)
+ DB 62,255 ; ds (bad)
+ DB 255 ; (bad)
+ DB 255,48 ; pushq (%rax)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 123,255 ; jnp e55 <_sk_srcover_rgba_8888_hsw_lowp+0x459>
+ DB 255 ; (bad)
+ DB 255,116,255,255 ; pushq -0x1(%rdi,%rdi,8)
+ DB 255,108,255,255 ; ljmp *-0x1(%rdi,%rdi,8)
+ DB 255,100,255,255 ; jmpq *-0x1(%rdi,%rdi,8)
+ DB 255,175,255,255,255,162 ; ljmp *-0x5d000001(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,148,255,255,255,134,255 ; callq *-0x790001(%rdi,%rdi,8)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
+
+PUBLIC _sk_scale_1_float_hsw_lowp
+_sk_scale_1_float_hsw_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,122,16,0 ; vmovss (%rax),%xmm8
+ DB 197,58,88,5,126,8,0,0 ; vaddss 0x87e(%rip),%xmm8,%xmm8 # 1700 <_sk_xor__hsw_lowp+0x80>
+ DB 197,121,126,192 ; vmovd %xmm8,%eax
+ DB 197,121,110,192 ; vmovd %eax,%xmm8
+ DB 196,66,125,121,192 ; vpbroadcastw %xmm8,%ymm8
+ DB 196,194,125,11,192 ; vpmulhrsw %ymm8,%ymm0,%ymm0
+ DB 196,226,125,29,192 ; vpabsw %ymm0,%ymm0
+ DB 196,194,117,11,200 ; vpmulhrsw %ymm8,%ymm1,%ymm1
+ DB 196,226,125,29,201 ; vpabsw %ymm1,%ymm1
+ DB 196,194,109,11,208 ; vpmulhrsw %ymm8,%ymm2,%ymm2
+ DB 196,226,125,29,210 ; vpabsw %ymm2,%ymm2
+ DB 196,194,101,11,216 ; vpmulhrsw %ymm8,%ymm3,%ymm3
+ DB 196,226,125,29,219 ; vpabsw %ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_scale_u8_hsw_lowp
+_sk_scale_u8_hsw_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 76,139,24 ; mov (%rax),%r11
+ DB 77,133,192 ; test %r8,%r8
+ DB 117,75 ; jne f10 <_sk_scale_u8_hsw_lowp+0x55>
+ DB 196,65,122,111,4,19 ; vmovdqu (%r11,%rdx,1),%xmm8
+ DB 196,66,125,48,192 ; vpmovzxbw %xmm8,%ymm8
+ DB 196,193,61,113,240,8 ; vpsllw $0x8,%ymm8,%ymm8
+ DB 196,98,125,121,13,163,9,0,0 ; vpbroadcastw 0x9a3(%rip),%ymm9 # 1882 <_sk_xor__hsw_lowp+0x202>
+ DB 196,65,61,228,193 ; vpmulhuw %ymm9,%ymm8,%ymm8
+ DB 196,194,125,11,192 ; vpmulhrsw %ymm8,%ymm0,%ymm0
+ DB 196,226,125,29,192 ; vpabsw %ymm0,%ymm0
+ DB 196,194,117,11,200 ; vpmulhrsw %ymm8,%ymm1,%ymm1
+ DB 196,226,125,29,201 ; vpabsw %ymm1,%ymm1
+ DB 196,194,109,11,208 ; vpmulhrsw %ymm8,%ymm2,%ymm2
+ DB 196,226,125,29,210 ; vpabsw %ymm2,%ymm2
+ DB 196,194,101,11,216 ; vpmulhrsw %ymm8,%ymm3,%ymm3
+ DB 196,226,125,29,219 ; vpabsw %ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+ DB 69,137,193 ; mov %r8d,%r9d
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 196,65,57,239,192 ; vpxor %xmm8,%xmm8,%xmm8
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 119,166 ; ja ecb <_sk_scale_u8_hsw_lowp+0x10>
+ DB 69,15,182,201 ; movzbl %r9b,%r9d
+ DB 76,141,21,200,0,0,0 ; lea 0xc8(%rip),%r10 # ff8 <_sk_scale_u8_hsw_lowp+0x13d>
+ DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax
+ DB 76,1,208 ; add %r10,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 65,15,182,4,19 ; movzbl (%r11,%rdx,1),%eax
+ DB 197,121,110,192 ; vmovd %eax,%xmm8
+ DB 235,135 ; jmp ecb <_sk_scale_u8_hsw_lowp+0x10>
+ DB 196,65,57,239,192 ; vpxor %xmm8,%xmm8,%xmm8
+ DB 196,67,57,32,68,19,2,2 ; vpinsrb $0x2,0x2(%r11,%rdx,1),%xmm8,%xmm8
+ DB 65,15,183,4,19 ; movzwl (%r11,%rdx,1),%eax
+ DB 197,121,110,200 ; vmovd %eax,%xmm9
+ DB 196,67,57,14,193,1 ; vpblendw $0x1,%xmm9,%xmm8,%xmm8
+ DB 233,102,255,255,255 ; jmpq ecb <_sk_scale_u8_hsw_lowp+0x10>
+ DB 196,65,57,239,192 ; vpxor %xmm8,%xmm8,%xmm8
+ DB 196,67,57,32,68,19,6,6 ; vpinsrb $0x6,0x6(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,67,57,32,68,19,5,5 ; vpinsrb $0x5,0x5(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,67,57,32,68,19,4,4 ; vpinsrb $0x4,0x4(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,65,121,110,12,19 ; vmovd (%r11,%rdx,1),%xmm9
+ DB 196,67,57,2,193,1 ; vpblendd $0x1,%xmm9,%xmm8,%xmm8
+ DB 233,56,255,255,255 ; jmpq ecb <_sk_scale_u8_hsw_lowp+0x10>
+ DB 196,65,57,239,192 ; vpxor %xmm8,%xmm8,%xmm8
+ DB 196,67,57,32,68,19,10,10 ; vpinsrb $0xa,0xa(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,67,57,32,68,19,9,9 ; vpinsrb $0x9,0x9(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,67,57,32,68,19,8,8 ; vpinsrb $0x8,0x8(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,65,122,126,12,19 ; vmovq (%r11,%rdx,1),%xmm9
+ DB 196,67,49,2,192,12 ; vpblendd $0xc,%xmm8,%xmm9,%xmm8
+ DB 233,10,255,255,255 ; jmpq ecb <_sk_scale_u8_hsw_lowp+0x10>
+ DB 196,65,57,239,192 ; vpxor %xmm8,%xmm8,%xmm8
+ DB 196,67,57,32,68,19,14,14 ; vpinsrb $0xe,0xe(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,67,57,32,68,19,13,13 ; vpinsrb $0xd,0xd(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,67,57,32,68,19,12,12 ; vpinsrb $0xc,0xc(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,65,122,126,12,19 ; vmovq (%r11,%rdx,1),%xmm9
+ DB 196,67,49,34,76,19,8,2 ; vpinsrd $0x2,0x8(%r11,%rdx,1),%xmm9,%xmm9
+ DB 196,67,49,2,192,8 ; vpblendd $0x8,%xmm8,%xmm9,%xmm8
+ DB 233,212,254,255,255 ; jmpq ecb <_sk_scale_u8_hsw_lowp+0x10>
+ DB 144 ; nop
+ DB 65,255 ; rex.B (bad)
+ DB 255 ; (bad)
+ DB 255,89,255 ; lcall *-0x1(%rcx)
+ DB 255 ; (bad)
+ DB 255,76,255,255 ; decl -0x1(%rdi,%rdi,8)
+ DB 255,138,255,255,255,130 ; decl -0x7d000001(%rdx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 122,255 ; jp 100d <_sk_scale_u8_hsw_lowp+0x152>
+ DB 255 ; (bad)
+ DB 255,109,255 ; ljmp *-0x1(%rbp)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 184,255,255,255,176 ; mov $0xb0ffffff,%eax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,168,255,255,255,155 ; ljmp *-0x64000001(%rax)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,230 ; jmpq *%rsi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 222,255 ; fdivrp %st,%st(7)
+ DB 255 ; (bad)
+ DB 255,214 ; callq *%rsi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,201 ; dec %ecx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
+
+PUBLIC _sk_lerp_1_float_hsw_lowp
+_sk_lerp_1_float_hsw_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,122,16,0 ; vmovss (%rax),%xmm8
+ DB 197,58,88,5,194,6,0,0 ; vaddss 0x6c2(%rip),%xmm8,%xmm8 # 1704 <_sk_xor__hsw_lowp+0x84>
+ DB 197,121,126,192 ; vmovd %xmm8,%eax
+ DB 197,121,110,192 ; vmovd %eax,%xmm8
+ DB 196,66,125,121,192 ; vpbroadcastw %xmm8,%ymm8
+ DB 196,194,125,11,192 ; vpmulhrsw %ymm8,%ymm0,%ymm0
+ DB 196,226,125,29,192 ; vpabsw %ymm0,%ymm0
+ DB 196,98,125,121,13,34,8,0,0 ; vpbroadcastw 0x822(%rip),%ymm9 # 1884 <_sk_xor__hsw_lowp+0x204>
+ DB 196,65,53,249,200 ; vpsubw %ymm8,%ymm9,%ymm9
+ DB 196,66,93,11,209 ; vpmulhrsw %ymm9,%ymm4,%ymm10
+ DB 196,66,125,29,210 ; vpabsw %ymm10,%ymm10
+ DB 197,173,253,192 ; vpaddw %ymm0,%ymm10,%ymm0
+ DB 196,194,117,11,200 ; vpmulhrsw %ymm8,%ymm1,%ymm1
+ DB 196,226,125,29,201 ; vpabsw %ymm1,%ymm1
+ DB 196,66,85,11,209 ; vpmulhrsw %ymm9,%ymm5,%ymm10
+ DB 196,66,125,29,210 ; vpabsw %ymm10,%ymm10
+ DB 197,173,253,201 ; vpaddw %ymm1,%ymm10,%ymm1
+ DB 196,194,109,11,208 ; vpmulhrsw %ymm8,%ymm2,%ymm2
+ DB 196,226,125,29,210 ; vpabsw %ymm2,%ymm2
+ DB 196,66,77,11,209 ; vpmulhrsw %ymm9,%ymm6,%ymm10
+ DB 196,66,125,29,210 ; vpabsw %ymm10,%ymm10
+ DB 197,173,253,210 ; vpaddw %ymm2,%ymm10,%ymm2
+ DB 196,194,101,11,216 ; vpmulhrsw %ymm8,%ymm3,%ymm3
+ DB 196,226,125,29,219 ; vpabsw %ymm3,%ymm3
+ DB 196,66,69,11,193 ; vpmulhrsw %ymm9,%ymm7,%ymm8
+ DB 196,66,125,29,192 ; vpabsw %ymm8,%ymm8
+ DB 197,189,253,219 ; vpaddw %ymm3,%ymm8,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_lerp_u8_hsw_lowp
+_sk_lerp_u8_hsw_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 76,139,24 ; mov (%rax),%r11
+ DB 77,133,192 ; test %r8,%r8
+ DB 15,133,145,0,0,0 ; jne 1160 <_sk_lerp_u8_hsw_lowp+0x9f>
+ DB 196,65,122,111,4,19 ; vmovdqu (%r11,%rdx,1),%xmm8
+ DB 196,66,125,48,192 ; vpmovzxbw %xmm8,%ymm8
+ DB 196,193,61,113,240,8 ; vpsllw $0x8,%ymm8,%ymm8
+ DB 196,98,125,121,13,157,7,0,0 ; vpbroadcastw 0x79d(%rip),%ymm9 # 1886 <_sk_xor__hsw_lowp+0x206>
+ DB 196,65,61,228,193 ; vpmulhuw %ymm9,%ymm8,%ymm8
+ DB 196,194,125,11,192 ; vpmulhrsw %ymm8,%ymm0,%ymm0
+ DB 196,226,125,29,192 ; vpabsw %ymm0,%ymm0
+ DB 196,98,125,121,13,135,7,0,0 ; vpbroadcastw 0x787(%rip),%ymm9 # 1888 <_sk_xor__hsw_lowp+0x208>
+ DB 196,65,53,249,200 ; vpsubw %ymm8,%ymm9,%ymm9
+ DB 196,66,93,11,209 ; vpmulhrsw %ymm9,%ymm4,%ymm10
+ DB 196,66,125,29,210 ; vpabsw %ymm10,%ymm10
+ DB 197,173,253,192 ; vpaddw %ymm0,%ymm10,%ymm0
+ DB 196,194,117,11,200 ; vpmulhrsw %ymm8,%ymm1,%ymm1
+ DB 196,226,125,29,201 ; vpabsw %ymm1,%ymm1
+ DB 196,66,85,11,209 ; vpmulhrsw %ymm9,%ymm5,%ymm10
+ DB 196,66,125,29,210 ; vpabsw %ymm10,%ymm10
+ DB 197,173,253,201 ; vpaddw %ymm1,%ymm10,%ymm1
+ DB 196,194,109,11,208 ; vpmulhrsw %ymm8,%ymm2,%ymm2
+ DB 196,226,125,29,210 ; vpabsw %ymm2,%ymm2
+ DB 196,66,77,11,209 ; vpmulhrsw %ymm9,%ymm6,%ymm10
+ DB 196,66,125,29,210 ; vpabsw %ymm10,%ymm10
+ DB 197,173,253,210 ; vpaddw %ymm2,%ymm10,%ymm2
+ DB 196,194,101,11,216 ; vpmulhrsw %ymm8,%ymm3,%ymm3
+ DB 196,226,125,29,219 ; vpabsw %ymm3,%ymm3
+ DB 196,66,69,11,193 ; vpmulhrsw %ymm9,%ymm7,%ymm8
+ DB 196,66,125,29,192 ; vpabsw %ymm8,%ymm8
+ DB 197,189,253,219 ; vpaddw %ymm3,%ymm8,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+ DB 69,137,193 ; mov %r8d,%r9d
+ DB 65,128,225,15 ; and $0xf,%r9b
+ DB 196,65,57,239,192 ; vpxor %xmm8,%xmm8,%xmm8
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,14 ; cmp $0xe,%r9b
+ DB 15,135,92,255,255,255 ; ja 10d5 <_sk_lerp_u8_hsw_lowp+0x14>
+ DB 69,15,182,201 ; movzbl %r9b,%r9d
+ DB 76,141,21,204,0,0,0 ; lea 0xcc(%rip),%r10 # 1250 <_sk_lerp_u8_hsw_lowp+0x18f>
+ DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax
+ DB 76,1,208 ; add %r10,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 65,15,182,4,19 ; movzbl (%r11,%rdx,1),%eax
+ DB 197,121,110,192 ; vmovd %eax,%xmm8
+ DB 233,58,255,255,255 ; jmpq 10d5 <_sk_lerp_u8_hsw_lowp+0x14>
+ DB 196,65,57,239,192 ; vpxor %xmm8,%xmm8,%xmm8
+ DB 196,67,57,32,68,19,2,2 ; vpinsrb $0x2,0x2(%r11,%rdx,1),%xmm8,%xmm8
+ DB 65,15,183,4,19 ; movzwl (%r11,%rdx,1),%eax
+ DB 197,121,110,200 ; vmovd %eax,%xmm9
+ DB 196,67,57,14,193,1 ; vpblendw $0x1,%xmm9,%xmm8,%xmm8
+ DB 233,25,255,255,255 ; jmpq 10d5 <_sk_lerp_u8_hsw_lowp+0x14>
+ DB 196,65,57,239,192 ; vpxor %xmm8,%xmm8,%xmm8
+ DB 196,67,57,32,68,19,6,6 ; vpinsrb $0x6,0x6(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,67,57,32,68,19,5,5 ; vpinsrb $0x5,0x5(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,67,57,32,68,19,4,4 ; vpinsrb $0x4,0x4(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,65,121,110,12,19 ; vmovd (%r11,%rdx,1),%xmm9
+ DB 196,67,57,2,193,1 ; vpblendd $0x1,%xmm9,%xmm8,%xmm8
+ DB 233,235,254,255,255 ; jmpq 10d5 <_sk_lerp_u8_hsw_lowp+0x14>
+ DB 196,65,57,239,192 ; vpxor %xmm8,%xmm8,%xmm8
+ DB 196,67,57,32,68,19,10,10 ; vpinsrb $0xa,0xa(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,67,57,32,68,19,9,9 ; vpinsrb $0x9,0x9(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,67,57,32,68,19,8,8 ; vpinsrb $0x8,0x8(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,65,122,126,12,19 ; vmovq (%r11,%rdx,1),%xmm9
+ DB 196,67,49,2,192,12 ; vpblendd $0xc,%xmm8,%xmm9,%xmm8
+ DB 233,189,254,255,255 ; jmpq 10d5 <_sk_lerp_u8_hsw_lowp+0x14>
+ DB 196,65,57,239,192 ; vpxor %xmm8,%xmm8,%xmm8
+ DB 196,67,57,32,68,19,14,14 ; vpinsrb $0xe,0xe(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,67,57,32,68,19,13,13 ; vpinsrb $0xd,0xd(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,67,57,32,68,19,12,12 ; vpinsrb $0xc,0xc(%r11,%rdx,1),%xmm8,%xmm8
+ DB 196,65,122,126,12,19 ; vmovq (%r11,%rdx,1),%xmm9
+ DB 196,67,49,34,76,19,8,2 ; vpinsrd $0x2,0x8(%r11,%rdx,1),%xmm9,%xmm9
+ DB 196,67,49,2,192,8 ; vpblendd $0x8,%xmm8,%xmm9,%xmm8
+ DB 233,135,254,255,255 ; jmpq 10d5 <_sk_lerp_u8_hsw_lowp+0x14>
+ DB 102,144 ; xchg %ax,%ax
+ DB 61,255,255,255,88 ; cmp $0x58ffffff,%eax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,75,255 ; decl -0x1(%rbx)
+ DB 255 ; (bad)
+ DB 255,137,255,255,255,129 ; decl -0x7e000001(%rcx)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 121,255 ; jns 1265 <_sk_lerp_u8_hsw_lowp+0x1a4>
+ DB 255 ; (bad)
+ DB 255,108,255,255 ; ljmp *-0x1(%rdi,%rdi,8)
+ DB 255,183,255,255,255,175 ; pushq -0x50000001(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,167,255,255,255,154 ; jmpq *-0x65000001(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,229 ; jmpq *%rbp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 221,255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,213 ; callq *%rbp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,200 ; dec %eax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
+
+PUBLIC _sk_swap_rb_hsw_lowp
+_sk_swap_rb_hsw_lowp LABEL PROC
+ DB 197,124,40,192 ; vmovaps %ymm0,%ymm8
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,252,40,194 ; vmovaps %ymm2,%ymm0
+ DB 197,124,41,194 ; vmovaps %ymm8,%ymm2
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_swap_hsw_lowp
+_sk_swap_hsw_lowp LABEL PROC
+ DB 197,124,40,195 ; vmovaps %ymm3,%ymm8
+ DB 197,124,40,202 ; vmovaps %ymm2,%ymm9
+ DB 197,124,40,209 ; vmovaps %ymm1,%ymm10
+ DB 197,124,40,216 ; vmovaps %ymm0,%ymm11
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,252,40,196 ; vmovaps %ymm4,%ymm0
+ DB 197,252,40,205 ; vmovaps %ymm5,%ymm1
+ DB 197,252,40,214 ; vmovaps %ymm6,%ymm2
+ DB 197,252,40,223 ; vmovaps %ymm7,%ymm3
+ DB 197,124,41,220 ; vmovaps %ymm11,%ymm4
+ DB 197,124,41,213 ; vmovaps %ymm10,%ymm5
+ DB 197,124,41,206 ; vmovaps %ymm9,%ymm6
+ DB 197,124,41,199 ; vmovaps %ymm8,%ymm7
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_move_src_dst_hsw_lowp
+_sk_move_src_dst_hsw_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,252,40,224 ; vmovaps %ymm0,%ymm4
+ DB 197,252,40,233 ; vmovaps %ymm1,%ymm5
+ DB 197,252,40,242 ; vmovaps %ymm2,%ymm6
+ DB 197,252,40,251 ; vmovaps %ymm3,%ymm7
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_move_dst_src_hsw_lowp
+_sk_move_dst_src_hsw_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,252,40,196 ; vmovaps %ymm4,%ymm0
+ DB 197,252,40,205 ; vmovaps %ymm5,%ymm1
+ DB 197,252,40,214 ; vmovaps %ymm6,%ymm2
+ DB 197,252,40,223 ; vmovaps %ymm7,%ymm3
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_clear_hsw_lowp
+_sk_clear_hsw_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,252,87,192 ; vxorps %ymm0,%ymm0,%ymm0
+ DB 197,244,87,201 ; vxorps %ymm1,%ymm1,%ymm1
+ DB 197,236,87,210 ; vxorps %ymm2,%ymm2,%ymm2
+ DB 197,228,87,219 ; vxorps %ymm3,%ymm3,%ymm3
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_srcatop_hsw_lowp
+_sk_srcatop_hsw_lowp LABEL PROC
+ DB 196,226,125,11,199 ; vpmulhrsw %ymm7,%ymm0,%ymm0
+ DB 196,226,125,29,192 ; vpabsw %ymm0,%ymm0
+ DB 196,98,125,121,5,107,5,0,0 ; vpbroadcastw 0x56b(%rip),%ymm8 # 188a <_sk_xor__hsw_lowp+0x20a>
+ DB 197,61,249,195 ; vpsubw %ymm3,%ymm8,%ymm8
+ DB 196,66,93,11,200 ; vpmulhrsw %ymm8,%ymm4,%ymm9
+ DB 196,66,125,29,201 ; vpabsw %ymm9,%ymm9
+ DB 197,181,253,192 ; vpaddw %ymm0,%ymm9,%ymm0
+ DB 196,226,117,11,207 ; vpmulhrsw %ymm7,%ymm1,%ymm1
+ DB 196,226,125,29,201 ; vpabsw %ymm1,%ymm1
+ DB 196,66,85,11,200 ; vpmulhrsw %ymm8,%ymm5,%ymm9
+ DB 196,66,125,29,201 ; vpabsw %ymm9,%ymm9
+ DB 197,181,253,201 ; vpaddw %ymm1,%ymm9,%ymm1
+ DB 196,226,109,11,215 ; vpmulhrsw %ymm7,%ymm2,%ymm2
+ DB 196,226,125,29,210 ; vpabsw %ymm2,%ymm2
+ DB 196,66,77,11,200 ; vpmulhrsw %ymm8,%ymm6,%ymm9
+ DB 196,66,125,29,201 ; vpabsw %ymm9,%ymm9
+ DB 197,181,253,210 ; vpaddw %ymm2,%ymm9,%ymm2
+ DB 196,226,101,11,223 ; vpmulhrsw %ymm7,%ymm3,%ymm3
+ DB 196,226,125,29,219 ; vpabsw %ymm3,%ymm3
+ DB 196,66,69,11,192 ; vpmulhrsw %ymm8,%ymm7,%ymm8
+ DB 196,66,125,29,192 ; vpabsw %ymm8,%ymm8
+ DB 197,189,253,219 ; vpaddw %ymm3,%ymm8,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_dstatop_hsw_lowp
+_sk_dstatop_hsw_lowp LABEL PROC
+ DB 196,98,93,11,195 ; vpmulhrsw %ymm3,%ymm4,%ymm8
+ DB 196,66,125,29,192 ; vpabsw %ymm8,%ymm8
+ DB 196,98,125,121,13,252,4,0,0 ; vpbroadcastw 0x4fc(%rip),%ymm9 # 188c <_sk_xor__hsw_lowp+0x20c>
+ DB 197,53,249,207 ; vpsubw %ymm7,%ymm9,%ymm9
+ DB 196,194,125,11,193 ; vpmulhrsw %ymm9,%ymm0,%ymm0
+ DB 196,226,125,29,192 ; vpabsw %ymm0,%ymm0
+ DB 196,193,125,253,192 ; vpaddw %ymm8,%ymm0,%ymm0
+ DB 196,98,85,11,195 ; vpmulhrsw %ymm3,%ymm5,%ymm8
+ DB 196,66,125,29,192 ; vpabsw %ymm8,%ymm8
+ DB 196,194,117,11,201 ; vpmulhrsw %ymm9,%ymm1,%ymm1
+ DB 196,226,125,29,201 ; vpabsw %ymm1,%ymm1
+ DB 196,193,117,253,200 ; vpaddw %ymm8,%ymm1,%ymm1
+ DB 196,98,77,11,195 ; vpmulhrsw %ymm3,%ymm6,%ymm8
+ DB 196,66,125,29,192 ; vpabsw %ymm8,%ymm8
+ DB 196,194,109,11,209 ; vpmulhrsw %ymm9,%ymm2,%ymm2
+ DB 196,226,125,29,210 ; vpabsw %ymm2,%ymm2
+ DB 196,193,109,253,208 ; vpaddw %ymm8,%ymm2,%ymm2
+ DB 196,98,69,11,195 ; vpmulhrsw %ymm3,%ymm7,%ymm8
+ DB 196,66,125,29,192 ; vpabsw %ymm8,%ymm8
+ DB 196,194,101,11,217 ; vpmulhrsw %ymm9,%ymm3,%ymm3
+ DB 196,226,125,29,219 ; vpabsw %ymm3,%ymm3
+ DB 196,193,101,253,216 ; vpaddw %ymm8,%ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_srcin_hsw_lowp
+_sk_srcin_hsw_lowp LABEL PROC
+ DB 196,226,125,11,199 ; vpmulhrsw %ymm7,%ymm0,%ymm0
+ DB 196,226,125,29,192 ; vpabsw %ymm0,%ymm0
+ DB 196,226,117,11,207 ; vpmulhrsw %ymm7,%ymm1,%ymm1
+ DB 196,226,125,29,201 ; vpabsw %ymm1,%ymm1
+ DB 196,226,109,11,215 ; vpmulhrsw %ymm7,%ymm2,%ymm2
+ DB 196,226,125,29,210 ; vpabsw %ymm2,%ymm2
+ DB 196,226,101,11,223 ; vpmulhrsw %ymm7,%ymm3,%ymm3
+ DB 196,226,125,29,219 ; vpabsw %ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_dstin_hsw_lowp
+_sk_dstin_hsw_lowp LABEL PROC
+ DB 196,226,93,11,195 ; vpmulhrsw %ymm3,%ymm4,%ymm0
+ DB 196,226,125,29,192 ; vpabsw %ymm0,%ymm0
+ DB 196,226,85,11,203 ; vpmulhrsw %ymm3,%ymm5,%ymm1
+ DB 196,226,125,29,201 ; vpabsw %ymm1,%ymm1
+ DB 196,226,77,11,211 ; vpmulhrsw %ymm3,%ymm6,%ymm2
+ DB 196,226,125,29,210 ; vpabsw %ymm2,%ymm2
+ DB 196,226,69,11,219 ; vpmulhrsw %ymm3,%ymm7,%ymm3
+ DB 196,226,125,29,219 ; vpabsw %ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_srcout_hsw_lowp
+_sk_srcout_hsw_lowp LABEL PROC
+ DB 196,98,125,121,5,59,4,0,0 ; vpbroadcastw 0x43b(%rip),%ymm8 # 188e <_sk_xor__hsw_lowp+0x20e>
+ DB 197,61,249,199 ; vpsubw %ymm7,%ymm8,%ymm8
+ DB 196,194,125,11,192 ; vpmulhrsw %ymm8,%ymm0,%ymm0
+ DB 196,226,125,29,192 ; vpabsw %ymm0,%ymm0
+ DB 196,194,117,11,200 ; vpmulhrsw %ymm8,%ymm1,%ymm1
+ DB 196,226,125,29,201 ; vpabsw %ymm1,%ymm1
+ DB 196,194,109,11,208 ; vpmulhrsw %ymm8,%ymm2,%ymm2
+ DB 196,226,125,29,210 ; vpabsw %ymm2,%ymm2
+ DB 196,194,101,11,216 ; vpmulhrsw %ymm8,%ymm3,%ymm3
+ DB 196,226,125,29,219 ; vpabsw %ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_dstout_hsw_lowp
+_sk_dstout_hsw_lowp LABEL PROC
+ DB 196,226,125,121,5,4,4,0,0 ; vpbroadcastw 0x404(%rip),%ymm0 # 1890 <_sk_xor__hsw_lowp+0x210>
+ DB 197,253,249,219 ; vpsubw %ymm3,%ymm0,%ymm3
+ DB 196,226,93,11,195 ; vpmulhrsw %ymm3,%ymm4,%ymm0
+ DB 196,226,125,29,192 ; vpabsw %ymm0,%ymm0
+ DB 196,226,85,11,203 ; vpmulhrsw %ymm3,%ymm5,%ymm1
+ DB 196,226,125,29,201 ; vpabsw %ymm1,%ymm1
+ DB 196,226,77,11,211 ; vpmulhrsw %ymm3,%ymm6,%ymm2
+ DB 196,226,125,29,210 ; vpabsw %ymm2,%ymm2
+ DB 196,226,69,11,219 ; vpmulhrsw %ymm3,%ymm7,%ymm3
+ DB 196,226,125,29,219 ; vpabsw %ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_srcover_hsw_lowp
+_sk_srcover_hsw_lowp LABEL PROC
+ DB 196,98,125,121,5,205,3,0,0 ; vpbroadcastw 0x3cd(%rip),%ymm8 # 1892 <_sk_xor__hsw_lowp+0x212>
+ DB 197,61,249,195 ; vpsubw %ymm3,%ymm8,%ymm8
+ DB 196,66,93,11,200 ; vpmulhrsw %ymm8,%ymm4,%ymm9
+ DB 196,66,125,29,201 ; vpabsw %ymm9,%ymm9
+ DB 197,181,253,192 ; vpaddw %ymm0,%ymm9,%ymm0
+ DB 196,66,85,11,200 ; vpmulhrsw %ymm8,%ymm5,%ymm9
+ DB 196,66,125,29,201 ; vpabsw %ymm9,%ymm9
+ DB 197,181,253,201 ; vpaddw %ymm1,%ymm9,%ymm1
+ DB 196,66,77,11,200 ; vpmulhrsw %ymm8,%ymm6,%ymm9
+ DB 196,66,125,29,201 ; vpabsw %ymm9,%ymm9
+ DB 197,181,253,210 ; vpaddw %ymm2,%ymm9,%ymm2
+ DB 196,66,69,11,192 ; vpmulhrsw %ymm8,%ymm7,%ymm8
+ DB 196,66,125,29,192 ; vpabsw %ymm8,%ymm8
+ DB 197,189,253,219 ; vpaddw %ymm3,%ymm8,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_dstover_hsw_lowp
+_sk_dstover_hsw_lowp LABEL PROC
+ DB 196,98,125,121,5,134,3,0,0 ; vpbroadcastw 0x386(%rip),%ymm8 # 1894 <_sk_xor__hsw_lowp+0x214>
+ DB 197,61,249,199 ; vpsubw %ymm7,%ymm8,%ymm8
+ DB 196,194,125,11,192 ; vpmulhrsw %ymm8,%ymm0,%ymm0
+ DB 196,226,125,29,192 ; vpabsw %ymm0,%ymm0
+ DB 197,253,253,196 ; vpaddw %ymm4,%ymm0,%ymm0
+ DB 196,194,117,11,200 ; vpmulhrsw %ymm8,%ymm1,%ymm1
+ DB 196,226,125,29,201 ; vpabsw %ymm1,%ymm1
+ DB 197,245,253,205 ; vpaddw %ymm5,%ymm1,%ymm1
+ DB 196,194,109,11,208 ; vpmulhrsw %ymm8,%ymm2,%ymm2
+ DB 196,226,125,29,210 ; vpabsw %ymm2,%ymm2
+ DB 197,237,253,214 ; vpaddw %ymm6,%ymm2,%ymm2
+ DB 196,194,101,11,216 ; vpmulhrsw %ymm8,%ymm3,%ymm3
+ DB 196,226,125,29,219 ; vpabsw %ymm3,%ymm3
+ DB 197,229,253,223 ; vpaddw %ymm7,%ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_modulate_hsw_lowp
+_sk_modulate_hsw_lowp LABEL PROC
+ DB 196,226,125,11,196 ; vpmulhrsw %ymm4,%ymm0,%ymm0
+ DB 196,226,125,29,192 ; vpabsw %ymm0,%ymm0
+ DB 196,226,117,11,205 ; vpmulhrsw %ymm5,%ymm1,%ymm1
+ DB 196,226,125,29,201 ; vpabsw %ymm1,%ymm1
+ DB 196,226,109,11,214 ; vpmulhrsw %ymm6,%ymm2,%ymm2
+ DB 196,226,125,29,210 ; vpabsw %ymm2,%ymm2
+ DB 196,226,101,11,223 ; vpmulhrsw %ymm7,%ymm3,%ymm3
+ DB 196,226,125,29,219 ; vpabsw %ymm3,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_multiply_hsw_lowp
+_sk_multiply_hsw_lowp LABEL PROC
+ DB 196,98,125,121,5,19,3,0,0 ; vpbroadcastw 0x313(%rip),%ymm8 # 1896 <_sk_xor__hsw_lowp+0x216>
+ DB 197,61,249,207 ; vpsubw %ymm7,%ymm8,%ymm9
+ DB 196,66,125,11,209 ; vpmulhrsw %ymm9,%ymm0,%ymm10
+ DB 196,66,125,29,210 ; vpabsw %ymm10,%ymm10
+ DB 197,61,249,195 ; vpsubw %ymm3,%ymm8,%ymm8
+ DB 196,66,93,11,216 ; vpmulhrsw %ymm8,%ymm4,%ymm11
+ DB 196,66,125,29,219 ; vpabsw %ymm11,%ymm11
+ DB 196,65,37,253,210 ; vpaddw %ymm10,%ymm11,%ymm10
+ DB 196,226,125,11,196 ; vpmulhrsw %ymm4,%ymm0,%ymm0
+ DB 196,226,125,29,192 ; vpabsw %ymm0,%ymm0
+ DB 197,173,253,192 ; vpaddw %ymm0,%ymm10,%ymm0
+ DB 196,66,117,11,209 ; vpmulhrsw %ymm9,%ymm1,%ymm10
+ DB 196,66,125,29,210 ; vpabsw %ymm10,%ymm10
+ DB 196,66,85,11,216 ; vpmulhrsw %ymm8,%ymm5,%ymm11
+ DB 196,66,125,29,219 ; vpabsw %ymm11,%ymm11
+ DB 196,65,37,253,210 ; vpaddw %ymm10,%ymm11,%ymm10
+ DB 196,226,117,11,205 ; vpmulhrsw %ymm5,%ymm1,%ymm1
+ DB 196,226,125,29,201 ; vpabsw %ymm1,%ymm1
+ DB 197,173,253,201 ; vpaddw %ymm1,%ymm10,%ymm1
+ DB 196,66,109,11,209 ; vpmulhrsw %ymm9,%ymm2,%ymm10
+ DB 196,66,125,29,210 ; vpabsw %ymm10,%ymm10
+ DB 196,66,77,11,216 ; vpmulhrsw %ymm8,%ymm6,%ymm11
+ DB 196,66,125,29,219 ; vpabsw %ymm11,%ymm11
+ DB 196,65,37,253,210 ; vpaddw %ymm10,%ymm11,%ymm10
+ DB 196,226,109,11,214 ; vpmulhrsw %ymm6,%ymm2,%ymm2
+ DB 196,226,125,29,210 ; vpabsw %ymm2,%ymm2
+ DB 197,173,253,210 ; vpaddw %ymm2,%ymm10,%ymm2
+ DB 196,66,101,11,201 ; vpmulhrsw %ymm9,%ymm3,%ymm9
+ DB 196,66,125,29,201 ; vpabsw %ymm9,%ymm9
+ DB 196,66,69,11,192 ; vpmulhrsw %ymm8,%ymm7,%ymm8
+ DB 196,66,125,29,192 ; vpabsw %ymm8,%ymm8
+ DB 196,65,61,253,193 ; vpaddw %ymm9,%ymm8,%ymm8
+ DB 196,226,101,11,223 ; vpmulhrsw %ymm7,%ymm3,%ymm3
+ DB 196,226,125,29,219 ; vpabsw %ymm3,%ymm3
+ DB 197,189,253,219 ; vpaddw %ymm3,%ymm8,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_screen_hsw_lowp
+_sk_screen_hsw_lowp LABEL PROC
+ DB 196,98,125,121,5,100,2,0,0 ; vpbroadcastw 0x264(%rip),%ymm8 # 1898 <_sk_xor__hsw_lowp+0x218>
+ DB 197,61,249,200 ; vpsubw %ymm0,%ymm8,%ymm9
+ DB 196,98,53,11,204 ; vpmulhrsw %ymm4,%ymm9,%ymm9
+ DB 196,66,125,29,201 ; vpabsw %ymm9,%ymm9
+ DB 197,181,253,192 ; vpaddw %ymm0,%ymm9,%ymm0
+ DB 197,61,249,201 ; vpsubw %ymm1,%ymm8,%ymm9
+ DB 196,98,53,11,205 ; vpmulhrsw %ymm5,%ymm9,%ymm9
+ DB 196,66,125,29,201 ; vpabsw %ymm9,%ymm9
+ DB 197,181,253,201 ; vpaddw %ymm1,%ymm9,%ymm1
+ DB 197,61,249,202 ; vpsubw %ymm2,%ymm8,%ymm9
+ DB 196,98,53,11,206 ; vpmulhrsw %ymm6,%ymm9,%ymm9
+ DB 196,66,125,29,201 ; vpabsw %ymm9,%ymm9
+ DB 197,181,253,210 ; vpaddw %ymm2,%ymm9,%ymm2
+ DB 197,61,249,195 ; vpsubw %ymm3,%ymm8,%ymm8
+ DB 196,98,61,11,199 ; vpmulhrsw %ymm7,%ymm8,%ymm8
+ DB 196,66,125,29,192 ; vpabsw %ymm8,%ymm8
+ DB 197,189,253,219 ; vpaddw %ymm3,%ymm8,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_xor__hsw_lowp
+_sk_xor__hsw_lowp LABEL PROC
+ DB 196,98,125,121,5,17,2,0,0 ; vpbroadcastw 0x211(%rip),%ymm8 # 189a <_sk_xor__hsw_lowp+0x21a>
+ DB 197,61,249,207 ; vpsubw %ymm7,%ymm8,%ymm9
+ DB 196,194,125,11,193 ; vpmulhrsw %ymm9,%ymm0,%ymm0
+ DB 196,226,125,29,192 ; vpabsw %ymm0,%ymm0
+ DB 197,61,249,195 ; vpsubw %ymm3,%ymm8,%ymm8
+ DB 196,66,93,11,208 ; vpmulhrsw %ymm8,%ymm4,%ymm10
+ DB 196,66,125,29,210 ; vpabsw %ymm10,%ymm10
+ DB 197,173,253,192 ; vpaddw %ymm0,%ymm10,%ymm0
+ DB 196,194,117,11,201 ; vpmulhrsw %ymm9,%ymm1,%ymm1
+ DB 196,226,125,29,201 ; vpabsw %ymm1,%ymm1
+ DB 196,66,85,11,208 ; vpmulhrsw %ymm8,%ymm5,%ymm10
+ DB 196,66,125,29,210 ; vpabsw %ymm10,%ymm10
+ DB 197,173,253,201 ; vpaddw %ymm1,%ymm10,%ymm1
+ DB 196,194,109,11,209 ; vpmulhrsw %ymm9,%ymm2,%ymm2
+ DB 196,226,125,29,210 ; vpabsw %ymm2,%ymm2
+ DB 196,66,77,11,208 ; vpmulhrsw %ymm8,%ymm6,%ymm10
+ DB 196,66,125,29,210 ; vpabsw %ymm10,%ymm10
+ DB 197,173,253,210 ; vpaddw %ymm2,%ymm10,%ymm2
+ DB 196,194,101,11,217 ; vpmulhrsw %ymm9,%ymm3,%ymm3
+ DB 196,226,125,29,219 ; vpabsw %ymm3,%ymm3
+ DB 196,66,69,11,192 ; vpmulhrsw %ymm8,%ymm7,%ymm8
+ DB 196,66,125,29,192 ; vpabsw %ymm8,%ymm8
+ DB 197,189,253,219 ; vpaddw %ymm3,%ymm8,%ymm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+ALIGN 4
+ DB 0,0 ; add %al,(%rax)
+ DB 128,67,0,0 ; addb $0x0,0x0(%rbx)
+ DB 128,67,0,0 ; addb $0x0,0x0(%rbx)
+ DB 128,67,0,0 ; addb $0x0,0x0(%rbx)
+ DB 128 ; .byte 0x80
+ DB 67 ; rex.XB
+
+ALIGN 32
+ DB 0,1 ; add %al,(%rcx)
+ DB 4,5 ; add $0x5,%al
+ DB 8,9 ; or %cl,(%rcx)
+ DB 12,13 ; or $0xd,%al
+ DB 128,128,128,128,128,128,128 ; addb $0x80,-0x7f7f7f80(%rax)
+ DB 128,0,1 ; addb $0x1,(%rax)
+ DB 4,5 ; add $0x5,%al
+ DB 8,9 ; or %cl,(%rcx)
+ DB 12,13 ; or $0xd,%al
+ DB 128,128,128,128,128,128,128 ; addb $0x80,-0x7f7f7f80(%rax)
+ DB 128,129,128,0,0,0,0 ; addb $0x0,0x80(%rcx)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 1,2 ; add %eax,(%rdx)
+ DB 5,6,9,10,13 ; add $0xd0a0906,%eax
+ DB 14 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,17 ; callq *(%rcx)
+ DB 18,21,22,25,26,29 ; adc 0x1d1a1916(%rip),%dl # 1d1a308d <_sk_xor__hsw_lowp+0x1d1a1a0d>
+ DB 30 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,2 ; incl (%rdx)
+ DB 3,6 ; add (%rsi),%eax
+ DB 7 ; (bad)
+ DB 10,11 ; or (%rbx),%cl
+ DB 14 ; (bad)
+ DB 15,255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,18 ; callq *(%rdx)
+ DB 19,22 ; adc (%rsi),%edx
+ DB 23 ; (bad)
+ DB 26,27 ; sbb (%rbx),%bl
+ DB 30 ; (bad)
+ DB 31 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,3 ; incl (%rbx)
+ DB 255,7 ; incl (%rdi)
+ DB 255,11 ; decl (%rbx)
+ DB 255,15 ; decl (%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,19 ; callq *(%rbx)
+ DB 255,23 ; callq *(%rdi)
+ DB 255,27 ; lcall *(%rbx)
+ DB 255,31 ; lcall *(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,0 ; incl (%rax)
+ DB 129,128,129,128,0,128,0,0,0,0 ; addl $0x0,-0x7fff7f7f(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,1 ; add %al,(%rcx)
+ DB 4,5 ; add $0x5,%al
+ DB 8,9 ; or %cl,(%rcx)
+ DB 12,13 ; or $0xd,%al
+ DB 128,128,128,128,128,128,128 ; addb $0x80,-0x7f7f7f80(%rax)
+ DB 128,0,1 ; addb $0x1,(%rax)
+ DB 4,5 ; add $0x5,%al
+ DB 8,9 ; or %cl,(%rcx)
+ DB 12,13 ; or $0xd,%al
+ DB 128,128,128,128,128,128,128 ; addb $0x80,-0x7f7f7f80(%rax)
+ DB 128,255,0 ; cmp $0x0,%bh
+ DB 129,128,0,0,0,0,0,0,0,0 ; addl $0x0,0x0(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 0,0 ; add %al,(%rax)
+ DB 1,2 ; add %eax,(%rdx)
+ DB 5,6,9,10,13 ; add $0xd0a0906,%eax
+ DB 14 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,17 ; callq *(%rcx)
+ DB 18,21,22,25,26,29 ; adc 0x1d1a1916(%rip),%dl # 1d1a314d <_sk_xor__hsw_lowp+0x1d1a1acd>
+ DB 30 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,2 ; incl (%rdx)
+ DB 3,6 ; add (%rsi),%eax
+ DB 7 ; (bad)
+ DB 10,11 ; or (%rbx),%cl
+ DB 14 ; (bad)
+ DB 15,255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,18 ; callq *(%rdx)
+ DB 19,22 ; adc (%rsi),%edx
+ DB 23 ; (bad)
+ DB 26,27 ; sbb (%rbx),%bl
+ DB 30 ; (bad)
+ DB 31 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,3 ; incl (%rbx)
+ DB 255,7 ; incl (%rdi)
+ DB 255,11 ; decl (%rbx)
+ DB 255,15 ; decl (%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,19 ; callq *(%rbx)
+ DB 255,23 ; callq *(%rdi)
+ DB 255,27 ; lcall *(%rbx)
+ DB 255,31 ; lcall *(%rdi)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,0 ; incl (%rax)
+ DB 128,129,128,0,128,129,128 ; addb $0x80,-0x7e7fff80(%rcx)
+ DB 0,128,0,128,0,128 ; add %al,-0x7fff8000(%rax)
+ DB 0,128,0,128,0,128 ; add %al,-0x7fff8000(%rax)
+ DB 0,128,0,128,0,128 ; add %al,-0x7fff8000(%rax)
+ DB 0 ; .byte 0x0
+ DB 128 ; .byte 0x80
+ALIGN 32
+
PUBLIC _sk_start_pipeline_ssse3_lowp
_sk_start_pipeline_ssse3_lowp LABEL PROC
DB 85 ; push %rbp
@@ -27020,13 +28709,13 @@ _sk_load_a8_ssse3_lowp LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
DB 76,139,24 ; mov (%rax),%r11
DB 77,133,192 ; test %r8,%r8
- DB 117,36 ; jne 51a <_sk_load_a8_ssse3_lowp+0x2e>
+ DB 117,37 ; jne 51b <_sk_load_a8_ssse3_lowp+0x2f>
DB 243,65,15,126,28,19 ; movq (%r11,%rdx,1),%xmm3
DB 102,15,96,216 ; punpcklbw %xmm0,%xmm3
DB 102,15,113,243,8 ; psllw $0x8,%xmm3
DB 102,15,228,29,51,15,0,0 ; pmulhuw 0xf33(%rip),%xmm3 # 1440 <_sk_xor__ssse3_lowp+0x10f>
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 15,87,192 ; xorps %xmm0,%xmm0
+ DB 102,15,87,192 ; xorpd %xmm0,%xmm0
DB 15,87,201 ; xorps %xmm1,%xmm1
DB 15,87,210 ; xorps %xmm2,%xmm2
DB 255,224 ; jmpq *%rax
@@ -27035,15 +28724,15 @@ _sk_load_a8_ssse3_lowp LABEL PROC
DB 102,15,239,219 ; pxor %xmm3,%xmm3
DB 65,254,201 ; dec %r9b
DB 65,128,249,6 ; cmp $0x6,%r9b
- DB 119,210 ; ja 500 <_sk_load_a8_ssse3_lowp+0x14>
+ DB 119,209 ; ja 500 <_sk_load_a8_ssse3_lowp+0x14>
DB 69,15,182,201 ; movzbl %r9b,%r9d
- DB 76,141,21,111,0,0,0 ; lea 0x6f(%rip),%r10 # 5a8 <_sk_load_a8_ssse3_lowp+0xbc>
+ DB 76,141,21,110,0,0,0 ; lea 0x6e(%rip),%r10 # 5a8 <_sk_load_a8_ssse3_lowp+0xbc>
DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax
DB 76,1,208 ; add %r10,%rax
DB 255,224 ; jmpq *%rax
DB 65,15,182,4,19 ; movzbl (%r11,%rdx,1),%eax
DB 102,15,110,216 ; movd %eax,%xmm3
- DB 235,179 ; jmp 500 <_sk_load_a8_ssse3_lowp+0x14>
+ DB 235,178 ; jmp 500 <_sk_load_a8_ssse3_lowp+0x14>
DB 65,15,182,68,19,2 ; movzbl 0x2(%r11,%rdx,1),%eax
DB 102,15,239,219 ; pxor %xmm3,%xmm3
DB 102,15,196,216,2 ; pinsrw $0x2,%eax,%xmm3
@@ -27051,7 +28740,7 @@ _sk_load_a8_ssse3_lowp LABEL PROC
DB 102,15,110,192 ; movd %eax,%xmm0
DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
DB 243,15,16,216 ; movss %xmm0,%xmm3
- DB 235,145 ; jmp 500 <_sk_load_a8_ssse3_lowp+0x14>
+ DB 235,144 ; jmp 500 <_sk_load_a8_ssse3_lowp+0x14>
DB 65,15,182,68,19,6 ; movzbl 0x6(%r11,%rdx,1),%eax
DB 102,15,239,219 ; pxor %xmm3,%xmm3
DB 102,15,196,216,6 ; pinsrw $0x6,%eax,%xmm3
@@ -27062,24 +28751,24 @@ _sk_load_a8_ssse3_lowp LABEL PROC
DB 102,65,15,110,4,19 ; movd (%r11,%rdx,1),%xmm0
DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
DB 242,15,16,216 ; movsd %xmm0,%xmm3
- DB 233,89,255,255,255 ; jmpq 500 <_sk_load_a8_ssse3_lowp+0x14>
- DB 144 ; nop
- DB 154 ; (bad)
+ DB 233,88,255,255,255 ; jmpq 500 <_sk_load_a8_ssse3_lowp+0x14>
+ DB 155 ; fwait
DB 255 ; (bad)
DB 255 ; (bad)
- DB 255,180,255,255,255,165,255 ; pushq -0x5a0001(%rdi,%rdi,8)
+ DB 255,181,255,255,255,166 ; pushq -0x59000001(%rbp)
DB 255 ; (bad)
DB 255 ; (bad)
- DB 236 ; in (%dx),%al
DB 255 ; (bad)
+ DB 237 ; in (%dx),%eax
DB 255 ; (bad)
- DB 255,225 ; jmpq *%rcx
DB 255 ; (bad)
+ DB 255,226 ; jmpq *%rdx
DB 255 ; (bad)
- DB 255,214 ; callq *%rsi
+ DB 255 ; (bad)
+ DB 255,215 ; callq *%rdi
DB 255 ; (bad)
DB 255 ; (bad)
- DB 255,199 ; inc %edi
+ DB 255,200 ; dec %eax
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; .byte 0xff
@@ -27109,45 +28798,46 @@ _sk_store_a8_ssse3_lowp LABEL PROC
DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax
DB 76,1,208 ; add %r10,%rax
DB 255,224 ; jmpq *%rax
- DB 102,68,15,127,68,36,64 ; movdqa %xmm8,0x40(%rsp)
- DB 138,68,36,64 ; mov 0x40(%rsp),%al
+ DB 102,68,15,127,4,36 ; movdqa %xmm8,(%rsp)
+ DB 138,4,36 ; mov (%rsp),%al
DB 65,136,4,19 ; mov %al,(%r11,%rdx,1)
- DB 235,190 ; jmp 5e8 <_sk_store_a8_ssse3_lowp+0x24>
- DB 102,68,15,127,68,36,48 ; movdqa %xmm8,0x30(%rsp)
- DB 138,68,36,52 ; mov 0x34(%rsp),%al
+ DB 235,192 ; jmp 5e8 <_sk_store_a8_ssse3_lowp+0x24>
+ DB 102,68,15,127,68,36,16 ; movdqa %xmm8,0x10(%rsp)
+ DB 138,68,36,20 ; mov 0x14(%rsp),%al
DB 65,136,68,19,2 ; mov %al,0x2(%r11,%rdx,1)
- DB 102,68,15,56,0,5,12,14,0,0 ; pshufb 0xe0c(%rip),%xmm8 # 1450 <_sk_xor__ssse3_lowp+0x11f>
+ DB 102,68,15,56,0,5,30,14,0,0 ; pshufb 0xe1e(%rip),%xmm8 # 1460 <_sk_xor__ssse3_lowp+0x12f>
DB 102,68,15,126,192 ; movd %xmm8,%eax
DB 102,65,137,4,19 ; mov %ax,(%r11,%rdx,1)
- DB 235,152 ; jmp 5e8 <_sk_store_a8_ssse3_lowp+0x24>
- DB 102,68,15,127,68,36,32 ; movdqa %xmm8,0x20(%rsp)
- DB 138,68,36,44 ; mov 0x2c(%rsp),%al
+ DB 235,154 ; jmp 5e8 <_sk_store_a8_ssse3_lowp+0x24>
+ DB 102,68,15,127,68,36,64 ; movdqa %xmm8,0x40(%rsp)
+ DB 138,68,36,76 ; mov 0x4c(%rsp),%al
DB 65,136,68,19,6 ; mov %al,0x6(%r11,%rdx,1)
- DB 102,68,15,127,68,36,16 ; movdqa %xmm8,0x10(%rsp)
- DB 138,68,36,26 ; mov 0x1a(%rsp),%al
+ DB 102,68,15,127,68,36,48 ; movdqa %xmm8,0x30(%rsp)
+ DB 138,68,36,58 ; mov 0x3a(%rsp),%al
DB 65,136,68,19,5 ; mov %al,0x5(%r11,%rdx,1)
- DB 102,68,15,127,4,36 ; movdqa %xmm8,(%rsp)
- DB 138,68,36,8 ; mov 0x8(%rsp),%al
+ DB 102,68,15,127,68,36,32 ; movdqa %xmm8,0x20(%rsp)
+ DB 138,68,36,40 ; mov 0x28(%rsp),%al
DB 65,136,68,19,4 ; mov %al,0x4(%r11,%rdx,1)
- DB 102,68,15,56,0,5,215,13,0,0 ; pshufb 0xdd7(%rip),%xmm8 # 1460 <_sk_xor__ssse3_lowp+0x12f>
+ DB 102,68,15,56,0,5,200,13,0,0 ; pshufb 0xdc8(%rip),%xmm8 # 1450 <_sk_xor__ssse3_lowp+0x11f>
DB 102,69,15,126,4,19 ; movd %xmm8,(%r11,%rdx,1)
- DB 233,84,255,255,255 ; jmpq 5e8 <_sk_store_a8_ssse3_lowp+0x24>
+ DB 233,85,255,255,255 ; jmpq 5e8 <_sk_store_a8_ssse3_lowp+0x24>
+ DB 144 ; nop
DB 133,255 ; test %edi,%edi
DB 255 ; (bad)
- DB 255,166,255,255,255,150 ; jmpq *-0x69000001(%rsi)
+ DB 255,164,255,255,255,148,255 ; jmpq *-0x6b0001(%rdi,%rdi,8)
DB 255 ; (bad)
DB 255 ; (bad)
+ DB 234 ; (bad)
DB 255 ; (bad)
- DB 235,255 ; jmp 6a1 <_sk_store_a8_ssse3_lowp+0xdd>
DB 255 ; (bad)
DB 255 ; (bad)
- DB 220,255 ; fdivr %st,%st(7)
+ DB 218,255 ; (bad)
DB 255 ; (bad)
- DB 255,204 ; dec %esp
+ DB 255,202 ; dec %edx
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; (bad)
- DB 188 ; .byte 0xbc
+ DB 186 ; .byte 0xba
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; .byte 0xff
@@ -28014,7 +29704,7 @@ ALIGN 16
DB 255,0 ; incl (%rax)
DB 255,0 ; incl (%rax)
DB 129,128,129,128,129,128,129,128,129,128; addl $0x80818081,-0x7f7e7f7f(%rax)
- DB 129,128,129,128,129,128,0,2,0,0 ; addl $0x200,-0x7f7e7f7f(%rax)
+ DB 129,128,129,128,129,128,0,2,4,6 ; addl $0x6040200,-0x7f7e7f7f(%rax)
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
@@ -28022,7 +29712,7 @@ ALIGN 16
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
DB 0,2 ; add %al,(%rdx)
- DB 4,6 ; add $0x6,%al
+ DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
DB 0,0 ; add %al,(%rax)
diff --git a/src/jumper/SkJumper_stages_lowp.cpp b/src/jumper/SkJumper_stages_lowp.cpp
index d6adc39af6..38dd53079b 100644
--- a/src/jumper/SkJumper_stages_lowp.cpp
+++ b/src/jumper/SkJumper_stages_lowp.cpp
@@ -9,16 +9,22 @@
#include "SkJumper_misc.h"
#include <immintrin.h>
-#if !defined(__SSSE3__) || !defined(__clang__) || !defined(__x86_64__)
- #error "We're starting with just SSSE3 x86-64 for now, and will always require Clang."
+#if !defined(__clang__) || !defined(__x86_64__)
+ #error "We're starting with just x86-64 for now, and will always require Clang."
#endif
-#define WRAP(name) sk_##name##_ssse3_lowp
-
using K = const SkJumper_constants;
-static const size_t kStride = 8;
-template <typename T> using V = T __attribute__((ext_vector_type(8)));
+#if defined(__AVX2__)
+ #define WRAP(name) sk_##name##_hsw_lowp
+ template <typename T> using V = T __attribute__((ext_vector_type(16)));
+ static const size_t kStride = 16;
+#else
+ #define WRAP(name) sk_##name##_ssse3_lowp
+ template <typename T> using V = T __attribute__((ext_vector_type(8)));
+ static const size_t kStride = 8;
+#endif
+
using U8 = V<uint8_t>;
using U16 = V<uint16_t>;
using U32 = V<uint32_t>;
@@ -40,7 +46,14 @@ struct F {
SI F operator+(F x, F y) { return x.vec + y.vec; }
SI F operator-(F x, F y) { return x.vec - y.vec; }
-SI F operator*(F x, F y) { return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec)); }
+SI F operator*(F x, F y) {
+#if defined(__AVX2__)
+ return _mm256_abs_epi16(_mm256_mulhrs_epi16(x.vec, y.vec));
+#else
+ return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec));
+#endif
+}
+
SI F mad(F f, F m, F a) { return f*m+a; }
SI F inv(F v) { return 1.0f - v; }
SI F two(F v) { return v + v; }
@@ -51,6 +64,11 @@ SI F operator>>(F x, int bits) { return x.vec >> bits; }
using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F);
+#if defined(__AVX__)
+ // We really want to make sure all paths go through this function's (implicit) vzeroupper.
+ // If they don't, we'll experience severe slowdowns when we first use SSE instructions again.
+ __attribute__((disable_tail_calls))
+#endif
MAYBE_MSABI
extern "C" size_t WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** program, K* k) {
F v{};
@@ -88,13 +106,21 @@ SI V load(const T* src, size_t tail) {
if (__builtin_expect(tail, 0)) {
V v{}; // Any inactive lanes are zeroed.
switch (tail) {
- case 7: v[6] = src[6];
- case 6: v[5] = src[5];
- case 5: v[4] = src[4];
- case 4: memcpy(&v, src, 4*sizeof(T)); break;
- case 3: v[2] = src[2];
- case 2: memcpy(&v, src, 2*sizeof(T)); break;
- case 1: memcpy(&v, src, 1*sizeof(T)); break;
+ case 15: v[14] = src[14];
+ case 14: v[13] = src[13];
+ case 13: v[12] = src[12];
+ case 12: memcpy(&v, src, 12*sizeof(T)); break;
+ case 11: v[10] = src[10];
+ case 10: v[ 9] = src[ 9];
+ case 9: v[ 8] = src[ 8];
+ case 8: memcpy(&v, src, 8*sizeof(T)); break;
+ case 7: v[6] = src[6];
+ case 6: v[5] = src[5];
+ case 5: v[4] = src[4];
+ case 4: memcpy(&v, src, 4*sizeof(T)); break;
+ case 3: v[2] = src[2];
+ case 2: memcpy(&v, src, 2*sizeof(T)); break;
+ case 1: memcpy(&v, src, 1*sizeof(T)); break;
}
return v;
}
@@ -106,25 +132,39 @@ SI void store(T* dst, V v, size_t tail) {
__builtin_assume(tail < kStride);
if (__builtin_expect(tail, 0)) {
switch (tail) {
- case 7: dst[6] = v[6];
- case 6: dst[5] = v[5];
- case 5: dst[4] = v[4];
- case 4: memcpy(dst, &v, 4*sizeof(T)); break;
- case 3: dst[2] = v[2];
- case 2: memcpy(dst, &v, 2*sizeof(T)); break;
- case 1: memcpy(dst, &v, 1*sizeof(T)); break;
+ case 15: dst[14] = v[14];
+ case 14: dst[13] = v[13];
+ case 13: dst[12] = v[12];
+ case 12: memcpy(dst, &v, 12*sizeof(T)); break;
+ case 11: dst[10] = v[10];
+ case 10: dst[ 9] = v[ 9];
+ case 9: dst[ 8] = v[ 8];
+ case 8: memcpy(dst, &v, 8*sizeof(T)); break;
+ case 7: dst[6] = v[6];
+ case 6: dst[5] = v[5];
+ case 5: dst[4] = v[4];
+ case 4: memcpy(dst, &v, 4*sizeof(T)); break;
+ case 3: dst[2] = v[2];
+ case 2: memcpy(dst, &v, 2*sizeof(T)); break;
+ case 1: memcpy(dst, &v, 1*sizeof(T)); break;
}
return;
}
unaligned_store(dst, v);
}
+// TODO: mask loads and stores with AVX2
+
// Scale from [0,255] up to [0,32768].
SI F from_wide_byte(U16 bytes) {
// Ideally we'd scale by 32768/255 = 128.50196, but instead we'll approximate
// that a little more cheaply as 256*32897/65536 = 128.50391.
// 0 and 255 map to 0 and 32768 correctly, and nothing else is off by more than 1 bit.
- return _mm_mulhi_epu16(bytes << 8, U16(32897));
+#if defined(__AVX2__)
+ return _mm256_mulhi_epu16(bytes << 8, U16(32897));
+#else
+ return _mm_mulhi_epu16(bytes << 8, U16(32897));
+#endif
}
SI F from_byte(U8 bytes) {
return from_wide_byte(__builtin_convertvector(bytes, U16));
@@ -133,13 +173,22 @@ SI F from_byte(U8 bytes) {
// Pack from [0,32768] down to [0,255].
SI U16 to_wide_byte(F v) {
// The simplest thing works great: divide by 128 and saturate.
- return _mm_min_epi16(v>>7, U16(255));
+#if defined(__AVX2__)
+ return _mm256_min_epi16(v >> 7, U16(255));
+#else
+ return _mm_min_epi16(v >> 7, U16(255));
+#endif
}
SI U8 to_byte(F v) {
// Like to_wide_byte(), but we'll bake the saturation into the 16->8 bit pack.
+#if defined(__AVX2__)
+ return _mm_packus_epi16(_mm256_extracti128_si256(v >> 7, 0),
+ _mm256_extracti128_si256(v >> 7, 1));
+#else
// Only the bottom 8 bytes are of interest... it doesn't matter what we pack on top.
- __m128i packed = _mm_packus_epi16(v>>7, v>>7);
+ __m128i packed = _mm_packus_epi16(v >> 7, v >> 7);
return unaligned_load<U8>(&packed);
+#endif
}
SI void from_8888(U32 rgba, F* r, F* g, F* b, F* a) {
diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py
index 02ebab0110..ee6f7da7ba 100755
--- a/src/jumper/build_stages.py
+++ b/src/jumper/build_stages.py
@@ -60,6 +60,12 @@ subprocess.check_call(clang + cflags + hsw +
subprocess.check_call(clang + cflags + hsw + win +
['-c', 'src/jumper/SkJumper_stages.cpp'] +
['-o', 'win_hsw.o'])
+subprocess.check_call(clang + cflags + hsw +
+ ['-c', 'src/jumper/SkJumper_stages_lowp.cpp'] +
+ ['-o', 'lowp_hsw.o'])
+subprocess.check_call(clang + cflags + hsw + win +
+ ['-c', 'src/jumper/SkJumper_stages_lowp.cpp'] +
+ ['-o', 'win_lowp_hsw.o'])
aarch64 = [ '--target=aarch64' ]
subprocess.check_call(clang + cflags + aarch64 +
@@ -196,6 +202,8 @@ parse_object_file('sse41.o', '.byte')
print 'BALIGN32'
parse_object_file('sse2.o', '.byte')
print 'BALIGN32'
+parse_object_file('lowp_hsw.o', '.byte')
+print 'BALIGN32'
parse_object_file('lowp_ssse3.o', '.byte')
print '#endif'
@@ -221,6 +229,8 @@ parse_object_file('win_sse41.o', 'DB')
print 'ALIGN 32'
parse_object_file('win_sse2.o', 'DB')
print 'ALIGN 32'
+parse_object_file('win_lowp_hsw.o', 'DB')
+print 'ALIGN 32'
parse_object_file('win_lowp_ssse3.o', 'DB')
print 'ENDIF'
print 'END'