aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-04-12 12:52:48 -0400
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-04-12 18:57:09 +0000
commit0a9044950c1caa1b9dc0c2837889850d044d1d34 (patch)
tree42dcf8677e42006eb560b03b7ed5d0bcdd61f092 /src
parent50130e427c4d02405a38e26c4f020159e6ac295a (diff)
jumper, bilinear and bicubic sampling stages
This splits SkImageShaderContext into three parts: - SkJumper_GatherCtx: always, already done - SkJumper_SamplerCtx: when bilinear or bicubic - MiscCtx: other little bits (the matrix, paint color, tiling limits) Thanks for the snazzy allocator that allows this Herb! Both SkJumper and SkRasterPipeline_opts.h should be speaking all the same types now. I've copied the comments about bilinear/bicubic to SkJumper with little typo fixes and clarifications. Change-Id: I4ba7b7c02feba3f65f5292169a22c060e34933c6 Reviewed-on: https://skia-review.googlesource.com/13269 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src')
-rw-r--r--src/image/SkImageShader.cpp79
-rw-r--r--src/image/SkImageShaderContext.h37
-rw-r--r--src/jumper/SkJumper.cpp5
-rw-r--r--src/jumper/SkJumper.h14
-rw-r--r--src/jumper/SkJumper_generated.S1939
-rw-r--r--src/jumper/SkJumper_generated_win.S1297
-rw-r--r--src/jumper/SkJumper_stages.cpp112
-rw-r--r--src/opts/SkRasterPipeline_opts.h37
8 files changed, 3428 insertions, 92 deletions
diff --git a/src/image/SkImageShader.cpp b/src/image/SkImageShader.cpp
index cbba65efa8..ce0d6f504b 100644
--- a/src/image/SkImageShader.cpp
+++ b/src/image/SkImageShader.cpp
@@ -13,10 +13,10 @@
#include "SkEmptyShader.h"
#include "SkImage_Base.h"
#include "SkImageShader.h"
-#include "SkImageShaderContext.h"
#include "SkPM4fPriv.h"
#include "SkReadBuffer.h"
#include "SkWriteBuffer.h"
+#include "../jumper/SkJumper.h"
SkImageShader::SkImageShader(sk_sp<SkImage> img, TileMode tmx, TileMode tmy, const SkMatrix* matrix)
: INHERITED(matrix)
@@ -257,41 +257,51 @@ bool SkImageShader::onAppendStages(SkRasterPipeline* p, SkColorSpace* dst, SkAre
}
}
- auto ctx = scratch->make<SkImageShaderContext>();
- ctx->pixels = pm.addr();
- ctx->ctable = pm.ctable() ? pm.ctable()->readColors() : nullptr;
- ctx->stride = pm.rowBytesAsPixels();
- ctx->color4f = SkColor4f_from_SkColor(paint.getColor(), dst);
- ctx->width = (float)pm.width();
- ctx->height = (float)pm.height();
- ctx->state = std::move(state); // Extend lifetime to match the pipeline's.
- if (matrix.asAffine(ctx->matrix)) {
- p->append(SkRasterPipeline::matrix_2x3, ctx->matrix);
+
+ struct MiscCtx {
+ std::unique_ptr<SkBitmapController::State> state;
+ SkColor4f paint_color;
+ float width;
+ float height;
+ float matrix[9];
+ };
+ auto misc = scratch->make<MiscCtx>();
+ misc->state = std::move(state); // Extend lifetime to match the pipeline's.
+ misc->paint_color = SkColor4f_from_SkColor(paint.getColor(), dst);
+ misc->width = (float)pm.width();
+ misc->height = (float)pm.height();
+ if (matrix.asAffine(misc->matrix)) {
+ p->append(SkRasterPipeline::matrix_2x3, misc->matrix);
} else {
- matrix.get9(ctx->matrix);
- p->append(SkRasterPipeline::matrix_perspective, ctx->matrix);
+ matrix.get9(misc->matrix);
+ p->append(SkRasterPipeline::matrix_perspective, misc->matrix);
}
+ auto gather = scratch->make<SkJumper_GatherCtx>();
+ gather->pixels = pm.addr();
+ gather->ctable = pm.ctable() ? pm.ctable()->readColors() : nullptr;
+ gather->stride = pm.rowBytesAsPixels();
+
auto append_tiling_and_gather = [&] {
switch (fTileModeX) {
- case kClamp_TileMode: p->append(SkRasterPipeline::clamp_x, &ctx->width); break;
- case kMirror_TileMode: p->append(SkRasterPipeline::mirror_x, &ctx->width); break;
- case kRepeat_TileMode: p->append(SkRasterPipeline::repeat_x, &ctx->width); break;
+ case kClamp_TileMode: p->append(SkRasterPipeline::clamp_x, &misc->width); break;
+ case kMirror_TileMode: p->append(SkRasterPipeline::mirror_x, &misc->width); break;
+ case kRepeat_TileMode: p->append(SkRasterPipeline::repeat_x, &misc->width); break;
}
switch (fTileModeY) {
- case kClamp_TileMode: p->append(SkRasterPipeline::clamp_y, &ctx->height); break;
- case kMirror_TileMode: p->append(SkRasterPipeline::mirror_y, &ctx->height); break;
- case kRepeat_TileMode: p->append(SkRasterPipeline::repeat_y, &ctx->height); break;
+ case kClamp_TileMode: p->append(SkRasterPipeline::clamp_y, &misc->height); break;
+ case kMirror_TileMode: p->append(SkRasterPipeline::mirror_y, &misc->height); break;
+ case kRepeat_TileMode: p->append(SkRasterPipeline::repeat_y, &misc->height); break;
}
switch (info.colorType()) {
- case kAlpha_8_SkColorType: p->append(SkRasterPipeline::gather_a8, ctx); break;
- case kIndex_8_SkColorType: p->append(SkRasterPipeline::gather_i8, ctx); break;
- case kGray_8_SkColorType: p->append(SkRasterPipeline::gather_g8, ctx); break;
- case kRGB_565_SkColorType: p->append(SkRasterPipeline::gather_565, ctx); break;
- case kARGB_4444_SkColorType: p->append(SkRasterPipeline::gather_4444, ctx); break;
+ case kAlpha_8_SkColorType: p->append(SkRasterPipeline::gather_a8, gather); break;
+ case kIndex_8_SkColorType: p->append(SkRasterPipeline::gather_i8, gather); break;
+ case kGray_8_SkColorType: p->append(SkRasterPipeline::gather_g8, gather); break;
+ case kRGB_565_SkColorType: p->append(SkRasterPipeline::gather_565, gather); break;
+ case kARGB_4444_SkColorType: p->append(SkRasterPipeline::gather_4444, gather); break;
case kRGBA_8888_SkColorType:
- case kBGRA_8888_SkColorType: p->append(SkRasterPipeline::gather_8888, ctx); break;
- case kRGBA_F16_SkColorType: p->append(SkRasterPipeline::gather_f16, ctx); break;
+ case kBGRA_8888_SkColorType: p->append(SkRasterPipeline::gather_8888, gather); break;
+ case kRGBA_F16_SkColorType: p->append(SkRasterPipeline::gather_f16, gather); break;
default: SkASSERT(false);
}
if (info.gammaCloseToSRGB() && dst != nullptr) {
@@ -299,18 +309,23 @@ bool SkImageShader::onAppendStages(SkRasterPipeline* p, SkColorSpace* dst, SkAre
}
};
+ SkJumper_SamplerCtx* sampler = nullptr;
+ if (quality != kNone_SkFilterQuality) {
+ sampler = scratch->make<SkJumper_SamplerCtx>();
+ }
+
auto sample = [&](SkRasterPipeline::StockStage setup_x,
SkRasterPipeline::StockStage setup_y) {
- p->append(setup_x, ctx);
- p->append(setup_y, ctx);
+ p->append(setup_x, sampler);
+ p->append(setup_y, sampler);
append_tiling_and_gather();
- p->append(SkRasterPipeline::accumulate, ctx);
+ p->append(SkRasterPipeline::accumulate, sampler);
};
if (quality == kNone_SkFilterQuality) {
append_tiling_and_gather();
} else if (quality == kLow_SkFilterQuality) {
- p->append(SkRasterPipeline::save_xy, ctx);
+ p->append(SkRasterPipeline::save_xy, sampler);
sample(SkRasterPipeline::bilinear_nx, SkRasterPipeline::bilinear_ny);
sample(SkRasterPipeline::bilinear_px, SkRasterPipeline::bilinear_ny);
@@ -319,7 +334,7 @@ bool SkImageShader::onAppendStages(SkRasterPipeline* p, SkColorSpace* dst, SkAre
p->append(SkRasterPipeline::move_dst_src);
} else {
- p->append(SkRasterPipeline::save_xy, ctx);
+ p->append(SkRasterPipeline::save_xy, sampler);
sample(SkRasterPipeline::bicubic_n3x, SkRasterPipeline::bicubic_n3y);
sample(SkRasterPipeline::bicubic_n1x, SkRasterPipeline::bicubic_n3y);
@@ -352,7 +367,7 @@ bool SkImageShader::onAppendStages(SkRasterPipeline* p, SkColorSpace* dst, SkAre
p->append(SkRasterPipeline::swap_rb);
}
if (info.colorType() == kAlpha_8_SkColorType) {
- p->append(SkRasterPipeline::set_rgb, &ctx->color4f);
+ p->append(SkRasterPipeline::set_rgb, &misc->paint_color);
}
if (info.colorType() == kAlpha_8_SkColorType || info.alphaType() == kUnpremul_SkAlphaType) {
p->append(SkRasterPipeline::premul);
diff --git a/src/image/SkImageShaderContext.h b/src/image/SkImageShaderContext.h
deleted file mode 100644
index 7a8ba6369f..0000000000
--- a/src/image/SkImageShaderContext.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright 2016 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#ifndef SkImageShaderContext_DEFINED
-#define SkImageShaderContext_DEFINED
-
-#include "SkBitmapController.h"
-#include "SkColor.h"
-#include "SkColorTable.h"
-#include <memory>
-
-// Definition used by SkImageShader.cpp and SkRasterPipeline_opts.h.
-// Otherwise, completely uninteresting.
-
-struct SkImageShaderContext {
- const void* pixels;
- const uint32_t* ctable;
- int stride;
- SkColor4f color4f;
- float width;
- float height;
- float matrix[9];
- float x[8];
- float y[8];
- float fx[8];
- float fy[8];
- float scalex[8];
- float scaley[8];
-
- std::unique_ptr<SkBitmapController::State> state;
-};
-
-#endif//SkImageShaderContext_DEFINED
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 736dad6f06..0d4446ce47 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -123,6 +123,11 @@ static K kConstants = {
M(repeat_y) \
M(mirror_x) \
M(mirror_y) \
+ M(save_xy) \
+ M(accumulate) \
+ M(bilinear_nx) M(bilinear_px) M(bilinear_ny) M(bilinear_py) \
+ M(bicubic_n3x) M(bicubic_n1x) M(bicubic_p1x) M(bicubic_p3x) \
+ M(bicubic_n3y) M(bicubic_n1y) M(bicubic_p1y) M(bicubic_p3y) \
M(linear_gradient) \
M(linear_gradient_2stops)
diff --git a/src/jumper/SkJumper.h b/src/jumper/SkJumper.h
index b440391f38..7a3f4e85f5 100644
--- a/src/jumper/SkJumper.h
+++ b/src/jumper/SkJumper.h
@@ -52,8 +52,10 @@
// - the _i and _f user-defined literal operators call C() for you in a prettier way; or
// - you can load values from this struct.
+static const int SkJumper_kMaxStride = 8;
+
struct SkJumper_constants {
- float iota[8]; // 0,1,2,3,4,5,6,7
+ float iota[SkJumper_kMaxStride]; // 0,1,2,3,4,...
};
struct SkJumper_GatherCtx {
@@ -62,4 +64,14 @@ struct SkJumper_GatherCtx {
int stride;
};
+// State shared by save_xy, accumulate, and bilinear_* / bicubic_*.
+struct SkJumper_SamplerCtx {
+ float x[SkJumper_kMaxStride];
+ float y[SkJumper_kMaxStride];
+ float fx[SkJumper_kMaxStride];
+ float fy[SkJumper_kMaxStride];
+ float scalex[SkJumper_kMaxStride];
+ float scaley[SkJumper_kMaxStride];
+};
+
#endif//SkJumper_DEFINED
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index db0d5933e3..80a08b990f 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -2648,6 +2648,284 @@ _sk_linear_gradient_2stops_aarch64:
.long 0x4f911001 // fmla v1.4s, v0.4s, v17.s[0]
.long 0x4eb01e00 // mov v0.16b, v16.16b
.long 0xd61f0060 // br x3
+
+HIDDEN _sk_save_xy_aarch64
+.globl _sk_save_xy_aarch64
+_sk_save_xy_aarch64:
+ .long 0x4f0167f0 // movi v16.4s, #0x3f, lsl #24
+ .long 0xf9400028 // ldr x8, [x1]
+ .long 0x4e30d411 // fadd v17.4s, v0.4s, v16.4s
+ .long 0x4e30d430 // fadd v16.4s, v1.4s, v16.4s
+ .long 0x4e219a32 // frintm v18.4s, v17.4s
+ .long 0x4eb2d631 // fsub v17.4s, v17.4s, v18.4s
+ .long 0x4e219a12 // frintm v18.4s, v16.4s
+ .long 0x4eb2d610 // fsub v16.4s, v16.4s, v18.4s
+ .long 0x3d800100 // str q0, [x8]
+ .long 0x3d800901 // str q1, [x8, #32]
+ .long 0x3d801111 // str q17, [x8, #64]
+ .long 0x3d801910 // str q16, [x8, #96]
+ .long 0xf9400423 // ldr x3, [x1, #8]
+ .long 0x91004021 // add x1, x1, #0x10
+ .long 0xd61f0060 // br x3
+
+HIDDEN _sk_accumulate_aarch64
+.globl _sk_accumulate_aarch64
+_sk_accumulate_aarch64:
+ .long 0xa8c10c28 // ldp x8, x3, [x1], #16
+ .long 0x3dc02110 // ldr q16, [x8, #128]
+ .long 0x3dc02911 // ldr q17, [x8, #160]
+ .long 0x6e31de10 // fmul v16.4s, v16.4s, v17.4s
+ .long 0x4e30cc04 // fmla v4.4s, v0.4s, v16.4s
+ .long 0x4e30cc25 // fmla v5.4s, v1.4s, v16.4s
+ .long 0x4e30cc46 // fmla v6.4s, v2.4s, v16.4s
+ .long 0x4e30cc67 // fmla v7.4s, v3.4s, v16.4s
+ .long 0xd61f0060 // br x3
+
+HIDDEN _sk_bilinear_nx_aarch64
+.globl _sk_bilinear_nx_aarch64
+_sk_bilinear_nx_aarch64:
+ .long 0xf9400028 // ldr x8, [x1]
+ .long 0x4f03f611 // fmov v17.4s, #1.000000000000000000e+00
+ .long 0x3dc01100 // ldr q0, [x8, #64]
+ .long 0x3dc00110 // ldr q16, [x8]
+ .long 0x4ea0d620 // fsub v0.4s, v17.4s, v0.4s
+ .long 0x3d802100 // str q0, [x8, #128]
+ .long 0xf9400423 // ldr x3, [x1, #8]
+ .long 0x4f0567e0 // movi v0.4s, #0xbf, lsl #24
+ .long 0x4e20d600 // fadd v0.4s, v16.4s, v0.4s
+ .long 0x91004021 // add x1, x1, #0x10
+ .long 0xd61f0060 // br x3
+
+HIDDEN _sk_bilinear_px_aarch64
+.globl _sk_bilinear_px_aarch64
+_sk_bilinear_px_aarch64:
+ .long 0xf9400028 // ldr x8, [x1]
+ .long 0x3dc01100 // ldr q0, [x8, #64]
+ .long 0x3dc00110 // ldr q16, [x8]
+ .long 0x3d802100 // str q0, [x8, #128]
+ .long 0xf9400423 // ldr x3, [x1, #8]
+ .long 0x4f0167e0 // movi v0.4s, #0x3f, lsl #24
+ .long 0x4e20d600 // fadd v0.4s, v16.4s, v0.4s
+ .long 0x91004021 // add x1, x1, #0x10
+ .long 0xd61f0060 // br x3
+
+HIDDEN _sk_bilinear_ny_aarch64
+.globl _sk_bilinear_ny_aarch64
+_sk_bilinear_ny_aarch64:
+ .long 0xf9400028 // ldr x8, [x1]
+ .long 0x4f03f611 // fmov v17.4s, #1.000000000000000000e+00
+ .long 0x3dc01901 // ldr q1, [x8, #96]
+ .long 0x3dc00910 // ldr q16, [x8, #32]
+ .long 0x4ea1d621 // fsub v1.4s, v17.4s, v1.4s
+ .long 0x3d802901 // str q1, [x8, #160]
+ .long 0xf9400423 // ldr x3, [x1, #8]
+ .long 0x4f0567e1 // movi v1.4s, #0xbf, lsl #24
+ .long 0x4e21d601 // fadd v1.4s, v16.4s, v1.4s
+ .long 0x91004021 // add x1, x1, #0x10
+ .long 0xd61f0060 // br x3
+
+HIDDEN _sk_bilinear_py_aarch64
+.globl _sk_bilinear_py_aarch64
+_sk_bilinear_py_aarch64:
+ .long 0xf9400028 // ldr x8, [x1]
+ .long 0x3dc01901 // ldr q1, [x8, #96]
+ .long 0x3dc00910 // ldr q16, [x8, #32]
+ .long 0x3d802901 // str q1, [x8, #160]
+ .long 0xf9400423 // ldr x3, [x1, #8]
+ .long 0x4f0167e1 // movi v1.4s, #0x3f, lsl #24
+ .long 0x4e21d601 // fadd v1.4s, v16.4s, v1.4s
+ .long 0x91004021 // add x1, x1, #0x10
+ .long 0xd61f0060 // br x3
+
+HIDDEN _sk_bicubic_n3x_aarch64
+.globl _sk_bicubic_n3x_aarch64
+_sk_bicubic_n3x_aarch64:
+ .long 0xf9400028 // ldr x8, [x1]
+ .long 0x52a7d8e9 // mov w9, #0x3ec70000
+ .long 0x72838e49 // movk w9, #0x1c72
+ .long 0x4e040d30 // dup v16.4s, w9
+ .long 0x3dc01111 // ldr q17, [x8, #64]
+ .long 0x52b7d549 // mov w9, #0xbeaa0000
+ .long 0x4f03f600 // fmov v0.4s, #1.000000000000000000e+00
+ .long 0x72955569 // movk w9, #0xaaab
+ .long 0x4e040d32 // dup v18.4s, w9
+ .long 0x4eb1d400 // fsub v0.4s, v0.4s, v17.4s
+ .long 0x6e20dc11 // fmul v17.4s, v0.4s, v0.4s
+ .long 0x4e20ce12 // fmla v18.4s, v16.4s, v0.4s
+ .long 0x6e32de20 // fmul v0.4s, v17.4s, v18.4s
+ .long 0x3dc00113 // ldr q19, [x8]
+ .long 0x3d802100 // str q0, [x8, #128]
+ .long 0xf9400423 // ldr x3, [x1, #8]
+ .long 0x4f07f700 // fmov v0.4s, #-1.500000000000000000e+00
+ .long 0x4e20d660 // fadd v0.4s, v19.4s, v0.4s
+ .long 0x91004021 // add x1, x1, #0x10
+ .long 0xd61f0060 // br x3
+
+HIDDEN _sk_bicubic_n1x_aarch64
+.globl _sk_bicubic_n1x_aarch64
+_sk_bicubic_n1x_aarch64:
+ .long 0xf9400028 // ldr x8, [x1]
+ .long 0x52b7f2a9 // mov w9, #0xbf950000
+ .long 0x4f03f600 // fmov v0.4s, #1.000000000000000000e+00
+ .long 0x728aaaa9 // movk w9, #0x5555
+ .long 0x3dc01110 // ldr q16, [x8, #64]
+ .long 0x4f03f711 // fmov v17.4s, #1.500000000000000000e+00
+ .long 0x4f0167f2 // movi v18.4s, #0x3f, lsl #24
+ .long 0x4eb0d400 // fsub v0.4s, v0.4s, v16.4s
+ .long 0x4e040d30 // dup v16.4s, w9
+ .long 0x52a7ac69 // mov w9, #0x3d630000
+ .long 0x7291c729 // movk w9, #0x8e39
+ .long 0x4e20ce11 // fmla v17.4s, v16.4s, v0.4s
+ .long 0x4e20ce32 // fmla v18.4s, v17.4s, v0.4s
+ .long 0x4e040d31 // dup v17.4s, w9
+ .long 0x4e20ce51 // fmla v17.4s, v18.4s, v0.4s
+ .long 0x3dc00110 // ldr q16, [x8]
+ .long 0x3d802111 // str q17, [x8, #128]
+ .long 0xf9400423 // ldr x3, [x1, #8]
+ .long 0x4f0567e0 // movi v0.4s, #0xbf, lsl #24
+ .long 0x4e20d600 // fadd v0.4s, v16.4s, v0.4s
+ .long 0x91004021 // add x1, x1, #0x10
+ .long 0xd61f0060 // br x3
+
+HIDDEN _sk_bicubic_p1x_aarch64
+.globl _sk_bicubic_p1x_aarch64
+_sk_bicubic_p1x_aarch64:
+ .long 0xf9400028 // ldr x8, [x1]
+ .long 0x52b7f2a9 // mov w9, #0xbf950000
+ .long 0x728aaaa9 // movk w9, #0x5555
+ .long 0x4f03f711 // fmov v17.4s, #1.500000000000000000e+00
+ .long 0x3dc01112 // ldr q18, [x8, #64]
+ .long 0x3dc00100 // ldr q0, [x8]
+ .long 0x4e040d33 // dup v19.4s, w9
+ .long 0x52a7ac69 // mov w9, #0x3d630000
+ .long 0x4f0167f0 // movi v16.4s, #0x3f, lsl #24
+ .long 0x7291c729 // movk w9, #0x8e39
+ .long 0x4e32ce71 // fmla v17.4s, v19.4s, v18.4s
+ .long 0x4e30d400 // fadd v0.4s, v0.4s, v16.4s
+ .long 0x4e32ce30 // fmla v16.4s, v17.4s, v18.4s
+ .long 0x4e040d31 // dup v17.4s, w9
+ .long 0x4e32ce11 // fmla v17.4s, v16.4s, v18.4s
+ .long 0x3d802111 // str q17, [x8, #128]
+ .long 0xf9400423 // ldr x3, [x1, #8]
+ .long 0x91004021 // add x1, x1, #0x10
+ .long 0xd61f0060 // br x3
+
+HIDDEN _sk_bicubic_p3x_aarch64
+.globl _sk_bicubic_p3x_aarch64
+_sk_bicubic_p3x_aarch64:
+ .long 0xf9400028 // ldr x8, [x1]
+ .long 0x52a7d8e9 // mov w9, #0x3ec70000
+ .long 0x72838e49 // movk w9, #0x1c72
+ .long 0x4e040d20 // dup v0.4s, w9
+ .long 0x3dc01110 // ldr q16, [x8, #64]
+ .long 0x52b7d549 // mov w9, #0xbeaa0000
+ .long 0x72955569 // movk w9, #0xaaab
+ .long 0x4e040d31 // dup v17.4s, w9
+ .long 0x6e30de13 // fmul v19.4s, v16.4s, v16.4s
+ .long 0x4e30cc11 // fmla v17.4s, v0.4s, v16.4s
+ .long 0x6e31de60 // fmul v0.4s, v19.4s, v17.4s
+ .long 0x3dc00112 // ldr q18, [x8]
+ .long 0x3d802100 // str q0, [x8, #128]
+ .long 0xf9400423 // ldr x3, [x1, #8]
+ .long 0x4f03f700 // fmov v0.4s, #1.500000000000000000e+00
+ .long 0x4e20d640 // fadd v0.4s, v18.4s, v0.4s
+ .long 0x91004021 // add x1, x1, #0x10
+ .long 0xd61f0060 // br x3
+
+HIDDEN _sk_bicubic_n3y_aarch64
+.globl _sk_bicubic_n3y_aarch64
+_sk_bicubic_n3y_aarch64:
+ .long 0xf9400028 // ldr x8, [x1]
+ .long 0x52a7d8e9 // mov w9, #0x3ec70000
+ .long 0x72838e49 // movk w9, #0x1c72
+ .long 0x4e040d30 // dup v16.4s, w9
+ .long 0x3dc01911 // ldr q17, [x8, #96]
+ .long 0x52b7d549 // mov w9, #0xbeaa0000
+ .long 0x4f03f601 // fmov v1.4s, #1.000000000000000000e+00
+ .long 0x72955569 // movk w9, #0xaaab
+ .long 0x4e040d32 // dup v18.4s, w9
+ .long 0x4eb1d421 // fsub v1.4s, v1.4s, v17.4s
+ .long 0x6e21dc31 // fmul v17.4s, v1.4s, v1.4s
+ .long 0x4e21ce12 // fmla v18.4s, v16.4s, v1.4s
+ .long 0x6e32de21 // fmul v1.4s, v17.4s, v18.4s
+ .long 0x3dc00913 // ldr q19, [x8, #32]
+ .long 0x3d802901 // str q1, [x8, #160]
+ .long 0xf9400423 // ldr x3, [x1, #8]
+ .long 0x4f07f701 // fmov v1.4s, #-1.500000000000000000e+00
+ .long 0x4e21d661 // fadd v1.4s, v19.4s, v1.4s
+ .long 0x91004021 // add x1, x1, #0x10
+ .long 0xd61f0060 // br x3
+
+HIDDEN _sk_bicubic_n1y_aarch64
+.globl _sk_bicubic_n1y_aarch64
+_sk_bicubic_n1y_aarch64:
+ .long 0xf9400028 // ldr x8, [x1]
+ .long 0x52b7f2a9 // mov w9, #0xbf950000
+ .long 0x4f03f601 // fmov v1.4s, #1.000000000000000000e+00
+ .long 0x728aaaa9 // movk w9, #0x5555
+ .long 0x3dc01910 // ldr q16, [x8, #96]
+ .long 0x4f03f711 // fmov v17.4s, #1.500000000000000000e+00
+ .long 0x4f0167f2 // movi v18.4s, #0x3f, lsl #24
+ .long 0x4eb0d421 // fsub v1.4s, v1.4s, v16.4s
+ .long 0x4e040d30 // dup v16.4s, w9
+ .long 0x52a7ac69 // mov w9, #0x3d630000
+ .long 0x7291c729 // movk w9, #0x8e39
+ .long 0x4e21ce11 // fmla v17.4s, v16.4s, v1.4s
+ .long 0x4e21ce32 // fmla v18.4s, v17.4s, v1.4s
+ .long 0x4e040d31 // dup v17.4s, w9
+ .long 0x4e21ce51 // fmla v17.4s, v18.4s, v1.4s
+ .long 0x3dc00910 // ldr q16, [x8, #32]
+ .long 0x3d802911 // str q17, [x8, #160]
+ .long 0xf9400423 // ldr x3, [x1, #8]
+ .long 0x4f0567e1 // movi v1.4s, #0xbf, lsl #24
+ .long 0x4e21d601 // fadd v1.4s, v16.4s, v1.4s
+ .long 0x91004021 // add x1, x1, #0x10
+ .long 0xd61f0060 // br x3
+
+HIDDEN _sk_bicubic_p1y_aarch64
+.globl _sk_bicubic_p1y_aarch64
+_sk_bicubic_p1y_aarch64:
+ .long 0xf9400028 // ldr x8, [x1]
+ .long 0x52b7f2a9 // mov w9, #0xbf950000
+ .long 0x728aaaa9 // movk w9, #0x5555
+ .long 0x4f03f711 // fmov v17.4s, #1.500000000000000000e+00
+ .long 0x3dc01912 // ldr q18, [x8, #96]
+ .long 0x3dc00901 // ldr q1, [x8, #32]
+ .long 0x4e040d33 // dup v19.4s, w9
+ .long 0x52a7ac69 // mov w9, #0x3d630000
+ .long 0x4f0167f0 // movi v16.4s, #0x3f, lsl #24
+ .long 0x7291c729 // movk w9, #0x8e39
+ .long 0x4e32ce71 // fmla v17.4s, v19.4s, v18.4s
+ .long 0x4e30d421 // fadd v1.4s, v1.4s, v16.4s
+ .long 0x4e32ce30 // fmla v16.4s, v17.4s, v18.4s
+ .long 0x4e040d31 // dup v17.4s, w9
+ .long 0x4e32ce11 // fmla v17.4s, v16.4s, v18.4s
+ .long 0x3d802911 // str q17, [x8, #160]
+ .long 0xf9400423 // ldr x3, [x1, #8]
+ .long 0x91004021 // add x1, x1, #0x10
+ .long 0xd61f0060 // br x3
+
+HIDDEN _sk_bicubic_p3y_aarch64
+.globl _sk_bicubic_p3y_aarch64
+_sk_bicubic_p3y_aarch64:
+ .long 0xf9400028 // ldr x8, [x1]
+ .long 0x52a7d8e9 // mov w9, #0x3ec70000
+ .long 0x72838e49 // movk w9, #0x1c72
+ .long 0x4e040d21 // dup v1.4s, w9
+ .long 0x3dc01910 // ldr q16, [x8, #96]
+ .long 0x52b7d549 // mov w9, #0xbeaa0000
+ .long 0x72955569 // movk w9, #0xaaab
+ .long 0x4e040d31 // dup v17.4s, w9
+ .long 0x6e30de13 // fmul v19.4s, v16.4s, v16.4s
+ .long 0x4e30cc31 // fmla v17.4s, v1.4s, v16.4s
+ .long 0x6e31de61 // fmul v1.4s, v19.4s, v17.4s
+ .long 0x3dc00912 // ldr q18, [x8, #32]
+ .long 0x3d802901 // str q1, [x8, #160]
+ .long 0xf9400423 // ldr x3, [x1, #8]
+ .long 0x4f03f701 // fmov v1.4s, #1.500000000000000000e+00
+ .long 0x4e21d641 // fadd v1.4s, v18.4s, v1.4s
+ .long 0x91004021 // add x1, x1, #0x10
+ .long 0xd61f0060 // br x3
#elif defined(__arm__)
.balign 4
@@ -5496,6 +5774,316 @@ _sk_linear_gradient_2stops_vfp4:
.long 0xf22001b0 // vorr d0, d16, d16
.long 0xe8bd4010 // pop {r4, lr}
.long 0xe12fff1c // bx ip
+
+HIDDEN _sk_save_xy_vfp4
+.globl _sk_save_xy_vfp4
+_sk_save_xy_vfp4:
+ .long 0xf2c3061f // vmov.i32 d16, #1056964608
+ .long 0xeddf7b17 // vldr d23, [pc, #92]
+ .long 0xf2c06010 // vmov.i32 d22, #0
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xf2401d20 // vadd.f32 d17, d0, d16
+ .long 0xf2410d20 // vadd.f32 d16, d1, d16
+ .long 0xed830b00 // vstr d0, [r3]
+ .long 0xed831b08 // vstr d1, [r3, #32]
+ .long 0xf3fb2721 // vcvt.s32.f32 d18, d17
+ .long 0xf3fb3720 // vcvt.s32.f32 d19, d16
+ .long 0xf3fb2622 // vcvt.f32.s32 d18, d18
+ .long 0xf3fb3623 // vcvt.f32.s32 d19, d19
+ .long 0xf3624ea1 // vcgt.f32 d20, d18, d17
+ .long 0xf3635ea0 // vcgt.f32 d21, d19, d16
+ .long 0xf35741b6 // vbsl d20, d23, d22
+ .long 0xf35751b6 // vbsl d21, d23, d22
+ .long 0xf2622da4 // vsub.f32 d18, d18, d20
+ .long 0xf2633da5 // vsub.f32 d19, d19, d21
+ .long 0xf2611da2 // vsub.f32 d17, d17, d18
+ .long 0xf2600da3 // vsub.f32 d16, d16, d19
+ .long 0xedc31b10 // vstr d17, [r3, #64]
+ .long 0xedc30b18 // vstr d16, [r3, #96]
+ .long 0xe2813008 // add r3, r1, #8
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe1a01003 // mov r1, r3
+ .long 0xe12fff1c // bx ip
+ .long 0x3f800000 // .word 0x3f800000
+ .long 0x3f800000 // .word 0x3f800000
+
+HIDDEN _sk_accumulate_vfp4
+.globl _sk_accumulate_vfp4
+_sk_accumulate_vfp4:
+ .long 0xe8911008 // ldm r1, {r3, ip}
+ .long 0xe2811008 // add r1, r1, #8
+ .long 0xedd31b28 // vldr d17, [r3, #160]
+ .long 0xedd30b20 // vldr d16, [r3, #128]
+ .long 0xf3400db1 // vmul.f32 d16, d16, d17
+ .long 0xf2004c90 // vfma.f32 d4, d16, d0
+ .long 0xf2005c91 // vfma.f32 d5, d16, d1
+ .long 0xf2006c92 // vfma.f32 d6, d16, d2
+ .long 0xf2007c93 // vfma.f32 d7, d16, d3
+ .long 0xe12fff1c // bx ip
+
+HIDDEN _sk_bilinear_nx_vfp4
+.globl _sk_bilinear_nx_vfp4
+_sk_bilinear_nx_vfp4:
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xf2c70f10 // vmov.f32 d16, #1
+ .long 0xedd32b10 // vldr d18, [r3, #64]
+ .long 0xf2600da2 // vsub.f32 d16, d16, d18
+ .long 0xedd31b00 // vldr d17, [r3]
+ .long 0xf3c3261f // vmov.i32 d18, #-1090519040
+ .long 0xf2010da2 // vadd.f32 d0, d17, d18
+ .long 0xedc30b20 // vstr d16, [r3, #128]
+ .long 0xe2813008 // add r3, r1, #8
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe1a01003 // mov r1, r3
+ .long 0xe12fff1c // bx ip
+
+HIDDEN _sk_bilinear_px_vfp4
+.globl _sk_bilinear_px_vfp4
+_sk_bilinear_px_vfp4:
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xf2c3061f // vmov.i32 d16, #1056964608
+ .long 0xedd31b00 // vldr d17, [r3]
+ .long 0xedd32b10 // vldr d18, [r3, #64]
+ .long 0xf2010da0 // vadd.f32 d0, d17, d16
+ .long 0xedc32b20 // vstr d18, [r3, #128]
+ .long 0xe2813008 // add r3, r1, #8
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe1a01003 // mov r1, r3
+ .long 0xe12fff1c // bx ip
+
+HIDDEN _sk_bilinear_ny_vfp4
+.globl _sk_bilinear_ny_vfp4
+_sk_bilinear_ny_vfp4:
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xf2c70f10 // vmov.f32 d16, #1
+ .long 0xedd32b18 // vldr d18, [r3, #96]
+ .long 0xf2600da2 // vsub.f32 d16, d16, d18
+ .long 0xedd31b08 // vldr d17, [r3, #32]
+ .long 0xf3c3261f // vmov.i32 d18, #-1090519040
+ .long 0xf2011da2 // vadd.f32 d1, d17, d18
+ .long 0xedc30b28 // vstr d16, [r3, #160]
+ .long 0xe2813008 // add r3, r1, #8
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe1a01003 // mov r1, r3
+ .long 0xe12fff1c // bx ip
+
+HIDDEN _sk_bilinear_py_vfp4
+.globl _sk_bilinear_py_vfp4
+_sk_bilinear_py_vfp4:
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xf2c3061f // vmov.i32 d16, #1056964608
+ .long 0xedd31b08 // vldr d17, [r3, #32]
+ .long 0xedd32b18 // vldr d18, [r3, #96]
+ .long 0xf2011da0 // vadd.f32 d1, d17, d16
+ .long 0xedc32b28 // vstr d18, [r3, #160]
+ .long 0xe2813008 // add r3, r1, #8
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe1a01003 // mov r1, r3
+ .long 0xe12fff1c // bx ip
+
+HIDDEN _sk_bicubic_n3x_vfp4
+.globl _sk_bicubic_n3x_vfp4
+_sk_bicubic_n3x_vfp4:
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xf2c70f10 // vmov.f32 d16, #1
+ .long 0xeddf3b10 // vldr d19, [pc, #64]
+ .long 0xedd32b10 // vldr d18, [r3, #64]
+ .long 0xf2600da2 // vsub.f32 d16, d16, d18
+ .long 0xeddf2b0b // vldr d18, [pc, #44]
+ .long 0xedd31b00 // vldr d17, [r3]
+ .long 0xf2403cb2 // vfma.f32 d19, d16, d18
+ .long 0xf3400db0 // vmul.f32 d16, d16, d16
+ .long 0xf3c72f18 // vmov.f32 d18, #-1.5
+ .long 0xf2010da2 // vadd.f32 d0, d17, d18
+ .long 0xf3400db3 // vmul.f32 d16, d16, d19
+ .long 0xedc30b20 // vstr d16, [r3, #128]
+ .long 0xe2813008 // add r3, r1, #8
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe1a01003 // mov r1, r3
+ .long 0xe12fff1c // bx ip
+ .long 0xe320f000 // nop {0}
+ .long 0x3ec71c72 // .word 0x3ec71c72
+ .long 0x3ec71c72 // .word 0x3ec71c72
+ .long 0xbeaaaaab // .word 0xbeaaaaab
+ .long 0xbeaaaaab // .word 0xbeaaaaab
+
+HIDDEN _sk_bicubic_n1x_vfp4
+.globl _sk_bicubic_n1x_vfp4
+_sk_bicubic_n1x_vfp4:
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xf2c70f10 // vmov.f32 d16, #1
+ .long 0xf2c73f18 // vmov.f32 d19, #1.5
+ .long 0xedd32b10 // vldr d18, [r3, #64]
+ .long 0xf2600da2 // vsub.f32 d16, d16, d18
+ .long 0xeddf2b0d // vldr d18, [pc, #52]
+ .long 0xedd31b00 // vldr d17, [r3]
+ .long 0xf2403cb2 // vfma.f32 d19, d16, d18
+ .long 0xf2c3261f // vmov.i32 d18, #1056964608
+ .long 0xf2402cb3 // vfma.f32 d18, d16, d19
+ .long 0xeddf3b0a // vldr d19, [pc, #40]
+ .long 0xf2403cb2 // vfma.f32 d19, d16, d18
+ .long 0xf3c3061f // vmov.i32 d16, #-1090519040
+ .long 0xf2010da0 // vadd.f32 d0, d17, d16
+ .long 0xedc33b20 // vstr d19, [r3, #128]
+ .long 0xe2813008 // add r3, r1, #8
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe1a01003 // mov r1, r3
+ .long 0xe12fff1c // bx ip
+ .long 0xe320f000 // nop {0}
+ .long 0xbf955555 // .word 0xbf955555
+ .long 0xbf955555 // .word 0xbf955555
+ .long 0x3d638e39 // .word 0x3d638e39
+ .long 0x3d638e39 // .word 0x3d638e39
+
+HIDDEN _sk_bicubic_p1x_vfp4
+.globl _sk_bicubic_p1x_vfp4
+_sk_bicubic_p1x_vfp4:
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xf2c71f18 // vmov.f32 d17, #1.5
+ .long 0xeddf0b0c // vldr d16, [pc, #48]
+ .long 0xedd33b10 // vldr d19, [r3, #64]
+ .long 0xf2431cb0 // vfma.f32 d17, d19, d16
+ .long 0xedd32b00 // vldr d18, [r3]
+ .long 0xf2c3061f // vmov.i32 d16, #1056964608
+ .long 0xf2020da0 // vadd.f32 d0, d18, d16
+ .long 0xf2430cb1 // vfma.f32 d16, d19, d17
+ .long 0xeddf1b07 // vldr d17, [pc, #28]
+ .long 0xf2431cb0 // vfma.f32 d17, d19, d16
+ .long 0xedc31b20 // vstr d17, [r3, #128]
+ .long 0xe2813008 // add r3, r1, #8
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe1a01003 // mov r1, r3
+ .long 0xe12fff1c // bx ip
+ .long 0xbf955555 // .word 0xbf955555
+ .long 0xbf955555 // .word 0xbf955555
+ .long 0x3d638e39 // .word 0x3d638e39
+ .long 0x3d638e39 // .word 0x3d638e39
+
+HIDDEN _sk_bicubic_p3x_vfp4
+.globl _sk_bicubic_p3x_vfp4
+_sk_bicubic_p3x_vfp4:
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xeddf0b0d // vldr d16, [pc, #52]
+ .long 0xeddf3b0e // vldr d19, [pc, #56]
+ .long 0xedd32b10 // vldr d18, [r3, #64]
+ .long 0xf2423cb0 // vfma.f32 d19, d18, d16
+ .long 0xedd31b00 // vldr d17, [r3]
+ .long 0xf3420db2 // vmul.f32 d16, d18, d18
+ .long 0xf2c72f18 // vmov.f32 d18, #1.5
+ .long 0xf2010da2 // vadd.f32 d0, d17, d18
+ .long 0xf3400db3 // vmul.f32 d16, d16, d19
+ .long 0xedc30b20 // vstr d16, [r3, #128]
+ .long 0xe2813008 // add r3, r1, #8
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe1a01003 // mov r1, r3
+ .long 0xe12fff1c // bx ip
+ .long 0xe320f000 // nop {0}
+ .long 0x3ec71c72 // .word 0x3ec71c72
+ .long 0x3ec71c72 // .word 0x3ec71c72
+ .long 0xbeaaaaab // .word 0xbeaaaaab
+ .long 0xbeaaaaab // .word 0xbeaaaaab
+
+HIDDEN _sk_bicubic_n3y_vfp4
+.globl _sk_bicubic_n3y_vfp4
+_sk_bicubic_n3y_vfp4:
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xf2c70f10 // vmov.f32 d16, #1
+ .long 0xeddf3b10 // vldr d19, [pc, #64]
+ .long 0xedd32b18 // vldr d18, [r3, #96]
+ .long 0xf2600da2 // vsub.f32 d16, d16, d18
+ .long 0xeddf2b0b // vldr d18, [pc, #44]
+ .long 0xedd31b08 // vldr d17, [r3, #32]
+ .long 0xf2403cb2 // vfma.f32 d19, d16, d18
+ .long 0xf3400db0 // vmul.f32 d16, d16, d16
+ .long 0xf3c72f18 // vmov.f32 d18, #-1.5
+ .long 0xf2011da2 // vadd.f32 d1, d17, d18
+ .long 0xf3400db3 // vmul.f32 d16, d16, d19
+ .long 0xedc30b28 // vstr d16, [r3, #160]
+ .long 0xe2813008 // add r3, r1, #8
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe1a01003 // mov r1, r3
+ .long 0xe12fff1c // bx ip
+ .long 0xe320f000 // nop {0}
+ .long 0x3ec71c72 // .word 0x3ec71c72
+ .long 0x3ec71c72 // .word 0x3ec71c72
+ .long 0xbeaaaaab // .word 0xbeaaaaab
+ .long 0xbeaaaaab // .word 0xbeaaaaab
+
+HIDDEN _sk_bicubic_n1y_vfp4
+.globl _sk_bicubic_n1y_vfp4
+_sk_bicubic_n1y_vfp4:
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xf2c70f10 // vmov.f32 d16, #1
+ .long 0xf2c73f18 // vmov.f32 d19, #1.5
+ .long 0xedd32b18 // vldr d18, [r3, #96]
+ .long 0xf2600da2 // vsub.f32 d16, d16, d18
+ .long 0xeddf2b0d // vldr d18, [pc, #52]
+ .long 0xedd31b08 // vldr d17, [r3, #32]
+ .long 0xf2403cb2 // vfma.f32 d19, d16, d18
+ .long 0xf2c3261f // vmov.i32 d18, #1056964608
+ .long 0xf2402cb3 // vfma.f32 d18, d16, d19
+ .long 0xeddf3b0a // vldr d19, [pc, #40]
+ .long 0xf2403cb2 // vfma.f32 d19, d16, d18
+ .long 0xf3c3061f // vmov.i32 d16, #-1090519040
+ .long 0xf2011da0 // vadd.f32 d1, d17, d16
+ .long 0xedc33b28 // vstr d19, [r3, #160]
+ .long 0xe2813008 // add r3, r1, #8
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe1a01003 // mov r1, r3
+ .long 0xe12fff1c // bx ip
+ .long 0xe320f000 // nop {0}
+ .long 0xbf955555 // .word 0xbf955555
+ .long 0xbf955555 // .word 0xbf955555
+ .long 0x3d638e39 // .word 0x3d638e39
+ .long 0x3d638e39 // .word 0x3d638e39
+
+HIDDEN _sk_bicubic_p1y_vfp4
+.globl _sk_bicubic_p1y_vfp4
+_sk_bicubic_p1y_vfp4:
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xf2c71f18 // vmov.f32 d17, #1.5
+ .long 0xeddf0b0c // vldr d16, [pc, #48]
+ .long 0xedd33b18 // vldr d19, [r3, #96]
+ .long 0xf2431cb0 // vfma.f32 d17, d19, d16
+ .long 0xedd32b08 // vldr d18, [r3, #32]
+ .long 0xf2c3061f // vmov.i32 d16, #1056964608
+ .long 0xf2021da0 // vadd.f32 d1, d18, d16
+ .long 0xf2430cb1 // vfma.f32 d16, d19, d17
+ .long 0xeddf1b07 // vldr d17, [pc, #28]
+ .long 0xf2431cb0 // vfma.f32 d17, d19, d16
+ .long 0xedc31b28 // vstr d17, [r3, #160]
+ .long 0xe2813008 // add r3, r1, #8
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe1a01003 // mov r1, r3
+ .long 0xe12fff1c // bx ip
+ .long 0xbf955555 // .word 0xbf955555
+ .long 0xbf955555 // .word 0xbf955555
+ .long 0x3d638e39 // .word 0x3d638e39
+ .long 0x3d638e39 // .word 0x3d638e39
+
+HIDDEN _sk_bicubic_p3y_vfp4
+.globl _sk_bicubic_p3y_vfp4
+_sk_bicubic_p3y_vfp4:
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xeddf0b0d // vldr d16, [pc, #52]
+ .long 0xeddf3b0e // vldr d19, [pc, #56]
+ .long 0xedd32b18 // vldr d18, [r3, #96]
+ .long 0xf2423cb0 // vfma.f32 d19, d18, d16
+ .long 0xedd31b08 // vldr d17, [r3, #32]
+ .long 0xf3420db2 // vmul.f32 d16, d18, d18
+ .long 0xf2c72f18 // vmov.f32 d18, #1.5
+ .long 0xf2011da2 // vadd.f32 d1, d17, d18
+ .long 0xf3400db3 // vmul.f32 d16, d16, d19
+ .long 0xedc30b28 // vstr d16, [r3, #160]
+ .long 0xe2813008 // add r3, r1, #8
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe1a01003 // mov r1, r3
+ .long 0xe12fff1c // bx ip
+ .long 0xe320f000 // nop {0}
+ .long 0x3ec71c72 // .word 0x3ec71c72
+ .long 0x3ec71c72 // .word 0x3ec71c72
+ .long 0xbeaaaaab // .word 0xbeaaaaab
+ .long 0xbeaaaaab // .word 0xbeaaaaab
#elif defined(__x86_64__)
HIDDEN _sk_start_pipeline_hsw
@@ -7849,7 +8437,7 @@ _sk_load_4444_hsw:
.byte 255 // (bad)
.byte 255 // (bad)
.byte 255 // (bad)
- .byte 233,255,255,255,225 // jmpq ffffffffe2002284 <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff152>
+ .byte 233,255,255,255,225 // jmpq ffffffffe2002284 <_sk_bicubic_p3y_hsw+0xffffffffe1ffecd5>
.byte 255 // (bad)
.byte 255 // (bad)
.byte 255 // (bad)
@@ -8835,6 +9423,304 @@ _sk_linear_gradient_2stops_hsw:
.byte 197,124,41,192 // vmovaps %ymm8,%ymm0
.byte 255,224 // jmpq *%rax
+HIDDEN _sk_save_xy_hsw
+.globl _sk_save_xy_hsw
+_sk_save_xy_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,63 // mov $0x3f000000,%r8d
+ .byte 196,65,121,110,192 // vmovd %r8d,%xmm8
+ .byte 196,66,125,88,192 // vpbroadcastd %xmm8,%ymm8
+ .byte 197,60,88,200 // vaddps %ymm0,%ymm8,%ymm9
+ .byte 196,67,125,8,209,1 // vroundps $0x1,%ymm9,%ymm10
+ .byte 196,65,52,92,202 // vsubps %ymm10,%ymm9,%ymm9
+ .byte 197,60,88,193 // vaddps %ymm1,%ymm8,%ymm8
+ .byte 196,67,125,8,208,1 // vroundps $0x1,%ymm8,%ymm10
+ .byte 196,65,60,92,194 // vsubps %ymm10,%ymm8,%ymm8
+ .byte 197,252,17,0 // vmovups %ymm0,(%rax)
+ .byte 197,252,17,72,32 // vmovups %ymm1,0x20(%rax)
+ .byte 197,124,17,72,64 // vmovups %ymm9,0x40(%rax)
+ .byte 197,124,17,64,96 // vmovups %ymm8,0x60(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_accumulate_hsw
+.globl _sk_accumulate_hsw
+_sk_accumulate_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,124,16,128,128,0,0,0 // vmovups 0x80(%rax),%ymm8
+ .byte 197,60,89,128,160,0,0,0 // vmulps 0xa0(%rax),%ymm8,%ymm8
+ .byte 196,226,61,184,224 // vfmadd231ps %ymm0,%ymm8,%ymm4
+ .byte 196,226,61,184,233 // vfmadd231ps %ymm1,%ymm8,%ymm5
+ .byte 196,226,61,184,242 // vfmadd231ps %ymm2,%ymm8,%ymm6
+ .byte 196,98,101,168,199 // vfmadd213ps %ymm7,%ymm3,%ymm8
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,124,41,199 // vmovaps %ymm8,%ymm7
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bilinear_nx_hsw
+.globl _sk_bilinear_nx_hsw
+_sk_bilinear_nx_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,191 // mov $0xbf000000,%r8d
+ .byte 196,193,121,110,192 // vmovd %r8d,%xmm0
+ .byte 196,226,125,88,192 // vpbroadcastd %xmm0,%ymm0
+ .byte 197,252,88,0 // vaddps (%rax),%ymm0,%ymm0
+ .byte 65,184,0,0,128,63 // mov $0x3f800000,%r8d
+ .byte 196,65,121,110,192 // vmovd %r8d,%xmm8
+ .byte 196,66,125,88,192 // vpbroadcastd %xmm8,%ymm8
+ .byte 197,60,92,64,64 // vsubps 0x40(%rax),%ymm8,%ymm8
+ .byte 197,124,17,128,128,0,0,0 // vmovups %ymm8,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bilinear_px_hsw
+.globl _sk_bilinear_px_hsw
+_sk_bilinear_px_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,63 // mov $0x3f000000,%r8d
+ .byte 196,193,121,110,192 // vmovd %r8d,%xmm0
+ .byte 196,226,125,88,192 // vpbroadcastd %xmm0,%ymm0
+ .byte 197,252,88,0 // vaddps (%rax),%ymm0,%ymm0
+ .byte 197,124,16,64,64 // vmovups 0x40(%rax),%ymm8
+ .byte 197,124,17,128,128,0,0,0 // vmovups %ymm8,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bilinear_ny_hsw
+.globl _sk_bilinear_ny_hsw
+_sk_bilinear_ny_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,191 // mov $0xbf000000,%r8d
+ .byte 196,193,121,110,200 // vmovd %r8d,%xmm1
+ .byte 196,226,125,88,201 // vpbroadcastd %xmm1,%ymm1
+ .byte 197,244,88,72,32 // vaddps 0x20(%rax),%ymm1,%ymm1
+ .byte 65,184,0,0,128,63 // mov $0x3f800000,%r8d
+ .byte 196,65,121,110,192 // vmovd %r8d,%xmm8
+ .byte 196,66,125,88,192 // vpbroadcastd %xmm8,%ymm8
+ .byte 197,60,92,64,96 // vsubps 0x60(%rax),%ymm8,%ymm8
+ .byte 197,124,17,128,160,0,0,0 // vmovups %ymm8,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bilinear_py_hsw
+.globl _sk_bilinear_py_hsw
+_sk_bilinear_py_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,63 // mov $0x3f000000,%r8d
+ .byte 196,193,121,110,200 // vmovd %r8d,%xmm1
+ .byte 196,226,125,88,201 // vpbroadcastd %xmm1,%ymm1
+ .byte 197,244,88,72,32 // vaddps 0x20(%rax),%ymm1,%ymm1
+ .byte 197,124,16,64,96 // vmovups 0x60(%rax),%ymm8
+ .byte 197,124,17,128,160,0,0,0 // vmovups %ymm8,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_n3x_hsw
+.globl _sk_bicubic_n3x_hsw
+_sk_bicubic_n3x_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,192,191 // mov $0xbfc00000,%r8d
+ .byte 196,193,121,110,192 // vmovd %r8d,%xmm0
+ .byte 196,226,125,88,192 // vpbroadcastd %xmm0,%ymm0
+ .byte 197,252,88,0 // vaddps (%rax),%ymm0,%ymm0
+ .byte 65,184,0,0,128,63 // mov $0x3f800000,%r8d
+ .byte 196,65,121,110,192 // vmovd %r8d,%xmm8
+ .byte 196,66,125,88,192 // vpbroadcastd %xmm8,%ymm8
+ .byte 197,60,92,64,64 // vsubps 0x40(%rax),%ymm8,%ymm8
+ .byte 196,65,60,89,200 // vmulps %ymm8,%ymm8,%ymm9
+ .byte 65,184,114,28,199,62 // mov $0x3ec71c72,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,66,125,88,210 // vpbroadcastd %xmm10,%ymm10
+ .byte 65,184,171,170,170,190 // mov $0xbeaaaaab,%r8d
+ .byte 196,65,121,110,216 // vmovd %r8d,%xmm11
+ .byte 196,66,125,88,219 // vpbroadcastd %xmm11,%ymm11
+ .byte 196,66,61,168,211 // vfmadd213ps %ymm11,%ymm8,%ymm10
+ .byte 196,65,44,89,193 // vmulps %ymm9,%ymm10,%ymm8
+ .byte 197,124,17,128,128,0,0,0 // vmovups %ymm8,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_n1x_hsw
+.globl _sk_bicubic_n1x_hsw
+_sk_bicubic_n1x_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,191 // mov $0xbf000000,%r8d
+ .byte 196,193,121,110,192 // vmovd %r8d,%xmm0
+ .byte 196,226,125,88,192 // vpbroadcastd %xmm0,%ymm0
+ .byte 197,252,88,0 // vaddps (%rax),%ymm0,%ymm0
+ .byte 65,184,0,0,128,63 // mov $0x3f800000,%r8d
+ .byte 196,65,121,110,192 // vmovd %r8d,%xmm8
+ .byte 196,66,125,88,192 // vpbroadcastd %xmm8,%ymm8
+ .byte 197,60,92,64,64 // vsubps 0x40(%rax),%ymm8,%ymm8
+ .byte 65,184,85,85,149,191 // mov $0xbf955555,%r8d
+ .byte 196,65,121,110,200 // vmovd %r8d,%xmm9
+ .byte 196,66,125,88,201 // vpbroadcastd %xmm9,%ymm9
+ .byte 65,184,0,0,192,63 // mov $0x3fc00000,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,66,125,88,210 // vpbroadcastd %xmm10,%ymm10
+ .byte 196,66,61,168,202 // vfmadd213ps %ymm10,%ymm8,%ymm9
+ .byte 65,184,0,0,0,63 // mov $0x3f000000,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,66,125,88,210 // vpbroadcastd %xmm10,%ymm10
+ .byte 196,66,61,184,209 // vfmadd231ps %ymm9,%ymm8,%ymm10
+ .byte 65,184,57,142,99,61 // mov $0x3d638e39,%r8d
+ .byte 196,65,121,110,200 // vmovd %r8d,%xmm9
+ .byte 196,66,125,88,201 // vpbroadcastd %xmm9,%ymm9
+ .byte 196,66,61,184,202 // vfmadd231ps %ymm10,%ymm8,%ymm9
+ .byte 197,124,17,136,128,0,0,0 // vmovups %ymm9,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_p1x_hsw
+.globl _sk_bicubic_p1x_hsw
+_sk_bicubic_p1x_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,63 // mov $0x3f000000,%r8d
+ .byte 196,193,121,110,192 // vmovd %r8d,%xmm0
+ .byte 196,98,125,88,192 // vpbroadcastd %xmm0,%ymm8
+ .byte 197,188,88,0 // vaddps (%rax),%ymm8,%ymm0
+ .byte 197,124,16,72,64 // vmovups 0x40(%rax),%ymm9
+ .byte 65,184,85,85,149,191 // mov $0xbf955555,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,66,125,88,210 // vpbroadcastd %xmm10,%ymm10
+ .byte 65,184,0,0,192,63 // mov $0x3fc00000,%r8d
+ .byte 196,65,121,110,216 // vmovd %r8d,%xmm11
+ .byte 196,66,125,88,219 // vpbroadcastd %xmm11,%ymm11
+ .byte 196,66,53,168,211 // vfmadd213ps %ymm11,%ymm9,%ymm10
+ .byte 196,66,53,168,208 // vfmadd213ps %ymm8,%ymm9,%ymm10
+ .byte 65,184,57,142,99,61 // mov $0x3d638e39,%r8d
+ .byte 196,65,121,110,192 // vmovd %r8d,%xmm8
+ .byte 196,66,125,88,192 // vpbroadcastd %xmm8,%ymm8
+ .byte 196,66,53,184,194 // vfmadd231ps %ymm10,%ymm9,%ymm8
+ .byte 197,124,17,128,128,0,0,0 // vmovups %ymm8,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_p3x_hsw
+.globl _sk_bicubic_p3x_hsw
+_sk_bicubic_p3x_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,192,63 // mov $0x3fc00000,%r8d
+ .byte 196,193,121,110,192 // vmovd %r8d,%xmm0
+ .byte 196,226,125,88,192 // vpbroadcastd %xmm0,%ymm0
+ .byte 197,252,88,0 // vaddps (%rax),%ymm0,%ymm0
+ .byte 197,124,16,64,64 // vmovups 0x40(%rax),%ymm8
+ .byte 196,65,60,89,200 // vmulps %ymm8,%ymm8,%ymm9
+ .byte 65,184,114,28,199,62 // mov $0x3ec71c72,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,66,125,88,210 // vpbroadcastd %xmm10,%ymm10
+ .byte 65,184,171,170,170,190 // mov $0xbeaaaaab,%r8d
+ .byte 196,65,121,110,216 // vmovd %r8d,%xmm11
+ .byte 196,66,125,88,219 // vpbroadcastd %xmm11,%ymm11
+ .byte 196,66,61,168,211 // vfmadd213ps %ymm11,%ymm8,%ymm10
+ .byte 196,65,52,89,194 // vmulps %ymm10,%ymm9,%ymm8
+ .byte 197,124,17,128,128,0,0,0 // vmovups %ymm8,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_n3y_hsw
+.globl _sk_bicubic_n3y_hsw
+_sk_bicubic_n3y_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,192,191 // mov $0xbfc00000,%r8d
+ .byte 196,193,121,110,200 // vmovd %r8d,%xmm1
+ .byte 196,226,125,88,201 // vpbroadcastd %xmm1,%ymm1
+ .byte 197,244,88,72,32 // vaddps 0x20(%rax),%ymm1,%ymm1
+ .byte 65,184,0,0,128,63 // mov $0x3f800000,%r8d
+ .byte 196,65,121,110,192 // vmovd %r8d,%xmm8
+ .byte 196,66,125,88,192 // vpbroadcastd %xmm8,%ymm8
+ .byte 197,60,92,64,96 // vsubps 0x60(%rax),%ymm8,%ymm8
+ .byte 196,65,60,89,200 // vmulps %ymm8,%ymm8,%ymm9
+ .byte 65,184,114,28,199,62 // mov $0x3ec71c72,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,66,125,88,210 // vpbroadcastd %xmm10,%ymm10
+ .byte 65,184,171,170,170,190 // mov $0xbeaaaaab,%r8d
+ .byte 196,65,121,110,216 // vmovd %r8d,%xmm11
+ .byte 196,66,125,88,219 // vpbroadcastd %xmm11,%ymm11
+ .byte 196,66,61,168,211 // vfmadd213ps %ymm11,%ymm8,%ymm10
+ .byte 196,65,44,89,193 // vmulps %ymm9,%ymm10,%ymm8
+ .byte 197,124,17,128,160,0,0,0 // vmovups %ymm8,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_n1y_hsw
+.globl _sk_bicubic_n1y_hsw
+_sk_bicubic_n1y_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,191 // mov $0xbf000000,%r8d
+ .byte 196,193,121,110,200 // vmovd %r8d,%xmm1
+ .byte 196,226,125,88,201 // vpbroadcastd %xmm1,%ymm1
+ .byte 197,244,88,72,32 // vaddps 0x20(%rax),%ymm1,%ymm1
+ .byte 65,184,0,0,128,63 // mov $0x3f800000,%r8d
+ .byte 196,65,121,110,192 // vmovd %r8d,%xmm8
+ .byte 196,66,125,88,192 // vpbroadcastd %xmm8,%ymm8
+ .byte 197,60,92,64,96 // vsubps 0x60(%rax),%ymm8,%ymm8
+ .byte 65,184,85,85,149,191 // mov $0xbf955555,%r8d
+ .byte 196,65,121,110,200 // vmovd %r8d,%xmm9
+ .byte 196,66,125,88,201 // vpbroadcastd %xmm9,%ymm9
+ .byte 65,184,0,0,192,63 // mov $0x3fc00000,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,66,125,88,210 // vpbroadcastd %xmm10,%ymm10
+ .byte 196,66,61,168,202 // vfmadd213ps %ymm10,%ymm8,%ymm9
+ .byte 65,184,0,0,0,63 // mov $0x3f000000,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,66,125,88,210 // vpbroadcastd %xmm10,%ymm10
+ .byte 196,66,61,184,209 // vfmadd231ps %ymm9,%ymm8,%ymm10
+ .byte 65,184,57,142,99,61 // mov $0x3d638e39,%r8d
+ .byte 196,65,121,110,200 // vmovd %r8d,%xmm9
+ .byte 196,66,125,88,201 // vpbroadcastd %xmm9,%ymm9
+ .byte 196,66,61,184,202 // vfmadd231ps %ymm10,%ymm8,%ymm9
+ .byte 197,124,17,136,160,0,0,0 // vmovups %ymm9,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_p1y_hsw
+.globl _sk_bicubic_p1y_hsw
+_sk_bicubic_p1y_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,63 // mov $0x3f000000,%r8d
+ .byte 196,193,121,110,200 // vmovd %r8d,%xmm1
+ .byte 196,98,125,88,193 // vpbroadcastd %xmm1,%ymm8
+ .byte 197,188,88,72,32 // vaddps 0x20(%rax),%ymm8,%ymm1
+ .byte 197,124,16,72,96 // vmovups 0x60(%rax),%ymm9
+ .byte 65,184,85,85,149,191 // mov $0xbf955555,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,66,125,88,210 // vpbroadcastd %xmm10,%ymm10
+ .byte 65,184,0,0,192,63 // mov $0x3fc00000,%r8d
+ .byte 196,65,121,110,216 // vmovd %r8d,%xmm11
+ .byte 196,66,125,88,219 // vpbroadcastd %xmm11,%ymm11
+ .byte 196,66,53,168,211 // vfmadd213ps %ymm11,%ymm9,%ymm10
+ .byte 196,66,53,168,208 // vfmadd213ps %ymm8,%ymm9,%ymm10
+ .byte 65,184,57,142,99,61 // mov $0x3d638e39,%r8d
+ .byte 196,65,121,110,192 // vmovd %r8d,%xmm8
+ .byte 196,66,125,88,192 // vpbroadcastd %xmm8,%ymm8
+ .byte 196,66,53,184,194 // vfmadd231ps %ymm10,%ymm9,%ymm8
+ .byte 197,124,17,128,160,0,0,0 // vmovups %ymm8,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_p3y_hsw
+.globl _sk_bicubic_p3y_hsw
+_sk_bicubic_p3y_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,192,63 // mov $0x3fc00000,%r8d
+ .byte 196,193,121,110,200 // vmovd %r8d,%xmm1
+ .byte 196,226,125,88,201 // vpbroadcastd %xmm1,%ymm1
+ .byte 197,244,88,72,32 // vaddps 0x20(%rax),%ymm1,%ymm1
+ .byte 197,124,16,64,96 // vmovups 0x60(%rax),%ymm8
+ .byte 196,65,60,89,200 // vmulps %ymm8,%ymm8,%ymm9
+ .byte 65,184,114,28,199,62 // mov $0x3ec71c72,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,66,125,88,210 // vpbroadcastd %xmm10,%ymm10
+ .byte 65,184,171,170,170,190 // mov $0xbeaaaaab,%r8d
+ .byte 196,65,121,110,216 // vmovd %r8d,%xmm11
+ .byte 196,66,125,88,219 // vpbroadcastd %xmm11,%ymm11
+ .byte 196,66,61,168,211 // vfmadd213ps %ymm11,%ymm8,%ymm10
+ .byte 196,65,52,89,194 // vmulps %ymm10,%ymm9,%ymm8
+ .byte 197,124,17,128,160,0,0,0 // vmovups %ymm8,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
HIDDEN _sk_start_pipeline_avx
.globl _sk_start_pipeline_avx
_sk_start_pipeline_avx:
@@ -12924,6 +13810,364 @@ _sk_linear_gradient_2stops_avx:
.byte 197,124,41,192 // vmovaps %ymm8,%ymm0
.byte 255,224 // jmpq *%rax
+HIDDEN _sk_save_xy_avx
+.globl _sk_save_xy_avx
+_sk_save_xy_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,63 // mov $0x3f000000,%r8d
+ .byte 196,65,121,110,192 // vmovd %r8d,%xmm8
+ .byte 196,67,121,4,192,0 // vpermilps $0x0,%xmm8,%xmm8
+ .byte 196,67,61,24,192,1 // vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
+ .byte 197,60,88,200 // vaddps %ymm0,%ymm8,%ymm9
+ .byte 196,67,125,8,209,1 // vroundps $0x1,%ymm9,%ymm10
+ .byte 196,65,52,92,202 // vsubps %ymm10,%ymm9,%ymm9
+ .byte 197,60,88,193 // vaddps %ymm1,%ymm8,%ymm8
+ .byte 196,67,125,8,208,1 // vroundps $0x1,%ymm8,%ymm10
+ .byte 196,65,60,92,194 // vsubps %ymm10,%ymm8,%ymm8
+ .byte 197,252,17,0 // vmovups %ymm0,(%rax)
+ .byte 197,252,17,72,32 // vmovups %ymm1,0x20(%rax)
+ .byte 197,124,17,72,64 // vmovups %ymm9,0x40(%rax)
+ .byte 197,124,17,64,96 // vmovups %ymm8,0x60(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_accumulate_avx
+.globl _sk_accumulate_avx
+_sk_accumulate_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,124,16,128,128,0,0,0 // vmovups 0x80(%rax),%ymm8
+ .byte 197,60,89,128,160,0,0,0 // vmulps 0xa0(%rax),%ymm8,%ymm8
+ .byte 197,60,89,200 // vmulps %ymm0,%ymm8,%ymm9
+ .byte 197,180,88,228 // vaddps %ymm4,%ymm9,%ymm4
+ .byte 197,60,89,201 // vmulps %ymm1,%ymm8,%ymm9
+ .byte 197,180,88,237 // vaddps %ymm5,%ymm9,%ymm5
+ .byte 197,60,89,202 // vmulps %ymm2,%ymm8,%ymm9
+ .byte 197,180,88,246 // vaddps %ymm6,%ymm9,%ymm6
+ .byte 197,60,89,195 // vmulps %ymm3,%ymm8,%ymm8
+ .byte 197,188,88,255 // vaddps %ymm7,%ymm8,%ymm7
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bilinear_nx_avx
+.globl _sk_bilinear_nx_avx
+_sk_bilinear_nx_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,191 // mov $0xbf000000,%r8d
+ .byte 196,193,121,110,192 // vmovd %r8d,%xmm0
+ .byte 196,227,121,4,192,0 // vpermilps $0x0,%xmm0,%xmm0
+ .byte 196,227,125,24,192,1 // vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
+ .byte 197,252,88,0 // vaddps (%rax),%ymm0,%ymm0
+ .byte 65,184,0,0,128,63 // mov $0x3f800000,%r8d
+ .byte 196,65,121,110,192 // vmovd %r8d,%xmm8
+ .byte 196,67,121,4,192,0 // vpermilps $0x0,%xmm8,%xmm8
+ .byte 196,67,61,24,192,1 // vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
+ .byte 197,60,92,64,64 // vsubps 0x40(%rax),%ymm8,%ymm8
+ .byte 197,124,17,128,128,0,0,0 // vmovups %ymm8,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bilinear_px_avx
+.globl _sk_bilinear_px_avx
+_sk_bilinear_px_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,63 // mov $0x3f000000,%r8d
+ .byte 196,193,121,110,192 // vmovd %r8d,%xmm0
+ .byte 196,227,121,4,192,0 // vpermilps $0x0,%xmm0,%xmm0
+ .byte 196,227,125,24,192,1 // vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
+ .byte 197,124,16,64,64 // vmovups 0x40(%rax),%ymm8
+ .byte 197,252,88,0 // vaddps (%rax),%ymm0,%ymm0
+ .byte 197,124,17,128,128,0,0,0 // vmovups %ymm8,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bilinear_ny_avx
+.globl _sk_bilinear_ny_avx
+_sk_bilinear_ny_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,191 // mov $0xbf000000,%r8d
+ .byte 196,193,121,110,200 // vmovd %r8d,%xmm1
+ .byte 196,227,121,4,201,0 // vpermilps $0x0,%xmm1,%xmm1
+ .byte 196,227,117,24,201,1 // vinsertf128 $0x1,%xmm1,%ymm1,%ymm1
+ .byte 197,244,88,72,32 // vaddps 0x20(%rax),%ymm1,%ymm1
+ .byte 65,184,0,0,128,63 // mov $0x3f800000,%r8d
+ .byte 196,65,121,110,192 // vmovd %r8d,%xmm8
+ .byte 196,67,121,4,192,0 // vpermilps $0x0,%xmm8,%xmm8
+ .byte 196,67,61,24,192,1 // vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
+ .byte 197,60,92,64,96 // vsubps 0x60(%rax),%ymm8,%ymm8
+ .byte 197,124,17,128,160,0,0,0 // vmovups %ymm8,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bilinear_py_avx
+.globl _sk_bilinear_py_avx
+_sk_bilinear_py_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,63 // mov $0x3f000000,%r8d
+ .byte 196,193,121,110,200 // vmovd %r8d,%xmm1
+ .byte 196,227,121,4,201,0 // vpermilps $0x0,%xmm1,%xmm1
+ .byte 196,227,117,24,201,1 // vinsertf128 $0x1,%xmm1,%ymm1,%ymm1
+ .byte 197,124,16,64,96 // vmovups 0x60(%rax),%ymm8
+ .byte 197,244,88,72,32 // vaddps 0x20(%rax),%ymm1,%ymm1
+ .byte 197,124,17,128,160,0,0,0 // vmovups %ymm8,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_n3x_avx
+.globl _sk_bicubic_n3x_avx
+_sk_bicubic_n3x_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,192,191 // mov $0xbfc00000,%r8d
+ .byte 196,193,121,110,192 // vmovd %r8d,%xmm0
+ .byte 196,227,121,4,192,0 // vpermilps $0x0,%xmm0,%xmm0
+ .byte 196,227,125,24,192,1 // vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
+ .byte 197,252,88,0 // vaddps (%rax),%ymm0,%ymm0
+ .byte 65,184,0,0,128,63 // mov $0x3f800000,%r8d
+ .byte 196,65,121,110,192 // vmovd %r8d,%xmm8
+ .byte 196,67,121,4,192,0 // vpermilps $0x0,%xmm8,%xmm8
+ .byte 196,67,61,24,192,1 // vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
+ .byte 197,60,92,64,64 // vsubps 0x40(%rax),%ymm8,%ymm8
+ .byte 196,65,60,89,200 // vmulps %ymm8,%ymm8,%ymm9
+ .byte 65,184,114,28,199,62 // mov $0x3ec71c72,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,67,121,4,210,0 // vpermilps $0x0,%xmm10,%xmm10
+ .byte 196,67,45,24,210,1 // vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ .byte 65,184,171,170,170,190 // mov $0xbeaaaaab,%r8d
+ .byte 196,65,121,110,216 // vmovd %r8d,%xmm11
+ .byte 196,67,121,4,219,0 // vpermilps $0x0,%xmm11,%xmm11
+ .byte 196,67,37,24,219,1 // vinsertf128 $0x1,%xmm11,%ymm11,%ymm11
+ .byte 196,65,44,89,192 // vmulps %ymm8,%ymm10,%ymm8
+ .byte 196,65,60,88,195 // vaddps %ymm11,%ymm8,%ymm8
+ .byte 196,65,52,89,192 // vmulps %ymm8,%ymm9,%ymm8
+ .byte 197,124,17,128,128,0,0,0 // vmovups %ymm8,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_n1x_avx
+.globl _sk_bicubic_n1x_avx
+_sk_bicubic_n1x_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,191 // mov $0xbf000000,%r8d
+ .byte 196,193,121,110,192 // vmovd %r8d,%xmm0
+ .byte 196,227,121,4,192,0 // vpermilps $0x0,%xmm0,%xmm0
+ .byte 196,227,125,24,192,1 // vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
+ .byte 197,252,88,0 // vaddps (%rax),%ymm0,%ymm0
+ .byte 65,184,0,0,128,63 // mov $0x3f800000,%r8d
+ .byte 196,65,121,110,192 // vmovd %r8d,%xmm8
+ .byte 196,67,121,4,192,0 // vpermilps $0x0,%xmm8,%xmm8
+ .byte 196,67,61,24,192,1 // vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
+ .byte 197,60,92,64,64 // vsubps 0x40(%rax),%ymm8,%ymm8
+ .byte 65,184,85,85,149,191 // mov $0xbf955555,%r8d
+ .byte 196,65,121,110,200 // vmovd %r8d,%xmm9
+ .byte 196,67,121,4,201,0 // vpermilps $0x0,%xmm9,%xmm9
+ .byte 196,67,53,24,201,1 // vinsertf128 $0x1,%xmm9,%ymm9,%ymm9
+ .byte 65,184,0,0,192,63 // mov $0x3fc00000,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,67,121,4,210,0 // vpermilps $0x0,%xmm10,%xmm10
+ .byte 196,67,45,24,210,1 // vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ .byte 196,65,52,89,200 // vmulps %ymm8,%ymm9,%ymm9
+ .byte 196,65,52,88,202 // vaddps %ymm10,%ymm9,%ymm9
+ .byte 65,184,0,0,0,63 // mov $0x3f000000,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,67,121,4,210,0 // vpermilps $0x0,%xmm10,%xmm10
+ .byte 196,67,45,24,210,1 // vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ .byte 196,65,60,89,201 // vmulps %ymm9,%ymm8,%ymm9
+ .byte 196,65,44,88,201 // vaddps %ymm9,%ymm10,%ymm9
+ .byte 65,184,57,142,99,61 // mov $0x3d638e39,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,67,121,4,210,0 // vpermilps $0x0,%xmm10,%xmm10
+ .byte 196,67,45,24,210,1 // vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ .byte 196,65,60,89,193 // vmulps %ymm9,%ymm8,%ymm8
+ .byte 196,65,44,88,192 // vaddps %ymm8,%ymm10,%ymm8
+ .byte 197,124,17,128,128,0,0,0 // vmovups %ymm8,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_p1x_avx
+.globl _sk_bicubic_p1x_avx
+_sk_bicubic_p1x_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,63 // mov $0x3f000000,%r8d
+ .byte 196,193,121,110,192 // vmovd %r8d,%xmm0
+ .byte 196,227,121,4,192,0 // vpermilps $0x0,%xmm0,%xmm0
+ .byte 196,99,125,24,192,1 // vinsertf128 $0x1,%xmm0,%ymm0,%ymm8
+ .byte 197,188,88,0 // vaddps (%rax),%ymm8,%ymm0
+ .byte 197,124,16,72,64 // vmovups 0x40(%rax),%ymm9
+ .byte 65,184,85,85,149,191 // mov $0xbf955555,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,67,121,4,210,0 // vpermilps $0x0,%xmm10,%xmm10
+ .byte 196,67,45,24,210,1 // vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ .byte 65,184,0,0,192,63 // mov $0x3fc00000,%r8d
+ .byte 196,65,121,110,216 // vmovd %r8d,%xmm11
+ .byte 196,67,121,4,219,0 // vpermilps $0x0,%xmm11,%xmm11
+ .byte 196,67,37,24,219,1 // vinsertf128 $0x1,%xmm11,%ymm11,%ymm11
+ .byte 196,65,52,89,210 // vmulps %ymm10,%ymm9,%ymm10
+ .byte 196,65,44,88,211 // vaddps %ymm11,%ymm10,%ymm10
+ .byte 196,65,52,89,210 // vmulps %ymm10,%ymm9,%ymm10
+ .byte 196,65,60,88,194 // vaddps %ymm10,%ymm8,%ymm8
+ .byte 65,184,57,142,99,61 // mov $0x3d638e39,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,67,121,4,210,0 // vpermilps $0x0,%xmm10,%xmm10
+ .byte 196,67,45,24,210,1 // vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ .byte 196,65,52,89,192 // vmulps %ymm8,%ymm9,%ymm8
+ .byte 196,65,44,88,192 // vaddps %ymm8,%ymm10,%ymm8
+ .byte 197,124,17,128,128,0,0,0 // vmovups %ymm8,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_p3x_avx
+.globl _sk_bicubic_p3x_avx
+_sk_bicubic_p3x_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,192,63 // mov $0x3fc00000,%r8d
+ .byte 196,193,121,110,192 // vmovd %r8d,%xmm0
+ .byte 196,227,121,4,192,0 // vpermilps $0x0,%xmm0,%xmm0
+ .byte 196,227,125,24,192,1 // vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
+ .byte 197,252,88,0 // vaddps (%rax),%ymm0,%ymm0
+ .byte 197,124,16,64,64 // vmovups 0x40(%rax),%ymm8
+ .byte 196,65,60,89,200 // vmulps %ymm8,%ymm8,%ymm9
+ .byte 65,184,114,28,199,62 // mov $0x3ec71c72,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,67,121,4,210,0 // vpermilps $0x0,%xmm10,%xmm10
+ .byte 196,67,45,24,210,1 // vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ .byte 65,184,171,170,170,190 // mov $0xbeaaaaab,%r8d
+ .byte 196,65,121,110,216 // vmovd %r8d,%xmm11
+ .byte 196,67,121,4,219,0 // vpermilps $0x0,%xmm11,%xmm11
+ .byte 196,67,37,24,219,1 // vinsertf128 $0x1,%xmm11,%ymm11,%ymm11
+ .byte 196,65,60,89,194 // vmulps %ymm10,%ymm8,%ymm8
+ .byte 196,65,60,88,195 // vaddps %ymm11,%ymm8,%ymm8
+ .byte 196,65,52,89,192 // vmulps %ymm8,%ymm9,%ymm8
+ .byte 197,124,17,128,128,0,0,0 // vmovups %ymm8,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_n3y_avx
+.globl _sk_bicubic_n3y_avx
+_sk_bicubic_n3y_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,192,191 // mov $0xbfc00000,%r8d
+ .byte 196,193,121,110,200 // vmovd %r8d,%xmm1
+ .byte 196,227,121,4,201,0 // vpermilps $0x0,%xmm1,%xmm1
+ .byte 196,227,117,24,201,1 // vinsertf128 $0x1,%xmm1,%ymm1,%ymm1
+ .byte 197,244,88,72,32 // vaddps 0x20(%rax),%ymm1,%ymm1
+ .byte 65,184,0,0,128,63 // mov $0x3f800000,%r8d
+ .byte 196,65,121,110,192 // vmovd %r8d,%xmm8
+ .byte 196,67,121,4,192,0 // vpermilps $0x0,%xmm8,%xmm8
+ .byte 196,67,61,24,192,1 // vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
+ .byte 197,60,92,64,96 // vsubps 0x60(%rax),%ymm8,%ymm8
+ .byte 196,65,60,89,200 // vmulps %ymm8,%ymm8,%ymm9
+ .byte 65,184,114,28,199,62 // mov $0x3ec71c72,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,67,121,4,210,0 // vpermilps $0x0,%xmm10,%xmm10
+ .byte 196,67,45,24,210,1 // vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ .byte 65,184,171,170,170,190 // mov $0xbeaaaaab,%r8d
+ .byte 196,65,121,110,216 // vmovd %r8d,%xmm11
+ .byte 196,67,121,4,219,0 // vpermilps $0x0,%xmm11,%xmm11
+ .byte 196,67,37,24,219,1 // vinsertf128 $0x1,%xmm11,%ymm11,%ymm11
+ .byte 196,65,44,89,192 // vmulps %ymm8,%ymm10,%ymm8
+ .byte 196,65,60,88,195 // vaddps %ymm11,%ymm8,%ymm8
+ .byte 196,65,52,89,192 // vmulps %ymm8,%ymm9,%ymm8
+ .byte 197,124,17,128,160,0,0,0 // vmovups %ymm8,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_n1y_avx
+.globl _sk_bicubic_n1y_avx
+_sk_bicubic_n1y_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,191 // mov $0xbf000000,%r8d
+ .byte 196,193,121,110,200 // vmovd %r8d,%xmm1
+ .byte 196,227,121,4,201,0 // vpermilps $0x0,%xmm1,%xmm1
+ .byte 196,227,117,24,201,1 // vinsertf128 $0x1,%xmm1,%ymm1,%ymm1
+ .byte 197,244,88,72,32 // vaddps 0x20(%rax),%ymm1,%ymm1
+ .byte 65,184,0,0,128,63 // mov $0x3f800000,%r8d
+ .byte 196,65,121,110,192 // vmovd %r8d,%xmm8
+ .byte 196,67,121,4,192,0 // vpermilps $0x0,%xmm8,%xmm8
+ .byte 196,67,61,24,192,1 // vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
+ .byte 197,60,92,64,96 // vsubps 0x60(%rax),%ymm8,%ymm8
+ .byte 65,184,85,85,149,191 // mov $0xbf955555,%r8d
+ .byte 196,65,121,110,200 // vmovd %r8d,%xmm9
+ .byte 196,67,121,4,201,0 // vpermilps $0x0,%xmm9,%xmm9
+ .byte 196,67,53,24,201,1 // vinsertf128 $0x1,%xmm9,%ymm9,%ymm9
+ .byte 65,184,0,0,192,63 // mov $0x3fc00000,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,67,121,4,210,0 // vpermilps $0x0,%xmm10,%xmm10
+ .byte 196,67,45,24,210,1 // vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ .byte 196,65,52,89,200 // vmulps %ymm8,%ymm9,%ymm9
+ .byte 196,65,52,88,202 // vaddps %ymm10,%ymm9,%ymm9
+ .byte 65,184,0,0,0,63 // mov $0x3f000000,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,67,121,4,210,0 // vpermilps $0x0,%xmm10,%xmm10
+ .byte 196,67,45,24,210,1 // vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ .byte 196,65,60,89,201 // vmulps %ymm9,%ymm8,%ymm9
+ .byte 196,65,44,88,201 // vaddps %ymm9,%ymm10,%ymm9
+ .byte 65,184,57,142,99,61 // mov $0x3d638e39,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,67,121,4,210,0 // vpermilps $0x0,%xmm10,%xmm10
+ .byte 196,67,45,24,210,1 // vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ .byte 196,65,60,89,193 // vmulps %ymm9,%ymm8,%ymm8
+ .byte 196,65,44,88,192 // vaddps %ymm8,%ymm10,%ymm8
+ .byte 197,124,17,128,160,0,0,0 // vmovups %ymm8,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_p1y_avx
+.globl _sk_bicubic_p1y_avx
+_sk_bicubic_p1y_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,0,63 // mov $0x3f000000,%r8d
+ .byte 196,193,121,110,200 // vmovd %r8d,%xmm1
+ .byte 196,227,121,4,201,0 // vpermilps $0x0,%xmm1,%xmm1
+ .byte 196,99,117,24,193,1 // vinsertf128 $0x1,%xmm1,%ymm1,%ymm8
+ .byte 197,188,88,72,32 // vaddps 0x20(%rax),%ymm8,%ymm1
+ .byte 197,124,16,72,96 // vmovups 0x60(%rax),%ymm9
+ .byte 65,184,85,85,149,191 // mov $0xbf955555,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,67,121,4,210,0 // vpermilps $0x0,%xmm10,%xmm10
+ .byte 196,67,45,24,210,1 // vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ .byte 65,184,0,0,192,63 // mov $0x3fc00000,%r8d
+ .byte 196,65,121,110,216 // vmovd %r8d,%xmm11
+ .byte 196,67,121,4,219,0 // vpermilps $0x0,%xmm11,%xmm11
+ .byte 196,67,37,24,219,1 // vinsertf128 $0x1,%xmm11,%ymm11,%ymm11
+ .byte 196,65,52,89,210 // vmulps %ymm10,%ymm9,%ymm10
+ .byte 196,65,44,88,211 // vaddps %ymm11,%ymm10,%ymm10
+ .byte 196,65,52,89,210 // vmulps %ymm10,%ymm9,%ymm10
+ .byte 196,65,60,88,194 // vaddps %ymm10,%ymm8,%ymm8
+ .byte 65,184,57,142,99,61 // mov $0x3d638e39,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,67,121,4,210,0 // vpermilps $0x0,%xmm10,%xmm10
+ .byte 196,67,45,24,210,1 // vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ .byte 196,65,52,89,192 // vmulps %ymm8,%ymm9,%ymm8
+ .byte 196,65,44,88,192 // vaddps %ymm8,%ymm10,%ymm8
+ .byte 197,124,17,128,160,0,0,0 // vmovups %ymm8,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_p3y_avx
+.globl _sk_bicubic_p3y_avx
+_sk_bicubic_p3y_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,184,0,0,192,63 // mov $0x3fc00000,%r8d
+ .byte 196,193,121,110,200 // vmovd %r8d,%xmm1
+ .byte 196,227,121,4,201,0 // vpermilps $0x0,%xmm1,%xmm1
+ .byte 196,227,117,24,201,1 // vinsertf128 $0x1,%xmm1,%ymm1,%ymm1
+ .byte 197,244,88,72,32 // vaddps 0x20(%rax),%ymm1,%ymm1
+ .byte 197,124,16,64,96 // vmovups 0x60(%rax),%ymm8
+ .byte 196,65,60,89,200 // vmulps %ymm8,%ymm8,%ymm9
+ .byte 65,184,114,28,199,62 // mov $0x3ec71c72,%r8d
+ .byte 196,65,121,110,208 // vmovd %r8d,%xmm10
+ .byte 196,67,121,4,210,0 // vpermilps $0x0,%xmm10,%xmm10
+ .byte 196,67,45,24,210,1 // vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ .byte 65,184,171,170,170,190 // mov $0xbeaaaaab,%r8d
+ .byte 196,65,121,110,216 // vmovd %r8d,%xmm11
+ .byte 196,67,121,4,219,0 // vpermilps $0x0,%xmm11,%xmm11
+ .byte 196,67,37,24,219,1 // vinsertf128 $0x1,%xmm11,%ymm11,%ymm11
+ .byte 196,65,60,89,194 // vmulps %ymm10,%ymm8,%ymm8
+ .byte 196,65,60,88,195 // vaddps %ymm11,%ymm8,%ymm8
+ .byte 196,65,52,89,192 // vmulps %ymm8,%ymm9,%ymm8
+ .byte 197,124,17,128,160,0,0,0 // vmovups %ymm8,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
HIDDEN _sk_start_pipeline_sse41
.globl _sk_start_pipeline_sse41
_sk_start_pipeline_sse41:
@@ -16164,6 +17408,346 @@ _sk_linear_gradient_2stops_sse41:
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
+HIDDEN _sk_save_xy_sse41
+.globl _sk_save_xy_sse41
+_sk_save_xy_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,63 // mov $0x3f000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,40,200 // movaps %xmm8,%xmm9
+ .byte 68,15,88,200 // addps %xmm0,%xmm9
+ .byte 102,69,15,58,8,209,1 // roundps $0x1,%xmm9,%xmm10
+ .byte 69,15,92,202 // subps %xmm10,%xmm9
+ .byte 68,15,88,193 // addps %xmm1,%xmm8
+ .byte 102,69,15,58,8,208,1 // roundps $0x1,%xmm8,%xmm10
+ .byte 69,15,92,194 // subps %xmm10,%xmm8
+ .byte 15,17,0 // movups %xmm0,(%rax)
+ .byte 15,17,72,32 // movups %xmm1,0x20(%rax)
+ .byte 68,15,17,72,64 // movups %xmm9,0x40(%rax)
+ .byte 68,15,17,64,96 // movups %xmm8,0x60(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_accumulate_sse41
+.globl _sk_accumulate_sse41
+_sk_accumulate_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 68,15,16,128,128,0,0,0 // movups 0x80(%rax),%xmm8
+ .byte 68,15,16,136,160,0,0,0 // movups 0xa0(%rax),%xmm9
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,40,193 // movaps %xmm9,%xmm8
+ .byte 68,15,89,192 // mulps %xmm0,%xmm8
+ .byte 65,15,88,224 // addps %xmm8,%xmm4
+ .byte 69,15,40,193 // movaps %xmm9,%xmm8
+ .byte 68,15,89,193 // mulps %xmm1,%xmm8
+ .byte 65,15,88,232 // addps %xmm8,%xmm5
+ .byte 69,15,40,193 // movaps %xmm9,%xmm8
+ .byte 68,15,89,194 // mulps %xmm2,%xmm8
+ .byte 65,15,88,240 // addps %xmm8,%xmm6
+ .byte 68,15,89,203 // mulps %xmm3,%xmm9
+ .byte 65,15,88,249 // addps %xmm9,%xmm7
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bilinear_nx_sse41
+.globl _sk_bilinear_nx_sse41
+_sk_bilinear_nx_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,191 // mov $0xbf000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,0 // movups (%rax),%xmm0
+ .byte 68,15,16,72,64 // movups 0x40(%rax),%xmm9
+ .byte 65,15,88,192 // addps %xmm8,%xmm0
+ .byte 185,0,0,128,63 // mov $0x3f800000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,92,193 // subps %xmm9,%xmm8
+ .byte 68,15,17,128,128,0,0,0 // movups %xmm8,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bilinear_px_sse41
+.globl _sk_bilinear_px_sse41
+_sk_bilinear_px_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,63 // mov $0x3f000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,0 // movups (%rax),%xmm0
+ .byte 68,15,16,72,64 // movups 0x40(%rax),%xmm9
+ .byte 65,15,88,192 // addps %xmm8,%xmm0
+ .byte 68,15,17,136,128,0,0,0 // movups %xmm9,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bilinear_ny_sse41
+.globl _sk_bilinear_ny_sse41
+_sk_bilinear_ny_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,191 // mov $0xbf000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,72,32 // movups 0x20(%rax),%xmm1
+ .byte 68,15,16,72,96 // movups 0x60(%rax),%xmm9
+ .byte 65,15,88,200 // addps %xmm8,%xmm1
+ .byte 185,0,0,128,63 // mov $0x3f800000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,92,193 // subps %xmm9,%xmm8
+ .byte 68,15,17,128,160,0,0,0 // movups %xmm8,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bilinear_py_sse41
+.globl _sk_bilinear_py_sse41
+_sk_bilinear_py_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,63 // mov $0x3f000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,72,32 // movups 0x20(%rax),%xmm1
+ .byte 68,15,16,72,96 // movups 0x60(%rax),%xmm9
+ .byte 65,15,88,200 // addps %xmm8,%xmm1
+ .byte 68,15,17,136,160,0,0,0 // movups %xmm9,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_n3x_sse41
+.globl _sk_bicubic_n3x_sse41
+_sk_bicubic_n3x_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,192,191 // mov $0xbfc00000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,0 // movups (%rax),%xmm0
+ .byte 68,15,16,72,64 // movups 0x40(%rax),%xmm9
+ .byte 65,15,88,192 // addps %xmm8,%xmm0
+ .byte 185,0,0,128,63 // mov $0x3f800000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,92,193 // subps %xmm9,%xmm8
+ .byte 185,114,28,199,62 // mov $0x3ec71c72,%ecx
+ .byte 102,68,15,110,201 // movd %ecx,%xmm9
+ .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9
+ .byte 185,171,170,170,190 // mov $0xbeaaaaab,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,89,192 // mulps %xmm8,%xmm8
+ .byte 69,15,88,202 // addps %xmm10,%xmm9
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 68,15,17,136,128,0,0,0 // movups %xmm9,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_n1x_sse41
+.globl _sk_bicubic_n1x_sse41
+_sk_bicubic_n1x_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,191 // mov $0xbf000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,0 // movups (%rax),%xmm0
+ .byte 68,15,16,72,64 // movups 0x40(%rax),%xmm9
+ .byte 65,15,88,192 // addps %xmm8,%xmm0
+ .byte 185,0,0,128,63 // mov $0x3f800000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,92,193 // subps %xmm9,%xmm8
+ .byte 185,85,85,149,191 // mov $0xbf955555,%ecx
+ .byte 102,68,15,110,201 // movd %ecx,%xmm9
+ .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9
+ .byte 185,0,0,192,63 // mov $0x3fc00000,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,88,202 // addps %xmm10,%xmm9
+ .byte 185,0,0,0,63 // mov $0x3f000000,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,88,202 // addps %xmm10,%xmm9
+ .byte 185,57,142,99,61 // mov $0x3d638e39,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,88,202 // addps %xmm10,%xmm9
+ .byte 68,15,17,136,128,0,0,0 // movups %xmm9,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_p1x_sse41
+.globl _sk_bicubic_p1x_sse41
+_sk_bicubic_p1x_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,63 // mov $0x3f000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,0 // movups (%rax),%xmm0
+ .byte 68,15,16,72,64 // movups 0x40(%rax),%xmm9
+ .byte 65,15,88,192 // addps %xmm8,%xmm0
+ .byte 185,85,85,149,191 // mov $0xbf955555,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 185,0,0,192,63 // mov $0x3fc00000,%ecx
+ .byte 102,68,15,110,217 // movd %ecx,%xmm11
+ .byte 69,15,198,219,0 // shufps $0x0,%xmm11,%xmm11
+ .byte 69,15,89,209 // mulps %xmm9,%xmm10
+ .byte 69,15,88,211 // addps %xmm11,%xmm10
+ .byte 69,15,89,209 // mulps %xmm9,%xmm10
+ .byte 69,15,88,208 // addps %xmm8,%xmm10
+ .byte 185,57,142,99,61 // mov $0x3d638e39,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,89,209 // mulps %xmm9,%xmm10
+ .byte 69,15,88,208 // addps %xmm8,%xmm10
+ .byte 68,15,17,144,128,0,0,0 // movups %xmm10,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_p3x_sse41
+.globl _sk_bicubic_p3x_sse41
+_sk_bicubic_p3x_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,192,63 // mov $0x3fc00000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,0 // movups (%rax),%xmm0
+ .byte 68,15,16,72,64 // movups 0x40(%rax),%xmm9
+ .byte 65,15,88,192 // addps %xmm8,%xmm0
+ .byte 185,114,28,199,62 // mov $0x3ec71c72,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,89,193 // mulps %xmm9,%xmm8
+ .byte 69,15,89,201 // mulps %xmm9,%xmm9
+ .byte 185,171,170,170,190 // mov $0xbeaaaaab,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,88,194 // addps %xmm10,%xmm8
+ .byte 69,15,89,193 // mulps %xmm9,%xmm8
+ .byte 68,15,17,128,128,0,0,0 // movups %xmm8,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_n3y_sse41
+.globl _sk_bicubic_n3y_sse41
+_sk_bicubic_n3y_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,192,191 // mov $0xbfc00000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,72,32 // movups 0x20(%rax),%xmm1
+ .byte 68,15,16,72,96 // movups 0x60(%rax),%xmm9
+ .byte 65,15,88,200 // addps %xmm8,%xmm1
+ .byte 185,0,0,128,63 // mov $0x3f800000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,92,193 // subps %xmm9,%xmm8
+ .byte 185,114,28,199,62 // mov $0x3ec71c72,%ecx
+ .byte 102,68,15,110,201 // movd %ecx,%xmm9
+ .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9
+ .byte 185,171,170,170,190 // mov $0xbeaaaaab,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,89,192 // mulps %xmm8,%xmm8
+ .byte 69,15,88,202 // addps %xmm10,%xmm9
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 68,15,17,136,160,0,0,0 // movups %xmm9,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_n1y_sse41
+.globl _sk_bicubic_n1y_sse41
+_sk_bicubic_n1y_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,191 // mov $0xbf000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,72,32 // movups 0x20(%rax),%xmm1
+ .byte 68,15,16,72,96 // movups 0x60(%rax),%xmm9
+ .byte 65,15,88,200 // addps %xmm8,%xmm1
+ .byte 185,0,0,128,63 // mov $0x3f800000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,92,193 // subps %xmm9,%xmm8
+ .byte 185,85,85,149,191 // mov $0xbf955555,%ecx
+ .byte 102,68,15,110,201 // movd %ecx,%xmm9
+ .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9
+ .byte 185,0,0,192,63 // mov $0x3fc00000,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,88,202 // addps %xmm10,%xmm9
+ .byte 185,0,0,0,63 // mov $0x3f000000,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,88,202 // addps %xmm10,%xmm9
+ .byte 185,57,142,99,61 // mov $0x3d638e39,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,88,202 // addps %xmm10,%xmm9
+ .byte 68,15,17,136,160,0,0,0 // movups %xmm9,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_p1y_sse41
+.globl _sk_bicubic_p1y_sse41
+_sk_bicubic_p1y_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,63 // mov $0x3f000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,72,32 // movups 0x20(%rax),%xmm1
+ .byte 68,15,16,72,96 // movups 0x60(%rax),%xmm9
+ .byte 65,15,88,200 // addps %xmm8,%xmm1
+ .byte 185,85,85,149,191 // mov $0xbf955555,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 185,0,0,192,63 // mov $0x3fc00000,%ecx
+ .byte 102,68,15,110,217 // movd %ecx,%xmm11
+ .byte 69,15,198,219,0 // shufps $0x0,%xmm11,%xmm11
+ .byte 69,15,89,209 // mulps %xmm9,%xmm10
+ .byte 69,15,88,211 // addps %xmm11,%xmm10
+ .byte 69,15,89,209 // mulps %xmm9,%xmm10
+ .byte 69,15,88,208 // addps %xmm8,%xmm10
+ .byte 185,57,142,99,61 // mov $0x3d638e39,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,89,209 // mulps %xmm9,%xmm10
+ .byte 69,15,88,208 // addps %xmm8,%xmm10
+ .byte 68,15,17,144,160,0,0,0 // movups %xmm10,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_p3y_sse41
+.globl _sk_bicubic_p3y_sse41
+_sk_bicubic_p3y_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,192,63 // mov $0x3fc00000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,72,32 // movups 0x20(%rax),%xmm1
+ .byte 68,15,16,72,96 // movups 0x60(%rax),%xmm9
+ .byte 65,15,88,200 // addps %xmm8,%xmm1
+ .byte 185,114,28,199,62 // mov $0x3ec71c72,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,89,193 // mulps %xmm9,%xmm8
+ .byte 69,15,89,201 // mulps %xmm9,%xmm9
+ .byte 185,171,170,170,190 // mov $0xbeaaaaab,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,88,194 // addps %xmm10,%xmm8
+ .byte 69,15,89,193 // mulps %xmm9,%xmm8
+ .byte 68,15,17,128,160,0,0,0 // movups %xmm8,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
HIDDEN _sk_start_pipeline_sse2
.globl _sk_start_pipeline_sse2
_sk_start_pipeline_sse2:
@@ -19638,4 +21222,357 @@ _sk_linear_gradient_2stops_sse2:
.byte 65,15,88,217 // addps %xmm9,%xmm3
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_save_xy_sse2
+.globl _sk_save_xy_sse2
+_sk_save_xy_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,63 // mov $0x3f000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,40,200 // movaps %xmm8,%xmm9
+ .byte 68,15,88,200 // addps %xmm0,%xmm9
+ .byte 243,69,15,91,209 // cvttps2dq %xmm9,%xmm10
+ .byte 69,15,91,210 // cvtdq2ps %xmm10,%xmm10
+ .byte 69,15,40,217 // movaps %xmm9,%xmm11
+ .byte 69,15,194,218,1 // cmpltps %xmm10,%xmm11
+ .byte 185,0,0,128,63 // mov $0x3f800000,%ecx
+ .byte 102,68,15,110,225 // movd %ecx,%xmm12
+ .byte 69,15,198,228,0 // shufps $0x0,%xmm12,%xmm12
+ .byte 69,15,84,220 // andps %xmm12,%xmm11
+ .byte 69,15,92,211 // subps %xmm11,%xmm10
+ .byte 69,15,92,202 // subps %xmm10,%xmm9
+ .byte 68,15,88,193 // addps %xmm1,%xmm8
+ .byte 243,69,15,91,208 // cvttps2dq %xmm8,%xmm10
+ .byte 69,15,91,210 // cvtdq2ps %xmm10,%xmm10
+ .byte 69,15,40,216 // movaps %xmm8,%xmm11
+ .byte 69,15,194,218,1 // cmpltps %xmm10,%xmm11
+ .byte 69,15,84,220 // andps %xmm12,%xmm11
+ .byte 69,15,92,211 // subps %xmm11,%xmm10
+ .byte 69,15,92,194 // subps %xmm10,%xmm8
+ .byte 15,17,0 // movups %xmm0,(%rax)
+ .byte 15,17,72,32 // movups %xmm1,0x20(%rax)
+ .byte 68,15,17,72,64 // movups %xmm9,0x40(%rax)
+ .byte 68,15,17,64,96 // movups %xmm8,0x60(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_accumulate_sse2
+.globl _sk_accumulate_sse2
+_sk_accumulate_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 68,15,16,128,128,0,0,0 // movups 0x80(%rax),%xmm8
+ .byte 68,15,16,136,160,0,0,0 // movups 0xa0(%rax),%xmm9
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,40,193 // movaps %xmm9,%xmm8
+ .byte 68,15,89,192 // mulps %xmm0,%xmm8
+ .byte 65,15,88,224 // addps %xmm8,%xmm4
+ .byte 69,15,40,193 // movaps %xmm9,%xmm8
+ .byte 68,15,89,193 // mulps %xmm1,%xmm8
+ .byte 65,15,88,232 // addps %xmm8,%xmm5
+ .byte 69,15,40,193 // movaps %xmm9,%xmm8
+ .byte 68,15,89,194 // mulps %xmm2,%xmm8
+ .byte 65,15,88,240 // addps %xmm8,%xmm6
+ .byte 68,15,89,203 // mulps %xmm3,%xmm9
+ .byte 65,15,88,249 // addps %xmm9,%xmm7
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bilinear_nx_sse2
+.globl _sk_bilinear_nx_sse2
+_sk_bilinear_nx_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,191 // mov $0xbf000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,0 // movups (%rax),%xmm0
+ .byte 68,15,16,72,64 // movups 0x40(%rax),%xmm9
+ .byte 65,15,88,192 // addps %xmm8,%xmm0
+ .byte 185,0,0,128,63 // mov $0x3f800000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,92,193 // subps %xmm9,%xmm8
+ .byte 68,15,17,128,128,0,0,0 // movups %xmm8,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bilinear_px_sse2
+.globl _sk_bilinear_px_sse2
+_sk_bilinear_px_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,63 // mov $0x3f000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,0 // movups (%rax),%xmm0
+ .byte 68,15,16,72,64 // movups 0x40(%rax),%xmm9
+ .byte 65,15,88,192 // addps %xmm8,%xmm0
+ .byte 68,15,17,136,128,0,0,0 // movups %xmm9,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bilinear_ny_sse2
+.globl _sk_bilinear_ny_sse2
+_sk_bilinear_ny_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,191 // mov $0xbf000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,72,32 // movups 0x20(%rax),%xmm1
+ .byte 68,15,16,72,96 // movups 0x60(%rax),%xmm9
+ .byte 65,15,88,200 // addps %xmm8,%xmm1
+ .byte 185,0,0,128,63 // mov $0x3f800000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,92,193 // subps %xmm9,%xmm8
+ .byte 68,15,17,128,160,0,0,0 // movups %xmm8,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bilinear_py_sse2
+.globl _sk_bilinear_py_sse2
+_sk_bilinear_py_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,63 // mov $0x3f000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,72,32 // movups 0x20(%rax),%xmm1
+ .byte 68,15,16,72,96 // movups 0x60(%rax),%xmm9
+ .byte 65,15,88,200 // addps %xmm8,%xmm1
+ .byte 68,15,17,136,160,0,0,0 // movups %xmm9,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_n3x_sse2
+.globl _sk_bicubic_n3x_sse2
+_sk_bicubic_n3x_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,192,191 // mov $0xbfc00000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,0 // movups (%rax),%xmm0
+ .byte 68,15,16,72,64 // movups 0x40(%rax),%xmm9
+ .byte 65,15,88,192 // addps %xmm8,%xmm0
+ .byte 185,0,0,128,63 // mov $0x3f800000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,92,193 // subps %xmm9,%xmm8
+ .byte 185,114,28,199,62 // mov $0x3ec71c72,%ecx
+ .byte 102,68,15,110,201 // movd %ecx,%xmm9
+ .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9
+ .byte 185,171,170,170,190 // mov $0xbeaaaaab,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,89,192 // mulps %xmm8,%xmm8
+ .byte 69,15,88,202 // addps %xmm10,%xmm9
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 68,15,17,136,128,0,0,0 // movups %xmm9,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_n1x_sse2
+.globl _sk_bicubic_n1x_sse2
+_sk_bicubic_n1x_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,191 // mov $0xbf000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,0 // movups (%rax),%xmm0
+ .byte 68,15,16,72,64 // movups 0x40(%rax),%xmm9
+ .byte 65,15,88,192 // addps %xmm8,%xmm0
+ .byte 185,0,0,128,63 // mov $0x3f800000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,92,193 // subps %xmm9,%xmm8
+ .byte 185,85,85,149,191 // mov $0xbf955555,%ecx
+ .byte 102,68,15,110,201 // movd %ecx,%xmm9
+ .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9
+ .byte 185,0,0,192,63 // mov $0x3fc00000,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,88,202 // addps %xmm10,%xmm9
+ .byte 185,0,0,0,63 // mov $0x3f000000,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,88,202 // addps %xmm10,%xmm9
+ .byte 185,57,142,99,61 // mov $0x3d638e39,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,88,202 // addps %xmm10,%xmm9
+ .byte 68,15,17,136,128,0,0,0 // movups %xmm9,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_p1x_sse2
+.globl _sk_bicubic_p1x_sse2
+_sk_bicubic_p1x_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,63 // mov $0x3f000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,0 // movups (%rax),%xmm0
+ .byte 68,15,16,72,64 // movups 0x40(%rax),%xmm9
+ .byte 65,15,88,192 // addps %xmm8,%xmm0
+ .byte 185,85,85,149,191 // mov $0xbf955555,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 185,0,0,192,63 // mov $0x3fc00000,%ecx
+ .byte 102,68,15,110,217 // movd %ecx,%xmm11
+ .byte 69,15,198,219,0 // shufps $0x0,%xmm11,%xmm11
+ .byte 69,15,89,209 // mulps %xmm9,%xmm10
+ .byte 69,15,88,211 // addps %xmm11,%xmm10
+ .byte 69,15,89,209 // mulps %xmm9,%xmm10
+ .byte 69,15,88,208 // addps %xmm8,%xmm10
+ .byte 185,57,142,99,61 // mov $0x3d638e39,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,89,209 // mulps %xmm9,%xmm10
+ .byte 69,15,88,208 // addps %xmm8,%xmm10
+ .byte 68,15,17,144,128,0,0,0 // movups %xmm10,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_p3x_sse2
+.globl _sk_bicubic_p3x_sse2
+_sk_bicubic_p3x_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,192,63 // mov $0x3fc00000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,0 // movups (%rax),%xmm0
+ .byte 68,15,16,72,64 // movups 0x40(%rax),%xmm9
+ .byte 65,15,88,192 // addps %xmm8,%xmm0
+ .byte 185,114,28,199,62 // mov $0x3ec71c72,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,89,193 // mulps %xmm9,%xmm8
+ .byte 69,15,89,201 // mulps %xmm9,%xmm9
+ .byte 185,171,170,170,190 // mov $0xbeaaaaab,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,88,194 // addps %xmm10,%xmm8
+ .byte 69,15,89,193 // mulps %xmm9,%xmm8
+ .byte 68,15,17,128,128,0,0,0 // movups %xmm8,0x80(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_n3y_sse2
+.globl _sk_bicubic_n3y_sse2
+_sk_bicubic_n3y_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,192,191 // mov $0xbfc00000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,72,32 // movups 0x20(%rax),%xmm1
+ .byte 68,15,16,72,96 // movups 0x60(%rax),%xmm9
+ .byte 65,15,88,200 // addps %xmm8,%xmm1
+ .byte 185,0,0,128,63 // mov $0x3f800000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,92,193 // subps %xmm9,%xmm8
+ .byte 185,114,28,199,62 // mov $0x3ec71c72,%ecx
+ .byte 102,68,15,110,201 // movd %ecx,%xmm9
+ .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9
+ .byte 185,171,170,170,190 // mov $0xbeaaaaab,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,89,192 // mulps %xmm8,%xmm8
+ .byte 69,15,88,202 // addps %xmm10,%xmm9
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 68,15,17,136,160,0,0,0 // movups %xmm9,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_n1y_sse2
+.globl _sk_bicubic_n1y_sse2
+_sk_bicubic_n1y_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,191 // mov $0xbf000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,72,32 // movups 0x20(%rax),%xmm1
+ .byte 68,15,16,72,96 // movups 0x60(%rax),%xmm9
+ .byte 65,15,88,200 // addps %xmm8,%xmm1
+ .byte 185,0,0,128,63 // mov $0x3f800000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,92,193 // subps %xmm9,%xmm8
+ .byte 185,85,85,149,191 // mov $0xbf955555,%ecx
+ .byte 102,68,15,110,201 // movd %ecx,%xmm9
+ .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9
+ .byte 185,0,0,192,63 // mov $0x3fc00000,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,88,202 // addps %xmm10,%xmm9
+ .byte 185,0,0,0,63 // mov $0x3f000000,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,88,202 // addps %xmm10,%xmm9
+ .byte 185,57,142,99,61 // mov $0x3d638e39,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 69,15,88,202 // addps %xmm10,%xmm9
+ .byte 68,15,17,136,160,0,0,0 // movups %xmm9,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_p1y_sse2
+.globl _sk_bicubic_p1y_sse2
+_sk_bicubic_p1y_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,0,63 // mov $0x3f000000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,72,32 // movups 0x20(%rax),%xmm1
+ .byte 68,15,16,72,96 // movups 0x60(%rax),%xmm9
+ .byte 65,15,88,200 // addps %xmm8,%xmm1
+ .byte 185,85,85,149,191 // mov $0xbf955555,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 185,0,0,192,63 // mov $0x3fc00000,%ecx
+ .byte 102,68,15,110,217 // movd %ecx,%xmm11
+ .byte 69,15,198,219,0 // shufps $0x0,%xmm11,%xmm11
+ .byte 69,15,89,209 // mulps %xmm9,%xmm10
+ .byte 69,15,88,211 // addps %xmm11,%xmm10
+ .byte 69,15,89,209 // mulps %xmm9,%xmm10
+ .byte 69,15,88,208 // addps %xmm8,%xmm10
+ .byte 185,57,142,99,61 // mov $0x3d638e39,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,89,209 // mulps %xmm9,%xmm10
+ .byte 69,15,88,208 // addps %xmm8,%xmm10
+ .byte 68,15,17,144,160,0,0,0 // movups %xmm10,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+HIDDEN _sk_bicubic_p3y_sse2
+.globl _sk_bicubic_p3y_sse2
+_sk_bicubic_p3y_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 185,0,0,192,63 // mov $0x3fc00000,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 15,16,72,32 // movups 0x20(%rax),%xmm1
+ .byte 68,15,16,72,96 // movups 0x60(%rax),%xmm9
+ .byte 65,15,88,200 // addps %xmm8,%xmm1
+ .byte 185,114,28,199,62 // mov $0x3ec71c72,%ecx
+ .byte 102,68,15,110,193 // movd %ecx,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 69,15,89,193 // mulps %xmm9,%xmm8
+ .byte 69,15,89,201 // mulps %xmm9,%xmm9
+ .byte 185,171,170,170,190 // mov $0xbeaaaaab,%ecx
+ .byte 102,68,15,110,209 // movd %ecx,%xmm10
+ .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10
+ .byte 69,15,88,194 // addps %xmm10,%xmm8
+ .byte 69,15,89,193 // mulps %xmm9,%xmm8
+ .byte 68,15,17,128,160,0,0,0 // movups %xmm8,0xa0(%rax)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
#endif
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index 043da8576b..b305f23943 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -1357,7 +1357,7 @@ _sk_lerp_565_hsw LABEL PROC
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; (bad)
- DB 233,255,255,255,225 ; jmpq ffffffffe2001478 <_sk_linear_gradient_2stops_hsw+0xffffffffe1ffe296>
+ DB 233,255,255,255,225 ; jmpq ffffffffe2001478 <_sk_bicubic_p3y_hsw+0xffffffffe1ffde19>
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; (bad)
@@ -2328,7 +2328,7 @@ _sk_load_4444_hsw LABEL PROC
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; (bad)
- DB 233,255,255,255,225 ; jmpq ffffffffe2002334 <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff152>
+ DB 233,255,255,255,225 ; jmpq ffffffffe2002334 <_sk_bicubic_p3y_hsw+0xffffffffe1ffecd5>
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; (bad)
@@ -3289,6 +3289,290 @@ _sk_linear_gradient_2stops_hsw LABEL PROC
DB 197,124,41,192 ; vmovaps %ymm8,%ymm0
DB 255,224 ; jmpq *%rax
+PUBLIC _sk_save_xy_hsw
+_sk_save_xy_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,63 ; mov $0x3f000000,%r8d
+ DB 196,65,121,110,192 ; vmovd %r8d,%xmm8
+ DB 196,66,125,88,192 ; vpbroadcastd %xmm8,%ymm8
+ DB 197,60,88,200 ; vaddps %ymm0,%ymm8,%ymm9
+ DB 196,67,125,8,209,1 ; vroundps $0x1,%ymm9,%ymm10
+ DB 196,65,52,92,202 ; vsubps %ymm10,%ymm9,%ymm9
+ DB 197,60,88,193 ; vaddps %ymm1,%ymm8,%ymm8
+ DB 196,67,125,8,208,1 ; vroundps $0x1,%ymm8,%ymm10
+ DB 196,65,60,92,194 ; vsubps %ymm10,%ymm8,%ymm8
+ DB 197,252,17,0 ; vmovups %ymm0,(%rax)
+ DB 197,252,17,72,32 ; vmovups %ymm1,0x20(%rax)
+ DB 197,124,17,72,64 ; vmovups %ymm9,0x40(%rax)
+ DB 197,124,17,64,96 ; vmovups %ymm8,0x60(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_accumulate_hsw
+_sk_accumulate_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,124,16,128,128,0,0,0 ; vmovups 0x80(%rax),%ymm8
+ DB 197,60,89,128,160,0,0,0 ; vmulps 0xa0(%rax),%ymm8,%ymm8
+ DB 196,226,61,184,224 ; vfmadd231ps %ymm0,%ymm8,%ymm4
+ DB 196,226,61,184,233 ; vfmadd231ps %ymm1,%ymm8,%ymm5
+ DB 196,226,61,184,242 ; vfmadd231ps %ymm2,%ymm8,%ymm6
+ DB 196,98,101,168,199 ; vfmadd213ps %ymm7,%ymm3,%ymm8
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,124,41,199 ; vmovaps %ymm8,%ymm7
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bilinear_nx_hsw
+_sk_bilinear_nx_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,191 ; mov $0xbf000000,%r8d
+ DB 196,193,121,110,192 ; vmovd %r8d,%xmm0
+ DB 196,226,125,88,192 ; vpbroadcastd %xmm0,%ymm0
+ DB 197,252,88,0 ; vaddps (%rax),%ymm0,%ymm0
+ DB 65,184,0,0,128,63 ; mov $0x3f800000,%r8d
+ DB 196,65,121,110,192 ; vmovd %r8d,%xmm8
+ DB 196,66,125,88,192 ; vpbroadcastd %xmm8,%ymm8
+ DB 197,60,92,64,64 ; vsubps 0x40(%rax),%ymm8,%ymm8
+ DB 197,124,17,128,128,0,0,0 ; vmovups %ymm8,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bilinear_px_hsw
+_sk_bilinear_px_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,63 ; mov $0x3f000000,%r8d
+ DB 196,193,121,110,192 ; vmovd %r8d,%xmm0
+ DB 196,226,125,88,192 ; vpbroadcastd %xmm0,%ymm0
+ DB 197,252,88,0 ; vaddps (%rax),%ymm0,%ymm0
+ DB 197,124,16,64,64 ; vmovups 0x40(%rax),%ymm8
+ DB 197,124,17,128,128,0,0,0 ; vmovups %ymm8,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bilinear_ny_hsw
+_sk_bilinear_ny_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,191 ; mov $0xbf000000,%r8d
+ DB 196,193,121,110,200 ; vmovd %r8d,%xmm1
+ DB 196,226,125,88,201 ; vpbroadcastd %xmm1,%ymm1
+ DB 197,244,88,72,32 ; vaddps 0x20(%rax),%ymm1,%ymm1
+ DB 65,184,0,0,128,63 ; mov $0x3f800000,%r8d
+ DB 196,65,121,110,192 ; vmovd %r8d,%xmm8
+ DB 196,66,125,88,192 ; vpbroadcastd %xmm8,%ymm8
+ DB 197,60,92,64,96 ; vsubps 0x60(%rax),%ymm8,%ymm8
+ DB 197,124,17,128,160,0,0,0 ; vmovups %ymm8,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bilinear_py_hsw
+_sk_bilinear_py_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,63 ; mov $0x3f000000,%r8d
+ DB 196,193,121,110,200 ; vmovd %r8d,%xmm1
+ DB 196,226,125,88,201 ; vpbroadcastd %xmm1,%ymm1
+ DB 197,244,88,72,32 ; vaddps 0x20(%rax),%ymm1,%ymm1
+ DB 197,124,16,64,96 ; vmovups 0x60(%rax),%ymm8
+ DB 197,124,17,128,160,0,0,0 ; vmovups %ymm8,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_n3x_hsw
+_sk_bicubic_n3x_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,192,191 ; mov $0xbfc00000,%r8d
+ DB 196,193,121,110,192 ; vmovd %r8d,%xmm0
+ DB 196,226,125,88,192 ; vpbroadcastd %xmm0,%ymm0
+ DB 197,252,88,0 ; vaddps (%rax),%ymm0,%ymm0
+ DB 65,184,0,0,128,63 ; mov $0x3f800000,%r8d
+ DB 196,65,121,110,192 ; vmovd %r8d,%xmm8
+ DB 196,66,125,88,192 ; vpbroadcastd %xmm8,%ymm8
+ DB 197,60,92,64,64 ; vsubps 0x40(%rax),%ymm8,%ymm8
+ DB 196,65,60,89,200 ; vmulps %ymm8,%ymm8,%ymm9
+ DB 65,184,114,28,199,62 ; mov $0x3ec71c72,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,66,125,88,210 ; vpbroadcastd %xmm10,%ymm10
+ DB 65,184,171,170,170,190 ; mov $0xbeaaaaab,%r8d
+ DB 196,65,121,110,216 ; vmovd %r8d,%xmm11
+ DB 196,66,125,88,219 ; vpbroadcastd %xmm11,%ymm11
+ DB 196,66,61,168,211 ; vfmadd213ps %ymm11,%ymm8,%ymm10
+ DB 196,65,44,89,193 ; vmulps %ymm9,%ymm10,%ymm8
+ DB 197,124,17,128,128,0,0,0 ; vmovups %ymm8,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_n1x_hsw
+_sk_bicubic_n1x_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,191 ; mov $0xbf000000,%r8d
+ DB 196,193,121,110,192 ; vmovd %r8d,%xmm0
+ DB 196,226,125,88,192 ; vpbroadcastd %xmm0,%ymm0
+ DB 197,252,88,0 ; vaddps (%rax),%ymm0,%ymm0
+ DB 65,184,0,0,128,63 ; mov $0x3f800000,%r8d
+ DB 196,65,121,110,192 ; vmovd %r8d,%xmm8
+ DB 196,66,125,88,192 ; vpbroadcastd %xmm8,%ymm8
+ DB 197,60,92,64,64 ; vsubps 0x40(%rax),%ymm8,%ymm8
+ DB 65,184,85,85,149,191 ; mov $0xbf955555,%r8d
+ DB 196,65,121,110,200 ; vmovd %r8d,%xmm9
+ DB 196,66,125,88,201 ; vpbroadcastd %xmm9,%ymm9
+ DB 65,184,0,0,192,63 ; mov $0x3fc00000,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,66,125,88,210 ; vpbroadcastd %xmm10,%ymm10
+ DB 196,66,61,168,202 ; vfmadd213ps %ymm10,%ymm8,%ymm9
+ DB 65,184,0,0,0,63 ; mov $0x3f000000,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,66,125,88,210 ; vpbroadcastd %xmm10,%ymm10
+ DB 196,66,61,184,209 ; vfmadd231ps %ymm9,%ymm8,%ymm10
+ DB 65,184,57,142,99,61 ; mov $0x3d638e39,%r8d
+ DB 196,65,121,110,200 ; vmovd %r8d,%xmm9
+ DB 196,66,125,88,201 ; vpbroadcastd %xmm9,%ymm9
+ DB 196,66,61,184,202 ; vfmadd231ps %ymm10,%ymm8,%ymm9
+ DB 197,124,17,136,128,0,0,0 ; vmovups %ymm9,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_p1x_hsw
+_sk_bicubic_p1x_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,63 ; mov $0x3f000000,%r8d
+ DB 196,193,121,110,192 ; vmovd %r8d,%xmm0
+ DB 196,98,125,88,192 ; vpbroadcastd %xmm0,%ymm8
+ DB 197,188,88,0 ; vaddps (%rax),%ymm8,%ymm0
+ DB 197,124,16,72,64 ; vmovups 0x40(%rax),%ymm9
+ DB 65,184,85,85,149,191 ; mov $0xbf955555,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,66,125,88,210 ; vpbroadcastd %xmm10,%ymm10
+ DB 65,184,0,0,192,63 ; mov $0x3fc00000,%r8d
+ DB 196,65,121,110,216 ; vmovd %r8d,%xmm11
+ DB 196,66,125,88,219 ; vpbroadcastd %xmm11,%ymm11
+ DB 196,66,53,168,211 ; vfmadd213ps %ymm11,%ymm9,%ymm10
+ DB 196,66,53,168,208 ; vfmadd213ps %ymm8,%ymm9,%ymm10
+ DB 65,184,57,142,99,61 ; mov $0x3d638e39,%r8d
+ DB 196,65,121,110,192 ; vmovd %r8d,%xmm8
+ DB 196,66,125,88,192 ; vpbroadcastd %xmm8,%ymm8
+ DB 196,66,53,184,194 ; vfmadd231ps %ymm10,%ymm9,%ymm8
+ DB 197,124,17,128,128,0,0,0 ; vmovups %ymm8,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_p3x_hsw
+_sk_bicubic_p3x_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,192,63 ; mov $0x3fc00000,%r8d
+ DB 196,193,121,110,192 ; vmovd %r8d,%xmm0
+ DB 196,226,125,88,192 ; vpbroadcastd %xmm0,%ymm0
+ DB 197,252,88,0 ; vaddps (%rax),%ymm0,%ymm0
+ DB 197,124,16,64,64 ; vmovups 0x40(%rax),%ymm8
+ DB 196,65,60,89,200 ; vmulps %ymm8,%ymm8,%ymm9
+ DB 65,184,114,28,199,62 ; mov $0x3ec71c72,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,66,125,88,210 ; vpbroadcastd %xmm10,%ymm10
+ DB 65,184,171,170,170,190 ; mov $0xbeaaaaab,%r8d
+ DB 196,65,121,110,216 ; vmovd %r8d,%xmm11
+ DB 196,66,125,88,219 ; vpbroadcastd %xmm11,%ymm11
+ DB 196,66,61,168,211 ; vfmadd213ps %ymm11,%ymm8,%ymm10
+ DB 196,65,52,89,194 ; vmulps %ymm10,%ymm9,%ymm8
+ DB 197,124,17,128,128,0,0,0 ; vmovups %ymm8,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_n3y_hsw
+_sk_bicubic_n3y_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,192,191 ; mov $0xbfc00000,%r8d
+ DB 196,193,121,110,200 ; vmovd %r8d,%xmm1
+ DB 196,226,125,88,201 ; vpbroadcastd %xmm1,%ymm1
+ DB 197,244,88,72,32 ; vaddps 0x20(%rax),%ymm1,%ymm1
+ DB 65,184,0,0,128,63 ; mov $0x3f800000,%r8d
+ DB 196,65,121,110,192 ; vmovd %r8d,%xmm8
+ DB 196,66,125,88,192 ; vpbroadcastd %xmm8,%ymm8
+ DB 197,60,92,64,96 ; vsubps 0x60(%rax),%ymm8,%ymm8
+ DB 196,65,60,89,200 ; vmulps %ymm8,%ymm8,%ymm9
+ DB 65,184,114,28,199,62 ; mov $0x3ec71c72,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,66,125,88,210 ; vpbroadcastd %xmm10,%ymm10
+ DB 65,184,171,170,170,190 ; mov $0xbeaaaaab,%r8d
+ DB 196,65,121,110,216 ; vmovd %r8d,%xmm11
+ DB 196,66,125,88,219 ; vpbroadcastd %xmm11,%ymm11
+ DB 196,66,61,168,211 ; vfmadd213ps %ymm11,%ymm8,%ymm10
+ DB 196,65,44,89,193 ; vmulps %ymm9,%ymm10,%ymm8
+ DB 197,124,17,128,160,0,0,0 ; vmovups %ymm8,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_n1y_hsw
+_sk_bicubic_n1y_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,191 ; mov $0xbf000000,%r8d
+ DB 196,193,121,110,200 ; vmovd %r8d,%xmm1
+ DB 196,226,125,88,201 ; vpbroadcastd %xmm1,%ymm1
+ DB 197,244,88,72,32 ; vaddps 0x20(%rax),%ymm1,%ymm1
+ DB 65,184,0,0,128,63 ; mov $0x3f800000,%r8d
+ DB 196,65,121,110,192 ; vmovd %r8d,%xmm8
+ DB 196,66,125,88,192 ; vpbroadcastd %xmm8,%ymm8
+ DB 197,60,92,64,96 ; vsubps 0x60(%rax),%ymm8,%ymm8
+ DB 65,184,85,85,149,191 ; mov $0xbf955555,%r8d
+ DB 196,65,121,110,200 ; vmovd %r8d,%xmm9
+ DB 196,66,125,88,201 ; vpbroadcastd %xmm9,%ymm9
+ DB 65,184,0,0,192,63 ; mov $0x3fc00000,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,66,125,88,210 ; vpbroadcastd %xmm10,%ymm10
+ DB 196,66,61,168,202 ; vfmadd213ps %ymm10,%ymm8,%ymm9
+ DB 65,184,0,0,0,63 ; mov $0x3f000000,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,66,125,88,210 ; vpbroadcastd %xmm10,%ymm10
+ DB 196,66,61,184,209 ; vfmadd231ps %ymm9,%ymm8,%ymm10
+ DB 65,184,57,142,99,61 ; mov $0x3d638e39,%r8d
+ DB 196,65,121,110,200 ; vmovd %r8d,%xmm9
+ DB 196,66,125,88,201 ; vpbroadcastd %xmm9,%ymm9
+ DB 196,66,61,184,202 ; vfmadd231ps %ymm10,%ymm8,%ymm9
+ DB 197,124,17,136,160,0,0,0 ; vmovups %ymm9,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_p1y_hsw
+_sk_bicubic_p1y_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,63 ; mov $0x3f000000,%r8d
+ DB 196,193,121,110,200 ; vmovd %r8d,%xmm1
+ DB 196,98,125,88,193 ; vpbroadcastd %xmm1,%ymm8
+ DB 197,188,88,72,32 ; vaddps 0x20(%rax),%ymm8,%ymm1
+ DB 197,124,16,72,96 ; vmovups 0x60(%rax),%ymm9
+ DB 65,184,85,85,149,191 ; mov $0xbf955555,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,66,125,88,210 ; vpbroadcastd %xmm10,%ymm10
+ DB 65,184,0,0,192,63 ; mov $0x3fc00000,%r8d
+ DB 196,65,121,110,216 ; vmovd %r8d,%xmm11
+ DB 196,66,125,88,219 ; vpbroadcastd %xmm11,%ymm11
+ DB 196,66,53,168,211 ; vfmadd213ps %ymm11,%ymm9,%ymm10
+ DB 196,66,53,168,208 ; vfmadd213ps %ymm8,%ymm9,%ymm10
+ DB 65,184,57,142,99,61 ; mov $0x3d638e39,%r8d
+ DB 196,65,121,110,192 ; vmovd %r8d,%xmm8
+ DB 196,66,125,88,192 ; vpbroadcastd %xmm8,%ymm8
+ DB 196,66,53,184,194 ; vfmadd231ps %ymm10,%ymm9,%ymm8
+ DB 197,124,17,128,160,0,0,0 ; vmovups %ymm8,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_p3y_hsw
+_sk_bicubic_p3y_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,192,63 ; mov $0x3fc00000,%r8d
+ DB 196,193,121,110,200 ; vmovd %r8d,%xmm1
+ DB 196,226,125,88,201 ; vpbroadcastd %xmm1,%ymm1
+ DB 197,244,88,72,32 ; vaddps 0x20(%rax),%ymm1,%ymm1
+ DB 197,124,16,64,96 ; vmovups 0x60(%rax),%ymm8
+ DB 196,65,60,89,200 ; vmulps %ymm8,%ymm8,%ymm9
+ DB 65,184,114,28,199,62 ; mov $0x3ec71c72,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,66,125,88,210 ; vpbroadcastd %xmm10,%ymm10
+ DB 65,184,171,170,170,190 ; mov $0xbeaaaaab,%r8d
+ DB 196,65,121,110,216 ; vmovd %r8d,%xmm11
+ DB 196,66,125,88,219 ; vpbroadcastd %xmm11,%ymm11
+ DB 196,66,61,168,211 ; vfmadd213ps %ymm11,%ymm8,%ymm10
+ DB 196,65,52,89,194 ; vmulps %ymm10,%ymm9,%ymm8
+ DB 197,124,17,128,160,0,0,0 ; vmovups %ymm8,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
PUBLIC _sk_start_pipeline_avx
_sk_start_pipeline_avx LABEL PROC
DB 65,87 ; push %r15
@@ -7321,6 +7605,350 @@ _sk_linear_gradient_2stops_avx LABEL PROC
DB 197,124,41,192 ; vmovaps %ymm8,%ymm0
DB 255,224 ; jmpq *%rax
+PUBLIC _sk_save_xy_avx
+_sk_save_xy_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,63 ; mov $0x3f000000,%r8d
+ DB 196,65,121,110,192 ; vmovd %r8d,%xmm8
+ DB 196,67,121,4,192,0 ; vpermilps $0x0,%xmm8,%xmm8
+ DB 196,67,61,24,192,1 ; vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
+ DB 197,60,88,200 ; vaddps %ymm0,%ymm8,%ymm9
+ DB 196,67,125,8,209,1 ; vroundps $0x1,%ymm9,%ymm10
+ DB 196,65,52,92,202 ; vsubps %ymm10,%ymm9,%ymm9
+ DB 197,60,88,193 ; vaddps %ymm1,%ymm8,%ymm8
+ DB 196,67,125,8,208,1 ; vroundps $0x1,%ymm8,%ymm10
+ DB 196,65,60,92,194 ; vsubps %ymm10,%ymm8,%ymm8
+ DB 197,252,17,0 ; vmovups %ymm0,(%rax)
+ DB 197,252,17,72,32 ; vmovups %ymm1,0x20(%rax)
+ DB 197,124,17,72,64 ; vmovups %ymm9,0x40(%rax)
+ DB 197,124,17,64,96 ; vmovups %ymm8,0x60(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_accumulate_avx
+_sk_accumulate_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,124,16,128,128,0,0,0 ; vmovups 0x80(%rax),%ymm8
+ DB 197,60,89,128,160,0,0,0 ; vmulps 0xa0(%rax),%ymm8,%ymm8
+ DB 197,60,89,200 ; vmulps %ymm0,%ymm8,%ymm9
+ DB 197,180,88,228 ; vaddps %ymm4,%ymm9,%ymm4
+ DB 197,60,89,201 ; vmulps %ymm1,%ymm8,%ymm9
+ DB 197,180,88,237 ; vaddps %ymm5,%ymm9,%ymm5
+ DB 197,60,89,202 ; vmulps %ymm2,%ymm8,%ymm9
+ DB 197,180,88,246 ; vaddps %ymm6,%ymm9,%ymm6
+ DB 197,60,89,195 ; vmulps %ymm3,%ymm8,%ymm8
+ DB 197,188,88,255 ; vaddps %ymm7,%ymm8,%ymm7
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bilinear_nx_avx
+_sk_bilinear_nx_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,191 ; mov $0xbf000000,%r8d
+ DB 196,193,121,110,192 ; vmovd %r8d,%xmm0
+ DB 196,227,121,4,192,0 ; vpermilps $0x0,%xmm0,%xmm0
+ DB 196,227,125,24,192,1 ; vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
+ DB 197,252,88,0 ; vaddps (%rax),%ymm0,%ymm0
+ DB 65,184,0,0,128,63 ; mov $0x3f800000,%r8d
+ DB 196,65,121,110,192 ; vmovd %r8d,%xmm8
+ DB 196,67,121,4,192,0 ; vpermilps $0x0,%xmm8,%xmm8
+ DB 196,67,61,24,192,1 ; vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
+ DB 197,60,92,64,64 ; vsubps 0x40(%rax),%ymm8,%ymm8
+ DB 197,124,17,128,128,0,0,0 ; vmovups %ymm8,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bilinear_px_avx
+_sk_bilinear_px_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,63 ; mov $0x3f000000,%r8d
+ DB 196,193,121,110,192 ; vmovd %r8d,%xmm0
+ DB 196,227,121,4,192,0 ; vpermilps $0x0,%xmm0,%xmm0
+ DB 196,227,125,24,192,1 ; vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
+ DB 197,124,16,64,64 ; vmovups 0x40(%rax),%ymm8
+ DB 197,252,88,0 ; vaddps (%rax),%ymm0,%ymm0
+ DB 197,124,17,128,128,0,0,0 ; vmovups %ymm8,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bilinear_ny_avx
+_sk_bilinear_ny_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,191 ; mov $0xbf000000,%r8d
+ DB 196,193,121,110,200 ; vmovd %r8d,%xmm1
+ DB 196,227,121,4,201,0 ; vpermilps $0x0,%xmm1,%xmm1
+ DB 196,227,117,24,201,1 ; vinsertf128 $0x1,%xmm1,%ymm1,%ymm1
+ DB 197,244,88,72,32 ; vaddps 0x20(%rax),%ymm1,%ymm1
+ DB 65,184,0,0,128,63 ; mov $0x3f800000,%r8d
+ DB 196,65,121,110,192 ; vmovd %r8d,%xmm8
+ DB 196,67,121,4,192,0 ; vpermilps $0x0,%xmm8,%xmm8
+ DB 196,67,61,24,192,1 ; vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
+ DB 197,60,92,64,96 ; vsubps 0x60(%rax),%ymm8,%ymm8
+ DB 197,124,17,128,160,0,0,0 ; vmovups %ymm8,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bilinear_py_avx
+_sk_bilinear_py_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,63 ; mov $0x3f000000,%r8d
+ DB 196,193,121,110,200 ; vmovd %r8d,%xmm1
+ DB 196,227,121,4,201,0 ; vpermilps $0x0,%xmm1,%xmm1
+ DB 196,227,117,24,201,1 ; vinsertf128 $0x1,%xmm1,%ymm1,%ymm1
+ DB 197,124,16,64,96 ; vmovups 0x60(%rax),%ymm8
+ DB 197,244,88,72,32 ; vaddps 0x20(%rax),%ymm1,%ymm1
+ DB 197,124,17,128,160,0,0,0 ; vmovups %ymm8,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_n3x_avx
+_sk_bicubic_n3x_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,192,191 ; mov $0xbfc00000,%r8d
+ DB 196,193,121,110,192 ; vmovd %r8d,%xmm0
+ DB 196,227,121,4,192,0 ; vpermilps $0x0,%xmm0,%xmm0
+ DB 196,227,125,24,192,1 ; vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
+ DB 197,252,88,0 ; vaddps (%rax),%ymm0,%ymm0
+ DB 65,184,0,0,128,63 ; mov $0x3f800000,%r8d
+ DB 196,65,121,110,192 ; vmovd %r8d,%xmm8
+ DB 196,67,121,4,192,0 ; vpermilps $0x0,%xmm8,%xmm8
+ DB 196,67,61,24,192,1 ; vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
+ DB 197,60,92,64,64 ; vsubps 0x40(%rax),%ymm8,%ymm8
+ DB 196,65,60,89,200 ; vmulps %ymm8,%ymm8,%ymm9
+ DB 65,184,114,28,199,62 ; mov $0x3ec71c72,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,67,121,4,210,0 ; vpermilps $0x0,%xmm10,%xmm10
+ DB 196,67,45,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ DB 65,184,171,170,170,190 ; mov $0xbeaaaaab,%r8d
+ DB 196,65,121,110,216 ; vmovd %r8d,%xmm11
+ DB 196,67,121,4,219,0 ; vpermilps $0x0,%xmm11,%xmm11
+ DB 196,67,37,24,219,1 ; vinsertf128 $0x1,%xmm11,%ymm11,%ymm11
+ DB 196,65,44,89,192 ; vmulps %ymm8,%ymm10,%ymm8
+ DB 196,65,60,88,195 ; vaddps %ymm11,%ymm8,%ymm8
+ DB 196,65,52,89,192 ; vmulps %ymm8,%ymm9,%ymm8
+ DB 197,124,17,128,128,0,0,0 ; vmovups %ymm8,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_n1x_avx
+_sk_bicubic_n1x_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,191 ; mov $0xbf000000,%r8d
+ DB 196,193,121,110,192 ; vmovd %r8d,%xmm0
+ DB 196,227,121,4,192,0 ; vpermilps $0x0,%xmm0,%xmm0
+ DB 196,227,125,24,192,1 ; vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
+ DB 197,252,88,0 ; vaddps (%rax),%ymm0,%ymm0
+ DB 65,184,0,0,128,63 ; mov $0x3f800000,%r8d
+ DB 196,65,121,110,192 ; vmovd %r8d,%xmm8
+ DB 196,67,121,4,192,0 ; vpermilps $0x0,%xmm8,%xmm8
+ DB 196,67,61,24,192,1 ; vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
+ DB 197,60,92,64,64 ; vsubps 0x40(%rax),%ymm8,%ymm8
+ DB 65,184,85,85,149,191 ; mov $0xbf955555,%r8d
+ DB 196,65,121,110,200 ; vmovd %r8d,%xmm9
+ DB 196,67,121,4,201,0 ; vpermilps $0x0,%xmm9,%xmm9
+ DB 196,67,53,24,201,1 ; vinsertf128 $0x1,%xmm9,%ymm9,%ymm9
+ DB 65,184,0,0,192,63 ; mov $0x3fc00000,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,67,121,4,210,0 ; vpermilps $0x0,%xmm10,%xmm10
+ DB 196,67,45,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ DB 196,65,52,89,200 ; vmulps %ymm8,%ymm9,%ymm9
+ DB 196,65,52,88,202 ; vaddps %ymm10,%ymm9,%ymm9
+ DB 65,184,0,0,0,63 ; mov $0x3f000000,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,67,121,4,210,0 ; vpermilps $0x0,%xmm10,%xmm10
+ DB 196,67,45,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ DB 196,65,60,89,201 ; vmulps %ymm9,%ymm8,%ymm9
+ DB 196,65,44,88,201 ; vaddps %ymm9,%ymm10,%ymm9
+ DB 65,184,57,142,99,61 ; mov $0x3d638e39,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,67,121,4,210,0 ; vpermilps $0x0,%xmm10,%xmm10
+ DB 196,67,45,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ DB 196,65,60,89,193 ; vmulps %ymm9,%ymm8,%ymm8
+ DB 196,65,44,88,192 ; vaddps %ymm8,%ymm10,%ymm8
+ DB 197,124,17,128,128,0,0,0 ; vmovups %ymm8,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_p1x_avx
+_sk_bicubic_p1x_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,63 ; mov $0x3f000000,%r8d
+ DB 196,193,121,110,192 ; vmovd %r8d,%xmm0
+ DB 196,227,121,4,192,0 ; vpermilps $0x0,%xmm0,%xmm0
+ DB 196,99,125,24,192,1 ; vinsertf128 $0x1,%xmm0,%ymm0,%ymm8
+ DB 197,188,88,0 ; vaddps (%rax),%ymm8,%ymm0
+ DB 197,124,16,72,64 ; vmovups 0x40(%rax),%ymm9
+ DB 65,184,85,85,149,191 ; mov $0xbf955555,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,67,121,4,210,0 ; vpermilps $0x0,%xmm10,%xmm10
+ DB 196,67,45,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ DB 65,184,0,0,192,63 ; mov $0x3fc00000,%r8d
+ DB 196,65,121,110,216 ; vmovd %r8d,%xmm11
+ DB 196,67,121,4,219,0 ; vpermilps $0x0,%xmm11,%xmm11
+ DB 196,67,37,24,219,1 ; vinsertf128 $0x1,%xmm11,%ymm11,%ymm11
+ DB 196,65,52,89,210 ; vmulps %ymm10,%ymm9,%ymm10
+ DB 196,65,44,88,211 ; vaddps %ymm11,%ymm10,%ymm10
+ DB 196,65,52,89,210 ; vmulps %ymm10,%ymm9,%ymm10
+ DB 196,65,60,88,194 ; vaddps %ymm10,%ymm8,%ymm8
+ DB 65,184,57,142,99,61 ; mov $0x3d638e39,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,67,121,4,210,0 ; vpermilps $0x0,%xmm10,%xmm10
+ DB 196,67,45,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ DB 196,65,52,89,192 ; vmulps %ymm8,%ymm9,%ymm8
+ DB 196,65,44,88,192 ; vaddps %ymm8,%ymm10,%ymm8
+ DB 197,124,17,128,128,0,0,0 ; vmovups %ymm8,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_p3x_avx
+_sk_bicubic_p3x_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,192,63 ; mov $0x3fc00000,%r8d
+ DB 196,193,121,110,192 ; vmovd %r8d,%xmm0
+ DB 196,227,121,4,192,0 ; vpermilps $0x0,%xmm0,%xmm0
+ DB 196,227,125,24,192,1 ; vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
+ DB 197,252,88,0 ; vaddps (%rax),%ymm0,%ymm0
+ DB 197,124,16,64,64 ; vmovups 0x40(%rax),%ymm8
+ DB 196,65,60,89,200 ; vmulps %ymm8,%ymm8,%ymm9
+ DB 65,184,114,28,199,62 ; mov $0x3ec71c72,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,67,121,4,210,0 ; vpermilps $0x0,%xmm10,%xmm10
+ DB 196,67,45,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ DB 65,184,171,170,170,190 ; mov $0xbeaaaaab,%r8d
+ DB 196,65,121,110,216 ; vmovd %r8d,%xmm11
+ DB 196,67,121,4,219,0 ; vpermilps $0x0,%xmm11,%xmm11
+ DB 196,67,37,24,219,1 ; vinsertf128 $0x1,%xmm11,%ymm11,%ymm11
+ DB 196,65,60,89,194 ; vmulps %ymm10,%ymm8,%ymm8
+ DB 196,65,60,88,195 ; vaddps %ymm11,%ymm8,%ymm8
+ DB 196,65,52,89,192 ; vmulps %ymm8,%ymm9,%ymm8
+ DB 197,124,17,128,128,0,0,0 ; vmovups %ymm8,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_n3y_avx
+_sk_bicubic_n3y_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,192,191 ; mov $0xbfc00000,%r8d
+ DB 196,193,121,110,200 ; vmovd %r8d,%xmm1
+ DB 196,227,121,4,201,0 ; vpermilps $0x0,%xmm1,%xmm1
+ DB 196,227,117,24,201,1 ; vinsertf128 $0x1,%xmm1,%ymm1,%ymm1
+ DB 197,244,88,72,32 ; vaddps 0x20(%rax),%ymm1,%ymm1
+ DB 65,184,0,0,128,63 ; mov $0x3f800000,%r8d
+ DB 196,65,121,110,192 ; vmovd %r8d,%xmm8
+ DB 196,67,121,4,192,0 ; vpermilps $0x0,%xmm8,%xmm8
+ DB 196,67,61,24,192,1 ; vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
+ DB 197,60,92,64,96 ; vsubps 0x60(%rax),%ymm8,%ymm8
+ DB 196,65,60,89,200 ; vmulps %ymm8,%ymm8,%ymm9
+ DB 65,184,114,28,199,62 ; mov $0x3ec71c72,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,67,121,4,210,0 ; vpermilps $0x0,%xmm10,%xmm10
+ DB 196,67,45,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ DB 65,184,171,170,170,190 ; mov $0xbeaaaaab,%r8d
+ DB 196,65,121,110,216 ; vmovd %r8d,%xmm11
+ DB 196,67,121,4,219,0 ; vpermilps $0x0,%xmm11,%xmm11
+ DB 196,67,37,24,219,1 ; vinsertf128 $0x1,%xmm11,%ymm11,%ymm11
+ DB 196,65,44,89,192 ; vmulps %ymm8,%ymm10,%ymm8
+ DB 196,65,60,88,195 ; vaddps %ymm11,%ymm8,%ymm8
+ DB 196,65,52,89,192 ; vmulps %ymm8,%ymm9,%ymm8
+ DB 197,124,17,128,160,0,0,0 ; vmovups %ymm8,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_n1y_avx
+_sk_bicubic_n1y_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,191 ; mov $0xbf000000,%r8d
+ DB 196,193,121,110,200 ; vmovd %r8d,%xmm1
+ DB 196,227,121,4,201,0 ; vpermilps $0x0,%xmm1,%xmm1
+ DB 196,227,117,24,201,1 ; vinsertf128 $0x1,%xmm1,%ymm1,%ymm1
+ DB 197,244,88,72,32 ; vaddps 0x20(%rax),%ymm1,%ymm1
+ DB 65,184,0,0,128,63 ; mov $0x3f800000,%r8d
+ DB 196,65,121,110,192 ; vmovd %r8d,%xmm8
+ DB 196,67,121,4,192,0 ; vpermilps $0x0,%xmm8,%xmm8
+ DB 196,67,61,24,192,1 ; vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
+ DB 197,60,92,64,96 ; vsubps 0x60(%rax),%ymm8,%ymm8
+ DB 65,184,85,85,149,191 ; mov $0xbf955555,%r8d
+ DB 196,65,121,110,200 ; vmovd %r8d,%xmm9
+ DB 196,67,121,4,201,0 ; vpermilps $0x0,%xmm9,%xmm9
+ DB 196,67,53,24,201,1 ; vinsertf128 $0x1,%xmm9,%ymm9,%ymm9
+ DB 65,184,0,0,192,63 ; mov $0x3fc00000,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,67,121,4,210,0 ; vpermilps $0x0,%xmm10,%xmm10
+ DB 196,67,45,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ DB 196,65,52,89,200 ; vmulps %ymm8,%ymm9,%ymm9
+ DB 196,65,52,88,202 ; vaddps %ymm10,%ymm9,%ymm9
+ DB 65,184,0,0,0,63 ; mov $0x3f000000,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,67,121,4,210,0 ; vpermilps $0x0,%xmm10,%xmm10
+ DB 196,67,45,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ DB 196,65,60,89,201 ; vmulps %ymm9,%ymm8,%ymm9
+ DB 196,65,44,88,201 ; vaddps %ymm9,%ymm10,%ymm9
+ DB 65,184,57,142,99,61 ; mov $0x3d638e39,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,67,121,4,210,0 ; vpermilps $0x0,%xmm10,%xmm10
+ DB 196,67,45,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ DB 196,65,60,89,193 ; vmulps %ymm9,%ymm8,%ymm8
+ DB 196,65,44,88,192 ; vaddps %ymm8,%ymm10,%ymm8
+ DB 197,124,17,128,160,0,0,0 ; vmovups %ymm8,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_p1y_avx
+_sk_bicubic_p1y_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,0,63 ; mov $0x3f000000,%r8d
+ DB 196,193,121,110,200 ; vmovd %r8d,%xmm1
+ DB 196,227,121,4,201,0 ; vpermilps $0x0,%xmm1,%xmm1
+ DB 196,99,117,24,193,1 ; vinsertf128 $0x1,%xmm1,%ymm1,%ymm8
+ DB 197,188,88,72,32 ; vaddps 0x20(%rax),%ymm8,%ymm1
+ DB 197,124,16,72,96 ; vmovups 0x60(%rax),%ymm9
+ DB 65,184,85,85,149,191 ; mov $0xbf955555,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,67,121,4,210,0 ; vpermilps $0x0,%xmm10,%xmm10
+ DB 196,67,45,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ DB 65,184,0,0,192,63 ; mov $0x3fc00000,%r8d
+ DB 196,65,121,110,216 ; vmovd %r8d,%xmm11
+ DB 196,67,121,4,219,0 ; vpermilps $0x0,%xmm11,%xmm11
+ DB 196,67,37,24,219,1 ; vinsertf128 $0x1,%xmm11,%ymm11,%ymm11
+ DB 196,65,52,89,210 ; vmulps %ymm10,%ymm9,%ymm10
+ DB 196,65,44,88,211 ; vaddps %ymm11,%ymm10,%ymm10
+ DB 196,65,52,89,210 ; vmulps %ymm10,%ymm9,%ymm10
+ DB 196,65,60,88,194 ; vaddps %ymm10,%ymm8,%ymm8
+ DB 65,184,57,142,99,61 ; mov $0x3d638e39,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,67,121,4,210,0 ; vpermilps $0x0,%xmm10,%xmm10
+ DB 196,67,45,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ DB 196,65,52,89,192 ; vmulps %ymm8,%ymm9,%ymm8
+ DB 196,65,44,88,192 ; vaddps %ymm8,%ymm10,%ymm8
+ DB 197,124,17,128,160,0,0,0 ; vmovups %ymm8,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_p3y_avx
+_sk_bicubic_p3y_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,184,0,0,192,63 ; mov $0x3fc00000,%r8d
+ DB 196,193,121,110,200 ; vmovd %r8d,%xmm1
+ DB 196,227,121,4,201,0 ; vpermilps $0x0,%xmm1,%xmm1
+ DB 196,227,117,24,201,1 ; vinsertf128 $0x1,%xmm1,%ymm1,%ymm1
+ DB 197,244,88,72,32 ; vaddps 0x20(%rax),%ymm1,%ymm1
+ DB 197,124,16,64,96 ; vmovups 0x60(%rax),%ymm8
+ DB 196,65,60,89,200 ; vmulps %ymm8,%ymm8,%ymm9
+ DB 65,184,114,28,199,62 ; mov $0x3ec71c72,%r8d
+ DB 196,65,121,110,208 ; vmovd %r8d,%xmm10
+ DB 196,67,121,4,210,0 ; vpermilps $0x0,%xmm10,%xmm10
+ DB 196,67,45,24,210,1 ; vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
+ DB 65,184,171,170,170,190 ; mov $0xbeaaaaab,%r8d
+ DB 196,65,121,110,216 ; vmovd %r8d,%xmm11
+ DB 196,67,121,4,219,0 ; vpermilps $0x0,%xmm11,%xmm11
+ DB 196,67,37,24,219,1 ; vinsertf128 $0x1,%xmm11,%ymm11,%ymm11
+ DB 196,65,60,89,194 ; vmulps %ymm10,%ymm8,%ymm8
+ DB 196,65,60,88,195 ; vaddps %ymm11,%ymm8,%ymm8
+ DB 196,65,52,89,192 ; vmulps %ymm8,%ymm9,%ymm8
+ DB 197,124,17,128,160,0,0,0 ; vmovups %ymm8,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
PUBLIC _sk_start_pipeline_sse41
_sk_start_pipeline_sse41 LABEL PROC
DB 65,87 ; push %r15
@@ -10512,6 +11140,332 @@ _sk_linear_gradient_2stops_sse41 LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
+PUBLIC _sk_save_xy_sse41
+_sk_save_xy_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,63 ; mov $0x3f000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,40,200 ; movaps %xmm8,%xmm9
+ DB 68,15,88,200 ; addps %xmm0,%xmm9
+ DB 102,69,15,58,8,209,1 ; roundps $0x1,%xmm9,%xmm10
+ DB 69,15,92,202 ; subps %xmm10,%xmm9
+ DB 68,15,88,193 ; addps %xmm1,%xmm8
+ DB 102,69,15,58,8,208,1 ; roundps $0x1,%xmm8,%xmm10
+ DB 69,15,92,194 ; subps %xmm10,%xmm8
+ DB 15,17,0 ; movups %xmm0,(%rax)
+ DB 15,17,72,32 ; movups %xmm1,0x20(%rax)
+ DB 68,15,17,72,64 ; movups %xmm9,0x40(%rax)
+ DB 68,15,17,64,96 ; movups %xmm8,0x60(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_accumulate_sse41
+_sk_accumulate_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 68,15,16,128,128,0,0,0 ; movups 0x80(%rax),%xmm8
+ DB 68,15,16,136,160,0,0,0 ; movups 0xa0(%rax),%xmm9
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,40,193 ; movaps %xmm9,%xmm8
+ DB 68,15,89,192 ; mulps %xmm0,%xmm8
+ DB 65,15,88,224 ; addps %xmm8,%xmm4
+ DB 69,15,40,193 ; movaps %xmm9,%xmm8
+ DB 68,15,89,193 ; mulps %xmm1,%xmm8
+ DB 65,15,88,232 ; addps %xmm8,%xmm5
+ DB 69,15,40,193 ; movaps %xmm9,%xmm8
+ DB 68,15,89,194 ; mulps %xmm2,%xmm8
+ DB 65,15,88,240 ; addps %xmm8,%xmm6
+ DB 68,15,89,203 ; mulps %xmm3,%xmm9
+ DB 65,15,88,249 ; addps %xmm9,%xmm7
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bilinear_nx_sse41
+_sk_bilinear_nx_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,191 ; mov $0xbf000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,0 ; movups (%rax),%xmm0
+ DB 68,15,16,72,64 ; movups 0x40(%rax),%xmm9
+ DB 65,15,88,192 ; addps %xmm8,%xmm0
+ DB 185,0,0,128,63 ; mov $0x3f800000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,92,193 ; subps %xmm9,%xmm8
+ DB 68,15,17,128,128,0,0,0 ; movups %xmm8,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bilinear_px_sse41
+_sk_bilinear_px_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,63 ; mov $0x3f000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,0 ; movups (%rax),%xmm0
+ DB 68,15,16,72,64 ; movups 0x40(%rax),%xmm9
+ DB 65,15,88,192 ; addps %xmm8,%xmm0
+ DB 68,15,17,136,128,0,0,0 ; movups %xmm9,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bilinear_ny_sse41
+_sk_bilinear_ny_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,191 ; mov $0xbf000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,72,32 ; movups 0x20(%rax),%xmm1
+ DB 68,15,16,72,96 ; movups 0x60(%rax),%xmm9
+ DB 65,15,88,200 ; addps %xmm8,%xmm1
+ DB 185,0,0,128,63 ; mov $0x3f800000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,92,193 ; subps %xmm9,%xmm8
+ DB 68,15,17,128,160,0,0,0 ; movups %xmm8,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bilinear_py_sse41
+_sk_bilinear_py_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,63 ; mov $0x3f000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,72,32 ; movups 0x20(%rax),%xmm1
+ DB 68,15,16,72,96 ; movups 0x60(%rax),%xmm9
+ DB 65,15,88,200 ; addps %xmm8,%xmm1
+ DB 68,15,17,136,160,0,0,0 ; movups %xmm9,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_n3x_sse41
+_sk_bicubic_n3x_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,192,191 ; mov $0xbfc00000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,0 ; movups (%rax),%xmm0
+ DB 68,15,16,72,64 ; movups 0x40(%rax),%xmm9
+ DB 65,15,88,192 ; addps %xmm8,%xmm0
+ DB 185,0,0,128,63 ; mov $0x3f800000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,92,193 ; subps %xmm9,%xmm8
+ DB 185,114,28,199,62 ; mov $0x3ec71c72,%ecx
+ DB 102,68,15,110,201 ; movd %ecx,%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 185,171,170,170,190 ; mov $0xbeaaaaab,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,89,192 ; mulps %xmm8,%xmm8
+ DB 69,15,88,202 ; addps %xmm10,%xmm9
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 68,15,17,136,128,0,0,0 ; movups %xmm9,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_n1x_sse41
+_sk_bicubic_n1x_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,191 ; mov $0xbf000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,0 ; movups (%rax),%xmm0
+ DB 68,15,16,72,64 ; movups 0x40(%rax),%xmm9
+ DB 65,15,88,192 ; addps %xmm8,%xmm0
+ DB 185,0,0,128,63 ; mov $0x3f800000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,92,193 ; subps %xmm9,%xmm8
+ DB 185,85,85,149,191 ; mov $0xbf955555,%ecx
+ DB 102,68,15,110,201 ; movd %ecx,%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 185,0,0,192,63 ; mov $0x3fc00000,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,88,202 ; addps %xmm10,%xmm9
+ DB 185,0,0,0,63 ; mov $0x3f000000,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,88,202 ; addps %xmm10,%xmm9
+ DB 185,57,142,99,61 ; mov $0x3d638e39,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,88,202 ; addps %xmm10,%xmm9
+ DB 68,15,17,136,128,0,0,0 ; movups %xmm9,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_p1x_sse41
+_sk_bicubic_p1x_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,63 ; mov $0x3f000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,0 ; movups (%rax),%xmm0
+ DB 68,15,16,72,64 ; movups 0x40(%rax),%xmm9
+ DB 65,15,88,192 ; addps %xmm8,%xmm0
+ DB 185,85,85,149,191 ; mov $0xbf955555,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 185,0,0,192,63 ; mov $0x3fc00000,%ecx
+ DB 102,68,15,110,217 ; movd %ecx,%xmm11
+ DB 69,15,198,219,0 ; shufps $0x0,%xmm11,%xmm11
+ DB 69,15,89,209 ; mulps %xmm9,%xmm10
+ DB 69,15,88,211 ; addps %xmm11,%xmm10
+ DB 69,15,89,209 ; mulps %xmm9,%xmm10
+ DB 69,15,88,208 ; addps %xmm8,%xmm10
+ DB 185,57,142,99,61 ; mov $0x3d638e39,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,89,209 ; mulps %xmm9,%xmm10
+ DB 69,15,88,208 ; addps %xmm8,%xmm10
+ DB 68,15,17,144,128,0,0,0 ; movups %xmm10,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_p3x_sse41
+_sk_bicubic_p3x_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,192,63 ; mov $0x3fc00000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,0 ; movups (%rax),%xmm0
+ DB 68,15,16,72,64 ; movups 0x40(%rax),%xmm9
+ DB 65,15,88,192 ; addps %xmm8,%xmm0
+ DB 185,114,28,199,62 ; mov $0x3ec71c72,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,89,193 ; mulps %xmm9,%xmm8
+ DB 69,15,89,201 ; mulps %xmm9,%xmm9
+ DB 185,171,170,170,190 ; mov $0xbeaaaaab,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,88,194 ; addps %xmm10,%xmm8
+ DB 69,15,89,193 ; mulps %xmm9,%xmm8
+ DB 68,15,17,128,128,0,0,0 ; movups %xmm8,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_n3y_sse41
+_sk_bicubic_n3y_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,192,191 ; mov $0xbfc00000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,72,32 ; movups 0x20(%rax),%xmm1
+ DB 68,15,16,72,96 ; movups 0x60(%rax),%xmm9
+ DB 65,15,88,200 ; addps %xmm8,%xmm1
+ DB 185,0,0,128,63 ; mov $0x3f800000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,92,193 ; subps %xmm9,%xmm8
+ DB 185,114,28,199,62 ; mov $0x3ec71c72,%ecx
+ DB 102,68,15,110,201 ; movd %ecx,%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 185,171,170,170,190 ; mov $0xbeaaaaab,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,89,192 ; mulps %xmm8,%xmm8
+ DB 69,15,88,202 ; addps %xmm10,%xmm9
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 68,15,17,136,160,0,0,0 ; movups %xmm9,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_n1y_sse41
+_sk_bicubic_n1y_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,191 ; mov $0xbf000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,72,32 ; movups 0x20(%rax),%xmm1
+ DB 68,15,16,72,96 ; movups 0x60(%rax),%xmm9
+ DB 65,15,88,200 ; addps %xmm8,%xmm1
+ DB 185,0,0,128,63 ; mov $0x3f800000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,92,193 ; subps %xmm9,%xmm8
+ DB 185,85,85,149,191 ; mov $0xbf955555,%ecx
+ DB 102,68,15,110,201 ; movd %ecx,%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 185,0,0,192,63 ; mov $0x3fc00000,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,88,202 ; addps %xmm10,%xmm9
+ DB 185,0,0,0,63 ; mov $0x3f000000,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,88,202 ; addps %xmm10,%xmm9
+ DB 185,57,142,99,61 ; mov $0x3d638e39,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,88,202 ; addps %xmm10,%xmm9
+ DB 68,15,17,136,160,0,0,0 ; movups %xmm9,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_p1y_sse41
+_sk_bicubic_p1y_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,63 ; mov $0x3f000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,72,32 ; movups 0x20(%rax),%xmm1
+ DB 68,15,16,72,96 ; movups 0x60(%rax),%xmm9
+ DB 65,15,88,200 ; addps %xmm8,%xmm1
+ DB 185,85,85,149,191 ; mov $0xbf955555,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 185,0,0,192,63 ; mov $0x3fc00000,%ecx
+ DB 102,68,15,110,217 ; movd %ecx,%xmm11
+ DB 69,15,198,219,0 ; shufps $0x0,%xmm11,%xmm11
+ DB 69,15,89,209 ; mulps %xmm9,%xmm10
+ DB 69,15,88,211 ; addps %xmm11,%xmm10
+ DB 69,15,89,209 ; mulps %xmm9,%xmm10
+ DB 69,15,88,208 ; addps %xmm8,%xmm10
+ DB 185,57,142,99,61 ; mov $0x3d638e39,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,89,209 ; mulps %xmm9,%xmm10
+ DB 69,15,88,208 ; addps %xmm8,%xmm10
+ DB 68,15,17,144,160,0,0,0 ; movups %xmm10,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_p3y_sse41
+_sk_bicubic_p3y_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,192,63 ; mov $0x3fc00000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,72,32 ; movups 0x20(%rax),%xmm1
+ DB 68,15,16,72,96 ; movups 0x60(%rax),%xmm9
+ DB 65,15,88,200 ; addps %xmm8,%xmm1
+ DB 185,114,28,199,62 ; mov $0x3ec71c72,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,89,193 ; mulps %xmm9,%xmm8
+ DB 69,15,89,201 ; mulps %xmm9,%xmm9
+ DB 185,171,170,170,190 ; mov $0xbeaaaaab,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,88,194 ; addps %xmm10,%xmm8
+ DB 69,15,89,193 ; mulps %xmm9,%xmm8
+ DB 68,15,17,128,160,0,0,0 ; movups %xmm8,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
PUBLIC _sk_start_pipeline_sse2
_sk_start_pipeline_sse2 LABEL PROC
DB 65,87 ; push %r15
@@ -13933,5 +14887,344 @@ _sk_linear_gradient_2stops_sse2 LABEL PROC
DB 65,15,88,217 ; addps %xmm9,%xmm3
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_save_xy_sse2
+_sk_save_xy_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,63 ; mov $0x3f000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,40,200 ; movaps %xmm8,%xmm9
+ DB 68,15,88,200 ; addps %xmm0,%xmm9
+ DB 243,69,15,91,209 ; cvttps2dq %xmm9,%xmm10
+ DB 69,15,91,210 ; cvtdq2ps %xmm10,%xmm10
+ DB 69,15,40,217 ; movaps %xmm9,%xmm11
+ DB 69,15,194,218,1 ; cmpltps %xmm10,%xmm11
+ DB 185,0,0,128,63 ; mov $0x3f800000,%ecx
+ DB 102,68,15,110,225 ; movd %ecx,%xmm12
+ DB 69,15,198,228,0 ; shufps $0x0,%xmm12,%xmm12
+ DB 69,15,84,220 ; andps %xmm12,%xmm11
+ DB 69,15,92,211 ; subps %xmm11,%xmm10
+ DB 69,15,92,202 ; subps %xmm10,%xmm9
+ DB 68,15,88,193 ; addps %xmm1,%xmm8
+ DB 243,69,15,91,208 ; cvttps2dq %xmm8,%xmm10
+ DB 69,15,91,210 ; cvtdq2ps %xmm10,%xmm10
+ DB 69,15,40,216 ; movaps %xmm8,%xmm11
+ DB 69,15,194,218,1 ; cmpltps %xmm10,%xmm11
+ DB 69,15,84,220 ; andps %xmm12,%xmm11
+ DB 69,15,92,211 ; subps %xmm11,%xmm10
+ DB 69,15,92,194 ; subps %xmm10,%xmm8
+ DB 15,17,0 ; movups %xmm0,(%rax)
+ DB 15,17,72,32 ; movups %xmm1,0x20(%rax)
+ DB 68,15,17,72,64 ; movups %xmm9,0x40(%rax)
+ DB 68,15,17,64,96 ; movups %xmm8,0x60(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_accumulate_sse2
+_sk_accumulate_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 68,15,16,128,128,0,0,0 ; movups 0x80(%rax),%xmm8
+ DB 68,15,16,136,160,0,0,0 ; movups 0xa0(%rax),%xmm9
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,40,193 ; movaps %xmm9,%xmm8
+ DB 68,15,89,192 ; mulps %xmm0,%xmm8
+ DB 65,15,88,224 ; addps %xmm8,%xmm4
+ DB 69,15,40,193 ; movaps %xmm9,%xmm8
+ DB 68,15,89,193 ; mulps %xmm1,%xmm8
+ DB 65,15,88,232 ; addps %xmm8,%xmm5
+ DB 69,15,40,193 ; movaps %xmm9,%xmm8
+ DB 68,15,89,194 ; mulps %xmm2,%xmm8
+ DB 65,15,88,240 ; addps %xmm8,%xmm6
+ DB 68,15,89,203 ; mulps %xmm3,%xmm9
+ DB 65,15,88,249 ; addps %xmm9,%xmm7
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bilinear_nx_sse2
+_sk_bilinear_nx_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,191 ; mov $0xbf000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,0 ; movups (%rax),%xmm0
+ DB 68,15,16,72,64 ; movups 0x40(%rax),%xmm9
+ DB 65,15,88,192 ; addps %xmm8,%xmm0
+ DB 185,0,0,128,63 ; mov $0x3f800000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,92,193 ; subps %xmm9,%xmm8
+ DB 68,15,17,128,128,0,0,0 ; movups %xmm8,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bilinear_px_sse2
+_sk_bilinear_px_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,63 ; mov $0x3f000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,0 ; movups (%rax),%xmm0
+ DB 68,15,16,72,64 ; movups 0x40(%rax),%xmm9
+ DB 65,15,88,192 ; addps %xmm8,%xmm0
+ DB 68,15,17,136,128,0,0,0 ; movups %xmm9,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bilinear_ny_sse2
+_sk_bilinear_ny_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,191 ; mov $0xbf000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,72,32 ; movups 0x20(%rax),%xmm1
+ DB 68,15,16,72,96 ; movups 0x60(%rax),%xmm9
+ DB 65,15,88,200 ; addps %xmm8,%xmm1
+ DB 185,0,0,128,63 ; mov $0x3f800000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,92,193 ; subps %xmm9,%xmm8
+ DB 68,15,17,128,160,0,0,0 ; movups %xmm8,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bilinear_py_sse2
+_sk_bilinear_py_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,63 ; mov $0x3f000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,72,32 ; movups 0x20(%rax),%xmm1
+ DB 68,15,16,72,96 ; movups 0x60(%rax),%xmm9
+ DB 65,15,88,200 ; addps %xmm8,%xmm1
+ DB 68,15,17,136,160,0,0,0 ; movups %xmm9,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_n3x_sse2
+_sk_bicubic_n3x_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,192,191 ; mov $0xbfc00000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,0 ; movups (%rax),%xmm0
+ DB 68,15,16,72,64 ; movups 0x40(%rax),%xmm9
+ DB 65,15,88,192 ; addps %xmm8,%xmm0
+ DB 185,0,0,128,63 ; mov $0x3f800000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,92,193 ; subps %xmm9,%xmm8
+ DB 185,114,28,199,62 ; mov $0x3ec71c72,%ecx
+ DB 102,68,15,110,201 ; movd %ecx,%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 185,171,170,170,190 ; mov $0xbeaaaaab,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,89,192 ; mulps %xmm8,%xmm8
+ DB 69,15,88,202 ; addps %xmm10,%xmm9
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 68,15,17,136,128,0,0,0 ; movups %xmm9,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_n1x_sse2
+_sk_bicubic_n1x_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,191 ; mov $0xbf000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,0 ; movups (%rax),%xmm0
+ DB 68,15,16,72,64 ; movups 0x40(%rax),%xmm9
+ DB 65,15,88,192 ; addps %xmm8,%xmm0
+ DB 185,0,0,128,63 ; mov $0x3f800000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,92,193 ; subps %xmm9,%xmm8
+ DB 185,85,85,149,191 ; mov $0xbf955555,%ecx
+ DB 102,68,15,110,201 ; movd %ecx,%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 185,0,0,192,63 ; mov $0x3fc00000,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,88,202 ; addps %xmm10,%xmm9
+ DB 185,0,0,0,63 ; mov $0x3f000000,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,88,202 ; addps %xmm10,%xmm9
+ DB 185,57,142,99,61 ; mov $0x3d638e39,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,88,202 ; addps %xmm10,%xmm9
+ DB 68,15,17,136,128,0,0,0 ; movups %xmm9,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_p1x_sse2
+_sk_bicubic_p1x_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,63 ; mov $0x3f000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,0 ; movups (%rax),%xmm0
+ DB 68,15,16,72,64 ; movups 0x40(%rax),%xmm9
+ DB 65,15,88,192 ; addps %xmm8,%xmm0
+ DB 185,85,85,149,191 ; mov $0xbf955555,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 185,0,0,192,63 ; mov $0x3fc00000,%ecx
+ DB 102,68,15,110,217 ; movd %ecx,%xmm11
+ DB 69,15,198,219,0 ; shufps $0x0,%xmm11,%xmm11
+ DB 69,15,89,209 ; mulps %xmm9,%xmm10
+ DB 69,15,88,211 ; addps %xmm11,%xmm10
+ DB 69,15,89,209 ; mulps %xmm9,%xmm10
+ DB 69,15,88,208 ; addps %xmm8,%xmm10
+ DB 185,57,142,99,61 ; mov $0x3d638e39,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,89,209 ; mulps %xmm9,%xmm10
+ DB 69,15,88,208 ; addps %xmm8,%xmm10
+ DB 68,15,17,144,128,0,0,0 ; movups %xmm10,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_p3x_sse2
+_sk_bicubic_p3x_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,192,63 ; mov $0x3fc00000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,0 ; movups (%rax),%xmm0
+ DB 68,15,16,72,64 ; movups 0x40(%rax),%xmm9
+ DB 65,15,88,192 ; addps %xmm8,%xmm0
+ DB 185,114,28,199,62 ; mov $0x3ec71c72,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,89,193 ; mulps %xmm9,%xmm8
+ DB 69,15,89,201 ; mulps %xmm9,%xmm9
+ DB 185,171,170,170,190 ; mov $0xbeaaaaab,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,88,194 ; addps %xmm10,%xmm8
+ DB 69,15,89,193 ; mulps %xmm9,%xmm8
+ DB 68,15,17,128,128,0,0,0 ; movups %xmm8,0x80(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_n3y_sse2
+_sk_bicubic_n3y_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,192,191 ; mov $0xbfc00000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,72,32 ; movups 0x20(%rax),%xmm1
+ DB 68,15,16,72,96 ; movups 0x60(%rax),%xmm9
+ DB 65,15,88,200 ; addps %xmm8,%xmm1
+ DB 185,0,0,128,63 ; mov $0x3f800000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,92,193 ; subps %xmm9,%xmm8
+ DB 185,114,28,199,62 ; mov $0x3ec71c72,%ecx
+ DB 102,68,15,110,201 ; movd %ecx,%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 185,171,170,170,190 ; mov $0xbeaaaaab,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,89,192 ; mulps %xmm8,%xmm8
+ DB 69,15,88,202 ; addps %xmm10,%xmm9
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 68,15,17,136,160,0,0,0 ; movups %xmm9,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_n1y_sse2
+_sk_bicubic_n1y_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,191 ; mov $0xbf000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,72,32 ; movups 0x20(%rax),%xmm1
+ DB 68,15,16,72,96 ; movups 0x60(%rax),%xmm9
+ DB 65,15,88,200 ; addps %xmm8,%xmm1
+ DB 185,0,0,128,63 ; mov $0x3f800000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,92,193 ; subps %xmm9,%xmm8
+ DB 185,85,85,149,191 ; mov $0xbf955555,%ecx
+ DB 102,68,15,110,201 ; movd %ecx,%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 185,0,0,192,63 ; mov $0x3fc00000,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,88,202 ; addps %xmm10,%xmm9
+ DB 185,0,0,0,63 ; mov $0x3f000000,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,88,202 ; addps %xmm10,%xmm9
+ DB 185,57,142,99,61 ; mov $0x3d638e39,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 69,15,88,202 ; addps %xmm10,%xmm9
+ DB 68,15,17,136,160,0,0,0 ; movups %xmm9,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_p1y_sse2
+_sk_bicubic_p1y_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,0,63 ; mov $0x3f000000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,72,32 ; movups 0x20(%rax),%xmm1
+ DB 68,15,16,72,96 ; movups 0x60(%rax),%xmm9
+ DB 65,15,88,200 ; addps %xmm8,%xmm1
+ DB 185,85,85,149,191 ; mov $0xbf955555,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 185,0,0,192,63 ; mov $0x3fc00000,%ecx
+ DB 102,68,15,110,217 ; movd %ecx,%xmm11
+ DB 69,15,198,219,0 ; shufps $0x0,%xmm11,%xmm11
+ DB 69,15,89,209 ; mulps %xmm9,%xmm10
+ DB 69,15,88,211 ; addps %xmm11,%xmm10
+ DB 69,15,89,209 ; mulps %xmm9,%xmm10
+ DB 69,15,88,208 ; addps %xmm8,%xmm10
+ DB 185,57,142,99,61 ; mov $0x3d638e39,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,89,209 ; mulps %xmm9,%xmm10
+ DB 69,15,88,208 ; addps %xmm8,%xmm10
+ DB 68,15,17,144,160,0,0,0 ; movups %xmm10,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_bicubic_p3y_sse2
+_sk_bicubic_p3y_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 185,0,0,192,63 ; mov $0x3fc00000,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 15,16,72,32 ; movups 0x20(%rax),%xmm1
+ DB 68,15,16,72,96 ; movups 0x60(%rax),%xmm9
+ DB 65,15,88,200 ; addps %xmm8,%xmm1
+ DB 185,114,28,199,62 ; mov $0x3ec71c72,%ecx
+ DB 102,68,15,110,193 ; movd %ecx,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 69,15,89,193 ; mulps %xmm9,%xmm8
+ DB 69,15,89,201 ; mulps %xmm9,%xmm9
+ DB 185,171,170,170,190 ; mov $0xbeaaaaab,%ecx
+ DB 102,68,15,110,209 ; movd %ecx,%xmm10
+ DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10
+ DB 69,15,88,194 ; addps %xmm10,%xmm8
+ DB 69,15,89,193 ; mulps %xmm9,%xmm8
+ DB 68,15,17,128,160,0,0,0 ; movups %xmm8,0xa0(%rax)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
ENDIF
END
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index 82191c91b8..2e6746c338 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -910,3 +910,115 @@ STAGE(linear_gradient_2stops) {
b = mad(t, c->f[2], c->b[2]);
a = mad(t, c->f[3], c->b[3]);
}
+
+STAGE(save_xy) {
+ auto c = (SkJumper_SamplerCtx*)ctx;
+
+ // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy).
+ // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
+ // surrounding (x,y) at (0.5,0.5) off-center.
+ auto fract = [](F v) { return v - floor_(v); };
+ F fx = fract(r + 0.5_f),
+ fy = fract(g + 0.5_f);
+
+ // Samplers will need to load x and fx, or y and fy.
+ memcpy(c->x, &r, sizeof(F));
+ memcpy(c->y, &g, sizeof(F));
+ memcpy(c->fx, &fx, sizeof(F));
+ memcpy(c->fy, &fy, sizeof(F));
+}
+
+STAGE(accumulate) {
+ auto c = (const SkJumper_SamplerCtx*)ctx;
+
+ // Bilinear and bicubic filters are both separable, so we produce independent contributions
+ // from x and y, multiplying them together here to get each pixel's total scale factor.
+ auto scale = unaligned_load<F>(c->scalex)
+ * unaligned_load<F>(c->scaley);
+ dr = mad(scale, r, dr);
+ dg = mad(scale, g, dg);
+ db = mad(scale, b, db);
+ da = mad(scale, a, da);
+}
+
+// In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
+// are combined in direct proportion to their area overlapping that logical query pixel.
+// At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x.
+// The y-axis is symmetric.
+
+template <int kScale>
+SI void bilinear_x(SkJumper_SamplerCtx* ctx, F* x) {
+ *x = unaligned_load<F>(ctx->x) + C(kScale * 0.5f);
+ F fx = unaligned_load<F>(ctx->fx);
+
+ F scalex;
+ if (kScale == -1) { scalex = 1.0_f - fx; }
+ if (kScale == +1) { scalex = fx; }
+ memcpy(ctx->scalex, &scalex, sizeof(F));
+}
+template <int kScale>
+SI void bilinear_y(SkJumper_SamplerCtx* ctx, F* y) {
+ *y = unaligned_load<F>(ctx->y) + C(kScale * 0.5f);
+ F fy = unaligned_load<F>(ctx->fy);
+
+ F scaley;
+ if (kScale == -1) { scaley = 1.0_f - fy; }
+ if (kScale == +1) { scaley = fy; }
+ memcpy(ctx->scaley, &scaley, sizeof(F));
+}
+
+STAGE(bilinear_nx) { bilinear_x<-1>(ctx, &r); }
+STAGE(bilinear_px) { bilinear_x<+1>(ctx, &r); }
+STAGE(bilinear_ny) { bilinear_y<-1>(ctx, &g); }
+STAGE(bilinear_py) { bilinear_y<+1>(ctx, &g); }
+
+
+// In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample
+// pixel center are combined with a non-uniform cubic filter, with higher values near the center.
+//
+// We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets.
+// See GrCubicEffect for details of this particular filter.
+
+SI F bicubic_near(F t) {
+ // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18
+ return mad(t, mad(t, mad(C(-21/18.0f), t, C(27/18.0f)), C(9/18.0f)), C(1/18.0f));
+}
+SI F bicubic_far(F t) {
+ // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18)
+ return (t*t)*mad(C(7/18.0f), t, C(-6/18.0f));
+}
+
+template <int kScale>
+SI void bicubic_x(SkJumper_SamplerCtx* ctx, F* x) {
+ *x = unaligned_load<F>(ctx->x) + C(kScale * 0.5f);
+ F fx = unaligned_load<F>(ctx->fx);
+
+ F scalex;
+ if (kScale == -3) { scalex = bicubic_far (1.0_f - fx); }
+ if (kScale == -1) { scalex = bicubic_near(1.0_f - fx); }
+ if (kScale == +1) { scalex = bicubic_near( fx); }
+ if (kScale == +3) { scalex = bicubic_far ( fx); }
+ memcpy(ctx->scalex, &scalex, sizeof(F));
+}
+template <int kScale>
+SI void bicubic_y(SkJumper_SamplerCtx* ctx, F* y) {
+ *y = unaligned_load<F>(ctx->y) + C(kScale * 0.5f);
+ F fy = unaligned_load<F>(ctx->fy);
+
+ F scaley;
+ if (kScale == -3) { scaley = bicubic_far (1.0_f - fy); }
+ if (kScale == -1) { scaley = bicubic_near(1.0_f - fy); }
+ if (kScale == +1) { scaley = bicubic_near( fy); }
+ if (kScale == +3) { scaley = bicubic_far ( fy); }
+ memcpy(ctx->scaley, &scaley, sizeof(F));
+}
+
+STAGE(bicubic_n3x) { bicubic_x<-3>(ctx, &r); }
+STAGE(bicubic_n1x) { bicubic_x<-1>(ctx, &r); }
+STAGE(bicubic_p1x) { bicubic_x<+1>(ctx, &r); }
+STAGE(bicubic_p3x) { bicubic_x<+3>(ctx, &r); }
+
+STAGE(bicubic_n3y) { bicubic_y<-3>(ctx, &g); }
+STAGE(bicubic_n1y) { bicubic_y<-1>(ctx, &g); }
+STAGE(bicubic_p1y) { bicubic_y<+1>(ctx, &g); }
+STAGE(bicubic_p3y) { bicubic_y<+3>(ctx, &g); }
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index fd7a9e5b1a..1146b3d7fb 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -13,7 +13,6 @@
#include "SkColorSpaceXform_A2B.h"
#include "SkColorSpaceXformPriv.h"
#include "SkHalf.h"
-#include "SkImageShaderContext.h"
#include "SkMSAN.h"
#include "SkPM4f.h"
#include "SkPM4fPriv.h"
@@ -883,7 +882,7 @@ STAGE_CTX( clamp_y, const float*) { g = clamp (g, *ctx); }
STAGE_CTX(repeat_y, const float*) { g = repeat(g, *ctx); }
STAGE_CTX(mirror_y, const float*) { g = mirror(g, *ctx); }
-STAGE_CTX(save_xy, SkImageShaderContext*) {
+STAGE_CTX(save_xy, SkJumper_SamplerCtx*) {
r.store(ctx->x);
g.store(ctx->y);
@@ -895,7 +894,7 @@ STAGE_CTX(save_xy, SkImageShaderContext*) {
fract(g + 0.5f).store(ctx->fy);
}
-STAGE_CTX(accumulate, const SkImageShaderContext*) {
+STAGE_CTX(accumulate, const SkJumper_SamplerCtx*) {
// Bilinear and bicubic filtering are both separable, so we'll end up with independent
// scale contributions in x and y that we multiply together to get each pixel's scale factor.
auto scale = SkNf::Load(ctx->scalex) * SkNf::Load(ctx->scaley);
@@ -910,21 +909,21 @@ STAGE_CTX(accumulate, const SkImageShaderContext*) {
// At positive offsets, the x-axis contribution to that rectangular area is fx; (1-fx)
// at negative x offsets. The y-axis is treated symmetrically.
template <int Scale>
-SI void bilinear_x(SkImageShaderContext* ctx, SkNf* x) {
+SI void bilinear_x(SkJumper_SamplerCtx* ctx, SkNf* x) {
*x = SkNf::Load(ctx->x) + Scale*0.5f;
auto fx = SkNf::Load(ctx->fx);
(Scale > 0 ? fx : (1.0f - fx)).store(ctx->scalex);
}
template <int Scale>
-SI void bilinear_y(SkImageShaderContext* ctx, SkNf* y) {
+SI void bilinear_y(SkJumper_SamplerCtx* ctx, SkNf* y) {
*y = SkNf::Load(ctx->y) + Scale*0.5f;
auto fy = SkNf::Load(ctx->fy);
(Scale > 0 ? fy : (1.0f - fy)).store(ctx->scaley);
}
-STAGE_CTX(bilinear_nx, SkImageShaderContext*) { bilinear_x<-1>(ctx, &r); }
-STAGE_CTX(bilinear_px, SkImageShaderContext*) { bilinear_x<+1>(ctx, &r); }
-STAGE_CTX(bilinear_ny, SkImageShaderContext*) { bilinear_y<-1>(ctx, &g); }
-STAGE_CTX(bilinear_py, SkImageShaderContext*) { bilinear_y<+1>(ctx, &g); }
+STAGE_CTX(bilinear_nx, SkJumper_SamplerCtx*) { bilinear_x<-1>(ctx, &r); }
+STAGE_CTX(bilinear_px, SkJumper_SamplerCtx*) { bilinear_x<+1>(ctx, &r); }
+STAGE_CTX(bilinear_ny, SkJumper_SamplerCtx*) { bilinear_y<-1>(ctx, &g); }
+STAGE_CTX(bilinear_py, SkJumper_SamplerCtx*) { bilinear_y<+1>(ctx, &g); }
// In bilinear interpolation, the 16 pixels at +/- 0.5 and +/- 1.5 offsets from the sample
@@ -945,7 +944,7 @@ SI SkNf bicubic_far(const SkNf& t) {
}
template <int Scale>
-SI void bicubic_x(SkImageShaderContext* ctx, SkNf* x) {
+SI void bicubic_x(SkJumper_SamplerCtx* ctx, SkNf* x) {
*x = SkNf::Load(ctx->x) + Scale*0.5f;
auto fx = SkNf::Load(ctx->fx);
if (Scale == -3) { return bicubic_far (1.0f - fx).store(ctx->scalex); }
@@ -955,7 +954,7 @@ SI void bicubic_x(SkImageShaderContext* ctx, SkNf* x) {
SkDEBUGFAIL("unreachable");
}
template <int Scale>
-SI void bicubic_y(SkImageShaderContext* ctx, SkNf* y) {
+SI void bicubic_y(SkJumper_SamplerCtx* ctx, SkNf* y) {
*y = SkNf::Load(ctx->y) + Scale*0.5f;
auto fy = SkNf::Load(ctx->fy);
if (Scale == -3) { return bicubic_far (1.0f - fy).store(ctx->scaley); }
@@ -964,15 +963,15 @@ SI void bicubic_y(SkImageShaderContext* ctx, SkNf* y) {
if (Scale == +3) { return bicubic_far ( fy).store(ctx->scaley); }
SkDEBUGFAIL("unreachable");
}
-STAGE_CTX(bicubic_n3x, SkImageShaderContext*) { bicubic_x<-3>(ctx, &r); }
-STAGE_CTX(bicubic_n1x, SkImageShaderContext*) { bicubic_x<-1>(ctx, &r); }
-STAGE_CTX(bicubic_p1x, SkImageShaderContext*) { bicubic_x<+1>(ctx, &r); }
-STAGE_CTX(bicubic_p3x, SkImageShaderContext*) { bicubic_x<+3>(ctx, &r); }
+STAGE_CTX(bicubic_n3x, SkJumper_SamplerCtx*) { bicubic_x<-3>(ctx, &r); }
+STAGE_CTX(bicubic_n1x, SkJumper_SamplerCtx*) { bicubic_x<-1>(ctx, &r); }
+STAGE_CTX(bicubic_p1x, SkJumper_SamplerCtx*) { bicubic_x<+1>(ctx, &r); }
+STAGE_CTX(bicubic_p3x, SkJumper_SamplerCtx*) { bicubic_x<+3>(ctx, &r); }
-STAGE_CTX(bicubic_n3y, SkImageShaderContext*) { bicubic_y<-3>(ctx, &g); }
-STAGE_CTX(bicubic_n1y, SkImageShaderContext*) { bicubic_y<-1>(ctx, &g); }
-STAGE_CTX(bicubic_p1y, SkImageShaderContext*) { bicubic_y<+1>(ctx, &g); }
-STAGE_CTX(bicubic_p3y, SkImageShaderContext*) { bicubic_y<+3>(ctx, &g); }
+STAGE_CTX(bicubic_n3y, SkJumper_SamplerCtx*) { bicubic_y<-3>(ctx, &g); }
+STAGE_CTX(bicubic_n1y, SkJumper_SamplerCtx*) { bicubic_y<-1>(ctx, &g); }
+STAGE_CTX(bicubic_p1y, SkJumper_SamplerCtx*) { bicubic_y<+1>(ctx, &g); }
+STAGE_CTX(bicubic_p3y, SkJumper_SamplerCtx*) { bicubic_y<+3>(ctx, &g); }
template <typename T>