aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-02-15 15:51:47 -0500
committerGravatar Mike Klein <mtklein@chromium.org>2017-02-15 22:47:58 +0000
commitf6a1a2179ad6bf37dd81a658b30d12bfbdfa5c3f (patch)
treeb02f237e634b929a1eafe7941880e5a3a6affb83 /src
parent6579da25ec17663a2609d1d03d881a4a0673db9e (diff)
Clean up SkSplicer.
SkJumper's looking promising enough that I want this further out of my face. Change-Id: I6dbe71aeabe32f7f4258ba157460e6985733b0ce Reviewed-on: https://skia-review.googlesource.com/8528 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Herb Derby <herb@google.com> Reviewed-by: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src')
-rw-r--r--src/core/SkRasterPipeline.cpp5
-rw-r--r--src/core/SkRasterPipeline.h2
-rw-r--r--src/splicer/SkSplicer.cpp398
-rw-r--r--src/splicer/SkSplicer_generated.h1816
-rw-r--r--src/splicer/SkSplicer_shared.h50
-rw-r--r--src/splicer/SkSplicer_stages.cpp584
-rwxr-xr-xsrc/splicer/build_stages.py122
7 files changed, 0 insertions, 2977 deletions
diff --git a/src/core/SkRasterPipeline.cpp b/src/core/SkRasterPipeline.cpp
index 884a7cbc97..350c2f50b4 100644
--- a/src/core/SkRasterPipeline.cpp
+++ b/src/core/SkRasterPipeline.cpp
@@ -27,11 +27,6 @@ void SkRasterPipeline::run(size_t x, size_t n) const {
}
std::function<void(size_t, size_t)> SkRasterPipeline::compile() const {
-#ifdef SK_RASTER_PIPELINE_HAS_JIT
- if (auto fn = this->jit()) {
- return fn;
- }
-#endif
return SkOpts::compile_pipeline(fStages.data(), SkToInt(fStages.size()));
}
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 9afe2fdd29..9ab6667e5a 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -133,8 +133,6 @@ public:
void append_from_srgb(SkAlphaType);
private:
- std::function<void(size_t, size_t)> jit() const;
-
std::vector<Stage> fStages;
};
diff --git a/src/splicer/SkSplicer.cpp b/src/splicer/SkSplicer.cpp
deleted file mode 100644
index 2fbab90899..0000000000
--- a/src/splicer/SkSplicer.cpp
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkCpu.h"
-#include "SkOpts.h"
-#include "SkRasterPipeline.h"
-#include "SkStream.h"
-#if defined(_MSC_VER)
- #include <windows.h>
-#else
- #include <sys/mman.h>
-#endif
-
-#include "SkSplicer_generated.h"
-#include "SkSplicer_shared.h"
-
-// Uncomment to dump output JIT'd pipeline.
-//#define DUMP "/tmp/dump.bin"
-//#define DUMP "/data/local/tmp/dump.bin"
-//
-// On x86, we'll include IACA markers too.
-// https://software.intel.com/en-us/articles/intel-architecture-code-analyzer
-// Running IACA will disassemble, and more.
-// $ ./iaca.sh -arch HSW -64 -mark 0 /tmp/dump.bin | less
-//
-// To disassemble an aarch64 dump,
-// $ adb pull /data/local/tmp/dump.bin; gobjdump -b binary -D dump.bin -m aarch64 | less
-//
-// To disassemble an armv7 dump,
-// $ adb pull /data/local/tmp/dump.bin; gobjdump -b binary -D dump.bin -m arm | less
-
-//#define M(st) #st,
-//static const char* kStageNames[] = { SK_RASTER_PIPELINE_STAGES(M) };
-//#undef M
-
-namespace {
-
- // Stages expect these constants to be set to these values.
- // It's fine to rearrange and add new ones if you update SkSplicer_constants.
- static const SkSplicer_constants kConstants = {
- 1.0f, 0.5f, 255.0f, 1/255.0f, 0x000000ff,
- {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f},
- 0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f, // from_srgb
- 12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f, // to_srgb
- 0x77800000, 0x07800000, // fp16 <-> fp32
- };
-
- // We do this a lot, so it's nice to infer the correct size. Works fine with arrays.
- template <typename T>
- static void splice(SkWStream* buf, const T& val) {
- buf->write(&val, sizeof(val));
- }
-
-#if defined(__aarch64__)
- static constexpr int kStride = 4;
- static void set_ctx(SkWStream* buf, void* ctx) {
- uint16_t parts[4];
- memcpy(parts, &ctx, 8);
- splice(buf, 0xd2f00000 | (parts[3] << 5) | 0x2); // move 16-bit intermediate << 48 into x2
- splice(buf, 0xf2c00000 | (parts[2] << 5) | 0x2); // merge 16-bit intermediate << 32 into x2
- splice(buf, 0xf2a00000 | (parts[1] << 5) | 0x2); // merge 16-bit intermediate << 16 into x2
- splice(buf, 0xf2800000 | (parts[0] << 5) | 0x2); // merge 16-bit intermediate << 0 into x2
- }
- static void loop(SkWStream* buf, int loop_start) {
- splice(buf, 0xeb01001f); // cmp x0, x1
- int off = loop_start - (int)buf->bytesWritten();
- off /= 4; // bytes -> instructions, still signed
- off = (off & 0x7ffff) << 5; // 19 bit maximum range (+- 256K instructions)
- splice(buf, 0x54000003 | off); // b.cc loop_start (cc == "carry clear", unsigned less than)
- }
- static void ret(SkWStream* buf) {
- splice(buf, 0xd65f03c0); // ret
- }
-#elif defined(__ARM_NEON__)
- static constexpr int kStride = 2;
- static void set_ctx(SkWStream* buf, void* ctx) {
- uint16_t parts[2];
- auto encode = [](uint16_t part) -> uint32_t {
- return (part & 0xf000) << 4 | (part & 0xfff);
- };
- memcpy(parts, &ctx, 4);
- splice(buf, 0xe3002000 | encode(parts[0])); // mov r2, <bottom 16 bits>
- splice(buf, 0xe3402000 | encode(parts[1])); // movt r2, <top 16 bits>
- }
- static void loop(SkWStream* buf, int loop_start) {
- splice(buf, 0xe1500001); // cmp r0, r1
- int off = loop_start - ((int)buf->bytesWritten() + 8 /*ARM is weird*/);
- off /= 4; // bytes -> instructions, still signed
- off = (off & 0x00ffffff);
- splice(buf, 0x3a000000 | off); // bcc loop_start
- }
- static void ret(SkWStream* buf) {
- splice(buf, 0xe12fff1e); // bx lr
- }
-#else
- static constexpr int kStride = 8;
- static void set_ctx(SkWStream* buf, void* ctx) {
- static const uint8_t movabsq_rdx[] = { 0x48, 0xba };
- splice(buf, movabsq_rdx); // movabsq <next 8 bytes>, %rdx
- splice(buf, ctx);
- }
- static void loop(SkWStream* buf, int loop_start) {
- static const uint8_t cmp_rsi_rdi[] = { 0x48, 0x39, 0xf7 };
- static const uint8_t jb_near[] = { 0x0f, 0x8c };
- splice(buf, cmp_rsi_rdi); // cmp %rsi, %rdi
- splice(buf, jb_near); // jb <next 4 bytes> (b == "before", unsigned less than)
- splice(buf, loop_start - (int)(buf->bytesWritten() + 4));
- }
-#endif
-
-#if defined(_MSC_VER)
- // Adapt from MS ABI to System V ABI used by stages.
- static void before_loop(SkWStream* buf) {
- // On the way into this adapter the stack is 16-byte aligned plus an 8-byte return address.
- // We need to leave the stack the same way: at an odd 8-byte alignment.
- static const uint8_t ms_to_system_v[] = {
- 0x56, // push %rsi
- 0x48,0x81,0xec,0xa0,0x00,0x00,0x00, // sub $0xa0,%rsp
- 0x44,0x0f,0x29,0xbc,0x24,0x90,0x00,0x00,0x00, // movaps %xmm15,0x90(%rsp)
- 0x44,0x0f,0x29,0xb4,0x24,0x80,0x00,0x00,0x00, // movaps %xmm14,0x80(%rsp)
- 0x44,0x0f,0x29,0x6c,0x24,0x70, // movaps %xmm13,0x70(%rsp)
- 0x44,0x0f,0x29,0x64,0x24,0x60, // movaps %xmm12,0x60(%rsp)
- 0x44,0x0f,0x29,0x5c,0x24,0x50, // movaps %xmm11,0x50(%rsp)
- 0x44,0x0f,0x29,0x54,0x24,0x40, // movaps %xmm10,0x40(%rsp)
- 0x44,0x0f,0x29,0x4c,0x24,0x30, // movaps %xmm9,0x30(%rsp)
- 0x44,0x0f,0x29,0x44,0x24,0x20, // movaps %xmm8,0x20(%rsp)
- 0x0f,0x29,0x7c,0x24,0x10, // movaps %xmm7,0x10(%rsp)
- 0x0f,0x29,0x34,0x24, // movaps %xmm6,(%rsp)
- 0x57, // push %rdi
- 0x48,0x89,0xcf, // mov %rcx,%rdi
- 0x48,0x89,0xd6, // mov %rdx,%rsi
- 0x4c,0x89,0xc2, // mov %r8,%rdx
- 0x4c,0x89,0xc9, // mov %r9,%rcx
- };
- splice(buf, ms_to_system_v);
- }
- static void after_loop(SkWStream* buf) {
- static const uint8_t system_v_to_ms[] = {
- // TODO: vzeroupper here?
- 0x5f, // pop %rdi
- 0x0f,0x28,0x34,0x24, // movaps (%rsp),%xmm6
- 0x0f,0x28,0x7c,0x24,0x10, // movaps 0x10(%rsp),%xmm7
- 0x44,0x0f,0x28,0x44,0x24,0x20, // movaps 0x20(%rsp),%xmm8
- 0x44,0x0f,0x28,0x4c,0x24,0x30, // movaps 0x30(%rsp),%xmm9
- 0x44,0x0f,0x28,0x54,0x24,0x40, // movaps 0x40(%rsp),%xmm10
- 0x44,0x0f,0x28,0x5c,0x24,0x50, // movaps 0x50(%rsp),%xmm11
- 0x44,0x0f,0x28,0x64,0x24,0x60, // movaps 0x60(%rsp),%xmm12
- 0x44,0x0f,0x28,0x6c,0x24,0x70, // movaps 0x70(%rsp),%xmm13
- 0x44,0x0f,0x28,0xb4,0x24,0x80,0x00,0x00,0x00, // movaps 0x80(%rsp),%xmm14
- 0x44,0x0f,0x28,0xbc,0x24,0x90,0x00,0x00,0x00, // movaps 0x90(%rsp),%xmm15
- 0x48,0x81,0xc4,0xa0,0x00,0x00,0x00, // add $0xa0,%rsp
- 0x5e, // pop %rsi
- };
- splice(buf, system_v_to_ms);
- }
-#elif !defined(__aarch64__) && !defined(__ARM_NEON__) && defined(DUMP)
- // IACA start and end markers.
- static const uint8_t ud2[] = { 0x0f, 0x0b }; // undefined... crashes when run
- static const uint8_t nop3[] = { 0x64, 0x67, 0x90 }; // 3 byte no-op
- static const uint8_t movl_ebx[] = { 0xbb }; // move next 4 bytes into ebx
-
- static void before_loop(SkWStream* buf) {
- splice(buf, ud2);
- splice(buf, movl_ebx);
- splice(buf, 111);
- splice(buf, nop3);
- }
- static void after_loop(SkWStream* buf) {
- splice(buf, movl_ebx);
- splice(buf, 222);
- splice(buf, nop3);
- splice(buf, ud2);
- }
-#else
- static void before_loop(SkWStream*) {}
- static void after_loop (SkWStream*) {}
-#endif
-
- // We can only mprotect / VirtualProtect at 4K page granularity.
- static size_t round_up_to_full_pages(size_t len) {
- size_t size = 0;
- while (size < len) {
- size += 4096;
- }
- return size;
- }
-
-#if defined(_MSC_VER)
- // Copy len bytes from src to memory that's executable. cleanup with cleanup_executable_mem().
- static void* copy_to_executable_mem(const void* src, size_t* len) {
- if (!src || !*len) {
- return nullptr;
- }
-
- size_t alloc = round_up_to_full_pages(*len);
-
- auto fn = VirtualAlloc(nullptr, alloc, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
- memcpy(fn, src, *len);
-
- DWORD dont_care;
- VirtualProtect(fn, alloc, PAGE_EXECUTE_READ, &dont_care);
-
- *len = alloc;
- return fn;
- }
- static void cleanup_executable_mem(void* fn, size_t len) {
- if (fn) {
- VirtualFree(fn, 0, MEM_RELEASE);
- }
- }
-#else
- static void* copy_to_executable_mem(const void* src, size_t* len) {
- if (!src || !*len) {
- return nullptr;
- }
-
- size_t alloc = round_up_to_full_pages(*len);
-
- auto fn = mmap(nullptr, alloc, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
- memcpy(fn, src, *len);
-
- mprotect(fn, alloc, PROT_READ|PROT_EXEC);
- __builtin___clear_cache((char*)fn, (char*)fn + *len); // Essential on ARM; no-op on x86.
-
- *len = alloc;
- return fn;
- }
- static void cleanup_executable_mem(void* fn, size_t len) {
- if (fn) {
- munmap(fn, len);
- }
- }
-#endif
-
-#define CASE(prefix, st) case SkRasterPipeline::st: splice(buf, prefix##_##st); break
-#define DEFINE_SPLICE_STAGE(prefix) \
- static bool prefix##_##splice_stage(SkWStream* buf, SkRasterPipeline::StockStage st) { \
- switch (st) { \
- default: return false; \
- CASE(prefix, seed_shader); \
- CASE(prefix, constant_color); \
- CASE(prefix, clear); \
- CASE(prefix, plus_); \
- CASE(prefix, srcover); \
- CASE(prefix, dstover); \
- CASE(prefix, clamp_0); \
- CASE(prefix, clamp_1); \
- CASE(prefix, clamp_a); \
- CASE(prefix, swap); \
- CASE(prefix, move_src_dst); \
- CASE(prefix, move_dst_src); \
- CASE(prefix, premul); \
- CASE(prefix, unpremul); \
- CASE(prefix, from_srgb); \
- CASE(prefix, to_srgb); \
- CASE(prefix, scale_u8); \
- CASE(prefix, load_tables); \
- CASE(prefix, load_8888); \
- CASE(prefix, store_8888); \
- CASE(prefix, load_f16); \
- CASE(prefix, store_f16); \
- CASE(prefix, matrix_2x3); \
- CASE(prefix, matrix_3x4); \
- CASE(prefix, clamp_x); \
- CASE(prefix, clamp_y); \
- CASE(prefix, linear_gradient_2stops); \
- } \
- return true; \
- }
- #if defined(__aarch64__)
- DEFINE_SPLICE_STAGE(aarch64)
- #elif defined(__ARM_NEON__)
- DEFINE_SPLICE_STAGE(armv7)
- #else
- DEFINE_SPLICE_STAGE(hsw)
- DEFINE_SPLICE_STAGE(sse2)
- #endif
-#undef DEFINE_SPLICE
-#undef CASE
-
- struct Spliced {
-
- Spliced(const SkRasterPipeline::Stage* stages, int nstages) {
- // We always create a backup interpreter pipeline,
- // - to handle any program we can't, and
- // - to handle the n < stride tails.
- fBackup = SkOpts::compile_pipeline(stages, nstages);
- fSplicedLen = 0;
- fSpliced = nullptr;
- // If we return early anywhere in here, !fSpliced means we'll use fBackup instead.
-
- #if defined(__aarch64__)
- auto splice_stage = aarch64_splice_stage;
- auto inc_x = [](SkWStream* buf) { splice(buf, aarch64_inc_x); };
- #elif defined(__ARM_NEON__)
- // Late generation ARMv7, e.g. Cortex A15 or Krait.
- if (!SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
- return;
- }
- auto splice_stage = armv7_splice_stage;
- auto inc_x = [](SkWStream* buf) { splice(buf, armv7_inc_x); };
- #else
- // To keep things simple, only x86-64 supported.
- if (sizeof(void*) != 8) {
- return;
- }
- bool hsw = true && SkCpu::Supports(SkCpu::HSW);
-
- auto splice_stage = hsw ? hsw_splice_stage : sse2_splice_stage;
- auto inc_x = [hsw](SkWStream* buf) {
- if (hsw) { splice(buf, hsw_inc_x); }
- else { splice(buf, sse2_inc_x); }
- };
- auto ret = [hsw](SkWStream* buf) {
- static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 };
- static const uint8_t ret[] = { 0xc3 };
- if (hsw) {
- splice(buf, vzeroupper);
- }
- splice(buf, ret);
- };
- #endif
-
- SkDynamicMemoryWStream buf;
-
- // Our loop is the equivalent of this C++ code:
- // do {
- // ... run spliced stages...
- // x += stride;
- // } while(x < limit);
- before_loop(&buf);
- auto loop_start = buf.bytesWritten(); // Think of this like a label, loop_start:
-
- for (int i = 0; i < nstages; i++) {
- // If a stage has a context pointer, load it into rdx/x2, Stage argument 3 "ctx".
- if (stages[i].ctx) {
- set_ctx(&buf, stages[i].ctx);
- }
-
- // Splice in the code for the Stages, generated offline into SkSplicer_generated.h.
- if (!splice_stage(&buf, stages[i].stage)) {
- //SkDebugf("SkSplicer can't yet handle stage %d %s.\n",
- // stages[i].stage, kStageNames[stages[i].stage]);
- return;
- }
- }
-
- inc_x(&buf);
- loop(&buf, loop_start); // Loop back to handle more pixels if not done.
- after_loop(&buf);
- ret(&buf); // We're done.
-
- auto data = buf.detachAsData();
- fSplicedLen = data->size();
- fSpliced = copy_to_executable_mem(data->data(), &fSplicedLen);
-
- #if defined(DUMP)
- SkFILEWStream(DUMP).write(data->data(), data->size());
- #endif
- }
-
- // Spliced is stored in a std::function, so it needs to be copyable.
- Spliced(const Spliced& o) : fBackup (o.fBackup)
- , fSplicedLen(o.fSplicedLen)
- , fSpliced (copy_to_executable_mem(o.fSpliced, &fSplicedLen)) {}
-
- ~Spliced() {
- cleanup_executable_mem(fSpliced, fSplicedLen);
- }
-
- // Here's where we call fSpliced if we created it, fBackup if not.
- void operator()(size_t x, size_t n) const {
- size_t body = n/kStride*kStride; // Largest multiple of kStride (2, 4, 8, or 16) <= n.
- if (fSpliced && body) { // Can we run fSpliced for at least one stride?
- using Fn = void(size_t x, size_t limit, void* ctx, const void* k);
- ((Fn*)fSpliced)(x, x+body, nullptr, &kConstants);
-
- // Fall through to fBackup for any n<stride last pixels.
- x += body;
- n -= body;
- }
- fBackup(x,n);
- }
-
- std::function<void(size_t, size_t)> fBackup;
- size_t fSplicedLen;
- void* fSpliced;
- };
-
-}
-
-std::function<void(size_t, size_t)> SkRasterPipeline::jit() const {
- return Spliced(fStages.data(), SkToInt(fStages.size()));
-}
diff --git a/src/splicer/SkSplicer_generated.h b/src/splicer/SkSplicer_generated.h
deleted file mode 100644
index cea6a558e6..0000000000
--- a/src/splicer/SkSplicer_generated.h
+++ /dev/null
@@ -1,1816 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#ifndef SkSplicer_generated_DEFINED
-#define SkSplicer_generated_DEFINED
-
-// This file is generated semi-automatically with this command:
-// $ src/splicer/build_stages.py
-
-static const unsigned int aarch64_inc_x[] = {
- 0x91001000, // add x0, x0, #0x4
-};
-static const unsigned int aarch64_seed_shader[] = {
- 0xaa0303e8, // mov x8, x3
- 0x4ddfc902, // ld1r {v2.4s}, [x8], #4
- 0x4d40c841, // ld1r {v1.4s}, [x2]
- 0x3cc14066, // ldur q6, [x3,#20]
- 0x4e040c00, // dup v0.4s, w0
- 0x4d40c907, // ld1r {v7.4s}, [x8]
- 0x4e21d800, // scvtf v0.4s, v0.4s
- 0x4e21d821, // scvtf v1.4s, v1.4s
- 0x6f00e403, // movi v3.2d, #0x0
- 0x4e27d400, // fadd v0.4s, v0.4s, v7.4s
- 0x6f00e404, // movi v4.2d, #0x0
- 0x6f00e405, // movi v5.2d, #0x0
- 0x4e20d4c0, // fadd v0.4s, v6.4s, v0.4s
- 0x4e27d421, // fadd v1.4s, v1.4s, v7.4s
- 0x6f00e406, // movi v6.2d, #0x0
- 0x6f00e407, // movi v7.2d, #0x0
-};
-static const unsigned int aarch64_constant_color[] = {
- 0x3dc00043, // ldr q3, [x2]
- 0x4e040460, // dup v0.4s, v3.s[0]
- 0x4e0c0461, // dup v1.4s, v3.s[1]
- 0x4e140462, // dup v2.4s, v3.s[2]
- 0x4e1c0463, // dup v3.4s, v3.s[3]
-};
-static const unsigned int aarch64_clear[] = {
- 0x6f00e400, // movi v0.2d, #0x0
- 0x6f00e401, // movi v1.2d, #0x0
- 0x6f00e402, // movi v2.2d, #0x0
- 0x6f00e403, // movi v3.2d, #0x0
-};
-static const unsigned int aarch64_plus_[] = {
- 0x4e24d400, // fadd v0.4s, v0.4s, v4.4s
- 0x4e25d421, // fadd v1.4s, v1.4s, v5.4s
- 0x4e26d442, // fadd v2.4s, v2.4s, v6.4s
- 0x4e27d463, // fadd v3.4s, v3.4s, v7.4s
-};
-static const unsigned int aarch64_srcover[] = {
- 0x4d40c870, // ld1r {v16.4s}, [x3]
- 0x4ea3d610, // fsub v16.4s, v16.4s, v3.4s
- 0x4e24ce00, // fmla v0.4s, v16.4s, v4.4s
- 0x4e25ce01, // fmla v1.4s, v16.4s, v5.4s
- 0x4e26ce02, // fmla v2.4s, v16.4s, v6.4s
- 0x4e27ce03, // fmla v3.4s, v16.4s, v7.4s
-};
-static const unsigned int aarch64_dstover[] = {
- 0x4d40c871, // ld1r {v17.4s}, [x3]
- 0x4ea41c90, // mov v16.16b, v4.16b
- 0x4ea61cd2, // mov v18.16b, v6.16b
- 0x4ea71cf3, // mov v19.16b, v7.16b
- 0x4ea7d634, // fsub v20.4s, v17.4s, v7.4s
- 0x4ea51cb1, // mov v17.16b, v5.16b
- 0x4e20ce90, // fmla v16.4s, v20.4s, v0.4s
- 0x4e21ce91, // fmla v17.4s, v20.4s, v1.4s
- 0x4e22ce92, // fmla v18.4s, v20.4s, v2.4s
- 0x4e23ce93, // fmla v19.4s, v20.4s, v3.4s
- 0x4eb01e00, // mov v0.16b, v16.16b
- 0x4eb11e21, // mov v1.16b, v17.16b
- 0x4eb21e42, // mov v2.16b, v18.16b
- 0x4eb31e63, // mov v3.16b, v19.16b
-};
-static const unsigned int aarch64_clamp_0[] = {
- 0x6f00e410, // movi v16.2d, #0x0
- 0x4e30f400, // fmax v0.4s, v0.4s, v16.4s
- 0x4e30f421, // fmax v1.4s, v1.4s, v16.4s
- 0x4e30f442, // fmax v2.4s, v2.4s, v16.4s
- 0x4e30f463, // fmax v3.4s, v3.4s, v16.4s
-};
-static const unsigned int aarch64_clamp_1[] = {
- 0x4d40c870, // ld1r {v16.4s}, [x3]
- 0x4eb0f400, // fmin v0.4s, v0.4s, v16.4s
- 0x4eb0f421, // fmin v1.4s, v1.4s, v16.4s
- 0x4eb0f442, // fmin v2.4s, v2.4s, v16.4s
- 0x4eb0f463, // fmin v3.4s, v3.4s, v16.4s
-};
-static const unsigned int aarch64_clamp_a[] = {
- 0x4d40c870, // ld1r {v16.4s}, [x3]
- 0x4eb0f463, // fmin v3.4s, v3.4s, v16.4s
- 0x4ea3f400, // fmin v0.4s, v0.4s, v3.4s
- 0x4ea3f421, // fmin v1.4s, v1.4s, v3.4s
- 0x4ea3f442, // fmin v2.4s, v2.4s, v3.4s
-};
-static const unsigned int aarch64_swap[] = {
- 0x4ea31c70, // mov v16.16b, v3.16b
- 0x4ea21c51, // mov v17.16b, v2.16b
- 0x4ea11c32, // mov v18.16b, v1.16b
- 0x4ea01c13, // mov v19.16b, v0.16b
- 0x4ea41c80, // mov v0.16b, v4.16b
- 0x4ea51ca1, // mov v1.16b, v5.16b
- 0x4ea61cc2, // mov v2.16b, v6.16b
- 0x4ea71ce3, // mov v3.16b, v7.16b
- 0x4eb31e64, // mov v4.16b, v19.16b
- 0x4eb21e45, // mov v5.16b, v18.16b
- 0x4eb11e26, // mov v6.16b, v17.16b
- 0x4eb01e07, // mov v7.16b, v16.16b
-};
-static const unsigned int aarch64_move_src_dst[] = {
- 0x4ea01c04, // mov v4.16b, v0.16b
- 0x4ea11c25, // mov v5.16b, v1.16b
- 0x4ea21c46, // mov v6.16b, v2.16b
- 0x4ea31c67, // mov v7.16b, v3.16b
-};
-static const unsigned int aarch64_move_dst_src[] = {
- 0x4ea41c80, // mov v0.16b, v4.16b
- 0x4ea51ca1, // mov v1.16b, v5.16b
- 0x4ea61cc2, // mov v2.16b, v6.16b
- 0x4ea71ce3, // mov v3.16b, v7.16b
-};
-static const unsigned int aarch64_premul[] = {
- 0x6e23dc00, // fmul v0.4s, v0.4s, v3.4s
- 0x6e23dc21, // fmul v1.4s, v1.4s, v3.4s
- 0x6e23dc42, // fmul v2.4s, v2.4s, v3.4s
-};
-static const unsigned int aarch64_unpremul[] = {
- 0x4d40c870, // ld1r {v16.4s}, [x3]
- 0x4ea0d871, // fcmeq v17.4s, v3.4s, #0.0
- 0x6e23fe10, // fdiv v16.4s, v16.4s, v3.4s
- 0x4e711e10, // bic v16.16b, v16.16b, v17.16b
- 0x6e20de00, // fmul v0.4s, v16.4s, v0.4s
- 0x6e21de01, // fmul v1.4s, v16.4s, v1.4s
- 0x6e22de02, // fmul v2.4s, v16.4s, v2.4s
-};
-static const unsigned int aarch64_from_srgb[] = {
- 0x9100e068, // add x8, x3, #0x38
- 0x4d40c910, // ld1r {v16.4s}, [x8]
- 0x9100d068, // add x8, x3, #0x34
- 0x4d40c911, // ld1r {v17.4s}, [x8]
- 0x2d47cc72, // ldp s18, s19, [x3,#60]
- 0x6e22dc54, // fmul v20.4s, v2.4s, v2.4s
- 0x4eb01e15, // mov v21.16b, v16.16b
- 0x4eb01e17, // mov v23.16b, v16.16b
- 0x4f921050, // fmla v16.4s, v2.4s, v18.s[0]
- 0x4eb11e36, // mov v22.16b, v17.16b
- 0x4eb11e38, // mov v24.16b, v17.16b
- 0x4e34ce11, // fmla v17.4s, v16.4s, v20.4s
- 0x6e20dc10, // fmul v16.4s, v0.4s, v0.4s
- 0x91011068, // add x8, x3, #0x44
- 0x4f921015, // fmla v21.4s, v0.4s, v18.s[0]
- 0x4e30ceb6, // fmla v22.4s, v21.4s, v16.4s
- 0x4d40c910, // ld1r {v16.4s}, [x8]
- 0x6e21dc34, // fmul v20.4s, v1.4s, v1.4s
- 0x4f921037, // fmla v23.4s, v1.4s, v18.s[0]
- 0x4f939015, // fmul v21.4s, v0.4s, v19.s[0]
- 0x4f939032, // fmul v18.4s, v1.4s, v19.s[0]
- 0x4f939053, // fmul v19.4s, v2.4s, v19.s[0]
- 0x6ea0e600, // fcmgt v0.4s, v16.4s, v0.4s
- 0x6ea1e601, // fcmgt v1.4s, v16.4s, v1.4s
- 0x6ea2e602, // fcmgt v2.4s, v16.4s, v2.4s
- 0x4e34cef8, // fmla v24.4s, v23.4s, v20.4s
- 0x6e761ea0, // bsl v0.16b, v21.16b, v22.16b
- 0x6e781e41, // bsl v1.16b, v18.16b, v24.16b
- 0x6e711e62, // bsl v2.16b, v19.16b, v17.16b
-};
-static const unsigned int aarch64_to_srgb[] = {
- 0x6ea1d810, // frsqrte v16.4s, v0.4s
- 0x6ea1d835, // frsqrte v21.4s, v1.4s
- 0x6e30de17, // fmul v23.4s, v16.4s, v16.4s
- 0x6ea1d856, // frsqrte v22.4s, v2.4s
- 0x6e35deb9, // fmul v25.4s, v21.4s, v21.4s
- 0x4eb7fc17, // frsqrts v23.4s, v0.4s, v23.4s
- 0x91015068, // add x8, x3, #0x54
- 0x6e36deda, // fmul v26.4s, v22.4s, v22.4s
- 0x4eb9fc39, // frsqrts v25.4s, v1.4s, v25.4s
- 0x6e37de10, // fmul v16.4s, v16.4s, v23.4s
- 0x2d494871, // ldp s17, s18, [x3,#72]
- 0x4d40c914, // ld1r {v20.4s}, [x8]
- 0x4ebafc5a, // frsqrts v26.4s, v2.4s, v26.4s
- 0x6e39deb5, // fmul v21.4s, v21.4s, v25.4s
- 0x4ea1da17, // frecpe v23.4s, v16.4s
- 0xbd405073, // ldr s19, [x3,#80]
- 0x91016068, // add x8, x3, #0x58
- 0x6e3aded6, // fmul v22.4s, v22.4s, v26.4s
- 0x4ea1dabb, // frecpe v27.4s, v21.4s
- 0x4e37fe1d, // frecps v29.4s, v16.4s, v23.4s
- 0x4d40c918, // ld1r {v24.4s}, [x8]
- 0x4ea1dadc, // frecpe v28.4s, v22.4s
- 0x6e3ddef7, // fmul v23.4s, v23.4s, v29.4s
- 0x4e3bfebd, // frecps v29.4s, v21.4s, v27.4s
- 0x6e3ddf7b, // fmul v27.4s, v27.4s, v29.4s
- 0x4e3cfedd, // frecps v29.4s, v22.4s, v28.4s
- 0x6e3ddf9c, // fmul v28.4s, v28.4s, v29.4s
- 0x4eb41e9d, // mov v29.16b, v20.16b
- 0x6ea1da19, // frsqrte v25.4s, v16.4s
- 0x4f9312fd, // fmla v29.4s, v23.4s, v19.s[0]
- 0x4eb41e97, // mov v23.16b, v20.16b
- 0x4f91901a, // fmul v26.4s, v0.4s, v17.s[0]
- 0x4f931377, // fmla v23.4s, v27.4s, v19.s[0]
- 0x6ea1dabb, // frsqrte v27.4s, v21.4s
- 0x4f931394, // fmla v20.4s, v28.4s, v19.s[0]
- 0x4f919033, // fmul v19.4s, v1.4s, v17.s[0]
- 0x4f919051, // fmul v17.4s, v2.4s, v17.s[0]
- 0x6ea0e700, // fcmgt v0.4s, v24.4s, v0.4s
- 0x6ea1e701, // fcmgt v1.4s, v24.4s, v1.4s
- 0x6ea2e702, // fcmgt v2.4s, v24.4s, v2.4s
- 0x6e39df38, // fmul v24.4s, v25.4s, v25.4s
- 0x6ea1dadc, // frsqrte v28.4s, v22.4s
- 0x4eb8fe10, // frsqrts v16.4s, v16.4s, v24.4s
- 0x6e3bdf78, // fmul v24.4s, v27.4s, v27.4s
- 0x4eb8feb5, // frsqrts v21.4s, v21.4s, v24.4s
- 0x6e3cdf98, // fmul v24.4s, v28.4s, v28.4s
- 0x4eb8fed6, // frsqrts v22.4s, v22.4s, v24.4s
- 0x4d40c878, // ld1r {v24.4s}, [x3]
- 0x6e30df30, // fmul v16.4s, v25.4s, v16.4s
- 0x6e35df75, // fmul v21.4s, v27.4s, v21.4s
- 0x6e36df96, // fmul v22.4s, v28.4s, v22.4s
- 0x4f92121d, // fmla v29.4s, v16.4s, v18.s[0]
- 0x4f9212b7, // fmla v23.4s, v21.4s, v18.s[0]
- 0x4f9212d4, // fmla v20.4s, v22.4s, v18.s[0]
- 0x4ebdf710, // fmin v16.4s, v24.4s, v29.4s
- 0x4eb7f712, // fmin v18.4s, v24.4s, v23.4s
- 0x4eb4f714, // fmin v20.4s, v24.4s, v20.4s
- 0x6e701f40, // bsl v0.16b, v26.16b, v16.16b
- 0x6e721e61, // bsl v1.16b, v19.16b, v18.16b
- 0x6e741e22, // bsl v2.16b, v17.16b, v20.16b
-};
-static const unsigned int aarch64_scale_u8[] = {
- 0xf9400048, // ldr x8, [x2]
- 0xbd400c71, // ldr s17, [x3,#12]
- 0x8b000108, // add x8, x8, x0
- 0x39400109, // ldrb w9, [x8]
- 0x3940050a, // ldrb w10, [x8,#1]
- 0x4e021d30, // mov v16.h[0], w9
- 0x39400909, // ldrb w9, [x8,#2]
- 0x39400d08, // ldrb w8, [x8,#3]
- 0x4e061d50, // mov v16.h[1], w10
- 0x4e0a1d30, // mov v16.h[2], w9
- 0x4e0e1d10, // mov v16.h[3], w8
- 0x2f07b7f0, // bic v16.4h, #0xff, lsl #8
- 0x2f10a610, // uxtl v16.4s, v16.4h
- 0x6e21da10, // ucvtf v16.4s, v16.4s
- 0x4f919210, // fmul v16.4s, v16.4s, v17.s[0]
- 0x6e20de00, // fmul v0.4s, v16.4s, v0.4s
- 0x6e21de01, // fmul v1.4s, v16.4s, v1.4s
- 0x6e22de02, // fmul v2.4s, v16.4s, v2.4s
- 0x6e23de03, // fmul v3.4s, v16.4s, v3.4s
-};
-static const unsigned int aarch64_load_tables[] = {
- 0xa9402849, // ldp x9, x10, [x2]
- 0xd37ef408, // lsl x8, x0, #2
- 0x9100406b, // add x11, x3, #0x10
- 0x4d40c960, // ld1r {v0.4s}, [x11]
- 0x3ce86923, // ldr q3, [x9,x8]
- 0xa9412448, // ldp x8, x9, [x2,#16]
- 0x4e231c01, // and v1.16b, v0.16b, v3.16b
- 0x1e26002e, // fmov w14, s1
- 0x6f380462, // ushr v2.4s, v3.4s, #8
- 0x6f300470, // ushr v16.4s, v3.4s, #16
- 0x8b2e494e, // add x14, x10, w14, uxtw #2
- 0x0e0c3c2b, // mov w11, v1.s[1]
- 0x0e143c2c, // mov w12, v1.s[2]
- 0x0e1c3c2d, // mov w13, v1.s[3]
- 0x4e221c01, // and v1.16b, v0.16b, v2.16b
- 0x4e301c02, // and v2.16b, v0.16b, v16.16b
- 0x0d4081c0, // ld1 {v0.s}[0], [x14]
- 0x1e26002e, // fmov w14, s1
- 0x8b2e490e, // add x14, x8, w14, uxtw #2
- 0x8b2b494b, // add x11, x10, w11, uxtw #2
- 0xbc6c5950, // ldr s16, [x10,w12,uxtw #2]
- 0xbc6d5951, // ldr s17, [x10,w13,uxtw #2]
- 0x0e0c3c2a, // mov w10, v1.s[1]
- 0x0e143c2c, // mov w12, v1.s[2]
- 0x0e1c3c2d, // mov w13, v1.s[3]
- 0x0d4081c1, // ld1 {v1.s}[0], [x14]
- 0x0d409160, // ld1 {v0.s}[1], [x11]
- 0xbc6c5912, // ldr s18, [x8,w12,uxtw #2]
- 0x0e143c4c, // mov w12, v2.s[2]
- 0x1e26004e, // fmov w14, s2
- 0xbc6c5933, // ldr s19, [x9,w12,uxtw #2]
- 0x8b2e492c, // add x12, x9, w14, uxtw #2
- 0x8b2a490a, // add x10, x8, w10, uxtw #2
- 0x0e0c3c4f, // mov w15, v2.s[1]
- 0x0e1c3c4b, // mov w11, v2.s[3]
- 0x0d408182, // ld1 {v2.s}[0], [x12]
- 0x0d409141, // ld1 {v1.s}[1], [x10]
- 0x6e140600, // mov v0.s[2], v16.s[0]
- 0xbc6d5910, // ldr s16, [x8,w13,uxtw #2]
- 0x8b2f492a, // add x10, x9, w15, uxtw #2
- 0x0d409142, // ld1 {v2.s}[1], [x10]
- 0x6e140641, // mov v1.s[2], v18.s[0]
- 0x6e1c0620, // mov v0.s[3], v17.s[0]
- 0xbc6b5931, // ldr s17, [x9,w11,uxtw #2]
- 0x6e1c0601, // mov v1.s[3], v16.s[0]
- 0xbd400c70, // ldr s16, [x3,#12]
- 0x6f280463, // ushr v3.4s, v3.4s, #24
- 0x6e140662, // mov v2.s[2], v19.s[0]
- 0x4e21d863, // scvtf v3.4s, v3.4s
- 0x6e1c0622, // mov v2.s[3], v17.s[0]
- 0x4f909063, // fmul v3.4s, v3.4s, v16.s[0]
-};
-static const unsigned int aarch64_load_8888[] = {
- 0xf9400048, // ldr x8, [x2]
- 0x91004069, // add x9, x3, #0x10
- 0x4d40c920, // ld1r {v0.4s}, [x9]
- 0xd37ef409, // lsl x9, x0, #2
- 0x3ce96901, // ldr q1, [x8,x9]
- 0xbd400c63, // ldr s3, [x3,#12]
- 0x4e211c02, // and v2.16b, v0.16b, v1.16b
- 0x6f380430, // ushr v16.4s, v1.4s, #8
- 0x6f300431, // ushr v17.4s, v1.4s, #16
- 0x6f280421, // ushr v1.4s, v1.4s, #24
- 0x4e21d842, // scvtf v2.4s, v2.4s
- 0x4e301c10, // and v16.16b, v0.16b, v16.16b
- 0x4e311c11, // and v17.16b, v0.16b, v17.16b
- 0x4e21d832, // scvtf v18.4s, v1.4s
- 0x4f839040, // fmul v0.4s, v2.4s, v3.s[0]
- 0x4e21da01, // scvtf v1.4s, v16.4s
- 0x4e21da22, // scvtf v2.4s, v17.4s
- 0x4f839021, // fmul v1.4s, v1.4s, v3.s[0]
- 0x4f839042, // fmul v2.4s, v2.4s, v3.s[0]
- 0x4f839243, // fmul v3.4s, v18.4s, v3.s[0]
-};
-static const unsigned int aarch64_store_8888[] = {
- 0xbd400870, // ldr s16, [x3,#8]
- 0xf9400048, // ldr x8, [x2]
- 0xd37ef409, // lsl x9, x0, #2
- 0x4f909032, // fmul v18.4s, v1.4s, v16.s[0]
- 0x4f909011, // fmul v17.4s, v0.4s, v16.s[0]
- 0x6e21aa52, // fcvtnu v18.4s, v18.4s
- 0x6e21aa31, // fcvtnu v17.4s, v17.4s
- 0x4f285652, // shl v18.4s, v18.4s, #8
- 0x4eb11e51, // orr v17.16b, v18.16b, v17.16b
- 0x4f909052, // fmul v18.4s, v2.4s, v16.s[0]
- 0x4f909070, // fmul v16.4s, v3.4s, v16.s[0]
- 0x6e21aa52, // fcvtnu v18.4s, v18.4s
- 0x6e21aa10, // fcvtnu v16.4s, v16.4s
- 0x4f305652, // shl v18.4s, v18.4s, #16
- 0x4eb21e31, // orr v17.16b, v17.16b, v18.16b
- 0x4f385610, // shl v16.4s, v16.4s, #24
- 0x4eb01e30, // orr v16.16b, v17.16b, v16.16b
- 0x3ca96910, // str q16, [x8,x9]
-};
-static const unsigned int aarch64_load_f16[] = {
- 0xf9400048, // ldr x8, [x2]
- 0x8b000d08, // add x8, x8, x0, lsl #3
- 0x0c400510, // ld4 {v16.4h-v19.4h}, [x8]
- 0x0e217a00, // fcvtl v0.4s, v16.4h
- 0x0e217a21, // fcvtl v1.4s, v17.4h
- 0x0e217a42, // fcvtl v2.4s, v18.4h
- 0x0e217a63, // fcvtl v3.4s, v19.4h
-};
-static const unsigned int aarch64_store_f16[] = {
- 0xf9400048, // ldr x8, [x2]
- 0x0e216810, // fcvtn v16.4h, v0.4s
- 0x0e216831, // fcvtn v17.4h, v1.4s
- 0x0e216852, // fcvtn v18.4h, v2.4s
- 0x8b000d08, // add x8, x8, x0, lsl #3
- 0x0e216873, // fcvtn v19.4h, v3.4s
- 0x0c000510, // st4 {v16.4h-v19.4h}, [x8]
-};
-static const unsigned int aarch64_clamp_x[] = {
- 0x4d40c850, // ld1r {v16.4s}, [x2]
- 0x6f07e7f1, // movi v17.2d, #0xffffffffffffffff
- 0x6f00e412, // movi v18.2d, #0x0
- 0x4eb18610, // add v16.4s, v16.4s, v17.4s
- 0x4eb0f400, // fmin v0.4s, v0.4s, v16.4s
- 0x4e20f640, // fmax v0.4s, v18.4s, v0.4s
-};
-static const unsigned int aarch64_clamp_y[] = {
- 0x4d40c850, // ld1r {v16.4s}, [x2]
- 0x6f07e7f1, // movi v17.2d, #0xffffffffffffffff
- 0x6f00e412, // movi v18.2d, #0x0
- 0x4eb18610, // add v16.4s, v16.4s, v17.4s
- 0x4eb0f421, // fmin v1.4s, v1.4s, v16.4s
- 0x4e21f641, // fmax v1.4s, v18.4s, v1.4s
-};
-static const unsigned int aarch64_matrix_2x3[] = {
- 0xaa0203e8, // mov x8, x2
- 0x4ddfc912, // ld1r {v18.4s}, [x8], #4
- 0x91004049, // add x9, x2, #0x10
- 0x4d40c930, // ld1r {v16.4s}, [x9]
- 0x91005049, // add x9, x2, #0x14
- 0x2d415053, // ldp s19, s20, [x2,#8]
- 0x4d40c931, // ld1r {v17.4s}, [x9]
- 0xbd400115, // ldr s21, [x8]
- 0x4f931030, // fmla v16.4s, v1.4s, v19.s[0]
- 0x4f941031, // fmla v17.4s, v1.4s, v20.s[0]
- 0x4e20ce50, // fmla v16.4s, v18.4s, v0.4s
- 0x4f951011, // fmla v17.4s, v0.4s, v21.s[0]
- 0x4eb01e00, // mov v0.16b, v16.16b
- 0x4eb11e21, // mov v1.16b, v17.16b
-};
-static const unsigned int aarch64_matrix_3x4[] = {
- 0xaa0203e8, // mov x8, x2
- 0x91009049, // add x9, x2, #0x24
- 0x4ddfc913, // ld1r {v19.4s}, [x8], #4
- 0x4d40c930, // ld1r {v16.4s}, [x9]
- 0x9100a049, // add x9, x2, #0x28
- 0x4d40c931, // ld1r {v17.4s}, [x9]
- 0x2d435454, // ldp s20, s21, [x2,#24]
- 0x9100b049, // add x9, x2, #0x2c
- 0xbd402056, // ldr s22, [x2,#32]
- 0x4d40c932, // ld1r {v18.4s}, [x9]
- 0x4f941050, // fmla v16.4s, v2.4s, v20.s[0]
- 0x4f951051, // fmla v17.4s, v2.4s, v21.s[0]
- 0x2d415454, // ldp s20, s21, [x2,#8]
- 0x4f961052, // fmla v18.4s, v2.4s, v22.s[0]
- 0x2d425842, // ldp s2, s22, [x2,#16]
- 0x4f951030, // fmla v16.4s, v1.4s, v21.s[0]
- 0xbd400115, // ldr s21, [x8]
- 0x4f821031, // fmla v17.4s, v1.4s, v2.s[0]
- 0x4f961032, // fmla v18.4s, v1.4s, v22.s[0]
- 0x4e20ce70, // fmla v16.4s, v19.4s, v0.4s
- 0x4f951011, // fmla v17.4s, v0.4s, v21.s[0]
- 0x4f941012, // fmla v18.4s, v0.4s, v20.s[0]
- 0x4eb01e00, // mov v0.16b, v16.16b
- 0x4eb11e21, // mov v1.16b, v17.16b
- 0x4eb21e42, // mov v2.16b, v18.16b
-};
-static const unsigned int aarch64_linear_gradient_2stops[] = {
- 0xad404443, // ldp q3, q17, [x2]
- 0x4e040470, // dup v16.4s, v3.s[0]
- 0x4e0c0461, // dup v1.4s, v3.s[1]
- 0x4e140462, // dup v2.4s, v3.s[2]
- 0x4e1c0463, // dup v3.4s, v3.s[3]
- 0x4f911010, // fmla v16.4s, v0.4s, v17.s[0]
- 0x4fb11001, // fmla v1.4s, v0.4s, v17.s[1]
- 0x4f911802, // fmla v2.4s, v0.4s, v17.s[2]
- 0x4fb11803, // fmla v3.4s, v0.4s, v17.s[3]
- 0x4eb01e00, // mov v0.16b, v16.16b
-};
-static const unsigned int armv7_inc_x[] = {
- 0xe2800002, // add r0, r0, #2
-};
-static const unsigned int armv7_seed_shader[] = {
- 0xee800b90, // vdup.32 d16, r0
- 0xe283c004, // add ip, r3, #4
- 0xf3fb0620, // vcvt.f32.s32 d16, d16
- 0xf4e21c9f, // vld1.32 {d17[]}, [r2 :32]
- 0xf3fb1621, // vcvt.f32.s32 d17, d17
- 0xf4ec2c9f, // vld1.32 {d18[]}, [ip :32]
- 0xf2803010, // vmov.i32 d3, #0
- 0xf2804010, // vmov.i32 d4, #0
- 0xf2400da2, // vadd.f32 d16, d16, d18
- 0xf4a32c9f, // vld1.32 {d2[]}, [r3 :32]
- 0xf2011da2, // vadd.f32 d1, d17, d18
- 0xf2805010, // vmov.i32 d5, #0
- 0xedd33b05, // vldr d19, [r3, #20]
- 0xf2806010, // vmov.i32 d6, #0
- 0xf2030da0, // vadd.f32 d0, d19, d16
- 0xf2807010, // vmov.i32 d7, #0
-};
-static const unsigned int armv7_constant_color[] = {
- 0xf4620a0f, // vld1.8 {d16-d17}, [r2]
- 0xf3b40c20, // vdup.32 d0, d16[0]
- 0xf3bc1c20, // vdup.32 d1, d16[1]
- 0xf3b42c21, // vdup.32 d2, d17[0]
- 0xf3bc3c21, // vdup.32 d3, d17[1]
-};
-static const unsigned int armv7_clear[] = {
- 0xf2800010, // vmov.i32 d0, #0
- 0xf2801010, // vmov.i32 d1, #0
- 0xf2802010, // vmov.i32 d2, #0
- 0xf2803010, // vmov.i32 d3, #0
-};
-static const unsigned int armv7_plus_[] = {
- 0xf2000d04, // vadd.f32 d0, d0, d4
- 0xf2011d05, // vadd.f32 d1, d1, d5
- 0xf2022d06, // vadd.f32 d2, d2, d6
- 0xf2033d07, // vadd.f32 d3, d3, d7
-};
-static const unsigned int armv7_srcover[] = {
- 0xf4e30c9f, // vld1.32 {d16[]}, [r3 :32]
- 0xf2600d83, // vsub.f32 d16, d16, d3
- 0xf2040c30, // vfma.f32 d0, d4, d16
- 0xf2051c30, // vfma.f32 d1, d5, d16
- 0xf2062c30, // vfma.f32 d2, d6, d16
- 0xf2073c30, // vfma.f32 d3, d7, d16
-};
-static const unsigned int armv7_dstover[] = {
- 0xf4e30c9f, // vld1.32 {d16[]}, [r3 :32]
- 0xf2651115, // vorr d17, d5, d5
- 0xf2604d87, // vsub.f32 d20, d16, d7
- 0xf2640114, // vorr d16, d4, d4
- 0xf2662116, // vorr d18, d6, d6
- 0xf2673117, // vorr d19, d7, d7
- 0xf2400c34, // vfma.f32 d16, d0, d20
- 0xf2411c34, // vfma.f32 d17, d1, d20
- 0xf2422c34, // vfma.f32 d18, d2, d20
- 0xf2433c34, // vfma.f32 d19, d3, d20
- 0xf22001b0, // vorr d0, d16, d16
- 0xf22111b1, // vorr d1, d17, d17
- 0xf22221b2, // vorr d2, d18, d18
- 0xf22331b3, // vorr d3, d19, d19
-};
-static const unsigned int armv7_clamp_0[] = {
- 0xf2c00010, // vmov.i32 d16, #0
- 0xf2000f20, // vmax.f32 d0, d0, d16
- 0xf2011f20, // vmax.f32 d1, d1, d16
- 0xf2022f20, // vmax.f32 d2, d2, d16
- 0xf2033f20, // vmax.f32 d3, d3, d16
-};
-static const unsigned int armv7_clamp_1[] = {
- 0xf4e30c9f, // vld1.32 {d16[]}, [r3 :32]
- 0xf2200f20, // vmin.f32 d0, d0, d16
- 0xf2211f20, // vmin.f32 d1, d1, d16
- 0xf2222f20, // vmin.f32 d2, d2, d16
- 0xf2233f20, // vmin.f32 d3, d3, d16
-};
-static const unsigned int armv7_clamp_a[] = {
- 0xf4e30c9f, // vld1.32 {d16[]}, [r3 :32]
- 0xf2233f20, // vmin.f32 d3, d3, d16
- 0xf2200f03, // vmin.f32 d0, d0, d3
- 0xf2211f03, // vmin.f32 d1, d1, d3
- 0xf2222f03, // vmin.f32 d2, d2, d3
-};
-static const unsigned int armv7_swap[] = {
- 0xeef00b43, // vmov.f64 d16, d3
- 0xeef01b42, // vmov.f64 d17, d2
- 0xeef02b41, // vmov.f64 d18, d1
- 0xeef03b40, // vmov.f64 d19, d0
- 0xeeb00b44, // vmov.f64 d0, d4
- 0xeeb01b45, // vmov.f64 d1, d5
- 0xeeb02b46, // vmov.f64 d2, d6
- 0xeeb03b47, // vmov.f64 d3, d7
- 0xeeb04b63, // vmov.f64 d4, d19
- 0xeeb05b62, // vmov.f64 d5, d18
- 0xeeb06b61, // vmov.f64 d6, d17
- 0xeeb07b60, // vmov.f64 d7, d16
-};
-static const unsigned int armv7_move_src_dst[] = {
- 0xeeb04b40, // vmov.f64 d4, d0
- 0xeeb05b41, // vmov.f64 d5, d1
- 0xeeb06b42, // vmov.f64 d6, d2
- 0xeeb07b43, // vmov.f64 d7, d3
-};
-static const unsigned int armv7_move_dst_src[] = {
- 0xeeb00b44, // vmov.f64 d0, d4
- 0xeeb01b45, // vmov.f64 d1, d5
- 0xeeb02b46, // vmov.f64 d2, d6
- 0xeeb03b47, // vmov.f64 d3, d7
-};
-static const unsigned int armv7_premul[] = {
- 0xf3000d13, // vmul.f32 d0, d0, d3
- 0xf3011d13, // vmul.f32 d1, d1, d3
- 0xf3022d13, // vmul.f32 d2, d2, d3
-};
-static const unsigned int armv7_unpremul[] = {
- 0xed2d8b04, // vpush {d8-d9}
- 0xed938a00, // vldr s16, [r3]
- 0xf2c00010, // vmov.i32 d16, #0
- 0xf3f91503, // vceq.f32 d17, d3, #0
- 0xeec89a23, // vdiv.f32 s19, s16, s7
- 0xee889a03, // vdiv.f32 s18, s16, s6
- 0xf3501199, // vbsl d17, d16, d9
- 0xf3010d90, // vmul.f32 d0, d17, d0
- 0xf3011d91, // vmul.f32 d1, d17, d1
- 0xf3012d92, // vmul.f32 d2, d17, d2
- 0xecbd8b04, // vpop {d8-d9}
-};
-static const unsigned int armv7_from_srgb[] = {
- 0xed2d8b02, // vpush {d8}
- 0xe283c03c, // add ip, r3, #60
- 0xed938a10, // vldr s16, [r3, #64]
- 0xf3402d10, // vmul.f32 d18, d0, d0
- 0xf4ec0c9f, // vld1.32 {d16[]}, [ip :32]
- 0xe283c038, // add ip, r3, #56
- 0xf3413d11, // vmul.f32 d19, d1, d1
- 0xf4ec1c9f, // vld1.32 {d17[]}, [ip :32]
- 0xe283c044, // add ip, r3, #68
- 0xf26141b1, // vorr d20, d17, d17
- 0xf26171b1, // vorr d23, d17, d17
- 0xf4ec8c9f, // vld1.32 {d24[]}, [ip :32]
- 0xf2404c30, // vfma.f32 d20, d0, d16
- 0xe283c034, // add ip, r3, #52
- 0xf2417c30, // vfma.f32 d23, d1, d16
- 0xf2421c30, // vfma.f32 d17, d2, d16
- 0xf3425d12, // vmul.f32 d21, d2, d2
- 0xf2e16948, // vmul.f32 d22, d1, d8[0]
- 0xf2e00948, // vmul.f32 d16, d0, d8[0]
- 0xf2e29948, // vmul.f32 d25, d2, d8[0]
- 0xf3282e82, // vcgt.f32 d2, d24, d2
- 0xf3281e81, // vcgt.f32 d1, d24, d1
- 0xf3280e80, // vcgt.f32 d0, d24, d0
- 0xf4ec8c9f, // vld1.32 {d24[]}, [ip :32]
- 0xf268a1b8, // vorr d26, d24, d24
- 0xf242acb4, // vfma.f32 d26, d18, d20
- 0xf26821b8, // vorr d18, d24, d24
- 0xf2432cb7, // vfma.f32 d18, d19, d23
- 0xf2458cb1, // vfma.f32 d24, d21, d17
- 0xf31001ba, // vbsl d0, d16, d26
- 0xf31611b2, // vbsl d1, d22, d18
- 0xf31921b8, // vbsl d2, d25, d24
- 0xecbd8b02, // vpop {d8}
-};
-static const unsigned int armv7_to_srgb[] = {
- 0xed2d8b02, // vpush {d8}
- 0xf3fb0580, // vrsqrte.f32 d16, d0
- 0xe283c050, // add ip, r3, #80
- 0xf3fb1581, // vrsqrte.f32 d17, d1
- 0xed938a12, // vldr s16, [r3, #72]
- 0xf3fb2582, // vrsqrte.f32 d18, d2
- 0xf3403db0, // vmul.f32 d19, d16, d16
- 0xf3414db1, // vmul.f32 d20, d17, d17
- 0xf3425db2, // vmul.f32 d21, d18, d18
- 0xf2603f33, // vrsqrts.f32 d19, d0, d19
- 0xf2614f34, // vrsqrts.f32 d20, d1, d20
- 0xf2625f35, // vrsqrts.f32 d21, d2, d21
- 0xf3400db3, // vmul.f32 d16, d16, d19
- 0xf3411db4, // vmul.f32 d17, d17, d20
- 0xf3422db5, // vmul.f32 d18, d18, d21
- 0xf3fb3520, // vrecpe.f32 d19, d16
- 0xf3fb4521, // vrecpe.f32 d20, d17
- 0xf3fb6522, // vrecpe.f32 d22, d18
- 0xf3fb55a2, // vrsqrte.f32 d21, d18
- 0xf3fb75a0, // vrsqrte.f32 d23, d16
- 0xf3fb85a1, // vrsqrte.f32 d24, d17
- 0xf2409fb3, // vrecps.f32 d25, d16, d19
- 0xf241afb4, // vrecps.f32 d26, d17, d20
- 0xf242bfb6, // vrecps.f32 d27, d18, d22
- 0xf345cdb5, // vmul.f32 d28, d21, d21
- 0xf347ddb7, // vmul.f32 d29, d23, d23
- 0xf348edb8, // vmul.f32 d30, d24, d24
- 0xf2622fbc, // vrsqrts.f32 d18, d18, d28
- 0xf2600fbd, // vrsqrts.f32 d16, d16, d29
- 0xf2611fbe, // vrsqrts.f32 d17, d17, d30
- 0xf3433db9, // vmul.f32 d19, d19, d25
- 0xf4ec9c9f, // vld1.32 {d25[]}, [ip :32]
- 0xe283c054, // add ip, r3, #84
- 0xf3444dba, // vmul.f32 d20, d20, d26
- 0xf3466dbb, // vmul.f32 d22, d22, d27
- 0xf4ecac9f, // vld1.32 {d26[]}, [ip :32]
- 0xe283c04c, // add ip, r3, #76
- 0xf26ab1ba, // vorr d27, d26, d26
- 0xf249bcb3, // vfma.f32 d27, d25, d19
- 0xf26a31ba, // vorr d19, d26, d26
- 0xf2493cb4, // vfma.f32 d19, d25, d20
- 0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
- 0xf249acb6, // vfma.f32 d26, d25, d22
- 0xe283c058, // add ip, r3, #88
- 0xf3452db2, // vmul.f32 d18, d21, d18
- 0xf3470db0, // vmul.f32 d16, d23, d16
- 0xf3481db1, // vmul.f32 d17, d24, d17
- 0xf2e05948, // vmul.f32 d21, d0, d8[0]
- 0xf244bcb0, // vfma.f32 d27, d20, d16
- 0xf4ec0c9f, // vld1.32 {d16[]}, [ip :32]
- 0xf2443cb1, // vfma.f32 d19, d20, d17
- 0xf244acb2, // vfma.f32 d26, d20, d18
- 0xf4e34c9f, // vld1.32 {d20[]}, [r3 :32]
- 0xf2e11948, // vmul.f32 d17, d1, d8[0]
- 0xf2e22948, // vmul.f32 d18, d2, d8[0]
- 0xf3201e81, // vcgt.f32 d1, d16, d1
- 0xf3200e80, // vcgt.f32 d0, d16, d0
- 0xf3202e82, // vcgt.f32 d2, d16, d2
- 0xf2640fab, // vmin.f32 d16, d20, d27
- 0xf2643fa3, // vmin.f32 d19, d20, d19
- 0xf2644faa, // vmin.f32 d20, d20, d26
- 0xf31501b0, // vbsl d0, d21, d16
- 0xf31111b3, // vbsl d1, d17, d19
- 0xf31221b4, // vbsl d2, d18, d20
- 0xecbd8b02, // vpop {d8}
-};
-static const unsigned int armv7_scale_u8[] = {
- 0xed2d8b02, // vpush {d8}
- 0xe24dd008, // sub sp, sp, #8
- 0xe592c000, // ldr ip, [r2]
- 0xe08cc000, // add ip, ip, r0
- 0xe1dcc0b0, // ldrh ip, [ip]
- 0xe1cdc0b4, // strh ip, [sp, #4]
- 0xe28dc004, // add ip, sp, #4
- 0xed938a03, // vldr s16, [r3, #12]
- 0xf4ec041f, // vld1.16 {d16[0]}, [ip :16]
- 0xf3c80a30, // vmovl.u8 q8, d16
- 0xf3d00a30, // vmovl.u16 q8, d16
- 0xf3fb06a0, // vcvt.f32.u32 d16, d16
- 0xf2e009c8, // vmul.f32 d16, d16, d8[0]
- 0xf3000d90, // vmul.f32 d0, d16, d0
- 0xf3001d91, // vmul.f32 d1, d16, d1
- 0xf3002d92, // vmul.f32 d2, d16, d2
- 0xf3003d93, // vmul.f32 d3, d16, d3
- 0xe28dd008, // add sp, sp, #8
- 0xecbd8b02, // vpop {d8}
-};
-static const unsigned int armv7_load_tables[] = {
- 0xe92d41f0, // push {r4, r5, r6, r7, r8, lr}
- 0xe2836010, // add r6, r3, #16
- 0xe592c000, // ldr ip, [r2]
- 0xe592e004, // ldr lr, [r2, #4]
- 0xf4e60c9f, // vld1.32 {d16[]}, [r6 :32]
- 0xe08c6100, // add r6, ip, r0, lsl #2
- 0xedd61b00, // vldr d17, [r6]
- 0xf24021b1, // vand d18, d16, d17
- 0xe592800c, // ldr r8, [r2, #12]
- 0xf3f83031, // vshr.u32 d19, d17, #8
- 0xe5924008, // ldr r4, [r2, #8]
- 0xed931a03, // vldr s2, [r3, #12]
- 0xee326b90, // vmov.32 r6, d18[1]
- 0xee125b90, // vmov.32 r5, d18[0]
- 0xf3f02031, // vshr.u32 d18, d17, #16
- 0xf24021b2, // vand d18, d16, d18
- 0xf24001b3, // vand d16, d16, d19
- 0xee107b90, // vmov.32 r7, d16[0]
- 0xe08e6106, // add r6, lr, r6, lsl #2
- 0xedd60a00, // vldr s1, [r6]
- 0xe08e6105, // add r6, lr, r5, lsl #2
- 0xee325b90, // vmov.32 r5, d18[1]
- 0xed960a00, // vldr s0, [r6]
- 0xee306b90, // vmov.32 r6, d16[1]
- 0xf3e80031, // vshr.u32 d16, d17, #24
- 0xf3fb0620, // vcvt.f32.s32 d16, d16
- 0xe0847107, // add r7, r4, r7, lsl #2
- 0xf2a039c1, // vmul.f32 d3, d16, d1[0]
- 0xe088c105, // add ip, r8, r5, lsl #2
- 0xee125b90, // vmov.32 r5, d18[0]
- 0xe0846106, // add r6, r4, r6, lsl #2
- 0xeddc2a00, // vldr s5, [ip]
- 0xedd61a00, // vldr s3, [r6]
- 0xed971a00, // vldr s2, [r7]
- 0xe0887105, // add r7, r8, r5, lsl #2
- 0xed972a00, // vldr s4, [r7]
- 0xe8bd41f0, // pop {r4, r5, r6, r7, r8, lr}
-};
-static const unsigned int armv7_load_8888[] = {
- 0xe92d4800, // push {fp, lr}
- 0xe592c000, // ldr ip, [r2]
- 0xe283e010, // add lr, r3, #16
- 0xed932a03, // vldr s4, [r3, #12]
- 0xe08cc100, // add ip, ip, r0, lsl #2
- 0xf4ee0c9f, // vld1.32 {d16[]}, [lr :32]
- 0xeddc1b00, // vldr d17, [ip]
- 0xf24021b1, // vand d18, d16, d17
- 0xf3f83031, // vshr.u32 d19, d17, #8
- 0xf3e84031, // vshr.u32 d20, d17, #24
- 0xf3f01031, // vshr.u32 d17, d17, #16
- 0xf24031b3, // vand d19, d16, d19
- 0xf24001b1, // vand d16, d16, d17
- 0xf3fb2622, // vcvt.f32.s32 d18, d18
- 0xf3fb4624, // vcvt.f32.s32 d20, d20
- 0xf3fb1623, // vcvt.f32.s32 d17, d19
- 0xf3fb0620, // vcvt.f32.s32 d16, d16
- 0xf2a209c2, // vmul.f32 d0, d18, d2[0]
- 0xf2a439c2, // vmul.f32 d3, d20, d2[0]
- 0xf2a119c2, // vmul.f32 d1, d17, d2[0]
- 0xf2a029c2, // vmul.f32 d2, d16, d2[0]
- 0xe8bd4800, // pop {fp, lr}
-};
-static const unsigned int armv7_store_8888[] = {
- 0xe283c008, // add ip, r3, #8
- 0xf2c3261f, // vmov.i32 d18, #1056964608
- 0xf2c3361f, // vmov.i32 d19, #1056964608
- 0xf4ec1c9f, // vld1.32 {d17[]}, [ip :32]
- 0xf2c3061f, // vmov.i32 d16, #1056964608
- 0xf2412c31, // vfma.f32 d18, d1, d17
- 0xf2423c31, // vfma.f32 d19, d2, d17
- 0xf2c3461f, // vmov.i32 d20, #1056964608
- 0xe592c000, // ldr ip, [r2]
- 0xf2400c31, // vfma.f32 d16, d0, d17
- 0xf2434c31, // vfma.f32 d20, d3, d17
- 0xe08cc100, // add ip, ip, r0, lsl #2
- 0xf3fb17a2, // vcvt.u32.f32 d17, d18
- 0xf3fb27a3, // vcvt.u32.f32 d18, d19
- 0xf3fb07a0, // vcvt.u32.f32 d16, d16
- 0xf3fb37a4, // vcvt.u32.f32 d19, d20
- 0xf2e81531, // vshl.s32 d17, d17, #8
- 0xf2f02532, // vshl.s32 d18, d18, #16
- 0xf26101b0, // vorr d16, d17, d16
- 0xf2f81533, // vshl.s32 d17, d19, #24
- 0xf26001b2, // vorr d16, d16, d18
- 0xf26001b1, // vorr d16, d16, d17
- 0xedcc0b00, // vstr d16, [ip]
-};
-static const unsigned int armv7_load_f16[] = {
- 0xed2d8b04, // vpush {d8-d9}
- 0xe592c000, // ldr ip, [r2]
- 0xe08cc180, // add ip, ip, r0, lsl #3
- 0xf46c084f, // vld2.16 {d16-d17}, [ip]
- 0xf3b62720, // vcvt.f32.f16 q1, d16
- 0xf3b68721, // vcvt.f32.f16 q4, d17
- 0xf2220112, // vorr d0, d2, d2
- 0xeef00a43, // vmov.f32 s1, s6
- 0xf2281118, // vorr d1, d8, d8
- 0xeeb03a62, // vmov.f32 s6, s5
- 0xeef01a49, // vmov.f32 s3, s18
- 0xeeb09a68, // vmov.f32 s18, s17
- 0xeeb02b43, // vmov.f64 d2, d3
- 0xeeb03b49, // vmov.f64 d3, d9
- 0xecbd8b04, // vpop {d8-d9}
-};
-static const unsigned int armv7_store_f16[] = {
- 0xeef00b41, // vmov.f64 d16, d1
- 0xf2631113, // vorr d17, d3, d3
- 0xeef02b40, // vmov.f64 d18, d0
- 0xf2623112, // vorr d19, d2, d2
- 0xf3fa00a1, // vtrn.32 d16, d17
- 0xf3f61620, // vcvt.f16.f32 d17, q8
- 0xf3fa20a3, // vtrn.32 d18, d19
- 0xe592c000, // ldr ip, [r2]
- 0xf3f60622, // vcvt.f16.f32 d16, q9
- 0xe08cc180, // add ip, ip, r0, lsl #3
- 0xf44c084f, // vst2.16 {d16-d17}, [ip]
-};
-static const unsigned int armv7_clamp_x[] = {
- 0xf3c70e1f, // vmov.i8 d16, #255
- 0xf4e21c9f, // vld1.32 {d17[]}, [r2 :32]
- 0xf26108a0, // vadd.i32 d16, d17, d16
- 0xf2c01010, // vmov.i32 d17, #0
- 0xf2600f20, // vmin.f32 d16, d0, d16
- 0xf2010fa0, // vmax.f32 d0, d17, d16
-};
-static const unsigned int armv7_clamp_y[] = {
- 0xf3c70e1f, // vmov.i8 d16, #255
- 0xf4e21c9f, // vld1.32 {d17[]}, [r2 :32]
- 0xf26108a0, // vadd.i32 d16, d17, d16
- 0xf2c01010, // vmov.i32 d17, #0
- 0xf2610f20, // vmin.f32 d16, d1, d16
- 0xf2011fa0, // vmax.f32 d1, d17, d16
-};
-static const unsigned int armv7_matrix_2x3[] = {
- 0xe282c00c, // add ip, r2, #12
- 0xf4ec2c9f, // vld1.32 {d18[]}, [ip :32]
- 0xe282c008, // add ip, r2, #8
- 0xf4ec1c9f, // vld1.32 {d17[]}, [ip :32]
- 0xe282c010, // add ip, r2, #16
- 0xf4ec0c9f, // vld1.32 {d16[]}, [ip :32]
- 0xe282c014, // add ip, r2, #20
- 0xf2410c31, // vfma.f32 d16, d1, d17
- 0xf4ec1c9f, // vld1.32 {d17[]}, [ip :32]
- 0xe282c004, // add ip, r2, #4
- 0xf2411c32, // vfma.f32 d17, d1, d18
- 0xf4e22c9f, // vld1.32 {d18[]}, [r2 :32]
- 0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32]
- 0xf2400c32, // vfma.f32 d16, d0, d18
- 0xf2401c33, // vfma.f32 d17, d0, d19
- 0xf22001b0, // vorr d0, d16, d16
- 0xf22111b1, // vorr d1, d17, d17
-};
-static const unsigned int armv7_matrix_3x4[] = {
- 0xe282c020, // add ip, r2, #32
- 0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32]
- 0xe282c02c, // add ip, r2, #44
- 0xf4ec0c9f, // vld1.32 {d16[]}, [ip :32]
- 0xe282c01c, // add ip, r2, #28
- 0xf2420c33, // vfma.f32 d16, d2, d19
- 0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
- 0xe282c018, // add ip, r2, #24
- 0xf4ec2c9f, // vld1.32 {d18[]}, [ip :32]
- 0xe282c024, // add ip, r2, #36
- 0xf4ec1c9f, // vld1.32 {d17[]}, [ip :32]
- 0xe282c028, // add ip, r2, #40
- 0xf2421c32, // vfma.f32 d17, d2, d18
- 0xf4ec2c9f, // vld1.32 {d18[]}, [ip :32]
- 0xe282c010, // add ip, r2, #16
- 0xf2422c34, // vfma.f32 d18, d2, d20
- 0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32]
- 0xe282c00c, // add ip, r2, #12
- 0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
- 0xe282c014, // add ip, r2, #20
- 0xf2411c34, // vfma.f32 d17, d1, d20
- 0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
- 0xf2410c34, // vfma.f32 d16, d1, d20
- 0xe282c004, // add ip, r2, #4
- 0xf2412c33, // vfma.f32 d18, d1, d19
- 0xf4e23c9f, // vld1.32 {d19[]}, [r2 :32]
- 0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
- 0xe282c008, // add ip, r2, #8
- 0xf2401c33, // vfma.f32 d17, d0, d19
- 0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32]
- 0xf2400c33, // vfma.f32 d16, d0, d19
- 0xf2402c34, // vfma.f32 d18, d0, d20
- 0xf22101b1, // vorr d0, d17, d17
- 0xf22021b0, // vorr d2, d16, d16
- 0xf22211b2, // vorr d1, d18, d18
-};
-static const unsigned int armv7_linear_gradient_2stops[] = {
- 0xe1a0c002, // mov ip, r2
- 0xf46c2a0d, // vld1.8 {d18-d19}, [ip]!
- 0xf46c4a0f, // vld1.8 {d20-d21}, [ip]
- 0xf3f40c22, // vdup.32 d16, d18[0]
- 0xf3f41c24, // vdup.32 d17, d20[0]
- 0xf2400c31, // vfma.f32 d16, d0, d17
- 0xf3fc6c24, // vdup.32 d22, d20[1]
- 0xf3bc1c22, // vdup.32 d1, d18[1]
- 0xf3b42c23, // vdup.32 d2, d19[0]
- 0xf2001c36, // vfma.f32 d1, d0, d22
- 0xf3f41c25, // vdup.32 d17, d21[0]
- 0xf3fc4c25, // vdup.32 d20, d21[1]
- 0xf2002c31, // vfma.f32 d2, d0, d17
- 0xf3bc3c23, // vdup.32 d3, d19[1]
- 0xf2003c34, // vfma.f32 d3, d0, d20
- 0xf22001b0, // vorr d0, d16, d16
-};
-static const unsigned char sse2_inc_x[] = {
- 0x48,0x83,0xc7,0x04, // add $0x4,%rdi
-};
-static const unsigned char sse2_seed_shader[] = {
- 0x66,0x0f,0x6e,0xc7, // movd %edi,%xmm0
- 0x66,0x0f,0x70,0xc0,0x00, // pshufd $0x0,%xmm0,%xmm0
- 0x0f,0x5b,0xc8, // cvtdq2ps %xmm0,%xmm1
- 0xf3,0x0f,0x10,0x11, // movss (%rcx),%xmm2
- 0xf3,0x0f,0x10,0x59,0x04, // movss 0x4(%rcx),%xmm3
- 0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm3,%xmm3
- 0x0f,0x58,0xcb, // addps %xmm3,%xmm1
- 0x0f,0x10,0x41,0x14, // movups 0x14(%rcx),%xmm0
- 0x0f,0x58,0xc1, // addps %xmm1,%xmm0
- 0x66,0x0f,0x6e,0x0a, // movd (%rdx),%xmm1
- 0x66,0x0f,0x70,0xc9,0x00, // pshufd $0x0,%xmm1,%xmm1
- 0x0f,0x5b,0xc9, // cvtdq2ps %xmm1,%xmm1
- 0x0f,0x58,0xcb, // addps %xmm3,%xmm1
- 0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm2,%xmm2
- 0x0f,0x57,0xdb, // xorps %xmm3,%xmm3
- 0x0f,0x57,0xe4, // xorps %xmm4,%xmm4
- 0x0f,0x57,0xed, // xorps %xmm5,%xmm5
- 0x0f,0x57,0xf6, // xorps %xmm6,%xmm6
- 0x0f,0x57,0xff, // xorps %xmm7,%xmm7
-};
-static const unsigned char sse2_constant_color[] = {
- 0x0f,0x10,0x1a, // movups (%rdx),%xmm3
- 0x0f,0x28,0xc3, // movaps %xmm3,%xmm0
- 0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm0,%xmm0
- 0x0f,0x28,0xcb, // movaps %xmm3,%xmm1
- 0x0f,0xc6,0xc9,0x55, // shufps $0x55,%xmm1,%xmm1
- 0x0f,0x28,0xd3, // movaps %xmm3,%xmm2
- 0x0f,0xc6,0xd2,0xaa, // shufps $0xaa,%xmm2,%xmm2
- 0x0f,0xc6,0xdb,0xff, // shufps $0xff,%xmm3,%xmm3
-};
-static const unsigned char sse2_clear[] = {
- 0x0f,0x57,0xc0, // xorps %xmm0,%xmm0
- 0x0f,0x57,0xc9, // xorps %xmm1,%xmm1
- 0x0f,0x57,0xd2, // xorps %xmm2,%xmm2
- 0x0f,0x57,0xdb, // xorps %xmm3,%xmm3
-};
-static const unsigned char sse2_plus_[] = {
- 0x0f,0x58,0xc4, // addps %xmm4,%xmm0
- 0x0f,0x58,0xcd, // addps %xmm5,%xmm1
- 0x0f,0x58,0xd6, // addps %xmm6,%xmm2
- 0x0f,0x58,0xdf, // addps %xmm7,%xmm3
-};
-static const unsigned char sse2_srcover[] = {
- 0xf3,0x44,0x0f,0x10,0x01, // movss (%rcx),%xmm8
- 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
- 0x44,0x0f,0x5c,0xc3, // subps %xmm3,%xmm8
- 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9
- 0x44,0x0f,0x59,0xcc, // mulps %xmm4,%xmm9
- 0x41,0x0f,0x58,0xc1, // addps %xmm9,%xmm0
- 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9
- 0x44,0x0f,0x59,0xcd, // mulps %xmm5,%xmm9
- 0x41,0x0f,0x58,0xc9, // addps %xmm9,%xmm1
- 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9
- 0x44,0x0f,0x59,0xce, // mulps %xmm6,%xmm9
- 0x41,0x0f,0x58,0xd1, // addps %xmm9,%xmm2
- 0x44,0x0f,0x59,0xc7, // mulps %xmm7,%xmm8
- 0x41,0x0f,0x58,0xd8, // addps %xmm8,%xmm3
-};
-static const unsigned char sse2_dstover[] = {
- 0xf3,0x44,0x0f,0x10,0x01, // movss (%rcx),%xmm8
- 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
- 0x44,0x0f,0x5c,0xc7, // subps %xmm7,%xmm8
- 0x41,0x0f,0x59,0xc0, // mulps %xmm8,%xmm0
- 0x0f,0x58,0xc4, // addps %xmm4,%xmm0
- 0x41,0x0f,0x59,0xc8, // mulps %xmm8,%xmm1
- 0x0f,0x58,0xcd, // addps %xmm5,%xmm1
- 0x41,0x0f,0x59,0xd0, // mulps %xmm8,%xmm2
- 0x0f,0x58,0xd6, // addps %xmm6,%xmm2
- 0x41,0x0f,0x59,0xd8, // mulps %xmm8,%xmm3
- 0x0f,0x58,0xdf, // addps %xmm7,%xmm3
-};
-static const unsigned char sse2_clamp_0[] = {
- 0x45,0x0f,0x57,0xc0, // xorps %xmm8,%xmm8
- 0x41,0x0f,0x5f,0xc0, // maxps %xmm8,%xmm0
- 0x41,0x0f,0x5f,0xc8, // maxps %xmm8,%xmm1
- 0x41,0x0f,0x5f,0xd0, // maxps %xmm8,%xmm2
- 0x41,0x0f,0x5f,0xd8, // maxps %xmm8,%xmm3
-};
-static const unsigned char sse2_clamp_1[] = {
- 0xf3,0x44,0x0f,0x10,0x01, // movss (%rcx),%xmm8
- 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
- 0x41,0x0f,0x5d,0xc0, // minps %xmm8,%xmm0
- 0x41,0x0f,0x5d,0xc8, // minps %xmm8,%xmm1
- 0x41,0x0f,0x5d,0xd0, // minps %xmm8,%xmm2
- 0x41,0x0f,0x5d,0xd8, // minps %xmm8,%xmm3
-};
-static const unsigned char sse2_clamp_a[] = {
- 0xf3,0x44,0x0f,0x10,0x01, // movss (%rcx),%xmm8
- 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
- 0x41,0x0f,0x5d,0xd8, // minps %xmm8,%xmm3
- 0x0f,0x5d,0xc3, // minps %xmm3,%xmm0
- 0x0f,0x5d,0xcb, // minps %xmm3,%xmm1
- 0x0f,0x5d,0xd3, // minps %xmm3,%xmm2
-};
-static const unsigned char sse2_swap[] = {
- 0x44,0x0f,0x28,0xc3, // movaps %xmm3,%xmm8
- 0x44,0x0f,0x28,0xca, // movaps %xmm2,%xmm9
- 0x44,0x0f,0x28,0xd1, // movaps %xmm1,%xmm10
- 0x44,0x0f,0x28,0xd8, // movaps %xmm0,%xmm11
- 0x0f,0x28,0xc4, // movaps %xmm4,%xmm0
- 0x0f,0x28,0xcd, // movaps %xmm5,%xmm1
- 0x0f,0x28,0xd6, // movaps %xmm6,%xmm2
- 0x0f,0x28,0xdf, // movaps %xmm7,%xmm3
- 0x41,0x0f,0x28,0xe3, // movaps %xmm11,%xmm4
- 0x41,0x0f,0x28,0xea, // movaps %xmm10,%xmm5
- 0x41,0x0f,0x28,0xf1, // movaps %xmm9,%xmm6
- 0x41,0x0f,0x28,0xf8, // movaps %xmm8,%xmm7
-};
-static const unsigned char sse2_move_src_dst[] = {
- 0x0f,0x28,0xe0, // movaps %xmm0,%xmm4
- 0x0f,0x28,0xe9, // movaps %xmm1,%xmm5
- 0x0f,0x28,0xf2, // movaps %xmm2,%xmm6
- 0x0f,0x28,0xfb, // movaps %xmm3,%xmm7
-};
-static const unsigned char sse2_move_dst_src[] = {
- 0x0f,0x28,0xc4, // movaps %xmm4,%xmm0
- 0x0f,0x28,0xcd, // movaps %xmm5,%xmm1
- 0x0f,0x28,0xd6, // movaps %xmm6,%xmm2
- 0x0f,0x28,0xdf, // movaps %xmm7,%xmm3
-};
-static const unsigned char sse2_premul[] = {
- 0x0f,0x59,0xc3, // mulps %xmm3,%xmm0
- 0x0f,0x59,0xcb, // mulps %xmm3,%xmm1
- 0x0f,0x59,0xd3, // mulps %xmm3,%xmm2
-};
-static const unsigned char sse2_unpremul[] = {
- 0x45,0x0f,0x57,0xc0, // xorps %xmm8,%xmm8
- 0x44,0x0f,0xc2,0xc3,0x00, // cmpeqps %xmm3,%xmm8
- 0xf3,0x44,0x0f,0x10,0x09, // movss (%rcx),%xmm9
- 0x45,0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm9,%xmm9
- 0x44,0x0f,0x5e,0xcb, // divps %xmm3,%xmm9
- 0x45,0x0f,0x55,0xc1, // andnps %xmm9,%xmm8
- 0x41,0x0f,0x59,0xc0, // mulps %xmm8,%xmm0
- 0x41,0x0f,0x59,0xc8, // mulps %xmm8,%xmm1
- 0x41,0x0f,0x59,0xd0, // mulps %xmm8,%xmm2
-};
-static const unsigned char sse2_from_srgb[] = {
- 0xf3,0x44,0x0f,0x10,0x41,0x40, // movss 0x40(%rcx),%xmm8
- 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
- 0x45,0x0f,0x28,0xe8, // movaps %xmm8,%xmm13
- 0x44,0x0f,0x59,0xe8, // mulps %xmm0,%xmm13
- 0x44,0x0f,0x28,0xe0, // movaps %xmm0,%xmm12
- 0x45,0x0f,0x59,0xe4, // mulps %xmm12,%xmm12
- 0xf3,0x44,0x0f,0x10,0x49,0x3c, // movss 0x3c(%rcx),%xmm9
- 0x45,0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm9,%xmm9
- 0xf3,0x44,0x0f,0x10,0x51,0x34, // movss 0x34(%rcx),%xmm10
- 0xf3,0x44,0x0f,0x10,0x59,0x38, // movss 0x38(%rcx),%xmm11
- 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11
- 0x45,0x0f,0x28,0xf1, // movaps %xmm9,%xmm14
- 0x44,0x0f,0x59,0xf0, // mulps %xmm0,%xmm14
- 0x45,0x0f,0x58,0xf3, // addps %xmm11,%xmm14
- 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10
- 0x45,0x0f,0x59,0xf4, // mulps %xmm12,%xmm14
- 0x45,0x0f,0x58,0xf2, // addps %xmm10,%xmm14
- 0xf3,0x44,0x0f,0x10,0x61,0x44, // movss 0x44(%rcx),%xmm12
- 0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12
- 0x41,0x0f,0xc2,0xc4,0x01, // cmpltps %xmm12,%xmm0
- 0x44,0x0f,0x54,0xe8, // andps %xmm0,%xmm13
- 0x41,0x0f,0x55,0xc6, // andnps %xmm14,%xmm0
- 0x41,0x0f,0x56,0xc5, // orps %xmm13,%xmm0
- 0x45,0x0f,0x28,0xe8, // movaps %xmm8,%xmm13
- 0x44,0x0f,0x59,0xe9, // mulps %xmm1,%xmm13
- 0x44,0x0f,0x28,0xf1, // movaps %xmm1,%xmm14
- 0x45,0x0f,0x59,0xf6, // mulps %xmm14,%xmm14
- 0x45,0x0f,0x28,0xf9, // movaps %xmm9,%xmm15
- 0x44,0x0f,0x59,0xf9, // mulps %xmm1,%xmm15
- 0x45,0x0f,0x58,0xfb, // addps %xmm11,%xmm15
- 0x45,0x0f,0x59,0xfe, // mulps %xmm14,%xmm15
- 0x45,0x0f,0x58,0xfa, // addps %xmm10,%xmm15
- 0x41,0x0f,0xc2,0xcc,0x01, // cmpltps %xmm12,%xmm1
- 0x44,0x0f,0x54,0xe9, // andps %xmm1,%xmm13
- 0x41,0x0f,0x55,0xcf, // andnps %xmm15,%xmm1
- 0x41,0x0f,0x56,0xcd, // orps %xmm13,%xmm1
- 0x44,0x0f,0x59,0xc2, // mulps %xmm2,%xmm8
- 0x44,0x0f,0x28,0xea, // movaps %xmm2,%xmm13
- 0x45,0x0f,0x59,0xed, // mulps %xmm13,%xmm13
- 0x44,0x0f,0x59,0xca, // mulps %xmm2,%xmm9
- 0x45,0x0f,0x58,0xcb, // addps %xmm11,%xmm9
- 0x45,0x0f,0x59,0xcd, // mulps %xmm13,%xmm9
- 0x45,0x0f,0x58,0xca, // addps %xmm10,%xmm9
- 0x41,0x0f,0xc2,0xd4,0x01, // cmpltps %xmm12,%xmm2
- 0x44,0x0f,0x54,0xc2, // andps %xmm2,%xmm8
- 0x41,0x0f,0x55,0xd1, // andnps %xmm9,%xmm2
- 0x41,0x0f,0x56,0xd0, // orps %xmm8,%xmm2
-};
-static const unsigned char sse2_to_srgb[] = {
- 0x48,0x83,0xec,0x28, // sub $0x28,%rsp
- 0x0f,0x29,0x7c,0x24,0x10, // movaps %xmm7,0x10(%rsp)
- 0x0f,0x29,0x34,0x24, // movaps %xmm6,(%rsp)
- 0x0f,0x28,0xf5, // movaps %xmm5,%xmm6
- 0x0f,0x28,0xec, // movaps %xmm4,%xmm5
- 0x0f,0x28,0xe3, // movaps %xmm3,%xmm4
- 0x44,0x0f,0x52,0xc0, // rsqrtps %xmm0,%xmm8
- 0x45,0x0f,0x53,0xe8, // rcpps %xmm8,%xmm13
- 0x45,0x0f,0x52,0xf8, // rsqrtps %xmm8,%xmm15
- 0xf3,0x0f,0x10,0x19, // movss (%rcx),%xmm3
- 0xf3,0x44,0x0f,0x10,0x41,0x48, // movss 0x48(%rcx),%xmm8
- 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
- 0x45,0x0f,0x28,0xf0, // movaps %xmm8,%xmm14
- 0x44,0x0f,0x59,0xf0, // mulps %xmm0,%xmm14
- 0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm3,%xmm3
- 0xf3,0x44,0x0f,0x10,0x51,0x4c, // movss 0x4c(%rcx),%xmm10
- 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10
- 0xf3,0x44,0x0f,0x10,0x59,0x50, // movss 0x50(%rcx),%xmm11
- 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11
- 0xf3,0x44,0x0f,0x10,0x61,0x54, // movss 0x54(%rcx),%xmm12
- 0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12
- 0x45,0x0f,0x59,0xeb, // mulps %xmm11,%xmm13
- 0x45,0x0f,0x58,0xec, // addps %xmm12,%xmm13
- 0x45,0x0f,0x59,0xfa, // mulps %xmm10,%xmm15
- 0x45,0x0f,0x58,0xfd, // addps %xmm13,%xmm15
- 0x44,0x0f,0x28,0xcb, // movaps %xmm3,%xmm9
- 0x45,0x0f,0x5d,0xcf, // minps %xmm15,%xmm9
- 0xf3,0x44,0x0f,0x10,0x69,0x58, // movss 0x58(%rcx),%xmm13
- 0x45,0x0f,0xc6,0xed,0x00, // shufps $0x0,%xmm13,%xmm13
- 0x41,0x0f,0xc2,0xc5,0x01, // cmpltps %xmm13,%xmm0
- 0x44,0x0f,0x54,0xf0, // andps %xmm0,%xmm14
- 0x41,0x0f,0x55,0xc1, // andnps %xmm9,%xmm0
- 0x41,0x0f,0x56,0xc6, // orps %xmm14,%xmm0
- 0x44,0x0f,0x52,0xc9, // rsqrtps %xmm1,%xmm9
- 0x45,0x0f,0x53,0xf1, // rcpps %xmm9,%xmm14
- 0x45,0x0f,0x52,0xc9, // rsqrtps %xmm9,%xmm9
- 0x45,0x0f,0x59,0xf3, // mulps %xmm11,%xmm14
- 0x45,0x0f,0x58,0xf4, // addps %xmm12,%xmm14
- 0x45,0x0f,0x59,0xca, // mulps %xmm10,%xmm9
- 0x45,0x0f,0x58,0xce, // addps %xmm14,%xmm9
- 0x44,0x0f,0x28,0xf3, // movaps %xmm3,%xmm14
- 0x45,0x0f,0x5d,0xf1, // minps %xmm9,%xmm14
- 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9
- 0x44,0x0f,0x59,0xc9, // mulps %xmm1,%xmm9
- 0x41,0x0f,0xc2,0xcd,0x01, // cmpltps %xmm13,%xmm1
- 0x44,0x0f,0x54,0xc9, // andps %xmm1,%xmm9
- 0x41,0x0f,0x55,0xce, // andnps %xmm14,%xmm1
- 0x41,0x0f,0x56,0xc9, // orps %xmm9,%xmm1
- 0x44,0x0f,0x52,0xca, // rsqrtps %xmm2,%xmm9
- 0x45,0x0f,0x53,0xf1, // rcpps %xmm9,%xmm14
- 0x45,0x0f,0x59,0xf3, // mulps %xmm11,%xmm14
- 0x45,0x0f,0x58,0xf4, // addps %xmm12,%xmm14
- 0x41,0x0f,0x52,0xf9, // rsqrtps %xmm9,%xmm7
- 0x41,0x0f,0x59,0xfa, // mulps %xmm10,%xmm7
- 0x41,0x0f,0x58,0xfe, // addps %xmm14,%xmm7
- 0x0f,0x5d,0xdf, // minps %xmm7,%xmm3
- 0x44,0x0f,0x59,0xc2, // mulps %xmm2,%xmm8
- 0x41,0x0f,0xc2,0xd5,0x01, // cmpltps %xmm13,%xmm2
- 0x44,0x0f,0x54,0xc2, // andps %xmm2,%xmm8
- 0x0f,0x55,0xd3, // andnps %xmm3,%xmm2
- 0x41,0x0f,0x56,0xd0, // orps %xmm8,%xmm2
- 0x0f,0x28,0xdc, // movaps %xmm4,%xmm3
- 0x0f,0x28,0xe5, // movaps %xmm5,%xmm4
- 0x0f,0x28,0xee, // movaps %xmm6,%xmm5
- 0x0f,0x28,0x34,0x24, // movaps (%rsp),%xmm6
- 0x0f,0x28,0x7c,0x24,0x10, // movaps 0x10(%rsp),%xmm7
- 0x48,0x83,0xc4,0x28, // add $0x28,%rsp
-};
-static const unsigned char sse2_scale_u8[] = {
- 0x48,0x8b,0x02, // mov (%rdx),%rax
- 0x66,0x44,0x0f,0x6e,0x04,0x38, // movd (%rax,%rdi,1),%xmm8
- 0x66,0x45,0x0f,0xef,0xc9, // pxor %xmm9,%xmm9
- 0x66,0x45,0x0f,0x60,0xc1, // punpcklbw %xmm9,%xmm8
- 0x66,0x45,0x0f,0x61,0xc1, // punpcklwd %xmm9,%xmm8
- 0x45,0x0f,0x5b,0xc0, // cvtdq2ps %xmm8,%xmm8
- 0xf3,0x44,0x0f,0x10,0x49,0x0c, // movss 0xc(%rcx),%xmm9
- 0x45,0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm9,%xmm9
- 0x45,0x0f,0x59,0xc8, // mulps %xmm8,%xmm9
- 0x41,0x0f,0x59,0xc1, // mulps %xmm9,%xmm0
- 0x41,0x0f,0x59,0xc9, // mulps %xmm9,%xmm1
- 0x41,0x0f,0x59,0xd1, // mulps %xmm9,%xmm2
- 0x41,0x0f,0x59,0xd9, // mulps %xmm9,%xmm3
-};
-static const unsigned char sse2_load_tables[] = {
- 0x48,0x8b,0x02, // mov (%rdx),%rax
- 0x4c,0x8b,0x42,0x08, // mov 0x8(%rdx),%r8
- 0xf3,0x44,0x0f,0x6f,0x04,0xb8, // movdqu (%rax,%rdi,4),%xmm8
- 0x66,0x0f,0x6e,0x41,0x10, // movd 0x10(%rcx),%xmm0
- 0x66,0x0f,0x70,0xc0,0x00, // pshufd $0x0,%xmm0,%xmm0
- 0x66,0x45,0x0f,0x6f,0xc8, // movdqa %xmm8,%xmm9
- 0x66,0x41,0x0f,0x72,0xd1,0x08, // psrld $0x8,%xmm9
- 0x66,0x44,0x0f,0xdb,0xc8, // pand %xmm0,%xmm9
- 0x66,0x45,0x0f,0x6f,0xd0, // movdqa %xmm8,%xmm10
- 0x66,0x41,0x0f,0x72,0xd2,0x10, // psrld $0x10,%xmm10
- 0x66,0x44,0x0f,0xdb,0xd0, // pand %xmm0,%xmm10
- 0x66,0x41,0x0f,0xdb,0xc0, // pand %xmm8,%xmm0
- 0x66,0x0f,0x70,0xd8,0x4e, // pshufd $0x4e,%xmm0,%xmm3
- 0x66,0x48,0x0f,0x7e,0xd8, // movq %xmm3,%rax
- 0x41,0x89,0xc1, // mov %eax,%r9d
- 0x48,0xc1,0xe8,0x20, // shr $0x20,%rax
- 0x66,0x49,0x0f,0x7e,0xc2, // movq %xmm0,%r10
- 0x45,0x89,0xd3, // mov %r10d,%r11d
- 0x49,0xc1,0xea,0x20, // shr $0x20,%r10
- 0xf3,0x43,0x0f,0x10,0x1c,0x90, // movss (%r8,%r10,4),%xmm3
- 0xf3,0x41,0x0f,0x10,0x04,0x80, // movss (%r8,%rax,4),%xmm0
- 0x0f,0x14,0xd8, // unpcklps %xmm0,%xmm3
- 0xf3,0x43,0x0f,0x10,0x04,0x98, // movss (%r8,%r11,4),%xmm0
- 0xf3,0x43,0x0f,0x10,0x0c,0x88, // movss (%r8,%r9,4),%xmm1
- 0x0f,0x14,0xc1, // unpcklps %xmm1,%xmm0
- 0x0f,0x14,0xc3, // unpcklps %xmm3,%xmm0
- 0x48,0x8b,0x42,0x10, // mov 0x10(%rdx),%rax
- 0x66,0x41,0x0f,0x70,0xc9,0x4e, // pshufd $0x4e,%xmm9,%xmm1
- 0x66,0x49,0x0f,0x7e,0xc8, // movq %xmm1,%r8
- 0x45,0x89,0xc1, // mov %r8d,%r9d
- 0x49,0xc1,0xe8,0x20, // shr $0x20,%r8
- 0x66,0x4d,0x0f,0x7e,0xca, // movq %xmm9,%r10
- 0x45,0x89,0xd3, // mov %r10d,%r11d
- 0x49,0xc1,0xea,0x20, // shr $0x20,%r10
- 0xf3,0x42,0x0f,0x10,0x1c,0x90, // movss (%rax,%r10,4),%xmm3
- 0xf3,0x42,0x0f,0x10,0x0c,0x80, // movss (%rax,%r8,4),%xmm1
- 0x0f,0x14,0xd9, // unpcklps %xmm1,%xmm3
- 0xf3,0x42,0x0f,0x10,0x0c,0x98, // movss (%rax,%r11,4),%xmm1
- 0xf3,0x42,0x0f,0x10,0x14,0x88, // movss (%rax,%r9,4),%xmm2
- 0x0f,0x14,0xca, // unpcklps %xmm2,%xmm1
- 0x0f,0x14,0xcb, // unpcklps %xmm3,%xmm1
- 0x48,0x8b,0x42,0x18, // mov 0x18(%rdx),%rax
- 0x66,0x41,0x0f,0x70,0xd2,0x4e, // pshufd $0x4e,%xmm10,%xmm2
- 0x66,0x49,0x0f,0x7e,0xd0, // movq %xmm2,%r8
- 0x45,0x89,0xc1, // mov %r8d,%r9d
- 0x49,0xc1,0xe8,0x20, // shr $0x20,%r8
- 0x66,0x4d,0x0f,0x7e,0xd2, // movq %xmm10,%r10
- 0x45,0x89,0xd3, // mov %r10d,%r11d
- 0x49,0xc1,0xea,0x20, // shr $0x20,%r10
- 0xf3,0x46,0x0f,0x10,0x0c,0x90, // movss (%rax,%r10,4),%xmm9
- 0xf3,0x42,0x0f,0x10,0x14,0x80, // movss (%rax,%r8,4),%xmm2
- 0x44,0x0f,0x14,0xca, // unpcklps %xmm2,%xmm9
- 0xf3,0x42,0x0f,0x10,0x14,0x98, // movss (%rax,%r11,4),%xmm2
- 0xf3,0x42,0x0f,0x10,0x1c,0x88, // movss (%rax,%r9,4),%xmm3
- 0x0f,0x14,0xd3, // unpcklps %xmm3,%xmm2
- 0x41,0x0f,0x14,0xd1, // unpcklps %xmm9,%xmm2
- 0x66,0x41,0x0f,0x72,0xd0,0x18, // psrld $0x18,%xmm8
- 0x45,0x0f,0x5b,0xc0, // cvtdq2ps %xmm8,%xmm8
- 0xf3,0x0f,0x10,0x59,0x0c, // movss 0xc(%rcx),%xmm3
- 0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm3,%xmm3
- 0x41,0x0f,0x59,0xd8, // mulps %xmm8,%xmm3
-};
-static const unsigned char sse2_load_8888[] = {
- 0x48,0x8b,0x02, // mov (%rdx),%rax
- 0xf3,0x0f,0x6f,0x1c,0xb8, // movdqu (%rax,%rdi,4),%xmm3
- 0x66,0x0f,0x6e,0x41,0x10, // movd 0x10(%rcx),%xmm0
- 0x66,0x0f,0x70,0xc0,0x00, // pshufd $0x0,%xmm0,%xmm0
- 0x66,0x0f,0x6f,0xcb, // movdqa %xmm3,%xmm1
- 0x66,0x0f,0x72,0xd1,0x08, // psrld $0x8,%xmm1
- 0x66,0x0f,0xdb,0xc8, // pand %xmm0,%xmm1
- 0x66,0x0f,0x6f,0xd3, // movdqa %xmm3,%xmm2
- 0x66,0x0f,0x72,0xd2,0x10, // psrld $0x10,%xmm2
- 0x66,0x0f,0xdb,0xd0, // pand %xmm0,%xmm2
- 0x66,0x0f,0xdb,0xc3, // pand %xmm3,%xmm0
- 0x0f,0x5b,0xc0, // cvtdq2ps %xmm0,%xmm0
- 0xf3,0x44,0x0f,0x10,0x41,0x0c, // movss 0xc(%rcx),%xmm8
- 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
- 0x41,0x0f,0x59,0xc0, // mulps %xmm8,%xmm0
- 0x0f,0x5b,0xc9, // cvtdq2ps %xmm1,%xmm1
- 0x41,0x0f,0x59,0xc8, // mulps %xmm8,%xmm1
- 0x0f,0x5b,0xd2, // cvtdq2ps %xmm2,%xmm2
- 0x41,0x0f,0x59,0xd0, // mulps %xmm8,%xmm2
- 0x66,0x0f,0x72,0xd3,0x18, // psrld $0x18,%xmm3
- 0x0f,0x5b,0xdb, // cvtdq2ps %xmm3,%xmm3
- 0x41,0x0f,0x59,0xd8, // mulps %xmm8,%xmm3
-};
-static const unsigned char sse2_store_8888[] = {
- 0x48,0x8b,0x02, // mov (%rdx),%rax
- 0xf3,0x44,0x0f,0x10,0x41,0x08, // movss 0x8(%rcx),%xmm8
- 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
- 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9
- 0x44,0x0f,0x59,0xc8, // mulps %xmm0,%xmm9
- 0x66,0x45,0x0f,0x5b,0xc9, // cvtps2dq %xmm9,%xmm9
- 0x45,0x0f,0x28,0xd0, // movaps %xmm8,%xmm10
- 0x44,0x0f,0x59,0xd1, // mulps %xmm1,%xmm10
- 0x66,0x45,0x0f,0x5b,0xd2, // cvtps2dq %xmm10,%xmm10
- 0x66,0x41,0x0f,0x72,0xf2,0x08, // pslld $0x8,%xmm10
- 0x66,0x45,0x0f,0xeb,0xd1, // por %xmm9,%xmm10
- 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9
- 0x44,0x0f,0x59,0xca, // mulps %xmm2,%xmm9
- 0x66,0x45,0x0f,0x5b,0xc9, // cvtps2dq %xmm9,%xmm9
- 0x66,0x41,0x0f,0x72,0xf1,0x10, // pslld $0x10,%xmm9
- 0x44,0x0f,0x59,0xc3, // mulps %xmm3,%xmm8
- 0x66,0x45,0x0f,0x5b,0xc0, // cvtps2dq %xmm8,%xmm8
- 0x66,0x41,0x0f,0x72,0xf0,0x18, // pslld $0x18,%xmm8
- 0x66,0x45,0x0f,0xeb,0xc1, // por %xmm9,%xmm8
- 0x66,0x45,0x0f,0xeb,0xc2, // por %xmm10,%xmm8
- 0xf3,0x44,0x0f,0x7f,0x04,0xb8, // movdqu %xmm8,(%rax,%rdi,4)
-};
-static const unsigned char sse2_load_f16[] = {
- 0x48,0x8b,0x02, // mov (%rdx),%rax
- 0xf3,0x0f,0x6f,0x04,0xf8, // movdqu (%rax,%rdi,8),%xmm0
- 0xf3,0x0f,0x6f,0x4c,0xf8,0x10, // movdqu 0x10(%rax,%rdi,8),%xmm1
- 0x66,0x0f,0x6f,0xd8, // movdqa %xmm0,%xmm3
- 0x66,0x0f,0x61,0xd9, // punpcklwd %xmm1,%xmm3
- 0x66,0x0f,0x69,0xc1, // punpckhwd %xmm1,%xmm0
- 0x66,0x0f,0x6f,0xcb, // movdqa %xmm3,%xmm1
- 0x66,0x0f,0x61,0xc8, // punpcklwd %xmm0,%xmm1
- 0x66,0x0f,0x69,0xd8, // punpckhwd %xmm0,%xmm3
- 0x66,0x45,0x0f,0xef,0xc0, // pxor %xmm8,%xmm8
- 0x66,0x0f,0x6f,0xc1, // movdqa %xmm1,%xmm0
- 0x66,0x41,0x0f,0x61,0xc0, // punpcklwd %xmm8,%xmm0
- 0x66,0x0f,0x72,0xf0,0x0d, // pslld $0xd,%xmm0
- 0x66,0x0f,0x6e,0x51,0x5c, // movd 0x5c(%rcx),%xmm2
- 0x66,0x44,0x0f,0x70,0xca,0x00, // pshufd $0x0,%xmm2,%xmm9
- 0x41,0x0f,0x59,0xc1, // mulps %xmm9,%xmm0
- 0x66,0x41,0x0f,0x69,0xc8, // punpckhwd %xmm8,%xmm1
- 0x66,0x0f,0x72,0xf1,0x0d, // pslld $0xd,%xmm1
- 0x41,0x0f,0x59,0xc9, // mulps %xmm9,%xmm1
- 0x66,0x0f,0x6f,0xd3, // movdqa %xmm3,%xmm2
- 0x66,0x41,0x0f,0x61,0xd0, // punpcklwd %xmm8,%xmm2
- 0x66,0x0f,0x72,0xf2,0x0d, // pslld $0xd,%xmm2
- 0x41,0x0f,0x59,0xd1, // mulps %xmm9,%xmm2
- 0x66,0x41,0x0f,0x69,0xd8, // punpckhwd %xmm8,%xmm3
- 0x66,0x0f,0x72,0xf3,0x0d, // pslld $0xd,%xmm3
- 0x41,0x0f,0x59,0xd9, // mulps %xmm9,%xmm3
-};
-static const unsigned char sse2_store_f16[] = {
- 0x48,0x8b,0x02, // mov (%rdx),%rax
- 0x66,0x44,0x0f,0x6e,0x41,0x60, // movd 0x60(%rcx),%xmm8
- 0x66,0x45,0x0f,0x70,0xc0,0x00, // pshufd $0x0,%xmm8,%xmm8
- 0x66,0x45,0x0f,0x6f,0xc8, // movdqa %xmm8,%xmm9
- 0x44,0x0f,0x59,0xc8, // mulps %xmm0,%xmm9
- 0x66,0x41,0x0f,0x72,0xd1,0x0d, // psrld $0xd,%xmm9
- 0x66,0x45,0x0f,0x6f,0xd0, // movdqa %xmm8,%xmm10
- 0x44,0x0f,0x59,0xd1, // mulps %xmm1,%xmm10
- 0x66,0x41,0x0f,0x72,0xd2,0x0d, // psrld $0xd,%xmm10
- 0x66,0x45,0x0f,0x6f,0xd8, // movdqa %xmm8,%xmm11
- 0x44,0x0f,0x59,0xda, // mulps %xmm2,%xmm11
- 0x66,0x41,0x0f,0x72,0xd3,0x0d, // psrld $0xd,%xmm11
- 0x44,0x0f,0x59,0xc3, // mulps %xmm3,%xmm8
- 0x66,0x41,0x0f,0x72,0xd0,0x0d, // psrld $0xd,%xmm8
- 0x66,0x41,0x0f,0x73,0xfa,0x02, // pslldq $0x2,%xmm10
- 0x66,0x45,0x0f,0xeb,0xd1, // por %xmm9,%xmm10
- 0x66,0x41,0x0f,0x73,0xf8,0x02, // pslldq $0x2,%xmm8
- 0x66,0x45,0x0f,0xeb,0xc3, // por %xmm11,%xmm8
- 0x66,0x45,0x0f,0x6f,0xca, // movdqa %xmm10,%xmm9
- 0x66,0x45,0x0f,0x62,0xc8, // punpckldq %xmm8,%xmm9
- 0xf3,0x44,0x0f,0x7f,0x0c,0xf8, // movdqu %xmm9,(%rax,%rdi,8)
- 0x66,0x45,0x0f,0x6a,0xd0, // punpckhdq %xmm8,%xmm10
- 0xf3,0x44,0x0f,0x7f,0x54,0xf8,0x10, // movdqu %xmm10,0x10(%rax,%rdi,8)
-};
-static const unsigned char sse2_clamp_x[] = {
- 0xf3,0x44,0x0f,0x10,0x02, // movss (%rdx),%xmm8
- 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
- 0x66,0x45,0x0f,0x76,0xc9, // pcmpeqd %xmm9,%xmm9
- 0x66,0x45,0x0f,0xfe,0xc8, // paddd %xmm8,%xmm9
- 0x41,0x0f,0x5d,0xc1, // minps %xmm9,%xmm0
- 0x45,0x0f,0x57,0xc0, // xorps %xmm8,%xmm8
- 0x44,0x0f,0x5f,0xc0, // maxps %xmm0,%xmm8
- 0x41,0x0f,0x28,0xc0, // movaps %xmm8,%xmm0
-};
-static const unsigned char sse2_clamp_y[] = {
- 0xf3,0x44,0x0f,0x10,0x02, // movss (%rdx),%xmm8
- 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
- 0x66,0x45,0x0f,0x76,0xc9, // pcmpeqd %xmm9,%xmm9
- 0x66,0x45,0x0f,0xfe,0xc8, // paddd %xmm8,%xmm9
- 0x41,0x0f,0x5d,0xc9, // minps %xmm9,%xmm1
- 0x45,0x0f,0x57,0xc0, // xorps %xmm8,%xmm8
- 0x44,0x0f,0x5f,0xc1, // maxps %xmm1,%xmm8
- 0x41,0x0f,0x28,0xc8, // movaps %xmm8,%xmm1
-};
-static const unsigned char sse2_matrix_2x3[] = {
- 0x44,0x0f,0x28,0xc9, // movaps %xmm1,%xmm9
- 0x44,0x0f,0x28,0xc0, // movaps %xmm0,%xmm8
- 0xf3,0x0f,0x10,0x02, // movss (%rdx),%xmm0
- 0xf3,0x0f,0x10,0x4a,0x04, // movss 0x4(%rdx),%xmm1
- 0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm0,%xmm0
- 0xf3,0x44,0x0f,0x10,0x52,0x08, // movss 0x8(%rdx),%xmm10
- 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10
- 0xf3,0x44,0x0f,0x10,0x5a,0x10, // movss 0x10(%rdx),%xmm11
- 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11
- 0x45,0x0f,0x59,0xd1, // mulps %xmm9,%xmm10
- 0x45,0x0f,0x58,0xd3, // addps %xmm11,%xmm10
- 0x41,0x0f,0x59,0xc0, // mulps %xmm8,%xmm0
- 0x41,0x0f,0x58,0xc2, // addps %xmm10,%xmm0
- 0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm1,%xmm1
- 0xf3,0x44,0x0f,0x10,0x52,0x0c, // movss 0xc(%rdx),%xmm10
- 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10
- 0xf3,0x44,0x0f,0x10,0x5a,0x14, // movss 0x14(%rdx),%xmm11
- 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11
- 0x45,0x0f,0x59,0xd1, // mulps %xmm9,%xmm10
- 0x45,0x0f,0x58,0xd3, // addps %xmm11,%xmm10
- 0x41,0x0f,0x59,0xc8, // mulps %xmm8,%xmm1
- 0x41,0x0f,0x58,0xca, // addps %xmm10,%xmm1
-};
-static const unsigned char sse2_matrix_3x4[] = {
- 0x44,0x0f,0x28,0xc9, // movaps %xmm1,%xmm9
- 0x44,0x0f,0x28,0xc0, // movaps %xmm0,%xmm8
- 0xf3,0x0f,0x10,0x02, // movss (%rdx),%xmm0
- 0xf3,0x0f,0x10,0x4a,0x04, // movss 0x4(%rdx),%xmm1
- 0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm0,%xmm0
- 0xf3,0x44,0x0f,0x10,0x52,0x0c, // movss 0xc(%rdx),%xmm10
- 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10
- 0xf3,0x44,0x0f,0x10,0x5a,0x18, // movss 0x18(%rdx),%xmm11
- 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11
- 0xf3,0x44,0x0f,0x10,0x62,0x24, // movss 0x24(%rdx),%xmm12
- 0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12
- 0x44,0x0f,0x59,0xda, // mulps %xmm2,%xmm11
- 0x45,0x0f,0x58,0xdc, // addps %xmm12,%xmm11
- 0x45,0x0f,0x59,0xd1, // mulps %xmm9,%xmm10
- 0x45,0x0f,0x58,0xd3, // addps %xmm11,%xmm10
- 0x41,0x0f,0x59,0xc0, // mulps %xmm8,%xmm0
- 0x41,0x0f,0x58,0xc2, // addps %xmm10,%xmm0
- 0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm1,%xmm1
- 0xf3,0x44,0x0f,0x10,0x52,0x10, // movss 0x10(%rdx),%xmm10
- 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10
- 0xf3,0x44,0x0f,0x10,0x5a,0x1c, // movss 0x1c(%rdx),%xmm11
- 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11
- 0xf3,0x44,0x0f,0x10,0x62,0x28, // movss 0x28(%rdx),%xmm12
- 0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12
- 0x44,0x0f,0x59,0xda, // mulps %xmm2,%xmm11
- 0x45,0x0f,0x58,0xdc, // addps %xmm12,%xmm11
- 0x45,0x0f,0x59,0xd1, // mulps %xmm9,%xmm10
- 0x45,0x0f,0x58,0xd3, // addps %xmm11,%xmm10
- 0x41,0x0f,0x59,0xc8, // mulps %xmm8,%xmm1
- 0x41,0x0f,0x58,0xca, // addps %xmm10,%xmm1
- 0xf3,0x44,0x0f,0x10,0x52,0x08, // movss 0x8(%rdx),%xmm10
- 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10
- 0xf3,0x44,0x0f,0x10,0x5a,0x14, // movss 0x14(%rdx),%xmm11
- 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11
- 0xf3,0x44,0x0f,0x10,0x62,0x20, // movss 0x20(%rdx),%xmm12
- 0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12
- 0xf3,0x44,0x0f,0x10,0x6a,0x2c, // movss 0x2c(%rdx),%xmm13
- 0x45,0x0f,0xc6,0xed,0x00, // shufps $0x0,%xmm13,%xmm13
- 0x44,0x0f,0x59,0xe2, // mulps %xmm2,%xmm12
- 0x45,0x0f,0x58,0xe5, // addps %xmm13,%xmm12
- 0x45,0x0f,0x59,0xd9, // mulps %xmm9,%xmm11
- 0x45,0x0f,0x58,0xdc, // addps %xmm12,%xmm11
- 0x45,0x0f,0x59,0xd0, // mulps %xmm8,%xmm10
- 0x45,0x0f,0x58,0xd3, // addps %xmm11,%xmm10
- 0x41,0x0f,0x28,0xd2, // movaps %xmm10,%xmm2
-};
-static const unsigned char sse2_linear_gradient_2stops[] = {
- 0x44,0x0f,0x10,0x0a, // movups (%rdx),%xmm9
- 0x0f,0x10,0x5a,0x10, // movups 0x10(%rdx),%xmm3
- 0x44,0x0f,0x28,0xc3, // movaps %xmm3,%xmm8
- 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
- 0x41,0x0f,0x28,0xc9, // movaps %xmm9,%xmm1
- 0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm1,%xmm1
- 0x44,0x0f,0x59,0xc0, // mulps %xmm0,%xmm8
- 0x44,0x0f,0x58,0xc1, // addps %xmm1,%xmm8
- 0x0f,0x28,0xcb, // movaps %xmm3,%xmm1
- 0x0f,0xc6,0xc9,0x55, // shufps $0x55,%xmm1,%xmm1
- 0x41,0x0f,0x28,0xd1, // movaps %xmm9,%xmm2
- 0x0f,0xc6,0xd2,0x55, // shufps $0x55,%xmm2,%xmm2
- 0x0f,0x59,0xc8, // mulps %xmm0,%xmm1
- 0x0f,0x58,0xca, // addps %xmm2,%xmm1
- 0x0f,0x28,0xd3, // movaps %xmm3,%xmm2
- 0x0f,0xc6,0xd2,0xaa, // shufps $0xaa,%xmm2,%xmm2
- 0x45,0x0f,0x28,0xd1, // movaps %xmm9,%xmm10
- 0x45,0x0f,0xc6,0xd2,0xaa, // shufps $0xaa,%xmm10,%xmm10
- 0x0f,0x59,0xd0, // mulps %xmm0,%xmm2
- 0x41,0x0f,0x58,0xd2, // addps %xmm10,%xmm2
- 0x0f,0xc6,0xdb,0xff, // shufps $0xff,%xmm3,%xmm3
- 0x45,0x0f,0xc6,0xc9,0xff, // shufps $0xff,%xmm9,%xmm9
- 0x0f,0x59,0xd8, // mulps %xmm0,%xmm3
- 0x41,0x0f,0x58,0xd9, // addps %xmm9,%xmm3
- 0x41,0x0f,0x28,0xc0, // movaps %xmm8,%xmm0
-};
-static const unsigned char hsw_inc_x[] = {
- 0x48,0x83,0xc7,0x08, // add $0x8,%rdi
-};
-static const unsigned char hsw_seed_shader[] = {
- 0xc5,0xf9,0x6e,0xc7, // vmovd %edi,%xmm0
- 0xc4,0xe2,0x7d,0x18,0xc0, // vbroadcastss %xmm0,%ymm0
- 0xc5,0xfc,0x5b,0xc0, // vcvtdq2ps %ymm0,%ymm0
- 0xc4,0xe2,0x7d,0x18,0x49,0x04, // vbroadcastss 0x4(%rcx),%ymm1
- 0xc5,0xfc,0x58,0xc1, // vaddps %ymm1,%ymm0,%ymm0
- 0xc5,0xfc,0x58,0x41,0x14, // vaddps 0x14(%rcx),%ymm0,%ymm0
- 0xc4,0xe2,0x7d,0x18,0x12, // vbroadcastss (%rdx),%ymm2
- 0xc5,0xfc,0x5b,0xd2, // vcvtdq2ps %ymm2,%ymm2
- 0xc5,0xec,0x58,0xc9, // vaddps %ymm1,%ymm2,%ymm1
- 0xc4,0xe2,0x7d,0x18,0x11, // vbroadcastss (%rcx),%ymm2
- 0xc5,0xe4,0x57,0xdb, // vxorps %ymm3,%ymm3,%ymm3
- 0xc5,0xdc,0x57,0xe4, // vxorps %ymm4,%ymm4,%ymm4
- 0xc5,0xd4,0x57,0xed, // vxorps %ymm5,%ymm5,%ymm5
- 0xc5,0xcc,0x57,0xf6, // vxorps %ymm6,%ymm6,%ymm6
- 0xc5,0xc4,0x57,0xff, // vxorps %ymm7,%ymm7,%ymm7
-};
-static const unsigned char hsw_constant_color[] = {
- 0xc4,0xe2,0x7d,0x18,0x02, // vbroadcastss (%rdx),%ymm0
- 0xc4,0xe2,0x7d,0x18,0x4a,0x04, // vbroadcastss 0x4(%rdx),%ymm1
- 0xc4,0xe2,0x7d,0x18,0x52,0x08, // vbroadcastss 0x8(%rdx),%ymm2
- 0xc4,0xe2,0x7d,0x18,0x5a,0x0c, // vbroadcastss 0xc(%rdx),%ymm3
-};
-static const unsigned char hsw_clear[] = {
- 0xc5,0xfc,0x57,0xc0, // vxorps %ymm0,%ymm0,%ymm0
- 0xc5,0xf4,0x57,0xc9, // vxorps %ymm1,%ymm1,%ymm1
- 0xc5,0xec,0x57,0xd2, // vxorps %ymm2,%ymm2,%ymm2
- 0xc5,0xe4,0x57,0xdb, // vxorps %ymm3,%ymm3,%ymm3
-};
-static const unsigned char hsw_plus_[] = {
- 0xc5,0xfc,0x58,0xc4, // vaddps %ymm4,%ymm0,%ymm0
- 0xc5,0xf4,0x58,0xcd, // vaddps %ymm5,%ymm1,%ymm1
- 0xc5,0xec,0x58,0xd6, // vaddps %ymm6,%ymm2,%ymm2
- 0xc5,0xe4,0x58,0xdf, // vaddps %ymm7,%ymm3,%ymm3
-};
-static const unsigned char hsw_srcover[] = {
- 0xc4,0x62,0x7d,0x18,0x01, // vbroadcastss (%rcx),%ymm8
- 0xc5,0x3c,0x5c,0xc3, // vsubps %ymm3,%ymm8,%ymm8
- 0xc4,0xc2,0x5d,0xb8,0xc0, // vfmadd231ps %ymm8,%ymm4,%ymm0
- 0xc4,0xc2,0x55,0xb8,0xc8, // vfmadd231ps %ymm8,%ymm5,%ymm1
- 0xc4,0xc2,0x4d,0xb8,0xd0, // vfmadd231ps %ymm8,%ymm6,%ymm2
- 0xc4,0xc2,0x45,0xb8,0xd8, // vfmadd231ps %ymm8,%ymm7,%ymm3
-};
-static const unsigned char hsw_dstover[] = {
- 0xc4,0x62,0x7d,0x18,0x01, // vbroadcastss (%rcx),%ymm8
- 0xc5,0x3c,0x5c,0xc7, // vsubps %ymm7,%ymm8,%ymm8
- 0xc4,0xe2,0x3d,0xa8,0xc4, // vfmadd213ps %ymm4,%ymm8,%ymm0
- 0xc4,0xe2,0x3d,0xa8,0xcd, // vfmadd213ps %ymm5,%ymm8,%ymm1
- 0xc4,0xe2,0x3d,0xa8,0xd6, // vfmadd213ps %ymm6,%ymm8,%ymm2
- 0xc4,0xe2,0x3d,0xa8,0xdf, // vfmadd213ps %ymm7,%ymm8,%ymm3
-};
-static const unsigned char hsw_clamp_0[] = {
- 0xc4,0x41,0x3c,0x57,0xc0, // vxorps %ymm8,%ymm8,%ymm8
- 0xc4,0xc1,0x7c,0x5f,0xc0, // vmaxps %ymm8,%ymm0,%ymm0
- 0xc4,0xc1,0x74,0x5f,0xc8, // vmaxps %ymm8,%ymm1,%ymm1
- 0xc4,0xc1,0x6c,0x5f,0xd0, // vmaxps %ymm8,%ymm2,%ymm2
- 0xc4,0xc1,0x64,0x5f,0xd8, // vmaxps %ymm8,%ymm3,%ymm3
-};
-static const unsigned char hsw_clamp_1[] = {
- 0xc4,0x62,0x7d,0x18,0x01, // vbroadcastss (%rcx),%ymm8
- 0xc4,0xc1,0x7c,0x5d,0xc0, // vminps %ymm8,%ymm0,%ymm0
- 0xc4,0xc1,0x74,0x5d,0xc8, // vminps %ymm8,%ymm1,%ymm1
- 0xc4,0xc1,0x6c,0x5d,0xd0, // vminps %ymm8,%ymm2,%ymm2
- 0xc4,0xc1,0x64,0x5d,0xd8, // vminps %ymm8,%ymm3,%ymm3
-};
-static const unsigned char hsw_clamp_a[] = {
- 0xc4,0x62,0x7d,0x18,0x01, // vbroadcastss (%rcx),%ymm8
- 0xc4,0xc1,0x64,0x5d,0xd8, // vminps %ymm8,%ymm3,%ymm3
- 0xc5,0xfc,0x5d,0xc3, // vminps %ymm3,%ymm0,%ymm0
- 0xc5,0xf4,0x5d,0xcb, // vminps %ymm3,%ymm1,%ymm1
- 0xc5,0xec,0x5d,0xd3, // vminps %ymm3,%ymm2,%ymm2
-};
-static const unsigned char hsw_swap[] = {
- 0xc5,0x7c,0x28,0xc3, // vmovaps %ymm3,%ymm8
- 0xc5,0x7c,0x28,0xca, // vmovaps %ymm2,%ymm9
- 0xc5,0x7c,0x28,0xd1, // vmovaps %ymm1,%ymm10
- 0xc5,0x7c,0x28,0xd8, // vmovaps %ymm0,%ymm11
- 0xc5,0xfc,0x28,0xc4, // vmovaps %ymm4,%ymm0
- 0xc5,0xfc,0x28,0xcd, // vmovaps %ymm5,%ymm1
- 0xc5,0xfc,0x28,0xd6, // vmovaps %ymm6,%ymm2
- 0xc5,0xfc,0x28,0xdf, // vmovaps %ymm7,%ymm3
- 0xc5,0x7c,0x29,0xdc, // vmovaps %ymm11,%ymm4
- 0xc5,0x7c,0x29,0xd5, // vmovaps %ymm10,%ymm5
- 0xc5,0x7c,0x29,0xce, // vmovaps %ymm9,%ymm6
- 0xc5,0x7c,0x29,0xc7, // vmovaps %ymm8,%ymm7
-};
-static const unsigned char hsw_move_src_dst[] = {
- 0xc5,0xfc,0x28,0xe0, // vmovaps %ymm0,%ymm4
- 0xc5,0xfc,0x28,0xe9, // vmovaps %ymm1,%ymm5
- 0xc5,0xfc,0x28,0xf2, // vmovaps %ymm2,%ymm6
- 0xc5,0xfc,0x28,0xfb, // vmovaps %ymm3,%ymm7
-};
-static const unsigned char hsw_move_dst_src[] = {
- 0xc5,0xfc,0x28,0xc4, // vmovaps %ymm4,%ymm0
- 0xc5,0xfc,0x28,0xcd, // vmovaps %ymm5,%ymm1
- 0xc5,0xfc,0x28,0xd6, // vmovaps %ymm6,%ymm2
- 0xc5,0xfc,0x28,0xdf, // vmovaps %ymm7,%ymm3
-};
-static const unsigned char hsw_premul[] = {
- 0xc5,0xfc,0x59,0xc3, // vmulps %ymm3,%ymm0,%ymm0
- 0xc5,0xf4,0x59,0xcb, // vmulps %ymm3,%ymm1,%ymm1
- 0xc5,0xec,0x59,0xd3, // vmulps %ymm3,%ymm2,%ymm2
-};
-static const unsigned char hsw_unpremul[] = {
- 0xc4,0x41,0x3c,0x57,0xc0, // vxorps %ymm8,%ymm8,%ymm8
- 0xc4,0x41,0x64,0xc2,0xc8,0x00, // vcmpeqps %ymm8,%ymm3,%ymm9
- 0xc4,0x62,0x7d,0x18,0x11, // vbroadcastss (%rcx),%ymm10
- 0xc5,0x2c,0x5e,0xd3, // vdivps %ymm3,%ymm10,%ymm10
- 0xc4,0x43,0x2d,0x4a,0xc0,0x90, // vblendvps %ymm9,%ymm8,%ymm10,%ymm8
- 0xc5,0xbc,0x59,0xc0, // vmulps %ymm0,%ymm8,%ymm0
- 0xc5,0xbc,0x59,0xc9, // vmulps %ymm1,%ymm8,%ymm1
- 0xc5,0xbc,0x59,0xd2, // vmulps %ymm2,%ymm8,%ymm2
-};
-static const unsigned char hsw_from_srgb[] = {
- 0xc4,0x62,0x7d,0x18,0x41,0x40, // vbroadcastss 0x40(%rcx),%ymm8
- 0xc5,0x3c,0x59,0xc8, // vmulps %ymm0,%ymm8,%ymm9
- 0xc5,0x7c,0x59,0xd0, // vmulps %ymm0,%ymm0,%ymm10
- 0xc4,0x62,0x7d,0x18,0x59,0x3c, // vbroadcastss 0x3c(%rcx),%ymm11
- 0xc4,0x62,0x7d,0x18,0x61,0x38, // vbroadcastss 0x38(%rcx),%ymm12
- 0xc4,0x41,0x7c,0x28,0xeb, // vmovaps %ymm11,%ymm13
- 0xc4,0x42,0x7d,0xa8,0xec, // vfmadd213ps %ymm12,%ymm0,%ymm13
- 0xc4,0x62,0x7d,0x18,0x71,0x34, // vbroadcastss 0x34(%rcx),%ymm14
- 0xc4,0x42,0x2d,0xa8,0xee, // vfmadd213ps %ymm14,%ymm10,%ymm13
- 0xc4,0x62,0x7d,0x18,0x51,0x44, // vbroadcastss 0x44(%rcx),%ymm10
- 0xc4,0xc1,0x7c,0xc2,0xc2,0x01, // vcmpltps %ymm10,%ymm0,%ymm0
- 0xc4,0xc3,0x15,0x4a,0xc1,0x00, // vblendvps %ymm0,%ymm9,%ymm13,%ymm0
- 0xc5,0x3c,0x59,0xc9, // vmulps %ymm1,%ymm8,%ymm9
- 0xc5,0x74,0x59,0xe9, // vmulps %ymm1,%ymm1,%ymm13
- 0xc4,0x41,0x7c,0x28,0xfb, // vmovaps %ymm11,%ymm15
- 0xc4,0x42,0x75,0xa8,0xfc, // vfmadd213ps %ymm12,%ymm1,%ymm15
- 0xc4,0x42,0x15,0xa8,0xfe, // vfmadd213ps %ymm14,%ymm13,%ymm15
- 0xc4,0xc1,0x74,0xc2,0xca,0x01, // vcmpltps %ymm10,%ymm1,%ymm1
- 0xc4,0xc3,0x05,0x4a,0xc9,0x10, // vblendvps %ymm1,%ymm9,%ymm15,%ymm1
- 0xc5,0x3c,0x59,0xc2, // vmulps %ymm2,%ymm8,%ymm8
- 0xc5,0x6c,0x59,0xca, // vmulps %ymm2,%ymm2,%ymm9
- 0xc4,0x42,0x6d,0xa8,0xdc, // vfmadd213ps %ymm12,%ymm2,%ymm11
- 0xc4,0x42,0x35,0xa8,0xde, // vfmadd213ps %ymm14,%ymm9,%ymm11
- 0xc4,0xc1,0x6c,0xc2,0xd2,0x01, // vcmpltps %ymm10,%ymm2,%ymm2
- 0xc4,0xc3,0x25,0x4a,0xd0,0x20, // vblendvps %ymm2,%ymm8,%ymm11,%ymm2
-};
-static const unsigned char hsw_to_srgb[] = {
- 0xc5,0x7c,0x52,0xc0, // vrsqrtps %ymm0,%ymm8
- 0xc4,0x41,0x7c,0x53,0xc8, // vrcpps %ymm8,%ymm9
- 0xc4,0x41,0x7c,0x52,0xd0, // vrsqrtps %ymm8,%ymm10
- 0xc4,0x62,0x7d,0x18,0x41,0x48, // vbroadcastss 0x48(%rcx),%ymm8
- 0xc5,0x3c,0x59,0xd8, // vmulps %ymm0,%ymm8,%ymm11
- 0xc4,0x62,0x7d,0x18,0x21, // vbroadcastss (%rcx),%ymm12
- 0xc4,0x62,0x7d,0x18,0x69,0x4c, // vbroadcastss 0x4c(%rcx),%ymm13
- 0xc4,0x62,0x7d,0x18,0x71,0x50, // vbroadcastss 0x50(%rcx),%ymm14
- 0xc4,0x62,0x7d,0x18,0x79,0x54, // vbroadcastss 0x54(%rcx),%ymm15
- 0xc4,0x42,0x0d,0xa8,0xcf, // vfmadd213ps %ymm15,%ymm14,%ymm9
- 0xc4,0x42,0x15,0xb8,0xca, // vfmadd231ps %ymm10,%ymm13,%ymm9
- 0xc4,0x41,0x1c,0x5d,0xc9, // vminps %ymm9,%ymm12,%ymm9
- 0xc4,0x62,0x7d,0x18,0x51,0x58, // vbroadcastss 0x58(%rcx),%ymm10
- 0xc4,0xc1,0x7c,0xc2,0xc2,0x01, // vcmpltps %ymm10,%ymm0,%ymm0
- 0xc4,0xc3,0x35,0x4a,0xc3,0x00, // vblendvps %ymm0,%ymm11,%ymm9,%ymm0
- 0xc5,0x7c,0x52,0xc9, // vrsqrtps %ymm1,%ymm9
- 0xc4,0x41,0x7c,0x53,0xd9, // vrcpps %ymm9,%ymm11
- 0xc4,0x41,0x7c,0x52,0xc9, // vrsqrtps %ymm9,%ymm9
- 0xc4,0x42,0x0d,0xa8,0xdf, // vfmadd213ps %ymm15,%ymm14,%ymm11
- 0xc4,0x42,0x15,0xb8,0xd9, // vfmadd231ps %ymm9,%ymm13,%ymm11
- 0xc5,0x3c,0x59,0xc9, // vmulps %ymm1,%ymm8,%ymm9
- 0xc4,0x41,0x1c,0x5d,0xdb, // vminps %ymm11,%ymm12,%ymm11
- 0xc4,0xc1,0x74,0xc2,0xca,0x01, // vcmpltps %ymm10,%ymm1,%ymm1
- 0xc4,0xc3,0x25,0x4a,0xc9,0x10, // vblendvps %ymm1,%ymm9,%ymm11,%ymm1
- 0xc5,0x7c,0x52,0xca, // vrsqrtps %ymm2,%ymm9
- 0xc4,0x41,0x7c,0x53,0xd9, // vrcpps %ymm9,%ymm11
- 0xc4,0x42,0x0d,0xa8,0xdf, // vfmadd213ps %ymm15,%ymm14,%ymm11
- 0xc4,0x41,0x7c,0x52,0xc9, // vrsqrtps %ymm9,%ymm9
- 0xc4,0x42,0x15,0xb8,0xd9, // vfmadd231ps %ymm9,%ymm13,%ymm11
- 0xc4,0x41,0x1c,0x5d,0xcb, // vminps %ymm11,%ymm12,%ymm9
- 0xc5,0x3c,0x59,0xc2, // vmulps %ymm2,%ymm8,%ymm8
- 0xc4,0xc1,0x6c,0xc2,0xd2,0x01, // vcmpltps %ymm10,%ymm2,%ymm2
- 0xc4,0xc3,0x35,0x4a,0xd0,0x20, // vblendvps %ymm2,%ymm8,%ymm9,%ymm2
-};
-static const unsigned char hsw_scale_u8[] = {
- 0x48,0x8b,0x02, // mov (%rdx),%rax
- 0xc4,0x62,0x7d,0x31,0x04,0x38, // vpmovzxbd (%rax,%rdi,1),%ymm8
- 0xc4,0x41,0x7c,0x5b,0xc0, // vcvtdq2ps %ymm8,%ymm8
- 0xc4,0x62,0x7d,0x18,0x49,0x0c, // vbroadcastss 0xc(%rcx),%ymm9
- 0xc4,0x41,0x3c,0x59,0xc1, // vmulps %ymm9,%ymm8,%ymm8
- 0xc5,0xbc,0x59,0xc0, // vmulps %ymm0,%ymm8,%ymm0
- 0xc5,0xbc,0x59,0xc9, // vmulps %ymm1,%ymm8,%ymm1
- 0xc5,0xbc,0x59,0xd2, // vmulps %ymm2,%ymm8,%ymm2
- 0xc5,0xbc,0x59,0xdb, // vmulps %ymm3,%ymm8,%ymm3
-};
-static const unsigned char hsw_load_tables[] = {
- 0x48,0x8b,0x02, // mov (%rdx),%rax
- 0x4c,0x8b,0x42,0x08, // mov 0x8(%rdx),%r8
- 0xc5,0xfc,0x10,0x1c,0xb8, // vmovups (%rax,%rdi,4),%ymm3
- 0xc4,0xe2,0x7d,0x18,0x51,0x10, // vbroadcastss 0x10(%rcx),%ymm2
- 0xc5,0xec,0x54,0xcb, // vandps %ymm3,%ymm2,%ymm1
- 0xc5,0xfc,0x57,0xc0, // vxorps %ymm0,%ymm0,%ymm0
- 0xc5,0x7c,0xc2,0xc0,0x00, // vcmpeqps %ymm0,%ymm0,%ymm8
- 0xc4,0x41,0x7c,0x28,0xc8, // vmovaps %ymm8,%ymm9
- 0xc4,0xc2,0x35,0x92,0x04,0x88, // vgatherdps %ymm9,(%r8,%ymm1,4),%ymm0
- 0x48,0x8b,0x42,0x10, // mov 0x10(%rdx),%rax
- 0xc5,0xf5,0x72,0xd3,0x08, // vpsrld $0x8,%ymm3,%ymm1
- 0xc5,0x6c,0x54,0xc9, // vandps %ymm1,%ymm2,%ymm9
- 0xc4,0x41,0x7c,0x28,0xd0, // vmovaps %ymm8,%ymm10
- 0xc4,0xa2,0x2d,0x92,0x0c,0x88, // vgatherdps %ymm10,(%rax,%ymm9,4),%ymm1
- 0x48,0x8b,0x42,0x18, // mov 0x18(%rdx),%rax
- 0xc5,0xb5,0x72,0xd3,0x10, // vpsrld $0x10,%ymm3,%ymm9
- 0xc4,0x41,0x6c,0x54,0xc9, // vandps %ymm9,%ymm2,%ymm9
- 0xc4,0xa2,0x3d,0x92,0x14,0x88, // vgatherdps %ymm8,(%rax,%ymm9,4),%ymm2
- 0xc5,0xe5,0x72,0xd3,0x18, // vpsrld $0x18,%ymm3,%ymm3
- 0xc5,0xfc,0x5b,0xdb, // vcvtdq2ps %ymm3,%ymm3
- 0xc4,0x62,0x7d,0x18,0x41,0x0c, // vbroadcastss 0xc(%rcx),%ymm8
- 0xc4,0xc1,0x64,0x59,0xd8, // vmulps %ymm8,%ymm3,%ymm3
-};
-static const unsigned char hsw_load_8888[] = {
- 0x48,0x8b,0x02, // mov (%rdx),%rax
- 0xc5,0xfc,0x10,0x1c,0xb8, // vmovups (%rax,%rdi,4),%ymm3
- 0xc4,0xe2,0x7d,0x18,0x51,0x10, // vbroadcastss 0x10(%rcx),%ymm2
- 0xc5,0xec,0x54,0xc3, // vandps %ymm3,%ymm2,%ymm0
- 0xc5,0xfc,0x5b,0xc0, // vcvtdq2ps %ymm0,%ymm0
- 0xc4,0x62,0x7d,0x18,0x41,0x0c, // vbroadcastss 0xc(%rcx),%ymm8
- 0xc5,0xbc,0x59,0xc0, // vmulps %ymm0,%ymm8,%ymm0
- 0xc5,0xf5,0x72,0xd3,0x08, // vpsrld $0x8,%ymm3,%ymm1
- 0xc5,0xec,0x54,0xc9, // vandps %ymm1,%ymm2,%ymm1
- 0xc5,0xfc,0x5b,0xc9, // vcvtdq2ps %ymm1,%ymm1
- 0xc5,0xbc,0x59,0xc9, // vmulps %ymm1,%ymm8,%ymm1
- 0xc5,0xb5,0x72,0xd3,0x10, // vpsrld $0x10,%ymm3,%ymm9
- 0xc4,0xc1,0x6c,0x54,0xd1, // vandps %ymm9,%ymm2,%ymm2
- 0xc5,0xfc,0x5b,0xd2, // vcvtdq2ps %ymm2,%ymm2
- 0xc5,0xbc,0x59,0xd2, // vmulps %ymm2,%ymm8,%ymm2
- 0xc5,0xe5,0x72,0xd3,0x18, // vpsrld $0x18,%ymm3,%ymm3
- 0xc5,0xfc,0x5b,0xdb, // vcvtdq2ps %ymm3,%ymm3
- 0xc4,0xc1,0x64,0x59,0xd8, // vmulps %ymm8,%ymm3,%ymm3
-};
-static const unsigned char hsw_store_8888[] = {
- 0x48,0x8b,0x02, // mov (%rdx),%rax
- 0xc4,0x62,0x7d,0x18,0x41,0x08, // vbroadcastss 0x8(%rcx),%ymm8
- 0xc5,0x3c,0x59,0xc8, // vmulps %ymm0,%ymm8,%ymm9
- 0xc4,0x41,0x7d,0x5b,0xc9, // vcvtps2dq %ymm9,%ymm9
- 0xc5,0x3c,0x59,0xd1, // vmulps %ymm1,%ymm8,%ymm10
- 0xc4,0x41,0x7d,0x5b,0xd2, // vcvtps2dq %ymm10,%ymm10
- 0xc4,0xc1,0x2d,0x72,0xf2,0x08, // vpslld $0x8,%ymm10,%ymm10
- 0xc4,0x41,0x2d,0xeb,0xc9, // vpor %ymm9,%ymm10,%ymm9
- 0xc5,0x3c,0x59,0xd2, // vmulps %ymm2,%ymm8,%ymm10
- 0xc4,0x41,0x7d,0x5b,0xd2, // vcvtps2dq %ymm10,%ymm10
- 0xc4,0xc1,0x2d,0x72,0xf2,0x10, // vpslld $0x10,%ymm10,%ymm10
- 0xc5,0x3c,0x59,0xc3, // vmulps %ymm3,%ymm8,%ymm8
- 0xc4,0x41,0x7d,0x5b,0xc0, // vcvtps2dq %ymm8,%ymm8
- 0xc4,0xc1,0x3d,0x72,0xf0,0x18, // vpslld $0x18,%ymm8,%ymm8
- 0xc4,0x41,0x2d,0xeb,0xc0, // vpor %ymm8,%ymm10,%ymm8
- 0xc4,0x41,0x35,0xeb,0xc0, // vpor %ymm8,%ymm9,%ymm8
- 0xc5,0x7e,0x7f,0x04,0xb8, // vmovdqu %ymm8,(%rax,%rdi,4)
-};
-static const unsigned char hsw_load_f16[] = {
- 0x48,0x8b,0x02, // mov (%rdx),%rax
- 0xc5,0xfa,0x6f,0x04,0xf8, // vmovdqu (%rax,%rdi,8),%xmm0
- 0xc5,0xfa,0x6f,0x4c,0xf8,0x10, // vmovdqu 0x10(%rax,%rdi,8),%xmm1
- 0xc5,0xfa,0x6f,0x54,0xf8,0x20, // vmovdqu 0x20(%rax,%rdi,8),%xmm2
- 0xc5,0xfa,0x6f,0x5c,0xf8,0x30, // vmovdqu 0x30(%rax,%rdi,8),%xmm3
- 0xc5,0x79,0x61,0xc1, // vpunpcklwd %xmm1,%xmm0,%xmm8
- 0xc5,0xf9,0x69,0xc1, // vpunpckhwd %xmm1,%xmm0,%xmm0
- 0xc5,0xe9,0x61,0xcb, // vpunpcklwd %xmm3,%xmm2,%xmm1
- 0xc5,0xe9,0x69,0xd3, // vpunpckhwd %xmm3,%xmm2,%xmm2
- 0xc5,0x39,0x61,0xc8, // vpunpcklwd %xmm0,%xmm8,%xmm9
- 0xc5,0x39,0x69,0xc0, // vpunpckhwd %xmm0,%xmm8,%xmm8
- 0xc5,0xf1,0x61,0xda, // vpunpcklwd %xmm2,%xmm1,%xmm3
- 0xc5,0x71,0x69,0xd2, // vpunpckhwd %xmm2,%xmm1,%xmm10
- 0xc5,0xb1,0x6c,0xc3, // vpunpcklqdq %xmm3,%xmm9,%xmm0
- 0xc4,0xe2,0x7d,0x13,0xc0, // vcvtph2ps %xmm0,%ymm0
- 0xc5,0xb1,0x6d,0xcb, // vpunpckhqdq %xmm3,%xmm9,%xmm1
- 0xc4,0xe2,0x7d,0x13,0xc9, // vcvtph2ps %xmm1,%ymm1
- 0xc4,0xc1,0x39,0x6c,0xd2, // vpunpcklqdq %xmm10,%xmm8,%xmm2
- 0xc4,0xe2,0x7d,0x13,0xd2, // vcvtph2ps %xmm2,%ymm2
- 0xc4,0xc1,0x39,0x6d,0xda, // vpunpckhqdq %xmm10,%xmm8,%xmm3
- 0xc4,0xe2,0x7d,0x13,0xdb, // vcvtph2ps %xmm3,%ymm3
-};
-static const unsigned char hsw_store_f16[] = {
- 0x48,0x8b,0x02, // mov (%rdx),%rax
- 0xc4,0xc3,0x7d,0x1d,0xc0,0x04, // vcvtps2ph $0x4,%ymm0,%xmm8
- 0xc4,0xc3,0x7d,0x1d,0xc9,0x04, // vcvtps2ph $0x4,%ymm1,%xmm9
- 0xc4,0xc3,0x7d,0x1d,0xd2,0x04, // vcvtps2ph $0x4,%ymm2,%xmm10
- 0xc4,0xc3,0x7d,0x1d,0xdb,0x04, // vcvtps2ph $0x4,%ymm3,%xmm11
- 0xc4,0x41,0x39,0x61,0xe1, // vpunpcklwd %xmm9,%xmm8,%xmm12
- 0xc4,0x41,0x39,0x69,0xc1, // vpunpckhwd %xmm9,%xmm8,%xmm8
- 0xc4,0x41,0x29,0x61,0xcb, // vpunpcklwd %xmm11,%xmm10,%xmm9
- 0xc4,0x41,0x29,0x69,0xd3, // vpunpckhwd %xmm11,%xmm10,%xmm10
- 0xc4,0x41,0x19,0x62,0xd9, // vpunpckldq %xmm9,%xmm12,%xmm11
- 0xc5,0x7a,0x7f,0x1c,0xf8, // vmovdqu %xmm11,(%rax,%rdi,8)
- 0xc4,0x41,0x19,0x6a,0xc9, // vpunpckhdq %xmm9,%xmm12,%xmm9
- 0xc5,0x7a,0x7f,0x4c,0xf8,0x10, // vmovdqu %xmm9,0x10(%rax,%rdi,8)
- 0xc4,0x41,0x39,0x62,0xca, // vpunpckldq %xmm10,%xmm8,%xmm9
- 0xc5,0x7a,0x7f,0x4c,0xf8,0x20, // vmovdqu %xmm9,0x20(%rax,%rdi,8)
- 0xc4,0x41,0x39,0x6a,0xc2, // vpunpckhdq %xmm10,%xmm8,%xmm8
- 0xc5,0x7a,0x7f,0x44,0xf8,0x30, // vmovdqu %xmm8,0x30(%rax,%rdi,8)
-};
-static const unsigned char hsw_clamp_x[] = {
- 0xc4,0x62,0x7d,0x58,0x02, // vpbroadcastd (%rdx),%ymm8
- 0xc4,0x41,0x35,0x76,0xc9, // vpcmpeqd %ymm9,%ymm9,%ymm9
- 0xc4,0x41,0x3d,0xfe,0xc1, // vpaddd %ymm9,%ymm8,%ymm8
- 0xc4,0xc1,0x7c,0x5d,0xc0, // vminps %ymm8,%ymm0,%ymm0
- 0xc4,0x41,0x3c,0x57,0xc0, // vxorps %ymm8,%ymm8,%ymm8
- 0xc5,0xbc,0x5f,0xc0, // vmaxps %ymm0,%ymm8,%ymm0
-};
-static const unsigned char hsw_clamp_y[] = {
- 0xc4,0x62,0x7d,0x58,0x02, // vpbroadcastd (%rdx),%ymm8
- 0xc4,0x41,0x35,0x76,0xc9, // vpcmpeqd %ymm9,%ymm9,%ymm9
- 0xc4,0x41,0x3d,0xfe,0xc1, // vpaddd %ymm9,%ymm8,%ymm8
- 0xc4,0xc1,0x74,0x5d,0xc8, // vminps %ymm8,%ymm1,%ymm1
- 0xc4,0x41,0x3c,0x57,0xc0, // vxorps %ymm8,%ymm8,%ymm8
- 0xc5,0xbc,0x5f,0xc9, // vmaxps %ymm1,%ymm8,%ymm1
-};
-static const unsigned char hsw_matrix_2x3[] = {
- 0xc4,0x62,0x7d,0x18,0x0a, // vbroadcastss (%rdx),%ymm9
- 0xc4,0x62,0x7d,0x18,0x52,0x08, // vbroadcastss 0x8(%rdx),%ymm10
- 0xc4,0x62,0x7d,0x18,0x42,0x10, // vbroadcastss 0x10(%rdx),%ymm8
- 0xc4,0x42,0x75,0xb8,0xc2, // vfmadd231ps %ymm10,%ymm1,%ymm8
- 0xc4,0x42,0x7d,0xb8,0xc1, // vfmadd231ps %ymm9,%ymm0,%ymm8
- 0xc4,0x62,0x7d,0x18,0x52,0x04, // vbroadcastss 0x4(%rdx),%ymm10
- 0xc4,0x62,0x7d,0x18,0x5a,0x0c, // vbroadcastss 0xc(%rdx),%ymm11
- 0xc4,0x62,0x7d,0x18,0x4a,0x14, // vbroadcastss 0x14(%rdx),%ymm9
- 0xc4,0x42,0x75,0xb8,0xcb, // vfmadd231ps %ymm11,%ymm1,%ymm9
- 0xc4,0x42,0x7d,0xb8,0xca, // vfmadd231ps %ymm10,%ymm0,%ymm9
- 0xc5,0x7c,0x29,0xc0, // vmovaps %ymm8,%ymm0
- 0xc5,0x7c,0x29,0xc9, // vmovaps %ymm9,%ymm1
-};
-static const unsigned char hsw_matrix_3x4[] = {
- 0xc4,0x62,0x7d,0x18,0x0a, // vbroadcastss (%rdx),%ymm9
- 0xc4,0x62,0x7d,0x18,0x52,0x0c, // vbroadcastss 0xc(%rdx),%ymm10
- 0xc4,0x62,0x7d,0x18,0x5a,0x18, // vbroadcastss 0x18(%rdx),%ymm11
- 0xc4,0x62,0x7d,0x18,0x42,0x24, // vbroadcastss 0x24(%rdx),%ymm8
- 0xc4,0x42,0x6d,0xb8,0xc3, // vfmadd231ps %ymm11,%ymm2,%ymm8
- 0xc4,0x42,0x75,0xb8,0xc2, // vfmadd231ps %ymm10,%ymm1,%ymm8
- 0xc4,0x42,0x7d,0xb8,0xc1, // vfmadd231ps %ymm9,%ymm0,%ymm8
- 0xc4,0x62,0x7d,0x18,0x52,0x04, // vbroadcastss 0x4(%rdx),%ymm10
- 0xc4,0x62,0x7d,0x18,0x5a,0x10, // vbroadcastss 0x10(%rdx),%ymm11
- 0xc4,0x62,0x7d,0x18,0x62,0x1c, // vbroadcastss 0x1c(%rdx),%ymm12
- 0xc4,0x62,0x7d,0x18,0x4a,0x28, // vbroadcastss 0x28(%rdx),%ymm9
- 0xc4,0x42,0x6d,0xb8,0xcc, // vfmadd231ps %ymm12,%ymm2,%ymm9
- 0xc4,0x42,0x75,0xb8,0xcb, // vfmadd231ps %ymm11,%ymm1,%ymm9
- 0xc4,0x42,0x7d,0xb8,0xca, // vfmadd231ps %ymm10,%ymm0,%ymm9
- 0xc4,0x62,0x7d,0x18,0x5a,0x08, // vbroadcastss 0x8(%rdx),%ymm11
- 0xc4,0x62,0x7d,0x18,0x62,0x14, // vbroadcastss 0x14(%rdx),%ymm12
- 0xc4,0x62,0x7d,0x18,0x6a,0x20, // vbroadcastss 0x20(%rdx),%ymm13
- 0xc4,0x62,0x7d,0x18,0x52,0x2c, // vbroadcastss 0x2c(%rdx),%ymm10
- 0xc4,0x42,0x6d,0xb8,0xd5, // vfmadd231ps %ymm13,%ymm2,%ymm10
- 0xc4,0x42,0x75,0xb8,0xd4, // vfmadd231ps %ymm12,%ymm1,%ymm10
- 0xc4,0x42,0x7d,0xb8,0xd3, // vfmadd231ps %ymm11,%ymm0,%ymm10
- 0xc5,0x7c,0x29,0xc0, // vmovaps %ymm8,%ymm0
- 0xc5,0x7c,0x29,0xc9, // vmovaps %ymm9,%ymm1
- 0xc5,0x7c,0x29,0xd2, // vmovaps %ymm10,%ymm2
-};
-static const unsigned char hsw_linear_gradient_2stops[] = {
- 0xc4,0xe2,0x7d,0x18,0x4a,0x10, // vbroadcastss 0x10(%rdx),%ymm1
- 0xc4,0x62,0x7d,0x18,0x02, // vbroadcastss (%rdx),%ymm8
- 0xc4,0x62,0x7d,0xb8,0xc1, // vfmadd231ps %ymm1,%ymm0,%ymm8
- 0xc4,0xe2,0x7d,0x18,0x52,0x14, // vbroadcastss 0x14(%rdx),%ymm2
- 0xc4,0xe2,0x7d,0x18,0x4a,0x04, // vbroadcastss 0x4(%rdx),%ymm1
- 0xc4,0xe2,0x7d,0xb8,0xca, // vfmadd231ps %ymm2,%ymm0,%ymm1
- 0xc4,0xe2,0x7d,0x18,0x5a,0x18, // vbroadcastss 0x18(%rdx),%ymm3
- 0xc4,0xe2,0x7d,0x18,0x52,0x08, // vbroadcastss 0x8(%rdx),%ymm2
- 0xc4,0xe2,0x7d,0xb8,0xd3, // vfmadd231ps %ymm3,%ymm0,%ymm2
- 0xc4,0x62,0x7d,0x18,0x4a,0x1c, // vbroadcastss 0x1c(%rdx),%ymm9
- 0xc4,0xe2,0x7d,0x18,0x5a,0x0c, // vbroadcastss 0xc(%rdx),%ymm3
- 0xc4,0xc2,0x7d,0xb8,0xd9, // vfmadd231ps %ymm9,%ymm0,%ymm3
- 0xc5,0x7c,0x29,0xc0, // vmovaps %ymm8,%ymm0
-};
-#endif//SkSplicer_generated_DEFINED
diff --git a/src/splicer/SkSplicer_shared.h b/src/splicer/SkSplicer_shared.h
deleted file mode 100644
index 1747e42b4b..0000000000
--- a/src/splicer/SkSplicer_shared.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#ifndef SkSplicer_shared_DEFINED
-#define SkSplicer_shared_DEFINED
-
-// This file contains definitions shared by SkSplicer.cpp (compiled normally as part of Skia)
-// and SkSplicer_stages.cpp (compiled offline into SkSplicer_generated.h). Keep it simple!
-
-#include <stdint.h>
-
-// SkSplicer Stages can use constant literals only if they end up baked into the instruction,
-// like bit shifts and rounding modes. Any other constant values must be pulled from this struct
-// (except 0 and 0.0f, which always end up as some sort of xor instruction).
-//
-// This constraint makes it much easier to move and reorder the code for each Stage.
-
-struct SkSplicer_constants {
- float _1; // 1.0f
- float _0_5; // 0.5f
- float _255; // 255.0f
- float _1_255; // 1/255.0f
- uint32_t _0x000000ff; // 0x000000ff
-
- float iota[8]; // 0,1,2,3,4,5,6,7
-
- // from_srgb
- float _00025; // 0.0025f
- float _06975; // 0.6975f
- float _03000; // 0.3000f
- float _1_1292; // 1/12.92f
- float _0055; // 0.055f
-
- // to_srgb
- float _1246; // 12.46f
- float _0411192; // 0.411192f
- float _0689206; // 0.689206f
- float n_00988; // -0.0988f
- float _00043; // 0.0043f
-
- // fp16 <-> fp32
- uint32_t _0x77800000;
- uint32_t _0x07800000;
-};
-
-#endif//SkSplicer_shared_DEFINED
diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp
deleted file mode 100644
index 19daee3fbb..0000000000
--- a/src/splicer/SkSplicer_stages.cpp
+++ /dev/null
@@ -1,584 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "SkSplicer_shared.h"
-#include <string.h>
-
-// It's tricky to relocate code referencing ordinary constants, so we read them from this struct.
-using K = const SkSplicer_constants;
-
-#if !defined(SPLICER) && !defined(JUMPER)
- // This path should lead to portable code that can be compiled directly into Skia.
- // (All other paths are compiled offline by Clang into SkSplicer_generated.h.)
- #include <math.h>
-
- using F = float;
- using I32 = int32_t;
- using U32 = uint32_t;
- using U8 = uint8_t;
-
- static F fma(F f, F m, F a) { return f*m+a; }
- static F min(F a, F b) { return fminf(a,b); }
- static F max(F a, F b) { return fmaxf(a,b); }
- static F rcp (F v) { return 1.0f / v; }
- static F rsqrt(F v) { return 1.0f / sqrtf(v); }
- static U32 round(F v, F scale) { return (uint32_t)(v*scale); }
-
- static F if_then_else(I32 c, F t, F e) { return c ? t : e; }
-
- static F gather(const float* p, U32 ix) { return p[ix]; }
-
-#elif defined(__aarch64__)
- #include <arm_neon.h>
-
- // Since we know we're using Clang, we can use its vector extensions.
- using F = float __attribute__((ext_vector_type(4)));
- using I32 = int32_t __attribute__((ext_vector_type(4)));
- using U32 = uint32_t __attribute__((ext_vector_type(4)));
- using U8 = uint8_t __attribute__((ext_vector_type(4)));
-
- // We polyfill a few routines that Clang doesn't build into ext_vector_types.
- static F fma(F f, F m, F a) { return vfmaq_f32(a,f,m); }
- static F min(F a, F b) { return vminq_f32(a,b); }
- static F max(F a, F b) { return vmaxq_f32(a,b); }
- static F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
- static F rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
- static U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
-
- static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
-
- static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
-
-#elif defined(__ARM_NEON__)
- #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
- #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
- #endif
- #include <arm_neon.h>
-
- // We can pass {s0-s15} as arguments under AAPCS-VFP. We'll slice that as 8 d-registers.
- using F = float __attribute__((ext_vector_type(2)));
- using I32 = int32_t __attribute__((ext_vector_type(2)));
- using U32 = uint32_t __attribute__((ext_vector_type(2)));
- using U8 = uint8_t __attribute__((ext_vector_type(2)));
-
- static F fma(F f, F m, F a) { return vfma_f32(a,f,m); }
- static F min(F a, F b) { return vmin_f32(a,b); }
- static F max(F a, F b) { return vmax_f32(a,b); }
- static F rcp (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e ) * e; }
- static F rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
- static U32 round(F v, F scale) { return vcvt_u32_f32(fma(v,scale,0.5f)); }
-
- static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
-
- static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
-
-#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
- #include <immintrin.h>
-
- // These are __m256 and __m256i, but friendlier and strongly-typed.
- using F = float __attribute__((ext_vector_type(8)));
- using I32 = int32_t __attribute__((ext_vector_type(8)));
- using U32 = uint32_t __attribute__((ext_vector_type(8)));
- using U8 = uint8_t __attribute__((ext_vector_type(8)));
-
- static F fma(F f, F m, F a) { return _mm256_fmadd_ps(f,m,a);}
- static F min(F a, F b) { return _mm256_min_ps(a,b); }
- static F max(F a, F b) { return _mm256_max_ps(a,b); }
- static F rcp (F v) { return _mm256_rcp_ps (v); }
- static F rsqrt(F v) { return _mm256_rsqrt_ps(v); }
- static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
-
- static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
-
- static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); }
-
-#elif defined(__SSE2__)
- #include <immintrin.h>
-
- using F = float __attribute__((ext_vector_type(4)));
- using I32 = int32_t __attribute__((ext_vector_type(4)));
- using U32 = uint32_t __attribute__((ext_vector_type(4)));
- using U8 = uint8_t __attribute__((ext_vector_type(4)));
-
- static F fma(F f, F m, F a) { return f*m+a; }
- static F min(F a, F b) { return _mm_min_ps(a,b); }
- static F max(F a, F b) { return _mm_max_ps(a,b); }
- static F rcp (F v) { return _mm_rcp_ps (v); }
- static F rsqrt(F v) { return _mm_rsqrt_ps(v); }
- static U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
-
- static F if_then_else(I32 c, F t, F e) {
- #if defined(__SSE4_1__)
- return _mm_blendv_ps(e,t,c);
- #else
- return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e));
- #endif
- }
-
- static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
-#endif
-
-// We need to be a careful with casts.
-// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
-// These named casts and bit_cast() are always what they seem to be.
-#if !defined(SPLICER) && !defined(JUMPER)
- static F cast (U32 v) { return (F)v; }
- static U32 expand(U8 v) { return (U32)v; }
-#else
- static F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
- static U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
-#endif
-
-template <typename T, typename P>
-static T unaligned_load(const P* p) {
- T v;
- memcpy(&v, p, sizeof(v));
- return v;
-}
-
-template <typename Dst, typename Src>
-static Dst bit_cast(const Src& src) {
- static_assert(sizeof(Dst) == sizeof(Src), "");
- return unaligned_load<Dst>(&src);
-}
-
-// Sometimes we want to work with 4 floats directly, regardless of the depth of the F vector.
-#if !defined(SPLICER) && !defined(JUMPER)
- struct F4 {
- float vals[4];
- float operator[](int i) const { return vals[i]; }
- };
-#else
- using F4 = float __attribute__((ext_vector_type(4)));
-#endif
-
-// We'll be compiling this file to an object file, then extracting parts of it into
-// SkSplicer_generated.h. It's easier to do if the function names are not C++ mangled.
-#define C extern "C"
-
-#if defined(SPLICER)
- // Splicer Stages all fit a common interface that allows SkSplicer to splice them together.
- // (This is just for reference... nothing uses this type when we're in Splicer mode.)
- using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F);
-
- // Stage's arguments act as the working set of registers within the final spliced function.
- // Here's a little primer on the x86-64/aarch64 ABIs:
- // x: rdi/x0 x and limit work to drive the loop, see loop_start in SkSplicer.cpp.
- // limit: rsi/x1
- // ctx: rdx/x2 Look for set_ctx in SkSplicer.cpp to see how this works.
- // k: rcx/x3
- // vectors: ymm0-ymm7/v0-v7
-
- // done() is the key to this entire splicing strategy.
- //
- // It matches the signature of Stage, so all the registers are kept live.
- // Every Stage calls done() and so will end in a single jmp (i.e. tail-call) into done(),
- // which marks the point where we can splice one Stage onto the next.
- //
- // The lovely bit is that we don't have to define done(), just declare it.
- C void done(size_t, size_t, void*, K*, F,F,F,F, F,F,F,F);
-
- // This should feel familiar to anyone who's read SkRasterPipeline_opts.h.
- // It's just a convenience to make a valid, spliceable Stage, nothing magic.
- #define STAGE(name) \
- static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \
- F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
- C void name(size_t x, size_t limit, void* ctx, K* k, \
- F r, F g, F b, F a, F dr, F dg, F db, F da) { \
- name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
- done (x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
- } \
- static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \
- F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
-#else
- // Jumper and portable Stages tail call between each other by following
- // program, an interlaced sequence of Stage pointers and context pointers.
- using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
-
- static void* load_and_inc(void**& program) {
- #if defined(__GNUC__) && defined(__x86_64__)
- // Passing program as the second Stage argument makes it likely that it's in %rsi,
- // so this is usually a single instruction *program++.
- void* rax;
- asm("lodsq" : "=a"(rax), "+S"(program)); // Write-only %rax, read-write %rsi.
- return rax;
- // When a Stage uses its ctx pointer, this optimization typically cuts an instruction:
- // mov (%rsi), %rcx // ctx = program[0]
- // ...
- // mov 0x8(%rsi), %rax // next = program[1]
- // add $0x10, %rsi // program += 2
- // jmpq *%rax // JUMP!
- // becomes
- // lods %ds:(%rsi),%rax // ctx = *program++;
- // ...
- // lods %ds:(%rsi),%rax // next = *program++;
- // jmpq *%rax // JUMP!
- //
- // When a Stage doesn't use its ctx pointer, it's 3 instructions either way,
- // but using lodsq (a 2-byte instruction) tends to trim a few bytes.
- #else
- // On ARM *program++ compiles into a single instruction without any handholding.
- return *program++;
- #endif
- }
-
- #define STAGE(name) \
- static void name##_k(size_t& x, void* ctx, K* k, \
- F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
- C void name(size_t x, void** program, K* k, \
- F r, F g, F b, F a, F dr, F dg, F db, F da) { \
- auto ctx = load_and_inc(program); \
- name##_k(x,ctx,k, r,g,b,a, dr,dg,db,da); \
- auto next = (Stage*)load_and_inc(program); \
- next(x,program,k, r,g,b,a, dr,dg,db,da); \
- } \
- static void name##_k(size_t& x, void* ctx, K* k, \
- F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
-#endif
-
-// We can now define Stages!
-
-// Some things to keep in mind while writing Stages:
-// - do not branch; (i.e. avoid jmp)
-// - do not call functions that don't inline; (i.e. avoid call, ret)
-// - do not use constant literals other than 0 and 0.0f. (i.e. avoid rip relative addressing)
-//
-// Some things that should work fine:
-// - 0, ~0, and 0.0f;
-// - arithmetic;
-// - functions of F and U32 that we've defined above;
-// - temporary values;
-// - lambdas;
-// - memcpy() with a compile-time constant size argument.
-
-STAGE(inc_x) {
- x += sizeof(F) / sizeof(float);
-}
-
-STAGE(seed_shader) {
- auto y = *(const int*)ctx;
-
- // It's important for speed to explicitly cast(x) and cast(y),
- // which has the effect of splatting them to vectors before converting to floats.
- // On Intel this breaks a data dependency on previous loop iterations' registers.
-
- r = cast(x) + k->_0_5 + unaligned_load<F>(k->iota);
- g = cast(y) + k->_0_5;
- b = k->_1;
- a = 0;
- dr = dg = db = da = 0;
-}
-
-STAGE(constant_color) {
- auto rgba = unaligned_load<F4>(ctx);
- r = rgba[0];
- g = rgba[1];
- b = rgba[2];
- a = rgba[3];
-}
-
-STAGE(clear) {
- r = g = b = a = 0;
-}
-
-STAGE(plus_) {
- r = r + dr;
- g = g + dg;
- b = b + db;
- a = a + da;
-}
-
-STAGE(srcover) {
- auto A = k->_1 - a;
- r = fma(dr, A, r);
- g = fma(dg, A, g);
- b = fma(db, A, b);
- a = fma(da, A, a);
-}
-STAGE(dstover) {
- auto DA = k->_1 - da;
- r = fma(r, DA, dr);
- g = fma(g, DA, dg);
- b = fma(b, DA, db);
- a = fma(a, DA, da);
-}
-
-STAGE(clamp_0) {
- r = max(r, 0);
- g = max(g, 0);
- b = max(b, 0);
- a = max(a, 0);
-}
-
-STAGE(clamp_1) {
- r = min(r, k->_1);
- g = min(g, k->_1);
- b = min(b, k->_1);
- a = min(a, k->_1);
-}
-
-STAGE(clamp_a) {
- a = min(a, k->_1);
- r = min(r, a);
- g = min(g, a);
- b = min(b, a);
-}
-
-STAGE(swap) {
- auto swap = [](F& v, F& dv) {
- auto tmp = v;
- v = dv;
- dv = tmp;
- };
- swap(r, dr);
- swap(g, dg);
- swap(b, db);
- swap(a, da);
-}
-STAGE(move_src_dst) {
- dr = r;
- dg = g;
- db = b;
- da = a;
-}
-STAGE(move_dst_src) {
- r = dr;
- g = dg;
- b = db;
- a = da;
-}
-
-STAGE(premul) {
- r = r * a;
- g = g * a;
- b = b * a;
-}
-STAGE(unpremul) {
- auto scale = if_then_else(a == 0, 0, k->_1 / a);
- r = r * scale;
- g = g * scale;
- b = b * scale;
-}
-
-STAGE(from_srgb) {
- auto fn = [&](F s) {
- auto lo = s * k->_1_1292;
- auto hi = fma(s*s, fma(s, k->_03000, k->_06975), k->_00025);
- return if_then_else(s < k->_0055, lo, hi);
- };
- r = fn(r);
- g = fn(g);
- b = fn(b);
-}
-STAGE(to_srgb) {
- auto fn = [&](F l) {
- F sqrt = rcp (rsqrt(l)),
- ftrt = rsqrt(rsqrt(l));
- auto lo = l * k->_1246;
- auto hi = min(k->_1, fma(k->_0411192, ftrt,
- fma(k->_0689206, sqrt,
- k->n_00988)));
- return if_then_else(l < k->_00043, lo, hi);
- };
- r = fn(r);
- g = fn(g);
- b = fn(b);
-}
-
-STAGE(scale_u8) {
- auto ptr = *(const uint8_t**)ctx + x;
-
- auto scales = unaligned_load<U8>(ptr);
- auto c = cast(expand(scales)) * k->_1_255;
-
- r = r * c;
- g = g * c;
- b = b * c;
- a = a * c;
-}
-
-STAGE(load_tables) {
- struct Ctx {
- const uint32_t* src;
- const float *r, *g, *b;
- };
- auto c = (const Ctx*)ctx;
-
- auto px = unaligned_load<U32>(c->src + x);
- r = gather(c->r, (px ) & k->_0x000000ff);
- g = gather(c->g, (px >> 8) & k->_0x000000ff);
- b = gather(c->b, (px >> 16) & k->_0x000000ff);
- a = cast( (px >> 24)) * k->_1_255;
-}
-
-STAGE(load_8888) {
- auto ptr = *(const uint32_t**)ctx + x;
-
- auto px = unaligned_load<U32>(ptr);
- r = cast((px ) & k->_0x000000ff) * k->_1_255;
- g = cast((px >> 8) & k->_0x000000ff) * k->_1_255;
- b = cast((px >> 16) & k->_0x000000ff) * k->_1_255;
- a = cast((px >> 24) ) * k->_1_255;
-}
-
-STAGE(store_8888) {
- auto ptr = *(uint32_t**)ctx + x;
-
- U32 px = round(r, k->_255)
- | round(g, k->_255) << 8
- | round(b, k->_255) << 16
- | round(a, k->_255) << 24;
- memcpy(ptr, &px, sizeof(px));
-}
-
-STAGE(load_f16) {
- auto ptr = *(const uint64_t**)ctx + x;
-
-#if !defined(SPLICER) && !defined(JUMPER)
- // TODO:
-#elif defined(__aarch64__)
- auto halfs = vld4_f16((const float16_t*)ptr);
- r = vcvt_f32_f16(halfs.val[0]);
- g = vcvt_f32_f16(halfs.val[1]);
- b = vcvt_f32_f16(halfs.val[2]);
- a = vcvt_f32_f16(halfs.val[3]);
-#elif defined(__ARM_NEON__)
- auto rb_ga = vld2_f16((const float16_t*)ptr);
- auto rb = vcvt_f32_f16(rb_ga.val[0]),
- ga = vcvt_f32_f16(rb_ga.val[1]);
- r = {rb[0], rb[2]};
- g = {ga[0], ga[2]};
- b = {rb[1], rb[3]};
- a = {ga[1], ga[3]};
-#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
- auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
- _23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
- _45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
- _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
-
- auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
- _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
- _46 = _mm_unpacklo_epi16(_45, _67),
- _57 = _mm_unpackhi_epi16(_45, _67);
-
- auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
- ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3
- rg4567 = _mm_unpacklo_epi16(_46, _57),
- ba4567 = _mm_unpackhi_epi16(_46, _57);
-
- r = _mm256_cvtph_ps(_mm_unpacklo_epi64(rg0123, rg4567));
- g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567));
- b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
- a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
-#elif defined(__SSE2__)
- auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
- _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
-
- auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
- _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3
-
- auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
- ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3
-
- auto half_to_float = [&](U32 h) {
- return bit_cast<F>(h << 13) // Line up the mantissa,
- * bit_cast<F>(U32(k->_0x77800000)); // then fix up the exponent.
- };
-
- r = half_to_float(_mm_unpacklo_epi16(rg, _mm_setzero_si128()));
- g = half_to_float(_mm_unpackhi_epi16(rg, _mm_setzero_si128()));
- b = half_to_float(_mm_unpacklo_epi16(ba, _mm_setzero_si128()));
- a = half_to_float(_mm_unpackhi_epi16(ba, _mm_setzero_si128()));
-#endif
-}
-
-STAGE(store_f16) {
- auto ptr = *(uint64_t**)ctx + x;
-
-#if !defined(SPLICER) && !defined(JUMPER)
- // TODO:
-#elif defined(__aarch64__)
- float16x4x4_t halfs = {{
- vcvt_f16_f32(r),
- vcvt_f16_f32(g),
- vcvt_f16_f32(b),
- vcvt_f16_f32(a),
- }};
- vst4_f16((float16_t*)ptr, halfs);
-#elif defined(__ARM_NEON__)
- float16x4x2_t rb_ga = {{
- vcvt_f16_f32(float32x4_t{r[0], b[0], r[1], b[1]}),
- vcvt_f16_f32(float32x4_t{g[0], a[0], g[1], a[1]}),
- }};
- vst2_f16((float16_t*)ptr, rb_ga);
-#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
- auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
- G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
- B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION),
- A = _mm256_cvtps_ph(a, _MM_FROUND_CUR_DIRECTION);
-
- auto rg0123 = _mm_unpacklo_epi16(R, G), // r0 g0 r1 g1 r2 g2 r3 g3
- rg4567 = _mm_unpackhi_epi16(R, G), // r4 g4 r5 g5 r6 g6 r7 g7
- ba0123 = _mm_unpacklo_epi16(B, A),
- ba4567 = _mm_unpackhi_epi16(B, A);
-
- _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123));
- _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
- _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
- _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
-#elif defined(__SSE2__)
- auto float_to_half = [&](F f) {
- return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000))) // Fix up the exponent,
- >> 13; // then line up the mantissa.
- };
- U32 R = float_to_half(r),
- G = float_to_half(g),
- B = float_to_half(b),
- A = float_to_half(a);
- U32 rg = R | _mm_slli_si128(G,2),
- ba = B | _mm_slli_si128(A,2);
- _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
- _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
-#endif
-}
-
-static F clamp(const F& v, float limit) {
- F l = bit_cast<F>(bit_cast<U32>(F(limit)) + U32(0xffffffff)); // limit - 1 ulp
- return max(0, min(v, l));
-}
-STAGE(clamp_x) { r = clamp(r, *(const float*)ctx); }
-STAGE(clamp_y) { g = clamp(g, *(const float*)ctx); }
-
-STAGE(matrix_2x3) {
- auto m = (const float*)ctx;
-
- auto R = fma(r,m[0], fma(g,m[2], m[4])),
- G = fma(r,m[1], fma(g,m[3], m[5]));
- r = R;
- g = G;
-}
-STAGE(matrix_3x4) {
- auto m = (const float*)ctx;
-
- auto R = fma(r,m[0], fma(g,m[3], fma(b,m[6], m[ 9]))),
- G = fma(r,m[1], fma(g,m[4], fma(b,m[7], m[10]))),
- B = fma(r,m[2], fma(g,m[5], fma(b,m[8], m[11])));
- r = R;
- g = G;
- b = B;
-}
-
-STAGE(linear_gradient_2stops) {
- struct Ctx { F4 c0, dc; };
- auto c = unaligned_load<Ctx>(ctx);
-
- auto t = r;
- r = fma(t, c.dc[0], c.c0[0]);
- g = fma(t, c.dc[1], c.c0[1]);
- b = fma(t, c.dc[2], c.c0[2]);
- a = fma(t, c.dc[3], c.c0[3]);
-}
diff --git a/src/splicer/build_stages.py b/src/splicer/build_stages.py
deleted file mode 100755
index cb7f6faa18..0000000000
--- a/src/splicer/build_stages.py
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/usr/bin/env python2.7
-#
-# Copyright 2017 Google Inc.
-#
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-import re
-import subprocess
-import sys
-
-sys.stdout = open('src/splicer/SkSplicer_generated.h', 'w')
-
-ndk = '/Users/mtklein/brew/opt/android-ndk/'
-objdump = 'gobjdump'
-
-#ndk = '/home/mtklein/ndk/'
-#objdump = '/home/mtklein/binutils-2.27/binutils/objdump'
-
-cflags = '-std=c++11 -Os -fomit-frame-pointer -DSPLICER'.split()
-
-# This won't stay here long-term.
-# It's a stand-in for building SkSplicer_stages.cpp into Skia.
-portable = '-USPLICER'.split()
-subprocess.check_call(['clang++'] + cflags + portable +
- ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
- ['-o', 'portable.o'])
-
-sse2 = '-mno-red-zone -msse2 -mno-sse3 -mno-ssse3 -mno-sse4.1'.split()
-subprocess.check_call(['clang++'] + cflags + sse2 +
- ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
- ['-o', 'sse2.o'])
-
-# We're not using sse41.o right now, so use it to test -DJUMPER.
-sse41 = '-mno-red-zone -msse4.1 -USPLICER -DJUMPER'.split()
-subprocess.check_call(['clang++'] + cflags + sse41 +
- ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
- ['-o', 'sse41.o'])
-
-
-hsw = '-mno-red-zone -mavx2 -mfma -mf16c'.split()
-subprocess.check_call(['clang++'] + cflags + hsw +
- ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
- ['-o', 'hsw.o'])
-
-aarch64 = [
- '--target=aarch64-linux-android',
- '--sysroot=' + ndk + 'platforms/android-21/arch-arm64',
-]
-subprocess.check_call(['clang++'] + cflags + aarch64 +
- ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
- ['-o', 'aarch64.o'])
-
-armv7 = [
- '--target=armv7a-linux-android',
- '--sysroot=' + ndk + 'platforms/android-18/arch-arm',
- '-mfpu=neon-vfpv4',
- '-mfloat-abi=hard',
-]
-subprocess.check_call(['clang++'] + cflags + armv7 +
- ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
- ['-o', 'armv7.o'])
-
-def parse_object_file(dot_o, array_type, jump, target=None):
- prefix = dot_o.replace('.o', '_')
- cmd = [ objdump, '-d', '--insn-width=8', dot_o]
- if target:
- cmd += ['--target', target]
- for line in subprocess.check_output(cmd).split('\n'):
- line = line.strip()
- if not line or line.startswith(dot_o) or line.startswith('Disassembly'):
- continue
-
- # E.g. 00000000000003a4 <_load_f16>:
- m = re.match('''[0-9a-f]+ <_?(.*)>:''', line)
- if m:
- print 'static const', array_type, prefix + m.group(1) + '[] = {'
- continue
-
- columns = line.split('\t')
- code = columns[1]
- if len(columns) >= 4:
- inst = columns[2]
- args = columns[3]
- else:
- inst, args = columns[2].split(' ', 1)
- code, inst, args = code.strip(), inst.strip(), args.strip()
-
- # We can't splice code that uses ip-relative addressing.
- for arg in args:
- assert 'rip' not in arg # TODO: detect on aarch64 too
-
- # At the end of every stage function there's a jump to next().
- # This marks the splice point.
- if code == jump:
- print '};'
- continue
-
- hexed = ''.join('0x'+x+',' for x in code.split(' '))
- print ' ' + hexed + ' '*(44-len(hexed)) + \
- '// ' + inst + (' '*(14-len(inst)) + args if args else '')
-
-print '''/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#ifndef SkSplicer_generated_DEFINED
-#define SkSplicer_generated_DEFINED
-
-// This file is generated semi-automatically with this command:
-// $ src/splicer/build_stages.py
-'''
-parse_object_file('aarch64.o', 'unsigned int', '14000000')
-parse_object_file( 'armv7.o', 'unsigned int', 'eafffffe',
- target='elf32-littlearm')
-parse_object_file( 'sse2.o', 'unsigned char', 'e9 00 00 00 00')
-#parse_object_file('sse41.o', 'unsigned char', 'e9 00 00 00 00')
-parse_object_file( 'hsw.o', 'unsigned char', 'e9 00 00 00 00')
-print '#endif//SkSplicer_generated_DEFINED'