aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/splicer
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-01-09 17:21:32 -0500
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-01-10 16:15:00 +0000
commit8e619a2b4eb31753e6fcb4a9ec494d31ace755da (patch)
tree366015a3ecea77d92fbd62477e1afc0be48166d6 /src/splicer
parent1e74cad9b4ed8079433d4e62ab3198d97436f5ec (diff)
SkSplicer: start on arm64
Seems to be working. The jump to loop_start might be a little off, but not by much. Correctness is really still a big TODO. $ adb shell 'cd /data/local/tmp; ./monobench SkRasterPipeline 200' SkRasterPipeline_… 200 …f16_compile 1x …f16_run 1.42x …srgb_compile 2.21x …srgb_run 2.59x⏎ Change-Id: I0e1acc6404cf3ce8084d9ef8011cbe0b5f1fd6e3 Reviewed-on: https://skia-review.googlesource.com/6811 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/splicer')
-rw-r--r--src/splicer/SkSplicer.cpp130
-rw-r--r--src/splicer/SkSplicer_generated.h276
-rw-r--r--src/splicer/SkSplicer_stages.cpp97
-rwxr-xr-xsrc/splicer/build_stages.py66
4 files changed, 482 insertions, 87 deletions
diff --git a/src/splicer/SkSplicer.cpp b/src/splicer/SkSplicer.cpp
index 167a89a22c..f37b95f445 100644
--- a/src/splicer/SkSplicer.cpp
+++ b/src/splicer/SkSplicer.cpp
@@ -29,38 +29,86 @@ namespace {
12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f, // to_srgb
};
- // Short x86-64 instruction sequences that we'll use as glue to splice together Stages.
- static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 }; // clear top half of all ymm
- static const uint8_t ret[] = { 0xc3 }; // return
- static const uint8_t movabsq_rcx[] = { 0x48, 0xb9 }; // move next 8 bytes into rcx
- static const uint8_t movabsq_rdx[] = { 0x48, 0xba }; // move next 8 bytes into rdx
- static const uint8_t addq_8_rdi[] = { 0x48, 0x83, 0xc7, 0x08 }; // rdi += 8
- static const uint8_t cmpq_rsi_rdi[] = { 0x48, 0x39, 0xf7 }; // rdi cmp? rsi
- static const uint8_t jb_near[] = { 0x0f, 0x8c }; // jump relative next 4 bytes
- // if cmp set unsigned < bit
-
// We do this a lot, so it's nice to infer the correct size. Works fine with arrays.
template <typename T>
- void splice(SkWStream* stream, const T& val) {
- stream->write(&val, sizeof(val));
+ static void splice(SkWStream* buf, const T& val) {
+ buf->write(&val, sizeof(val));
}
+#if defined(__aarch64__)
+ static constexpr int kStride = 4;
+ static void set_k(SkWStream* buf, const SkSplicer_constants* k) {
+ uint16_t parts[4];
+ memcpy(parts, &k, 8);
+ splice(buf, 0xd2f00000 | (parts[3] << 5) | 0x3); // move 16-bit intermediate << 48 into x3
+ splice(buf, 0xf2c00000 | (parts[2] << 5) | 0x3); // merge 16-bit intermediate << 32 into x3
+ splice(buf, 0xf2a00000 | (parts[1] << 5) | 0x3); // merge 16-bit intermediate << 16 into x3
+ splice(buf, 0xf2800000 | (parts[0] << 5) | 0x3); // merge 16-bit intermediate << 0 into x3
+ }
+ static void set_ctx(SkWStream* buf, void* ctx) {
+ uint16_t parts[4];
+ memcpy(parts, &ctx, 8);
+ splice(buf, 0xd2f00000 | (parts[3] << 5) | 0x2); // move 16-bit intermediate << 48 into x2
+ splice(buf, 0xf2c00000 | (parts[2] << 5) | 0x2); // merge 16-bit intermediate << 32 into x2
+ splice(buf, 0xf2a00000 | (parts[1] << 5) | 0x2); // merge 16-bit intermediate << 16 into x2
+ splice(buf, 0xf2800000 | (parts[0] << 5) | 0x2); // merge 16-bit intermediate << 0 into x2
+ }
+ static void loop(SkWStream* buf, int loop_start) {
+ splice(buf, 0x91001000); // add x0, x0, #4
+ splice(buf, 0xeb01001f); // cmp x0, x1
+ int off = loop_start - (int)(buf->bytesWritten() + 4); // TODO: check that this is right
+ off /= 4; // bytes -> instructions, still signed
+ off = (off & 0x7ffff) << 5; // 19 bit maximum range (+- 256K instructions)
+ splice(buf, 0x54000003 | off); // b.cc loop_start (cc == "carry clear", unsigned less than)
+ }
+ static void ret(SkWStream* buf) {
+ splice(buf, 0xd65f03c0); // ret
+ }
+#else
+ static constexpr int kStride = 8;
+ static void set_k(SkWStream* buf, const SkSplicer_constants* k) {
+ static const uint8_t movabsq_rcx[] = { 0x48, 0xb9 };
+ splice(buf, movabsq_rcx); // movabsq <next 8 bytes>, %rcx
+ splice(buf, k);
+ }
+ static void set_ctx(SkWStream* buf, void* ctx) {
+ static const uint8_t movabsq_rdx[] = { 0x48, 0xba };
+ splice(buf, movabsq_rdx); // movabsq <next 8 bytes>, %rdx
+ splice(buf, ctx);
+ }
+ static void loop(SkWStream* buf, int loop_start) {
+ static const uint8_t addq_8_rdi[] = { 0x48, 0x83, 0xc7, 0x08 };
+ static const uint8_t cmp_rsi_rdi[] = { 0x48, 0x39, 0xf7 };
+ static const uint8_t jb_near[] = { 0x0f, 0x8c };
+ splice(buf, addq_8_rdi); // addq $8, %rdi
+ splice(buf, cmp_rsi_rdi); // cmp %rsi, %rdi
+ splice(buf, jb_near); // jb <next 4 bytes> (b == "before", unsigned less than)
+ splice(buf, loop_start - (int)(buf->bytesWritten() + 4));
+ }
+ static void ret(SkWStream* buf) {
+ static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 };
+ static const uint8_t ret[] = { 0xc3 };
+ splice(buf, vzeroupper);
+ splice(buf, ret);
+ }
+#endif
+
#ifdef IACA_DUMP
static const uint8_t ud2[] = { 0x0f, 0x0b }; // undefined... crashes when run
static const uint8_t nop3[] = { 0x64, 0x67, 0x90 }; // 3 byte no-op
static const uint8_t movl_ebx[] = { 0xbb }; // move next 4 bytes into ebx
- static void iaca_start(SkWStream* stream) {
- splice(stream, ud2);
- splice(stream, movl_ebx);
- splice(stream, 111);
- splice(stream, nop3);
+ static void iaca_start(SkWStream* buf) {
+ splice(buf, ud2);
+ splice(buf, movl_ebx);
+ splice(buf, 111);
+ splice(buf, nop3);
}
- static void iaca_end(SkWStream* stream) {
- splice(stream, movl_ebx);
- splice(stream, 222);
- splice(stream, nop3);
- splice(stream, ud2);
+ static void iaca_end(SkWStream* buf) {
+ splice(buf, movl_ebx);
+ splice(buf, 222);
+ splice(buf, nop3);
+ splice(buf, ud2);
}
#else
static void iaca_start(SkWStream*) {}
@@ -87,32 +135,32 @@ namespace {
Spliced(const SkRasterPipeline::Stage* stages, int nstages) {
// We always create a backup interpreter pipeline,
// - to handle any program we can't, and
- // - to handle the n < 8 tails.
+ // - to handle the n < kStride tails.
fBackup = SkOpts::compile_pipeline(stages, nstages);
fSplicedLen = 0;
fSpliced = nullptr;
// If we return early anywhere in here, !fSpliced means we'll use fBackup instead.
+ #if !defined(__aarch64__)
// To keep things simple, only one target supported: Haswell+ x86-64.
if (!SkCpu::Supports(SkCpu::HSW) || sizeof(void*) != 8) {
return;
}
+ #endif
SkDynamicMemoryWStream buf;
- // Put the address of kConstants in rcx, Stage argument 4 "k".
- splice(&buf, movabsq_rcx);
- splice(&buf, &kConstants);
+ // Put the address of kConstants in rcx/x3, Stage argument 4 "k".
+ set_k(&buf, &kConstants);
- // We'll loop back to here as long as x<n after x+=8.
+ // We'll loop back to here as long as x<n after x += kStride.
iaca_start(&buf);
auto loop_start = buf.bytesWritten(); // Think of this like a label, loop_start:
for (int i = 0; i < nstages; i++) {
- // If a stage has a context pointer, load it into rdx, Stage argument 3 "ctx".
+ // If a stage has a context pointer, load it into rdx/x2, Stage argument 3 "ctx".
if (stages[i].ctx) {
- splice(&buf, movabsq_rdx);
- splice(&buf, stages[i].ctx);
+ set_ctx(&buf, stages[i].ctx);
}
// Splice in the code for the Stages, generated offline into SkSplicer_generated.h.
@@ -144,16 +192,9 @@ namespace {
}
}
- // See if we should loop back to handle more pixels.
- splice(&buf, addq_8_rdi); // x += 8
- splice(&buf, cmpq_rsi_rdi); // if (x < n)
- splice(&buf, jb_near); // goto loop_start;
- splice(&buf, (int)loop_start - (int)(buf.bytesWritten() + 4));
+ loop(&buf, loop_start); // Loop back to handle more pixels if not done.
iaca_end(&buf);
-
- // Nope! We're done.
- splice(&buf, vzeroupper);
- splice(&buf, ret);
+ ret(&buf); // We're done.
auto data = buf.detachAsData();
fSplicedLen = data->size();
@@ -175,14 +216,15 @@ namespace {
// Here's where we call fSpliced if we created it, fBackup if not.
void operator()(size_t x, size_t y, size_t n) const {
- // TODO: The looping logic is probably not correct for handling n<8 tails.
- if (fSpliced) {
+ // TODO: The looping logic is probably not correct for n < kStride tails or x != 0.
+
+ size_t body = n/kStride*kStride; // Largest multiple of kStride (4 or 8) <= n.
+ if (fSpliced && body) { // Can we run fSpliced for at least one kStride?
// TODO: At some point we will want to pass in y...
using Fn = void(size_t x, size_t n);
- ((Fn*)fSpliced)(x,n);
+ ((Fn*)fSpliced)(x,body);
- // Fall through to fBackup for any n<8 last pixels.
- size_t body = n/8*8;
+ // Fall through to fBackup for any n<kStride last pixels.
x += body;
n -= body;
}
diff --git a/src/splicer/SkSplicer_generated.h b/src/splicer/SkSplicer_generated.h
index ab045839d6..f5f07a2101 100644
--- a/src/splicer/SkSplicer_generated.h
+++ b/src/splicer/SkSplicer_generated.h
@@ -11,6 +11,279 @@
// This file is generated semi-automatically with this command:
// $ src/splicer/build_stages.py > src/splicer/SkSplicer_generated.h
+#if defined(__aarch64__)
+
+static const unsigned int kSplice_clear[] = {
+ 0x6f00e400, // movi v0.2d, #0x0
+ 0x6f00e401, // movi v1.2d, #0x0
+ 0x6f00e402, // movi v2.2d, #0x0
+ 0x6f00e403, // movi v3.2d, #0x0
+};
+static const unsigned int kSplice_plus[] = {
+ 0x4e24d400, // fadd v0.4s, v0.4s, v4.4s
+ 0x4e25d421, // fadd v1.4s, v1.4s, v5.4s
+ 0x4e26d442, // fadd v2.4s, v2.4s, v6.4s
+ 0x4e27d463, // fadd v3.4s, v3.4s, v7.4s
+};
+static const unsigned int kSplice_srcover[] = {
+ 0x91001068, // add x8, x3, #0x4
+ 0x4d40c910, // ld1r {v16.4s}, [x8]
+ 0x4ea3d610, // fsub v16.4s, v16.4s, v3.4s
+ 0x4e24ce00, // fmla v0.4s, v16.4s, v4.4s
+ 0x4e25ce01, // fmla v1.4s, v16.4s, v5.4s
+ 0x4e26ce02, // fmla v2.4s, v16.4s, v6.4s
+ 0x4e26ce03, // fmla v3.4s, v16.4s, v6.4s
+};
+static const unsigned int kSplice_dstover[] = {
+ 0x91001068, // add x8, x3, #0x4
+ 0x4d40c910, // ld1r {v16.4s}, [x8]
+ 0x4ea7d610, // fsub v16.4s, v16.4s, v7.4s
+ 0x4e20ce04, // fmla v4.4s, v16.4s, v0.4s
+ 0x4e21ce05, // fmla v5.4s, v16.4s, v1.4s
+ 0x4e22ce06, // fmla v6.4s, v16.4s, v2.4s
+ 0x4e22ce07, // fmla v7.4s, v16.4s, v2.4s
+};
+static const unsigned int kSplice_clamp_0[] = {
+ 0x6f00e410, // movi v16.2d, #0x0
+ 0x4e30f400, // fmax v0.4s, v0.4s, v16.4s
+ 0x4e30f421, // fmax v1.4s, v1.4s, v16.4s
+ 0x4e30f442, // fmax v2.4s, v2.4s, v16.4s
+ 0x4e30f463, // fmax v3.4s, v3.4s, v16.4s
+};
+static const unsigned int kSplice_clamp_1[] = {
+ 0x91001068, // add x8, x3, #0x4
+ 0x4d40c910, // ld1r {v16.4s}, [x8]
+ 0x4eb0f400, // fmin v0.4s, v0.4s, v16.4s
+ 0x4eb0f421, // fmin v1.4s, v1.4s, v16.4s
+ 0x4eb0f442, // fmin v2.4s, v2.4s, v16.4s
+ 0x4eb0f463, // fmin v3.4s, v3.4s, v16.4s
+};
+static const unsigned int kSplice_clamp_a[] = {
+ 0x91001068, // add x8, x3, #0x4
+ 0x4d40c910, // ld1r {v16.4s}, [x8]
+ 0x4eb0f463, // fmin v3.4s, v3.4s, v16.4s
+ 0x4ea3f400, // fmin v0.4s, v0.4s, v3.4s
+ 0x4ea3f421, // fmin v1.4s, v1.4s, v3.4s
+ 0x4ea3f442, // fmin v2.4s, v2.4s, v3.4s
+};
+static const unsigned int kSplice_swap[] = {
+ 0x4ea31c70, // mov v16.16b, v3.16b
+ 0x4ea21c51, // mov v17.16b, v2.16b
+ 0x4ea11c32, // mov v18.16b, v1.16b
+ 0x4ea01c13, // mov v19.16b, v0.16b
+ 0x4ea41c80, // mov v0.16b, v4.16b
+ 0x4ea51ca1, // mov v1.16b, v5.16b
+ 0x4ea61cc2, // mov v2.16b, v6.16b
+ 0x4ea71ce3, // mov v3.16b, v7.16b
+ 0x4eb31e64, // mov v4.16b, v19.16b
+ 0x4eb21e45, // mov v5.16b, v18.16b
+ 0x4eb11e26, // mov v6.16b, v17.16b
+ 0x4eb01e07, // mov v7.16b, v16.16b
+};
+static const unsigned int kSplice_move_src_dst[] = {
+ 0x4ea01c04, // mov v4.16b, v0.16b
+ 0x4ea11c25, // mov v5.16b, v1.16b
+ 0x4ea21c46, // mov v6.16b, v2.16b
+ 0x4ea31c67, // mov v7.16b, v3.16b
+};
+static const unsigned int kSplice_move_dst_src[] = {
+ 0x4ea41c80, // mov v0.16b, v4.16b
+ 0x4ea51ca1, // mov v1.16b, v5.16b
+ 0x4ea61cc2, // mov v2.16b, v6.16b
+ 0x4ea71ce3, // mov v3.16b, v7.16b
+};
+static const unsigned int kSplice_premul[] = {
+ 0x6e23dc00, // fmul v0.4s, v0.4s, v3.4s
+ 0x6e23dc21, // fmul v1.4s, v1.4s, v3.4s
+ 0x6e23dc42, // fmul v2.4s, v2.4s, v3.4s
+};
+static const unsigned int kSplice_unpremul[] = {
+ 0x91001068, // add x8, x3, #0x4
+ 0x4d40c910, // ld1r {v16.4s}, [x8]
+ 0x4ea0d871, // fcmeq v17.4s, v3.4s, #0.0
+ 0x6e23fe10, // fdiv v16.4s, v16.4s, v3.4s
+ 0x4e711e10, // bic v16.16b, v16.16b, v17.16b
+ 0x6e20de00, // fmul v0.4s, v16.4s, v0.4s
+ 0x6e21de01, // fmul v1.4s, v16.4s, v1.4s
+ 0x6e22de02, // fmul v2.4s, v16.4s, v2.4s
+};
+static const unsigned int kSplice_from_srgb[] = {
+ 0x91005068, // add x8, x3, #0x14
+ 0x4d40c910, // ld1r {v16.4s}, [x8]
+ 0x91004068, // add x8, x3, #0x10
+ 0x4d40c911, // ld1r {v17.4s}, [x8]
+ 0x2d434c72, // ldp s18, s19, [x3,#24]
+ 0x6e22dc54, // fmul v20.4s, v2.4s, v2.4s
+ 0x4eb01e15, // mov v21.16b, v16.16b
+ 0x4eb01e17, // mov v23.16b, v16.16b
+ 0x4f921050, // fmla v16.4s, v2.4s, v18.s[0]
+ 0x4eb11e36, // mov v22.16b, v17.16b
+ 0x4eb11e38, // mov v24.16b, v17.16b
+ 0x4e34ce11, // fmla v17.4s, v16.4s, v20.4s
+ 0x6e20dc10, // fmul v16.4s, v0.4s, v0.4s
+ 0x91008068, // add x8, x3, #0x20
+ 0x4f921015, // fmla v21.4s, v0.4s, v18.s[0]
+ 0x4e30ceb6, // fmla v22.4s, v21.4s, v16.4s
+ 0x4d40c910, // ld1r {v16.4s}, [x8]
+ 0x6e21dc34, // fmul v20.4s, v1.4s, v1.4s
+ 0x4f921037, // fmla v23.4s, v1.4s, v18.s[0]
+ 0x4f939015, // fmul v21.4s, v0.4s, v19.s[0]
+ 0x4f939032, // fmul v18.4s, v1.4s, v19.s[0]
+ 0x4f939053, // fmul v19.4s, v2.4s, v19.s[0]
+ 0x6ea0e600, // fcmgt v0.4s, v16.4s, v0.4s
+ 0x6ea1e601, // fcmgt v1.4s, v16.4s, v1.4s
+ 0x6ea2e602, // fcmgt v2.4s, v16.4s, v2.4s
+ 0x4e34cef8, // fmla v24.4s, v23.4s, v20.4s
+ 0x6e761ea0, // bsl v0.16b, v21.16b, v22.16b
+ 0x6e781e41, // bsl v1.16b, v18.16b, v24.16b
+ 0x6e711e62, // bsl v2.16b, v19.16b, v17.16b
+};
+static const unsigned int kSplice_to_srgb[] = {
+ 0x6ea1d810, // frsqrte v16.4s, v0.4s
+ 0x6ea1d835, // frsqrte v21.4s, v1.4s
+ 0x6e30de17, // fmul v23.4s, v16.4s, v16.4s
+ 0x6ea1d856, // frsqrte v22.4s, v2.4s
+ 0x6e35deb9, // fmul v25.4s, v21.4s, v21.4s
+ 0x4eb7fc17, // frsqrts v23.4s, v0.4s, v23.4s
+ 0x9100c068, // add x8, x3, #0x30
+ 0x6e36deda, // fmul v26.4s, v22.4s, v22.4s
+ 0x4eb9fc39, // frsqrts v25.4s, v1.4s, v25.4s
+ 0x6e37de10, // fmul v16.4s, v16.4s, v23.4s
+ 0x2d44c871, // ldp s17, s18, [x3,#36]
+ 0x4d40c914, // ld1r {v20.4s}, [x8]
+ 0x4ebafc5a, // frsqrts v26.4s, v2.4s, v26.4s
+ 0x6e39deb5, // fmul v21.4s, v21.4s, v25.4s
+ 0x4ea1da17, // frecpe v23.4s, v16.4s
+ 0xbd402c73, // ldr s19, [x3,#44]
+ 0x9100d068, // add x8, x3, #0x34
+ 0x6e3aded6, // fmul v22.4s, v22.4s, v26.4s
+ 0x4ea1dabb, // frecpe v27.4s, v21.4s
+ 0x4e37fe1d, // frecps v29.4s, v16.4s, v23.4s
+ 0x4d40c918, // ld1r {v24.4s}, [x8]
+ 0x4ea1dadc, // frecpe v28.4s, v22.4s
+ 0x6e3ddef7, // fmul v23.4s, v23.4s, v29.4s
+ 0x4e3bfebd, // frecps v29.4s, v21.4s, v27.4s
+ 0x6e3ddf7b, // fmul v27.4s, v27.4s, v29.4s
+ 0x4e3cfedd, // frecps v29.4s, v22.4s, v28.4s
+ 0x6e3ddf9c, // fmul v28.4s, v28.4s, v29.4s
+ 0x4eb41e9d, // mov v29.16b, v20.16b
+ 0x6ea1da19, // frsqrte v25.4s, v16.4s
+ 0x4f9312fd, // fmla v29.4s, v23.4s, v19.s[0]
+ 0x4eb41e97, // mov v23.16b, v20.16b
+ 0x4f91901a, // fmul v26.4s, v0.4s, v17.s[0]
+ 0x4f931377, // fmla v23.4s, v27.4s, v19.s[0]
+ 0x6ea1dabb, // frsqrte v27.4s, v21.4s
+ 0x4f931394, // fmla v20.4s, v28.4s, v19.s[0]
+ 0x4f919033, // fmul v19.4s, v1.4s, v17.s[0]
+ 0x4f919051, // fmul v17.4s, v2.4s, v17.s[0]
+ 0x6ea0e700, // fcmgt v0.4s, v24.4s, v0.4s
+ 0x6ea1e701, // fcmgt v1.4s, v24.4s, v1.4s
+ 0x6ea2e702, // fcmgt v2.4s, v24.4s, v2.4s
+ 0x6e39df38, // fmul v24.4s, v25.4s, v25.4s
+ 0x6ea1dadc, // frsqrte v28.4s, v22.4s
+ 0x4eb8fe10, // frsqrts v16.4s, v16.4s, v24.4s
+ 0x6e3bdf78, // fmul v24.4s, v27.4s, v27.4s
+ 0x4eb8feb5, // frsqrts v21.4s, v21.4s, v24.4s
+ 0x6e3cdf98, // fmul v24.4s, v28.4s, v28.4s
+ 0x91001068, // add x8, x3, #0x4
+ 0x4eb8fed6, // frsqrts v22.4s, v22.4s, v24.4s
+ 0x4d40c918, // ld1r {v24.4s}, [x8]
+ 0x6e30df30, // fmul v16.4s, v25.4s, v16.4s
+ 0x6e35df75, // fmul v21.4s, v27.4s, v21.4s
+ 0x6e36df96, // fmul v22.4s, v28.4s, v22.4s
+ 0x4f92121d, // fmla v29.4s, v16.4s, v18.s[0]
+ 0x4f9212b7, // fmla v23.4s, v21.4s, v18.s[0]
+ 0x4f9212d4, // fmla v20.4s, v22.4s, v18.s[0]
+ 0x4ebdf710, // fmin v16.4s, v24.4s, v29.4s
+ 0x4eb7f712, // fmin v18.4s, v24.4s, v23.4s
+ 0x4eb4f714, // fmin v20.4s, v24.4s, v20.4s
+ 0x6e701f40, // bsl v0.16b, v26.16b, v16.16b
+ 0x6e721e61, // bsl v1.16b, v19.16b, v18.16b
+ 0x6e741e22, // bsl v2.16b, v17.16b, v20.16b
+};
+static const unsigned int kSplice_scale_u8[] = {
+ 0xf9400048, // ldr x8, [x2]
+ 0xbd400c71, // ldr s17, [x3,#12]
+ 0x8b000108, // add x8, x8, x0
+ 0x39400109, // ldrb w9, [x8]
+ 0x3940050a, // ldrb w10, [x8,#1]
+ 0x4e021d30, // mov v16.h[0], w9
+ 0x39400909, // ldrb w9, [x8,#2]
+ 0x39400d08, // ldrb w8, [x8,#3]
+ 0x4e061d50, // mov v16.h[1], w10
+ 0x4e0a1d30, // mov v16.h[2], w9
+ 0x4e0e1d10, // mov v16.h[3], w8
+ 0x2f07b7f0, // bic v16.4h, #0xff, lsl #8
+ 0x2f10a610, // uxtl v16.4s, v16.4h
+ 0x6e21da10, // ucvtf v16.4s, v16.4s
+ 0x4f919210, // fmul v16.4s, v16.4s, v17.s[0]
+ 0x6e20de00, // fmul v0.4s, v16.4s, v0.4s
+ 0x6e21de01, // fmul v1.4s, v16.4s, v1.4s
+ 0x6e22de02, // fmul v2.4s, v16.4s, v2.4s
+ 0x6e23de03, // fmul v3.4s, v16.4s, v3.4s
+};
+static const unsigned int kSplice_load_8888[] = {
+ 0xf9400048, // ldr x8, [x2]
+ 0xd37ef409, // lsl x9, x0, #2
+ 0x4d40c860, // ld1r {v0.4s}, [x3]
+ 0xbd400c63, // ldr s3, [x3,#12]
+ 0x3ce96901, // ldr q1, [x8,x9]
+ 0x4e211c02, // and v2.16b, v0.16b, v1.16b
+ 0x6f380430, // ushr v16.4s, v1.4s, #8
+ 0x6f300431, // ushr v17.4s, v1.4s, #16
+ 0x6f280421, // ushr v1.4s, v1.4s, #24
+ 0x4e21d842, // scvtf v2.4s, v2.4s
+ 0x4e301c10, // and v16.16b, v0.16b, v16.16b
+ 0x4e311c11, // and v17.16b, v0.16b, v17.16b
+ 0x4e21d832, // scvtf v18.4s, v1.4s
+ 0x4f839040, // fmul v0.4s, v2.4s, v3.s[0]
+ 0x4e21da01, // scvtf v1.4s, v16.4s
+ 0x4e21da22, // scvtf v2.4s, v17.4s
+ 0x4f839021, // fmul v1.4s, v1.4s, v3.s[0]
+ 0x4f839042, // fmul v2.4s, v2.4s, v3.s[0]
+ 0x4f839243, // fmul v3.4s, v18.4s, v3.s[0]
+};
+static const unsigned int kSplice_store_8888[] = {
+ 0xbd400870, // ldr s16, [x3,#8]
+ 0xf9400048, // ldr x8, [x2]
+ 0xd37ef409, // lsl x9, x0, #2
+ 0x4f909032, // fmul v18.4s, v1.4s, v16.s[0]
+ 0x4f909011, // fmul v17.4s, v0.4s, v16.s[0]
+ 0x6e21aa52, // fcvtnu v18.4s, v18.4s
+ 0x6e21aa31, // fcvtnu v17.4s, v17.4s
+ 0x4f285652, // shl v18.4s, v18.4s, #8
+ 0x4eb11e51, // orr v17.16b, v18.16b, v17.16b
+ 0x4f909052, // fmul v18.4s, v2.4s, v16.s[0]
+ 0x4f909070, // fmul v16.4s, v3.4s, v16.s[0]
+ 0x6e21aa52, // fcvtnu v18.4s, v18.4s
+ 0x6e21aa10, // fcvtnu v16.4s, v16.4s
+ 0x4f305652, // shl v18.4s, v18.4s, #16
+ 0x4eb21e31, // orr v17.16b, v17.16b, v18.16b
+ 0x4f385610, // shl v16.4s, v16.4s, #24
+ 0x4eb01e30, // orr v16.16b, v17.16b, v16.16b
+ 0x3ca96910, // str q16, [x8,x9]
+};
+static const unsigned int kSplice_load_f16[] = {
+ 0xf9400048, // ldr x8, [x2]
+ 0x8b000d08, // add x8, x8, x0, lsl #3
+ 0x0c400510, // ld4 {v16.4h-v19.4h}, [x8]
+ 0x0e217a00, // fcvtl v0.4s, v16.4h
+ 0x0e217a21, // fcvtl v1.4s, v17.4h
+ 0x0e217a42, // fcvtl v2.4s, v18.4h
+ 0x0e217a63, // fcvtl v3.4s, v19.4h
+};
+static const unsigned int kSplice_store_f16[] = {
+ 0xf9400048, // ldr x8, [x2]
+ 0x0e216810, // fcvtn v16.4h, v0.4s
+ 0x0e216831, // fcvtn v17.4h, v1.4s
+ 0x0e216852, // fcvtn v18.4h, v2.4s
+ 0x8b000d08, // add x8, x8, x0, lsl #3
+ 0x0e216873, // fcvtn v19.4h, v3.4s
+ 0x0c000510, // st4 {v16.4h-v19.4h}, [x8]
+};
+
+#else
+
static const unsigned char kSplice_clear[] = {
0xc5,0xfc,0x57,0xc0, // vxorps %ymm0, %ymm0, %ymm0
0xc5,0xf4,0x57,0xc9, // vxorps %ymm1, %ymm1, %ymm1
@@ -255,4 +528,7 @@ static const unsigned char kSplice_store_f16[] = {
0xc4,0x41,0x39,0x6a,0xc2, // vpunpckhdq %xmm10, %xmm8, %xmm8
0xc5,0x7a,0x7f,0x44,0xf8,0x30, // vmovdqu %xmm8, 0x30(%rax,%rdi,8)
};
+
+#endif
+
#endif//SkSplicer_generated_DEFINED
diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp
index f3608d47c1..0e3290d5d6 100644
--- a/src/splicer/SkSplicer_stages.cpp
+++ b/src/splicer/SkSplicer_stages.cpp
@@ -6,52 +6,71 @@
*/
#include "SkSplicer_shared.h"
-#include <immintrin.h>
#include <string.h>
-#if !defined(__clang__) || !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
- #error This file is not like the rest of Skia.
- #error It must be compiled with clang and with -mavx2 -mfma -mf16c -fomit-frame-pointer.
+#if !defined(__clang__)
+ #error This file is not like the rest of Skia. It must be compiled with clang.
#endif
// We have very specific inlining requirements. It helps to just take total control.
#define AI __attribute__((always_inline)) inline
+#if defined(__aarch64__)
+ #include <arm_neon.h>
+
+ // Since we know we're using Clang, we can use its vector extensions.
+ using F = float __attribute__((ext_vector_type(4)));
+ using I32 = int32_t __attribute__((ext_vector_type(4)));
+ using U32 = uint32_t __attribute__((ext_vector_type(4)));
+ using U8 = uint8_t __attribute__((ext_vector_type(4)));
+
+ // We polyfill a few routines that Clang doesn't build into ext_vector_types.
+ AI static U32 round(F v) { return vcvtnq_u32_f32(v); }
+ AI static F min(F a, F b) { return vminq_f32(a,b); }
+ AI static F max(F a, F b) { return vmaxq_f32(a,b); }
+ AI static F fma(F f, F m, F a) { return vfmaq_f32(a,f,m); }
+ AI static F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
+ AI static F rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
+ AI static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
+#else
+ #if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
+ #error On x86, compile with -mavx2 -mfma -mf16c.
+ #endif
+ #include <immintrin.h>
+
+ // These are __m256 and __m256i, but friendlier and strongly-typed.
+ using F = float __attribute__((ext_vector_type(8)));
+ using I32 = int32_t __attribute__((ext_vector_type(8)));
+ using U32 = uint32_t __attribute__((ext_vector_type(8)));
+ using U8 = uint8_t __attribute__((ext_vector_type(8)));
+
+ AI static U32 round(F v) { return _mm256_cvtps_epi32(v); }
+ AI static F min(F a, F b) { return _mm256_min_ps (a,b); }
+ AI static F max(F a, F b) { return _mm256_max_ps (a,b); }
+ AI static F fma(F f, F m, F a) { return _mm256_fmadd_ps(f,m,a);}
+ AI static F rcp (F v) { return _mm256_rcp_ps (v); }
+ AI static F rsqrt(F v) { return _mm256_rsqrt_ps (v); }
+ AI static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
+#endif
+
+AI static F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
+AI static U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
+
// We'll be compiling this file to an object file, then extracting parts of it into
// SkSplicer_generated.h. It's easier to do if the function names are not C++ mangled.
#define C extern "C"
-// Since we know we're using Clang, we can use its vector extensions.
-// These are __m256 and __m256i, but friendlier and strongly-typed.
-using F = float __attribute__((ext_vector_type(8)));
-using I32 = int32_t __attribute__((ext_vector_type(8)));
-using U32 = uint32_t __attribute__((ext_vector_type(8)));
-using U8 = uint8_t __attribute__((ext_vector_type(8)));
-
-// We polyfill a few routines that Clang doesn't build into ext_vector_types.
-AI static F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
-AI static U32 round (F v) { return _mm256_cvtps_epi32(v); }
-AI static U32 expand(U8 v) { return __builtin_convertvector(v, U32); }
-
-AI static F rcp (F v) { return _mm256_rcp_ps (v); }
-AI static F rsqrt(F v) { return _mm256_rsqrt_ps(v); }
-AI static F min (F a, F b) { return _mm256_min_ps (a,b); }
-AI static F max (F a, F b) { return _mm256_max_ps (a,b); }
-AI static F fma (F f, F m, F a) { return _mm256_fmadd_ps(f,m,a); }
-
-AI static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
-
// Stages all fit a common interface that allows SkSplicer to splice them together.
using K = const SkSplicer_constants;
using Stage = void(size_t x, size_t n, void* ctx, K* constants, F,F,F,F, F,F,F,F);
// Stage's arguments act as the working set of registers within the final spliced function.
-// Here's a little primer on the ABI:
-// x: rdi x and n work to drive the loop, like for (; x < n; x += 8)
-// n: rsi
-// ctx: rdx Look for movabsq_rdx in SkSplicer.cpp to see how this works.
-// constants: rcx Look for movabsq_rcx in SkSplicer.cpp to see how this works.
-// vectors: ymm0-ymm7
+// Here's a little primer on the x86-64/aarch64 ABIs:
+// x: rdi/x0 x and n work to drive the loop, like for (; x < n; x += 4 or 8)
+// n: rsi/x1
+// ctx: rdx/x2 Look for movabsq_rdx in SkSplicer.cpp to see how this works.
+// constants: rcx/x3 Look for movabsq_rcx in SkSplicer.cpp to see how this works.
+// vectors: ymm0-ymm7/v0-v7
// done() is the key to this entire splicing strategy.
@@ -231,6 +250,13 @@ STAGE(store_8888) {
STAGE(load_f16) {
auto ptr = *(const uint64_t**)ctx + x;
+#if defined(__aarch64__)
+ auto halfs = vld4_f16((const float16_t*)ptr);
+ r = vcvt_f32_f16(halfs.val[0]);
+ g = vcvt_f32_f16(halfs.val[1]);
+ b = vcvt_f32_f16(halfs.val[2]);
+ a = vcvt_f32_f16(halfs.val[3]);
+#else
auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
_45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
@@ -250,11 +276,21 @@ STAGE(load_f16) {
g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567));
b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
+#endif
}
STAGE(store_f16) {
auto ptr = *(uint64_t**)ctx + x;
+#if defined(__aarch64__)
+ float16x4x4_t halfs = {{
+ vcvt_f16_f32(r),
+ vcvt_f16_f32(g),
+ vcvt_f16_f32(b),
+ vcvt_f16_f32(a),
+ }};
+ vst4_f16((float16_t*)ptr, halfs);
+#else
auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION),
@@ -269,4 +305,5 @@ STAGE(store_f16) {
_mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
_mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
_mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
+#endif
}
diff --git a/src/splicer/build_stages.py b/src/splicer/build_stages.py
index 0cb633d6bb..1940690d58 100755
--- a/src/splicer/build_stages.py
+++ b/src/splicer/build_stages.py
@@ -9,11 +9,21 @@ import re
import subprocess
import sys
-cflags = '-std=c++11 -Os -fomit-frame-pointer -mavx2 -mfma -mf16c'
+cflags = '-std=c++11 -Os -fomit-frame-pointer'.split()
-subprocess.check_call(['clang++'] + cflags.split() +
+hsw = '-mavx2 -mfma -mf16c'.split()
+subprocess.check_call(['clang++'] + cflags + hsw +
['-c', 'src/splicer/SkSplicer_stages.cpp'] +
- ['-o', 'stages.o'])
+ ['-o', 'hsw.o'])
+
+aarch64 = [
+ '--target=aarch64-linux-android',
+ '--sysroot=' +
+ '/Users/mtklein/brew/opt/android-ndk/platforms/android-21/arch-arm64',
+]
+subprocess.check_call(['clang++'] + cflags + aarch64 +
+ ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
+ ['-o', 'aarch64.o'])
print '''/*
* Copyright 2017 Google Inc.
@@ -27,11 +37,37 @@ print '''/*
// This file is generated semi-automatically with this command:
// $ src/splicer/build_stages.py > src/splicer/SkSplicer_generated.h
+
+#if defined(__aarch64__)
+'''
+for line in subprocess.check_output(['gobjdump', '-d',
+ 'aarch64.o']).split('\n'):
+ line = line.strip()
+ if not line or line.startswith('aarch64.o') or line.startswith('Disassembly'):
+ continue
+
+ m = re.match('''................ <(.*)>:''', line)
+ if m:
+ print 'static const unsigned int kSplice_' + m.group(1) + '[] = {'
+ continue
+
+ _, code, inst, args = line.split('\t')
+ code = code.strip()
+
+ # b done, where done has not yet been filled in by the linker.
+ if code == '14000000':
+ print '};'
+ continue
+ print ' ', '0x'+code+',' + ' // ' + inst + ' ' + args
+
+print '''
+#else
'''
-for line in subprocess.check_output(['otool', '-tvj', 'stages.o']).split('\n'):
+# TODO: port this to gobjdump too
+for line in subprocess.check_output(['otool', '-tvj', 'hsw.o']).split('\n'):
line = line.strip()
- if line == '' or line == 'stages.o:' or line == '(__TEXT,__text) section':
+ if line == '' or line == 'hsw.o:' or line == '(__TEXT,__text) section':
continue
m = re.match('_(.*):', line)
@@ -41,33 +77,37 @@ for line in subprocess.check_output(['otool', '-tvj', 'stages.o']).split('\n'):
continue
# Skip the leading 16 byte address and a tab,
- # leaving the hex and mnemonics of each instruction.
+ # leaving the code, instruction mnemonic, and its arguments.
line = line[17:]
columns = line.split('\t')
- _hex = columns[0].strip()
- instr = columns[1]
- args = columns[2:]
+ code = columns[0].strip()
+ inst = columns[1]
+ args = columns[2:]
# We can't splice code that uses rip relative addressing.
for arg in args:
assert 'rip' not in arg
# jmp done, the end of each stage (the address of done is not yet filled in)
- if _hex == 'e9 00 00 00 00':
+ if code == 'e9 00 00 00 00':
print '};'
continue
sys.stdout.write(' ')
- _bytes = _hex.split(' ')
+ _bytes = code.split(' ')
# This is the meat of things: copy the code to a C unsigned char array.
for byte in _bytes:
sys.stdout.write('0x' + byte + ',')
# From here on we're just making the generated file readable and pretty.
sys.stdout.write(' ' * (44 - 5*len(_bytes)))
- sys.stdout.write('// ' + instr)
+ sys.stdout.write('// ' + inst)
if args:
- sys.stdout.write(' ' * (13 - len(instr)))
+ sys.stdout.write(' ' * (13 - len(inst)))
sys.stdout.write(' '.join(args))
sys.stdout.write('\n')
+print '''
+#endif
+'''
+
print '''#endif//SkSplicer_generated_DEFINED'''