From 8e619a2b4eb31753e6fcb4a9ec494d31ace755da Mon Sep 17 00:00:00 2001 From: Mike Klein Date: Mon, 9 Jan 2017 17:21:32 -0500 Subject: SkSplicer: start on arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Seems to be working. The jump to loop_start might be a little off, but not by much. Correctness is really still a big TODO. $ adb shell 'cd /data/local/tmp; ./monobench SkRasterPipeline 200' SkRasterPipeline_… 200 …f16_compile 1x …f16_run 1.42x …srgb_compile 2.21x …srgb_run 2.59x⏎ Change-Id: I0e1acc6404cf3ce8084d9ef8011cbe0b5f1fd6e3 Reviewed-on: https://skia-review.googlesource.com/6811 Reviewed-by: Herb Derby Commit-Queue: Mike Klein --- src/splicer/SkSplicer.cpp | 130 ++++++++++++------ src/splicer/SkSplicer_generated.h | 276 ++++++++++++++++++++++++++++++++++++++ src/splicer/SkSplicer_stages.cpp | 97 +++++++++----- src/splicer/build_stages.py | 66 +++++++-- 4 files changed, 482 insertions(+), 87 deletions(-) (limited to 'src/splicer') diff --git a/src/splicer/SkSplicer.cpp b/src/splicer/SkSplicer.cpp index 167a89a22c..f37b95f445 100644 --- a/src/splicer/SkSplicer.cpp +++ b/src/splicer/SkSplicer.cpp @@ -29,38 +29,86 @@ namespace { 12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f, // to_srgb }; - // Short x86-64 instruction sequences that we'll use as glue to splice together Stages. - static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 }; // clear top half of all ymm - static const uint8_t ret[] = { 0xc3 }; // return - static const uint8_t movabsq_rcx[] = { 0x48, 0xb9 }; // move next 8 bytes into rcx - static const uint8_t movabsq_rdx[] = { 0x48, 0xba }; // move next 8 bytes into rdx - static const uint8_t addq_8_rdi[] = { 0x48, 0x83, 0xc7, 0x08 }; // rdi += 8 - static const uint8_t cmpq_rsi_rdi[] = { 0x48, 0x39, 0xf7 }; // rdi cmp? rsi - static const uint8_t jb_near[] = { 0x0f, 0x8c }; // jump relative next 4 bytes - // if cmp set unsigned < bit - // We do this a lot, so it's nice to infer the correct size. Works fine with arrays. template - void splice(SkWStream* stream, const T& val) { - stream->write(&val, sizeof(val)); + static void splice(SkWStream* buf, const T& val) { + buf->write(&val, sizeof(val)); } +#if defined(__aarch64__) + static constexpr int kStride = 4; + static void set_k(SkWStream* buf, const SkSplicer_constants* k) { + uint16_t parts[4]; + memcpy(parts, &k, 8); + splice(buf, 0xd2f00000 | (parts[3] << 5) | 0x3); // move 16-bit intermediate << 48 into x3 + splice(buf, 0xf2c00000 | (parts[2] << 5) | 0x3); // merge 16-bit intermediate << 32 into x3 + splice(buf, 0xf2a00000 | (parts[1] << 5) | 0x3); // merge 16-bit intermediate << 16 into x3 + splice(buf, 0xf2800000 | (parts[0] << 5) | 0x3); // merge 16-bit intermediate << 0 into x3 + } + static void set_ctx(SkWStream* buf, void* ctx) { + uint16_t parts[4]; + memcpy(parts, &ctx, 8); + splice(buf, 0xd2f00000 | (parts[3] << 5) | 0x2); // move 16-bit intermediate << 48 into x2 + splice(buf, 0xf2c00000 | (parts[2] << 5) | 0x2); // merge 16-bit intermediate << 32 into x2 + splice(buf, 0xf2a00000 | (parts[1] << 5) | 0x2); // merge 16-bit intermediate << 16 into x2 + splice(buf, 0xf2800000 | (parts[0] << 5) | 0x2); // merge 16-bit intermediate << 0 into x2 + } + static void loop(SkWStream* buf, int loop_start) { + splice(buf, 0x91001000); // add x0, x0, #4 + splice(buf, 0xeb01001f); // cmp x0, x1 + int off = loop_start - (int)(buf->bytesWritten() + 4); // TODO: check that this is right + off /= 4; // bytes -> instructions, still signed + off = (off & 0x7ffff) << 5; // 19 bit maximum range (+- 256K instructions) + splice(buf, 0x54000003 | off); // b.cc loop_start (cc == "carry clear", unsigned less than) + } + static void ret(SkWStream* buf) { + splice(buf, 0xd65f03c0); // ret + } +#else + static constexpr int kStride = 8; + static void set_k(SkWStream* buf, const SkSplicer_constants* k) { + static const uint8_t movabsq_rcx[] = { 0x48, 0xb9 }; + splice(buf, movabsq_rcx); // movabsq , %rcx + splice(buf, k); + } + static void set_ctx(SkWStream* buf, void* ctx) { + static const uint8_t movabsq_rdx[] = { 0x48, 0xba }; + splice(buf, movabsq_rdx); // movabsq , %rdx + splice(buf, ctx); + } + static void loop(SkWStream* buf, int loop_start) { + static const uint8_t addq_8_rdi[] = { 0x48, 0x83, 0xc7, 0x08 }; + static const uint8_t cmp_rsi_rdi[] = { 0x48, 0x39, 0xf7 }; + static const uint8_t jb_near[] = { 0x0f, 0x8c }; + splice(buf, addq_8_rdi); // addq $8, %rdi + splice(buf, cmp_rsi_rdi); // cmp %rsi, %rdi + splice(buf, jb_near); // jb (b == "before", unsigned less than) + splice(buf, loop_start - (int)(buf->bytesWritten() + 4)); + } + static void ret(SkWStream* buf) { + static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 }; + static const uint8_t ret[] = { 0xc3 }; + splice(buf, vzeroupper); + splice(buf, ret); + } +#endif + #ifdef IACA_DUMP static const uint8_t ud2[] = { 0x0f, 0x0b }; // undefined... crashes when run static const uint8_t nop3[] = { 0x64, 0x67, 0x90 }; // 3 byte no-op static const uint8_t movl_ebx[] = { 0xbb }; // move next 4 bytes into ebx - static void iaca_start(SkWStream* stream) { - splice(stream, ud2); - splice(stream, movl_ebx); - splice(stream, 111); - splice(stream, nop3); + static void iaca_start(SkWStream* buf) { + splice(buf, ud2); + splice(buf, movl_ebx); + splice(buf, 111); + splice(buf, nop3); } - static void iaca_end(SkWStream* stream) { - splice(stream, movl_ebx); - splice(stream, 222); - splice(stream, nop3); - splice(stream, ud2); + static void iaca_end(SkWStream* buf) { + splice(buf, movl_ebx); + splice(buf, 222); + splice(buf, nop3); + splice(buf, ud2); } #else static void iaca_start(SkWStream*) {} @@ -87,32 +135,32 @@ namespace { Spliced(const SkRasterPipeline::Stage* stages, int nstages) { // We always create a backup interpreter pipeline, // - to handle any program we can't, and - // - to handle the n < 8 tails. + // - to handle the n < kStride tails. fBackup = SkOpts::compile_pipeline(stages, nstages); fSplicedLen = 0; fSpliced = nullptr; // If we return early anywhere in here, !fSpliced means we'll use fBackup instead. + #if !defined(__aarch64__) // To keep things simple, only one target supported: Haswell+ x86-64. if (!SkCpu::Supports(SkCpu::HSW) || sizeof(void*) != 8) { return; } + #endif SkDynamicMemoryWStream buf; - // Put the address of kConstants in rcx, Stage argument 4 "k". - splice(&buf, movabsq_rcx); - splice(&buf, &kConstants); + // Put the address of kConstants in rcx/x3, Stage argument 4 "k". + set_k(&buf, &kConstants); - // We'll loop back to here as long as xsize(); @@ -175,14 +216,15 @@ namespace { // Here's where we call fSpliced if we created it, fBackup if not. void operator()(size_t x, size_t y, size_t n) const { - // TODO: The looping logic is probably not correct for handling n<8 tails. - if (fSpliced) { + // TODO: The looping logic is probably not correct for n < kStride tails or x != 0. + + size_t body = n/kStride*kStride; // Largest multiple of kStride (4 or 8) <= n. + if (fSpliced && body) { // Can we run fSpliced for at least one kStride? // TODO: At some point we will want to pass in y... using Fn = void(size_t x, size_t n); - ((Fn*)fSpliced)(x,n); + ((Fn*)fSpliced)(x,body); - // Fall through to fBackup for any n<8 last pixels. - size_t body = n/8*8; + // Fall through to fBackup for any n src/splicer/SkSplicer_generated.h +#if defined(__aarch64__) + +static const unsigned int kSplice_clear[] = { + 0x6f00e400, // movi v0.2d, #0x0 + 0x6f00e401, // movi v1.2d, #0x0 + 0x6f00e402, // movi v2.2d, #0x0 + 0x6f00e403, // movi v3.2d, #0x0 +}; +static const unsigned int kSplice_plus[] = { + 0x4e24d400, // fadd v0.4s, v0.4s, v4.4s + 0x4e25d421, // fadd v1.4s, v1.4s, v5.4s + 0x4e26d442, // fadd v2.4s, v2.4s, v6.4s + 0x4e27d463, // fadd v3.4s, v3.4s, v7.4s +}; +static const unsigned int kSplice_srcover[] = { + 0x91001068, // add x8, x3, #0x4 + 0x4d40c910, // ld1r {v16.4s}, [x8] + 0x4ea3d610, // fsub v16.4s, v16.4s, v3.4s + 0x4e24ce00, // fmla v0.4s, v16.4s, v4.4s + 0x4e25ce01, // fmla v1.4s, v16.4s, v5.4s + 0x4e26ce02, // fmla v2.4s, v16.4s, v6.4s + 0x4e26ce03, // fmla v3.4s, v16.4s, v6.4s +}; +static const unsigned int kSplice_dstover[] = { + 0x91001068, // add x8, x3, #0x4 + 0x4d40c910, // ld1r {v16.4s}, [x8] + 0x4ea7d610, // fsub v16.4s, v16.4s, v7.4s + 0x4e20ce04, // fmla v4.4s, v16.4s, v0.4s + 0x4e21ce05, // fmla v5.4s, v16.4s, v1.4s + 0x4e22ce06, // fmla v6.4s, v16.4s, v2.4s + 0x4e22ce07, // fmla v7.4s, v16.4s, v2.4s +}; +static const unsigned int kSplice_clamp_0[] = { + 0x6f00e410, // movi v16.2d, #0x0 + 0x4e30f400, // fmax v0.4s, v0.4s, v16.4s + 0x4e30f421, // fmax v1.4s, v1.4s, v16.4s + 0x4e30f442, // fmax v2.4s, v2.4s, v16.4s + 0x4e30f463, // fmax v3.4s, v3.4s, v16.4s +}; +static const unsigned int kSplice_clamp_1[] = { + 0x91001068, // add x8, x3, #0x4 + 0x4d40c910, // ld1r {v16.4s}, [x8] + 0x4eb0f400, // fmin v0.4s, v0.4s, v16.4s + 0x4eb0f421, // fmin v1.4s, v1.4s, v16.4s + 0x4eb0f442, // fmin v2.4s, v2.4s, v16.4s + 0x4eb0f463, // fmin v3.4s, v3.4s, v16.4s +}; +static const unsigned int kSplice_clamp_a[] = { + 0x91001068, // add x8, x3, #0x4 + 0x4d40c910, // ld1r {v16.4s}, [x8] + 0x4eb0f463, // fmin v3.4s, v3.4s, v16.4s + 0x4ea3f400, // fmin v0.4s, v0.4s, v3.4s + 0x4ea3f421, // fmin v1.4s, v1.4s, v3.4s + 0x4ea3f442, // fmin v2.4s, v2.4s, v3.4s +}; +static const unsigned int kSplice_swap[] = { + 0x4ea31c70, // mov v16.16b, v3.16b + 0x4ea21c51, // mov v17.16b, v2.16b + 0x4ea11c32, // mov v18.16b, v1.16b + 0x4ea01c13, // mov v19.16b, v0.16b + 0x4ea41c80, // mov v0.16b, v4.16b + 0x4ea51ca1, // mov v1.16b, v5.16b + 0x4ea61cc2, // mov v2.16b, v6.16b + 0x4ea71ce3, // mov v3.16b, v7.16b + 0x4eb31e64, // mov v4.16b, v19.16b + 0x4eb21e45, // mov v5.16b, v18.16b + 0x4eb11e26, // mov v6.16b, v17.16b + 0x4eb01e07, // mov v7.16b, v16.16b +}; +static const unsigned int kSplice_move_src_dst[] = { + 0x4ea01c04, // mov v4.16b, v0.16b + 0x4ea11c25, // mov v5.16b, v1.16b + 0x4ea21c46, // mov v6.16b, v2.16b + 0x4ea31c67, // mov v7.16b, v3.16b +}; +static const unsigned int kSplice_move_dst_src[] = { + 0x4ea41c80, // mov v0.16b, v4.16b + 0x4ea51ca1, // mov v1.16b, v5.16b + 0x4ea61cc2, // mov v2.16b, v6.16b + 0x4ea71ce3, // mov v3.16b, v7.16b +}; +static const unsigned int kSplice_premul[] = { + 0x6e23dc00, // fmul v0.4s, v0.4s, v3.4s + 0x6e23dc21, // fmul v1.4s, v1.4s, v3.4s + 0x6e23dc42, // fmul v2.4s, v2.4s, v3.4s +}; +static const unsigned int kSplice_unpremul[] = { + 0x91001068, // add x8, x3, #0x4 + 0x4d40c910, // ld1r {v16.4s}, [x8] + 0x4ea0d871, // fcmeq v17.4s, v3.4s, #0.0 + 0x6e23fe10, // fdiv v16.4s, v16.4s, v3.4s + 0x4e711e10, // bic v16.16b, v16.16b, v17.16b + 0x6e20de00, // fmul v0.4s, v16.4s, v0.4s + 0x6e21de01, // fmul v1.4s, v16.4s, v1.4s + 0x6e22de02, // fmul v2.4s, v16.4s, v2.4s +}; +static const unsigned int kSplice_from_srgb[] = { + 0x91005068, // add x8, x3, #0x14 + 0x4d40c910, // ld1r {v16.4s}, [x8] + 0x91004068, // add x8, x3, #0x10 + 0x4d40c911, // ld1r {v17.4s}, [x8] + 0x2d434c72, // ldp s18, s19, [x3,#24] + 0x6e22dc54, // fmul v20.4s, v2.4s, v2.4s + 0x4eb01e15, // mov v21.16b, v16.16b + 0x4eb01e17, // mov v23.16b, v16.16b + 0x4f921050, // fmla v16.4s, v2.4s, v18.s[0] + 0x4eb11e36, // mov v22.16b, v17.16b + 0x4eb11e38, // mov v24.16b, v17.16b + 0x4e34ce11, // fmla v17.4s, v16.4s, v20.4s + 0x6e20dc10, // fmul v16.4s, v0.4s, v0.4s + 0x91008068, // add x8, x3, #0x20 + 0x4f921015, // fmla v21.4s, v0.4s, v18.s[0] + 0x4e30ceb6, // fmla v22.4s, v21.4s, v16.4s + 0x4d40c910, // ld1r {v16.4s}, [x8] + 0x6e21dc34, // fmul v20.4s, v1.4s, v1.4s + 0x4f921037, // fmla v23.4s, v1.4s, v18.s[0] + 0x4f939015, // fmul v21.4s, v0.4s, v19.s[0] + 0x4f939032, // fmul v18.4s, v1.4s, v19.s[0] + 0x4f939053, // fmul v19.4s, v2.4s, v19.s[0] + 0x6ea0e600, // fcmgt v0.4s, v16.4s, v0.4s + 0x6ea1e601, // fcmgt v1.4s, v16.4s, v1.4s + 0x6ea2e602, // fcmgt v2.4s, v16.4s, v2.4s + 0x4e34cef8, // fmla v24.4s, v23.4s, v20.4s + 0x6e761ea0, // bsl v0.16b, v21.16b, v22.16b + 0x6e781e41, // bsl v1.16b, v18.16b, v24.16b + 0x6e711e62, // bsl v2.16b, v19.16b, v17.16b +}; +static const unsigned int kSplice_to_srgb[] = { + 0x6ea1d810, // frsqrte v16.4s, v0.4s + 0x6ea1d835, // frsqrte v21.4s, v1.4s + 0x6e30de17, // fmul v23.4s, v16.4s, v16.4s + 0x6ea1d856, // frsqrte v22.4s, v2.4s + 0x6e35deb9, // fmul v25.4s, v21.4s, v21.4s + 0x4eb7fc17, // frsqrts v23.4s, v0.4s, v23.4s + 0x9100c068, // add x8, x3, #0x30 + 0x6e36deda, // fmul v26.4s, v22.4s, v22.4s + 0x4eb9fc39, // frsqrts v25.4s, v1.4s, v25.4s + 0x6e37de10, // fmul v16.4s, v16.4s, v23.4s + 0x2d44c871, // ldp s17, s18, [x3,#36] + 0x4d40c914, // ld1r {v20.4s}, [x8] + 0x4ebafc5a, // frsqrts v26.4s, v2.4s, v26.4s + 0x6e39deb5, // fmul v21.4s, v21.4s, v25.4s + 0x4ea1da17, // frecpe v23.4s, v16.4s + 0xbd402c73, // ldr s19, [x3,#44] + 0x9100d068, // add x8, x3, #0x34 + 0x6e3aded6, // fmul v22.4s, v22.4s, v26.4s + 0x4ea1dabb, // frecpe v27.4s, v21.4s + 0x4e37fe1d, // frecps v29.4s, v16.4s, v23.4s + 0x4d40c918, // ld1r {v24.4s}, [x8] + 0x4ea1dadc, // frecpe v28.4s, v22.4s + 0x6e3ddef7, // fmul v23.4s, v23.4s, v29.4s + 0x4e3bfebd, // frecps v29.4s, v21.4s, v27.4s + 0x6e3ddf7b, // fmul v27.4s, v27.4s, v29.4s + 0x4e3cfedd, // frecps v29.4s, v22.4s, v28.4s + 0x6e3ddf9c, // fmul v28.4s, v28.4s, v29.4s + 0x4eb41e9d, // mov v29.16b, v20.16b + 0x6ea1da19, // frsqrte v25.4s, v16.4s + 0x4f9312fd, // fmla v29.4s, v23.4s, v19.s[0] + 0x4eb41e97, // mov v23.16b, v20.16b + 0x4f91901a, // fmul v26.4s, v0.4s, v17.s[0] + 0x4f931377, // fmla v23.4s, v27.4s, v19.s[0] + 0x6ea1dabb, // frsqrte v27.4s, v21.4s + 0x4f931394, // fmla v20.4s, v28.4s, v19.s[0] + 0x4f919033, // fmul v19.4s, v1.4s, v17.s[0] + 0x4f919051, // fmul v17.4s, v2.4s, v17.s[0] + 0x6ea0e700, // fcmgt v0.4s, v24.4s, v0.4s + 0x6ea1e701, // fcmgt v1.4s, v24.4s, v1.4s + 0x6ea2e702, // fcmgt v2.4s, v24.4s, v2.4s + 0x6e39df38, // fmul v24.4s, v25.4s, v25.4s + 0x6ea1dadc, // frsqrte v28.4s, v22.4s + 0x4eb8fe10, // frsqrts v16.4s, v16.4s, v24.4s + 0x6e3bdf78, // fmul v24.4s, v27.4s, v27.4s + 0x4eb8feb5, // frsqrts v21.4s, v21.4s, v24.4s + 0x6e3cdf98, // fmul v24.4s, v28.4s, v28.4s + 0x91001068, // add x8, x3, #0x4 + 0x4eb8fed6, // frsqrts v22.4s, v22.4s, v24.4s + 0x4d40c918, // ld1r {v24.4s}, [x8] + 0x6e30df30, // fmul v16.4s, v25.4s, v16.4s + 0x6e35df75, // fmul v21.4s, v27.4s, v21.4s + 0x6e36df96, // fmul v22.4s, v28.4s, v22.4s + 0x4f92121d, // fmla v29.4s, v16.4s, v18.s[0] + 0x4f9212b7, // fmla v23.4s, v21.4s, v18.s[0] + 0x4f9212d4, // fmla v20.4s, v22.4s, v18.s[0] + 0x4ebdf710, // fmin v16.4s, v24.4s, v29.4s + 0x4eb7f712, // fmin v18.4s, v24.4s, v23.4s + 0x4eb4f714, // fmin v20.4s, v24.4s, v20.4s + 0x6e701f40, // bsl v0.16b, v26.16b, v16.16b + 0x6e721e61, // bsl v1.16b, v19.16b, v18.16b + 0x6e741e22, // bsl v2.16b, v17.16b, v20.16b +}; +static const unsigned int kSplice_scale_u8[] = { + 0xf9400048, // ldr x8, [x2] + 0xbd400c71, // ldr s17, [x3,#12] + 0x8b000108, // add x8, x8, x0 + 0x39400109, // ldrb w9, [x8] + 0x3940050a, // ldrb w10, [x8,#1] + 0x4e021d30, // mov v16.h[0], w9 + 0x39400909, // ldrb w9, [x8,#2] + 0x39400d08, // ldrb w8, [x8,#3] + 0x4e061d50, // mov v16.h[1], w10 + 0x4e0a1d30, // mov v16.h[2], w9 + 0x4e0e1d10, // mov v16.h[3], w8 + 0x2f07b7f0, // bic v16.4h, #0xff, lsl #8 + 0x2f10a610, // uxtl v16.4s, v16.4h + 0x6e21da10, // ucvtf v16.4s, v16.4s + 0x4f919210, // fmul v16.4s, v16.4s, v17.s[0] + 0x6e20de00, // fmul v0.4s, v16.4s, v0.4s + 0x6e21de01, // fmul v1.4s, v16.4s, v1.4s + 0x6e22de02, // fmul v2.4s, v16.4s, v2.4s + 0x6e23de03, // fmul v3.4s, v16.4s, v3.4s +}; +static const unsigned int kSplice_load_8888[] = { + 0xf9400048, // ldr x8, [x2] + 0xd37ef409, // lsl x9, x0, #2 + 0x4d40c860, // ld1r {v0.4s}, [x3] + 0xbd400c63, // ldr s3, [x3,#12] + 0x3ce96901, // ldr q1, [x8,x9] + 0x4e211c02, // and v2.16b, v0.16b, v1.16b + 0x6f380430, // ushr v16.4s, v1.4s, #8 + 0x6f300431, // ushr v17.4s, v1.4s, #16 + 0x6f280421, // ushr v1.4s, v1.4s, #24 + 0x4e21d842, // scvtf v2.4s, v2.4s + 0x4e301c10, // and v16.16b, v0.16b, v16.16b + 0x4e311c11, // and v17.16b, v0.16b, v17.16b + 0x4e21d832, // scvtf v18.4s, v1.4s + 0x4f839040, // fmul v0.4s, v2.4s, v3.s[0] + 0x4e21da01, // scvtf v1.4s, v16.4s + 0x4e21da22, // scvtf v2.4s, v17.4s + 0x4f839021, // fmul v1.4s, v1.4s, v3.s[0] + 0x4f839042, // fmul v2.4s, v2.4s, v3.s[0] + 0x4f839243, // fmul v3.4s, v18.4s, v3.s[0] +}; +static const unsigned int kSplice_store_8888[] = { + 0xbd400870, // ldr s16, [x3,#8] + 0xf9400048, // ldr x8, [x2] + 0xd37ef409, // lsl x9, x0, #2 + 0x4f909032, // fmul v18.4s, v1.4s, v16.s[0] + 0x4f909011, // fmul v17.4s, v0.4s, v16.s[0] + 0x6e21aa52, // fcvtnu v18.4s, v18.4s + 0x6e21aa31, // fcvtnu v17.4s, v17.4s + 0x4f285652, // shl v18.4s, v18.4s, #8 + 0x4eb11e51, // orr v17.16b, v18.16b, v17.16b + 0x4f909052, // fmul v18.4s, v2.4s, v16.s[0] + 0x4f909070, // fmul v16.4s, v3.4s, v16.s[0] + 0x6e21aa52, // fcvtnu v18.4s, v18.4s + 0x6e21aa10, // fcvtnu v16.4s, v16.4s + 0x4f305652, // shl v18.4s, v18.4s, #16 + 0x4eb21e31, // orr v17.16b, v17.16b, v18.16b + 0x4f385610, // shl v16.4s, v16.4s, #24 + 0x4eb01e30, // orr v16.16b, v17.16b, v16.16b + 0x3ca96910, // str q16, [x8,x9] +}; +static const unsigned int kSplice_load_f16[] = { + 0xf9400048, // ldr x8, [x2] + 0x8b000d08, // add x8, x8, x0, lsl #3 + 0x0c400510, // ld4 {v16.4h-v19.4h}, [x8] + 0x0e217a00, // fcvtl v0.4s, v16.4h + 0x0e217a21, // fcvtl v1.4s, v17.4h + 0x0e217a42, // fcvtl v2.4s, v18.4h + 0x0e217a63, // fcvtl v3.4s, v19.4h +}; +static const unsigned int kSplice_store_f16[] = { + 0xf9400048, // ldr x8, [x2] + 0x0e216810, // fcvtn v16.4h, v0.4s + 0x0e216831, // fcvtn v17.4h, v1.4s + 0x0e216852, // fcvtn v18.4h, v2.4s + 0x8b000d08, // add x8, x8, x0, lsl #3 + 0x0e216873, // fcvtn v19.4h, v3.4s + 0x0c000510, // st4 {v16.4h-v19.4h}, [x8] +}; + +#else + static const unsigned char kSplice_clear[] = { 0xc5,0xfc,0x57,0xc0, // vxorps %ymm0, %ymm0, %ymm0 0xc5,0xf4,0x57,0xc9, // vxorps %ymm1, %ymm1, %ymm1 @@ -255,4 +528,7 @@ static const unsigned char kSplice_store_f16[] = { 0xc4,0x41,0x39,0x6a,0xc2, // vpunpckhdq %xmm10, %xmm8, %xmm8 0xc5,0x7a,0x7f,0x44,0xf8,0x30, // vmovdqu %xmm8, 0x30(%rax,%rdi,8) }; + +#endif + #endif//SkSplicer_generated_DEFINED diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp index f3608d47c1..0e3290d5d6 100644 --- a/src/splicer/SkSplicer_stages.cpp +++ b/src/splicer/SkSplicer_stages.cpp @@ -6,52 +6,71 @@ */ #include "SkSplicer_shared.h" -#include #include -#if !defined(__clang__) || !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__) - #error This file is not like the rest of Skia. - #error It must be compiled with clang and with -mavx2 -mfma -mf16c -fomit-frame-pointer. +#if !defined(__clang__) + #error This file is not like the rest of Skia. It must be compiled with clang. #endif // We have very specific inlining requirements. It helps to just take total control. #define AI __attribute__((always_inline)) inline +#if defined(__aarch64__) + #include + + // Since we know we're using Clang, we can use its vector extensions. + using F = float __attribute__((ext_vector_type(4))); + using I32 = int32_t __attribute__((ext_vector_type(4))); + using U32 = uint32_t __attribute__((ext_vector_type(4))); + using U8 = uint8_t __attribute__((ext_vector_type(4))); + + // We polyfill a few routines that Clang doesn't build into ext_vector_types. + AI static U32 round(F v) { return vcvtnq_u32_f32(v); } + AI static F min(F a, F b) { return vminq_f32(a,b); } + AI static F max(F a, F b) { return vmaxq_f32(a,b); } + AI static F fma(F f, F m, F a) { return vfmaq_f32(a,f,m); } + AI static F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; } + AI static F rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; } + AI static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); } +#else + #if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__) + #error On x86, compile with -mavx2 -mfma -mf16c. + #endif + #include + + // These are __m256 and __m256i, but friendlier and strongly-typed. + using F = float __attribute__((ext_vector_type(8))); + using I32 = int32_t __attribute__((ext_vector_type(8))); + using U32 = uint32_t __attribute__((ext_vector_type(8))); + using U8 = uint8_t __attribute__((ext_vector_type(8))); + + AI static U32 round(F v) { return _mm256_cvtps_epi32(v); } + AI static F min(F a, F b) { return _mm256_min_ps (a,b); } + AI static F max(F a, F b) { return _mm256_max_ps (a,b); } + AI static F fma(F f, F m, F a) { return _mm256_fmadd_ps(f,m,a);} + AI static F rcp (F v) { return _mm256_rcp_ps (v); } + AI static F rsqrt(F v) { return _mm256_rsqrt_ps (v); } + AI static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } +#endif + +AI static F cast (U32 v) { return __builtin_convertvector((I32)v, F); } +AI static U32 expand(U8 v) { return __builtin_convertvector( v, U32); } + // We'll be compiling this file to an object file, then extracting parts of it into // SkSplicer_generated.h. It's easier to do if the function names are not C++ mangled. #define C extern "C" -// Since we know we're using Clang, we can use its vector extensions. -// These are __m256 and __m256i, but friendlier and strongly-typed. -using F = float __attribute__((ext_vector_type(8))); -using I32 = int32_t __attribute__((ext_vector_type(8))); -using U32 = uint32_t __attribute__((ext_vector_type(8))); -using U8 = uint8_t __attribute__((ext_vector_type(8))); - -// We polyfill a few routines that Clang doesn't build into ext_vector_types. -AI static F cast (U32 v) { return __builtin_convertvector((I32)v, F); } -AI static U32 round (F v) { return _mm256_cvtps_epi32(v); } -AI static U32 expand(U8 v) { return __builtin_convertvector(v, U32); } - -AI static F rcp (F v) { return _mm256_rcp_ps (v); } -AI static F rsqrt(F v) { return _mm256_rsqrt_ps(v); } -AI static F min (F a, F b) { return _mm256_min_ps (a,b); } -AI static F max (F a, F b) { return _mm256_max_ps (a,b); } -AI static F fma (F f, F m, F a) { return _mm256_fmadd_ps(f,m,a); } - -AI static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } - // Stages all fit a common interface that allows SkSplicer to splice them together. using K = const SkSplicer_constants; using Stage = void(size_t x, size_t n, void* ctx, K* constants, F,F,F,F, F,F,F,F); // Stage's arguments act as the working set of registers within the final spliced function. -// Here's a little primer on the ABI: -// x: rdi x and n work to drive the loop, like for (; x < n; x += 8) -// n: rsi -// ctx: rdx Look for movabsq_rdx in SkSplicer.cpp to see how this works. -// constants: rcx Look for movabsq_rcx in SkSplicer.cpp to see how this works. -// vectors: ymm0-ymm7 +// Here's a little primer on the x86-64/aarch64 ABIs: +// x: rdi/x0 x and n work to drive the loop, like for (; x < n; x += 4 or 8) +// n: rsi/x1 +// ctx: rdx/x2 Look for movabsq_rdx in SkSplicer.cpp to see how this works. +// constants: rcx/x3 Look for movabsq_rcx in SkSplicer.cpp to see how this works. +// vectors: ymm0-ymm7/v0-v7 // done() is the key to this entire splicing strategy. @@ -231,6 +250,13 @@ STAGE(store_8888) { STAGE(load_f16) { auto ptr = *(const uint64_t**)ctx + x; +#if defined(__aarch64__) + auto halfs = vld4_f16((const float16_t*)ptr); + r = vcvt_f32_f16(halfs.val[0]); + g = vcvt_f32_f16(halfs.val[1]); + b = vcvt_f32_f16(halfs.val[2]); + a = vcvt_f32_f16(halfs.val[3]); +#else auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0), _23 = _mm_loadu_si128(((__m128i*)ptr) + 1), _45 = _mm_loadu_si128(((__m128i*)ptr) + 2), @@ -250,11 +276,21 @@ STAGE(load_f16) { g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567)); b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567)); a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567)); +#endif } STAGE(store_f16) { auto ptr = *(uint64_t**)ctx + x; +#if defined(__aarch64__) + float16x4x4_t halfs = {{ + vcvt_f16_f32(r), + vcvt_f16_f32(g), + vcvt_f16_f32(b), + vcvt_f16_f32(a), + }}; + vst4_f16((float16_t*)ptr, halfs); +#else auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION), G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION), B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION), @@ -269,4 +305,5 @@ STAGE(store_f16) { _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123)); _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567)); _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567)); +#endif } diff --git a/src/splicer/build_stages.py b/src/splicer/build_stages.py index 0cb633d6bb..1940690d58 100755 --- a/src/splicer/build_stages.py +++ b/src/splicer/build_stages.py @@ -9,11 +9,21 @@ import re import subprocess import sys -cflags = '-std=c++11 -Os -fomit-frame-pointer -mavx2 -mfma -mf16c' +cflags = '-std=c++11 -Os -fomit-frame-pointer'.split() -subprocess.check_call(['clang++'] + cflags.split() + +hsw = '-mavx2 -mfma -mf16c'.split() +subprocess.check_call(['clang++'] + cflags + hsw + ['-c', 'src/splicer/SkSplicer_stages.cpp'] + - ['-o', 'stages.o']) + ['-o', 'hsw.o']) + +aarch64 = [ + '--target=aarch64-linux-android', + '--sysroot=' + + '/Users/mtklein/brew/opt/android-ndk/platforms/android-21/arch-arm64', +] +subprocess.check_call(['clang++'] + cflags + aarch64 + + ['-c', 'src/splicer/SkSplicer_stages.cpp'] + + ['-o', 'aarch64.o']) print '''/* * Copyright 2017 Google Inc. @@ -27,11 +37,37 @@ print '''/* // This file is generated semi-automatically with this command: // $ src/splicer/build_stages.py > src/splicer/SkSplicer_generated.h + +#if defined(__aarch64__) +''' +for line in subprocess.check_output(['gobjdump', '-d', + 'aarch64.o']).split('\n'): + line = line.strip() + if not line or line.startswith('aarch64.o') or line.startswith('Disassembly'): + continue + + m = re.match('''................ <(.*)>:''', line) + if m: + print 'static const unsigned int kSplice_' + m.group(1) + '[] = {' + continue + + _, code, inst, args = line.split('\t') + code = code.strip() + + # b done, where done has not yet been filled in by the linker. + if code == '14000000': + print '};' + continue + print ' ', '0x'+code+',' + ' // ' + inst + ' ' + args + +print ''' +#else ''' -for line in subprocess.check_output(['otool', '-tvj', 'stages.o']).split('\n'): +# TODO: port this to gobjdump too +for line in subprocess.check_output(['otool', '-tvj', 'hsw.o']).split('\n'): line = line.strip() - if line == '' or line == 'stages.o:' or line == '(__TEXT,__text) section': + if line == '' or line == 'hsw.o:' or line == '(__TEXT,__text) section': continue m = re.match('_(.*):', line) @@ -41,33 +77,37 @@ for line in subprocess.check_output(['otool', '-tvj', 'stages.o']).split('\n'): continue # Skip the leading 16 byte address and a tab, - # leaving the hex and mnemonics of each instruction. + # leaving the code, instruction mnemonic, and its arguments. line = line[17:] columns = line.split('\t') - _hex = columns[0].strip() - instr = columns[1] - args = columns[2:] + code = columns[0].strip() + inst = columns[1] + args = columns[2:] # We can't splice code that uses rip relative addressing. for arg in args: assert 'rip' not in arg # jmp done, the end of each stage (the address of done is not yet filled in) - if _hex == 'e9 00 00 00 00': + if code == 'e9 00 00 00 00': print '};' continue sys.stdout.write(' ') - _bytes = _hex.split(' ') + _bytes = code.split(' ') # This is the meat of things: copy the code to a C unsigned char array. for byte in _bytes: sys.stdout.write('0x' + byte + ',') # From here on we're just making the generated file readable and pretty. sys.stdout.write(' ' * (44 - 5*len(_bytes))) - sys.stdout.write('// ' + instr) + sys.stdout.write('// ' + inst) if args: - sys.stdout.write(' ' * (13 - len(instr))) + sys.stdout.write(' ' * (13 - len(inst))) sys.stdout.write(' '.join(args)) sys.stdout.write('\n') +print ''' +#endif +''' + print '''#endif//SkSplicer_generated_DEFINED''' -- cgit v1.2.3