diff options
author | 2017-02-11 13:21:29 -0500 | |
---|---|---|
committer | 2017-02-13 21:08:10 +0000 | |
commit | 6fcea9d4366f385260e7a59c84a5456c9de2d844 (patch) | |
tree | a595d0258cac7c38cfee2186e8fcaf1463c327af | |
parent | 07f665efb918f68e406b76a78d0b76d5c714f16c (diff) |
Enable sse2 backend for SkSplicer.
One more piece of https://skia-review.googlesource.com/c/8230/.
-mno-red-zone makes it safe for x86 stages to use the stack on Windows
(at the expense of an extra sub and add to the stack pointer on !Windows).
CQ_INCLUDE_TRYBOTS=skia.primary:Test-Win2k8-MSVC-GCE-CPU-AVX2-x86_64-Debug,Test-Win10-MSVC-Golo-GPU-GT610-x86_64-Release,Test-Mac-Clang-MacMini6.2-CPU-AVX-x86_64-Debug
Change-Id: I81f8220e790b201757a7e1e9752b2fe94520ccbb
Reviewed-on: https://skia-review.googlesource.com/8352
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Herb Derby <herb@google.com>
-rw-r--r-- | src/splicer/SkSplicer.cpp | 28 | ||||
-rw-r--r-- | src/splicer/SkSplicer_generated.h | 477 | ||||
-rwxr-xr-x | src/splicer/build_stages.py | 8 |
3 files changed, 499 insertions, 14 deletions
diff --git a/src/splicer/SkSplicer.cpp b/src/splicer/SkSplicer.cpp index b3336cccab..8447c9b45f 100644 --- a/src/splicer/SkSplicer.cpp +++ b/src/splicer/SkSplicer.cpp @@ -112,12 +112,6 @@ namespace { splice(buf, jb_near); // jb <next 4 bytes> (b == "before", unsigned less than) splice(buf, loop_start - (int)(buf->bytesWritten() + 4)); } - static void ret(SkWStream* buf) { - static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 }; - static const uint8_t ret[] = { 0xc3 }; - splice(buf, vzeroupper); - splice(buf, ret); - } #endif #if defined(_MSC_VER) @@ -279,6 +273,7 @@ namespace { DEFINE_SPLICE_STAGE(armv7) #else DEFINE_SPLICE_STAGE(hsw) + DEFINE_SPLICE_STAGE(sse2) #endif #undef DEFINE_SPLICE #undef CASE @@ -305,12 +300,25 @@ namespace { auto splice_stage = armv7_splice_stage; auto inc_x = [](SkWStream* buf) { splice_until_ret(buf, armv7_inc_x); }; #else - // To keep things simple, only one x86 target supported: Haswell+ x86-64. - if (!SkCpu::Supports(SkCpu::HSW) || sizeof(void*) != 8) { + // To keep things simple, only x86-64 supported. + if (sizeof(void*) != 8) { return; } - auto splice_stage = hsw_splice_stage; - auto inc_x = [&](SkWStream* buf) { splice_until_ret(buf, hsw_inc_x); }; + bool hsw = true && SkCpu::Supports(SkCpu::HSW); + + auto splice_stage = hsw ? hsw_splice_stage : sse2_splice_stage; + auto inc_x = [hsw](SkWStream* buf) { + if (hsw) { splice_until_ret(buf, hsw_inc_x); } + else { splice_until_ret(buf, sse2_inc_x); } + }; + auto ret = [hsw](SkWStream* buf) { + static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 }; + static const uint8_t ret[] = { 0xc3 }; + if (hsw) { + splice(buf, vzeroupper); + } + splice(buf, ret); + }; #endif SkDynamicMemoryWStream buf; diff --git a/src/splicer/SkSplicer_generated.h b/src/splicer/SkSplicer_generated.h index fa8a72d8ff..5f16b5c66f 100644 --- a/src/splicer/SkSplicer_generated.h +++ b/src/splicer/SkSplicer_generated.h @@ -775,6 +775,483 @@ static const unsigned int armv7_matrix_3x4[] = { 0xf22211b2, // vorr d1, d18, d18 0xe12fff1e, // return }; +static const unsigned char sse2_inc_x[] = { + 0x48,0x83,0xc7,0x04, // add $0x4,%rdi + 0xc3, // return +}; +static const unsigned char sse2_clear[] = { + 0x0f,0x57,0xc0, // xorps %xmm0,%xmm0 + 0x0f,0x57,0xc9, // xorps %xmm1,%xmm1 + 0x0f,0x57,0xd2, // xorps %xmm2,%xmm2 + 0x0f,0x57,0xdb, // xorps %xmm3,%xmm3 + 0xc3, // return +}; +static const unsigned char sse2_plus_[] = { + 0x0f,0x58,0xc4, // addps %xmm4,%xmm0 + 0x0f,0x58,0xcd, // addps %xmm5,%xmm1 + 0x0f,0x58,0xd6, // addps %xmm6,%xmm2 + 0x0f,0x58,0xdf, // addps %xmm7,%xmm3 + 0xc3, // return +}; +static const unsigned char sse2_srcover[] = { + 0xf3,0x44,0x0f,0x10,0x01, // movss (%rcx),%xmm8 + 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8 + 0x44,0x0f,0x5c,0xc3, // subps %xmm3,%xmm8 + 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9 + 0x44,0x0f,0x59,0xcc, // mulps %xmm4,%xmm9 + 0x41,0x0f,0x58,0xc1, // addps %xmm9,%xmm0 + 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9 + 0x44,0x0f,0x59,0xcd, // mulps %xmm5,%xmm9 + 0x41,0x0f,0x58,0xc9, // addps %xmm9,%xmm1 + 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9 + 0x44,0x0f,0x59,0xce, // mulps %xmm6,%xmm9 + 0x41,0x0f,0x58,0xd1, // addps %xmm9,%xmm2 + 0x44,0x0f,0x59,0xc7, // mulps %xmm7,%xmm8 + 0x41,0x0f,0x58,0xd8, // addps %xmm8,%xmm3 + 0xc3, // return +}; +static const unsigned char sse2_dstover[] = { + 0xf3,0x44,0x0f,0x10,0x01, // movss (%rcx),%xmm8 + 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8 + 0x44,0x0f,0x5c,0xc7, // subps %xmm7,%xmm8 + 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9 + 0x44,0x0f,0x59,0xc8, // mulps %xmm0,%xmm9 + 0x41,0x0f,0x58,0xe1, // addps %xmm9,%xmm4 + 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9 + 0x44,0x0f,0x59,0xc9, // mulps %xmm1,%xmm9 + 0x41,0x0f,0x58,0xe9, // addps %xmm9,%xmm5 + 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9 + 0x44,0x0f,0x59,0xca, // mulps %xmm2,%xmm9 + 0x41,0x0f,0x58,0xf1, // addps %xmm9,%xmm6 + 0x44,0x0f,0x59,0xc3, // mulps %xmm3,%xmm8 + 0x41,0x0f,0x58,0xf8, // addps %xmm8,%xmm7 + 0xc3, // return +}; +static const unsigned char sse2_clamp_0[] = { + 0x45,0x0f,0x57,0xc0, // xorps %xmm8,%xmm8 + 0x41,0x0f,0x5f,0xc0, // maxps %xmm8,%xmm0 + 0x41,0x0f,0x5f,0xc8, // maxps %xmm8,%xmm1 + 0x41,0x0f,0x5f,0xd0, // maxps %xmm8,%xmm2 + 0x41,0x0f,0x5f,0xd8, // maxps %xmm8,%xmm3 + 0xc3, // return +}; +static const unsigned char sse2_clamp_1[] = { + 0xf3,0x44,0x0f,0x10,0x01, // movss (%rcx),%xmm8 + 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8 + 0x41,0x0f,0x5d,0xc0, // minps %xmm8,%xmm0 + 0x41,0x0f,0x5d,0xc8, // minps %xmm8,%xmm1 + 0x41,0x0f,0x5d,0xd0, // minps %xmm8,%xmm2 + 0x41,0x0f,0x5d,0xd8, // minps %xmm8,%xmm3 + 0xc3, // return +}; +static const unsigned char sse2_clamp_a[] = { + 0xf3,0x44,0x0f,0x10,0x01, // movss (%rcx),%xmm8 + 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8 + 0x41,0x0f,0x5d,0xd8, // minps %xmm8,%xmm3 + 0x0f,0x5d,0xc3, // minps %xmm3,%xmm0 + 0x0f,0x5d,0xcb, // minps %xmm3,%xmm1 + 0x0f,0x5d,0xd3, // minps %xmm3,%xmm2 + 0xc3, // return +}; +static const unsigned char sse2_swap[] = { + 0x44,0x0f,0x28,0xc3, // movaps %xmm3,%xmm8 + 0x44,0x0f,0x28,0xca, // movaps %xmm2,%xmm9 + 0x44,0x0f,0x28,0xd1, // movaps %xmm1,%xmm10 + 0x44,0x0f,0x28,0xd8, // movaps %xmm0,%xmm11 + 0x0f,0x28,0xc4, // movaps %xmm4,%xmm0 + 0x0f,0x28,0xcd, // movaps %xmm5,%xmm1 + 0x0f,0x28,0xd6, // movaps %xmm6,%xmm2 + 0x0f,0x28,0xdf, // movaps %xmm7,%xmm3 + 0x41,0x0f,0x28,0xe3, // movaps %xmm11,%xmm4 + 0x41,0x0f,0x28,0xea, // movaps %xmm10,%xmm5 + 0x41,0x0f,0x28,0xf1, // movaps %xmm9,%xmm6 + 0x41,0x0f,0x28,0xf8, // movaps %xmm8,%xmm7 + 0xc3, // return +}; +static const unsigned char sse2_move_src_dst[] = { + 0x0f,0x28,0xe0, // movaps %xmm0,%xmm4 + 0x0f,0x28,0xe9, // movaps %xmm1,%xmm5 + 0x0f,0x28,0xf2, // movaps %xmm2,%xmm6 + 0x0f,0x28,0xfb, // movaps %xmm3,%xmm7 + 0xc3, // return +}; +static const unsigned char sse2_move_dst_src[] = { + 0x0f,0x28,0xc4, // movaps %xmm4,%xmm0 + 0x0f,0x28,0xcd, // movaps %xmm5,%xmm1 + 0x0f,0x28,0xd6, // movaps %xmm6,%xmm2 + 0x0f,0x28,0xdf, // movaps %xmm7,%xmm3 + 0xc3, // return +}; +static const unsigned char sse2_premul[] = { + 0x0f,0x59,0xc3, // mulps %xmm3,%xmm0 + 0x0f,0x59,0xcb, // mulps %xmm3,%xmm1 + 0x0f,0x59,0xd3, // mulps %xmm3,%xmm2 + 0xc3, // return +}; +static const unsigned char sse2_unpremul[] = { + 0x45,0x0f,0x57,0xc0, // xorps %xmm8,%xmm8 + 0x44,0x0f,0xc2,0xc3,0x00, // cmpeqps %xmm3,%xmm8 + 0xf3,0x44,0x0f,0x10,0x09, // movss (%rcx),%xmm9 + 0x45,0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm9,%xmm9 + 0x44,0x0f,0x5e,0xcb, // divps %xmm3,%xmm9 + 0x45,0x0f,0x55,0xc1, // andnps %xmm9,%xmm8 + 0x41,0x0f,0x59,0xc0, // mulps %xmm8,%xmm0 + 0x41,0x0f,0x59,0xc8, // mulps %xmm8,%xmm1 + 0x41,0x0f,0x59,0xd0, // mulps %xmm8,%xmm2 + 0xc3, // return +}; +static const unsigned char sse2_from_srgb[] = { + 0xf3,0x44,0x0f,0x10,0x41,0x1c, // movss 0x1c(%rcx),%xmm8 + 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8 + 0x45,0x0f,0x28,0xe8, // movaps %xmm8,%xmm13 + 0x44,0x0f,0x59,0xe8, // mulps %xmm0,%xmm13 + 0x44,0x0f,0x28,0xe0, // movaps %xmm0,%xmm12 + 0x45,0x0f,0x59,0xe4, // mulps %xmm12,%xmm12 + 0xf3,0x44,0x0f,0x10,0x49,0x18, // movss 0x18(%rcx),%xmm9 + 0x45,0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm9,%xmm9 + 0xf3,0x44,0x0f,0x10,0x51,0x10, // movss 0x10(%rcx),%xmm10 + 0xf3,0x44,0x0f,0x10,0x59,0x14, // movss 0x14(%rcx),%xmm11 + 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11 + 0x45,0x0f,0x28,0xf1, // movaps %xmm9,%xmm14 + 0x44,0x0f,0x59,0xf0, // mulps %xmm0,%xmm14 + 0x45,0x0f,0x58,0xf3, // addps %xmm11,%xmm14 + 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10 + 0x45,0x0f,0x59,0xf4, // mulps %xmm12,%xmm14 + 0x45,0x0f,0x58,0xf2, // addps %xmm10,%xmm14 + 0xf3,0x44,0x0f,0x10,0x61,0x20, // movss 0x20(%rcx),%xmm12 + 0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12 + 0x41,0x0f,0xc2,0xc4,0x01, // cmpltps %xmm12,%xmm0 + 0x44,0x0f,0x54,0xe8, // andps %xmm0,%xmm13 + 0x41,0x0f,0x55,0xc6, // andnps %xmm14,%xmm0 + 0x41,0x0f,0x56,0xc5, // orps %xmm13,%xmm0 + 0x45,0x0f,0x28,0xe8, // movaps %xmm8,%xmm13 + 0x44,0x0f,0x59,0xe9, // mulps %xmm1,%xmm13 + 0x44,0x0f,0x28,0xf1, // movaps %xmm1,%xmm14 + 0x45,0x0f,0x59,0xf6, // mulps %xmm14,%xmm14 + 0x45,0x0f,0x28,0xf9, // movaps %xmm9,%xmm15 + 0x44,0x0f,0x59,0xf9, // mulps %xmm1,%xmm15 + 0x45,0x0f,0x58,0xfb, // addps %xmm11,%xmm15 + 0x45,0x0f,0x59,0xfe, // mulps %xmm14,%xmm15 + 0x45,0x0f,0x58,0xfa, // addps %xmm10,%xmm15 + 0x41,0x0f,0xc2,0xcc,0x01, // cmpltps %xmm12,%xmm1 + 0x44,0x0f,0x54,0xe9, // andps %xmm1,%xmm13 + 0x41,0x0f,0x55,0xcf, // andnps %xmm15,%xmm1 + 0x41,0x0f,0x56,0xcd, // orps %xmm13,%xmm1 + 0x44,0x0f,0x59,0xc2, // mulps %xmm2,%xmm8 + 0x44,0x0f,0x28,0xea, // movaps %xmm2,%xmm13 + 0x45,0x0f,0x59,0xed, // mulps %xmm13,%xmm13 + 0x44,0x0f,0x59,0xca, // mulps %xmm2,%xmm9 + 0x45,0x0f,0x58,0xcb, // addps %xmm11,%xmm9 + 0x45,0x0f,0x59,0xcd, // mulps %xmm13,%xmm9 + 0x45,0x0f,0x58,0xca, // addps %xmm10,%xmm9 + 0x41,0x0f,0xc2,0xd4,0x01, // cmpltps %xmm12,%xmm2 + 0x44,0x0f,0x54,0xc2, // andps %xmm2,%xmm8 + 0x41,0x0f,0x55,0xd1, // andnps %xmm9,%xmm2 + 0x41,0x0f,0x56,0xd0, // orps %xmm8,%xmm2 + 0xc3, // return +}; +static const unsigned char sse2_to_srgb[] = { + 0x48,0x83,0xec,0x28, // sub $0x28,%rsp + 0x0f,0x29,0x7c,0x24,0x10, // movaps %xmm7,0x10(%rsp) + 0x0f,0x29,0x34,0x24, // movaps %xmm6,(%rsp) + 0x0f,0x28,0xf5, // movaps %xmm5,%xmm6 + 0x0f,0x28,0xec, // movaps %xmm4,%xmm5 + 0x0f,0x28,0xe3, // movaps %xmm3,%xmm4 + 0x44,0x0f,0x52,0xc0, // rsqrtps %xmm0,%xmm8 + 0x45,0x0f,0x53,0xe8, // rcpps %xmm8,%xmm13 + 0x45,0x0f,0x52,0xf8, // rsqrtps %xmm8,%xmm15 + 0xf3,0x0f,0x10,0x19, // movss (%rcx),%xmm3 + 0xf3,0x44,0x0f,0x10,0x41,0x24, // movss 0x24(%rcx),%xmm8 + 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8 + 0x45,0x0f,0x28,0xf0, // movaps %xmm8,%xmm14 + 0x44,0x0f,0x59,0xf0, // mulps %xmm0,%xmm14 + 0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm3,%xmm3 + 0xf3,0x44,0x0f,0x10,0x51,0x28, // movss 0x28(%rcx),%xmm10 + 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10 + 0xf3,0x44,0x0f,0x10,0x59,0x2c, // movss 0x2c(%rcx),%xmm11 + 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11 + 0xf3,0x44,0x0f,0x10,0x61,0x30, // movss 0x30(%rcx),%xmm12 + 0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12 + 0x45,0x0f,0x59,0xeb, // mulps %xmm11,%xmm13 + 0x45,0x0f,0x58,0xec, // addps %xmm12,%xmm13 + 0x45,0x0f,0x59,0xfa, // mulps %xmm10,%xmm15 + 0x45,0x0f,0x58,0xfd, // addps %xmm13,%xmm15 + 0x44,0x0f,0x28,0xcb, // movaps %xmm3,%xmm9 + 0x45,0x0f,0x5d,0xcf, // minps %xmm15,%xmm9 + 0xf3,0x44,0x0f,0x10,0x69,0x34, // movss 0x34(%rcx),%xmm13 + 0x45,0x0f,0xc6,0xed,0x00, // shufps $0x0,%xmm13,%xmm13 + 0x41,0x0f,0xc2,0xc5,0x01, // cmpltps %xmm13,%xmm0 + 0x44,0x0f,0x54,0xf0, // andps %xmm0,%xmm14 + 0x41,0x0f,0x55,0xc1, // andnps %xmm9,%xmm0 + 0x41,0x0f,0x56,0xc6, // orps %xmm14,%xmm0 + 0x44,0x0f,0x52,0xc9, // rsqrtps %xmm1,%xmm9 + 0x45,0x0f,0x53,0xf1, // rcpps %xmm9,%xmm14 + 0x45,0x0f,0x52,0xc9, // rsqrtps %xmm9,%xmm9 + 0x45,0x0f,0x59,0xf3, // mulps %xmm11,%xmm14 + 0x45,0x0f,0x58,0xf4, // addps %xmm12,%xmm14 + 0x45,0x0f,0x59,0xca, // mulps %xmm10,%xmm9 + 0x45,0x0f,0x58,0xce, // addps %xmm14,%xmm9 + 0x44,0x0f,0x28,0xf3, // movaps %xmm3,%xmm14 + 0x45,0x0f,0x5d,0xf1, // minps %xmm9,%xmm14 + 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9 + 0x44,0x0f,0x59,0xc9, // mulps %xmm1,%xmm9 + 0x41,0x0f,0xc2,0xcd,0x01, // cmpltps %xmm13,%xmm1 + 0x44,0x0f,0x54,0xc9, // andps %xmm1,%xmm9 + 0x41,0x0f,0x55,0xce, // andnps %xmm14,%xmm1 + 0x41,0x0f,0x56,0xc9, // orps %xmm9,%xmm1 + 0x44,0x0f,0x52,0xca, // rsqrtps %xmm2,%xmm9 + 0x45,0x0f,0x53,0xf1, // rcpps %xmm9,%xmm14 + 0x45,0x0f,0x59,0xf3, // mulps %xmm11,%xmm14 + 0x45,0x0f,0x58,0xf4, // addps %xmm12,%xmm14 + 0x41,0x0f,0x52,0xf9, // rsqrtps %xmm9,%xmm7 + 0x41,0x0f,0x59,0xfa, // mulps %xmm10,%xmm7 + 0x41,0x0f,0x58,0xfe, // addps %xmm14,%xmm7 + 0x0f,0x5d,0xdf, // minps %xmm7,%xmm3 + 0x44,0x0f,0x59,0xc2, // mulps %xmm2,%xmm8 + 0x41,0x0f,0xc2,0xd5,0x01, // cmpltps %xmm13,%xmm2 + 0x44,0x0f,0x54,0xc2, // andps %xmm2,%xmm8 + 0x0f,0x55,0xd3, // andnps %xmm3,%xmm2 + 0x41,0x0f,0x56,0xd0, // orps %xmm8,%xmm2 + 0x0f,0x28,0xdc, // movaps %xmm4,%xmm3 + 0x0f,0x28,0xe5, // movaps %xmm5,%xmm4 + 0x0f,0x28,0xee, // movaps %xmm6,%xmm5 + 0x0f,0x28,0x34,0x24, // movaps (%rsp),%xmm6 + 0x0f,0x28,0x7c,0x24,0x10, // movaps 0x10(%rsp),%xmm7 + 0x48,0x83,0xc4,0x28, // add $0x28,%rsp + 0xc3, // return +}; +static const unsigned char sse2_scale_u8[] = { + 0x48,0x8b,0x02, // mov (%rdx),%rax + 0x66,0x44,0x0f,0x6e,0x04,0x38, // movd (%rax,%rdi,1),%xmm8 + 0x66,0x45,0x0f,0xef,0xc9, // pxor %xmm9,%xmm9 + 0x66,0x45,0x0f,0x60,0xc1, // punpcklbw %xmm9,%xmm8 + 0x66,0x45,0x0f,0x61,0xc1, // punpcklwd %xmm9,%xmm8 + 0x45,0x0f,0x5b,0xc0, // cvtdq2ps %xmm8,%xmm8 + 0xf3,0x44,0x0f,0x10,0x49,0x08, // movss 0x8(%rcx),%xmm9 + 0x45,0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm9,%xmm9 + 0x45,0x0f,0x59,0xc8, // mulps %xmm8,%xmm9 + 0x41,0x0f,0x59,0xc1, // mulps %xmm9,%xmm0 + 0x41,0x0f,0x59,0xc9, // mulps %xmm9,%xmm1 + 0x41,0x0f,0x59,0xd1, // mulps %xmm9,%xmm2 + 0x41,0x0f,0x59,0xd9, // mulps %xmm9,%xmm3 + 0xc3, // return +}; +static const unsigned char sse2_load_tables[] = { + 0x48,0x8b,0x02, // mov (%rdx),%rax + 0x4c,0x8b,0x42,0x08, // mov 0x8(%rdx),%r8 + 0xf3,0x44,0x0f,0x6f,0x04,0xb8, // movdqu (%rax,%rdi,4),%xmm8 + 0x66,0x0f,0x6e,0x41,0x0c, // movd 0xc(%rcx),%xmm0 + 0x66,0x0f,0x70,0xc0,0x00, // pshufd $0x0,%xmm0,%xmm0 + 0x66,0x45,0x0f,0x6f,0xc8, // movdqa %xmm8,%xmm9 + 0x66,0x41,0x0f,0x72,0xd1,0x08, // psrld $0x8,%xmm9 + 0x66,0x44,0x0f,0xdb,0xc8, // pand %xmm0,%xmm9 + 0x66,0x45,0x0f,0x6f,0xd0, // movdqa %xmm8,%xmm10 + 0x66,0x41,0x0f,0x72,0xd2,0x10, // psrld $0x10,%xmm10 + 0x66,0x44,0x0f,0xdb,0xd0, // pand %xmm0,%xmm10 + 0x66,0x41,0x0f,0xdb,0xc0, // pand %xmm8,%xmm0 + 0x66,0x0f,0x70,0xd8,0x4e, // pshufd $0x4e,%xmm0,%xmm3 + 0x66,0x48,0x0f,0x7e,0xd8, // movq %xmm3,%rax + 0x41,0x89,0xc1, // mov %eax,%r9d + 0x48,0xc1,0xe8,0x20, // shr $0x20,%rax + 0x66,0x49,0x0f,0x7e,0xc2, // movq %xmm0,%r10 + 0x45,0x89,0xd3, // mov %r10d,%r11d + 0x49,0xc1,0xea,0x20, // shr $0x20,%r10 + 0xf3,0x43,0x0f,0x10,0x1c,0x90, // movss (%r8,%r10,4),%xmm3 + 0xf3,0x41,0x0f,0x10,0x04,0x80, // movss (%r8,%rax,4),%xmm0 + 0x0f,0x14,0xd8, // unpcklps %xmm0,%xmm3 + 0xf3,0x43,0x0f,0x10,0x04,0x98, // movss (%r8,%r11,4),%xmm0 + 0xf3,0x43,0x0f,0x10,0x0c,0x88, // movss (%r8,%r9,4),%xmm1 + 0x0f,0x14,0xc1, // unpcklps %xmm1,%xmm0 + 0x0f,0x14,0xc3, // unpcklps %xmm3,%xmm0 + 0x48,0x8b,0x42,0x10, // mov 0x10(%rdx),%rax + 0x66,0x41,0x0f,0x70,0xc9,0x4e, // pshufd $0x4e,%xmm9,%xmm1 + 0x66,0x49,0x0f,0x7e,0xc8, // movq %xmm1,%r8 + 0x45,0x89,0xc1, // mov %r8d,%r9d + 0x49,0xc1,0xe8,0x20, // shr $0x20,%r8 + 0x66,0x4d,0x0f,0x7e,0xca, // movq %xmm9,%r10 + 0x45,0x89,0xd3, // mov %r10d,%r11d + 0x49,0xc1,0xea,0x20, // shr $0x20,%r10 + 0xf3,0x42,0x0f,0x10,0x1c,0x90, // movss (%rax,%r10,4),%xmm3 + 0xf3,0x42,0x0f,0x10,0x0c,0x80, // movss (%rax,%r8,4),%xmm1 + 0x0f,0x14,0xd9, // unpcklps %xmm1,%xmm3 + 0xf3,0x42,0x0f,0x10,0x0c,0x98, // movss (%rax,%r11,4),%xmm1 + 0xf3,0x42,0x0f,0x10,0x14,0x88, // movss (%rax,%r9,4),%xmm2 + 0x0f,0x14,0xca, // unpcklps %xmm2,%xmm1 + 0x0f,0x14,0xcb, // unpcklps %xmm3,%xmm1 + 0x48,0x8b,0x42,0x18, // mov 0x18(%rdx),%rax + 0x66,0x41,0x0f,0x70,0xd2,0x4e, // pshufd $0x4e,%xmm10,%xmm2 + 0x66,0x49,0x0f,0x7e,0xd0, // movq %xmm2,%r8 + 0x45,0x89,0xc1, // mov %r8d,%r9d + 0x49,0xc1,0xe8,0x20, // shr $0x20,%r8 + 0x66,0x4d,0x0f,0x7e,0xd2, // movq %xmm10,%r10 + 0x45,0x89,0xd3, // mov %r10d,%r11d + 0x49,0xc1,0xea,0x20, // shr $0x20,%r10 + 0xf3,0x46,0x0f,0x10,0x0c,0x90, // movss (%rax,%r10,4),%xmm9 + 0xf3,0x42,0x0f,0x10,0x14,0x80, // movss (%rax,%r8,4),%xmm2 + 0x44,0x0f,0x14,0xca, // unpcklps %xmm2,%xmm9 + 0xf3,0x42,0x0f,0x10,0x14,0x98, // movss (%rax,%r11,4),%xmm2 + 0xf3,0x42,0x0f,0x10,0x1c,0x88, // movss (%rax,%r9,4),%xmm3 + 0x0f,0x14,0xd3, // unpcklps %xmm3,%xmm2 + 0x41,0x0f,0x14,0xd1, // unpcklps %xmm9,%xmm2 + 0x66,0x41,0x0f,0x72,0xd0,0x18, // psrld $0x18,%xmm8 + 0x45,0x0f,0x5b,0xc0, // cvtdq2ps %xmm8,%xmm8 + 0xf3,0x0f,0x10,0x59,0x08, // movss 0x8(%rcx),%xmm3 + 0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm3,%xmm3 + 0x41,0x0f,0x59,0xd8, // mulps %xmm8,%xmm3 + 0xc3, // return +}; +static const unsigned char sse2_load_8888[] = { + 0x48,0x8b,0x02, // mov (%rdx),%rax + 0xf3,0x0f,0x6f,0x1c,0xb8, // movdqu (%rax,%rdi,4),%xmm3 + 0x66,0x0f,0x6e,0x41,0x0c, // movd 0xc(%rcx),%xmm0 + 0x66,0x0f,0x70,0xc0,0x00, // pshufd $0x0,%xmm0,%xmm0 + 0x66,0x0f,0x6f,0xcb, // movdqa %xmm3,%xmm1 + 0x66,0x0f,0x72,0xd1,0x08, // psrld $0x8,%xmm1 + 0x66,0x0f,0xdb,0xc8, // pand %xmm0,%xmm1 + 0x66,0x0f,0x6f,0xd3, // movdqa %xmm3,%xmm2 + 0x66,0x0f,0x72,0xd2,0x10, // psrld $0x10,%xmm2 + 0x66,0x0f,0xdb,0xd0, // pand %xmm0,%xmm2 + 0x66,0x0f,0xdb,0xc3, // pand %xmm3,%xmm0 + 0x0f,0x5b,0xc0, // cvtdq2ps %xmm0,%xmm0 + 0xf3,0x44,0x0f,0x10,0x41,0x08, // movss 0x8(%rcx),%xmm8 + 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8 + 0x41,0x0f,0x59,0xc0, // mulps %xmm8,%xmm0 + 0x0f,0x5b,0xc9, // cvtdq2ps %xmm1,%xmm1 + 0x41,0x0f,0x59,0xc8, // mulps %xmm8,%xmm1 + 0x0f,0x5b,0xd2, // cvtdq2ps %xmm2,%xmm2 + 0x41,0x0f,0x59,0xd0, // mulps %xmm8,%xmm2 + 0x66,0x0f,0x72,0xd3,0x18, // psrld $0x18,%xmm3 + 0x0f,0x5b,0xdb, // cvtdq2ps %xmm3,%xmm3 + 0x41,0x0f,0x59,0xd8, // mulps %xmm8,%xmm3 + 0xc3, // return +}; +static const unsigned char sse2_store_8888[] = { + 0x48,0x8b,0x02, // mov (%rdx),%rax + 0xf3,0x44,0x0f,0x10,0x41,0x04, // movss 0x4(%rcx),%xmm8 + 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8 + 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9 + 0x44,0x0f,0x59,0xc8, // mulps %xmm0,%xmm9 + 0x66,0x45,0x0f,0x5b,0xc9, // cvtps2dq %xmm9,%xmm9 + 0x45,0x0f,0x28,0xd0, // movaps %xmm8,%xmm10 + 0x44,0x0f,0x59,0xd1, // mulps %xmm1,%xmm10 + 0x66,0x45,0x0f,0x5b,0xd2, // cvtps2dq %xmm10,%xmm10 + 0x66,0x41,0x0f,0x72,0xf2,0x08, // pslld $0x8,%xmm10 + 0x66,0x45,0x0f,0xeb,0xd1, // por %xmm9,%xmm10 + 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9 + 0x44,0x0f,0x59,0xca, // mulps %xmm2,%xmm9 + 0x66,0x45,0x0f,0x5b,0xc9, // cvtps2dq %xmm9,%xmm9 + 0x66,0x41,0x0f,0x72,0xf1,0x10, // pslld $0x10,%xmm9 + 0x44,0x0f,0x59,0xc3, // mulps %xmm3,%xmm8 + 0x66,0x45,0x0f,0x5b,0xc0, // cvtps2dq %xmm8,%xmm8 + 0x66,0x41,0x0f,0x72,0xf0,0x18, // pslld $0x18,%xmm8 + 0x66,0x45,0x0f,0xeb,0xc1, // por %xmm9,%xmm8 + 0x66,0x45,0x0f,0xeb,0xc2, // por %xmm10,%xmm8 + 0xf3,0x44,0x0f,0x7f,0x04,0xb8, // movdqu %xmm8,(%rax,%rdi,4) + 0xc3, // return +}; +static const unsigned char sse2_load_f16[] = { + 0x48,0x8b,0x02, // mov (%rdx),%rax + 0xf3,0x0f,0x6f,0x04,0xf8, // movdqu (%rax,%rdi,8),%xmm0 + 0xf3,0x0f,0x6f,0x4c,0xf8,0x10, // movdqu 0x10(%rax,%rdi,8),%xmm1 + 0x66,0x0f,0x6f,0xd8, // movdqa %xmm0,%xmm3 + 0x66,0x0f,0x61,0xd9, // punpcklwd %xmm1,%xmm3 + 0x66,0x0f,0x69,0xc1, // punpckhwd %xmm1,%xmm0 + 0x66,0x0f,0x6f,0xcb, // movdqa %xmm3,%xmm1 + 0x66,0x0f,0x61,0xc8, // punpcklwd %xmm0,%xmm1 + 0x66,0x0f,0x69,0xd8, // punpckhwd %xmm0,%xmm3 + 0x66,0x45,0x0f,0xef,0xc0, // pxor %xmm8,%xmm8 + 0x66,0x0f,0x6f,0xc1, // movdqa %xmm1,%xmm0 + 0x66,0x41,0x0f,0x61,0xc0, // punpcklwd %xmm8,%xmm0 + 0x66,0x0f,0x72,0xf0,0x0d, // pslld $0xd,%xmm0 + 0x66,0x0f,0x6e,0x51,0x38, // movd 0x38(%rcx),%xmm2 + 0x66,0x44,0x0f,0x70,0xca,0x00, // pshufd $0x0,%xmm2,%xmm9 + 0x41,0x0f,0x59,0xc1, // mulps %xmm9,%xmm0 + 0x66,0x41,0x0f,0x69,0xc8, // punpckhwd %xmm8,%xmm1 + 0x66,0x0f,0x72,0xf1,0x0d, // pslld $0xd,%xmm1 + 0x41,0x0f,0x59,0xc9, // mulps %xmm9,%xmm1 + 0x66,0x0f,0x6f,0xd3, // movdqa %xmm3,%xmm2 + 0x66,0x41,0x0f,0x61,0xd0, // punpcklwd %xmm8,%xmm2 + 0x66,0x0f,0x72,0xf2,0x0d, // pslld $0xd,%xmm2 + 0x41,0x0f,0x59,0xd1, // mulps %xmm9,%xmm2 + 0x66,0x41,0x0f,0x69,0xd8, // punpckhwd %xmm8,%xmm3 + 0x66,0x0f,0x72,0xf3,0x0d, // pslld $0xd,%xmm3 + 0x41,0x0f,0x59,0xd9, // mulps %xmm9,%xmm3 + 0xc3, // return +}; +static const unsigned char sse2_store_f16[] = { + 0x48,0x8b,0x02, // mov (%rdx),%rax + 0x66,0x44,0x0f,0x6e,0x41,0x3c, // movd 0x3c(%rcx),%xmm8 + 0x66,0x45,0x0f,0x70,0xc0,0x00, // pshufd $0x0,%xmm8,%xmm8 + 0x66,0x45,0x0f,0x6f,0xc8, // movdqa %xmm8,%xmm9 + 0x44,0x0f,0x59,0xc8, // mulps %xmm0,%xmm9 + 0x66,0x41,0x0f,0x72,0xd1,0x0d, // psrld $0xd,%xmm9 + 0x66,0x45,0x0f,0x6f,0xd0, // movdqa %xmm8,%xmm10 + 0x44,0x0f,0x59,0xd1, // mulps %xmm1,%xmm10 + 0x66,0x41,0x0f,0x72,0xd2,0x0d, // psrld $0xd,%xmm10 + 0x66,0x45,0x0f,0x6f,0xd8, // movdqa %xmm8,%xmm11 + 0x44,0x0f,0x59,0xda, // mulps %xmm2,%xmm11 + 0x66,0x41,0x0f,0x72,0xd3,0x0d, // psrld $0xd,%xmm11 + 0x44,0x0f,0x59,0xc3, // mulps %xmm3,%xmm8 + 0x66,0x41,0x0f,0x72,0xd0,0x0d, // psrld $0xd,%xmm8 + 0x66,0x41,0x0f,0x73,0xfa,0x02, // pslldq $0x2,%xmm10 + 0x66,0x45,0x0f,0xeb,0xd1, // por %xmm9,%xmm10 + 0x66,0x41,0x0f,0x73,0xf8,0x02, // pslldq $0x2,%xmm8 + 0x66,0x45,0x0f,0xeb,0xc3, // por %xmm11,%xmm8 + 0x66,0x45,0x0f,0x6f,0xca, // movdqa %xmm10,%xmm9 + 0x66,0x45,0x0f,0x62,0xc8, // punpckldq %xmm8,%xmm9 + 0xf3,0x44,0x0f,0x7f,0x0c,0xf8, // movdqu %xmm9,(%rax,%rdi,8) + 0x66,0x45,0x0f,0x6a,0xd0, // punpckhdq %xmm8,%xmm10 + 0xf3,0x44,0x0f,0x7f,0x54,0xf8,0x10, // movdqu %xmm10,0x10(%rax,%rdi,8) + 0xc3, // return +}; +static const unsigned char sse2_matrix_3x4[] = { + 0x44,0x0f,0x28,0xc9, // movaps %xmm1,%xmm9 + 0x44,0x0f,0x28,0xc0, // movaps %xmm0,%xmm8 + 0xf3,0x0f,0x10,0x02, // movss (%rdx),%xmm0 + 0xf3,0x0f,0x10,0x4a,0x04, // movss 0x4(%rdx),%xmm1 + 0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm0,%xmm0 + 0xf3,0x44,0x0f,0x10,0x52,0x0c, // movss 0xc(%rdx),%xmm10 + 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10 + 0xf3,0x44,0x0f,0x10,0x5a,0x18, // movss 0x18(%rdx),%xmm11 + 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11 + 0xf3,0x44,0x0f,0x10,0x62,0x24, // movss 0x24(%rdx),%xmm12 + 0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12 + 0x44,0x0f,0x59,0xda, // mulps %xmm2,%xmm11 + 0x45,0x0f,0x58,0xdc, // addps %xmm12,%xmm11 + 0x45,0x0f,0x59,0xd1, // mulps %xmm9,%xmm10 + 0x45,0x0f,0x58,0xd3, // addps %xmm11,%xmm10 + 0x41,0x0f,0x59,0xc0, // mulps %xmm8,%xmm0 + 0x41,0x0f,0x58,0xc2, // addps %xmm10,%xmm0 + 0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm1,%xmm1 + 0xf3,0x44,0x0f,0x10,0x52,0x10, // movss 0x10(%rdx),%xmm10 + 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10 + 0xf3,0x44,0x0f,0x10,0x5a,0x1c, // movss 0x1c(%rdx),%xmm11 + 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11 + 0xf3,0x44,0x0f,0x10,0x62,0x28, // movss 0x28(%rdx),%xmm12 + 0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12 + 0x44,0x0f,0x59,0xda, // mulps %xmm2,%xmm11 + 0x45,0x0f,0x58,0xdc, // addps %xmm12,%xmm11 + 0x45,0x0f,0x59,0xd1, // mulps %xmm9,%xmm10 + 0x45,0x0f,0x58,0xd3, // addps %xmm11,%xmm10 + 0x41,0x0f,0x59,0xc8, // mulps %xmm8,%xmm1 + 0x41,0x0f,0x58,0xca, // addps %xmm10,%xmm1 + 0xf3,0x44,0x0f,0x10,0x52,0x08, // movss 0x8(%rdx),%xmm10 + 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10 + 0xf3,0x44,0x0f,0x10,0x5a,0x14, // movss 0x14(%rdx),%xmm11 + 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11 + 0xf3,0x44,0x0f,0x10,0x62,0x20, // movss 0x20(%rdx),%xmm12 + 0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12 + 0xf3,0x44,0x0f,0x10,0x6a,0x2c, // movss 0x2c(%rdx),%xmm13 + 0x45,0x0f,0xc6,0xed,0x00, // shufps $0x0,%xmm13,%xmm13 + 0x44,0x0f,0x59,0xe2, // mulps %xmm2,%xmm12 + 0x45,0x0f,0x58,0xe5, // addps %xmm13,%xmm12 + 0x45,0x0f,0x59,0xd9, // mulps %xmm9,%xmm11 + 0x45,0x0f,0x58,0xdc, // addps %xmm12,%xmm11 + 0x45,0x0f,0x59,0xd0, // mulps %xmm8,%xmm10 + 0x45,0x0f,0x58,0xd3, // addps %xmm11,%xmm10 + 0x41,0x0f,0x28,0xd2, // movaps %xmm10,%xmm2 + 0xc3, // return +}; static const unsigned char hsw_inc_x[] = { 0x48,0x83,0xc7,0x08, // add $0x8,%rdi 0xc3, // return diff --git a/src/splicer/build_stages.py b/src/splicer/build_stages.py index 354cfb37ef..21d94d8dd4 100755 --- a/src/splicer/build_stages.py +++ b/src/splicer/build_stages.py @@ -19,18 +19,18 @@ objdump = 'gobjdump' cflags = '-std=c++11 -Os -fomit-frame-pointer'.split() -sse2 = '-msse2 -mno-sse3 -mno-ssse3 -mno-sse4.1'.split() +sse2 = '-mno-red-zone -msse2 -mno-sse3 -mno-ssse3 -mno-sse4.1'.split() subprocess.check_call(['clang++'] + cflags + sse2 + ['-c', 'src/splicer/SkSplicer_stages.cpp'] + ['-o', 'sse2.o']) -sse41 = '-msse4.1'.split() +sse41 = '-mno-red-zone -msse4.1'.split() subprocess.check_call(['clang++'] + cflags + sse41 + ['-c', 'src/splicer/SkSplicer_stages.cpp'] + ['-o', 'sse41.o']) -hsw = '-mavx2 -mfma -mf16c'.split() +hsw = '-mno-red-zone -mavx2 -mfma -mf16c'.split() subprocess.check_call(['clang++'] + cflags + hsw + ['-c', 'src/splicer/SkSplicer_stages.cpp'] + ['-o', 'hsw.o']) @@ -111,7 +111,7 @@ print '''/* parse_object_file('aarch64.o', 'unsigned int', '14000000', 'd65f03c0') parse_object_file( 'armv7.o', 'unsigned int', 'eafffffe', 'e12fff1e', target='elf32-littlearm') -#parse_object_file( 'sse2.o', 'unsigned char', 'e9 00 00 00 00', 'c3') +parse_object_file( 'sse2.o', 'unsigned char', 'e9 00 00 00 00', 'c3') #parse_object_file('sse41.o', 'unsigned char', 'e9 00 00 00 00', 'c3') parse_object_file( 'hsw.o', 'unsigned char', 'e9 00 00 00 00', 'c3') print '#endif//SkSplicer_generated_DEFINED' |