aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-02-11 13:21:29 -0500
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-02-13 21:08:10 +0000
commit6fcea9d4366f385260e7a59c84a5456c9de2d844 (patch)
treea595d0258cac7c38cfee2186e8fcaf1463c327af
parent07f665efb918f68e406b76a78d0b76d5c714f16c (diff)
Enable sse2 backend for SkSplicer.
One more piece of https://skia-review.googlesource.com/c/8230/. -mno-red-zone makes it safe for x86 stages to use the stack on Windows (at the expense of an extra sub and add to the stack pointer on !Windows). CQ_INCLUDE_TRYBOTS=skia.primary:Test-Win2k8-MSVC-GCE-CPU-AVX2-x86_64-Debug,Test-Win10-MSVC-Golo-GPU-GT610-x86_64-Release,Test-Mac-Clang-MacMini6.2-CPU-AVX-x86_64-Debug Change-Id: I81f8220e790b201757a7e1e9752b2fe94520ccbb Reviewed-on: https://skia-review.googlesource.com/8352 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Herb Derby <herb@google.com>
-rw-r--r--src/splicer/SkSplicer.cpp28
-rw-r--r--src/splicer/SkSplicer_generated.h477
-rwxr-xr-xsrc/splicer/build_stages.py8
3 files changed, 499 insertions, 14 deletions
diff --git a/src/splicer/SkSplicer.cpp b/src/splicer/SkSplicer.cpp
index b3336cccab..8447c9b45f 100644
--- a/src/splicer/SkSplicer.cpp
+++ b/src/splicer/SkSplicer.cpp
@@ -112,12 +112,6 @@ namespace {
splice(buf, jb_near); // jb <next 4 bytes> (b == "before", unsigned less than)
splice(buf, loop_start - (int)(buf->bytesWritten() + 4));
}
- static void ret(SkWStream* buf) {
- static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 };
- static const uint8_t ret[] = { 0xc3 };
- splice(buf, vzeroupper);
- splice(buf, ret);
- }
#endif
#if defined(_MSC_VER)
@@ -279,6 +273,7 @@ namespace {
DEFINE_SPLICE_STAGE(armv7)
#else
DEFINE_SPLICE_STAGE(hsw)
+ DEFINE_SPLICE_STAGE(sse2)
#endif
#undef DEFINE_SPLICE
#undef CASE
@@ -305,12 +300,25 @@ namespace {
auto splice_stage = armv7_splice_stage;
auto inc_x = [](SkWStream* buf) { splice_until_ret(buf, armv7_inc_x); };
#else
- // To keep things simple, only one x86 target supported: Haswell+ x86-64.
- if (!SkCpu::Supports(SkCpu::HSW) || sizeof(void*) != 8) {
+ // To keep things simple, only x86-64 supported.
+ if (sizeof(void*) != 8) {
return;
}
- auto splice_stage = hsw_splice_stage;
- auto inc_x = [&](SkWStream* buf) { splice_until_ret(buf, hsw_inc_x); };
+ bool hsw = true && SkCpu::Supports(SkCpu::HSW);
+
+ auto splice_stage = hsw ? hsw_splice_stage : sse2_splice_stage;
+ auto inc_x = [hsw](SkWStream* buf) {
+ if (hsw) { splice_until_ret(buf, hsw_inc_x); }
+ else { splice_until_ret(buf, sse2_inc_x); }
+ };
+ auto ret = [hsw](SkWStream* buf) {
+ static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 };
+ static const uint8_t ret[] = { 0xc3 };
+ if (hsw) {
+ splice(buf, vzeroupper);
+ }
+ splice(buf, ret);
+ };
#endif
SkDynamicMemoryWStream buf;
diff --git a/src/splicer/SkSplicer_generated.h b/src/splicer/SkSplicer_generated.h
index fa8a72d8ff..5f16b5c66f 100644
--- a/src/splicer/SkSplicer_generated.h
+++ b/src/splicer/SkSplicer_generated.h
@@ -775,6 +775,483 @@ static const unsigned int armv7_matrix_3x4[] = {
0xf22211b2, // vorr d1, d18, d18
0xe12fff1e, // return
};
+static const unsigned char sse2_inc_x[] = {
+ 0x48,0x83,0xc7,0x04, // add $0x4,%rdi
+ 0xc3, // return
+};
+static const unsigned char sse2_clear[] = {
+ 0x0f,0x57,0xc0, // xorps %xmm0,%xmm0
+ 0x0f,0x57,0xc9, // xorps %xmm1,%xmm1
+ 0x0f,0x57,0xd2, // xorps %xmm2,%xmm2
+ 0x0f,0x57,0xdb, // xorps %xmm3,%xmm3
+ 0xc3, // return
+};
+static const unsigned char sse2_plus_[] = {
+ 0x0f,0x58,0xc4, // addps %xmm4,%xmm0
+ 0x0f,0x58,0xcd, // addps %xmm5,%xmm1
+ 0x0f,0x58,0xd6, // addps %xmm6,%xmm2
+ 0x0f,0x58,0xdf, // addps %xmm7,%xmm3
+ 0xc3, // return
+};
+static const unsigned char sse2_srcover[] = {
+ 0xf3,0x44,0x0f,0x10,0x01, // movss (%rcx),%xmm8
+ 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
+ 0x44,0x0f,0x5c,0xc3, // subps %xmm3,%xmm8
+ 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9
+ 0x44,0x0f,0x59,0xcc, // mulps %xmm4,%xmm9
+ 0x41,0x0f,0x58,0xc1, // addps %xmm9,%xmm0
+ 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9
+ 0x44,0x0f,0x59,0xcd, // mulps %xmm5,%xmm9
+ 0x41,0x0f,0x58,0xc9, // addps %xmm9,%xmm1
+ 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9
+ 0x44,0x0f,0x59,0xce, // mulps %xmm6,%xmm9
+ 0x41,0x0f,0x58,0xd1, // addps %xmm9,%xmm2
+ 0x44,0x0f,0x59,0xc7, // mulps %xmm7,%xmm8
+ 0x41,0x0f,0x58,0xd8, // addps %xmm8,%xmm3
+ 0xc3, // return
+};
+static const unsigned char sse2_dstover[] = {
+ 0xf3,0x44,0x0f,0x10,0x01, // movss (%rcx),%xmm8
+ 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
+ 0x44,0x0f,0x5c,0xc7, // subps %xmm7,%xmm8
+ 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9
+ 0x44,0x0f,0x59,0xc8, // mulps %xmm0,%xmm9
+ 0x41,0x0f,0x58,0xe1, // addps %xmm9,%xmm4
+ 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9
+ 0x44,0x0f,0x59,0xc9, // mulps %xmm1,%xmm9
+ 0x41,0x0f,0x58,0xe9, // addps %xmm9,%xmm5
+ 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9
+ 0x44,0x0f,0x59,0xca, // mulps %xmm2,%xmm9
+ 0x41,0x0f,0x58,0xf1, // addps %xmm9,%xmm6
+ 0x44,0x0f,0x59,0xc3, // mulps %xmm3,%xmm8
+ 0x41,0x0f,0x58,0xf8, // addps %xmm8,%xmm7
+ 0xc3, // return
+};
+static const unsigned char sse2_clamp_0[] = {
+ 0x45,0x0f,0x57,0xc0, // xorps %xmm8,%xmm8
+ 0x41,0x0f,0x5f,0xc0, // maxps %xmm8,%xmm0
+ 0x41,0x0f,0x5f,0xc8, // maxps %xmm8,%xmm1
+ 0x41,0x0f,0x5f,0xd0, // maxps %xmm8,%xmm2
+ 0x41,0x0f,0x5f,0xd8, // maxps %xmm8,%xmm3
+ 0xc3, // return
+};
+static const unsigned char sse2_clamp_1[] = {
+ 0xf3,0x44,0x0f,0x10,0x01, // movss (%rcx),%xmm8
+ 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
+ 0x41,0x0f,0x5d,0xc0, // minps %xmm8,%xmm0
+ 0x41,0x0f,0x5d,0xc8, // minps %xmm8,%xmm1
+ 0x41,0x0f,0x5d,0xd0, // minps %xmm8,%xmm2
+ 0x41,0x0f,0x5d,0xd8, // minps %xmm8,%xmm3
+ 0xc3, // return
+};
+static const unsigned char sse2_clamp_a[] = {
+ 0xf3,0x44,0x0f,0x10,0x01, // movss (%rcx),%xmm8
+ 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
+ 0x41,0x0f,0x5d,0xd8, // minps %xmm8,%xmm3
+ 0x0f,0x5d,0xc3, // minps %xmm3,%xmm0
+ 0x0f,0x5d,0xcb, // minps %xmm3,%xmm1
+ 0x0f,0x5d,0xd3, // minps %xmm3,%xmm2
+ 0xc3, // return
+};
+static const unsigned char sse2_swap[] = {
+ 0x44,0x0f,0x28,0xc3, // movaps %xmm3,%xmm8
+ 0x44,0x0f,0x28,0xca, // movaps %xmm2,%xmm9
+ 0x44,0x0f,0x28,0xd1, // movaps %xmm1,%xmm10
+ 0x44,0x0f,0x28,0xd8, // movaps %xmm0,%xmm11
+ 0x0f,0x28,0xc4, // movaps %xmm4,%xmm0
+ 0x0f,0x28,0xcd, // movaps %xmm5,%xmm1
+ 0x0f,0x28,0xd6, // movaps %xmm6,%xmm2
+ 0x0f,0x28,0xdf, // movaps %xmm7,%xmm3
+ 0x41,0x0f,0x28,0xe3, // movaps %xmm11,%xmm4
+ 0x41,0x0f,0x28,0xea, // movaps %xmm10,%xmm5
+ 0x41,0x0f,0x28,0xf1, // movaps %xmm9,%xmm6
+ 0x41,0x0f,0x28,0xf8, // movaps %xmm8,%xmm7
+ 0xc3, // return
+};
+static const unsigned char sse2_move_src_dst[] = {
+ 0x0f,0x28,0xe0, // movaps %xmm0,%xmm4
+ 0x0f,0x28,0xe9, // movaps %xmm1,%xmm5
+ 0x0f,0x28,0xf2, // movaps %xmm2,%xmm6
+ 0x0f,0x28,0xfb, // movaps %xmm3,%xmm7
+ 0xc3, // return
+};
+static const unsigned char sse2_move_dst_src[] = {
+ 0x0f,0x28,0xc4, // movaps %xmm4,%xmm0
+ 0x0f,0x28,0xcd, // movaps %xmm5,%xmm1
+ 0x0f,0x28,0xd6, // movaps %xmm6,%xmm2
+ 0x0f,0x28,0xdf, // movaps %xmm7,%xmm3
+ 0xc3, // return
+};
+static const unsigned char sse2_premul[] = {
+ 0x0f,0x59,0xc3, // mulps %xmm3,%xmm0
+ 0x0f,0x59,0xcb, // mulps %xmm3,%xmm1
+ 0x0f,0x59,0xd3, // mulps %xmm3,%xmm2
+ 0xc3, // return
+};
+static const unsigned char sse2_unpremul[] = {
+ 0x45,0x0f,0x57,0xc0, // xorps %xmm8,%xmm8
+ 0x44,0x0f,0xc2,0xc3,0x00, // cmpeqps %xmm3,%xmm8
+ 0xf3,0x44,0x0f,0x10,0x09, // movss (%rcx),%xmm9
+ 0x45,0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm9,%xmm9
+ 0x44,0x0f,0x5e,0xcb, // divps %xmm3,%xmm9
+ 0x45,0x0f,0x55,0xc1, // andnps %xmm9,%xmm8
+ 0x41,0x0f,0x59,0xc0, // mulps %xmm8,%xmm0
+ 0x41,0x0f,0x59,0xc8, // mulps %xmm8,%xmm1
+ 0x41,0x0f,0x59,0xd0, // mulps %xmm8,%xmm2
+ 0xc3, // return
+};
+static const unsigned char sse2_from_srgb[] = {
+ 0xf3,0x44,0x0f,0x10,0x41,0x1c, // movss 0x1c(%rcx),%xmm8
+ 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
+ 0x45,0x0f,0x28,0xe8, // movaps %xmm8,%xmm13
+ 0x44,0x0f,0x59,0xe8, // mulps %xmm0,%xmm13
+ 0x44,0x0f,0x28,0xe0, // movaps %xmm0,%xmm12
+ 0x45,0x0f,0x59,0xe4, // mulps %xmm12,%xmm12
+ 0xf3,0x44,0x0f,0x10,0x49,0x18, // movss 0x18(%rcx),%xmm9
+ 0x45,0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm9,%xmm9
+ 0xf3,0x44,0x0f,0x10,0x51,0x10, // movss 0x10(%rcx),%xmm10
+ 0xf3,0x44,0x0f,0x10,0x59,0x14, // movss 0x14(%rcx),%xmm11
+ 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11
+ 0x45,0x0f,0x28,0xf1, // movaps %xmm9,%xmm14
+ 0x44,0x0f,0x59,0xf0, // mulps %xmm0,%xmm14
+ 0x45,0x0f,0x58,0xf3, // addps %xmm11,%xmm14
+ 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10
+ 0x45,0x0f,0x59,0xf4, // mulps %xmm12,%xmm14
+ 0x45,0x0f,0x58,0xf2, // addps %xmm10,%xmm14
+ 0xf3,0x44,0x0f,0x10,0x61,0x20, // movss 0x20(%rcx),%xmm12
+ 0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12
+ 0x41,0x0f,0xc2,0xc4,0x01, // cmpltps %xmm12,%xmm0
+ 0x44,0x0f,0x54,0xe8, // andps %xmm0,%xmm13
+ 0x41,0x0f,0x55,0xc6, // andnps %xmm14,%xmm0
+ 0x41,0x0f,0x56,0xc5, // orps %xmm13,%xmm0
+ 0x45,0x0f,0x28,0xe8, // movaps %xmm8,%xmm13
+ 0x44,0x0f,0x59,0xe9, // mulps %xmm1,%xmm13
+ 0x44,0x0f,0x28,0xf1, // movaps %xmm1,%xmm14
+ 0x45,0x0f,0x59,0xf6, // mulps %xmm14,%xmm14
+ 0x45,0x0f,0x28,0xf9, // movaps %xmm9,%xmm15
+ 0x44,0x0f,0x59,0xf9, // mulps %xmm1,%xmm15
+ 0x45,0x0f,0x58,0xfb, // addps %xmm11,%xmm15
+ 0x45,0x0f,0x59,0xfe, // mulps %xmm14,%xmm15
+ 0x45,0x0f,0x58,0xfa, // addps %xmm10,%xmm15
+ 0x41,0x0f,0xc2,0xcc,0x01, // cmpltps %xmm12,%xmm1
+ 0x44,0x0f,0x54,0xe9, // andps %xmm1,%xmm13
+ 0x41,0x0f,0x55,0xcf, // andnps %xmm15,%xmm1
+ 0x41,0x0f,0x56,0xcd, // orps %xmm13,%xmm1
+ 0x44,0x0f,0x59,0xc2, // mulps %xmm2,%xmm8
+ 0x44,0x0f,0x28,0xea, // movaps %xmm2,%xmm13
+ 0x45,0x0f,0x59,0xed, // mulps %xmm13,%xmm13
+ 0x44,0x0f,0x59,0xca, // mulps %xmm2,%xmm9
+ 0x45,0x0f,0x58,0xcb, // addps %xmm11,%xmm9
+ 0x45,0x0f,0x59,0xcd, // mulps %xmm13,%xmm9
+ 0x45,0x0f,0x58,0xca, // addps %xmm10,%xmm9
+ 0x41,0x0f,0xc2,0xd4,0x01, // cmpltps %xmm12,%xmm2
+ 0x44,0x0f,0x54,0xc2, // andps %xmm2,%xmm8
+ 0x41,0x0f,0x55,0xd1, // andnps %xmm9,%xmm2
+ 0x41,0x0f,0x56,0xd0, // orps %xmm8,%xmm2
+ 0xc3, // return
+};
+static const unsigned char sse2_to_srgb[] = {
+ 0x48,0x83,0xec,0x28, // sub $0x28,%rsp
+ 0x0f,0x29,0x7c,0x24,0x10, // movaps %xmm7,0x10(%rsp)
+ 0x0f,0x29,0x34,0x24, // movaps %xmm6,(%rsp)
+ 0x0f,0x28,0xf5, // movaps %xmm5,%xmm6
+ 0x0f,0x28,0xec, // movaps %xmm4,%xmm5
+ 0x0f,0x28,0xe3, // movaps %xmm3,%xmm4
+ 0x44,0x0f,0x52,0xc0, // rsqrtps %xmm0,%xmm8
+ 0x45,0x0f,0x53,0xe8, // rcpps %xmm8,%xmm13
+ 0x45,0x0f,0x52,0xf8, // rsqrtps %xmm8,%xmm15
+ 0xf3,0x0f,0x10,0x19, // movss (%rcx),%xmm3
+ 0xf3,0x44,0x0f,0x10,0x41,0x24, // movss 0x24(%rcx),%xmm8
+ 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
+ 0x45,0x0f,0x28,0xf0, // movaps %xmm8,%xmm14
+ 0x44,0x0f,0x59,0xf0, // mulps %xmm0,%xmm14
+ 0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm3,%xmm3
+ 0xf3,0x44,0x0f,0x10,0x51,0x28, // movss 0x28(%rcx),%xmm10
+ 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10
+ 0xf3,0x44,0x0f,0x10,0x59,0x2c, // movss 0x2c(%rcx),%xmm11
+ 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11
+ 0xf3,0x44,0x0f,0x10,0x61,0x30, // movss 0x30(%rcx),%xmm12
+ 0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12
+ 0x45,0x0f,0x59,0xeb, // mulps %xmm11,%xmm13
+ 0x45,0x0f,0x58,0xec, // addps %xmm12,%xmm13
+ 0x45,0x0f,0x59,0xfa, // mulps %xmm10,%xmm15
+ 0x45,0x0f,0x58,0xfd, // addps %xmm13,%xmm15
+ 0x44,0x0f,0x28,0xcb, // movaps %xmm3,%xmm9
+ 0x45,0x0f,0x5d,0xcf, // minps %xmm15,%xmm9
+ 0xf3,0x44,0x0f,0x10,0x69,0x34, // movss 0x34(%rcx),%xmm13
+ 0x45,0x0f,0xc6,0xed,0x00, // shufps $0x0,%xmm13,%xmm13
+ 0x41,0x0f,0xc2,0xc5,0x01, // cmpltps %xmm13,%xmm0
+ 0x44,0x0f,0x54,0xf0, // andps %xmm0,%xmm14
+ 0x41,0x0f,0x55,0xc1, // andnps %xmm9,%xmm0
+ 0x41,0x0f,0x56,0xc6, // orps %xmm14,%xmm0
+ 0x44,0x0f,0x52,0xc9, // rsqrtps %xmm1,%xmm9
+ 0x45,0x0f,0x53,0xf1, // rcpps %xmm9,%xmm14
+ 0x45,0x0f,0x52,0xc9, // rsqrtps %xmm9,%xmm9
+ 0x45,0x0f,0x59,0xf3, // mulps %xmm11,%xmm14
+ 0x45,0x0f,0x58,0xf4, // addps %xmm12,%xmm14
+ 0x45,0x0f,0x59,0xca, // mulps %xmm10,%xmm9
+ 0x45,0x0f,0x58,0xce, // addps %xmm14,%xmm9
+ 0x44,0x0f,0x28,0xf3, // movaps %xmm3,%xmm14
+ 0x45,0x0f,0x5d,0xf1, // minps %xmm9,%xmm14
+ 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9
+ 0x44,0x0f,0x59,0xc9, // mulps %xmm1,%xmm9
+ 0x41,0x0f,0xc2,0xcd,0x01, // cmpltps %xmm13,%xmm1
+ 0x44,0x0f,0x54,0xc9, // andps %xmm1,%xmm9
+ 0x41,0x0f,0x55,0xce, // andnps %xmm14,%xmm1
+ 0x41,0x0f,0x56,0xc9, // orps %xmm9,%xmm1
+ 0x44,0x0f,0x52,0xca, // rsqrtps %xmm2,%xmm9
+ 0x45,0x0f,0x53,0xf1, // rcpps %xmm9,%xmm14
+ 0x45,0x0f,0x59,0xf3, // mulps %xmm11,%xmm14
+ 0x45,0x0f,0x58,0xf4, // addps %xmm12,%xmm14
+ 0x41,0x0f,0x52,0xf9, // rsqrtps %xmm9,%xmm7
+ 0x41,0x0f,0x59,0xfa, // mulps %xmm10,%xmm7
+ 0x41,0x0f,0x58,0xfe, // addps %xmm14,%xmm7
+ 0x0f,0x5d,0xdf, // minps %xmm7,%xmm3
+ 0x44,0x0f,0x59,0xc2, // mulps %xmm2,%xmm8
+ 0x41,0x0f,0xc2,0xd5,0x01, // cmpltps %xmm13,%xmm2
+ 0x44,0x0f,0x54,0xc2, // andps %xmm2,%xmm8
+ 0x0f,0x55,0xd3, // andnps %xmm3,%xmm2
+ 0x41,0x0f,0x56,0xd0, // orps %xmm8,%xmm2
+ 0x0f,0x28,0xdc, // movaps %xmm4,%xmm3
+ 0x0f,0x28,0xe5, // movaps %xmm5,%xmm4
+ 0x0f,0x28,0xee, // movaps %xmm6,%xmm5
+ 0x0f,0x28,0x34,0x24, // movaps (%rsp),%xmm6
+ 0x0f,0x28,0x7c,0x24,0x10, // movaps 0x10(%rsp),%xmm7
+ 0x48,0x83,0xc4,0x28, // add $0x28,%rsp
+ 0xc3, // return
+};
+static const unsigned char sse2_scale_u8[] = {
+ 0x48,0x8b,0x02, // mov (%rdx),%rax
+ 0x66,0x44,0x0f,0x6e,0x04,0x38, // movd (%rax,%rdi,1),%xmm8
+ 0x66,0x45,0x0f,0xef,0xc9, // pxor %xmm9,%xmm9
+ 0x66,0x45,0x0f,0x60,0xc1, // punpcklbw %xmm9,%xmm8
+ 0x66,0x45,0x0f,0x61,0xc1, // punpcklwd %xmm9,%xmm8
+ 0x45,0x0f,0x5b,0xc0, // cvtdq2ps %xmm8,%xmm8
+ 0xf3,0x44,0x0f,0x10,0x49,0x08, // movss 0x8(%rcx),%xmm9
+ 0x45,0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm9,%xmm9
+ 0x45,0x0f,0x59,0xc8, // mulps %xmm8,%xmm9
+ 0x41,0x0f,0x59,0xc1, // mulps %xmm9,%xmm0
+ 0x41,0x0f,0x59,0xc9, // mulps %xmm9,%xmm1
+ 0x41,0x0f,0x59,0xd1, // mulps %xmm9,%xmm2
+ 0x41,0x0f,0x59,0xd9, // mulps %xmm9,%xmm3
+ 0xc3, // return
+};
+static const unsigned char sse2_load_tables[] = {
+ 0x48,0x8b,0x02, // mov (%rdx),%rax
+ 0x4c,0x8b,0x42,0x08, // mov 0x8(%rdx),%r8
+ 0xf3,0x44,0x0f,0x6f,0x04,0xb8, // movdqu (%rax,%rdi,4),%xmm8
+ 0x66,0x0f,0x6e,0x41,0x0c, // movd 0xc(%rcx),%xmm0
+ 0x66,0x0f,0x70,0xc0,0x00, // pshufd $0x0,%xmm0,%xmm0
+ 0x66,0x45,0x0f,0x6f,0xc8, // movdqa %xmm8,%xmm9
+ 0x66,0x41,0x0f,0x72,0xd1,0x08, // psrld $0x8,%xmm9
+ 0x66,0x44,0x0f,0xdb,0xc8, // pand %xmm0,%xmm9
+ 0x66,0x45,0x0f,0x6f,0xd0, // movdqa %xmm8,%xmm10
+ 0x66,0x41,0x0f,0x72,0xd2,0x10, // psrld $0x10,%xmm10
+ 0x66,0x44,0x0f,0xdb,0xd0, // pand %xmm0,%xmm10
+ 0x66,0x41,0x0f,0xdb,0xc0, // pand %xmm8,%xmm0
+ 0x66,0x0f,0x70,0xd8,0x4e, // pshufd $0x4e,%xmm0,%xmm3
+ 0x66,0x48,0x0f,0x7e,0xd8, // movq %xmm3,%rax
+ 0x41,0x89,0xc1, // mov %eax,%r9d
+ 0x48,0xc1,0xe8,0x20, // shr $0x20,%rax
+ 0x66,0x49,0x0f,0x7e,0xc2, // movq %xmm0,%r10
+ 0x45,0x89,0xd3, // mov %r10d,%r11d
+ 0x49,0xc1,0xea,0x20, // shr $0x20,%r10
+ 0xf3,0x43,0x0f,0x10,0x1c,0x90, // movss (%r8,%r10,4),%xmm3
+ 0xf3,0x41,0x0f,0x10,0x04,0x80, // movss (%r8,%rax,4),%xmm0
+ 0x0f,0x14,0xd8, // unpcklps %xmm0,%xmm3
+ 0xf3,0x43,0x0f,0x10,0x04,0x98, // movss (%r8,%r11,4),%xmm0
+ 0xf3,0x43,0x0f,0x10,0x0c,0x88, // movss (%r8,%r9,4),%xmm1
+ 0x0f,0x14,0xc1, // unpcklps %xmm1,%xmm0
+ 0x0f,0x14,0xc3, // unpcklps %xmm3,%xmm0
+ 0x48,0x8b,0x42,0x10, // mov 0x10(%rdx),%rax
+ 0x66,0x41,0x0f,0x70,0xc9,0x4e, // pshufd $0x4e,%xmm9,%xmm1
+ 0x66,0x49,0x0f,0x7e,0xc8, // movq %xmm1,%r8
+ 0x45,0x89,0xc1, // mov %r8d,%r9d
+ 0x49,0xc1,0xe8,0x20, // shr $0x20,%r8
+ 0x66,0x4d,0x0f,0x7e,0xca, // movq %xmm9,%r10
+ 0x45,0x89,0xd3, // mov %r10d,%r11d
+ 0x49,0xc1,0xea,0x20, // shr $0x20,%r10
+ 0xf3,0x42,0x0f,0x10,0x1c,0x90, // movss (%rax,%r10,4),%xmm3
+ 0xf3,0x42,0x0f,0x10,0x0c,0x80, // movss (%rax,%r8,4),%xmm1
+ 0x0f,0x14,0xd9, // unpcklps %xmm1,%xmm3
+ 0xf3,0x42,0x0f,0x10,0x0c,0x98, // movss (%rax,%r11,4),%xmm1
+ 0xf3,0x42,0x0f,0x10,0x14,0x88, // movss (%rax,%r9,4),%xmm2
+ 0x0f,0x14,0xca, // unpcklps %xmm2,%xmm1
+ 0x0f,0x14,0xcb, // unpcklps %xmm3,%xmm1
+ 0x48,0x8b,0x42,0x18, // mov 0x18(%rdx),%rax
+ 0x66,0x41,0x0f,0x70,0xd2,0x4e, // pshufd $0x4e,%xmm10,%xmm2
+ 0x66,0x49,0x0f,0x7e,0xd0, // movq %xmm2,%r8
+ 0x45,0x89,0xc1, // mov %r8d,%r9d
+ 0x49,0xc1,0xe8,0x20, // shr $0x20,%r8
+ 0x66,0x4d,0x0f,0x7e,0xd2, // movq %xmm10,%r10
+ 0x45,0x89,0xd3, // mov %r10d,%r11d
+ 0x49,0xc1,0xea,0x20, // shr $0x20,%r10
+ 0xf3,0x46,0x0f,0x10,0x0c,0x90, // movss (%rax,%r10,4),%xmm9
+ 0xf3,0x42,0x0f,0x10,0x14,0x80, // movss (%rax,%r8,4),%xmm2
+ 0x44,0x0f,0x14,0xca, // unpcklps %xmm2,%xmm9
+ 0xf3,0x42,0x0f,0x10,0x14,0x98, // movss (%rax,%r11,4),%xmm2
+ 0xf3,0x42,0x0f,0x10,0x1c,0x88, // movss (%rax,%r9,4),%xmm3
+ 0x0f,0x14,0xd3, // unpcklps %xmm3,%xmm2
+ 0x41,0x0f,0x14,0xd1, // unpcklps %xmm9,%xmm2
+ 0x66,0x41,0x0f,0x72,0xd0,0x18, // psrld $0x18,%xmm8
+ 0x45,0x0f,0x5b,0xc0, // cvtdq2ps %xmm8,%xmm8
+ 0xf3,0x0f,0x10,0x59,0x08, // movss 0x8(%rcx),%xmm3
+ 0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm3,%xmm3
+ 0x41,0x0f,0x59,0xd8, // mulps %xmm8,%xmm3
+ 0xc3, // return
+};
+static const unsigned char sse2_load_8888[] = {
+ 0x48,0x8b,0x02, // mov (%rdx),%rax
+ 0xf3,0x0f,0x6f,0x1c,0xb8, // movdqu (%rax,%rdi,4),%xmm3
+ 0x66,0x0f,0x6e,0x41,0x0c, // movd 0xc(%rcx),%xmm0
+ 0x66,0x0f,0x70,0xc0,0x00, // pshufd $0x0,%xmm0,%xmm0
+ 0x66,0x0f,0x6f,0xcb, // movdqa %xmm3,%xmm1
+ 0x66,0x0f,0x72,0xd1,0x08, // psrld $0x8,%xmm1
+ 0x66,0x0f,0xdb,0xc8, // pand %xmm0,%xmm1
+ 0x66,0x0f,0x6f,0xd3, // movdqa %xmm3,%xmm2
+ 0x66,0x0f,0x72,0xd2,0x10, // psrld $0x10,%xmm2
+ 0x66,0x0f,0xdb,0xd0, // pand %xmm0,%xmm2
+ 0x66,0x0f,0xdb,0xc3, // pand %xmm3,%xmm0
+ 0x0f,0x5b,0xc0, // cvtdq2ps %xmm0,%xmm0
+ 0xf3,0x44,0x0f,0x10,0x41,0x08, // movss 0x8(%rcx),%xmm8
+ 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
+ 0x41,0x0f,0x59,0xc0, // mulps %xmm8,%xmm0
+ 0x0f,0x5b,0xc9, // cvtdq2ps %xmm1,%xmm1
+ 0x41,0x0f,0x59,0xc8, // mulps %xmm8,%xmm1
+ 0x0f,0x5b,0xd2, // cvtdq2ps %xmm2,%xmm2
+ 0x41,0x0f,0x59,0xd0, // mulps %xmm8,%xmm2
+ 0x66,0x0f,0x72,0xd3,0x18, // psrld $0x18,%xmm3
+ 0x0f,0x5b,0xdb, // cvtdq2ps %xmm3,%xmm3
+ 0x41,0x0f,0x59,0xd8, // mulps %xmm8,%xmm3
+ 0xc3, // return
+};
+static const unsigned char sse2_store_8888[] = {
+ 0x48,0x8b,0x02, // mov (%rdx),%rax
+ 0xf3,0x44,0x0f,0x10,0x41,0x04, // movss 0x4(%rcx),%xmm8
+ 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
+ 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9
+ 0x44,0x0f,0x59,0xc8, // mulps %xmm0,%xmm9
+ 0x66,0x45,0x0f,0x5b,0xc9, // cvtps2dq %xmm9,%xmm9
+ 0x45,0x0f,0x28,0xd0, // movaps %xmm8,%xmm10
+ 0x44,0x0f,0x59,0xd1, // mulps %xmm1,%xmm10
+ 0x66,0x45,0x0f,0x5b,0xd2, // cvtps2dq %xmm10,%xmm10
+ 0x66,0x41,0x0f,0x72,0xf2,0x08, // pslld $0x8,%xmm10
+ 0x66,0x45,0x0f,0xeb,0xd1, // por %xmm9,%xmm10
+ 0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9
+ 0x44,0x0f,0x59,0xca, // mulps %xmm2,%xmm9
+ 0x66,0x45,0x0f,0x5b,0xc9, // cvtps2dq %xmm9,%xmm9
+ 0x66,0x41,0x0f,0x72,0xf1,0x10, // pslld $0x10,%xmm9
+ 0x44,0x0f,0x59,0xc3, // mulps %xmm3,%xmm8
+ 0x66,0x45,0x0f,0x5b,0xc0, // cvtps2dq %xmm8,%xmm8
+ 0x66,0x41,0x0f,0x72,0xf0,0x18, // pslld $0x18,%xmm8
+ 0x66,0x45,0x0f,0xeb,0xc1, // por %xmm9,%xmm8
+ 0x66,0x45,0x0f,0xeb,0xc2, // por %xmm10,%xmm8
+ 0xf3,0x44,0x0f,0x7f,0x04,0xb8, // movdqu %xmm8,(%rax,%rdi,4)
+ 0xc3, // return
+};
+static const unsigned char sse2_load_f16[] = {
+ 0x48,0x8b,0x02, // mov (%rdx),%rax
+ 0xf3,0x0f,0x6f,0x04,0xf8, // movdqu (%rax,%rdi,8),%xmm0
+ 0xf3,0x0f,0x6f,0x4c,0xf8,0x10, // movdqu 0x10(%rax,%rdi,8),%xmm1
+ 0x66,0x0f,0x6f,0xd8, // movdqa %xmm0,%xmm3
+ 0x66,0x0f,0x61,0xd9, // punpcklwd %xmm1,%xmm3
+ 0x66,0x0f,0x69,0xc1, // punpckhwd %xmm1,%xmm0
+ 0x66,0x0f,0x6f,0xcb, // movdqa %xmm3,%xmm1
+ 0x66,0x0f,0x61,0xc8, // punpcklwd %xmm0,%xmm1
+ 0x66,0x0f,0x69,0xd8, // punpckhwd %xmm0,%xmm3
+ 0x66,0x45,0x0f,0xef,0xc0, // pxor %xmm8,%xmm8
+ 0x66,0x0f,0x6f,0xc1, // movdqa %xmm1,%xmm0
+ 0x66,0x41,0x0f,0x61,0xc0, // punpcklwd %xmm8,%xmm0
+ 0x66,0x0f,0x72,0xf0,0x0d, // pslld $0xd,%xmm0
+ 0x66,0x0f,0x6e,0x51,0x38, // movd 0x38(%rcx),%xmm2
+ 0x66,0x44,0x0f,0x70,0xca,0x00, // pshufd $0x0,%xmm2,%xmm9
+ 0x41,0x0f,0x59,0xc1, // mulps %xmm9,%xmm0
+ 0x66,0x41,0x0f,0x69,0xc8, // punpckhwd %xmm8,%xmm1
+ 0x66,0x0f,0x72,0xf1,0x0d, // pslld $0xd,%xmm1
+ 0x41,0x0f,0x59,0xc9, // mulps %xmm9,%xmm1
+ 0x66,0x0f,0x6f,0xd3, // movdqa %xmm3,%xmm2
+ 0x66,0x41,0x0f,0x61,0xd0, // punpcklwd %xmm8,%xmm2
+ 0x66,0x0f,0x72,0xf2,0x0d, // pslld $0xd,%xmm2
+ 0x41,0x0f,0x59,0xd1, // mulps %xmm9,%xmm2
+ 0x66,0x41,0x0f,0x69,0xd8, // punpckhwd %xmm8,%xmm3
+ 0x66,0x0f,0x72,0xf3,0x0d, // pslld $0xd,%xmm3
+ 0x41,0x0f,0x59,0xd9, // mulps %xmm9,%xmm3
+ 0xc3, // return
+};
+static const unsigned char sse2_store_f16[] = {
+ 0x48,0x8b,0x02, // mov (%rdx),%rax
+ 0x66,0x44,0x0f,0x6e,0x41,0x3c, // movd 0x3c(%rcx),%xmm8
+ 0x66,0x45,0x0f,0x70,0xc0,0x00, // pshufd $0x0,%xmm8,%xmm8
+ 0x66,0x45,0x0f,0x6f,0xc8, // movdqa %xmm8,%xmm9
+ 0x44,0x0f,0x59,0xc8, // mulps %xmm0,%xmm9
+ 0x66,0x41,0x0f,0x72,0xd1,0x0d, // psrld $0xd,%xmm9
+ 0x66,0x45,0x0f,0x6f,0xd0, // movdqa %xmm8,%xmm10
+ 0x44,0x0f,0x59,0xd1, // mulps %xmm1,%xmm10
+ 0x66,0x41,0x0f,0x72,0xd2,0x0d, // psrld $0xd,%xmm10
+ 0x66,0x45,0x0f,0x6f,0xd8, // movdqa %xmm8,%xmm11
+ 0x44,0x0f,0x59,0xda, // mulps %xmm2,%xmm11
+ 0x66,0x41,0x0f,0x72,0xd3,0x0d, // psrld $0xd,%xmm11
+ 0x44,0x0f,0x59,0xc3, // mulps %xmm3,%xmm8
+ 0x66,0x41,0x0f,0x72,0xd0,0x0d, // psrld $0xd,%xmm8
+ 0x66,0x41,0x0f,0x73,0xfa,0x02, // pslldq $0x2,%xmm10
+ 0x66,0x45,0x0f,0xeb,0xd1, // por %xmm9,%xmm10
+ 0x66,0x41,0x0f,0x73,0xf8,0x02, // pslldq $0x2,%xmm8
+ 0x66,0x45,0x0f,0xeb,0xc3, // por %xmm11,%xmm8
+ 0x66,0x45,0x0f,0x6f,0xca, // movdqa %xmm10,%xmm9
+ 0x66,0x45,0x0f,0x62,0xc8, // punpckldq %xmm8,%xmm9
+ 0xf3,0x44,0x0f,0x7f,0x0c,0xf8, // movdqu %xmm9,(%rax,%rdi,8)
+ 0x66,0x45,0x0f,0x6a,0xd0, // punpckhdq %xmm8,%xmm10
+ 0xf3,0x44,0x0f,0x7f,0x54,0xf8,0x10, // movdqu %xmm10,0x10(%rax,%rdi,8)
+ 0xc3, // return
+};
+static const unsigned char sse2_matrix_3x4[] = {
+ 0x44,0x0f,0x28,0xc9, // movaps %xmm1,%xmm9
+ 0x44,0x0f,0x28,0xc0, // movaps %xmm0,%xmm8
+ 0xf3,0x0f,0x10,0x02, // movss (%rdx),%xmm0
+ 0xf3,0x0f,0x10,0x4a,0x04, // movss 0x4(%rdx),%xmm1
+ 0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm0,%xmm0
+ 0xf3,0x44,0x0f,0x10,0x52,0x0c, // movss 0xc(%rdx),%xmm10
+ 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10
+ 0xf3,0x44,0x0f,0x10,0x5a,0x18, // movss 0x18(%rdx),%xmm11
+ 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11
+ 0xf3,0x44,0x0f,0x10,0x62,0x24, // movss 0x24(%rdx),%xmm12
+ 0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12
+ 0x44,0x0f,0x59,0xda, // mulps %xmm2,%xmm11
+ 0x45,0x0f,0x58,0xdc, // addps %xmm12,%xmm11
+ 0x45,0x0f,0x59,0xd1, // mulps %xmm9,%xmm10
+ 0x45,0x0f,0x58,0xd3, // addps %xmm11,%xmm10
+ 0x41,0x0f,0x59,0xc0, // mulps %xmm8,%xmm0
+ 0x41,0x0f,0x58,0xc2, // addps %xmm10,%xmm0
+ 0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm1,%xmm1
+ 0xf3,0x44,0x0f,0x10,0x52,0x10, // movss 0x10(%rdx),%xmm10
+ 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10
+ 0xf3,0x44,0x0f,0x10,0x5a,0x1c, // movss 0x1c(%rdx),%xmm11
+ 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11
+ 0xf3,0x44,0x0f,0x10,0x62,0x28, // movss 0x28(%rdx),%xmm12
+ 0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12
+ 0x44,0x0f,0x59,0xda, // mulps %xmm2,%xmm11
+ 0x45,0x0f,0x58,0xdc, // addps %xmm12,%xmm11
+ 0x45,0x0f,0x59,0xd1, // mulps %xmm9,%xmm10
+ 0x45,0x0f,0x58,0xd3, // addps %xmm11,%xmm10
+ 0x41,0x0f,0x59,0xc8, // mulps %xmm8,%xmm1
+ 0x41,0x0f,0x58,0xca, // addps %xmm10,%xmm1
+ 0xf3,0x44,0x0f,0x10,0x52,0x08, // movss 0x8(%rdx),%xmm10
+ 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10
+ 0xf3,0x44,0x0f,0x10,0x5a,0x14, // movss 0x14(%rdx),%xmm11
+ 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11
+ 0xf3,0x44,0x0f,0x10,0x62,0x20, // movss 0x20(%rdx),%xmm12
+ 0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12
+ 0xf3,0x44,0x0f,0x10,0x6a,0x2c, // movss 0x2c(%rdx),%xmm13
+ 0x45,0x0f,0xc6,0xed,0x00, // shufps $0x0,%xmm13,%xmm13
+ 0x44,0x0f,0x59,0xe2, // mulps %xmm2,%xmm12
+ 0x45,0x0f,0x58,0xe5, // addps %xmm13,%xmm12
+ 0x45,0x0f,0x59,0xd9, // mulps %xmm9,%xmm11
+ 0x45,0x0f,0x58,0xdc, // addps %xmm12,%xmm11
+ 0x45,0x0f,0x59,0xd0, // mulps %xmm8,%xmm10
+ 0x45,0x0f,0x58,0xd3, // addps %xmm11,%xmm10
+ 0x41,0x0f,0x28,0xd2, // movaps %xmm10,%xmm2
+ 0xc3, // return
+};
static const unsigned char hsw_inc_x[] = {
0x48,0x83,0xc7,0x08, // add $0x8,%rdi
0xc3, // return
diff --git a/src/splicer/build_stages.py b/src/splicer/build_stages.py
index 354cfb37ef..21d94d8dd4 100755
--- a/src/splicer/build_stages.py
+++ b/src/splicer/build_stages.py
@@ -19,18 +19,18 @@ objdump = 'gobjdump'
cflags = '-std=c++11 -Os -fomit-frame-pointer'.split()
-sse2 = '-msse2 -mno-sse3 -mno-ssse3 -mno-sse4.1'.split()
+sse2 = '-mno-red-zone -msse2 -mno-sse3 -mno-ssse3 -mno-sse4.1'.split()
subprocess.check_call(['clang++'] + cflags + sse2 +
['-c', 'src/splicer/SkSplicer_stages.cpp'] +
['-o', 'sse2.o'])
-sse41 = '-msse4.1'.split()
+sse41 = '-mno-red-zone -msse4.1'.split()
subprocess.check_call(['clang++'] + cflags + sse41 +
['-c', 'src/splicer/SkSplicer_stages.cpp'] +
['-o', 'sse41.o'])
-hsw = '-mavx2 -mfma -mf16c'.split()
+hsw = '-mno-red-zone -mavx2 -mfma -mf16c'.split()
subprocess.check_call(['clang++'] + cflags + hsw +
['-c', 'src/splicer/SkSplicer_stages.cpp'] +
['-o', 'hsw.o'])
@@ -111,7 +111,7 @@ print '''/*
parse_object_file('aarch64.o', 'unsigned int', '14000000', 'd65f03c0')
parse_object_file( 'armv7.o', 'unsigned int', 'eafffffe', 'e12fff1e',
target='elf32-littlearm')
-#parse_object_file( 'sse2.o', 'unsigned char', 'e9 00 00 00 00', 'c3')
+parse_object_file( 'sse2.o', 'unsigned char', 'e9 00 00 00 00', 'c3')
#parse_object_file('sse41.o', 'unsigned char', 'e9 00 00 00 00', 'c3')
parse_object_file( 'hsw.o', 'unsigned char', 'e9 00 00 00 00', 'c3')
print '#endif//SkSplicer_generated_DEFINED'