aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--src/splicer/SkSplicer.cpp15
-rw-r--r--src/splicer/SkSplicer_generated.h435
-rw-r--r--src/splicer/SkSplicer_shared.h3
-rw-r--r--src/splicer/SkSplicer_stages.cpp42
4 files changed, 429 insertions, 66 deletions
diff --git a/src/splicer/SkSplicer.cpp b/src/splicer/SkSplicer.cpp
index 8447c9b45f..cde6f3307a 100644
--- a/src/splicer/SkSplicer.cpp
+++ b/src/splicer/SkSplicer.cpp
@@ -33,12 +33,17 @@
// To disassemble an armv7 dump,
// $ adb pull /data/local/tmp/dump.bin; gobjdump -b binary -D dump.bin -m arm | less
+//#define M(st) #st,
+//static const char* kStageNames[] = { SK_RASTER_PIPELINE_STAGES(M) };
+//#undef M
+
namespace {
// Stages expect these constants to be set to these values.
// It's fine to rearrange and add new ones if you update SkSplicer_constants.
static const SkSplicer_constants kConstants = {
- 1.0f, 255.0f, 1/255.0f, 0x000000ff,
+ 1.0f, 0.5f, 255.0f, 1/255.0f, 0x000000ff,
+ {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f},
0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f, // from_srgb
12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f, // to_srgb
0x77800000, 0x07800000, // fp16 <-> fp32
@@ -243,6 +248,7 @@ namespace {
static bool prefix##_##splice_stage(SkWStream* buf, SkRasterPipeline::StockStage st) { \
switch (st) { \
default: return false; \
+ CASE(prefix, seed_shader); \
CASE(prefix, clear); \
CASE(prefix, plus_); \
CASE(prefix, srcover); \
@@ -263,7 +269,11 @@ namespace {
CASE(prefix, store_8888); \
CASE(prefix, load_f16); \
CASE(prefix, store_f16); \
+ CASE(prefix, matrix_2x3); \
CASE(prefix, matrix_3x4); \
+ CASE(prefix, clamp_x); \
+ CASE(prefix, clamp_y); \
+ CASE(prefix, linear_gradient_2stops); \
} \
return true; \
}
@@ -339,7 +349,8 @@ namespace {
// Splice in the code for the Stages, generated offline into SkSplicer_generated.h.
if (!splice_stage(&buf, stages[i].stage)) {
- //SkDebugf("SkSplicer can't yet handle stage %d.\n", stages[i].stage);
+ //SkDebugf("SkSplicer can't yet handle stage %d %s.\n",
+ // stages[i].stage, kStageNames[stages[i].stage]);
return;
}
}
diff --git a/src/splicer/SkSplicer_generated.h b/src/splicer/SkSplicer_generated.h
index 5f16b5c66f..a9d8891c57 100644
--- a/src/splicer/SkSplicer_generated.h
+++ b/src/splicer/SkSplicer_generated.h
@@ -15,6 +15,25 @@ static const unsigned int aarch64_inc_x[] = {
0x91001000, // add x0, x0, #0x4
0xd65f03c0, // return
};
+static const unsigned int aarch64_seed_shader[] = {
+ 0xaa0303e8, // mov x8, x3
+ 0x4ddfc902, // ld1r {v2.4s}, [x8], #4
+ 0x4d40c841, // ld1r {v1.4s}, [x2]
+ 0x3cc14066, // ldur q6, [x3,#20]
+ 0x4e040c00, // dup v0.4s, w0
+ 0x4d40c907, // ld1r {v7.4s}, [x8]
+ 0x4e21d800, // scvtf v0.4s, v0.4s
+ 0x4e21d821, // scvtf v1.4s, v1.4s
+ 0x6f00e403, // movi v3.2d, #0x0
+ 0x4e27d400, // fadd v0.4s, v0.4s, v7.4s
+ 0x6f00e404, // movi v4.2d, #0x0
+ 0x6f00e405, // movi v5.2d, #0x0
+ 0x4e20d4c0, // fadd v0.4s, v6.4s, v0.4s
+ 0x4e27d421, // fadd v1.4s, v1.4s, v7.4s
+ 0x6f00e406, // movi v6.2d, #0x0
+ 0x6f00e407, // movi v7.2d, #0x0
+ 0xd65f03c0, // return
+};
static const unsigned int aarch64_clear[] = {
0x6f00e400, // movi v0.2d, #0x0
0x6f00e401, // movi v1.2d, #0x0
@@ -117,11 +136,11 @@ static const unsigned int aarch64_unpremul[] = {
0xd65f03c0, // return
};
static const unsigned int aarch64_from_srgb[] = {
- 0x91005068, // add x8, x3, #0x14
+ 0x9100e068, // add x8, x3, #0x38
0x4d40c910, // ld1r {v16.4s}, [x8]
- 0x91004068, // add x8, x3, #0x10
+ 0x9100d068, // add x8, x3, #0x34
0x4d40c911, // ld1r {v17.4s}, [x8]
- 0x2d434c72, // ldp s18, s19, [x3,#24]
+ 0x2d47cc72, // ldp s18, s19, [x3,#60]
0x6e22dc54, // fmul v20.4s, v2.4s, v2.4s
0x4eb01e15, // mov v21.16b, v16.16b
0x4eb01e17, // mov v23.16b, v16.16b
@@ -130,7 +149,7 @@ static const unsigned int aarch64_from_srgb[] = {
0x4eb11e38, // mov v24.16b, v17.16b
0x4e34ce11, // fmla v17.4s, v16.4s, v20.4s
0x6e20dc10, // fmul v16.4s, v0.4s, v0.4s
- 0x91008068, // add x8, x3, #0x20
+ 0x91011068, // add x8, x3, #0x44
0x4f921015, // fmla v21.4s, v0.4s, v18.s[0]
0x4e30ceb6, // fmla v22.4s, v21.4s, v16.4s
0x4d40c910, // ld1r {v16.4s}, [x8]
@@ -155,17 +174,17 @@ static const unsigned int aarch64_to_srgb[] = {
0x6ea1d856, // frsqrte v22.4s, v2.4s
0x6e35deb9, // fmul v25.4s, v21.4s, v21.4s
0x4eb7fc17, // frsqrts v23.4s, v0.4s, v23.4s
- 0x9100c068, // add x8, x3, #0x30
+ 0x91015068, // add x8, x3, #0x54
0x6e36deda, // fmul v26.4s, v22.4s, v22.4s
0x4eb9fc39, // frsqrts v25.4s, v1.4s, v25.4s
0x6e37de10, // fmul v16.4s, v16.4s, v23.4s
- 0x2d44c871, // ldp s17, s18, [x3,#36]
+ 0x2d494871, // ldp s17, s18, [x3,#72]
0x4d40c914, // ld1r {v20.4s}, [x8]
0x4ebafc5a, // frsqrts v26.4s, v2.4s, v26.4s
0x6e39deb5, // fmul v21.4s, v21.4s, v25.4s
0x4ea1da17, // frecpe v23.4s, v16.4s
- 0xbd402c73, // ldr s19, [x3,#44]
- 0x9100d068, // add x8, x3, #0x34
+ 0xbd405073, // ldr s19, [x3,#80]
+ 0x91016068, // add x8, x3, #0x58
0x6e3aded6, // fmul v22.4s, v22.4s, v26.4s
0x4ea1dabb, // frecpe v27.4s, v21.4s
0x4e37fe1d, // frecps v29.4s, v16.4s, v23.4s
@@ -213,7 +232,7 @@ static const unsigned int aarch64_to_srgb[] = {
};
static const unsigned int aarch64_scale_u8[] = {
0xf9400048, // ldr x8, [x2]
- 0xbd400871, // ldr s17, [x3,#8]
+ 0xbd400c71, // ldr s17, [x3,#12]
0x8b000108, // add x8, x8, x0
0x39400109, // ldrb w9, [x8]
0x3940050a, // ldrb w10, [x8,#1]
@@ -236,7 +255,7 @@ static const unsigned int aarch64_scale_u8[] = {
static const unsigned int aarch64_load_tables[] = {
0xa9402849, // ldp x9, x10, [x2]
0xd37ef408, // lsl x8, x0, #2
- 0x9100306b, // add x11, x3, #0xc
+ 0x9100406b, // add x11, x3, #0x10
0x4d40c960, // ld1r {v0.4s}, [x11]
0x3ce86923, // ldr q3, [x9,x8]
0xa9412448, // ldp x8, x9, [x2,#16]
@@ -279,7 +298,7 @@ static const unsigned int aarch64_load_tables[] = {
0x6e1c0620, // mov v0.s[3], v17.s[0]
0xbc6b5931, // ldr s17, [x9,w11,uxtw #2]
0x6e1c0601, // mov v1.s[3], v16.s[0]
- 0xbd400870, // ldr s16, [x3,#8]
+ 0xbd400c70, // ldr s16, [x3,#12]
0x6f280463, // ushr v3.4s, v3.4s, #24
0x6e140662, // mov v2.s[2], v19.s[0]
0x4e21d863, // scvtf v3.4s, v3.4s
@@ -289,11 +308,11 @@ static const unsigned int aarch64_load_tables[] = {
};
static const unsigned int aarch64_load_8888[] = {
0xf9400048, // ldr x8, [x2]
- 0x91003069, // add x9, x3, #0xc
+ 0x91004069, // add x9, x3, #0x10
0x4d40c920, // ld1r {v0.4s}, [x9]
0xd37ef409, // lsl x9, x0, #2
0x3ce96901, // ldr q1, [x8,x9]
- 0xbd400863, // ldr s3, [x3,#8]
+ 0xbd400c63, // ldr s3, [x3,#12]
0x4e211c02, // and v2.16b, v0.16b, v1.16b
0x6f380430, // ushr v16.4s, v1.4s, #8
0x6f300431, // ushr v17.4s, v1.4s, #16
@@ -311,7 +330,7 @@ static const unsigned int aarch64_load_8888[] = {
0xd65f03c0, // return
};
static const unsigned int aarch64_store_8888[] = {
- 0xbd400470, // ldr s16, [x3,#4]
+ 0xbd400870, // ldr s16, [x3,#8]
0xf9400048, // ldr x8, [x2]
0xd37ef409, // lsl x9, x0, #2
0x4f909032, // fmul v18.4s, v1.4s, v16.s[0]
@@ -351,6 +370,41 @@ static const unsigned int aarch64_store_f16[] = {
0x0c000510, // st4 {v16.4h-v19.4h}, [x8]
0xd65f03c0, // return
};
+static const unsigned int aarch64_clamp_x[] = {
+ 0x4d40c850, // ld1r {v16.4s}, [x2]
+ 0x6f07e7f1, // movi v17.2d, #0xffffffffffffffff
+ 0x6f00e412, // movi v18.2d, #0x0
+ 0x4eb18610, // add v16.4s, v16.4s, v17.4s
+ 0x4eb0f400, // fmin v0.4s, v0.4s, v16.4s
+ 0x4e20f640, // fmax v0.4s, v18.4s, v0.4s
+ 0xd65f03c0, // return
+};
+static const unsigned int aarch64_clamp_y[] = {
+ 0x4d40c850, // ld1r {v16.4s}, [x2]
+ 0x6f07e7f1, // movi v17.2d, #0xffffffffffffffff
+ 0x6f00e412, // movi v18.2d, #0x0
+ 0x4eb18610, // add v16.4s, v16.4s, v17.4s
+ 0x4eb0f421, // fmin v1.4s, v1.4s, v16.4s
+ 0x4e21f641, // fmax v1.4s, v18.4s, v1.4s
+ 0xd65f03c0, // return
+};
+static const unsigned int aarch64_matrix_2x3[] = {
+ 0xaa0203e8, // mov x8, x2
+ 0x4ddfc912, // ld1r {v18.4s}, [x8], #4
+ 0x91004049, // add x9, x2, #0x10
+ 0x4d40c930, // ld1r {v16.4s}, [x9]
+ 0x91005049, // add x9, x2, #0x14
+ 0x2d415053, // ldp s19, s20, [x2,#8]
+ 0x4d40c931, // ld1r {v17.4s}, [x9]
+ 0xbd400115, // ldr s21, [x8]
+ 0x4f931030, // fmla v16.4s, v1.4s, v19.s[0]
+ 0x4f941031, // fmla v17.4s, v1.4s, v20.s[0]
+ 0x4e20ce50, // fmla v16.4s, v18.4s, v0.4s
+ 0x4f951011, // fmla v17.4s, v0.4s, v21.s[0]
+ 0x4eb01e00, // mov v0.16b, v16.16b
+ 0x4eb11e21, // mov v1.16b, v17.16b
+ 0xd65f03c0, // return
+};
static const unsigned int aarch64_matrix_3x4[] = {
0xaa0203e8, // mov x8, x2
0x91009049, // add x9, x2, #0x24
@@ -379,10 +433,42 @@ static const unsigned int aarch64_matrix_3x4[] = {
0x4eb21e42, // mov v2.16b, v18.16b
0xd65f03c0, // return
};
+static const unsigned int aarch64_linear_gradient_2stops[] = {
+ 0xad404443, // ldp q3, q17, [x2]
+ 0x4e040470, // dup v16.4s, v3.s[0]
+ 0x4e0c0461, // dup v1.4s, v3.s[1]
+ 0x4e140462, // dup v2.4s, v3.s[2]
+ 0x4e1c0463, // dup v3.4s, v3.s[3]
+ 0x4f911010, // fmla v16.4s, v0.4s, v17.s[0]
+ 0x4fb11001, // fmla v1.4s, v0.4s, v17.s[1]
+ 0x4f911802, // fmla v2.4s, v0.4s, v17.s[2]
+ 0x4fb11803, // fmla v3.4s, v0.4s, v17.s[3]
+ 0x4eb01e00, // mov v0.16b, v16.16b
+ 0xd65f03c0, // return
+};
static const unsigned int armv7_inc_x[] = {
0xe2800002, // add r0, r0, #2
0xe12fff1e, // return
};
+static const unsigned int armv7_seed_shader[] = {
+ 0xee800b90, // vdup.32 d16, r0
+ 0xe283c004, // add ip, r3, #4
+ 0xf3fb0620, // vcvt.f32.s32 d16, d16
+ 0xf4e21c9f, // vld1.32 {d17[]}, [r2 :32]
+ 0xf3fb1621, // vcvt.f32.s32 d17, d17
+ 0xf4ec2c9f, // vld1.32 {d18[]}, [ip :32]
+ 0xf2803010, // vmov.i32 d3, #0
+ 0xf2804010, // vmov.i32 d4, #0
+ 0xf2400da2, // vadd.f32 d16, d16, d18
+ 0xf4a32c9f, // vld1.32 {d2[]}, [r3 :32]
+ 0xf2011da2, // vadd.f32 d1, d17, d18
+ 0xf2805010, // vmov.i32 d5, #0
+ 0xedd33b05, // vldr d19, [r3, #20]
+ 0xf2806010, // vmov.i32 d6, #0
+ 0xf2030da0, // vadd.f32 d0, d19, d16
+ 0xf2807010, // vmov.i32 d7, #0
+ 0xe12fff1e, // return
+};
static const unsigned int armv7_clear[] = {
0xf2800010, // vmov.i32 d0, #0
0xf2801010, // vmov.i32 d1, #0
@@ -490,19 +576,19 @@ static const unsigned int armv7_unpremul[] = {
};
static const unsigned int armv7_from_srgb[] = {
0xed2d8b02, // vpush {d8}
- 0xe283c018, // add ip, r3, #24
- 0xed938a07, // vldr s16, [r3, #28]
+ 0xe283c03c, // add ip, r3, #60
+ 0xed938a10, // vldr s16, [r3, #64]
0xf3402d10, // vmul.f32 d18, d0, d0
0xf4ec0c9f, // vld1.32 {d16[]}, [ip :32]
- 0xe283c014, // add ip, r3, #20
+ 0xe283c038, // add ip, r3, #56
0xf3413d11, // vmul.f32 d19, d1, d1
0xf4ec1c9f, // vld1.32 {d17[]}, [ip :32]
- 0xe283c020, // add ip, r3, #32
+ 0xe283c044, // add ip, r3, #68
0xf26141b1, // vorr d20, d17, d17
0xf26171b1, // vorr d23, d17, d17
0xf4ec8c9f, // vld1.32 {d24[]}, [ip :32]
0xf2404c30, // vfma.f32 d20, d0, d16
- 0xe283c010, // add ip, r3, #16
+ 0xe283c034, // add ip, r3, #52
0xf2417c30, // vfma.f32 d23, d1, d16
0xf2421c30, // vfma.f32 d17, d2, d16
0xf3425d12, // vmul.f32 d21, d2, d2
@@ -527,9 +613,9 @@ static const unsigned int armv7_from_srgb[] = {
static const unsigned int armv7_to_srgb[] = {
0xed2d8b02, // vpush {d8}
0xf3fb0580, // vrsqrte.f32 d16, d0
- 0xe283c02c, // add ip, r3, #44
+ 0xe283c050, // add ip, r3, #80
0xf3fb1581, // vrsqrte.f32 d17, d1
- 0xed938a09, // vldr s16, [r3, #36]
+ 0xed938a12, // vldr s16, [r3, #72]
0xf3fb2582, // vrsqrte.f32 d18, d2
0xf3403db0, // vmul.f32 d19, d16, d16
0xf3414db1, // vmul.f32 d20, d17, d17
@@ -557,18 +643,18 @@ static const unsigned int armv7_to_srgb[] = {
0xf2611fbe, // vrsqrts.f32 d17, d17, d30
0xf3433db9, // vmul.f32 d19, d19, d25
0xf4ec9c9f, // vld1.32 {d25[]}, [ip :32]
- 0xe283c030, // add ip, r3, #48
+ 0xe283c054, // add ip, r3, #84
0xf3444dba, // vmul.f32 d20, d20, d26
0xf3466dbb, // vmul.f32 d22, d22, d27
0xf4ecac9f, // vld1.32 {d26[]}, [ip :32]
- 0xe283c028, // add ip, r3, #40
+ 0xe283c04c, // add ip, r3, #76
0xf26ab1ba, // vorr d27, d26, d26
0xf249bcb3, // vfma.f32 d27, d25, d19
0xf26a31ba, // vorr d19, d26, d26
0xf2493cb4, // vfma.f32 d19, d25, d20
0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
0xf249acb6, // vfma.f32 d26, d25, d22
- 0xe283c034, // add ip, r3, #52
+ 0xe283c058, // add ip, r3, #88
0xf3452db2, // vmul.f32 d18, d21, d18
0xf3470db0, // vmul.f32 d16, d23, d16
0xf3481db1, // vmul.f32 d17, d24, d17
@@ -600,7 +686,7 @@ static const unsigned int armv7_scale_u8[] = {
0xe1dcc0b0, // ldrh ip, [ip]
0xe1cdc0b4, // strh ip, [sp, #4]
0xe28dc004, // add ip, sp, #4
- 0xed938a02, // vldr s16, [r3, #8]
+ 0xed938a03, // vldr s16, [r3, #12]
0xf4ec041f, // vld1.16 {d16[0]}, [ip :16]
0xf3c80a30, // vmovl.u8 q8, d16
0xf3d00a30, // vmovl.u16 q8, d16
@@ -616,7 +702,7 @@ static const unsigned int armv7_scale_u8[] = {
};
static const unsigned int armv7_load_tables[] = {
0xe92d41f0, // push {r4, r5, r6, r7, r8, lr}
- 0xe283600c, // add r6, r3, #12
+ 0xe2836010, // add r6, r3, #16
0xe592c000, // ldr ip, [r2]
0xe592e004, // ldr lr, [r2, #4]
0xf4e60c9f, // vld1.32 {d16[]}, [r6 :32]
@@ -626,7 +712,7 @@ static const unsigned int armv7_load_tables[] = {
0xe592800c, // ldr r8, [r2, #12]
0xf3f83031, // vshr.u32 d19, d17, #8
0xe5924008, // ldr r4, [r2, #8]
- 0xed931a02, // vldr s2, [r3, #8]
+ 0xed931a03, // vldr s2, [r3, #12]
0xee326b90, // vmov.32 r6, d18[1]
0xee125b90, // vmov.32 r5, d18[0]
0xf3f02031, // vshr.u32 d18, d17, #16
@@ -657,8 +743,8 @@ static const unsigned int armv7_load_tables[] = {
static const unsigned int armv7_load_8888[] = {
0xe92d4800, // push {fp, lr}
0xe592c000, // ldr ip, [r2]
- 0xe283e00c, // add lr, r3, #12
- 0xed932a02, // vldr s4, [r3, #8]
+ 0xe283e010, // add lr, r3, #16
+ 0xed932a03, // vldr s4, [r3, #12]
0xe08cc100, // add ip, ip, r0, lsl #2
0xf4ee0c9f, // vld1.32 {d16[]}, [lr :32]
0xeddc1b00, // vldr d17, [ip]
@@ -680,7 +766,7 @@ static const unsigned int armv7_load_8888[] = {
0xe12fff1e, // return
};
static const unsigned int armv7_store_8888[] = {
- 0xe283c004, // add ip, r3, #4
+ 0xe283c008, // add ip, r3, #8
0xf2c3261f, // vmov.i32 d18, #1056964608
0xf2c3361f, // vmov.i32 d19, #1056964608
0xf4ec1c9f, // vld1.32 {d17[]}, [ip :32]
@@ -737,6 +823,44 @@ static const unsigned int armv7_store_f16[] = {
0xf44c084f, // vst2.16 {d16-d17}, [ip]
0xe12fff1e, // return
};
+static const unsigned int armv7_clamp_x[] = {
+ 0xf3c70e1f, // vmov.i8 d16, #255
+ 0xf4e21c9f, // vld1.32 {d17[]}, [r2 :32]
+ 0xf26108a0, // vadd.i32 d16, d17, d16
+ 0xf2c01010, // vmov.i32 d17, #0
+ 0xf2600f20, // vmin.f32 d16, d0, d16
+ 0xf2010fa0, // vmax.f32 d0, d17, d16
+ 0xe12fff1e, // return
+};
+static const unsigned int armv7_clamp_y[] = {
+ 0xf3c70e1f, // vmov.i8 d16, #255
+ 0xf4e21c9f, // vld1.32 {d17[]}, [r2 :32]
+ 0xf26108a0, // vadd.i32 d16, d17, d16
+ 0xf2c01010, // vmov.i32 d17, #0
+ 0xf2610f20, // vmin.f32 d16, d1, d16
+ 0xf2011fa0, // vmax.f32 d1, d17, d16
+ 0xe12fff1e, // return
+};
+static const unsigned int armv7_matrix_2x3[] = {
+ 0xe282c00c, // add ip, r2, #12
+ 0xf4ec2c9f, // vld1.32 {d18[]}, [ip :32]
+ 0xe282c008, // add ip, r2, #8
+ 0xf4ec1c9f, // vld1.32 {d17[]}, [ip :32]
+ 0xe282c010, // add ip, r2, #16
+ 0xf4ec0c9f, // vld1.32 {d16[]}, [ip :32]
+ 0xe282c014, // add ip, r2, #20
+ 0xf2410c31, // vfma.f32 d16, d1, d17
+ 0xf4ec1c9f, // vld1.32 {d17[]}, [ip :32]
+ 0xe282c004, // add ip, r2, #4
+ 0xf2411c32, // vfma.f32 d17, d1, d18
+ 0xf4e22c9f, // vld1.32 {d18[]}, [r2 :32]
+ 0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32]
+ 0xf2400c32, // vfma.f32 d16, d0, d18
+ 0xf2401c33, // vfma.f32 d17, d0, d19
+ 0xf22001b0, // vorr d0, d16, d16
+ 0xf22111b1, // vorr d1, d17, d17
+ 0xe12fff1e, // return
+};
static const unsigned int armv7_matrix_3x4[] = {
0xe282c020, // add ip, r2, #32
0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32]
@@ -775,10 +899,51 @@ static const unsigned int armv7_matrix_3x4[] = {
0xf22211b2, // vorr d1, d18, d18
0xe12fff1e, // return
};
+static const unsigned int armv7_linear_gradient_2stops[] = {
+ 0xe1a0c002, // mov ip, r2
+ 0xf46c2a0d, // vld1.8 {d18-d19}, [ip]!
+ 0xf46c4a0f, // vld1.8 {d20-d21}, [ip]
+ 0xf3f40c22, // vdup.32 d16, d18[0]
+ 0xf3f41c24, // vdup.32 d17, d20[0]
+ 0xf2400c31, // vfma.f32 d16, d0, d17
+ 0xf3fc6c24, // vdup.32 d22, d20[1]
+ 0xf3bc1c22, // vdup.32 d1, d18[1]
+ 0xf3b42c23, // vdup.32 d2, d19[0]
+ 0xf2001c36, // vfma.f32 d1, d0, d22
+ 0xf3f41c25, // vdup.32 d17, d21[0]
+ 0xf3fc4c25, // vdup.32 d20, d21[1]
+ 0xf2002c31, // vfma.f32 d2, d0, d17
+ 0xf3bc3c23, // vdup.32 d3, d19[1]
+ 0xf2003c34, // vfma.f32 d3, d0, d20
+ 0xf22001b0, // vorr d0, d16, d16
+ 0xe12fff1e, // return
+};
static const unsigned char sse2_inc_x[] = {
0x48,0x83,0xc7,0x04, // add $0x4,%rdi
0xc3, // return
};
+static const unsigned char sse2_seed_shader[] = {
+ 0x66,0x0f,0x6e,0xc7, // movd %edi,%xmm0
+ 0x66,0x0f,0x70,0xc0,0x00, // pshufd $0x0,%xmm0,%xmm0
+ 0x0f,0x5b,0xc8, // cvtdq2ps %xmm0,%xmm1
+ 0xf3,0x0f,0x10,0x11, // movss (%rcx),%xmm2
+ 0xf3,0x0f,0x10,0x59,0x04, // movss 0x4(%rcx),%xmm3
+ 0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm3,%xmm3
+ 0x0f,0x58,0xcb, // addps %xmm3,%xmm1
+ 0x0f,0x10,0x41,0x14, // movups 0x14(%rcx),%xmm0
+ 0x0f,0x58,0xc1, // addps %xmm1,%xmm0
+ 0x66,0x0f,0x6e,0x0a, // movd (%rdx),%xmm1
+ 0x66,0x0f,0x70,0xc9,0x00, // pshufd $0x0,%xmm1,%xmm1
+ 0x0f,0x5b,0xc9, // cvtdq2ps %xmm1,%xmm1
+ 0x0f,0x58,0xcb, // addps %xmm3,%xmm1
+ 0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm2,%xmm2
+ 0x0f,0x57,0xdb, // xorps %xmm3,%xmm3
+ 0x0f,0x57,0xe4, // xorps %xmm4,%xmm4
+ 0x0f,0x57,0xed, // xorps %xmm5,%xmm5
+ 0x0f,0x57,0xf6, // xorps %xmm6,%xmm6
+ 0x0f,0x57,0xff, // xorps %xmm7,%xmm7
+ 0xc3, // return
+};
static const unsigned char sse2_clear[] = {
0x0f,0x57,0xc0, // xorps %xmm0,%xmm0
0x0f,0x57,0xc9, // xorps %xmm1,%xmm1
@@ -901,16 +1066,16 @@ static const unsigned char sse2_unpremul[] = {
0xc3, // return
};
static const unsigned char sse2_from_srgb[] = {
- 0xf3,0x44,0x0f,0x10,0x41,0x1c, // movss 0x1c(%rcx),%xmm8
+ 0xf3,0x44,0x0f,0x10,0x41,0x40, // movss 0x40(%rcx),%xmm8
0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
0x45,0x0f,0x28,0xe8, // movaps %xmm8,%xmm13
0x44,0x0f,0x59,0xe8, // mulps %xmm0,%xmm13
0x44,0x0f,0x28,0xe0, // movaps %xmm0,%xmm12
0x45,0x0f,0x59,0xe4, // mulps %xmm12,%xmm12
- 0xf3,0x44,0x0f,0x10,0x49,0x18, // movss 0x18(%rcx),%xmm9
+ 0xf3,0x44,0x0f,0x10,0x49,0x3c, // movss 0x3c(%rcx),%xmm9
0x45,0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm9,%xmm9
- 0xf3,0x44,0x0f,0x10,0x51,0x10, // movss 0x10(%rcx),%xmm10
- 0xf3,0x44,0x0f,0x10,0x59,0x14, // movss 0x14(%rcx),%xmm11
+ 0xf3,0x44,0x0f,0x10,0x51,0x34, // movss 0x34(%rcx),%xmm10
+ 0xf3,0x44,0x0f,0x10,0x59,0x38, // movss 0x38(%rcx),%xmm11
0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11
0x45,0x0f,0x28,0xf1, // movaps %xmm9,%xmm14
0x44,0x0f,0x59,0xf0, // mulps %xmm0,%xmm14
@@ -918,7 +1083,7 @@ static const unsigned char sse2_from_srgb[] = {
0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10
0x45,0x0f,0x59,0xf4, // mulps %xmm12,%xmm14
0x45,0x0f,0x58,0xf2, // addps %xmm10,%xmm14
- 0xf3,0x44,0x0f,0x10,0x61,0x20, // movss 0x20(%rcx),%xmm12
+ 0xf3,0x44,0x0f,0x10,0x61,0x44, // movss 0x44(%rcx),%xmm12
0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12
0x41,0x0f,0xc2,0xc4,0x01, // cmpltps %xmm12,%xmm0
0x44,0x0f,0x54,0xe8, // andps %xmm0,%xmm13
@@ -961,16 +1126,16 @@ static const unsigned char sse2_to_srgb[] = {
0x45,0x0f,0x53,0xe8, // rcpps %xmm8,%xmm13
0x45,0x0f,0x52,0xf8, // rsqrtps %xmm8,%xmm15
0xf3,0x0f,0x10,0x19, // movss (%rcx),%xmm3
- 0xf3,0x44,0x0f,0x10,0x41,0x24, // movss 0x24(%rcx),%xmm8
+ 0xf3,0x44,0x0f,0x10,0x41,0x48, // movss 0x48(%rcx),%xmm8
0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
0x45,0x0f,0x28,0xf0, // movaps %xmm8,%xmm14
0x44,0x0f,0x59,0xf0, // mulps %xmm0,%xmm14
0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm3,%xmm3
- 0xf3,0x44,0x0f,0x10,0x51,0x28, // movss 0x28(%rcx),%xmm10
+ 0xf3,0x44,0x0f,0x10,0x51,0x4c, // movss 0x4c(%rcx),%xmm10
0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10
- 0xf3,0x44,0x0f,0x10,0x59,0x2c, // movss 0x2c(%rcx),%xmm11
+ 0xf3,0x44,0x0f,0x10,0x59,0x50, // movss 0x50(%rcx),%xmm11
0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11
- 0xf3,0x44,0x0f,0x10,0x61,0x30, // movss 0x30(%rcx),%xmm12
+ 0xf3,0x44,0x0f,0x10,0x61,0x54, // movss 0x54(%rcx),%xmm12
0x45,0x0f,0xc6,0xe4,0x00, // shufps $0x0,%xmm12,%xmm12
0x45,0x0f,0x59,0xeb, // mulps %xmm11,%xmm13
0x45,0x0f,0x58,0xec, // addps %xmm12,%xmm13
@@ -978,7 +1143,7 @@ static const unsigned char sse2_to_srgb[] = {
0x45,0x0f,0x58,0xfd, // addps %xmm13,%xmm15
0x44,0x0f,0x28,0xcb, // movaps %xmm3,%xmm9
0x45,0x0f,0x5d,0xcf, // minps %xmm15,%xmm9
- 0xf3,0x44,0x0f,0x10,0x69,0x34, // movss 0x34(%rcx),%xmm13
+ 0xf3,0x44,0x0f,0x10,0x69,0x58, // movss 0x58(%rcx),%xmm13
0x45,0x0f,0xc6,0xed,0x00, // shufps $0x0,%xmm13,%xmm13
0x41,0x0f,0xc2,0xc5,0x01, // cmpltps %xmm13,%xmm0
0x44,0x0f,0x54,0xf0, // andps %xmm0,%xmm14
@@ -1027,7 +1192,7 @@ static const unsigned char sse2_scale_u8[] = {
0x66,0x45,0x0f,0x60,0xc1, // punpcklbw %xmm9,%xmm8
0x66,0x45,0x0f,0x61,0xc1, // punpcklwd %xmm9,%xmm8
0x45,0x0f,0x5b,0xc0, // cvtdq2ps %xmm8,%xmm8
- 0xf3,0x44,0x0f,0x10,0x49,0x08, // movss 0x8(%rcx),%xmm9
+ 0xf3,0x44,0x0f,0x10,0x49,0x0c, // movss 0xc(%rcx),%xmm9
0x45,0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm9,%xmm9
0x45,0x0f,0x59,0xc8, // mulps %xmm8,%xmm9
0x41,0x0f,0x59,0xc1, // mulps %xmm9,%xmm0
@@ -1040,7 +1205,7 @@ static const unsigned char sse2_load_tables[] = {
0x48,0x8b,0x02, // mov (%rdx),%rax
0x4c,0x8b,0x42,0x08, // mov 0x8(%rdx),%r8
0xf3,0x44,0x0f,0x6f,0x04,0xb8, // movdqu (%rax,%rdi,4),%xmm8
- 0x66,0x0f,0x6e,0x41,0x0c, // movd 0xc(%rcx),%xmm0
+ 0x66,0x0f,0x6e,0x41,0x10, // movd 0x10(%rcx),%xmm0
0x66,0x0f,0x70,0xc0,0x00, // pshufd $0x0,%xmm0,%xmm0
0x66,0x45,0x0f,0x6f,0xc8, // movdqa %xmm8,%xmm9
0x66,0x41,0x0f,0x72,0xd1,0x08, // psrld $0x8,%xmm9
@@ -1095,7 +1260,7 @@ static const unsigned char sse2_load_tables[] = {
0x41,0x0f,0x14,0xd1, // unpcklps %xmm9,%xmm2
0x66,0x41,0x0f,0x72,0xd0,0x18, // psrld $0x18,%xmm8
0x45,0x0f,0x5b,0xc0, // cvtdq2ps %xmm8,%xmm8
- 0xf3,0x0f,0x10,0x59,0x08, // movss 0x8(%rcx),%xmm3
+ 0xf3,0x0f,0x10,0x59,0x0c, // movss 0xc(%rcx),%xmm3
0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm3,%xmm3
0x41,0x0f,0x59,0xd8, // mulps %xmm8,%xmm3
0xc3, // return
@@ -1103,7 +1268,7 @@ static const unsigned char sse2_load_tables[] = {
static const unsigned char sse2_load_8888[] = {
0x48,0x8b,0x02, // mov (%rdx),%rax
0xf3,0x0f,0x6f,0x1c,0xb8, // movdqu (%rax,%rdi,4),%xmm3
- 0x66,0x0f,0x6e,0x41,0x0c, // movd 0xc(%rcx),%xmm0
+ 0x66,0x0f,0x6e,0x41,0x10, // movd 0x10(%rcx),%xmm0
0x66,0x0f,0x70,0xc0,0x00, // pshufd $0x0,%xmm0,%xmm0
0x66,0x0f,0x6f,0xcb, // movdqa %xmm3,%xmm1
0x66,0x0f,0x72,0xd1,0x08, // psrld $0x8,%xmm1
@@ -1113,7 +1278,7 @@ static const unsigned char sse2_load_8888[] = {
0x66,0x0f,0xdb,0xd0, // pand %xmm0,%xmm2
0x66,0x0f,0xdb,0xc3, // pand %xmm3,%xmm0
0x0f,0x5b,0xc0, // cvtdq2ps %xmm0,%xmm0
- 0xf3,0x44,0x0f,0x10,0x41,0x08, // movss 0x8(%rcx),%xmm8
+ 0xf3,0x44,0x0f,0x10,0x41,0x0c, // movss 0xc(%rcx),%xmm8
0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
0x41,0x0f,0x59,0xc0, // mulps %xmm8,%xmm0
0x0f,0x5b,0xc9, // cvtdq2ps %xmm1,%xmm1
@@ -1127,7 +1292,7 @@ static const unsigned char sse2_load_8888[] = {
};
static const unsigned char sse2_store_8888[] = {
0x48,0x8b,0x02, // mov (%rdx),%rax
- 0xf3,0x44,0x0f,0x10,0x41,0x04, // movss 0x4(%rcx),%xmm8
+ 0xf3,0x44,0x0f,0x10,0x41,0x08, // movss 0x8(%rcx),%xmm8
0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
0x45,0x0f,0x28,0xc8, // movaps %xmm8,%xmm9
0x44,0x0f,0x59,0xc8, // mulps %xmm0,%xmm9
@@ -1163,7 +1328,7 @@ static const unsigned char sse2_load_f16[] = {
0x66,0x0f,0x6f,0xc1, // movdqa %xmm1,%xmm0
0x66,0x41,0x0f,0x61,0xc0, // punpcklwd %xmm8,%xmm0
0x66,0x0f,0x72,0xf0,0x0d, // pslld $0xd,%xmm0
- 0x66,0x0f,0x6e,0x51,0x38, // movd 0x38(%rcx),%xmm2
+ 0x66,0x0f,0x6e,0x51,0x5c, // movd 0x5c(%rcx),%xmm2
0x66,0x44,0x0f,0x70,0xca,0x00, // pshufd $0x0,%xmm2,%xmm9
0x41,0x0f,0x59,0xc1, // mulps %xmm9,%xmm0
0x66,0x41,0x0f,0x69,0xc8, // punpckhwd %xmm8,%xmm1
@@ -1180,7 +1345,7 @@ static const unsigned char sse2_load_f16[] = {
};
static const unsigned char sse2_store_f16[] = {
0x48,0x8b,0x02, // mov (%rdx),%rax
- 0x66,0x44,0x0f,0x6e,0x41,0x3c, // movd 0x3c(%rcx),%xmm8
+ 0x66,0x44,0x0f,0x6e,0x41,0x60, // movd 0x60(%rcx),%xmm8
0x66,0x45,0x0f,0x70,0xc0,0x00, // pshufd $0x0,%xmm8,%xmm8
0x66,0x45,0x0f,0x6f,0xc8, // movdqa %xmm8,%xmm9
0x44,0x0f,0x59,0xc8, // mulps %xmm0,%xmm9
@@ -1204,6 +1369,53 @@ static const unsigned char sse2_store_f16[] = {
0xf3,0x44,0x0f,0x7f,0x54,0xf8,0x10, // movdqu %xmm10,0x10(%rax,%rdi,8)
0xc3, // return
};
+static const unsigned char sse2_clamp_x[] = {
+ 0xf3,0x44,0x0f,0x10,0x02, // movss (%rdx),%xmm8
+ 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
+ 0x66,0x45,0x0f,0x76,0xc9, // pcmpeqd %xmm9,%xmm9
+ 0x66,0x45,0x0f,0xfe,0xc8, // paddd %xmm8,%xmm9
+ 0x41,0x0f,0x5d,0xc1, // minps %xmm9,%xmm0
+ 0x45,0x0f,0x57,0xc0, // xorps %xmm8,%xmm8
+ 0x44,0x0f,0x5f,0xc0, // maxps %xmm0,%xmm8
+ 0x41,0x0f,0x28,0xc0, // movaps %xmm8,%xmm0
+ 0xc3, // return
+};
+static const unsigned char sse2_clamp_y[] = {
+ 0xf3,0x44,0x0f,0x10,0x02, // movss (%rdx),%xmm8
+ 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
+ 0x66,0x45,0x0f,0x76,0xc9, // pcmpeqd %xmm9,%xmm9
+ 0x66,0x45,0x0f,0xfe,0xc8, // paddd %xmm8,%xmm9
+ 0x41,0x0f,0x5d,0xc9, // minps %xmm9,%xmm1
+ 0x45,0x0f,0x57,0xc0, // xorps %xmm8,%xmm8
+ 0x44,0x0f,0x5f,0xc1, // maxps %xmm1,%xmm8
+ 0x41,0x0f,0x28,0xc8, // movaps %xmm8,%xmm1
+ 0xc3, // return
+};
+static const unsigned char sse2_matrix_2x3[] = {
+ 0x44,0x0f,0x28,0xc9, // movaps %xmm1,%xmm9
+ 0x44,0x0f,0x28,0xc0, // movaps %xmm0,%xmm8
+ 0xf3,0x0f,0x10,0x02, // movss (%rdx),%xmm0
+ 0xf3,0x0f,0x10,0x4a,0x04, // movss 0x4(%rdx),%xmm1
+ 0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm0,%xmm0
+ 0xf3,0x44,0x0f,0x10,0x52,0x08, // movss 0x8(%rdx),%xmm10
+ 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10
+ 0xf3,0x44,0x0f,0x10,0x5a,0x10, // movss 0x10(%rdx),%xmm11
+ 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11
+ 0x45,0x0f,0x59,0xd1, // mulps %xmm9,%xmm10
+ 0x45,0x0f,0x58,0xd3, // addps %xmm11,%xmm10
+ 0x41,0x0f,0x59,0xc0, // mulps %xmm8,%xmm0
+ 0x41,0x0f,0x58,0xc2, // addps %xmm10,%xmm0
+ 0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm1,%xmm1
+ 0xf3,0x44,0x0f,0x10,0x52,0x0c, // movss 0xc(%rdx),%xmm10
+ 0x45,0x0f,0xc6,0xd2,0x00, // shufps $0x0,%xmm10,%xmm10
+ 0xf3,0x44,0x0f,0x10,0x5a,0x14, // movss 0x14(%rdx),%xmm11
+ 0x45,0x0f,0xc6,0xdb,0x00, // shufps $0x0,%xmm11,%xmm11
+ 0x45,0x0f,0x59,0xd1, // mulps %xmm9,%xmm10
+ 0x45,0x0f,0x58,0xd3, // addps %xmm11,%xmm10
+ 0x41,0x0f,0x59,0xc8, // mulps %xmm8,%xmm1
+ 0x41,0x0f,0x58,0xca, // addps %xmm10,%xmm1
+ 0xc3, // return
+};
static const unsigned char sse2_matrix_3x4[] = {
0x44,0x0f,0x28,0xc9, // movaps %xmm1,%xmm9
0x44,0x0f,0x28,0xc0, // movaps %xmm0,%xmm8
@@ -1252,10 +1464,56 @@ static const unsigned char sse2_matrix_3x4[] = {
0x41,0x0f,0x28,0xd2, // movaps %xmm10,%xmm2
0xc3, // return
};
+static const unsigned char sse2_linear_gradient_2stops[] = {
+ 0x44,0x0f,0x10,0x0a, // movups (%rdx),%xmm9
+ 0x0f,0x10,0x5a,0x10, // movups 0x10(%rdx),%xmm3
+ 0x44,0x0f,0x28,0xc3, // movaps %xmm3,%xmm8
+ 0x45,0x0f,0xc6,0xc0,0x00, // shufps $0x0,%xmm8,%xmm8
+ 0x41,0x0f,0x28,0xc9, // movaps %xmm9,%xmm1
+ 0x0f,0xc6,0xc9,0x00, // shufps $0x0,%xmm1,%xmm1
+ 0x44,0x0f,0x59,0xc0, // mulps %xmm0,%xmm8
+ 0x44,0x0f,0x58,0xc1, // addps %xmm1,%xmm8
+ 0x0f,0x28,0xcb, // movaps %xmm3,%xmm1
+ 0x0f,0xc6,0xc9,0x55, // shufps $0x55,%xmm1,%xmm1
+ 0x41,0x0f,0x28,0xd1, // movaps %xmm9,%xmm2
+ 0x0f,0xc6,0xd2,0x55, // shufps $0x55,%xmm2,%xmm2
+ 0x0f,0x59,0xc8, // mulps %xmm0,%xmm1
+ 0x0f,0x58,0xca, // addps %xmm2,%xmm1
+ 0x0f,0x28,0xd3, // movaps %xmm3,%xmm2
+ 0x0f,0xc6,0xd2,0xaa, // shufps $0xaa,%xmm2,%xmm2
+ 0x45,0x0f,0x28,0xd1, // movaps %xmm9,%xmm10
+ 0x45,0x0f,0xc6,0xd2,0xaa, // shufps $0xaa,%xmm10,%xmm10
+ 0x0f,0x59,0xd0, // mulps %xmm0,%xmm2
+ 0x41,0x0f,0x58,0xd2, // addps %xmm10,%xmm2
+ 0x0f,0xc6,0xdb,0xff, // shufps $0xff,%xmm3,%xmm3
+ 0x45,0x0f,0xc6,0xc9,0xff, // shufps $0xff,%xmm9,%xmm9
+ 0x0f,0x59,0xd8, // mulps %xmm0,%xmm3
+ 0x41,0x0f,0x58,0xd9, // addps %xmm9,%xmm3
+ 0x41,0x0f,0x28,0xc0, // movaps %xmm8,%xmm0
+ 0xc3, // return
+};
static const unsigned char hsw_inc_x[] = {
0x48,0x83,0xc7,0x08, // add $0x8,%rdi
0xc3, // return
};
+static const unsigned char hsw_seed_shader[] = {
+ 0xc5,0xf9,0x6e,0xc7, // vmovd %edi,%xmm0
+ 0xc4,0xe2,0x7d,0x18,0xc0, // vbroadcastss %xmm0,%ymm0
+ 0xc5,0xfc,0x5b,0xc0, // vcvtdq2ps %ymm0,%ymm0
+ 0xc4,0xe2,0x7d,0x18,0x49,0x04, // vbroadcastss 0x4(%rcx),%ymm1
+ 0xc5,0xfc,0x58,0xc1, // vaddps %ymm1,%ymm0,%ymm0
+ 0xc5,0xfc,0x58,0x41,0x14, // vaddps 0x14(%rcx),%ymm0,%ymm0
+ 0xc4,0xe2,0x7d,0x18,0x12, // vbroadcastss (%rdx),%ymm2
+ 0xc5,0xfc,0x5b,0xd2, // vcvtdq2ps %ymm2,%ymm2
+ 0xc5,0xec,0x58,0xc9, // vaddps %ymm1,%ymm2,%ymm1
+ 0xc4,0xe2,0x7d,0x18,0x11, // vbroadcastss (%rcx),%ymm2
+ 0xc5,0xe4,0x57,0xdb, // vxorps %ymm3,%ymm3,%ymm3
+ 0xc5,0xdc,0x57,0xe4, // vxorps %ymm4,%ymm4,%ymm4
+ 0xc5,0xd4,0x57,0xed, // vxorps %ymm5,%ymm5,%ymm5
+ 0xc5,0xcc,0x57,0xf6, // vxorps %ymm6,%ymm6,%ymm6
+ 0xc5,0xc4,0x57,0xff, // vxorps %ymm7,%ymm7,%ymm7
+ 0xc3, // return
+};
static const unsigned char hsw_clear[] = {
0xc5,0xfc,0x57,0xc0, // vxorps %ymm0,%ymm0,%ymm0
0xc5,0xf4,0x57,0xc9, // vxorps %ymm1,%ymm1,%ymm1
@@ -1359,16 +1617,16 @@ static const unsigned char hsw_unpremul[] = {
0xc3, // return
};
static const unsigned char hsw_from_srgb[] = {
- 0xc4,0x62,0x7d,0x18,0x41,0x1c, // vbroadcastss 0x1c(%rcx),%ymm8
+ 0xc4,0x62,0x7d,0x18,0x41,0x40, // vbroadcastss 0x40(%rcx),%ymm8
0xc5,0x3c,0x59,0xc8, // vmulps %ymm0,%ymm8,%ymm9
0xc5,0x7c,0x59,0xd0, // vmulps %ymm0,%ymm0,%ymm10
- 0xc4,0x62,0x7d,0x18,0x59,0x18, // vbroadcastss 0x18(%rcx),%ymm11
- 0xc4,0x62,0x7d,0x18,0x61,0x14, // vbroadcastss 0x14(%rcx),%ymm12
+ 0xc4,0x62,0x7d,0x18,0x59,0x3c, // vbroadcastss 0x3c(%rcx),%ymm11
+ 0xc4,0x62,0x7d,0x18,0x61,0x38, // vbroadcastss 0x38(%rcx),%ymm12
0xc4,0x41,0x7c,0x28,0xeb, // vmovaps %ymm11,%ymm13
0xc4,0x42,0x7d,0xa8,0xec, // vfmadd213ps %ymm12,%ymm0,%ymm13
- 0xc4,0x62,0x7d,0x18,0x71,0x10, // vbroadcastss 0x10(%rcx),%ymm14
+ 0xc4,0x62,0x7d,0x18,0x71,0x34, // vbroadcastss 0x34(%rcx),%ymm14
0xc4,0x42,0x2d,0xa8,0xee, // vfmadd213ps %ymm14,%ymm10,%ymm13
- 0xc4,0x62,0x7d,0x18,0x51,0x20, // vbroadcastss 0x20(%rcx),%ymm10
+ 0xc4,0x62,0x7d,0x18,0x51,0x44, // vbroadcastss 0x44(%rcx),%ymm10
0xc4,0xc1,0x7c,0xc2,0xc2,0x01, // vcmpltps %ymm10,%ymm0,%ymm0
0xc4,0xc3,0x15,0x4a,0xc1,0x00, // vblendvps %ymm0,%ymm9,%ymm13,%ymm0
0xc5,0x3c,0x59,0xc9, // vmulps %ymm1,%ymm8,%ymm9
@@ -1390,16 +1648,16 @@ static const unsigned char hsw_to_srgb[] = {
0xc5,0x7c,0x52,0xc0, // vrsqrtps %ymm0,%ymm8
0xc4,0x41,0x7c,0x53,0xc8, // vrcpps %ymm8,%ymm9
0xc4,0x41,0x7c,0x52,0xd0, // vrsqrtps %ymm8,%ymm10
- 0xc4,0x62,0x7d,0x18,0x41,0x24, // vbroadcastss 0x24(%rcx),%ymm8
+ 0xc4,0x62,0x7d,0x18,0x41,0x48, // vbroadcastss 0x48(%rcx),%ymm8
0xc5,0x3c,0x59,0xd8, // vmulps %ymm0,%ymm8,%ymm11
0xc4,0x62,0x7d,0x18,0x21, // vbroadcastss (%rcx),%ymm12
- 0xc4,0x62,0x7d,0x18,0x69,0x28, // vbroadcastss 0x28(%rcx),%ymm13
- 0xc4,0x62,0x7d,0x18,0x71,0x2c, // vbroadcastss 0x2c(%rcx),%ymm14
- 0xc4,0x62,0x7d,0x18,0x79,0x30, // vbroadcastss 0x30(%rcx),%ymm15
+ 0xc4,0x62,0x7d,0x18,0x69,0x4c, // vbroadcastss 0x4c(%rcx),%ymm13
+ 0xc4,0x62,0x7d,0x18,0x71,0x50, // vbroadcastss 0x50(%rcx),%ymm14
+ 0xc4,0x62,0x7d,0x18,0x79,0x54, // vbroadcastss 0x54(%rcx),%ymm15
0xc4,0x42,0x0d,0xa8,0xcf, // vfmadd213ps %ymm15,%ymm14,%ymm9
0xc4,0x42,0x15,0xb8,0xca, // vfmadd231ps %ymm10,%ymm13,%ymm9
0xc4,0x41,0x1c,0x5d,0xc9, // vminps %ymm9,%ymm12,%ymm9
- 0xc4,0x62,0x7d,0x18,0x51,0x34, // vbroadcastss 0x34(%rcx),%ymm10
+ 0xc4,0x62,0x7d,0x18,0x51,0x58, // vbroadcastss 0x58(%rcx),%ymm10
0xc4,0xc1,0x7c,0xc2,0xc2,0x01, // vcmpltps %ymm10,%ymm0,%ymm0
0xc4,0xc3,0x35,0x4a,0xc3,0x00, // vblendvps %ymm0,%ymm11,%ymm9,%ymm0
0xc5,0x7c,0x52,0xc9, // vrsqrtps %ymm1,%ymm9
@@ -1426,7 +1684,7 @@ static const unsigned char hsw_scale_u8[] = {
0x48,0x8b,0x02, // mov (%rdx),%rax
0xc4,0x62,0x7d,0x31,0x04,0x38, // vpmovzxbd (%rax,%rdi,1),%ymm8
0xc4,0x41,0x7c,0x5b,0xc0, // vcvtdq2ps %ymm8,%ymm8
- 0xc4,0x62,0x7d,0x18,0x49,0x08, // vbroadcastss 0x8(%rcx),%ymm9
+ 0xc4,0x62,0x7d,0x18,0x49,0x0c, // vbroadcastss 0xc(%rcx),%ymm9
0xc4,0x41,0x3c,0x59,0xc1, // vmulps %ymm9,%ymm8,%ymm8
0xc5,0xbc,0x59,0xc0, // vmulps %ymm0,%ymm8,%ymm0
0xc5,0xbc,0x59,0xc9, // vmulps %ymm1,%ymm8,%ymm1
@@ -1438,7 +1696,7 @@ static const unsigned char hsw_load_tables[] = {
0x48,0x8b,0x02, // mov (%rdx),%rax
0x4c,0x8b,0x42,0x08, // mov 0x8(%rdx),%r8
0xc5,0xfc,0x10,0x1c,0xb8, // vmovups (%rax,%rdi,4),%ymm3
- 0xc4,0xe2,0x7d,0x18,0x51,0x0c, // vbroadcastss 0xc(%rcx),%ymm2
+ 0xc4,0xe2,0x7d,0x18,0x51,0x10, // vbroadcastss 0x10(%rcx),%ymm2
0xc5,0xec,0x54,0xcb, // vandps %ymm3,%ymm2,%ymm1
0xc5,0xfc,0x57,0xc0, // vxorps %ymm0,%ymm0,%ymm0
0xc5,0x7c,0xc2,0xc0,0x00, // vcmpeqps %ymm0,%ymm0,%ymm8
@@ -1455,17 +1713,17 @@ static const unsigned char hsw_load_tables[] = {
0xc4,0xa2,0x3d,0x92,0x14,0x88, // vgatherdps %ymm8,(%rax,%ymm9,4),%ymm2
0xc5,0xe5,0x72,0xd3,0x18, // vpsrld $0x18,%ymm3,%ymm3
0xc5,0xfc,0x5b,0xdb, // vcvtdq2ps %ymm3,%ymm3
- 0xc4,0x62,0x7d,0x18,0x41,0x08, // vbroadcastss 0x8(%rcx),%ymm8
+ 0xc4,0x62,0x7d,0x18,0x41,0x0c, // vbroadcastss 0xc(%rcx),%ymm8
0xc4,0xc1,0x64,0x59,0xd8, // vmulps %ymm8,%ymm3,%ymm3
0xc3, // return
};
static const unsigned char hsw_load_8888[] = {
0x48,0x8b,0x02, // mov (%rdx),%rax
0xc5,0xfc,0x10,0x1c,0xb8, // vmovups (%rax,%rdi,4),%ymm3
- 0xc4,0xe2,0x7d,0x18,0x51,0x0c, // vbroadcastss 0xc(%rcx),%ymm2
+ 0xc4,0xe2,0x7d,0x18,0x51,0x10, // vbroadcastss 0x10(%rcx),%ymm2
0xc5,0xec,0x54,0xc3, // vandps %ymm3,%ymm2,%ymm0
0xc5,0xfc,0x5b,0xc0, // vcvtdq2ps %ymm0,%ymm0
- 0xc4,0x62,0x7d,0x18,0x41,0x08, // vbroadcastss 0x8(%rcx),%ymm8
+ 0xc4,0x62,0x7d,0x18,0x41,0x0c, // vbroadcastss 0xc(%rcx),%ymm8
0xc5,0xbc,0x59,0xc0, // vmulps %ymm0,%ymm8,%ymm0
0xc5,0xf5,0x72,0xd3,0x08, // vpsrld $0x8,%ymm3,%ymm1
0xc5,0xec,0x54,0xc9, // vandps %ymm1,%ymm2,%ymm1
@@ -1482,7 +1740,7 @@ static const unsigned char hsw_load_8888[] = {
};
static const unsigned char hsw_store_8888[] = {
0x48,0x8b,0x02, // mov (%rdx),%rax
- 0xc4,0x62,0x7d,0x18,0x41,0x04, // vbroadcastss 0x4(%rcx),%ymm8
+ 0xc4,0x62,0x7d,0x18,0x41,0x08, // vbroadcastss 0x8(%rcx),%ymm8
0xc5,0x3c,0x59,0xc8, // vmulps %ymm0,%ymm8,%ymm9
0xc4,0x41,0x7d,0x5b,0xc9, // vcvtps2dq %ymm9,%ymm9
0xc5,0x3c,0x59,0xd1, // vmulps %ymm1,%ymm8,%ymm10
@@ -1544,6 +1802,39 @@ static const unsigned char hsw_store_f16[] = {
0xc5,0x7a,0x7f,0x44,0xf8,0x30, // vmovdqu %xmm8,0x30(%rax,%rdi,8)
0xc3, // return
};
+static const unsigned char hsw_clamp_x[] = {
+ 0xc4,0x62,0x7d,0x58,0x02, // vpbroadcastd (%rdx),%ymm8
+ 0xc4,0x41,0x35,0x76,0xc9, // vpcmpeqd %ymm9,%ymm9,%ymm9
+ 0xc4,0x41,0x3d,0xfe,0xc1, // vpaddd %ymm9,%ymm8,%ymm8
+ 0xc4,0xc1,0x7c,0x5d,0xc0, // vminps %ymm8,%ymm0,%ymm0
+ 0xc4,0x41,0x3c,0x57,0xc0, // vxorps %ymm8,%ymm8,%ymm8
+ 0xc5,0xbc,0x5f,0xc0, // vmaxps %ymm0,%ymm8,%ymm0
+ 0xc3, // return
+};
+static const unsigned char hsw_clamp_y[] = {
+ 0xc4,0x62,0x7d,0x58,0x02, // vpbroadcastd (%rdx),%ymm8
+ 0xc4,0x41,0x35,0x76,0xc9, // vpcmpeqd %ymm9,%ymm9,%ymm9
+ 0xc4,0x41,0x3d,0xfe,0xc1, // vpaddd %ymm9,%ymm8,%ymm8
+ 0xc4,0xc1,0x74,0x5d,0xc8, // vminps %ymm8,%ymm1,%ymm1
+ 0xc4,0x41,0x3c,0x57,0xc0, // vxorps %ymm8,%ymm8,%ymm8
+ 0xc5,0xbc,0x5f,0xc9, // vmaxps %ymm1,%ymm8,%ymm1
+ 0xc3, // return
+};
+static const unsigned char hsw_matrix_2x3[] = {
+ 0xc4,0x62,0x7d,0x18,0x0a, // vbroadcastss (%rdx),%ymm9
+ 0xc4,0x62,0x7d,0x18,0x52,0x08, // vbroadcastss 0x8(%rdx),%ymm10
+ 0xc4,0x62,0x7d,0x18,0x42,0x10, // vbroadcastss 0x10(%rdx),%ymm8
+ 0xc4,0x42,0x75,0xb8,0xc2, // vfmadd231ps %ymm10,%ymm1,%ymm8
+ 0xc4,0x42,0x7d,0xb8,0xc1, // vfmadd231ps %ymm9,%ymm0,%ymm8
+ 0xc4,0x62,0x7d,0x18,0x52,0x04, // vbroadcastss 0x4(%rdx),%ymm10
+ 0xc4,0x62,0x7d,0x18,0x5a,0x0c, // vbroadcastss 0xc(%rdx),%ymm11
+ 0xc4,0x62,0x7d,0x18,0x4a,0x14, // vbroadcastss 0x14(%rdx),%ymm9
+ 0xc4,0x42,0x75,0xb8,0xcb, // vfmadd231ps %ymm11,%ymm1,%ymm9
+ 0xc4,0x42,0x7d,0xb8,0xca, // vfmadd231ps %ymm10,%ymm0,%ymm9
+ 0xc5,0x7c,0x29,0xc0, // vmovaps %ymm8,%ymm0
+ 0xc5,0x7c,0x29,0xc9, // vmovaps %ymm9,%ymm1
+ 0xc3, // return
+};
static const unsigned char hsw_matrix_3x4[] = {
0xc4,0x62,0x7d,0x18,0x0a, // vbroadcastss (%rdx),%ymm9
0xc4,0x62,0x7d,0x18,0x52,0x0c, // vbroadcastss 0xc(%rdx),%ymm10
@@ -1571,4 +1862,20 @@ static const unsigned char hsw_matrix_3x4[] = {
0xc5,0x7c,0x29,0xd2, // vmovaps %ymm10,%ymm2
0xc3, // return
};
+static const unsigned char hsw_linear_gradient_2stops[] = {
+ 0xc4,0xe2,0x7d,0x18,0x4a,0x10, // vbroadcastss 0x10(%rdx),%ymm1
+ 0xc4,0x62,0x7d,0x18,0x02, // vbroadcastss (%rdx),%ymm8
+ 0xc4,0x62,0x7d,0xb8,0xc1, // vfmadd231ps %ymm1,%ymm0,%ymm8
+ 0xc4,0xe2,0x7d,0x18,0x52,0x14, // vbroadcastss 0x14(%rdx),%ymm2
+ 0xc4,0xe2,0x7d,0x18,0x4a,0x04, // vbroadcastss 0x4(%rdx),%ymm1
+ 0xc4,0xe2,0x7d,0xb8,0xca, // vfmadd231ps %ymm2,%ymm0,%ymm1
+ 0xc4,0xe2,0x7d,0x18,0x5a,0x18, // vbroadcastss 0x18(%rdx),%ymm3
+ 0xc4,0xe2,0x7d,0x18,0x52,0x08, // vbroadcastss 0x8(%rdx),%ymm2
+ 0xc4,0xe2,0x7d,0xb8,0xd3, // vfmadd231ps %ymm3,%ymm0,%ymm2
+ 0xc4,0x62,0x7d,0x18,0x4a,0x1c, // vbroadcastss 0x1c(%rdx),%ymm9
+ 0xc4,0xe2,0x7d,0x18,0x5a,0x0c, // vbroadcastss 0xc(%rdx),%ymm3
+ 0xc4,0xc2,0x7d,0xb8,0xd9, // vfmadd231ps %ymm9,%ymm0,%ymm3
+ 0xc5,0x7c,0x29,0xc0, // vmovaps %ymm8,%ymm0
+ 0xc3, // return
+};
#endif//SkSplicer_generated_DEFINED
diff --git a/src/splicer/SkSplicer_shared.h b/src/splicer/SkSplicer_shared.h
index 0ad0a09478..1747e42b4b 100644
--- a/src/splicer/SkSplicer_shared.h
+++ b/src/splicer/SkSplicer_shared.h
@@ -21,10 +21,13 @@
struct SkSplicer_constants {
float _1; // 1.0f
+ float _0_5; // 0.5f
float _255; // 255.0f
float _1_255; // 1/255.0f
uint32_t _0x000000ff; // 0x000000ff
+ float iota[8]; // 0,1,2,3,4,5,6,7
+
// from_srgb
float _00025; // 0.0025f
float _06975; // 0.6975f
diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp
index bff58c2518..badaf78695 100644
--- a/src/splicer/SkSplicer_stages.cpp
+++ b/src/splicer/SkSplicer_stages.cpp
@@ -169,6 +169,20 @@ STAGE(inc_x) {
x += sizeof(F) / sizeof(float);
}
+STAGE(seed_shader) {
+ auto y = *(const int*)ctx;
+
+ // It's important for speed to explicitly cast(x) and cast(y),
+ // which has the effect of splatting them to vectors before converting to floats.
+ // On Intel this breaks a data dependency on previous loop iterations' registers.
+
+ r = cast(x) + k->_0_5 + unaligned_load<F>(k->iota);
+ g = cast(y) + k->_0_5;
+ b = k->_1;
+ a = 0;
+ dr = dg = db = da = 0;
+}
+
STAGE(clear) {
r = g = b = a = 0;
}
@@ -424,6 +438,21 @@ STAGE(store_f16) {
#endif
}
+static F clamp(const F& v, float limit) {
+ F l = (F)((U32)F(limit) + U32(0xffffffff)); // limit - 1 ulp
+ return max(0, min(v, l));
+}
+STAGE(clamp_x) { r = clamp(r, *(const float*)ctx); }
+STAGE(clamp_y) { g = clamp(g, *(const float*)ctx); }
+
+STAGE(matrix_2x3) {
+ auto m = (const float*)ctx;
+
+ auto R = fma(r,m[0], fma(g,m[2], m[4])),
+ G = fma(r,m[1], fma(g,m[3], m[5]));
+ r = R;
+ g = G;
+}
STAGE(matrix_3x4) {
auto m = (const float*)ctx;
@@ -434,3 +463,16 @@ STAGE(matrix_3x4) {
g = G;
b = B;
}
+
+STAGE(linear_gradient_2stops) {
+ using F4 = float __attribute__((ext_vector_type(4)));
+
+ struct Ctx { F4 c0, dc; };
+ auto c = unaligned_load<Ctx>(ctx);
+
+ auto t = r;
+ r = fma(t, c.dc[0], c.c0[0]);
+ g = fma(t, c.dc[1], c.c0[1]);
+ b = fma(t, c.dc[2], c.c0[2]);
+ a = fma(t, c.dc[3], c.c0[3]);
+}