aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-03-07 14:41:06 -0500
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-03-07 20:42:42 +0000
commitfdf3bbe82110488fa271c5a8ab0f17e5c925c2dd (patch)
tree0598abe0b4bc4c137ad69185666e2518e3ae921a /src
parentd85f97d6968c9a287a27d6ba10da8fff3b0230e2 (diff)
SkJumper: constant() embeds constants directly in code
This is a potential alternative to threading around a constants struct. I've applied it in a smattering of locations to get a feel for it. If we like it and the performance is fine, I'll follow up with the rest. Change-Id: Ief469e17d6da2cb94e4545ae96cf5d2604ffba88 Reviewed-on: https://skia-review.googlesource.com/9393 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src')
-rw-r--r--src/jumper/SkJumper_generated.cpp1549
-rw-r--r--src/jumper/SkJumper_stages.cpp64
2 files changed, 919 insertions, 694 deletions
diff --git a/src/jumper/SkJumper_generated.cpp b/src/jumper/SkJumper_generated.cpp
index ce7fc44945..383922d0ce 100644
--- a/src/jumper/SkJumper_generated.cpp
+++ b/src/jumper/SkJumper_generated.cpp
@@ -63,25 +63,22 @@ CODE const uint32_t sk_just_return_aarch64[] = {
};
CODE const uint32_t sk_seed_shader_aarch64[] = {
- 0xaa0203e9, //mov x9, x2
- 0xa9400c28, //ldp x8, x3, [x1]
- 0x4ddfc922, //ld1r {v2.4s}, [x9], #4
- 0x3cc14047, //ldur q7, [x2, #20]
+ 0xa8c10c28, //ldp x8, x3, [x1], #16
+ 0x3cc14046, //ldur q6, [x2, #20]
0x4e040c00, //dup v0.4s, w0
+ 0x4f0167e7, //movi v7.4s, #0x3f, lsl #24
0x4d40c901, //ld1r {v1.4s}, [x8]
- 0x4d40c926, //ld1r {v6.4s}, [x9]
0x4e21d800, //scvtf v0.4s, v0.4s
- 0x91004028, //add x8, x1, #0x10
+ 0x4e27d400, //fadd v0.4s, v0.4s, v7.4s
+ 0x4f03f602, //fmov v2.4s, #1.000000000000000000e+00
0x4e21d821, //scvtf v1.4s, v1.4s
- 0x4e26d400, //fadd v0.4s, v0.4s, v6.4s
0x6f00e403, //movi v3.2d, #0x0
0x6f00e404, //movi v4.2d, #0x0
0x6f00e405, //movi v5.2d, #0x0
- 0x4e26d421, //fadd v1.4s, v1.4s, v6.4s
+ 0x4e26d400, //fadd v0.4s, v0.4s, v6.4s
0x6f00e406, //movi v6.2d, #0x0
- 0x4e20d4e0, //fadd v0.4s, v7.4s, v0.4s
+ 0x4e27d421, //fadd v1.4s, v1.4s, v7.4s
0x6f00e407, //movi v7.2d, #0x0
- 0xaa0803e1, //mov x1, x8
0xd61f0060, //br x3
};
@@ -114,8 +111,8 @@ CODE const uint32_t sk_plus__aarch64[] = {
};
CODE const uint32_t sk_srcover_aarch64[] = {
- 0x4d40c850, //ld1r {v16.4s}, [x2]
0xf8408423, //ldr x3, [x1], #8
+ 0x4f03f610, //fmov v16.4s, #1.000000000000000000e+00
0x4ea3d610, //fsub v16.4s, v16.4s, v3.4s
0x4e24ce00, //fmla v0.4s, v16.4s, v4.4s
0x4e25ce01, //fmla v1.4s, v16.4s, v5.4s
@@ -125,12 +122,12 @@ CODE const uint32_t sk_srcover_aarch64[] = {
};
CODE const uint32_t sk_dstover_aarch64[] = {
- 0x4d40c851, //ld1r {v17.4s}, [x2]
+ 0x4f03f611, //fmov v17.4s, #1.000000000000000000e+00
0xf8408423, //ldr x3, [x1], #8
0x4ea41c90, //mov v16.16b, v4.16b
- 0x4ea61cd2, //mov v18.16b, v6.16b
0x4ea7d634, //fsub v20.4s, v17.4s, v7.4s
0x4ea51cb1, //mov v17.16b, v5.16b
+ 0x4ea61cd2, //mov v18.16b, v6.16b
0x4ea71cf3, //mov v19.16b, v7.16b
0x4e20ce90, //fmla v16.4s, v20.4s, v0.4s
0x4e21ce91, //fmla v17.4s, v20.4s, v1.4s
@@ -154,8 +151,8 @@ CODE const uint32_t sk_clamp_0_aarch64[] = {
};
CODE const uint32_t sk_clamp_1_aarch64[] = {
- 0x4d40c850, //ld1r {v16.4s}, [x2]
0xf8408423, //ldr x3, [x1], #8
+ 0x4f03f610, //fmov v16.4s, #1.000000000000000000e+00
0x4eb0f400, //fmin v0.4s, v0.4s, v16.4s
0x4eb0f421, //fmin v1.4s, v1.4s, v16.4s
0x4eb0f442, //fmin v2.4s, v2.4s, v16.4s
@@ -164,8 +161,8 @@ CODE const uint32_t sk_clamp_1_aarch64[] = {
};
CODE const uint32_t sk_clamp_a_aarch64[] = {
- 0x4d40c850, //ld1r {v16.4s}, [x2]
0xf8408423, //ldr x3, [x1], #8
+ 0x4f03f610, //fmov v16.4s, #1.000000000000000000e+00
0x4eb0f463, //fmin v3.4s, v3.4s, v16.4s
0x4ea3f400, //fmin v0.4s, v0.4s, v3.4s
0x4ea3f421, //fmin v1.4s, v1.4s, v3.4s
@@ -235,11 +232,11 @@ CODE const uint32_t sk_premul_aarch64[] = {
};
CODE const uint32_t sk_unpremul_aarch64[] = {
- 0x4d40c850, //ld1r {v16.4s}, [x2]
+ 0x4f03f611, //fmov v17.4s, #1.000000000000000000e+00
0xf8408423, //ldr x3, [x1], #8
- 0x4ea0d871, //fcmeq v17.4s, v3.4s, #0.0
- 0x6e23fe10, //fdiv v16.4s, v16.4s, v3.4s
- 0x4e711e10, //bic v16.16b, v16.16b, v17.16b
+ 0x4ea0d870, //fcmeq v16.4s, v3.4s, #0.0
+ 0x6e23fe31, //fdiv v17.4s, v17.4s, v3.4s
+ 0x4e701e30, //bic v16.16b, v17.16b, v16.16b
0x6e20de00, //fmul v0.4s, v16.4s, v0.4s
0x6e21de01, //fmul v1.4s, v16.4s, v1.4s
0x6e22de02, //fmul v2.4s, v16.4s, v2.4s
@@ -357,21 +354,23 @@ CODE const uint32_t sk_scale_1_float_aarch64[] = {
CODE const uint32_t sk_scale_u8_aarch64[] = {
0xa8c10c28, //ldp x8, x3, [x1], #16
- 0xbd400c51, //ldr s17, [x2, #12]
+ 0x52a77009, //mov w9, #0x3b800000
+ 0x72901029, //movk w9, #0x8081
+ 0x4e040d30, //dup v16.4s, w9
0xf9400108, //ldr x8, [x8]
0x8b000108, //add x8, x8, x0
0x39400109, //ldrb w9, [x8]
0x3940050a, //ldrb w10, [x8, #1]
0x3940090b, //ldrb w11, [x8, #2]
0x39400d08, //ldrb w8, [x8, #3]
- 0x4e021d30, //mov v16.h[0], w9
- 0x4e061d50, //mov v16.h[1], w10
- 0x4e0a1d70, //mov v16.h[2], w11
- 0x4e0e1d10, //mov v16.h[3], w8
- 0x2f07b7f0, //bic v16.4h, #0xff, lsl #8
- 0x2f10a610, //uxtl v16.4s, v16.4h
- 0x6e21da10, //ucvtf v16.4s, v16.4s
- 0x4f919210, //fmul v16.4s, v16.4s, v17.s[0]
+ 0x4e021d31, //mov v17.h[0], w9
+ 0x4e061d51, //mov v17.h[1], w10
+ 0x4e0a1d71, //mov v17.h[2], w11
+ 0x4e0e1d11, //mov v17.h[3], w8
+ 0x2f07b7f1, //bic v17.4h, #0xff, lsl #8
+ 0x2f10a631, //uxtl v17.4s, v17.4h
+ 0x6e21da31, //ucvtf v17.4s, v17.4s
+ 0x6e30de30, //fmul v16.4s, v17.4s, v16.4s
0x6e20de00, //fmul v0.4s, v16.4s, v0.4s
0x6e21de01, //fmul v1.4s, v16.4s, v1.4s
0x6e22de02, //fmul v2.4s, v16.4s, v2.4s
@@ -399,22 +398,24 @@ CODE const uint32_t sk_lerp_1_float_aarch64[] = {
CODE const uint32_t sk_lerp_u8_aarch64[] = {
0xa8c10c28, //ldp x8, x3, [x1], #16
- 0xbd400c51, //ldr s17, [x2, #12]
- 0x4ea4d412, //fsub v18.4s, v0.4s, v4.4s
+ 0x52a77009, //mov w9, #0x3b800000
+ 0x72901029, //movk w9, #0x8081
+ 0x4e040d30, //dup v16.4s, w9
0xf9400108, //ldr x8, [x8]
+ 0x4ea4d412, //fsub v18.4s, v0.4s, v4.4s
0x8b000108, //add x8, x8, x0
- 0x39400109, //ldrb w9, [x8]
- 0x3940050a, //ldrb w10, [x8, #1]
+ 0x3940010a, //ldrb w10, [x8]
+ 0x39400509, //ldrb w9, [x8, #1]
0x3940090b, //ldrb w11, [x8, #2]
0x39400d08, //ldrb w8, [x8, #3]
- 0x4e021d30, //mov v16.h[0], w9
- 0x4e061d50, //mov v16.h[1], w10
- 0x4e0a1d70, //mov v16.h[2], w11
- 0x4e0e1d10, //mov v16.h[3], w8
- 0x2f07b7f0, //bic v16.4h, #0xff, lsl #8
- 0x2f10a600, //uxtl v0.4s, v16.4h
+ 0x4e021d51, //mov v17.h[0], w10
+ 0x4e061d31, //mov v17.h[1], w9
+ 0x4e0a1d71, //mov v17.h[2], w11
+ 0x4e0e1d11, //mov v17.h[3], w8
+ 0x2f07b7f1, //bic v17.4h, #0xff, lsl #8
+ 0x2f10a620, //uxtl v0.4s, v17.4h
0x6e21d800, //ucvtf v0.4s, v0.4s
- 0x4f919010, //fmul v16.4s, v0.4s, v17.s[0]
+ 0x6e30dc10, //fmul v16.4s, v0.4s, v16.4s
0x4ea41c80, //mov v0.16b, v4.16b
0x4ea5d431, //fsub v17.4s, v1.4s, v5.4s
0x4ea51ca1, //mov v1.16b, v5.16b
@@ -433,35 +434,35 @@ CODE const uint32_t sk_lerp_565_aarch64[] = {
0xa8c10c28, //ldp x8, x3, [x1], #16
0xd37ff809, //lsl x9, x0, #1
0x2d4ec851, //ldp s17, s18, [x2, #116]
- 0x4ea4d413, //fsub v19.4s, v0.4s, v4.4s
+ 0x4ea4d414, //fsub v20.4s, v0.4s, v4.4s
0xf9400108, //ldr x8, [x8]
- 0x4ea41c80, //mov v0.16b, v4.16b
0xfc696903, //ldr d3, [x8, x9]
0x9101a048, //add x8, x2, #0x68
0x4d40c910, //ld1r {v16.4s}, [x8]
0x9101b048, //add x8, x2, #0x6c
- 0x2f10a463, //uxtl v3.4s, v3.4h
- 0x4e231e10, //and v16.16b, v16.16b, v3.16b
- 0x4e21da10, //scvtf v16.4s, v16.4s
- 0x4f919210, //fmul v16.4s, v16.4s, v17.s[0]
- 0x4d40c911, //ld1r {v17.4s}, [x8]
+ 0x4d40c913, //ld1r {v19.4s}, [x8]
0x9101c048, //add x8, x2, #0x70
- 0x4e33ce00, //fmla v0.4s, v16.4s, v19.4s
- 0x4ea5d430, //fsub v16.4s, v1.4s, v5.4s
- 0x4e231e31, //and v17.16b, v17.16b, v3.16b
- 0x4e21da31, //scvtf v17.4s, v17.4s
- 0x4f929231, //fmul v17.4s, v17.4s, v18.s[0]
- 0x4d40c912, //ld1r {v18.4s}, [x8]
+ 0x2f10a463, //uxtl v3.4s, v3.4h
+ 0x4d40c915, //ld1r {v21.4s}, [x8]
+ 0x4e231e00, //and v0.16b, v16.16b, v3.16b
+ 0x4e21d800, //scvtf v0.4s, v0.4s
+ 0x4f919010, //fmul v16.4s, v0.4s, v17.s[0]
+ 0x4ea41c80, //mov v0.16b, v4.16b
+ 0xbd407c51, //ldr s17, [x2, #124]
+ 0x4e34ce00, //fmla v0.4s, v16.4s, v20.4s
+ 0x4e231e70, //and v16.16b, v19.16b, v3.16b
+ 0x4e231ea3, //and v3.16b, v21.16b, v3.16b
+ 0x4ea5d433, //fsub v19.4s, v1.4s, v5.4s
+ 0x4e21da01, //scvtf v1.4s, v16.4s
+ 0x4f929030, //fmul v16.4s, v1.4s, v18.s[0]
+ 0x4ea6d452, //fsub v18.4s, v2.4s, v6.4s
+ 0x4e21d862, //scvtf v2.4s, v3.4s
0x4ea51ca1, //mov v1.16b, v5.16b
- 0x4e30ce21, //fmla v1.4s, v17.4s, v16.4s
- 0xbd407c50, //ldr s16, [x2, #124]
- 0x4e231e52, //and v18.16b, v18.16b, v3.16b
- 0x4d40c843, //ld1r {v3.4s}, [x2]
- 0x4e21da52, //scvtf v18.4s, v18.4s
- 0x4ea6d451, //fsub v17.4s, v2.4s, v6.4s
+ 0x4f919043, //fmul v3.4s, v2.4s, v17.s[0]
0x4ea61cc2, //mov v2.16b, v6.16b
- 0x4f909250, //fmul v16.4s, v18.4s, v16.s[0]
- 0x4e31ce02, //fmla v2.4s, v16.4s, v17.4s
+ 0x4e33ce01, //fmla v1.4s, v16.4s, v19.4s
+ 0x4e32cc62, //fmla v2.4s, v3.4s, v18.4s
+ 0x4f03f603, //fmov v3.4s, #1.000000000000000000e+00
0xd61f0060, //br x3
};
@@ -617,46 +618,48 @@ CODE const uint32_t sk_store_565_aarch64[] = {
CODE const uint32_t sk_load_8888_aarch64[] = {
0xa8c10c28, //ldp x8, x3, [x1], #16
0xd37ef409, //lsl x9, x0, #2
- 0xbd400c42, //ldr s2, [x2, #12]
+ 0x6f00e621, //movi v1.2d, #0xff000000ff
0xf9400108, //ldr x8, [x8]
0x3ce96900, //ldr q0, [x8, x9]
- 0x91004048, //add x8, x2, #0x10
- 0x4d40c901, //ld1r {v1.4s}, [x8]
+ 0x52a77008, //mov w8, #0x3b800000
+ 0x72901028, //movk w8, #0x8081
+ 0x4e040d02, //dup v2.4s, w8
0x6f380410, //ushr v16.4s, v0.4s, #8
0x6f300411, //ushr v17.4s, v0.4s, #16
- 0x4e201c23, //and v3.16b, v1.16b, v0.16b
+ 0x4e211c03, //and v3.16b, v0.16b, v1.16b
0x6f280400, //ushr v0.4s, v0.4s, #24
- 0x4e301c30, //and v16.16b, v1.16b, v16.16b
- 0x4e311c21, //and v1.16b, v1.16b, v17.16b
+ 0x4e211e10, //and v16.16b, v16.16b, v1.16b
+ 0x4e211e21, //and v1.16b, v17.16b, v1.16b
0x4e21d863, //scvtf v3.4s, v3.4s
0x4e21d811, //scvtf v17.4s, v0.4s
0x4e21da10, //scvtf v16.4s, v16.4s
0x4e21d832, //scvtf v18.4s, v1.4s
- 0x4f829060, //fmul v0.4s, v3.4s, v2.s[0]
- 0x4f829223, //fmul v3.4s, v17.4s, v2.s[0]
- 0x4f829201, //fmul v1.4s, v16.4s, v2.s[0]
- 0x4f829242, //fmul v2.4s, v18.4s, v2.s[0]
+ 0x6e22dc60, //fmul v0.4s, v3.4s, v2.4s
+ 0x6e22de23, //fmul v3.4s, v17.4s, v2.4s
+ 0x6e22de01, //fmul v1.4s, v16.4s, v2.4s
+ 0x6e22de42, //fmul v2.4s, v18.4s, v2.4s
0xd61f0060, //br x3
};
CODE const uint32_t sk_store_8888_aarch64[] = {
- 0xbd400850, //ldr s16, [x2, #8]
+ 0x52a86fea, //mov w10, #0x437f0000
+ 0x4e040d50, //dup v16.4s, w10
0xf9400028, //ldr x8, [x1]
- 0xd37ef409, //lsl x9, x0, #2
- 0x4f909032, //fmul v18.4s, v1.4s, v16.s[0]
- 0x4f909011, //fmul v17.4s, v0.4s, v16.s[0]
+ 0x6e30dc32, //fmul v18.4s, v1.4s, v16.4s
+ 0x6e30dc11, //fmul v17.4s, v0.4s, v16.4s
0x6e21aa52, //fcvtnu v18.4s, v18.4s
0x6e21aa31, //fcvtnu v17.4s, v17.4s
0x4f285652, //shl v18.4s, v18.4s, #8
0x4eb11e51, //orr v17.16b, v18.16b, v17.16b
- 0x4f909052, //fmul v18.4s, v2.4s, v16.s[0]
- 0xf9400108, //ldr x8, [x8]
- 0x4f909070, //fmul v16.4s, v3.4s, v16.s[0]
+ 0x6e30dc52, //fmul v18.4s, v2.4s, v16.4s
+ 0x6e30dc70, //fmul v16.4s, v3.4s, v16.4s
0x6e21aa52, //fcvtnu v18.4s, v18.4s
+ 0xf9400108, //ldr x8, [x8]
0x6e21aa10, //fcvtnu v16.4s, v16.4s
0x4f305652, //shl v18.4s, v18.4s, #16
0x4eb21e31, //orr v17.16b, v17.16b, v18.16b
0x4f385610, //shl v16.4s, v16.4s, #24
+ 0xd37ef409, //lsl x9, x0, #2
0x4eb01e30, //orr v16.16b, v17.16b, v16.16b
0x3ca96910, //str q16, [x8, x9]
0xf9400423, //ldr x3, [x1, #8]
@@ -968,23 +971,22 @@ CODE const uint32_t sk_just_return_vfp4[] = {
};
CODE const uint32_t sk_seed_shader_vfp4[] = {
- 0xe8911008, //ldm r1, {r3, ip}
0xee800b90, //vdup.32 d16, r0
+ 0xe8911008, //ldm r1, {r3, ip}
0xf3fb0620, //vcvt.f32.s32 d16, d16
+ 0xf2c3161f, //vmov.i32 d17, #1056964608
0xedd23b05, //vldr d19, [r2, #20]
- 0xf2803010, //vmov.i32 d3, #0
- 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
- 0xe2823004, //add r3, r2, #4
- 0xf3fb1621, //vcvt.f32.s32 d17, d17
- 0xe2811008, //add r1, r1, #8
0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32]
+ 0xf2872f10, //vmov.f32 d2, #1
+ 0xf3fb2622, //vcvt.f32.s32 d18, d18
+ 0xe2811008, //add r1, r1, #8
+ 0xf2400da1, //vadd.f32 d16, d16, d17
+ 0xf2803010, //vmov.i32 d3, #0
0xf2804010, //vmov.i32 d4, #0
- 0xf2400da2, //vadd.f32 d16, d16, d18
+ 0xf2021da1, //vadd.f32 d1, d18, d17
+ 0xf2000da3, //vadd.f32 d0, d16, d19
0xf2805010, //vmov.i32 d5, #0
- 0xf4a22c9f, //vld1.32 {d2[]}, [r2 :32]
- 0xf2011da2, //vadd.f32 d1, d17, d18
0xf2806010, //vmov.i32 d6, #0
- 0xf2030da0, //vadd.f32 d0, d19, d16
0xf2807010, //vmov.i32 d7, #0
0xe12fff1c, //bx ip
};
@@ -1019,7 +1021,7 @@ CODE const uint32_t sk_plus__vfp4[] = {
};
CODE const uint32_t sk_srcover_vfp4[] = {
- 0xf4e20c9f, //vld1.32 {d16[]}, [r2 :32]
+ 0xf2c70f10, //vmov.f32 d16, #1
0xe4913004, //ldr r3, [r1], #4
0xf2600d83, //vsub.f32 d16, d16, d3
0xf2040c30, //vfma.f32 d0, d4, d16
@@ -1030,12 +1032,12 @@ CODE const uint32_t sk_srcover_vfp4[] = {
};
CODE const uint32_t sk_dstover_vfp4[] = {
- 0xf4e20c9f, //vld1.32 {d16[]}, [r2 :32]
+ 0xf2c70f10, //vmov.f32 d16, #1
+ 0xe4913004, //ldr r3, [r1], #4
0xf2651115, //vorr d17, d5, d5
0xf2604d87, //vsub.f32 d20, d16, d7
0xf2640114, //vorr d16, d4, d4
0xf2662116, //vorr d18, d6, d6
- 0xe4913004, //ldr r3, [r1], #4
0xf2673117, //vorr d19, d7, d7
0xf2400c34, //vfma.f32 d16, d0, d20
0xf2411c34, //vfma.f32 d17, d1, d20
@@ -1059,7 +1061,7 @@ CODE const uint32_t sk_clamp_0_vfp4[] = {
};
CODE const uint32_t sk_clamp_1_vfp4[] = {
- 0xf4e20c9f, //vld1.32 {d16[]}, [r2 :32]
+ 0xf2c70f10, //vmov.f32 d16, #1
0xe4913004, //ldr r3, [r1], #4
0xf2200f20, //vmin.f32 d0, d0, d16
0xf2211f20, //vmin.f32 d1, d1, d16
@@ -1069,7 +1071,7 @@ CODE const uint32_t sk_clamp_1_vfp4[] = {
};
CODE const uint32_t sk_clamp_a_vfp4[] = {
- 0xf4e20c9f, //vld1.32 {d16[]}, [r2 :32]
+ 0xf2c70f10, //vmov.f32 d16, #1
0xe4913004, //ldr r3, [r1], #4
0xf2233f20, //vmin.f32 d3, d3, d16
0xf2200f03, //vmin.f32 d0, d0, d3
@@ -1145,9 +1147,9 @@ CODE const uint32_t sk_premul_vfp4[] = {
CODE const uint32_t sk_unpremul_vfp4[] = {
0xed2d8b04, //vpush {d8-d9}
- 0xed928a00, //vldr s16, [r2]
- 0xf2c00010, //vmov.i32 d16, #0
+ 0xeeb78a00, //vmov.f32 s16, #112
0xf3f91503, //vceq.f32 d17, d3, #0
+ 0xf2c00010, //vmov.i32 d16, #0
0xe4913004, //ldr r3, [r1], #4
0xeec89a23, //vdiv.f32 s19, s16, s7
0xee889a03, //vdiv.f32 s18, s16, s6
@@ -1281,28 +1283,28 @@ CODE const uint32_t sk_scale_1_float_vfp4[] = {
};
CODE const uint32_t sk_scale_u8_vfp4[] = {
- 0xed2d8b02, //vpush {d8}
- 0xe24dd008, //sub sp, sp, #8
+ 0xe24dd004, //sub sp, sp, #4
0xe8911008, //ldm r1, {r3, ip}
0xe2811008, //add r1, r1, #8
0xe5933000, //ldr r3, [r3]
0xe0833000, //add r3, r3, r0
0xe1d330b0, //ldrh r3, [r3]
- 0xe1cd30b4, //strh r3, [sp, #4]
- 0xe28d3004, //add r3, sp, #4
- 0xed928a03, //vldr s16, [r2, #12]
+ 0xe1cd30b0, //strh r3, [sp]
+ 0xe1a0300d, //mov r3, sp
0xf4e3041f, //vld1.16 {d16[0]}, [r3 :16]
0xf3c80a30, //vmovl.u8 q8, d16
0xf3d00a30, //vmovl.u16 q8, d16
0xf3fb06a0, //vcvt.f32.u32 d16, d16
- 0xf2e009c8, //vmul.f32 d16, d16, d8[0]
+ 0xeddf1b06, //vldr d17, [pc, #24]
+ 0xf3400db1, //vmul.f32 d16, d16, d17
0xf3000d90, //vmul.f32 d0, d16, d0
0xf3001d91, //vmul.f32 d1, d16, d1
0xf3002d92, //vmul.f32 d2, d16, d2
0xf3003d93, //vmul.f32 d3, d16, d3
- 0xe28dd008, //add sp, sp, #8
- 0xecbd8b02, //vpop {d8}
+ 0xe28dd004, //add sp, sp, #4
0xe12fff1c, //bx ip
+ 0x3b808081, //.word 0x3b808081
+ 0x3b808081, //.word 0x3b808081
};
CODE const uint32_t sk_lerp_1_float_vfp4[] = {
@@ -1325,36 +1327,36 @@ CODE const uint32_t sk_lerp_1_float_vfp4[] = {
};
CODE const uint32_t sk_lerp_u8_vfp4[] = {
- 0xed2d8b02, //vpush {d8}
- 0xe24dd008, //sub sp, sp, #8
+ 0xe24dd004, //sub sp, sp, #4
0xe8911008, //ldm r1, {r3, ip}
- 0xf2612d05, //vsub.f32 d18, d1, d5
+ 0xf2602d04, //vsub.f32 d18, d0, d4
0xf2623d06, //vsub.f32 d19, d2, d6
0xf2634d07, //vsub.f32 d20, d3, d7
0xe2811008, //add r1, r1, #8
0xe5933000, //ldr r3, [r3]
- 0xf2251115, //vorr d1, d5, d5
+ 0xf2240114, //vorr d0, d4, d4
0xf2262116, //vorr d2, d6, d6
0xe0833000, //add r3, r3, r0
0xf2273117, //vorr d3, d7, d7
0xe1d330b0, //ldrh r3, [r3]
- 0xe1cd30b4, //strh r3, [sp, #4]
- 0xe28d3004, //add r3, sp, #4
- 0xed928a03, //vldr s16, [r2, #12]
+ 0xe1cd30b0, //strh r3, [sp]
+ 0xe1a0300d, //mov r3, sp
0xf4e3041f, //vld1.16 {d16[0]}, [r3 :16]
0xf3c80a30, //vmovl.u8 q8, d16
0xf3d00a30, //vmovl.u16 q8, d16
0xf3fb06a0, //vcvt.f32.u32 d16, d16
- 0xf2601d04, //vsub.f32 d17, d0, d4
- 0xf2240114, //vorr d0, d4, d4
- 0xf2e009c8, //vmul.f32 d16, d16, d8[0]
- 0xf2010cb0, //vfma.f32 d0, d17, d16
- 0xf2021cb0, //vfma.f32 d1, d18, d16
+ 0xeddf1b08, //vldr d17, [pc, #32]
+ 0xf3400db1, //vmul.f32 d16, d16, d17
+ 0xf2611d05, //vsub.f32 d17, d1, d5
+ 0xf2251115, //vorr d1, d5, d5
+ 0xf2020cb0, //vfma.f32 d0, d18, d16
+ 0xf2011cb0, //vfma.f32 d1, d17, d16
0xf2032cb0, //vfma.f32 d2, d19, d16
0xf2043cb0, //vfma.f32 d3, d20, d16
- 0xe28dd008, //add sp, sp, #8
- 0xecbd8b02, //vpop {d8}
+ 0xe28dd004, //add sp, sp, #4
0xe12fff1c, //bx ip
+ 0x3b808081, //.word 0x3b808081
+ 0x3b808081, //.word 0x3b808081
};
CODE const uint32_t sk_lerp_565_vfp4[] = {
@@ -1387,7 +1389,6 @@ CODE const uint32_t sk_lerp_565_vfp4[] = {
0xf3fb2622, //vcvt.f32.s32 d18, d18
0xf2614d05, //vsub.f32 d20, d1, d5
0xf2e009c3, //vmul.f32 d16, d16, d3[0]
- 0xf4a23c9f, //vld1.32 {d3[]}, [r2 :32]
0xf2625d06, //vsub.f32 d21, d2, d6
0xf2e119c8, //vmul.f32 d17, d17, d8[0]
0xf2e229c9, //vmul.f32 d18, d18, d9[0]
@@ -1396,6 +1397,7 @@ CODE const uint32_t sk_lerp_565_vfp4[] = {
0xf2030cb0, //vfma.f32 d0, d19, d16
0xf2041cb1, //vfma.f32 d1, d20, d17
0xf2052cb2, //vfma.f32 d2, d21, d18
+ 0xf2873f10, //vmov.f32 d3, #1
0xe28dd008, //add sp, sp, #8
0xecbd8b04, //vpop {d8-d9}
0xe12fff1c, //bx ip
@@ -1542,64 +1544,67 @@ CODE const uint32_t sk_store_565_vfp4[] = {
0xe5913004, //ldr r3, [r1, #4]
0xe2811008, //add r1, r1, #8
0xe12fff13, //bx r3
+ 0xe320f000, //nop {0}
};
CODE const uint32_t sk_load_8888_vfp4[] = {
- 0xe92d4800, //push {fp, lr}
0xe8911008, //ldm r1, {r3, ip}
+ 0xf3c7001f, //vmov.i32 d16, #255
0xe2811008, //add r1, r1, #8
- 0xed922a03, //vldr s4, [r2, #12]
- 0xe593e000, //ldr lr, [r3]
- 0xe2823010, //add r3, r2, #16
- 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32]
- 0xe08e3100, //add r3, lr, r0, lsl #2
+ 0xe5933000, //ldr r3, [r3]
+ 0xe0833100, //add r3, r3, r0, lsl #2
0xedd31b00, //vldr d17, [r3]
- 0xf24021b1, //vand d18, d16, d17
+ 0xf24121b0, //vand d18, d17, d16
0xf3f83031, //vshr.u32 d19, d17, #8
0xf3e84031, //vshr.u32 d20, d17, #24
0xf3f01031, //vshr.u32 d17, d17, #16
- 0xf24031b3, //vand d19, d16, d19
- 0xf24001b1, //vand d16, d16, d17
+ 0xf24331b0, //vand d19, d19, d16
+ 0xf24101b0, //vand d16, d17, d16
+ 0xeddf1b08, //vldr d17, [pc, #32]
0xf3fb2622, //vcvt.f32.s32 d18, d18
0xf3fb4624, //vcvt.f32.s32 d20, d20
- 0xf3fb1623, //vcvt.f32.s32 d17, d19
+ 0xf3fb3623, //vcvt.f32.s32 d19, d19
0xf3fb0620, //vcvt.f32.s32 d16, d16
- 0xf2a209c2, //vmul.f32 d0, d18, d2[0]
- 0xf2a439c2, //vmul.f32 d3, d20, d2[0]
- 0xf2a119c2, //vmul.f32 d1, d17, d2[0]
- 0xf2a029c2, //vmul.f32 d2, d16, d2[0]
- 0xe8bd4800, //pop {fp, lr}
+ 0xf3020db1, //vmul.f32 d0, d18, d17
+ 0xf3043db1, //vmul.f32 d3, d20, d17
+ 0xf3031db1, //vmul.f32 d1, d19, d17
+ 0xf3002db1, //vmul.f32 d2, d16, d17
0xe12fff1c, //bx ip
+ 0x3b808081, //.word 0x3b808081
+ 0x3b808081, //.word 0x3b808081
};
CODE const uint32_t sk_store_8888_vfp4[] = {
- 0xe2823008, //add r3, r2, #8
+ 0xeddf0b1a, //vldr d16, [pc, #104]
0xf2c3261f, //vmov.i32 d18, #1056964608
- 0xf2c3361f, //vmov.i32 d19, #1056964608
- 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
- 0xf2c3061f, //vmov.i32 d16, #1056964608
- 0xf2412c31, //vfma.f32 d18, d1, d17
- 0xf2423c31, //vfma.f32 d19, d2, d17
- 0xf2c3461f, //vmov.i32 d20, #1056964608
+ 0xf2412c30, //vfma.f32 d18, d1, d16
0xe5913000, //ldr r3, [r1]
- 0xf2400c31, //vfma.f32 d16, d0, d17
- 0xf2434c31, //vfma.f32 d20, d3, d17
+ 0xf2c3361f, //vmov.i32 d19, #1056964608
+ 0xf2c3161f, //vmov.i32 d17, #1056964608
+ 0xf2423c30, //vfma.f32 d19, d2, d16
0xe5933000, //ldr r3, [r3]
+ 0xf2c3461f, //vmov.i32 d20, #1056964608
+ 0xf2401c30, //vfma.f32 d17, d0, d16
0xe0833100, //add r3, r3, r0, lsl #2
- 0xf3fb17a2, //vcvt.u32.f32 d17, d18
+ 0xf2434c30, //vfma.f32 d20, d3, d16
+ 0xf3fb07a2, //vcvt.u32.f32 d16, d18
0xf3fb27a3, //vcvt.u32.f32 d18, d19
- 0xf3fb07a0, //vcvt.u32.f32 d16, d16
+ 0xf3fb17a1, //vcvt.u32.f32 d17, d17
0xf3fb37a4, //vcvt.u32.f32 d19, d20
- 0xf2e81531, //vshl.s32 d17, d17, #8
+ 0xf2e80530, //vshl.s32 d16, d16, #8
0xf2f02532, //vshl.s32 d18, d18, #16
- 0xf26101b0, //vorr d16, d17, d16
+ 0xf26001b1, //vorr d16, d16, d17
0xf2f81533, //vshl.s32 d17, d19, #24
0xf26001b2, //vorr d16, d16, d18
0xf26001b1, //vorr d16, d16, d17
0xedc30b00, //vstr d16, [r3]
- 0xe5913004, //ldr r3, [r1, #4]
- 0xe2811008, //add r1, r1, #8
- 0xe12fff13, //bx r3
+ 0xe2813008, //add r3, r1, #8
+ 0xe591c004, //ldr ip, [r1, #4]
+ 0xe1a01003, //mov r1, r3
+ 0xe12fff1c, //bx ip
+ 0xe320f000, //nop {0}
+ 0x437f0000, //.word 0x437f0000
+ 0x437f0000, //.word 0x437f0000
};
CODE const uint32_t sk_load_f16_vfp4[] = {
@@ -1681,8 +1686,8 @@ CODE const uint32_t sk_repeat_x_vfp4[] = {
0xed2d8b04, //vpush {d8-d9}
0xe8911008, //ldm r1, {r3, ip}
0xf2c02010, //vmov.i32 d18, #0
- 0xf4e23c9f, //vld1.32 {d19[]}, [r2 :32]
0xe2811008, //add r1, r1, #8
+ 0xeddf3b10, //vldr d19, [pc, #64]
0xed938a00, //vldr s16, [r3]
0xeec09a88, //vdiv.f32 s19, s1, s16
0xee809a08, //vdiv.f32 s18, s0, s16
@@ -1699,14 +1704,17 @@ CODE const uint32_t sk_repeat_x_vfp4[] = {
0xf2200fa1, //vmin.f32 d0, d16, d17
0xecbd8b04, //vpop {d8-d9}
0xe12fff1c, //bx ip
+ 0xe320f000, //nop {0}
+ 0x3f800000, //.word 0x3f800000
+ 0x3f800000, //.word 0x3f800000
};
CODE const uint32_t sk_repeat_y_vfp4[] = {
0xed2d8b04, //vpush {d8-d9}
0xe8911008, //ldm r1, {r3, ip}
0xf2c02010, //vmov.i32 d18, #0
- 0xf4e23c9f, //vld1.32 {d19[]}, [r2 :32]
0xe2811008, //add r1, r1, #8
+ 0xeddf3b10, //vldr d19, [pc, #64]
0xed938a00, //vldr s16, [r3]
0xeec19a88, //vdiv.f32 s19, s3, s16
0xee819a08, //vdiv.f32 s18, s2, s16
@@ -1723,14 +1731,17 @@ CODE const uint32_t sk_repeat_y_vfp4[] = {
0xf2201fa1, //vmin.f32 d1, d16, d17
0xecbd8b04, //vpop {d8-d9}
0xe12fff1c, //bx ip
+ 0xe320f000, //nop {0}
+ 0x3f800000, //.word 0x3f800000
+ 0x3f800000, //.word 0x3f800000
};
CODE const uint32_t sk_mirror_x_vfp4[] = {
0xed2d8b04, //vpush {d8-d9}
0xe8911008, //ldm r1, {r3, ip}
0xf2c03010, //vmov.i32 d19, #0
- 0xf4e24c9f, //vld1.32 {d20[]}, [r2 :32]
0xe2811008, //add r1, r1, #8
+ 0xeddf4b14, //vldr d20, [pc, #80]
0xed938a00, //vldr s16, [r3]
0xee389a08, //vadd.f32 s18, s16, s16
0xf3f40c08, //vdup.32 d16, d8[0]
@@ -1751,14 +1762,17 @@ CODE const uint32_t sk_mirror_x_vfp4[] = {
0xf2210fa0, //vmin.f32 d0, d17, d16
0xecbd8b04, //vpop {d8-d9}
0xe12fff1c, //bx ip
+ 0xe320f000, //nop {0}
+ 0x3f800000, //.word 0x3f800000
+ 0x3f800000, //.word 0x3f800000
};
CODE const uint32_t sk_mirror_y_vfp4[] = {
0xed2d8b04, //vpush {d8-d9}
0xe8911008, //ldm r1, {r3, ip}
0xf2c03010, //vmov.i32 d19, #0
- 0xf4e24c9f, //vld1.32 {d20[]}, [r2 :32]
0xe2811008, //add r1, r1, #8
+ 0xeddf4b14, //vldr d20, [pc, #80]
0xed938a00, //vldr s16, [r3]
0xee389a08, //vadd.f32 s18, s16, s16
0xf3f40c08, //vdup.32 d16, d8[0]
@@ -1779,6 +1793,9 @@ CODE const uint32_t sk_mirror_y_vfp4[] = {
0xf2211fa0, //vmin.f32 d1, d17, d16
0xecbd8b04, //vpop {d8-d9}
0xe12fff1c, //bx ip
+ 0xe320f000, //nop {0}
+ 0x3f800000, //.word 0x3f800000
+ 0x3f800000, //.word 0x3f800000
};
CODE const uint32_t sk_luminance_to_alpha_vfp4[] = {
@@ -2064,13 +2081,17 @@ CODE const uint8_t sk_seed_shader_hsw[] = {
197,249,110,199, //vmovd %edi,%xmm0
196,226,125,24,192, //vbroadcastss %xmm0,%ymm0
197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
- 196,226,125,24,74,4, //vbroadcastss 0x4(%rdx),%ymm1
+ 65,184,0,0,0,63, //mov $0x3f000000,%r8d
+ 196,193,121,110,200, //vmovd %r8d,%xmm1
+ 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1
197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0
197,252,88,66,20, //vaddps 0x14(%rdx),%ymm0,%ymm0
196,226,125,24,16, //vbroadcastss (%rax),%ymm2
197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1
- 196,226,125,24,18, //vbroadcastss (%rdx),%ymm2
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,249,110,208, //vmovd %eax,%xmm2
+ 196,226,125,24,210, //vbroadcastss %xmm2,%ymm2
72,173, //lods %ds:(%rsi),%rax
197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4
@@ -2109,7 +2130,9 @@ CODE const uint8_t sk_plus__hsw[] = {
};
CODE const uint8_t sk_srcover_hsw[] = {
- 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8
197,60,92,195, //vsubps %ymm3,%ymm8,%ymm8
196,194,93,184,192, //vfmadd231ps %ymm8,%ymm4,%ymm0
196,194,85,184,200, //vfmadd231ps %ymm8,%ymm5,%ymm1
@@ -2120,7 +2143,9 @@ CODE const uint8_t sk_srcover_hsw[] = {
};
CODE const uint8_t sk_dstover_hsw[] = {
- 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8
197,60,92,199, //vsubps %ymm7,%ymm8,%ymm8
196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0
196,226,61,168,205, //vfmadd213ps %ymm5,%ymm8,%ymm1
@@ -2141,7 +2166,9 @@ CODE const uint8_t sk_clamp_0_hsw[] = {
};
CODE const uint8_t sk_clamp_1_hsw[] = {
- 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8
196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0
196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1
196,193,108,93,208, //vminps %ymm8,%ymm2,%ymm2
@@ -2151,7 +2178,9 @@ CODE const uint8_t sk_clamp_1_hsw[] = {
};
CODE const uint8_t sk_clamp_a_hsw[] = {
- 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8
196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3
197,252,93,195, //vminps %ymm3,%ymm0,%ymm0
197,244,93,203, //vminps %ymm3,%ymm1,%ymm1
@@ -2223,7 +2252,9 @@ CODE const uint8_t sk_premul_hsw[] = {
CODE const uint8_t sk_unpremul_hsw[] = {
196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
196,65,100,194,200,0, //vcmpeqps %ymm8,%ymm3,%ymm9
- 196,98,125,24,18, //vbroadcastss (%rdx),%ymm10
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,208, //vmovd %eax,%xmm10
+ 196,66,125,24,210, //vbroadcastss %xmm10,%ymm10
197,44,94,211, //vdivps %ymm3,%ymm10,%ymm10
196,67,45,74,192,144, //vblendvps %ymm9,%ymm8,%ymm10,%ymm8
197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
@@ -2318,11 +2349,13 @@ CODE const uint8_t sk_scale_u8_hsw[] = {
72,139,0, //mov (%rax),%rax
72,1,248, //add %rdi,%rax
77,133,192, //test %r8,%r8
- 117,48, //jne 41a <_sk_scale_u8_hsw+0x40>
+ 117,56, //jne 462 <_sk_scale_u8_hsw+0x48>
197,123,16,0, //vmovsd (%rax),%xmm8
196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8
196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
- 196,98,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm9
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 197,121,110,200, //vmovd %eax,%xmm9
+ 196,66,125,24,201, //vbroadcastss %xmm9,%ymm9
196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8
197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
@@ -2340,9 +2373,9 @@ CODE const uint8_t sk_scale_u8_hsw[] = {
77,9,217, //or %r11,%r9
72,131,193,8, //add $0x8,%rcx
73,255,202, //dec %r10
- 117,234, //jne 422 <_sk_scale_u8_hsw+0x48>
+ 117,234, //jne 46a <_sk_scale_u8_hsw+0x50>
196,65,249,110,193, //vmovq %r9,%xmm8
- 235,175, //jmp 3ee <_sk_scale_u8_hsw+0x14>
+ 235,167, //jmp 42e <_sk_scale_u8_hsw+0x14>
};
CODE const uint8_t sk_lerp_1_float_hsw[] = {
@@ -2366,11 +2399,13 @@ CODE const uint8_t sk_lerp_u8_hsw[] = {
72,139,0, //mov (%rax),%rax
72,1,248, //add %rdi,%rax
77,133,192, //test %r8,%r8
- 117,68, //jne 4c2 <_sk_lerp_u8_hsw+0x54>
+ 117,76, //jne 512 <_sk_lerp_u8_hsw+0x5c>
197,123,16,0, //vmovsd (%rax),%xmm8
196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8
196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
- 196,98,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm9
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 197,121,110,200, //vmovd %eax,%xmm9
+ 196,66,125,24,201, //vbroadcastss %xmm9,%ymm9
196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8
197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0
196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0
@@ -2392,16 +2427,16 @@ CODE const uint8_t sk_lerp_u8_hsw[] = {
77,9,217, //or %r11,%r9
72,131,193,8, //add $0x8,%rcx
73,255,202, //dec %r10
- 117,234, //jne 4ca <_sk_lerp_u8_hsw+0x5c>
+ 117,234, //jne 51a <_sk_lerp_u8_hsw+0x64>
196,65,249,110,193, //vmovq %r9,%xmm8
- 235,155, //jmp 482 <_sk_lerp_u8_hsw+0x14>
+ 235,147, //jmp 4ca <_sk_lerp_u8_hsw+0x14>
};
CODE const uint8_t sk_lerp_565_hsw[] = {
72,173, //lods %ds:(%rsi),%rax
76,139,16, //mov (%rax),%r10
72,133,201, //test %rcx,%rcx
- 117,123, //jne 56c <_sk_lerp_565_hsw+0x85>
+ 15,133,132,0,0,0, //jne 5c9 <_sk_lerp_565_hsw+0x92>
196,193,122,111,28,122, //vmovdqu (%r10,%rdi,2),%xmm3
196,226,125,51,219, //vpmovzxwd %xmm3,%ymm3
196,98,125,88,66,104, //vpbroadcastd 0x68(%rdx),%ymm8
@@ -2425,7 +2460,9 @@ CODE const uint8_t sk_lerp_565_hsw[] = {
196,226,53,168,205, //vfmadd213ps %ymm5,%ymm9,%ymm1
197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2
196,226,101,168,214, //vfmadd213ps %ymm6,%ymm3,%ymm2
- 196,226,125,24,26, //vbroadcastss (%rdx),%ymm3
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,249,110,216, //vmovd %eax,%xmm3
+ 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
65,137,200, //mov %ecx,%r8d
@@ -2434,8 +2471,8 @@ CODE const uint8_t sk_lerp_565_hsw[] = {
65,254,200, //dec %r8b
69,15,182,192, //movzbl %r8b,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 15,135,111,255,255,255, //ja 4f7 <_sk_lerp_565_hsw+0x10>
- 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # 5d8 <_sk_lerp_565_hsw+0xf1>
+ 15,135,102,255,255,255, //ja 54b <_sk_lerp_565_hsw+0x14>
+ 76,141,13,76,0,0,0, //lea 0x4c(%rip),%r9 # 638 <_sk_lerp_565_hsw+0x101>
75,99,4,129, //movslq (%r9,%r8,4),%rax
76,1,200, //add %r9,%rax
255,224, //jmpq *%rax
@@ -2447,27 +2484,26 @@ CODE const uint8_t sk_lerp_565_hsw[] = {
196,193,97,196,92,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm3,%xmm3
196,193,97,196,92,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm3,%xmm3
196,193,97,196,28,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm3,%xmm3
- 233,31,255,255,255, //jmpq 4f7 <_sk_lerp_565_hsw+0x10>
- 244, //hlt
- 255, //(bad)
+ 233,22,255,255,255, //jmpq 54b <_sk_lerp_565_hsw+0x14>
+ 15,31,0, //nopl (%rax)
+ 241, //icebp
255, //(bad)
255, //(bad)
- 236, //in (%dx),%al
255, //(bad)
+ 233,255,255,255,225, //jmpq ffffffffe2000640 <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff55e>
255, //(bad)
- 255,228, //jmpq *%rsp
255, //(bad)
255, //(bad)
+ 217,255, //fcos
255, //(bad)
- 220,255, //fdivr %st,%st(7)
+ 255,209, //callq *%rcx
255, //(bad)
- 255,212, //callq *%rsp
255, //(bad)
+ 255,201, //dec %ecx
255, //(bad)
- 255,204, //dec %esp
255, //(bad)
255, //(bad)
- 255,192, //inc %eax
+ 189, //.byte 0xbd
255, //(bad)
255, //(bad)
255, //.byte 0xff
@@ -2479,7 +2515,7 @@ CODE const uint8_t sk_load_tables_hsw[] = {
76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9
76,3,8, //add (%rax),%r9
77,133,192, //test %r8,%r8
- 117,106, //jne 673 <_sk_load_tables_hsw+0x7f>
+ 117,106, //jne 6d3 <_sk_load_tables_hsw+0x7f>
196,193,126,111,25, //vmovdqu (%r9),%ymm3
196,226,125,88,82,16, //vpbroadcastd 0x10(%rdx),%ymm2
197,237,219,203, //vpand %ymm3,%ymm2,%ymm1
@@ -2511,7 +2547,7 @@ CODE const uint8_t sk_load_tables_hsw[] = {
196,193,249,110,194, //vmovq %r10,%xmm0
196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0
196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3
- 233,114,255,255,255, //jmpq 60e <_sk_load_tables_hsw+0x1a>
+ 233,114,255,255,255, //jmpq 66e <_sk_load_tables_hsw+0x1a>
};
CODE const uint8_t sk_load_a8_hsw[] = {
@@ -2520,7 +2556,7 @@ CODE const uint8_t sk_load_a8_hsw[] = {
72,139,0, //mov (%rax),%rax
72,1,248, //add %rdi,%rax
77,133,192, //test %r8,%r8
- 117,42, //jne 6d6 <_sk_load_a8_hsw+0x3a>
+ 117,42, //jne 736 <_sk_load_a8_hsw+0x3a>
197,251,16,0, //vmovsd (%rax),%xmm0
196,226,125,49,192, //vpmovzxbd %xmm0,%ymm0
197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
@@ -2541,9 +2577,9 @@ CODE const uint8_t sk_load_a8_hsw[] = {
77,9,217, //or %r11,%r9
72,131,193,8, //add $0x8,%rcx
73,255,202, //dec %r10
- 117,234, //jne 6de <_sk_load_a8_hsw+0x42>
+ 117,234, //jne 73e <_sk_load_a8_hsw+0x42>
196,193,249,110,193, //vmovq %r9,%xmm0
- 235,181, //jmp 6b0 <_sk_load_a8_hsw+0x14>
+ 235,181, //jmp 710 <_sk_load_a8_hsw+0x14>
};
CODE const uint8_t sk_store_a8_hsw[] = {
@@ -2556,7 +2592,7 @@ CODE const uint8_t sk_store_a8_hsw[] = {
196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8
196,65,57,103,192, //vpackuswb %xmm8,%xmm8,%xmm8
72,133,201, //test %rcx,%rcx
- 117,10, //jne 72e <_sk_store_a8_hsw+0x33>
+ 117,10, //jne 78e <_sk_store_a8_hsw+0x33>
196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1)
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
@@ -2565,9 +2601,9 @@ CODE const uint8_t sk_store_a8_hsw[] = {
254,200, //dec %al
68,15,182,192, //movzbl %al,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 119,236, //ja 72a <_sk_store_a8_hsw+0x2f>
+ 119,236, //ja 78a <_sk_store_a8_hsw+0x2f>
196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8
- 76,141,21,66,0,0,0, //lea 0x42(%rip),%r10 # 78c <_sk_store_a8_hsw+0x91>
+ 76,141,21,66,0,0,0, //lea 0x42(%rip),%r10 # 7ec <_sk_store_a8_hsw+0x91>
75,99,4,130, //movslq (%r10,%r8,4),%rax
76,1,208, //add %r10,%rax
255,224, //jmpq *%rax
@@ -2578,7 +2614,7 @@ CODE const uint8_t sk_store_a8_hsw[] = {
196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1)
196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1)
196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1)
- 235,158, //jmp 72a <_sk_store_a8_hsw+0x2f>
+ 235,158, //jmp 78a <_sk_store_a8_hsw+0x2f>
247,255, //idiv %edi
255, //(bad)
255, //(bad)
@@ -2607,7 +2643,7 @@ CODE const uint8_t sk_load_565_hsw[] = {
72,173, //lods %ds:(%rsi),%rax
76,139,16, //mov (%rax),%r10
72,133,201, //test %rcx,%rcx
- 117,92, //jne 80e <_sk_load_565_hsw+0x66>
+ 117,92, //jne 86e <_sk_load_565_hsw+0x66>
196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0
196,226,125,51,208, //vpmovzxwd %xmm0,%ymm2
196,226,125,88,66,104, //vpbroadcastd 0x68(%rdx),%ymm0
@@ -2634,8 +2670,8 @@ CODE const uint8_t sk_load_565_hsw[] = {
65,254,200, //dec %r8b
69,15,182,192, //movzbl %r8b,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 119,146, //ja 7b8 <_sk_load_565_hsw+0x10>
- 76,141,13,75,0,0,0, //lea 0x4b(%rip),%r9 # 878 <_sk_load_565_hsw+0xd0>
+ 119,146, //ja 818 <_sk_load_565_hsw+0x10>
+ 76,141,13,75,0,0,0, //lea 0x4b(%rip),%r9 # 8d8 <_sk_load_565_hsw+0xd0>
75,99,4,129, //movslq (%r9,%r8,4),%rax
76,1,200, //add %r9,%rax
255,224, //jmpq *%rax
@@ -2647,7 +2683,7 @@ CODE const uint8_t sk_load_565_hsw[] = {
196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0
- 233,66,255,255,255, //jmpq 7b8 <_sk_load_565_hsw+0x10>
+ 233,66,255,255,255, //jmpq 818 <_sk_load_565_hsw+0x10>
102,144, //xchg %ax,%ax
242,255, //repnz (bad)
255, //(bad)
@@ -2692,7 +2728,7 @@ CODE const uint8_t sk_store_565_hsw[] = {
196,67,125,57,193,1, //vextracti128 $0x1,%ymm8,%xmm9
196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8
72,133,201, //test %rcx,%rcx
- 117,10, //jne 8f6 <_sk_store_565_hsw+0x62>
+ 117,10, //jne 956 <_sk_store_565_hsw+0x62>
196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2)
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
@@ -2701,8 +2737,8 @@ CODE const uint8_t sk_store_565_hsw[] = {
254,200, //dec %al
68,15,182,192, //movzbl %al,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 119,236, //ja 8f2 <_sk_store_565_hsw+0x5e>
- 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # 954 <_sk_store_565_hsw+0xc0>
+ 119,236, //ja 952 <_sk_store_565_hsw+0x5e>
+ 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # 9b4 <_sk_store_565_hsw+0xc0>
75,99,4,130, //movslq (%r10,%r8,4),%rax
76,1,208, //add %r10,%rax
255,224, //jmpq *%rax
@@ -2714,7 +2750,7 @@ CODE const uint8_t sk_store_565_hsw[] = {
196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2)
197,121,126,192, //vmovd %xmm8,%eax
102,65,137,4,121, //mov %ax,(%r9,%rdi,2)
- 235,161, //jmp 8f2 <_sk_store_565_hsw+0x5e>
+ 235,161, //jmp 952 <_sk_store_565_hsw+0x5e>
15,31,0, //nopl (%rax)
242,255, //repnz (bad)
255, //(bad)
@@ -2746,21 +2782,25 @@ CODE const uint8_t sk_load_8888_hsw[] = {
76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9
76,3,8, //add (%rax),%r9
77,133,192, //test %r8,%r8
- 117,85, //jne 9da <_sk_load_8888_hsw+0x6a>
+ 117,104, //jne a4d <_sk_load_8888_hsw+0x7d>
196,193,126,111,25, //vmovdqu (%r9),%ymm3
- 196,226,125,88,82,16, //vpbroadcastd 0x10(%rdx),%ymm2
+ 184,255,0,0,0, //mov $0xff,%eax
+ 197,249,110,192, //vmovd %eax,%xmm0
+ 196,226,125,88,208, //vpbroadcastd %xmm0,%ymm2
197,237,219,195, //vpand %ymm3,%ymm2,%ymm0
197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
- 196,98,125,24,66,12, //vbroadcastss 0xc(%rdx),%ymm8
- 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 197,249,110,200, //vmovd %eax,%xmm1
+ 196,98,125,24,193, //vbroadcastss %xmm1,%ymm8
+ 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0
197,245,114,211,8, //vpsrld $0x8,%ymm3,%ymm1
197,237,219,201, //vpand %ymm1,%ymm2,%ymm1
197,252,91,201, //vcvtdq2ps %ymm1,%ymm1
- 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
+ 196,193,116,89,200, //vmulps %ymm8,%ymm1,%ymm1
197,181,114,211,16, //vpsrld $0x10,%ymm3,%ymm9
196,193,109,219,209, //vpand %ymm9,%ymm2,%ymm2
197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
- 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
+ 196,193,108,89,208, //vmulps %ymm8,%ymm2,%ymm2
197,229,114,211,24, //vpsrld $0x18,%ymm3,%ymm3
197,252,91,219, //vcvtdq2ps %ymm3,%ymm3
196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3
@@ -2775,7 +2815,7 @@ CODE const uint8_t sk_load_8888_hsw[] = {
196,225,249,110,192, //vmovq %rax,%xmm0
196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0
196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3
- 235,138, //jmp 98a <_sk_load_8888_hsw+0x1a>
+ 233,116,255,255,255, //jmpq 9ea <_sk_load_8888_hsw+0x1a>
};
CODE const uint8_t sk_store_8888_hsw[] = {
@@ -2783,7 +2823,9 @@ CODE const uint8_t sk_store_8888_hsw[] = {
72,173, //lods %ds:(%rsi),%rax
76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9
76,3,8, //add (%rax),%r9
- 196,98,125,24,66,8, //vbroadcastss 0x8(%rdx),%ymm8
+ 184,0,0,127,67, //mov $0x437f0000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8
197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9
197,60,89,209, //vmulps %ymm1,%ymm8,%ymm10
@@ -2799,7 +2841,7 @@ CODE const uint8_t sk_store_8888_hsw[] = {
196,65,45,235,192, //vpor %ymm8,%ymm10,%ymm8
196,65,53,235,192, //vpor %ymm8,%ymm9,%ymm8
77,133,192, //test %r8,%r8
- 117,12, //jne a6c <_sk_store_8888_hsw+0x6c>
+ 117,12, //jne aea <_sk_store_8888_hsw+0x74>
196,65,126,127,1, //vmovdqu %ymm8,(%r9)
72,173, //lods %ds:(%rsi),%rax
76,137,193, //mov %r8,%rcx
@@ -2812,14 +2854,14 @@ CODE const uint8_t sk_store_8888_hsw[] = {
196,97,249,110,200, //vmovq %rax,%xmm9
196,66,125,33,201, //vpmovsxbd %xmm9,%ymm9
196,66,53,142,1, //vpmaskmovd %ymm8,%ymm9,(%r9)
- 235,211, //jmp a65 <_sk_store_8888_hsw+0x65>
+ 235,211, //jmp ae3 <_sk_store_8888_hsw+0x6d>
};
CODE const uint8_t sk_load_f16_hsw[] = {
72,173, //lods %ds:(%rsi),%rax
72,139,0, //mov (%rax),%rax
72,133,201, //test %rcx,%rcx
- 117,97, //jne afd <_sk_load_f16_hsw+0x6b>
+ 117,97, //jne b7b <_sk_load_f16_hsw+0x6b>
197,249,16,12,248, //vmovupd (%rax,%rdi,8),%xmm1
197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2
197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3
@@ -2845,35 +2887,35 @@ CODE const uint8_t sk_load_f16_hsw[] = {
197,251,16,12,248, //vmovsd (%rax,%rdi,8),%xmm1
196,65,57,87,192, //vxorpd %xmm8,%xmm8,%xmm8
72,131,249,1, //cmp $0x1,%rcx
- 117,6, //jne b13 <_sk_load_f16_hsw+0x81>
+ 117,6, //jne b91 <_sk_load_f16_hsw+0x81>
197,250,126,201, //vmovq %xmm1,%xmm1
- 235,30, //jmp b31 <_sk_load_f16_hsw+0x9f>
+ 235,30, //jmp baf <_sk_load_f16_hsw+0x9f>
197,241,22,76,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm1,%xmm1
72,131,249,3, //cmp $0x3,%rcx
- 114,18, //jb b31 <_sk_load_f16_hsw+0x9f>
+ 114,18, //jb baf <_sk_load_f16_hsw+0x9f>
197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2
72,131,249,3, //cmp $0x3,%rcx
- 117,19, //jne b3e <_sk_load_f16_hsw+0xac>
+ 117,19, //jne bbc <_sk_load_f16_hsw+0xac>
197,250,126,210, //vmovq %xmm2,%xmm2
- 235,46, //jmp b5f <_sk_load_f16_hsw+0xcd>
+ 235,46, //jmp bdd <_sk_load_f16_hsw+0xcd>
197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3
197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2
- 233,117,255,255,255, //jmpq ab3 <_sk_load_f16_hsw+0x21>
+ 233,117,255,255,255, //jmpq b31 <_sk_load_f16_hsw+0x21>
197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2
72,131,249,5, //cmp $0x5,%rcx
- 114,21, //jb b5f <_sk_load_f16_hsw+0xcd>
+ 114,21, //jb bdd <_sk_load_f16_hsw+0xcd>
197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3
72,131,249,5, //cmp $0x5,%rcx
- 117,18, //jne b68 <_sk_load_f16_hsw+0xd6>
+ 117,18, //jne be6 <_sk_load_f16_hsw+0xd6>
197,250,126,219, //vmovq %xmm3,%xmm3
- 233,84,255,255,255, //jmpq ab3 <_sk_load_f16_hsw+0x21>
+ 233,84,255,255,255, //jmpq b31 <_sk_load_f16_hsw+0x21>
197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3
- 233,75,255,255,255, //jmpq ab3 <_sk_load_f16_hsw+0x21>
+ 233,75,255,255,255, //jmpq b31 <_sk_load_f16_hsw+0x21>
197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3
72,131,249,7, //cmp $0x7,%rcx
- 15,130,59,255,255,255, //jb ab3 <_sk_load_f16_hsw+0x21>
+ 15,130,59,255,255,255, //jb b31 <_sk_load_f16_hsw+0x21>
197,123,16,68,248,48, //vmovsd 0x30(%rax,%rdi,8),%xmm8
- 233,48,255,255,255, //jmpq ab3 <_sk_load_f16_hsw+0x21>
+ 233,48,255,255,255, //jmpq b31 <_sk_load_f16_hsw+0x21>
};
CODE const uint8_t sk_store_f16_hsw[] = {
@@ -2892,7 +2934,7 @@ CODE const uint8_t sk_store_f16_hsw[] = {
196,65,57,98,205, //vpunpckldq %xmm13,%xmm8,%xmm9
196,65,57,106,197, //vpunpckhdq %xmm13,%xmm8,%xmm8
72,133,201, //test %rcx,%rcx
- 117,27, //jne be8 <_sk_store_f16_hsw+0x65>
+ 117,27, //jne c66 <_sk_store_f16_hsw+0x65>
197,120,17,28,248, //vmovups %xmm11,(%rax,%rdi,8)
197,120,17,84,248,16, //vmovups %xmm10,0x10(%rax,%rdi,8)
197,120,17,76,248,32, //vmovups %xmm9,0x20(%rax,%rdi,8)
@@ -2901,22 +2943,22 @@ CODE const uint8_t sk_store_f16_hsw[] = {
255,224, //jmpq *%rax
197,121,214,28,248, //vmovq %xmm11,(%rax,%rdi,8)
72,131,249,1, //cmp $0x1,%rcx
- 116,241, //je be4 <_sk_store_f16_hsw+0x61>
+ 116,241, //je c62 <_sk_store_f16_hsw+0x61>
197,121,23,92,248,8, //vmovhpd %xmm11,0x8(%rax,%rdi,8)
72,131,249,3, //cmp $0x3,%rcx
- 114,229, //jb be4 <_sk_store_f16_hsw+0x61>
+ 114,229, //jb c62 <_sk_store_f16_hsw+0x61>
197,121,214,84,248,16, //vmovq %xmm10,0x10(%rax,%rdi,8)
- 116,221, //je be4 <_sk_store_f16_hsw+0x61>
+ 116,221, //je c62 <_sk_store_f16_hsw+0x61>
197,121,23,84,248,24, //vmovhpd %xmm10,0x18(%rax,%rdi,8)
72,131,249,5, //cmp $0x5,%rcx
- 114,209, //jb be4 <_sk_store_f16_hsw+0x61>
+ 114,209, //jb c62 <_sk_store_f16_hsw+0x61>
197,121,214,76,248,32, //vmovq %xmm9,0x20(%rax,%rdi,8)
- 116,201, //je be4 <_sk_store_f16_hsw+0x61>
+ 116,201, //je c62 <_sk_store_f16_hsw+0x61>
197,121,23,76,248,40, //vmovhpd %xmm9,0x28(%rax,%rdi,8)
72,131,249,7, //cmp $0x7,%rcx
- 114,189, //jb be4 <_sk_store_f16_hsw+0x61>
+ 114,189, //jb c62 <_sk_store_f16_hsw+0x61>
197,121,214,68,248,48, //vmovq %xmm8,0x30(%rax,%rdi,8)
- 235,181, //jmp be4 <_sk_store_f16_hsw+0x61>
+ 235,181, //jmp c62 <_sk_store_f16_hsw+0x61>
};
CODE const uint8_t sk_store_f32_hsw[] = {
@@ -2932,7 +2974,7 @@ CODE const uint8_t sk_store_f32_hsw[] = {
196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8
196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11
72,133,201, //test %rcx,%rcx
- 117,55, //jne c9c <_sk_store_f32_hsw+0x6d>
+ 117,55, //jne d1a <_sk_store_f32_hsw+0x6d>
196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12
196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13
196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9
@@ -2945,22 +2987,22 @@ CODE const uint8_t sk_store_f32_hsw[] = {
255,224, //jmpq *%rax
196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4)
72,131,249,1, //cmp $0x1,%rcx
- 116,240, //je c98 <_sk_store_f32_hsw+0x69>
+ 116,240, //je d16 <_sk_store_f32_hsw+0x69>
196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4)
72,131,249,3, //cmp $0x3,%rcx
- 114,227, //jb c98 <_sk_store_f32_hsw+0x69>
+ 114,227, //jb d16 <_sk_store_f32_hsw+0x69>
196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4)
- 116,218, //je c98 <_sk_store_f32_hsw+0x69>
+ 116,218, //je d16 <_sk_store_f32_hsw+0x69>
196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4)
72,131,249,5, //cmp $0x5,%rcx
- 114,205, //jb c98 <_sk_store_f32_hsw+0x69>
+ 114,205, //jb d16 <_sk_store_f32_hsw+0x69>
196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4)
- 116,195, //je c98 <_sk_store_f32_hsw+0x69>
+ 116,195, //je d16 <_sk_store_f32_hsw+0x69>
196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4)
72,131,249,7, //cmp $0x7,%rcx
- 114,181, //jb c98 <_sk_store_f32_hsw+0x69>
+ 114,181, //jb d16 <_sk_store_f32_hsw+0x69>
196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4)
- 235,171, //jmp c98 <_sk_store_f32_hsw+0x69>
+ 235,171, //jmp d16 <_sk_store_f32_hsw+0x69>
};
CODE const uint8_t sk_clamp_x_hsw[] = {
@@ -3275,13 +3317,19 @@ CODE const uint8_t sk_seed_shader_avx[] = {
197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0
196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
- 196,226,125,24,74,4, //vbroadcastss 0x4(%rdx),%ymm1
+ 65,184,0,0,0,63, //mov $0x3f000000,%r8d
+ 196,193,121,110,200, //vmovd %r8d,%xmm1
+ 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1
+ 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1
197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0
197,252,88,66,20, //vaddps 0x14(%rdx),%ymm0,%ymm0
196,226,125,24,16, //vbroadcastss (%rax),%ymm2
197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1
- 196,226,125,24,18, //vbroadcastss (%rdx),%ymm2
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,249,110,208, //vmovd %eax,%xmm2
+ 196,227,121,4,210,0, //vpermilps $0x0,%xmm2,%xmm2
+ 196,227,109,24,210,1, //vinsertf128 $0x1,%xmm2,%ymm2,%ymm2
72,173, //lods %ds:(%rsi),%rax
197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4
@@ -3320,7 +3368,10 @@ CODE const uint8_t sk_plus__avx[] = {
};
CODE const uint8_t sk_srcover_avx[] = {
- 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8
+ 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
197,60,92,195, //vsubps %ymm3,%ymm8,%ymm8
197,60,89,204, //vmulps %ymm4,%ymm8,%ymm9
197,180,88,192, //vaddps %ymm0,%ymm9,%ymm0
@@ -3335,7 +3386,10 @@ CODE const uint8_t sk_srcover_avx[] = {
};
CODE const uint8_t sk_dstover_avx[] = {
- 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8
+ 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
197,60,92,199, //vsubps %ymm7,%ymm8,%ymm8
197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0
@@ -3360,7 +3414,10 @@ CODE const uint8_t sk_clamp_0_avx[] = {
};
CODE const uint8_t sk_clamp_1_avx[] = {
- 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8
+ 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0
196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1
196,193,108,93,208, //vminps %ymm8,%ymm2,%ymm2
@@ -3370,7 +3427,10 @@ CODE const uint8_t sk_clamp_1_avx[] = {
};
CODE const uint8_t sk_clamp_a_avx[] = {
- 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8
+ 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3
197,252,93,195, //vminps %ymm3,%ymm0,%ymm0
197,244,93,203, //vminps %ymm3,%ymm1,%ymm1
@@ -3442,7 +3502,10 @@ CODE const uint8_t sk_premul_avx[] = {
CODE const uint8_t sk_unpremul_avx[] = {
196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
196,65,100,194,200,0, //vcmpeqps %ymm8,%ymm3,%ymm9
- 196,98,125,24,18, //vbroadcastss (%rdx),%ymm10
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,208, //vmovd %eax,%xmm10
+ 196,67,121,4,210,0, //vpermilps $0x0,%xmm10,%xmm10
+ 196,67,45,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
197,44,94,211, //vdivps %ymm3,%ymm10,%ymm10
196,67,45,74,192,144, //vblendvps %ymm9,%ymm8,%ymm10,%ymm8
197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
@@ -3547,14 +3610,17 @@ CODE const uint8_t sk_scale_u8_avx[] = {
72,139,0, //mov (%rax),%rax
72,1,248, //add %rdi,%rax
77,133,192, //test %r8,%r8
- 117,65, //jne 478 <_sk_scale_u8_avx+0x51>
+ 117,80, //jne 4f8 <_sk_scale_u8_avx+0x60>
197,123,16,0, //vmovsd (%rax),%xmm8
196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9
196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8
196,66,121,49,192, //vpmovzxbd %xmm8,%xmm8
196,67,53,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm9,%ymm8
196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
- 196,98,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm9
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 197,121,110,200, //vmovd %eax,%xmm9
+ 196,67,121,4,201,0, //vpermilps $0x0,%xmm9,%xmm9
+ 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9
196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8
197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
@@ -3572,9 +3638,9 @@ CODE const uint8_t sk_scale_u8_avx[] = {
77,9,217, //or %r11,%r9
72,131,193,8, //add $0x8,%rcx
73,255,202, //dec %r10
- 117,234, //jne 480 <_sk_scale_u8_avx+0x59>
+ 117,234, //jne 500 <_sk_scale_u8_avx+0x68>
196,65,249,110,193, //vmovq %r9,%xmm8
- 235,158, //jmp 43b <_sk_scale_u8_avx+0x14>
+ 235,143, //jmp 4ac <_sk_scale_u8_avx+0x14>
};
CODE const uint8_t sk_lerp_1_float_avx[] = {
@@ -3602,14 +3668,17 @@ CODE const uint8_t sk_lerp_u8_avx[] = {
72,139,0, //mov (%rax),%rax
72,1,248, //add %rdi,%rax
77,133,192, //test %r8,%r8
- 117,101, //jne 551 <_sk_lerp_u8_avx+0x75>
+ 117,116, //jne 5e0 <_sk_lerp_u8_avx+0x84>
197,123,16,0, //vmovsd (%rax),%xmm8
196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9
196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8
196,66,121,49,192, //vpmovzxbd %xmm8,%xmm8
196,67,53,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm9,%ymm8
196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
- 196,98,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm9
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 197,121,110,200, //vmovd %eax,%xmm9
+ 196,67,121,4,201,0, //vpermilps $0x0,%xmm9,%xmm9
+ 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9
196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8
197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0
196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0
@@ -3635,16 +3704,16 @@ CODE const uint8_t sk_lerp_u8_avx[] = {
77,9,217, //or %r11,%r9
72,131,193,8, //add $0x8,%rcx
73,255,202, //dec %r10
- 117,234, //jne 559 <_sk_lerp_u8_avx+0x7d>
+ 117,234, //jne 5e8 <_sk_lerp_u8_avx+0x8c>
196,65,249,110,193, //vmovq %r9,%xmm8
- 233,119,255,255,255, //jmpq 4f0 <_sk_lerp_u8_avx+0x14>
+ 233,104,255,255,255, //jmpq 570 <_sk_lerp_u8_avx+0x14>
};
CODE const uint8_t sk_lerp_565_avx[] = {
72,173, //lods %ds:(%rsi),%rax
76,139,16, //mov (%rax),%r10
72,133,201, //test %rcx,%rcx
- 15,133,148,0,0,0, //jne 61b <_sk_lerp_565_avx+0xa2>
+ 15,133,164,0,0,0, //jne 6ba <_sk_lerp_565_avx+0xb2>
196,65,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm8
197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3
197,185,105,219, //vpunpckhwd %xmm3,%xmm8,%xmm3
@@ -3674,7 +3743,10 @@ CODE const uint8_t sk_lerp_565_avx[] = {
197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2
197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2
197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2
- 196,226,125,24,26, //vbroadcastss (%rdx),%ymm3
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,249,110,216, //vmovd %eax,%xmm3
+ 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3
+ 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
65,137,200, //mov %ecx,%r8d
@@ -3683,8 +3755,8 @@ CODE const uint8_t sk_lerp_565_avx[] = {
65,254,200, //dec %r8b
69,15,182,192, //movzbl %r8b,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 15,135,85,255,255,255, //ja 58d <_sk_lerp_565_avx+0x14>
- 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # 688 <_sk_lerp_565_avx+0x10f>
+ 15,135,69,255,255,255, //ja 61c <_sk_lerp_565_avx+0x14>
+ 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # 728 <_sk_lerp_565_avx+0x120>
75,99,4,129, //movslq (%r9,%r8,4),%rax
76,1,200, //add %r9,%rax
255,224, //jmpq *%rax
@@ -3696,27 +3768,27 @@ CODE const uint8_t sk_lerp_565_avx[] = {
196,65,57,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm8,%xmm8
196,65,57,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm8,%xmm8
196,65,57,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm8,%xmm8
- 233,5,255,255,255, //jmpq 58d <_sk_lerp_565_avx+0x14>
- 244, //hlt
- 255, //(bad)
+ 233,245,254,255,255, //jmpq 61c <_sk_lerp_565_avx+0x14>
+ 144, //nop
+ 243,255, //repz (bad)
255, //(bad)
255, //(bad)
- 236, //in (%dx),%al
+ 235,255, //jmp 72d <_sk_lerp_565_avx+0x125>
255, //(bad)
+ 255,227, //jmpq *%rbx
255, //(bad)
- 255,228, //jmpq *%rsp
255, //(bad)
255, //(bad)
+ 219,255, //(bad)
255, //(bad)
- 220,255, //fdivr %st,%st(7)
+ 255,211, //callq *%rbx
255, //(bad)
- 255,212, //callq *%rsp
255, //(bad)
+ 255,203, //dec %ebx
255, //(bad)
- 255,204, //dec %esp
255, //(bad)
255, //(bad)
- 255,192, //inc %eax
+ 191, //.byte 0xbf
255, //(bad)
255, //(bad)
255, //.byte 0xff
@@ -3732,7 +3804,7 @@ CODE const uint8_t sk_load_tables_avx[] = {
72,173, //lods %ds:(%rsi),%rax
76,139,0, //mov (%rax),%r8
72,133,201, //test %rcx,%rcx
- 15,133,18,2,0,0, //jne 8ce <_sk_load_tables_avx+0x22a>
+ 15,133,18,2,0,0, //jne 96e <_sk_load_tables_avx+0x22a>
196,65,124,16,4,184, //vmovups (%r8,%rdi,4),%ymm8
196,98,125,24,74,16, //vbroadcastss 0x10(%rdx),%ymm9
196,193,52,84,192, //vandps %ymm8,%ymm9,%ymm0
@@ -3844,8 +3916,8 @@ CODE const uint8_t sk_load_tables_avx[] = {
65,254,201, //dec %r9b
69,15,182,201, //movzbl %r9b,%r9d
65,128,249,6, //cmp $0x6,%r9b
- 15,135,215,253,255,255, //ja 6c2 <_sk_load_tables_avx+0x1e>
- 76,141,21,138,0,0,0, //lea 0x8a(%rip),%r10 # 97c <_sk_load_tables_avx+0x2d8>
+ 15,135,215,253,255,255, //ja 762 <_sk_load_tables_avx+0x1e>
+ 76,141,21,138,0,0,0, //lea 0x8a(%rip),%r10 # a1c <_sk_load_tables_avx+0x2d8>
79,99,12,138, //movslq (%r10,%r9,4),%r9
77,1,209, //add %r10,%r9
65,255,225, //jmpq *%r9
@@ -3868,7 +3940,7 @@ CODE const uint8_t sk_load_tables_avx[] = {
196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8
196,195,57,34,4,184,0, //vpinsrd $0x0,(%r8,%rdi,4),%xmm8,%xmm0
196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8
- 233,70,253,255,255, //jmpq 6c2 <_sk_load_tables_avx+0x1e>
+ 233,70,253,255,255, //jmpq 762 <_sk_load_tables_avx+0x1e>
238, //out %al,(%dx)
255, //(bad)
255, //(bad)
@@ -3895,7 +3967,7 @@ CODE const uint8_t sk_load_a8_avx[] = {
72,139,0, //mov (%rax),%rax
72,1,248, //add %rdi,%rax
77,133,192, //test %r8,%r8
- 117,59, //jne 9e3 <_sk_load_a8_avx+0x4b>
+ 117,59, //jne a83 <_sk_load_a8_avx+0x4b>
197,251,16,0, //vmovsd (%rax),%xmm0
196,226,121,49,200, //vpmovzxbd %xmm0,%xmm1
196,227,121,4,192,229, //vpermilps $0xe5,%xmm0,%xmm0
@@ -3919,9 +3991,9 @@ CODE const uint8_t sk_load_a8_avx[] = {
77,9,217, //or %r11,%r9
72,131,193,8, //add $0x8,%rcx
73,255,202, //dec %r10
- 117,234, //jne 9eb <_sk_load_a8_avx+0x53>
+ 117,234, //jne a8b <_sk_load_a8_avx+0x53>
196,193,249,110,193, //vmovq %r9,%xmm0
- 235,164, //jmp 9ac <_sk_load_a8_avx+0x14>
+ 235,164, //jmp a4c <_sk_load_a8_avx+0x14>
};
CODE const uint8_t sk_store_a8_avx[] = {
@@ -3934,7 +4006,7 @@ CODE const uint8_t sk_store_a8_avx[] = {
196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8
196,65,57,103,192, //vpackuswb %xmm8,%xmm8,%xmm8
72,133,201, //test %rcx,%rcx
- 117,10, //jne a3b <_sk_store_a8_avx+0x33>
+ 117,10, //jne adb <_sk_store_a8_avx+0x33>
196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1)
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
@@ -3943,9 +4015,9 @@ CODE const uint8_t sk_store_a8_avx[] = {
254,200, //dec %al
68,15,182,192, //movzbl %al,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 119,236, //ja a37 <_sk_store_a8_avx+0x2f>
+ 119,236, //ja ad7 <_sk_store_a8_avx+0x2f>
196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8
- 76,141,21,69,0,0,0, //lea 0x45(%rip),%r10 # a9c <_sk_store_a8_avx+0x94>
+ 76,141,21,69,0,0,0, //lea 0x45(%rip),%r10 # b3c <_sk_store_a8_avx+0x94>
75,99,4,130, //movslq (%r10,%r8,4),%rax
76,1,208, //add %r10,%rax
255,224, //jmpq *%rax
@@ -3956,7 +4028,7 @@ CODE const uint8_t sk_store_a8_avx[] = {
196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1)
196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1)
196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1)
- 235,158, //jmp a37 <_sk_store_a8_avx+0x2f>
+ 235,158, //jmp ad7 <_sk_store_a8_avx+0x2f>
15,31,0, //nopl (%rax)
244, //hlt
255, //(bad)
@@ -3987,7 +4059,7 @@ CODE const uint8_t sk_load_565_avx[] = {
72,173, //lods %ds:(%rsi),%rax
76,139,16, //mov (%rax),%r10
72,133,201, //test %rcx,%rcx
- 117,106, //jne b2c <_sk_load_565_avx+0x74>
+ 117,106, //jne bcc <_sk_load_565_avx+0x74>
196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0
197,241,239,201, //vpxor %xmm1,%xmm1,%xmm1
197,249,105,201, //vpunpckhwd %xmm1,%xmm0,%xmm1
@@ -4017,8 +4089,8 @@ CODE const uint8_t sk_load_565_avx[] = {
65,254,200, //dec %r8b
69,15,182,192, //movzbl %r8b,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 119,132, //ja ac8 <_sk_load_565_avx+0x10>
- 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # b94 <_sk_load_565_avx+0xdc>
+ 119,132, //ja b68 <_sk_load_565_avx+0x10>
+ 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # c34 <_sk_load_565_avx+0xdc>
75,99,4,129, //movslq (%r9,%r8,4),%rax
76,1,200, //add %r9,%rax
255,224, //jmpq *%rax
@@ -4030,7 +4102,7 @@ CODE const uint8_t sk_load_565_avx[] = {
196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0
- 233,52,255,255,255, //jmpq ac8 <_sk_load_565_avx+0x10>
+ 233,52,255,255,255, //jmpq b68 <_sk_load_565_avx+0x10>
244, //hlt
255, //(bad)
255, //(bad)
@@ -4080,7 +4152,7 @@ CODE const uint8_t sk_store_565_avx[] = {
196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9
196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8
72,133,201, //test %rcx,%rcx
- 117,10, //jne c36 <_sk_store_565_avx+0x86>
+ 117,10, //jne cd6 <_sk_store_565_avx+0x86>
196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2)
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
@@ -4089,8 +4161,8 @@ CODE const uint8_t sk_store_565_avx[] = {
254,200, //dec %al
68,15,182,192, //movzbl %al,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 119,236, //ja c32 <_sk_store_565_avx+0x82>
- 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # c94 <_sk_store_565_avx+0xe4>
+ 119,236, //ja cd2 <_sk_store_565_avx+0x82>
+ 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # d34 <_sk_store_565_avx+0xe4>
75,99,4,130, //movslq (%r10,%r8,4),%rax
76,1,208, //add %r10,%rax
255,224, //jmpq *%rax
@@ -4102,7 +4174,7 @@ CODE const uint8_t sk_store_565_avx[] = {
196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2)
197,121,126,192, //vmovd %xmm8,%eax
102,65,137,4,121, //mov %ax,(%r9,%rdi,2)
- 235,161, //jmp c32 <_sk_store_565_avx+0x82>
+ 235,161, //jmp cd2 <_sk_store_565_avx+0x82>
15,31,0, //nopl (%rax)
242,255, //repnz (bad)
255, //(bad)
@@ -4132,26 +4204,32 @@ CODE const uint8_t sk_load_8888_avx[] = {
72,173, //lods %ds:(%rsi),%rax
76,139,16, //mov (%rax),%r10
72,133,201, //test %rcx,%rcx
- 117,125, //jne d37 <_sk_load_8888_avx+0x87>
+ 15,133,157,0,0,0, //jne dfb <_sk_load_8888_avx+0xab>
196,65,124,16,12,186, //vmovups (%r10,%rdi,4),%ymm9
- 196,98,125,24,90,16, //vbroadcastss 0x10(%rdx),%ymm11
+ 184,255,0,0,0, //mov $0xff,%eax
+ 197,249,110,192, //vmovd %eax,%xmm0
+ 197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0
+ 196,99,125,24,216,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm11
196,193,36,84,193, //vandps %ymm9,%ymm11,%ymm0
197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
- 196,98,125,24,66,12, //vbroadcastss 0xc(%rdx),%ymm8
- 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 197,249,110,200, //vmovd %eax,%xmm1
+ 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1
+ 196,99,117,24,193,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm8
+ 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0
196,193,41,114,209,8, //vpsrld $0x8,%xmm9,%xmm10
196,99,125,25,203,1, //vextractf128 $0x1,%ymm9,%xmm3
197,241,114,211,8, //vpsrld $0x8,%xmm3,%xmm1
196,227,45,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm10,%ymm1
197,164,84,201, //vandps %ymm1,%ymm11,%ymm1
197,252,91,201, //vcvtdq2ps %ymm1,%ymm1
- 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
+ 196,193,116,89,200, //vmulps %ymm8,%ymm1,%ymm1
196,193,41,114,209,16, //vpsrld $0x10,%xmm9,%xmm10
197,233,114,211,16, //vpsrld $0x10,%xmm3,%xmm2
196,227,45,24,210,1, //vinsertf128 $0x1,%xmm2,%ymm10,%ymm2
197,164,84,210, //vandps %ymm2,%ymm11,%ymm2
197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
- 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
+ 196,193,108,89,208, //vmulps %ymm8,%ymm2,%ymm2
196,193,49,114,209,24, //vpsrld $0x18,%xmm9,%xmm9
197,225,114,211,24, //vpsrld $0x18,%xmm3,%xmm3
196,227,53,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm9,%ymm3
@@ -4165,8 +4243,8 @@ CODE const uint8_t sk_load_8888_avx[] = {
65,254,200, //dec %r8b
69,15,182,192, //movzbl %r8b,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 15,135,108,255,255,255, //ja cc0 <_sk_load_8888_avx+0x10>
- 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # de4 <_sk_load_8888_avx+0x134>
+ 15,135,76,255,255,255, //ja d64 <_sk_load_8888_avx+0x14>
+ 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # ea8 <_sk_load_8888_avx+0x158>
75,99,4,129, //movslq (%r9,%r8,4),%rax
76,1,200, //add %r9,%rax
255,224, //jmpq *%rax
@@ -4189,7 +4267,7 @@ CODE const uint8_t sk_load_8888_avx[] = {
196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9
196,195,49,34,4,186,0, //vpinsrd $0x0,(%r10,%rdi,4),%xmm9,%xmm0
196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9
- 233,220,254,255,255, //jmpq cc0 <_sk_load_8888_avx+0x10>
+ 233,188,254,255,255, //jmpq d64 <_sk_load_8888_avx+0x14>
238, //out %al,(%dx)
255, //(bad)
255, //(bad)
@@ -4213,7 +4291,10 @@ CODE const uint8_t sk_load_8888_avx[] = {
CODE const uint8_t sk_store_8888_avx[] = {
72,173, //lods %ds:(%rsi),%rax
76,139,8, //mov (%rax),%r9
- 196,98,125,24,66,8, //vbroadcastss 0x8(%rdx),%ymm8
+ 184,0,0,127,67, //mov $0x437f0000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8
+ 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9
197,60,89,209, //vmulps %ymm1,%ymm8,%ymm10
@@ -4238,7 +4319,7 @@ CODE const uint8_t sk_store_8888_avx[] = {
196,65,45,86,192, //vorpd %ymm8,%ymm10,%ymm8
196,65,53,86,192, //vorpd %ymm8,%ymm9,%ymm8
72,133,201, //test %rcx,%rcx
- 117,10, //jne e95 <_sk_store_8888_avx+0x95>
+ 117,10, //jne f68 <_sk_store_8888_avx+0xa4>
196,65,124,17,4,185, //vmovups %ymm8,(%r9,%rdi,4)
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
@@ -4247,8 +4328,8 @@ CODE const uint8_t sk_store_8888_avx[] = {
254,200, //dec %al
68,15,182,192, //movzbl %al,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 119,236, //ja e91 <_sk_store_8888_avx+0x91>
- 76,141,21,84,0,0,0, //lea 0x54(%rip),%r10 # f00 <_sk_store_8888_avx+0x100>
+ 119,236, //ja f64 <_sk_store_8888_avx+0xa0>
+ 76,141,21,85,0,0,0, //lea 0x55(%rip),%r10 # fd4 <_sk_store_8888_avx+0x110>
75,99,4,130, //movslq (%r10,%r8,4),%rax
76,1,208, //add %r10,%rax
255,224, //jmpq *%rax
@@ -4262,28 +4343,29 @@ CODE const uint8_t sk_store_8888_avx[] = {
196,67,121,22,68,185,8,2, //vpextrd $0x2,%xmm8,0x8(%r9,%rdi,4)
196,67,121,22,68,185,4,1, //vpextrd $0x1,%xmm8,0x4(%r9,%rdi,4)
196,65,121,126,4,185, //vmovd %xmm8,(%r9,%rdi,4)
- 235,147, //jmp e91 <_sk_store_8888_avx+0x91>
- 102,144, //xchg %ax,%ax
- 246,255, //idiv %bh
+ 235,147, //jmp f64 <_sk_store_8888_avx+0xa0>
+ 15,31,0, //nopl (%rax)
+ 245, //cmc
255, //(bad)
255, //(bad)
- 238, //out %al,(%dx)
255, //(bad)
+ 237, //in (%dx),%eax
255, //(bad)
- 255,230, //jmpq *%rsi
255, //(bad)
+ 255,229, //jmpq *%rbp
255, //(bad)
255, //(bad)
- 222,255, //fdivrp %st,%st(7)
255, //(bad)
- 255,209, //callq *%rcx
+ 221,255, //(bad)
255, //(bad)
+ 255,208, //callq *%rax
255, //(bad)
- 255,195, //inc %ebx
+ 255, //(bad)
+ 255,194, //inc %edx
255, //(bad)
255, //(bad)
255, //.byte 0xff
- 181,255, //mov $0xff,%ch
+ 180,255, //mov $0xff,%ah
255, //(bad)
255, //.byte 0xff
};
@@ -4292,7 +4374,7 @@ CODE const uint8_t sk_load_f16_avx[] = {
72,173, //lods %ds:(%rsi),%rax
72,139,0, //mov (%rax),%rax
72,133,201, //test %rcx,%rcx
- 15,133,240,0,0,0, //jne 101a <_sk_load_f16_avx+0xfe>
+ 15,133,240,0,0,0, //jne 10ee <_sk_load_f16_avx+0xfe>
197,249,16,12,248, //vmovupd (%rax,%rdi,8),%xmm1
197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2
197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3
@@ -4346,35 +4428,35 @@ CODE const uint8_t sk_load_f16_avx[] = {
197,251,16,12,248, //vmovsd (%rax,%rdi,8),%xmm1
196,65,57,87,192, //vxorpd %xmm8,%xmm8,%xmm8
72,131,249,1, //cmp $0x1,%rcx
- 117,6, //jne 1030 <_sk_load_f16_avx+0x114>
+ 117,6, //jne 1104 <_sk_load_f16_avx+0x114>
197,250,126,201, //vmovq %xmm1,%xmm1
- 235,30, //jmp 104e <_sk_load_f16_avx+0x132>
+ 235,30, //jmp 1122 <_sk_load_f16_avx+0x132>
197,241,22,76,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm1,%xmm1
72,131,249,3, //cmp $0x3,%rcx
- 114,18, //jb 104e <_sk_load_f16_avx+0x132>
+ 114,18, //jb 1122 <_sk_load_f16_avx+0x132>
197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2
72,131,249,3, //cmp $0x3,%rcx
- 117,19, //jne 105b <_sk_load_f16_avx+0x13f>
+ 117,19, //jne 112f <_sk_load_f16_avx+0x13f>
197,250,126,210, //vmovq %xmm2,%xmm2
- 235,46, //jmp 107c <_sk_load_f16_avx+0x160>
+ 235,46, //jmp 1150 <_sk_load_f16_avx+0x160>
197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3
197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2
- 233,230,254,255,255, //jmpq f41 <_sk_load_f16_avx+0x25>
+ 233,230,254,255,255, //jmpq 1015 <_sk_load_f16_avx+0x25>
197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2
72,131,249,5, //cmp $0x5,%rcx
- 114,21, //jb 107c <_sk_load_f16_avx+0x160>
+ 114,21, //jb 1150 <_sk_load_f16_avx+0x160>
197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3
72,131,249,5, //cmp $0x5,%rcx
- 117,18, //jne 1085 <_sk_load_f16_avx+0x169>
+ 117,18, //jne 1159 <_sk_load_f16_avx+0x169>
197,250,126,219, //vmovq %xmm3,%xmm3
- 233,197,254,255,255, //jmpq f41 <_sk_load_f16_avx+0x25>
+ 233,197,254,255,255, //jmpq 1015 <_sk_load_f16_avx+0x25>
197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3
- 233,188,254,255,255, //jmpq f41 <_sk_load_f16_avx+0x25>
+ 233,188,254,255,255, //jmpq 1015 <_sk_load_f16_avx+0x25>
197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3
72,131,249,7, //cmp $0x7,%rcx
- 15,130,172,254,255,255, //jb f41 <_sk_load_f16_avx+0x25>
+ 15,130,172,254,255,255, //jb 1015 <_sk_load_f16_avx+0x25>
197,123,16,68,248,48, //vmovsd 0x30(%rax,%rdi,8),%xmm8
- 233,161,254,255,255, //jmpq f41 <_sk_load_f16_avx+0x25>
+ 233,161,254,255,255, //jmpq 1015 <_sk_load_f16_avx+0x25>
};
CODE const uint8_t sk_store_f16_avx[] = {
@@ -4410,7 +4492,7 @@ CODE const uint8_t sk_store_f16_avx[] = {
196,65,25,98,205, //vpunpckldq %xmm13,%xmm12,%xmm9
196,65,25,106,197, //vpunpckhdq %xmm13,%xmm12,%xmm8
72,133,201, //test %rcx,%rcx
- 117,27, //jne 1163 <_sk_store_f16_avx+0xc3>
+ 117,27, //jne 1237 <_sk_store_f16_avx+0xc3>
197,120,17,28,248, //vmovups %xmm11,(%rax,%rdi,8)
197,120,17,84,248,16, //vmovups %xmm10,0x10(%rax,%rdi,8)
197,120,17,76,248,32, //vmovups %xmm9,0x20(%rax,%rdi,8)
@@ -4419,22 +4501,22 @@ CODE const uint8_t sk_store_f16_avx[] = {
255,224, //jmpq *%rax
197,121,214,28,248, //vmovq %xmm11,(%rax,%rdi,8)
72,131,249,1, //cmp $0x1,%rcx
- 116,241, //je 115f <_sk_store_f16_avx+0xbf>
+ 116,241, //je 1233 <_sk_store_f16_avx+0xbf>
197,121,23,92,248,8, //vmovhpd %xmm11,0x8(%rax,%rdi,8)
72,131,249,3, //cmp $0x3,%rcx
- 114,229, //jb 115f <_sk_store_f16_avx+0xbf>
+ 114,229, //jb 1233 <_sk_store_f16_avx+0xbf>
197,121,214,84,248,16, //vmovq %xmm10,0x10(%rax,%rdi,8)
- 116,221, //je 115f <_sk_store_f16_avx+0xbf>
+ 116,221, //je 1233 <_sk_store_f16_avx+0xbf>
197,121,23,84,248,24, //vmovhpd %xmm10,0x18(%rax,%rdi,8)
72,131,249,5, //cmp $0x5,%rcx
- 114,209, //jb 115f <_sk_store_f16_avx+0xbf>
+ 114,209, //jb 1233 <_sk_store_f16_avx+0xbf>
197,121,214,76,248,32, //vmovq %xmm9,0x20(%rax,%rdi,8)
- 116,201, //je 115f <_sk_store_f16_avx+0xbf>
+ 116,201, //je 1233 <_sk_store_f16_avx+0xbf>
197,121,23,76,248,40, //vmovhpd %xmm9,0x28(%rax,%rdi,8)
72,131,249,7, //cmp $0x7,%rcx
- 114,189, //jb 115f <_sk_store_f16_avx+0xbf>
+ 114,189, //jb 1233 <_sk_store_f16_avx+0xbf>
197,121,214,68,248,48, //vmovq %xmm8,0x30(%rax,%rdi,8)
- 235,181, //jmp 115f <_sk_store_f16_avx+0xbf>
+ 235,181, //jmp 1233 <_sk_store_f16_avx+0xbf>
};
CODE const uint8_t sk_store_f32_avx[] = {
@@ -4450,7 +4532,7 @@ CODE const uint8_t sk_store_f32_avx[] = {
196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8
196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11
72,133,201, //test %rcx,%rcx
- 117,55, //jne 1217 <_sk_store_f32_avx+0x6d>
+ 117,55, //jne 12eb <_sk_store_f32_avx+0x6d>
196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12
196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13
196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9
@@ -4463,22 +4545,22 @@ CODE const uint8_t sk_store_f32_avx[] = {
255,224, //jmpq *%rax
196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4)
72,131,249,1, //cmp $0x1,%rcx
- 116,240, //je 1213 <_sk_store_f32_avx+0x69>
+ 116,240, //je 12e7 <_sk_store_f32_avx+0x69>
196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4)
72,131,249,3, //cmp $0x3,%rcx
- 114,227, //jb 1213 <_sk_store_f32_avx+0x69>
+ 114,227, //jb 12e7 <_sk_store_f32_avx+0x69>
196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4)
- 116,218, //je 1213 <_sk_store_f32_avx+0x69>
+ 116,218, //je 12e7 <_sk_store_f32_avx+0x69>
196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4)
72,131,249,5, //cmp $0x5,%rcx
- 114,205, //jb 1213 <_sk_store_f32_avx+0x69>
+ 114,205, //jb 12e7 <_sk_store_f32_avx+0x69>
196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4)
- 116,195, //je 1213 <_sk_store_f32_avx+0x69>
+ 116,195, //je 12e7 <_sk_store_f32_avx+0x69>
196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4)
72,131,249,7, //cmp $0x7,%rcx
- 114,181, //jb 1213 <_sk_store_f32_avx+0x69>
+ 114,181, //jb 12e7 <_sk_store_f32_avx+0x69>
196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4)
- 235,171, //jmp 1213 <_sk_store_f32_avx+0x69>
+ 235,171, //jmp 12e7 <_sk_store_f32_avx+0x69>
};
CODE const uint8_t sk_clamp_x_avx[] = {
@@ -4839,16 +4921,18 @@ CODE const uint8_t sk_seed_shader_sse41[] = {
102,15,110,199, //movd %edi,%xmm0
102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
15,91,200, //cvtdq2ps %xmm0,%xmm1
- 243,15,16,18, //movss (%rdx),%xmm2
- 243,15,16,90,4, //movss 0x4(%rdx),%xmm3
- 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
- 15,88,203, //addps %xmm3,%xmm1
+ 185,0,0,0,63, //mov $0x3f000000,%ecx
+ 102,15,110,209, //movd %ecx,%xmm2
+ 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
+ 15,88,202, //addps %xmm2,%xmm1
15,16,66,20, //movups 0x14(%rdx),%xmm0
15,88,193, //addps %xmm1,%xmm0
102,15,110,8, //movd (%rax),%xmm1
102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1
15,91,201, //cvtdq2ps %xmm1,%xmm1
- 15,88,203, //addps %xmm3,%xmm1
+ 15,88,202, //addps %xmm2,%xmm1
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,15,110,208, //movd %eax,%xmm2
15,198,210,0, //shufps $0x0,%xmm2,%xmm2
72,173, //lods %ds:(%rsi),%rax
15,87,219, //xorps %xmm3,%xmm3
@@ -4892,7 +4976,8 @@ CODE const uint8_t sk_plus__sse41[] = {
};
CODE const uint8_t sk_srcover_sse41[] = {
- 243,68,15,16,2, //movss (%rdx),%xmm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
68,15,92,195, //subps %xmm3,%xmm8
69,15,40,200, //movaps %xmm8,%xmm9
@@ -4911,7 +4996,8 @@ CODE const uint8_t sk_srcover_sse41[] = {
};
CODE const uint8_t sk_dstover_sse41[] = {
- 243,68,15,16,2, //movss (%rdx),%xmm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
68,15,92,199, //subps %xmm7,%xmm8
65,15,89,192, //mulps %xmm8,%xmm0
@@ -4937,7 +5023,8 @@ CODE const uint8_t sk_clamp_0_sse41[] = {
};
CODE const uint8_t sk_clamp_1_sse41[] = {
- 243,68,15,16,2, //movss (%rdx),%xmm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
65,15,93,192, //minps %xmm8,%xmm0
65,15,93,200, //minps %xmm8,%xmm1
@@ -4948,7 +5035,8 @@ CODE const uint8_t sk_clamp_1_sse41[] = {
};
CODE const uint8_t sk_clamp_a_sse41[] = {
- 243,68,15,16,2, //movss (%rdx),%xmm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
65,15,93,216, //minps %xmm8,%xmm3
15,93,195, //minps %xmm3,%xmm0
@@ -5024,7 +5112,8 @@ CODE const uint8_t sk_premul_sse41[] = {
CODE const uint8_t sk_unpremul_sse41[] = {
68,15,40,192, //movaps %xmm0,%xmm8
69,15,87,201, //xorps %xmm9,%xmm9
- 243,68,15,16,18, //movss (%rdx),%xmm10
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,208, //movd %eax,%xmm10
69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
68,15,94,211, //divps %xmm3,%xmm10
15,40,195, //movaps %xmm3,%xmm0
@@ -5178,7 +5267,8 @@ CODE const uint8_t sk_scale_u8_sse41[] = {
72,139,0, //mov (%rax),%rax
102,68,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm8
69,15,91,192, //cvtdq2ps %xmm8,%xmm8
- 243,68,15,16,74,12, //movss 0xc(%rdx),%xmm9
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 102,68,15,110,200, //movd %eax,%xmm9
69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
69,15,89,200, //mulps %xmm8,%xmm9
65,15,89,193, //mulps %xmm9,%xmm0
@@ -5214,7 +5304,8 @@ CODE const uint8_t sk_lerp_u8_sse41[] = {
72,139,0, //mov (%rax),%rax
102,68,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm8
69,15,91,192, //cvtdq2ps %xmm8,%xmm8
- 243,68,15,16,74,12, //movss 0xc(%rdx),%xmm9
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 102,68,15,110,200, //movd %eax,%xmm9
69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
69,15,89,200, //mulps %xmm8,%xmm9
15,92,196, //subps %xmm4,%xmm0
@@ -5241,33 +5332,34 @@ CODE const uint8_t sk_lerp_565_sse41[] = {
102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3
102,65,15,219,216, //pand %xmm8,%xmm3
68,15,91,203, //cvtdq2ps %xmm3,%xmm9
- 243,15,16,26, //movss (%rdx),%xmm3
- 243,68,15,16,82,116, //movss 0x74(%rdx),%xmm10
- 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
- 69,15,89,209, //mulps %xmm9,%xmm10
- 102,68,15,110,74,108, //movd 0x6c(%rdx),%xmm9
- 102,69,15,112,201,0, //pshufd $0x0,%xmm9,%xmm9
- 102,69,15,219,200, //pand %xmm8,%xmm9
- 69,15,91,201, //cvtdq2ps %xmm9,%xmm9
- 243,68,15,16,90,120, //movss 0x78(%rdx),%xmm11
+ 243,68,15,16,90,116, //movss 0x74(%rdx),%xmm11
+ 243,68,15,16,82,120, //movss 0x78(%rdx),%xmm10
69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
69,15,89,217, //mulps %xmm9,%xmm11
- 102,68,15,110,74,112, //movd 0x70(%rdx),%xmm9
- 102,69,15,112,201,0, //pshufd $0x0,%xmm9,%xmm9
- 102,69,15,219,200, //pand %xmm8,%xmm9
- 69,15,91,193, //cvtdq2ps %xmm9,%xmm8
- 243,68,15,16,74,124, //movss 0x7c(%rdx),%xmm9
- 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
- 69,15,89,200, //mulps %xmm8,%xmm9
+ 102,15,110,90,108, //movd 0x6c(%rdx),%xmm3
+ 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3
+ 102,65,15,219,216, //pand %xmm8,%xmm3
+ 15,91,219, //cvtdq2ps %xmm3,%xmm3
+ 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
+ 68,15,89,211, //mulps %xmm3,%xmm10
+ 102,15,110,90,112, //movd 0x70(%rdx),%xmm3
+ 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3
+ 102,65,15,219,216, //pand %xmm8,%xmm3
+ 68,15,91,195, //cvtdq2ps %xmm3,%xmm8
+ 243,15,16,90,124, //movss 0x7c(%rdx),%xmm3
+ 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
+ 65,15,89,216, //mulps %xmm8,%xmm3
15,92,196, //subps %xmm4,%xmm0
- 65,15,89,194, //mulps %xmm10,%xmm0
+ 65,15,89,195, //mulps %xmm11,%xmm0
15,88,196, //addps %xmm4,%xmm0
15,92,205, //subps %xmm5,%xmm1
- 65,15,89,203, //mulps %xmm11,%xmm1
+ 65,15,89,202, //mulps %xmm10,%xmm1
15,88,205, //addps %xmm5,%xmm1
15,92,214, //subps %xmm6,%xmm2
- 65,15,89,209, //mulps %xmm9,%xmm2
+ 15,89,211, //mulps %xmm3,%xmm2
15,88,214, //addps %xmm6,%xmm2
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,15,110,216, //movd %eax,%xmm3
15,198,219,0, //shufps $0x0,%xmm3,%xmm3
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
@@ -5420,7 +5512,8 @@ CODE const uint8_t sk_load_8888_sse41[] = {
72,173, //lods %ds:(%rsi),%rax
72,139,0, //mov (%rax),%rax
243,15,111,28,184, //movdqu (%rax,%rdi,4),%xmm3
- 102,15,110,66,16, //movd 0x10(%rdx),%xmm0
+ 184,255,0,0,0, //mov $0xff,%eax
+ 102,15,110,192, //movd %eax,%xmm0
102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
102,15,111,203, //movdqa %xmm3,%xmm1
102,15,114,209,8, //psrld $0x8,%xmm1
@@ -5430,7 +5523,8 @@ CODE const uint8_t sk_load_8888_sse41[] = {
102,15,219,208, //pand %xmm0,%xmm2
102,15,219,195, //pand %xmm3,%xmm0
15,91,192, //cvtdq2ps %xmm0,%xmm0
- 243,68,15,16,66,12, //movss 0xc(%rdx),%xmm8
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
65,15,89,192, //mulps %xmm8,%xmm0
15,91,201, //cvtdq2ps %xmm1,%xmm1
@@ -5447,7 +5541,8 @@ CODE const uint8_t sk_load_8888_sse41[] = {
CODE const uint8_t sk_store_8888_sse41[] = {
72,173, //lods %ds:(%rsi),%rax
72,139,0, //mov (%rax),%rax
- 243,68,15,16,66,8, //movss 0x8(%rdx),%xmm8
+ 185,0,0,127,67, //mov $0x437f0000,%ecx
+ 102,68,15,110,193, //movd %ecx,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
69,15,40,200, //movaps %xmm8,%xmm9
68,15,89,200, //mulps %xmm0,%xmm9
@@ -5975,16 +6070,18 @@ CODE const uint8_t sk_seed_shader_sse2[] = {
102,15,110,199, //movd %edi,%xmm0
102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
15,91,200, //cvtdq2ps %xmm0,%xmm1
- 243,15,16,18, //movss (%rdx),%xmm2
- 243,15,16,90,4, //movss 0x4(%rdx),%xmm3
- 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
- 15,88,203, //addps %xmm3,%xmm1
+ 185,0,0,0,63, //mov $0x3f000000,%ecx
+ 102,15,110,209, //movd %ecx,%xmm2
+ 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
+ 15,88,202, //addps %xmm2,%xmm1
15,16,66,20, //movups 0x14(%rdx),%xmm0
15,88,193, //addps %xmm1,%xmm0
102,15,110,8, //movd (%rax),%xmm1
102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1
15,91,201, //cvtdq2ps %xmm1,%xmm1
- 15,88,203, //addps %xmm3,%xmm1
+ 15,88,202, //addps %xmm2,%xmm1
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,15,110,208, //movd %eax,%xmm2
15,198,210,0, //shufps $0x0,%xmm2,%xmm2
72,173, //lods %ds:(%rsi),%rax
15,87,219, //xorps %xmm3,%xmm3
@@ -6028,7 +6125,8 @@ CODE const uint8_t sk_plus__sse2[] = {
};
CODE const uint8_t sk_srcover_sse2[] = {
- 243,68,15,16,2, //movss (%rdx),%xmm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
68,15,92,195, //subps %xmm3,%xmm8
69,15,40,200, //movaps %xmm8,%xmm9
@@ -6047,7 +6145,8 @@ CODE const uint8_t sk_srcover_sse2[] = {
};
CODE const uint8_t sk_dstover_sse2[] = {
- 243,68,15,16,2, //movss (%rdx),%xmm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
68,15,92,199, //subps %xmm7,%xmm8
65,15,89,192, //mulps %xmm8,%xmm0
@@ -6073,7 +6172,8 @@ CODE const uint8_t sk_clamp_0_sse2[] = {
};
CODE const uint8_t sk_clamp_1_sse2[] = {
- 243,68,15,16,2, //movss (%rdx),%xmm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
65,15,93,192, //minps %xmm8,%xmm0
65,15,93,200, //minps %xmm8,%xmm1
@@ -6084,7 +6184,8 @@ CODE const uint8_t sk_clamp_1_sse2[] = {
};
CODE const uint8_t sk_clamp_a_sse2[] = {
- 243,68,15,16,2, //movss (%rdx),%xmm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
65,15,93,216, //minps %xmm8,%xmm3
15,93,195, //minps %xmm3,%xmm0
@@ -6160,7 +6261,8 @@ CODE const uint8_t sk_premul_sse2[] = {
CODE const uint8_t sk_unpremul_sse2[] = {
69,15,87,192, //xorps %xmm8,%xmm8
68,15,194,195,0, //cmpeqps %xmm3,%xmm8
- 243,68,15,16,10, //movss (%rdx),%xmm9
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,200, //movd %eax,%xmm9
69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
68,15,94,203, //divps %xmm3,%xmm9
69,15,85,193, //andnps %xmm9,%xmm8
@@ -6315,7 +6417,8 @@ CODE const uint8_t sk_scale_u8_sse2[] = {
102,69,15,96,193, //punpcklbw %xmm9,%xmm8
102,69,15,97,193, //punpcklwd %xmm9,%xmm8
69,15,91,192, //cvtdq2ps %xmm8,%xmm8
- 243,68,15,16,74,12, //movss 0xc(%rdx),%xmm9
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 102,68,15,110,200, //movd %eax,%xmm9
69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
69,15,89,200, //mulps %xmm8,%xmm9
65,15,89,193, //mulps %xmm9,%xmm0
@@ -6354,7 +6457,8 @@ CODE const uint8_t sk_lerp_u8_sse2[] = {
102,69,15,96,193, //punpcklbw %xmm9,%xmm8
102,69,15,97,193, //punpcklwd %xmm9,%xmm8
69,15,91,192, //cvtdq2ps %xmm8,%xmm8
- 243,68,15,16,74,12, //movss 0xc(%rdx),%xmm9
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 102,68,15,110,200, //movd %eax,%xmm9
69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
69,15,89,200, //mulps %xmm8,%xmm9
15,92,196, //subps %xmm4,%xmm0
@@ -6376,40 +6480,41 @@ CODE const uint8_t sk_lerp_u8_sse2[] = {
CODE const uint8_t sk_lerp_565_sse2[] = {
72,173, //lods %ds:(%rsi),%rax
72,139,0, //mov (%rax),%rax
- 243,68,15,126,4,120, //movq (%rax,%rdi,2),%xmm8
+ 243,68,15,126,12,120, //movq (%rax,%rdi,2),%xmm9
102,15,239,219, //pxor %xmm3,%xmm3
- 102,68,15,97,195, //punpcklwd %xmm3,%xmm8
+ 102,68,15,97,203, //punpcklwd %xmm3,%xmm9
102,15,110,90,104, //movd 0x68(%rdx),%xmm3
102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3
- 102,65,15,219,216, //pand %xmm8,%xmm3
- 68,15,91,203, //cvtdq2ps %xmm3,%xmm9
- 243,15,16,26, //movss (%rdx),%xmm3
- 243,68,15,16,82,116, //movss 0x74(%rdx),%xmm10
- 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
- 69,15,89,209, //mulps %xmm9,%xmm10
- 102,68,15,110,74,108, //movd 0x6c(%rdx),%xmm9
- 102,69,15,112,201,0, //pshufd $0x0,%xmm9,%xmm9
- 102,69,15,219,200, //pand %xmm8,%xmm9
- 69,15,91,201, //cvtdq2ps %xmm9,%xmm9
- 243,68,15,16,90,120, //movss 0x78(%rdx),%xmm11
+ 102,65,15,219,217, //pand %xmm9,%xmm3
+ 68,15,91,211, //cvtdq2ps %xmm3,%xmm10
+ 243,68,15,16,90,116, //movss 0x74(%rdx),%xmm11
+ 243,68,15,16,66,120, //movss 0x78(%rdx),%xmm8
69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
- 69,15,89,217, //mulps %xmm9,%xmm11
- 102,68,15,110,74,112, //movd 0x70(%rdx),%xmm9
- 102,69,15,112,201,0, //pshufd $0x0,%xmm9,%xmm9
- 102,69,15,219,200, //pand %xmm8,%xmm9
- 69,15,91,193, //cvtdq2ps %xmm9,%xmm8
- 243,68,15,16,74,124, //movss 0x7c(%rdx),%xmm9
- 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
- 69,15,89,200, //mulps %xmm8,%xmm9
+ 69,15,89,218, //mulps %xmm10,%xmm11
+ 102,15,110,90,108, //movd 0x6c(%rdx),%xmm3
+ 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3
+ 102,65,15,219,217, //pand %xmm9,%xmm3
+ 15,91,219, //cvtdq2ps %xmm3,%xmm3
+ 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
+ 68,15,89,195, //mulps %xmm3,%xmm8
+ 102,15,110,90,112, //movd 0x70(%rdx),%xmm3
+ 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3
+ 102,65,15,219,217, //pand %xmm9,%xmm3
+ 68,15,91,203, //cvtdq2ps %xmm3,%xmm9
+ 243,15,16,90,124, //movss 0x7c(%rdx),%xmm3
+ 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
+ 65,15,89,217, //mulps %xmm9,%xmm3
15,92,196, //subps %xmm4,%xmm0
- 65,15,89,194, //mulps %xmm10,%xmm0
+ 65,15,89,195, //mulps %xmm11,%xmm0
15,88,196, //addps %xmm4,%xmm0
15,92,205, //subps %xmm5,%xmm1
- 65,15,89,203, //mulps %xmm11,%xmm1
+ 65,15,89,200, //mulps %xmm8,%xmm1
15,88,205, //addps %xmm5,%xmm1
15,92,214, //subps %xmm6,%xmm2
- 65,15,89,209, //mulps %xmm9,%xmm2
+ 15,89,211, //mulps %xmm3,%xmm2
15,88,214, //addps %xmm6,%xmm2
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,15,110,216, //movd %eax,%xmm3
15,198,219,0, //shufps $0x0,%xmm3,%xmm3
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
@@ -6579,7 +6684,8 @@ CODE const uint8_t sk_load_8888_sse2[] = {
72,173, //lods %ds:(%rsi),%rax
72,139,0, //mov (%rax),%rax
243,15,111,28,184, //movdqu (%rax,%rdi,4),%xmm3
- 102,15,110,66,16, //movd 0x10(%rdx),%xmm0
+ 184,255,0,0,0, //mov $0xff,%eax
+ 102,15,110,192, //movd %eax,%xmm0
102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
102,15,111,203, //movdqa %xmm3,%xmm1
102,15,114,209,8, //psrld $0x8,%xmm1
@@ -6589,7 +6695,8 @@ CODE const uint8_t sk_load_8888_sse2[] = {
102,15,219,208, //pand %xmm0,%xmm2
102,15,219,195, //pand %xmm3,%xmm0
15,91,192, //cvtdq2ps %xmm0,%xmm0
- 243,68,15,16,66,12, //movss 0xc(%rdx),%xmm8
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
65,15,89,192, //mulps %xmm8,%xmm0
15,91,201, //cvtdq2ps %xmm1,%xmm1
@@ -6606,7 +6713,8 @@ CODE const uint8_t sk_load_8888_sse2[] = {
CODE const uint8_t sk_store_8888_sse2[] = {
72,173, //lods %ds:(%rsi),%rax
72,139,0, //mov (%rax),%rax
- 243,68,15,16,66,8, //movss 0x8(%rdx),%xmm8
+ 185,0,0,127,67, //mov $0x437f0000,%ecx
+ 102,68,15,110,193, //movd %ecx,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
69,15,40,200, //movaps %xmm8,%xmm9
68,15,89,200, //mulps %xmm0,%xmm9
@@ -6762,7 +6870,8 @@ CODE const uint8_t sk_repeat_x_sse2[] = {
243,69,15,91,209, //cvttps2dq %xmm9,%xmm10
69,15,91,210, //cvtdq2ps %xmm10,%xmm10
69,15,194,202,1, //cmpltps %xmm10,%xmm9
- 243,68,15,16,26, //movss (%rdx),%xmm11
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,216, //movd %eax,%xmm11
69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
69,15,84,217, //andps %xmm9,%xmm11
69,15,92,211, //subps %xmm11,%xmm10
@@ -6784,7 +6893,8 @@ CODE const uint8_t sk_repeat_y_sse2[] = {
243,69,15,91,209, //cvttps2dq %xmm9,%xmm10
69,15,91,210, //cvtdq2ps %xmm10,%xmm10
69,15,194,202,1, //cmpltps %xmm10,%xmm9
- 243,68,15,16,26, //movss (%rdx),%xmm11
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,216, //movd %eax,%xmm11
69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
69,15,84,217, //andps %xmm9,%xmm11
69,15,92,211, //subps %xmm11,%xmm10
@@ -6810,7 +6920,8 @@ CODE const uint8_t sk_mirror_x_sse2[] = {
243,69,15,91,218, //cvttps2dq %xmm10,%xmm11
69,15,91,219, //cvtdq2ps %xmm11,%xmm11
69,15,194,211,1, //cmpltps %xmm11,%xmm10
- 243,68,15,16,34, //movss (%rdx),%xmm12
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,224, //movd %eax,%xmm12
69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
69,15,84,226, //andps %xmm10,%xmm12
69,15,87,210, //xorps %xmm10,%xmm10
@@ -6840,7 +6951,8 @@ CODE const uint8_t sk_mirror_y_sse2[] = {
243,69,15,91,218, //cvttps2dq %xmm10,%xmm11
69,15,91,219, //cvtdq2ps %xmm11,%xmm11
69,15,194,211,1, //cmpltps %xmm11,%xmm10
- 243,68,15,16,34, //movss (%rdx),%xmm12
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,224, //movd %eax,%xmm12
69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
69,15,84,226, //andps %xmm10,%xmm12
69,15,87,210, //xorps %xmm10,%xmm10
@@ -7205,13 +7317,17 @@ CODE const uint8_t sk_seed_shader_hsw[] = {
197,249,110,199, //vmovd %edi,%xmm0
196,226,125,24,192, //vbroadcastss %xmm0,%ymm0
197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
- 196,226,125,24,74,4, //vbroadcastss 0x4(%rdx),%ymm1
+ 65,184,0,0,0,63, //mov $0x3f000000,%r8d
+ 196,193,121,110,200, //vmovd %r8d,%xmm1
+ 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1
197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0
197,252,88,66,20, //vaddps 0x14(%rdx),%ymm0,%ymm0
196,226,125,24,16, //vbroadcastss (%rax),%ymm2
197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1
- 196,226,125,24,18, //vbroadcastss (%rdx),%ymm2
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,249,110,208, //vmovd %eax,%xmm2
+ 196,226,125,24,210, //vbroadcastss %xmm2,%ymm2
72,173, //lods %ds:(%rsi),%rax
197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4
@@ -7250,7 +7366,9 @@ CODE const uint8_t sk_plus__hsw[] = {
};
CODE const uint8_t sk_srcover_hsw[] = {
- 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8
197,60,92,195, //vsubps %ymm3,%ymm8,%ymm8
196,194,93,184,192, //vfmadd231ps %ymm8,%ymm4,%ymm0
196,194,85,184,200, //vfmadd231ps %ymm8,%ymm5,%ymm1
@@ -7261,7 +7379,9 @@ CODE const uint8_t sk_srcover_hsw[] = {
};
CODE const uint8_t sk_dstover_hsw[] = {
- 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8
197,60,92,199, //vsubps %ymm7,%ymm8,%ymm8
196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0
196,226,61,168,205, //vfmadd213ps %ymm5,%ymm8,%ymm1
@@ -7282,7 +7402,9 @@ CODE const uint8_t sk_clamp_0_hsw[] = {
};
CODE const uint8_t sk_clamp_1_hsw[] = {
- 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8
196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0
196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1
196,193,108,93,208, //vminps %ymm8,%ymm2,%ymm2
@@ -7292,7 +7414,9 @@ CODE const uint8_t sk_clamp_1_hsw[] = {
};
CODE const uint8_t sk_clamp_a_hsw[] = {
- 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8
196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3
197,252,93,195, //vminps %ymm3,%ymm0,%ymm0
197,244,93,203, //vminps %ymm3,%ymm1,%ymm1
@@ -7364,7 +7488,9 @@ CODE const uint8_t sk_premul_hsw[] = {
CODE const uint8_t sk_unpremul_hsw[] = {
196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
196,65,100,194,200,0, //vcmpeqps %ymm8,%ymm3,%ymm9
- 196,98,125,24,18, //vbroadcastss (%rdx),%ymm10
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,208, //vmovd %eax,%xmm10
+ 196,66,125,24,210, //vbroadcastss %xmm10,%ymm10
197,44,94,211, //vdivps %ymm3,%ymm10,%ymm10
196,67,45,74,192,144, //vblendvps %ymm9,%ymm8,%ymm10,%ymm8
197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
@@ -7459,11 +7585,13 @@ CODE const uint8_t sk_scale_u8_hsw[] = {
72,139,0, //mov (%rax),%rax
72,1,248, //add %rdi,%rax
77,133,192, //test %r8,%r8
- 117,48, //jne 4b1 <_sk_scale_u8_hsw+0x40>
+ 117,56, //jne 4f9 <_sk_scale_u8_hsw+0x48>
197,123,16,0, //vmovsd (%rax),%xmm8
196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8
196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
- 196,98,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm9
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 197,121,110,200, //vmovd %eax,%xmm9
+ 196,66,125,24,201, //vbroadcastss %xmm9,%ymm9
196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8
197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
@@ -7481,9 +7609,9 @@ CODE const uint8_t sk_scale_u8_hsw[] = {
77,9,217, //or %r11,%r9
72,131,193,8, //add $0x8,%rcx
73,255,202, //dec %r10
- 117,234, //jne 4b9 <_sk_scale_u8_hsw+0x48>
+ 117,234, //jne 501 <_sk_scale_u8_hsw+0x50>
196,65,249,110,193, //vmovq %r9,%xmm8
- 235,175, //jmp 485 <_sk_scale_u8_hsw+0x14>
+ 235,167, //jmp 4c5 <_sk_scale_u8_hsw+0x14>
};
CODE const uint8_t sk_lerp_1_float_hsw[] = {
@@ -7507,11 +7635,13 @@ CODE const uint8_t sk_lerp_u8_hsw[] = {
72,139,0, //mov (%rax),%rax
72,1,248, //add %rdi,%rax
77,133,192, //test %r8,%r8
- 117,68, //jne 559 <_sk_lerp_u8_hsw+0x54>
+ 117,76, //jne 5a9 <_sk_lerp_u8_hsw+0x5c>
197,123,16,0, //vmovsd (%rax),%xmm8
196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8
196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
- 196,98,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm9
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 197,121,110,200, //vmovd %eax,%xmm9
+ 196,66,125,24,201, //vbroadcastss %xmm9,%ymm9
196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8
197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0
196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0
@@ -7533,16 +7663,16 @@ CODE const uint8_t sk_lerp_u8_hsw[] = {
77,9,217, //or %r11,%r9
72,131,193,8, //add $0x8,%rcx
73,255,202, //dec %r10
- 117,234, //jne 561 <_sk_lerp_u8_hsw+0x5c>
+ 117,234, //jne 5b1 <_sk_lerp_u8_hsw+0x64>
196,65,249,110,193, //vmovq %r9,%xmm8
- 235,155, //jmp 519 <_sk_lerp_u8_hsw+0x14>
+ 235,147, //jmp 561 <_sk_lerp_u8_hsw+0x14>
};
CODE const uint8_t sk_lerp_565_hsw[] = {
72,173, //lods %ds:(%rsi),%rax
76,139,16, //mov (%rax),%r10
72,133,201, //test %rcx,%rcx
- 117,123, //jne 603 <_sk_lerp_565_hsw+0x85>
+ 15,133,132,0,0,0, //jne 660 <_sk_lerp_565_hsw+0x92>
196,193,122,111,28,122, //vmovdqu (%r10,%rdi,2),%xmm3
196,226,125,51,219, //vpmovzxwd %xmm3,%ymm3
196,98,125,88,66,104, //vpbroadcastd 0x68(%rdx),%ymm8
@@ -7566,7 +7696,9 @@ CODE const uint8_t sk_lerp_565_hsw[] = {
196,226,53,168,205, //vfmadd213ps %ymm5,%ymm9,%ymm1
197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2
196,226,101,168,214, //vfmadd213ps %ymm6,%ymm3,%ymm2
- 196,226,125,24,26, //vbroadcastss (%rdx),%ymm3
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,249,110,216, //vmovd %eax,%xmm3
+ 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
65,137,200, //mov %ecx,%r8d
@@ -7575,8 +7707,8 @@ CODE const uint8_t sk_lerp_565_hsw[] = {
65,254,200, //dec %r8b
69,15,182,192, //movzbl %r8b,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 15,135,111,255,255,255, //ja 58e <_sk_lerp_565_hsw+0x10>
- 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # 670 <_sk_lerp_565_hsw+0xf2>
+ 15,135,102,255,255,255, //ja 5e2 <_sk_lerp_565_hsw+0x14>
+ 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # 6cc <_sk_lerp_565_hsw+0xfe>
75,99,4,129, //movslq (%r9,%r8,4),%rax
76,1,200, //add %r9,%rax
255,224, //jmpq *%rax
@@ -7588,27 +7720,27 @@ CODE const uint8_t sk_lerp_565_hsw[] = {
196,193,97,196,92,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm3,%xmm3
196,193,97,196,92,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm3,%xmm3
196,193,97,196,28,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm3,%xmm3
- 233,31,255,255,255, //jmpq 58e <_sk_lerp_565_hsw+0x10>
- 144, //nop
- 243,255, //repz (bad)
+ 233,22,255,255,255, //jmpq 5e2 <_sk_lerp_565_hsw+0x14>
+ 244, //hlt
255, //(bad)
255, //(bad)
- 235,255, //jmp 675 <_sk_lerp_565_hsw+0xf7>
255, //(bad)
- 255,227, //jmpq *%rbx
+ 236, //in (%dx),%al
255, //(bad)
255, //(bad)
+ 255,228, //jmpq *%rsp
255, //(bad)
- 219,255, //(bad)
255, //(bad)
- 255,211, //callq *%rbx
255, //(bad)
+ 220,255, //fdivr %st,%st(7)
255, //(bad)
- 255,203, //dec %ebx
+ 255,212, //callq *%rsp
255, //(bad)
255, //(bad)
+ 255,204, //dec %esp
255, //(bad)
- 191, //.byte 0xbf
+ 255, //(bad)
+ 255,192, //inc %eax
255, //(bad)
255, //(bad)
255, //.byte 0xff
@@ -7620,7 +7752,7 @@ CODE const uint8_t sk_load_tables_hsw[] = {
76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9
76,3,8, //add (%rax),%r9
77,133,192, //test %r8,%r8
- 117,106, //jne 70b <_sk_load_tables_hsw+0x7f>
+ 117,106, //jne 767 <_sk_load_tables_hsw+0x7f>
196,193,126,111,25, //vmovdqu (%r9),%ymm3
196,226,125,88,82,16, //vpbroadcastd 0x10(%rdx),%ymm2
197,237,219,203, //vpand %ymm3,%ymm2,%ymm1
@@ -7652,7 +7784,7 @@ CODE const uint8_t sk_load_tables_hsw[] = {
196,193,249,110,194, //vmovq %r10,%xmm0
196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0
196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3
- 233,114,255,255,255, //jmpq 6a6 <_sk_load_tables_hsw+0x1a>
+ 233,114,255,255,255, //jmpq 702 <_sk_load_tables_hsw+0x1a>
};
CODE const uint8_t sk_load_a8_hsw[] = {
@@ -7661,7 +7793,7 @@ CODE const uint8_t sk_load_a8_hsw[] = {
72,139,0, //mov (%rax),%rax
72,1,248, //add %rdi,%rax
77,133,192, //test %r8,%r8
- 117,42, //jne 76e <_sk_load_a8_hsw+0x3a>
+ 117,42, //jne 7ca <_sk_load_a8_hsw+0x3a>
197,251,16,0, //vmovsd (%rax),%xmm0
196,226,125,49,192, //vpmovzxbd %xmm0,%ymm0
197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
@@ -7682,9 +7814,9 @@ CODE const uint8_t sk_load_a8_hsw[] = {
77,9,217, //or %r11,%r9
72,131,193,8, //add $0x8,%rcx
73,255,202, //dec %r10
- 117,234, //jne 776 <_sk_load_a8_hsw+0x42>
+ 117,234, //jne 7d2 <_sk_load_a8_hsw+0x42>
196,193,249,110,193, //vmovq %r9,%xmm0
- 235,181, //jmp 748 <_sk_load_a8_hsw+0x14>
+ 235,181, //jmp 7a4 <_sk_load_a8_hsw+0x14>
};
CODE const uint8_t sk_store_a8_hsw[] = {
@@ -7697,7 +7829,7 @@ CODE const uint8_t sk_store_a8_hsw[] = {
196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8
196,65,57,103,192, //vpackuswb %xmm8,%xmm8,%xmm8
72,133,201, //test %rcx,%rcx
- 117,10, //jne 7c6 <_sk_store_a8_hsw+0x33>
+ 117,10, //jne 822 <_sk_store_a8_hsw+0x33>
196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1)
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
@@ -7706,9 +7838,9 @@ CODE const uint8_t sk_store_a8_hsw[] = {
254,200, //dec %al
68,15,182,192, //movzbl %al,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 119,236, //ja 7c2 <_sk_store_a8_hsw+0x2f>
+ 119,236, //ja 81e <_sk_store_a8_hsw+0x2f>
196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8
- 76,141,21,66,0,0,0, //lea 0x42(%rip),%r10 # 824 <_sk_store_a8_hsw+0x91>
+ 76,141,21,66,0,0,0, //lea 0x42(%rip),%r10 # 880 <_sk_store_a8_hsw+0x91>
75,99,4,130, //movslq (%r10,%r8,4),%rax
76,1,208, //add %r10,%rax
255,224, //jmpq *%rax
@@ -7719,7 +7851,7 @@ CODE const uint8_t sk_store_a8_hsw[] = {
196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1)
196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1)
196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1)
- 235,158, //jmp 7c2 <_sk_store_a8_hsw+0x2f>
+ 235,158, //jmp 81e <_sk_store_a8_hsw+0x2f>
247,255, //idiv %edi
255, //(bad)
255, //(bad)
@@ -7748,7 +7880,7 @@ CODE const uint8_t sk_load_565_hsw[] = {
72,173, //lods %ds:(%rsi),%rax
76,139,16, //mov (%rax),%r10
72,133,201, //test %rcx,%rcx
- 117,92, //jne 8a6 <_sk_load_565_hsw+0x66>
+ 117,92, //jne 902 <_sk_load_565_hsw+0x66>
196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0
196,226,125,51,208, //vpmovzxwd %xmm0,%ymm2
196,226,125,88,66,104, //vpbroadcastd 0x68(%rdx),%ymm0
@@ -7775,8 +7907,8 @@ CODE const uint8_t sk_load_565_hsw[] = {
65,254,200, //dec %r8b
69,15,182,192, //movzbl %r8b,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 119,146, //ja 850 <_sk_load_565_hsw+0x10>
- 76,141,13,75,0,0,0, //lea 0x4b(%rip),%r9 # 910 <_sk_load_565_hsw+0xd0>
+ 119,146, //ja 8ac <_sk_load_565_hsw+0x10>
+ 76,141,13,75,0,0,0, //lea 0x4b(%rip),%r9 # 96c <_sk_load_565_hsw+0xd0>
75,99,4,129, //movslq (%r9,%r8,4),%rax
76,1,200, //add %r9,%rax
255,224, //jmpq *%rax
@@ -7788,7 +7920,7 @@ CODE const uint8_t sk_load_565_hsw[] = {
196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0
- 233,66,255,255,255, //jmpq 850 <_sk_load_565_hsw+0x10>
+ 233,66,255,255,255, //jmpq 8ac <_sk_load_565_hsw+0x10>
102,144, //xchg %ax,%ax
242,255, //repnz (bad)
255, //(bad)
@@ -7833,7 +7965,7 @@ CODE const uint8_t sk_store_565_hsw[] = {
196,67,125,57,193,1, //vextracti128 $0x1,%ymm8,%xmm9
196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8
72,133,201, //test %rcx,%rcx
- 117,10, //jne 98e <_sk_store_565_hsw+0x62>
+ 117,10, //jne 9ea <_sk_store_565_hsw+0x62>
196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2)
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
@@ -7842,8 +7974,8 @@ CODE const uint8_t sk_store_565_hsw[] = {
254,200, //dec %al
68,15,182,192, //movzbl %al,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 119,236, //ja 98a <_sk_store_565_hsw+0x5e>
- 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # 9ec <_sk_store_565_hsw+0xc0>
+ 119,236, //ja 9e6 <_sk_store_565_hsw+0x5e>
+ 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # a48 <_sk_store_565_hsw+0xc0>
75,99,4,130, //movslq (%r10,%r8,4),%rax
76,1,208, //add %r10,%rax
255,224, //jmpq *%rax
@@ -7855,7 +7987,7 @@ CODE const uint8_t sk_store_565_hsw[] = {
196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2)
197,121,126,192, //vmovd %xmm8,%eax
102,65,137,4,121, //mov %ax,(%r9,%rdi,2)
- 235,161, //jmp 98a <_sk_store_565_hsw+0x5e>
+ 235,161, //jmp 9e6 <_sk_store_565_hsw+0x5e>
15,31,0, //nopl (%rax)
242,255, //repnz (bad)
255, //(bad)
@@ -7887,21 +8019,25 @@ CODE const uint8_t sk_load_8888_hsw[] = {
76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9
76,3,8, //add (%rax),%r9
77,133,192, //test %r8,%r8
- 117,85, //jne a72 <_sk_load_8888_hsw+0x6a>
+ 117,104, //jne ae1 <_sk_load_8888_hsw+0x7d>
196,193,126,111,25, //vmovdqu (%r9),%ymm3
- 196,226,125,88,82,16, //vpbroadcastd 0x10(%rdx),%ymm2
+ 184,255,0,0,0, //mov $0xff,%eax
+ 197,249,110,192, //vmovd %eax,%xmm0
+ 196,226,125,88,208, //vpbroadcastd %xmm0,%ymm2
197,237,219,195, //vpand %ymm3,%ymm2,%ymm0
197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
- 196,98,125,24,66,12, //vbroadcastss 0xc(%rdx),%ymm8
- 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 197,249,110,200, //vmovd %eax,%xmm1
+ 196,98,125,24,193, //vbroadcastss %xmm1,%ymm8
+ 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0
197,245,114,211,8, //vpsrld $0x8,%ymm3,%ymm1
197,237,219,201, //vpand %ymm1,%ymm2,%ymm1
197,252,91,201, //vcvtdq2ps %ymm1,%ymm1
- 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
+ 196,193,116,89,200, //vmulps %ymm8,%ymm1,%ymm1
197,181,114,211,16, //vpsrld $0x10,%ymm3,%ymm9
196,193,109,219,209, //vpand %ymm9,%ymm2,%ymm2
197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
- 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
+ 196,193,108,89,208, //vmulps %ymm8,%ymm2,%ymm2
197,229,114,211,24, //vpsrld $0x18,%ymm3,%ymm3
197,252,91,219, //vcvtdq2ps %ymm3,%ymm3
196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3
@@ -7916,7 +8052,7 @@ CODE const uint8_t sk_load_8888_hsw[] = {
196,225,249,110,192, //vmovq %rax,%xmm0
196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0
196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3
- 235,138, //jmp a22 <_sk_load_8888_hsw+0x1a>
+ 233,116,255,255,255, //jmpq a7e <_sk_load_8888_hsw+0x1a>
};
CODE const uint8_t sk_store_8888_hsw[] = {
@@ -7924,7 +8060,9 @@ CODE const uint8_t sk_store_8888_hsw[] = {
72,173, //lods %ds:(%rsi),%rax
76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9
76,3,8, //add (%rax),%r9
- 196,98,125,24,66,8, //vbroadcastss 0x8(%rdx),%ymm8
+ 184,0,0,127,67, //mov $0x437f0000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8
197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9
197,60,89,209, //vmulps %ymm1,%ymm8,%ymm10
@@ -7940,7 +8078,7 @@ CODE const uint8_t sk_store_8888_hsw[] = {
196,65,45,235,192, //vpor %ymm8,%ymm10,%ymm8
196,65,53,235,192, //vpor %ymm8,%ymm9,%ymm8
77,133,192, //test %r8,%r8
- 117,12, //jne b04 <_sk_store_8888_hsw+0x6c>
+ 117,12, //jne b7e <_sk_store_8888_hsw+0x74>
196,65,126,127,1, //vmovdqu %ymm8,(%r9)
72,173, //lods %ds:(%rsi),%rax
76,137,193, //mov %r8,%rcx
@@ -7953,14 +8091,14 @@ CODE const uint8_t sk_store_8888_hsw[] = {
196,97,249,110,200, //vmovq %rax,%xmm9
196,66,125,33,201, //vpmovsxbd %xmm9,%ymm9
196,66,53,142,1, //vpmaskmovd %ymm8,%ymm9,(%r9)
- 235,211, //jmp afd <_sk_store_8888_hsw+0x65>
+ 235,211, //jmp b77 <_sk_store_8888_hsw+0x6d>
};
CODE const uint8_t sk_load_f16_hsw[] = {
72,173, //lods %ds:(%rsi),%rax
72,139,0, //mov (%rax),%rax
72,133,201, //test %rcx,%rcx
- 117,97, //jne b95 <_sk_load_f16_hsw+0x6b>
+ 117,97, //jne c0f <_sk_load_f16_hsw+0x6b>
197,249,16,12,248, //vmovupd (%rax,%rdi,8),%xmm1
197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2
197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3
@@ -7986,35 +8124,35 @@ CODE const uint8_t sk_load_f16_hsw[] = {
197,251,16,12,248, //vmovsd (%rax,%rdi,8),%xmm1
196,65,57,87,192, //vxorpd %xmm8,%xmm8,%xmm8
72,131,249,1, //cmp $0x1,%rcx
- 117,6, //jne bab <_sk_load_f16_hsw+0x81>
+ 117,6, //jne c25 <_sk_load_f16_hsw+0x81>
197,250,126,201, //vmovq %xmm1,%xmm1
- 235,30, //jmp bc9 <_sk_load_f16_hsw+0x9f>
+ 235,30, //jmp c43 <_sk_load_f16_hsw+0x9f>
197,241,22,76,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm1,%xmm1
72,131,249,3, //cmp $0x3,%rcx
- 114,18, //jb bc9 <_sk_load_f16_hsw+0x9f>
+ 114,18, //jb c43 <_sk_load_f16_hsw+0x9f>
197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2
72,131,249,3, //cmp $0x3,%rcx
- 117,19, //jne bd6 <_sk_load_f16_hsw+0xac>
+ 117,19, //jne c50 <_sk_load_f16_hsw+0xac>
197,250,126,210, //vmovq %xmm2,%xmm2
- 235,46, //jmp bf7 <_sk_load_f16_hsw+0xcd>
+ 235,46, //jmp c71 <_sk_load_f16_hsw+0xcd>
197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3
197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2
- 233,117,255,255,255, //jmpq b4b <_sk_load_f16_hsw+0x21>
+ 233,117,255,255,255, //jmpq bc5 <_sk_load_f16_hsw+0x21>
197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2
72,131,249,5, //cmp $0x5,%rcx
- 114,21, //jb bf7 <_sk_load_f16_hsw+0xcd>
+ 114,21, //jb c71 <_sk_load_f16_hsw+0xcd>
197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3
72,131,249,5, //cmp $0x5,%rcx
- 117,18, //jne c00 <_sk_load_f16_hsw+0xd6>
+ 117,18, //jne c7a <_sk_load_f16_hsw+0xd6>
197,250,126,219, //vmovq %xmm3,%xmm3
- 233,84,255,255,255, //jmpq b4b <_sk_load_f16_hsw+0x21>
+ 233,84,255,255,255, //jmpq bc5 <_sk_load_f16_hsw+0x21>
197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3
- 233,75,255,255,255, //jmpq b4b <_sk_load_f16_hsw+0x21>
+ 233,75,255,255,255, //jmpq bc5 <_sk_load_f16_hsw+0x21>
197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3
72,131,249,7, //cmp $0x7,%rcx
- 15,130,59,255,255,255, //jb b4b <_sk_load_f16_hsw+0x21>
+ 15,130,59,255,255,255, //jb bc5 <_sk_load_f16_hsw+0x21>
197,123,16,68,248,48, //vmovsd 0x30(%rax,%rdi,8),%xmm8
- 233,48,255,255,255, //jmpq b4b <_sk_load_f16_hsw+0x21>
+ 233,48,255,255,255, //jmpq bc5 <_sk_load_f16_hsw+0x21>
};
CODE const uint8_t sk_store_f16_hsw[] = {
@@ -8033,7 +8171,7 @@ CODE const uint8_t sk_store_f16_hsw[] = {
196,65,57,98,205, //vpunpckldq %xmm13,%xmm8,%xmm9
196,65,57,106,197, //vpunpckhdq %xmm13,%xmm8,%xmm8
72,133,201, //test %rcx,%rcx
- 117,27, //jne c80 <_sk_store_f16_hsw+0x65>
+ 117,27, //jne cfa <_sk_store_f16_hsw+0x65>
197,120,17,28,248, //vmovups %xmm11,(%rax,%rdi,8)
197,120,17,84,248,16, //vmovups %xmm10,0x10(%rax,%rdi,8)
197,120,17,76,248,32, //vmovups %xmm9,0x20(%rax,%rdi,8)
@@ -8042,22 +8180,22 @@ CODE const uint8_t sk_store_f16_hsw[] = {
255,224, //jmpq *%rax
197,121,214,28,248, //vmovq %xmm11,(%rax,%rdi,8)
72,131,249,1, //cmp $0x1,%rcx
- 116,241, //je c7c <_sk_store_f16_hsw+0x61>
+ 116,241, //je cf6 <_sk_store_f16_hsw+0x61>
197,121,23,92,248,8, //vmovhpd %xmm11,0x8(%rax,%rdi,8)
72,131,249,3, //cmp $0x3,%rcx
- 114,229, //jb c7c <_sk_store_f16_hsw+0x61>
+ 114,229, //jb cf6 <_sk_store_f16_hsw+0x61>
197,121,214,84,248,16, //vmovq %xmm10,0x10(%rax,%rdi,8)
- 116,221, //je c7c <_sk_store_f16_hsw+0x61>
+ 116,221, //je cf6 <_sk_store_f16_hsw+0x61>
197,121,23,84,248,24, //vmovhpd %xmm10,0x18(%rax,%rdi,8)
72,131,249,5, //cmp $0x5,%rcx
- 114,209, //jb c7c <_sk_store_f16_hsw+0x61>
+ 114,209, //jb cf6 <_sk_store_f16_hsw+0x61>
197,121,214,76,248,32, //vmovq %xmm9,0x20(%rax,%rdi,8)
- 116,201, //je c7c <_sk_store_f16_hsw+0x61>
+ 116,201, //je cf6 <_sk_store_f16_hsw+0x61>
197,121,23,76,248,40, //vmovhpd %xmm9,0x28(%rax,%rdi,8)
72,131,249,7, //cmp $0x7,%rcx
- 114,189, //jb c7c <_sk_store_f16_hsw+0x61>
+ 114,189, //jb cf6 <_sk_store_f16_hsw+0x61>
197,121,214,68,248,48, //vmovq %xmm8,0x30(%rax,%rdi,8)
- 235,181, //jmp c7c <_sk_store_f16_hsw+0x61>
+ 235,181, //jmp cf6 <_sk_store_f16_hsw+0x61>
};
CODE const uint8_t sk_store_f32_hsw[] = {
@@ -8073,7 +8211,7 @@ CODE const uint8_t sk_store_f32_hsw[] = {
196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8
196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11
72,133,201, //test %rcx,%rcx
- 117,55, //jne d34 <_sk_store_f32_hsw+0x6d>
+ 117,55, //jne dae <_sk_store_f32_hsw+0x6d>
196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12
196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13
196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9
@@ -8086,22 +8224,22 @@ CODE const uint8_t sk_store_f32_hsw[] = {
255,224, //jmpq *%rax
196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4)
72,131,249,1, //cmp $0x1,%rcx
- 116,240, //je d30 <_sk_store_f32_hsw+0x69>
+ 116,240, //je daa <_sk_store_f32_hsw+0x69>
196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4)
72,131,249,3, //cmp $0x3,%rcx
- 114,227, //jb d30 <_sk_store_f32_hsw+0x69>
+ 114,227, //jb daa <_sk_store_f32_hsw+0x69>
196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4)
- 116,218, //je d30 <_sk_store_f32_hsw+0x69>
+ 116,218, //je daa <_sk_store_f32_hsw+0x69>
196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4)
72,131,249,5, //cmp $0x5,%rcx
- 114,205, //jb d30 <_sk_store_f32_hsw+0x69>
+ 114,205, //jb daa <_sk_store_f32_hsw+0x69>
196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4)
- 116,195, //je d30 <_sk_store_f32_hsw+0x69>
+ 116,195, //je daa <_sk_store_f32_hsw+0x69>
196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4)
72,131,249,7, //cmp $0x7,%rcx
- 114,181, //jb d30 <_sk_store_f32_hsw+0x69>
+ 114,181, //jb daa <_sk_store_f32_hsw+0x69>
196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4)
- 235,171, //jmp d30 <_sk_store_f32_hsw+0x69>
+ 235,171, //jmp daa <_sk_store_f32_hsw+0x69>
};
CODE const uint8_t sk_clamp_x_hsw[] = {
@@ -8443,13 +8581,19 @@ CODE const uint8_t sk_seed_shader_avx[] = {
197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0
196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
- 196,226,125,24,74,4, //vbroadcastss 0x4(%rdx),%ymm1
+ 65,184,0,0,0,63, //mov $0x3f000000,%r8d
+ 196,193,121,110,200, //vmovd %r8d,%xmm1
+ 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1
+ 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1
197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0
197,252,88,66,20, //vaddps 0x14(%rdx),%ymm0,%ymm0
196,226,125,24,16, //vbroadcastss (%rax),%ymm2
197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1
- 196,226,125,24,18, //vbroadcastss (%rdx),%ymm2
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,249,110,208, //vmovd %eax,%xmm2
+ 196,227,121,4,210,0, //vpermilps $0x0,%xmm2,%xmm2
+ 196,227,109,24,210,1, //vinsertf128 $0x1,%xmm2,%ymm2,%ymm2
72,173, //lods %ds:(%rsi),%rax
197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3
197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4
@@ -8488,7 +8632,10 @@ CODE const uint8_t sk_plus__avx[] = {
};
CODE const uint8_t sk_srcover_avx[] = {
- 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8
+ 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
197,60,92,195, //vsubps %ymm3,%ymm8,%ymm8
197,60,89,204, //vmulps %ymm4,%ymm8,%ymm9
197,180,88,192, //vaddps %ymm0,%ymm9,%ymm0
@@ -8503,7 +8650,10 @@ CODE const uint8_t sk_srcover_avx[] = {
};
CODE const uint8_t sk_dstover_avx[] = {
- 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8
+ 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
197,60,92,199, //vsubps %ymm7,%ymm8,%ymm8
197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0
@@ -8528,7 +8678,10 @@ CODE const uint8_t sk_clamp_0_avx[] = {
};
CODE const uint8_t sk_clamp_1_avx[] = {
- 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8
+ 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0
196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1
196,193,108,93,208, //vminps %ymm8,%ymm2,%ymm2
@@ -8538,7 +8691,10 @@ CODE const uint8_t sk_clamp_1_avx[] = {
};
CODE const uint8_t sk_clamp_a_avx[] = {
- 196,98,125,24,2, //vbroadcastss (%rdx),%ymm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8
+ 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3
197,252,93,195, //vminps %ymm3,%ymm0,%ymm0
197,244,93,203, //vminps %ymm3,%ymm1,%ymm1
@@ -8610,7 +8766,10 @@ CODE const uint8_t sk_premul_avx[] = {
CODE const uint8_t sk_unpremul_avx[] = {
196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8
196,65,100,194,200,0, //vcmpeqps %ymm8,%ymm3,%ymm9
- 196,98,125,24,18, //vbroadcastss (%rdx),%ymm10
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,121,110,208, //vmovd %eax,%xmm10
+ 196,67,121,4,210,0, //vpermilps $0x0,%xmm10,%xmm10
+ 196,67,45,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm10,%ymm10
197,44,94,211, //vdivps %ymm3,%ymm10,%ymm10
196,67,45,74,192,144, //vblendvps %ymm9,%ymm8,%ymm10,%ymm8
197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
@@ -8715,14 +8874,17 @@ CODE const uint8_t sk_scale_u8_avx[] = {
72,139,0, //mov (%rax),%rax
72,1,248, //add %rdi,%rax
77,133,192, //test %r8,%r8
- 117,65, //jne 50f <_sk_scale_u8_avx+0x51>
+ 117,80, //jne 58f <_sk_scale_u8_avx+0x60>
197,123,16,0, //vmovsd (%rax),%xmm8
196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9
196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8
196,66,121,49,192, //vpmovzxbd %xmm8,%xmm8
196,67,53,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm9,%ymm8
196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
- 196,98,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm9
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 197,121,110,200, //vmovd %eax,%xmm9
+ 196,67,121,4,201,0, //vpermilps $0x0,%xmm9,%xmm9
+ 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9
196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8
197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
@@ -8740,9 +8902,9 @@ CODE const uint8_t sk_scale_u8_avx[] = {
77,9,217, //or %r11,%r9
72,131,193,8, //add $0x8,%rcx
73,255,202, //dec %r10
- 117,234, //jne 517 <_sk_scale_u8_avx+0x59>
+ 117,234, //jne 597 <_sk_scale_u8_avx+0x68>
196,65,249,110,193, //vmovq %r9,%xmm8
- 235,158, //jmp 4d2 <_sk_scale_u8_avx+0x14>
+ 235,143, //jmp 543 <_sk_scale_u8_avx+0x14>
};
CODE const uint8_t sk_lerp_1_float_avx[] = {
@@ -8770,14 +8932,17 @@ CODE const uint8_t sk_lerp_u8_avx[] = {
72,139,0, //mov (%rax),%rax
72,1,248, //add %rdi,%rax
77,133,192, //test %r8,%r8
- 117,101, //jne 5e8 <_sk_lerp_u8_avx+0x75>
+ 117,116, //jne 677 <_sk_lerp_u8_avx+0x84>
197,123,16,0, //vmovsd (%rax),%xmm8
196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9
196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8
196,66,121,49,192, //vpmovzxbd %xmm8,%xmm8
196,67,53,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm9,%ymm8
196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8
- 196,98,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm9
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 197,121,110,200, //vmovd %eax,%xmm9
+ 196,67,121,4,201,0, //vpermilps $0x0,%xmm9,%xmm9
+ 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9
196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8
197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0
196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0
@@ -8803,16 +8968,16 @@ CODE const uint8_t sk_lerp_u8_avx[] = {
77,9,217, //or %r11,%r9
72,131,193,8, //add $0x8,%rcx
73,255,202, //dec %r10
- 117,234, //jne 5f0 <_sk_lerp_u8_avx+0x7d>
+ 117,234, //jne 67f <_sk_lerp_u8_avx+0x8c>
196,65,249,110,193, //vmovq %r9,%xmm8
- 233,119,255,255,255, //jmpq 587 <_sk_lerp_u8_avx+0x14>
+ 233,104,255,255,255, //jmpq 607 <_sk_lerp_u8_avx+0x14>
};
CODE const uint8_t sk_lerp_565_avx[] = {
72,173, //lods %ds:(%rsi),%rax
76,139,16, //mov (%rax),%r10
72,133,201, //test %rcx,%rcx
- 15,133,148,0,0,0, //jne 6b2 <_sk_lerp_565_avx+0xa2>
+ 15,133,164,0,0,0, //jne 751 <_sk_lerp_565_avx+0xb2>
196,65,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm8
197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3
197,185,105,219, //vpunpckhwd %xmm3,%xmm8,%xmm3
@@ -8842,7 +9007,10 @@ CODE const uint8_t sk_lerp_565_avx[] = {
197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2
197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2
197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2
- 196,226,125,24,26, //vbroadcastss (%rdx),%ymm3
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 197,249,110,216, //vmovd %eax,%xmm3
+ 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3
+ 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
65,137,200, //mov %ecx,%r8d
@@ -8851,8 +9019,8 @@ CODE const uint8_t sk_lerp_565_avx[] = {
65,254,200, //dec %r8b
69,15,182,192, //movzbl %r8b,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 15,135,85,255,255,255, //ja 624 <_sk_lerp_565_avx+0x14>
- 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # 720 <_sk_lerp_565_avx+0x110>
+ 15,135,69,255,255,255, //ja 6b3 <_sk_lerp_565_avx+0x14>
+ 76,141,13,75,0,0,0, //lea 0x4b(%rip),%r9 # 7c0 <_sk_lerp_565_avx+0x121>
75,99,4,129, //movslq (%r9,%r8,4),%rax
76,1,200, //add %r9,%rax
255,224, //jmpq *%rax
@@ -8864,27 +9032,28 @@ CODE const uint8_t sk_lerp_565_avx[] = {
196,65,57,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm8,%xmm8
196,65,57,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm8,%xmm8
196,65,57,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm8,%xmm8
- 233,5,255,255,255, //jmpq 624 <_sk_lerp_565_avx+0x14>
- 144, //nop
- 243,255, //repz (bad)
+ 233,245,254,255,255, //jmpq 6b3 <_sk_lerp_565_avx+0x14>
+ 102,144, //xchg %ax,%ax
+ 242,255, //repnz (bad)
255, //(bad)
255, //(bad)
- 235,255, //jmp 725 <_sk_lerp_565_avx+0x115>
+ 234, //(bad)
255, //(bad)
- 255,227, //jmpq *%rbx
255, //(bad)
+ 255,226, //jmpq *%rdx
255, //(bad)
255, //(bad)
- 219,255, //(bad)
255, //(bad)
- 255,211, //callq *%rbx
+ 218,255, //(bad)
255, //(bad)
+ 255,210, //callq *%rdx
255, //(bad)
- 255,203, //dec %ebx
255, //(bad)
+ 255,202, //dec %edx
255, //(bad)
255, //(bad)
- 191, //.byte 0xbf
+ 255, //(bad)
+ 190, //.byte 0xbe
255, //(bad)
255, //(bad)
255, //.byte 0xff
@@ -8900,7 +9069,7 @@ CODE const uint8_t sk_load_tables_avx[] = {
72,173, //lods %ds:(%rsi),%rax
76,139,0, //mov (%rax),%r8
72,133,201, //test %rcx,%rcx
- 15,133,18,2,0,0, //jne 966 <_sk_load_tables_avx+0x22a>
+ 15,133,18,2,0,0, //jne a06 <_sk_load_tables_avx+0x22a>
196,65,124,16,4,184, //vmovups (%r8,%rdi,4),%ymm8
196,98,125,24,74,16, //vbroadcastss 0x10(%rdx),%ymm9
196,193,52,84,192, //vandps %ymm8,%ymm9,%ymm0
@@ -9012,8 +9181,8 @@ CODE const uint8_t sk_load_tables_avx[] = {
65,254,201, //dec %r9b
69,15,182,201, //movzbl %r9b,%r9d
65,128,249,6, //cmp $0x6,%r9b
- 15,135,215,253,255,255, //ja 75a <_sk_load_tables_avx+0x1e>
- 76,141,21,138,0,0,0, //lea 0x8a(%rip),%r10 # a14 <_sk_load_tables_avx+0x2d8>
+ 15,135,215,253,255,255, //ja 7fa <_sk_load_tables_avx+0x1e>
+ 76,141,21,138,0,0,0, //lea 0x8a(%rip),%r10 # ab4 <_sk_load_tables_avx+0x2d8>
79,99,12,138, //movslq (%r10,%r9,4),%r9
77,1,209, //add %r10,%r9
65,255,225, //jmpq *%r9
@@ -9036,7 +9205,7 @@ CODE const uint8_t sk_load_tables_avx[] = {
196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8
196,195,57,34,4,184,0, //vpinsrd $0x0,(%r8,%rdi,4),%xmm8,%xmm0
196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8
- 233,70,253,255,255, //jmpq 75a <_sk_load_tables_avx+0x1e>
+ 233,70,253,255,255, //jmpq 7fa <_sk_load_tables_avx+0x1e>
238, //out %al,(%dx)
255, //(bad)
255, //(bad)
@@ -9063,7 +9232,7 @@ CODE const uint8_t sk_load_a8_avx[] = {
72,139,0, //mov (%rax),%rax
72,1,248, //add %rdi,%rax
77,133,192, //test %r8,%r8
- 117,59, //jne a7b <_sk_load_a8_avx+0x4b>
+ 117,59, //jne b1b <_sk_load_a8_avx+0x4b>
197,251,16,0, //vmovsd (%rax),%xmm0
196,226,121,49,200, //vpmovzxbd %xmm0,%xmm1
196,227,121,4,192,229, //vpermilps $0xe5,%xmm0,%xmm0
@@ -9087,9 +9256,9 @@ CODE const uint8_t sk_load_a8_avx[] = {
77,9,217, //or %r11,%r9
72,131,193,8, //add $0x8,%rcx
73,255,202, //dec %r10
- 117,234, //jne a83 <_sk_load_a8_avx+0x53>
+ 117,234, //jne b23 <_sk_load_a8_avx+0x53>
196,193,249,110,193, //vmovq %r9,%xmm0
- 235,164, //jmp a44 <_sk_load_a8_avx+0x14>
+ 235,164, //jmp ae4 <_sk_load_a8_avx+0x14>
};
CODE const uint8_t sk_store_a8_avx[] = {
@@ -9102,7 +9271,7 @@ CODE const uint8_t sk_store_a8_avx[] = {
196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8
196,65,57,103,192, //vpackuswb %xmm8,%xmm8,%xmm8
72,133,201, //test %rcx,%rcx
- 117,10, //jne ad3 <_sk_store_a8_avx+0x33>
+ 117,10, //jne b73 <_sk_store_a8_avx+0x33>
196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1)
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
@@ -9111,9 +9280,9 @@ CODE const uint8_t sk_store_a8_avx[] = {
254,200, //dec %al
68,15,182,192, //movzbl %al,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 119,236, //ja acf <_sk_store_a8_avx+0x2f>
+ 119,236, //ja b6f <_sk_store_a8_avx+0x2f>
196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8
- 76,141,21,69,0,0,0, //lea 0x45(%rip),%r10 # b34 <_sk_store_a8_avx+0x94>
+ 76,141,21,69,0,0,0, //lea 0x45(%rip),%r10 # bd4 <_sk_store_a8_avx+0x94>
75,99,4,130, //movslq (%r10,%r8,4),%rax
76,1,208, //add %r10,%rax
255,224, //jmpq *%rax
@@ -9124,7 +9293,7 @@ CODE const uint8_t sk_store_a8_avx[] = {
196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1)
196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1)
196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1)
- 235,158, //jmp acf <_sk_store_a8_avx+0x2f>
+ 235,158, //jmp b6f <_sk_store_a8_avx+0x2f>
15,31,0, //nopl (%rax)
244, //hlt
255, //(bad)
@@ -9155,7 +9324,7 @@ CODE const uint8_t sk_load_565_avx[] = {
72,173, //lods %ds:(%rsi),%rax
76,139,16, //mov (%rax),%r10
72,133,201, //test %rcx,%rcx
- 117,106, //jne bc4 <_sk_load_565_avx+0x74>
+ 117,106, //jne c64 <_sk_load_565_avx+0x74>
196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0
197,241,239,201, //vpxor %xmm1,%xmm1,%xmm1
197,249,105,201, //vpunpckhwd %xmm1,%xmm0,%xmm1
@@ -9185,8 +9354,8 @@ CODE const uint8_t sk_load_565_avx[] = {
65,254,200, //dec %r8b
69,15,182,192, //movzbl %r8b,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 119,132, //ja b60 <_sk_load_565_avx+0x10>
- 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # c2c <_sk_load_565_avx+0xdc>
+ 119,132, //ja c00 <_sk_load_565_avx+0x10>
+ 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # ccc <_sk_load_565_avx+0xdc>
75,99,4,129, //movslq (%r9,%r8,4),%rax
76,1,200, //add %r9,%rax
255,224, //jmpq *%rax
@@ -9198,7 +9367,7 @@ CODE const uint8_t sk_load_565_avx[] = {
196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0
- 233,52,255,255,255, //jmpq b60 <_sk_load_565_avx+0x10>
+ 233,52,255,255,255, //jmpq c00 <_sk_load_565_avx+0x10>
244, //hlt
255, //(bad)
255, //(bad)
@@ -9248,7 +9417,7 @@ CODE const uint8_t sk_store_565_avx[] = {
196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9
196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8
72,133,201, //test %rcx,%rcx
- 117,10, //jne cce <_sk_store_565_avx+0x86>
+ 117,10, //jne d6e <_sk_store_565_avx+0x86>
196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2)
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
@@ -9257,8 +9426,8 @@ CODE const uint8_t sk_store_565_avx[] = {
254,200, //dec %al
68,15,182,192, //movzbl %al,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 119,236, //ja cca <_sk_store_565_avx+0x82>
- 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # d2c <_sk_store_565_avx+0xe4>
+ 119,236, //ja d6a <_sk_store_565_avx+0x82>
+ 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # dcc <_sk_store_565_avx+0xe4>
75,99,4,130, //movslq (%r10,%r8,4),%rax
76,1,208, //add %r10,%rax
255,224, //jmpq *%rax
@@ -9270,7 +9439,7 @@ CODE const uint8_t sk_store_565_avx[] = {
196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2)
197,121,126,192, //vmovd %xmm8,%eax
102,65,137,4,121, //mov %ax,(%r9,%rdi,2)
- 235,161, //jmp cca <_sk_store_565_avx+0x82>
+ 235,161, //jmp d6a <_sk_store_565_avx+0x82>
15,31,0, //nopl (%rax)
242,255, //repnz (bad)
255, //(bad)
@@ -9300,26 +9469,32 @@ CODE const uint8_t sk_load_8888_avx[] = {
72,173, //lods %ds:(%rsi),%rax
76,139,16, //mov (%rax),%r10
72,133,201, //test %rcx,%rcx
- 117,125, //jne dcf <_sk_load_8888_avx+0x87>
+ 15,133,157,0,0,0, //jne e93 <_sk_load_8888_avx+0xab>
196,65,124,16,12,186, //vmovups (%r10,%rdi,4),%ymm9
- 196,98,125,24,90,16, //vbroadcastss 0x10(%rdx),%ymm11
+ 184,255,0,0,0, //mov $0xff,%eax
+ 197,249,110,192, //vmovd %eax,%xmm0
+ 197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0
+ 196,99,125,24,216,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm11
196,193,36,84,193, //vandps %ymm9,%ymm11,%ymm0
197,252,91,192, //vcvtdq2ps %ymm0,%ymm0
- 196,98,125,24,66,12, //vbroadcastss 0xc(%rdx),%ymm8
- 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 197,249,110,200, //vmovd %eax,%xmm1
+ 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1
+ 196,99,117,24,193,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm8
+ 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0
196,193,41,114,209,8, //vpsrld $0x8,%xmm9,%xmm10
196,99,125,25,203,1, //vextractf128 $0x1,%ymm9,%xmm3
197,241,114,211,8, //vpsrld $0x8,%xmm3,%xmm1
196,227,45,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm10,%ymm1
197,164,84,201, //vandps %ymm1,%ymm11,%ymm1
197,252,91,201, //vcvtdq2ps %ymm1,%ymm1
- 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1
+ 196,193,116,89,200, //vmulps %ymm8,%ymm1,%ymm1
196,193,41,114,209,16, //vpsrld $0x10,%xmm9,%xmm10
197,233,114,211,16, //vpsrld $0x10,%xmm3,%xmm2
196,227,45,24,210,1, //vinsertf128 $0x1,%xmm2,%ymm10,%ymm2
197,164,84,210, //vandps %ymm2,%ymm11,%ymm2
197,252,91,210, //vcvtdq2ps %ymm2,%ymm2
- 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2
+ 196,193,108,89,208, //vmulps %ymm8,%ymm2,%ymm2
196,193,49,114,209,24, //vpsrld $0x18,%xmm9,%xmm9
197,225,114,211,24, //vpsrld $0x18,%xmm3,%xmm3
196,227,53,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm9,%ymm3
@@ -9333,8 +9508,8 @@ CODE const uint8_t sk_load_8888_avx[] = {
65,254,200, //dec %r8b
69,15,182,192, //movzbl %r8b,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 15,135,108,255,255,255, //ja d58 <_sk_load_8888_avx+0x10>
- 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # e7c <_sk_load_8888_avx+0x134>
+ 15,135,76,255,255,255, //ja dfc <_sk_load_8888_avx+0x14>
+ 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # f40 <_sk_load_8888_avx+0x158>
75,99,4,129, //movslq (%r9,%r8,4),%rax
76,1,200, //add %r9,%rax
255,224, //jmpq *%rax
@@ -9357,7 +9532,7 @@ CODE const uint8_t sk_load_8888_avx[] = {
196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9
196,195,49,34,4,186,0, //vpinsrd $0x0,(%r10,%rdi,4),%xmm9,%xmm0
196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9
- 233,220,254,255,255, //jmpq d58 <_sk_load_8888_avx+0x10>
+ 233,188,254,255,255, //jmpq dfc <_sk_load_8888_avx+0x14>
238, //out %al,(%dx)
255, //(bad)
255, //(bad)
@@ -9381,7 +9556,10 @@ CODE const uint8_t sk_load_8888_avx[] = {
CODE const uint8_t sk_store_8888_avx[] = {
72,173, //lods %ds:(%rsi),%rax
76,139,8, //mov (%rax),%r9
- 196,98,125,24,66,8, //vbroadcastss 0x8(%rdx),%ymm8
+ 184,0,0,127,67, //mov $0x437f0000,%eax
+ 197,121,110,192, //vmovd %eax,%xmm8
+ 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8
+ 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9
196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9
197,60,89,209, //vmulps %ymm1,%ymm8,%ymm10
@@ -9406,7 +9584,7 @@ CODE const uint8_t sk_store_8888_avx[] = {
196,65,45,86,192, //vorpd %ymm8,%ymm10,%ymm8
196,65,53,86,192, //vorpd %ymm8,%ymm9,%ymm8
72,133,201, //test %rcx,%rcx
- 117,10, //jne f2d <_sk_store_8888_avx+0x95>
+ 117,10, //jne 1000 <_sk_store_8888_avx+0xa4>
196,65,124,17,4,185, //vmovups %ymm8,(%r9,%rdi,4)
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
@@ -9415,8 +9593,8 @@ CODE const uint8_t sk_store_8888_avx[] = {
254,200, //dec %al
68,15,182,192, //movzbl %al,%r8d
65,128,248,6, //cmp $0x6,%r8b
- 119,236, //ja f29 <_sk_store_8888_avx+0x91>
- 76,141,21,84,0,0,0, //lea 0x54(%rip),%r10 # f98 <_sk_store_8888_avx+0x100>
+ 119,236, //ja ffc <_sk_store_8888_avx+0xa0>
+ 76,141,21,85,0,0,0, //lea 0x55(%rip),%r10 # 106c <_sk_store_8888_avx+0x110>
75,99,4,130, //movslq (%r10,%r8,4),%rax
76,1,208, //add %r10,%rax
255,224, //jmpq *%rax
@@ -9430,28 +9608,29 @@ CODE const uint8_t sk_store_8888_avx[] = {
196,67,121,22,68,185,8,2, //vpextrd $0x2,%xmm8,0x8(%r9,%rdi,4)
196,67,121,22,68,185,4,1, //vpextrd $0x1,%xmm8,0x4(%r9,%rdi,4)
196,65,121,126,4,185, //vmovd %xmm8,(%r9,%rdi,4)
- 235,147, //jmp f29 <_sk_store_8888_avx+0x91>
- 102,144, //xchg %ax,%ax
- 246,255, //idiv %bh
+ 235,147, //jmp ffc <_sk_store_8888_avx+0xa0>
+ 15,31,0, //nopl (%rax)
+ 245, //cmc
255, //(bad)
255, //(bad)
- 238, //out %al,(%dx)
255, //(bad)
+ 237, //in (%dx),%eax
255, //(bad)
- 255,230, //jmpq *%rsi
255, //(bad)
+ 255,229, //jmpq *%rbp
255, //(bad)
255, //(bad)
- 222,255, //fdivrp %st,%st(7)
255, //(bad)
- 255,209, //callq *%rcx
+ 221,255, //(bad)
255, //(bad)
+ 255,208, //callq *%rax
255, //(bad)
- 255,195, //inc %ebx
+ 255, //(bad)
+ 255,194, //inc %edx
255, //(bad)
255, //(bad)
255, //.byte 0xff
- 181,255, //mov $0xff,%ch
+ 180,255, //mov $0xff,%ah
255, //(bad)
255, //.byte 0xff
};
@@ -9460,7 +9639,7 @@ CODE const uint8_t sk_load_f16_avx[] = {
72,173, //lods %ds:(%rsi),%rax
72,139,0, //mov (%rax),%rax
72,133,201, //test %rcx,%rcx
- 15,133,240,0,0,0, //jne 10b2 <_sk_load_f16_avx+0xfe>
+ 15,133,240,0,0,0, //jne 1186 <_sk_load_f16_avx+0xfe>
197,249,16,12,248, //vmovupd (%rax,%rdi,8),%xmm1
197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2
197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3
@@ -9514,35 +9693,35 @@ CODE const uint8_t sk_load_f16_avx[] = {
197,251,16,12,248, //vmovsd (%rax,%rdi,8),%xmm1
196,65,57,87,192, //vxorpd %xmm8,%xmm8,%xmm8
72,131,249,1, //cmp $0x1,%rcx
- 117,6, //jne 10c8 <_sk_load_f16_avx+0x114>
+ 117,6, //jne 119c <_sk_load_f16_avx+0x114>
197,250,126,201, //vmovq %xmm1,%xmm1
- 235,30, //jmp 10e6 <_sk_load_f16_avx+0x132>
+ 235,30, //jmp 11ba <_sk_load_f16_avx+0x132>
197,241,22,76,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm1,%xmm1
72,131,249,3, //cmp $0x3,%rcx
- 114,18, //jb 10e6 <_sk_load_f16_avx+0x132>
+ 114,18, //jb 11ba <_sk_load_f16_avx+0x132>
197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2
72,131,249,3, //cmp $0x3,%rcx
- 117,19, //jne 10f3 <_sk_load_f16_avx+0x13f>
+ 117,19, //jne 11c7 <_sk_load_f16_avx+0x13f>
197,250,126,210, //vmovq %xmm2,%xmm2
- 235,46, //jmp 1114 <_sk_load_f16_avx+0x160>
+ 235,46, //jmp 11e8 <_sk_load_f16_avx+0x160>
197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3
197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2
- 233,230,254,255,255, //jmpq fd9 <_sk_load_f16_avx+0x25>
+ 233,230,254,255,255, //jmpq 10ad <_sk_load_f16_avx+0x25>
197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2
72,131,249,5, //cmp $0x5,%rcx
- 114,21, //jb 1114 <_sk_load_f16_avx+0x160>
+ 114,21, //jb 11e8 <_sk_load_f16_avx+0x160>
197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3
72,131,249,5, //cmp $0x5,%rcx
- 117,18, //jne 111d <_sk_load_f16_avx+0x169>
+ 117,18, //jne 11f1 <_sk_load_f16_avx+0x169>
197,250,126,219, //vmovq %xmm3,%xmm3
- 233,197,254,255,255, //jmpq fd9 <_sk_load_f16_avx+0x25>
+ 233,197,254,255,255, //jmpq 10ad <_sk_load_f16_avx+0x25>
197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3
- 233,188,254,255,255, //jmpq fd9 <_sk_load_f16_avx+0x25>
+ 233,188,254,255,255, //jmpq 10ad <_sk_load_f16_avx+0x25>
197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3
72,131,249,7, //cmp $0x7,%rcx
- 15,130,172,254,255,255, //jb fd9 <_sk_load_f16_avx+0x25>
+ 15,130,172,254,255,255, //jb 10ad <_sk_load_f16_avx+0x25>
197,123,16,68,248,48, //vmovsd 0x30(%rax,%rdi,8),%xmm8
- 233,161,254,255,255, //jmpq fd9 <_sk_load_f16_avx+0x25>
+ 233,161,254,255,255, //jmpq 10ad <_sk_load_f16_avx+0x25>
};
CODE const uint8_t sk_store_f16_avx[] = {
@@ -9578,7 +9757,7 @@ CODE const uint8_t sk_store_f16_avx[] = {
196,65,25,98,205, //vpunpckldq %xmm13,%xmm12,%xmm9
196,65,25,106,197, //vpunpckhdq %xmm13,%xmm12,%xmm8
72,133,201, //test %rcx,%rcx
- 117,27, //jne 11fb <_sk_store_f16_avx+0xc3>
+ 117,27, //jne 12cf <_sk_store_f16_avx+0xc3>
197,120,17,28,248, //vmovups %xmm11,(%rax,%rdi,8)
197,120,17,84,248,16, //vmovups %xmm10,0x10(%rax,%rdi,8)
197,120,17,76,248,32, //vmovups %xmm9,0x20(%rax,%rdi,8)
@@ -9587,22 +9766,22 @@ CODE const uint8_t sk_store_f16_avx[] = {
255,224, //jmpq *%rax
197,121,214,28,248, //vmovq %xmm11,(%rax,%rdi,8)
72,131,249,1, //cmp $0x1,%rcx
- 116,241, //je 11f7 <_sk_store_f16_avx+0xbf>
+ 116,241, //je 12cb <_sk_store_f16_avx+0xbf>
197,121,23,92,248,8, //vmovhpd %xmm11,0x8(%rax,%rdi,8)
72,131,249,3, //cmp $0x3,%rcx
- 114,229, //jb 11f7 <_sk_store_f16_avx+0xbf>
+ 114,229, //jb 12cb <_sk_store_f16_avx+0xbf>
197,121,214,84,248,16, //vmovq %xmm10,0x10(%rax,%rdi,8)
- 116,221, //je 11f7 <_sk_store_f16_avx+0xbf>
+ 116,221, //je 12cb <_sk_store_f16_avx+0xbf>
197,121,23,84,248,24, //vmovhpd %xmm10,0x18(%rax,%rdi,8)
72,131,249,5, //cmp $0x5,%rcx
- 114,209, //jb 11f7 <_sk_store_f16_avx+0xbf>
+ 114,209, //jb 12cb <_sk_store_f16_avx+0xbf>
197,121,214,76,248,32, //vmovq %xmm9,0x20(%rax,%rdi,8)
- 116,201, //je 11f7 <_sk_store_f16_avx+0xbf>
+ 116,201, //je 12cb <_sk_store_f16_avx+0xbf>
197,121,23,76,248,40, //vmovhpd %xmm9,0x28(%rax,%rdi,8)
72,131,249,7, //cmp $0x7,%rcx
- 114,189, //jb 11f7 <_sk_store_f16_avx+0xbf>
+ 114,189, //jb 12cb <_sk_store_f16_avx+0xbf>
197,121,214,68,248,48, //vmovq %xmm8,0x30(%rax,%rdi,8)
- 235,181, //jmp 11f7 <_sk_store_f16_avx+0xbf>
+ 235,181, //jmp 12cb <_sk_store_f16_avx+0xbf>
};
CODE const uint8_t sk_store_f32_avx[] = {
@@ -9618,7 +9797,7 @@ CODE const uint8_t sk_store_f32_avx[] = {
196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8
196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11
72,133,201, //test %rcx,%rcx
- 117,55, //jne 12af <_sk_store_f32_avx+0x6d>
+ 117,55, //jne 1383 <_sk_store_f32_avx+0x6d>
196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12
196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13
196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9
@@ -9631,22 +9810,22 @@ CODE const uint8_t sk_store_f32_avx[] = {
255,224, //jmpq *%rax
196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4)
72,131,249,1, //cmp $0x1,%rcx
- 116,240, //je 12ab <_sk_store_f32_avx+0x69>
+ 116,240, //je 137f <_sk_store_f32_avx+0x69>
196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4)
72,131,249,3, //cmp $0x3,%rcx
- 114,227, //jb 12ab <_sk_store_f32_avx+0x69>
+ 114,227, //jb 137f <_sk_store_f32_avx+0x69>
196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4)
- 116,218, //je 12ab <_sk_store_f32_avx+0x69>
+ 116,218, //je 137f <_sk_store_f32_avx+0x69>
196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4)
72,131,249,5, //cmp $0x5,%rcx
- 114,205, //jb 12ab <_sk_store_f32_avx+0x69>
+ 114,205, //jb 137f <_sk_store_f32_avx+0x69>
196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4)
- 116,195, //je 12ab <_sk_store_f32_avx+0x69>
+ 116,195, //je 137f <_sk_store_f32_avx+0x69>
196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4)
72,131,249,7, //cmp $0x7,%rcx
- 114,181, //jb 12ab <_sk_store_f32_avx+0x69>
+ 114,181, //jb 137f <_sk_store_f32_avx+0x69>
196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4)
- 235,171, //jmp 12ab <_sk_store_f32_avx+0x69>
+ 235,171, //jmp 137f <_sk_store_f32_avx+0x69>
};
CODE const uint8_t sk_clamp_x_avx[] = {
@@ -10034,16 +10213,18 @@ CODE const uint8_t sk_seed_shader_sse41[] = {
102,15,110,199, //movd %edi,%xmm0
102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
15,91,200, //cvtdq2ps %xmm0,%xmm1
- 243,15,16,18, //movss (%rdx),%xmm2
- 243,15,16,90,4, //movss 0x4(%rdx),%xmm3
- 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
- 15,88,203, //addps %xmm3,%xmm1
+ 185,0,0,0,63, //mov $0x3f000000,%ecx
+ 102,15,110,209, //movd %ecx,%xmm2
+ 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
+ 15,88,202, //addps %xmm2,%xmm1
15,16,66,20, //movups 0x14(%rdx),%xmm0
15,88,193, //addps %xmm1,%xmm0
102,15,110,8, //movd (%rax),%xmm1
102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1
15,91,201, //cvtdq2ps %xmm1,%xmm1
- 15,88,203, //addps %xmm3,%xmm1
+ 15,88,202, //addps %xmm2,%xmm1
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,15,110,208, //movd %eax,%xmm2
15,198,210,0, //shufps $0x0,%xmm2,%xmm2
72,173, //lods %ds:(%rsi),%rax
15,87,219, //xorps %xmm3,%xmm3
@@ -10087,7 +10268,8 @@ CODE const uint8_t sk_plus__sse41[] = {
};
CODE const uint8_t sk_srcover_sse41[] = {
- 243,68,15,16,2, //movss (%rdx),%xmm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
68,15,92,195, //subps %xmm3,%xmm8
69,15,40,200, //movaps %xmm8,%xmm9
@@ -10106,7 +10288,8 @@ CODE const uint8_t sk_srcover_sse41[] = {
};
CODE const uint8_t sk_dstover_sse41[] = {
- 243,68,15,16,2, //movss (%rdx),%xmm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
68,15,92,199, //subps %xmm7,%xmm8
65,15,89,192, //mulps %xmm8,%xmm0
@@ -10132,7 +10315,8 @@ CODE const uint8_t sk_clamp_0_sse41[] = {
};
CODE const uint8_t sk_clamp_1_sse41[] = {
- 243,68,15,16,2, //movss (%rdx),%xmm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
65,15,93,192, //minps %xmm8,%xmm0
65,15,93,200, //minps %xmm8,%xmm1
@@ -10143,7 +10327,8 @@ CODE const uint8_t sk_clamp_1_sse41[] = {
};
CODE const uint8_t sk_clamp_a_sse41[] = {
- 243,68,15,16,2, //movss (%rdx),%xmm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
65,15,93,216, //minps %xmm8,%xmm3
15,93,195, //minps %xmm3,%xmm0
@@ -10219,7 +10404,8 @@ CODE const uint8_t sk_premul_sse41[] = {
CODE const uint8_t sk_unpremul_sse41[] = {
68,15,40,192, //movaps %xmm0,%xmm8
69,15,87,201, //xorps %xmm9,%xmm9
- 243,68,15,16,18, //movss (%rdx),%xmm10
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,208, //movd %eax,%xmm10
69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
68,15,94,211, //divps %xmm3,%xmm10
15,40,195, //movaps %xmm3,%xmm0
@@ -10373,7 +10559,8 @@ CODE const uint8_t sk_scale_u8_sse41[] = {
72,139,0, //mov (%rax),%rax
102,68,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm8
69,15,91,192, //cvtdq2ps %xmm8,%xmm8
- 243,68,15,16,74,12, //movss 0xc(%rdx),%xmm9
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 102,68,15,110,200, //movd %eax,%xmm9
69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
69,15,89,200, //mulps %xmm8,%xmm9
65,15,89,193, //mulps %xmm9,%xmm0
@@ -10409,7 +10596,8 @@ CODE const uint8_t sk_lerp_u8_sse41[] = {
72,139,0, //mov (%rax),%rax
102,68,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm8
69,15,91,192, //cvtdq2ps %xmm8,%xmm8
- 243,68,15,16,74,12, //movss 0xc(%rdx),%xmm9
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 102,68,15,110,200, //movd %eax,%xmm9
69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
69,15,89,200, //mulps %xmm8,%xmm9
15,92,196, //subps %xmm4,%xmm0
@@ -10436,33 +10624,34 @@ CODE const uint8_t sk_lerp_565_sse41[] = {
102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3
102,65,15,219,216, //pand %xmm8,%xmm3
68,15,91,203, //cvtdq2ps %xmm3,%xmm9
- 243,15,16,26, //movss (%rdx),%xmm3
- 243,68,15,16,82,116, //movss 0x74(%rdx),%xmm10
- 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
- 69,15,89,209, //mulps %xmm9,%xmm10
- 102,68,15,110,74,108, //movd 0x6c(%rdx),%xmm9
- 102,69,15,112,201,0, //pshufd $0x0,%xmm9,%xmm9
- 102,69,15,219,200, //pand %xmm8,%xmm9
- 69,15,91,201, //cvtdq2ps %xmm9,%xmm9
- 243,68,15,16,90,120, //movss 0x78(%rdx),%xmm11
+ 243,68,15,16,90,116, //movss 0x74(%rdx),%xmm11
+ 243,68,15,16,82,120, //movss 0x78(%rdx),%xmm10
69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
69,15,89,217, //mulps %xmm9,%xmm11
- 102,68,15,110,74,112, //movd 0x70(%rdx),%xmm9
- 102,69,15,112,201,0, //pshufd $0x0,%xmm9,%xmm9
- 102,69,15,219,200, //pand %xmm8,%xmm9
- 69,15,91,193, //cvtdq2ps %xmm9,%xmm8
- 243,68,15,16,74,124, //movss 0x7c(%rdx),%xmm9
- 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
- 69,15,89,200, //mulps %xmm8,%xmm9
+ 102,15,110,90,108, //movd 0x6c(%rdx),%xmm3
+ 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3
+ 102,65,15,219,216, //pand %xmm8,%xmm3
+ 15,91,219, //cvtdq2ps %xmm3,%xmm3
+ 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
+ 68,15,89,211, //mulps %xmm3,%xmm10
+ 102,15,110,90,112, //movd 0x70(%rdx),%xmm3
+ 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3
+ 102,65,15,219,216, //pand %xmm8,%xmm3
+ 68,15,91,195, //cvtdq2ps %xmm3,%xmm8
+ 243,15,16,90,124, //movss 0x7c(%rdx),%xmm3
+ 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
+ 65,15,89,216, //mulps %xmm8,%xmm3
15,92,196, //subps %xmm4,%xmm0
- 65,15,89,194, //mulps %xmm10,%xmm0
+ 65,15,89,195, //mulps %xmm11,%xmm0
15,88,196, //addps %xmm4,%xmm0
15,92,205, //subps %xmm5,%xmm1
- 65,15,89,203, //mulps %xmm11,%xmm1
+ 65,15,89,202, //mulps %xmm10,%xmm1
15,88,205, //addps %xmm5,%xmm1
15,92,214, //subps %xmm6,%xmm2
- 65,15,89,209, //mulps %xmm9,%xmm2
+ 15,89,211, //mulps %xmm3,%xmm2
15,88,214, //addps %xmm6,%xmm2
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,15,110,216, //movd %eax,%xmm3
15,198,219,0, //shufps $0x0,%xmm3,%xmm3
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
@@ -10615,7 +10804,8 @@ CODE const uint8_t sk_load_8888_sse41[] = {
72,173, //lods %ds:(%rsi),%rax
72,139,0, //mov (%rax),%rax
243,15,111,28,184, //movdqu (%rax,%rdi,4),%xmm3
- 102,15,110,66,16, //movd 0x10(%rdx),%xmm0
+ 184,255,0,0,0, //mov $0xff,%eax
+ 102,15,110,192, //movd %eax,%xmm0
102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
102,15,111,203, //movdqa %xmm3,%xmm1
102,15,114,209,8, //psrld $0x8,%xmm1
@@ -10625,7 +10815,8 @@ CODE const uint8_t sk_load_8888_sse41[] = {
102,15,219,208, //pand %xmm0,%xmm2
102,15,219,195, //pand %xmm3,%xmm0
15,91,192, //cvtdq2ps %xmm0,%xmm0
- 243,68,15,16,66,12, //movss 0xc(%rdx),%xmm8
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
65,15,89,192, //mulps %xmm8,%xmm0
15,91,201, //cvtdq2ps %xmm1,%xmm1
@@ -10642,7 +10833,8 @@ CODE const uint8_t sk_load_8888_sse41[] = {
CODE const uint8_t sk_store_8888_sse41[] = {
72,173, //lods %ds:(%rsi),%rax
72,139,0, //mov (%rax),%rax
- 243,68,15,16,66,8, //movss 0x8(%rdx),%xmm8
+ 185,0,0,127,67, //mov $0x437f0000,%ecx
+ 102,68,15,110,193, //movd %ecx,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
69,15,40,200, //movaps %xmm8,%xmm9
68,15,89,200, //mulps %xmm0,%xmm9
@@ -11197,16 +11389,18 @@ CODE const uint8_t sk_seed_shader_sse2[] = {
102,15,110,199, //movd %edi,%xmm0
102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
15,91,200, //cvtdq2ps %xmm0,%xmm1
- 243,15,16,18, //movss (%rdx),%xmm2
- 243,15,16,90,4, //movss 0x4(%rdx),%xmm3
- 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
- 15,88,203, //addps %xmm3,%xmm1
+ 185,0,0,0,63, //mov $0x3f000000,%ecx
+ 102,15,110,209, //movd %ecx,%xmm2
+ 15,198,210,0, //shufps $0x0,%xmm2,%xmm2
+ 15,88,202, //addps %xmm2,%xmm1
15,16,66,20, //movups 0x14(%rdx),%xmm0
15,88,193, //addps %xmm1,%xmm0
102,15,110,8, //movd (%rax),%xmm1
102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1
15,91,201, //cvtdq2ps %xmm1,%xmm1
- 15,88,203, //addps %xmm3,%xmm1
+ 15,88,202, //addps %xmm2,%xmm1
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,15,110,208, //movd %eax,%xmm2
15,198,210,0, //shufps $0x0,%xmm2,%xmm2
72,173, //lods %ds:(%rsi),%rax
15,87,219, //xorps %xmm3,%xmm3
@@ -11250,7 +11444,8 @@ CODE const uint8_t sk_plus__sse2[] = {
};
CODE const uint8_t sk_srcover_sse2[] = {
- 243,68,15,16,2, //movss (%rdx),%xmm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
68,15,92,195, //subps %xmm3,%xmm8
69,15,40,200, //movaps %xmm8,%xmm9
@@ -11269,7 +11464,8 @@ CODE const uint8_t sk_srcover_sse2[] = {
};
CODE const uint8_t sk_dstover_sse2[] = {
- 243,68,15,16,2, //movss (%rdx),%xmm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
68,15,92,199, //subps %xmm7,%xmm8
65,15,89,192, //mulps %xmm8,%xmm0
@@ -11295,7 +11491,8 @@ CODE const uint8_t sk_clamp_0_sse2[] = {
};
CODE const uint8_t sk_clamp_1_sse2[] = {
- 243,68,15,16,2, //movss (%rdx),%xmm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
65,15,93,192, //minps %xmm8,%xmm0
65,15,93,200, //minps %xmm8,%xmm1
@@ -11306,7 +11503,8 @@ CODE const uint8_t sk_clamp_1_sse2[] = {
};
CODE const uint8_t sk_clamp_a_sse2[] = {
- 243,68,15,16,2, //movss (%rdx),%xmm8
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
65,15,93,216, //minps %xmm8,%xmm3
15,93,195, //minps %xmm3,%xmm0
@@ -11382,7 +11580,8 @@ CODE const uint8_t sk_premul_sse2[] = {
CODE const uint8_t sk_unpremul_sse2[] = {
69,15,87,192, //xorps %xmm8,%xmm8
68,15,194,195,0, //cmpeqps %xmm3,%xmm8
- 243,68,15,16,10, //movss (%rdx),%xmm9
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,200, //movd %eax,%xmm9
69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
68,15,94,203, //divps %xmm3,%xmm9
69,15,85,193, //andnps %xmm9,%xmm8
@@ -11537,7 +11736,8 @@ CODE const uint8_t sk_scale_u8_sse2[] = {
102,69,15,96,193, //punpcklbw %xmm9,%xmm8
102,69,15,97,193, //punpcklwd %xmm9,%xmm8
69,15,91,192, //cvtdq2ps %xmm8,%xmm8
- 243,68,15,16,74,12, //movss 0xc(%rdx),%xmm9
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 102,68,15,110,200, //movd %eax,%xmm9
69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
69,15,89,200, //mulps %xmm8,%xmm9
65,15,89,193, //mulps %xmm9,%xmm0
@@ -11576,7 +11776,8 @@ CODE const uint8_t sk_lerp_u8_sse2[] = {
102,69,15,96,193, //punpcklbw %xmm9,%xmm8
102,69,15,97,193, //punpcklwd %xmm9,%xmm8
69,15,91,192, //cvtdq2ps %xmm8,%xmm8
- 243,68,15,16,74,12, //movss 0xc(%rdx),%xmm9
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 102,68,15,110,200, //movd %eax,%xmm9
69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
69,15,89,200, //mulps %xmm8,%xmm9
15,92,196, //subps %xmm4,%xmm0
@@ -11598,40 +11799,41 @@ CODE const uint8_t sk_lerp_u8_sse2[] = {
CODE const uint8_t sk_lerp_565_sse2[] = {
72,173, //lods %ds:(%rsi),%rax
72,139,0, //mov (%rax),%rax
- 243,68,15,126,4,120, //movq (%rax,%rdi,2),%xmm8
+ 243,68,15,126,12,120, //movq (%rax,%rdi,2),%xmm9
102,15,239,219, //pxor %xmm3,%xmm3
- 102,68,15,97,195, //punpcklwd %xmm3,%xmm8
+ 102,68,15,97,203, //punpcklwd %xmm3,%xmm9
102,15,110,90,104, //movd 0x68(%rdx),%xmm3
102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3
- 102,65,15,219,216, //pand %xmm8,%xmm3
- 68,15,91,203, //cvtdq2ps %xmm3,%xmm9
- 243,15,16,26, //movss (%rdx),%xmm3
- 243,68,15,16,82,116, //movss 0x74(%rdx),%xmm10
- 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
- 69,15,89,209, //mulps %xmm9,%xmm10
- 102,68,15,110,74,108, //movd 0x6c(%rdx),%xmm9
- 102,69,15,112,201,0, //pshufd $0x0,%xmm9,%xmm9
- 102,69,15,219,200, //pand %xmm8,%xmm9
- 69,15,91,201, //cvtdq2ps %xmm9,%xmm9
- 243,68,15,16,90,120, //movss 0x78(%rdx),%xmm11
+ 102,65,15,219,217, //pand %xmm9,%xmm3
+ 68,15,91,211, //cvtdq2ps %xmm3,%xmm10
+ 243,68,15,16,90,116, //movss 0x74(%rdx),%xmm11
+ 243,68,15,16,66,120, //movss 0x78(%rdx),%xmm8
69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
- 69,15,89,217, //mulps %xmm9,%xmm11
- 102,68,15,110,74,112, //movd 0x70(%rdx),%xmm9
- 102,69,15,112,201,0, //pshufd $0x0,%xmm9,%xmm9
- 102,69,15,219,200, //pand %xmm8,%xmm9
- 69,15,91,193, //cvtdq2ps %xmm9,%xmm8
- 243,68,15,16,74,124, //movss 0x7c(%rdx),%xmm9
- 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9
- 69,15,89,200, //mulps %xmm8,%xmm9
+ 69,15,89,218, //mulps %xmm10,%xmm11
+ 102,15,110,90,108, //movd 0x6c(%rdx),%xmm3
+ 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3
+ 102,65,15,219,217, //pand %xmm9,%xmm3
+ 15,91,219, //cvtdq2ps %xmm3,%xmm3
+ 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
+ 68,15,89,195, //mulps %xmm3,%xmm8
+ 102,15,110,90,112, //movd 0x70(%rdx),%xmm3
+ 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3
+ 102,65,15,219,217, //pand %xmm9,%xmm3
+ 68,15,91,203, //cvtdq2ps %xmm3,%xmm9
+ 243,15,16,90,124, //movss 0x7c(%rdx),%xmm3
+ 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
+ 65,15,89,217, //mulps %xmm9,%xmm3
15,92,196, //subps %xmm4,%xmm0
- 65,15,89,194, //mulps %xmm10,%xmm0
+ 65,15,89,195, //mulps %xmm11,%xmm0
15,88,196, //addps %xmm4,%xmm0
15,92,205, //subps %xmm5,%xmm1
- 65,15,89,203, //mulps %xmm11,%xmm1
+ 65,15,89,200, //mulps %xmm8,%xmm1
15,88,205, //addps %xmm5,%xmm1
15,92,214, //subps %xmm6,%xmm2
- 65,15,89,209, //mulps %xmm9,%xmm2
+ 15,89,211, //mulps %xmm3,%xmm2
15,88,214, //addps %xmm6,%xmm2
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,15,110,216, //movd %eax,%xmm3
15,198,219,0, //shufps $0x0,%xmm3,%xmm3
72,173, //lods %ds:(%rsi),%rax
255,224, //jmpq *%rax
@@ -11801,7 +12003,8 @@ CODE const uint8_t sk_load_8888_sse2[] = {
72,173, //lods %ds:(%rsi),%rax
72,139,0, //mov (%rax),%rax
243,15,111,28,184, //movdqu (%rax,%rdi,4),%xmm3
- 102,15,110,66,16, //movd 0x10(%rdx),%xmm0
+ 184,255,0,0,0, //mov $0xff,%eax
+ 102,15,110,192, //movd %eax,%xmm0
102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0
102,15,111,203, //movdqa %xmm3,%xmm1
102,15,114,209,8, //psrld $0x8,%xmm1
@@ -11811,7 +12014,8 @@ CODE const uint8_t sk_load_8888_sse2[] = {
102,15,219,208, //pand %xmm0,%xmm2
102,15,219,195, //pand %xmm3,%xmm0
15,91,192, //cvtdq2ps %xmm0,%xmm0
- 243,68,15,16,66,12, //movss 0xc(%rdx),%xmm8
+ 184,129,128,128,59, //mov $0x3b808081,%eax
+ 102,68,15,110,192, //movd %eax,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
65,15,89,192, //mulps %xmm8,%xmm0
15,91,201, //cvtdq2ps %xmm1,%xmm1
@@ -11828,7 +12032,8 @@ CODE const uint8_t sk_load_8888_sse2[] = {
CODE const uint8_t sk_store_8888_sse2[] = {
72,173, //lods %ds:(%rsi),%rax
72,139,0, //mov (%rax),%rax
- 243,68,15,16,66,8, //movss 0x8(%rdx),%xmm8
+ 185,0,0,127,67, //mov $0x437f0000,%ecx
+ 102,68,15,110,193, //movd %ecx,%xmm8
69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
69,15,40,200, //movaps %xmm8,%xmm9
68,15,89,200, //mulps %xmm0,%xmm9
@@ -11984,7 +12189,8 @@ CODE const uint8_t sk_repeat_x_sse2[] = {
243,69,15,91,209, //cvttps2dq %xmm9,%xmm10
69,15,91,210, //cvtdq2ps %xmm10,%xmm10
69,15,194,202,1, //cmpltps %xmm10,%xmm9
- 243,68,15,16,26, //movss (%rdx),%xmm11
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,216, //movd %eax,%xmm11
69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
69,15,84,217, //andps %xmm9,%xmm11
69,15,92,211, //subps %xmm11,%xmm10
@@ -12006,7 +12212,8 @@ CODE const uint8_t sk_repeat_y_sse2[] = {
243,69,15,91,209, //cvttps2dq %xmm9,%xmm10
69,15,91,210, //cvtdq2ps %xmm10,%xmm10
69,15,194,202,1, //cmpltps %xmm10,%xmm9
- 243,68,15,16,26, //movss (%rdx),%xmm11
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,216, //movd %eax,%xmm11
69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
69,15,84,217, //andps %xmm9,%xmm11
69,15,92,211, //subps %xmm11,%xmm10
@@ -12032,7 +12239,8 @@ CODE const uint8_t sk_mirror_x_sse2[] = {
243,69,15,91,218, //cvttps2dq %xmm10,%xmm11
69,15,91,219, //cvtdq2ps %xmm11,%xmm11
69,15,194,211,1, //cmpltps %xmm11,%xmm10
- 243,68,15,16,34, //movss (%rdx),%xmm12
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,224, //movd %eax,%xmm12
69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
69,15,84,226, //andps %xmm10,%xmm12
69,15,87,210, //xorps %xmm10,%xmm10
@@ -12062,7 +12270,8 @@ CODE const uint8_t sk_mirror_y_sse2[] = {
243,69,15,91,218, //cvttps2dq %xmm10,%xmm11
69,15,91,219, //cvtdq2ps %xmm11,%xmm11
69,15,194,211,1, //cmpltps %xmm11,%xmm10
- 243,68,15,16,34, //movss (%rdx),%xmm12
+ 184,0,0,128,63, //mov $0x3f800000,%eax
+ 102,68,15,110,224, //movd %eax,%xmm12
69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
69,15,84,226, //andps %xmm10,%xmm12
69,15,87,210, //xorps %xmm10,%xmm10
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index c22b89eac8..13c33cc91e 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -24,6 +24,22 @@ static Dst bit_cast(const Src& src) {
return unaligned_load<Dst>(&src);
}
+// A couple functions for embedding constants directly into code,
+// so that no .const or .literal4 section is created.
+
+static inline int constant(int x) {
+#if defined(JUMPER) && defined(__x86_64__)
+ // Move x-the-compile-time-constant as a literal into x-the-register.
+ asm("mov %1, %0" : "=r"(x) : "i"(x));
+#endif
+ return x;
+}
+
+static inline float constant(float f) {
+ int x = constant(unaligned_load<int>(&f));
+ return unaligned_load<float>(&x);
+}
+
#if !defined(JUMPER)
// This path should lead to portable code that can be compiled directly into Skia.
// (All other paths are compiled offline by Clang into SkJumper_generated.h.)
@@ -107,7 +123,7 @@ static Dst bit_cast(const Src& src) {
static F floor(F v, K* k) {
F roundtrip = vcvt_f32_s32(vcvt_s32_f32(v));
- return roundtrip - if_then_else(roundtrip > v, k->_1, 0);
+ return roundtrip - if_then_else(roundtrip > v, constant(1.0f), 0);
}
static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
@@ -209,7 +225,7 @@ static Dst bit_cast(const Src& src) {
return _mm_floor_ps(v);
#else
F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
- return roundtrip - if_then_else(roundtrip > v, k->_1, 0);
+ return roundtrip - if_then_else(roundtrip > v, constant(1.0f), 0);
#endif
}
@@ -488,9 +504,9 @@ STAGE(seed_shader) {
// which has the effect of splatting them to vectors before converting to floats.
// On Intel this breaks a data dependency on previous loop iterations' registers.
- r = cast(x) + k->_0_5 + unaligned_load<F>(k->iota);
- g = cast(y) + k->_0_5;
- b = k->_1;
+ r = cast(x) + constant(0.5f) + unaligned_load<F>(k->iota);
+ g = cast(y) + constant(0.5f);
+ b = constant(1.0f);
a = 0;
dr = dg = db = da = 0;
}
@@ -515,14 +531,14 @@ STAGE(plus_) {
}
STAGE(srcover) {
- auto A = k->_1 - a;
+ auto A = constant(1.0f) - a;
r = mad(dr, A, r);
g = mad(dg, A, g);
b = mad(db, A, b);
a = mad(da, A, a);
}
STAGE(dstover) {
- auto DA = k->_1 - da;
+ auto DA = constant(1.0f) - da;
r = mad(r, DA, dr);
g = mad(g, DA, dg);
b = mad(b, DA, db);
@@ -537,14 +553,14 @@ STAGE(clamp_0) {
}
STAGE(clamp_1) {
- r = min(r, k->_1);
- g = min(g, k->_1);
- b = min(b, k->_1);
- a = min(a, k->_1);
+ r = min(r, constant(1.0f));
+ g = min(g, constant(1.0f));
+ b = min(b, constant(1.0f));
+ a = min(a, constant(1.0f));
}
STAGE(clamp_a) {
- a = min(a, k->_1);
+ a = min(a, constant(1.0f));
r = min(r, a);
g = min(g, a);
b = min(b, a);
@@ -592,7 +608,7 @@ STAGE(premul) {
b = b * a;
}
STAGE(unpremul) {
- auto scale = if_then_else(a == 0, 0, k->_1 / a);
+ auto scale = if_then_else(a == 0, 0, constant(1.0f) / a);
r = r * scale;
g = g * scale;
b = b * scale;
@@ -635,7 +651,7 @@ STAGE(scale_u8) {
auto ptr = *(const uint8_t**)ctx + x;
auto scales = load<U8>(ptr, tail);
- auto c = cast(expand(scales)) * k->_1_255;
+ auto c = cast(expand(scales)) * constant(1/255.0f);
r = r * c;
g = g * c;
@@ -655,7 +671,7 @@ STAGE(lerp_u8) {
auto ptr = *(const uint8_t**)ctx + x;
auto scales = load<U8>(ptr, tail);
- auto c = cast(expand(scales)) * k->_1_255;
+ auto c = cast(expand(scales)) * constant(1/255.0f);
r = lerp(dr, r, c);
g = lerp(dg, g, c);
@@ -671,7 +687,7 @@ STAGE(lerp_565) {
r = lerp(dr, r, cr);
g = lerp(dg, g, cg);
b = lerp(db, b, cb);
- a = k->_1;
+ a = constant(1.0f);
}
STAGE(load_tables) {
@@ -720,19 +736,19 @@ STAGE(load_8888) {
auto ptr = *(const uint32_t**)ctx + x;
auto px = load<U32>(ptr, tail);
- r = cast((px ) & k->_0x000000ff) * k->_1_255;
- g = cast((px >> 8) & k->_0x000000ff) * k->_1_255;
- b = cast((px >> 16) & k->_0x000000ff) * k->_1_255;
- a = cast((px >> 24) ) * k->_1_255;
+ r = cast((px ) & constant(0xff)) * constant(1/255.0f);
+ g = cast((px >> 8) & constant(0xff)) * constant(1/255.0f);
+ b = cast((px >> 16) & constant(0xff)) * constant(1/255.0f);
+ a = cast((px >> 24) ) * constant(1/255.0f);
}
STAGE(store_8888) {
auto ptr = *(uint32_t**)ctx + x;
- U32 px = round(r, k->_255)
- | round(g, k->_255) << 8
- | round(b, k->_255) << 16
- | round(a, k->_255) << 24;
+ U32 px = round(r, constant(255.0f))
+ | round(g, constant(255.0f)) << 8
+ | round(b, constant(255.0f)) << 16
+ | round(a, constant(255.0f)) << 24;
store(ptr, px, tail);
}