diff options
-rw-r--r-- | src/jumper/SkJumper.cpp | 8 | ||||
-rw-r--r-- | src/jumper/SkJumper.h | 55 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated.cpp | 3780 | ||||
-rw-r--r-- | src/jumper/SkJumper_stages.cpp | 180 | ||||
-rwxr-xr-x | src/jumper/build_stages.py | 10 |
5 files changed, 2268 insertions, 1765 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp index caa9db2fd3..85f1231b1d 100644 --- a/src/jumper/SkJumper.cpp +++ b/src/jumper/SkJumper.cpp @@ -37,15 +37,7 @@ // It's fine to rearrange and add new ones if you update SkJumper_constants. using K = const SkJumper_constants; static K kConstants = { - 1.0f, 0.5f, 255.0f, 1/255.0f, 0x000000ff, {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}, - 0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f, // from_srgb - 12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f, // to_srgb - 0x77800000, 0x07800000, 0x04000400, // fp16 <-> fp32 - 0x0000f800, 0x000007e0, 0x0000001f, // 565 - 1.0f/0x0000f800, 1.0f/0x000007e0, 1.0f/0x0000001f, - 31.0f, 63.0f, - SK_LUM_COEFF_R, SK_LUM_COEFF_G, SK_LUM_COEFF_B, // luminance -> alpha }; #define STAGES(M) \ diff --git a/src/jumper/SkJumper.h b/src/jumper/SkJumper.h index 215284b84c..712417a7de 100644 --- a/src/jumper/SkJumper.h +++ b/src/jumper/SkJumper.h @@ -14,54 +14,19 @@ #include <stdint.h> -// SkJumper Stages can use constant literals only if they end up baked into the instruction, -// like bit shifts and rounding modes. Any other constant values must be pulled from this struct -// (except 0, ~0, and 0.0f, which always end up as some sort of xor or cmpeq instruction). +// SkJumper_stages.cpp has some unusual constraints on what constants it can use. // -// This constraint makes it much easier to move and reorder the code for each Stage. +// If the constant is baked into the instruction, that's ok. +// If the constant is synthesized through code, that's ok. +// If the constant is loaded from memory, that's no good. +// +// We offer a couple facilities to get at any other constants you need: +// - the C() function usually constrains constants to be directly baked into an instruction; or +// - the _i and _f user-defined literal operators call C() for you in a prettier way; or +// - you can load values from this struct. struct SkJumper_constants { - float _1; // 1.0f - float _0_5; // 0.5f - float _255; // 255.0f - float _1_255; // 1/255.0f - uint32_t _0x000000ff; // 0x000000ff - - float iota[8]; // 0,1,2,3,4,5,6,7 - - // from_srgb - float _00025; // 0.0025f - float _06975; // 0.6975f - float _03000; // 0.3000f - float _1_1292; // 1/12.92f - float _0055; // 0.055f - - // to_srgb - float _1246; // 12.46f - float _0411192; // 0.411192f - float _0689206; // 0.689206f - float n_00988; // -0.0988f - float _00043; // 0.0043f - - // fp16 <-> fp32 - uint32_t _0x77800000; - uint32_t _0x07800000; - uint32_t _0x04000400; - - // 565 - uint32_t r_565_mask; - uint32_t g_565_mask; - uint32_t b_565_mask; - float r_565_scale; - float g_565_scale; - float b_565_scale; - float _31; - float _63; - - // luminance -> alpha - float lum_r; - float lum_g; - float lum_b; + float iota[8]; // 0,1,2,3,4,5,6,7 }; #endif//SkJumper_DEFINED diff --git a/src/jumper/SkJumper_generated.cpp b/src/jumper/SkJumper_generated.cpp index 383922d0ce..df6e75234f 100644 --- a/src/jumper/SkJumper_generated.cpp +++ b/src/jumper/SkJumper_generated.cpp @@ -64,7 +64,7 @@ CODE const uint32_t sk_just_return_aarch64[] = { CODE const uint32_t sk_seed_shader_aarch64[] = { 0xa8c10c28, //ldp x8, x3, [x1], #16 - 0x3cc14046, //ldur q6, [x2, #20] + 0x3dc00046, //ldr q6, [x2] 0x4e040c00, //dup v0.4s, w0 0x4f0167e7, //movi v7.4s, #0x3f, lsl #24 0x4d40c901, //ld1r {v1.4s}, [x8] @@ -244,101 +244,118 @@ CODE const uint32_t sk_unpremul_aarch64[] = { }; CODE const uint32_t sk_from_srgb_aarch64[] = { - 0x9100e048, //add x8, x2, #0x38 - 0x4d40c910, //ld1r {v16.4s}, [x8] - 0x9100d048, //add x8, x2, #0x34 - 0x2d47cc52, //ldp s18, s19, [x2, #60] - 0x4d40c911, //ld1r {v17.4s}, [x8] + 0x52a7d328, //mov w8, #0x3e990000 + 0x72933348, //movk w8, #0x999a + 0x4e040d10, //dup v16.4s, w8 + 0x52a7e648, //mov w8, #0x3f320000 + 0x7291eb88, //movk w8, #0x8f5c + 0x4e040d11, //dup v17.4s, w8 + 0x52a76468, //mov w8, #0x3b230000 + 0x729ae148, //movk w8, #0xd70a + 0x4e040d12, //dup v18.4s, w8 + 0x52a7b3c8, //mov w8, #0x3d9e0000 + 0x72907228, //movk w8, #0x8391 0x6e22dc54, //fmul v20.4s, v2.4s, v2.4s - 0x4eb01e15, //mov v21.16b, v16.16b - 0x4eb01e17, //mov v23.16b, v16.16b - 0x4f921050, //fmla v16.4s, v2.4s, v18.s[0] - 0x4eb11e36, //mov v22.16b, v17.16b - 0x4eb11e38, //mov v24.16b, v17.16b - 0x4e34ce11, //fmla v17.4s, v16.4s, v20.4s - 0x6e20dc10, //fmul v16.4s, v0.4s, v0.4s - 0x91011048, //add x8, x2, #0x44 - 0x4f921015, //fmla v21.4s, v0.4s, v18.s[0] - 0x4e30ceb6, //fmla v22.4s, v21.4s, v16.4s - 0x4d40c910, //ld1r {v16.4s}, [x8] + 0x4eb11e35, //mov v21.16b, v17.16b + 0x4eb11e37, //mov v23.16b, v17.16b + 0x4e22ce11, //fmla v17.4s, v16.4s, v2.4s + 0x4eb21e56, //mov v22.16b, v18.16b + 0x4eb21e58, //mov v24.16b, v18.16b + 0x4e34ce32, //fmla v18.4s, v17.4s, v20.4s + 0x4e040d11, //dup v17.4s, w8 + 0x52a7ac28, //mov w8, #0x3d610000 + 0x6e20dc13, //fmul v19.4s, v0.4s, v0.4s + 0x7288f5c8, //movk w8, #0x47ae + 0x4e20ce15, //fmla v21.4s, v16.4s, v0.4s 0xf8408423, //ldr x3, [x1], #8 0x6e21dc34, //fmul v20.4s, v1.4s, v1.4s - 0x4f921037, //fmla v23.4s, v1.4s, v18.s[0] - 0x4f939015, //fmul v21.4s, v0.4s, v19.s[0] - 0x4f939032, //fmul v18.4s, v1.4s, v19.s[0] - 0x4f939053, //fmul v19.4s, v2.4s, v19.s[0] - 0x6ea0e600, //fcmgt v0.4s, v16.4s, v0.4s - 0x6ea1e601, //fcmgt v1.4s, v16.4s, v1.4s - 0x6ea2e602, //fcmgt v2.4s, v16.4s, v2.4s + 0x4e33ceb6, //fmla v22.4s, v21.4s, v19.4s + 0x4e040d13, //dup v19.4s, w8 + 0x4e21ce17, //fmla v23.4s, v16.4s, v1.4s + 0x6e31dc15, //fmul v21.4s, v0.4s, v17.4s + 0x6ea0e660, //fcmgt v0.4s, v19.4s, v0.4s + 0x6e31dc30, //fmul v16.4s, v1.4s, v17.4s + 0x6ea1e661, //fcmgt v1.4s, v19.4s, v1.4s + 0x6e31dc51, //fmul v17.4s, v2.4s, v17.4s + 0x6ea2e662, //fcmgt v2.4s, v19.4s, v2.4s 0x4e34cef8, //fmla v24.4s, v23.4s, v20.4s 0x6e761ea0, //bsl v0.16b, v21.16b, v22.16b - 0x6e781e41, //bsl v1.16b, v18.16b, v24.16b - 0x6e711e62, //bsl v2.16b, v19.16b, v17.16b + 0x6e781e01, //bsl v1.16b, v16.16b, v24.16b + 0x6e721e22, //bsl v2.16b, v17.16b, v18.16b 0xd61f0060, //br x3 }; CODE const uint32_t sk_to_srgb_aarch64[] = { + 0x52a828e8, //mov w8, #0x41470000 + 0x728b8528, //movk w8, #0x5c29 + 0x4e040d12, //dup v18.4s, w8 + 0x52a7e608, //mov w8, #0x3f300000 + 0x728df9c8, //movk w8, #0x6fce 0x6ea1d811, //frsqrte v17.4s, v0.4s - 0x6ea1d835, //frsqrte v21.4s, v1.4s - 0x6e31de37, //fmul v23.4s, v17.4s, v17.4s - 0x6ea1d856, //frsqrte v22.4s, v2.4s + 0x4e040d13, //dup v19.4s, w8 + 0x52b7b948, //mov w8, #0xbdca0000 + 0x728af508, //movk w8, #0x57a8 + 0x6ea1d834, //frsqrte v20.4s, v1.4s + 0x6e31de36, //fmul v22.4s, v17.4s, v17.4s + 0x4e040d10, //dup v16.4s, w8 + 0x52a77188, //mov w8, #0x3b8c0000 + 0x6ea1d855, //frsqrte v21.4s, v2.4s + 0x6e34de98, //fmul v24.4s, v20.4s, v20.4s + 0x4eb6fc16, //frsqrts v22.4s, v0.4s, v22.4s + 0x729ce088, //movk w8, #0xe704 0x6e35deb9, //fmul v25.4s, v21.4s, v21.4s - 0x4eb7fc17, //frsqrts v23.4s, v0.4s, v23.4s - 0x91015048, //add x8, x2, #0x54 - 0x6e36deda, //fmul v26.4s, v22.4s, v22.4s - 0x4eb9fc39, //frsqrts v25.4s, v1.4s, v25.4s - 0x6e37de31, //fmul v17.4s, v17.4s, v23.4s - 0x4d40c914, //ld1r {v20.4s}, [x8] - 0x4ebafc5a, //frsqrts v26.4s, v2.4s, v26.4s + 0x4eb8fc38, //frsqrts v24.4s, v1.4s, v24.4s + 0x6e36de31, //fmul v17.4s, v17.4s, v22.4s + 0x4e040d17, //dup v23.4s, w8 + 0x4eb9fc59, //frsqrts v25.4s, v2.4s, v25.4s + 0x6e38de94, //fmul v20.4s, v20.4s, v24.4s + 0x4ea1da36, //frecpe v22.4s, v17.4s + 0x6e32dc1a, //fmul v26.4s, v0.4s, v18.4s + 0x6ea0e6e0, //fcmgt v0.4s, v23.4s, v0.4s + 0x6e32dc3c, //fmul v28.4s, v1.4s, v18.4s + 0x6ea1e6e1, //fcmgt v1.4s, v23.4s, v1.4s + 0x6e32dc52, //fmul v18.4s, v2.4s, v18.4s + 0x6ea2e6e2, //fcmgt v2.4s, v23.4s, v2.4s 0x6e39deb5, //fmul v21.4s, v21.4s, v25.4s - 0x4ea1da37, //frecpe v23.4s, v17.4s - 0xbd405053, //ldr s19, [x2, #80] - 0x91016048, //add x8, x2, #0x58 - 0x6e3aded6, //fmul v22.4s, v22.4s, v26.4s - 0x4ea1dabb, //frecpe v27.4s, v21.4s - 0x4e37fe3d, //frecps v29.4s, v17.4s, v23.4s - 0x2d494052, //ldp s18, s16, [x2, #72] - 0x4d40c918, //ld1r {v24.4s}, [x8] - 0x4ea1dadc, //frecpe v28.4s, v22.4s - 0x6e3ddef7, //fmul v23.4s, v23.4s, v29.4s - 0x4e3bfebd, //frecps v29.4s, v21.4s, v27.4s - 0x6e3ddf7b, //fmul v27.4s, v27.4s, v29.4s - 0x4e3cfedd, //frecps v29.4s, v22.4s, v28.4s - 0x6e3ddf9c, //fmul v28.4s, v28.4s, v29.4s - 0x4eb41e9d, //mov v29.16b, v20.16b - 0x6ea1da39, //frsqrte v25.4s, v17.4s - 0x4f9312fd, //fmla v29.4s, v23.4s, v19.s[0] - 0x4eb41e97, //mov v23.16b, v20.16b - 0x4f92901a, //fmul v26.4s, v0.4s, v18.s[0] - 0x4f931377, //fmla v23.4s, v27.4s, v19.s[0] - 0x4f931394, //fmla v20.4s, v28.4s, v19.s[0] - 0x4f929033, //fmul v19.4s, v1.4s, v18.s[0] - 0x4f929052, //fmul v18.4s, v2.4s, v18.s[0] - 0x6ea0e700, //fcmgt v0.4s, v24.4s, v0.4s - 0x6ea1e701, //fcmgt v1.4s, v24.4s, v1.4s - 0x6ea2e702, //fcmgt v2.4s, v24.4s, v2.4s - 0x6e39df38, //fmul v24.4s, v25.4s, v25.4s - 0x6ea1dabb, //frsqrte v27.4s, v21.4s + 0x4ea1da97, //frecpe v23.4s, v20.4s + 0x4e36fe39, //frecps v25.4s, v17.4s, v22.4s + 0x4ea1dab8, //frecpe v24.4s, v21.4s + 0x6e39ded6, //fmul v22.4s, v22.4s, v25.4s + 0x4e37fe99, //frecps v25.4s, v20.4s, v23.4s + 0x4eb01e1b, //mov v27.16b, v16.16b + 0x6e39def7, //fmul v23.4s, v23.4s, v25.4s + 0x4e38feb9, //frecps v25.4s, v21.4s, v24.4s + 0x6e39df18, //fmul v24.4s, v24.4s, v25.4s + 0x4eb01e19, //mov v25.16b, v16.16b + 0x4e36ce7b, //fmla v27.4s, v19.4s, v22.4s + 0x6ea1da36, //frsqrte v22.4s, v17.4s + 0x4e37ce79, //fmla v25.4s, v19.4s, v23.4s + 0x6ea1da97, //frsqrte v23.4s, v20.4s + 0x4e38ce70, //fmla v16.4s, v19.4s, v24.4s + 0x6e36ded8, //fmul v24.4s, v22.4s, v22.4s + 0x6ea1dab3, //frsqrte v19.4s, v21.4s 0x4eb8fe31, //frsqrts v17.4s, v17.4s, v24.4s - 0x6ea1dadc, //frsqrte v28.4s, v22.4s - 0x6e3bdf78, //fmul v24.4s, v27.4s, v27.4s - 0x6e31df31, //fmul v17.4s, v25.4s, v17.4s + 0x6e37def8, //fmul v24.4s, v23.4s, v23.4s + 0x4eb8fe94, //frsqrts v20.4s, v20.4s, v24.4s + 0x6e33de78, //fmul v24.4s, v19.4s, v19.4s + 0x52a7da48, //mov w8, #0x3ed20000 0x4eb8feb5, //frsqrts v21.4s, v21.4s, v24.4s - 0x6e3cdf98, //fmul v24.4s, v28.4s, v28.4s - 0x4f90123d, //fmla v29.4s, v17.4s, v16.s[0] - 0x4d40c851, //ld1r {v17.4s}, [x2] - 0x4eb8fed6, //frsqrts v22.4s, v22.4s, v24.4s - 0x6e35df75, //fmul v21.4s, v27.4s, v21.4s - 0x6e36df96, //fmul v22.4s, v28.4s, v22.4s + 0x7290f848, //movk w8, #0x87c2 + 0x6e31ded1, //fmul v17.4s, v22.4s, v17.4s + 0x6e34def4, //fmul v20.4s, v23.4s, v20.4s + 0x6e35de73, //fmul v19.4s, v19.4s, v21.4s + 0x4e040d15, //dup v21.4s, w8 0xf8408423, //ldr x3, [x1], #8 - 0x4f9012b7, //fmla v23.4s, v21.4s, v16.s[0] - 0x4f9012d4, //fmla v20.4s, v22.4s, v16.s[0] - 0x4ebdf630, //fmin v16.4s, v17.4s, v29.4s - 0x4eb7f635, //fmin v21.4s, v17.4s, v23.4s - 0x4eb4f631, //fmin v17.4s, v17.4s, v20.4s - 0x6e701f40, //bsl v0.16b, v26.16b, v16.16b - 0x6e751e61, //bsl v1.16b, v19.16b, v21.16b - 0x6e711e42, //bsl v2.16b, v18.16b, v17.16b + 0x4e31cebb, //fmla v27.4s, v21.4s, v17.4s + 0x4f03f611, //fmov v17.4s, #1.000000000000000000e+00 + 0x4e34ceb9, //fmla v25.4s, v21.4s, v20.4s + 0x4e33ceb0, //fmla v16.4s, v21.4s, v19.4s + 0x4ebbf633, //fmin v19.4s, v17.4s, v27.4s + 0x4eb9f634, //fmin v20.4s, v17.4s, v25.4s + 0x4eb0f630, //fmin v16.4s, v17.4s, v16.4s + 0x6e731f40, //bsl v0.16b, v26.16b, v19.16b + 0x6e741f81, //bsl v1.16b, v28.16b, v20.16b + 0x6e701e42, //bsl v2.16b, v18.16b, v16.16b 0xd61f0060, //br x3 }; @@ -433,123 +450,132 @@ CODE const uint32_t sk_lerp_u8_aarch64[] = { CODE const uint32_t sk_lerp_565_aarch64[] = { 0xa8c10c28, //ldp x8, x3, [x1], #16 0xd37ff809, //lsl x9, x0, #1 - 0x2d4ec851, //ldp s17, s18, [x2, #116] - 0x4ea4d414, //fsub v20.4s, v0.4s, v4.4s + 0x4f072710, //movi v16.4s, #0xf8, lsl #8 + 0x4ea4d413, //fsub v19.4s, v0.4s, v4.4s 0xf9400108, //ldr x8, [x8] 0xfc696903, //ldr d3, [x8, x9] - 0x9101a048, //add x8, x2, #0x68 - 0x4d40c910, //ld1r {v16.4s}, [x8] - 0x9101b048, //add x8, x2, #0x6c - 0x4d40c913, //ld1r {v19.4s}, [x8] - 0x9101c048, //add x8, x2, #0x70 + 0x52a6f088, //mov w8, #0x37840000 + 0x72842108, //movk w8, #0x2108 + 0x4e040d11, //dup v17.4s, w8 0x2f10a463, //uxtl v3.4s, v3.4h - 0x4d40c915, //ld1r {v21.4s}, [x8] - 0x4e231e00, //and v0.16b, v16.16b, v3.16b + 0x321b17e8, //orr w8, wzr, #0x7e0 + 0x4e301c60, //and v0.16b, v3.16b, v16.16b + 0x4e040d12, //dup v18.4s, w8 + 0x52a74048, //mov w8, #0x3a020000 0x4e21d800, //scvtf v0.4s, v0.4s - 0x4f919010, //fmul v16.4s, v0.4s, v17.s[0] + 0x72810428, //movk w8, #0x821 + 0x6e31dc10, //fmul v16.4s, v0.4s, v17.4s 0x4ea41c80, //mov v0.16b, v4.16b - 0xbd407c51, //ldr s17, [x2, #124] - 0x4e34ce00, //fmla v0.4s, v16.4s, v20.4s - 0x4e231e70, //and v16.16b, v19.16b, v3.16b - 0x4e231ea3, //and v3.16b, v21.16b, v3.16b + 0x4e33ce00, //fmla v0.4s, v16.4s, v19.4s + 0x4f0007f0, //movi v16.4s, #0x1f + 0x4e040d11, //dup v17.4s, w8 + 0x52a7a088, //mov w8, #0x3d040000 + 0x4e321c72, //and v18.16b, v3.16b, v18.16b + 0x72842108, //movk w8, #0x2108 + 0x4e301c63, //and v3.16b, v3.16b, v16.16b + 0x4ea6d450, //fsub v16.4s, v2.4s, v6.4s + 0x4e21da42, //scvtf v2.4s, v18.4s + 0x6e31dc51, //fmul v17.4s, v2.4s, v17.4s + 0x4e040d02, //dup v2.4s, w8 + 0x4e21d863, //scvtf v3.4s, v3.4s 0x4ea5d433, //fsub v19.4s, v1.4s, v5.4s - 0x4e21da01, //scvtf v1.4s, v16.4s - 0x4f929030, //fmul v16.4s, v1.4s, v18.s[0] - 0x4ea6d452, //fsub v18.4s, v2.4s, v6.4s - 0x4e21d862, //scvtf v2.4s, v3.4s 0x4ea51ca1, //mov v1.16b, v5.16b - 0x4f919043, //fmul v3.4s, v2.4s, v17.s[0] + 0x6e22dc63, //fmul v3.4s, v3.4s, v2.4s 0x4ea61cc2, //mov v2.16b, v6.16b - 0x4e33ce01, //fmla v1.4s, v16.4s, v19.4s - 0x4e32cc62, //fmla v2.4s, v3.4s, v18.4s + 0x4e33ce21, //fmla v1.4s, v17.4s, v19.4s + 0x4e30cc62, //fmla v2.4s, v3.4s, v16.4s 0x4f03f603, //fmov v3.4s, #1.000000000000000000e+00 0xd61f0060, //br x3 }; CODE const uint32_t sk_load_tables_aarch64[] = { 0xa8c10c28, //ldp x8, x3, [x1], #16 - 0x9100404b, //add x11, x2, #0x10 - 0x4d40c960, //ld1r {v0.4s}, [x11] 0xd37ef409, //lsl x9, x0, #2 - 0xa9402d0a, //ldp x10, x11, [x8] + 0x6f00e620, //movi v0.2d, #0xff000000ff + 0x52a7700b, //mov w11, #0x3b800000 + 0xa940310a, //ldp x10, x12, [x8] + 0x7290102b, //movk w11, #0x8081 + 0x4e040d63, //dup v3.4s, w11 0x3ce96942, //ldr q2, [x10, x9] 0xa9412109, //ldp x9, x8, [x8, #16] - 0x4e221c01, //and v1.16b, v0.16b, v2.16b - 0x0e143c2c, //mov w12, v1.s[2] - 0xbc6c5971, //ldr s17, [x11, w12, uxtw #2] - 0x1e26002c, //fmov w12, s1 - 0x6f380443, //ushr v3.4s, v2.4s, #8 - 0x6f300450, //ushr v16.4s, v2.4s, #16 - 0x8b2c496c, //add x12, x11, w12, uxtw #2 + 0x4e201c41, //and v1.16b, v2.16b, v0.16b + 0x1e26002e, //fmov w14, s1 + 0x6f380450, //ushr v16.4s, v2.4s, #8 + 0x6f300451, //ushr v17.4s, v2.4s, #16 + 0x8b2e498e, //add x14, x12, w14, uxtw #2 0x0e0c3c2a, //mov w10, v1.s[1] + 0x0e143c2b, //mov w11, v1.s[2] 0x0e1c3c2d, //mov w13, v1.s[3] - 0x4e231c01, //and v1.16b, v0.16b, v3.16b - 0x4e301c03, //and v3.16b, v0.16b, v16.16b - 0x0d408180, //ld1 {v0.s}[0], [x12] - 0x0e143c2c, //mov w12, v1.s[2] - 0xbc6c5932, //ldr s18, [x9, w12, uxtw #2] - 0x1e26002c, //fmov w12, s1 - 0x8b2a496a, //add x10, x11, w10, uxtw #2 - 0xbc6d5970, //ldr s16, [x11, w13, uxtw #2] + 0x4e201e01, //and v1.16b, v16.16b, v0.16b + 0x4e201e30, //and v16.16b, v17.16b, v0.16b + 0x0d4081c0, //ld1 {v0.s}[0], [x14] + 0x8b2a498a, //add x10, x12, w10, uxtw #2 + 0xbc6b5991, //ldr s17, [x12, w11, uxtw #2] + 0xbc6d5992, //ldr s18, [x12, w13, uxtw #2] 0x0e0c3c2b, //mov w11, v1.s[1] + 0x0e143c2c, //mov w12, v1.s[2] 0x0e1c3c2d, //mov w13, v1.s[3] - 0x8b2c492c, //add x12, x9, w12, uxtw #2 - 0xbc6d5933, //ldr s19, [x9, w13, uxtw #2] - 0x0e0c3c6d, //mov w13, v3.s[1] + 0x1e26002e, //fmov w14, s1 + 0x8b2e492e, //add x14, x9, w14, uxtw #2 + 0xbc6c5933, //ldr s19, [x9, w12, uxtw #2] + 0xbc6d5934, //ldr s20, [x9, w13, uxtw #2] 0x8b2b4929, //add x9, x9, w11, uxtw #2 - 0x0e143c6b, //mov w11, v3.s[2] - 0x0d408181, //ld1 {v1.s}[0], [x12] - 0x0e1c3c6c, //mov w12, v3.s[3] - 0x0d409140, //ld1 {v0.s}[1], [x10] - 0x1e26006a, //fmov w10, s3 - 0xbd400c43, //ldr s3, [x2, #12] + 0x1e26020b, //fmov w11, s16 0x6f280442, //ushr v2.4s, v2.4s, #24 + 0x0d409140, //ld1 {v0.s}[1], [x10] 0x4e21d842, //scvtf v2.4s, v2.4s - 0x8b2a490a, //add x10, x8, w10, uxtw #2 - 0x4f839043, //fmul v3.4s, v2.4s, v3.s[0] + 0x8b2b490a, //add x10, x8, w11, uxtw #2 + 0x0d4081c1, //ld1 {v1.s}[0], [x14] + 0x6e23dc43, //fmul v3.4s, v2.4s, v3.4s 0x0d408142, //ld1 {v2.s}[0], [x10] - 0x8b2d490a, //add x10, x8, w13, uxtw #2 - 0x6e140620, //mov v0.s[2], v17.s[0] - 0xbc6b5911, //ldr s17, [x8, w11, uxtw #2] + 0x0e0c3e0f, //mov w15, v16.s[1] + 0x0e143e0c, //mov w12, v16.s[2] + 0x8b2f490a, //add x10, x8, w15, uxtw #2 + 0x0e1c3e0d, //mov w13, v16.s[3] + 0xbc6c5910, //ldr s16, [x8, w12, uxtw #2] 0x0d409121, //ld1 {v1.s}[1], [x9] 0x0d409142, //ld1 {v2.s}[1], [x10] - 0x6e1c0600, //mov v0.s[3], v16.s[0] - 0xbc6c5910, //ldr s16, [x8, w12, uxtw #2] - 0x6e140641, //mov v1.s[2], v18.s[0] - 0x6e140622, //mov v2.s[2], v17.s[0] - 0x6e1c0661, //mov v1.s[3], v19.s[0] - 0x6e1c0602, //mov v2.s[3], v16.s[0] + 0x6e140620, //mov v0.s[2], v17.s[0] + 0xbc6d5911, //ldr s17, [x8, w13, uxtw #2] + 0x6e140661, //mov v1.s[2], v19.s[0] + 0x6e140602, //mov v2.s[2], v16.s[0] + 0x6e1c0640, //mov v0.s[3], v18.s[0] + 0x6e1c0681, //mov v1.s[3], v20.s[0] + 0x6e1c0622, //mov v2.s[3], v17.s[0] 0xd61f0060, //br x3 }; CODE const uint32_t sk_load_a8_aarch64[] = { 0xa8c10c28, //ldp x8, x3, [x1], #16 - 0xbd400c43, //ldr s3, [x2, #12] + 0x52a77009, //mov w9, #0x3b800000 + 0x72901029, //movk w9, #0x8081 + 0x4e040d22, //dup v2.4s, w9 + 0xf9400108, //ldr x8, [x8] 0x6f00e400, //movi v0.2d, #0x0 0x6f00e401, //movi v1.2d, #0x0 - 0xf9400108, //ldr x8, [x8] 0x8b000108, //add x8, x8, x0 - 0x39400109, //ldrb w9, [x8] - 0x3940050a, //ldrb w10, [x8, #1] - 0x3940090b, //ldrb w11, [x8, #2] + 0x3940010a, //ldrb w10, [x8] + 0x3940050b, //ldrb w11, [x8, #1] + 0x3940090c, //ldrb w12, [x8, #2] 0x39400d08, //ldrb w8, [x8, #3] - 0x4e021d22, //mov v2.h[0], w9 - 0x4e061d42, //mov v2.h[1], w10 - 0x4e0a1d62, //mov v2.h[2], w11 - 0x4e0e1d02, //mov v2.h[3], w8 - 0x2f07b7e2, //bic v2.4h, #0xff, lsl #8 - 0x2f10a442, //uxtl v2.4s, v2.4h - 0x6e21d842, //ucvtf v2.4s, v2.4s - 0x4f839043, //fmul v3.4s, v2.4s, v3.s[0] + 0x4e021d43, //mov v3.h[0], w10 + 0x4e061d63, //mov v3.h[1], w11 + 0x4e0a1d83, //mov v3.h[2], w12 + 0x4e0e1d03, //mov v3.h[3], w8 + 0x2f07b7e3, //bic v3.4h, #0xff, lsl #8 + 0x2f10a463, //uxtl v3.4s, v3.4h + 0x6e21d863, //ucvtf v3.4s, v3.4s + 0x6e22dc63, //fmul v3.4s, v3.4s, v2.4s 0x6f00e402, //movi v2.2d, #0x0 0xd61f0060, //br x3 }; CODE const uint32_t sk_store_a8_aarch64[] = { 0xf9400028, //ldr x8, [x1] - 0xbd400850, //ldr s16, [x2, #8] + 0x52a86fe9, //mov w9, #0x437f0000 + 0x4e040d30, //dup v16.4s, w9 + 0x6e30dc70, //fmul v16.4s, v3.4s, v16.4s 0xf9400108, //ldr x8, [x8] - 0x4f909070, //fmul v16.4s, v3.4s, v16.s[0] 0x6e21aa10, //fcvtnu v16.4s, v16.4s 0x0e612a10, //xtn v16.4h, v16.4s 0x0e0e3e09, //umov w9, v16.h[3] @@ -569,45 +595,52 @@ CODE const uint32_t sk_store_a8_aarch64[] = { CODE const uint32_t sk_load_565_aarch64[] = { 0xa8c10c28, //ldp x8, x3, [x1], #16 0xd37ff809, //lsl x9, x0, #1 + 0x4f072701, //movi v1.4s, #0xf8, lsl #8 + 0x4f0007e3, //movi v3.4s, #0x1f 0xf9400108, //ldr x8, [x8] 0xfc696900, //ldr d0, [x8, x9] - 0x9101a048, //add x8, x2, #0x68 - 0x4d40c901, //ld1r {v1.4s}, [x8] - 0x9101b048, //add x8, x2, #0x6c - 0x4d40c902, //ld1r {v2.4s}, [x8] - 0x9101c048, //add x8, x2, #0x70 - 0x4d40c903, //ld1r {v3.4s}, [x8] + 0x321b17e8, //orr w8, wzr, #0x7e0 + 0x4e040d02, //dup v2.4s, w8 + 0x52a6f088, //mov w8, #0x37840000 + 0x72842108, //movk w8, #0x2108 0x2f10a400, //uxtl v0.4s, v0.4h - 0x4e201c21, //and v1.16b, v1.16b, v0.16b - 0x4e201c42, //and v2.16b, v2.16b, v0.16b - 0x4e201c71, //and v17.16b, v3.16b, v0.16b - 0x2d4e8c50, //ldp s16, s3, [x2, #116] - 0x4e21d820, //scvtf v0.4s, v1.4s - 0x4e21d841, //scvtf v1.4s, v2.4s - 0x4e21da22, //scvtf v2.4s, v17.4s - 0x4f909000, //fmul v0.4s, v0.4s, v16.s[0] - 0xbd407c50, //ldr s16, [x2, #124] - 0x4f839021, //fmul v1.4s, v1.4s, v3.s[0] - 0x4d40c843, //ld1r {v3.4s}, [x2] - 0x4f909042, //fmul v2.4s, v2.4s, v16.s[0] + 0x4e211c01, //and v1.16b, v0.16b, v1.16b + 0x4e221c02, //and v2.16b, v0.16b, v2.16b + 0x4e231c03, //and v3.16b, v0.16b, v3.16b + 0x4e040d00, //dup v0.4s, w8 + 0x52a74048, //mov w8, #0x3a020000 + 0x72810428, //movk w8, #0x821 + 0x4e21d821, //scvtf v1.4s, v1.4s + 0x6e20dc20, //fmul v0.4s, v1.4s, v0.4s + 0x4e040d01, //dup v1.4s, w8 + 0x52a7a088, //mov w8, #0x3d040000 + 0x72842108, //movk w8, #0x2108 + 0x4e21d842, //scvtf v2.4s, v2.4s + 0x6e21dc41, //fmul v1.4s, v2.4s, v1.4s + 0x4e040d02, //dup v2.4s, w8 + 0x4e21d863, //scvtf v3.4s, v3.4s + 0x6e22dc62, //fmul v2.4s, v3.4s, v2.4s + 0x4f03f603, //fmov v3.4s, #1.000000000000000000e+00 0xd61f0060, //br x3 }; CODE const uint32_t sk_store_565_aarch64[] = { - 0x2d504450, //ldp s16, s17, [x2, #128] 0xf9400028, //ldr x8, [x1] - 0xd37ff809, //lsl x9, x0, #1 - 0x4f909012, //fmul v18.4s, v0.4s, v16.s[0] - 0x4f919031, //fmul v17.4s, v1.4s, v17.s[0] - 0x6e21aa52, //fcvtnu v18.4s, v18.4s + 0x52a84f8a, //mov w10, #0x427c0000 + 0x4f01f7f0, //fmov v16.4s, #3.100000000000000000e+01 + 0x4e040d52, //dup v18.4s, w10 + 0x6e30dc11, //fmul v17.4s, v0.4s, v16.4s + 0x6e32dc32, //fmul v18.4s, v1.4s, v18.4s 0x6e21aa31, //fcvtnu v17.4s, v17.4s + 0x6e21aa52, //fcvtnu v18.4s, v18.4s + 0x6e30dc50, //fmul v16.4s, v2.4s, v16.4s + 0x4f2b5631, //shl v17.4s, v17.4s, #11 0xf9400108, //ldr x8, [x8] - 0x4f909050, //fmul v16.4s, v2.4s, v16.s[0] - 0x4f2b5652, //shl v18.4s, v18.4s, #11 - 0x4f255631, //shl v17.4s, v17.4s, #5 - 0x4eb21e31, //orr v17.16b, v17.16b, v18.16b + 0x4f255652, //shl v18.4s, v18.4s, #5 + 0x4eb11e51, //orr v17.16b, v18.16b, v17.16b 0x6e21aa10, //fcvtnu v16.4s, v16.4s 0x4eb01e30, //orr v16.16b, v17.16b, v16.16b + 0xd37ff809, //lsl x9, x0, #1 0x0e612a10, //xtn v16.4h, v16.4s 0xfc296910, //str d16, [x8, x9] 0xf9400423, //ldr x3, [x1, #8] @@ -788,14 +821,22 @@ CODE const uint32_t sk_mirror_y_aarch64[] = { }; CODE const uint32_t sk_luminance_to_alpha_aarch64[] = { - 0x2d510c50, //ldp s16, s3, [x2, #136] - 0xbd409051, //ldr s17, [x2, #144] + 0x52a7cb28, //mov w8, #0x3e590000 + 0x72967a08, //movk w8, #0xb3d0 + 0x4e040d11, //dup v17.4s, w8 + 0x52a7e6e8, //mov w8, #0x3f370000 + 0x7282eb28, //movk w8, #0x1759 + 0x4ea01c10, //mov v16.16b, v0.16b + 0x4e040d00, //dup v0.4s, w8 + 0x52a7b268, //mov w8, #0x3d930000 0xf8408423, //ldr x3, [x1], #8 - 0x4f839023, //fmul v3.4s, v1.4s, v3.s[0] - 0x4f901003, //fmla v3.4s, v0.4s, v16.s[0] + 0x729bb308, //movk w8, #0xdd98 + 0x6e20dc23, //fmul v3.4s, v1.4s, v0.4s + 0x4e30ce23, //fmla v3.4s, v17.4s, v16.4s + 0x4e040d10, //dup v16.4s, w8 0x6f00e400, //movi v0.2d, #0x0 0x6f00e401, //movi v1.2d, #0x0 - 0x4f911043, //fmla v3.4s, v2.4s, v17.s[0] + 0x4e22ce03, //fmla v3.4s, v16.4s, v2.4s 0x6f00e402, //movi v2.2d, #0x0 0xd61f0060, //br x3 }; @@ -975,7 +1016,7 @@ CODE const uint32_t sk_seed_shader_vfp4[] = { 0xe8911008, //ldm r1, {r3, ip} 0xf3fb0620, //vcvt.f32.s32 d16, d16 0xf2c3161f, //vmov.i32 d17, #1056964608 - 0xedd23b05, //vldr d19, [r2, #20] + 0xedd23b00, //vldr d19, [r2] 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] 0xf2872f10, //vmov.f32 d2, #1 0xf3fb2622, //vcvt.f32.s32 d18, d18 @@ -1162,111 +1203,125 @@ CODE const uint32_t sk_unpremul_vfp4[] = { }; CODE const uint32_t sk_from_srgb_vfp4[] = { - 0xed2d8b02, //vpush {d8} - 0xe282303c, //add r3, r2, #60 - 0xed928a10, //vldr s16, [r2, #64] - 0xf3402d10, //vmul.f32 d18, d0, d0 - 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32] - 0xe2823038, //add r3, r2, #56 - 0xf3413d11, //vmul.f32 d19, d1, d1 - 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32] - 0xe2823044, //add r3, r2, #68 - 0xf26141b1, //vorr d20, d17, d17 - 0xf26171b1, //vorr d23, d17, d17 - 0xf4e38c9f, //vld1.32 {d24[]}, [r3 :32] + 0xeddf3b20, //vldr d19, [pc, #128] + 0xf3408d10, //vmul.f32 d24, d0, d0 + 0xeddf0b1c, //vldr d16, [pc, #112] + 0xf26341b3, //vorr d20, d19, d19 + 0xf26351b3, //vorr d21, d19, d19 + 0xeddf9b1f, //vldr d25, [pc, #124] 0xf2404c30, //vfma.f32 d20, d0, d16 - 0xe2823034, //add r3, r2, #52 - 0xf2417c30, //vfma.f32 d23, d1, d16 - 0xf2421c30, //vfma.f32 d17, d2, d16 - 0xf3425d12, //vmul.f32 d21, d2, d2 - 0xf2e16948, //vmul.f32 d22, d1, d8[0] - 0xf2e00948, //vmul.f32 d16, d0, d8[0] - 0xf2e29948, //vmul.f32 d25, d2, d8[0] - 0xf3282e82, //vcgt.f32 d2, d24, d2 - 0xf3281e81, //vcgt.f32 d1, d24, d1 - 0xf3280e80, //vcgt.f32 d0, d24, d0 - 0xf4e38c9f, //vld1.32 {d24[]}, [r3 :32] - 0xf268a1b8, //vorr d26, d24, d24 - 0xf242acb4, //vfma.f32 d26, d18, d20 - 0xf26821b8, //vorr d18, d24, d24 + 0xeddf2b1b, //vldr d18, [pc, #108] + 0xf2415c30, //vfma.f32 d21, d1, d16 + 0xeddfcb1d, //vldr d28, [pc, #116] + 0xf2423c30, //vfma.f32 d19, d2, d16 0xe4913004, //ldr r3, [r1], #4 - 0xf2432cb7, //vfma.f32 d18, d19, d23 - 0xf2458cb1, //vfma.f32 d24, d21, d17 - 0xf31001ba, //vbsl d0, d16, d26 - 0xf31611b2, //vbsl d1, d22, d18 - 0xf31921b8, //vbsl d2, d25, d24 - 0xecbd8b02, //vpop {d8} + 0xf3426d12, //vmul.f32 d22, d2, d2 + 0xf3417d11, //vmul.f32 d23, d1, d1 + 0xf3620e80, //vcgt.f32 d16, d18, d0 + 0xf3621e81, //vcgt.f32 d17, d18, d1 + 0xf341ad39, //vmul.f32 d26, d1, d25 + 0xf342bd39, //vmul.f32 d27, d2, d25 + 0xf3622e82, //vcgt.f32 d18, d18, d2 + 0xf3409d39, //vmul.f32 d25, d0, d25 + 0xf26cd1bc, //vorr d29, d28, d28 + 0xf248dcb4, //vfma.f32 d29, d24, d20 + 0xf26c41bc, //vorr d20, d28, d28 + 0xf2474cb5, //vfma.f32 d20, d23, d21 + 0xf246ccb3, //vfma.f32 d28, d22, d19 + 0xf35901bd, //vbsl d16, d25, d29 + 0xf35a11b4, //vbsl d17, d26, d20 + 0xf35b21bc, //vbsl d18, d27, d28 + 0xf22001b0, //vorr d0, d16, d16 + 0xf22111b1, //vorr d1, d17, d17 + 0xf22221b2, //vorr d2, d18, d18 0xe12fff13, //bx r3 + 0x3e99999a, //.word 0x3e99999a + 0x3e99999a, //.word 0x3e99999a + 0x3f328f5c, //.word 0x3f328f5c + 0x3f328f5c, //.word 0x3f328f5c + 0x3d6147ae, //.word 0x3d6147ae + 0x3d6147ae, //.word 0x3d6147ae + 0x3d9e8391, //.word 0x3d9e8391 + 0x3d9e8391, //.word 0x3d9e8391 + 0x3b23d70a, //.word 0x3b23d70a + 0x3b23d70a, //.word 0x3b23d70a }; CODE const uint32_t sk_to_srgb_vfp4[] = { - 0xed2d8b02, //vpush {d8} - 0xf3fb0580, //vrsqrte.f32 d16, d0 - 0xe2823050, //add r3, r2, #80 + 0xf3fb0582, //vrsqrte.f32 d16, d2 + 0xe4913004, //ldr r3, [r1], #4 0xf3fb1581, //vrsqrte.f32 d17, d1 - 0xed928a12, //vldr s16, [r2, #72] - 0xf3fb2582, //vrsqrte.f32 d18, d2 + 0xf3fb2580, //vrsqrte.f32 d18, d0 0xf3403db0, //vmul.f32 d19, d16, d16 0xf3414db1, //vmul.f32 d20, d17, d17 0xf3425db2, //vmul.f32 d21, d18, d18 - 0xf2603f33, //vrsqrts.f32 d19, d0, d19 + 0xf2623f33, //vrsqrts.f32 d19, d2, d19 0xf2614f34, //vrsqrts.f32 d20, d1, d20 - 0xf2625f35, //vrsqrts.f32 d21, d2, d21 + 0xf2605f35, //vrsqrts.f32 d21, d0, d21 0xf3400db3, //vmul.f32 d16, d16, d19 0xf3411db4, //vmul.f32 d17, d17, d20 0xf3422db5, //vmul.f32 d18, d18, d21 0xf3fb3520, //vrecpe.f32 d19, d16 0xf3fb4521, //vrecpe.f32 d20, d17 0xf3fb6522, //vrecpe.f32 d22, d18 - 0xf3fb55a2, //vrsqrte.f32 d21, d18 - 0xf3fb75a0, //vrsqrte.f32 d23, d16 - 0xf3fb85a1, //vrsqrte.f32 d24, d17 + 0xf3fb55a0, //vrsqrte.f32 d21, d16 + 0xf3fb75a1, //vrsqrte.f32 d23, d17 + 0xf3fb85a2, //vrsqrte.f32 d24, d18 0xf2409fb3, //vrecps.f32 d25, d16, d19 0xf241afb4, //vrecps.f32 d26, d17, d20 0xf242bfb6, //vrecps.f32 d27, d18, d22 0xf345cdb5, //vmul.f32 d28, d21, d21 0xf347ddb7, //vmul.f32 d29, d23, d23 0xf348edb8, //vmul.f32 d30, d24, d24 - 0xf2622fbc, //vrsqrts.f32 d18, d18, d28 - 0xf2600fbd, //vrsqrts.f32 d16, d16, d29 - 0xf2611fbe, //vrsqrts.f32 d17, d17, d30 + 0xf2600fbc, //vrsqrts.f32 d16, d16, d28 + 0xf2611fbd, //vrsqrts.f32 d17, d17, d29 + 0xf2622fbe, //vrsqrts.f32 d18, d18, d30 0xf3433db9, //vmul.f32 d19, d19, d25 - 0xf4e39c9f, //vld1.32 {d25[]}, [r3 :32] - 0xe2823054, //add r3, r2, #84 + 0xeddf9b21, //vldr d25, [pc, #132] 0xf3444dba, //vmul.f32 d20, d20, d26 + 0xeddfab21, //vldr d26, [pc, #132] 0xf3466dbb, //vmul.f32 d22, d22, d27 - 0xf4e3ac9f, //vld1.32 {d26[]}, [r3 :32] - 0xe282304c, //add r3, r2, #76 0xf26ab1ba, //vorr d27, d26, d26 - 0xf249bcb3, //vfma.f32 d27, d25, d19 + 0xf243bcb9, //vfma.f32 d27, d19, d25 0xf26a31ba, //vorr d19, d26, d26 - 0xf2493cb4, //vfma.f32 d19, d25, d20 - 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32] - 0xf249acb6, //vfma.f32 d26, d25, d22 - 0xe2823058, //add r3, r2, #88 - 0xf3452db2, //vmul.f32 d18, d21, d18 - 0xf3470db0, //vmul.f32 d16, d23, d16 - 0xf3481db1, //vmul.f32 d17, d24, d17 - 0xf2e05948, //vmul.f32 d21, d0, d8[0] - 0xf244bcb0, //vfma.f32 d27, d20, d16 - 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32] - 0xf2443cb1, //vfma.f32 d19, d20, d17 - 0xf244acb2, //vfma.f32 d26, d20, d18 - 0xf4e24c9f, //vld1.32 {d20[]}, [r2 :32] - 0xf2e11948, //vmul.f32 d17, d1, d8[0] - 0xf2e22948, //vmul.f32 d18, d2, d8[0] - 0xf3201e81, //vcgt.f32 d1, d16, d1 - 0xe4913004, //ldr r3, [r1], #4 - 0xf3200e80, //vcgt.f32 d0, d16, d0 - 0xf3202e82, //vcgt.f32 d2, d16, d2 - 0xf2640fab, //vmin.f32 d16, d20, d27 + 0xf2443cb9, //vfma.f32 d19, d20, d25 + 0xeddf4b1d, //vldr d20, [pc, #116] + 0xf246acb9, //vfma.f32 d26, d22, d25 + 0xf3450db0, //vmul.f32 d16, d21, d16 + 0xeddf5b1c, //vldr d21, [pc, #112] + 0xf3471db1, //vmul.f32 d17, d23, d17 + 0xf3482db2, //vmul.f32 d18, d24, d18 + 0xf3406d35, //vmul.f32 d22, d0, d21 + 0xf240bcb4, //vfma.f32 d27, d16, d20 + 0xf2413cb4, //vfma.f32 d19, d17, d20 + 0xf242acb4, //vfma.f32 d26, d18, d20 + 0xeddf2b17, //vldr d18, [pc, #92] + 0xf3417d35, //vmul.f32 d23, d1, d21 + 0xf3620e80, //vcgt.f32 d16, d18, d0 + 0xf3621e81, //vcgt.f32 d17, d18, d1 + 0xf3622e82, //vcgt.f32 d18, d18, d2 + 0xf3425d35, //vmul.f32 d21, d2, d21 + 0xf2c74f10, //vmov.f32 d20, #1 + 0xf2648faa, //vmin.f32 d24, d20, d26 0xf2643fa3, //vmin.f32 d19, d20, d19 - 0xf2644faa, //vmin.f32 d20, d20, d26 - 0xf31501b0, //vbsl d0, d21, d16 - 0xf31111b3, //vbsl d1, d17, d19 - 0xf31221b4, //vbsl d2, d18, d20 - 0xecbd8b02, //vpop {d8} + 0xf2644fab, //vmin.f32 d20, d20, d27 + 0xf35601b8, //vbsl d16, d22, d24 + 0xf35711b3, //vbsl d17, d23, d19 + 0xf35521b4, //vbsl d18, d21, d20 + 0xf22001b0, //vorr d0, d16, d16 + 0xf22111b1, //vorr d1, d17, d17 + 0xf22221b2, //vorr d2, d18, d18 0xe12fff13, //bx r3 + 0x3f306fce, //.word 0x3f306fce + 0x3f306fce, //.word 0x3f306fce + 0xbdca57a8, //.word 0xbdca57a8 + 0xbdca57a8, //.word 0xbdca57a8 + 0x3ed287c2, //.word 0x3ed287c2 + 0x3ed287c2, //.word 0x3ed287c2 + 0x41475c29, //.word 0x41475c29 + 0x41475c29, //.word 0x41475c29 + 0x3b8ce704, //.word 0x3b8ce704 + 0x3b8ce704, //.word 0x3b8ce704 }; CODE const uint32_t sk_scale_1_float_vfp4[] = { @@ -1360,191 +1415,205 @@ CODE const uint32_t sk_lerp_u8_vfp4[] = { }; CODE const uint32_t sk_lerp_565_vfp4[] = { - 0xed2d8b04, //vpush {d8-d9} - 0xe24dd008, //sub sp, sp, #8 + 0xe24dd004, //sub sp, sp, #4 0xe8911008, //ldm r1, {r3, ip} + 0xf3c72218, //vmov.i32 d18, #63488 + 0xf2c1101f, //vmov.i32 d17, #31 0xf2603d04, //vsub.f32 d19, d0, d4 - 0xf2240114, //vorr d0, d4, d4 0xe2811008, //add r1, r1, #8 0xe5933000, //ldr r3, [r3] + 0xf2616d05, //vsub.f32 d22, d1, d5 + 0xf2240114, //vorr d0, d4, d4 + 0xf2251115, //vorr d1, d5, d5 0xe7933080, //ldr r3, [r3, r0, lsl #1] - 0xe58d3004, //str r3, [sp, #4] - 0xe28d3004, //add r3, sp, #4 - 0xed923a1d, //vldr s6, [r2, #116] + 0xf2873f10, //vmov.f32 d3, #1 + 0xe58d3000, //str r3, [sp] + 0xe1a0300d, //mov r3, sp 0xf4e3083f, //vld1.32 {d16[0]}, [r3 :32] - 0xe282306c, //add r3, r2, #108 - 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32] - 0xe2823068, //add r3, r2, #104 + 0xe3a03e7e, //mov r3, #2016 0xf3d04a30, //vmovl.u16 q10, d16 - 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] - 0xe2823070, //add r3, r2, #112 - 0xf24201b4, //vand d16, d18, d20 - 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] - 0xf24221b4, //vand d18, d18, d20 - 0xf24111b4, //vand d17, d17, d20 + 0xee803b90, //vdup.32 d16, r3 + 0xf24421b2, //vand d18, d20, d18 + 0xf24411b1, //vand d17, d20, d17 + 0xeddf5b12, //vldr d21, [pc, #72] + 0xf24401b0, //vand d16, d20, d16 + 0xeddf4b0e, //vldr d20, [pc, #56] + 0xf3fb2622, //vcvt.f32.s32 d18, d18 0xf3fb0620, //vcvt.f32.s32 d16, d16 - 0xed928a1e, //vldr s16, [r2, #120] 0xf3fb1621, //vcvt.f32.s32 d17, d17 - 0xed929a1f, //vldr s18, [r2, #124] - 0xf3fb2622, //vcvt.f32.s32 d18, d18 - 0xf2614d05, //vsub.f32 d20, d1, d5 - 0xf2e009c3, //vmul.f32 d16, d16, d3[0] + 0xf3422db4, //vmul.f32 d18, d18, d20 + 0xeddf4b0d, //vldr d20, [pc, #52] + 0xf3400db5, //vmul.f32 d16, d16, d21 0xf2625d06, //vsub.f32 d21, d2, d6 - 0xf2e119c8, //vmul.f32 d17, d17, d8[0] - 0xf2e229c9, //vmul.f32 d18, d18, d9[0] - 0xf2251115, //vorr d1, d5, d5 + 0xf3411db4, //vmul.f32 d17, d17, d20 0xf2262116, //vorr d2, d6, d6 - 0xf2030cb0, //vfma.f32 d0, d19, d16 - 0xf2041cb1, //vfma.f32 d1, d20, d17 - 0xf2052cb2, //vfma.f32 d2, d21, d18 - 0xf2873f10, //vmov.f32 d3, #1 - 0xe28dd008, //add sp, sp, #8 - 0xecbd8b04, //vpop {d8-d9} + 0xf2030cb2, //vfma.f32 d0, d19, d18 + 0xf2061cb0, //vfma.f32 d1, d22, d16 + 0xf2052cb1, //vfma.f32 d2, d21, d17 + 0xe28dd004, //add sp, sp, #4 0xe12fff1c, //bx ip + 0xe320f000, //nop {0} + 0x37842108, //.word 0x37842108 + 0x37842108, //.word 0x37842108 + 0x3a020821, //.word 0x3a020821 + 0x3a020821, //.word 0x3a020821 + 0x3d042108, //.word 0x3d042108 + 0x3d042108, //.word 0x3d042108 }; CODE const uint32_t sk_load_tables_vfp4[] = { 0xe92d48f0, //push {r4, r5, r6, r7, fp, lr} 0xe8911008, //ldm r1, {r3, ip} - 0xe2826010, //add r6, r2, #16 + 0xf3c7001f, //vmov.i32 d16, #255 0xe2811008, //add r1, r1, #8 0xe593e000, //ldr lr, [r3] 0xe99300b0, //ldmib r3, {r4, r5, r7} - 0xf4e60c9f, //vld1.32 {d16[]}, [r6 :32] - 0xe08e6100, //add r6, lr, r0, lsl #2 - 0xedd61b00, //vldr d17, [r6] - 0xf24021b1, //vand d18, d16, d17 - 0xed922a03, //vldr s4, [r2, #12] - 0xf3f03031, //vshr.u32 d19, d17, #16 - 0xee326b90, //vmov.32 r6, d18[1] - 0xe0846106, //add r6, r4, r6, lsl #2 - 0xedd60a00, //vldr s1, [r6] + 0xe08e3100, //add r3, lr, r0, lsl #2 + 0xedd31b00, //vldr d17, [r3] + 0xf24121b0, //vand d18, d17, d16 + 0xf3f83031, //vshr.u32 d19, d17, #8 + 0xee323b90, //vmov.32 r3, d18[1] 0xee126b90, //vmov.32 r6, d18[0] - 0xf3f82031, //vshr.u32 d18, d17, #8 - 0xf24021b2, //vand d18, d16, d18 - 0xf24001b3, //vand d16, d16, d19 - 0xee103b90, //vmov.32 r3, d16[0] - 0xe0846106, //add r6, r4, r6, lsl #2 - 0xee304b90, //vmov.32 r4, d16[1] - 0xf3e80031, //vshr.u32 d16, d17, #24 - 0xed960a00, //vldr s0, [r6] + 0xf3f02031, //vshr.u32 d18, d17, #16 + 0xf24221b0, //vand d18, d18, d16 + 0xf24301b0, //vand d16, d19, d16 + 0xe0843103, //add r3, r4, r3, lsl #2 + 0xedd30a00, //vldr s1, [r3] + 0xe0843106, //add r3, r4, r6, lsl #2 0xee326b90, //vmov.32 r6, d18[1] + 0xed930a00, //vldr s0, [r3] + 0xee303b90, //vmov.32 r3, d16[1] + 0xee104b90, //vmov.32 r4, d16[0] + 0xf3e80031, //vshr.u32 d16, d17, #24 + 0xeddf1b0d, //vldr d17, [pc, #52] 0xf3fb0620, //vcvt.f32.s32 d16, d16 - 0xe0873103, //add r3, r7, r3, lsl #2 - 0xf2a039c2, //vmul.f32 d3, d16, d2[0] - 0xe0874104, //add r4, r7, r4, lsl #2 - 0xedd42a00, //vldr s5, [r4] - 0xe0856106, //add r6, r5, r6, lsl #2 - 0xed932a00, //vldr s4, [r3] - 0xedd61a00, //vldr s3, [r6] + 0xf3003db1, //vmul.f32 d3, d16, d17 + 0xe087e106, //add lr, r7, r6, lsl #2 0xee126b90, //vmov.32 r6, d18[0] - 0xe0856106, //add r6, r5, r6, lsl #2 - 0xed961a00, //vldr s2, [r6] + 0xe0853103, //add r3, r5, r3, lsl #2 + 0xedde2a00, //vldr s5, [lr] + 0xedd31a00, //vldr s3, [r3] + 0xe0853104, //add r3, r5, r4, lsl #2 + 0xed931a00, //vldr s2, [r3] + 0xe0873106, //add r3, r7, r6, lsl #2 + 0xed932a00, //vldr s4, [r3] 0xe8bd48f0, //pop {r4, r5, r6, r7, fp, lr} 0xe12fff1c, //bx ip + 0xe320f000, //nop {0} + 0x3b808081, //.word 0x3b808081 + 0x3b808081, //.word 0x3b808081 }; CODE const uint32_t sk_load_a8_vfp4[] = { 0xe24dd004, //sub sp, sp, #4 0xe8911008, //ldm r1, {r3, ip} 0xe2811008, //add r1, r1, #8 + 0xf2800010, //vmov.i32 d0, #0 0xf2801010, //vmov.i32 d1, #0 - 0xf2802010, //vmov.i32 d2, #0 0xe5933000, //ldr r3, [r3] + 0xf2802010, //vmov.i32 d2, #0 0xe0833000, //add r3, r3, r0 0xe1d330b0, //ldrh r3, [r3] 0xe1cd30b0, //strh r3, [sp] 0xe1a0300d, //mov r3, sp 0xf4e3041f, //vld1.16 {d16[0]}, [r3 :16] - 0xed920a03, //vldr s0, [r2, #12] 0xf3c80a30, //vmovl.u8 q8, d16 0xf3d00a30, //vmovl.u16 q8, d16 0xf3fb06a0, //vcvt.f32.u32 d16, d16 - 0xf2a039c0, //vmul.f32 d3, d16, d0[0] - 0xf2800010, //vmov.i32 d0, #0 + 0xeddf1b03, //vldr d17, [pc, #12] + 0xf3003db1, //vmul.f32 d3, d16, d17 0xe28dd004, //add sp, sp, #4 0xe12fff1c, //bx ip + 0xe320f000, //nop {0} + 0x3b808081, //.word 0x3b808081 + 0x3b808081, //.word 0x3b808081 }; CODE const uint32_t sk_store_a8_vfp4[] = { 0xe92d4800, //push {fp, lr} - 0xe2823008, //add r3, r2, #8 - 0xf2c3061f, //vmov.i32 d16, #1056964608 - 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32] + 0xeddf0b0d, //vldr d16, [pc, #52] + 0xf2c3161f, //vmov.i32 d17, #1056964608 + 0xf2431c30, //vfma.f32 d17, d3, d16 0xe5913000, //ldr r3, [r1] - 0xf2430c31, //vfma.f32 d16, d3, d17 0xe5933000, //ldr r3, [r3] - 0xf3fb07a0, //vcvt.u32.f32 d16, d16 + 0xf3fb07a1, //vcvt.u32.f32 d16, d17 0xee10eb90, //vmov.32 lr, d16[0] 0xee30cb90, //vmov.32 ip, d16[1] 0xe7e3e000, //strb lr, [r3, r0]! 0xe5c3c001, //strb ip, [r3, #1] - 0xe5913004, //ldr r3, [r1, #4] - 0xe2811008, //add r1, r1, #8 + 0xe2813008, //add r3, r1, #8 + 0xe591c004, //ldr ip, [r1, #4] + 0xe1a01003, //mov r1, r3 0xe8bd4800, //pop {fp, lr} - 0xe12fff13, //bx r3 + 0xe12fff1c, //bx ip + 0x437f0000, //.word 0x437f0000 + 0x437f0000, //.word 0x437f0000 }; CODE const uint32_t sk_load_565_vfp4[] = { 0xe24dd004, //sub sp, sp, #4 0xe8911008, //ldm r1, {r3, ip} + 0xf2c1101f, //vmov.i32 d17, #31 + 0xf3c72218, //vmov.i32 d18, #63488 + 0xeddf3b16, //vldr d19, [pc, #88] 0xe2811008, //add r1, r1, #8 0xe5933000, //ldr r3, [r3] + 0xf2873f10, //vmov.f32 d3, #1 0xe7933080, //ldr r3, [r3, r0, lsl #1] 0xe58d3000, //str r3, [sp] 0xe1a0300d, //mov r3, sp 0xf4e3083f, //vld1.32 {d16[0]}, [r3 :32] - 0xe282306c, //add r3, r2, #108 - 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32] - 0xe2823068, //add r3, r2, #104 + 0xe3a03e7e, //mov r3, #2016 0xf3d04a30, //vmovl.u16 q10, d16 - 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] - 0xe2823070, //add r3, r2, #112 - 0xf24201b4, //vand d16, d18, d20 - 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] - 0xf24111b4, //vand d17, d17, d20 - 0xf24221b4, //vand d18, d18, d20 - 0xf4a23c9f, //vld1.32 {d3[]}, [r2 :32] + 0xee803b90, //vdup.32 d16, r3 + 0xf24411b1, //vand d17, d20, d17 + 0xeddf5b0e, //vldr d21, [pc, #56] + 0xf24421b2, //vand d18, d20, d18 + 0xf24401b0, //vand d16, d20, d16 + 0xeddf4b09, //vldr d20, [pc, #36] + 0xf3fb2622, //vcvt.f32.s32 d18, d18 0xf3fb0620, //vcvt.f32.s32 d16, d16 0xf3fb1621, //vcvt.f32.s32 d17, d17 - 0xf3fb2622, //vcvt.f32.s32 d18, d18 - 0xed920a1d, //vldr s0, [r2, #116] - 0xed921a1e, //vldr s2, [r2, #120] - 0xed922a1f, //vldr s4, [r2, #124] - 0xf2a009c0, //vmul.f32 d0, d16, d0[0] - 0xf2a119c1, //vmul.f32 d1, d17, d1[0] - 0xf2a229c2, //vmul.f32 d2, d18, d2[0] + 0xf3020db3, //vmul.f32 d0, d18, d19 + 0xf3001db4, //vmul.f32 d1, d16, d20 + 0xf3012db5, //vmul.f32 d2, d17, d21 0xe28dd004, //add sp, sp, #4 0xe12fff1c, //bx ip + 0x37842108, //.word 0x37842108 + 0x37842108, //.word 0x37842108 + 0x3a020821, //.word 0x3a020821 + 0x3a020821, //.word 0x3a020821 + 0x3d042108, //.word 0x3d042108 + 0x3d042108, //.word 0x3d042108 }; CODE const uint32_t sk_store_565_vfp4[] = { - 0xe2823080, //add r3, r2, #128 + 0xf2c30f1f, //vmov.f32 d16, #31 + 0xeddf1b15, //vldr d17, [pc, #84] 0xf2c3361f, //vmov.i32 d19, #1056964608 - 0xf2c3461f, //vmov.i32 d20, #1056964608 - 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32] - 0xe2823084, //add r3, r2, #132 - 0xf2403c31, //vfma.f32 d19, d0, d17 - 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] - 0xf2c3061f, //vmov.i32 d16, #1056964608 - 0xf2414c32, //vfma.f32 d20, d1, d18 - 0xf2420c31, //vfma.f32 d16, d2, d17 0xe5913000, //ldr r3, [r1] + 0xf2413c31, //vfma.f32 d19, d1, d17 + 0xf2c3161f, //vmov.i32 d17, #1056964608 + 0xf2401c30, //vfma.f32 d17, d0, d16 0xe5933000, //ldr r3, [r3] - 0xf3fb17a3, //vcvt.u32.f32 d17, d19 + 0xf2c3261f, //vmov.i32 d18, #1056964608 + 0xf2422c30, //vfma.f32 d18, d2, d16 0xe0833080, //add r3, r3, r0, lsl #1 - 0xf3fb27a4, //vcvt.u32.f32 d18, d20 - 0xf3fb07a0, //vcvt.u32.f32 d16, d16 + 0xf3fb07a3, //vcvt.u32.f32 d16, d19 + 0xf3fb17a1, //vcvt.u32.f32 d17, d17 + 0xf3fb27a2, //vcvt.u32.f32 d18, d18 + 0xf2e50530, //vshl.s32 d16, d16, #5 0xf2eb1531, //vshl.s32 d17, d17, #11 - 0xf2e52532, //vshl.s32 d18, d18, #5 - 0xf26101b0, //vorr d16, d17, d16 + 0xf26001b1, //vorr d16, d16, d17 0xf26001b2, //vorr d16, d16, d18 0xf3f60121, //vuzp.16 d16, d17 0xf4c3080f, //vst1.32 {d16[0]}, [r3] - 0xe5913004, //ldr r3, [r1, #4] - 0xe2811008, //add r1, r1, #8 - 0xe12fff13, //bx r3 - 0xe320f000, //nop {0} + 0xe2813008, //add r3, r1, #8 + 0xe591c004, //ldr ip, [r1, #4] + 0xe1a01003, //mov r1, r3 + 0xe12fff1c, //bx ip + 0x427c0000, //.word 0x427c0000 + 0x427c0000, //.word 0x427c0000 }; CODE const uint32_t sk_load_8888_vfp4[] = { @@ -1799,21 +1868,24 @@ CODE const uint32_t sk_mirror_y_vfp4[] = { }; CODE const uint32_t sk_luminance_to_alpha_vfp4[] = { - 0xed2d8b02, //vpush {d8} - 0xed923a22, //vldr s6, [r2, #136] - 0xe2823090, //add r3, r2, #144 - 0xed928a23, //vldr s16, [r2, #140] - 0xf2e01943, //vmul.f32 d17, d0, d3[0] - 0xf2e10948, //vmul.f32 d16, d1, d8[0] + 0xeddf0b0a, //vldr d16, [pc, #40] + 0xeddf1b0b, //vldr d17, [pc, #44] + 0xf3410d30, //vmul.f32 d16, d1, d16 + 0xe4913004, //ldr r3, [r1], #4 + 0xf3401d31, //vmul.f32 d17, d0, d17 0xf2800010, //vmov.i32 d0, #0 0xf2801010, //vmov.i32 d1, #0 0xf2013da0, //vadd.f32 d3, d17, d16 - 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32] - 0xf2003c92, //vfma.f32 d3, d16, d2 - 0xe4913004, //ldr r3, [r1], #4 + 0xeddf0b06, //vldr d16, [pc, #24] + 0xf2023c30, //vfma.f32 d3, d2, d16 0xf2802010, //vmov.i32 d2, #0 - 0xecbd8b02, //vpop {d8} 0xe12fff13, //bx r3 + 0x3f371759, //.word 0x3f371759 + 0x3f371759, //.word 0x3f371759 + 0x3e59b3d0, //.word 0x3e59b3d0 + 0x3e59b3d0, //.word 0x3e59b3d0 + 0x3d93dd98, //.word 0x3d93dd98 + 0x3d93dd98, //.word 0x3d93dd98 }; CODE const uint32_t sk_matrix_2x3_vfp4[] = { @@ -2085,7 +2157,7 @@ CODE const uint8_t sk_seed_shader_hsw[] = { 196,193,121,110,200, //vmovd %r8d,%xmm1 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0 - 197,252,88,66,20, //vaddps 0x14(%rdx),%ymm0,%ymm0 + 197,252,88,2, //vaddps (%rdx),%ymm0,%ymm0 196,226,125,24,16, //vbroadcastss (%rax),%ymm2 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1 @@ -2265,16 +2337,26 @@ CODE const uint8_t sk_unpremul_hsw[] = { }; CODE const uint8_t sk_from_srgb_hsw[] = { - 196,98,125,24,66,64, //vbroadcastss 0x40(%rdx),%ymm8 + 184,145,131,158,61, //mov $0x3d9e8391,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 197,124,89,208, //vmulps %ymm0,%ymm0,%ymm10 - 196,98,125,24,90,60, //vbroadcastss 0x3c(%rdx),%ymm11 - 196,98,125,24,98,56, //vbroadcastss 0x38(%rdx),%ymm12 + 184,154,153,153,62, //mov $0x3e99999a,%eax + 197,121,110,216, //vmovd %eax,%xmm11 + 196,66,125,24,219, //vbroadcastss %xmm11,%ymm11 + 184,92,143,50,63, //mov $0x3f328f5c,%eax + 197,121,110,224, //vmovd %eax,%xmm12 + 196,66,125,24,228, //vbroadcastss %xmm12,%ymm12 196,65,124,40,235, //vmovaps %ymm11,%ymm13 196,66,125,168,236, //vfmadd213ps %ymm12,%ymm0,%ymm13 - 196,98,125,24,114,52, //vbroadcastss 0x34(%rdx),%ymm14 + 184,10,215,35,59, //mov $0x3b23d70a,%eax + 197,121,110,240, //vmovd %eax,%xmm14 + 196,66,125,24,246, //vbroadcastss %xmm14,%ymm14 196,66,45,168,238, //vfmadd213ps %ymm14,%ymm10,%ymm13 - 196,98,125,24,82,68, //vbroadcastss 0x44(%rdx),%ymm10 + 184,174,71,97,61, //mov $0x3d6147ae,%eax + 197,121,110,208, //vmovd %eax,%xmm10 + 196,66,125,24,210, //vbroadcastss %xmm10,%ymm10 196,193,124,194,194,1, //vcmpltps %ymm10,%ymm0,%ymm0 196,195,21,74,193,0, //vblendvps %ymm0,%ymm9,%ymm13,%ymm0 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9 @@ -2296,37 +2378,50 @@ CODE const uint8_t sk_from_srgb_hsw[] = { CODE const uint8_t sk_to_srgb_hsw[] = { 197,124,82,192, //vrsqrtps %ymm0,%ymm8 - 196,65,124,83,200, //vrcpps %ymm8,%ymm9 - 196,65,124,82,208, //vrsqrtps %ymm8,%ymm10 - 196,98,125,24,66,72, //vbroadcastss 0x48(%rdx),%ymm8 - 197,60,89,216, //vmulps %ymm0,%ymm8,%ymm11 - 196,98,125,24,34, //vbroadcastss (%rdx),%ymm12 - 196,98,125,24,106,76, //vbroadcastss 0x4c(%rdx),%ymm13 - 196,98,125,24,114,80, //vbroadcastss 0x50(%rdx),%ymm14 - 196,98,125,24,122,84, //vbroadcastss 0x54(%rdx),%ymm15 - 196,66,13,168,207, //vfmadd213ps %ymm15,%ymm14,%ymm9 - 196,66,21,184,202, //vfmadd231ps %ymm10,%ymm13,%ymm9 - 196,65,28,93,201, //vminps %ymm9,%ymm12,%ymm9 - 196,98,125,24,82,88, //vbroadcastss 0x58(%rdx),%ymm10 - 196,193,124,194,194,1, //vcmpltps %ymm10,%ymm0,%ymm0 - 196,195,53,74,195,0, //vblendvps %ymm0,%ymm11,%ymm9,%ymm0 - 197,124,82,201, //vrsqrtps %ymm1,%ymm9 - 196,65,124,83,217, //vrcpps %ymm9,%ymm11 - 196,65,124,82,201, //vrsqrtps %ymm9,%ymm9 - 196,66,13,168,223, //vfmadd213ps %ymm15,%ymm14,%ymm11 - 196,66,21,184,217, //vfmadd231ps %ymm9,%ymm13,%ymm11 - 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9 - 196,65,28,93,219, //vminps %ymm11,%ymm12,%ymm11 - 196,193,116,194,202,1, //vcmpltps %ymm10,%ymm1,%ymm1 - 196,195,37,74,201,16, //vblendvps %ymm1,%ymm9,%ymm11,%ymm1 - 197,124,82,202, //vrsqrtps %ymm2,%ymm9 - 196,65,124,83,217, //vrcpps %ymm9,%ymm11 + 196,65,124,83,216, //vrcpps %ymm8,%ymm11 + 196,65,124,82,224, //vrsqrtps %ymm8,%ymm12 + 184,41,92,71,65, //mov $0x41475c29,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 197,60,89,232, //vmulps %ymm0,%ymm8,%ymm13 + 184,0,0,128,63, //mov $0x3f800000,%eax + 197,121,110,200, //vmovd %eax,%xmm9 + 196,66,125,24,201, //vbroadcastss %xmm9,%ymm9 + 184,194,135,210,62, //mov $0x3ed287c2,%eax + 197,121,110,208, //vmovd %eax,%xmm10 + 196,66,125,24,210, //vbroadcastss %xmm10,%ymm10 + 184,206,111,48,63, //mov $0x3f306fce,%eax + 197,121,110,240, //vmovd %eax,%xmm14 + 196,66,125,24,246, //vbroadcastss %xmm14,%ymm14 + 184,168,87,202,61, //mov $0x3dca57a8,%eax + 53,0,0,0,128, //xor $0x80000000,%eax + 197,121,110,248, //vmovd %eax,%xmm15 + 196,66,125,24,255, //vbroadcastss %xmm15,%ymm15 196,66,13,168,223, //vfmadd213ps %ymm15,%ymm14,%ymm11 - 196,65,124,82,201, //vrsqrtps %ymm9,%ymm9 - 196,66,21,184,217, //vfmadd231ps %ymm9,%ymm13,%ymm11 - 196,65,28,93,203, //vminps %ymm11,%ymm12,%ymm9 + 196,66,45,184,220, //vfmadd231ps %ymm12,%ymm10,%ymm11 + 196,65,52,93,219, //vminps %ymm11,%ymm9,%ymm11 + 184,4,231,140,59, //mov $0x3b8ce704,%eax + 197,121,110,224, //vmovd %eax,%xmm12 + 196,66,125,24,228, //vbroadcastss %xmm12,%ymm12 + 196,193,124,194,196,1, //vcmpltps %ymm12,%ymm0,%ymm0 + 196,195,37,74,197,0, //vblendvps %ymm0,%ymm13,%ymm11,%ymm0 + 197,124,82,217, //vrsqrtps %ymm1,%ymm11 + 196,65,124,83,235, //vrcpps %ymm11,%ymm13 + 196,65,124,82,219, //vrsqrtps %ymm11,%ymm11 + 196,66,13,168,239, //vfmadd213ps %ymm15,%ymm14,%ymm13 + 196,66,45,184,235, //vfmadd231ps %ymm11,%ymm10,%ymm13 + 197,60,89,217, //vmulps %ymm1,%ymm8,%ymm11 + 196,65,52,93,237, //vminps %ymm13,%ymm9,%ymm13 + 196,193,116,194,204,1, //vcmpltps %ymm12,%ymm1,%ymm1 + 196,195,21,74,203,16, //vblendvps %ymm1,%ymm11,%ymm13,%ymm1 + 197,124,82,218, //vrsqrtps %ymm2,%ymm11 + 196,65,124,83,235, //vrcpps %ymm11,%ymm13 + 196,66,13,168,239, //vfmadd213ps %ymm15,%ymm14,%ymm13 + 196,65,124,82,219, //vrsqrtps %ymm11,%ymm11 + 196,66,45,184,235, //vfmadd231ps %ymm11,%ymm10,%ymm13 + 196,65,52,93,205, //vminps %ymm13,%ymm9,%ymm9 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 - 196,193,108,194,210,1, //vcmpltps %ymm10,%ymm2,%ymm2 + 196,193,108,194,212,1, //vcmpltps %ymm12,%ymm2,%ymm2 196,195,53,74,208,32, //vblendvps %ymm2,%ymm8,%ymm9,%ymm2 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -2349,7 +2444,7 @@ CODE const uint8_t sk_scale_u8_hsw[] = { 72,139,0, //mov (%rax),%rax 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 - 117,56, //jne 462 <_sk_scale_u8_hsw+0x48> + 117,56, //jne 4bf <_sk_scale_u8_hsw+0x48> 197,123,16,0, //vmovsd (%rax),%xmm8 196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 @@ -2373,9 +2468,9 @@ CODE const uint8_t sk_scale_u8_hsw[] = { 77,9,217, //or %r11,%r9 72,131,193,8, //add $0x8,%rcx 73,255,202, //dec %r10 - 117,234, //jne 46a <_sk_scale_u8_hsw+0x50> + 117,234, //jne 4c7 <_sk_scale_u8_hsw+0x50> 196,65,249,110,193, //vmovq %r9,%xmm8 - 235,167, //jmp 42e <_sk_scale_u8_hsw+0x14> + 235,167, //jmp 48b <_sk_scale_u8_hsw+0x14> }; CODE const uint8_t sk_lerp_1_float_hsw[] = { @@ -2399,7 +2494,7 @@ CODE const uint8_t sk_lerp_u8_hsw[] = { 72,139,0, //mov (%rax),%rax 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 - 117,76, //jne 512 <_sk_lerp_u8_hsw+0x5c> + 117,76, //jne 56f <_sk_lerp_u8_hsw+0x5c> 197,123,16,0, //vmovsd (%rax),%xmm8 196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 @@ -2427,37 +2522,49 @@ CODE const uint8_t sk_lerp_u8_hsw[] = { 77,9,217, //or %r11,%r9 72,131,193,8, //add $0x8,%rcx 73,255,202, //dec %r10 - 117,234, //jne 51a <_sk_lerp_u8_hsw+0x64> + 117,234, //jne 577 <_sk_lerp_u8_hsw+0x64> 196,65,249,110,193, //vmovq %r9,%xmm8 - 235,147, //jmp 4ca <_sk_lerp_u8_hsw+0x14> + 235,147, //jmp 527 <_sk_lerp_u8_hsw+0x14> }; CODE const uint8_t sk_lerp_565_hsw[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,16, //mov (%rax),%r10 72,133,201, //test %rcx,%rcx - 15,133,132,0,0,0, //jne 5c9 <_sk_lerp_565_hsw+0x92> + 15,133,179,0,0,0, //jne 655 <_sk_lerp_565_hsw+0xc1> 196,193,122,111,28,122, //vmovdqu (%r10,%rdi,2),%xmm3 - 196,226,125,51,219, //vpmovzxwd %xmm3,%ymm3 - 196,98,125,88,66,104, //vpbroadcastd 0x68(%rdx),%ymm8 - 197,61,219,195, //vpand %ymm3,%ymm8,%ymm8 - 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 - 196,98,125,24,74,116, //vbroadcastss 0x74(%rdx),%ymm9 - 196,65,52,89,192, //vmulps %ymm8,%ymm9,%ymm8 - 196,98,125,88,74,108, //vpbroadcastd 0x6c(%rdx),%ymm9 - 197,53,219,203, //vpand %ymm3,%ymm9,%ymm9 - 196,65,124,91,201, //vcvtdq2ps %ymm9,%ymm9 - 196,98,125,24,82,120, //vbroadcastss 0x78(%rdx),%ymm10 - 196,65,44,89,201, //vmulps %ymm9,%ymm10,%ymm9 - 196,98,125,88,82,112, //vpbroadcastd 0x70(%rdx),%ymm10 - 197,173,219,219, //vpand %ymm3,%ymm10,%ymm3 - 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3 - 196,98,125,24,82,124, //vbroadcastss 0x7c(%rdx),%ymm10 - 197,172,89,219, //vmulps %ymm3,%ymm10,%ymm3 + 196,98,125,51,195, //vpmovzxwd %xmm3,%ymm8 + 184,0,248,0,0, //mov $0xf800,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 + 196,193,101,219,216, //vpand %ymm8,%ymm3,%ymm3 + 197,124,91,203, //vcvtdq2ps %ymm3,%ymm9 + 184,8,33,132,55, //mov $0x37842108,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 197,52,89,203, //vmulps %ymm3,%ymm9,%ymm9 + 184,224,7,0,0, //mov $0x7e0,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 + 196,193,101,219,216, //vpand %ymm8,%ymm3,%ymm3 + 197,124,91,211, //vcvtdq2ps %ymm3,%ymm10 + 184,33,8,2,58, //mov $0x3a020821,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 197,44,89,211, //vmulps %ymm3,%ymm10,%ymm10 + 184,31,0,0,0, //mov $0x1f,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 + 196,193,101,219,216, //vpand %ymm8,%ymm3,%ymm3 + 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 + 184,8,33,4,61, //mov $0x3d042108,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 - 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0 + 196,226,53,168,196, //vfmadd213ps %ymm4,%ymm9,%ymm0 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1 - 196,226,53,168,205, //vfmadd213ps %ymm5,%ymm9,%ymm1 + 196,226,45,168,205, //vfmadd213ps %ymm5,%ymm10,%ymm1 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2 196,226,101,168,214, //vfmadd213ps %ymm6,%ymm3,%ymm2 184,0,0,128,63, //mov $0x3f800000,%eax @@ -2471,8 +2578,8 @@ CODE const uint8_t sk_lerp_565_hsw[] = { 65,254,200, //dec %r8b 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 15,135,102,255,255,255, //ja 54b <_sk_lerp_565_hsw+0x14> - 76,141,13,76,0,0,0, //lea 0x4c(%rip),%r9 # 638 <_sk_lerp_565_hsw+0x101> + 15,135,55,255,255,255, //ja 5a8 <_sk_lerp_565_hsw+0x14> + 76,141,13,76,0,0,0, //lea 0x4c(%rip),%r9 # 6c4 <_sk_lerp_565_hsw+0x130> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax 255,224, //jmpq *%rax @@ -2484,13 +2591,13 @@ CODE const uint8_t sk_lerp_565_hsw[] = { 196,193,97,196,92,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm3,%xmm3 196,193,97,196,92,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm3,%xmm3 196,193,97,196,28,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm3,%xmm3 - 233,22,255,255,255, //jmpq 54b <_sk_lerp_565_hsw+0x14> + 233,231,254,255,255, //jmpq 5a8 <_sk_lerp_565_hsw+0x14> 15,31,0, //nopl (%rax) 241, //icebp 255, //(bad) 255, //(bad) 255, //(bad) - 233,255,255,255,225, //jmpq ffffffffe2000640 <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff55e> + 233,255,255,255,225, //jmpq ffffffffe20006cc <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff4e7> 255, //(bad) 255, //(bad) 255, //(bad) @@ -2515,9 +2622,11 @@ CODE const uint8_t sk_load_tables_hsw[] = { 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9 76,3,8, //add (%rax),%r9 77,133,192, //test %r8,%r8 - 117,106, //jne 6d3 <_sk_load_tables_hsw+0x7f> + 117,121, //jne 76e <_sk_load_tables_hsw+0x8e> 196,193,126,111,25, //vmovdqu (%r9),%ymm3 - 196,226,125,88,82,16, //vpbroadcastd 0x10(%rdx),%ymm2 + 185,255,0,0,0, //mov $0xff,%ecx + 197,249,110,193, //vmovd %ecx,%xmm0 + 196,226,125,88,208, //vpbroadcastd %xmm0,%ymm2 197,237,219,203, //vpand %ymm3,%ymm2,%ymm1 196,65,61,118,192, //vpcmpeqd %ymm8,%ymm8,%ymm8 72,139,72,8, //mov 0x8(%rax),%rcx @@ -2533,9 +2642,11 @@ CODE const uint8_t sk_load_tables_hsw[] = { 196,65,109,219,201, //vpand %ymm9,%ymm2,%ymm9 196,162,61,146,20,136, //vgatherdps %ymm8,(%rax,%ymm9,4),%ymm2 197,229,114,211,24, //vpsrld $0x18,%ymm3,%ymm3 - 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3 - 196,98,125,24,66,12, //vbroadcastss 0xc(%rdx),%ymm8 - 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3 + 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 + 184,129,128,128,59, //mov $0x3b808081,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 72,173, //lods %ds:(%rsi),%rax 76,137,193, //mov %r8,%rcx 255,224, //jmpq *%rax @@ -2547,7 +2658,7 @@ CODE const uint8_t sk_load_tables_hsw[] = { 196,193,249,110,194, //vmovq %r10,%xmm0 196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0 196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3 - 233,114,255,255,255, //jmpq 66e <_sk_load_tables_hsw+0x1a> + 233,99,255,255,255, //jmpq 6fa <_sk_load_tables_hsw+0x1a> }; CODE const uint8_t sk_load_a8_hsw[] = { @@ -2556,11 +2667,13 @@ CODE const uint8_t sk_load_a8_hsw[] = { 72,139,0, //mov (%rax),%rax 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 - 117,42, //jne 736 <_sk_load_a8_hsw+0x3a> + 117,50, //jne 7d9 <_sk_load_a8_hsw+0x42> 197,251,16,0, //vmovsd (%rax),%xmm0 196,226,125,49,192, //vpmovzxbd %xmm0,%ymm0 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 - 196,226,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm1 + 184,129,128,128,59, //mov $0x3b808081,%eax + 197,249,110,200, //vmovd %eax,%xmm1 + 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1 197,252,89,217, //vmulps %ymm1,%ymm0,%ymm3 72,173, //lods %ds:(%rsi),%rax 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 @@ -2577,22 +2690,24 @@ CODE const uint8_t sk_load_a8_hsw[] = { 77,9,217, //or %r11,%r9 72,131,193,8, //add $0x8,%rcx 73,255,202, //dec %r10 - 117,234, //jne 73e <_sk_load_a8_hsw+0x42> + 117,234, //jne 7e1 <_sk_load_a8_hsw+0x4a> 196,193,249,110,193, //vmovq %r9,%xmm0 - 235,181, //jmp 710 <_sk_load_a8_hsw+0x14> + 235,173, //jmp 7ab <_sk_load_a8_hsw+0x14> }; CODE const uint8_t sk_store_a8_hsw[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,8, //mov (%rax),%r9 - 196,98,125,24,66,8, //vbroadcastss 0x8(%rdx),%ymm8 + 184,0,0,127,67, //mov $0x437f0000,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 196,65,57,103,192, //vpackuswb %xmm8,%xmm8,%xmm8 72,133,201, //test %rcx,%rcx - 117,10, //jne 78e <_sk_store_a8_hsw+0x33> + 117,10, //jne 839 <_sk_store_a8_hsw+0x3b> 196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -2601,9 +2716,9 @@ CODE const uint8_t sk_store_a8_hsw[] = { 254,200, //dec %al 68,15,182,192, //movzbl %al,%r8d 65,128,248,6, //cmp $0x6,%r8b - 119,236, //ja 78a <_sk_store_a8_hsw+0x2f> + 119,236, //ja 835 <_sk_store_a8_hsw+0x37> 196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8 - 76,141,21,66,0,0,0, //lea 0x42(%rip),%r10 # 7ec <_sk_store_a8_hsw+0x91> + 76,141,21,67,0,0,0, //lea 0x43(%rip),%r10 # 898 <_sk_store_a8_hsw+0x9a> 75,99,4,130, //movslq (%r10,%r8,4),%rax 76,1,208, //add %r10,%rax 255,224, //jmpq *%rax @@ -2614,26 +2729,27 @@ CODE const uint8_t sk_store_a8_hsw[] = { 196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1) 196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1) 196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1) - 235,158, //jmp 78a <_sk_store_a8_hsw+0x2f> - 247,255, //idiv %edi + 235,158, //jmp 835 <_sk_store_a8_hsw+0x37> + 144, //nop + 246,255, //idiv %bh 255, //(bad) 255, //(bad) - 239, //out %eax,(%dx) + 238, //out %al,(%dx) 255, //(bad) 255, //(bad) - 255,231, //jmpq *%rdi + 255,230, //jmpq *%rsi 255, //(bad) 255, //(bad) 255, //(bad) - 223,255, //(bad) + 222,255, //fdivrp %st,%st(7) 255, //(bad) - 255,215, //callq *%rdi + 255,214, //callq *%rsi 255, //(bad) 255, //(bad) - 255,207, //dec %edi + 255,206, //dec %esi 255, //(bad) 255, //(bad) - 255,199, //inc %edi + 255,198, //inc %esi 255, //(bad) 255, //(bad) 255, //.byte 0xff @@ -2643,25 +2759,39 @@ CODE const uint8_t sk_load_565_hsw[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,16, //mov (%rax),%r10 72,133,201, //test %rcx,%rcx - 117,92, //jne 86e <_sk_load_565_hsw+0x66> + 15,133,149,0,0,0, //jne 957 <_sk_load_565_hsw+0xa3> 196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0 196,226,125,51,208, //vpmovzxwd %xmm0,%ymm2 - 196,226,125,88,66,104, //vpbroadcastd 0x68(%rdx),%ymm0 + 184,0,248,0,0, //mov $0xf800,%eax + 197,249,110,192, //vmovd %eax,%xmm0 + 196,226,125,88,192, //vpbroadcastd %xmm0,%ymm0 197,253,219,194, //vpand %ymm2,%ymm0,%ymm0 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 - 196,226,125,24,74,116, //vbroadcastss 0x74(%rdx),%ymm1 - 197,244,89,192, //vmulps %ymm0,%ymm1,%ymm0 - 196,226,125,88,74,108, //vpbroadcastd 0x6c(%rdx),%ymm1 + 184,8,33,132,55, //mov $0x37842108,%eax + 197,249,110,200, //vmovd %eax,%xmm1 + 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1 + 197,252,89,193, //vmulps %ymm1,%ymm0,%ymm0 + 184,224,7,0,0, //mov $0x7e0,%eax + 197,249,110,200, //vmovd %eax,%xmm1 + 196,226,125,88,201, //vpbroadcastd %xmm1,%ymm1 197,245,219,202, //vpand %ymm2,%ymm1,%ymm1 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1 - 196,226,125,24,90,120, //vbroadcastss 0x78(%rdx),%ymm3 - 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1 - 196,226,125,88,90,112, //vpbroadcastd 0x70(%rdx),%ymm3 + 184,33,8,2,58, //mov $0x3a020821,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1 + 184,31,0,0,0, //mov $0x1f,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 197,229,219,210, //vpand %ymm2,%ymm3,%ymm2 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 - 196,226,125,24,90,124, //vbroadcastss 0x7c(%rdx),%ymm3 - 197,228,89,210, //vmulps %ymm2,%ymm3,%ymm2 - 196,226,125,24,26, //vbroadcastss (%rdx),%ymm3 + 184,8,33,4,61, //mov $0x3d042108,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 + 184,0,0,128,63, //mov $0x3f800000,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax 65,137,200, //mov %ecx,%r8d @@ -2670,8 +2800,8 @@ CODE const uint8_t sk_load_565_hsw[] = { 65,254,200, //dec %r8b 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 119,146, //ja 818 <_sk_load_565_hsw+0x10> - 76,141,13,75,0,0,0, //lea 0x4b(%rip),%r9 # 8d8 <_sk_load_565_hsw+0xd0> + 15,135,85,255,255,255, //ja 8c8 <_sk_load_565_hsw+0x14> + 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # 9c4 <_sk_load_565_hsw+0x110> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax 255,224, //jmpq *%rax @@ -2683,28 +2813,27 @@ CODE const uint8_t sk_load_565_hsw[] = { 196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0 196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0 196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0 - 233,66,255,255,255, //jmpq 818 <_sk_load_565_hsw+0x10> - 102,144, //xchg %ax,%ax - 242,255, //repnz (bad) - 255, //(bad) + 233,5,255,255,255, //jmpq 8c8 <_sk_load_565_hsw+0x14> + 144, //nop + 243,255, //repz (bad) 255, //(bad) - 234, //(bad) 255, //(bad) + 235,255, //jmp 9c9 <_sk_load_565_hsw+0x115> 255, //(bad) - 255,226, //jmpq *%rdx + 255,227, //jmpq *%rbx 255, //(bad) 255, //(bad) 255, //(bad) - 218,255, //(bad) + 219,255, //(bad) 255, //(bad) - 255,210, //callq *%rdx + 255,211, //callq *%rbx 255, //(bad) 255, //(bad) - 255,202, //dec %edx + 255,203, //dec %ebx 255, //(bad) 255, //(bad) 255, //(bad) - 190, //.byte 0xbe + 191, //.byte 0xbf 255, //(bad) 255, //(bad) 255, //.byte 0xff @@ -2713,11 +2842,15 @@ CODE const uint8_t sk_load_565_hsw[] = { CODE const uint8_t sk_store_565_hsw[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,8, //mov (%rax),%r9 - 196,98,125,24,130,128,0,0,0, //vbroadcastss 0x80(%rdx),%ymm8 + 184,0,0,248,65, //mov $0x41f80000,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9 196,193,53,114,241,11, //vpslld $0xb,%ymm9,%ymm9 - 196,98,125,24,146,132,0,0,0, //vbroadcastss 0x84(%rdx),%ymm10 + 184,0,0,124,66, //mov $0x427c0000,%eax + 197,121,110,208, //vmovd %eax,%xmm10 + 196,66,125,24,210, //vbroadcastss %xmm10,%ymm10 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 196,193,45,114,242,5, //vpslld $0x5,%ymm10,%ymm10 @@ -2728,7 +2861,7 @@ CODE const uint8_t sk_store_565_hsw[] = { 196,67,125,57,193,1, //vextracti128 $0x1,%ymm8,%xmm9 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 72,133,201, //test %rcx,%rcx - 117,10, //jne 956 <_sk_store_565_hsw+0x62> + 117,10, //jne a4c <_sk_store_565_hsw+0x6c> 196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -2737,8 +2870,8 @@ CODE const uint8_t sk_store_565_hsw[] = { 254,200, //dec %al 68,15,182,192, //movzbl %al,%r8d 65,128,248,6, //cmp $0x6,%r8b - 119,236, //ja 952 <_sk_store_565_hsw+0x5e> - 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # 9b4 <_sk_store_565_hsw+0xc0> + 119,236, //ja a48 <_sk_store_565_hsw+0x68> + 76,141,21,69,0,0,0, //lea 0x45(%rip),%r10 # aa8 <_sk_store_565_hsw+0xc8> 75,99,4,130, //movslq (%r10,%r8,4),%rax 76,1,208, //add %r10,%rax 255,224, //jmpq *%rax @@ -2750,27 +2883,28 @@ CODE const uint8_t sk_store_565_hsw[] = { 196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2) 197,121,126,192, //vmovd %xmm8,%eax 102,65,137,4,121, //mov %ax,(%r9,%rdi,2) - 235,161, //jmp 952 <_sk_store_565_hsw+0x5e> - 15,31,0, //nopl (%rax) - 242,255, //repnz (bad) + 235,161, //jmp a48 <_sk_store_565_hsw+0x68> + 144, //nop + 244, //hlt 255, //(bad) 255, //(bad) - 234, //(bad) 255, //(bad) + 236, //in (%dx),%al 255, //(bad) - 255,226, //jmpq *%rdx 255, //(bad) + 255,228, //jmpq *%rsp 255, //(bad) 255, //(bad) - 218,255, //(bad) 255, //(bad) - 255,210, //callq *%rdx + 220,255, //fdivr %st,%st(7) 255, //(bad) + 255,212, //callq *%rsp 255, //(bad) - 255,202, //dec %edx + 255, //(bad) + 255,204, //dec %esp 255, //(bad) 255, //(bad) - 255,194, //inc %edx + 255,196, //inc %esp 255, //(bad) 255, //(bad) 255, //.byte 0xff @@ -2782,7 +2916,7 @@ CODE const uint8_t sk_load_8888_hsw[] = { 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9 76,3,8, //add (%rax),%r9 77,133,192, //test %r8,%r8 - 117,104, //jne a4d <_sk_load_8888_hsw+0x7d> + 117,104, //jne b41 <_sk_load_8888_hsw+0x7d> 196,193,126,111,25, //vmovdqu (%r9),%ymm3 184,255,0,0,0, //mov $0xff,%eax 197,249,110,192, //vmovd %eax,%xmm0 @@ -2815,7 +2949,7 @@ CODE const uint8_t sk_load_8888_hsw[] = { 196,225,249,110,192, //vmovq %rax,%xmm0 196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0 196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3 - 233,116,255,255,255, //jmpq 9ea <_sk_load_8888_hsw+0x1a> + 233,116,255,255,255, //jmpq ade <_sk_load_8888_hsw+0x1a> }; CODE const uint8_t sk_store_8888_hsw[] = { @@ -2841,7 +2975,7 @@ CODE const uint8_t sk_store_8888_hsw[] = { 196,65,45,235,192, //vpor %ymm8,%ymm10,%ymm8 196,65,53,235,192, //vpor %ymm8,%ymm9,%ymm8 77,133,192, //test %r8,%r8 - 117,12, //jne aea <_sk_store_8888_hsw+0x74> + 117,12, //jne bde <_sk_store_8888_hsw+0x74> 196,65,126,127,1, //vmovdqu %ymm8,(%r9) 72,173, //lods %ds:(%rsi),%rax 76,137,193, //mov %r8,%rcx @@ -2854,14 +2988,14 @@ CODE const uint8_t sk_store_8888_hsw[] = { 196,97,249,110,200, //vmovq %rax,%xmm9 196,66,125,33,201, //vpmovsxbd %xmm9,%ymm9 196,66,53,142,1, //vpmaskmovd %ymm8,%ymm9,(%r9) - 235,211, //jmp ae3 <_sk_store_8888_hsw+0x6d> + 235,211, //jmp bd7 <_sk_store_8888_hsw+0x6d> }; CODE const uint8_t sk_load_f16_hsw[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax 72,133,201, //test %rcx,%rcx - 117,97, //jne b7b <_sk_load_f16_hsw+0x6b> + 117,97, //jne c6f <_sk_load_f16_hsw+0x6b> 197,249,16,12,248, //vmovupd (%rax,%rdi,8),%xmm1 197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2 197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3 @@ -2887,35 +3021,35 @@ CODE const uint8_t sk_load_f16_hsw[] = { 197,251,16,12,248, //vmovsd (%rax,%rdi,8),%xmm1 196,65,57,87,192, //vxorpd %xmm8,%xmm8,%xmm8 72,131,249,1, //cmp $0x1,%rcx - 117,6, //jne b91 <_sk_load_f16_hsw+0x81> + 117,6, //jne c85 <_sk_load_f16_hsw+0x81> 197,250,126,201, //vmovq %xmm1,%xmm1 - 235,30, //jmp baf <_sk_load_f16_hsw+0x9f> + 235,30, //jmp ca3 <_sk_load_f16_hsw+0x9f> 197,241,22,76,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm1,%xmm1 72,131,249,3, //cmp $0x3,%rcx - 114,18, //jb baf <_sk_load_f16_hsw+0x9f> + 114,18, //jb ca3 <_sk_load_f16_hsw+0x9f> 197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2 72,131,249,3, //cmp $0x3,%rcx - 117,19, //jne bbc <_sk_load_f16_hsw+0xac> + 117,19, //jne cb0 <_sk_load_f16_hsw+0xac> 197,250,126,210, //vmovq %xmm2,%xmm2 - 235,46, //jmp bdd <_sk_load_f16_hsw+0xcd> + 235,46, //jmp cd1 <_sk_load_f16_hsw+0xcd> 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2 - 233,117,255,255,255, //jmpq b31 <_sk_load_f16_hsw+0x21> + 233,117,255,255,255, //jmpq c25 <_sk_load_f16_hsw+0x21> 197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2 72,131,249,5, //cmp $0x5,%rcx - 114,21, //jb bdd <_sk_load_f16_hsw+0xcd> + 114,21, //jb cd1 <_sk_load_f16_hsw+0xcd> 197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3 72,131,249,5, //cmp $0x5,%rcx - 117,18, //jne be6 <_sk_load_f16_hsw+0xd6> + 117,18, //jne cda <_sk_load_f16_hsw+0xd6> 197,250,126,219, //vmovq %xmm3,%xmm3 - 233,84,255,255,255, //jmpq b31 <_sk_load_f16_hsw+0x21> + 233,84,255,255,255, //jmpq c25 <_sk_load_f16_hsw+0x21> 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 - 233,75,255,255,255, //jmpq b31 <_sk_load_f16_hsw+0x21> + 233,75,255,255,255, //jmpq c25 <_sk_load_f16_hsw+0x21> 197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3 72,131,249,7, //cmp $0x7,%rcx - 15,130,59,255,255,255, //jb b31 <_sk_load_f16_hsw+0x21> + 15,130,59,255,255,255, //jb c25 <_sk_load_f16_hsw+0x21> 197,123,16,68,248,48, //vmovsd 0x30(%rax,%rdi,8),%xmm8 - 233,48,255,255,255, //jmpq b31 <_sk_load_f16_hsw+0x21> + 233,48,255,255,255, //jmpq c25 <_sk_load_f16_hsw+0x21> }; CODE const uint8_t sk_store_f16_hsw[] = { @@ -2934,7 +3068,7 @@ CODE const uint8_t sk_store_f16_hsw[] = { 196,65,57,98,205, //vpunpckldq %xmm13,%xmm8,%xmm9 196,65,57,106,197, //vpunpckhdq %xmm13,%xmm8,%xmm8 72,133,201, //test %rcx,%rcx - 117,27, //jne c66 <_sk_store_f16_hsw+0x65> + 117,27, //jne d5a <_sk_store_f16_hsw+0x65> 197,120,17,28,248, //vmovups %xmm11,(%rax,%rdi,8) 197,120,17,84,248,16, //vmovups %xmm10,0x10(%rax,%rdi,8) 197,120,17,76,248,32, //vmovups %xmm9,0x20(%rax,%rdi,8) @@ -2943,22 +3077,22 @@ CODE const uint8_t sk_store_f16_hsw[] = { 255,224, //jmpq *%rax 197,121,214,28,248, //vmovq %xmm11,(%rax,%rdi,8) 72,131,249,1, //cmp $0x1,%rcx - 116,241, //je c62 <_sk_store_f16_hsw+0x61> + 116,241, //je d56 <_sk_store_f16_hsw+0x61> 197,121,23,92,248,8, //vmovhpd %xmm11,0x8(%rax,%rdi,8) 72,131,249,3, //cmp $0x3,%rcx - 114,229, //jb c62 <_sk_store_f16_hsw+0x61> + 114,229, //jb d56 <_sk_store_f16_hsw+0x61> 197,121,214,84,248,16, //vmovq %xmm10,0x10(%rax,%rdi,8) - 116,221, //je c62 <_sk_store_f16_hsw+0x61> + 116,221, //je d56 <_sk_store_f16_hsw+0x61> 197,121,23,84,248,24, //vmovhpd %xmm10,0x18(%rax,%rdi,8) 72,131,249,5, //cmp $0x5,%rcx - 114,209, //jb c62 <_sk_store_f16_hsw+0x61> + 114,209, //jb d56 <_sk_store_f16_hsw+0x61> 197,121,214,76,248,32, //vmovq %xmm9,0x20(%rax,%rdi,8) - 116,201, //je c62 <_sk_store_f16_hsw+0x61> + 116,201, //je d56 <_sk_store_f16_hsw+0x61> 197,121,23,76,248,40, //vmovhpd %xmm9,0x28(%rax,%rdi,8) 72,131,249,7, //cmp $0x7,%rcx - 114,189, //jb c62 <_sk_store_f16_hsw+0x61> + 114,189, //jb d56 <_sk_store_f16_hsw+0x61> 197,121,214,68,248,48, //vmovq %xmm8,0x30(%rax,%rdi,8) - 235,181, //jmp c62 <_sk_store_f16_hsw+0x61> + 235,181, //jmp d56 <_sk_store_f16_hsw+0x61> }; CODE const uint8_t sk_store_f32_hsw[] = { @@ -2974,7 +3108,7 @@ CODE const uint8_t sk_store_f32_hsw[] = { 196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8 196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11 72,133,201, //test %rcx,%rcx - 117,55, //jne d1a <_sk_store_f32_hsw+0x6d> + 117,55, //jne e0e <_sk_store_f32_hsw+0x6d> 196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12 196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13 196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9 @@ -2987,22 +3121,22 @@ CODE const uint8_t sk_store_f32_hsw[] = { 255,224, //jmpq *%rax 196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4) 72,131,249,1, //cmp $0x1,%rcx - 116,240, //je d16 <_sk_store_f32_hsw+0x69> + 116,240, //je e0a <_sk_store_f32_hsw+0x69> 196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4) 72,131,249,3, //cmp $0x3,%rcx - 114,227, //jb d16 <_sk_store_f32_hsw+0x69> + 114,227, //jb e0a <_sk_store_f32_hsw+0x69> 196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4) - 116,218, //je d16 <_sk_store_f32_hsw+0x69> + 116,218, //je e0a <_sk_store_f32_hsw+0x69> 196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4) 72,131,249,5, //cmp $0x5,%rcx - 114,205, //jb d16 <_sk_store_f32_hsw+0x69> + 114,205, //jb e0a <_sk_store_f32_hsw+0x69> 196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4) - 116,195, //je d16 <_sk_store_f32_hsw+0x69> + 116,195, //je e0a <_sk_store_f32_hsw+0x69> 196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4) 72,131,249,7, //cmp $0x7,%rcx - 114,181, //jb d16 <_sk_store_f32_hsw+0x69> + 114,181, //jb e0a <_sk_store_f32_hsw+0x69> 196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4) - 235,171, //jmp d16 <_sk_store_f32_hsw+0x69> + 235,171, //jmp e0a <_sk_store_f32_hsw+0x69> }; CODE const uint8_t sk_clamp_x_hsw[] = { @@ -3098,11 +3232,17 @@ CODE const uint8_t sk_mirror_y_hsw[] = { }; CODE const uint8_t sk_luminance_to_alpha_hsw[] = { - 196,98,125,24,130,136,0,0,0, //vbroadcastss 0x88(%rdx),%ymm8 - 196,226,125,24,154,140,0,0,0, //vbroadcastss 0x8c(%rdx),%ymm3 + 184,208,179,89,62, //mov $0x3e59b3d0,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,98,125,24,195, //vbroadcastss %xmm3,%ymm8 + 184,89,23,55,63, //mov $0x3f371759,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1 196,98,125,168,193, //vfmadd213ps %ymm1,%ymm0,%ymm8 - 196,226,125,24,154,144,0,0,0, //vbroadcastss 0x90(%rdx),%ymm3 + 184,152,221,147,61, //mov $0x3d93dd98,%eax + 197,249,110,192, //vmovd %eax,%xmm0 + 196,226,125,24,216, //vbroadcastss %xmm0,%ymm3 196,194,109,168,216, //vfmadd213ps %ymm8,%ymm2,%ymm3 72,173, //lods %ds:(%rsi),%rax 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 @@ -3322,7 +3462,7 @@ CODE const uint8_t sk_seed_shader_avx[] = { 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0 - 197,252,88,66,20, //vaddps 0x14(%rdx),%ymm0,%ymm0 + 197,252,88,2, //vaddps (%rdx),%ymm0,%ymm0 196,226,125,24,16, //vbroadcastss (%rax),%ymm2 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1 @@ -3516,23 +3656,38 @@ CODE const uint8_t sk_unpremul_avx[] = { }; CODE const uint8_t sk_from_srgb_avx[] = { - 196,98,125,24,66,64, //vbroadcastss 0x40(%rdx),%ymm8 + 184,145,131,158,61, //mov $0x3d9e8391,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 + 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 197,124,89,208, //vmulps %ymm0,%ymm0,%ymm10 - 196,98,125,24,90,60, //vbroadcastss 0x3c(%rdx),%ymm11 - 196,98,125,24,98,56, //vbroadcastss 0x38(%rdx),%ymm12 + 184,154,153,153,62, //mov $0x3e99999a,%eax + 197,121,110,216, //vmovd %eax,%xmm11 + 196,67,121,4,219,0, //vpermilps $0x0,%xmm11,%xmm11 + 196,67,37,24,219,1, //vinsertf128 $0x1,%xmm11,%ymm11,%ymm11 + 184,92,143,50,63, //mov $0x3f328f5c,%eax + 197,121,110,224, //vmovd %eax,%xmm12 + 196,67,121,4,228,0, //vpermilps $0x0,%xmm12,%xmm12 + 196,67,29,24,228,1, //vinsertf128 $0x1,%xmm12,%ymm12,%ymm12 197,36,89,232, //vmulps %ymm0,%ymm11,%ymm13 196,65,20,88,236, //vaddps %ymm12,%ymm13,%ymm13 - 196,98,125,24,114,52, //vbroadcastss 0x34(%rdx),%ymm14 + 184,10,215,35,59, //mov $0x3b23d70a,%eax + 197,121,110,240, //vmovd %eax,%xmm14 + 196,67,121,4,246,0, //vpermilps $0x0,%xmm14,%xmm14 + 196,67,13,24,246,1, //vinsertf128 $0x1,%xmm14,%ymm14,%ymm14 196,65,44,89,213, //vmulps %ymm13,%ymm10,%ymm10 196,65,12,88,210, //vaddps %ymm10,%ymm14,%ymm10 - 196,98,125,24,106,68, //vbroadcastss 0x44(%rdx),%ymm13 + 184,174,71,97,61, //mov $0x3d6147ae,%eax + 197,121,110,232, //vmovd %eax,%xmm13 + 196,67,121,4,237,0, //vpermilps $0x0,%xmm13,%xmm13 + 196,67,21,24,237,1, //vinsertf128 $0x1,%xmm13,%ymm13,%ymm13 196,193,124,194,197,1, //vcmpltps %ymm13,%ymm0,%ymm0 196,195,45,74,193,0, //vblendvps %ymm0,%ymm9,%ymm10,%ymm0 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9 197,116,89,209, //vmulps %ymm1,%ymm1,%ymm10 197,36,89,249, //vmulps %ymm1,%ymm11,%ymm15 - 196,65,4,88,252, //vaddps %ymm12,%ymm15,%ymm15 + 196,65,28,88,255, //vaddps %ymm15,%ymm12,%ymm15 196,65,44,89,215, //vmulps %ymm15,%ymm10,%ymm10 196,65,12,88,210, //vaddps %ymm10,%ymm14,%ymm10 196,193,116,194,205,1, //vcmpltps %ymm13,%ymm1,%ymm1 @@ -3540,7 +3695,7 @@ CODE const uint8_t sk_from_srgb_avx[] = { 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 197,108,89,202, //vmulps %ymm2,%ymm2,%ymm9 197,36,89,210, //vmulps %ymm2,%ymm11,%ymm10 - 196,65,44,88,212, //vaddps %ymm12,%ymm10,%ymm10 + 196,65,28,88,210, //vaddps %ymm10,%ymm12,%ymm10 196,65,52,89,202, //vmulps %ymm10,%ymm9,%ymm9 196,65,12,88,201, //vaddps %ymm9,%ymm14,%ymm9 196,193,108,194,213,1, //vcmpltps %ymm13,%ymm2,%ymm2 @@ -3551,43 +3706,62 @@ CODE const uint8_t sk_from_srgb_avx[] = { CODE const uint8_t sk_to_srgb_avx[] = { 197,124,82,192, //vrsqrtps %ymm0,%ymm8 - 196,65,124,83,200, //vrcpps %ymm8,%ymm9 - 196,65,124,82,208, //vrsqrtps %ymm8,%ymm10 - 196,98,125,24,66,72, //vbroadcastss 0x48(%rdx),%ymm8 - 197,60,89,216, //vmulps %ymm0,%ymm8,%ymm11 - 196,98,125,24,34, //vbroadcastss (%rdx),%ymm12 - 196,98,125,24,106,76, //vbroadcastss 0x4c(%rdx),%ymm13 - 196,98,125,24,114,80, //vbroadcastss 0x50(%rdx),%ymm14 - 196,98,125,24,122,84, //vbroadcastss 0x54(%rdx),%ymm15 - 196,65,52,89,206, //vmulps %ymm14,%ymm9,%ymm9 - 196,65,52,88,207, //vaddps %ymm15,%ymm9,%ymm9 - 196,65,44,89,213, //vmulps %ymm13,%ymm10,%ymm10 - 196,65,44,88,201, //vaddps %ymm9,%ymm10,%ymm9 - 196,65,28,93,201, //vminps %ymm9,%ymm12,%ymm9 - 196,98,125,24,82,88, //vbroadcastss 0x58(%rdx),%ymm10 - 196,193,124,194,194,1, //vcmpltps %ymm10,%ymm0,%ymm0 - 196,195,53,74,195,0, //vblendvps %ymm0,%ymm11,%ymm9,%ymm0 - 197,124,82,201, //vrsqrtps %ymm1,%ymm9 - 196,65,124,83,217, //vrcpps %ymm9,%ymm11 - 196,65,124,82,201, //vrsqrtps %ymm9,%ymm9 - 196,65,12,89,219, //vmulps %ymm11,%ymm14,%ymm11 - 196,65,4,88,219, //vaddps %ymm11,%ymm15,%ymm11 - 196,65,20,89,201, //vmulps %ymm9,%ymm13,%ymm9 - 196,65,52,88,203, //vaddps %ymm11,%ymm9,%ymm9 - 197,60,89,217, //vmulps %ymm1,%ymm8,%ymm11 - 196,65,28,93,201, //vminps %ymm9,%ymm12,%ymm9 - 196,193,116,194,202,1, //vcmpltps %ymm10,%ymm1,%ymm1 - 196,195,53,74,203,16, //vblendvps %ymm1,%ymm11,%ymm9,%ymm1 - 197,124,82,202, //vrsqrtps %ymm2,%ymm9 - 196,65,124,83,217, //vrcpps %ymm9,%ymm11 - 196,65,12,89,219, //vmulps %ymm11,%ymm14,%ymm11 + 196,65,124,83,232, //vrcpps %ymm8,%ymm13 + 196,65,124,82,240, //vrsqrtps %ymm8,%ymm14 + 184,41,92,71,65, //mov $0x41475c29,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 + 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 + 197,60,89,224, //vmulps %ymm0,%ymm8,%ymm12 + 184,0,0,128,63, //mov $0x3f800000,%eax + 197,121,110,200, //vmovd %eax,%xmm9 + 196,67,121,4,201,0, //vpermilps $0x0,%xmm9,%xmm9 + 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9 + 184,194,135,210,62, //mov $0x3ed287c2,%eax + 197,121,110,208, //vmovd %eax,%xmm10 + 196,67,121,4,210,0, //vpermilps $0x0,%xmm10,%xmm10 + 196,67,45,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm10,%ymm10 + 184,206,111,48,63, //mov $0x3f306fce,%eax + 197,121,110,216, //vmovd %eax,%xmm11 + 196,67,121,4,219,0, //vpermilps $0x0,%xmm11,%xmm11 + 196,67,37,24,219,1, //vinsertf128 $0x1,%xmm11,%ymm11,%ymm11 + 184,168,87,202,61, //mov $0x3dca57a8,%eax + 53,0,0,0,128, //xor $0x80000000,%eax + 197,121,110,248, //vmovd %eax,%xmm15 + 196,67,121,4,255,0, //vpermilps $0x0,%xmm15,%xmm15 + 196,67,5,24,255,1, //vinsertf128 $0x1,%xmm15,%ymm15,%ymm15 + 196,65,20,89,235, //vmulps %ymm11,%ymm13,%ymm13 + 196,65,20,88,239, //vaddps %ymm15,%ymm13,%ymm13 + 196,65,12,89,242, //vmulps %ymm10,%ymm14,%ymm14 + 196,65,12,88,237, //vaddps %ymm13,%ymm14,%ymm13 + 196,65,52,93,237, //vminps %ymm13,%ymm9,%ymm13 + 184,4,231,140,59, //mov $0x3b8ce704,%eax + 197,121,110,240, //vmovd %eax,%xmm14 + 196,67,121,4,246,0, //vpermilps $0x0,%xmm14,%xmm14 + 196,67,13,24,246,1, //vinsertf128 $0x1,%xmm14,%ymm14,%ymm14 + 196,193,124,194,198,1, //vcmpltps %ymm14,%ymm0,%ymm0 + 196,195,21,74,196,0, //vblendvps %ymm0,%ymm12,%ymm13,%ymm0 + 197,124,82,225, //vrsqrtps %ymm1,%ymm12 + 196,65,124,83,236, //vrcpps %ymm12,%ymm13 + 196,65,124,82,228, //vrsqrtps %ymm12,%ymm12 + 196,65,36,89,237, //vmulps %ymm13,%ymm11,%ymm13 + 196,65,4,88,237, //vaddps %ymm13,%ymm15,%ymm13 + 196,65,44,89,228, //vmulps %ymm12,%ymm10,%ymm12 + 196,65,28,88,229, //vaddps %ymm13,%ymm12,%ymm12 + 197,60,89,233, //vmulps %ymm1,%ymm8,%ymm13 + 196,65,52,93,228, //vminps %ymm12,%ymm9,%ymm12 + 196,193,116,194,206,1, //vcmpltps %ymm14,%ymm1,%ymm1 + 196,195,29,74,205,16, //vblendvps %ymm1,%ymm13,%ymm12,%ymm1 + 197,124,82,226, //vrsqrtps %ymm2,%ymm12 + 196,65,124,83,236, //vrcpps %ymm12,%ymm13 + 196,65,36,89,221, //vmulps %ymm13,%ymm11,%ymm11 196,65,4,88,219, //vaddps %ymm11,%ymm15,%ymm11 - 196,65,124,82,201, //vrsqrtps %ymm9,%ymm9 - 196,65,20,89,201, //vmulps %ymm9,%ymm13,%ymm9 - 196,65,52,88,203, //vaddps %ymm11,%ymm9,%ymm9 - 196,65,28,93,201, //vminps %ymm9,%ymm12,%ymm9 + 196,65,124,82,228, //vrsqrtps %ymm12,%ymm12 + 196,65,44,89,212, //vmulps %ymm12,%ymm10,%ymm10 + 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10 + 196,65,52,93,202, //vminps %ymm10,%ymm9,%ymm9 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 - 196,193,108,194,210,1, //vcmpltps %ymm10,%ymm2,%ymm2 + 196,193,108,194,214,1, //vcmpltps %ymm14,%ymm2,%ymm2 196,195,53,74,208,32, //vblendvps %ymm2,%ymm8,%ymm9,%ymm2 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -3610,7 +3784,7 @@ CODE const uint8_t sk_scale_u8_avx[] = { 72,139,0, //mov (%rax),%rax 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 - 117,80, //jne 4f8 <_sk_scale_u8_avx+0x60> + 117,80, //jne 5a2 <_sk_scale_u8_avx+0x60> 197,123,16,0, //vmovsd (%rax),%xmm8 196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9 196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8 @@ -3638,9 +3812,9 @@ CODE const uint8_t sk_scale_u8_avx[] = { 77,9,217, //or %r11,%r9 72,131,193,8, //add $0x8,%rcx 73,255,202, //dec %r10 - 117,234, //jne 500 <_sk_scale_u8_avx+0x68> + 117,234, //jne 5aa <_sk_scale_u8_avx+0x68> 196,65,249,110,193, //vmovq %r9,%xmm8 - 235,143, //jmp 4ac <_sk_scale_u8_avx+0x14> + 235,143, //jmp 556 <_sk_scale_u8_avx+0x14> }; CODE const uint8_t sk_lerp_1_float_avx[] = { @@ -3668,7 +3842,7 @@ CODE const uint8_t sk_lerp_u8_avx[] = { 72,139,0, //mov (%rax),%rax 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 - 117,116, //jne 5e0 <_sk_lerp_u8_avx+0x84> + 117,116, //jne 68a <_sk_lerp_u8_avx+0x84> 197,123,16,0, //vmovsd (%rax),%xmm8 196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9 196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8 @@ -3704,41 +3878,59 @@ CODE const uint8_t sk_lerp_u8_avx[] = { 77,9,217, //or %r11,%r9 72,131,193,8, //add $0x8,%rcx 73,255,202, //dec %r10 - 117,234, //jne 5e8 <_sk_lerp_u8_avx+0x8c> + 117,234, //jne 692 <_sk_lerp_u8_avx+0x8c> 196,65,249,110,193, //vmovq %r9,%xmm8 - 233,104,255,255,255, //jmpq 570 <_sk_lerp_u8_avx+0x14> + 233,104,255,255,255, //jmpq 61a <_sk_lerp_u8_avx+0x14> }; CODE const uint8_t sk_lerp_565_avx[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,16, //mov (%rax),%r10 72,133,201, //test %rcx,%rcx - 15,133,164,0,0,0, //jne 6ba <_sk_lerp_565_avx+0xb2> + 15,133,250,0,0,0, //jne 7ba <_sk_lerp_565_avx+0x108> 196,65,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm8 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 197,185,105,219, //vpunpckhwd %xmm3,%xmm8,%xmm3 196,66,121,51,192, //vpmovzxwd %xmm8,%xmm8 - 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3 - 196,98,125,24,66,104, //vbroadcastss 0x68(%rdx),%ymm8 - 197,60,84,195, //vandps %ymm3,%ymm8,%ymm8 - 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 - 196,98,125,24,74,116, //vbroadcastss 0x74(%rdx),%ymm9 - 196,65,52,89,192, //vmulps %ymm8,%ymm9,%ymm8 - 196,98,125,24,74,108, //vbroadcastss 0x6c(%rdx),%ymm9 - 197,52,84,203, //vandps %ymm3,%ymm9,%ymm9 - 196,65,124,91,201, //vcvtdq2ps %ymm9,%ymm9 - 196,98,125,24,82,120, //vbroadcastss 0x78(%rdx),%ymm10 - 196,65,44,89,201, //vmulps %ymm9,%ymm10,%ymm9 - 196,98,125,24,82,112, //vbroadcastss 0x70(%rdx),%ymm10 - 197,172,84,219, //vandps %ymm3,%ymm10,%ymm3 - 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3 - 196,98,125,24,82,124, //vbroadcastss 0x7c(%rdx),%ymm10 - 197,172,89,219, //vmulps %ymm3,%ymm10,%ymm3 + 196,99,61,24,195,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm8 + 184,0,248,0,0, //mov $0xf800,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 196,193,100,84,216, //vandps %ymm8,%ymm3,%ymm3 + 197,124,91,203, //vcvtdq2ps %ymm3,%ymm9 + 184,8,33,132,55, //mov $0x37842108,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 197,52,89,203, //vmulps %ymm3,%ymm9,%ymm9 + 184,224,7,0,0, //mov $0x7e0,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 196,193,100,84,216, //vandps %ymm8,%ymm3,%ymm3 + 197,124,91,211, //vcvtdq2ps %ymm3,%ymm10 + 184,33,8,2,58, //mov $0x3a020821,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 197,44,89,211, //vmulps %ymm3,%ymm10,%ymm10 + 184,31,0,0,0, //mov $0x1f,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 196,193,100,84,216, //vandps %ymm8,%ymm3,%ymm3 + 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 + 184,8,33,4,61, //mov $0x3d042108,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 - 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0 + 196,193,124,89,193, //vmulps %ymm9,%ymm0,%ymm0 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1 - 196,193,116,89,201, //vmulps %ymm9,%ymm1,%ymm1 + 196,193,116,89,202, //vmulps %ymm10,%ymm1,%ymm1 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 @@ -3755,8 +3947,8 @@ CODE const uint8_t sk_lerp_565_avx[] = { 65,254,200, //dec %r8b 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 15,135,69,255,255,255, //ja 61c <_sk_lerp_565_avx+0x14> - 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # 728 <_sk_lerp_565_avx+0x120> + 15,135,239,254,255,255, //ja 6c6 <_sk_lerp_565_avx+0x14> + 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # 828 <_sk_lerp_565_avx+0x176> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax 255,224, //jmpq *%rax @@ -3768,12 +3960,12 @@ CODE const uint8_t sk_lerp_565_avx[] = { 196,65,57,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm8,%xmm8 196,65,57,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm8,%xmm8 196,65,57,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm8,%xmm8 - 233,245,254,255,255, //jmpq 61c <_sk_lerp_565_avx+0x14> + 233,159,254,255,255, //jmpq 6c6 <_sk_lerp_565_avx+0x14> 144, //nop 243,255, //repz (bad) 255, //(bad) 255, //(bad) - 235,255, //jmp 72d <_sk_lerp_565_avx+0x125> + 235,255, //jmp 82d <_sk_lerp_565_avx+0x17b> 255, //(bad) 255,227, //jmpq *%rbx 255, //(bad) @@ -3804,9 +3996,12 @@ CODE const uint8_t sk_load_tables_avx[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,0, //mov (%rax),%r8 72,133,201, //test %rcx,%rcx - 15,133,18,2,0,0, //jne 96e <_sk_load_tables_avx+0x22a> + 15,133,56,2,0,0, //jne a94 <_sk_load_tables_avx+0x250> 196,65,124,16,4,184, //vmovups (%r8,%rdi,4),%ymm8 - 196,98,125,24,74,16, //vbroadcastss 0x10(%rdx),%ymm9 + 187,255,0,0,0, //mov $0xff,%ebx + 197,249,110,195, //vmovd %ebx,%xmm0 + 197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0 + 196,99,125,24,200,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm9 196,193,52,84,192, //vandps %ymm8,%ymm9,%ymm0 196,193,249,126,193, //vmovq %xmm0,%r9 69,137,203, //mov %r9d,%r11d @@ -3825,12 +4020,14 @@ CODE const uint8_t sk_load_tables_avx[] = { 76,139,64,16, //mov 0x10(%rax),%r8 196,161,122,16,68,189,0, //vmovss 0x0(%rbp,%r15,4),%xmm0 196,163,121,33,68,165,0,16, //vinsertps $0x10,0x0(%rbp,%r12,4),%xmm0,%xmm0 - 196,163,121,33,68,173,0,32, //vinsertps $0x20,0x0(%rbp,%r13,4),%xmm0,%xmm0 + 196,161,122,16,76,173,0, //vmovss 0x0(%rbp,%r13,4),%xmm1 + 196,227,121,33,193,32, //vinsertps $0x20,%xmm1,%xmm0,%xmm0 197,250,16,76,157,0, //vmovss 0x0(%rbp,%rbx,4),%xmm1 196,227,121,33,193,48, //vinsertps $0x30,%xmm1,%xmm0,%xmm0 196,161,122,16,76,157,0, //vmovss 0x0(%rbp,%r11,4),%xmm1 196,163,113,33,76,141,0,16, //vinsertps $0x10,0x0(%rbp,%r9,4),%xmm1,%xmm1 - 196,163,113,33,76,181,0,32, //vinsertps $0x20,0x0(%rbp,%r14,4),%xmm1,%xmm1 + 196,161,122,16,92,181,0, //vmovss 0x0(%rbp,%r14,4),%xmm3 + 196,227,113,33,203,32, //vinsertps $0x20,%xmm3,%xmm1,%xmm1 196,161,122,16,92,149,0, //vmovss 0x0(%rbp,%r10,4),%xmm3 196,227,113,33,203,48, //vinsertps $0x30,%xmm3,%xmm1,%xmm1 196,227,117,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm1,%ymm0 @@ -3899,9 +4096,12 @@ CODE const uint8_t sk_load_tables_avx[] = { 196,193,57,114,208,24, //vpsrld $0x18,%xmm8,%xmm8 196,193,97,114,210,24, //vpsrld $0x18,%xmm10,%xmm3 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3 - 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3 - 196,98,125,24,66,12, //vbroadcastss 0xc(%rdx),%ymm8 - 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3 + 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 + 184,129,128,128,59, //mov $0x3b808081,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 72,173, //lods %ds:(%rsi),%rax 91, //pop %rbx 65,92, //pop %r12 @@ -3910,17 +4110,17 @@ CODE const uint8_t sk_load_tables_avx[] = { 65,95, //pop %r15 93, //pop %rbp 255,224, //jmpq *%rax - 65,137,201, //mov %ecx,%r9d - 65,128,225,7, //and $0x7,%r9b + 137,203, //mov %ecx,%ebx + 128,227,7, //and $0x7,%bl 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 - 65,254,201, //dec %r9b - 69,15,182,201, //movzbl %r9b,%r9d - 65,128,249,6, //cmp $0x6,%r9b - 15,135,215,253,255,255, //ja 762 <_sk_load_tables_avx+0x1e> - 76,141,21,138,0,0,0, //lea 0x8a(%rip),%r10 # a1c <_sk_load_tables_avx+0x2d8> - 79,99,12,138, //movslq (%r10,%r9,4),%r9 - 77,1,209, //add %r10,%r9 - 65,255,225, //jmpq *%r9 + 254,203, //dec %bl + 15,182,219, //movzbl %bl,%ebx + 128,251,6, //cmp $0x6,%bl + 15,135,182,253,255,255, //ja 862 <_sk_load_tables_avx+0x1e> + 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # b3c <_sk_load_tables_avx+0x2f8> + 73,99,28,153, //movslq (%r9,%rbx,4),%rbx + 76,1,203, //add %r9,%rbx + 255,227, //jmpq *%rbx 196,193,121,110,68,184,24, //vmovd 0x18(%r8,%rdi,4),%xmm0 197,249,112,192,68, //vpshufd $0x44,%xmm0,%xmm0 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 @@ -3940,7 +4140,7 @@ CODE const uint8_t sk_load_tables_avx[] = { 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8 196,195,57,34,4,184,0, //vpinsrd $0x0,(%r8,%rdi,4),%xmm8,%xmm0 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8 - 233,70,253,255,255, //jmpq 762 <_sk_load_tables_avx+0x1e> + 233,38,253,255,255, //jmpq 862 <_sk_load_tables_avx+0x1e> 238, //out %al,(%dx) 255, //(bad) 255, //(bad) @@ -3967,14 +4167,17 @@ CODE const uint8_t sk_load_a8_avx[] = { 72,139,0, //mov (%rax),%rax 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 - 117,59, //jne a83 <_sk_load_a8_avx+0x4b> + 117,74, //jne bb2 <_sk_load_a8_avx+0x5a> 197,251,16,0, //vmovsd (%rax),%xmm0 196,226,121,49,200, //vpmovzxbd %xmm0,%xmm1 196,227,121,4,192,229, //vpermilps $0xe5,%xmm0,%xmm0 196,226,121,49,192, //vpmovzxbd %xmm0,%xmm0 196,227,117,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm1,%ymm0 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 - 196,226,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm1 + 184,129,128,128,59, //mov $0x3b808081,%eax + 197,249,110,200, //vmovd %eax,%xmm1 + 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 + 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 197,252,89,217, //vmulps %ymm1,%ymm0,%ymm3 72,173, //lods %ds:(%rsi),%rax 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 @@ -3991,22 +4194,25 @@ CODE const uint8_t sk_load_a8_avx[] = { 77,9,217, //or %r11,%r9 72,131,193,8, //add $0x8,%rcx 73,255,202, //dec %r10 - 117,234, //jne a8b <_sk_load_a8_avx+0x53> + 117,234, //jne bba <_sk_load_a8_avx+0x62> 196,193,249,110,193, //vmovq %r9,%xmm0 - 235,164, //jmp a4c <_sk_load_a8_avx+0x14> + 235,149, //jmp b6c <_sk_load_a8_avx+0x14> }; CODE const uint8_t sk_store_a8_avx[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,8, //mov (%rax),%r9 - 196,98,125,24,66,8, //vbroadcastss 0x8(%rdx),%ymm8 + 184,0,0,127,67, //mov $0x437f0000,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 + 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 196,65,57,103,192, //vpackuswb %xmm8,%xmm8,%xmm8 72,133,201, //test %rcx,%rcx - 117,10, //jne adb <_sk_store_a8_avx+0x33> + 117,10, //jne c19 <_sk_store_a8_avx+0x42> 196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -4015,9 +4221,9 @@ CODE const uint8_t sk_store_a8_avx[] = { 254,200, //dec %al 68,15,182,192, //movzbl %al,%r8d 65,128,248,6, //cmp $0x6,%r8b - 119,236, //ja ad7 <_sk_store_a8_avx+0x2f> + 119,236, //ja c15 <_sk_store_a8_avx+0x3e> 196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8 - 76,141,21,69,0,0,0, //lea 0x45(%rip),%r10 # b3c <_sk_store_a8_avx+0x94> + 76,141,21,67,0,0,0, //lea 0x43(%rip),%r10 # c78 <_sk_store_a8_avx+0xa1> 75,99,4,130, //movslq (%r10,%r8,4),%rax 76,1,208, //add %r10,%rax 255,224, //jmpq *%rax @@ -4028,28 +4234,27 @@ CODE const uint8_t sk_store_a8_avx[] = { 196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1) 196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1) 196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1) - 235,158, //jmp ad7 <_sk_store_a8_avx+0x2f> - 15,31,0, //nopl (%rax) - 244, //hlt - 255, //(bad) + 235,158, //jmp c15 <_sk_store_a8_avx+0x3e> + 144, //nop + 246,255, //idiv %bh 255, //(bad) 255, //(bad) - 236, //in (%dx),%al + 238, //out %al,(%dx) 255, //(bad) 255, //(bad) - 255,228, //jmpq *%rsp + 255,230, //jmpq *%rsi 255, //(bad) 255, //(bad) 255, //(bad) - 220,255, //fdivr %st,%st(7) + 222,255, //fdivrp %st,%st(7) 255, //(bad) - 255,212, //callq *%rsp + 255,214, //callq *%rsi 255, //(bad) 255, //(bad) - 255,204, //dec %esp + 255,206, //dec %esi 255, //(bad) 255, //(bad) - 255,196, //inc %esp + 255,198, //inc %esi 255, //(bad) 255, //(bad) 255, //.byte 0xff @@ -4059,28 +4264,49 @@ CODE const uint8_t sk_load_565_avx[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,16, //mov (%rax),%r10 72,133,201, //test %rcx,%rcx - 117,106, //jne bcc <_sk_load_565_avx+0x74> + 15,133,209,0,0,0, //jne d73 <_sk_load_565_avx+0xdf> 196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0 197,241,239,201, //vpxor %xmm1,%xmm1,%xmm1 197,249,105,201, //vpunpckhwd %xmm1,%xmm0,%xmm1 196,226,121,51,192, //vpmovzxwd %xmm0,%xmm0 196,227,125,24,209,1, //vinsertf128 $0x1,%xmm1,%ymm0,%ymm2 - 196,226,125,24,66,104, //vbroadcastss 0x68(%rdx),%ymm0 + 184,0,248,0,0, //mov $0xf800,%eax + 197,249,110,192, //vmovd %eax,%xmm0 + 197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0 + 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 197,252,84,194, //vandps %ymm2,%ymm0,%ymm0 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 - 196,226,125,24,74,116, //vbroadcastss 0x74(%rdx),%ymm1 - 197,244,89,192, //vmulps %ymm0,%ymm1,%ymm0 - 196,226,125,24,74,108, //vbroadcastss 0x6c(%rdx),%ymm1 + 184,8,33,132,55, //mov $0x37842108,%eax + 197,249,110,200, //vmovd %eax,%xmm1 + 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 + 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 + 197,252,89,193, //vmulps %ymm1,%ymm0,%ymm0 + 184,224,7,0,0, //mov $0x7e0,%eax + 197,249,110,200, //vmovd %eax,%xmm1 + 197,249,112,201,0, //vpshufd $0x0,%xmm1,%xmm1 + 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 197,244,84,202, //vandps %ymm2,%ymm1,%ymm1 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1 - 196,226,125,24,90,120, //vbroadcastss 0x78(%rdx),%ymm3 - 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1 - 196,226,125,24,90,112, //vbroadcastss 0x70(%rdx),%ymm3 + 184,33,8,2,58, //mov $0x3a020821,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1 + 184,31,0,0,0, //mov $0x1f,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 197,228,84,210, //vandps %ymm2,%ymm3,%ymm2 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 - 196,226,125,24,90,124, //vbroadcastss 0x7c(%rdx),%ymm3 - 197,228,89,210, //vmulps %ymm2,%ymm3,%ymm2 - 196,226,125,24,26, //vbroadcastss (%rdx),%ymm3 + 184,8,33,4,61, //mov $0x3d042108,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 + 184,0,0,128,63, //mov $0x3f800000,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax 65,137,200, //mov %ecx,%r8d @@ -4089,8 +4315,8 @@ CODE const uint8_t sk_load_565_avx[] = { 65,254,200, //dec %r8b 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 119,132, //ja b68 <_sk_load_565_avx+0x10> - 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # c34 <_sk_load_565_avx+0xdc> + 15,135,25,255,255,255, //ja ca8 <_sk_load_565_avx+0x14> + 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # de0 <_sk_load_565_avx+0x14c> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax 255,224, //jmpq *%rax @@ -4102,27 +4328,27 @@ CODE const uint8_t sk_load_565_avx[] = { 196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0 196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0 196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0 - 233,52,255,255,255, //jmpq b68 <_sk_load_565_avx+0x10> - 244, //hlt - 255, //(bad) + 233,201,254,255,255, //jmpq ca8 <_sk_load_565_avx+0x14> + 144, //nop + 243,255, //repz (bad) 255, //(bad) 255, //(bad) - 236, //in (%dx),%al + 235,255, //jmp de5 <_sk_load_565_avx+0x151> 255, //(bad) + 255,227, //jmpq *%rbx 255, //(bad) - 255,228, //jmpq *%rsp 255, //(bad) 255, //(bad) + 219,255, //(bad) 255, //(bad) - 220,255, //fdivr %st,%st(7) + 255,211, //callq *%rbx 255, //(bad) - 255,212, //callq *%rsp 255, //(bad) + 255,203, //dec %ebx 255, //(bad) - 255,204, //dec %esp 255, //(bad) 255, //(bad) - 255,192, //inc %eax + 191, //.byte 0xbf 255, //(bad) 255, //(bad) 255, //.byte 0xff @@ -4131,14 +4357,20 @@ CODE const uint8_t sk_load_565_avx[] = { CODE const uint8_t sk_store_565_avx[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,8, //mov (%rax),%r9 - 196,98,125,24,130,128,0,0,0, //vbroadcastss 0x80(%rdx),%ymm8 + 184,0,0,248,65, //mov $0x41f80000,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 + 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9 196,193,41,114,241,11, //vpslld $0xb,%xmm9,%xmm10 196,67,125,25,201,1, //vextractf128 $0x1,%ymm9,%xmm9 196,193,49,114,241,11, //vpslld $0xb,%xmm9,%xmm9 196,67,45,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm9 - 196,98,125,24,146,132,0,0,0, //vbroadcastss 0x84(%rdx),%ymm10 + 184,0,0,124,66, //mov $0x427c0000,%eax + 197,121,110,208, //vmovd %eax,%xmm10 + 196,67,121,4,210,0, //vpermilps $0x0,%xmm10,%xmm10 + 196,67,45,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm10,%ymm10 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 196,193,33,114,242,5, //vpslld $0x5,%xmm10,%xmm11 @@ -4152,7 +4384,7 @@ CODE const uint8_t sk_store_565_avx[] = { 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 72,133,201, //test %rcx,%rcx - 117,10, //jne cd6 <_sk_store_565_avx+0x86> + 117,10, //jne e9a <_sk_store_565_avx+0x9e> 196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -4161,8 +4393,8 @@ CODE const uint8_t sk_store_565_avx[] = { 254,200, //dec %al 68,15,182,192, //movzbl %al,%r8d 65,128,248,6, //cmp $0x6,%r8b - 119,236, //ja cd2 <_sk_store_565_avx+0x82> - 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # d34 <_sk_store_565_avx+0xe4> + 119,236, //ja e96 <_sk_store_565_avx+0x9a> + 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # ef8 <_sk_store_565_avx+0xfc> 75,99,4,130, //movslq (%r10,%r8,4),%rax 76,1,208, //add %r10,%rax 255,224, //jmpq *%rax @@ -4174,7 +4406,7 @@ CODE const uint8_t sk_store_565_avx[] = { 196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2) 197,121,126,192, //vmovd %xmm8,%eax 102,65,137,4,121, //mov %ax,(%r9,%rdi,2) - 235,161, //jmp cd2 <_sk_store_565_avx+0x82> + 235,161, //jmp e96 <_sk_store_565_avx+0x9a> 15,31,0, //nopl (%rax) 242,255, //repnz (bad) 255, //(bad) @@ -4204,7 +4436,7 @@ CODE const uint8_t sk_load_8888_avx[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,16, //mov (%rax),%r10 72,133,201, //test %rcx,%rcx - 15,133,157,0,0,0, //jne dfb <_sk_load_8888_avx+0xab> + 15,133,157,0,0,0, //jne fbf <_sk_load_8888_avx+0xab> 196,65,124,16,12,186, //vmovups (%r10,%rdi,4),%ymm9 184,255,0,0,0, //mov $0xff,%eax 197,249,110,192, //vmovd %eax,%xmm0 @@ -4243,8 +4475,8 @@ CODE const uint8_t sk_load_8888_avx[] = { 65,254,200, //dec %r8b 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 15,135,76,255,255,255, //ja d64 <_sk_load_8888_avx+0x14> - 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # ea8 <_sk_load_8888_avx+0x158> + 15,135,76,255,255,255, //ja f28 <_sk_load_8888_avx+0x14> + 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # 106c <_sk_load_8888_avx+0x158> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax 255,224, //jmpq *%rax @@ -4267,7 +4499,7 @@ CODE const uint8_t sk_load_8888_avx[] = { 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9 196,195,49,34,4,186,0, //vpinsrd $0x0,(%r10,%rdi,4),%xmm9,%xmm0 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9 - 233,188,254,255,255, //jmpq d64 <_sk_load_8888_avx+0x14> + 233,188,254,255,255, //jmpq f28 <_sk_load_8888_avx+0x14> 238, //out %al,(%dx) 255, //(bad) 255, //(bad) @@ -4319,7 +4551,7 @@ CODE const uint8_t sk_store_8888_avx[] = { 196,65,45,86,192, //vorpd %ymm8,%ymm10,%ymm8 196,65,53,86,192, //vorpd %ymm8,%ymm9,%ymm8 72,133,201, //test %rcx,%rcx - 117,10, //jne f68 <_sk_store_8888_avx+0xa4> + 117,10, //jne 112c <_sk_store_8888_avx+0xa4> 196,65,124,17,4,185, //vmovups %ymm8,(%r9,%rdi,4) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -4328,8 +4560,8 @@ CODE const uint8_t sk_store_8888_avx[] = { 254,200, //dec %al 68,15,182,192, //movzbl %al,%r8d 65,128,248,6, //cmp $0x6,%r8b - 119,236, //ja f64 <_sk_store_8888_avx+0xa0> - 76,141,21,85,0,0,0, //lea 0x55(%rip),%r10 # fd4 <_sk_store_8888_avx+0x110> + 119,236, //ja 1128 <_sk_store_8888_avx+0xa0> + 76,141,21,85,0,0,0, //lea 0x55(%rip),%r10 # 1198 <_sk_store_8888_avx+0x110> 75,99,4,130, //movslq (%r10,%r8,4),%rax 76,1,208, //add %r10,%rax 255,224, //jmpq *%rax @@ -4343,7 +4575,7 @@ CODE const uint8_t sk_store_8888_avx[] = { 196,67,121,22,68,185,8,2, //vpextrd $0x2,%xmm8,0x8(%r9,%rdi,4) 196,67,121,22,68,185,4,1, //vpextrd $0x1,%xmm8,0x4(%r9,%rdi,4) 196,65,121,126,4,185, //vmovd %xmm8,(%r9,%rdi,4) - 235,147, //jmp f64 <_sk_store_8888_avx+0xa0> + 235,147, //jmp 1128 <_sk_store_8888_avx+0xa0> 15,31,0, //nopl (%rax) 245, //cmc 255, //(bad) @@ -4374,7 +4606,7 @@ CODE const uint8_t sk_load_f16_avx[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax 72,133,201, //test %rcx,%rcx - 15,133,240,0,0,0, //jne 10ee <_sk_load_f16_avx+0xfe> + 15,133,8,1,0,0, //jne 12ca <_sk_load_f16_avx+0x116> 197,249,16,12,248, //vmovupd (%rax,%rdi,8),%xmm1 197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2 197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3 @@ -4385,42 +4617,46 @@ CODE const uint8_t sk_load_f16_avx[] = { 196,193,97,105,216, //vpunpckhwd %xmm8,%xmm3,%xmm3 197,121,97,193, //vpunpcklwd %xmm1,%xmm0,%xmm8 197,249,105,193, //vpunpckhwd %xmm1,%xmm0,%xmm0 - 197,233,97,203, //vpunpcklwd %xmm3,%xmm2,%xmm1 + 197,105,97,211, //vpunpcklwd %xmm3,%xmm2,%xmm10 197,105,105,203, //vpunpckhwd %xmm3,%xmm2,%xmm9 - 197,249,110,90,100, //vmovd 0x64(%rdx),%xmm3 + 184,0,4,0,4, //mov $0x4000400,%eax + 197,249,110,216, //vmovd %eax,%xmm3 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 196,193,97,101,208, //vpcmpgtw %xmm8,%xmm3,%xmm2 - 196,65,105,223,192, //vpandn %xmm8,%xmm2,%xmm8 - 197,225,101,208, //vpcmpgtw %xmm0,%xmm3,%xmm2 - 197,233,223,192, //vpandn %xmm0,%xmm2,%xmm0 - 197,225,101,209, //vpcmpgtw %xmm1,%xmm3,%xmm2 - 197,233,223,201, //vpandn %xmm1,%xmm2,%xmm1 - 196,193,97,101,209, //vpcmpgtw %xmm9,%xmm3,%xmm2 - 196,193,105,223,209, //vpandn %xmm9,%xmm2,%xmm2 - 196,66,121,51,208, //vpmovzxwd %xmm8,%xmm10 + 196,193,105,223,208, //vpandn %xmm8,%xmm2,%xmm2 + 197,225,101,200, //vpcmpgtw %xmm0,%xmm3,%xmm1 + 197,241,223,192, //vpandn %xmm0,%xmm1,%xmm0 + 196,193,97,101,202, //vpcmpgtw %xmm10,%xmm3,%xmm1 + 196,193,113,223,202, //vpandn %xmm10,%xmm1,%xmm1 + 196,193,97,101,217, //vpcmpgtw %xmm9,%xmm3,%xmm3 + 196,193,97,223,217, //vpandn %xmm9,%xmm3,%xmm3 + 196,98,121,51,194, //vpmovzxwd %xmm2,%xmm8 196,98,121,51,201, //vpmovzxwd %xmm1,%xmm9 - 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 - 197,57,105,195, //vpunpckhwd %xmm3,%xmm8,%xmm8 - 197,241,105,203, //vpunpckhwd %xmm3,%xmm1,%xmm1 + 196,65,41,239,210, //vpxor %xmm10,%xmm10,%xmm10 + 196,193,105,105,210, //vpunpckhwd %xmm10,%xmm2,%xmm2 + 196,193,113,105,202, //vpunpckhwd %xmm10,%xmm1,%xmm1 196,98,121,51,216, //vpmovzxwd %xmm0,%xmm11 - 196,98,121,51,226, //vpmovzxwd %xmm2,%xmm12 - 197,121,105,235, //vpunpckhwd %xmm3,%xmm0,%xmm13 - 197,105,105,243, //vpunpckhwd %xmm3,%xmm2,%xmm14 - 196,193,121,114,242,13, //vpslld $0xd,%xmm10,%xmm0 - 196,193,105,114,241,13, //vpslld $0xd,%xmm9,%xmm2 - 196,227,125,24,194,1, //vinsertf128 $0x1,%xmm2,%ymm0,%ymm0 - 196,98,125,24,74,92, //vbroadcastss 0x5c(%rdx),%ymm9 + 196,98,121,51,227, //vpmovzxwd %xmm3,%xmm12 + 196,65,121,105,234, //vpunpckhwd %xmm10,%xmm0,%xmm13 + 196,65,97,105,210, //vpunpckhwd %xmm10,%xmm3,%xmm10 + 196,193,121,114,240,13, //vpslld $0xd,%xmm8,%xmm0 + 196,193,97,114,241,13, //vpslld $0xd,%xmm9,%xmm3 + 196,227,125,24,195,1, //vinsertf128 $0x1,%xmm3,%ymm0,%ymm0 + 184,0,0,128,119, //mov $0x77800000,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 + 196,99,101,24,203,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm9 197,180,89,192, //vmulps %ymm0,%ymm9,%ymm0 - 196,193,105,114,240,13, //vpslld $0xd,%xmm8,%xmm2 + 197,233,114,242,13, //vpslld $0xd,%xmm2,%xmm2 197,241,114,241,13, //vpslld $0xd,%xmm1,%xmm1 196,227,109,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm2,%ymm1 197,180,89,201, //vmulps %ymm1,%ymm9,%ymm1 - 196,193,105,114,243,13, //vpslld $0xd,%xmm11,%xmm2 - 196,193,97,114,244,13, //vpslld $0xd,%xmm12,%xmm3 - 196,227,109,24,211,1, //vinsertf128 $0x1,%xmm3,%ymm2,%ymm2 + 196,193,57,114,243,13, //vpslld $0xd,%xmm11,%xmm8 + 196,193,105,114,244,13, //vpslld $0xd,%xmm12,%xmm2 + 196,227,61,24,210,1, //vinsertf128 $0x1,%xmm2,%ymm8,%ymm2 197,180,89,210, //vmulps %ymm2,%ymm9,%ymm2 196,193,57,114,245,13, //vpslld $0xd,%xmm13,%xmm8 - 196,193,97,114,246,13, //vpslld $0xd,%xmm14,%xmm3 + 196,193,97,114,242,13, //vpslld $0xd,%xmm10,%xmm3 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3 197,180,89,219, //vmulps %ymm3,%ymm9,%ymm3 72,173, //lods %ds:(%rsi),%rax @@ -4428,41 +4664,44 @@ CODE const uint8_t sk_load_f16_avx[] = { 197,251,16,12,248, //vmovsd (%rax,%rdi,8),%xmm1 196,65,57,87,192, //vxorpd %xmm8,%xmm8,%xmm8 72,131,249,1, //cmp $0x1,%rcx - 117,6, //jne 1104 <_sk_load_f16_avx+0x114> + 117,6, //jne 12e0 <_sk_load_f16_avx+0x12c> 197,250,126,201, //vmovq %xmm1,%xmm1 - 235,30, //jmp 1122 <_sk_load_f16_avx+0x132> + 235,30, //jmp 12fe <_sk_load_f16_avx+0x14a> 197,241,22,76,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm1,%xmm1 72,131,249,3, //cmp $0x3,%rcx - 114,18, //jb 1122 <_sk_load_f16_avx+0x132> + 114,18, //jb 12fe <_sk_load_f16_avx+0x14a> 197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2 72,131,249,3, //cmp $0x3,%rcx - 117,19, //jne 112f <_sk_load_f16_avx+0x13f> + 117,19, //jne 130b <_sk_load_f16_avx+0x157> 197,250,126,210, //vmovq %xmm2,%xmm2 - 235,46, //jmp 1150 <_sk_load_f16_avx+0x160> + 235,46, //jmp 132c <_sk_load_f16_avx+0x178> 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2 - 233,230,254,255,255, //jmpq 1015 <_sk_load_f16_avx+0x25> + 233,206,254,255,255, //jmpq 11d9 <_sk_load_f16_avx+0x25> 197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2 72,131,249,5, //cmp $0x5,%rcx - 114,21, //jb 1150 <_sk_load_f16_avx+0x160> + 114,21, //jb 132c <_sk_load_f16_avx+0x178> 197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3 72,131,249,5, //cmp $0x5,%rcx - 117,18, //jne 1159 <_sk_load_f16_avx+0x169> + 117,18, //jne 1335 <_sk_load_f16_avx+0x181> 197,250,126,219, //vmovq %xmm3,%xmm3 - 233,197,254,255,255, //jmpq 1015 <_sk_load_f16_avx+0x25> + 233,173,254,255,255, //jmpq 11d9 <_sk_load_f16_avx+0x25> 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 - 233,188,254,255,255, //jmpq 1015 <_sk_load_f16_avx+0x25> + 233,164,254,255,255, //jmpq 11d9 <_sk_load_f16_avx+0x25> 197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3 72,131,249,7, //cmp $0x7,%rcx - 15,130,172,254,255,255, //jb 1015 <_sk_load_f16_avx+0x25> + 15,130,148,254,255,255, //jb 11d9 <_sk_load_f16_avx+0x25> 197,123,16,68,248,48, //vmovsd 0x30(%rax,%rdi,8),%xmm8 - 233,161,254,255,255, //jmpq 1015 <_sk_load_f16_avx+0x25> + 233,137,254,255,255, //jmpq 11d9 <_sk_load_f16_avx+0x25> }; CODE const uint8_t sk_store_f16_avx[] = { 72,173, //lods %ds:(%rsi),%rax - 72,139,0, //mov (%rax),%rax - 196,98,125,24,66,96, //vbroadcastss 0x60(%rdx),%ymm8 + 76,139,0, //mov (%rax),%r8 + 184,0,0,128,7, //mov $0x7800000,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,65,121,112,192,0, //vpshufd $0x0,%xmm8,%xmm8 + 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 196,67,125,25,202,1, //vextractf128 $0x1,%ymm9,%xmm10 196,193,41,114,210,13, //vpsrld $0xd,%xmm10,%xmm10 @@ -4492,31 +4731,31 @@ CODE const uint8_t sk_store_f16_avx[] = { 196,65,25,98,205, //vpunpckldq %xmm13,%xmm12,%xmm9 196,65,25,106,197, //vpunpckhdq %xmm13,%xmm12,%xmm8 72,133,201, //test %rcx,%rcx - 117,27, //jne 1237 <_sk_store_f16_avx+0xc3> - 197,120,17,28,248, //vmovups %xmm11,(%rax,%rdi,8) - 197,120,17,84,248,16, //vmovups %xmm10,0x10(%rax,%rdi,8) - 197,120,17,76,248,32, //vmovups %xmm9,0x20(%rax,%rdi,8) - 197,122,127,68,248,48, //vmovdqu %xmm8,0x30(%rax,%rdi,8) + 117,31, //jne 1426 <_sk_store_f16_avx+0xd6> + 196,65,120,17,28,248, //vmovups %xmm11,(%r8,%rdi,8) + 196,65,120,17,84,248,16, //vmovups %xmm10,0x10(%r8,%rdi,8) + 196,65,120,17,76,248,32, //vmovups %xmm9,0x20(%r8,%rdi,8) + 196,65,122,127,68,248,48, //vmovdqu %xmm8,0x30(%r8,%rdi,8) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax - 197,121,214,28,248, //vmovq %xmm11,(%rax,%rdi,8) + 196,65,121,214,28,248, //vmovq %xmm11,(%r8,%rdi,8) 72,131,249,1, //cmp $0x1,%rcx - 116,241, //je 1233 <_sk_store_f16_avx+0xbf> - 197,121,23,92,248,8, //vmovhpd %xmm11,0x8(%rax,%rdi,8) + 116,240, //je 1422 <_sk_store_f16_avx+0xd2> + 196,65,121,23,92,248,8, //vmovhpd %xmm11,0x8(%r8,%rdi,8) 72,131,249,3, //cmp $0x3,%rcx - 114,229, //jb 1233 <_sk_store_f16_avx+0xbf> - 197,121,214,84,248,16, //vmovq %xmm10,0x10(%rax,%rdi,8) - 116,221, //je 1233 <_sk_store_f16_avx+0xbf> - 197,121,23,84,248,24, //vmovhpd %xmm10,0x18(%rax,%rdi,8) + 114,227, //jb 1422 <_sk_store_f16_avx+0xd2> + 196,65,121,214,84,248,16, //vmovq %xmm10,0x10(%r8,%rdi,8) + 116,218, //je 1422 <_sk_store_f16_avx+0xd2> + 196,65,121,23,84,248,24, //vmovhpd %xmm10,0x18(%r8,%rdi,8) 72,131,249,5, //cmp $0x5,%rcx - 114,209, //jb 1233 <_sk_store_f16_avx+0xbf> - 197,121,214,76,248,32, //vmovq %xmm9,0x20(%rax,%rdi,8) - 116,201, //je 1233 <_sk_store_f16_avx+0xbf> - 197,121,23,76,248,40, //vmovhpd %xmm9,0x28(%rax,%rdi,8) + 114,205, //jb 1422 <_sk_store_f16_avx+0xd2> + 196,65,121,214,76,248,32, //vmovq %xmm9,0x20(%r8,%rdi,8) + 116,196, //je 1422 <_sk_store_f16_avx+0xd2> + 196,65,121,23,76,248,40, //vmovhpd %xmm9,0x28(%r8,%rdi,8) 72,131,249,7, //cmp $0x7,%rcx - 114,189, //jb 1233 <_sk_store_f16_avx+0xbf> - 197,121,214,68,248,48, //vmovq %xmm8,0x30(%rax,%rdi,8) - 235,181, //jmp 1233 <_sk_store_f16_avx+0xbf> + 114,183, //jb 1422 <_sk_store_f16_avx+0xd2> + 196,65,121,214,68,248,48, //vmovq %xmm8,0x30(%r8,%rdi,8) + 235,174, //jmp 1422 <_sk_store_f16_avx+0xd2> }; CODE const uint8_t sk_store_f32_avx[] = { @@ -4532,7 +4771,7 @@ CODE const uint8_t sk_store_f32_avx[] = { 196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8 196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11 72,133,201, //test %rcx,%rcx - 117,55, //jne 12eb <_sk_store_f32_avx+0x6d> + 117,55, //jne 14e1 <_sk_store_f32_avx+0x6d> 196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12 196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13 196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9 @@ -4545,22 +4784,22 @@ CODE const uint8_t sk_store_f32_avx[] = { 255,224, //jmpq *%rax 196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4) 72,131,249,1, //cmp $0x1,%rcx - 116,240, //je 12e7 <_sk_store_f32_avx+0x69> + 116,240, //je 14dd <_sk_store_f32_avx+0x69> 196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4) 72,131,249,3, //cmp $0x3,%rcx - 114,227, //jb 12e7 <_sk_store_f32_avx+0x69> + 114,227, //jb 14dd <_sk_store_f32_avx+0x69> 196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4) - 116,218, //je 12e7 <_sk_store_f32_avx+0x69> + 116,218, //je 14dd <_sk_store_f32_avx+0x69> 196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4) 72,131,249,5, //cmp $0x5,%rcx - 114,205, //jb 12e7 <_sk_store_f32_avx+0x69> + 114,205, //jb 14dd <_sk_store_f32_avx+0x69> 196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4) - 116,195, //je 12e7 <_sk_store_f32_avx+0x69> + 116,195, //je 14dd <_sk_store_f32_avx+0x69> 196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4) 72,131,249,7, //cmp $0x7,%rcx - 114,181, //jb 12e7 <_sk_store_f32_avx+0x69> + 114,181, //jb 14dd <_sk_store_f32_avx+0x69> 196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4) - 235,171, //jmp 12e7 <_sk_store_f32_avx+0x69> + 235,171, //jmp 14dd <_sk_store_f32_avx+0x69> }; CODE const uint8_t sk_clamp_x_avx[] = { @@ -4682,12 +4921,21 @@ CODE const uint8_t sk_mirror_y_avx[] = { }; CODE const uint8_t sk_luminance_to_alpha_avx[] = { - 196,226,125,24,154,136,0,0,0, //vbroadcastss 0x88(%rdx),%ymm3 + 184,208,179,89,62, //mov $0x3e59b3d0,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 197,228,89,192, //vmulps %ymm0,%ymm3,%ymm0 - 196,226,125,24,154,140,0,0,0, //vbroadcastss 0x8c(%rdx),%ymm3 + 184,89,23,55,63, //mov $0x3f371759,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0 - 196,226,125,24,138,144,0,0,0, //vbroadcastss 0x90(%rdx),%ymm1 + 184,152,221,147,61, //mov $0x3d93dd98,%eax + 197,249,110,200, //vmovd %eax,%xmm1 + 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 + 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 197,244,89,202, //vmulps %ymm2,%ymm1,%ymm1 197,252,88,217, //vaddps %ymm1,%ymm0,%ymm3 72,173, //lods %ds:(%rsi),%rax @@ -4925,7 +5173,7 @@ CODE const uint8_t sk_seed_shader_sse41[] = { 102,15,110,209, //movd %ecx,%xmm2 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 15,88,202, //addps %xmm2,%xmm1 - 15,16,66,20, //movups 0x14(%rdx),%xmm0 + 15,16,2, //movups (%rdx),%xmm0 15,88,193, //addps %xmm1,%xmm0 102,15,110,8, //movd (%rax),%xmm1 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1 @@ -5128,25 +5376,29 @@ CODE const uint8_t sk_unpremul_sse41[] = { }; CODE const uint8_t sk_from_srgb_sse41[] = { - 68,15,40,194, //movaps %xmm2,%xmm8 - 243,68,15,16,90,64, //movss 0x40(%rdx),%xmm11 + 184,145,131,158,61, //mov $0x3d9e8391,%eax + 102,68,15,110,216, //movd %eax,%xmm11 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 69,15,40,211, //movaps %xmm11,%xmm10 68,15,89,208, //mulps %xmm0,%xmm10 68,15,40,240, //movaps %xmm0,%xmm14 69,15,89,246, //mulps %xmm14,%xmm14 - 243,15,16,82,60, //movss 0x3c(%rdx),%xmm2 - 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 - 243,68,15,16,98,52, //movss 0x34(%rdx),%xmm12 - 243,68,15,16,106,56, //movss 0x38(%rdx),%xmm13 - 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 - 68,15,40,202, //movaps %xmm2,%xmm9 - 68,15,89,200, //mulps %xmm0,%xmm9 - 69,15,88,205, //addps %xmm13,%xmm9 + 184,154,153,153,62, //mov $0x3e99999a,%eax + 102,68,15,110,192, //movd %eax,%xmm8 + 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 + 184,92,143,50,63, //mov $0x3f328f5c,%eax + 102,68,15,110,224, //movd %eax,%xmm12 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 - 69,15,89,206, //mulps %xmm14,%xmm9 + 69,15,40,200, //movaps %xmm8,%xmm9 + 68,15,89,200, //mulps %xmm0,%xmm9 69,15,88,204, //addps %xmm12,%xmm9 - 243,68,15,16,114,68, //movss 0x44(%rdx),%xmm14 + 184,10,215,35,59, //mov $0x3b23d70a,%eax + 102,68,15,110,232, //movd %eax,%xmm13 + 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 + 69,15,89,206, //mulps %xmm14,%xmm9 + 69,15,88,205, //addps %xmm13,%xmm9 + 184,174,71,97,61, //mov $0x3d6147ae,%eax + 102,68,15,110,240, //movd %eax,%xmm14 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14 65,15,194,198,1, //cmpltps %xmm14,%xmm0 102,69,15,56,20,202, //blendvps %xmm0,%xmm10,%xmm9 @@ -5154,27 +5406,28 @@ CODE const uint8_t sk_from_srgb_sse41[] = { 68,15,89,249, //mulps %xmm1,%xmm15 15,40,193, //movaps %xmm1,%xmm0 15,89,192, //mulps %xmm0,%xmm0 - 68,15,40,210, //movaps %xmm2,%xmm10 + 69,15,40,208, //movaps %xmm8,%xmm10 68,15,89,209, //mulps %xmm1,%xmm10 - 69,15,88,213, //addps %xmm13,%xmm10 - 68,15,89,208, //mulps %xmm0,%xmm10 69,15,88,212, //addps %xmm12,%xmm10 + 68,15,89,208, //mulps %xmm0,%xmm10 + 69,15,88,213, //addps %xmm13,%xmm10 65,15,194,206,1, //cmpltps %xmm14,%xmm1 15,40,193, //movaps %xmm1,%xmm0 102,69,15,56,20,215, //blendvps %xmm0,%xmm15,%xmm10 - 69,15,89,216, //mulps %xmm8,%xmm11 - 65,15,40,192, //movaps %xmm8,%xmm0 + 68,15,89,218, //mulps %xmm2,%xmm11 + 15,40,194, //movaps %xmm2,%xmm0 15,89,192, //mulps %xmm0,%xmm0 - 65,15,89,208, //mulps %xmm8,%xmm2 - 65,15,88,213, //addps %xmm13,%xmm2 - 15,89,208, //mulps %xmm0,%xmm2 - 65,15,88,212, //addps %xmm12,%xmm2 - 69,15,194,198,1, //cmpltps %xmm14,%xmm8 - 65,15,40,192, //movaps %xmm8,%xmm0 - 102,65,15,56,20,211, //blendvps %xmm0,%xmm11,%xmm2 + 68,15,89,194, //mulps %xmm2,%xmm8 + 69,15,88,196, //addps %xmm12,%xmm8 + 68,15,89,192, //mulps %xmm0,%xmm8 + 69,15,88,197, //addps %xmm13,%xmm8 + 65,15,194,214,1, //cmpltps %xmm14,%xmm2 + 15,40,194, //movaps %xmm2,%xmm0 + 102,69,15,56,20,195, //blendvps %xmm0,%xmm11,%xmm8 72,173, //lods %ds:(%rsi),%rax 65,15,40,193, //movaps %xmm9,%xmm0 65,15,40,202, //movaps %xmm10,%xmm1 + 65,15,40,208, //movaps %xmm8,%xmm2 255,224, //jmpq *%rax }; @@ -5185,62 +5438,69 @@ CODE const uint8_t sk_to_srgb_sse41[] = { 15,40,245, //movaps %xmm5,%xmm6 15,40,236, //movaps %xmm4,%xmm5 15,40,227, //movaps %xmm3,%xmm4 - 68,15,40,194, //movaps %xmm2,%xmm8 - 15,40,217, //movaps %xmm1,%xmm3 - 15,82,208, //rsqrtps %xmm0,%xmm2 - 68,15,83,202, //rcpps %xmm2,%xmm9 - 68,15,82,210, //rsqrtps %xmm2,%xmm10 - 243,15,16,18, //movss (%rdx),%xmm2 - 243,68,15,16,90,72, //movss 0x48(%rdx),%xmm11 + 15,40,218, //movaps %xmm2,%xmm3 + 15,40,209, //movaps %xmm1,%xmm2 + 68,15,82,192, //rsqrtps %xmm0,%xmm8 + 69,15,83,200, //rcpps %xmm8,%xmm9 + 69,15,82,248, //rsqrtps %xmm8,%xmm15 + 184,41,92,71,65, //mov $0x41475c29,%eax + 102,68,15,110,216, //movd %eax,%xmm11 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 - 65,15,40,203, //movaps %xmm11,%xmm1 - 15,89,200, //mulps %xmm0,%xmm1 - 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 - 243,68,15,16,98,76, //movss 0x4c(%rdx),%xmm12 + 69,15,40,211, //movaps %xmm11,%xmm10 + 68,15,89,208, //mulps %xmm0,%xmm10 + 184,0,0,128,63, //mov $0x3f800000,%eax + 102,68,15,110,192, //movd %eax,%xmm8 + 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 + 184,194,135,210,62, //mov $0x3ed287c2,%eax + 102,68,15,110,224, //movd %eax,%xmm12 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 - 243,68,15,16,106,80, //movss 0x50(%rdx),%xmm13 + 184,206,111,48,63, //mov $0x3f306fce,%eax + 102,68,15,110,232, //movd %eax,%xmm13 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 - 243,68,15,16,114,84, //movss 0x54(%rdx),%xmm14 + 184,168,87,202,61, //mov $0x3dca57a8,%eax + 53,0,0,0,128, //xor $0x80000000,%eax + 102,68,15,110,240, //movd %eax,%xmm14 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14 69,15,89,205, //mulps %xmm13,%xmm9 69,15,88,206, //addps %xmm14,%xmm9 - 69,15,89,212, //mulps %xmm12,%xmm10 - 69,15,88,209, //addps %xmm9,%xmm10 - 68,15,40,202, //movaps %xmm2,%xmm9 - 69,15,93,202, //minps %xmm10,%xmm9 - 243,68,15,16,122,88, //movss 0x58(%rdx),%xmm15 + 69,15,89,252, //mulps %xmm12,%xmm15 + 69,15,88,249, //addps %xmm9,%xmm15 + 69,15,40,200, //movaps %xmm8,%xmm9 + 69,15,93,207, //minps %xmm15,%xmm9 + 184,4,231,140,59, //mov $0x3b8ce704,%eax + 102,68,15,110,248, //movd %eax,%xmm15 69,15,198,255,0, //shufps $0x0,%xmm15,%xmm15 65,15,194,199,1, //cmpltps %xmm15,%xmm0 - 102,68,15,56,20,201, //blendvps %xmm0,%xmm1,%xmm9 + 102,69,15,56,20,202, //blendvps %xmm0,%xmm10,%xmm9 + 68,15,82,210, //rsqrtps %xmm2,%xmm10 + 65,15,83,194, //rcpps %xmm10,%xmm0 + 69,15,82,210, //rsqrtps %xmm10,%xmm10 + 65,15,89,197, //mulps %xmm13,%xmm0 + 65,15,88,198, //addps %xmm14,%xmm0 + 69,15,89,212, //mulps %xmm12,%xmm10 + 68,15,88,208, //addps %xmm0,%xmm10 + 65,15,40,200, //movaps %xmm8,%xmm1 + 65,15,93,202, //minps %xmm10,%xmm1 + 69,15,40,211, //movaps %xmm11,%xmm10 + 68,15,89,210, //mulps %xmm2,%xmm10 + 65,15,194,215,1, //cmpltps %xmm15,%xmm2 + 15,40,194, //movaps %xmm2,%xmm0 + 102,65,15,56,20,202, //blendvps %xmm0,%xmm10,%xmm1 15,82,195, //rsqrtps %xmm3,%xmm0 - 15,83,200, //rcpps %xmm0,%xmm1 + 15,83,208, //rcpps %xmm0,%xmm2 + 65,15,89,213, //mulps %xmm13,%xmm2 + 65,15,88,214, //addps %xmm14,%xmm2 15,82,192, //rsqrtps %xmm0,%xmm0 - 65,15,89,205, //mulps %xmm13,%xmm1 - 65,15,88,206, //addps %xmm14,%xmm1 65,15,89,196, //mulps %xmm12,%xmm0 - 15,88,193, //addps %xmm1,%xmm0 - 68,15,40,210, //movaps %xmm2,%xmm10 - 68,15,93,208, //minps %xmm0,%xmm10 - 65,15,40,203, //movaps %xmm11,%xmm1 - 15,89,203, //mulps %xmm3,%xmm1 + 15,88,194, //addps %xmm2,%xmm0 + 68,15,93,192, //minps %xmm0,%xmm8 + 68,15,89,219, //mulps %xmm3,%xmm11 65,15,194,223,1, //cmpltps %xmm15,%xmm3 15,40,195, //movaps %xmm3,%xmm0 - 102,68,15,56,20,209, //blendvps %xmm0,%xmm1,%xmm10 - 65,15,82,192, //rsqrtps %xmm8,%xmm0 - 15,83,200, //rcpps %xmm0,%xmm1 - 65,15,89,205, //mulps %xmm13,%xmm1 - 65,15,88,206, //addps %xmm14,%xmm1 - 15,82,192, //rsqrtps %xmm0,%xmm0 - 65,15,89,196, //mulps %xmm12,%xmm0 - 15,88,193, //addps %xmm1,%xmm0 - 15,93,208, //minps %xmm0,%xmm2 - 69,15,89,216, //mulps %xmm8,%xmm11 - 69,15,194,199,1, //cmpltps %xmm15,%xmm8 - 65,15,40,192, //movaps %xmm8,%xmm0 - 102,65,15,56,20,211, //blendvps %xmm0,%xmm11,%xmm2 + 102,69,15,56,20,195, //blendvps %xmm0,%xmm11,%xmm8 72,173, //lods %ds:(%rsi),%rax 65,15,40,193, //movaps %xmm9,%xmm0 - 65,15,40,202, //movaps %xmm10,%xmm1 + 65,15,40,208, //movaps %xmm8,%xmm2 15,40,220, //movaps %xmm4,%xmm3 15,40,229, //movaps %xmm5,%xmm4 15,40,238, //movaps %xmm6,%xmm5 @@ -5328,32 +5588,38 @@ CODE const uint8_t sk_lerp_565_sse41[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax 102,68,15,56,51,4,120, //pmovzxwd (%rax,%rdi,2),%xmm8 - 102,15,110,90,104, //movd 0x68(%rdx),%xmm3 + 184,0,248,0,0, //mov $0xf800,%eax + 102,15,110,216, //movd %eax,%xmm3 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 102,65,15,219,216, //pand %xmm8,%xmm3 68,15,91,203, //cvtdq2ps %xmm3,%xmm9 - 243,68,15,16,90,116, //movss 0x74(%rdx),%xmm11 - 243,68,15,16,82,120, //movss 0x78(%rdx),%xmm10 - 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 - 69,15,89,217, //mulps %xmm9,%xmm11 - 102,15,110,90,108, //movd 0x6c(%rdx),%xmm3 + 184,8,33,132,55, //mov $0x37842108,%eax + 102,68,15,110,208, //movd %eax,%xmm10 + 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 + 69,15,89,209, //mulps %xmm9,%xmm10 + 184,224,7,0,0, //mov $0x7e0,%eax + 102,15,110,216, //movd %eax,%xmm3 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 102,65,15,219,216, //pand %xmm8,%xmm3 - 15,91,219, //cvtdq2ps %xmm3,%xmm3 - 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 - 68,15,89,211, //mulps %xmm3,%xmm10 - 102,15,110,90,112, //movd 0x70(%rdx),%xmm3 + 68,15,91,203, //cvtdq2ps %xmm3,%xmm9 + 184,33,8,2,58, //mov $0x3a020821,%eax + 102,68,15,110,216, //movd %eax,%xmm11 + 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 + 69,15,89,217, //mulps %xmm9,%xmm11 + 184,31,0,0,0, //mov $0x1f,%eax + 102,15,110,216, //movd %eax,%xmm3 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 102,65,15,219,216, //pand %xmm8,%xmm3 68,15,91,195, //cvtdq2ps %xmm3,%xmm8 - 243,15,16,90,124, //movss 0x7c(%rdx),%xmm3 + 184,8,33,4,61, //mov $0x3d042108,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 65,15,89,216, //mulps %xmm8,%xmm3 15,92,196, //subps %xmm4,%xmm0 - 65,15,89,195, //mulps %xmm11,%xmm0 + 65,15,89,194, //mulps %xmm10,%xmm0 15,88,196, //addps %xmm4,%xmm0 15,92,205, //subps %xmm5,%xmm1 - 65,15,89,202, //mulps %xmm10,%xmm1 + 65,15,89,203, //mulps %xmm11,%xmm1 15,88,205, //addps %xmm5,%xmm1 15,92,214, //subps %xmm6,%xmm2 15,89,211, //mulps %xmm3,%xmm2 @@ -5370,7 +5636,8 @@ CODE const uint8_t sk_load_tables_sse41[] = { 72,139,8, //mov (%rax),%rcx 76,139,64,8, //mov 0x8(%rax),%r8 243,68,15,111,4,185, //movdqu (%rcx,%rdi,4),%xmm8 - 102,15,110,66,16, //movd 0x10(%rdx),%xmm0 + 185,255,0,0,0, //mov $0xff,%ecx + 102,15,110,193, //movd %ecx,%xmm0 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 102,65,15,111,200, //movdqa %xmm8,%xmm1 102,15,114,209,8, //psrld $0x8,%xmm1 @@ -5417,7 +5684,8 @@ CODE const uint8_t sk_load_tables_sse41[] = { 102,15,58,33,211,48, //insertps $0x30,%xmm3,%xmm2 102,65,15,114,208,24, //psrld $0x18,%xmm8 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 - 243,15,16,90,12, //movss 0xc(%rdx),%xmm3 + 184,129,128,128,59, //mov $0x3b808081,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 65,15,89,216, //mulps %xmm8,%xmm3 72,173, //lods %ds:(%rsi),%rax @@ -5429,7 +5697,8 @@ CODE const uint8_t sk_load_a8_sse41[] = { 72,139,0, //mov (%rax),%rax 102,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm0 15,91,192, //cvtdq2ps %xmm0,%xmm0 - 243,15,16,90,12, //movss 0xc(%rdx),%xmm3 + 184,129,128,128,59, //mov $0x3b808081,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 15,89,216, //mulps %xmm0,%xmm3 72,173, //lods %ds:(%rsi),%rax @@ -5442,7 +5711,8 @@ CODE const uint8_t sk_load_a8_sse41[] = { CODE const uint8_t sk_store_a8_sse41[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 243,68,15,16,66,8, //movss 0x8(%rdx),%xmm8 + 185,0,0,127,67, //mov $0x437f0000,%ecx + 102,68,15,110,193, //movd %ecx,%xmm8 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 68,15,89,195, //mulps %xmm3,%xmm8 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 @@ -5456,29 +5726,36 @@ CODE const uint8_t sk_store_a8_sse41[] = { CODE const uint8_t sk_load_565_sse41[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 102,68,15,56,51,12,120, //pmovzxwd (%rax,%rdi,2),%xmm9 - 102,15,110,66,104, //movd 0x68(%rdx),%xmm0 + 102,15,56,51,20,120, //pmovzxwd (%rax,%rdi,2),%xmm2 + 184,0,248,0,0, //mov $0xf800,%eax + 102,15,110,192, //movd %eax,%xmm0 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 - 102,65,15,219,193, //pand %xmm9,%xmm0 + 102,15,219,194, //pand %xmm2,%xmm0 15,91,200, //cvtdq2ps %xmm0,%xmm1 - 243,15,16,26, //movss (%rdx),%xmm3 - 243,15,16,66,116, //movss 0x74(%rdx),%xmm0 + 184,8,33,132,55, //mov $0x37842108,%eax + 102,15,110,192, //movd %eax,%xmm0 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 15,89,193, //mulps %xmm1,%xmm0 - 102,15,110,74,108, //movd 0x6c(%rdx),%xmm1 + 184,224,7,0,0, //mov $0x7e0,%eax + 102,15,110,200, //movd %eax,%xmm1 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1 - 102,65,15,219,201, //pand %xmm9,%xmm1 - 68,15,91,193, //cvtdq2ps %xmm1,%xmm8 - 243,15,16,74,120, //movss 0x78(%rdx),%xmm1 + 102,15,219,202, //pand %xmm2,%xmm1 + 15,91,217, //cvtdq2ps %xmm1,%xmm3 + 184,33,8,2,58, //mov $0x3a020821,%eax + 102,15,110,200, //movd %eax,%xmm1 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 - 65,15,89,200, //mulps %xmm8,%xmm1 - 102,15,110,82,112, //movd 0x70(%rdx),%xmm2 - 102,15,112,210,0, //pshufd $0x0,%xmm2,%xmm2 - 102,65,15,219,209, //pand %xmm9,%xmm2 - 68,15,91,194, //cvtdq2ps %xmm2,%xmm8 - 243,15,16,82,124, //movss 0x7c(%rdx),%xmm2 + 15,89,203, //mulps %xmm3,%xmm1 + 184,31,0,0,0, //mov $0x1f,%eax + 102,15,110,216, //movd %eax,%xmm3 + 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 + 102,15,219,218, //pand %xmm2,%xmm3 + 15,91,219, //cvtdq2ps %xmm3,%xmm3 + 184,8,33,4,61, //mov $0x3d042108,%eax + 102,15,110,208, //movd %eax,%xmm2 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 - 65,15,89,208, //mulps %xmm8,%xmm2 + 15,89,211, //mulps %xmm3,%xmm2 + 184,0,0,128,63, //mov $0x3f800000,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -5487,21 +5764,23 @@ CODE const uint8_t sk_load_565_sse41[] = { CODE const uint8_t sk_store_565_sse41[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 243,68,15,16,130,128,0,0,0, //movss 0x80(%rdx),%xmm8 - 243,68,15,16,138,132,0,0,0, //movss 0x84(%rdx),%xmm9 + 185,0,0,248,65, //mov $0x41f80000,%ecx + 102,68,15,110,193, //movd %ecx,%xmm8 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 - 69,15,40,208, //movaps %xmm8,%xmm10 - 68,15,89,208, //mulps %xmm0,%xmm10 - 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10 - 102,65,15,114,242,11, //pslld $0xb,%xmm10 - 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 - 68,15,89,201, //mulps %xmm1,%xmm9 + 69,15,40,200, //movaps %xmm8,%xmm9 + 68,15,89,200, //mulps %xmm0,%xmm9 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9 - 102,65,15,114,241,5, //pslld $0x5,%xmm9 - 102,69,15,235,202, //por %xmm10,%xmm9 + 102,65,15,114,241,11, //pslld $0xb,%xmm9 + 185,0,0,124,66, //mov $0x427c0000,%ecx + 102,68,15,110,209, //movd %ecx,%xmm10 + 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 + 68,15,89,209, //mulps %xmm1,%xmm10 + 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10 + 102,65,15,114,242,5, //pslld $0x5,%xmm10 + 102,69,15,235,209, //por %xmm9,%xmm10 68,15,89,194, //mulps %xmm2,%xmm8 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 - 102,69,15,86,193, //orpd %xmm9,%xmm8 + 102,69,15,86,194, //orpd %xmm10,%xmm8 102,69,15,56,43,192, //packusdw %xmm8,%xmm8 102,68,15,214,4,120, //movq %xmm8,(%rax,%rdi,2) 72,173, //lods %ds:(%rsi),%rax @@ -5577,7 +5856,8 @@ CODE const uint8_t sk_load_f16_sse41[] = { 102,68,15,111,194, //movdqa %xmm2,%xmm8 102,68,15,97,192, //punpcklwd %xmm0,%xmm8 102,15,105,208, //punpckhwd %xmm0,%xmm2 - 102,15,110,66,100, //movd 0x64(%rdx),%xmm0 + 184,0,4,0,4, //mov $0x4000400,%eax + 102,15,110,192, //movd %eax,%xmm0 102,15,112,216,0, //pshufd $0x0,%xmm0,%xmm3 102,15,111,203, //movdqa %xmm3,%xmm1 102,65,15,101,200, //pcmpgtw %xmm8,%xmm1 @@ -5586,7 +5866,8 @@ CODE const uint8_t sk_load_f16_sse41[] = { 102,15,223,218, //pandn %xmm2,%xmm3 102,15,56,51,193, //pmovzxwd %xmm1,%xmm0 102,15,114,240,13, //pslld $0xd,%xmm0 - 102,15,110,82,92, //movd 0x5c(%rdx),%xmm2 + 184,0,0,128,119, //mov $0x77800000,%eax + 102,15,110,208, //movd %eax,%xmm2 102,68,15,112,194,0, //pshufd $0x0,%xmm2,%xmm8 65,15,89,192, //mulps %xmm8,%xmm0 102,69,15,239,201, //pxor %xmm9,%xmm9 @@ -5606,7 +5887,8 @@ CODE const uint8_t sk_load_f16_sse41[] = { CODE const uint8_t sk_store_f16_sse41[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 102,68,15,110,66,96, //movd 0x60(%rdx),%xmm8 + 185,0,0,128,7, //mov $0x7800000,%ecx + 102,68,15,110,193, //movd %ecx,%xmm8 102,69,15,112,192,0, //pshufd $0x0,%xmm8,%xmm8 102,69,15,111,200, //movdqa %xmm8,%xmm9 68,15,89,200, //mulps %xmm0,%xmm9 @@ -5768,17 +6050,20 @@ CODE const uint8_t sk_mirror_y_sse41[] = { }; CODE const uint8_t sk_luminance_to_alpha_sse41[] = { - 243,15,16,154,136,0,0,0, //movss 0x88(%rdx),%xmm3 - 243,68,15,16,130,140,0,0,0, //movss 0x8c(%rdx),%xmm8 + 184,208,179,89,62, //mov $0x3e59b3d0,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 15,89,216, //mulps %xmm0,%xmm3 - 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 - 68,15,89,193, //mulps %xmm1,%xmm8 - 68,15,88,195, //addps %xmm3,%xmm8 - 243,15,16,154,144,0,0,0, //movss 0x90(%rdx),%xmm3 + 184,89,23,55,63, //mov $0x3f371759,%eax + 102,15,110,192, //movd %eax,%xmm0 + 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 + 15,89,193, //mulps %xmm1,%xmm0 + 15,88,195, //addps %xmm3,%xmm0 + 184,152,221,147,61, //mov $0x3d93dd98,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 15,89,218, //mulps %xmm2,%xmm3 - 65,15,88,216, //addps %xmm8,%xmm3 + 15,88,216, //addps %xmm0,%xmm3 72,173, //lods %ds:(%rsi),%rax 15,87,192, //xorps %xmm0,%xmm0 15,87,201, //xorps %xmm1,%xmm1 @@ -6074,7 +6359,7 @@ CODE const uint8_t sk_seed_shader_sse2[] = { 102,15,110,209, //movd %ecx,%xmm2 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 15,88,202, //addps %xmm2,%xmm1 - 15,16,66,20, //movups 0x14(%rdx),%xmm0 + 15,16,2, //movups (%rdx),%xmm0 15,88,193, //addps %xmm1,%xmm0 102,15,110,8, //movd (%rax),%xmm1 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1 @@ -6274,24 +6559,29 @@ CODE const uint8_t sk_unpremul_sse2[] = { }; CODE const uint8_t sk_from_srgb_sse2[] = { - 243,68,15,16,66,64, //movss 0x40(%rdx),%xmm8 + 184,145,131,158,61, //mov $0x3d9e8391,%eax + 102,68,15,110,192, //movd %eax,%xmm8 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 69,15,40,232, //movaps %xmm8,%xmm13 68,15,89,232, //mulps %xmm0,%xmm13 68,15,40,224, //movaps %xmm0,%xmm12 69,15,89,228, //mulps %xmm12,%xmm12 - 243,68,15,16,74,60, //movss 0x3c(%rdx),%xmm9 + 184,154,153,153,62, //mov $0x3e99999a,%eax + 102,68,15,110,200, //movd %eax,%xmm9 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 - 243,68,15,16,82,52, //movss 0x34(%rdx),%xmm10 - 243,68,15,16,90,56, //movss 0x38(%rdx),%xmm11 - 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 + 184,92,143,50,63, //mov $0x3f328f5c,%eax + 102,68,15,110,208, //movd %eax,%xmm10 + 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 69,15,40,241, //movaps %xmm9,%xmm14 68,15,89,240, //mulps %xmm0,%xmm14 - 69,15,88,243, //addps %xmm11,%xmm14 - 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 - 69,15,89,244, //mulps %xmm12,%xmm14 69,15,88,242, //addps %xmm10,%xmm14 - 243,68,15,16,98,68, //movss 0x44(%rdx),%xmm12 + 184,10,215,35,59, //mov $0x3b23d70a,%eax + 102,68,15,110,216, //movd %eax,%xmm11 + 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 + 69,15,89,244, //mulps %xmm12,%xmm14 + 69,15,88,243, //addps %xmm11,%xmm14 + 184,174,71,97,61, //mov $0x3d6147ae,%eax + 102,68,15,110,224, //movd %eax,%xmm12 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 65,15,194,196,1, //cmpltps %xmm12,%xmm0 68,15,84,232, //andps %xmm0,%xmm13 @@ -6303,9 +6593,9 @@ CODE const uint8_t sk_from_srgb_sse2[] = { 69,15,89,246, //mulps %xmm14,%xmm14 69,15,40,249, //movaps %xmm9,%xmm15 68,15,89,249, //mulps %xmm1,%xmm15 - 69,15,88,251, //addps %xmm11,%xmm15 - 69,15,89,254, //mulps %xmm14,%xmm15 69,15,88,250, //addps %xmm10,%xmm15 + 69,15,89,254, //mulps %xmm14,%xmm15 + 69,15,88,251, //addps %xmm11,%xmm15 65,15,194,204,1, //cmpltps %xmm12,%xmm1 68,15,84,233, //andps %xmm1,%xmm13 65,15,85,207, //andnps %xmm15,%xmm1 @@ -6314,9 +6604,9 @@ CODE const uint8_t sk_from_srgb_sse2[] = { 68,15,40,234, //movaps %xmm2,%xmm13 69,15,89,237, //mulps %xmm13,%xmm13 68,15,89,202, //mulps %xmm2,%xmm9 - 69,15,88,203, //addps %xmm11,%xmm9 - 69,15,89,205, //mulps %xmm13,%xmm9 69,15,88,202, //addps %xmm10,%xmm9 + 69,15,89,205, //mulps %xmm13,%xmm9 + 69,15,88,203, //addps %xmm11,%xmm9 65,15,194,212,1, //cmpltps %xmm12,%xmm2 68,15,84,194, //andps %xmm2,%xmm8 65,15,85,209, //andnps %xmm9,%xmm2 @@ -6326,74 +6616,69 @@ CODE const uint8_t sk_from_srgb_sse2[] = { }; CODE const uint8_t sk_to_srgb_sse2[] = { - 72,131,236,40, //sub $0x28,%rsp - 15,41,124,36,16, //movaps %xmm7,0x10(%rsp) - 15,41,52,36, //movaps %xmm6,(%rsp) - 15,40,245, //movaps %xmm5,%xmm6 - 15,40,236, //movaps %xmm4,%xmm5 - 15,40,227, //movaps %xmm3,%xmm4 68,15,82,192, //rsqrtps %xmm0,%xmm8 - 69,15,83,232, //rcpps %xmm8,%xmm13 - 69,15,82,248, //rsqrtps %xmm8,%xmm15 - 243,15,16,26, //movss (%rdx),%xmm3 - 243,68,15,16,66,72, //movss 0x48(%rdx),%xmm8 + 69,15,83,248, //rcpps %xmm8,%xmm15 + 69,15,82,232, //rsqrtps %xmm8,%xmm13 + 184,41,92,71,65, //mov $0x41475c29,%eax + 102,68,15,110,192, //movd %eax,%xmm8 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 69,15,40,240, //movaps %xmm8,%xmm14 68,15,89,240, //mulps %xmm0,%xmm14 - 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 - 243,68,15,16,82,76, //movss 0x4c(%rdx),%xmm10 + 184,0,0,128,63, //mov $0x3f800000,%eax + 102,68,15,110,200, //movd %eax,%xmm9 + 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 + 184,194,135,210,62, //mov $0x3ed287c2,%eax + 102,68,15,110,208, //movd %eax,%xmm10 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 - 243,68,15,16,90,80, //movss 0x50(%rdx),%xmm11 + 184,206,111,48,63, //mov $0x3f306fce,%eax + 102,68,15,110,216, //movd %eax,%xmm11 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 - 243,68,15,16,98,84, //movss 0x54(%rdx),%xmm12 + 184,168,87,202,61, //mov $0x3dca57a8,%eax + 53,0,0,0,128, //xor $0x80000000,%eax + 102,68,15,110,224, //movd %eax,%xmm12 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 - 69,15,89,235, //mulps %xmm11,%xmm13 - 69,15,88,236, //addps %xmm12,%xmm13 - 69,15,89,250, //mulps %xmm10,%xmm15 - 69,15,88,253, //addps %xmm13,%xmm15 - 68,15,40,203, //movaps %xmm3,%xmm9 - 69,15,93,207, //minps %xmm15,%xmm9 - 243,68,15,16,106,88, //movss 0x58(%rdx),%xmm13 + 69,15,89,251, //mulps %xmm11,%xmm15 + 69,15,88,252, //addps %xmm12,%xmm15 + 69,15,89,234, //mulps %xmm10,%xmm13 + 69,15,88,239, //addps %xmm15,%xmm13 + 69,15,40,249, //movaps %xmm9,%xmm15 + 69,15,93,253, //minps %xmm13,%xmm15 + 184,4,231,140,59, //mov $0x3b8ce704,%eax + 102,68,15,110,232, //movd %eax,%xmm13 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 65,15,194,197,1, //cmpltps %xmm13,%xmm0 68,15,84,240, //andps %xmm0,%xmm14 - 65,15,85,193, //andnps %xmm9,%xmm0 + 65,15,85,199, //andnps %xmm15,%xmm0 65,15,86,198, //orps %xmm14,%xmm0 - 68,15,82,201, //rsqrtps %xmm1,%xmm9 - 69,15,83,241, //rcpps %xmm9,%xmm14 - 69,15,82,201, //rsqrtps %xmm9,%xmm9 - 69,15,89,243, //mulps %xmm11,%xmm14 - 69,15,88,244, //addps %xmm12,%xmm14 - 69,15,89,202, //mulps %xmm10,%xmm9 - 69,15,88,206, //addps %xmm14,%xmm9 - 68,15,40,243, //movaps %xmm3,%xmm14 - 69,15,93,241, //minps %xmm9,%xmm14 - 69,15,40,200, //movaps %xmm8,%xmm9 - 68,15,89,201, //mulps %xmm1,%xmm9 + 68,15,82,241, //rsqrtps %xmm1,%xmm14 + 69,15,83,254, //rcpps %xmm14,%xmm15 + 69,15,82,246, //rsqrtps %xmm14,%xmm14 + 69,15,89,251, //mulps %xmm11,%xmm15 + 69,15,88,252, //addps %xmm12,%xmm15 + 69,15,89,242, //mulps %xmm10,%xmm14 + 69,15,88,247, //addps %xmm15,%xmm14 + 69,15,40,249, //movaps %xmm9,%xmm15 + 69,15,93,254, //minps %xmm14,%xmm15 + 69,15,40,240, //movaps %xmm8,%xmm14 + 68,15,89,241, //mulps %xmm1,%xmm14 65,15,194,205,1, //cmpltps %xmm13,%xmm1 - 68,15,84,201, //andps %xmm1,%xmm9 - 65,15,85,206, //andnps %xmm14,%xmm1 - 65,15,86,201, //orps %xmm9,%xmm1 - 68,15,82,202, //rsqrtps %xmm2,%xmm9 - 69,15,83,241, //rcpps %xmm9,%xmm14 - 69,15,89,243, //mulps %xmm11,%xmm14 - 69,15,88,244, //addps %xmm12,%xmm14 - 65,15,82,249, //rsqrtps %xmm9,%xmm7 - 65,15,89,250, //mulps %xmm10,%xmm7 - 65,15,88,254, //addps %xmm14,%xmm7 - 15,93,223, //minps %xmm7,%xmm3 + 68,15,84,241, //andps %xmm1,%xmm14 + 65,15,85,207, //andnps %xmm15,%xmm1 + 65,15,86,206, //orps %xmm14,%xmm1 + 68,15,82,242, //rsqrtps %xmm2,%xmm14 + 69,15,83,254, //rcpps %xmm14,%xmm15 + 69,15,89,251, //mulps %xmm11,%xmm15 + 69,15,88,252, //addps %xmm12,%xmm15 + 69,15,82,222, //rsqrtps %xmm14,%xmm11 + 69,15,89,218, //mulps %xmm10,%xmm11 + 69,15,88,223, //addps %xmm15,%xmm11 + 69,15,93,203, //minps %xmm11,%xmm9 68,15,89,194, //mulps %xmm2,%xmm8 65,15,194,213,1, //cmpltps %xmm13,%xmm2 68,15,84,194, //andps %xmm2,%xmm8 - 15,85,211, //andnps %xmm3,%xmm2 + 65,15,85,209, //andnps %xmm9,%xmm2 65,15,86,208, //orps %xmm8,%xmm2 72,173, //lods %ds:(%rsi),%rax - 15,40,220, //movaps %xmm4,%xmm3 - 15,40,229, //movaps %xmm5,%xmm4 - 15,40,238, //movaps %xmm6,%xmm5 - 15,40,52,36, //movaps (%rsp),%xmm6 - 15,40,124,36,16, //movaps 0x10(%rsp),%xmm7 - 72,131,196,40, //add $0x28,%rsp 255,224, //jmpq *%rax }; @@ -6480,35 +6765,41 @@ CODE const uint8_t sk_lerp_u8_sse2[] = { CODE const uint8_t sk_lerp_565_sse2[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 243,68,15,126,12,120, //movq (%rax,%rdi,2),%xmm9 + 243,68,15,126,4,120, //movq (%rax,%rdi,2),%xmm8 102,15,239,219, //pxor %xmm3,%xmm3 - 102,68,15,97,203, //punpcklwd %xmm3,%xmm9 - 102,15,110,90,104, //movd 0x68(%rdx),%xmm3 - 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 - 102,65,15,219,217, //pand %xmm9,%xmm3 - 68,15,91,211, //cvtdq2ps %xmm3,%xmm10 - 243,68,15,16,90,116, //movss 0x74(%rdx),%xmm11 - 243,68,15,16,66,120, //movss 0x78(%rdx),%xmm8 - 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 - 69,15,89,218, //mulps %xmm10,%xmm11 - 102,15,110,90,108, //movd 0x6c(%rdx),%xmm3 + 102,68,15,97,195, //punpcklwd %xmm3,%xmm8 + 184,0,248,0,0, //mov $0xf800,%eax + 102,15,110,216, //movd %eax,%xmm3 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 - 102,65,15,219,217, //pand %xmm9,%xmm3 - 15,91,219, //cvtdq2ps %xmm3,%xmm3 - 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 - 68,15,89,195, //mulps %xmm3,%xmm8 - 102,15,110,90,112, //movd 0x70(%rdx),%xmm3 + 102,65,15,219,216, //pand %xmm8,%xmm3 + 68,15,91,203, //cvtdq2ps %xmm3,%xmm9 + 184,8,33,132,55, //mov $0x37842108,%eax + 102,68,15,110,208, //movd %eax,%xmm10 + 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 + 69,15,89,209, //mulps %xmm9,%xmm10 + 184,224,7,0,0, //mov $0x7e0,%eax + 102,15,110,216, //movd %eax,%xmm3 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 - 102,65,15,219,217, //pand %xmm9,%xmm3 + 102,65,15,219,216, //pand %xmm8,%xmm3 68,15,91,203, //cvtdq2ps %xmm3,%xmm9 - 243,15,16,90,124, //movss 0x7c(%rdx),%xmm3 + 184,33,8,2,58, //mov $0x3a020821,%eax + 102,68,15,110,216, //movd %eax,%xmm11 + 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 + 69,15,89,217, //mulps %xmm9,%xmm11 + 184,31,0,0,0, //mov $0x1f,%eax + 102,15,110,216, //movd %eax,%xmm3 + 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 + 102,65,15,219,216, //pand %xmm8,%xmm3 + 68,15,91,195, //cvtdq2ps %xmm3,%xmm8 + 184,8,33,4,61, //mov $0x3d042108,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 - 65,15,89,217, //mulps %xmm9,%xmm3 + 65,15,89,216, //mulps %xmm8,%xmm3 15,92,196, //subps %xmm4,%xmm0 - 65,15,89,195, //mulps %xmm11,%xmm0 + 65,15,89,194, //mulps %xmm10,%xmm0 15,88,196, //addps %xmm4,%xmm0 15,92,205, //subps %xmm5,%xmm1 - 65,15,89,200, //mulps %xmm8,%xmm1 + 65,15,89,203, //mulps %xmm11,%xmm1 15,88,205, //addps %xmm5,%xmm1 15,92,214, //subps %xmm6,%xmm2 15,89,211, //mulps %xmm3,%xmm2 @@ -6525,7 +6816,8 @@ CODE const uint8_t sk_load_tables_sse2[] = { 72,139,8, //mov (%rax),%rcx 76,139,64,8, //mov 0x8(%rax),%r8 243,68,15,111,4,185, //movdqu (%rcx,%rdi,4),%xmm8 - 102,15,110,66,16, //movd 0x10(%rdx),%xmm0 + 185,255,0,0,0, //mov $0xff,%ecx + 102,15,110,193, //movd %ecx,%xmm0 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 102,69,15,111,200, //movdqa %xmm8,%xmm9 102,65,15,114,209,8, //psrld $0x8,%xmm9 @@ -6580,7 +6872,8 @@ CODE const uint8_t sk_load_tables_sse2[] = { 65,15,20,209, //unpcklps %xmm9,%xmm2 102,65,15,114,208,24, //psrld $0x18,%xmm8 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 - 243,15,16,90,12, //movss 0xc(%rdx),%xmm3 + 184,129,128,128,59, //mov $0x3b808081,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 65,15,89,216, //mulps %xmm8,%xmm3 72,173, //lods %ds:(%rsi),%rax @@ -6595,7 +6888,8 @@ CODE const uint8_t sk_load_a8_sse2[] = { 102,15,96,193, //punpcklbw %xmm1,%xmm0 102,15,97,193, //punpcklwd %xmm1,%xmm0 15,91,192, //cvtdq2ps %xmm0,%xmm0 - 243,15,16,90,12, //movss 0xc(%rdx),%xmm3 + 184,129,128,128,59, //mov $0x3b808081,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 15,89,216, //mulps %xmm0,%xmm3 72,173, //lods %ds:(%rsi),%rax @@ -6608,7 +6902,8 @@ CODE const uint8_t sk_load_a8_sse2[] = { CODE const uint8_t sk_store_a8_sse2[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 243,68,15,16,66,8, //movss 0x8(%rdx),%xmm8 + 185,0,0,127,67, //mov $0x437f0000,%ecx + 102,68,15,110,193, //movd %ecx,%xmm8 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 68,15,89,195, //mulps %xmm3,%xmm8 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 @@ -6624,31 +6919,38 @@ CODE const uint8_t sk_store_a8_sse2[] = { CODE const uint8_t sk_load_565_sse2[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 243,68,15,126,12,120, //movq (%rax,%rdi,2),%xmm9 + 243,15,126,20,120, //movq (%rax,%rdi,2),%xmm2 102,15,239,192, //pxor %xmm0,%xmm0 - 102,68,15,97,200, //punpcklwd %xmm0,%xmm9 - 102,15,110,66,104, //movd 0x68(%rdx),%xmm0 + 102,15,97,208, //punpcklwd %xmm0,%xmm2 + 184,0,248,0,0, //mov $0xf800,%eax + 102,15,110,192, //movd %eax,%xmm0 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 - 102,65,15,219,193, //pand %xmm9,%xmm0 + 102,15,219,194, //pand %xmm2,%xmm0 15,91,200, //cvtdq2ps %xmm0,%xmm1 - 243,15,16,26, //movss (%rdx),%xmm3 - 243,15,16,66,116, //movss 0x74(%rdx),%xmm0 + 184,8,33,132,55, //mov $0x37842108,%eax + 102,15,110,192, //movd %eax,%xmm0 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 15,89,193, //mulps %xmm1,%xmm0 - 102,15,110,74,108, //movd 0x6c(%rdx),%xmm1 + 184,224,7,0,0, //mov $0x7e0,%eax + 102,15,110,200, //movd %eax,%xmm1 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1 - 102,65,15,219,201, //pand %xmm9,%xmm1 - 68,15,91,193, //cvtdq2ps %xmm1,%xmm8 - 243,15,16,74,120, //movss 0x78(%rdx),%xmm1 + 102,15,219,202, //pand %xmm2,%xmm1 + 15,91,217, //cvtdq2ps %xmm1,%xmm3 + 184,33,8,2,58, //mov $0x3a020821,%eax + 102,15,110,200, //movd %eax,%xmm1 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 - 65,15,89,200, //mulps %xmm8,%xmm1 - 102,15,110,82,112, //movd 0x70(%rdx),%xmm2 - 102,15,112,210,0, //pshufd $0x0,%xmm2,%xmm2 - 102,65,15,219,209, //pand %xmm9,%xmm2 - 68,15,91,194, //cvtdq2ps %xmm2,%xmm8 - 243,15,16,82,124, //movss 0x7c(%rdx),%xmm2 + 15,89,203, //mulps %xmm3,%xmm1 + 184,31,0,0,0, //mov $0x1f,%eax + 102,15,110,216, //movd %eax,%xmm3 + 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 + 102,15,219,218, //pand %xmm2,%xmm3 + 15,91,219, //cvtdq2ps %xmm3,%xmm3 + 184,8,33,4,61, //mov $0x3d042108,%eax + 102,15,110,208, //movd %eax,%xmm2 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 - 65,15,89,208, //mulps %xmm8,%xmm2 + 15,89,211, //mulps %xmm3,%xmm2 + 184,0,0,128,63, //mov $0x3f800000,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -6657,21 +6959,23 @@ CODE const uint8_t sk_load_565_sse2[] = { CODE const uint8_t sk_store_565_sse2[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 243,68,15,16,130,128,0,0,0, //movss 0x80(%rdx),%xmm8 - 243,68,15,16,138,132,0,0,0, //movss 0x84(%rdx),%xmm9 + 185,0,0,248,65, //mov $0x41f80000,%ecx + 102,68,15,110,193, //movd %ecx,%xmm8 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 - 69,15,40,208, //movaps %xmm8,%xmm10 - 68,15,89,208, //mulps %xmm0,%xmm10 - 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10 - 102,65,15,114,242,11, //pslld $0xb,%xmm10 - 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 - 68,15,89,201, //mulps %xmm1,%xmm9 + 69,15,40,200, //movaps %xmm8,%xmm9 + 68,15,89,200, //mulps %xmm0,%xmm9 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9 - 102,65,15,114,241,5, //pslld $0x5,%xmm9 - 102,69,15,235,202, //por %xmm10,%xmm9 + 102,65,15,114,241,11, //pslld $0xb,%xmm9 + 185,0,0,124,66, //mov $0x427c0000,%ecx + 102,68,15,110,209, //movd %ecx,%xmm10 + 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 + 68,15,89,209, //mulps %xmm1,%xmm10 + 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10 + 102,65,15,114,242,5, //pslld $0x5,%xmm10 + 102,69,15,235,209, //por %xmm9,%xmm10 68,15,89,194, //mulps %xmm2,%xmm8 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 - 102,69,15,86,193, //orpd %xmm9,%xmm8 + 102,69,15,86,194, //orpd %xmm10,%xmm8 102,65,15,114,240,16, //pslld $0x10,%xmm8 102,65,15,114,224,16, //psrad $0x10,%xmm8 102,69,15,107,192, //packssdw %xmm8,%xmm8 @@ -6749,7 +7053,8 @@ CODE const uint8_t sk_load_f16_sse2[] = { 102,68,15,111,194, //movdqa %xmm2,%xmm8 102,68,15,97,192, //punpcklwd %xmm0,%xmm8 102,15,105,208, //punpckhwd %xmm0,%xmm2 - 102,15,110,66,100, //movd 0x64(%rdx),%xmm0 + 184,0,4,0,4, //mov $0x4000400,%eax + 102,15,110,192, //movd %eax,%xmm0 102,15,112,216,0, //pshufd $0x0,%xmm0,%xmm3 102,15,111,203, //movdqa %xmm3,%xmm1 102,65,15,101,200, //pcmpgtw %xmm8,%xmm1 @@ -6760,7 +7065,8 @@ CODE const uint8_t sk_load_f16_sse2[] = { 102,15,111,193, //movdqa %xmm1,%xmm0 102,65,15,97,192, //punpcklwd %xmm8,%xmm0 102,15,114,240,13, //pslld $0xd,%xmm0 - 102,15,110,82,92, //movd 0x5c(%rdx),%xmm2 + 184,0,0,128,119, //mov $0x77800000,%eax + 102,15,110,208, //movd %eax,%xmm2 102,68,15,112,202,0, //pshufd $0x0,%xmm2,%xmm9 65,15,89,193, //mulps %xmm9,%xmm0 102,65,15,105,200, //punpckhwd %xmm8,%xmm1 @@ -6780,7 +7086,8 @@ CODE const uint8_t sk_load_f16_sse2[] = { CODE const uint8_t sk_store_f16_sse2[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 102,68,15,110,66,96, //movd 0x60(%rdx),%xmm8 + 185,0,0,128,7, //mov $0x7800000,%ecx + 102,68,15,110,193, //movd %ecx,%xmm8 102,69,15,112,192,0, //pshufd $0x0,%xmm8,%xmm8 102,69,15,111,200, //movdqa %xmm8,%xmm9 68,15,89,200, //mulps %xmm0,%xmm9 @@ -6970,17 +7277,20 @@ CODE const uint8_t sk_mirror_y_sse2[] = { }; CODE const uint8_t sk_luminance_to_alpha_sse2[] = { - 243,15,16,154,136,0,0,0, //movss 0x88(%rdx),%xmm3 - 243,68,15,16,130,140,0,0,0, //movss 0x8c(%rdx),%xmm8 + 184,208,179,89,62, //mov $0x3e59b3d0,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 15,89,216, //mulps %xmm0,%xmm3 - 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 - 68,15,89,193, //mulps %xmm1,%xmm8 - 68,15,88,195, //addps %xmm3,%xmm8 - 243,15,16,154,144,0,0,0, //movss 0x90(%rdx),%xmm3 + 184,89,23,55,63, //mov $0x3f371759,%eax + 102,15,110,192, //movd %eax,%xmm0 + 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 + 15,89,193, //mulps %xmm1,%xmm0 + 15,88,195, //addps %xmm3,%xmm0 + 184,152,221,147,61, //mov $0x3d93dd98,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 15,89,218, //mulps %xmm2,%xmm3 - 65,15,88,216, //addps %xmm8,%xmm3 + 15,88,216, //addps %xmm0,%xmm3 72,173, //lods %ds:(%rsi),%rax 15,87,192, //xorps %xmm0,%xmm0 15,87,201, //xorps %xmm1,%xmm1 @@ -7321,7 +7631,7 @@ CODE const uint8_t sk_seed_shader_hsw[] = { 196,193,121,110,200, //vmovd %r8d,%xmm1 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0 - 197,252,88,66,20, //vaddps 0x14(%rdx),%ymm0,%ymm0 + 197,252,88,2, //vaddps (%rdx),%ymm0,%ymm0 196,226,125,24,16, //vbroadcastss (%rax),%ymm2 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1 @@ -7501,16 +7811,26 @@ CODE const uint8_t sk_unpremul_hsw[] = { }; CODE const uint8_t sk_from_srgb_hsw[] = { - 196,98,125,24,66,64, //vbroadcastss 0x40(%rdx),%ymm8 + 184,145,131,158,61, //mov $0x3d9e8391,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 197,124,89,208, //vmulps %ymm0,%ymm0,%ymm10 - 196,98,125,24,90,60, //vbroadcastss 0x3c(%rdx),%ymm11 - 196,98,125,24,98,56, //vbroadcastss 0x38(%rdx),%ymm12 + 184,154,153,153,62, //mov $0x3e99999a,%eax + 197,121,110,216, //vmovd %eax,%xmm11 + 196,66,125,24,219, //vbroadcastss %xmm11,%ymm11 + 184,92,143,50,63, //mov $0x3f328f5c,%eax + 197,121,110,224, //vmovd %eax,%xmm12 + 196,66,125,24,228, //vbroadcastss %xmm12,%ymm12 196,65,124,40,235, //vmovaps %ymm11,%ymm13 196,66,125,168,236, //vfmadd213ps %ymm12,%ymm0,%ymm13 - 196,98,125,24,114,52, //vbroadcastss 0x34(%rdx),%ymm14 + 184,10,215,35,59, //mov $0x3b23d70a,%eax + 197,121,110,240, //vmovd %eax,%xmm14 + 196,66,125,24,246, //vbroadcastss %xmm14,%ymm14 196,66,45,168,238, //vfmadd213ps %ymm14,%ymm10,%ymm13 - 196,98,125,24,82,68, //vbroadcastss 0x44(%rdx),%ymm10 + 184,174,71,97,61, //mov $0x3d6147ae,%eax + 197,121,110,208, //vmovd %eax,%xmm10 + 196,66,125,24,210, //vbroadcastss %xmm10,%ymm10 196,193,124,194,194,1, //vcmpltps %ymm10,%ymm0,%ymm0 196,195,21,74,193,0, //vblendvps %ymm0,%ymm9,%ymm13,%ymm0 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9 @@ -7532,37 +7852,50 @@ CODE const uint8_t sk_from_srgb_hsw[] = { CODE const uint8_t sk_to_srgb_hsw[] = { 197,124,82,192, //vrsqrtps %ymm0,%ymm8 - 196,65,124,83,200, //vrcpps %ymm8,%ymm9 - 196,65,124,82,208, //vrsqrtps %ymm8,%ymm10 - 196,98,125,24,66,72, //vbroadcastss 0x48(%rdx),%ymm8 - 197,60,89,216, //vmulps %ymm0,%ymm8,%ymm11 - 196,98,125,24,34, //vbroadcastss (%rdx),%ymm12 - 196,98,125,24,106,76, //vbroadcastss 0x4c(%rdx),%ymm13 - 196,98,125,24,114,80, //vbroadcastss 0x50(%rdx),%ymm14 - 196,98,125,24,122,84, //vbroadcastss 0x54(%rdx),%ymm15 - 196,66,13,168,207, //vfmadd213ps %ymm15,%ymm14,%ymm9 - 196,66,21,184,202, //vfmadd231ps %ymm10,%ymm13,%ymm9 - 196,65,28,93,201, //vminps %ymm9,%ymm12,%ymm9 - 196,98,125,24,82,88, //vbroadcastss 0x58(%rdx),%ymm10 - 196,193,124,194,194,1, //vcmpltps %ymm10,%ymm0,%ymm0 - 196,195,53,74,195,0, //vblendvps %ymm0,%ymm11,%ymm9,%ymm0 - 197,124,82,201, //vrsqrtps %ymm1,%ymm9 - 196,65,124,83,217, //vrcpps %ymm9,%ymm11 - 196,65,124,82,201, //vrsqrtps %ymm9,%ymm9 - 196,66,13,168,223, //vfmadd213ps %ymm15,%ymm14,%ymm11 - 196,66,21,184,217, //vfmadd231ps %ymm9,%ymm13,%ymm11 - 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9 - 196,65,28,93,219, //vminps %ymm11,%ymm12,%ymm11 - 196,193,116,194,202,1, //vcmpltps %ymm10,%ymm1,%ymm1 - 196,195,37,74,201,16, //vblendvps %ymm1,%ymm9,%ymm11,%ymm1 - 197,124,82,202, //vrsqrtps %ymm2,%ymm9 - 196,65,124,83,217, //vrcpps %ymm9,%ymm11 + 196,65,124,83,216, //vrcpps %ymm8,%ymm11 + 196,65,124,82,224, //vrsqrtps %ymm8,%ymm12 + 184,41,92,71,65, //mov $0x41475c29,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 197,60,89,232, //vmulps %ymm0,%ymm8,%ymm13 + 184,0,0,128,63, //mov $0x3f800000,%eax + 197,121,110,200, //vmovd %eax,%xmm9 + 196,66,125,24,201, //vbroadcastss %xmm9,%ymm9 + 184,194,135,210,62, //mov $0x3ed287c2,%eax + 197,121,110,208, //vmovd %eax,%xmm10 + 196,66,125,24,210, //vbroadcastss %xmm10,%ymm10 + 184,206,111,48,63, //mov $0x3f306fce,%eax + 197,121,110,240, //vmovd %eax,%xmm14 + 196,66,125,24,246, //vbroadcastss %xmm14,%ymm14 + 184,168,87,202,61, //mov $0x3dca57a8,%eax + 53,0,0,0,128, //xor $0x80000000,%eax + 197,121,110,248, //vmovd %eax,%xmm15 + 196,66,125,24,255, //vbroadcastss %xmm15,%ymm15 196,66,13,168,223, //vfmadd213ps %ymm15,%ymm14,%ymm11 - 196,65,124,82,201, //vrsqrtps %ymm9,%ymm9 - 196,66,21,184,217, //vfmadd231ps %ymm9,%ymm13,%ymm11 - 196,65,28,93,203, //vminps %ymm11,%ymm12,%ymm9 + 196,66,45,184,220, //vfmadd231ps %ymm12,%ymm10,%ymm11 + 196,65,52,93,219, //vminps %ymm11,%ymm9,%ymm11 + 184,4,231,140,59, //mov $0x3b8ce704,%eax + 197,121,110,224, //vmovd %eax,%xmm12 + 196,66,125,24,228, //vbroadcastss %xmm12,%ymm12 + 196,193,124,194,196,1, //vcmpltps %ymm12,%ymm0,%ymm0 + 196,195,37,74,197,0, //vblendvps %ymm0,%ymm13,%ymm11,%ymm0 + 197,124,82,217, //vrsqrtps %ymm1,%ymm11 + 196,65,124,83,235, //vrcpps %ymm11,%ymm13 + 196,65,124,82,219, //vrsqrtps %ymm11,%ymm11 + 196,66,13,168,239, //vfmadd213ps %ymm15,%ymm14,%ymm13 + 196,66,45,184,235, //vfmadd231ps %ymm11,%ymm10,%ymm13 + 197,60,89,217, //vmulps %ymm1,%ymm8,%ymm11 + 196,65,52,93,237, //vminps %ymm13,%ymm9,%ymm13 + 196,193,116,194,204,1, //vcmpltps %ymm12,%ymm1,%ymm1 + 196,195,21,74,203,16, //vblendvps %ymm1,%ymm11,%ymm13,%ymm1 + 197,124,82,218, //vrsqrtps %ymm2,%ymm11 + 196,65,124,83,235, //vrcpps %ymm11,%ymm13 + 196,66,13,168,239, //vfmadd213ps %ymm15,%ymm14,%ymm13 + 196,65,124,82,219, //vrsqrtps %ymm11,%ymm11 + 196,66,45,184,235, //vfmadd231ps %ymm11,%ymm10,%ymm13 + 196,65,52,93,205, //vminps %ymm13,%ymm9,%ymm9 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 - 196,193,108,194,210,1, //vcmpltps %ymm10,%ymm2,%ymm2 + 196,193,108,194,212,1, //vcmpltps %ymm12,%ymm2,%ymm2 196,195,53,74,208,32, //vblendvps %ymm2,%ymm8,%ymm9,%ymm2 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -7585,7 +7918,7 @@ CODE const uint8_t sk_scale_u8_hsw[] = { 72,139,0, //mov (%rax),%rax 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 - 117,56, //jne 4f9 <_sk_scale_u8_hsw+0x48> + 117,56, //jne 556 <_sk_scale_u8_hsw+0x48> 197,123,16,0, //vmovsd (%rax),%xmm8 196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 @@ -7609,9 +7942,9 @@ CODE const uint8_t sk_scale_u8_hsw[] = { 77,9,217, //or %r11,%r9 72,131,193,8, //add $0x8,%rcx 73,255,202, //dec %r10 - 117,234, //jne 501 <_sk_scale_u8_hsw+0x50> + 117,234, //jne 55e <_sk_scale_u8_hsw+0x50> 196,65,249,110,193, //vmovq %r9,%xmm8 - 235,167, //jmp 4c5 <_sk_scale_u8_hsw+0x14> + 235,167, //jmp 522 <_sk_scale_u8_hsw+0x14> }; CODE const uint8_t sk_lerp_1_float_hsw[] = { @@ -7635,7 +7968,7 @@ CODE const uint8_t sk_lerp_u8_hsw[] = { 72,139,0, //mov (%rax),%rax 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 - 117,76, //jne 5a9 <_sk_lerp_u8_hsw+0x5c> + 117,76, //jne 606 <_sk_lerp_u8_hsw+0x5c> 197,123,16,0, //vmovsd (%rax),%xmm8 196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 @@ -7663,37 +7996,49 @@ CODE const uint8_t sk_lerp_u8_hsw[] = { 77,9,217, //or %r11,%r9 72,131,193,8, //add $0x8,%rcx 73,255,202, //dec %r10 - 117,234, //jne 5b1 <_sk_lerp_u8_hsw+0x64> + 117,234, //jne 60e <_sk_lerp_u8_hsw+0x64> 196,65,249,110,193, //vmovq %r9,%xmm8 - 235,147, //jmp 561 <_sk_lerp_u8_hsw+0x14> + 235,147, //jmp 5be <_sk_lerp_u8_hsw+0x14> }; CODE const uint8_t sk_lerp_565_hsw[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,16, //mov (%rax),%r10 72,133,201, //test %rcx,%rcx - 15,133,132,0,0,0, //jne 660 <_sk_lerp_565_hsw+0x92> + 15,133,179,0,0,0, //jne 6ec <_sk_lerp_565_hsw+0xc1> 196,193,122,111,28,122, //vmovdqu (%r10,%rdi,2),%xmm3 - 196,226,125,51,219, //vpmovzxwd %xmm3,%ymm3 - 196,98,125,88,66,104, //vpbroadcastd 0x68(%rdx),%ymm8 - 197,61,219,195, //vpand %ymm3,%ymm8,%ymm8 - 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 - 196,98,125,24,74,116, //vbroadcastss 0x74(%rdx),%ymm9 - 196,65,52,89,192, //vmulps %ymm8,%ymm9,%ymm8 - 196,98,125,88,74,108, //vpbroadcastd 0x6c(%rdx),%ymm9 - 197,53,219,203, //vpand %ymm3,%ymm9,%ymm9 - 196,65,124,91,201, //vcvtdq2ps %ymm9,%ymm9 - 196,98,125,24,82,120, //vbroadcastss 0x78(%rdx),%ymm10 - 196,65,44,89,201, //vmulps %ymm9,%ymm10,%ymm9 - 196,98,125,88,82,112, //vpbroadcastd 0x70(%rdx),%ymm10 - 197,173,219,219, //vpand %ymm3,%ymm10,%ymm3 - 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3 - 196,98,125,24,82,124, //vbroadcastss 0x7c(%rdx),%ymm10 - 197,172,89,219, //vmulps %ymm3,%ymm10,%ymm3 + 196,98,125,51,195, //vpmovzxwd %xmm3,%ymm8 + 184,0,248,0,0, //mov $0xf800,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 + 196,193,101,219,216, //vpand %ymm8,%ymm3,%ymm3 + 197,124,91,203, //vcvtdq2ps %ymm3,%ymm9 + 184,8,33,132,55, //mov $0x37842108,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 197,52,89,203, //vmulps %ymm3,%ymm9,%ymm9 + 184,224,7,0,0, //mov $0x7e0,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 + 196,193,101,219,216, //vpand %ymm8,%ymm3,%ymm3 + 197,124,91,211, //vcvtdq2ps %ymm3,%ymm10 + 184,33,8,2,58, //mov $0x3a020821,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 197,44,89,211, //vmulps %ymm3,%ymm10,%ymm10 + 184,31,0,0,0, //mov $0x1f,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 + 196,193,101,219,216, //vpand %ymm8,%ymm3,%ymm3 + 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 + 184,8,33,4,61, //mov $0x3d042108,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 - 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0 + 196,226,53,168,196, //vfmadd213ps %ymm4,%ymm9,%ymm0 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1 - 196,226,53,168,205, //vfmadd213ps %ymm5,%ymm9,%ymm1 + 196,226,45,168,205, //vfmadd213ps %ymm5,%ymm10,%ymm1 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2 196,226,101,168,214, //vfmadd213ps %ymm6,%ymm3,%ymm2 184,0,0,128,63, //mov $0x3f800000,%eax @@ -7707,8 +8052,8 @@ CODE const uint8_t sk_lerp_565_hsw[] = { 65,254,200, //dec %r8b 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 15,135,102,255,255,255, //ja 5e2 <_sk_lerp_565_hsw+0x14> - 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # 6cc <_sk_lerp_565_hsw+0xfe> + 15,135,55,255,255,255, //ja 63f <_sk_lerp_565_hsw+0x14> + 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # 758 <_sk_lerp_565_hsw+0x12d> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax 255,224, //jmpq *%rax @@ -7720,7 +8065,7 @@ CODE const uint8_t sk_lerp_565_hsw[] = { 196,193,97,196,92,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm3,%xmm3 196,193,97,196,92,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm3,%xmm3 196,193,97,196,28,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm3,%xmm3 - 233,22,255,255,255, //jmpq 5e2 <_sk_lerp_565_hsw+0x14> + 233,231,254,255,255, //jmpq 63f <_sk_lerp_565_hsw+0x14> 244, //hlt 255, //(bad) 255, //(bad) @@ -7752,9 +8097,11 @@ CODE const uint8_t sk_load_tables_hsw[] = { 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9 76,3,8, //add (%rax),%r9 77,133,192, //test %r8,%r8 - 117,106, //jne 767 <_sk_load_tables_hsw+0x7f> + 117,121, //jne 802 <_sk_load_tables_hsw+0x8e> 196,193,126,111,25, //vmovdqu (%r9),%ymm3 - 196,226,125,88,82,16, //vpbroadcastd 0x10(%rdx),%ymm2 + 185,255,0,0,0, //mov $0xff,%ecx + 197,249,110,193, //vmovd %ecx,%xmm0 + 196,226,125,88,208, //vpbroadcastd %xmm0,%ymm2 197,237,219,203, //vpand %ymm3,%ymm2,%ymm1 196,65,61,118,192, //vpcmpeqd %ymm8,%ymm8,%ymm8 72,139,72,8, //mov 0x8(%rax),%rcx @@ -7770,9 +8117,11 @@ CODE const uint8_t sk_load_tables_hsw[] = { 196,65,109,219,201, //vpand %ymm9,%ymm2,%ymm9 196,162,61,146,20,136, //vgatherdps %ymm8,(%rax,%ymm9,4),%ymm2 197,229,114,211,24, //vpsrld $0x18,%ymm3,%ymm3 - 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3 - 196,98,125,24,66,12, //vbroadcastss 0xc(%rdx),%ymm8 - 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3 + 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 + 184,129,128,128,59, //mov $0x3b808081,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 72,173, //lods %ds:(%rsi),%rax 76,137,193, //mov %r8,%rcx 255,224, //jmpq *%rax @@ -7784,7 +8133,7 @@ CODE const uint8_t sk_load_tables_hsw[] = { 196,193,249,110,194, //vmovq %r10,%xmm0 196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0 196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3 - 233,114,255,255,255, //jmpq 702 <_sk_load_tables_hsw+0x1a> + 233,99,255,255,255, //jmpq 78e <_sk_load_tables_hsw+0x1a> }; CODE const uint8_t sk_load_a8_hsw[] = { @@ -7793,11 +8142,13 @@ CODE const uint8_t sk_load_a8_hsw[] = { 72,139,0, //mov (%rax),%rax 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 - 117,42, //jne 7ca <_sk_load_a8_hsw+0x3a> + 117,50, //jne 86d <_sk_load_a8_hsw+0x42> 197,251,16,0, //vmovsd (%rax),%xmm0 196,226,125,49,192, //vpmovzxbd %xmm0,%ymm0 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 - 196,226,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm1 + 184,129,128,128,59, //mov $0x3b808081,%eax + 197,249,110,200, //vmovd %eax,%xmm1 + 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1 197,252,89,217, //vmulps %ymm1,%ymm0,%ymm3 72,173, //lods %ds:(%rsi),%rax 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 @@ -7814,22 +8165,24 @@ CODE const uint8_t sk_load_a8_hsw[] = { 77,9,217, //or %r11,%r9 72,131,193,8, //add $0x8,%rcx 73,255,202, //dec %r10 - 117,234, //jne 7d2 <_sk_load_a8_hsw+0x42> + 117,234, //jne 875 <_sk_load_a8_hsw+0x4a> 196,193,249,110,193, //vmovq %r9,%xmm0 - 235,181, //jmp 7a4 <_sk_load_a8_hsw+0x14> + 235,173, //jmp 83f <_sk_load_a8_hsw+0x14> }; CODE const uint8_t sk_store_a8_hsw[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,8, //mov (%rax),%r9 - 196,98,125,24,66,8, //vbroadcastss 0x8(%rdx),%ymm8 + 184,0,0,127,67, //mov $0x437f0000,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 196,65,57,103,192, //vpackuswb %xmm8,%xmm8,%xmm8 72,133,201, //test %rcx,%rcx - 117,10, //jne 822 <_sk_store_a8_hsw+0x33> + 117,10, //jne 8cd <_sk_store_a8_hsw+0x3b> 196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -7838,9 +8191,9 @@ CODE const uint8_t sk_store_a8_hsw[] = { 254,200, //dec %al 68,15,182,192, //movzbl %al,%r8d 65,128,248,6, //cmp $0x6,%r8b - 119,236, //ja 81e <_sk_store_a8_hsw+0x2f> + 119,236, //ja 8c9 <_sk_store_a8_hsw+0x37> 196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8 - 76,141,21,66,0,0,0, //lea 0x42(%rip),%r10 # 880 <_sk_store_a8_hsw+0x91> + 76,141,21,67,0,0,0, //lea 0x43(%rip),%r10 # 92c <_sk_store_a8_hsw+0x9a> 75,99,4,130, //movslq (%r10,%r8,4),%rax 76,1,208, //add %r10,%rax 255,224, //jmpq *%rax @@ -7851,26 +8204,27 @@ CODE const uint8_t sk_store_a8_hsw[] = { 196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1) 196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1) 196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1) - 235,158, //jmp 81e <_sk_store_a8_hsw+0x2f> - 247,255, //idiv %edi + 235,158, //jmp 8c9 <_sk_store_a8_hsw+0x37> + 144, //nop + 246,255, //idiv %bh 255, //(bad) 255, //(bad) - 239, //out %eax,(%dx) + 238, //out %al,(%dx) 255, //(bad) 255, //(bad) - 255,231, //jmpq *%rdi + 255,230, //jmpq *%rsi 255, //(bad) 255, //(bad) 255, //(bad) - 223,255, //(bad) + 222,255, //fdivrp %st,%st(7) 255, //(bad) - 255,215, //callq *%rdi + 255,214, //callq *%rsi 255, //(bad) 255, //(bad) - 255,207, //dec %edi + 255,206, //dec %esi 255, //(bad) 255, //(bad) - 255,199, //inc %edi + 255,198, //inc %esi 255, //(bad) 255, //(bad) 255, //.byte 0xff @@ -7880,25 +8234,39 @@ CODE const uint8_t sk_load_565_hsw[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,16, //mov (%rax),%r10 72,133,201, //test %rcx,%rcx - 117,92, //jne 902 <_sk_load_565_hsw+0x66> + 15,133,149,0,0,0, //jne 9eb <_sk_load_565_hsw+0xa3> 196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0 196,226,125,51,208, //vpmovzxwd %xmm0,%ymm2 - 196,226,125,88,66,104, //vpbroadcastd 0x68(%rdx),%ymm0 + 184,0,248,0,0, //mov $0xf800,%eax + 197,249,110,192, //vmovd %eax,%xmm0 + 196,226,125,88,192, //vpbroadcastd %xmm0,%ymm0 197,253,219,194, //vpand %ymm2,%ymm0,%ymm0 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 - 196,226,125,24,74,116, //vbroadcastss 0x74(%rdx),%ymm1 - 197,244,89,192, //vmulps %ymm0,%ymm1,%ymm0 - 196,226,125,88,74,108, //vpbroadcastd 0x6c(%rdx),%ymm1 + 184,8,33,132,55, //mov $0x37842108,%eax + 197,249,110,200, //vmovd %eax,%xmm1 + 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1 + 197,252,89,193, //vmulps %ymm1,%ymm0,%ymm0 + 184,224,7,0,0, //mov $0x7e0,%eax + 197,249,110,200, //vmovd %eax,%xmm1 + 196,226,125,88,201, //vpbroadcastd %xmm1,%ymm1 197,245,219,202, //vpand %ymm2,%ymm1,%ymm1 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1 - 196,226,125,24,90,120, //vbroadcastss 0x78(%rdx),%ymm3 - 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1 - 196,226,125,88,90,112, //vpbroadcastd 0x70(%rdx),%ymm3 + 184,33,8,2,58, //mov $0x3a020821,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1 + 184,31,0,0,0, //mov $0x1f,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 197,229,219,210, //vpand %ymm2,%ymm3,%ymm2 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 - 196,226,125,24,90,124, //vbroadcastss 0x7c(%rdx),%ymm3 - 197,228,89,210, //vmulps %ymm2,%ymm3,%ymm2 - 196,226,125,24,26, //vbroadcastss (%rdx),%ymm3 + 184,8,33,4,61, //mov $0x3d042108,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 + 184,0,0,128,63, //mov $0x3f800000,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax 65,137,200, //mov %ecx,%r8d @@ -7907,8 +8275,8 @@ CODE const uint8_t sk_load_565_hsw[] = { 65,254,200, //dec %r8b 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 119,146, //ja 8ac <_sk_load_565_hsw+0x10> - 76,141,13,75,0,0,0, //lea 0x4b(%rip),%r9 # 96c <_sk_load_565_hsw+0xd0> + 15,135,85,255,255,255, //ja 95c <_sk_load_565_hsw+0x14> + 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # a58 <_sk_load_565_hsw+0x110> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax 255,224, //jmpq *%rax @@ -7920,28 +8288,27 @@ CODE const uint8_t sk_load_565_hsw[] = { 196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0 196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0 196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0 - 233,66,255,255,255, //jmpq 8ac <_sk_load_565_hsw+0x10> - 102,144, //xchg %ax,%ax - 242,255, //repnz (bad) - 255, //(bad) + 233,5,255,255,255, //jmpq 95c <_sk_load_565_hsw+0x14> + 144, //nop + 243,255, //repz (bad) 255, //(bad) - 234, //(bad) 255, //(bad) + 235,255, //jmp a5d <_sk_load_565_hsw+0x115> 255, //(bad) - 255,226, //jmpq *%rdx + 255,227, //jmpq *%rbx 255, //(bad) 255, //(bad) 255, //(bad) - 218,255, //(bad) + 219,255, //(bad) 255, //(bad) - 255,210, //callq *%rdx + 255,211, //callq *%rbx 255, //(bad) 255, //(bad) - 255,202, //dec %edx + 255,203, //dec %ebx 255, //(bad) 255, //(bad) 255, //(bad) - 190, //.byte 0xbe + 191, //.byte 0xbf 255, //(bad) 255, //(bad) 255, //.byte 0xff @@ -7950,11 +8317,15 @@ CODE const uint8_t sk_load_565_hsw[] = { CODE const uint8_t sk_store_565_hsw[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,8, //mov (%rax),%r9 - 196,98,125,24,130,128,0,0,0, //vbroadcastss 0x80(%rdx),%ymm8 + 184,0,0,248,65, //mov $0x41f80000,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9 196,193,53,114,241,11, //vpslld $0xb,%ymm9,%ymm9 - 196,98,125,24,146,132,0,0,0, //vbroadcastss 0x84(%rdx),%ymm10 + 184,0,0,124,66, //mov $0x427c0000,%eax + 197,121,110,208, //vmovd %eax,%xmm10 + 196,66,125,24,210, //vbroadcastss %xmm10,%ymm10 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 196,193,45,114,242,5, //vpslld $0x5,%ymm10,%ymm10 @@ -7965,7 +8336,7 @@ CODE const uint8_t sk_store_565_hsw[] = { 196,67,125,57,193,1, //vextracti128 $0x1,%ymm8,%xmm9 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 72,133,201, //test %rcx,%rcx - 117,10, //jne 9ea <_sk_store_565_hsw+0x62> + 117,10, //jne ae0 <_sk_store_565_hsw+0x6c> 196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -7974,8 +8345,8 @@ CODE const uint8_t sk_store_565_hsw[] = { 254,200, //dec %al 68,15,182,192, //movzbl %al,%r8d 65,128,248,6, //cmp $0x6,%r8b - 119,236, //ja 9e6 <_sk_store_565_hsw+0x5e> - 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # a48 <_sk_store_565_hsw+0xc0> + 119,236, //ja adc <_sk_store_565_hsw+0x68> + 76,141,21,69,0,0,0, //lea 0x45(%rip),%r10 # b3c <_sk_store_565_hsw+0xc8> 75,99,4,130, //movslq (%r10,%r8,4),%rax 76,1,208, //add %r10,%rax 255,224, //jmpq *%rax @@ -7987,27 +8358,28 @@ CODE const uint8_t sk_store_565_hsw[] = { 196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2) 197,121,126,192, //vmovd %xmm8,%eax 102,65,137,4,121, //mov %ax,(%r9,%rdi,2) - 235,161, //jmp 9e6 <_sk_store_565_hsw+0x5e> - 15,31,0, //nopl (%rax) - 242,255, //repnz (bad) + 235,161, //jmp adc <_sk_store_565_hsw+0x68> + 144, //nop + 244, //hlt 255, //(bad) 255, //(bad) - 234, //(bad) 255, //(bad) + 236, //in (%dx),%al 255, //(bad) - 255,226, //jmpq *%rdx 255, //(bad) + 255,228, //jmpq *%rsp 255, //(bad) 255, //(bad) - 218,255, //(bad) 255, //(bad) - 255,210, //callq *%rdx + 220,255, //fdivr %st,%st(7) 255, //(bad) + 255,212, //callq *%rsp 255, //(bad) - 255,202, //dec %edx 255, //(bad) + 255,204, //dec %esp 255, //(bad) - 255,194, //inc %edx + 255, //(bad) + 255,196, //inc %esp 255, //(bad) 255, //(bad) 255, //.byte 0xff @@ -8019,7 +8391,7 @@ CODE const uint8_t sk_load_8888_hsw[] = { 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9 76,3,8, //add (%rax),%r9 77,133,192, //test %r8,%r8 - 117,104, //jne ae1 <_sk_load_8888_hsw+0x7d> + 117,104, //jne bd5 <_sk_load_8888_hsw+0x7d> 196,193,126,111,25, //vmovdqu (%r9),%ymm3 184,255,0,0,0, //mov $0xff,%eax 197,249,110,192, //vmovd %eax,%xmm0 @@ -8052,7 +8424,7 @@ CODE const uint8_t sk_load_8888_hsw[] = { 196,225,249,110,192, //vmovq %rax,%xmm0 196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0 196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3 - 233,116,255,255,255, //jmpq a7e <_sk_load_8888_hsw+0x1a> + 233,116,255,255,255, //jmpq b72 <_sk_load_8888_hsw+0x1a> }; CODE const uint8_t sk_store_8888_hsw[] = { @@ -8078,7 +8450,7 @@ CODE const uint8_t sk_store_8888_hsw[] = { 196,65,45,235,192, //vpor %ymm8,%ymm10,%ymm8 196,65,53,235,192, //vpor %ymm8,%ymm9,%ymm8 77,133,192, //test %r8,%r8 - 117,12, //jne b7e <_sk_store_8888_hsw+0x74> + 117,12, //jne c72 <_sk_store_8888_hsw+0x74> 196,65,126,127,1, //vmovdqu %ymm8,(%r9) 72,173, //lods %ds:(%rsi),%rax 76,137,193, //mov %r8,%rcx @@ -8091,14 +8463,14 @@ CODE const uint8_t sk_store_8888_hsw[] = { 196,97,249,110,200, //vmovq %rax,%xmm9 196,66,125,33,201, //vpmovsxbd %xmm9,%ymm9 196,66,53,142,1, //vpmaskmovd %ymm8,%ymm9,(%r9) - 235,211, //jmp b77 <_sk_store_8888_hsw+0x6d> + 235,211, //jmp c6b <_sk_store_8888_hsw+0x6d> }; CODE const uint8_t sk_load_f16_hsw[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax 72,133,201, //test %rcx,%rcx - 117,97, //jne c0f <_sk_load_f16_hsw+0x6b> + 117,97, //jne d03 <_sk_load_f16_hsw+0x6b> 197,249,16,12,248, //vmovupd (%rax,%rdi,8),%xmm1 197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2 197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3 @@ -8124,35 +8496,35 @@ CODE const uint8_t sk_load_f16_hsw[] = { 197,251,16,12,248, //vmovsd (%rax,%rdi,8),%xmm1 196,65,57,87,192, //vxorpd %xmm8,%xmm8,%xmm8 72,131,249,1, //cmp $0x1,%rcx - 117,6, //jne c25 <_sk_load_f16_hsw+0x81> + 117,6, //jne d19 <_sk_load_f16_hsw+0x81> 197,250,126,201, //vmovq %xmm1,%xmm1 - 235,30, //jmp c43 <_sk_load_f16_hsw+0x9f> + 235,30, //jmp d37 <_sk_load_f16_hsw+0x9f> 197,241,22,76,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm1,%xmm1 72,131,249,3, //cmp $0x3,%rcx - 114,18, //jb c43 <_sk_load_f16_hsw+0x9f> + 114,18, //jb d37 <_sk_load_f16_hsw+0x9f> 197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2 72,131,249,3, //cmp $0x3,%rcx - 117,19, //jne c50 <_sk_load_f16_hsw+0xac> + 117,19, //jne d44 <_sk_load_f16_hsw+0xac> 197,250,126,210, //vmovq %xmm2,%xmm2 - 235,46, //jmp c71 <_sk_load_f16_hsw+0xcd> + 235,46, //jmp d65 <_sk_load_f16_hsw+0xcd> 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2 - 233,117,255,255,255, //jmpq bc5 <_sk_load_f16_hsw+0x21> + 233,117,255,255,255, //jmpq cb9 <_sk_load_f16_hsw+0x21> 197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2 72,131,249,5, //cmp $0x5,%rcx - 114,21, //jb c71 <_sk_load_f16_hsw+0xcd> + 114,21, //jb d65 <_sk_load_f16_hsw+0xcd> 197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3 72,131,249,5, //cmp $0x5,%rcx - 117,18, //jne c7a <_sk_load_f16_hsw+0xd6> + 117,18, //jne d6e <_sk_load_f16_hsw+0xd6> 197,250,126,219, //vmovq %xmm3,%xmm3 - 233,84,255,255,255, //jmpq bc5 <_sk_load_f16_hsw+0x21> + 233,84,255,255,255, //jmpq cb9 <_sk_load_f16_hsw+0x21> 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 - 233,75,255,255,255, //jmpq bc5 <_sk_load_f16_hsw+0x21> + 233,75,255,255,255, //jmpq cb9 <_sk_load_f16_hsw+0x21> 197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3 72,131,249,7, //cmp $0x7,%rcx - 15,130,59,255,255,255, //jb bc5 <_sk_load_f16_hsw+0x21> + 15,130,59,255,255,255, //jb cb9 <_sk_load_f16_hsw+0x21> 197,123,16,68,248,48, //vmovsd 0x30(%rax,%rdi,8),%xmm8 - 233,48,255,255,255, //jmpq bc5 <_sk_load_f16_hsw+0x21> + 233,48,255,255,255, //jmpq cb9 <_sk_load_f16_hsw+0x21> }; CODE const uint8_t sk_store_f16_hsw[] = { @@ -8171,7 +8543,7 @@ CODE const uint8_t sk_store_f16_hsw[] = { 196,65,57,98,205, //vpunpckldq %xmm13,%xmm8,%xmm9 196,65,57,106,197, //vpunpckhdq %xmm13,%xmm8,%xmm8 72,133,201, //test %rcx,%rcx - 117,27, //jne cfa <_sk_store_f16_hsw+0x65> + 117,27, //jne dee <_sk_store_f16_hsw+0x65> 197,120,17,28,248, //vmovups %xmm11,(%rax,%rdi,8) 197,120,17,84,248,16, //vmovups %xmm10,0x10(%rax,%rdi,8) 197,120,17,76,248,32, //vmovups %xmm9,0x20(%rax,%rdi,8) @@ -8180,22 +8552,22 @@ CODE const uint8_t sk_store_f16_hsw[] = { 255,224, //jmpq *%rax 197,121,214,28,248, //vmovq %xmm11,(%rax,%rdi,8) 72,131,249,1, //cmp $0x1,%rcx - 116,241, //je cf6 <_sk_store_f16_hsw+0x61> + 116,241, //je dea <_sk_store_f16_hsw+0x61> 197,121,23,92,248,8, //vmovhpd %xmm11,0x8(%rax,%rdi,8) 72,131,249,3, //cmp $0x3,%rcx - 114,229, //jb cf6 <_sk_store_f16_hsw+0x61> + 114,229, //jb dea <_sk_store_f16_hsw+0x61> 197,121,214,84,248,16, //vmovq %xmm10,0x10(%rax,%rdi,8) - 116,221, //je cf6 <_sk_store_f16_hsw+0x61> + 116,221, //je dea <_sk_store_f16_hsw+0x61> 197,121,23,84,248,24, //vmovhpd %xmm10,0x18(%rax,%rdi,8) 72,131,249,5, //cmp $0x5,%rcx - 114,209, //jb cf6 <_sk_store_f16_hsw+0x61> + 114,209, //jb dea <_sk_store_f16_hsw+0x61> 197,121,214,76,248,32, //vmovq %xmm9,0x20(%rax,%rdi,8) - 116,201, //je cf6 <_sk_store_f16_hsw+0x61> + 116,201, //je dea <_sk_store_f16_hsw+0x61> 197,121,23,76,248,40, //vmovhpd %xmm9,0x28(%rax,%rdi,8) 72,131,249,7, //cmp $0x7,%rcx - 114,189, //jb cf6 <_sk_store_f16_hsw+0x61> + 114,189, //jb dea <_sk_store_f16_hsw+0x61> 197,121,214,68,248,48, //vmovq %xmm8,0x30(%rax,%rdi,8) - 235,181, //jmp cf6 <_sk_store_f16_hsw+0x61> + 235,181, //jmp dea <_sk_store_f16_hsw+0x61> }; CODE const uint8_t sk_store_f32_hsw[] = { @@ -8211,7 +8583,7 @@ CODE const uint8_t sk_store_f32_hsw[] = { 196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8 196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11 72,133,201, //test %rcx,%rcx - 117,55, //jne dae <_sk_store_f32_hsw+0x6d> + 117,55, //jne ea2 <_sk_store_f32_hsw+0x6d> 196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12 196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13 196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9 @@ -8224,22 +8596,22 @@ CODE const uint8_t sk_store_f32_hsw[] = { 255,224, //jmpq *%rax 196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4) 72,131,249,1, //cmp $0x1,%rcx - 116,240, //je daa <_sk_store_f32_hsw+0x69> + 116,240, //je e9e <_sk_store_f32_hsw+0x69> 196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4) 72,131,249,3, //cmp $0x3,%rcx - 114,227, //jb daa <_sk_store_f32_hsw+0x69> + 114,227, //jb e9e <_sk_store_f32_hsw+0x69> 196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4) - 116,218, //je daa <_sk_store_f32_hsw+0x69> + 116,218, //je e9e <_sk_store_f32_hsw+0x69> 196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4) 72,131,249,5, //cmp $0x5,%rcx - 114,205, //jb daa <_sk_store_f32_hsw+0x69> + 114,205, //jb e9e <_sk_store_f32_hsw+0x69> 196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4) - 116,195, //je daa <_sk_store_f32_hsw+0x69> + 116,195, //je e9e <_sk_store_f32_hsw+0x69> 196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4) 72,131,249,7, //cmp $0x7,%rcx - 114,181, //jb daa <_sk_store_f32_hsw+0x69> + 114,181, //jb e9e <_sk_store_f32_hsw+0x69> 196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4) - 235,171, //jmp daa <_sk_store_f32_hsw+0x69> + 235,171, //jmp e9e <_sk_store_f32_hsw+0x69> }; CODE const uint8_t sk_clamp_x_hsw[] = { @@ -8335,11 +8707,17 @@ CODE const uint8_t sk_mirror_y_hsw[] = { }; CODE const uint8_t sk_luminance_to_alpha_hsw[] = { - 196,98,125,24,130,136,0,0,0, //vbroadcastss 0x88(%rdx),%ymm8 - 196,226,125,24,154,140,0,0,0, //vbroadcastss 0x8c(%rdx),%ymm3 + 184,208,179,89,62, //mov $0x3e59b3d0,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,98,125,24,195, //vbroadcastss %xmm3,%ymm8 + 184,89,23,55,63, //mov $0x3f371759,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1 196,98,125,168,193, //vfmadd213ps %ymm1,%ymm0,%ymm8 - 196,226,125,24,154,144,0,0,0, //vbroadcastss 0x90(%rdx),%ymm3 + 184,152,221,147,61, //mov $0x3d93dd98,%eax + 197,249,110,192, //vmovd %eax,%xmm0 + 196,226,125,24,216, //vbroadcastss %xmm0,%ymm3 196,194,109,168,216, //vfmadd213ps %ymm8,%ymm2,%ymm3 72,173, //lods %ds:(%rsi),%rax 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 @@ -8586,7 +8964,7 @@ CODE const uint8_t sk_seed_shader_avx[] = { 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0 - 197,252,88,66,20, //vaddps 0x14(%rdx),%ymm0,%ymm0 + 197,252,88,2, //vaddps (%rdx),%ymm0,%ymm0 196,226,125,24,16, //vbroadcastss (%rax),%ymm2 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1 @@ -8780,23 +9158,38 @@ CODE const uint8_t sk_unpremul_avx[] = { }; CODE const uint8_t sk_from_srgb_avx[] = { - 196,98,125,24,66,64, //vbroadcastss 0x40(%rdx),%ymm8 + 184,145,131,158,61, //mov $0x3d9e8391,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 + 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 197,124,89,208, //vmulps %ymm0,%ymm0,%ymm10 - 196,98,125,24,90,60, //vbroadcastss 0x3c(%rdx),%ymm11 - 196,98,125,24,98,56, //vbroadcastss 0x38(%rdx),%ymm12 + 184,154,153,153,62, //mov $0x3e99999a,%eax + 197,121,110,216, //vmovd %eax,%xmm11 + 196,67,121,4,219,0, //vpermilps $0x0,%xmm11,%xmm11 + 196,67,37,24,219,1, //vinsertf128 $0x1,%xmm11,%ymm11,%ymm11 + 184,92,143,50,63, //mov $0x3f328f5c,%eax + 197,121,110,224, //vmovd %eax,%xmm12 + 196,67,121,4,228,0, //vpermilps $0x0,%xmm12,%xmm12 + 196,67,29,24,228,1, //vinsertf128 $0x1,%xmm12,%ymm12,%ymm12 197,36,89,232, //vmulps %ymm0,%ymm11,%ymm13 196,65,20,88,236, //vaddps %ymm12,%ymm13,%ymm13 - 196,98,125,24,114,52, //vbroadcastss 0x34(%rdx),%ymm14 + 184,10,215,35,59, //mov $0x3b23d70a,%eax + 197,121,110,240, //vmovd %eax,%xmm14 + 196,67,121,4,246,0, //vpermilps $0x0,%xmm14,%xmm14 + 196,67,13,24,246,1, //vinsertf128 $0x1,%xmm14,%ymm14,%ymm14 196,65,44,89,213, //vmulps %ymm13,%ymm10,%ymm10 196,65,12,88,210, //vaddps %ymm10,%ymm14,%ymm10 - 196,98,125,24,106,68, //vbroadcastss 0x44(%rdx),%ymm13 + 184,174,71,97,61, //mov $0x3d6147ae,%eax + 197,121,110,232, //vmovd %eax,%xmm13 + 196,67,121,4,237,0, //vpermilps $0x0,%xmm13,%xmm13 + 196,67,21,24,237,1, //vinsertf128 $0x1,%xmm13,%ymm13,%ymm13 196,193,124,194,197,1, //vcmpltps %ymm13,%ymm0,%ymm0 196,195,45,74,193,0, //vblendvps %ymm0,%ymm9,%ymm10,%ymm0 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9 197,116,89,209, //vmulps %ymm1,%ymm1,%ymm10 197,36,89,249, //vmulps %ymm1,%ymm11,%ymm15 - 196,65,4,88,252, //vaddps %ymm12,%ymm15,%ymm15 + 196,65,28,88,255, //vaddps %ymm15,%ymm12,%ymm15 196,65,44,89,215, //vmulps %ymm15,%ymm10,%ymm10 196,65,12,88,210, //vaddps %ymm10,%ymm14,%ymm10 196,193,116,194,205,1, //vcmpltps %ymm13,%ymm1,%ymm1 @@ -8804,7 +9197,7 @@ CODE const uint8_t sk_from_srgb_avx[] = { 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 197,108,89,202, //vmulps %ymm2,%ymm2,%ymm9 197,36,89,210, //vmulps %ymm2,%ymm11,%ymm10 - 196,65,44,88,212, //vaddps %ymm12,%ymm10,%ymm10 + 196,65,28,88,210, //vaddps %ymm10,%ymm12,%ymm10 196,65,52,89,202, //vmulps %ymm10,%ymm9,%ymm9 196,65,12,88,201, //vaddps %ymm9,%ymm14,%ymm9 196,193,108,194,213,1, //vcmpltps %ymm13,%ymm2,%ymm2 @@ -8815,43 +9208,62 @@ CODE const uint8_t sk_from_srgb_avx[] = { CODE const uint8_t sk_to_srgb_avx[] = { 197,124,82,192, //vrsqrtps %ymm0,%ymm8 - 196,65,124,83,200, //vrcpps %ymm8,%ymm9 - 196,65,124,82,208, //vrsqrtps %ymm8,%ymm10 - 196,98,125,24,66,72, //vbroadcastss 0x48(%rdx),%ymm8 - 197,60,89,216, //vmulps %ymm0,%ymm8,%ymm11 - 196,98,125,24,34, //vbroadcastss (%rdx),%ymm12 - 196,98,125,24,106,76, //vbroadcastss 0x4c(%rdx),%ymm13 - 196,98,125,24,114,80, //vbroadcastss 0x50(%rdx),%ymm14 - 196,98,125,24,122,84, //vbroadcastss 0x54(%rdx),%ymm15 - 196,65,52,89,206, //vmulps %ymm14,%ymm9,%ymm9 - 196,65,52,88,207, //vaddps %ymm15,%ymm9,%ymm9 - 196,65,44,89,213, //vmulps %ymm13,%ymm10,%ymm10 - 196,65,44,88,201, //vaddps %ymm9,%ymm10,%ymm9 - 196,65,28,93,201, //vminps %ymm9,%ymm12,%ymm9 - 196,98,125,24,82,88, //vbroadcastss 0x58(%rdx),%ymm10 - 196,193,124,194,194,1, //vcmpltps %ymm10,%ymm0,%ymm0 - 196,195,53,74,195,0, //vblendvps %ymm0,%ymm11,%ymm9,%ymm0 - 197,124,82,201, //vrsqrtps %ymm1,%ymm9 - 196,65,124,83,217, //vrcpps %ymm9,%ymm11 - 196,65,124,82,201, //vrsqrtps %ymm9,%ymm9 - 196,65,12,89,219, //vmulps %ymm11,%ymm14,%ymm11 - 196,65,4,88,219, //vaddps %ymm11,%ymm15,%ymm11 - 196,65,20,89,201, //vmulps %ymm9,%ymm13,%ymm9 - 196,65,52,88,203, //vaddps %ymm11,%ymm9,%ymm9 - 197,60,89,217, //vmulps %ymm1,%ymm8,%ymm11 - 196,65,28,93,201, //vminps %ymm9,%ymm12,%ymm9 - 196,193,116,194,202,1, //vcmpltps %ymm10,%ymm1,%ymm1 - 196,195,53,74,203,16, //vblendvps %ymm1,%ymm11,%ymm9,%ymm1 - 197,124,82,202, //vrsqrtps %ymm2,%ymm9 - 196,65,124,83,217, //vrcpps %ymm9,%ymm11 - 196,65,12,89,219, //vmulps %ymm11,%ymm14,%ymm11 + 196,65,124,83,232, //vrcpps %ymm8,%ymm13 + 196,65,124,82,240, //vrsqrtps %ymm8,%ymm14 + 184,41,92,71,65, //mov $0x41475c29,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 + 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 + 197,60,89,224, //vmulps %ymm0,%ymm8,%ymm12 + 184,0,0,128,63, //mov $0x3f800000,%eax + 197,121,110,200, //vmovd %eax,%xmm9 + 196,67,121,4,201,0, //vpermilps $0x0,%xmm9,%xmm9 + 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9 + 184,194,135,210,62, //mov $0x3ed287c2,%eax + 197,121,110,208, //vmovd %eax,%xmm10 + 196,67,121,4,210,0, //vpermilps $0x0,%xmm10,%xmm10 + 196,67,45,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm10,%ymm10 + 184,206,111,48,63, //mov $0x3f306fce,%eax + 197,121,110,216, //vmovd %eax,%xmm11 + 196,67,121,4,219,0, //vpermilps $0x0,%xmm11,%xmm11 + 196,67,37,24,219,1, //vinsertf128 $0x1,%xmm11,%ymm11,%ymm11 + 184,168,87,202,61, //mov $0x3dca57a8,%eax + 53,0,0,0,128, //xor $0x80000000,%eax + 197,121,110,248, //vmovd %eax,%xmm15 + 196,67,121,4,255,0, //vpermilps $0x0,%xmm15,%xmm15 + 196,67,5,24,255,1, //vinsertf128 $0x1,%xmm15,%ymm15,%ymm15 + 196,65,20,89,235, //vmulps %ymm11,%ymm13,%ymm13 + 196,65,20,88,239, //vaddps %ymm15,%ymm13,%ymm13 + 196,65,12,89,242, //vmulps %ymm10,%ymm14,%ymm14 + 196,65,12,88,237, //vaddps %ymm13,%ymm14,%ymm13 + 196,65,52,93,237, //vminps %ymm13,%ymm9,%ymm13 + 184,4,231,140,59, //mov $0x3b8ce704,%eax + 197,121,110,240, //vmovd %eax,%xmm14 + 196,67,121,4,246,0, //vpermilps $0x0,%xmm14,%xmm14 + 196,67,13,24,246,1, //vinsertf128 $0x1,%xmm14,%ymm14,%ymm14 + 196,193,124,194,198,1, //vcmpltps %ymm14,%ymm0,%ymm0 + 196,195,21,74,196,0, //vblendvps %ymm0,%ymm12,%ymm13,%ymm0 + 197,124,82,225, //vrsqrtps %ymm1,%ymm12 + 196,65,124,83,236, //vrcpps %ymm12,%ymm13 + 196,65,124,82,228, //vrsqrtps %ymm12,%ymm12 + 196,65,36,89,237, //vmulps %ymm13,%ymm11,%ymm13 + 196,65,4,88,237, //vaddps %ymm13,%ymm15,%ymm13 + 196,65,44,89,228, //vmulps %ymm12,%ymm10,%ymm12 + 196,65,28,88,229, //vaddps %ymm13,%ymm12,%ymm12 + 197,60,89,233, //vmulps %ymm1,%ymm8,%ymm13 + 196,65,52,93,228, //vminps %ymm12,%ymm9,%ymm12 + 196,193,116,194,206,1, //vcmpltps %ymm14,%ymm1,%ymm1 + 196,195,29,74,205,16, //vblendvps %ymm1,%ymm13,%ymm12,%ymm1 + 197,124,82,226, //vrsqrtps %ymm2,%ymm12 + 196,65,124,83,236, //vrcpps %ymm12,%ymm13 + 196,65,36,89,221, //vmulps %ymm13,%ymm11,%ymm11 196,65,4,88,219, //vaddps %ymm11,%ymm15,%ymm11 - 196,65,124,82,201, //vrsqrtps %ymm9,%ymm9 - 196,65,20,89,201, //vmulps %ymm9,%ymm13,%ymm9 - 196,65,52,88,203, //vaddps %ymm11,%ymm9,%ymm9 - 196,65,28,93,201, //vminps %ymm9,%ymm12,%ymm9 + 196,65,124,82,228, //vrsqrtps %ymm12,%ymm12 + 196,65,44,89,212, //vmulps %ymm12,%ymm10,%ymm10 + 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10 + 196,65,52,93,202, //vminps %ymm10,%ymm9,%ymm9 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 - 196,193,108,194,210,1, //vcmpltps %ymm10,%ymm2,%ymm2 + 196,193,108,194,214,1, //vcmpltps %ymm14,%ymm2,%ymm2 196,195,53,74,208,32, //vblendvps %ymm2,%ymm8,%ymm9,%ymm2 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -8874,7 +9286,7 @@ CODE const uint8_t sk_scale_u8_avx[] = { 72,139,0, //mov (%rax),%rax 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 - 117,80, //jne 58f <_sk_scale_u8_avx+0x60> + 117,80, //jne 639 <_sk_scale_u8_avx+0x60> 197,123,16,0, //vmovsd (%rax),%xmm8 196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9 196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8 @@ -8902,9 +9314,9 @@ CODE const uint8_t sk_scale_u8_avx[] = { 77,9,217, //or %r11,%r9 72,131,193,8, //add $0x8,%rcx 73,255,202, //dec %r10 - 117,234, //jne 597 <_sk_scale_u8_avx+0x68> + 117,234, //jne 641 <_sk_scale_u8_avx+0x68> 196,65,249,110,193, //vmovq %r9,%xmm8 - 235,143, //jmp 543 <_sk_scale_u8_avx+0x14> + 235,143, //jmp 5ed <_sk_scale_u8_avx+0x14> }; CODE const uint8_t sk_lerp_1_float_avx[] = { @@ -8932,7 +9344,7 @@ CODE const uint8_t sk_lerp_u8_avx[] = { 72,139,0, //mov (%rax),%rax 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 - 117,116, //jne 677 <_sk_lerp_u8_avx+0x84> + 117,116, //jne 721 <_sk_lerp_u8_avx+0x84> 197,123,16,0, //vmovsd (%rax),%xmm8 196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9 196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8 @@ -8968,41 +9380,59 @@ CODE const uint8_t sk_lerp_u8_avx[] = { 77,9,217, //or %r11,%r9 72,131,193,8, //add $0x8,%rcx 73,255,202, //dec %r10 - 117,234, //jne 67f <_sk_lerp_u8_avx+0x8c> + 117,234, //jne 729 <_sk_lerp_u8_avx+0x8c> 196,65,249,110,193, //vmovq %r9,%xmm8 - 233,104,255,255,255, //jmpq 607 <_sk_lerp_u8_avx+0x14> + 233,104,255,255,255, //jmpq 6b1 <_sk_lerp_u8_avx+0x14> }; CODE const uint8_t sk_lerp_565_avx[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,16, //mov (%rax),%r10 72,133,201, //test %rcx,%rcx - 15,133,164,0,0,0, //jne 751 <_sk_lerp_565_avx+0xb2> + 15,133,250,0,0,0, //jne 851 <_sk_lerp_565_avx+0x108> 196,65,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm8 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 197,185,105,219, //vpunpckhwd %xmm3,%xmm8,%xmm3 196,66,121,51,192, //vpmovzxwd %xmm8,%xmm8 - 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3 - 196,98,125,24,66,104, //vbroadcastss 0x68(%rdx),%ymm8 - 197,60,84,195, //vandps %ymm3,%ymm8,%ymm8 - 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 - 196,98,125,24,74,116, //vbroadcastss 0x74(%rdx),%ymm9 - 196,65,52,89,192, //vmulps %ymm8,%ymm9,%ymm8 - 196,98,125,24,74,108, //vbroadcastss 0x6c(%rdx),%ymm9 - 197,52,84,203, //vandps %ymm3,%ymm9,%ymm9 - 196,65,124,91,201, //vcvtdq2ps %ymm9,%ymm9 - 196,98,125,24,82,120, //vbroadcastss 0x78(%rdx),%ymm10 - 196,65,44,89,201, //vmulps %ymm9,%ymm10,%ymm9 - 196,98,125,24,82,112, //vbroadcastss 0x70(%rdx),%ymm10 - 197,172,84,219, //vandps %ymm3,%ymm10,%ymm3 - 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3 - 196,98,125,24,82,124, //vbroadcastss 0x7c(%rdx),%ymm10 - 197,172,89,219, //vmulps %ymm3,%ymm10,%ymm3 + 196,99,61,24,195,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm8 + 184,0,248,0,0, //mov $0xf800,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 196,193,100,84,216, //vandps %ymm8,%ymm3,%ymm3 + 197,124,91,203, //vcvtdq2ps %ymm3,%ymm9 + 184,8,33,132,55, //mov $0x37842108,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 197,52,89,203, //vmulps %ymm3,%ymm9,%ymm9 + 184,224,7,0,0, //mov $0x7e0,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 196,193,100,84,216, //vandps %ymm8,%ymm3,%ymm3 + 197,124,91,211, //vcvtdq2ps %ymm3,%ymm10 + 184,33,8,2,58, //mov $0x3a020821,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 197,44,89,211, //vmulps %ymm3,%ymm10,%ymm10 + 184,31,0,0,0, //mov $0x1f,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 196,193,100,84,216, //vandps %ymm8,%ymm3,%ymm3 + 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 + 184,8,33,4,61, //mov $0x3d042108,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 - 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0 + 196,193,124,89,193, //vmulps %ymm9,%ymm0,%ymm0 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1 - 196,193,116,89,201, //vmulps %ymm9,%ymm1,%ymm1 + 196,193,116,89,202, //vmulps %ymm10,%ymm1,%ymm1 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 @@ -9019,8 +9449,8 @@ CODE const uint8_t sk_lerp_565_avx[] = { 65,254,200, //dec %r8b 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 15,135,69,255,255,255, //ja 6b3 <_sk_lerp_565_avx+0x14> - 76,141,13,75,0,0,0, //lea 0x4b(%rip),%r9 # 7c0 <_sk_lerp_565_avx+0x121> + 15,135,239,254,255,255, //ja 75d <_sk_lerp_565_avx+0x14> + 76,141,13,75,0,0,0, //lea 0x4b(%rip),%r9 # 8c0 <_sk_lerp_565_avx+0x177> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax 255,224, //jmpq *%rax @@ -9032,7 +9462,7 @@ CODE const uint8_t sk_lerp_565_avx[] = { 196,65,57,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm8,%xmm8 196,65,57,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm8,%xmm8 196,65,57,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm8,%xmm8 - 233,245,254,255,255, //jmpq 6b3 <_sk_lerp_565_avx+0x14> + 233,159,254,255,255, //jmpq 75d <_sk_lerp_565_avx+0x14> 102,144, //xchg %ax,%ax 242,255, //repnz (bad) 255, //(bad) @@ -9069,9 +9499,12 @@ CODE const uint8_t sk_load_tables_avx[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,0, //mov (%rax),%r8 72,133,201, //test %rcx,%rcx - 15,133,18,2,0,0, //jne a06 <_sk_load_tables_avx+0x22a> + 15,133,56,2,0,0, //jne b2c <_sk_load_tables_avx+0x250> 196,65,124,16,4,184, //vmovups (%r8,%rdi,4),%ymm8 - 196,98,125,24,74,16, //vbroadcastss 0x10(%rdx),%ymm9 + 187,255,0,0,0, //mov $0xff,%ebx + 197,249,110,195, //vmovd %ebx,%xmm0 + 197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0 + 196,99,125,24,200,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm9 196,193,52,84,192, //vandps %ymm8,%ymm9,%ymm0 196,193,249,126,193, //vmovq %xmm0,%r9 69,137,203, //mov %r9d,%r11d @@ -9090,12 +9523,14 @@ CODE const uint8_t sk_load_tables_avx[] = { 76,139,64,16, //mov 0x10(%rax),%r8 196,161,122,16,68,189,0, //vmovss 0x0(%rbp,%r15,4),%xmm0 196,163,121,33,68,165,0,16, //vinsertps $0x10,0x0(%rbp,%r12,4),%xmm0,%xmm0 - 196,163,121,33,68,173,0,32, //vinsertps $0x20,0x0(%rbp,%r13,4),%xmm0,%xmm0 + 196,161,122,16,76,173,0, //vmovss 0x0(%rbp,%r13,4),%xmm1 + 196,227,121,33,193,32, //vinsertps $0x20,%xmm1,%xmm0,%xmm0 197,250,16,76,157,0, //vmovss 0x0(%rbp,%rbx,4),%xmm1 196,227,121,33,193,48, //vinsertps $0x30,%xmm1,%xmm0,%xmm0 196,161,122,16,76,157,0, //vmovss 0x0(%rbp,%r11,4),%xmm1 196,163,113,33,76,141,0,16, //vinsertps $0x10,0x0(%rbp,%r9,4),%xmm1,%xmm1 - 196,163,113,33,76,181,0,32, //vinsertps $0x20,0x0(%rbp,%r14,4),%xmm1,%xmm1 + 196,161,122,16,92,181,0, //vmovss 0x0(%rbp,%r14,4),%xmm3 + 196,227,113,33,203,32, //vinsertps $0x20,%xmm3,%xmm1,%xmm1 196,161,122,16,92,149,0, //vmovss 0x0(%rbp,%r10,4),%xmm3 196,227,113,33,203,48, //vinsertps $0x30,%xmm3,%xmm1,%xmm1 196,227,117,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm1,%ymm0 @@ -9164,9 +9599,12 @@ CODE const uint8_t sk_load_tables_avx[] = { 196,193,57,114,208,24, //vpsrld $0x18,%xmm8,%xmm8 196,193,97,114,210,24, //vpsrld $0x18,%xmm10,%xmm3 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3 - 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3 - 196,98,125,24,66,12, //vbroadcastss 0xc(%rdx),%ymm8 - 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3 + 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 + 184,129,128,128,59, //mov $0x3b808081,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 72,173, //lods %ds:(%rsi),%rax 91, //pop %rbx 65,92, //pop %r12 @@ -9175,17 +9613,17 @@ CODE const uint8_t sk_load_tables_avx[] = { 65,95, //pop %r15 93, //pop %rbp 255,224, //jmpq *%rax - 65,137,201, //mov %ecx,%r9d - 65,128,225,7, //and $0x7,%r9b + 137,203, //mov %ecx,%ebx + 128,227,7, //and $0x7,%bl 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 - 65,254,201, //dec %r9b - 69,15,182,201, //movzbl %r9b,%r9d - 65,128,249,6, //cmp $0x6,%r9b - 15,135,215,253,255,255, //ja 7fa <_sk_load_tables_avx+0x1e> - 76,141,21,138,0,0,0, //lea 0x8a(%rip),%r10 # ab4 <_sk_load_tables_avx+0x2d8> - 79,99,12,138, //movslq (%r10,%r9,4),%r9 - 77,1,209, //add %r10,%r9 - 65,255,225, //jmpq *%r9 + 254,203, //dec %bl + 15,182,219, //movzbl %bl,%ebx + 128,251,6, //cmp $0x6,%bl + 15,135,182,253,255,255, //ja 8fa <_sk_load_tables_avx+0x1e> + 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # bd4 <_sk_load_tables_avx+0x2f8> + 73,99,28,153, //movslq (%r9,%rbx,4),%rbx + 76,1,203, //add %r9,%rbx + 255,227, //jmpq *%rbx 196,193,121,110,68,184,24, //vmovd 0x18(%r8,%rdi,4),%xmm0 197,249,112,192,68, //vpshufd $0x44,%xmm0,%xmm0 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 @@ -9205,7 +9643,7 @@ CODE const uint8_t sk_load_tables_avx[] = { 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8 196,195,57,34,4,184,0, //vpinsrd $0x0,(%r8,%rdi,4),%xmm8,%xmm0 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8 - 233,70,253,255,255, //jmpq 7fa <_sk_load_tables_avx+0x1e> + 233,38,253,255,255, //jmpq 8fa <_sk_load_tables_avx+0x1e> 238, //out %al,(%dx) 255, //(bad) 255, //(bad) @@ -9232,14 +9670,17 @@ CODE const uint8_t sk_load_a8_avx[] = { 72,139,0, //mov (%rax),%rax 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 - 117,59, //jne b1b <_sk_load_a8_avx+0x4b> + 117,74, //jne c4a <_sk_load_a8_avx+0x5a> 197,251,16,0, //vmovsd (%rax),%xmm0 196,226,121,49,200, //vpmovzxbd %xmm0,%xmm1 196,227,121,4,192,229, //vpermilps $0xe5,%xmm0,%xmm0 196,226,121,49,192, //vpmovzxbd %xmm0,%xmm0 196,227,117,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm1,%ymm0 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 - 196,226,125,24,74,12, //vbroadcastss 0xc(%rdx),%ymm1 + 184,129,128,128,59, //mov $0x3b808081,%eax + 197,249,110,200, //vmovd %eax,%xmm1 + 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 + 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 197,252,89,217, //vmulps %ymm1,%ymm0,%ymm3 72,173, //lods %ds:(%rsi),%rax 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 @@ -9256,22 +9697,25 @@ CODE const uint8_t sk_load_a8_avx[] = { 77,9,217, //or %r11,%r9 72,131,193,8, //add $0x8,%rcx 73,255,202, //dec %r10 - 117,234, //jne b23 <_sk_load_a8_avx+0x53> + 117,234, //jne c52 <_sk_load_a8_avx+0x62> 196,193,249,110,193, //vmovq %r9,%xmm0 - 235,164, //jmp ae4 <_sk_load_a8_avx+0x14> + 235,149, //jmp c04 <_sk_load_a8_avx+0x14> }; CODE const uint8_t sk_store_a8_avx[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,8, //mov (%rax),%r9 - 196,98,125,24,66,8, //vbroadcastss 0x8(%rdx),%ymm8 + 184,0,0,127,67, //mov $0x437f0000,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 + 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 196,65,57,103,192, //vpackuswb %xmm8,%xmm8,%xmm8 72,133,201, //test %rcx,%rcx - 117,10, //jne b73 <_sk_store_a8_avx+0x33> + 117,10, //jne cb1 <_sk_store_a8_avx+0x42> 196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -9280,9 +9724,9 @@ CODE const uint8_t sk_store_a8_avx[] = { 254,200, //dec %al 68,15,182,192, //movzbl %al,%r8d 65,128,248,6, //cmp $0x6,%r8b - 119,236, //ja b6f <_sk_store_a8_avx+0x2f> + 119,236, //ja cad <_sk_store_a8_avx+0x3e> 196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8 - 76,141,21,69,0,0,0, //lea 0x45(%rip),%r10 # bd4 <_sk_store_a8_avx+0x94> + 76,141,21,67,0,0,0, //lea 0x43(%rip),%r10 # d10 <_sk_store_a8_avx+0xa1> 75,99,4,130, //movslq (%r10,%r8,4),%rax 76,1,208, //add %r10,%rax 255,224, //jmpq *%rax @@ -9293,28 +9737,27 @@ CODE const uint8_t sk_store_a8_avx[] = { 196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1) 196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1) 196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1) - 235,158, //jmp b6f <_sk_store_a8_avx+0x2f> - 15,31,0, //nopl (%rax) - 244, //hlt + 235,158, //jmp cad <_sk_store_a8_avx+0x3e> + 144, //nop + 246,255, //idiv %bh 255, //(bad) 255, //(bad) + 238, //out %al,(%dx) 255, //(bad) - 236, //in (%dx),%al 255, //(bad) + 255,230, //jmpq *%rsi 255, //(bad) - 255,228, //jmpq *%rsp 255, //(bad) 255, //(bad) + 222,255, //fdivrp %st,%st(7) 255, //(bad) - 220,255, //fdivr %st,%st(7) + 255,214, //callq *%rsi 255, //(bad) - 255,212, //callq *%rsp 255, //(bad) + 255,206, //dec %esi 255, //(bad) - 255,204, //dec %esp 255, //(bad) - 255, //(bad) - 255,196, //inc %esp + 255,198, //inc %esi 255, //(bad) 255, //(bad) 255, //.byte 0xff @@ -9324,28 +9767,49 @@ CODE const uint8_t sk_load_565_avx[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,16, //mov (%rax),%r10 72,133,201, //test %rcx,%rcx - 117,106, //jne c64 <_sk_load_565_avx+0x74> + 15,133,209,0,0,0, //jne e0b <_sk_load_565_avx+0xdf> 196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0 197,241,239,201, //vpxor %xmm1,%xmm1,%xmm1 197,249,105,201, //vpunpckhwd %xmm1,%xmm0,%xmm1 196,226,121,51,192, //vpmovzxwd %xmm0,%xmm0 196,227,125,24,209,1, //vinsertf128 $0x1,%xmm1,%ymm0,%ymm2 - 196,226,125,24,66,104, //vbroadcastss 0x68(%rdx),%ymm0 + 184,0,248,0,0, //mov $0xf800,%eax + 197,249,110,192, //vmovd %eax,%xmm0 + 197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0 + 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 197,252,84,194, //vandps %ymm2,%ymm0,%ymm0 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 - 196,226,125,24,74,116, //vbroadcastss 0x74(%rdx),%ymm1 - 197,244,89,192, //vmulps %ymm0,%ymm1,%ymm0 - 196,226,125,24,74,108, //vbroadcastss 0x6c(%rdx),%ymm1 + 184,8,33,132,55, //mov $0x37842108,%eax + 197,249,110,200, //vmovd %eax,%xmm1 + 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 + 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 + 197,252,89,193, //vmulps %ymm1,%ymm0,%ymm0 + 184,224,7,0,0, //mov $0x7e0,%eax + 197,249,110,200, //vmovd %eax,%xmm1 + 197,249,112,201,0, //vpshufd $0x0,%xmm1,%xmm1 + 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 197,244,84,202, //vandps %ymm2,%ymm1,%ymm1 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1 - 196,226,125,24,90,120, //vbroadcastss 0x78(%rdx),%ymm3 - 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1 - 196,226,125,24,90,112, //vbroadcastss 0x70(%rdx),%ymm3 + 184,33,8,2,58, //mov $0x3a020821,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1 + 184,31,0,0,0, //mov $0x1f,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 197,228,84,210, //vandps %ymm2,%ymm3,%ymm2 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 - 196,226,125,24,90,124, //vbroadcastss 0x7c(%rdx),%ymm3 - 197,228,89,210, //vmulps %ymm2,%ymm3,%ymm2 - 196,226,125,24,26, //vbroadcastss (%rdx),%ymm3 + 184,8,33,4,61, //mov $0x3d042108,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 + 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 + 184,0,0,128,63, //mov $0x3f800000,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax 65,137,200, //mov %ecx,%r8d @@ -9354,8 +9818,8 @@ CODE const uint8_t sk_load_565_avx[] = { 65,254,200, //dec %r8b 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 119,132, //ja c00 <_sk_load_565_avx+0x10> - 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # ccc <_sk_load_565_avx+0xdc> + 15,135,25,255,255,255, //ja d40 <_sk_load_565_avx+0x14> + 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # e78 <_sk_load_565_avx+0x14c> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax 255,224, //jmpq *%rax @@ -9367,27 +9831,27 @@ CODE const uint8_t sk_load_565_avx[] = { 196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0 196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0 196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0 - 233,52,255,255,255, //jmpq c00 <_sk_load_565_avx+0x10> - 244, //hlt - 255, //(bad) + 233,201,254,255,255, //jmpq d40 <_sk_load_565_avx+0x14> + 144, //nop + 243,255, //repz (bad) 255, //(bad) 255, //(bad) - 236, //in (%dx),%al + 235,255, //jmp e7d <_sk_load_565_avx+0x151> 255, //(bad) + 255,227, //jmpq *%rbx 255, //(bad) - 255,228, //jmpq *%rsp 255, //(bad) 255, //(bad) + 219,255, //(bad) 255, //(bad) - 220,255, //fdivr %st,%st(7) + 255,211, //callq *%rbx 255, //(bad) - 255,212, //callq *%rsp 255, //(bad) + 255,203, //dec %ebx 255, //(bad) - 255,204, //dec %esp 255, //(bad) 255, //(bad) - 255,192, //inc %eax + 191, //.byte 0xbf 255, //(bad) 255, //(bad) 255, //.byte 0xff @@ -9396,14 +9860,20 @@ CODE const uint8_t sk_load_565_avx[] = { CODE const uint8_t sk_store_565_avx[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,8, //mov (%rax),%r9 - 196,98,125,24,130,128,0,0,0, //vbroadcastss 0x80(%rdx),%ymm8 + 184,0,0,248,65, //mov $0x41f80000,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 + 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9 196,193,41,114,241,11, //vpslld $0xb,%xmm9,%xmm10 196,67,125,25,201,1, //vextractf128 $0x1,%ymm9,%xmm9 196,193,49,114,241,11, //vpslld $0xb,%xmm9,%xmm9 196,67,45,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm9 - 196,98,125,24,146,132,0,0,0, //vbroadcastss 0x84(%rdx),%ymm10 + 184,0,0,124,66, //mov $0x427c0000,%eax + 197,121,110,208, //vmovd %eax,%xmm10 + 196,67,121,4,210,0, //vpermilps $0x0,%xmm10,%xmm10 + 196,67,45,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm10,%ymm10 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 196,193,33,114,242,5, //vpslld $0x5,%xmm10,%xmm11 @@ -9417,7 +9887,7 @@ CODE const uint8_t sk_store_565_avx[] = { 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 72,133,201, //test %rcx,%rcx - 117,10, //jne d6e <_sk_store_565_avx+0x86> + 117,10, //jne f32 <_sk_store_565_avx+0x9e> 196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -9426,8 +9896,8 @@ CODE const uint8_t sk_store_565_avx[] = { 254,200, //dec %al 68,15,182,192, //movzbl %al,%r8d 65,128,248,6, //cmp $0x6,%r8b - 119,236, //ja d6a <_sk_store_565_avx+0x82> - 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # dcc <_sk_store_565_avx+0xe4> + 119,236, //ja f2e <_sk_store_565_avx+0x9a> + 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # f90 <_sk_store_565_avx+0xfc> 75,99,4,130, //movslq (%r10,%r8,4),%rax 76,1,208, //add %r10,%rax 255,224, //jmpq *%rax @@ -9439,7 +9909,7 @@ CODE const uint8_t sk_store_565_avx[] = { 196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2) 197,121,126,192, //vmovd %xmm8,%eax 102,65,137,4,121, //mov %ax,(%r9,%rdi,2) - 235,161, //jmp d6a <_sk_store_565_avx+0x82> + 235,161, //jmp f2e <_sk_store_565_avx+0x9a> 15,31,0, //nopl (%rax) 242,255, //repnz (bad) 255, //(bad) @@ -9469,7 +9939,7 @@ CODE const uint8_t sk_load_8888_avx[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,16, //mov (%rax),%r10 72,133,201, //test %rcx,%rcx - 15,133,157,0,0,0, //jne e93 <_sk_load_8888_avx+0xab> + 15,133,157,0,0,0, //jne 1057 <_sk_load_8888_avx+0xab> 196,65,124,16,12,186, //vmovups (%r10,%rdi,4),%ymm9 184,255,0,0,0, //mov $0xff,%eax 197,249,110,192, //vmovd %eax,%xmm0 @@ -9508,8 +9978,8 @@ CODE const uint8_t sk_load_8888_avx[] = { 65,254,200, //dec %r8b 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 15,135,76,255,255,255, //ja dfc <_sk_load_8888_avx+0x14> - 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # f40 <_sk_load_8888_avx+0x158> + 15,135,76,255,255,255, //ja fc0 <_sk_load_8888_avx+0x14> + 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # 1104 <_sk_load_8888_avx+0x158> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax 255,224, //jmpq *%rax @@ -9532,7 +10002,7 @@ CODE const uint8_t sk_load_8888_avx[] = { 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9 196,195,49,34,4,186,0, //vpinsrd $0x0,(%r10,%rdi,4),%xmm9,%xmm0 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9 - 233,188,254,255,255, //jmpq dfc <_sk_load_8888_avx+0x14> + 233,188,254,255,255, //jmpq fc0 <_sk_load_8888_avx+0x14> 238, //out %al,(%dx) 255, //(bad) 255, //(bad) @@ -9584,7 +10054,7 @@ CODE const uint8_t sk_store_8888_avx[] = { 196,65,45,86,192, //vorpd %ymm8,%ymm10,%ymm8 196,65,53,86,192, //vorpd %ymm8,%ymm9,%ymm8 72,133,201, //test %rcx,%rcx - 117,10, //jne 1000 <_sk_store_8888_avx+0xa4> + 117,10, //jne 11c4 <_sk_store_8888_avx+0xa4> 196,65,124,17,4,185, //vmovups %ymm8,(%r9,%rdi,4) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -9593,8 +10063,8 @@ CODE const uint8_t sk_store_8888_avx[] = { 254,200, //dec %al 68,15,182,192, //movzbl %al,%r8d 65,128,248,6, //cmp $0x6,%r8b - 119,236, //ja ffc <_sk_store_8888_avx+0xa0> - 76,141,21,85,0,0,0, //lea 0x55(%rip),%r10 # 106c <_sk_store_8888_avx+0x110> + 119,236, //ja 11c0 <_sk_store_8888_avx+0xa0> + 76,141,21,85,0,0,0, //lea 0x55(%rip),%r10 # 1230 <_sk_store_8888_avx+0x110> 75,99,4,130, //movslq (%r10,%r8,4),%rax 76,1,208, //add %r10,%rax 255,224, //jmpq *%rax @@ -9608,7 +10078,7 @@ CODE const uint8_t sk_store_8888_avx[] = { 196,67,121,22,68,185,8,2, //vpextrd $0x2,%xmm8,0x8(%r9,%rdi,4) 196,67,121,22,68,185,4,1, //vpextrd $0x1,%xmm8,0x4(%r9,%rdi,4) 196,65,121,126,4,185, //vmovd %xmm8,(%r9,%rdi,4) - 235,147, //jmp ffc <_sk_store_8888_avx+0xa0> + 235,147, //jmp 11c0 <_sk_store_8888_avx+0xa0> 15,31,0, //nopl (%rax) 245, //cmc 255, //(bad) @@ -9639,7 +10109,7 @@ CODE const uint8_t sk_load_f16_avx[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax 72,133,201, //test %rcx,%rcx - 15,133,240,0,0,0, //jne 1186 <_sk_load_f16_avx+0xfe> + 15,133,8,1,0,0, //jne 1362 <_sk_load_f16_avx+0x116> 197,249,16,12,248, //vmovupd (%rax,%rdi,8),%xmm1 197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2 197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3 @@ -9650,42 +10120,46 @@ CODE const uint8_t sk_load_f16_avx[] = { 196,193,97,105,216, //vpunpckhwd %xmm8,%xmm3,%xmm3 197,121,97,193, //vpunpcklwd %xmm1,%xmm0,%xmm8 197,249,105,193, //vpunpckhwd %xmm1,%xmm0,%xmm0 - 197,233,97,203, //vpunpcklwd %xmm3,%xmm2,%xmm1 + 197,105,97,211, //vpunpcklwd %xmm3,%xmm2,%xmm10 197,105,105,203, //vpunpckhwd %xmm3,%xmm2,%xmm9 - 197,249,110,90,100, //vmovd 0x64(%rdx),%xmm3 + 184,0,4,0,4, //mov $0x4000400,%eax + 197,249,110,216, //vmovd %eax,%xmm3 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 196,193,97,101,208, //vpcmpgtw %xmm8,%xmm3,%xmm2 - 196,65,105,223,192, //vpandn %xmm8,%xmm2,%xmm8 - 197,225,101,208, //vpcmpgtw %xmm0,%xmm3,%xmm2 - 197,233,223,192, //vpandn %xmm0,%xmm2,%xmm0 - 197,225,101,209, //vpcmpgtw %xmm1,%xmm3,%xmm2 - 197,233,223,201, //vpandn %xmm1,%xmm2,%xmm1 - 196,193,97,101,209, //vpcmpgtw %xmm9,%xmm3,%xmm2 - 196,193,105,223,209, //vpandn %xmm9,%xmm2,%xmm2 - 196,66,121,51,208, //vpmovzxwd %xmm8,%xmm10 + 196,193,105,223,208, //vpandn %xmm8,%xmm2,%xmm2 + 197,225,101,200, //vpcmpgtw %xmm0,%xmm3,%xmm1 + 197,241,223,192, //vpandn %xmm0,%xmm1,%xmm0 + 196,193,97,101,202, //vpcmpgtw %xmm10,%xmm3,%xmm1 + 196,193,113,223,202, //vpandn %xmm10,%xmm1,%xmm1 + 196,193,97,101,217, //vpcmpgtw %xmm9,%xmm3,%xmm3 + 196,193,97,223,217, //vpandn %xmm9,%xmm3,%xmm3 + 196,98,121,51,194, //vpmovzxwd %xmm2,%xmm8 196,98,121,51,201, //vpmovzxwd %xmm1,%xmm9 - 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 - 197,57,105,195, //vpunpckhwd %xmm3,%xmm8,%xmm8 - 197,241,105,203, //vpunpckhwd %xmm3,%xmm1,%xmm1 + 196,65,41,239,210, //vpxor %xmm10,%xmm10,%xmm10 + 196,193,105,105,210, //vpunpckhwd %xmm10,%xmm2,%xmm2 + 196,193,113,105,202, //vpunpckhwd %xmm10,%xmm1,%xmm1 196,98,121,51,216, //vpmovzxwd %xmm0,%xmm11 - 196,98,121,51,226, //vpmovzxwd %xmm2,%xmm12 - 197,121,105,235, //vpunpckhwd %xmm3,%xmm0,%xmm13 - 197,105,105,243, //vpunpckhwd %xmm3,%xmm2,%xmm14 - 196,193,121,114,242,13, //vpslld $0xd,%xmm10,%xmm0 - 196,193,105,114,241,13, //vpslld $0xd,%xmm9,%xmm2 - 196,227,125,24,194,1, //vinsertf128 $0x1,%xmm2,%ymm0,%ymm0 - 196,98,125,24,74,92, //vbroadcastss 0x5c(%rdx),%ymm9 + 196,98,121,51,227, //vpmovzxwd %xmm3,%xmm12 + 196,65,121,105,234, //vpunpckhwd %xmm10,%xmm0,%xmm13 + 196,65,97,105,210, //vpunpckhwd %xmm10,%xmm3,%xmm10 + 196,193,121,114,240,13, //vpslld $0xd,%xmm8,%xmm0 + 196,193,97,114,241,13, //vpslld $0xd,%xmm9,%xmm3 + 196,227,125,24,195,1, //vinsertf128 $0x1,%xmm3,%ymm0,%ymm0 + 184,0,0,128,119, //mov $0x77800000,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 + 196,99,101,24,203,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm9 197,180,89,192, //vmulps %ymm0,%ymm9,%ymm0 - 196,193,105,114,240,13, //vpslld $0xd,%xmm8,%xmm2 + 197,233,114,242,13, //vpslld $0xd,%xmm2,%xmm2 197,241,114,241,13, //vpslld $0xd,%xmm1,%xmm1 196,227,109,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm2,%ymm1 197,180,89,201, //vmulps %ymm1,%ymm9,%ymm1 - 196,193,105,114,243,13, //vpslld $0xd,%xmm11,%xmm2 - 196,193,97,114,244,13, //vpslld $0xd,%xmm12,%xmm3 - 196,227,109,24,211,1, //vinsertf128 $0x1,%xmm3,%ymm2,%ymm2 + 196,193,57,114,243,13, //vpslld $0xd,%xmm11,%xmm8 + 196,193,105,114,244,13, //vpslld $0xd,%xmm12,%xmm2 + 196,227,61,24,210,1, //vinsertf128 $0x1,%xmm2,%ymm8,%ymm2 197,180,89,210, //vmulps %ymm2,%ymm9,%ymm2 196,193,57,114,245,13, //vpslld $0xd,%xmm13,%xmm8 - 196,193,97,114,246,13, //vpslld $0xd,%xmm14,%xmm3 + 196,193,97,114,242,13, //vpslld $0xd,%xmm10,%xmm3 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3 197,180,89,219, //vmulps %ymm3,%ymm9,%ymm3 72,173, //lods %ds:(%rsi),%rax @@ -9693,41 +10167,44 @@ CODE const uint8_t sk_load_f16_avx[] = { 197,251,16,12,248, //vmovsd (%rax,%rdi,8),%xmm1 196,65,57,87,192, //vxorpd %xmm8,%xmm8,%xmm8 72,131,249,1, //cmp $0x1,%rcx - 117,6, //jne 119c <_sk_load_f16_avx+0x114> + 117,6, //jne 1378 <_sk_load_f16_avx+0x12c> 197,250,126,201, //vmovq %xmm1,%xmm1 - 235,30, //jmp 11ba <_sk_load_f16_avx+0x132> + 235,30, //jmp 1396 <_sk_load_f16_avx+0x14a> 197,241,22,76,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm1,%xmm1 72,131,249,3, //cmp $0x3,%rcx - 114,18, //jb 11ba <_sk_load_f16_avx+0x132> + 114,18, //jb 1396 <_sk_load_f16_avx+0x14a> 197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2 72,131,249,3, //cmp $0x3,%rcx - 117,19, //jne 11c7 <_sk_load_f16_avx+0x13f> + 117,19, //jne 13a3 <_sk_load_f16_avx+0x157> 197,250,126,210, //vmovq %xmm2,%xmm2 - 235,46, //jmp 11e8 <_sk_load_f16_avx+0x160> + 235,46, //jmp 13c4 <_sk_load_f16_avx+0x178> 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2 - 233,230,254,255,255, //jmpq 10ad <_sk_load_f16_avx+0x25> + 233,206,254,255,255, //jmpq 1271 <_sk_load_f16_avx+0x25> 197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2 72,131,249,5, //cmp $0x5,%rcx - 114,21, //jb 11e8 <_sk_load_f16_avx+0x160> + 114,21, //jb 13c4 <_sk_load_f16_avx+0x178> 197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3 72,131,249,5, //cmp $0x5,%rcx - 117,18, //jne 11f1 <_sk_load_f16_avx+0x169> + 117,18, //jne 13cd <_sk_load_f16_avx+0x181> 197,250,126,219, //vmovq %xmm3,%xmm3 - 233,197,254,255,255, //jmpq 10ad <_sk_load_f16_avx+0x25> + 233,173,254,255,255, //jmpq 1271 <_sk_load_f16_avx+0x25> 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 - 233,188,254,255,255, //jmpq 10ad <_sk_load_f16_avx+0x25> + 233,164,254,255,255, //jmpq 1271 <_sk_load_f16_avx+0x25> 197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3 72,131,249,7, //cmp $0x7,%rcx - 15,130,172,254,255,255, //jb 10ad <_sk_load_f16_avx+0x25> + 15,130,148,254,255,255, //jb 1271 <_sk_load_f16_avx+0x25> 197,123,16,68,248,48, //vmovsd 0x30(%rax,%rdi,8),%xmm8 - 233,161,254,255,255, //jmpq 10ad <_sk_load_f16_avx+0x25> + 233,137,254,255,255, //jmpq 1271 <_sk_load_f16_avx+0x25> }; CODE const uint8_t sk_store_f16_avx[] = { 72,173, //lods %ds:(%rsi),%rax - 72,139,0, //mov (%rax),%rax - 196,98,125,24,66,96, //vbroadcastss 0x60(%rdx),%ymm8 + 76,139,0, //mov (%rax),%r8 + 184,0,0,128,7, //mov $0x7800000,%eax + 197,121,110,192, //vmovd %eax,%xmm8 + 196,65,121,112,192,0, //vpshufd $0x0,%xmm8,%xmm8 + 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 196,67,125,25,202,1, //vextractf128 $0x1,%ymm9,%xmm10 196,193,41,114,210,13, //vpsrld $0xd,%xmm10,%xmm10 @@ -9757,31 +10234,31 @@ CODE const uint8_t sk_store_f16_avx[] = { 196,65,25,98,205, //vpunpckldq %xmm13,%xmm12,%xmm9 196,65,25,106,197, //vpunpckhdq %xmm13,%xmm12,%xmm8 72,133,201, //test %rcx,%rcx - 117,27, //jne 12cf <_sk_store_f16_avx+0xc3> - 197,120,17,28,248, //vmovups %xmm11,(%rax,%rdi,8) - 197,120,17,84,248,16, //vmovups %xmm10,0x10(%rax,%rdi,8) - 197,120,17,76,248,32, //vmovups %xmm9,0x20(%rax,%rdi,8) - 197,122,127,68,248,48, //vmovdqu %xmm8,0x30(%rax,%rdi,8) + 117,31, //jne 14be <_sk_store_f16_avx+0xd6> + 196,65,120,17,28,248, //vmovups %xmm11,(%r8,%rdi,8) + 196,65,120,17,84,248,16, //vmovups %xmm10,0x10(%r8,%rdi,8) + 196,65,120,17,76,248,32, //vmovups %xmm9,0x20(%r8,%rdi,8) + 196,65,122,127,68,248,48, //vmovdqu %xmm8,0x30(%r8,%rdi,8) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax - 197,121,214,28,248, //vmovq %xmm11,(%rax,%rdi,8) + 196,65,121,214,28,248, //vmovq %xmm11,(%r8,%rdi,8) 72,131,249,1, //cmp $0x1,%rcx - 116,241, //je 12cb <_sk_store_f16_avx+0xbf> - 197,121,23,92,248,8, //vmovhpd %xmm11,0x8(%rax,%rdi,8) + 116,240, //je 14ba <_sk_store_f16_avx+0xd2> + 196,65,121,23,92,248,8, //vmovhpd %xmm11,0x8(%r8,%rdi,8) 72,131,249,3, //cmp $0x3,%rcx - 114,229, //jb 12cb <_sk_store_f16_avx+0xbf> - 197,121,214,84,248,16, //vmovq %xmm10,0x10(%rax,%rdi,8) - 116,221, //je 12cb <_sk_store_f16_avx+0xbf> - 197,121,23,84,248,24, //vmovhpd %xmm10,0x18(%rax,%rdi,8) + 114,227, //jb 14ba <_sk_store_f16_avx+0xd2> + 196,65,121,214,84,248,16, //vmovq %xmm10,0x10(%r8,%rdi,8) + 116,218, //je 14ba <_sk_store_f16_avx+0xd2> + 196,65,121,23,84,248,24, //vmovhpd %xmm10,0x18(%r8,%rdi,8) 72,131,249,5, //cmp $0x5,%rcx - 114,209, //jb 12cb <_sk_store_f16_avx+0xbf> - 197,121,214,76,248,32, //vmovq %xmm9,0x20(%rax,%rdi,8) - 116,201, //je 12cb <_sk_store_f16_avx+0xbf> - 197,121,23,76,248,40, //vmovhpd %xmm9,0x28(%rax,%rdi,8) + 114,205, //jb 14ba <_sk_store_f16_avx+0xd2> + 196,65,121,214,76,248,32, //vmovq %xmm9,0x20(%r8,%rdi,8) + 116,196, //je 14ba <_sk_store_f16_avx+0xd2> + 196,65,121,23,76,248,40, //vmovhpd %xmm9,0x28(%r8,%rdi,8) 72,131,249,7, //cmp $0x7,%rcx - 114,189, //jb 12cb <_sk_store_f16_avx+0xbf> - 197,121,214,68,248,48, //vmovq %xmm8,0x30(%rax,%rdi,8) - 235,181, //jmp 12cb <_sk_store_f16_avx+0xbf> + 114,183, //jb 14ba <_sk_store_f16_avx+0xd2> + 196,65,121,214,68,248,48, //vmovq %xmm8,0x30(%r8,%rdi,8) + 235,174, //jmp 14ba <_sk_store_f16_avx+0xd2> }; CODE const uint8_t sk_store_f32_avx[] = { @@ -9797,7 +10274,7 @@ CODE const uint8_t sk_store_f32_avx[] = { 196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8 196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11 72,133,201, //test %rcx,%rcx - 117,55, //jne 1383 <_sk_store_f32_avx+0x6d> + 117,55, //jne 1579 <_sk_store_f32_avx+0x6d> 196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12 196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13 196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9 @@ -9810,22 +10287,22 @@ CODE const uint8_t sk_store_f32_avx[] = { 255,224, //jmpq *%rax 196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4) 72,131,249,1, //cmp $0x1,%rcx - 116,240, //je 137f <_sk_store_f32_avx+0x69> + 116,240, //je 1575 <_sk_store_f32_avx+0x69> 196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4) 72,131,249,3, //cmp $0x3,%rcx - 114,227, //jb 137f <_sk_store_f32_avx+0x69> + 114,227, //jb 1575 <_sk_store_f32_avx+0x69> 196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4) - 116,218, //je 137f <_sk_store_f32_avx+0x69> + 116,218, //je 1575 <_sk_store_f32_avx+0x69> 196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4) 72,131,249,5, //cmp $0x5,%rcx - 114,205, //jb 137f <_sk_store_f32_avx+0x69> + 114,205, //jb 1575 <_sk_store_f32_avx+0x69> 196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4) - 116,195, //je 137f <_sk_store_f32_avx+0x69> + 116,195, //je 1575 <_sk_store_f32_avx+0x69> 196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4) 72,131,249,7, //cmp $0x7,%rcx - 114,181, //jb 137f <_sk_store_f32_avx+0x69> + 114,181, //jb 1575 <_sk_store_f32_avx+0x69> 196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4) - 235,171, //jmp 137f <_sk_store_f32_avx+0x69> + 235,171, //jmp 1575 <_sk_store_f32_avx+0x69> }; CODE const uint8_t sk_clamp_x_avx[] = { @@ -9947,12 +10424,21 @@ CODE const uint8_t sk_mirror_y_avx[] = { }; CODE const uint8_t sk_luminance_to_alpha_avx[] = { - 196,226,125,24,154,136,0,0,0, //vbroadcastss 0x88(%rdx),%ymm3 + 184,208,179,89,62, //mov $0x3e59b3d0,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 197,228,89,192, //vmulps %ymm0,%ymm3,%ymm0 - 196,226,125,24,154,140,0,0,0, //vbroadcastss 0x8c(%rdx),%ymm3 + 184,89,23,55,63, //mov $0x3f371759,%eax + 197,249,110,216, //vmovd %eax,%xmm3 + 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 + 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0 - 196,226,125,24,138,144,0,0,0, //vbroadcastss 0x90(%rdx),%ymm1 + 184,152,221,147,61, //mov $0x3d93dd98,%eax + 197,249,110,200, //vmovd %eax,%xmm1 + 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 + 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 197,244,89,202, //vmulps %ymm2,%ymm1,%ymm1 197,252,88,217, //vaddps %ymm1,%ymm0,%ymm3 72,173, //lods %ds:(%rsi),%rax @@ -10217,7 +10703,7 @@ CODE const uint8_t sk_seed_shader_sse41[] = { 102,15,110,209, //movd %ecx,%xmm2 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 15,88,202, //addps %xmm2,%xmm1 - 15,16,66,20, //movups 0x14(%rdx),%xmm0 + 15,16,2, //movups (%rdx),%xmm0 15,88,193, //addps %xmm1,%xmm0 102,15,110,8, //movd (%rax),%xmm1 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1 @@ -10420,25 +10906,29 @@ CODE const uint8_t sk_unpremul_sse41[] = { }; CODE const uint8_t sk_from_srgb_sse41[] = { - 68,15,40,194, //movaps %xmm2,%xmm8 - 243,68,15,16,90,64, //movss 0x40(%rdx),%xmm11 + 184,145,131,158,61, //mov $0x3d9e8391,%eax + 102,68,15,110,216, //movd %eax,%xmm11 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 69,15,40,211, //movaps %xmm11,%xmm10 68,15,89,208, //mulps %xmm0,%xmm10 68,15,40,240, //movaps %xmm0,%xmm14 69,15,89,246, //mulps %xmm14,%xmm14 - 243,15,16,82,60, //movss 0x3c(%rdx),%xmm2 - 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 - 243,68,15,16,98,52, //movss 0x34(%rdx),%xmm12 - 243,68,15,16,106,56, //movss 0x38(%rdx),%xmm13 - 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 - 68,15,40,202, //movaps %xmm2,%xmm9 - 68,15,89,200, //mulps %xmm0,%xmm9 - 69,15,88,205, //addps %xmm13,%xmm9 + 184,154,153,153,62, //mov $0x3e99999a,%eax + 102,68,15,110,192, //movd %eax,%xmm8 + 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 + 184,92,143,50,63, //mov $0x3f328f5c,%eax + 102,68,15,110,224, //movd %eax,%xmm12 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 - 69,15,89,206, //mulps %xmm14,%xmm9 + 69,15,40,200, //movaps %xmm8,%xmm9 + 68,15,89,200, //mulps %xmm0,%xmm9 69,15,88,204, //addps %xmm12,%xmm9 - 243,68,15,16,114,68, //movss 0x44(%rdx),%xmm14 + 184,10,215,35,59, //mov $0x3b23d70a,%eax + 102,68,15,110,232, //movd %eax,%xmm13 + 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 + 69,15,89,206, //mulps %xmm14,%xmm9 + 69,15,88,205, //addps %xmm13,%xmm9 + 184,174,71,97,61, //mov $0x3d6147ae,%eax + 102,68,15,110,240, //movd %eax,%xmm14 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14 65,15,194,198,1, //cmpltps %xmm14,%xmm0 102,69,15,56,20,202, //blendvps %xmm0,%xmm10,%xmm9 @@ -10446,27 +10936,28 @@ CODE const uint8_t sk_from_srgb_sse41[] = { 68,15,89,249, //mulps %xmm1,%xmm15 15,40,193, //movaps %xmm1,%xmm0 15,89,192, //mulps %xmm0,%xmm0 - 68,15,40,210, //movaps %xmm2,%xmm10 + 69,15,40,208, //movaps %xmm8,%xmm10 68,15,89,209, //mulps %xmm1,%xmm10 - 69,15,88,213, //addps %xmm13,%xmm10 - 68,15,89,208, //mulps %xmm0,%xmm10 69,15,88,212, //addps %xmm12,%xmm10 + 68,15,89,208, //mulps %xmm0,%xmm10 + 69,15,88,213, //addps %xmm13,%xmm10 65,15,194,206,1, //cmpltps %xmm14,%xmm1 15,40,193, //movaps %xmm1,%xmm0 102,69,15,56,20,215, //blendvps %xmm0,%xmm15,%xmm10 - 69,15,89,216, //mulps %xmm8,%xmm11 - 65,15,40,192, //movaps %xmm8,%xmm0 + 68,15,89,218, //mulps %xmm2,%xmm11 + 15,40,194, //movaps %xmm2,%xmm0 15,89,192, //mulps %xmm0,%xmm0 - 65,15,89,208, //mulps %xmm8,%xmm2 - 65,15,88,213, //addps %xmm13,%xmm2 - 15,89,208, //mulps %xmm0,%xmm2 - 65,15,88,212, //addps %xmm12,%xmm2 - 69,15,194,198,1, //cmpltps %xmm14,%xmm8 - 65,15,40,192, //movaps %xmm8,%xmm0 - 102,65,15,56,20,211, //blendvps %xmm0,%xmm11,%xmm2 + 68,15,89,194, //mulps %xmm2,%xmm8 + 69,15,88,196, //addps %xmm12,%xmm8 + 68,15,89,192, //mulps %xmm0,%xmm8 + 69,15,88,197, //addps %xmm13,%xmm8 + 65,15,194,214,1, //cmpltps %xmm14,%xmm2 + 15,40,194, //movaps %xmm2,%xmm0 + 102,69,15,56,20,195, //blendvps %xmm0,%xmm11,%xmm8 72,173, //lods %ds:(%rsi),%rax 65,15,40,193, //movaps %xmm9,%xmm0 65,15,40,202, //movaps %xmm10,%xmm1 + 65,15,40,208, //movaps %xmm8,%xmm2 255,224, //jmpq *%rax }; @@ -10477,62 +10968,69 @@ CODE const uint8_t sk_to_srgb_sse41[] = { 15,40,245, //movaps %xmm5,%xmm6 15,40,236, //movaps %xmm4,%xmm5 15,40,227, //movaps %xmm3,%xmm4 - 68,15,40,194, //movaps %xmm2,%xmm8 - 15,40,217, //movaps %xmm1,%xmm3 - 15,82,208, //rsqrtps %xmm0,%xmm2 - 68,15,83,202, //rcpps %xmm2,%xmm9 - 68,15,82,210, //rsqrtps %xmm2,%xmm10 - 243,15,16,18, //movss (%rdx),%xmm2 - 243,68,15,16,90,72, //movss 0x48(%rdx),%xmm11 + 15,40,218, //movaps %xmm2,%xmm3 + 15,40,209, //movaps %xmm1,%xmm2 + 68,15,82,192, //rsqrtps %xmm0,%xmm8 + 69,15,83,200, //rcpps %xmm8,%xmm9 + 69,15,82,248, //rsqrtps %xmm8,%xmm15 + 184,41,92,71,65, //mov $0x41475c29,%eax + 102,68,15,110,216, //movd %eax,%xmm11 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 - 65,15,40,203, //movaps %xmm11,%xmm1 - 15,89,200, //mulps %xmm0,%xmm1 - 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 - 243,68,15,16,98,76, //movss 0x4c(%rdx),%xmm12 + 69,15,40,211, //movaps %xmm11,%xmm10 + 68,15,89,208, //mulps %xmm0,%xmm10 + 184,0,0,128,63, //mov $0x3f800000,%eax + 102,68,15,110,192, //movd %eax,%xmm8 + 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 + 184,194,135,210,62, //mov $0x3ed287c2,%eax + 102,68,15,110,224, //movd %eax,%xmm12 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 - 243,68,15,16,106,80, //movss 0x50(%rdx),%xmm13 + 184,206,111,48,63, //mov $0x3f306fce,%eax + 102,68,15,110,232, //movd %eax,%xmm13 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 - 243,68,15,16,114,84, //movss 0x54(%rdx),%xmm14 + 184,168,87,202,61, //mov $0x3dca57a8,%eax + 53,0,0,0,128, //xor $0x80000000,%eax + 102,68,15,110,240, //movd %eax,%xmm14 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14 69,15,89,205, //mulps %xmm13,%xmm9 69,15,88,206, //addps %xmm14,%xmm9 - 69,15,89,212, //mulps %xmm12,%xmm10 - 69,15,88,209, //addps %xmm9,%xmm10 - 68,15,40,202, //movaps %xmm2,%xmm9 - 69,15,93,202, //minps %xmm10,%xmm9 - 243,68,15,16,122,88, //movss 0x58(%rdx),%xmm15 + 69,15,89,252, //mulps %xmm12,%xmm15 + 69,15,88,249, //addps %xmm9,%xmm15 + 69,15,40,200, //movaps %xmm8,%xmm9 + 69,15,93,207, //minps %xmm15,%xmm9 + 184,4,231,140,59, //mov $0x3b8ce704,%eax + 102,68,15,110,248, //movd %eax,%xmm15 69,15,198,255,0, //shufps $0x0,%xmm15,%xmm15 65,15,194,199,1, //cmpltps %xmm15,%xmm0 - 102,68,15,56,20,201, //blendvps %xmm0,%xmm1,%xmm9 + 102,69,15,56,20,202, //blendvps %xmm0,%xmm10,%xmm9 + 68,15,82,210, //rsqrtps %xmm2,%xmm10 + 65,15,83,194, //rcpps %xmm10,%xmm0 + 69,15,82,210, //rsqrtps %xmm10,%xmm10 + 65,15,89,197, //mulps %xmm13,%xmm0 + 65,15,88,198, //addps %xmm14,%xmm0 + 69,15,89,212, //mulps %xmm12,%xmm10 + 68,15,88,208, //addps %xmm0,%xmm10 + 65,15,40,200, //movaps %xmm8,%xmm1 + 65,15,93,202, //minps %xmm10,%xmm1 + 69,15,40,211, //movaps %xmm11,%xmm10 + 68,15,89,210, //mulps %xmm2,%xmm10 + 65,15,194,215,1, //cmpltps %xmm15,%xmm2 + 15,40,194, //movaps %xmm2,%xmm0 + 102,65,15,56,20,202, //blendvps %xmm0,%xmm10,%xmm1 15,82,195, //rsqrtps %xmm3,%xmm0 - 15,83,200, //rcpps %xmm0,%xmm1 + 15,83,208, //rcpps %xmm0,%xmm2 + 65,15,89,213, //mulps %xmm13,%xmm2 + 65,15,88,214, //addps %xmm14,%xmm2 15,82,192, //rsqrtps %xmm0,%xmm0 - 65,15,89,205, //mulps %xmm13,%xmm1 - 65,15,88,206, //addps %xmm14,%xmm1 65,15,89,196, //mulps %xmm12,%xmm0 - 15,88,193, //addps %xmm1,%xmm0 - 68,15,40,210, //movaps %xmm2,%xmm10 - 68,15,93,208, //minps %xmm0,%xmm10 - 65,15,40,203, //movaps %xmm11,%xmm1 - 15,89,203, //mulps %xmm3,%xmm1 + 15,88,194, //addps %xmm2,%xmm0 + 68,15,93,192, //minps %xmm0,%xmm8 + 68,15,89,219, //mulps %xmm3,%xmm11 65,15,194,223,1, //cmpltps %xmm15,%xmm3 15,40,195, //movaps %xmm3,%xmm0 - 102,68,15,56,20,209, //blendvps %xmm0,%xmm1,%xmm10 - 65,15,82,192, //rsqrtps %xmm8,%xmm0 - 15,83,200, //rcpps %xmm0,%xmm1 - 65,15,89,205, //mulps %xmm13,%xmm1 - 65,15,88,206, //addps %xmm14,%xmm1 - 15,82,192, //rsqrtps %xmm0,%xmm0 - 65,15,89,196, //mulps %xmm12,%xmm0 - 15,88,193, //addps %xmm1,%xmm0 - 15,93,208, //minps %xmm0,%xmm2 - 69,15,89,216, //mulps %xmm8,%xmm11 - 69,15,194,199,1, //cmpltps %xmm15,%xmm8 - 65,15,40,192, //movaps %xmm8,%xmm0 - 102,65,15,56,20,211, //blendvps %xmm0,%xmm11,%xmm2 + 102,69,15,56,20,195, //blendvps %xmm0,%xmm11,%xmm8 72,173, //lods %ds:(%rsi),%rax 65,15,40,193, //movaps %xmm9,%xmm0 - 65,15,40,202, //movaps %xmm10,%xmm1 + 65,15,40,208, //movaps %xmm8,%xmm2 15,40,220, //movaps %xmm4,%xmm3 15,40,229, //movaps %xmm5,%xmm4 15,40,238, //movaps %xmm6,%xmm5 @@ -10620,32 +11118,38 @@ CODE const uint8_t sk_lerp_565_sse41[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax 102,68,15,56,51,4,120, //pmovzxwd (%rax,%rdi,2),%xmm8 - 102,15,110,90,104, //movd 0x68(%rdx),%xmm3 + 184,0,248,0,0, //mov $0xf800,%eax + 102,15,110,216, //movd %eax,%xmm3 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 102,65,15,219,216, //pand %xmm8,%xmm3 68,15,91,203, //cvtdq2ps %xmm3,%xmm9 - 243,68,15,16,90,116, //movss 0x74(%rdx),%xmm11 - 243,68,15,16,82,120, //movss 0x78(%rdx),%xmm10 - 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 - 69,15,89,217, //mulps %xmm9,%xmm11 - 102,15,110,90,108, //movd 0x6c(%rdx),%xmm3 + 184,8,33,132,55, //mov $0x37842108,%eax + 102,68,15,110,208, //movd %eax,%xmm10 + 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 + 69,15,89,209, //mulps %xmm9,%xmm10 + 184,224,7,0,0, //mov $0x7e0,%eax + 102,15,110,216, //movd %eax,%xmm3 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 102,65,15,219,216, //pand %xmm8,%xmm3 - 15,91,219, //cvtdq2ps %xmm3,%xmm3 - 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 - 68,15,89,211, //mulps %xmm3,%xmm10 - 102,15,110,90,112, //movd 0x70(%rdx),%xmm3 + 68,15,91,203, //cvtdq2ps %xmm3,%xmm9 + 184,33,8,2,58, //mov $0x3a020821,%eax + 102,68,15,110,216, //movd %eax,%xmm11 + 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 + 69,15,89,217, //mulps %xmm9,%xmm11 + 184,31,0,0,0, //mov $0x1f,%eax + 102,15,110,216, //movd %eax,%xmm3 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 102,65,15,219,216, //pand %xmm8,%xmm3 68,15,91,195, //cvtdq2ps %xmm3,%xmm8 - 243,15,16,90,124, //movss 0x7c(%rdx),%xmm3 + 184,8,33,4,61, //mov $0x3d042108,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 65,15,89,216, //mulps %xmm8,%xmm3 15,92,196, //subps %xmm4,%xmm0 - 65,15,89,195, //mulps %xmm11,%xmm0 + 65,15,89,194, //mulps %xmm10,%xmm0 15,88,196, //addps %xmm4,%xmm0 15,92,205, //subps %xmm5,%xmm1 - 65,15,89,202, //mulps %xmm10,%xmm1 + 65,15,89,203, //mulps %xmm11,%xmm1 15,88,205, //addps %xmm5,%xmm1 15,92,214, //subps %xmm6,%xmm2 15,89,211, //mulps %xmm3,%xmm2 @@ -10662,7 +11166,8 @@ CODE const uint8_t sk_load_tables_sse41[] = { 72,139,8, //mov (%rax),%rcx 76,139,64,8, //mov 0x8(%rax),%r8 243,68,15,111,4,185, //movdqu (%rcx,%rdi,4),%xmm8 - 102,15,110,66,16, //movd 0x10(%rdx),%xmm0 + 185,255,0,0,0, //mov $0xff,%ecx + 102,15,110,193, //movd %ecx,%xmm0 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 102,65,15,111,200, //movdqa %xmm8,%xmm1 102,15,114,209,8, //psrld $0x8,%xmm1 @@ -10709,7 +11214,8 @@ CODE const uint8_t sk_load_tables_sse41[] = { 102,15,58,33,211,48, //insertps $0x30,%xmm3,%xmm2 102,65,15,114,208,24, //psrld $0x18,%xmm8 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 - 243,15,16,90,12, //movss 0xc(%rdx),%xmm3 + 184,129,128,128,59, //mov $0x3b808081,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 65,15,89,216, //mulps %xmm8,%xmm3 72,173, //lods %ds:(%rsi),%rax @@ -10721,7 +11227,8 @@ CODE const uint8_t sk_load_a8_sse41[] = { 72,139,0, //mov (%rax),%rax 102,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm0 15,91,192, //cvtdq2ps %xmm0,%xmm0 - 243,15,16,90,12, //movss 0xc(%rdx),%xmm3 + 184,129,128,128,59, //mov $0x3b808081,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 15,89,216, //mulps %xmm0,%xmm3 72,173, //lods %ds:(%rsi),%rax @@ -10734,7 +11241,8 @@ CODE const uint8_t sk_load_a8_sse41[] = { CODE const uint8_t sk_store_a8_sse41[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 243,68,15,16,66,8, //movss 0x8(%rdx),%xmm8 + 185,0,0,127,67, //mov $0x437f0000,%ecx + 102,68,15,110,193, //movd %ecx,%xmm8 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 68,15,89,195, //mulps %xmm3,%xmm8 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 @@ -10748,29 +11256,36 @@ CODE const uint8_t sk_store_a8_sse41[] = { CODE const uint8_t sk_load_565_sse41[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 102,68,15,56,51,12,120, //pmovzxwd (%rax,%rdi,2),%xmm9 - 102,15,110,66,104, //movd 0x68(%rdx),%xmm0 + 102,15,56,51,20,120, //pmovzxwd (%rax,%rdi,2),%xmm2 + 184,0,248,0,0, //mov $0xf800,%eax + 102,15,110,192, //movd %eax,%xmm0 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 - 102,65,15,219,193, //pand %xmm9,%xmm0 + 102,15,219,194, //pand %xmm2,%xmm0 15,91,200, //cvtdq2ps %xmm0,%xmm1 - 243,15,16,26, //movss (%rdx),%xmm3 - 243,15,16,66,116, //movss 0x74(%rdx),%xmm0 + 184,8,33,132,55, //mov $0x37842108,%eax + 102,15,110,192, //movd %eax,%xmm0 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 15,89,193, //mulps %xmm1,%xmm0 - 102,15,110,74,108, //movd 0x6c(%rdx),%xmm1 + 184,224,7,0,0, //mov $0x7e0,%eax + 102,15,110,200, //movd %eax,%xmm1 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1 - 102,65,15,219,201, //pand %xmm9,%xmm1 - 68,15,91,193, //cvtdq2ps %xmm1,%xmm8 - 243,15,16,74,120, //movss 0x78(%rdx),%xmm1 + 102,15,219,202, //pand %xmm2,%xmm1 + 15,91,217, //cvtdq2ps %xmm1,%xmm3 + 184,33,8,2,58, //mov $0x3a020821,%eax + 102,15,110,200, //movd %eax,%xmm1 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 - 65,15,89,200, //mulps %xmm8,%xmm1 - 102,15,110,82,112, //movd 0x70(%rdx),%xmm2 - 102,15,112,210,0, //pshufd $0x0,%xmm2,%xmm2 - 102,65,15,219,209, //pand %xmm9,%xmm2 - 68,15,91,194, //cvtdq2ps %xmm2,%xmm8 - 243,15,16,82,124, //movss 0x7c(%rdx),%xmm2 + 15,89,203, //mulps %xmm3,%xmm1 + 184,31,0,0,0, //mov $0x1f,%eax + 102,15,110,216, //movd %eax,%xmm3 + 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 + 102,15,219,218, //pand %xmm2,%xmm3 + 15,91,219, //cvtdq2ps %xmm3,%xmm3 + 184,8,33,4,61, //mov $0x3d042108,%eax + 102,15,110,208, //movd %eax,%xmm2 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 - 65,15,89,208, //mulps %xmm8,%xmm2 + 15,89,211, //mulps %xmm3,%xmm2 + 184,0,0,128,63, //mov $0x3f800000,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -10779,21 +11294,23 @@ CODE const uint8_t sk_load_565_sse41[] = { CODE const uint8_t sk_store_565_sse41[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 243,68,15,16,130,128,0,0,0, //movss 0x80(%rdx),%xmm8 - 243,68,15,16,138,132,0,0,0, //movss 0x84(%rdx),%xmm9 + 185,0,0,248,65, //mov $0x41f80000,%ecx + 102,68,15,110,193, //movd %ecx,%xmm8 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 - 69,15,40,208, //movaps %xmm8,%xmm10 - 68,15,89,208, //mulps %xmm0,%xmm10 - 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10 - 102,65,15,114,242,11, //pslld $0xb,%xmm10 - 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 - 68,15,89,201, //mulps %xmm1,%xmm9 + 69,15,40,200, //movaps %xmm8,%xmm9 + 68,15,89,200, //mulps %xmm0,%xmm9 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9 - 102,65,15,114,241,5, //pslld $0x5,%xmm9 - 102,69,15,235,202, //por %xmm10,%xmm9 + 102,65,15,114,241,11, //pslld $0xb,%xmm9 + 185,0,0,124,66, //mov $0x427c0000,%ecx + 102,68,15,110,209, //movd %ecx,%xmm10 + 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 + 68,15,89,209, //mulps %xmm1,%xmm10 + 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10 + 102,65,15,114,242,5, //pslld $0x5,%xmm10 + 102,69,15,235,209, //por %xmm9,%xmm10 68,15,89,194, //mulps %xmm2,%xmm8 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 - 102,69,15,86,193, //orpd %xmm9,%xmm8 + 102,69,15,86,194, //orpd %xmm10,%xmm8 102,69,15,56,43,192, //packusdw %xmm8,%xmm8 102,68,15,214,4,120, //movq %xmm8,(%rax,%rdi,2) 72,173, //lods %ds:(%rsi),%rax @@ -10869,7 +11386,8 @@ CODE const uint8_t sk_load_f16_sse41[] = { 102,68,15,111,194, //movdqa %xmm2,%xmm8 102,68,15,97,192, //punpcklwd %xmm0,%xmm8 102,15,105,208, //punpckhwd %xmm0,%xmm2 - 102,15,110,66,100, //movd 0x64(%rdx),%xmm0 + 184,0,4,0,4, //mov $0x4000400,%eax + 102,15,110,192, //movd %eax,%xmm0 102,15,112,216,0, //pshufd $0x0,%xmm0,%xmm3 102,15,111,203, //movdqa %xmm3,%xmm1 102,65,15,101,200, //pcmpgtw %xmm8,%xmm1 @@ -10878,7 +11396,8 @@ CODE const uint8_t sk_load_f16_sse41[] = { 102,15,223,218, //pandn %xmm2,%xmm3 102,15,56,51,193, //pmovzxwd %xmm1,%xmm0 102,15,114,240,13, //pslld $0xd,%xmm0 - 102,15,110,82,92, //movd 0x5c(%rdx),%xmm2 + 184,0,0,128,119, //mov $0x77800000,%eax + 102,15,110,208, //movd %eax,%xmm2 102,68,15,112,194,0, //pshufd $0x0,%xmm2,%xmm8 65,15,89,192, //mulps %xmm8,%xmm0 102,69,15,239,201, //pxor %xmm9,%xmm9 @@ -10898,7 +11417,8 @@ CODE const uint8_t sk_load_f16_sse41[] = { CODE const uint8_t sk_store_f16_sse41[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 102,68,15,110,66,96, //movd 0x60(%rdx),%xmm8 + 185,0,0,128,7, //mov $0x7800000,%ecx + 102,68,15,110,193, //movd %ecx,%xmm8 102,69,15,112,192,0, //pshufd $0x0,%xmm8,%xmm8 102,69,15,111,200, //movdqa %xmm8,%xmm9 68,15,89,200, //mulps %xmm0,%xmm9 @@ -11060,17 +11580,20 @@ CODE const uint8_t sk_mirror_y_sse41[] = { }; CODE const uint8_t sk_luminance_to_alpha_sse41[] = { - 243,15,16,154,136,0,0,0, //movss 0x88(%rdx),%xmm3 - 243,68,15,16,130,140,0,0,0, //movss 0x8c(%rdx),%xmm8 + 184,208,179,89,62, //mov $0x3e59b3d0,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 15,89,216, //mulps %xmm0,%xmm3 - 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 - 68,15,89,193, //mulps %xmm1,%xmm8 - 68,15,88,195, //addps %xmm3,%xmm8 - 243,15,16,154,144,0,0,0, //movss 0x90(%rdx),%xmm3 + 184,89,23,55,63, //mov $0x3f371759,%eax + 102,15,110,192, //movd %eax,%xmm0 + 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 + 15,89,193, //mulps %xmm1,%xmm0 + 15,88,195, //addps %xmm3,%xmm0 + 184,152,221,147,61, //mov $0x3d93dd98,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 15,89,218, //mulps %xmm2,%xmm3 - 65,15,88,216, //addps %xmm8,%xmm3 + 15,88,216, //addps %xmm0,%xmm3 72,173, //lods %ds:(%rsi),%rax 15,87,192, //xorps %xmm0,%xmm0 15,87,201, //xorps %xmm1,%xmm1 @@ -11393,7 +11916,7 @@ CODE const uint8_t sk_seed_shader_sse2[] = { 102,15,110,209, //movd %ecx,%xmm2 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 15,88,202, //addps %xmm2,%xmm1 - 15,16,66,20, //movups 0x14(%rdx),%xmm0 + 15,16,2, //movups (%rdx),%xmm0 15,88,193, //addps %xmm1,%xmm0 102,15,110,8, //movd (%rax),%xmm1 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1 @@ -11593,24 +12116,29 @@ CODE const uint8_t sk_unpremul_sse2[] = { }; CODE const uint8_t sk_from_srgb_sse2[] = { - 243,68,15,16,66,64, //movss 0x40(%rdx),%xmm8 + 184,145,131,158,61, //mov $0x3d9e8391,%eax + 102,68,15,110,192, //movd %eax,%xmm8 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 69,15,40,232, //movaps %xmm8,%xmm13 68,15,89,232, //mulps %xmm0,%xmm13 68,15,40,224, //movaps %xmm0,%xmm12 69,15,89,228, //mulps %xmm12,%xmm12 - 243,68,15,16,74,60, //movss 0x3c(%rdx),%xmm9 + 184,154,153,153,62, //mov $0x3e99999a,%eax + 102,68,15,110,200, //movd %eax,%xmm9 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 - 243,68,15,16,82,52, //movss 0x34(%rdx),%xmm10 - 243,68,15,16,90,56, //movss 0x38(%rdx),%xmm11 - 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 + 184,92,143,50,63, //mov $0x3f328f5c,%eax + 102,68,15,110,208, //movd %eax,%xmm10 + 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 69,15,40,241, //movaps %xmm9,%xmm14 68,15,89,240, //mulps %xmm0,%xmm14 - 69,15,88,243, //addps %xmm11,%xmm14 - 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 - 69,15,89,244, //mulps %xmm12,%xmm14 69,15,88,242, //addps %xmm10,%xmm14 - 243,68,15,16,98,68, //movss 0x44(%rdx),%xmm12 + 184,10,215,35,59, //mov $0x3b23d70a,%eax + 102,68,15,110,216, //movd %eax,%xmm11 + 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 + 69,15,89,244, //mulps %xmm12,%xmm14 + 69,15,88,243, //addps %xmm11,%xmm14 + 184,174,71,97,61, //mov $0x3d6147ae,%eax + 102,68,15,110,224, //movd %eax,%xmm12 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 65,15,194,196,1, //cmpltps %xmm12,%xmm0 68,15,84,232, //andps %xmm0,%xmm13 @@ -11622,9 +12150,9 @@ CODE const uint8_t sk_from_srgb_sse2[] = { 69,15,89,246, //mulps %xmm14,%xmm14 69,15,40,249, //movaps %xmm9,%xmm15 68,15,89,249, //mulps %xmm1,%xmm15 - 69,15,88,251, //addps %xmm11,%xmm15 - 69,15,89,254, //mulps %xmm14,%xmm15 69,15,88,250, //addps %xmm10,%xmm15 + 69,15,89,254, //mulps %xmm14,%xmm15 + 69,15,88,251, //addps %xmm11,%xmm15 65,15,194,204,1, //cmpltps %xmm12,%xmm1 68,15,84,233, //andps %xmm1,%xmm13 65,15,85,207, //andnps %xmm15,%xmm1 @@ -11633,9 +12161,9 @@ CODE const uint8_t sk_from_srgb_sse2[] = { 68,15,40,234, //movaps %xmm2,%xmm13 69,15,89,237, //mulps %xmm13,%xmm13 68,15,89,202, //mulps %xmm2,%xmm9 - 69,15,88,203, //addps %xmm11,%xmm9 - 69,15,89,205, //mulps %xmm13,%xmm9 69,15,88,202, //addps %xmm10,%xmm9 + 69,15,89,205, //mulps %xmm13,%xmm9 + 69,15,88,203, //addps %xmm11,%xmm9 65,15,194,212,1, //cmpltps %xmm12,%xmm2 68,15,84,194, //andps %xmm2,%xmm8 65,15,85,209, //andnps %xmm9,%xmm2 @@ -11645,74 +12173,69 @@ CODE const uint8_t sk_from_srgb_sse2[] = { }; CODE const uint8_t sk_to_srgb_sse2[] = { - 72,131,236,40, //sub $0x28,%rsp - 15,41,124,36,16, //movaps %xmm7,0x10(%rsp) - 15,41,52,36, //movaps %xmm6,(%rsp) - 15,40,245, //movaps %xmm5,%xmm6 - 15,40,236, //movaps %xmm4,%xmm5 - 15,40,227, //movaps %xmm3,%xmm4 68,15,82,192, //rsqrtps %xmm0,%xmm8 - 69,15,83,232, //rcpps %xmm8,%xmm13 - 69,15,82,248, //rsqrtps %xmm8,%xmm15 - 243,15,16,26, //movss (%rdx),%xmm3 - 243,68,15,16,66,72, //movss 0x48(%rdx),%xmm8 + 69,15,83,248, //rcpps %xmm8,%xmm15 + 69,15,82,232, //rsqrtps %xmm8,%xmm13 + 184,41,92,71,65, //mov $0x41475c29,%eax + 102,68,15,110,192, //movd %eax,%xmm8 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 69,15,40,240, //movaps %xmm8,%xmm14 68,15,89,240, //mulps %xmm0,%xmm14 - 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 - 243,68,15,16,82,76, //movss 0x4c(%rdx),%xmm10 + 184,0,0,128,63, //mov $0x3f800000,%eax + 102,68,15,110,200, //movd %eax,%xmm9 + 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 + 184,194,135,210,62, //mov $0x3ed287c2,%eax + 102,68,15,110,208, //movd %eax,%xmm10 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 - 243,68,15,16,90,80, //movss 0x50(%rdx),%xmm11 + 184,206,111,48,63, //mov $0x3f306fce,%eax + 102,68,15,110,216, //movd %eax,%xmm11 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 - 243,68,15,16,98,84, //movss 0x54(%rdx),%xmm12 + 184,168,87,202,61, //mov $0x3dca57a8,%eax + 53,0,0,0,128, //xor $0x80000000,%eax + 102,68,15,110,224, //movd %eax,%xmm12 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 - 69,15,89,235, //mulps %xmm11,%xmm13 - 69,15,88,236, //addps %xmm12,%xmm13 - 69,15,89,250, //mulps %xmm10,%xmm15 - 69,15,88,253, //addps %xmm13,%xmm15 - 68,15,40,203, //movaps %xmm3,%xmm9 - 69,15,93,207, //minps %xmm15,%xmm9 - 243,68,15,16,106,88, //movss 0x58(%rdx),%xmm13 + 69,15,89,251, //mulps %xmm11,%xmm15 + 69,15,88,252, //addps %xmm12,%xmm15 + 69,15,89,234, //mulps %xmm10,%xmm13 + 69,15,88,239, //addps %xmm15,%xmm13 + 69,15,40,249, //movaps %xmm9,%xmm15 + 69,15,93,253, //minps %xmm13,%xmm15 + 184,4,231,140,59, //mov $0x3b8ce704,%eax + 102,68,15,110,232, //movd %eax,%xmm13 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 65,15,194,197,1, //cmpltps %xmm13,%xmm0 68,15,84,240, //andps %xmm0,%xmm14 - 65,15,85,193, //andnps %xmm9,%xmm0 + 65,15,85,199, //andnps %xmm15,%xmm0 65,15,86,198, //orps %xmm14,%xmm0 - 68,15,82,201, //rsqrtps %xmm1,%xmm9 - 69,15,83,241, //rcpps %xmm9,%xmm14 - 69,15,82,201, //rsqrtps %xmm9,%xmm9 - 69,15,89,243, //mulps %xmm11,%xmm14 - 69,15,88,244, //addps %xmm12,%xmm14 - 69,15,89,202, //mulps %xmm10,%xmm9 - 69,15,88,206, //addps %xmm14,%xmm9 - 68,15,40,243, //movaps %xmm3,%xmm14 - 69,15,93,241, //minps %xmm9,%xmm14 - 69,15,40,200, //movaps %xmm8,%xmm9 - 68,15,89,201, //mulps %xmm1,%xmm9 + 68,15,82,241, //rsqrtps %xmm1,%xmm14 + 69,15,83,254, //rcpps %xmm14,%xmm15 + 69,15,82,246, //rsqrtps %xmm14,%xmm14 + 69,15,89,251, //mulps %xmm11,%xmm15 + 69,15,88,252, //addps %xmm12,%xmm15 + 69,15,89,242, //mulps %xmm10,%xmm14 + 69,15,88,247, //addps %xmm15,%xmm14 + 69,15,40,249, //movaps %xmm9,%xmm15 + 69,15,93,254, //minps %xmm14,%xmm15 + 69,15,40,240, //movaps %xmm8,%xmm14 + 68,15,89,241, //mulps %xmm1,%xmm14 65,15,194,205,1, //cmpltps %xmm13,%xmm1 - 68,15,84,201, //andps %xmm1,%xmm9 - 65,15,85,206, //andnps %xmm14,%xmm1 - 65,15,86,201, //orps %xmm9,%xmm1 - 68,15,82,202, //rsqrtps %xmm2,%xmm9 - 69,15,83,241, //rcpps %xmm9,%xmm14 - 69,15,89,243, //mulps %xmm11,%xmm14 - 69,15,88,244, //addps %xmm12,%xmm14 - 65,15,82,249, //rsqrtps %xmm9,%xmm7 - 65,15,89,250, //mulps %xmm10,%xmm7 - 65,15,88,254, //addps %xmm14,%xmm7 - 15,93,223, //minps %xmm7,%xmm3 + 68,15,84,241, //andps %xmm1,%xmm14 + 65,15,85,207, //andnps %xmm15,%xmm1 + 65,15,86,206, //orps %xmm14,%xmm1 + 68,15,82,242, //rsqrtps %xmm2,%xmm14 + 69,15,83,254, //rcpps %xmm14,%xmm15 + 69,15,89,251, //mulps %xmm11,%xmm15 + 69,15,88,252, //addps %xmm12,%xmm15 + 69,15,82,222, //rsqrtps %xmm14,%xmm11 + 69,15,89,218, //mulps %xmm10,%xmm11 + 69,15,88,223, //addps %xmm15,%xmm11 + 69,15,93,203, //minps %xmm11,%xmm9 68,15,89,194, //mulps %xmm2,%xmm8 65,15,194,213,1, //cmpltps %xmm13,%xmm2 68,15,84,194, //andps %xmm2,%xmm8 - 15,85,211, //andnps %xmm3,%xmm2 + 65,15,85,209, //andnps %xmm9,%xmm2 65,15,86,208, //orps %xmm8,%xmm2 72,173, //lods %ds:(%rsi),%rax - 15,40,220, //movaps %xmm4,%xmm3 - 15,40,229, //movaps %xmm5,%xmm4 - 15,40,238, //movaps %xmm6,%xmm5 - 15,40,52,36, //movaps (%rsp),%xmm6 - 15,40,124,36,16, //movaps 0x10(%rsp),%xmm7 - 72,131,196,40, //add $0x28,%rsp 255,224, //jmpq *%rax }; @@ -11799,35 +12322,41 @@ CODE const uint8_t sk_lerp_u8_sse2[] = { CODE const uint8_t sk_lerp_565_sse2[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 243,68,15,126,12,120, //movq (%rax,%rdi,2),%xmm9 + 243,68,15,126,4,120, //movq (%rax,%rdi,2),%xmm8 102,15,239,219, //pxor %xmm3,%xmm3 - 102,68,15,97,203, //punpcklwd %xmm3,%xmm9 - 102,15,110,90,104, //movd 0x68(%rdx),%xmm3 - 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 - 102,65,15,219,217, //pand %xmm9,%xmm3 - 68,15,91,211, //cvtdq2ps %xmm3,%xmm10 - 243,68,15,16,90,116, //movss 0x74(%rdx),%xmm11 - 243,68,15,16,66,120, //movss 0x78(%rdx),%xmm8 - 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 - 69,15,89,218, //mulps %xmm10,%xmm11 - 102,15,110,90,108, //movd 0x6c(%rdx),%xmm3 + 102,68,15,97,195, //punpcklwd %xmm3,%xmm8 + 184,0,248,0,0, //mov $0xf800,%eax + 102,15,110,216, //movd %eax,%xmm3 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 - 102,65,15,219,217, //pand %xmm9,%xmm3 - 15,91,219, //cvtdq2ps %xmm3,%xmm3 - 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 - 68,15,89,195, //mulps %xmm3,%xmm8 - 102,15,110,90,112, //movd 0x70(%rdx),%xmm3 + 102,65,15,219,216, //pand %xmm8,%xmm3 + 68,15,91,203, //cvtdq2ps %xmm3,%xmm9 + 184,8,33,132,55, //mov $0x37842108,%eax + 102,68,15,110,208, //movd %eax,%xmm10 + 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 + 69,15,89,209, //mulps %xmm9,%xmm10 + 184,224,7,0,0, //mov $0x7e0,%eax + 102,15,110,216, //movd %eax,%xmm3 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 - 102,65,15,219,217, //pand %xmm9,%xmm3 + 102,65,15,219,216, //pand %xmm8,%xmm3 68,15,91,203, //cvtdq2ps %xmm3,%xmm9 - 243,15,16,90,124, //movss 0x7c(%rdx),%xmm3 + 184,33,8,2,58, //mov $0x3a020821,%eax + 102,68,15,110,216, //movd %eax,%xmm11 + 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 + 69,15,89,217, //mulps %xmm9,%xmm11 + 184,31,0,0,0, //mov $0x1f,%eax + 102,15,110,216, //movd %eax,%xmm3 + 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 + 102,65,15,219,216, //pand %xmm8,%xmm3 + 68,15,91,195, //cvtdq2ps %xmm3,%xmm8 + 184,8,33,4,61, //mov $0x3d042108,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 - 65,15,89,217, //mulps %xmm9,%xmm3 + 65,15,89,216, //mulps %xmm8,%xmm3 15,92,196, //subps %xmm4,%xmm0 - 65,15,89,195, //mulps %xmm11,%xmm0 + 65,15,89,194, //mulps %xmm10,%xmm0 15,88,196, //addps %xmm4,%xmm0 15,92,205, //subps %xmm5,%xmm1 - 65,15,89,200, //mulps %xmm8,%xmm1 + 65,15,89,203, //mulps %xmm11,%xmm1 15,88,205, //addps %xmm5,%xmm1 15,92,214, //subps %xmm6,%xmm2 15,89,211, //mulps %xmm3,%xmm2 @@ -11844,7 +12373,8 @@ CODE const uint8_t sk_load_tables_sse2[] = { 72,139,8, //mov (%rax),%rcx 76,139,64,8, //mov 0x8(%rax),%r8 243,68,15,111,4,185, //movdqu (%rcx,%rdi,4),%xmm8 - 102,15,110,66,16, //movd 0x10(%rdx),%xmm0 + 185,255,0,0,0, //mov $0xff,%ecx + 102,15,110,193, //movd %ecx,%xmm0 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 102,69,15,111,200, //movdqa %xmm8,%xmm9 102,65,15,114,209,8, //psrld $0x8,%xmm9 @@ -11899,7 +12429,8 @@ CODE const uint8_t sk_load_tables_sse2[] = { 65,15,20,209, //unpcklps %xmm9,%xmm2 102,65,15,114,208,24, //psrld $0x18,%xmm8 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 - 243,15,16,90,12, //movss 0xc(%rdx),%xmm3 + 184,129,128,128,59, //mov $0x3b808081,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 65,15,89,216, //mulps %xmm8,%xmm3 72,173, //lods %ds:(%rsi),%rax @@ -11914,7 +12445,8 @@ CODE const uint8_t sk_load_a8_sse2[] = { 102,15,96,193, //punpcklbw %xmm1,%xmm0 102,15,97,193, //punpcklwd %xmm1,%xmm0 15,91,192, //cvtdq2ps %xmm0,%xmm0 - 243,15,16,90,12, //movss 0xc(%rdx),%xmm3 + 184,129,128,128,59, //mov $0x3b808081,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 15,89,216, //mulps %xmm0,%xmm3 72,173, //lods %ds:(%rsi),%rax @@ -11927,7 +12459,8 @@ CODE const uint8_t sk_load_a8_sse2[] = { CODE const uint8_t sk_store_a8_sse2[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 243,68,15,16,66,8, //movss 0x8(%rdx),%xmm8 + 185,0,0,127,67, //mov $0x437f0000,%ecx + 102,68,15,110,193, //movd %ecx,%xmm8 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 68,15,89,195, //mulps %xmm3,%xmm8 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 @@ -11943,31 +12476,38 @@ CODE const uint8_t sk_store_a8_sse2[] = { CODE const uint8_t sk_load_565_sse2[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 243,68,15,126,12,120, //movq (%rax,%rdi,2),%xmm9 + 243,15,126,20,120, //movq (%rax,%rdi,2),%xmm2 102,15,239,192, //pxor %xmm0,%xmm0 - 102,68,15,97,200, //punpcklwd %xmm0,%xmm9 - 102,15,110,66,104, //movd 0x68(%rdx),%xmm0 + 102,15,97,208, //punpcklwd %xmm0,%xmm2 + 184,0,248,0,0, //mov $0xf800,%eax + 102,15,110,192, //movd %eax,%xmm0 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 - 102,65,15,219,193, //pand %xmm9,%xmm0 + 102,15,219,194, //pand %xmm2,%xmm0 15,91,200, //cvtdq2ps %xmm0,%xmm1 - 243,15,16,26, //movss (%rdx),%xmm3 - 243,15,16,66,116, //movss 0x74(%rdx),%xmm0 + 184,8,33,132,55, //mov $0x37842108,%eax + 102,15,110,192, //movd %eax,%xmm0 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 15,89,193, //mulps %xmm1,%xmm0 - 102,15,110,74,108, //movd 0x6c(%rdx),%xmm1 + 184,224,7,0,0, //mov $0x7e0,%eax + 102,15,110,200, //movd %eax,%xmm1 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1 - 102,65,15,219,201, //pand %xmm9,%xmm1 - 68,15,91,193, //cvtdq2ps %xmm1,%xmm8 - 243,15,16,74,120, //movss 0x78(%rdx),%xmm1 + 102,15,219,202, //pand %xmm2,%xmm1 + 15,91,217, //cvtdq2ps %xmm1,%xmm3 + 184,33,8,2,58, //mov $0x3a020821,%eax + 102,15,110,200, //movd %eax,%xmm1 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 - 65,15,89,200, //mulps %xmm8,%xmm1 - 102,15,110,82,112, //movd 0x70(%rdx),%xmm2 - 102,15,112,210,0, //pshufd $0x0,%xmm2,%xmm2 - 102,65,15,219,209, //pand %xmm9,%xmm2 - 68,15,91,194, //cvtdq2ps %xmm2,%xmm8 - 243,15,16,82,124, //movss 0x7c(%rdx),%xmm2 + 15,89,203, //mulps %xmm3,%xmm1 + 184,31,0,0,0, //mov $0x1f,%eax + 102,15,110,216, //movd %eax,%xmm3 + 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 + 102,15,219,218, //pand %xmm2,%xmm3 + 15,91,219, //cvtdq2ps %xmm3,%xmm3 + 184,8,33,4,61, //mov $0x3d042108,%eax + 102,15,110,208, //movd %eax,%xmm2 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 - 65,15,89,208, //mulps %xmm8,%xmm2 + 15,89,211, //mulps %xmm3,%xmm2 + 184,0,0,128,63, //mov $0x3f800000,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax @@ -11976,21 +12516,23 @@ CODE const uint8_t sk_load_565_sse2[] = { CODE const uint8_t sk_store_565_sse2[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 243,68,15,16,130,128,0,0,0, //movss 0x80(%rdx),%xmm8 - 243,68,15,16,138,132,0,0,0, //movss 0x84(%rdx),%xmm9 + 185,0,0,248,65, //mov $0x41f80000,%ecx + 102,68,15,110,193, //movd %ecx,%xmm8 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 - 69,15,40,208, //movaps %xmm8,%xmm10 - 68,15,89,208, //mulps %xmm0,%xmm10 - 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10 - 102,65,15,114,242,11, //pslld $0xb,%xmm10 - 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 - 68,15,89,201, //mulps %xmm1,%xmm9 + 69,15,40,200, //movaps %xmm8,%xmm9 + 68,15,89,200, //mulps %xmm0,%xmm9 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9 - 102,65,15,114,241,5, //pslld $0x5,%xmm9 - 102,69,15,235,202, //por %xmm10,%xmm9 + 102,65,15,114,241,11, //pslld $0xb,%xmm9 + 185,0,0,124,66, //mov $0x427c0000,%ecx + 102,68,15,110,209, //movd %ecx,%xmm10 + 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 + 68,15,89,209, //mulps %xmm1,%xmm10 + 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10 + 102,65,15,114,242,5, //pslld $0x5,%xmm10 + 102,69,15,235,209, //por %xmm9,%xmm10 68,15,89,194, //mulps %xmm2,%xmm8 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 - 102,69,15,86,193, //orpd %xmm9,%xmm8 + 102,69,15,86,194, //orpd %xmm10,%xmm8 102,65,15,114,240,16, //pslld $0x10,%xmm8 102,65,15,114,224,16, //psrad $0x10,%xmm8 102,69,15,107,192, //packssdw %xmm8,%xmm8 @@ -12068,7 +12610,8 @@ CODE const uint8_t sk_load_f16_sse2[] = { 102,68,15,111,194, //movdqa %xmm2,%xmm8 102,68,15,97,192, //punpcklwd %xmm0,%xmm8 102,15,105,208, //punpckhwd %xmm0,%xmm2 - 102,15,110,66,100, //movd 0x64(%rdx),%xmm0 + 184,0,4,0,4, //mov $0x4000400,%eax + 102,15,110,192, //movd %eax,%xmm0 102,15,112,216,0, //pshufd $0x0,%xmm0,%xmm3 102,15,111,203, //movdqa %xmm3,%xmm1 102,65,15,101,200, //pcmpgtw %xmm8,%xmm1 @@ -12079,7 +12622,8 @@ CODE const uint8_t sk_load_f16_sse2[] = { 102,15,111,193, //movdqa %xmm1,%xmm0 102,65,15,97,192, //punpcklwd %xmm8,%xmm0 102,15,114,240,13, //pslld $0xd,%xmm0 - 102,15,110,82,92, //movd 0x5c(%rdx),%xmm2 + 184,0,0,128,119, //mov $0x77800000,%eax + 102,15,110,208, //movd %eax,%xmm2 102,68,15,112,202,0, //pshufd $0x0,%xmm2,%xmm9 65,15,89,193, //mulps %xmm9,%xmm0 102,65,15,105,200, //punpckhwd %xmm8,%xmm1 @@ -12099,7 +12643,8 @@ CODE const uint8_t sk_load_f16_sse2[] = { CODE const uint8_t sk_store_f16_sse2[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax - 102,68,15,110,66,96, //movd 0x60(%rdx),%xmm8 + 185,0,0,128,7, //mov $0x7800000,%ecx + 102,68,15,110,193, //movd %ecx,%xmm8 102,69,15,112,192,0, //pshufd $0x0,%xmm8,%xmm8 102,69,15,111,200, //movdqa %xmm8,%xmm9 68,15,89,200, //mulps %xmm0,%xmm9 @@ -12289,17 +12834,20 @@ CODE const uint8_t sk_mirror_y_sse2[] = { }; CODE const uint8_t sk_luminance_to_alpha_sse2[] = { - 243,15,16,154,136,0,0,0, //movss 0x88(%rdx),%xmm3 - 243,68,15,16,130,140,0,0,0, //movss 0x8c(%rdx),%xmm8 + 184,208,179,89,62, //mov $0x3e59b3d0,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 15,89,216, //mulps %xmm0,%xmm3 - 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 - 68,15,89,193, //mulps %xmm1,%xmm8 - 68,15,88,195, //addps %xmm3,%xmm8 - 243,15,16,154,144,0,0,0, //movss 0x90(%rdx),%xmm3 + 184,89,23,55,63, //mov $0x3f371759,%eax + 102,15,110,192, //movd %eax,%xmm0 + 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 + 15,89,193, //mulps %xmm1,%xmm0 + 15,88,195, //addps %xmm3,%xmm0 + 184,152,221,147,61, //mov $0x3d93dd98,%eax + 102,15,110,216, //movd %eax,%xmm3 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 15,89,218, //mulps %xmm2,%xmm3 - 65,15,88,216, //addps %xmm8,%xmm3 + 15,88,216, //addps %xmm0,%xmm3 72,173, //lods %ds:(%rsi),%rax 15,87,192, //xorps %xmm0,%xmm0 15,87,201, //xorps %xmm1,%xmm1 diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index 13c33cc91e..a4a8975f67 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -8,9 +8,6 @@ #include "SkJumper.h" #include <string.h> -// It's tricky to relocate code referencing ordinary constants, so we read them from this struct. -using K = const SkJumper_constants; - template <typename T, typename P> static T unaligned_load(const P* p) { T v; @@ -26,19 +23,22 @@ static Dst bit_cast(const Src& src) { // A couple functions for embedding constants directly into code, // so that no .const or .literal4 section is created. - -static inline int constant(int x) { +static inline int C(int x) { #if defined(JUMPER) && defined(__x86_64__) // Move x-the-compile-time-constant as a literal into x-the-register. asm("mov %1, %0" : "=r"(x) : "i"(x)); #endif return x; } - -static inline float constant(float f) { - int x = constant(unaligned_load<int>(&f)); +static inline float C(float f) { + int x = C(unaligned_load<int>(&f)); return unaligned_load<float>(&x); } +static inline int operator "" _i(unsigned long long int i) { return C( (int)i); } +static inline float operator "" _f( long double f) { return C((float)f); } + +// Not all constants can be generated using C() or _i/_f. We read the rest from this struct. +using K = const SkJumper_constants; #if !defined(JUMPER) // This path should lead to portable code that can be compiled directly into Skia. @@ -55,7 +55,7 @@ static inline float constant(float f) { static F min(F a, F b) { return fminf(a,b); } static F max(F a, F b) { return fmaxf(a,b); } static F abs_ (F v) { return fabsf(v); } - static F floor(F v, K*) { return floorf(v); } + static F floor(F v) { return floorf(v); } static F rcp (F v) { return 1.0f / v; } static F rsqrt(F v) { return 1.0f / sqrtf(v); } static U32 round(F v, F scale) { return (uint32_t)lrintf(v*scale); } @@ -83,7 +83,7 @@ static inline float constant(float f) { static F min(F a, F b) { return vminq_f32(a,b); } static F max(F a, F b) { return vmaxq_f32(a,b); } static F abs_ (F v) { return vabsq_f32(v); } - static F floor(F v, K*) { return vrndmq_f32(v); } + static F floor(F v) { return vrndmq_f32(v); } static F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; } static F rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; } static U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); } @@ -121,9 +121,9 @@ static inline float constant(float f) { static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); } - static F floor(F v, K* k) { + static F floor(F v) { F roundtrip = vcvt_f32_s32(vcvt_s32_f32(v)); - return roundtrip - if_then_else(roundtrip > v, constant(1.0f), 0); + return roundtrip - if_then_else(roundtrip > v, 1.0_f, 0); } static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; } @@ -151,7 +151,7 @@ static inline float constant(float f) { static F min(F a, F b) { return _mm256_min_ps(a,b); } static F max(F a, F b) { return _mm256_max_ps(a,b); } static F abs_(F v) { return _mm256_and_ps(v, 0-v); } - static F floor(F v, K*) { return _mm256_floor_ps(v); } + static F floor(F v) { return _mm256_floor_ps(v); } static F rcp (F v) { return _mm256_rcp_ps (v); } static F rsqrt(F v) { return _mm256_rsqrt_ps(v); } static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); } @@ -220,12 +220,12 @@ static inline float constant(float f) { return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e)); } - static F floor(F v, K* k) { + static F floor(F v) { #if defined(__SSE4_1__) return _mm_floor_ps(v); #else F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v)); - return roundtrip - if_then_else(roundtrip > v, constant(1.0f), 0); + return roundtrip - if_then_else(roundtrip > v, 1.0_f, 0); #endif } @@ -345,11 +345,11 @@ static F lerp(F from, F to, F t) { return mad(to-from, t, from); } -static void from_565(U16 _565, F* r, F* g, F* b, K* k) { +static void from_565(U16 _565, F* r, F* g, F* b) { U32 wide = expand(_565); - *r = cast(wide & k->r_565_mask) * k->r_565_scale; - *g = cast(wide & k->g_565_mask) * k->g_565_scale; - *b = cast(wide & k->b_565_mask) * k->b_565_scale; + *r = cast(wide & C(31<<11)) * C(1.0f / (31<<11)); + *g = cast(wide & C(63<< 5)) * C(1.0f / (63<< 5)); + *b = cast(wide & C(31<< 0)) * C(1.0f / (31<< 0)); } // Sometimes we want to work with 4 floats directly, regardless of the depth of the F vector. @@ -503,10 +503,9 @@ STAGE(seed_shader) { // It's important for speed to explicitly cast(x) and cast(y), // which has the effect of splatting them to vectors before converting to floats. // On Intel this breaks a data dependency on previous loop iterations' registers. - - r = cast(x) + constant(0.5f) + unaligned_load<F>(k->iota); - g = cast(y) + constant(0.5f); - b = constant(1.0f); + r = cast(x) + 0.5_f + unaligned_load<F>(k->iota); + g = cast(y) + 0.5_f; + b = 1.0_f; a = 0; dr = dg = db = da = 0; } @@ -531,14 +530,14 @@ STAGE(plus_) { } STAGE(srcover) { - auto A = constant(1.0f) - a; + auto A = C(1.0f) - a; r = mad(dr, A, r); g = mad(dg, A, g); b = mad(db, A, b); a = mad(da, A, a); } STAGE(dstover) { - auto DA = constant(1.0f) - da; + auto DA = 1.0_f - da; r = mad(r, DA, dr); g = mad(g, DA, dg); b = mad(b, DA, db); @@ -553,14 +552,14 @@ STAGE(clamp_0) { } STAGE(clamp_1) { - r = min(r, constant(1.0f)); - g = min(g, constant(1.0f)); - b = min(b, constant(1.0f)); - a = min(a, constant(1.0f)); + r = min(r, 1.0_f); + g = min(g, 1.0_f); + b = min(b, 1.0_f); + a = min(a, 1.0_f); } STAGE(clamp_a) { - a = min(a, constant(1.0f)); + a = min(a, 1.0_f); r = min(r, a); g = min(g, a); b = min(b, a); @@ -608,7 +607,7 @@ STAGE(premul) { b = b * a; } STAGE(unpremul) { - auto scale = if_then_else(a == 0, 0, constant(1.0f) / a); + auto scale = if_then_else(a == 0, 0, 1.0_f / a); r = r * scale; g = g * scale; b = b * scale; @@ -616,9 +615,9 @@ STAGE(unpremul) { STAGE(from_srgb) { auto fn = [&](F s) { - auto lo = s * k->_1_1292; - auto hi = mad(s*s, mad(s, k->_03000, k->_06975), k->_00025); - return if_then_else(s < k->_0055, lo, hi); + auto lo = s * C(1/12.92f); + auto hi = mad(s*s, mad(s, 0.3000_f, 0.6975_f), 0.0025_f); + return if_then_else(s < 0.055_f, lo, hi); }; r = fn(r); g = fn(g); @@ -628,11 +627,10 @@ STAGE(to_srgb) { auto fn = [&](F l) { F sqrt = rcp (rsqrt(l)), ftrt = rsqrt(rsqrt(l)); - auto lo = l * k->_1246; - auto hi = min(k->_1, mad(k->_0411192, ftrt, - mad(k->_0689206, sqrt, - k->n_00988))); - return if_then_else(l < k->_00043, lo, hi); + auto lo = l * 12.46_f; + auto hi = min(1.0_f, mad(0.411192_f, ftrt, + mad(0.689206_f, sqrt, -0.0988_f))); + return if_then_else(l < 0.0043_f, lo, hi); }; r = fn(r); g = fn(g); @@ -651,7 +649,7 @@ STAGE(scale_u8) { auto ptr = *(const uint8_t**)ctx + x; auto scales = load<U8>(ptr, tail); - auto c = cast(expand(scales)) * constant(1/255.0f); + auto c = cast(expand(scales)) * C(1/255.0f); r = r * c; g = g * c; @@ -671,7 +669,7 @@ STAGE(lerp_u8) { auto ptr = *(const uint8_t**)ctx + x; auto scales = load<U8>(ptr, tail); - auto c = cast(expand(scales)) * constant(1/255.0f); + auto c = cast(expand(scales)) * C(1/255.0f); r = lerp(dr, r, c); g = lerp(dg, g, c); @@ -682,12 +680,12 @@ STAGE(lerp_565) { auto ptr = *(const uint16_t**)ctx + x; F cr,cg,cb; - from_565(load<U16>(ptr, tail), &cr, &cg, &cb, k); + from_565(load<U16>(ptr, tail), &cr, &cg, &cb); r = lerp(dr, r, cr); g = lerp(dg, g, cg); b = lerp(db, b, cb); - a = constant(1.0f); + a = 1.0_f; } STAGE(load_tables) { @@ -698,37 +696,37 @@ STAGE(load_tables) { auto c = (const Ctx*)ctx; auto px = load<U32>(c->src + x, tail); - r = gather(c->r, (px ) & k->_0x000000ff); - g = gather(c->g, (px >> 8) & k->_0x000000ff); - b = gather(c->b, (px >> 16) & k->_0x000000ff); - a = cast( (px >> 24)) * k->_1_255; + r = gather(c->r, (px ) & 0xff_i); + g = gather(c->g, (px >> 8) & 0xff_i); + b = gather(c->b, (px >> 16) & 0xff_i); + a = cast( (px >> 24)) * C(1/255.0f); } STAGE(load_a8) { auto ptr = *(const uint8_t**)ctx + x; r = g = b = 0.0f; - a = cast(expand(load<U8>(ptr, tail))) * k->_1_255; + a = cast(expand(load<U8>(ptr, tail))) * C(1/255.0f); } STAGE(store_a8) { auto ptr = *(uint8_t**)ctx + x; - U8 packed = pack(pack(round(a, k->_255))); + U8 packed = pack(pack(round(a, 255.0_f))); store(ptr, packed, tail); } STAGE(load_565) { auto ptr = *(const uint16_t**)ctx + x; - from_565(load<U16>(ptr, tail), &r,&g,&b, k); - a = k->_1; + from_565(load<U16>(ptr, tail), &r,&g,&b); + a = 1.0_f; } STAGE(store_565) { auto ptr = *(uint16_t**)ctx + x; - U16 px = pack( round(r, k->_31) << 11 - | round(g, k->_63) << 5 - | round(b, k->_31) ); + U16 px = pack( round(r, 31.0_f) << 11 + | round(g, 63.0_f) << 5 + | round(b, 31.0_f) ); store(ptr, px, tail); } @@ -736,19 +734,19 @@ STAGE(load_8888) { auto ptr = *(const uint32_t**)ctx + x; auto px = load<U32>(ptr, tail); - r = cast((px ) & constant(0xff)) * constant(1/255.0f); - g = cast((px >> 8) & constant(0xff)) * constant(1/255.0f); - b = cast((px >> 16) & constant(0xff)) * constant(1/255.0f); - a = cast((px >> 24) ) * constant(1/255.0f); + r = cast((px ) & 0xff_i) * C(1/255.0f); + g = cast((px >> 8) & 0xff_i) * C(1/255.0f); + b = cast((px >> 16) & 0xff_i) * C(1/255.0f); + a = cast((px >> 24) ) * C(1/255.0f); } STAGE(store_8888) { auto ptr = *(uint32_t**)ctx + x; - U32 px = round(r, constant(255.0f)) - | round(g, constant(255.0f)) << 8 - | round(b, constant(255.0f)) << 16 - | round(a, constant(255.0f)) << 24; + U32 px = round(r, 255.0_f) + | round(g, 255.0_f) << 8 + | round(b, 255.0_f) << 16 + | round(a, 255.0_f) << 24; store(ptr, px, tail); } @@ -757,9 +755,9 @@ STAGE(load_f16) { #if !defined(JUMPER) auto half_to_float = [&](int16_t h) { - if (h < 0x0400) { h = 0; } // Flush denorm and negative to zero. - return bit_cast<F>(h << 13) // Line up the mantissa, - * bit_cast<F>(U32(k->_0x77800000)); // then fix up the exponent. + if (h < 0x0400) { h = 0; } // Flush denorm and negative to zero. + return bit_cast<F>(h << 13) // Line up the mantissa, + * bit_cast<F>(U32(0x77800000)); // then fix up the exponent. }; auto rgba = (const int16_t*)ptr; r = half_to_float(rgba[0]); @@ -844,8 +842,8 @@ STAGE(load_f16) { // half_to_float() slows down ~10x for denorm inputs, so we flush them to zero. // With a signed comparison this conveniently also flushes negative half floats to zero. - auto ftz = [k](__m128i v) { - return _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(k->_0x04000400)), v); + auto ftz = [](__m128i v) { + return _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(0x04000400_i)), v); }; rg0123 = ftz(rg0123); ba0123 = ftz(ba0123); @@ -862,8 +860,8 @@ STAGE(load_f16) { _mm_unpackhi_epi16(ba4567, _mm_setzero_si128())); auto half_to_float = [&](U32 h) { - return bit_cast<F>(h << 13) // Line up the mantissa, - * bit_cast<F>(U32(k->_0x77800000)); // then fix up the exponent. + return bit_cast<F>(h << 13) // Line up the mantissa, + * bit_cast<F>(U32(0x77800000_i)); // then fix up the exponent. }; r = half_to_float(R); @@ -882,15 +880,15 @@ STAGE(load_f16) { ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3 // Same deal as AVX, flush denorms and negatives to zero. - auto ftz = [k](__m128i v) { - return _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(k->_0x04000400)), v); + auto ftz = [](__m128i v) { + return _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(0x04000400_i)), v); }; rg = ftz(rg); ba = ftz(ba); auto half_to_float = [&](U32 h) { - return bit_cast<F>(h << 13) // Line up the mantissa, - * bit_cast<F>(U32(k->_0x77800000)); // then fix up the exponent. + return bit_cast<F>(h << 13) // Line up the mantissa, + * bit_cast<F>(U32(0x77800000_i)); // then fix up the exponent. }; r = half_to_float(_mm_unpacklo_epi16(rg, _mm_setzero_si128())); @@ -905,8 +903,8 @@ STAGE(store_f16) { #if !defined(JUMPER) auto float_to_half = [&](F f) { - return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000))) // Fix up the exponent, - >> 13; // then line up the mantissa. + return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent, + >> 13; // then line up the mantissa. }; auto rgba = (int16_t*)ptr; rgba[0] = float_to_half(r); @@ -960,8 +958,8 @@ STAGE(store_f16) { } #elif defined(__AVX__) auto float_to_half = [&](F f) { - return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000))) // Fix up the exponent, - >> 13; // then line up the mantissa. + return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent, + >> 13; // then line up the mantissa. }; U32 R = float_to_half(r), G = float_to_half(g), @@ -1002,8 +1000,8 @@ STAGE(store_f16) { } #elif defined(__SSE2__) auto float_to_half = [&](F f) { - return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000))) // Fix up the exponent, - >> 13; // then line up the mantissa. + return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent, + >> 13; // then line up the mantissa. }; U32 R = float_to_half(r), G = float_to_half(g), @@ -1070,27 +1068,27 @@ STAGE(store_f32) { static F ulp_before(F v) { return bit_cast<F>(bit_cast<U32>(v) + U32(0xffffffff)); } -static F clamp(F v, float limit, K*) { +static F clamp(F v, float limit) { v = max(0, v); return min(v, ulp_before(limit)); } -static F repeat(F v, float limit, K* k) { - v = v - floor(v/limit, k)*limit; +static F repeat(F v, float limit) { + v = v - floor(v/limit)*limit; return min(v, ulp_before(limit)); } -static F mirror(F v, float limit, K* k) { - v = abs_( (v-limit) - (limit+limit)*floor((v-limit)/(limit+limit),k) - limit ); +static F mirror(F v, float limit) { + v = abs_( (v-limit) - (limit+limit)*floor((v-limit)/(limit+limit)) - limit ); return min(v, ulp_before(limit)); } -STAGE(clamp_x) { r = clamp (r, *(const float*)ctx, k); } -STAGE(clamp_y) { g = clamp (g, *(const float*)ctx, k); } -STAGE(repeat_x) { r = repeat(r, *(const float*)ctx, k); } -STAGE(repeat_y) { g = repeat(g, *(const float*)ctx, k); } -STAGE(mirror_x) { r = mirror(r, *(const float*)ctx, k); } -STAGE(mirror_y) { g = mirror(g, *(const float*)ctx, k); } +STAGE(clamp_x) { r = clamp (r, *(const float*)ctx); } +STAGE(clamp_y) { g = clamp (g, *(const float*)ctx); } +STAGE(repeat_x) { r = repeat(r, *(const float*)ctx); } +STAGE(repeat_y) { g = repeat(g, *(const float*)ctx); } +STAGE(mirror_x) { r = mirror(r, *(const float*)ctx); } +STAGE(mirror_y) { g = mirror(g, *(const float*)ctx); } STAGE(luminance_to_alpha) { - a = r*k->lum_r + g*k->lum_g + b*k->lum_b; + a = r*0.2126_f + g*0.7152_f + b*0.0722_f; r = g = b = 0; } diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py index 118291d16d..7513b0df09 100755 --- a/src/jumper/build_stages.py +++ b/src/jumper/build_stages.py @@ -78,14 +78,14 @@ def parse_object_file(dot_o, array_type, target=None): # Look for sections we know we can't handle. section_headers = subprocess.check_output(cmd + ['-h', dot_o]) - for section in ['.literal4', '.literal8', '.literal16', '.const']: - if section in section_headers: - print >>sys.stderr, 'Found %s section, which we cannot handle.' % section - assert section not in section_headers + for snippet in ['.literal', '.const', '.rodata']: + if snippet in section_headers: + print >>sys.stderr, 'Found %s in section.' % snippet + assert snippet not in section_headers # Ok. Let's disassemble. active = False - disassemble = ['-d', '--insn-width=9', dot_o] + disassemble = ['-d', '--insn-width=10', dot_o] for line in subprocess.check_output(cmd + disassemble).split('\n'): line = line.strip() |