diff options
author | 2017-03-14 17:35:04 -0700 | |
---|---|---|
committer | 2017-03-15 01:17:04 +0000 | |
commit | 64b974836a594c4f14384ded399ff09e96160215 (patch) | |
tree | 3b421fe84eb9d938f556a019e35c27ece62a1fb7 /src/jumper/SkJumper_generated.cpp | |
parent | 009e68c719a20675cf377f7c3ffbfb874ab3850a (diff) |
SkJumper: update to Clang 4.0
This Clang makes some new decisions about what (not) to inline.
Luckily, liberal use of the 'inline' keyword steers it back in
the right direction.
This new code draws the same, and generally looks improved.
Change-Id: I0ab6e1c884e6b339d01ae46a08a848e36dcc535a
Reviewed-on: https://skia-review.googlesource.com/9702
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/jumper/SkJumper_generated.cpp')
-rw-r--r-- | src/jumper/SkJumper_generated.cpp | 1542 |
1 files changed, 752 insertions, 790 deletions
diff --git a/src/jumper/SkJumper_generated.cpp b/src/jumper/SkJumper_generated.cpp index df6e75234f..2494c1eb05 100644 --- a/src/jumper/SkJumper_generated.cpp +++ b/src/jumper/SkJumper_generated.cpp @@ -25,15 +25,15 @@ CODE const uint32_t sk_start_pipeline_aarch64[] = { 0xa9bd5bf7, //stp x23, x22, [sp, #-48]! 0xa90153f5, //stp x21, x20, [sp, #16] 0xa9027bf3, //stp x19, x30, [sp, #32] - 0xaa0103f5, //mov x21, x1 - 0xf84086b7, //ldr x23, [x21], #8 - 0xaa0003f6, //mov x22, x0 + 0xaa0103f4, //mov x20, x1 + 0xf8408697, //ldr x23, [x20], #8 + 0xaa0003f5, //mov x21, x0 0xaa0303f3, //mov x19, x3 - 0xaa0203f4, //mov x20, x2 - 0x910012c8, //add x8, x22, #0x4 + 0x910012a8, //add x8, x21, #0x4 0xeb13011f, //cmp x8, x19 + 0xaa0203f6, //mov x22, x2 0x54000069, //b.ls 34 <sk_start_pipeline_aarch64+0x34> // b.plast - 0xaa1603e0, //mov x0, x22 + 0xaa1503e0, //mov x0, x21 0x14000012, //b 78 <sk_start_pipeline_aarch64+0x78> 0x6f00e400, //movi v0.2d, #0x0 0x6f00e401, //movi v1.2d, #0x0 @@ -43,14 +43,14 @@ CODE const uint32_t sk_start_pipeline_aarch64[] = { 0x6f00e405, //movi v5.2d, #0x0 0x6f00e406, //movi v6.2d, #0x0 0x6f00e407, //movi v7.2d, #0x0 - 0xaa1603e0, //mov x0, x22 - 0xaa1503e1, //mov x1, x21 - 0xaa1403e2, //mov x2, x20 + 0xaa1503e0, //mov x0, x21 + 0xaa1403e1, //mov x1, x20 + 0xaa1603e2, //mov x2, x22 0xd63f02e0, //blr x23 - 0x910022c8, //add x8, x22, #0x8 - 0x910012c0, //add x0, x22, #0x4 + 0x910012a0, //add x0, x21, #0x4 + 0x910022a8, //add x8, x21, #0x8 0xeb13011f, //cmp x8, x19 - 0xaa0003f6, //mov x22, x0 + 0xaa0003f5, //mov x21, x0 0x54fffe09, //b.ls 34 <sk_start_pipeline_aarch64+0x34> // b.plast 0xa9427bf3, //ldp x19, x30, [sp, #32] 0xa94153f5, //ldp x21, x20, [sp, #16] @@ -384,7 +384,6 @@ CODE const uint32_t sk_scale_u8_aarch64[] = { 0x4e061d51, //mov v17.h[1], w10 0x4e0a1d71, //mov v17.h[2], w11 0x4e0e1d11, //mov v17.h[3], w8 - 0x2f07b7f1, //bic v17.4h, #0xff, lsl #8 0x2f10a631, //uxtl v17.4s, v17.4h 0x6e21da31, //ucvtf v17.4s, v17.4s 0x6e30de30, //fmul v16.4s, v17.4s, v16.4s @@ -429,7 +428,6 @@ CODE const uint32_t sk_lerp_u8_aarch64[] = { 0x4e061d31, //mov v17.h[1], w9 0x4e0a1d71, //mov v17.h[2], w11 0x4e0e1d11, //mov v17.h[3], w8 - 0x2f07b7f1, //bic v17.4h, #0xff, lsl #8 0x2f10a620, //uxtl v0.4s, v17.4h 0x6e21d800, //ucvtf v0.4s, v0.4s 0x6e30dc10, //fmul v16.4s, v0.4s, v16.4s @@ -562,7 +560,6 @@ CODE const uint32_t sk_load_a8_aarch64[] = { 0x4e061d63, //mov v3.h[1], w11 0x4e0a1d83, //mov v3.h[2], w12 0x4e0e1d03, //mov v3.h[3], w8 - 0x2f07b7e3, //bic v3.4h, #0xff, lsl #8 0x2f10a463, //uxtl v3.4s, v3.4h 0x6e21d863, //ucvtf v3.4s, v3.4s 0x6e22dc63, //fmul v3.4s, v3.4s, v2.4s @@ -980,13 +977,14 @@ CODE const uint32_t sk_linear_gradient_2stops_aarch64[] = { CODE const uint32_t sk_start_pipeline_vfp4[] = { 0xe92d41f0, //push {r4, r5, r6, r7, r8, lr} - 0xe1a07001, //mov r7, r1 0xe1a04000, //mov r4, r0 + 0xe2840002, //add r0, r4, #2 0xe1a05003, //mov r5, r3 0xe1a08002, //mov r8, r2 + 0xe1a07001, //mov r7, r1 + 0xe1500005, //cmp r0, r5 + 0x8a000010, //bhi 64 <sk_start_pipeline_vfp4+0x64> 0xe4976004, //ldr r6, [r7], #4 - 0xe2840002, //add r0, r4, #2 - 0xea00000d, //b 58 <sk_start_pipeline_vfp4+0x58> 0xf2800010, //vmov.i32 d0, #0 0xe1a00004, //mov r0, r4 0xf2801010, //vmov.i32 d1, #0 @@ -1002,7 +1000,7 @@ CODE const uint32_t sk_start_pipeline_vfp4[] = { 0xe2840004, //add r0, r4, #4 0xe2844002, //add r4, r4, #2 0xe1500005, //cmp r0, r5 - 0x9affffef, //bls 20 <sk_start_pipeline_vfp4+0x20> + 0x9affffef, //bls 24 <sk_start_pipeline_vfp4+0x24> 0xe1a00004, //mov r0, r4 0xe8bd81f0, //pop {r4, r5, r6, r7, r8, pc} }; @@ -1123,13 +1121,11 @@ CODE const uint32_t sk_clamp_a_vfp4[] = { CODE const uint32_t sk_set_rgb_vfp4[] = { 0xe92d4800, //push {fp, lr} - 0xe591e000, //ldr lr, [r1] - 0xe591c004, //ldr ip, [r1, #4] + 0xe8911008, //ldm r1, {r3, ip} 0xe2811008, //add r1, r1, #8 - 0xe28e3008, //add r3, lr, #8 - 0xf4ae0c9f, //vld1.32 {d0[]}, [lr :32] - 0xf4a32c9f, //vld1.32 {d2[]}, [r3 :32] - 0xe28e3004, //add r3, lr, #4 + 0xe283e008, //add lr, r3, #8 + 0xf4a30c9d, //vld1.32 {d0[]}, [r3 :32]! + 0xf4ae2c9f, //vld1.32 {d2[]}, [lr :32] 0xf4a31c9f, //vld1.32 {d1[]}, [r3 :32] 0xe8bd4800, //pop {fp, lr} 0xe12fff1c, //bx ip @@ -1200,6 +1196,7 @@ CODE const uint32_t sk_unpremul_vfp4[] = { 0xf3012d92, //vmul.f32 d2, d17, d2 0xecbd8b04, //vpop {d8-d9} 0xe12fff13, //bx r3 + 0xe320f000, //nop {0} }; CODE const uint32_t sk_from_srgb_vfp4[] = { @@ -1325,15 +1322,13 @@ CODE const uint32_t sk_to_srgb_vfp4[] = { }; CODE const uint32_t sk_scale_1_float_vfp4[] = { - 0xed2d8b02, //vpush {d8} 0xe8911008, //ldm r1, {r3, ip} 0xe2811008, //add r1, r1, #8 - 0xed938a00, //vldr s16, [r3] - 0xf2a00948, //vmul.f32 d0, d0, d8[0] - 0xf2a11948, //vmul.f32 d1, d1, d8[0] - 0xf2a22948, //vmul.f32 d2, d2, d8[0] - 0xf2a33948, //vmul.f32 d3, d3, d8[0] - 0xecbd8b02, //vpop {d8} + 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32] + 0xf3000d90, //vmul.f32 d0, d16, d0 + 0xf3001d91, //vmul.f32 d1, d16, d1 + 0xf3002d92, //vmul.f32 d2, d16, d2 + 0xf3003d93, //vmul.f32 d3, d16, d3 0xe12fff1c, //bx ip }; @@ -1677,23 +1672,19 @@ CODE const uint32_t sk_store_8888_vfp4[] = { }; CODE const uint32_t sk_load_f16_vfp4[] = { - 0xed2d8b04, //vpush {d8-d9} 0xe8911008, //ldm r1, {r3, ip} 0xe2811008, //add r1, r1, #8 0xe5933000, //ldr r3, [r3] 0xe0833180, //add r3, r3, r0, lsl #3 0xf463084f, //vld2.16 {d16-d17}, [r3] - 0xf3b62720, //vcvt.f32.f16 q1, d16 - 0xf3b68721, //vcvt.f32.f16 q4, d17 - 0xf2220112, //vorr d0, d2, d2 - 0xeef00a43, //vmov.f32 s1, s6 - 0xf2281118, //vorr d1, d8, d8 - 0xeeb03a62, //vmov.f32 s6, s5 - 0xeef01a49, //vmov.f32 s3, s18 - 0xeeb09a68, //vmov.f32 s18, s17 - 0xeeb02b43, //vmov.f64 d2, d3 - 0xeeb03b49, //vmov.f64 d3, d9 - 0xecbd8b04, //vpop {d8-d9} + 0xf3f62720, //vcvt.f32.f16 q9, d16 + 0xf3f60721, //vcvt.f32.f16 q8, d17 + 0xf22201b2, //vorr d0, d18, d18 + 0xf22011b0, //vorr d1, d16, d16 + 0xf3ba00a3, //vtrn.32 d0, d19 + 0xf22321b3, //vorr d2, d19, d19 + 0xf3ba10a1, //vtrn.32 d1, d17 + 0xf22131b1, //vorr d3, d17, d17 0xe12fff1c, //bx ip }; @@ -1902,10 +1893,9 @@ CODE const uint32_t sk_matrix_2x3_vfp4[] = { 0xe28e3014, //add r3, lr, #20 0xf2410c31, //vfma.f32 d16, d1, d17 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32] - 0xe28e3004, //add r3, lr, #4 0xf2411c32, //vfma.f32 d17, d1, d18 - 0xf4ee2c9f, //vld1.32 {d18[]}, [lr :32] - 0xf4e33c9f, //vld1.32 {d19[]}, [r3 :32] + 0xf4ee2c9d, //vld1.32 {d18[]}, [lr :32]! + 0xf4ee3c9f, //vld1.32 {d19[]}, [lr :32] 0xf2400c32, //vfma.f32 d16, d0, d18 0xf2401c33, //vfma.f32 d17, d0, d19 0xf22001b0, //vorr d0, d16, d16 @@ -1942,11 +1932,10 @@ CODE const uint32_t sk_matrix_3x4_vfp4[] = { 0xf2411c34, //vfma.f32 d17, d1, d20 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32] 0xf2410c34, //vfma.f32 d16, d1, d20 - 0xe28e3004, //add r3, lr, #4 - 0xf2412c33, //vfma.f32 d18, d1, d19 - 0xf4ee3c9f, //vld1.32 {d19[]}, [lr :32] - 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32] 0xe28e3008, //add r3, lr, #8 + 0xf2412c33, //vfma.f32 d18, d1, d19 + 0xf4ee3c9d, //vld1.32 {d19[]}, [lr :32]! + 0xf4ee4c9f, //vld1.32 {d20[]}, [lr :32] 0xf2401c33, //vfma.f32 d17, d0, d19 0xf4e33c9f, //vld1.32 {d19[]}, [r3 :32] 0xf2400c33, //vfma.f32 d16, d0, d19 @@ -1959,107 +1948,106 @@ CODE const uint32_t sk_matrix_3x4_vfp4[] = { }; CODE const uint32_t sk_matrix_4x5_vfp4[] = { - 0xe92d4800, //push {fp, lr} - 0xe591e000, //ldr lr, [r1] - 0xf2630113, //vorr d16, d3, d3 - 0xf2621112, //vorr d17, d2, d2 - 0xe591c004, //ldr ip, [r1, #4] - 0xe28e301c, //add r3, lr, #28 + 0xe92d4010, //push {r4, lr} + 0xe8911008, //ldm r1, {r3, ip} + 0xf2620112, //vorr d16, d2, d2 0xe2811008, //add r1, r1, #8 - 0xf4ee4c9f, //vld1.32 {d20[]}, [lr :32] - 0xf4e35c9f, //vld1.32 {d21[]}, [r3 :32] - 0xe28e302c, //add r3, lr, #44 - 0xf4e36c9f, //vld1.32 {d22[]}, [r3 :32] - 0xe28e303c, //add r3, lr, #60 - 0xf4e37c9f, //vld1.32 {d23[]}, [r3 :32] - 0xe28e304c, //add r3, lr, #76 - 0xf4a33c9f, //vld1.32 {d3[]}, [r3 :32] - 0xe28e3038, //add r3, lr, #56 - 0xf2003cb7, //vfma.f32 d3, d16, d23 - 0xf4e38c9f, //vld1.32 {d24[]}, [r3 :32] - 0xe28e3048, //add r3, lr, #72 - 0xf4a32c9f, //vld1.32 {d2[]}, [r3 :32] - 0xe28e3034, //add r3, lr, #52 - 0xf2002cb8, //vfma.f32 d2, d16, d24 - 0xf4e39c9f, //vld1.32 {d25[]}, [r3 :32] - 0xe28e3030, //add r3, lr, #48 - 0xf4e33c9f, //vld1.32 {d19[]}, [r3 :32] - 0xe28e3040, //add r3, lr, #64 - 0xf2013cb6, //vfma.f32 d3, d17, d22 - 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] - 0xe28e3044, //add r3, lr, #68 - 0xf2402cb3, //vfma.f32 d18, d16, d19 - 0xf4e33c9f, //vld1.32 {d19[]}, [r3 :32] - 0xe28e3024, //add r3, lr, #36 - 0xf2403cb9, //vfma.f32 d19, d16, d25 - 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32] - 0xe28e3020, //add r3, lr, #32 - 0xf4e37c9f, //vld1.32 {d23[]}, [r3 :32] - 0xe28e3028, //add r3, lr, #40 - 0xf2013c35, //vfma.f32 d3, d1, d21 - 0xf2412cb7, //vfma.f32 d18, d17, d23 - 0xf4e37c9f, //vld1.32 {d23[]}, [r3 :32] - 0xe28e3014, //add r3, lr, #20 - 0xf2012cb7, //vfma.f32 d2, d17, d23 - 0xf2413cb0, //vfma.f32 d19, d17, d16 - 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32] - 0xe28e3010, //add r3, lr, #16 - 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32] - 0xe28e3018, //add r3, lr, #24 - 0xf2412c31, //vfma.f32 d18, d1, d17 - 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32] - 0xe28e3008, //add r3, lr, #8 - 0xf2012c31, //vfma.f32 d2, d1, d17 + 0xe2834014, //add r4, r3, #20 + 0xe1a0e003, //mov lr, r3 + 0xf4e45c9f, //vld1.32 {d21[]}, [r4 :32] + 0xe2834028, //add r4, r3, #40 + 0xf4e46c9f, //vld1.32 {d22[]}, [r4 :32] + 0xe2834038, //add r4, r3, #56 + 0xf4e47c9f, //vld1.32 {d23[]}, [r4 :32] + 0xe2834048, //add r4, r3, #72 + 0xf4a42c9f, //vld1.32 {d2[]}, [r4 :32] + 0xe2834034, //add r4, r3, #52 + 0xf2032c37, //vfma.f32 d2, d3, d23 + 0xf4e48c9f, //vld1.32 {d24[]}, [r4 :32] + 0xe2834044, //add r4, r3, #68 + 0xf4e41c9f, //vld1.32 {d17[]}, [r4 :32] + 0xe2834030, //add r4, r3, #48 + 0xf2431c38, //vfma.f32 d17, d3, d24 + 0xf4e49c9f, //vld1.32 {d25[]}, [r4 :32] + 0xe283403c, //add r4, r3, #60 + 0xf4e43c9f, //vld1.32 {d19[]}, [r4 :32] + 0xe283404c, //add r4, r3, #76 + 0xf2002cb6, //vfma.f32 d2, d16, d22 + 0xf4e42c9f, //vld1.32 {d18[]}, [r4 :32] + 0xe2834040, //add r4, r3, #64 + 0xf2432c33, //vfma.f32 d18, d3, d19 + 0xf4e43c9f, //vld1.32 {d19[]}, [r4 :32] + 0xe2834020, //add r4, r3, #32 + 0xf2433c39, //vfma.f32 d19, d3, d25 + 0xf4e47c9f, //vld1.32 {d23[]}, [r4 :32] + 0xe283402c, //add r4, r3, #44 + 0xf4e48c9f, //vld1.32 {d24[]}, [r4 :32] + 0xe2834024, //add r4, r3, #36 + 0xf2402cb8, //vfma.f32 d18, d16, d24 + 0xf4e48c9f, //vld1.32 {d24[]}, [r4 :32] + 0xf2401cb8, //vfma.f32 d17, d16, d24 + 0xe2834010, //add r4, r3, #16 + 0xf2403cb7, //vfma.f32 d19, d16, d23 + 0xf4ee4c9d, //vld1.32 {d20[]}, [lr :32]! + 0xf4e40c9f, //vld1.32 {d16[]}, [r4 :32] + 0xe283401c, //add r4, r3, #28 + 0xf4e46c9f, //vld1.32 {d22[]}, [r4 :32] + 0xe2834018, //add r4, r3, #24 + 0xf2412c36, //vfma.f32 d18, d1, d22 + 0xf2411c35, //vfma.f32 d17, d1, d21 + 0xf4ee5c9f, //vld1.32 {d21[]}, [lr :32] 0xf2413c30, //vfma.f32 d19, d1, d16 - 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32] - 0xe28e3004, //add r3, lr, #4 - 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32] - 0xe28e300c, //add r3, lr, #12 + 0xf4e40c9f, //vld1.32 {d16[]}, [r4 :32] + 0xe2834008, //add r4, r3, #8 + 0xe283300c, //add r3, r3, #12 + 0xf2012c30, //vfma.f32 d2, d1, d16 + 0xf4e40c9f, //vld1.32 {d16[]}, [r4 :32] + 0xf2401c35, //vfma.f32 d17, d0, d21 + 0xf2403c34, //vfma.f32 d19, d0, d20 + 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32] 0xf2402c34, //vfma.f32 d18, d0, d20 0xf2002c30, //vfma.f32 d2, d0, d16 - 0xf2403c31, //vfma.f32 d19, d0, d17 - 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32] - 0xf2003c31, //vfma.f32 d3, d0, d17 - 0xf22201b2, //vorr d0, d18, d18 - 0xf22311b3, //vorr d1, d19, d19 - 0xe8bd4800, //pop {fp, lr} + 0xf22111b1, //vorr d1, d17, d17 + 0xf22301b3, //vorr d0, d19, d19 + 0xf22231b2, //vorr d3, d18, d18 + 0xe8bd4010, //pop {r4, lr} 0xe12fff1c, //bx ip }; CODE const uint32_t sk_matrix_perspective_vfp4[] = { - 0xe92d4800, //push {fp, lr} + 0xe92d4010, //push {r4, lr} 0xe591e000, //ldr lr, [r1] 0xe591c004, //ldr ip, [r1, #4] 0xe2811008, //add r1, r1, #8 0xe28e301c, //add r3, lr, #28 + 0xe28e4010, //add r4, lr, #16 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32] 0xe28e3020, //add r3, lr, #32 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32] 0xe28e3018, //add r3, lr, #24 0xf2411c30, //vfma.f32 d17, d1, d16 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32] - 0xe28e3010, //add r3, lr, #16 + 0xe1a0300e, //mov r3, lr + 0xf4e42c9f, //vld1.32 {d18[]}, [r4 :32] + 0xe28e4008, //add r4, lr, #8 + 0xf4e43c9f, //vld1.32 {d19[]}, [r4 :32] 0xf2401c30, //vfma.f32 d17, d0, d16 - 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32] - 0xe28e3004, //add r3, lr, #4 - 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] - 0xe28e3008, //add r3, lr, #8 - 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32] + 0xf4e30c9d, //vld1.32 {d16[]}, [r3 :32]! + 0xf4e35c9f, //vld1.32 {d21[]}, [r3 :32] 0xe28e3014, //add r3, lr, #20 - 0xf2414c32, //vfma.f32 d20, d1, d18 - 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] - 0xe28e300c, //add r3, lr, #12 - 0xf3fb3521, //vrecpe.f32 d19, d17 - 0xf2412c30, //vfma.f32 d18, d1, d16 + 0xf2413c35, //vfma.f32 d19, d1, d21 0xf4e35c9f, //vld1.32 {d21[]}, [r3 :32] - 0xf2410fb3, //vrecps.f32 d16, d17, d19 - 0xf4ee1c9f, //vld1.32 {d17[]}, [lr :32] - 0xf2404c31, //vfma.f32 d20, d0, d17 - 0xf2402c35, //vfma.f32 d18, d0, d21 - 0xf3430db0, //vmul.f32 d16, d19, d16 - 0xf3040db0, //vmul.f32 d0, d20, d16 - 0xf3021db0, //vmul.f32 d1, d18, d16 - 0xe8bd4800, //pop {fp, lr} + 0xe28e300c, //add r3, lr, #12 + 0xf2415c32, //vfma.f32 d21, d1, d18 + 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] + 0xf3fb4521, //vrecpe.f32 d20, d17 + 0xf2403c30, //vfma.f32 d19, d0, d16 + 0xf2411fb4, //vrecps.f32 d17, d17, d20 + 0xf2405c32, //vfma.f32 d21, d0, d18 + 0xf3440db1, //vmul.f32 d16, d20, d17 + 0xf3030db0, //vmul.f32 d0, d19, d16 + 0xf3051db0, //vmul.f32 d1, d21, d16 + 0xe8bd4010, //pop {r4, lr} 0xe12fff1c, //bx ip }; @@ -2151,11 +2139,11 @@ CODE const uint8_t sk_just_return_hsw[] = { CODE const uint8_t sk_seed_shader_hsw[] = { 72,173, //lods %ds:(%rsi),%rax 197,249,110,199, //vmovd %edi,%xmm0 - 196,226,125,24,192, //vbroadcastss %xmm0,%ymm0 + 196,226,125,88,192, //vpbroadcastd %xmm0,%ymm0 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 65,184,0,0,0,63, //mov $0x3f000000,%r8d 196,193,121,110,200, //vmovd %r8d,%xmm1 - 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1 + 196,226,125,88,201, //vpbroadcastd %xmm1,%ymm1 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0 197,252,88,2, //vaddps (%rdx),%ymm0,%ymm0 196,226,125,24,16, //vbroadcastss (%rax),%ymm2 @@ -2163,7 +2151,7 @@ CODE const uint8_t sk_seed_shader_hsw[] = { 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1 184,0,0,128,63, //mov $0x3f800000,%eax 197,249,110,208, //vmovd %eax,%xmm2 - 196,226,125,24,210, //vbroadcastss %xmm2,%ymm2 + 196,226,125,88,210, //vpbroadcastd %xmm2,%ymm2 72,173, //lods %ds:(%rsi),%rax 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4 @@ -2204,7 +2192,7 @@ CODE const uint8_t sk_plus__hsw[] = { CODE const uint8_t sk_srcover_hsw[] = { 184,0,0,128,63, //mov $0x3f800000,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 197,60,92,195, //vsubps %ymm3,%ymm8,%ymm8 196,194,93,184,192, //vfmadd231ps %ymm8,%ymm4,%ymm0 196,194,85,184,200, //vfmadd231ps %ymm8,%ymm5,%ymm1 @@ -2217,7 +2205,7 @@ CODE const uint8_t sk_srcover_hsw[] = { CODE const uint8_t sk_dstover_hsw[] = { 184,0,0,128,63, //mov $0x3f800000,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 197,60,92,199, //vsubps %ymm7,%ymm8,%ymm8 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0 196,226,61,168,205, //vfmadd213ps %ymm5,%ymm8,%ymm1 @@ -2240,7 +2228,7 @@ CODE const uint8_t sk_clamp_0_hsw[] = { CODE const uint8_t sk_clamp_1_hsw[] = { 184,0,0,128,63, //mov $0x3f800000,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1 196,193,108,93,208, //vminps %ymm8,%ymm2,%ymm2 @@ -2252,7 +2240,7 @@ CODE const uint8_t sk_clamp_1_hsw[] = { CODE const uint8_t sk_clamp_a_hsw[] = { 184,0,0,128,63, //mov $0x3f800000,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3 197,252,93,195, //vminps %ymm3,%ymm0,%ymm0 197,244,93,203, //vminps %ymm3,%ymm1,%ymm1 @@ -2326,7 +2314,7 @@ CODE const uint8_t sk_unpremul_hsw[] = { 196,65,100,194,200,0, //vcmpeqps %ymm8,%ymm3,%ymm9 184,0,0,128,63, //mov $0x3f800000,%eax 197,121,110,208, //vmovd %eax,%xmm10 - 196,66,125,24,210, //vbroadcastss %xmm10,%ymm10 + 196,66,125,88,210, //vpbroadcastd %xmm10,%ymm10 197,44,94,211, //vdivps %ymm3,%ymm10,%ymm10 196,67,45,74,192,144, //vblendvps %ymm9,%ymm8,%ymm10,%ymm8 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 @@ -2339,29 +2327,29 @@ CODE const uint8_t sk_unpremul_hsw[] = { CODE const uint8_t sk_from_srgb_hsw[] = { 184,145,131,158,61, //mov $0x3d9e8391,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 197,124,89,208, //vmulps %ymm0,%ymm0,%ymm10 184,154,153,153,62, //mov $0x3e99999a,%eax 197,121,110,216, //vmovd %eax,%xmm11 - 196,66,125,24,219, //vbroadcastss %xmm11,%ymm11 + 196,66,125,88,219, //vpbroadcastd %xmm11,%ymm11 184,92,143,50,63, //mov $0x3f328f5c,%eax 197,121,110,224, //vmovd %eax,%xmm12 - 196,66,125,24,228, //vbroadcastss %xmm12,%ymm12 - 196,65,124,40,235, //vmovaps %ymm11,%ymm13 + 196,66,125,88,228, //vpbroadcastd %xmm12,%ymm12 + 196,65,125,111,235, //vmovdqa %ymm11,%ymm13 196,66,125,168,236, //vfmadd213ps %ymm12,%ymm0,%ymm13 184,10,215,35,59, //mov $0x3b23d70a,%eax 197,121,110,240, //vmovd %eax,%xmm14 - 196,66,125,24,246, //vbroadcastss %xmm14,%ymm14 + 196,66,125,88,246, //vpbroadcastd %xmm14,%ymm14 196,66,45,168,238, //vfmadd213ps %ymm14,%ymm10,%ymm13 184,174,71,97,61, //mov $0x3d6147ae,%eax 197,121,110,208, //vmovd %eax,%xmm10 - 196,66,125,24,210, //vbroadcastss %xmm10,%ymm10 + 196,66,125,88,210, //vpbroadcastd %xmm10,%ymm10 196,193,124,194,194,1, //vcmpltps %ymm10,%ymm0,%ymm0 196,195,21,74,193,0, //vblendvps %ymm0,%ymm9,%ymm13,%ymm0 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9 197,116,89,233, //vmulps %ymm1,%ymm1,%ymm13 - 196,65,124,40,251, //vmovaps %ymm11,%ymm15 + 196,65,125,111,251, //vmovdqa %ymm11,%ymm15 196,66,117,168,252, //vfmadd213ps %ymm12,%ymm1,%ymm15 196,66,21,168,254, //vfmadd213ps %ymm14,%ymm13,%ymm15 196,193,116,194,202,1, //vcmpltps %ymm10,%ymm1,%ymm1 @@ -2382,27 +2370,27 @@ CODE const uint8_t sk_to_srgb_hsw[] = { 196,65,124,82,224, //vrsqrtps %ymm8,%ymm12 184,41,92,71,65, //mov $0x41475c29,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 197,60,89,232, //vmulps %ymm0,%ymm8,%ymm13 184,0,0,128,63, //mov $0x3f800000,%eax 197,121,110,200, //vmovd %eax,%xmm9 - 196,66,125,24,201, //vbroadcastss %xmm9,%ymm9 + 196,66,125,88,201, //vpbroadcastd %xmm9,%ymm9 184,194,135,210,62, //mov $0x3ed287c2,%eax 197,121,110,208, //vmovd %eax,%xmm10 - 196,66,125,24,210, //vbroadcastss %xmm10,%ymm10 + 196,66,125,88,210, //vpbroadcastd %xmm10,%ymm10 184,206,111,48,63, //mov $0x3f306fce,%eax 197,121,110,240, //vmovd %eax,%xmm14 - 196,66,125,24,246, //vbroadcastss %xmm14,%ymm14 + 196,66,125,88,246, //vpbroadcastd %xmm14,%ymm14 184,168,87,202,61, //mov $0x3dca57a8,%eax 53,0,0,0,128, //xor $0x80000000,%eax 197,121,110,248, //vmovd %eax,%xmm15 - 196,66,125,24,255, //vbroadcastss %xmm15,%ymm15 + 196,66,125,88,255, //vpbroadcastd %xmm15,%ymm15 196,66,13,168,223, //vfmadd213ps %ymm15,%ymm14,%ymm11 196,66,45,184,220, //vfmadd231ps %ymm12,%ymm10,%ymm11 196,65,52,93,219, //vminps %ymm11,%ymm9,%ymm11 184,4,231,140,59, //mov $0x3b8ce704,%eax 197,121,110,224, //vmovd %eax,%xmm12 - 196,66,125,24,228, //vbroadcastss %xmm12,%ymm12 + 196,66,125,88,228, //vpbroadcastd %xmm12,%ymm12 196,193,124,194,196,1, //vcmpltps %ymm12,%ymm0,%ymm0 196,195,37,74,197,0, //vblendvps %ymm0,%ymm13,%ymm11,%ymm0 197,124,82,217, //vrsqrtps %ymm1,%ymm11 @@ -2445,12 +2433,12 @@ CODE const uint8_t sk_scale_u8_hsw[] = { 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 117,56, //jne 4bf <_sk_scale_u8_hsw+0x48> - 197,123,16,0, //vmovsd (%rax),%xmm8 + 197,122,126,0, //vmovq (%rax),%xmm8 196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 184,129,128,128,59, //mov $0x3b808081,%eax 197,121,110,200, //vmovd %eax,%xmm9 - 196,66,125,24,201, //vbroadcastss %xmm9,%ymm9 + 196,66,125,88,201, //vpbroadcastd %xmm9,%ymm9 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 @@ -2495,12 +2483,12 @@ CODE const uint8_t sk_lerp_u8_hsw[] = { 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 117,76, //jne 56f <_sk_lerp_u8_hsw+0x5c> - 197,123,16,0, //vmovsd (%rax),%xmm8 + 197,122,126,0, //vmovq (%rax),%xmm8 196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 184,129,128,128,59, //mov $0x3b808081,%eax 197,121,110,200, //vmovd %eax,%xmm9 - 196,66,125,24,201, //vbroadcastss %xmm9,%ymm9 + 196,66,125,88,201, //vpbroadcastd %xmm9,%ymm9 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0 @@ -2541,7 +2529,7 @@ CODE const uint8_t sk_lerp_565_hsw[] = { 197,124,91,203, //vcvtdq2ps %ymm3,%ymm9 184,8,33,132,55, //mov $0x37842108,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 197,52,89,203, //vmulps %ymm3,%ymm9,%ymm9 184,224,7,0,0, //mov $0x7e0,%eax 197,249,110,216, //vmovd %eax,%xmm3 @@ -2550,7 +2538,7 @@ CODE const uint8_t sk_lerp_565_hsw[] = { 197,124,91,211, //vcvtdq2ps %ymm3,%ymm10 184,33,8,2,58, //mov $0x3a020821,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 197,44,89,211, //vmulps %ymm3,%ymm10,%ymm10 184,31,0,0,0, //mov $0x1f,%eax 197,249,110,216, //vmovd %eax,%xmm3 @@ -2559,7 +2547,7 @@ CODE const uint8_t sk_lerp_565_hsw[] = { 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 184,8,33,4,61, //mov $0x3d042108,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 196,226,53,168,196, //vfmadd213ps %ymm4,%ymm9,%ymm0 @@ -2569,16 +2557,16 @@ CODE const uint8_t sk_lerp_565_hsw[] = { 196,226,101,168,214, //vfmadd213ps %ymm6,%ymm3,%ymm2 184,0,0,128,63, //mov $0x3f800000,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax 65,137,200, //mov %ecx,%r8d 65,128,224,7, //and $0x7,%r8b 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 65,254,200, //dec %r8b - 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 15,135,55,255,255,255, //ja 5a8 <_sk_lerp_565_hsw+0x14> + 15,135,59,255,255,255, //ja 5a8 <_sk_lerp_565_hsw+0x14> + 69,15,182,192, //movzbl %r8b,%r8d 76,141,13,76,0,0,0, //lea 0x4c(%rip),%r9 # 6c4 <_sk_lerp_565_hsw+0x130> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax @@ -2597,7 +2585,7 @@ CODE const uint8_t sk_lerp_565_hsw[] = { 255, //(bad) 255, //(bad) 255, //(bad) - 233,255,255,255,225, //jmpq ffffffffe20006cc <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff4e7> + 233,255,255,255,225, //jmpq ffffffffe20006cc <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff4f0> 255, //(bad) 255, //(bad) 255, //(bad) @@ -2645,7 +2633,7 @@ CODE const uint8_t sk_load_tables_hsw[] = { 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 184,129,128,128,59, //mov $0x3b808081,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 72,173, //lods %ds:(%rsi),%rax 76,137,193, //mov %r8,%rcx @@ -2668,12 +2656,12 @@ CODE const uint8_t sk_load_a8_hsw[] = { 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 117,50, //jne 7d9 <_sk_load_a8_hsw+0x42> - 197,251,16,0, //vmovsd (%rax),%xmm0 + 197,250,126,0, //vmovq (%rax),%xmm0 196,226,125,49,192, //vpmovzxbd %xmm0,%ymm0 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 184,129,128,128,59, //mov $0x3b808081,%eax 197,249,110,200, //vmovd %eax,%xmm1 - 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1 + 196,226,125,88,201, //vpbroadcastd %xmm1,%ymm1 197,252,89,217, //vmulps %ymm1,%ymm0,%ymm3 72,173, //lods %ds:(%rsi),%rax 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 @@ -2700,7 +2688,7 @@ CODE const uint8_t sk_store_a8_hsw[] = { 76,139,8, //mov (%rax),%r9 184,0,0,127,67, //mov $0x437f0000,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 @@ -2711,16 +2699,16 @@ CODE const uint8_t sk_store_a8_hsw[] = { 196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax - 137,200, //mov %ecx,%eax - 36,7, //and $0x7,%al - 254,200, //dec %al - 68,15,182,192, //movzbl %al,%r8d + 65,137,200, //mov %ecx,%r8d + 65,128,224,7, //and $0x7,%r8b + 65,254,200, //dec %r8b 65,128,248,6, //cmp $0x6,%r8b 119,236, //ja 835 <_sk_store_a8_hsw+0x37> 196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8 - 76,141,21,67,0,0,0, //lea 0x43(%rip),%r10 # 898 <_sk_store_a8_hsw+0x9a> - 75,99,4,130, //movslq (%r10,%r8,4),%rax - 76,1,208, //add %r10,%rax + 65,15,182,192, //movzbl %r8b,%eax + 76,141,5,67,0,0,0, //lea 0x43(%rip),%r8 # 89c <_sk_store_a8_hsw+0x9e> + 73,99,4,128, //movslq (%r8,%rax,4),%rax + 76,1,192, //add %r8,%rax 255,224, //jmpq *%rax 196,67,121,20,68,57,6,12, //vpextrb $0xc,%xmm8,0x6(%r9,%rdi,1) 196,67,121,20,68,57,5,10, //vpextrb $0xa,%xmm8,0x5(%r9,%rdi,1) @@ -2729,7 +2717,7 @@ CODE const uint8_t sk_store_a8_hsw[] = { 196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1) 196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1) 196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1) - 235,158, //jmp 835 <_sk_store_a8_hsw+0x37> + 235,154, //jmp 835 <_sk_store_a8_hsw+0x37> 144, //nop 246,255, //idiv %bh 255, //(bad) @@ -2759,7 +2747,7 @@ CODE const uint8_t sk_load_565_hsw[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,16, //mov (%rax),%r10 72,133,201, //test %rcx,%rcx - 15,133,149,0,0,0, //jne 957 <_sk_load_565_hsw+0xa3> + 15,133,149,0,0,0, //jne 95b <_sk_load_565_hsw+0xa3> 196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0 196,226,125,51,208, //vpmovzxwd %xmm0,%ymm2 184,0,248,0,0, //mov $0xf800,%eax @@ -2769,7 +2757,7 @@ CODE const uint8_t sk_load_565_hsw[] = { 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 184,8,33,132,55, //mov $0x37842108,%eax 197,249,110,200, //vmovd %eax,%xmm1 - 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1 + 196,226,125,88,201, //vpbroadcastd %xmm1,%ymm1 197,252,89,193, //vmulps %ymm1,%ymm0,%ymm0 184,224,7,0,0, //mov $0x7e0,%eax 197,249,110,200, //vmovd %eax,%xmm1 @@ -2778,7 +2766,7 @@ CODE const uint8_t sk_load_565_hsw[] = { 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1 184,33,8,2,58, //mov $0x3a020821,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1 184,31,0,0,0, //mov $0x1f,%eax 197,249,110,216, //vmovd %eax,%xmm3 @@ -2787,21 +2775,21 @@ CODE const uint8_t sk_load_565_hsw[] = { 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 184,8,33,4,61, //mov $0x3d042108,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 184,0,0,128,63, //mov $0x3f800000,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax 65,137,200, //mov %ecx,%r8d 65,128,224,7, //and $0x7,%r8b 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0 65,254,200, //dec %r8b - 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 15,135,85,255,255,255, //ja 8c8 <_sk_load_565_hsw+0x14> - 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # 9c4 <_sk_load_565_hsw+0x110> + 15,135,89,255,255,255, //ja 8cc <_sk_load_565_hsw+0x14> + 69,15,182,192, //movzbl %r8b,%r8d + 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # 9c8 <_sk_load_565_hsw+0x110> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax 255,224, //jmpq *%rax @@ -2813,12 +2801,12 @@ CODE const uint8_t sk_load_565_hsw[] = { 196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0 196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0 196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0 - 233,5,255,255,255, //jmpq 8c8 <_sk_load_565_hsw+0x14> + 233,5,255,255,255, //jmpq 8cc <_sk_load_565_hsw+0x14> 144, //nop 243,255, //repz (bad) 255, //(bad) 255, //(bad) - 235,255, //jmp 9c9 <_sk_load_565_hsw+0x115> + 235,255, //jmp 9cd <_sk_load_565_hsw+0x115> 255, //(bad) 255,227, //jmpq *%rbx 255, //(bad) @@ -2844,13 +2832,13 @@ CODE const uint8_t sk_store_565_hsw[] = { 76,139,8, //mov (%rax),%r9 184,0,0,248,65, //mov $0x41f80000,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9 196,193,53,114,241,11, //vpslld $0xb,%ymm9,%ymm9 184,0,0,124,66, //mov $0x427c0000,%eax 197,121,110,208, //vmovd %eax,%xmm10 - 196,66,125,24,210, //vbroadcastss %xmm10,%ymm10 + 196,66,125,88,210, //vpbroadcastd %xmm10,%ymm10 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 196,193,45,114,242,5, //vpslld $0x5,%ymm10,%ymm10 @@ -2861,19 +2849,19 @@ CODE const uint8_t sk_store_565_hsw[] = { 196,67,125,57,193,1, //vextracti128 $0x1,%ymm8,%xmm9 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 72,133,201, //test %rcx,%rcx - 117,10, //jne a4c <_sk_store_565_hsw+0x6c> + 117,10, //jne a50 <_sk_store_565_hsw+0x6c> 196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax - 137,200, //mov %ecx,%eax - 36,7, //and $0x7,%al - 254,200, //dec %al - 68,15,182,192, //movzbl %al,%r8d + 65,137,200, //mov %ecx,%r8d + 65,128,224,7, //and $0x7,%r8b + 65,254,200, //dec %r8b 65,128,248,6, //cmp $0x6,%r8b - 119,236, //ja a48 <_sk_store_565_hsw+0x68> - 76,141,21,69,0,0,0, //lea 0x45(%rip),%r10 # aa8 <_sk_store_565_hsw+0xc8> - 75,99,4,130, //movslq (%r10,%r8,4),%rax - 76,1,208, //add %r10,%rax + 119,236, //ja a4c <_sk_store_565_hsw+0x68> + 65,15,182,192, //movzbl %r8b,%eax + 76,141,5,69,0,0,0, //lea 0x45(%rip),%r8 # ab0 <_sk_store_565_hsw+0xcc> + 73,99,4,128, //movslq (%r8,%rax,4),%rax + 76,1,192, //add %r8,%rax 255,224, //jmpq *%rax 196,67,121,21,68,121,12,6, //vpextrw $0x6,%xmm8,0xc(%r9,%rdi,2) 196,67,121,21,68,121,10,5, //vpextrw $0x5,%xmm8,0xa(%r9,%rdi,2) @@ -2881,10 +2869,9 @@ CODE const uint8_t sk_store_565_hsw[] = { 196,67,121,21,68,121,6,3, //vpextrw $0x3,%xmm8,0x6(%r9,%rdi,2) 196,67,121,21,68,121,4,2, //vpextrw $0x2,%xmm8,0x4(%r9,%rdi,2) 196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2) - 197,121,126,192, //vmovd %xmm8,%eax - 102,65,137,4,121, //mov %ax,(%r9,%rdi,2) - 235,161, //jmp a48 <_sk_store_565_hsw+0x68> - 144, //nop + 196,67,121,21,4,121,0, //vpextrw $0x0,%xmm8,(%r9,%rdi,2) + 235,159, //jmp a4c <_sk_store_565_hsw+0x68> + 15,31,0, //nopl (%rax) 244, //hlt 255, //(bad) 255, //(bad) @@ -2916,7 +2903,7 @@ CODE const uint8_t sk_load_8888_hsw[] = { 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9 76,3,8, //add (%rax),%r9 77,133,192, //test %r8,%r8 - 117,104, //jne b41 <_sk_load_8888_hsw+0x7d> + 117,104, //jne b49 <_sk_load_8888_hsw+0x7d> 196,193,126,111,25, //vmovdqu (%r9),%ymm3 184,255,0,0,0, //mov $0xff,%eax 197,249,110,192, //vmovd %eax,%xmm0 @@ -2925,7 +2912,7 @@ CODE const uint8_t sk_load_8888_hsw[] = { 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 184,129,128,128,59, //mov $0x3b808081,%eax 197,249,110,200, //vmovd %eax,%xmm1 - 196,98,125,24,193, //vbroadcastss %xmm1,%ymm8 + 196,98,125,88,193, //vpbroadcastd %xmm1,%ymm8 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0 197,245,114,211,8, //vpsrld $0x8,%ymm3,%ymm1 197,237,219,201, //vpand %ymm1,%ymm2,%ymm1 @@ -2949,7 +2936,7 @@ CODE const uint8_t sk_load_8888_hsw[] = { 196,225,249,110,192, //vmovq %rax,%xmm0 196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0 196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3 - 233,116,255,255,255, //jmpq ade <_sk_load_8888_hsw+0x1a> + 233,116,255,255,255, //jmpq ae6 <_sk_load_8888_hsw+0x1a> }; CODE const uint8_t sk_store_8888_hsw[] = { @@ -2959,7 +2946,7 @@ CODE const uint8_t sk_store_8888_hsw[] = { 76,3,8, //add (%rax),%r9 184,0,0,127,67, //mov $0x437f0000,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9 197,60,89,209, //vmulps %ymm1,%ymm8,%ymm10 @@ -2975,7 +2962,7 @@ CODE const uint8_t sk_store_8888_hsw[] = { 196,65,45,235,192, //vpor %ymm8,%ymm10,%ymm8 196,65,53,235,192, //vpor %ymm8,%ymm9,%ymm8 77,133,192, //test %r8,%r8 - 117,12, //jne bde <_sk_store_8888_hsw+0x74> + 117,12, //jne be6 <_sk_store_8888_hsw+0x74> 196,65,126,127,1, //vmovdqu %ymm8,(%r9) 72,173, //lods %ds:(%rsi),%rax 76,137,193, //mov %r8,%rcx @@ -2988,29 +2975,29 @@ CODE const uint8_t sk_store_8888_hsw[] = { 196,97,249,110,200, //vmovq %rax,%xmm9 196,66,125,33,201, //vpmovsxbd %xmm9,%ymm9 196,66,53,142,1, //vpmaskmovd %ymm8,%ymm9,(%r9) - 235,211, //jmp bd7 <_sk_store_8888_hsw+0x6d> + 235,211, //jmp bdf <_sk_store_8888_hsw+0x6d> }; CODE const uint8_t sk_load_f16_hsw[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax 72,133,201, //test %rcx,%rcx - 117,97, //jne c6f <_sk_load_f16_hsw+0x6b> - 197,249,16,12,248, //vmovupd (%rax,%rdi,8),%xmm1 + 117,97, //jne c77 <_sk_load_f16_hsw+0x6b> + 197,121,16,4,248, //vmovupd (%rax,%rdi,8),%xmm8 197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2 197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3 - 197,121,16,68,248,48, //vmovupd 0x30(%rax,%rdi,8),%xmm8 - 197,241,97,194, //vpunpcklwd %xmm2,%xmm1,%xmm0 - 197,241,105,202, //vpunpckhwd %xmm2,%xmm1,%xmm1 - 196,193,97,97,208, //vpunpcklwd %xmm8,%xmm3,%xmm2 - 196,193,97,105,216, //vpunpckhwd %xmm8,%xmm3,%xmm3 - 197,121,97,193, //vpunpcklwd %xmm1,%xmm0,%xmm8 - 197,121,105,201, //vpunpckhwd %xmm1,%xmm0,%xmm9 - 197,233,97,203, //vpunpcklwd %xmm3,%xmm2,%xmm1 - 197,233,105,219, //vpunpckhwd %xmm3,%xmm2,%xmm3 - 197,185,108,193, //vpunpcklqdq %xmm1,%xmm8,%xmm0 + 197,122,111,76,248,48, //vmovdqu 0x30(%rax,%rdi,8),%xmm9 + 197,185,97,194, //vpunpcklwd %xmm2,%xmm8,%xmm0 + 197,185,105,210, //vpunpckhwd %xmm2,%xmm8,%xmm2 + 196,193,97,97,201, //vpunpcklwd %xmm9,%xmm3,%xmm1 + 196,193,97,105,217, //vpunpckhwd %xmm9,%xmm3,%xmm3 + 197,121,97,194, //vpunpcklwd %xmm2,%xmm0,%xmm8 + 197,121,105,202, //vpunpckhwd %xmm2,%xmm0,%xmm9 + 197,241,97,211, //vpunpcklwd %xmm3,%xmm1,%xmm2 + 197,241,105,219, //vpunpckhwd %xmm3,%xmm1,%xmm3 + 197,185,108,194, //vpunpcklqdq %xmm2,%xmm8,%xmm0 196,226,125,19,192, //vcvtph2ps %xmm0,%ymm0 - 197,185,109,201, //vpunpckhqdq %xmm1,%xmm8,%xmm1 + 197,185,109,202, //vpunpckhqdq %xmm2,%xmm8,%xmm1 196,226,125,19,201, //vcvtph2ps %xmm1,%ymm1 197,177,108,211, //vpunpcklqdq %xmm3,%xmm9,%xmm2 196,226,125,19,210, //vcvtph2ps %xmm2,%ymm2 @@ -3018,38 +3005,32 @@ CODE const uint8_t sk_load_f16_hsw[] = { 196,226,125,19,219, //vcvtph2ps %xmm3,%ymm3 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax - 197,251,16,12,248, //vmovsd (%rax,%rdi,8),%xmm1 - 196,65,57,87,192, //vxorpd %xmm8,%xmm8,%xmm8 + 197,123,16,4,248, //vmovsd (%rax,%rdi,8),%xmm8 + 196,65,49,239,201, //vpxor %xmm9,%xmm9,%xmm9 72,131,249,1, //cmp $0x1,%rcx - 117,6, //jne c85 <_sk_load_f16_hsw+0x81> - 197,250,126,201, //vmovq %xmm1,%xmm1 - 235,30, //jmp ca3 <_sk_load_f16_hsw+0x9f> - 197,241,22,76,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm1,%xmm1 + 116,79, //je cd6 <_sk_load_f16_hsw+0xca> + 197,57,22,68,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm8,%xmm8 72,131,249,3, //cmp $0x3,%rcx - 114,18, //jb ca3 <_sk_load_f16_hsw+0x9f> + 114,67, //jb cd6 <_sk_load_f16_hsw+0xca> 197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2 72,131,249,3, //cmp $0x3,%rcx - 117,19, //jne cb0 <_sk_load_f16_hsw+0xac> - 197,250,126,210, //vmovq %xmm2,%xmm2 - 235,46, //jmp cd1 <_sk_load_f16_hsw+0xcd> - 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 - 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2 - 233,117,255,255,255, //jmpq c25 <_sk_load_f16_hsw+0x21> + 116,68, //je ce3 <_sk_load_f16_hsw+0xd7> 197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2 72,131,249,5, //cmp $0x5,%rcx - 114,21, //jb cd1 <_sk_load_f16_hsw+0xcd> + 114,56, //jb ce3 <_sk_load_f16_hsw+0xd7> 197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3 72,131,249,5, //cmp $0x5,%rcx - 117,18, //jne cda <_sk_load_f16_hsw+0xd6> - 197,250,126,219, //vmovq %xmm3,%xmm3 - 233,84,255,255,255, //jmpq c25 <_sk_load_f16_hsw+0x21> - 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 - 233,75,255,255,255, //jmpq c25 <_sk_load_f16_hsw+0x21> + 15,132,114,255,255,255, //je c2d <_sk_load_f16_hsw+0x21> 197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3 72,131,249,7, //cmp $0x7,%rcx - 15,130,59,255,255,255, //jb c25 <_sk_load_f16_hsw+0x21> - 197,123,16,68,248,48, //vmovsd 0x30(%rax,%rdi,8),%xmm8 - 233,48,255,255,255, //jmpq c25 <_sk_load_f16_hsw+0x21> + 15,130,98,255,255,255, //jb c2d <_sk_load_f16_hsw+0x21> + 197,122,126,76,248,48, //vmovq 0x30(%rax,%rdi,8),%xmm9 + 233,87,255,255,255, //jmpq c2d <_sk_load_f16_hsw+0x21> + 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 + 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2 + 233,74,255,255,255, //jmpq c2d <_sk_load_f16_hsw+0x21> + 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 + 233,65,255,255,255, //jmpq c2d <_sk_load_f16_hsw+0x21> }; CODE const uint8_t sk_store_f16_hsw[] = { @@ -3068,7 +3049,7 @@ CODE const uint8_t sk_store_f16_hsw[] = { 196,65,57,98,205, //vpunpckldq %xmm13,%xmm8,%xmm9 196,65,57,106,197, //vpunpckhdq %xmm13,%xmm8,%xmm8 72,133,201, //test %rcx,%rcx - 117,27, //jne d5a <_sk_store_f16_hsw+0x65> + 117,27, //jne d51 <_sk_store_f16_hsw+0x65> 197,120,17,28,248, //vmovups %xmm11,(%rax,%rdi,8) 197,120,17,84,248,16, //vmovups %xmm10,0x10(%rax,%rdi,8) 197,120,17,76,248,32, //vmovups %xmm9,0x20(%rax,%rdi,8) @@ -3077,22 +3058,22 @@ CODE const uint8_t sk_store_f16_hsw[] = { 255,224, //jmpq *%rax 197,121,214,28,248, //vmovq %xmm11,(%rax,%rdi,8) 72,131,249,1, //cmp $0x1,%rcx - 116,241, //je d56 <_sk_store_f16_hsw+0x61> + 116,241, //je d4d <_sk_store_f16_hsw+0x61> 197,121,23,92,248,8, //vmovhpd %xmm11,0x8(%rax,%rdi,8) 72,131,249,3, //cmp $0x3,%rcx - 114,229, //jb d56 <_sk_store_f16_hsw+0x61> + 114,229, //jb d4d <_sk_store_f16_hsw+0x61> 197,121,214,84,248,16, //vmovq %xmm10,0x10(%rax,%rdi,8) - 116,221, //je d56 <_sk_store_f16_hsw+0x61> + 116,221, //je d4d <_sk_store_f16_hsw+0x61> 197,121,23,84,248,24, //vmovhpd %xmm10,0x18(%rax,%rdi,8) 72,131,249,5, //cmp $0x5,%rcx - 114,209, //jb d56 <_sk_store_f16_hsw+0x61> + 114,209, //jb d4d <_sk_store_f16_hsw+0x61> 197,121,214,76,248,32, //vmovq %xmm9,0x20(%rax,%rdi,8) - 116,201, //je d56 <_sk_store_f16_hsw+0x61> + 116,201, //je d4d <_sk_store_f16_hsw+0x61> 197,121,23,76,248,40, //vmovhpd %xmm9,0x28(%rax,%rdi,8) 72,131,249,7, //cmp $0x7,%rcx - 114,189, //jb d56 <_sk_store_f16_hsw+0x61> + 114,189, //jb d4d <_sk_store_f16_hsw+0x61> 197,121,214,68,248,48, //vmovq %xmm8,0x30(%rax,%rdi,8) - 235,181, //jmp d56 <_sk_store_f16_hsw+0x61> + 235,181, //jmp d4d <_sk_store_f16_hsw+0x61> }; CODE const uint8_t sk_store_f32_hsw[] = { @@ -3108,7 +3089,7 @@ CODE const uint8_t sk_store_f32_hsw[] = { 196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8 196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11 72,133,201, //test %rcx,%rcx - 117,55, //jne e0e <_sk_store_f32_hsw+0x6d> + 117,55, //jne e05 <_sk_store_f32_hsw+0x6d> 196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12 196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13 196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9 @@ -3121,22 +3102,22 @@ CODE const uint8_t sk_store_f32_hsw[] = { 255,224, //jmpq *%rax 196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4) 72,131,249,1, //cmp $0x1,%rcx - 116,240, //je e0a <_sk_store_f32_hsw+0x69> + 116,240, //je e01 <_sk_store_f32_hsw+0x69> 196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4) 72,131,249,3, //cmp $0x3,%rcx - 114,227, //jb e0a <_sk_store_f32_hsw+0x69> + 114,227, //jb e01 <_sk_store_f32_hsw+0x69> 196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4) - 116,218, //je e0a <_sk_store_f32_hsw+0x69> + 116,218, //je e01 <_sk_store_f32_hsw+0x69> 196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4) 72,131,249,5, //cmp $0x5,%rcx - 114,205, //jb e0a <_sk_store_f32_hsw+0x69> + 114,205, //jb e01 <_sk_store_f32_hsw+0x69> 196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4) - 116,195, //je e0a <_sk_store_f32_hsw+0x69> + 116,195, //je e01 <_sk_store_f32_hsw+0x69> 196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4) 72,131,249,7, //cmp $0x7,%rcx - 114,181, //jb e0a <_sk_store_f32_hsw+0x69> + 114,181, //jb e01 <_sk_store_f32_hsw+0x69> 196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4) - 235,171, //jmp e0a <_sk_store_f32_hsw+0x69> + 235,171, //jmp e01 <_sk_store_f32_hsw+0x69> }; CODE const uint8_t sk_clamp_x_hsw[] = { @@ -3234,18 +3215,18 @@ CODE const uint8_t sk_mirror_y_hsw[] = { CODE const uint8_t sk_luminance_to_alpha_hsw[] = { 184,208,179,89,62, //mov $0x3e59b3d0,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,98,125,24,195, //vbroadcastss %xmm3,%ymm8 + 196,98,125,88,195, //vpbroadcastd %xmm3,%ymm8 184,89,23,55,63, //mov $0x3f371759,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1 196,98,125,168,193, //vfmadd213ps %ymm1,%ymm0,%ymm8 184,152,221,147,61, //mov $0x3d93dd98,%eax 197,249,110,192, //vmovd %eax,%xmm0 - 196,226,125,24,216, //vbroadcastss %xmm0,%ymm3 + 196,226,125,88,216, //vpbroadcastd %xmm0,%ymm3 196,194,109,168,216, //vfmadd213ps %ymm8,%ymm2,%ymm3 72,173, //lods %ds:(%rsi),%rax - 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 + 197,253,239,192, //vpxor %ymm0,%ymm0,%ymm0 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 255,224, //jmpq *%rax @@ -3785,7 +3766,7 @@ CODE const uint8_t sk_scale_u8_avx[] = { 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 117,80, //jne 5a2 <_sk_scale_u8_avx+0x60> - 197,123,16,0, //vmovsd (%rax),%xmm8 + 197,122,126,0, //vmovq (%rax),%xmm8 196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9 196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8 196,66,121,49,192, //vpmovzxbd %xmm8,%xmm8 @@ -3843,7 +3824,7 @@ CODE const uint8_t sk_lerp_u8_avx[] = { 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 117,116, //jne 68a <_sk_lerp_u8_avx+0x84> - 197,123,16,0, //vmovsd (%rax),%xmm8 + 197,122,126,0, //vmovq (%rax),%xmm8 196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9 196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8 196,66,121,49,192, //vpmovzxbd %xmm8,%xmm8 @@ -3945,9 +3926,9 @@ CODE const uint8_t sk_lerp_565_avx[] = { 65,128,224,7, //and $0x7,%r8b 196,65,57,239,192, //vpxor %xmm8,%xmm8,%xmm8 65,254,200, //dec %r8b - 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 15,135,239,254,255,255, //ja 6c6 <_sk_lerp_565_avx+0x14> + 15,135,243,254,255,255, //ja 6c6 <_sk_lerp_565_avx+0x14> + 69,15,182,192, //movzbl %r8b,%r8d 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # 828 <_sk_lerp_565_avx+0x176> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax @@ -4114,9 +4095,9 @@ CODE const uint8_t sk_load_tables_avx[] = { 128,227,7, //and $0x7,%bl 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 254,203, //dec %bl - 15,182,219, //movzbl %bl,%ebx 128,251,6, //cmp $0x6,%bl - 15,135,182,253,255,255, //ja 862 <_sk_load_tables_avx+0x1e> + 15,135,185,253,255,255, //ja 862 <_sk_load_tables_avx+0x1e> + 15,182,219, //movzbl %bl,%ebx 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # b3c <_sk_load_tables_avx+0x2f8> 73,99,28,153, //movslq (%r9,%rbx,4),%rbx 76,1,203, //add %r9,%rbx @@ -4168,7 +4149,7 @@ CODE const uint8_t sk_load_a8_avx[] = { 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 117,74, //jne bb2 <_sk_load_a8_avx+0x5a> - 197,251,16,0, //vmovsd (%rax),%xmm0 + 197,250,126,0, //vmovq (%rax),%xmm0 196,226,121,49,200, //vpmovzxbd %xmm0,%xmm1 196,227,121,4,192,229, //vpermilps $0xe5,%xmm0,%xmm0 196,226,121,49,192, //vpmovzxbd %xmm0,%xmm0 @@ -4216,16 +4197,16 @@ CODE const uint8_t sk_store_a8_avx[] = { 196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax - 137,200, //mov %ecx,%eax - 36,7, //and $0x7,%al - 254,200, //dec %al - 68,15,182,192, //movzbl %al,%r8d + 65,137,200, //mov %ecx,%r8d + 65,128,224,7, //and $0x7,%r8b + 65,254,200, //dec %r8b 65,128,248,6, //cmp $0x6,%r8b 119,236, //ja c15 <_sk_store_a8_avx+0x3e> 196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8 - 76,141,21,67,0,0,0, //lea 0x43(%rip),%r10 # c78 <_sk_store_a8_avx+0xa1> - 75,99,4,130, //movslq (%r10,%r8,4),%rax - 76,1,208, //add %r10,%rax + 65,15,182,192, //movzbl %r8b,%eax + 76,141,5,67,0,0,0, //lea 0x43(%rip),%r8 # c7c <_sk_store_a8_avx+0xa5> + 73,99,4,128, //movslq (%r8,%rax,4),%rax + 76,1,192, //add %r8,%rax 255,224, //jmpq *%rax 196,67,121,20,68,57,6,12, //vpextrb $0xc,%xmm8,0x6(%r9,%rdi,1) 196,67,121,20,68,57,5,10, //vpextrb $0xa,%xmm8,0x5(%r9,%rdi,1) @@ -4234,7 +4215,7 @@ CODE const uint8_t sk_store_a8_avx[] = { 196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1) 196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1) 196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1) - 235,158, //jmp c15 <_sk_store_a8_avx+0x3e> + 235,154, //jmp c15 <_sk_store_a8_avx+0x3e> 144, //nop 246,255, //idiv %bh 255, //(bad) @@ -4264,7 +4245,7 @@ CODE const uint8_t sk_load_565_avx[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,16, //mov (%rax),%r10 72,133,201, //test %rcx,%rcx - 15,133,209,0,0,0, //jne d73 <_sk_load_565_avx+0xdf> + 15,133,209,0,0,0, //jne d77 <_sk_load_565_avx+0xdf> 196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0 197,241,239,201, //vpxor %xmm1,%xmm1,%xmm1 197,249,105,201, //vpunpckhwd %xmm1,%xmm0,%xmm1 @@ -4313,10 +4294,10 @@ CODE const uint8_t sk_load_565_avx[] = { 65,128,224,7, //and $0x7,%r8b 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0 65,254,200, //dec %r8b - 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 15,135,25,255,255,255, //ja ca8 <_sk_load_565_avx+0x14> - 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # de0 <_sk_load_565_avx+0x14c> + 15,135,29,255,255,255, //ja cac <_sk_load_565_avx+0x14> + 69,15,182,192, //movzbl %r8b,%r8d + 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # de4 <_sk_load_565_avx+0x14c> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax 255,224, //jmpq *%rax @@ -4328,12 +4309,12 @@ CODE const uint8_t sk_load_565_avx[] = { 196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0 196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0 196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0 - 233,201,254,255,255, //jmpq ca8 <_sk_load_565_avx+0x14> + 233,201,254,255,255, //jmpq cac <_sk_load_565_avx+0x14> 144, //nop 243,255, //repz (bad) 255, //(bad) 255, //(bad) - 235,255, //jmp de5 <_sk_load_565_avx+0x151> + 235,255, //jmp de9 <_sk_load_565_avx+0x151> 255, //(bad) 255,227, //jmpq *%rbx 255, //(bad) @@ -4384,19 +4365,19 @@ CODE const uint8_t sk_store_565_avx[] = { 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 72,133,201, //test %rcx,%rcx - 117,10, //jne e9a <_sk_store_565_avx+0x9e> + 117,10, //jne e9e <_sk_store_565_avx+0x9e> 196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax - 137,200, //mov %ecx,%eax - 36,7, //and $0x7,%al - 254,200, //dec %al - 68,15,182,192, //movzbl %al,%r8d + 65,137,200, //mov %ecx,%r8d + 65,128,224,7, //and $0x7,%r8b + 65,254,200, //dec %r8b 65,128,248,6, //cmp $0x6,%r8b - 119,236, //ja e96 <_sk_store_565_avx+0x9a> - 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # ef8 <_sk_store_565_avx+0xfc> - 75,99,4,130, //movslq (%r10,%r8,4),%rax - 76,1,208, //add %r10,%rax + 119,236, //ja e9a <_sk_store_565_avx+0x9a> + 65,15,182,192, //movzbl %r8b,%eax + 76,141,5,67,0,0,0, //lea 0x43(%rip),%r8 # efc <_sk_store_565_avx+0xfc> + 73,99,4,128, //movslq (%r8,%rax,4),%rax + 76,1,192, //add %r8,%rax 255,224, //jmpq *%rax 196,67,121,21,68,121,12,6, //vpextrw $0x6,%xmm8,0xc(%r9,%rdi,2) 196,67,121,21,68,121,10,5, //vpextrw $0x5,%xmm8,0xa(%r9,%rdi,2) @@ -4404,29 +4385,28 @@ CODE const uint8_t sk_store_565_avx[] = { 196,67,121,21,68,121,6,3, //vpextrw $0x3,%xmm8,0x6(%r9,%rdi,2) 196,67,121,21,68,121,4,2, //vpextrw $0x2,%xmm8,0x4(%r9,%rdi,2) 196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2) - 197,121,126,192, //vmovd %xmm8,%eax - 102,65,137,4,121, //mov %ax,(%r9,%rdi,2) - 235,161, //jmp e96 <_sk_store_565_avx+0x9a> - 15,31,0, //nopl (%rax) - 242,255, //repnz (bad) + 196,67,121,21,4,121,0, //vpextrw $0x0,%xmm8,(%r9,%rdi,2) + 235,159, //jmp e9a <_sk_store_565_avx+0x9a> + 144, //nop + 246,255, //idiv %bh 255, //(bad) 255, //(bad) - 234, //(bad) + 238, //out %al,(%dx) 255, //(bad) 255, //(bad) - 255,226, //jmpq *%rdx + 255,230, //jmpq *%rsi 255, //(bad) 255, //(bad) 255, //(bad) - 218,255, //(bad) + 222,255, //fdivrp %st,%st(7) 255, //(bad) - 255,210, //callq *%rdx + 255,214, //callq *%rsi 255, //(bad) 255, //(bad) - 255,202, //dec %edx + 255,206, //dec %esi 255, //(bad) 255, //(bad) - 255,194, //inc %edx + 255,198, //inc %esi 255, //(bad) 255, //(bad) 255, //.byte 0xff @@ -4436,7 +4416,7 @@ CODE const uint8_t sk_load_8888_avx[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,16, //mov (%rax),%r10 72,133,201, //test %rcx,%rcx - 15,133,157,0,0,0, //jne fbf <_sk_load_8888_avx+0xab> + 15,133,157,0,0,0, //jne fc3 <_sk_load_8888_avx+0xab> 196,65,124,16,12,186, //vmovups (%r10,%rdi,4),%ymm9 184,255,0,0,0, //mov $0xff,%eax 197,249,110,192, //vmovd %eax,%xmm0 @@ -4473,10 +4453,10 @@ CODE const uint8_t sk_load_8888_avx[] = { 65,128,224,7, //and $0x7,%r8b 196,65,52,87,201, //vxorps %ymm9,%ymm9,%ymm9 65,254,200, //dec %r8b - 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 15,135,76,255,255,255, //ja f28 <_sk_load_8888_avx+0x14> - 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # 106c <_sk_load_8888_avx+0x158> + 15,135,80,255,255,255, //ja f2c <_sk_load_8888_avx+0x14> + 69,15,182,192, //movzbl %r8b,%r8d + 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # 1070 <_sk_load_8888_avx+0x158> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax 255,224, //jmpq *%rax @@ -4499,7 +4479,7 @@ CODE const uint8_t sk_load_8888_avx[] = { 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9 196,195,49,34,4,186,0, //vpinsrd $0x0,(%r10,%rdi,4),%xmm9,%xmm0 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9 - 233,188,254,255,255, //jmpq f28 <_sk_load_8888_avx+0x14> + 233,188,254,255,255, //jmpq f2c <_sk_load_8888_avx+0x14> 238, //out %al,(%dx) 255, //(bad) 255, //(bad) @@ -4551,31 +4531,31 @@ CODE const uint8_t sk_store_8888_avx[] = { 196,65,45,86,192, //vorpd %ymm8,%ymm10,%ymm8 196,65,53,86,192, //vorpd %ymm8,%ymm9,%ymm8 72,133,201, //test %rcx,%rcx - 117,10, //jne 112c <_sk_store_8888_avx+0xa4> + 117,10, //jne 1130 <_sk_store_8888_avx+0xa4> 196,65,124,17,4,185, //vmovups %ymm8,(%r9,%rdi,4) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax - 137,200, //mov %ecx,%eax - 36,7, //and $0x7,%al - 254,200, //dec %al - 68,15,182,192, //movzbl %al,%r8d + 65,137,200, //mov %ecx,%r8d + 65,128,224,7, //and $0x7,%r8b + 65,254,200, //dec %r8b 65,128,248,6, //cmp $0x6,%r8b - 119,236, //ja 1128 <_sk_store_8888_avx+0xa0> - 76,141,21,85,0,0,0, //lea 0x55(%rip),%r10 # 1198 <_sk_store_8888_avx+0x110> - 75,99,4,130, //movslq (%r10,%r8,4),%rax - 76,1,208, //add %r10,%rax + 119,236, //ja 112c <_sk_store_8888_avx+0xa0> + 65,15,182,192, //movzbl %r8b,%eax + 76,141,5,85,0,0,0, //lea 0x55(%rip),%r8 # 11a0 <_sk_store_8888_avx+0x114> + 73,99,4,128, //movslq (%r8,%rax,4),%rax + 76,1,192, //add %r8,%rax 255,224, //jmpq *%rax 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 196,67,121,22,76,185,24,2, //vpextrd $0x2,%xmm9,0x18(%r9,%rdi,4) 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 196,67,121,22,76,185,20,1, //vpextrd $0x1,%xmm9,0x14(%r9,%rdi,4) 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 - 196,65,121,126,76,185,16, //vmovd %xmm9,0x10(%r9,%rdi,4) + 196,65,122,17,76,185,16, //vmovss %xmm9,0x10(%r9,%rdi,4) 196,67,121,22,68,185,12,3, //vpextrd $0x3,%xmm8,0xc(%r9,%rdi,4) 196,67,121,22,68,185,8,2, //vpextrd $0x2,%xmm8,0x8(%r9,%rdi,4) 196,67,121,22,68,185,4,1, //vpextrd $0x1,%xmm8,0x4(%r9,%rdi,4) 196,65,121,126,4,185, //vmovd %xmm8,(%r9,%rdi,4) - 235,147, //jmp 1128 <_sk_store_8888_avx+0xa0> + 235,143, //jmp 112c <_sk_store_8888_avx+0xa0> 15,31,0, //nopl (%rax) 245, //cmc 255, //(bad) @@ -4606,93 +4586,87 @@ CODE const uint8_t sk_load_f16_avx[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax 72,133,201, //test %rcx,%rcx - 15,133,8,1,0,0, //jne 12ca <_sk_load_f16_avx+0x116> - 197,249,16,12,248, //vmovupd (%rax,%rdi,8),%xmm1 + 15,133,2,1,0,0, //jne 12cc <_sk_load_f16_avx+0x110> + 197,121,16,4,248, //vmovupd (%rax,%rdi,8),%xmm8 197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2 197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3 - 197,121,16,68,248,48, //vmovupd 0x30(%rax,%rdi,8),%xmm8 - 197,241,97,194, //vpunpcklwd %xmm2,%xmm1,%xmm0 - 197,241,105,202, //vpunpckhwd %xmm2,%xmm1,%xmm1 - 196,193,97,97,208, //vpunpcklwd %xmm8,%xmm3,%xmm2 - 196,193,97,105,216, //vpunpckhwd %xmm8,%xmm3,%xmm3 - 197,121,97,193, //vpunpcklwd %xmm1,%xmm0,%xmm8 - 197,249,105,193, //vpunpckhwd %xmm1,%xmm0,%xmm0 - 197,105,97,211, //vpunpcklwd %xmm3,%xmm2,%xmm10 - 197,105,105,203, //vpunpckhwd %xmm3,%xmm2,%xmm9 + 197,122,111,76,248,48, //vmovdqu 0x30(%rax,%rdi,8),%xmm9 + 197,185,97,194, //vpunpcklwd %xmm2,%xmm8,%xmm0 + 197,185,105,210, //vpunpckhwd %xmm2,%xmm8,%xmm2 + 196,193,97,97,201, //vpunpcklwd %xmm9,%xmm3,%xmm1 + 196,193,97,105,217, //vpunpckhwd %xmm9,%xmm3,%xmm3 + 197,121,97,194, //vpunpcklwd %xmm2,%xmm0,%xmm8 + 197,249,105,194, //vpunpckhwd %xmm2,%xmm0,%xmm0 + 197,241,97,211, //vpunpcklwd %xmm3,%xmm1,%xmm2 + 197,113,105,203, //vpunpckhwd %xmm3,%xmm1,%xmm9 184,0,4,0,4, //mov $0x4000400,%eax 197,249,110,216, //vmovd %eax,%xmm3 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 - 196,193,97,101,208, //vpcmpgtw %xmm8,%xmm3,%xmm2 - 196,193,105,223,208, //vpandn %xmm8,%xmm2,%xmm2 + 196,193,97,101,200, //vpcmpgtw %xmm8,%xmm3,%xmm1 + 196,65,113,223,192, //vpandn %xmm8,%xmm1,%xmm8 197,225,101,200, //vpcmpgtw %xmm0,%xmm3,%xmm1 197,241,223,192, //vpandn %xmm0,%xmm1,%xmm0 - 196,193,97,101,202, //vpcmpgtw %xmm10,%xmm3,%xmm1 - 196,193,113,223,202, //vpandn %xmm10,%xmm1,%xmm1 - 196,193,97,101,217, //vpcmpgtw %xmm9,%xmm3,%xmm3 - 196,193,97,223,217, //vpandn %xmm9,%xmm3,%xmm3 - 196,98,121,51,194, //vpmovzxwd %xmm2,%xmm8 + 197,225,101,202, //vpcmpgtw %xmm2,%xmm3,%xmm1 + 197,241,223,202, //vpandn %xmm2,%xmm1,%xmm1 + 196,193,97,101,209, //vpcmpgtw %xmm9,%xmm3,%xmm2 + 196,193,105,223,209, //vpandn %xmm9,%xmm2,%xmm2 + 196,66,121,51,208, //vpmovzxwd %xmm8,%xmm10 196,98,121,51,201, //vpmovzxwd %xmm1,%xmm9 - 196,65,41,239,210, //vpxor %xmm10,%xmm10,%xmm10 - 196,193,105,105,210, //vpunpckhwd %xmm10,%xmm2,%xmm2 - 196,193,113,105,202, //vpunpckhwd %xmm10,%xmm1,%xmm1 + 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 + 197,57,105,195, //vpunpckhwd %xmm3,%xmm8,%xmm8 + 197,241,105,203, //vpunpckhwd %xmm3,%xmm1,%xmm1 196,98,121,51,216, //vpmovzxwd %xmm0,%xmm11 - 196,98,121,51,227, //vpmovzxwd %xmm3,%xmm12 - 196,65,121,105,234, //vpunpckhwd %xmm10,%xmm0,%xmm13 - 196,65,97,105,210, //vpunpckhwd %xmm10,%xmm3,%xmm10 - 196,193,121,114,240,13, //vpslld $0xd,%xmm8,%xmm0 - 196,193,97,114,241,13, //vpslld $0xd,%xmm9,%xmm3 - 196,227,125,24,195,1, //vinsertf128 $0x1,%xmm3,%ymm0,%ymm0 + 196,98,121,51,226, //vpmovzxwd %xmm2,%xmm12 + 197,121,105,235, //vpunpckhwd %xmm3,%xmm0,%xmm13 + 197,105,105,243, //vpunpckhwd %xmm3,%xmm2,%xmm14 + 196,193,121,114,242,13, //vpslld $0xd,%xmm10,%xmm0 + 196,193,105,114,241,13, //vpslld $0xd,%xmm9,%xmm2 + 196,227,125,24,194,1, //vinsertf128 $0x1,%xmm2,%ymm0,%ymm0 184,0,0,128,119, //mov $0x77800000,%eax - 197,249,110,216, //vmovd %eax,%xmm3 - 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 - 196,99,101,24,203,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm9 + 197,249,110,208, //vmovd %eax,%xmm2 + 197,249,112,210,0, //vpshufd $0x0,%xmm2,%xmm2 + 196,99,109,24,202,1, //vinsertf128 $0x1,%xmm2,%ymm2,%ymm9 197,180,89,192, //vmulps %ymm0,%ymm9,%ymm0 - 197,233,114,242,13, //vpslld $0xd,%xmm2,%xmm2 + 196,193,105,114,240,13, //vpslld $0xd,%xmm8,%xmm2 197,241,114,241,13, //vpslld $0xd,%xmm1,%xmm1 196,227,109,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm2,%ymm1 197,180,89,201, //vmulps %ymm1,%ymm9,%ymm1 - 196,193,57,114,243,13, //vpslld $0xd,%xmm11,%xmm8 - 196,193,105,114,244,13, //vpslld $0xd,%xmm12,%xmm2 - 196,227,61,24,210,1, //vinsertf128 $0x1,%xmm2,%ymm8,%ymm2 + 196,193,105,114,243,13, //vpslld $0xd,%xmm11,%xmm2 + 196,193,97,114,244,13, //vpslld $0xd,%xmm12,%xmm3 + 196,227,109,24,211,1, //vinsertf128 $0x1,%xmm3,%ymm2,%ymm2 197,180,89,210, //vmulps %ymm2,%ymm9,%ymm2 196,193,57,114,245,13, //vpslld $0xd,%xmm13,%xmm8 - 196,193,97,114,242,13, //vpslld $0xd,%xmm10,%xmm3 + 196,193,97,114,246,13, //vpslld $0xd,%xmm14,%xmm3 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3 197,180,89,219, //vmulps %ymm3,%ymm9,%ymm3 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax - 197,251,16,12,248, //vmovsd (%rax,%rdi,8),%xmm1 - 196,65,57,87,192, //vxorpd %xmm8,%xmm8,%xmm8 + 197,123,16,4,248, //vmovsd (%rax,%rdi,8),%xmm8 + 196,65,49,239,201, //vpxor %xmm9,%xmm9,%xmm9 72,131,249,1, //cmp $0x1,%rcx - 117,6, //jne 12e0 <_sk_load_f16_avx+0x12c> - 197,250,126,201, //vmovq %xmm1,%xmm1 - 235,30, //jmp 12fe <_sk_load_f16_avx+0x14a> - 197,241,22,76,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm1,%xmm1 + 116,79, //je 132b <_sk_load_f16_avx+0x16f> + 197,57,22,68,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm8,%xmm8 72,131,249,3, //cmp $0x3,%rcx - 114,18, //jb 12fe <_sk_load_f16_avx+0x14a> + 114,67, //jb 132b <_sk_load_f16_avx+0x16f> 197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2 72,131,249,3, //cmp $0x3,%rcx - 117,19, //jne 130b <_sk_load_f16_avx+0x157> - 197,250,126,210, //vmovq %xmm2,%xmm2 - 235,46, //jmp 132c <_sk_load_f16_avx+0x178> - 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 - 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2 - 233,206,254,255,255, //jmpq 11d9 <_sk_load_f16_avx+0x25> + 116,68, //je 1338 <_sk_load_f16_avx+0x17c> 197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2 72,131,249,5, //cmp $0x5,%rcx - 114,21, //jb 132c <_sk_load_f16_avx+0x178> + 114,56, //jb 1338 <_sk_load_f16_avx+0x17c> 197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3 72,131,249,5, //cmp $0x5,%rcx - 117,18, //jne 1335 <_sk_load_f16_avx+0x181> - 197,250,126,219, //vmovq %xmm3,%xmm3 - 233,173,254,255,255, //jmpq 11d9 <_sk_load_f16_avx+0x25> - 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 - 233,164,254,255,255, //jmpq 11d9 <_sk_load_f16_avx+0x25> + 15,132,209,254,255,255, //je 11e1 <_sk_load_f16_avx+0x25> 197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3 72,131,249,7, //cmp $0x7,%rcx - 15,130,148,254,255,255, //jb 11d9 <_sk_load_f16_avx+0x25> - 197,123,16,68,248,48, //vmovsd 0x30(%rax,%rdi,8),%xmm8 - 233,137,254,255,255, //jmpq 11d9 <_sk_load_f16_avx+0x25> + 15,130,193,254,255,255, //jb 11e1 <_sk_load_f16_avx+0x25> + 197,122,126,76,248,48, //vmovq 0x30(%rax,%rdi,8),%xmm9 + 233,182,254,255,255, //jmpq 11e1 <_sk_load_f16_avx+0x25> + 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 + 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2 + 233,169,254,255,255, //jmpq 11e1 <_sk_load_f16_avx+0x25> + 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 + 233,160,254,255,255, //jmpq 11e1 <_sk_load_f16_avx+0x25> }; CODE const uint8_t sk_store_f16_avx[] = { @@ -4731,7 +4705,7 @@ CODE const uint8_t sk_store_f16_avx[] = { 196,65,25,98,205, //vpunpckldq %xmm13,%xmm12,%xmm9 196,65,25,106,197, //vpunpckhdq %xmm13,%xmm12,%xmm8 72,133,201, //test %rcx,%rcx - 117,31, //jne 1426 <_sk_store_f16_avx+0xd6> + 117,31, //jne 1417 <_sk_store_f16_avx+0xd6> 196,65,120,17,28,248, //vmovups %xmm11,(%r8,%rdi,8) 196,65,120,17,84,248,16, //vmovups %xmm10,0x10(%r8,%rdi,8) 196,65,120,17,76,248,32, //vmovups %xmm9,0x20(%r8,%rdi,8) @@ -4740,22 +4714,22 @@ CODE const uint8_t sk_store_f16_avx[] = { 255,224, //jmpq *%rax 196,65,121,214,28,248, //vmovq %xmm11,(%r8,%rdi,8) 72,131,249,1, //cmp $0x1,%rcx - 116,240, //je 1422 <_sk_store_f16_avx+0xd2> + 116,240, //je 1413 <_sk_store_f16_avx+0xd2> 196,65,121,23,92,248,8, //vmovhpd %xmm11,0x8(%r8,%rdi,8) 72,131,249,3, //cmp $0x3,%rcx - 114,227, //jb 1422 <_sk_store_f16_avx+0xd2> + 114,227, //jb 1413 <_sk_store_f16_avx+0xd2> 196,65,121,214,84,248,16, //vmovq %xmm10,0x10(%r8,%rdi,8) - 116,218, //je 1422 <_sk_store_f16_avx+0xd2> + 116,218, //je 1413 <_sk_store_f16_avx+0xd2> 196,65,121,23,84,248,24, //vmovhpd %xmm10,0x18(%r8,%rdi,8) 72,131,249,5, //cmp $0x5,%rcx - 114,205, //jb 1422 <_sk_store_f16_avx+0xd2> + 114,205, //jb 1413 <_sk_store_f16_avx+0xd2> 196,65,121,214,76,248,32, //vmovq %xmm9,0x20(%r8,%rdi,8) - 116,196, //je 1422 <_sk_store_f16_avx+0xd2> + 116,196, //je 1413 <_sk_store_f16_avx+0xd2> 196,65,121,23,76,248,40, //vmovhpd %xmm9,0x28(%r8,%rdi,8) 72,131,249,7, //cmp $0x7,%rcx - 114,183, //jb 1422 <_sk_store_f16_avx+0xd2> + 114,183, //jb 1413 <_sk_store_f16_avx+0xd2> 196,65,121,214,68,248,48, //vmovq %xmm8,0x30(%r8,%rdi,8) - 235,174, //jmp 1422 <_sk_store_f16_avx+0xd2> + 235,174, //jmp 1413 <_sk_store_f16_avx+0xd2> }; CODE const uint8_t sk_store_f32_avx[] = { @@ -4771,7 +4745,7 @@ CODE const uint8_t sk_store_f32_avx[] = { 196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8 196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11 72,133,201, //test %rcx,%rcx - 117,55, //jne 14e1 <_sk_store_f32_avx+0x6d> + 117,55, //jne 14d2 <_sk_store_f32_avx+0x6d> 196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12 196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13 196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9 @@ -4784,22 +4758,22 @@ CODE const uint8_t sk_store_f32_avx[] = { 255,224, //jmpq *%rax 196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4) 72,131,249,1, //cmp $0x1,%rcx - 116,240, //je 14dd <_sk_store_f32_avx+0x69> + 116,240, //je 14ce <_sk_store_f32_avx+0x69> 196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4) 72,131,249,3, //cmp $0x3,%rcx - 114,227, //jb 14dd <_sk_store_f32_avx+0x69> + 114,227, //jb 14ce <_sk_store_f32_avx+0x69> 196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4) - 116,218, //je 14dd <_sk_store_f32_avx+0x69> + 116,218, //je 14ce <_sk_store_f32_avx+0x69> 196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4) 72,131,249,5, //cmp $0x5,%rcx - 114,205, //jb 14dd <_sk_store_f32_avx+0x69> + 114,205, //jb 14ce <_sk_store_f32_avx+0x69> 196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4) - 116,195, //je 14dd <_sk_store_f32_avx+0x69> + 116,195, //je 14ce <_sk_store_f32_avx+0x69> 196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4) 72,131,249,7, //cmp $0x7,%rcx - 114,181, //jb 14dd <_sk_store_f32_avx+0x69> + 114,181, //jb 14ce <_sk_store_f32_avx+0x69> 196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4) - 235,171, //jmp 14dd <_sk_store_f32_avx+0x69> + 235,171, //jmp 14ce <_sk_store_f32_avx+0x69> }; CODE const uint8_t sk_clamp_x_avx[] = { @@ -4868,7 +4842,7 @@ CODE const uint8_t sk_repeat_y_avx[] = { CODE const uint8_t sk_mirror_x_avx[] = { 72,173, //lods %ds:(%rsi),%rax - 197,122,16,0, //vmovss (%rax),%xmm8 + 197,121,110,0, //vmovd (%rax),%xmm8 196,65,121,112,200,0, //vpshufd $0x0,%xmm8,%xmm9 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9 196,65,124,92,209, //vsubps %ymm9,%ymm0,%ymm10 @@ -4895,7 +4869,7 @@ CODE const uint8_t sk_mirror_x_avx[] = { CODE const uint8_t sk_mirror_y_avx[] = { 72,173, //lods %ds:(%rsi),%rax - 197,122,16,0, //vmovss (%rax),%xmm8 + 197,121,110,0, //vmovd (%rax),%xmm8 196,65,121,112,200,0, //vpshufd $0x0,%xmm8,%xmm9 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9 196,65,116,92,209, //vsubps %ymm9,%ymm1,%ymm10 @@ -5358,20 +5332,17 @@ CODE const uint8_t sk_premul_sse41[] = { }; CODE const uint8_t sk_unpremul_sse41[] = { - 68,15,40,192, //movaps %xmm0,%xmm8 - 69,15,87,201, //xorps %xmm9,%xmm9 + 69,15,87,192, //xorps %xmm8,%xmm8 184,0,0,128,63, //mov $0x3f800000,%eax - 102,68,15,110,208, //movd %eax,%xmm10 - 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 - 68,15,94,211, //divps %xmm3,%xmm10 - 15,40,195, //movaps %xmm3,%xmm0 - 65,15,194,193,0, //cmpeqps %xmm9,%xmm0 - 102,69,15,56,20,209, //blendvps %xmm0,%xmm9,%xmm10 - 69,15,89,194, //mulps %xmm10,%xmm8 - 65,15,89,202, //mulps %xmm10,%xmm1 - 65,15,89,210, //mulps %xmm10,%xmm2 + 102,68,15,110,200, //movd %eax,%xmm9 + 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 + 68,15,94,203, //divps %xmm3,%xmm9 + 68,15,194,195,4, //cmpneqps %xmm3,%xmm8 + 69,15,84,193, //andps %xmm9,%xmm8 + 65,15,89,192, //mulps %xmm8,%xmm0 + 65,15,89,200, //mulps %xmm8,%xmm1 + 65,15,89,208, //mulps %xmm8,%xmm2 72,173, //lods %ds:(%rsi),%rax - 65,15,40,192, //movaps %xmm8,%xmm0 255,224, //jmpq *%rax }; @@ -5656,31 +5627,33 @@ CODE const uint8_t sk_load_tables_sse41[] = { 102,67,15,58,33,4,144,16, //insertps $0x10,(%r8,%r10,4),%xmm0 102,67,15,58,33,4,136,32, //insertps $0x20,(%r8,%r9,4),%xmm0 102,65,15,58,33,4,136,48, //insertps $0x30,(%r8,%rcx,4),%xmm0 - 72,139,72,16, //mov 0x10(%rax),%rcx - 102,73,15,58,22,200,1, //pextrq $0x1,%xmm1,%r8 - 69,137,193, //mov %r8d,%r9d - 73,193,232,32, //shr $0x20,%r8 - 102,73,15,126,202, //movq %xmm1,%r10 - 69,137,211, //mov %r10d,%r11d - 73,193,234,32, //shr $0x20,%r10 - 243,66,15,16,12,153, //movss (%rcx,%r11,4),%xmm1 - 102,66,15,58,33,12,145,16, //insertps $0x10,(%rcx,%r10,4),%xmm1 - 243,66,15,16,28,137, //movss (%rcx,%r9,4),%xmm3 + 76,139,64,16, //mov 0x10(%rax),%r8 + 102,73,15,58,22,202,1, //pextrq $0x1,%xmm1,%r10 + 77,137,209, //mov %r10,%r9 + 73,193,233,32, //shr $0x20,%r9 + 102,72,15,126,201, //movq %xmm1,%rcx + 65,137,203, //mov %ecx,%r11d + 65,129,227,255,255,255,0, //and $0xffffff,%r11d + 72,193,233,30, //shr $0x1e,%rcx + 65,129,226,255,255,255,0, //and $0xffffff,%r10d + 243,67,15,16,12,152, //movss (%r8,%r11,4),%xmm1 + 102,65,15,58,33,12,8,16, //insertps $0x10,(%r8,%rcx,1),%xmm1 + 243,67,15,16,28,144, //movss (%r8,%r10,4),%xmm3 102,15,58,33,203,32, //insertps $0x20,%xmm3,%xmm1 - 243,66,15,16,28,129, //movss (%rcx,%r8,4),%xmm3 + 243,67,15,16,28,136, //movss (%r8,%r9,4),%xmm3 102,15,58,33,203,48, //insertps $0x30,%xmm3,%xmm1 - 72,139,64,24, //mov 0x18(%rax),%rax + 76,139,72,24, //mov 0x18(%rax),%r9 102,72,15,58,22,209,1, //pextrq $0x1,%xmm2,%rcx - 65,137,200, //mov %ecx,%r8d + 68,15,183,193, //movzwl %cx,%r8d 72,193,233,32, //shr $0x20,%rcx - 102,73,15,126,209, //movq %xmm2,%r9 - 69,137,202, //mov %r9d,%r10d - 73,193,233,32, //shr $0x20,%r9 - 243,66,15,16,20,144, //movss (%rax,%r10,4),%xmm2 - 102,66,15,58,33,20,136,16, //insertps $0x10,(%rax,%r9,4),%xmm2 - 243,66,15,16,28,128, //movss (%rax,%r8,4),%xmm3 + 102,72,15,126,208, //movq %xmm2,%rax + 68,15,183,208, //movzwl %ax,%r10d + 72,193,232,30, //shr $0x1e,%rax + 243,67,15,16,20,145, //movss (%r9,%r10,4),%xmm2 + 102,65,15,58,33,20,1,16, //insertps $0x10,(%r9,%rax,1),%xmm2 + 243,67,15,16,28,129, //movss (%r9,%r8,4),%xmm3 102,15,58,33,211,32, //insertps $0x20,%xmm3,%xmm2 - 243,15,16,28,136, //movss (%rax,%rcx,4),%xmm3 + 243,65,15,16,28,137, //movss (%r9,%rcx,4),%xmm3 102,15,58,33,211,48, //insertps $0x30,%xmm3,%xmm2 102,65,15,114,208,24, //psrld $0x18,%xmm8 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 @@ -5929,14 +5902,14 @@ CODE const uint8_t sk_store_f32_sse41[] = { 68,15,21,211, //unpckhps %xmm3,%xmm10 69,15,40,225, //movaps %xmm9,%xmm12 102,69,15,20,227, //unpcklpd %xmm11,%xmm12 - 102,69,15,21,203, //unpckhpd %xmm11,%xmm9 - 69,15,40,216, //movaps %xmm8,%xmm11 - 102,69,15,20,218, //unpcklpd %xmm10,%xmm11 - 102,69,15,21,194, //unpckhpd %xmm10,%xmm8 + 69,15,18,217, //movhlps %xmm9,%xmm11 + 69,15,40,200, //movaps %xmm8,%xmm9 + 102,69,15,20,202, //unpcklpd %xmm10,%xmm9 + 69,15,18,208, //movhlps %xmm8,%xmm10 102,68,15,17,36,8, //movupd %xmm12,(%rax,%rcx,1) - 102,68,15,17,76,8,16, //movupd %xmm9,0x10(%rax,%rcx,1) - 102,68,15,17,92,8,32, //movupd %xmm11,0x20(%rax,%rcx,1) - 102,68,15,17,68,8,48, //movupd %xmm8,0x30(%rax,%rcx,1) + 68,15,17,92,8,16, //movups %xmm11,0x10(%rax,%rcx,1) + 102,68,15,17,76,8,32, //movupd %xmm9,0x20(%rax,%rcx,1) + 68,15,17,84,8,48, //movups %xmm10,0x30(%rax,%rcx,1) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax }; @@ -6545,12 +6518,12 @@ CODE const uint8_t sk_premul_sse2[] = { CODE const uint8_t sk_unpremul_sse2[] = { 69,15,87,192, //xorps %xmm8,%xmm8 - 68,15,194,195,0, //cmpeqps %xmm3,%xmm8 184,0,0,128,63, //mov $0x3f800000,%eax 102,68,15,110,200, //movd %eax,%xmm9 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 68,15,94,203, //divps %xmm3,%xmm9 - 69,15,85,193, //andnps %xmm9,%xmm8 + 68,15,194,195,4, //cmpneqps %xmm3,%xmm8 + 69,15,84,193, //andps %xmm9,%xmm8 65,15,89,192, //mulps %xmm8,%xmm0 65,15,89,200, //mulps %xmm8,%xmm1 65,15,89,208, //mulps %xmm8,%xmm2 @@ -6840,34 +6813,36 @@ CODE const uint8_t sk_load_tables_sse2[] = { 243,67,15,16,12,136, //movss (%r8,%r9,4),%xmm1 15,20,193, //unpcklps %xmm1,%xmm0 15,20,195, //unpcklps %xmm3,%xmm0 - 72,139,72,16, //mov 0x10(%rax),%rcx + 76,139,64,16, //mov 0x10(%rax),%r8 102,65,15,112,201,78, //pshufd $0x4e,%xmm9,%xmm1 - 102,73,15,126,200, //movq %xmm1,%r8 - 69,137,193, //mov %r8d,%r9d - 73,193,232,32, //shr $0x20,%r8 - 102,77,15,126,202, //movq %xmm9,%r10 - 69,137,211, //mov %r10d,%r11d - 73,193,234,32, //shr $0x20,%r10 - 243,66,15,16,28,145, //movss (%rcx,%r10,4),%xmm3 - 243,66,15,16,12,129, //movss (%rcx,%r8,4),%xmm1 + 102,73,15,126,202, //movq %xmm1,%r10 + 77,137,209, //mov %r10,%r9 + 73,193,233,32, //shr $0x20,%r9 + 102,76,15,126,201, //movq %xmm9,%rcx + 65,137,203, //mov %ecx,%r11d + 65,129,227,255,255,255,0, //and $0xffffff,%r11d + 72,193,233,30, //shr $0x1e,%rcx + 65,129,226,255,255,255,0, //and $0xffffff,%r10d + 243,65,15,16,28,8, //movss (%r8,%rcx,1),%xmm3 + 243,67,15,16,12,136, //movss (%r8,%r9,4),%xmm1 15,20,217, //unpcklps %xmm1,%xmm3 - 243,66,15,16,12,153, //movss (%rcx,%r11,4),%xmm1 - 243,66,15,16,20,137, //movss (%rcx,%r9,4),%xmm2 + 243,67,15,16,12,152, //movss (%r8,%r11,4),%xmm1 + 243,67,15,16,20,144, //movss (%r8,%r10,4),%xmm2 15,20,202, //unpcklps %xmm2,%xmm1 15,20,203, //unpcklps %xmm3,%xmm1 - 72,139,64,24, //mov 0x18(%rax),%rax + 76,139,72,24, //mov 0x18(%rax),%r9 102,65,15,112,210,78, //pshufd $0x4e,%xmm10,%xmm2 102,72,15,126,209, //movq %xmm2,%rcx - 65,137,200, //mov %ecx,%r8d + 68,15,183,193, //movzwl %cx,%r8d 72,193,233,32, //shr $0x20,%rcx - 102,77,15,126,209, //movq %xmm10,%r9 - 69,137,202, //mov %r9d,%r10d - 73,193,233,32, //shr $0x20,%r9 - 243,70,15,16,12,136, //movss (%rax,%r9,4),%xmm9 - 243,15,16,20,136, //movss (%rax,%rcx,4),%xmm2 + 102,76,15,126,208, //movq %xmm10,%rax + 68,15,183,208, //movzwl %ax,%r10d + 72,193,232,30, //shr $0x1e,%rax + 243,69,15,16,12,1, //movss (%r9,%rax,1),%xmm9 + 243,65,15,16,20,137, //movss (%r9,%rcx,4),%xmm2 68,15,20,202, //unpcklps %xmm2,%xmm9 - 243,66,15,16,20,144, //movss (%rax,%r10,4),%xmm2 - 243,66,15,16,28,128, //movss (%rax,%r8,4),%xmm3 + 243,67,15,16,20,145, //movss (%r9,%r10,4),%xmm2 + 243,67,15,16,28,129, //movss (%r9,%r8,4),%xmm3 15,20,211, //unpcklps %xmm3,%xmm2 65,15,20,209, //unpcklps %xmm9,%xmm2 102,65,15,114,208,24, //psrld $0x18,%xmm8 @@ -7128,14 +7103,14 @@ CODE const uint8_t sk_store_f32_sse2[] = { 68,15,21,211, //unpckhps %xmm3,%xmm10 69,15,40,225, //movaps %xmm9,%xmm12 102,69,15,20,227, //unpcklpd %xmm11,%xmm12 - 102,69,15,21,203, //unpckhpd %xmm11,%xmm9 - 69,15,40,216, //movaps %xmm8,%xmm11 - 102,69,15,20,218, //unpcklpd %xmm10,%xmm11 - 102,69,15,21,194, //unpckhpd %xmm10,%xmm8 + 69,15,18,217, //movhlps %xmm9,%xmm11 + 69,15,40,200, //movaps %xmm8,%xmm9 + 102,69,15,20,202, //unpcklpd %xmm10,%xmm9 + 69,15,18,208, //movhlps %xmm8,%xmm10 102,68,15,17,36,8, //movupd %xmm12,(%rax,%rcx,1) - 102,68,15,17,76,8,16, //movupd %xmm9,0x10(%rax,%rcx,1) - 102,68,15,17,92,8,32, //movupd %xmm11,0x20(%rax,%rcx,1) - 102,68,15,17,68,8,48, //movupd %xmm8,0x30(%rax,%rcx,1) + 68,15,17,92,8,16, //movups %xmm11,0x10(%rax,%rcx,1) + 102,68,15,17,76,8,32, //movupd %xmm9,0x20(%rax,%rcx,1) + 68,15,17,84,8,48, //movups %xmm10,0x30(%rax,%rcx,1) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax }; @@ -7625,11 +7600,11 @@ CODE const uint8_t sk_just_return_hsw[] = { CODE const uint8_t sk_seed_shader_hsw[] = { 72,173, //lods %ds:(%rsi),%rax 197,249,110,199, //vmovd %edi,%xmm0 - 196,226,125,24,192, //vbroadcastss %xmm0,%ymm0 + 196,226,125,88,192, //vpbroadcastd %xmm0,%ymm0 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 65,184,0,0,0,63, //mov $0x3f000000,%r8d 196,193,121,110,200, //vmovd %r8d,%xmm1 - 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1 + 196,226,125,88,201, //vpbroadcastd %xmm1,%ymm1 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0 197,252,88,2, //vaddps (%rdx),%ymm0,%ymm0 196,226,125,24,16, //vbroadcastss (%rax),%ymm2 @@ -7637,7 +7612,7 @@ CODE const uint8_t sk_seed_shader_hsw[] = { 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1 184,0,0,128,63, //mov $0x3f800000,%eax 197,249,110,208, //vmovd %eax,%xmm2 - 196,226,125,24,210, //vbroadcastss %xmm2,%ymm2 + 196,226,125,88,210, //vpbroadcastd %xmm2,%ymm2 72,173, //lods %ds:(%rsi),%rax 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4 @@ -7678,7 +7653,7 @@ CODE const uint8_t sk_plus__hsw[] = { CODE const uint8_t sk_srcover_hsw[] = { 184,0,0,128,63, //mov $0x3f800000,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 197,60,92,195, //vsubps %ymm3,%ymm8,%ymm8 196,194,93,184,192, //vfmadd231ps %ymm8,%ymm4,%ymm0 196,194,85,184,200, //vfmadd231ps %ymm8,%ymm5,%ymm1 @@ -7691,7 +7666,7 @@ CODE const uint8_t sk_srcover_hsw[] = { CODE const uint8_t sk_dstover_hsw[] = { 184,0,0,128,63, //mov $0x3f800000,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 197,60,92,199, //vsubps %ymm7,%ymm8,%ymm8 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0 196,226,61,168,205, //vfmadd213ps %ymm5,%ymm8,%ymm1 @@ -7714,7 +7689,7 @@ CODE const uint8_t sk_clamp_0_hsw[] = { CODE const uint8_t sk_clamp_1_hsw[] = { 184,0,0,128,63, //mov $0x3f800000,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1 196,193,108,93,208, //vminps %ymm8,%ymm2,%ymm2 @@ -7726,7 +7701,7 @@ CODE const uint8_t sk_clamp_1_hsw[] = { CODE const uint8_t sk_clamp_a_hsw[] = { 184,0,0,128,63, //mov $0x3f800000,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3 197,252,93,195, //vminps %ymm3,%ymm0,%ymm0 197,244,93,203, //vminps %ymm3,%ymm1,%ymm1 @@ -7800,7 +7775,7 @@ CODE const uint8_t sk_unpremul_hsw[] = { 196,65,100,194,200,0, //vcmpeqps %ymm8,%ymm3,%ymm9 184,0,0,128,63, //mov $0x3f800000,%eax 197,121,110,208, //vmovd %eax,%xmm10 - 196,66,125,24,210, //vbroadcastss %xmm10,%ymm10 + 196,66,125,88,210, //vpbroadcastd %xmm10,%ymm10 197,44,94,211, //vdivps %ymm3,%ymm10,%ymm10 196,67,45,74,192,144, //vblendvps %ymm9,%ymm8,%ymm10,%ymm8 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 @@ -7813,29 +7788,29 @@ CODE const uint8_t sk_unpremul_hsw[] = { CODE const uint8_t sk_from_srgb_hsw[] = { 184,145,131,158,61, //mov $0x3d9e8391,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 197,124,89,208, //vmulps %ymm0,%ymm0,%ymm10 184,154,153,153,62, //mov $0x3e99999a,%eax 197,121,110,216, //vmovd %eax,%xmm11 - 196,66,125,24,219, //vbroadcastss %xmm11,%ymm11 + 196,66,125,88,219, //vpbroadcastd %xmm11,%ymm11 184,92,143,50,63, //mov $0x3f328f5c,%eax 197,121,110,224, //vmovd %eax,%xmm12 - 196,66,125,24,228, //vbroadcastss %xmm12,%ymm12 - 196,65,124,40,235, //vmovaps %ymm11,%ymm13 + 196,66,125,88,228, //vpbroadcastd %xmm12,%ymm12 + 196,65,125,111,235, //vmovdqa %ymm11,%ymm13 196,66,125,168,236, //vfmadd213ps %ymm12,%ymm0,%ymm13 184,10,215,35,59, //mov $0x3b23d70a,%eax 197,121,110,240, //vmovd %eax,%xmm14 - 196,66,125,24,246, //vbroadcastss %xmm14,%ymm14 + 196,66,125,88,246, //vpbroadcastd %xmm14,%ymm14 196,66,45,168,238, //vfmadd213ps %ymm14,%ymm10,%ymm13 184,174,71,97,61, //mov $0x3d6147ae,%eax 197,121,110,208, //vmovd %eax,%xmm10 - 196,66,125,24,210, //vbroadcastss %xmm10,%ymm10 + 196,66,125,88,210, //vpbroadcastd %xmm10,%ymm10 196,193,124,194,194,1, //vcmpltps %ymm10,%ymm0,%ymm0 196,195,21,74,193,0, //vblendvps %ymm0,%ymm9,%ymm13,%ymm0 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9 197,116,89,233, //vmulps %ymm1,%ymm1,%ymm13 - 196,65,124,40,251, //vmovaps %ymm11,%ymm15 + 196,65,125,111,251, //vmovdqa %ymm11,%ymm15 196,66,117,168,252, //vfmadd213ps %ymm12,%ymm1,%ymm15 196,66,21,168,254, //vfmadd213ps %ymm14,%ymm13,%ymm15 196,193,116,194,202,1, //vcmpltps %ymm10,%ymm1,%ymm1 @@ -7856,27 +7831,27 @@ CODE const uint8_t sk_to_srgb_hsw[] = { 196,65,124,82,224, //vrsqrtps %ymm8,%ymm12 184,41,92,71,65, //mov $0x41475c29,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 197,60,89,232, //vmulps %ymm0,%ymm8,%ymm13 184,0,0,128,63, //mov $0x3f800000,%eax 197,121,110,200, //vmovd %eax,%xmm9 - 196,66,125,24,201, //vbroadcastss %xmm9,%ymm9 + 196,66,125,88,201, //vpbroadcastd %xmm9,%ymm9 184,194,135,210,62, //mov $0x3ed287c2,%eax 197,121,110,208, //vmovd %eax,%xmm10 - 196,66,125,24,210, //vbroadcastss %xmm10,%ymm10 + 196,66,125,88,210, //vpbroadcastd %xmm10,%ymm10 184,206,111,48,63, //mov $0x3f306fce,%eax 197,121,110,240, //vmovd %eax,%xmm14 - 196,66,125,24,246, //vbroadcastss %xmm14,%ymm14 + 196,66,125,88,246, //vpbroadcastd %xmm14,%ymm14 184,168,87,202,61, //mov $0x3dca57a8,%eax 53,0,0,0,128, //xor $0x80000000,%eax 197,121,110,248, //vmovd %eax,%xmm15 - 196,66,125,24,255, //vbroadcastss %xmm15,%ymm15 + 196,66,125,88,255, //vpbroadcastd %xmm15,%ymm15 196,66,13,168,223, //vfmadd213ps %ymm15,%ymm14,%ymm11 196,66,45,184,220, //vfmadd231ps %ymm12,%ymm10,%ymm11 196,65,52,93,219, //vminps %ymm11,%ymm9,%ymm11 184,4,231,140,59, //mov $0x3b8ce704,%eax 197,121,110,224, //vmovd %eax,%xmm12 - 196,66,125,24,228, //vbroadcastss %xmm12,%ymm12 + 196,66,125,88,228, //vpbroadcastd %xmm12,%ymm12 196,193,124,194,196,1, //vcmpltps %ymm12,%ymm0,%ymm0 196,195,37,74,197,0, //vblendvps %ymm0,%ymm13,%ymm11,%ymm0 197,124,82,217, //vrsqrtps %ymm1,%ymm11 @@ -7919,12 +7894,12 @@ CODE const uint8_t sk_scale_u8_hsw[] = { 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 117,56, //jne 556 <_sk_scale_u8_hsw+0x48> - 197,123,16,0, //vmovsd (%rax),%xmm8 + 197,122,126,0, //vmovq (%rax),%xmm8 196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 184,129,128,128,59, //mov $0x3b808081,%eax 197,121,110,200, //vmovd %eax,%xmm9 - 196,66,125,24,201, //vbroadcastss %xmm9,%ymm9 + 196,66,125,88,201, //vpbroadcastd %xmm9,%ymm9 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 @@ -7969,12 +7944,12 @@ CODE const uint8_t sk_lerp_u8_hsw[] = { 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 117,76, //jne 606 <_sk_lerp_u8_hsw+0x5c> - 197,123,16,0, //vmovsd (%rax),%xmm8 + 197,122,126,0, //vmovq (%rax),%xmm8 196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 184,129,128,128,59, //mov $0x3b808081,%eax 197,121,110,200, //vmovd %eax,%xmm9 - 196,66,125,24,201, //vbroadcastss %xmm9,%ymm9 + 196,66,125,88,201, //vpbroadcastd %xmm9,%ymm9 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0 @@ -8015,7 +7990,7 @@ CODE const uint8_t sk_lerp_565_hsw[] = { 197,124,91,203, //vcvtdq2ps %ymm3,%ymm9 184,8,33,132,55, //mov $0x37842108,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 197,52,89,203, //vmulps %ymm3,%ymm9,%ymm9 184,224,7,0,0, //mov $0x7e0,%eax 197,249,110,216, //vmovd %eax,%xmm3 @@ -8024,7 +7999,7 @@ CODE const uint8_t sk_lerp_565_hsw[] = { 197,124,91,211, //vcvtdq2ps %ymm3,%ymm10 184,33,8,2,58, //mov $0x3a020821,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 197,44,89,211, //vmulps %ymm3,%ymm10,%ymm10 184,31,0,0,0, //mov $0x1f,%eax 197,249,110,216, //vmovd %eax,%xmm3 @@ -8033,7 +8008,7 @@ CODE const uint8_t sk_lerp_565_hsw[] = { 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 184,8,33,4,61, //mov $0x3d042108,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 196,226,53,168,196, //vfmadd213ps %ymm4,%ymm9,%ymm0 @@ -8043,16 +8018,16 @@ CODE const uint8_t sk_lerp_565_hsw[] = { 196,226,101,168,214, //vfmadd213ps %ymm6,%ymm3,%ymm2 184,0,0,128,63, //mov $0x3f800000,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax 65,137,200, //mov %ecx,%r8d 65,128,224,7, //and $0x7,%r8b 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 65,254,200, //dec %r8b - 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 15,135,55,255,255,255, //ja 63f <_sk_lerp_565_hsw+0x14> + 15,135,59,255,255,255, //ja 63f <_sk_lerp_565_hsw+0x14> + 69,15,182,192, //movzbl %r8b,%r8d 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # 758 <_sk_lerp_565_hsw+0x12d> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax @@ -8120,7 +8095,7 @@ CODE const uint8_t sk_load_tables_hsw[] = { 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 184,129,128,128,59, //mov $0x3b808081,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 72,173, //lods %ds:(%rsi),%rax 76,137,193, //mov %r8,%rcx @@ -8143,12 +8118,12 @@ CODE const uint8_t sk_load_a8_hsw[] = { 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 117,50, //jne 86d <_sk_load_a8_hsw+0x42> - 197,251,16,0, //vmovsd (%rax),%xmm0 + 197,250,126,0, //vmovq (%rax),%xmm0 196,226,125,49,192, //vpmovzxbd %xmm0,%ymm0 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 184,129,128,128,59, //mov $0x3b808081,%eax 197,249,110,200, //vmovd %eax,%xmm1 - 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1 + 196,226,125,88,201, //vpbroadcastd %xmm1,%ymm1 197,252,89,217, //vmulps %ymm1,%ymm0,%ymm3 72,173, //lods %ds:(%rsi),%rax 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 @@ -8175,7 +8150,7 @@ CODE const uint8_t sk_store_a8_hsw[] = { 76,139,8, //mov (%rax),%r9 184,0,0,127,67, //mov $0x437f0000,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 @@ -8186,16 +8161,16 @@ CODE const uint8_t sk_store_a8_hsw[] = { 196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax - 137,200, //mov %ecx,%eax - 36,7, //and $0x7,%al - 254,200, //dec %al - 68,15,182,192, //movzbl %al,%r8d + 65,137,200, //mov %ecx,%r8d + 65,128,224,7, //and $0x7,%r8b + 65,254,200, //dec %r8b 65,128,248,6, //cmp $0x6,%r8b 119,236, //ja 8c9 <_sk_store_a8_hsw+0x37> 196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8 - 76,141,21,67,0,0,0, //lea 0x43(%rip),%r10 # 92c <_sk_store_a8_hsw+0x9a> - 75,99,4,130, //movslq (%r10,%r8,4),%rax - 76,1,208, //add %r10,%rax + 65,15,182,192, //movzbl %r8b,%eax + 76,141,5,67,0,0,0, //lea 0x43(%rip),%r8 # 930 <_sk_store_a8_hsw+0x9e> + 73,99,4,128, //movslq (%r8,%rax,4),%rax + 76,1,192, //add %r8,%rax 255,224, //jmpq *%rax 196,67,121,20,68,57,6,12, //vpextrb $0xc,%xmm8,0x6(%r9,%rdi,1) 196,67,121,20,68,57,5,10, //vpextrb $0xa,%xmm8,0x5(%r9,%rdi,1) @@ -8204,7 +8179,7 @@ CODE const uint8_t sk_store_a8_hsw[] = { 196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1) 196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1) 196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1) - 235,158, //jmp 8c9 <_sk_store_a8_hsw+0x37> + 235,154, //jmp 8c9 <_sk_store_a8_hsw+0x37> 144, //nop 246,255, //idiv %bh 255, //(bad) @@ -8234,7 +8209,7 @@ CODE const uint8_t sk_load_565_hsw[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,16, //mov (%rax),%r10 72,133,201, //test %rcx,%rcx - 15,133,149,0,0,0, //jne 9eb <_sk_load_565_hsw+0xa3> + 15,133,149,0,0,0, //jne 9ef <_sk_load_565_hsw+0xa3> 196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0 196,226,125,51,208, //vpmovzxwd %xmm0,%ymm2 184,0,248,0,0, //mov $0xf800,%eax @@ -8244,7 +8219,7 @@ CODE const uint8_t sk_load_565_hsw[] = { 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 184,8,33,132,55, //mov $0x37842108,%eax 197,249,110,200, //vmovd %eax,%xmm1 - 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1 + 196,226,125,88,201, //vpbroadcastd %xmm1,%ymm1 197,252,89,193, //vmulps %ymm1,%ymm0,%ymm0 184,224,7,0,0, //mov $0x7e0,%eax 197,249,110,200, //vmovd %eax,%xmm1 @@ -8253,7 +8228,7 @@ CODE const uint8_t sk_load_565_hsw[] = { 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1 184,33,8,2,58, //mov $0x3a020821,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1 184,31,0,0,0, //mov $0x1f,%eax 197,249,110,216, //vmovd %eax,%xmm3 @@ -8262,21 +8237,21 @@ CODE const uint8_t sk_load_565_hsw[] = { 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 184,8,33,4,61, //mov $0x3d042108,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 184,0,0,128,63, //mov $0x3f800000,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax 65,137,200, //mov %ecx,%r8d 65,128,224,7, //and $0x7,%r8b 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0 65,254,200, //dec %r8b - 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 15,135,85,255,255,255, //ja 95c <_sk_load_565_hsw+0x14> - 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # a58 <_sk_load_565_hsw+0x110> + 15,135,89,255,255,255, //ja 960 <_sk_load_565_hsw+0x14> + 69,15,182,192, //movzbl %r8b,%r8d + 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # a5c <_sk_load_565_hsw+0x110> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax 255,224, //jmpq *%rax @@ -8288,12 +8263,12 @@ CODE const uint8_t sk_load_565_hsw[] = { 196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0 196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0 196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0 - 233,5,255,255,255, //jmpq 95c <_sk_load_565_hsw+0x14> + 233,5,255,255,255, //jmpq 960 <_sk_load_565_hsw+0x14> 144, //nop 243,255, //repz (bad) 255, //(bad) 255, //(bad) - 235,255, //jmp a5d <_sk_load_565_hsw+0x115> + 235,255, //jmp a61 <_sk_load_565_hsw+0x115> 255, //(bad) 255,227, //jmpq *%rbx 255, //(bad) @@ -8319,13 +8294,13 @@ CODE const uint8_t sk_store_565_hsw[] = { 76,139,8, //mov (%rax),%r9 184,0,0,248,65, //mov $0x41f80000,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9 196,193,53,114,241,11, //vpslld $0xb,%ymm9,%ymm9 184,0,0,124,66, //mov $0x427c0000,%eax 197,121,110,208, //vmovd %eax,%xmm10 - 196,66,125,24,210, //vbroadcastss %xmm10,%ymm10 + 196,66,125,88,210, //vpbroadcastd %xmm10,%ymm10 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 196,193,45,114,242,5, //vpslld $0x5,%ymm10,%ymm10 @@ -8336,19 +8311,19 @@ CODE const uint8_t sk_store_565_hsw[] = { 196,67,125,57,193,1, //vextracti128 $0x1,%ymm8,%xmm9 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 72,133,201, //test %rcx,%rcx - 117,10, //jne ae0 <_sk_store_565_hsw+0x6c> + 117,10, //jne ae4 <_sk_store_565_hsw+0x6c> 196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax - 137,200, //mov %ecx,%eax - 36,7, //and $0x7,%al - 254,200, //dec %al - 68,15,182,192, //movzbl %al,%r8d + 65,137,200, //mov %ecx,%r8d + 65,128,224,7, //and $0x7,%r8b + 65,254,200, //dec %r8b 65,128,248,6, //cmp $0x6,%r8b - 119,236, //ja adc <_sk_store_565_hsw+0x68> - 76,141,21,69,0,0,0, //lea 0x45(%rip),%r10 # b3c <_sk_store_565_hsw+0xc8> - 75,99,4,130, //movslq (%r10,%r8,4),%rax - 76,1,208, //add %r10,%rax + 119,236, //ja ae0 <_sk_store_565_hsw+0x68> + 65,15,182,192, //movzbl %r8b,%eax + 76,141,5,69,0,0,0, //lea 0x45(%rip),%r8 # b44 <_sk_store_565_hsw+0xcc> + 73,99,4,128, //movslq (%r8,%rax,4),%rax + 76,1,192, //add %r8,%rax 255,224, //jmpq *%rax 196,67,121,21,68,121,12,6, //vpextrw $0x6,%xmm8,0xc(%r9,%rdi,2) 196,67,121,21,68,121,10,5, //vpextrw $0x5,%xmm8,0xa(%r9,%rdi,2) @@ -8356,10 +8331,9 @@ CODE const uint8_t sk_store_565_hsw[] = { 196,67,121,21,68,121,6,3, //vpextrw $0x3,%xmm8,0x6(%r9,%rdi,2) 196,67,121,21,68,121,4,2, //vpextrw $0x2,%xmm8,0x4(%r9,%rdi,2) 196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2) - 197,121,126,192, //vmovd %xmm8,%eax - 102,65,137,4,121, //mov %ax,(%r9,%rdi,2) - 235,161, //jmp adc <_sk_store_565_hsw+0x68> - 144, //nop + 196,67,121,21,4,121,0, //vpextrw $0x0,%xmm8,(%r9,%rdi,2) + 235,159, //jmp ae0 <_sk_store_565_hsw+0x68> + 15,31,0, //nopl (%rax) 244, //hlt 255, //(bad) 255, //(bad) @@ -8391,7 +8365,7 @@ CODE const uint8_t sk_load_8888_hsw[] = { 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9 76,3,8, //add (%rax),%r9 77,133,192, //test %r8,%r8 - 117,104, //jne bd5 <_sk_load_8888_hsw+0x7d> + 117,104, //jne bdd <_sk_load_8888_hsw+0x7d> 196,193,126,111,25, //vmovdqu (%r9),%ymm3 184,255,0,0,0, //mov $0xff,%eax 197,249,110,192, //vmovd %eax,%xmm0 @@ -8400,7 +8374,7 @@ CODE const uint8_t sk_load_8888_hsw[] = { 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 184,129,128,128,59, //mov $0x3b808081,%eax 197,249,110,200, //vmovd %eax,%xmm1 - 196,98,125,24,193, //vbroadcastss %xmm1,%ymm8 + 196,98,125,88,193, //vpbroadcastd %xmm1,%ymm8 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0 197,245,114,211,8, //vpsrld $0x8,%ymm3,%ymm1 197,237,219,201, //vpand %ymm1,%ymm2,%ymm1 @@ -8424,7 +8398,7 @@ CODE const uint8_t sk_load_8888_hsw[] = { 196,225,249,110,192, //vmovq %rax,%xmm0 196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0 196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3 - 233,116,255,255,255, //jmpq b72 <_sk_load_8888_hsw+0x1a> + 233,116,255,255,255, //jmpq b7a <_sk_load_8888_hsw+0x1a> }; CODE const uint8_t sk_store_8888_hsw[] = { @@ -8434,7 +8408,7 @@ CODE const uint8_t sk_store_8888_hsw[] = { 76,3,8, //add (%rax),%r9 184,0,0,127,67, //mov $0x437f0000,%eax 197,121,110,192, //vmovd %eax,%xmm8 - 196,66,125,24,192, //vbroadcastss %xmm8,%ymm8 + 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9 197,60,89,209, //vmulps %ymm1,%ymm8,%ymm10 @@ -8450,7 +8424,7 @@ CODE const uint8_t sk_store_8888_hsw[] = { 196,65,45,235,192, //vpor %ymm8,%ymm10,%ymm8 196,65,53,235,192, //vpor %ymm8,%ymm9,%ymm8 77,133,192, //test %r8,%r8 - 117,12, //jne c72 <_sk_store_8888_hsw+0x74> + 117,12, //jne c7a <_sk_store_8888_hsw+0x74> 196,65,126,127,1, //vmovdqu %ymm8,(%r9) 72,173, //lods %ds:(%rsi),%rax 76,137,193, //mov %r8,%rcx @@ -8463,29 +8437,29 @@ CODE const uint8_t sk_store_8888_hsw[] = { 196,97,249,110,200, //vmovq %rax,%xmm9 196,66,125,33,201, //vpmovsxbd %xmm9,%ymm9 196,66,53,142,1, //vpmaskmovd %ymm8,%ymm9,(%r9) - 235,211, //jmp c6b <_sk_store_8888_hsw+0x6d> + 235,211, //jmp c73 <_sk_store_8888_hsw+0x6d> }; CODE const uint8_t sk_load_f16_hsw[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax 72,133,201, //test %rcx,%rcx - 117,97, //jne d03 <_sk_load_f16_hsw+0x6b> - 197,249,16,12,248, //vmovupd (%rax,%rdi,8),%xmm1 + 117,97, //jne d0b <_sk_load_f16_hsw+0x6b> + 197,121,16,4,248, //vmovupd (%rax,%rdi,8),%xmm8 197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2 197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3 - 197,121,16,68,248,48, //vmovupd 0x30(%rax,%rdi,8),%xmm8 - 197,241,97,194, //vpunpcklwd %xmm2,%xmm1,%xmm0 - 197,241,105,202, //vpunpckhwd %xmm2,%xmm1,%xmm1 - 196,193,97,97,208, //vpunpcklwd %xmm8,%xmm3,%xmm2 - 196,193,97,105,216, //vpunpckhwd %xmm8,%xmm3,%xmm3 - 197,121,97,193, //vpunpcklwd %xmm1,%xmm0,%xmm8 - 197,121,105,201, //vpunpckhwd %xmm1,%xmm0,%xmm9 - 197,233,97,203, //vpunpcklwd %xmm3,%xmm2,%xmm1 - 197,233,105,219, //vpunpckhwd %xmm3,%xmm2,%xmm3 - 197,185,108,193, //vpunpcklqdq %xmm1,%xmm8,%xmm0 + 197,122,111,76,248,48, //vmovdqu 0x30(%rax,%rdi,8),%xmm9 + 197,185,97,194, //vpunpcklwd %xmm2,%xmm8,%xmm0 + 197,185,105,210, //vpunpckhwd %xmm2,%xmm8,%xmm2 + 196,193,97,97,201, //vpunpcklwd %xmm9,%xmm3,%xmm1 + 196,193,97,105,217, //vpunpckhwd %xmm9,%xmm3,%xmm3 + 197,121,97,194, //vpunpcklwd %xmm2,%xmm0,%xmm8 + 197,121,105,202, //vpunpckhwd %xmm2,%xmm0,%xmm9 + 197,241,97,211, //vpunpcklwd %xmm3,%xmm1,%xmm2 + 197,241,105,219, //vpunpckhwd %xmm3,%xmm1,%xmm3 + 197,185,108,194, //vpunpcklqdq %xmm2,%xmm8,%xmm0 196,226,125,19,192, //vcvtph2ps %xmm0,%ymm0 - 197,185,109,201, //vpunpckhqdq %xmm1,%xmm8,%xmm1 + 197,185,109,202, //vpunpckhqdq %xmm2,%xmm8,%xmm1 196,226,125,19,201, //vcvtph2ps %xmm1,%ymm1 197,177,108,211, //vpunpcklqdq %xmm3,%xmm9,%xmm2 196,226,125,19,210, //vcvtph2ps %xmm2,%ymm2 @@ -8493,38 +8467,32 @@ CODE const uint8_t sk_load_f16_hsw[] = { 196,226,125,19,219, //vcvtph2ps %xmm3,%ymm3 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax - 197,251,16,12,248, //vmovsd (%rax,%rdi,8),%xmm1 - 196,65,57,87,192, //vxorpd %xmm8,%xmm8,%xmm8 + 197,123,16,4,248, //vmovsd (%rax,%rdi,8),%xmm8 + 196,65,49,239,201, //vpxor %xmm9,%xmm9,%xmm9 72,131,249,1, //cmp $0x1,%rcx - 117,6, //jne d19 <_sk_load_f16_hsw+0x81> - 197,250,126,201, //vmovq %xmm1,%xmm1 - 235,30, //jmp d37 <_sk_load_f16_hsw+0x9f> - 197,241,22,76,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm1,%xmm1 + 116,79, //je d6a <_sk_load_f16_hsw+0xca> + 197,57,22,68,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm8,%xmm8 72,131,249,3, //cmp $0x3,%rcx - 114,18, //jb d37 <_sk_load_f16_hsw+0x9f> + 114,67, //jb d6a <_sk_load_f16_hsw+0xca> 197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2 72,131,249,3, //cmp $0x3,%rcx - 117,19, //jne d44 <_sk_load_f16_hsw+0xac> - 197,250,126,210, //vmovq %xmm2,%xmm2 - 235,46, //jmp d65 <_sk_load_f16_hsw+0xcd> - 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 - 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2 - 233,117,255,255,255, //jmpq cb9 <_sk_load_f16_hsw+0x21> + 116,68, //je d77 <_sk_load_f16_hsw+0xd7> 197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2 72,131,249,5, //cmp $0x5,%rcx - 114,21, //jb d65 <_sk_load_f16_hsw+0xcd> + 114,56, //jb d77 <_sk_load_f16_hsw+0xd7> 197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3 72,131,249,5, //cmp $0x5,%rcx - 117,18, //jne d6e <_sk_load_f16_hsw+0xd6> - 197,250,126,219, //vmovq %xmm3,%xmm3 - 233,84,255,255,255, //jmpq cb9 <_sk_load_f16_hsw+0x21> - 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 - 233,75,255,255,255, //jmpq cb9 <_sk_load_f16_hsw+0x21> + 15,132,114,255,255,255, //je cc1 <_sk_load_f16_hsw+0x21> 197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3 72,131,249,7, //cmp $0x7,%rcx - 15,130,59,255,255,255, //jb cb9 <_sk_load_f16_hsw+0x21> - 197,123,16,68,248,48, //vmovsd 0x30(%rax,%rdi,8),%xmm8 - 233,48,255,255,255, //jmpq cb9 <_sk_load_f16_hsw+0x21> + 15,130,98,255,255,255, //jb cc1 <_sk_load_f16_hsw+0x21> + 197,122,126,76,248,48, //vmovq 0x30(%rax,%rdi,8),%xmm9 + 233,87,255,255,255, //jmpq cc1 <_sk_load_f16_hsw+0x21> + 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 + 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2 + 233,74,255,255,255, //jmpq cc1 <_sk_load_f16_hsw+0x21> + 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 + 233,65,255,255,255, //jmpq cc1 <_sk_load_f16_hsw+0x21> }; CODE const uint8_t sk_store_f16_hsw[] = { @@ -8543,7 +8511,7 @@ CODE const uint8_t sk_store_f16_hsw[] = { 196,65,57,98,205, //vpunpckldq %xmm13,%xmm8,%xmm9 196,65,57,106,197, //vpunpckhdq %xmm13,%xmm8,%xmm8 72,133,201, //test %rcx,%rcx - 117,27, //jne dee <_sk_store_f16_hsw+0x65> + 117,27, //jne de5 <_sk_store_f16_hsw+0x65> 197,120,17,28,248, //vmovups %xmm11,(%rax,%rdi,8) 197,120,17,84,248,16, //vmovups %xmm10,0x10(%rax,%rdi,8) 197,120,17,76,248,32, //vmovups %xmm9,0x20(%rax,%rdi,8) @@ -8552,22 +8520,22 @@ CODE const uint8_t sk_store_f16_hsw[] = { 255,224, //jmpq *%rax 197,121,214,28,248, //vmovq %xmm11,(%rax,%rdi,8) 72,131,249,1, //cmp $0x1,%rcx - 116,241, //je dea <_sk_store_f16_hsw+0x61> + 116,241, //je de1 <_sk_store_f16_hsw+0x61> 197,121,23,92,248,8, //vmovhpd %xmm11,0x8(%rax,%rdi,8) 72,131,249,3, //cmp $0x3,%rcx - 114,229, //jb dea <_sk_store_f16_hsw+0x61> + 114,229, //jb de1 <_sk_store_f16_hsw+0x61> 197,121,214,84,248,16, //vmovq %xmm10,0x10(%rax,%rdi,8) - 116,221, //je dea <_sk_store_f16_hsw+0x61> + 116,221, //je de1 <_sk_store_f16_hsw+0x61> 197,121,23,84,248,24, //vmovhpd %xmm10,0x18(%rax,%rdi,8) 72,131,249,5, //cmp $0x5,%rcx - 114,209, //jb dea <_sk_store_f16_hsw+0x61> + 114,209, //jb de1 <_sk_store_f16_hsw+0x61> 197,121,214,76,248,32, //vmovq %xmm9,0x20(%rax,%rdi,8) - 116,201, //je dea <_sk_store_f16_hsw+0x61> + 116,201, //je de1 <_sk_store_f16_hsw+0x61> 197,121,23,76,248,40, //vmovhpd %xmm9,0x28(%rax,%rdi,8) 72,131,249,7, //cmp $0x7,%rcx - 114,189, //jb dea <_sk_store_f16_hsw+0x61> + 114,189, //jb de1 <_sk_store_f16_hsw+0x61> 197,121,214,68,248,48, //vmovq %xmm8,0x30(%rax,%rdi,8) - 235,181, //jmp dea <_sk_store_f16_hsw+0x61> + 235,181, //jmp de1 <_sk_store_f16_hsw+0x61> }; CODE const uint8_t sk_store_f32_hsw[] = { @@ -8583,7 +8551,7 @@ CODE const uint8_t sk_store_f32_hsw[] = { 196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8 196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11 72,133,201, //test %rcx,%rcx - 117,55, //jne ea2 <_sk_store_f32_hsw+0x6d> + 117,55, //jne e99 <_sk_store_f32_hsw+0x6d> 196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12 196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13 196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9 @@ -8596,22 +8564,22 @@ CODE const uint8_t sk_store_f32_hsw[] = { 255,224, //jmpq *%rax 196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4) 72,131,249,1, //cmp $0x1,%rcx - 116,240, //je e9e <_sk_store_f32_hsw+0x69> + 116,240, //je e95 <_sk_store_f32_hsw+0x69> 196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4) 72,131,249,3, //cmp $0x3,%rcx - 114,227, //jb e9e <_sk_store_f32_hsw+0x69> + 114,227, //jb e95 <_sk_store_f32_hsw+0x69> 196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4) - 116,218, //je e9e <_sk_store_f32_hsw+0x69> + 116,218, //je e95 <_sk_store_f32_hsw+0x69> 196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4) 72,131,249,5, //cmp $0x5,%rcx - 114,205, //jb e9e <_sk_store_f32_hsw+0x69> + 114,205, //jb e95 <_sk_store_f32_hsw+0x69> 196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4) - 116,195, //je e9e <_sk_store_f32_hsw+0x69> + 116,195, //je e95 <_sk_store_f32_hsw+0x69> 196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4) 72,131,249,7, //cmp $0x7,%rcx - 114,181, //jb e9e <_sk_store_f32_hsw+0x69> + 114,181, //jb e95 <_sk_store_f32_hsw+0x69> 196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4) - 235,171, //jmp e9e <_sk_store_f32_hsw+0x69> + 235,171, //jmp e95 <_sk_store_f32_hsw+0x69> }; CODE const uint8_t sk_clamp_x_hsw[] = { @@ -8709,18 +8677,18 @@ CODE const uint8_t sk_mirror_y_hsw[] = { CODE const uint8_t sk_luminance_to_alpha_hsw[] = { 184,208,179,89,62, //mov $0x3e59b3d0,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,98,125,24,195, //vbroadcastss %xmm3,%ymm8 + 196,98,125,88,195, //vpbroadcastd %xmm3,%ymm8 184,89,23,55,63, //mov $0x3f371759,%eax 197,249,110,216, //vmovd %eax,%xmm3 - 196,226,125,24,219, //vbroadcastss %xmm3,%ymm3 + 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1 196,98,125,168,193, //vfmadd213ps %ymm1,%ymm0,%ymm8 184,152,221,147,61, //mov $0x3d93dd98,%eax 197,249,110,192, //vmovd %eax,%xmm0 - 196,226,125,24,216, //vbroadcastss %xmm0,%ymm3 + 196,226,125,88,216, //vpbroadcastd %xmm0,%ymm3 196,194,109,168,216, //vfmadd213ps %ymm8,%ymm2,%ymm3 72,173, //lods %ds:(%rsi),%rax - 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 + 197,253,239,192, //vpxor %ymm0,%ymm0,%ymm0 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 255,224, //jmpq *%rax @@ -9287,7 +9255,7 @@ CODE const uint8_t sk_scale_u8_avx[] = { 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 117,80, //jne 639 <_sk_scale_u8_avx+0x60> - 197,123,16,0, //vmovsd (%rax),%xmm8 + 197,122,126,0, //vmovq (%rax),%xmm8 196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9 196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8 196,66,121,49,192, //vpmovzxbd %xmm8,%xmm8 @@ -9345,7 +9313,7 @@ CODE const uint8_t sk_lerp_u8_avx[] = { 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 117,116, //jne 721 <_sk_lerp_u8_avx+0x84> - 197,123,16,0, //vmovsd (%rax),%xmm8 + 197,122,126,0, //vmovq (%rax),%xmm8 196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9 196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8 196,66,121,49,192, //vpmovzxbd %xmm8,%xmm8 @@ -9447,9 +9415,9 @@ CODE const uint8_t sk_lerp_565_avx[] = { 65,128,224,7, //and $0x7,%r8b 196,65,57,239,192, //vpxor %xmm8,%xmm8,%xmm8 65,254,200, //dec %r8b - 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 15,135,239,254,255,255, //ja 75d <_sk_lerp_565_avx+0x14> + 15,135,243,254,255,255, //ja 75d <_sk_lerp_565_avx+0x14> + 69,15,182,192, //movzbl %r8b,%r8d 76,141,13,75,0,0,0, //lea 0x4b(%rip),%r9 # 8c0 <_sk_lerp_565_avx+0x177> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax @@ -9617,9 +9585,9 @@ CODE const uint8_t sk_load_tables_avx[] = { 128,227,7, //and $0x7,%bl 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 254,203, //dec %bl - 15,182,219, //movzbl %bl,%ebx 128,251,6, //cmp $0x6,%bl - 15,135,182,253,255,255, //ja 8fa <_sk_load_tables_avx+0x1e> + 15,135,185,253,255,255, //ja 8fa <_sk_load_tables_avx+0x1e> + 15,182,219, //movzbl %bl,%ebx 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # bd4 <_sk_load_tables_avx+0x2f8> 73,99,28,153, //movslq (%r9,%rbx,4),%rbx 76,1,203, //add %r9,%rbx @@ -9671,7 +9639,7 @@ CODE const uint8_t sk_load_a8_avx[] = { 72,1,248, //add %rdi,%rax 77,133,192, //test %r8,%r8 117,74, //jne c4a <_sk_load_a8_avx+0x5a> - 197,251,16,0, //vmovsd (%rax),%xmm0 + 197,250,126,0, //vmovq (%rax),%xmm0 196,226,121,49,200, //vpmovzxbd %xmm0,%xmm1 196,227,121,4,192,229, //vpermilps $0xe5,%xmm0,%xmm0 196,226,121,49,192, //vpmovzxbd %xmm0,%xmm0 @@ -9719,16 +9687,16 @@ CODE const uint8_t sk_store_a8_avx[] = { 196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax - 137,200, //mov %ecx,%eax - 36,7, //and $0x7,%al - 254,200, //dec %al - 68,15,182,192, //movzbl %al,%r8d + 65,137,200, //mov %ecx,%r8d + 65,128,224,7, //and $0x7,%r8b + 65,254,200, //dec %r8b 65,128,248,6, //cmp $0x6,%r8b 119,236, //ja cad <_sk_store_a8_avx+0x3e> 196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8 - 76,141,21,67,0,0,0, //lea 0x43(%rip),%r10 # d10 <_sk_store_a8_avx+0xa1> - 75,99,4,130, //movslq (%r10,%r8,4),%rax - 76,1,208, //add %r10,%rax + 65,15,182,192, //movzbl %r8b,%eax + 76,141,5,67,0,0,0, //lea 0x43(%rip),%r8 # d14 <_sk_store_a8_avx+0xa5> + 73,99,4,128, //movslq (%r8,%rax,4),%rax + 76,1,192, //add %r8,%rax 255,224, //jmpq *%rax 196,67,121,20,68,57,6,12, //vpextrb $0xc,%xmm8,0x6(%r9,%rdi,1) 196,67,121,20,68,57,5,10, //vpextrb $0xa,%xmm8,0x5(%r9,%rdi,1) @@ -9737,7 +9705,7 @@ CODE const uint8_t sk_store_a8_avx[] = { 196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1) 196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1) 196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1) - 235,158, //jmp cad <_sk_store_a8_avx+0x3e> + 235,154, //jmp cad <_sk_store_a8_avx+0x3e> 144, //nop 246,255, //idiv %bh 255, //(bad) @@ -9767,7 +9735,7 @@ CODE const uint8_t sk_load_565_avx[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,16, //mov (%rax),%r10 72,133,201, //test %rcx,%rcx - 15,133,209,0,0,0, //jne e0b <_sk_load_565_avx+0xdf> + 15,133,209,0,0,0, //jne e0f <_sk_load_565_avx+0xdf> 196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0 197,241,239,201, //vpxor %xmm1,%xmm1,%xmm1 197,249,105,201, //vpunpckhwd %xmm1,%xmm0,%xmm1 @@ -9816,10 +9784,10 @@ CODE const uint8_t sk_load_565_avx[] = { 65,128,224,7, //and $0x7,%r8b 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0 65,254,200, //dec %r8b - 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 15,135,25,255,255,255, //ja d40 <_sk_load_565_avx+0x14> - 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # e78 <_sk_load_565_avx+0x14c> + 15,135,29,255,255,255, //ja d44 <_sk_load_565_avx+0x14> + 69,15,182,192, //movzbl %r8b,%r8d + 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # e7c <_sk_load_565_avx+0x14c> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax 255,224, //jmpq *%rax @@ -9831,12 +9799,12 @@ CODE const uint8_t sk_load_565_avx[] = { 196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0 196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0 196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0 - 233,201,254,255,255, //jmpq d40 <_sk_load_565_avx+0x14> + 233,201,254,255,255, //jmpq d44 <_sk_load_565_avx+0x14> 144, //nop 243,255, //repz (bad) 255, //(bad) 255, //(bad) - 235,255, //jmp e7d <_sk_load_565_avx+0x151> + 235,255, //jmp e81 <_sk_load_565_avx+0x151> 255, //(bad) 255,227, //jmpq *%rbx 255, //(bad) @@ -9887,19 +9855,19 @@ CODE const uint8_t sk_store_565_avx[] = { 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 72,133,201, //test %rcx,%rcx - 117,10, //jne f32 <_sk_store_565_avx+0x9e> + 117,10, //jne f36 <_sk_store_565_avx+0x9e> 196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax - 137,200, //mov %ecx,%eax - 36,7, //and $0x7,%al - 254,200, //dec %al - 68,15,182,192, //movzbl %al,%r8d + 65,137,200, //mov %ecx,%r8d + 65,128,224,7, //and $0x7,%r8b + 65,254,200, //dec %r8b 65,128,248,6, //cmp $0x6,%r8b - 119,236, //ja f2e <_sk_store_565_avx+0x9a> - 76,141,21,71,0,0,0, //lea 0x47(%rip),%r10 # f90 <_sk_store_565_avx+0xfc> - 75,99,4,130, //movslq (%r10,%r8,4),%rax - 76,1,208, //add %r10,%rax + 119,236, //ja f32 <_sk_store_565_avx+0x9a> + 65,15,182,192, //movzbl %r8b,%eax + 76,141,5,67,0,0,0, //lea 0x43(%rip),%r8 # f94 <_sk_store_565_avx+0xfc> + 73,99,4,128, //movslq (%r8,%rax,4),%rax + 76,1,192, //add %r8,%rax 255,224, //jmpq *%rax 196,67,121,21,68,121,12,6, //vpextrw $0x6,%xmm8,0xc(%r9,%rdi,2) 196,67,121,21,68,121,10,5, //vpextrw $0x5,%xmm8,0xa(%r9,%rdi,2) @@ -9907,29 +9875,28 @@ CODE const uint8_t sk_store_565_avx[] = { 196,67,121,21,68,121,6,3, //vpextrw $0x3,%xmm8,0x6(%r9,%rdi,2) 196,67,121,21,68,121,4,2, //vpextrw $0x2,%xmm8,0x4(%r9,%rdi,2) 196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2) - 197,121,126,192, //vmovd %xmm8,%eax - 102,65,137,4,121, //mov %ax,(%r9,%rdi,2) - 235,161, //jmp f2e <_sk_store_565_avx+0x9a> - 15,31,0, //nopl (%rax) - 242,255, //repnz (bad) + 196,67,121,21,4,121,0, //vpextrw $0x0,%xmm8,(%r9,%rdi,2) + 235,159, //jmp f32 <_sk_store_565_avx+0x9a> + 144, //nop + 246,255, //idiv %bh 255, //(bad) 255, //(bad) - 234, //(bad) + 238, //out %al,(%dx) 255, //(bad) 255, //(bad) - 255,226, //jmpq *%rdx + 255,230, //jmpq *%rsi 255, //(bad) 255, //(bad) 255, //(bad) - 218,255, //(bad) + 222,255, //fdivrp %st,%st(7) 255, //(bad) - 255,210, //callq *%rdx + 255,214, //callq *%rsi 255, //(bad) 255, //(bad) - 255,202, //dec %edx + 255,206, //dec %esi 255, //(bad) 255, //(bad) - 255,194, //inc %edx + 255,198, //inc %esi 255, //(bad) 255, //(bad) 255, //.byte 0xff @@ -9939,7 +9906,7 @@ CODE const uint8_t sk_load_8888_avx[] = { 72,173, //lods %ds:(%rsi),%rax 76,139,16, //mov (%rax),%r10 72,133,201, //test %rcx,%rcx - 15,133,157,0,0,0, //jne 1057 <_sk_load_8888_avx+0xab> + 15,133,157,0,0,0, //jne 105b <_sk_load_8888_avx+0xab> 196,65,124,16,12,186, //vmovups (%r10,%rdi,4),%ymm9 184,255,0,0,0, //mov $0xff,%eax 197,249,110,192, //vmovd %eax,%xmm0 @@ -9976,10 +9943,10 @@ CODE const uint8_t sk_load_8888_avx[] = { 65,128,224,7, //and $0x7,%r8b 196,65,52,87,201, //vxorps %ymm9,%ymm9,%ymm9 65,254,200, //dec %r8b - 69,15,182,192, //movzbl %r8b,%r8d 65,128,248,6, //cmp $0x6,%r8b - 15,135,76,255,255,255, //ja fc0 <_sk_load_8888_avx+0x14> - 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # 1104 <_sk_load_8888_avx+0x158> + 15,135,80,255,255,255, //ja fc4 <_sk_load_8888_avx+0x14> + 69,15,182,192, //movzbl %r8b,%r8d + 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # 1108 <_sk_load_8888_avx+0x158> 75,99,4,129, //movslq (%r9,%r8,4),%rax 76,1,200, //add %r9,%rax 255,224, //jmpq *%rax @@ -10002,7 +9969,7 @@ CODE const uint8_t sk_load_8888_avx[] = { 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9 196,195,49,34,4,186,0, //vpinsrd $0x0,(%r10,%rdi,4),%xmm9,%xmm0 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9 - 233,188,254,255,255, //jmpq fc0 <_sk_load_8888_avx+0x14> + 233,188,254,255,255, //jmpq fc4 <_sk_load_8888_avx+0x14> 238, //out %al,(%dx) 255, //(bad) 255, //(bad) @@ -10054,31 +10021,31 @@ CODE const uint8_t sk_store_8888_avx[] = { 196,65,45,86,192, //vorpd %ymm8,%ymm10,%ymm8 196,65,53,86,192, //vorpd %ymm8,%ymm9,%ymm8 72,133,201, //test %rcx,%rcx - 117,10, //jne 11c4 <_sk_store_8888_avx+0xa4> + 117,10, //jne 11c8 <_sk_store_8888_avx+0xa4> 196,65,124,17,4,185, //vmovups %ymm8,(%r9,%rdi,4) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax - 137,200, //mov %ecx,%eax - 36,7, //and $0x7,%al - 254,200, //dec %al - 68,15,182,192, //movzbl %al,%r8d + 65,137,200, //mov %ecx,%r8d + 65,128,224,7, //and $0x7,%r8b + 65,254,200, //dec %r8b 65,128,248,6, //cmp $0x6,%r8b - 119,236, //ja 11c0 <_sk_store_8888_avx+0xa0> - 76,141,21,85,0,0,0, //lea 0x55(%rip),%r10 # 1230 <_sk_store_8888_avx+0x110> - 75,99,4,130, //movslq (%r10,%r8,4),%rax - 76,1,208, //add %r10,%rax + 119,236, //ja 11c4 <_sk_store_8888_avx+0xa0> + 65,15,182,192, //movzbl %r8b,%eax + 76,141,5,85,0,0,0, //lea 0x55(%rip),%r8 # 1238 <_sk_store_8888_avx+0x114> + 73,99,4,128, //movslq (%r8,%rax,4),%rax + 76,1,192, //add %r8,%rax 255,224, //jmpq *%rax 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 196,67,121,22,76,185,24,2, //vpextrd $0x2,%xmm9,0x18(%r9,%rdi,4) 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 196,67,121,22,76,185,20,1, //vpextrd $0x1,%xmm9,0x14(%r9,%rdi,4) 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 - 196,65,121,126,76,185,16, //vmovd %xmm9,0x10(%r9,%rdi,4) + 196,65,122,17,76,185,16, //vmovss %xmm9,0x10(%r9,%rdi,4) 196,67,121,22,68,185,12,3, //vpextrd $0x3,%xmm8,0xc(%r9,%rdi,4) 196,67,121,22,68,185,8,2, //vpextrd $0x2,%xmm8,0x8(%r9,%rdi,4) 196,67,121,22,68,185,4,1, //vpextrd $0x1,%xmm8,0x4(%r9,%rdi,4) 196,65,121,126,4,185, //vmovd %xmm8,(%r9,%rdi,4) - 235,147, //jmp 11c0 <_sk_store_8888_avx+0xa0> + 235,143, //jmp 11c4 <_sk_store_8888_avx+0xa0> 15,31,0, //nopl (%rax) 245, //cmc 255, //(bad) @@ -10109,93 +10076,87 @@ CODE const uint8_t sk_load_f16_avx[] = { 72,173, //lods %ds:(%rsi),%rax 72,139,0, //mov (%rax),%rax 72,133,201, //test %rcx,%rcx - 15,133,8,1,0,0, //jne 1362 <_sk_load_f16_avx+0x116> - 197,249,16,12,248, //vmovupd (%rax,%rdi,8),%xmm1 + 15,133,2,1,0,0, //jne 1364 <_sk_load_f16_avx+0x110> + 197,121,16,4,248, //vmovupd (%rax,%rdi,8),%xmm8 197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2 197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3 - 197,121,16,68,248,48, //vmovupd 0x30(%rax,%rdi,8),%xmm8 - 197,241,97,194, //vpunpcklwd %xmm2,%xmm1,%xmm0 - 197,241,105,202, //vpunpckhwd %xmm2,%xmm1,%xmm1 - 196,193,97,97,208, //vpunpcklwd %xmm8,%xmm3,%xmm2 - 196,193,97,105,216, //vpunpckhwd %xmm8,%xmm3,%xmm3 - 197,121,97,193, //vpunpcklwd %xmm1,%xmm0,%xmm8 - 197,249,105,193, //vpunpckhwd %xmm1,%xmm0,%xmm0 - 197,105,97,211, //vpunpcklwd %xmm3,%xmm2,%xmm10 - 197,105,105,203, //vpunpckhwd %xmm3,%xmm2,%xmm9 + 197,122,111,76,248,48, //vmovdqu 0x30(%rax,%rdi,8),%xmm9 + 197,185,97,194, //vpunpcklwd %xmm2,%xmm8,%xmm0 + 197,185,105,210, //vpunpckhwd %xmm2,%xmm8,%xmm2 + 196,193,97,97,201, //vpunpcklwd %xmm9,%xmm3,%xmm1 + 196,193,97,105,217, //vpunpckhwd %xmm9,%xmm3,%xmm3 + 197,121,97,194, //vpunpcklwd %xmm2,%xmm0,%xmm8 + 197,249,105,194, //vpunpckhwd %xmm2,%xmm0,%xmm0 + 197,241,97,211, //vpunpcklwd %xmm3,%xmm1,%xmm2 + 197,113,105,203, //vpunpckhwd %xmm3,%xmm1,%xmm9 184,0,4,0,4, //mov $0x4000400,%eax 197,249,110,216, //vmovd %eax,%xmm3 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 - 196,193,97,101,208, //vpcmpgtw %xmm8,%xmm3,%xmm2 - 196,193,105,223,208, //vpandn %xmm8,%xmm2,%xmm2 + 196,193,97,101,200, //vpcmpgtw %xmm8,%xmm3,%xmm1 + 196,65,113,223,192, //vpandn %xmm8,%xmm1,%xmm8 197,225,101,200, //vpcmpgtw %xmm0,%xmm3,%xmm1 197,241,223,192, //vpandn %xmm0,%xmm1,%xmm0 - 196,193,97,101,202, //vpcmpgtw %xmm10,%xmm3,%xmm1 - 196,193,113,223,202, //vpandn %xmm10,%xmm1,%xmm1 - 196,193,97,101,217, //vpcmpgtw %xmm9,%xmm3,%xmm3 - 196,193,97,223,217, //vpandn %xmm9,%xmm3,%xmm3 - 196,98,121,51,194, //vpmovzxwd %xmm2,%xmm8 + 197,225,101,202, //vpcmpgtw %xmm2,%xmm3,%xmm1 + 197,241,223,202, //vpandn %xmm2,%xmm1,%xmm1 + 196,193,97,101,209, //vpcmpgtw %xmm9,%xmm3,%xmm2 + 196,193,105,223,209, //vpandn %xmm9,%xmm2,%xmm2 + 196,66,121,51,208, //vpmovzxwd %xmm8,%xmm10 196,98,121,51,201, //vpmovzxwd %xmm1,%xmm9 - 196,65,41,239,210, //vpxor %xmm10,%xmm10,%xmm10 - 196,193,105,105,210, //vpunpckhwd %xmm10,%xmm2,%xmm2 - 196,193,113,105,202, //vpunpckhwd %xmm10,%xmm1,%xmm1 + 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 + 197,57,105,195, //vpunpckhwd %xmm3,%xmm8,%xmm8 + 197,241,105,203, //vpunpckhwd %xmm3,%xmm1,%xmm1 196,98,121,51,216, //vpmovzxwd %xmm0,%xmm11 - 196,98,121,51,227, //vpmovzxwd %xmm3,%xmm12 - 196,65,121,105,234, //vpunpckhwd %xmm10,%xmm0,%xmm13 - 196,65,97,105,210, //vpunpckhwd %xmm10,%xmm3,%xmm10 - 196,193,121,114,240,13, //vpslld $0xd,%xmm8,%xmm0 - 196,193,97,114,241,13, //vpslld $0xd,%xmm9,%xmm3 - 196,227,125,24,195,1, //vinsertf128 $0x1,%xmm3,%ymm0,%ymm0 + 196,98,121,51,226, //vpmovzxwd %xmm2,%xmm12 + 197,121,105,235, //vpunpckhwd %xmm3,%xmm0,%xmm13 + 197,105,105,243, //vpunpckhwd %xmm3,%xmm2,%xmm14 + 196,193,121,114,242,13, //vpslld $0xd,%xmm10,%xmm0 + 196,193,105,114,241,13, //vpslld $0xd,%xmm9,%xmm2 + 196,227,125,24,194,1, //vinsertf128 $0x1,%xmm2,%ymm0,%ymm0 184,0,0,128,119, //mov $0x77800000,%eax - 197,249,110,216, //vmovd %eax,%xmm3 - 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 - 196,99,101,24,203,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm9 + 197,249,110,208, //vmovd %eax,%xmm2 + 197,249,112,210,0, //vpshufd $0x0,%xmm2,%xmm2 + 196,99,109,24,202,1, //vinsertf128 $0x1,%xmm2,%ymm2,%ymm9 197,180,89,192, //vmulps %ymm0,%ymm9,%ymm0 - 197,233,114,242,13, //vpslld $0xd,%xmm2,%xmm2 + 196,193,105,114,240,13, //vpslld $0xd,%xmm8,%xmm2 197,241,114,241,13, //vpslld $0xd,%xmm1,%xmm1 196,227,109,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm2,%ymm1 197,180,89,201, //vmulps %ymm1,%ymm9,%ymm1 - 196,193,57,114,243,13, //vpslld $0xd,%xmm11,%xmm8 - 196,193,105,114,244,13, //vpslld $0xd,%xmm12,%xmm2 - 196,227,61,24,210,1, //vinsertf128 $0x1,%xmm2,%ymm8,%ymm2 + 196,193,105,114,243,13, //vpslld $0xd,%xmm11,%xmm2 + 196,193,97,114,244,13, //vpslld $0xd,%xmm12,%xmm3 + 196,227,109,24,211,1, //vinsertf128 $0x1,%xmm3,%ymm2,%ymm2 197,180,89,210, //vmulps %ymm2,%ymm9,%ymm2 196,193,57,114,245,13, //vpslld $0xd,%xmm13,%xmm8 - 196,193,97,114,242,13, //vpslld $0xd,%xmm10,%xmm3 + 196,193,97,114,246,13, //vpslld $0xd,%xmm14,%xmm3 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3 197,180,89,219, //vmulps %ymm3,%ymm9,%ymm3 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax - 197,251,16,12,248, //vmovsd (%rax,%rdi,8),%xmm1 - 196,65,57,87,192, //vxorpd %xmm8,%xmm8,%xmm8 + 197,123,16,4,248, //vmovsd (%rax,%rdi,8),%xmm8 + 196,65,49,239,201, //vpxor %xmm9,%xmm9,%xmm9 72,131,249,1, //cmp $0x1,%rcx - 117,6, //jne 1378 <_sk_load_f16_avx+0x12c> - 197,250,126,201, //vmovq %xmm1,%xmm1 - 235,30, //jmp 1396 <_sk_load_f16_avx+0x14a> - 197,241,22,76,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm1,%xmm1 + 116,79, //je 13c3 <_sk_load_f16_avx+0x16f> + 197,57,22,68,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm8,%xmm8 72,131,249,3, //cmp $0x3,%rcx - 114,18, //jb 1396 <_sk_load_f16_avx+0x14a> + 114,67, //jb 13c3 <_sk_load_f16_avx+0x16f> 197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2 72,131,249,3, //cmp $0x3,%rcx - 117,19, //jne 13a3 <_sk_load_f16_avx+0x157> - 197,250,126,210, //vmovq %xmm2,%xmm2 - 235,46, //jmp 13c4 <_sk_load_f16_avx+0x178> - 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 - 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2 - 233,206,254,255,255, //jmpq 1271 <_sk_load_f16_avx+0x25> + 116,68, //je 13d0 <_sk_load_f16_avx+0x17c> 197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2 72,131,249,5, //cmp $0x5,%rcx - 114,21, //jb 13c4 <_sk_load_f16_avx+0x178> + 114,56, //jb 13d0 <_sk_load_f16_avx+0x17c> 197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3 72,131,249,5, //cmp $0x5,%rcx - 117,18, //jne 13cd <_sk_load_f16_avx+0x181> - 197,250,126,219, //vmovq %xmm3,%xmm3 - 233,173,254,255,255, //jmpq 1271 <_sk_load_f16_avx+0x25> - 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 - 233,164,254,255,255, //jmpq 1271 <_sk_load_f16_avx+0x25> + 15,132,209,254,255,255, //je 1279 <_sk_load_f16_avx+0x25> 197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3 72,131,249,7, //cmp $0x7,%rcx - 15,130,148,254,255,255, //jb 1271 <_sk_load_f16_avx+0x25> - 197,123,16,68,248,48, //vmovsd 0x30(%rax,%rdi,8),%xmm8 - 233,137,254,255,255, //jmpq 1271 <_sk_load_f16_avx+0x25> + 15,130,193,254,255,255, //jb 1279 <_sk_load_f16_avx+0x25> + 197,122,126,76,248,48, //vmovq 0x30(%rax,%rdi,8),%xmm9 + 233,182,254,255,255, //jmpq 1279 <_sk_load_f16_avx+0x25> + 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 + 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2 + 233,169,254,255,255, //jmpq 1279 <_sk_load_f16_avx+0x25> + 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 + 233,160,254,255,255, //jmpq 1279 <_sk_load_f16_avx+0x25> }; CODE const uint8_t sk_store_f16_avx[] = { @@ -10234,7 +10195,7 @@ CODE const uint8_t sk_store_f16_avx[] = { 196,65,25,98,205, //vpunpckldq %xmm13,%xmm12,%xmm9 196,65,25,106,197, //vpunpckhdq %xmm13,%xmm12,%xmm8 72,133,201, //test %rcx,%rcx - 117,31, //jne 14be <_sk_store_f16_avx+0xd6> + 117,31, //jne 14af <_sk_store_f16_avx+0xd6> 196,65,120,17,28,248, //vmovups %xmm11,(%r8,%rdi,8) 196,65,120,17,84,248,16, //vmovups %xmm10,0x10(%r8,%rdi,8) 196,65,120,17,76,248,32, //vmovups %xmm9,0x20(%r8,%rdi,8) @@ -10243,22 +10204,22 @@ CODE const uint8_t sk_store_f16_avx[] = { 255,224, //jmpq *%rax 196,65,121,214,28,248, //vmovq %xmm11,(%r8,%rdi,8) 72,131,249,1, //cmp $0x1,%rcx - 116,240, //je 14ba <_sk_store_f16_avx+0xd2> + 116,240, //je 14ab <_sk_store_f16_avx+0xd2> 196,65,121,23,92,248,8, //vmovhpd %xmm11,0x8(%r8,%rdi,8) 72,131,249,3, //cmp $0x3,%rcx - 114,227, //jb 14ba <_sk_store_f16_avx+0xd2> + 114,227, //jb 14ab <_sk_store_f16_avx+0xd2> 196,65,121,214,84,248,16, //vmovq %xmm10,0x10(%r8,%rdi,8) - 116,218, //je 14ba <_sk_store_f16_avx+0xd2> + 116,218, //je 14ab <_sk_store_f16_avx+0xd2> 196,65,121,23,84,248,24, //vmovhpd %xmm10,0x18(%r8,%rdi,8) 72,131,249,5, //cmp $0x5,%rcx - 114,205, //jb 14ba <_sk_store_f16_avx+0xd2> + 114,205, //jb 14ab <_sk_store_f16_avx+0xd2> 196,65,121,214,76,248,32, //vmovq %xmm9,0x20(%r8,%rdi,8) - 116,196, //je 14ba <_sk_store_f16_avx+0xd2> + 116,196, //je 14ab <_sk_store_f16_avx+0xd2> 196,65,121,23,76,248,40, //vmovhpd %xmm9,0x28(%r8,%rdi,8) 72,131,249,7, //cmp $0x7,%rcx - 114,183, //jb 14ba <_sk_store_f16_avx+0xd2> + 114,183, //jb 14ab <_sk_store_f16_avx+0xd2> 196,65,121,214,68,248,48, //vmovq %xmm8,0x30(%r8,%rdi,8) - 235,174, //jmp 14ba <_sk_store_f16_avx+0xd2> + 235,174, //jmp 14ab <_sk_store_f16_avx+0xd2> }; CODE const uint8_t sk_store_f32_avx[] = { @@ -10274,7 +10235,7 @@ CODE const uint8_t sk_store_f32_avx[] = { 196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8 196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11 72,133,201, //test %rcx,%rcx - 117,55, //jne 1579 <_sk_store_f32_avx+0x6d> + 117,55, //jne 156a <_sk_store_f32_avx+0x6d> 196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12 196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13 196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9 @@ -10287,22 +10248,22 @@ CODE const uint8_t sk_store_f32_avx[] = { 255,224, //jmpq *%rax 196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4) 72,131,249,1, //cmp $0x1,%rcx - 116,240, //je 1575 <_sk_store_f32_avx+0x69> + 116,240, //je 1566 <_sk_store_f32_avx+0x69> 196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4) 72,131,249,3, //cmp $0x3,%rcx - 114,227, //jb 1575 <_sk_store_f32_avx+0x69> + 114,227, //jb 1566 <_sk_store_f32_avx+0x69> 196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4) - 116,218, //je 1575 <_sk_store_f32_avx+0x69> + 116,218, //je 1566 <_sk_store_f32_avx+0x69> 196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4) 72,131,249,5, //cmp $0x5,%rcx - 114,205, //jb 1575 <_sk_store_f32_avx+0x69> + 114,205, //jb 1566 <_sk_store_f32_avx+0x69> 196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4) - 116,195, //je 1575 <_sk_store_f32_avx+0x69> + 116,195, //je 1566 <_sk_store_f32_avx+0x69> 196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4) 72,131,249,7, //cmp $0x7,%rcx - 114,181, //jb 1575 <_sk_store_f32_avx+0x69> + 114,181, //jb 1566 <_sk_store_f32_avx+0x69> 196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4) - 235,171, //jmp 1575 <_sk_store_f32_avx+0x69> + 235,171, //jmp 1566 <_sk_store_f32_avx+0x69> }; CODE const uint8_t sk_clamp_x_avx[] = { @@ -10371,7 +10332,7 @@ CODE const uint8_t sk_repeat_y_avx[] = { CODE const uint8_t sk_mirror_x_avx[] = { 72,173, //lods %ds:(%rsi),%rax - 197,122,16,0, //vmovss (%rax),%xmm8 + 197,121,110,0, //vmovd (%rax),%xmm8 196,65,121,112,200,0, //vpshufd $0x0,%xmm8,%xmm9 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9 196,65,124,92,209, //vsubps %ymm9,%ymm0,%ymm10 @@ -10398,7 +10359,7 @@ CODE const uint8_t sk_mirror_x_avx[] = { CODE const uint8_t sk_mirror_y_avx[] = { 72,173, //lods %ds:(%rsi),%rax - 197,122,16,0, //vmovss (%rax),%xmm8 + 197,121,110,0, //vmovd (%rax),%xmm8 196,65,121,112,200,0, //vpshufd $0x0,%xmm8,%xmm9 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9 196,65,116,92,209, //vsubps %ymm9,%ymm1,%ymm10 @@ -10888,20 +10849,17 @@ CODE const uint8_t sk_premul_sse41[] = { }; CODE const uint8_t sk_unpremul_sse41[] = { - 68,15,40,192, //movaps %xmm0,%xmm8 - 69,15,87,201, //xorps %xmm9,%xmm9 + 69,15,87,192, //xorps %xmm8,%xmm8 184,0,0,128,63, //mov $0x3f800000,%eax - 102,68,15,110,208, //movd %eax,%xmm10 - 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 - 68,15,94,211, //divps %xmm3,%xmm10 - 15,40,195, //movaps %xmm3,%xmm0 - 65,15,194,193,0, //cmpeqps %xmm9,%xmm0 - 102,69,15,56,20,209, //blendvps %xmm0,%xmm9,%xmm10 - 69,15,89,194, //mulps %xmm10,%xmm8 - 65,15,89,202, //mulps %xmm10,%xmm1 - 65,15,89,210, //mulps %xmm10,%xmm2 + 102,68,15,110,200, //movd %eax,%xmm9 + 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 + 68,15,94,203, //divps %xmm3,%xmm9 + 68,15,194,195,4, //cmpneqps %xmm3,%xmm8 + 69,15,84,193, //andps %xmm9,%xmm8 + 65,15,89,192, //mulps %xmm8,%xmm0 + 65,15,89,200, //mulps %xmm8,%xmm1 + 65,15,89,208, //mulps %xmm8,%xmm2 72,173, //lods %ds:(%rsi),%rax - 65,15,40,192, //movaps %xmm8,%xmm0 255,224, //jmpq *%rax }; @@ -11186,31 +11144,33 @@ CODE const uint8_t sk_load_tables_sse41[] = { 102,67,15,58,33,4,144,16, //insertps $0x10,(%r8,%r10,4),%xmm0 102,67,15,58,33,4,136,32, //insertps $0x20,(%r8,%r9,4),%xmm0 102,65,15,58,33,4,136,48, //insertps $0x30,(%r8,%rcx,4),%xmm0 - 72,139,72,16, //mov 0x10(%rax),%rcx - 102,73,15,58,22,200,1, //pextrq $0x1,%xmm1,%r8 - 69,137,193, //mov %r8d,%r9d - 73,193,232,32, //shr $0x20,%r8 - 102,73,15,126,202, //movq %xmm1,%r10 - 69,137,211, //mov %r10d,%r11d - 73,193,234,32, //shr $0x20,%r10 - 243,66,15,16,12,153, //movss (%rcx,%r11,4),%xmm1 - 102,66,15,58,33,12,145,16, //insertps $0x10,(%rcx,%r10,4),%xmm1 - 243,66,15,16,28,137, //movss (%rcx,%r9,4),%xmm3 + 76,139,64,16, //mov 0x10(%rax),%r8 + 102,73,15,58,22,202,1, //pextrq $0x1,%xmm1,%r10 + 77,137,209, //mov %r10,%r9 + 73,193,233,32, //shr $0x20,%r9 + 102,72,15,126,201, //movq %xmm1,%rcx + 65,137,203, //mov %ecx,%r11d + 65,129,227,255,255,255,0, //and $0xffffff,%r11d + 72,193,233,30, //shr $0x1e,%rcx + 65,129,226,255,255,255,0, //and $0xffffff,%r10d + 243,67,15,16,12,152, //movss (%r8,%r11,4),%xmm1 + 102,65,15,58,33,12,8,16, //insertps $0x10,(%r8,%rcx,1),%xmm1 + 243,67,15,16,28,144, //movss (%r8,%r10,4),%xmm3 102,15,58,33,203,32, //insertps $0x20,%xmm3,%xmm1 - 243,66,15,16,28,129, //movss (%rcx,%r8,4),%xmm3 + 243,67,15,16,28,136, //movss (%r8,%r9,4),%xmm3 102,15,58,33,203,48, //insertps $0x30,%xmm3,%xmm1 - 72,139,64,24, //mov 0x18(%rax),%rax + 76,139,72,24, //mov 0x18(%rax),%r9 102,72,15,58,22,209,1, //pextrq $0x1,%xmm2,%rcx - 65,137,200, //mov %ecx,%r8d + 68,15,183,193, //movzwl %cx,%r8d 72,193,233,32, //shr $0x20,%rcx - 102,73,15,126,209, //movq %xmm2,%r9 - 69,137,202, //mov %r9d,%r10d - 73,193,233,32, //shr $0x20,%r9 - 243,66,15,16,20,144, //movss (%rax,%r10,4),%xmm2 - 102,66,15,58,33,20,136,16, //insertps $0x10,(%rax,%r9,4),%xmm2 - 243,66,15,16,28,128, //movss (%rax,%r8,4),%xmm3 + 102,72,15,126,208, //movq %xmm2,%rax + 68,15,183,208, //movzwl %ax,%r10d + 72,193,232,30, //shr $0x1e,%rax + 243,67,15,16,20,145, //movss (%r9,%r10,4),%xmm2 + 102,65,15,58,33,20,1,16, //insertps $0x10,(%r9,%rax,1),%xmm2 + 243,67,15,16,28,129, //movss (%r9,%r8,4),%xmm3 102,15,58,33,211,32, //insertps $0x20,%xmm3,%xmm2 - 243,15,16,28,136, //movss (%rax,%rcx,4),%xmm3 + 243,65,15,16,28,137, //movss (%r9,%rcx,4),%xmm3 102,15,58,33,211,48, //insertps $0x30,%xmm3,%xmm2 102,65,15,114,208,24, //psrld $0x18,%xmm8 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 @@ -11459,14 +11419,14 @@ CODE const uint8_t sk_store_f32_sse41[] = { 68,15,21,211, //unpckhps %xmm3,%xmm10 69,15,40,225, //movaps %xmm9,%xmm12 102,69,15,20,227, //unpcklpd %xmm11,%xmm12 - 102,69,15,21,203, //unpckhpd %xmm11,%xmm9 - 69,15,40,216, //movaps %xmm8,%xmm11 - 102,69,15,20,218, //unpcklpd %xmm10,%xmm11 - 102,69,15,21,194, //unpckhpd %xmm10,%xmm8 + 69,15,18,217, //movhlps %xmm9,%xmm11 + 69,15,40,200, //movaps %xmm8,%xmm9 + 102,69,15,20,202, //unpcklpd %xmm10,%xmm9 + 69,15,18,208, //movhlps %xmm8,%xmm10 102,68,15,17,36,8, //movupd %xmm12,(%rax,%rcx,1) - 102,68,15,17,76,8,16, //movupd %xmm9,0x10(%rax,%rcx,1) - 102,68,15,17,92,8,32, //movupd %xmm11,0x20(%rax,%rcx,1) - 102,68,15,17,68,8,48, //movupd %xmm8,0x30(%rax,%rcx,1) + 68,15,17,92,8,16, //movups %xmm11,0x10(%rax,%rcx,1) + 102,68,15,17,76,8,32, //movupd %xmm9,0x20(%rax,%rcx,1) + 68,15,17,84,8,48, //movups %xmm10,0x30(%rax,%rcx,1) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax }; @@ -12102,12 +12062,12 @@ CODE const uint8_t sk_premul_sse2[] = { CODE const uint8_t sk_unpremul_sse2[] = { 69,15,87,192, //xorps %xmm8,%xmm8 - 68,15,194,195,0, //cmpeqps %xmm3,%xmm8 184,0,0,128,63, //mov $0x3f800000,%eax 102,68,15,110,200, //movd %eax,%xmm9 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 68,15,94,203, //divps %xmm3,%xmm9 - 69,15,85,193, //andnps %xmm9,%xmm8 + 68,15,194,195,4, //cmpneqps %xmm3,%xmm8 + 69,15,84,193, //andps %xmm9,%xmm8 65,15,89,192, //mulps %xmm8,%xmm0 65,15,89,200, //mulps %xmm8,%xmm1 65,15,89,208, //mulps %xmm8,%xmm2 @@ -12397,34 +12357,36 @@ CODE const uint8_t sk_load_tables_sse2[] = { 243,67,15,16,12,136, //movss (%r8,%r9,4),%xmm1 15,20,193, //unpcklps %xmm1,%xmm0 15,20,195, //unpcklps %xmm3,%xmm0 - 72,139,72,16, //mov 0x10(%rax),%rcx + 76,139,64,16, //mov 0x10(%rax),%r8 102,65,15,112,201,78, //pshufd $0x4e,%xmm9,%xmm1 - 102,73,15,126,200, //movq %xmm1,%r8 - 69,137,193, //mov %r8d,%r9d - 73,193,232,32, //shr $0x20,%r8 - 102,77,15,126,202, //movq %xmm9,%r10 - 69,137,211, //mov %r10d,%r11d - 73,193,234,32, //shr $0x20,%r10 - 243,66,15,16,28,145, //movss (%rcx,%r10,4),%xmm3 - 243,66,15,16,12,129, //movss (%rcx,%r8,4),%xmm1 + 102,73,15,126,202, //movq %xmm1,%r10 + 77,137,209, //mov %r10,%r9 + 73,193,233,32, //shr $0x20,%r9 + 102,76,15,126,201, //movq %xmm9,%rcx + 65,137,203, //mov %ecx,%r11d + 65,129,227,255,255,255,0, //and $0xffffff,%r11d + 72,193,233,30, //shr $0x1e,%rcx + 65,129,226,255,255,255,0, //and $0xffffff,%r10d + 243,65,15,16,28,8, //movss (%r8,%rcx,1),%xmm3 + 243,67,15,16,12,136, //movss (%r8,%r9,4),%xmm1 15,20,217, //unpcklps %xmm1,%xmm3 - 243,66,15,16,12,153, //movss (%rcx,%r11,4),%xmm1 - 243,66,15,16,20,137, //movss (%rcx,%r9,4),%xmm2 + 243,67,15,16,12,152, //movss (%r8,%r11,4),%xmm1 + 243,67,15,16,20,144, //movss (%r8,%r10,4),%xmm2 15,20,202, //unpcklps %xmm2,%xmm1 15,20,203, //unpcklps %xmm3,%xmm1 - 72,139,64,24, //mov 0x18(%rax),%rax + 76,139,72,24, //mov 0x18(%rax),%r9 102,65,15,112,210,78, //pshufd $0x4e,%xmm10,%xmm2 102,72,15,126,209, //movq %xmm2,%rcx - 65,137,200, //mov %ecx,%r8d + 68,15,183,193, //movzwl %cx,%r8d 72,193,233,32, //shr $0x20,%rcx - 102,77,15,126,209, //movq %xmm10,%r9 - 69,137,202, //mov %r9d,%r10d - 73,193,233,32, //shr $0x20,%r9 - 243,70,15,16,12,136, //movss (%rax,%r9,4),%xmm9 - 243,15,16,20,136, //movss (%rax,%rcx,4),%xmm2 + 102,76,15,126,208, //movq %xmm10,%rax + 68,15,183,208, //movzwl %ax,%r10d + 72,193,232,30, //shr $0x1e,%rax + 243,69,15,16,12,1, //movss (%r9,%rax,1),%xmm9 + 243,65,15,16,20,137, //movss (%r9,%rcx,4),%xmm2 68,15,20,202, //unpcklps %xmm2,%xmm9 - 243,66,15,16,20,144, //movss (%rax,%r10,4),%xmm2 - 243,66,15,16,28,128, //movss (%rax,%r8,4),%xmm3 + 243,67,15,16,20,145, //movss (%r9,%r10,4),%xmm2 + 243,67,15,16,28,129, //movss (%r9,%r8,4),%xmm3 15,20,211, //unpcklps %xmm3,%xmm2 65,15,20,209, //unpcklps %xmm9,%xmm2 102,65,15,114,208,24, //psrld $0x18,%xmm8 @@ -12685,14 +12647,14 @@ CODE const uint8_t sk_store_f32_sse2[] = { 68,15,21,211, //unpckhps %xmm3,%xmm10 69,15,40,225, //movaps %xmm9,%xmm12 102,69,15,20,227, //unpcklpd %xmm11,%xmm12 - 102,69,15,21,203, //unpckhpd %xmm11,%xmm9 - 69,15,40,216, //movaps %xmm8,%xmm11 - 102,69,15,20,218, //unpcklpd %xmm10,%xmm11 - 102,69,15,21,194, //unpckhpd %xmm10,%xmm8 + 69,15,18,217, //movhlps %xmm9,%xmm11 + 69,15,40,200, //movaps %xmm8,%xmm9 + 102,69,15,20,202, //unpcklpd %xmm10,%xmm9 + 69,15,18,208, //movhlps %xmm8,%xmm10 102,68,15,17,36,8, //movupd %xmm12,(%rax,%rcx,1) - 102,68,15,17,76,8,16, //movupd %xmm9,0x10(%rax,%rcx,1) - 102,68,15,17,92,8,32, //movupd %xmm11,0x20(%rax,%rcx,1) - 102,68,15,17,68,8,48, //movupd %xmm8,0x30(%rax,%rcx,1) + 68,15,17,92,8,16, //movups %xmm11,0x10(%rax,%rcx,1) + 102,68,15,17,76,8,32, //movupd %xmm9,0x20(%rax,%rcx,1) + 68,15,17,84,8,48, //movups %xmm10,0x30(%rax,%rcx,1) 72,173, //lods %ds:(%rsi),%rax 255,224, //jmpq *%rax }; |