aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--src/jumper/SkJumper.cpp9
-rw-r--r--src/jumper/SkJumper_generated.S616
-rw-r--r--src/jumper/SkJumper_generated_win.S402
-rw-r--r--src/jumper/SkJumper_stages.cpp69
4 files changed, 1001 insertions, 95 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index d38090d180..0821666ba9 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -7,13 +7,14 @@
#include "SkCpu.h"
#include "SkJumper.h"
-#include "SkOnce.h"
#include "SkRasterPipeline.h"
#include "SkTemplates.h"
-#include <atomic>
// A debugging mode that helps prioritize porting stages to SkJumper.
#if 0
+ #include "SkOnce.h"
+ #include <atomic>
+
#define M(st) {0},
static std::atomic<int> gMissing[] = { SK_RASTER_PIPELINE_STAGES(M) };
#undef M
@@ -82,6 +83,10 @@ static K kConstants = {
M(matrix_3x4) \
M(clamp_x) \
M(clamp_y) \
+ M(repeat_x) \
+ M(repeat_y) \
+ M(mirror_x) \
+ M(mirror_y) \
M(linear_gradient_2stops)
// We can't express the real types of most stage functions portably, so we use a stand-in.
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index 973534a65b..071aeea230 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -698,23 +698,89 @@ _sk_store_f16_aarch64:
.globl _sk_clamp_x_aarch64
_sk_clamp_x_aarch64:
.long 0xa8c10c28 // ldp x8, x3, [x1],#16
- .long 0x6f07e7f0 // movi v16.2d, #0xffffffffffffffff
- .long 0x6f00e411 // movi v17.2d, #0x0
- .long 0x4d40c912 // ld1r {v18.4s}, [x8]
- .long 0x4eb08650 // add v16.4s, v18.4s, v16.4s
+ .long 0x6f00e410 // movi v16.2d, #0x0
+ .long 0x4e20f600 // fmax v0.4s, v16.4s, v0.4s
+ .long 0x6f07e7f1 // movi v17.2d, #0xffffffffffffffff
+ .long 0x4d40c910 // ld1r {v16.4s}, [x8]
+ .long 0x4eb18610 // add v16.4s, v16.4s, v17.4s
.long 0x4eb0f400 // fmin v0.4s, v0.4s, v16.4s
- .long 0x4e20f620 // fmax v0.4s, v17.4s, v0.4s
.long 0xd61f0060 // br x3
.globl _sk_clamp_y_aarch64
_sk_clamp_y_aarch64:
.long 0xa8c10c28 // ldp x8, x3, [x1],#16
+ .long 0x6f00e410 // movi v16.2d, #0x0
+ .long 0x4e21f601 // fmax v1.4s, v16.4s, v1.4s
+ .long 0x6f07e7f1 // movi v17.2d, #0xffffffffffffffff
+ .long 0x4d40c910 // ld1r {v16.4s}, [x8]
+ .long 0x4eb18610 // add v16.4s, v16.4s, v17.4s
+ .long 0x4eb0f421 // fmin v1.4s, v1.4s, v16.4s
+ .long 0xd61f0060 // br x3
+
+.globl _sk_repeat_x_aarch64
+_sk_repeat_x_aarch64:
+ .long 0xa8c10c28 // ldp x8, x3, [x1],#16
.long 0x6f07e7f0 // movi v16.2d, #0xffffffffffffffff
- .long 0x6f00e411 // movi v17.2d, #0x0
- .long 0x4d40c912 // ld1r {v18.4s}, [x8]
+ .long 0xbd400111 // ldr s17, [x8]
+ .long 0x4e040632 // dup v18.4s, v17.s[0]
+ .long 0x4eb08650 // add v16.4s, v18.4s, v16.4s
+ .long 0x6e32fc12 // fdiv v18.4s, v0.4s, v18.4s
+ .long 0x4e219a52 // frintm v18.4s, v18.4s
+ .long 0x4f919251 // fmul v17.4s, v18.4s, v17.s[0]
+ .long 0x4eb1d400 // fsub v0.4s, v0.4s, v17.4s
+ .long 0x4eb0f400 // fmin v0.4s, v0.4s, v16.4s
+ .long 0xd61f0060 // br x3
+
+.globl _sk_repeat_y_aarch64
+_sk_repeat_y_aarch64:
+ .long 0xa8c10c28 // ldp x8, x3, [x1],#16
+ .long 0x6f07e7f0 // movi v16.2d, #0xffffffffffffffff
+ .long 0xbd400111 // ldr s17, [x8]
+ .long 0x4e040632 // dup v18.4s, v17.s[0]
+ .long 0x4eb08650 // add v16.4s, v18.4s, v16.4s
+ .long 0x6e32fc32 // fdiv v18.4s, v1.4s, v18.4s
+ .long 0x4e219a52 // frintm v18.4s, v18.4s
+ .long 0x4f919251 // fmul v17.4s, v18.4s, v17.s[0]
+ .long 0x4eb1d421 // fsub v1.4s, v1.4s, v17.4s
+ .long 0x4eb0f421 // fmin v1.4s, v1.4s, v16.4s
+ .long 0xd61f0060 // br x3
+
+.globl _sk_mirror_x_aarch64
+_sk_mirror_x_aarch64:
+ .long 0xa8c10c28 // ldp x8, x3, [x1],#16
+ .long 0x6f07e7f0 // movi v16.2d, #0xffffffffffffffff
+ .long 0xbd400111 // ldr s17, [x8]
+ .long 0x4e040632 // dup v18.4s, v17.s[0]
+ .long 0x1e312a31 // fadd s17, s17, s17
+ .long 0x4eb2d400 // fsub v0.4s, v0.4s, v18.4s
+ .long 0x4e040633 // dup v19.4s, v17.s[0]
+ .long 0x6e33fc13 // fdiv v19.4s, v0.4s, v19.4s
+ .long 0x4e219a73 // frintm v19.4s, v19.4s
+ .long 0x4f919271 // fmul v17.4s, v19.4s, v17.s[0]
+ .long 0x4eb1d400 // fsub v0.4s, v0.4s, v17.4s
+ .long 0x4eb2d400 // fsub v0.4s, v0.4s, v18.4s
+ .long 0x4ea0f800 // fabs v0.4s, v0.4s
+ .long 0x4eb08650 // add v16.4s, v18.4s, v16.4s
+ .long 0x4eb0f400 // fmin v0.4s, v0.4s, v16.4s
+ .long 0xd61f0060 // br x3
+
+.globl _sk_mirror_y_aarch64
+_sk_mirror_y_aarch64:
+ .long 0xa8c10c28 // ldp x8, x3, [x1],#16
+ .long 0x6f07e7f0 // movi v16.2d, #0xffffffffffffffff
+ .long 0xbd400111 // ldr s17, [x8]
+ .long 0x4e040632 // dup v18.4s, v17.s[0]
+ .long 0x1e312a31 // fadd s17, s17, s17
+ .long 0x4eb2d421 // fsub v1.4s, v1.4s, v18.4s
+ .long 0x4e040633 // dup v19.4s, v17.s[0]
+ .long 0x6e33fc33 // fdiv v19.4s, v1.4s, v19.4s
+ .long 0x4e219a73 // frintm v19.4s, v19.4s
+ .long 0x4f919271 // fmul v17.4s, v19.4s, v17.s[0]
+ .long 0x4eb1d421 // fsub v1.4s, v1.4s, v17.4s
+ .long 0x4eb2d421 // fsub v1.4s, v1.4s, v18.4s
+ .long 0x4ea0f821 // fabs v1.4s, v1.4s
.long 0x4eb08650 // add v16.4s, v18.4s, v16.4s
.long 0x4eb0f421 // fmin v1.4s, v1.4s, v16.4s
- .long 0x4e21f621 // fmax v1.4s, v17.4s, v1.4s
.long 0xd61f0060 // br x3
.globl _sk_matrix_2x3_aarch64
@@ -1523,28 +1589,136 @@ _sk_store_f16_vfp4:
.globl _sk_clamp_x_vfp4
_sk_clamp_x_vfp4:
+ .long 0xf2c00010 // vmov.i32 d16, #0
.long 0xe5913000 // ldr r3, [r1]
- .long 0xf3c70e1f // vmov.i8 d16, #255
+ .long 0xf3c71e1f // vmov.i8 d17, #255
.long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xf2400f80 // vmax.f32 d16, d16, d0
.long 0xe2811008 // add r1, r1, #8
- .long 0xf4e31c9f // vld1.32 {d17[]}, [r3 :32]
- .long 0xf26108a0 // vadd.i32 d16, d17, d16
- .long 0xf2c01010 // vmov.i32 d17, #0
- .long 0xf2600f20 // vmin.f32 d16, d0, d16
- .long 0xf2010fa0 // vmax.f32 d0, d17, d16
+ .long 0xf4e32c9f // vld1.32 {d18[]}, [r3 :32]
+ .long 0xf26218a1 // vadd.i32 d17, d18, d17
+ .long 0xf2200fa1 // vmin.f32 d0, d16, d17
.long 0xe12fff1c // bx ip
.globl _sk_clamp_y_vfp4
_sk_clamp_y_vfp4:
+ .long 0xf2c00010 // vmov.i32 d16, #0
.long 0xe5913000 // ldr r3, [r1]
- .long 0xf3c70e1f // vmov.i8 d16, #255
+ .long 0xf3c71e1f // vmov.i8 d17, #255
.long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xf2400f81 // vmax.f32 d16, d16, d1
.long 0xe2811008 // add r1, r1, #8
- .long 0xf4e31c9f // vld1.32 {d17[]}, [r3 :32]
- .long 0xf26108a0 // vadd.i32 d16, d17, d16
- .long 0xf2c01010 // vmov.i32 d17, #0
- .long 0xf2610f20 // vmin.f32 d16, d1, d16
- .long 0xf2011fa0 // vmax.f32 d1, d17, d16
+ .long 0xf4e32c9f // vld1.32 {d18[]}, [r3 :32]
+ .long 0xf26218a1 // vadd.i32 d17, d18, d17
+ .long 0xf2201fa1 // vmin.f32 d1, d16, d17
+ .long 0xe12fff1c // bx ip
+
+.globl _sk_repeat_x_vfp4
+_sk_repeat_x_vfp4:
+ .long 0xed2d8b04 // vpush {d8-d9}
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xf2c02010 // vmov.i32 d18, #0
+ .long 0xf4e23c9f // vld1.32 {d19[]}, [r2 :32]
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe2811008 // add r1, r1, #8
+ .long 0xed938a00 // vldr s16, [r3]
+ .long 0xeec09a88 // vdiv.f32 s19, s1, s16
+ .long 0xee809a08 // vdiv.f32 s18, s0, s16
+ .long 0xf3fb0709 // vcvt.s32.f32 d16, d9
+ .long 0xf3fb0620 // vcvt.f32.s32 d16, d16
+ .long 0xf3601e89 // vcgt.f32 d17, d16, d9
+ .long 0xf35311b2 // vbsl d17, d19, d18
+ .long 0xf3f42c08 // vdup.32 d18, d8[0]
+ .long 0xf2600da1 // vsub.f32 d16, d16, d17
+ .long 0xf3c71e1f // vmov.i8 d17, #255
+ .long 0xf26218a1 // vadd.i32 d17, d18, d17
+ .long 0xf2e009c8 // vmul.f32 d16, d16, d8[0]
+ .long 0xf2600d20 // vsub.f32 d16, d0, d16
+ .long 0xf2200fa1 // vmin.f32 d0, d16, d17
+ .long 0xecbd8b04 // vpop {d8-d9}
+ .long 0xe12fff1c // bx ip
+
+.globl _sk_repeat_y_vfp4
+_sk_repeat_y_vfp4:
+ .long 0xed2d8b04 // vpush {d8-d9}
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xf2c02010 // vmov.i32 d18, #0
+ .long 0xf4e23c9f // vld1.32 {d19[]}, [r2 :32]
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe2811008 // add r1, r1, #8
+ .long 0xed938a00 // vldr s16, [r3]
+ .long 0xeec19a88 // vdiv.f32 s19, s3, s16
+ .long 0xee819a08 // vdiv.f32 s18, s2, s16
+ .long 0xf3fb0709 // vcvt.s32.f32 d16, d9
+ .long 0xf3fb0620 // vcvt.f32.s32 d16, d16
+ .long 0xf3601e89 // vcgt.f32 d17, d16, d9
+ .long 0xf35311b2 // vbsl d17, d19, d18
+ .long 0xf3f42c08 // vdup.32 d18, d8[0]
+ .long 0xf2600da1 // vsub.f32 d16, d16, d17
+ .long 0xf3c71e1f // vmov.i8 d17, #255
+ .long 0xf26218a1 // vadd.i32 d17, d18, d17
+ .long 0xf2e009c8 // vmul.f32 d16, d16, d8[0]
+ .long 0xf2610d20 // vsub.f32 d16, d1, d16
+ .long 0xf2201fa1 // vmin.f32 d1, d16, d17
+ .long 0xecbd8b04 // vpop {d8-d9}
+ .long 0xe12fff1c // bx ip
+
+.globl _sk_mirror_x_vfp4
+_sk_mirror_x_vfp4:
+ .long 0xed2d8b04 // vpush {d8-d9}
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xf2c03010 // vmov.i32 d19, #0
+ .long 0xf4e24c9f // vld1.32 {d20[]}, [r2 :32]
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe2811008 // add r1, r1, #8
+ .long 0xed938a00 // vldr s16, [r3]
+ .long 0xee389a08 // vadd.f32 s18, s16, s16
+ .long 0xf3f40c08 // vdup.32 d16, d8[0]
+ .long 0xf2200d20 // vsub.f32 d0, d0, d16
+ .long 0xeec08a89 // vdiv.f32 s17, s1, s18
+ .long 0xee808a09 // vdiv.f32 s16, s0, s18
+ .long 0xf3fb1708 // vcvt.s32.f32 d17, d8
+ .long 0xf3fb1621 // vcvt.f32.s32 d17, d17
+ .long 0xf3612e88 // vcgt.f32 d18, d17, d8
+ .long 0xf35421b3 // vbsl d18, d20, d19
+ .long 0xf2611da2 // vsub.f32 d17, d17, d18
+ .long 0xf3c72e1f // vmov.i8 d18, #255
+ .long 0xf2e119c9 // vmul.f32 d17, d17, d9[0]
+ .long 0xf2601d21 // vsub.f32 d17, d0, d17
+ .long 0xf2611da0 // vsub.f32 d17, d17, d16
+ .long 0xf26008a2 // vadd.i32 d16, d16, d18
+ .long 0xf3f91721 // vabs.f32 d17, d17
+ .long 0xf2210fa0 // vmin.f32 d0, d17, d16
+ .long 0xecbd8b04 // vpop {d8-d9}
+ .long 0xe12fff1c // bx ip
+
+.globl _sk_mirror_y_vfp4
+_sk_mirror_y_vfp4:
+ .long 0xed2d8b04 // vpush {d8-d9}
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xf2c03010 // vmov.i32 d19, #0
+ .long 0xf4e24c9f // vld1.32 {d20[]}, [r2 :32]
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe2811008 // add r1, r1, #8
+ .long 0xed938a00 // vldr s16, [r3]
+ .long 0xee389a08 // vadd.f32 s18, s16, s16
+ .long 0xf3f40c08 // vdup.32 d16, d8[0]
+ .long 0xf2211d20 // vsub.f32 d1, d1, d16
+ .long 0xeec18a89 // vdiv.f32 s17, s3, s18
+ .long 0xee818a09 // vdiv.f32 s16, s2, s18
+ .long 0xf3fb1708 // vcvt.s32.f32 d17, d8
+ .long 0xf3fb1621 // vcvt.f32.s32 d17, d17
+ .long 0xf3612e88 // vcgt.f32 d18, d17, d8
+ .long 0xf35421b3 // vbsl d18, d20, d19
+ .long 0xf2611da2 // vsub.f32 d17, d17, d18
+ .long 0xf3c72e1f // vmov.i8 d18, #255
+ .long 0xf2e119c9 // vmul.f32 d17, d17, d9[0]
+ .long 0xf2611d21 // vsub.f32 d17, d1, d17
+ .long 0xf2611da0 // vsub.f32 d17, d17, d16
+ .long 0xf26008a2 // vadd.i32 d16, d16, d18
+ .long 0xf3f91721 // vabs.f32 d17, d17
+ .long 0xf2211fa0 // vmin.f32 d1, d17, d16
+ .long 0xecbd8b04 // vpop {d8-d9}
.long 0xe12fff1c // bx ip
.globl _sk_matrix_2x3_vfp4
@@ -2236,24 +2410,96 @@ _sk_store_f16_hsw:
.globl _sk_clamp_x_hsw
_sk_clamp_x_hsw:
.byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 196,65,60,87,192 // vxorps %ymm8,%ymm8,%ymm8
+ .byte 197,188,95,192 // vmaxps %ymm0,%ymm8,%ymm0
.byte 196,98,125,88,0 // vpbroadcastd (%rax),%ymm8
.byte 196,65,53,118,201 // vpcmpeqd %ymm9,%ymm9,%ymm9
.byte 196,65,61,254,193 // vpaddd %ymm9,%ymm8,%ymm8
.byte 196,193,124,93,192 // vminps %ymm8,%ymm0,%ymm0
- .byte 196,65,60,87,192 // vxorps %ymm8,%ymm8,%ymm8
- .byte 197,188,95,192 // vmaxps %ymm0,%ymm8,%ymm0
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
.globl _sk_clamp_y_hsw
_sk_clamp_y_hsw:
.byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 196,65,60,87,192 // vxorps %ymm8,%ymm8,%ymm8
+ .byte 197,188,95,201 // vmaxps %ymm1,%ymm8,%ymm1
.byte 196,98,125,88,0 // vpbroadcastd (%rax),%ymm8
.byte 196,65,53,118,201 // vpcmpeqd %ymm9,%ymm9,%ymm9
.byte 196,65,61,254,193 // vpaddd %ymm9,%ymm8,%ymm8
.byte 196,193,116,93,200 // vminps %ymm8,%ymm1,%ymm1
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+.globl _sk_repeat_x_hsw
+_sk_repeat_x_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 196,98,125,24,0 // vbroadcastss (%rax),%ymm8
+ .byte 196,65,124,94,200 // vdivps %ymm8,%ymm0,%ymm9
+ .byte 196,67,125,8,201,1 // vroundps $0x1,%ymm9,%ymm9
+ .byte 196,65,52,89,200 // vmulps %ymm8,%ymm9,%ymm9
+ .byte 196,193,124,92,193 // vsubps %ymm9,%ymm0,%ymm0
+ .byte 196,65,53,118,201 // vpcmpeqd %ymm9,%ymm9,%ymm9
+ .byte 196,65,61,254,193 // vpaddd %ymm9,%ymm8,%ymm8
+ .byte 196,193,124,93,192 // vminps %ymm8,%ymm0,%ymm0
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+.globl _sk_repeat_y_hsw
+_sk_repeat_y_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 196,98,125,24,0 // vbroadcastss (%rax),%ymm8
+ .byte 196,65,116,94,200 // vdivps %ymm8,%ymm1,%ymm9
+ .byte 196,67,125,8,201,1 // vroundps $0x1,%ymm9,%ymm9
+ .byte 196,65,52,89,200 // vmulps %ymm8,%ymm9,%ymm9
+ .byte 196,193,116,92,201 // vsubps %ymm9,%ymm1,%ymm1
+ .byte 196,65,53,118,201 // vpcmpeqd %ymm9,%ymm9,%ymm9
+ .byte 196,65,61,254,193 // vpaddd %ymm9,%ymm8,%ymm8
+ .byte 196,193,116,93,200 // vminps %ymm8,%ymm1,%ymm1
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+.globl _sk_mirror_x_hsw
+_sk_mirror_x_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,122,16,0 // vmovss (%rax),%xmm8
+ .byte 196,66,125,24,200 // vbroadcastss %xmm8,%ymm9
+ .byte 196,65,124,92,209 // vsubps %ymm9,%ymm0,%ymm10
+ .byte 196,193,58,88,192 // vaddss %xmm8,%xmm8,%xmm0
+ .byte 196,226,125,24,192 // vbroadcastss %xmm0,%ymm0
+ .byte 197,44,94,192 // vdivps %ymm0,%ymm10,%ymm8
+ .byte 196,67,125,8,192,1 // vroundps $0x1,%ymm8,%ymm8
+ .byte 197,188,89,192 // vmulps %ymm0,%ymm8,%ymm0
+ .byte 197,172,92,192 // vsubps %ymm0,%ymm10,%ymm0
+ .byte 196,193,124,92,193 // vsubps %ymm9,%ymm0,%ymm0
.byte 196,65,60,87,192 // vxorps %ymm8,%ymm8,%ymm8
- .byte 197,188,95,201 // vmaxps %ymm1,%ymm8,%ymm1
+ .byte 197,60,92,192 // vsubps %ymm0,%ymm8,%ymm8
+ .byte 197,188,84,192 // vandps %ymm0,%ymm8,%ymm0
+ .byte 196,65,61,118,192 // vpcmpeqd %ymm8,%ymm8,%ymm8
+ .byte 196,65,53,254,192 // vpaddd %ymm8,%ymm9,%ymm8
+ .byte 196,193,124,93,192 // vminps %ymm8,%ymm0,%ymm0
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+.globl _sk_mirror_y_hsw
+_sk_mirror_y_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,122,16,0 // vmovss (%rax),%xmm8
+ .byte 196,66,125,24,200 // vbroadcastss %xmm8,%ymm9
+ .byte 196,65,116,92,209 // vsubps %ymm9,%ymm1,%ymm10
+ .byte 196,193,58,88,200 // vaddss %xmm8,%xmm8,%xmm1
+ .byte 196,226,125,24,201 // vbroadcastss %xmm1,%ymm1
+ .byte 197,44,94,193 // vdivps %ymm1,%ymm10,%ymm8
+ .byte 196,67,125,8,192,1 // vroundps $0x1,%ymm8,%ymm8
+ .byte 197,188,89,201 // vmulps %ymm1,%ymm8,%ymm1
+ .byte 197,172,92,201 // vsubps %ymm1,%ymm10,%ymm1
+ .byte 196,193,116,92,201 // vsubps %ymm9,%ymm1,%ymm1
+ .byte 196,65,60,87,192 // vxorps %ymm8,%ymm8,%ymm8
+ .byte 197,60,92,193 // vsubps %ymm1,%ymm8,%ymm8
+ .byte 197,188,84,201 // vandps %ymm1,%ymm8,%ymm1
+ .byte 196,65,61,118,192 // vpcmpeqd %ymm8,%ymm8,%ymm8
+ .byte 196,65,53,254,192 // vpaddd %ymm8,%ymm9,%ymm8
+ .byte 196,193,116,93,200 // vminps %ymm8,%ymm1,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -3131,30 +3377,118 @@ _sk_store_f16_avx:
.globl _sk_clamp_x_avx
_sk_clamp_x_avx:
.byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 196,65,60,87,192 // vxorps %ymm8,%ymm8,%ymm8
+ .byte 197,60,95,200 // vmaxps %ymm0,%ymm8,%ymm9
.byte 196,98,125,24,0 // vbroadcastss (%rax),%ymm8
- .byte 196,67,125,25,193,1 // vextractf128 $0x1,%ymm8,%xmm9
+ .byte 196,99,125,25,192,1 // vextractf128 $0x1,%ymm8,%xmm0
.byte 196,65,41,118,210 // vpcmpeqd %xmm10,%xmm10,%xmm10
- .byte 196,65,49,254,202 // vpaddd %xmm10,%xmm9,%xmm9
+ .byte 196,193,121,254,194 // vpaddd %xmm10,%xmm0,%xmm0
.byte 196,65,57,254,194 // vpaddd %xmm10,%xmm8,%xmm8
- .byte 196,67,61,24,193,1 // vinsertf128 $0x1,%xmm9,%ymm8,%ymm8
- .byte 196,193,124,93,192 // vminps %ymm8,%ymm0,%ymm0
- .byte 196,65,60,87,192 // vxorps %ymm8,%ymm8,%ymm8
- .byte 197,188,95,192 // vmaxps %ymm0,%ymm8,%ymm0
+ .byte 196,227,61,24,192,1 // vinsertf128 $0x1,%xmm0,%ymm8,%ymm0
+ .byte 197,180,93,192 // vminps %ymm0,%ymm9,%ymm0
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
.globl _sk_clamp_y_avx
_sk_clamp_y_avx:
.byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 196,65,60,87,192 // vxorps %ymm8,%ymm8,%ymm8
+ .byte 197,60,95,201 // vmaxps %ymm1,%ymm8,%ymm9
.byte 196,98,125,24,0 // vbroadcastss (%rax),%ymm8
- .byte 196,67,125,25,193,1 // vextractf128 $0x1,%ymm8,%xmm9
+ .byte 196,99,125,25,193,1 // vextractf128 $0x1,%ymm8,%xmm1
.byte 196,65,41,118,210 // vpcmpeqd %xmm10,%xmm10,%xmm10
- .byte 196,65,49,254,202 // vpaddd %xmm10,%xmm9,%xmm9
+ .byte 196,193,113,254,202 // vpaddd %xmm10,%xmm1,%xmm1
.byte 196,65,57,254,194 // vpaddd %xmm10,%xmm8,%xmm8
- .byte 196,67,61,24,193,1 // vinsertf128 $0x1,%xmm9,%ymm8,%ymm8
- .byte 196,193,116,93,200 // vminps %ymm8,%ymm1,%ymm1
+ .byte 196,227,61,24,201,1 // vinsertf128 $0x1,%xmm1,%ymm8,%ymm1
+ .byte 197,180,93,201 // vminps %ymm1,%ymm9,%ymm1
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+.globl _sk_repeat_x_avx
+_sk_repeat_x_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 196,98,125,24,0 // vbroadcastss (%rax),%ymm8
+ .byte 196,65,124,94,200 // vdivps %ymm8,%ymm0,%ymm9
+ .byte 196,67,125,8,201,1 // vroundps $0x1,%ymm9,%ymm9
+ .byte 196,65,52,89,200 // vmulps %ymm8,%ymm9,%ymm9
+ .byte 196,65,124,92,201 // vsubps %ymm9,%ymm0,%ymm9
+ .byte 196,99,125,25,192,1 // vextractf128 $0x1,%ymm8,%xmm0
+ .byte 196,65,41,118,210 // vpcmpeqd %xmm10,%xmm10,%xmm10
+ .byte 196,193,121,254,194 // vpaddd %xmm10,%xmm0,%xmm0
+ .byte 196,65,57,254,194 // vpaddd %xmm10,%xmm8,%xmm8
+ .byte 196,227,61,24,192,1 // vinsertf128 $0x1,%xmm0,%ymm8,%ymm0
+ .byte 197,180,93,192 // vminps %ymm0,%ymm9,%ymm0
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+.globl _sk_repeat_y_avx
+_sk_repeat_y_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 196,98,125,24,0 // vbroadcastss (%rax),%ymm8
+ .byte 196,65,116,94,200 // vdivps %ymm8,%ymm1,%ymm9
+ .byte 196,67,125,8,201,1 // vroundps $0x1,%ymm9,%ymm9
+ .byte 196,65,52,89,200 // vmulps %ymm8,%ymm9,%ymm9
+ .byte 196,65,116,92,201 // vsubps %ymm9,%ymm1,%ymm9
+ .byte 196,99,125,25,193,1 // vextractf128 $0x1,%ymm8,%xmm1
+ .byte 196,65,41,118,210 // vpcmpeqd %xmm10,%xmm10,%xmm10
+ .byte 196,193,113,254,202 // vpaddd %xmm10,%xmm1,%xmm1
+ .byte 196,65,57,254,194 // vpaddd %xmm10,%xmm8,%xmm8
+ .byte 196,227,61,24,201,1 // vinsertf128 $0x1,%xmm1,%ymm8,%ymm1
+ .byte 197,180,93,201 // vminps %ymm1,%ymm9,%ymm1
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+.globl _sk_mirror_x_avx
+_sk_mirror_x_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,122,16,0 // vmovss (%rax),%xmm8
+ .byte 196,65,121,112,200,0 // vpshufd $0x0,%xmm8,%xmm9
+ .byte 196,67,53,24,201,1 // vinsertf128 $0x1,%xmm9,%ymm9,%ymm9
+ .byte 196,65,124,92,209 // vsubps %ymm9,%ymm0,%ymm10
+ .byte 196,193,58,88,192 // vaddss %xmm8,%xmm8,%xmm0
+ .byte 196,227,121,4,192,0 // vpermilps $0x0,%xmm0,%xmm0
+ .byte 196,227,125,24,192,1 // vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
+ .byte 197,44,94,192 // vdivps %ymm0,%ymm10,%ymm8
+ .byte 196,67,125,8,192,1 // vroundps $0x1,%ymm8,%ymm8
+ .byte 197,188,89,192 // vmulps %ymm0,%ymm8,%ymm0
+ .byte 197,172,92,192 // vsubps %ymm0,%ymm10,%ymm0
+ .byte 196,193,124,92,193 // vsubps %ymm9,%ymm0,%ymm0
.byte 196,65,60,87,192 // vxorps %ymm8,%ymm8,%ymm8
- .byte 197,188,95,201 // vmaxps %ymm1,%ymm8,%ymm1
+ .byte 197,60,92,192 // vsubps %ymm0,%ymm8,%ymm8
+ .byte 197,60,84,192 // vandps %ymm0,%ymm8,%ymm8
+ .byte 196,99,125,25,200,1 // vextractf128 $0x1,%ymm9,%xmm0
+ .byte 196,65,41,118,210 // vpcmpeqd %xmm10,%xmm10,%xmm10
+ .byte 196,193,121,254,194 // vpaddd %xmm10,%xmm0,%xmm0
+ .byte 196,65,49,254,202 // vpaddd %xmm10,%xmm9,%xmm9
+ .byte 196,227,53,24,192,1 // vinsertf128 $0x1,%xmm0,%ymm9,%ymm0
+ .byte 197,188,93,192 // vminps %ymm0,%ymm8,%ymm0
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+.globl _sk_mirror_y_avx
+_sk_mirror_y_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 197,122,16,0 // vmovss (%rax),%xmm8
+ .byte 196,65,121,112,200,0 // vpshufd $0x0,%xmm8,%xmm9
+ .byte 196,67,53,24,201,1 // vinsertf128 $0x1,%xmm9,%ymm9,%ymm9
+ .byte 196,65,116,92,209 // vsubps %ymm9,%ymm1,%ymm10
+ .byte 196,193,58,88,200 // vaddss %xmm8,%xmm8,%xmm1
+ .byte 196,227,121,4,201,0 // vpermilps $0x0,%xmm1,%xmm1
+ .byte 196,227,117,24,201,1 // vinsertf128 $0x1,%xmm1,%ymm1,%ymm1
+ .byte 197,44,94,193 // vdivps %ymm1,%ymm10,%ymm8
+ .byte 196,67,125,8,192,1 // vroundps $0x1,%ymm8,%ymm8
+ .byte 197,188,89,201 // vmulps %ymm1,%ymm8,%ymm1
+ .byte 197,172,92,201 // vsubps %ymm1,%ymm10,%ymm1
+ .byte 196,193,116,92,201 // vsubps %ymm9,%ymm1,%ymm1
+ .byte 196,65,60,87,192 // vxorps %ymm8,%ymm8,%ymm8
+ .byte 197,60,92,193 // vsubps %ymm1,%ymm8,%ymm8
+ .byte 197,60,84,193 // vandps %ymm1,%ymm8,%ymm8
+ .byte 196,99,125,25,201,1 // vextractf128 $0x1,%ymm9,%xmm1
+ .byte 196,65,41,118,210 // vpcmpeqd %xmm10,%xmm10,%xmm10
+ .byte 196,193,113,254,202 // vpaddd %xmm10,%xmm1,%xmm1
+ .byte 196,65,49,254,202 // vpaddd %xmm10,%xmm9,%xmm9
+ .byte 196,227,53,24,201,1 // vinsertf128 $0x1,%xmm1,%ymm9,%ymm1
+ .byte 197,188,93,201 // vminps %ymm1,%ymm8,%ymm1
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
@@ -4007,13 +4341,13 @@ _sk_store_f16_sse41:
.globl _sk_clamp_x_sse41
_sk_clamp_x_sse41:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 243,68,15,16,0 // movss (%rax),%xmm8
- .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
- .byte 102,69,15,118,201 // pcmpeqd %xmm9,%xmm9
- .byte 102,69,15,254,200 // paddd %xmm8,%xmm9
- .byte 65,15,93,193 // minps %xmm9,%xmm0
.byte 69,15,87,192 // xorps %xmm8,%xmm8
.byte 68,15,95,192 // maxps %xmm0,%xmm8
+ .byte 243,68,15,16,8 // movss (%rax),%xmm9
+ .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9
+ .byte 102,15,118,192 // pcmpeqd %xmm0,%xmm0
+ .byte 102,65,15,254,193 // paddd %xmm9,%xmm0
+ .byte 68,15,93,192 // minps %xmm0,%xmm8
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 65,15,40,192 // movaps %xmm8,%xmm0
.byte 255,224 // jmpq *%rax
@@ -4021,15 +4355,95 @@ _sk_clamp_x_sse41:
.globl _sk_clamp_y_sse41
_sk_clamp_y_sse41:
.byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 69,15,87,192 // xorps %xmm8,%xmm8
+ .byte 68,15,95,193 // maxps %xmm1,%xmm8
+ .byte 243,68,15,16,8 // movss (%rax),%xmm9
+ .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9
+ .byte 102,15,118,201 // pcmpeqd %xmm1,%xmm1
+ .byte 102,65,15,254,201 // paddd %xmm9,%xmm1
+ .byte 68,15,93,193 // minps %xmm1,%xmm8
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,15,40,200 // movaps %xmm8,%xmm1
+ .byte 255,224 // jmpq *%rax
+
+.globl _sk_repeat_x_sse41
+_sk_repeat_x_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 243,68,15,16,0 // movss (%rax),%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 68,15,40,200 // movaps %xmm0,%xmm9
+ .byte 69,15,94,200 // divps %xmm8,%xmm9
+ .byte 102,69,15,58,8,201,1 // roundps $0x1,%xmm9,%xmm9
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 65,15,92,193 // subps %xmm9,%xmm0
+ .byte 102,69,15,118,201 // pcmpeqd %xmm9,%xmm9
+ .byte 102,69,15,254,200 // paddd %xmm8,%xmm9
+ .byte 65,15,93,193 // minps %xmm9,%xmm0
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+.globl _sk_repeat_y_sse41
+_sk_repeat_y_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
.byte 243,68,15,16,0 // movss (%rax),%xmm8
.byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 68,15,40,201 // movaps %xmm1,%xmm9
+ .byte 69,15,94,200 // divps %xmm8,%xmm9
+ .byte 102,69,15,58,8,201,1 // roundps $0x1,%xmm9,%xmm9
+ .byte 69,15,89,200 // mulps %xmm8,%xmm9
+ .byte 65,15,92,201 // subps %xmm9,%xmm1
.byte 102,69,15,118,201 // pcmpeqd %xmm9,%xmm9
.byte 102,69,15,254,200 // paddd %xmm8,%xmm9
.byte 65,15,93,201 // minps %xmm9,%xmm1
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+.globl _sk_mirror_x_sse41
+_sk_mirror_x_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 243,68,15,16,0 // movss (%rax),%xmm8
+ .byte 69,15,40,200 // movaps %xmm8,%xmm9
+ .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9
+ .byte 65,15,92,193 // subps %xmm9,%xmm0
+ .byte 243,69,15,88,192 // addss %xmm8,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 68,15,40,208 // movaps %xmm0,%xmm10
+ .byte 69,15,94,208 // divps %xmm8,%xmm10
+ .byte 102,69,15,58,8,210,1 // roundps $0x1,%xmm10,%xmm10
+ .byte 69,15,89,208 // mulps %xmm8,%xmm10
+ .byte 65,15,92,194 // subps %xmm10,%xmm0
+ .byte 65,15,92,193 // subps %xmm9,%xmm0
.byte 69,15,87,192 // xorps %xmm8,%xmm8
- .byte 68,15,95,193 // maxps %xmm1,%xmm8
+ .byte 68,15,92,192 // subps %xmm0,%xmm8
+ .byte 65,15,84,192 // andps %xmm8,%xmm0
+ .byte 102,69,15,118,192 // pcmpeqd %xmm8,%xmm8
+ .byte 102,69,15,254,193 // paddd %xmm9,%xmm8
+ .byte 65,15,93,192 // minps %xmm8,%xmm0
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+.globl _sk_mirror_y_sse41
+_sk_mirror_y_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 243,68,15,16,0 // movss (%rax),%xmm8
+ .byte 69,15,40,200 // movaps %xmm8,%xmm9
+ .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9
+ .byte 65,15,92,201 // subps %xmm9,%xmm1
+ .byte 243,69,15,88,192 // addss %xmm8,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 68,15,40,209 // movaps %xmm1,%xmm10
+ .byte 69,15,94,208 // divps %xmm8,%xmm10
+ .byte 102,69,15,58,8,210,1 // roundps $0x1,%xmm10,%xmm10
+ .byte 69,15,89,208 // mulps %xmm8,%xmm10
+ .byte 65,15,92,202 // subps %xmm10,%xmm1
+ .byte 65,15,92,201 // subps %xmm9,%xmm1
+ .byte 69,15,87,192 // xorps %xmm8,%xmm8
+ .byte 68,15,92,193 // subps %xmm1,%xmm8
+ .byte 65,15,84,200 // andps %xmm8,%xmm1
+ .byte 102,69,15,118,192 // pcmpeqd %xmm8,%xmm8
+ .byte 102,69,15,254,193 // paddd %xmm9,%xmm8
+ .byte 65,15,93,200 // minps %xmm8,%xmm1
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 65,15,40,200 // movaps %xmm8,%xmm1
.byte 255,224 // jmpq *%rax
.globl _sk_matrix_2x3_sse41
@@ -4934,13 +5348,13 @@ _sk_store_f16_sse2:
.globl _sk_clamp_x_sse2
_sk_clamp_x_sse2:
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 243,68,15,16,0 // movss (%rax),%xmm8
- .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
- .byte 102,69,15,118,201 // pcmpeqd %xmm9,%xmm9
- .byte 102,69,15,254,200 // paddd %xmm8,%xmm9
- .byte 65,15,93,193 // minps %xmm9,%xmm0
.byte 69,15,87,192 // xorps %xmm8,%xmm8
.byte 68,15,95,192 // maxps %xmm0,%xmm8
+ .byte 243,68,15,16,8 // movss (%rax),%xmm9
+ .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9
+ .byte 102,15,118,192 // pcmpeqd %xmm0,%xmm0
+ .byte 102,65,15,254,193 // paddd %xmm9,%xmm0
+ .byte 68,15,93,192 // minps %xmm0,%xmm8
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 65,15,40,192 // movaps %xmm8,%xmm0
.byte 255,224 // jmpq *%rax
@@ -4948,15 +5362,119 @@ _sk_clamp_x_sse2:
.globl _sk_clamp_y_sse2
_sk_clamp_y_sse2:
.byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 69,15,87,192 // xorps %xmm8,%xmm8
+ .byte 68,15,95,193 // maxps %xmm1,%xmm8
+ .byte 243,68,15,16,8 // movss (%rax),%xmm9
+ .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9
+ .byte 102,15,118,201 // pcmpeqd %xmm1,%xmm1
+ .byte 102,65,15,254,201 // paddd %xmm9,%xmm1
+ .byte 68,15,93,193 // minps %xmm1,%xmm8
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 65,15,40,200 // movaps %xmm8,%xmm1
+ .byte 255,224 // jmpq *%rax
+
+.globl _sk_repeat_x_sse2
+_sk_repeat_x_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
.byte 243,68,15,16,0 // movss (%rax),%xmm8
.byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 68,15,40,200 // movaps %xmm0,%xmm9
+ .byte 69,15,94,200 // divps %xmm8,%xmm9
+ .byte 243,69,15,91,209 // cvttps2dq %xmm9,%xmm10
+ .byte 69,15,91,210 // cvtdq2ps %xmm10,%xmm10
+ .byte 69,15,194,202,1 // cmpltps %xmm10,%xmm9
+ .byte 243,68,15,16,26 // movss (%rdx),%xmm11
+ .byte 69,15,198,219,0 // shufps $0x0,%xmm11,%xmm11
+ .byte 69,15,84,217 // andps %xmm9,%xmm11
+ .byte 69,15,92,211 // subps %xmm11,%xmm10
+ .byte 69,15,89,208 // mulps %xmm8,%xmm10
+ .byte 65,15,92,194 // subps %xmm10,%xmm0
+ .byte 102,69,15,118,201 // pcmpeqd %xmm9,%xmm9
+ .byte 102,69,15,254,200 // paddd %xmm8,%xmm9
+ .byte 65,15,93,193 // minps %xmm9,%xmm0
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+.globl _sk_repeat_y_sse2
+_sk_repeat_y_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 243,68,15,16,0 // movss (%rax),%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 68,15,40,201 // movaps %xmm1,%xmm9
+ .byte 69,15,94,200 // divps %xmm8,%xmm9
+ .byte 243,69,15,91,209 // cvttps2dq %xmm9,%xmm10
+ .byte 69,15,91,210 // cvtdq2ps %xmm10,%xmm10
+ .byte 69,15,194,202,1 // cmpltps %xmm10,%xmm9
+ .byte 243,68,15,16,26 // movss (%rdx),%xmm11
+ .byte 69,15,198,219,0 // shufps $0x0,%xmm11,%xmm11
+ .byte 69,15,84,217 // andps %xmm9,%xmm11
+ .byte 69,15,92,211 // subps %xmm11,%xmm10
+ .byte 69,15,89,208 // mulps %xmm8,%xmm10
+ .byte 65,15,92,202 // subps %xmm10,%xmm1
+ .byte 102,69,15,118,201 // pcmpeqd %xmm9,%xmm9
+ .byte 102,69,15,254,200 // paddd %xmm8,%xmm9
+ .byte 65,15,93,201 // minps %xmm9,%xmm1
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+.globl _sk_mirror_x_sse2
+_sk_mirror_x_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 243,68,15,16,8 // movss (%rax),%xmm9
+ .byte 69,15,40,193 // movaps %xmm9,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 65,15,92,192 // subps %xmm8,%xmm0
+ .byte 243,69,15,88,201 // addss %xmm9,%xmm9
+ .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9
+ .byte 68,15,40,208 // movaps %xmm0,%xmm10
+ .byte 69,15,94,209 // divps %xmm9,%xmm10
+ .byte 243,69,15,91,218 // cvttps2dq %xmm10,%xmm11
+ .byte 69,15,91,219 // cvtdq2ps %xmm11,%xmm11
+ .byte 69,15,194,211,1 // cmpltps %xmm11,%xmm10
+ .byte 243,68,15,16,34 // movss (%rdx),%xmm12
+ .byte 69,15,198,228,0 // shufps $0x0,%xmm12,%xmm12
+ .byte 69,15,84,226 // andps %xmm10,%xmm12
+ .byte 69,15,87,210 // xorps %xmm10,%xmm10
+ .byte 69,15,92,220 // subps %xmm12,%xmm11
+ .byte 69,15,89,217 // mulps %xmm9,%xmm11
+ .byte 65,15,92,195 // subps %xmm11,%xmm0
+ .byte 65,15,92,192 // subps %xmm8,%xmm0
+ .byte 68,15,92,208 // subps %xmm0,%xmm10
+ .byte 65,15,84,194 // andps %xmm10,%xmm0
+ .byte 102,69,15,118,201 // pcmpeqd %xmm9,%xmm9
+ .byte 102,69,15,254,200 // paddd %xmm8,%xmm9
+ .byte 65,15,93,193 // minps %xmm9,%xmm0
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
+.globl _sk_mirror_y_sse2
+_sk_mirror_y_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 243,68,15,16,8 // movss (%rax),%xmm9
+ .byte 69,15,40,193 // movaps %xmm9,%xmm8
+ .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8
+ .byte 65,15,92,200 // subps %xmm8,%xmm1
+ .byte 243,69,15,88,201 // addss %xmm9,%xmm9
+ .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9
+ .byte 68,15,40,209 // movaps %xmm1,%xmm10
+ .byte 69,15,94,209 // divps %xmm9,%xmm10
+ .byte 243,69,15,91,218 // cvttps2dq %xmm10,%xmm11
+ .byte 69,15,91,219 // cvtdq2ps %xmm11,%xmm11
+ .byte 69,15,194,211,1 // cmpltps %xmm11,%xmm10
+ .byte 243,68,15,16,34 // movss (%rdx),%xmm12
+ .byte 69,15,198,228,0 // shufps $0x0,%xmm12,%xmm12
+ .byte 69,15,84,226 // andps %xmm10,%xmm12
+ .byte 69,15,87,210 // xorps %xmm10,%xmm10
+ .byte 69,15,92,220 // subps %xmm12,%xmm11
+ .byte 69,15,89,217 // mulps %xmm9,%xmm11
+ .byte 65,15,92,203 // subps %xmm11,%xmm1
+ .byte 65,15,92,200 // subps %xmm8,%xmm1
+ .byte 68,15,92,209 // subps %xmm1,%xmm10
+ .byte 65,15,84,202 // andps %xmm10,%xmm1
.byte 102,69,15,118,201 // pcmpeqd %xmm9,%xmm9
.byte 102,69,15,254,200 // paddd %xmm8,%xmm9
.byte 65,15,93,201 // minps %xmm9,%xmm1
- .byte 69,15,87,192 // xorps %xmm8,%xmm8
- .byte 68,15,95,193 // maxps %xmm1,%xmm8
.byte 72,173 // lods %ds:(%rsi),%rax
- .byte 65,15,40,200 // movaps %xmm8,%xmm1
.byte 255,224 // jmpq *%rax
.globl _sk_matrix_2x3_sse2
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index e22e4cdfe3..7c38fc0517 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -631,24 +631,96 @@ _sk_store_f16_hsw LABEL PROC
PUBLIC _sk_clamp_x_hsw
_sk_clamp_x_hsw LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8
+ DB 197,188,95,192 ; vmaxps %ymm0,%ymm8,%ymm0
DB 196,98,125,88,0 ; vpbroadcastd (%rax),%ymm8
DB 196,65,53,118,201 ; vpcmpeqd %ymm9,%ymm9,%ymm9
DB 196,65,61,254,193 ; vpaddd %ymm9,%ymm8,%ymm8
DB 196,193,124,93,192 ; vminps %ymm8,%ymm0,%ymm0
- DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8
- DB 197,188,95,192 ; vmaxps %ymm0,%ymm8,%ymm0
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_clamp_y_hsw
_sk_clamp_y_hsw LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8
+ DB 197,188,95,201 ; vmaxps %ymm1,%ymm8,%ymm1
DB 196,98,125,88,0 ; vpbroadcastd (%rax),%ymm8
DB 196,65,53,118,201 ; vpcmpeqd %ymm9,%ymm9,%ymm9
DB 196,65,61,254,193 ; vpaddd %ymm9,%ymm8,%ymm8
DB 196,193,116,93,200 ; vminps %ymm8,%ymm1,%ymm1
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_repeat_x_hsw
+_sk_repeat_x_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8
+ DB 196,65,124,94,200 ; vdivps %ymm8,%ymm0,%ymm9
+ DB 196,67,125,8,201,1 ; vroundps $0x1,%ymm9,%ymm9
+ DB 196,65,52,89,200 ; vmulps %ymm8,%ymm9,%ymm9
+ DB 196,193,124,92,193 ; vsubps %ymm9,%ymm0,%ymm0
+ DB 196,65,53,118,201 ; vpcmpeqd %ymm9,%ymm9,%ymm9
+ DB 196,65,61,254,193 ; vpaddd %ymm9,%ymm8,%ymm8
+ DB 196,193,124,93,192 ; vminps %ymm8,%ymm0,%ymm0
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_repeat_y_hsw
+_sk_repeat_y_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8
+ DB 196,65,116,94,200 ; vdivps %ymm8,%ymm1,%ymm9
+ DB 196,67,125,8,201,1 ; vroundps $0x1,%ymm9,%ymm9
+ DB 196,65,52,89,200 ; vmulps %ymm8,%ymm9,%ymm9
+ DB 196,193,116,92,201 ; vsubps %ymm9,%ymm1,%ymm1
+ DB 196,65,53,118,201 ; vpcmpeqd %ymm9,%ymm9,%ymm9
+ DB 196,65,61,254,193 ; vpaddd %ymm9,%ymm8,%ymm8
+ DB 196,193,116,93,200 ; vminps %ymm8,%ymm1,%ymm1
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_mirror_x_hsw
+_sk_mirror_x_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,122,16,0 ; vmovss (%rax),%xmm8
+ DB 196,66,125,24,200 ; vbroadcastss %xmm8,%ymm9
+ DB 196,65,124,92,209 ; vsubps %ymm9,%ymm0,%ymm10
+ DB 196,193,58,88,192 ; vaddss %xmm8,%xmm8,%xmm0
+ DB 196,226,125,24,192 ; vbroadcastss %xmm0,%ymm0
+ DB 197,44,94,192 ; vdivps %ymm0,%ymm10,%ymm8
+ DB 196,67,125,8,192,1 ; vroundps $0x1,%ymm8,%ymm8
+ DB 197,188,89,192 ; vmulps %ymm0,%ymm8,%ymm0
+ DB 197,172,92,192 ; vsubps %ymm0,%ymm10,%ymm0
+ DB 196,193,124,92,193 ; vsubps %ymm9,%ymm0,%ymm0
DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8
- DB 197,188,95,201 ; vmaxps %ymm1,%ymm8,%ymm1
+ DB 197,60,92,192 ; vsubps %ymm0,%ymm8,%ymm8
+ DB 197,188,84,192 ; vandps %ymm0,%ymm8,%ymm0
+ DB 196,65,61,118,192 ; vpcmpeqd %ymm8,%ymm8,%ymm8
+ DB 196,65,53,254,192 ; vpaddd %ymm8,%ymm9,%ymm8
+ DB 196,193,124,93,192 ; vminps %ymm8,%ymm0,%ymm0
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_mirror_y_hsw
+_sk_mirror_y_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,122,16,0 ; vmovss (%rax),%xmm8
+ DB 196,66,125,24,200 ; vbroadcastss %xmm8,%ymm9
+ DB 196,65,116,92,209 ; vsubps %ymm9,%ymm1,%ymm10
+ DB 196,193,58,88,200 ; vaddss %xmm8,%xmm8,%xmm1
+ DB 196,226,125,24,201 ; vbroadcastss %xmm1,%ymm1
+ DB 197,44,94,193 ; vdivps %ymm1,%ymm10,%ymm8
+ DB 196,67,125,8,192,1 ; vroundps $0x1,%ymm8,%ymm8
+ DB 197,188,89,201 ; vmulps %ymm1,%ymm8,%ymm1
+ DB 197,172,92,201 ; vsubps %ymm1,%ymm10,%ymm1
+ DB 196,193,116,92,201 ; vsubps %ymm9,%ymm1,%ymm1
+ DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8
+ DB 197,60,92,193 ; vsubps %ymm1,%ymm8,%ymm8
+ DB 197,188,84,201 ; vandps %ymm1,%ymm8,%ymm1
+ DB 196,65,61,118,192 ; vpcmpeqd %ymm8,%ymm8,%ymm8
+ DB 196,65,53,254,192 ; vpaddd %ymm8,%ymm9,%ymm8
+ DB 196,193,116,93,200 ; vminps %ymm8,%ymm1,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
@@ -1553,30 +1625,118 @@ _sk_store_f16_avx LABEL PROC
PUBLIC _sk_clamp_x_avx
_sk_clamp_x_avx LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8
+ DB 197,60,95,200 ; vmaxps %ymm0,%ymm8,%ymm9
DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8
- DB 196,67,125,25,193,1 ; vextractf128 $0x1,%ymm8,%xmm9
+ DB 196,99,125,25,192,1 ; vextractf128 $0x1,%ymm8,%xmm0
DB 196,65,41,118,210 ; vpcmpeqd %xmm10,%xmm10,%xmm10
- DB 196,65,49,254,202 ; vpaddd %xmm10,%xmm9,%xmm9
+ DB 196,193,121,254,194 ; vpaddd %xmm10,%xmm0,%xmm0
DB 196,65,57,254,194 ; vpaddd %xmm10,%xmm8,%xmm8
- DB 196,67,61,24,193,1 ; vinsertf128 $0x1,%xmm9,%ymm8,%ymm8
- DB 196,193,124,93,192 ; vminps %ymm8,%ymm0,%ymm0
- DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8
- DB 197,188,95,192 ; vmaxps %ymm0,%ymm8,%ymm0
+ DB 196,227,61,24,192,1 ; vinsertf128 $0x1,%xmm0,%ymm8,%ymm0
+ DB 197,180,93,192 ; vminps %ymm0,%ymm9,%ymm0
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_clamp_y_avx
_sk_clamp_y_avx LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8
+ DB 197,60,95,201 ; vmaxps %ymm1,%ymm8,%ymm9
DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8
- DB 196,67,125,25,193,1 ; vextractf128 $0x1,%ymm8,%xmm9
+ DB 196,99,125,25,193,1 ; vextractf128 $0x1,%ymm8,%xmm1
DB 196,65,41,118,210 ; vpcmpeqd %xmm10,%xmm10,%xmm10
- DB 196,65,49,254,202 ; vpaddd %xmm10,%xmm9,%xmm9
+ DB 196,193,113,254,202 ; vpaddd %xmm10,%xmm1,%xmm1
DB 196,65,57,254,194 ; vpaddd %xmm10,%xmm8,%xmm8
- DB 196,67,61,24,193,1 ; vinsertf128 $0x1,%xmm9,%ymm8,%ymm8
- DB 196,193,116,93,200 ; vminps %ymm8,%ymm1,%ymm1
+ DB 196,227,61,24,201,1 ; vinsertf128 $0x1,%xmm1,%ymm8,%ymm1
+ DB 197,180,93,201 ; vminps %ymm1,%ymm9,%ymm1
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_repeat_x_avx
+_sk_repeat_x_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8
+ DB 196,65,124,94,200 ; vdivps %ymm8,%ymm0,%ymm9
+ DB 196,67,125,8,201,1 ; vroundps $0x1,%ymm9,%ymm9
+ DB 196,65,52,89,200 ; vmulps %ymm8,%ymm9,%ymm9
+ DB 196,65,124,92,201 ; vsubps %ymm9,%ymm0,%ymm9
+ DB 196,99,125,25,192,1 ; vextractf128 $0x1,%ymm8,%xmm0
+ DB 196,65,41,118,210 ; vpcmpeqd %xmm10,%xmm10,%xmm10
+ DB 196,193,121,254,194 ; vpaddd %xmm10,%xmm0,%xmm0
+ DB 196,65,57,254,194 ; vpaddd %xmm10,%xmm8,%xmm8
+ DB 196,227,61,24,192,1 ; vinsertf128 $0x1,%xmm0,%ymm8,%ymm0
+ DB 197,180,93,192 ; vminps %ymm0,%ymm9,%ymm0
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_repeat_y_avx
+_sk_repeat_y_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8
+ DB 196,65,116,94,200 ; vdivps %ymm8,%ymm1,%ymm9
+ DB 196,67,125,8,201,1 ; vroundps $0x1,%ymm9,%ymm9
+ DB 196,65,52,89,200 ; vmulps %ymm8,%ymm9,%ymm9
+ DB 196,65,116,92,201 ; vsubps %ymm9,%ymm1,%ymm9
+ DB 196,99,125,25,193,1 ; vextractf128 $0x1,%ymm8,%xmm1
+ DB 196,65,41,118,210 ; vpcmpeqd %xmm10,%xmm10,%xmm10
+ DB 196,193,113,254,202 ; vpaddd %xmm10,%xmm1,%xmm1
+ DB 196,65,57,254,194 ; vpaddd %xmm10,%xmm8,%xmm8
+ DB 196,227,61,24,201,1 ; vinsertf128 $0x1,%xmm1,%ymm8,%ymm1
+ DB 197,180,93,201 ; vminps %ymm1,%ymm9,%ymm1
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_mirror_x_avx
+_sk_mirror_x_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,122,16,0 ; vmovss (%rax),%xmm8
+ DB 196,65,121,112,200,0 ; vpshufd $0x0,%xmm8,%xmm9
+ DB 196,67,53,24,201,1 ; vinsertf128 $0x1,%xmm9,%ymm9,%ymm9
+ DB 196,65,124,92,209 ; vsubps %ymm9,%ymm0,%ymm10
+ DB 196,193,58,88,192 ; vaddss %xmm8,%xmm8,%xmm0
+ DB 196,227,121,4,192,0 ; vpermilps $0x0,%xmm0,%xmm0
+ DB 196,227,125,24,192,1 ; vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
+ DB 197,44,94,192 ; vdivps %ymm0,%ymm10,%ymm8
+ DB 196,67,125,8,192,1 ; vroundps $0x1,%ymm8,%ymm8
+ DB 197,188,89,192 ; vmulps %ymm0,%ymm8,%ymm0
+ DB 197,172,92,192 ; vsubps %ymm0,%ymm10,%ymm0
+ DB 196,193,124,92,193 ; vsubps %ymm9,%ymm0,%ymm0
DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8
- DB 197,188,95,201 ; vmaxps %ymm1,%ymm8,%ymm1
+ DB 197,60,92,192 ; vsubps %ymm0,%ymm8,%ymm8
+ DB 197,60,84,192 ; vandps %ymm0,%ymm8,%ymm8
+ DB 196,99,125,25,200,1 ; vextractf128 $0x1,%ymm9,%xmm0
+ DB 196,65,41,118,210 ; vpcmpeqd %xmm10,%xmm10,%xmm10
+ DB 196,193,121,254,194 ; vpaddd %xmm10,%xmm0,%xmm0
+ DB 196,65,49,254,202 ; vpaddd %xmm10,%xmm9,%xmm9
+ DB 196,227,53,24,192,1 ; vinsertf128 $0x1,%xmm0,%ymm9,%ymm0
+ DB 197,188,93,192 ; vminps %ymm0,%ymm8,%ymm0
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_mirror_y_avx
+_sk_mirror_y_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 197,122,16,0 ; vmovss (%rax),%xmm8
+ DB 196,65,121,112,200,0 ; vpshufd $0x0,%xmm8,%xmm9
+ DB 196,67,53,24,201,1 ; vinsertf128 $0x1,%xmm9,%ymm9,%ymm9
+ DB 196,65,116,92,209 ; vsubps %ymm9,%ymm1,%ymm10
+ DB 196,193,58,88,200 ; vaddss %xmm8,%xmm8,%xmm1
+ DB 196,227,121,4,201,0 ; vpermilps $0x0,%xmm1,%xmm1
+ DB 196,227,117,24,201,1 ; vinsertf128 $0x1,%xmm1,%ymm1,%ymm1
+ DB 197,44,94,193 ; vdivps %ymm1,%ymm10,%ymm8
+ DB 196,67,125,8,192,1 ; vroundps $0x1,%ymm8,%ymm8
+ DB 197,188,89,201 ; vmulps %ymm1,%ymm8,%ymm1
+ DB 197,172,92,201 ; vsubps %ymm1,%ymm10,%ymm1
+ DB 196,193,116,92,201 ; vsubps %ymm9,%ymm1,%ymm1
+ DB 196,65,60,87,192 ; vxorps %ymm8,%ymm8,%ymm8
+ DB 197,60,92,193 ; vsubps %ymm1,%ymm8,%ymm8
+ DB 197,60,84,193 ; vandps %ymm1,%ymm8,%ymm8
+ DB 196,99,125,25,201,1 ; vextractf128 $0x1,%ymm9,%xmm1
+ DB 196,65,41,118,210 ; vpcmpeqd %xmm10,%xmm10,%xmm10
+ DB 196,193,113,254,202 ; vpaddd %xmm10,%xmm1,%xmm1
+ DB 196,65,49,254,202 ; vpaddd %xmm10,%xmm9,%xmm9
+ DB 196,227,53,24,201,1 ; vinsertf128 $0x1,%xmm1,%ymm9,%ymm1
+ DB 197,188,93,201 ; vminps %ymm1,%ymm8,%ymm1
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
@@ -2456,13 +2616,13 @@ _sk_store_f16_sse41 LABEL PROC
PUBLIC _sk_clamp_x_sse41
_sk_clamp_x_sse41 LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 243,68,15,16,0 ; movss (%rax),%xmm8
- DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
- DB 102,69,15,118,201 ; pcmpeqd %xmm9,%xmm9
- DB 102,69,15,254,200 ; paddd %xmm8,%xmm9
- DB 65,15,93,193 ; minps %xmm9,%xmm0
DB 69,15,87,192 ; xorps %xmm8,%xmm8
DB 68,15,95,192 ; maxps %xmm0,%xmm8
+ DB 243,68,15,16,8 ; movss (%rax),%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 102,15,118,192 ; pcmpeqd %xmm0,%xmm0
+ DB 102,65,15,254,193 ; paddd %xmm9,%xmm0
+ DB 68,15,93,192 ; minps %xmm0,%xmm8
DB 72,173 ; lods %ds:(%rsi),%rax
DB 65,15,40,192 ; movaps %xmm8,%xmm0
DB 255,224 ; jmpq *%rax
@@ -2470,15 +2630,95 @@ _sk_clamp_x_sse41 LABEL PROC
PUBLIC _sk_clamp_y_sse41
_sk_clamp_y_sse41 LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 69,15,87,192 ; xorps %xmm8,%xmm8
+ DB 68,15,95,193 ; maxps %xmm1,%xmm8
+ DB 243,68,15,16,8 ; movss (%rax),%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 102,15,118,201 ; pcmpeqd %xmm1,%xmm1
+ DB 102,65,15,254,201 ; paddd %xmm9,%xmm1
+ DB 68,15,93,193 ; minps %xmm1,%xmm8
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,15,40,200 ; movaps %xmm8,%xmm1
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_repeat_x_sse41
+_sk_repeat_x_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
DB 243,68,15,16,0 ; movss (%rax),%xmm8
DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 68,15,40,200 ; movaps %xmm0,%xmm9
+ DB 69,15,94,200 ; divps %xmm8,%xmm9
+ DB 102,69,15,58,8,201,1 ; roundps $0x1,%xmm9,%xmm9
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 65,15,92,193 ; subps %xmm9,%xmm0
+ DB 102,69,15,118,201 ; pcmpeqd %xmm9,%xmm9
+ DB 102,69,15,254,200 ; paddd %xmm8,%xmm9
+ DB 65,15,93,193 ; minps %xmm9,%xmm0
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_repeat_y_sse41
+_sk_repeat_y_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 243,68,15,16,0 ; movss (%rax),%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 68,15,40,201 ; movaps %xmm1,%xmm9
+ DB 69,15,94,200 ; divps %xmm8,%xmm9
+ DB 102,69,15,58,8,201,1 ; roundps $0x1,%xmm9,%xmm9
+ DB 69,15,89,200 ; mulps %xmm8,%xmm9
+ DB 65,15,92,201 ; subps %xmm9,%xmm1
DB 102,69,15,118,201 ; pcmpeqd %xmm9,%xmm9
DB 102,69,15,254,200 ; paddd %xmm8,%xmm9
DB 65,15,93,201 ; minps %xmm9,%xmm1
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_mirror_x_sse41
+_sk_mirror_x_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 243,68,15,16,0 ; movss (%rax),%xmm8
+ DB 69,15,40,200 ; movaps %xmm8,%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 65,15,92,193 ; subps %xmm9,%xmm0
+ DB 243,69,15,88,192 ; addss %xmm8,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 68,15,40,208 ; movaps %xmm0,%xmm10
+ DB 69,15,94,208 ; divps %xmm8,%xmm10
+ DB 102,69,15,58,8,210,1 ; roundps $0x1,%xmm10,%xmm10
+ DB 69,15,89,208 ; mulps %xmm8,%xmm10
+ DB 65,15,92,194 ; subps %xmm10,%xmm0
+ DB 65,15,92,193 ; subps %xmm9,%xmm0
DB 69,15,87,192 ; xorps %xmm8,%xmm8
- DB 68,15,95,193 ; maxps %xmm1,%xmm8
+ DB 68,15,92,192 ; subps %xmm0,%xmm8
+ DB 65,15,84,192 ; andps %xmm8,%xmm0
+ DB 102,69,15,118,192 ; pcmpeqd %xmm8,%xmm8
+ DB 102,69,15,254,193 ; paddd %xmm9,%xmm8
+ DB 65,15,93,192 ; minps %xmm8,%xmm0
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_mirror_y_sse41
+_sk_mirror_y_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 243,68,15,16,0 ; movss (%rax),%xmm8
+ DB 69,15,40,200 ; movaps %xmm8,%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 65,15,92,201 ; subps %xmm9,%xmm1
+ DB 243,69,15,88,192 ; addss %xmm8,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 68,15,40,209 ; movaps %xmm1,%xmm10
+ DB 69,15,94,208 ; divps %xmm8,%xmm10
+ DB 102,69,15,58,8,210,1 ; roundps $0x1,%xmm10,%xmm10
+ DB 69,15,89,208 ; mulps %xmm8,%xmm10
+ DB 65,15,92,202 ; subps %xmm10,%xmm1
+ DB 65,15,92,201 ; subps %xmm9,%xmm1
+ DB 69,15,87,192 ; xorps %xmm8,%xmm8
+ DB 68,15,92,193 ; subps %xmm1,%xmm8
+ DB 65,15,84,200 ; andps %xmm8,%xmm1
+ DB 102,69,15,118,192 ; pcmpeqd %xmm8,%xmm8
+ DB 102,69,15,254,193 ; paddd %xmm9,%xmm8
+ DB 65,15,93,200 ; minps %xmm8,%xmm1
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 65,15,40,200 ; movaps %xmm8,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_matrix_2x3_sse41
@@ -3410,13 +3650,13 @@ _sk_store_f16_sse2 LABEL PROC
PUBLIC _sk_clamp_x_sse2
_sk_clamp_x_sse2 LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 243,68,15,16,0 ; movss (%rax),%xmm8
- DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
- DB 102,69,15,118,201 ; pcmpeqd %xmm9,%xmm9
- DB 102,69,15,254,200 ; paddd %xmm8,%xmm9
- DB 65,15,93,193 ; minps %xmm9,%xmm0
DB 69,15,87,192 ; xorps %xmm8,%xmm8
DB 68,15,95,192 ; maxps %xmm0,%xmm8
+ DB 243,68,15,16,8 ; movss (%rax),%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 102,15,118,192 ; pcmpeqd %xmm0,%xmm0
+ DB 102,65,15,254,193 ; paddd %xmm9,%xmm0
+ DB 68,15,93,192 ; minps %xmm0,%xmm8
DB 72,173 ; lods %ds:(%rsi),%rax
DB 65,15,40,192 ; movaps %xmm8,%xmm0
DB 255,224 ; jmpq *%rax
@@ -3424,15 +3664,119 @@ _sk_clamp_x_sse2 LABEL PROC
PUBLIC _sk_clamp_y_sse2
_sk_clamp_y_sse2 LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 69,15,87,192 ; xorps %xmm8,%xmm8
+ DB 68,15,95,193 ; maxps %xmm1,%xmm8
+ DB 243,68,15,16,8 ; movss (%rax),%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 102,15,118,201 ; pcmpeqd %xmm1,%xmm1
+ DB 102,65,15,254,201 ; paddd %xmm9,%xmm1
+ DB 68,15,93,193 ; minps %xmm1,%xmm8
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 65,15,40,200 ; movaps %xmm8,%xmm1
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_repeat_x_sse2
+_sk_repeat_x_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
DB 243,68,15,16,0 ; movss (%rax),%xmm8
DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 68,15,40,200 ; movaps %xmm0,%xmm9
+ DB 69,15,94,200 ; divps %xmm8,%xmm9
+ DB 243,69,15,91,209 ; cvttps2dq %xmm9,%xmm10
+ DB 69,15,91,210 ; cvtdq2ps %xmm10,%xmm10
+ DB 69,15,194,202,1 ; cmpltps %xmm10,%xmm9
+ DB 243,68,15,16,26 ; movss (%rdx),%xmm11
+ DB 69,15,198,219,0 ; shufps $0x0,%xmm11,%xmm11
+ DB 69,15,84,217 ; andps %xmm9,%xmm11
+ DB 69,15,92,211 ; subps %xmm11,%xmm10
+ DB 69,15,89,208 ; mulps %xmm8,%xmm10
+ DB 65,15,92,194 ; subps %xmm10,%xmm0
+ DB 102,69,15,118,201 ; pcmpeqd %xmm9,%xmm9
+ DB 102,69,15,254,200 ; paddd %xmm8,%xmm9
+ DB 65,15,93,193 ; minps %xmm9,%xmm0
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_repeat_y_sse2
+_sk_repeat_y_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 243,68,15,16,0 ; movss (%rax),%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 68,15,40,201 ; movaps %xmm1,%xmm9
+ DB 69,15,94,200 ; divps %xmm8,%xmm9
+ DB 243,69,15,91,209 ; cvttps2dq %xmm9,%xmm10
+ DB 69,15,91,210 ; cvtdq2ps %xmm10,%xmm10
+ DB 69,15,194,202,1 ; cmpltps %xmm10,%xmm9
+ DB 243,68,15,16,26 ; movss (%rdx),%xmm11
+ DB 69,15,198,219,0 ; shufps $0x0,%xmm11,%xmm11
+ DB 69,15,84,217 ; andps %xmm9,%xmm11
+ DB 69,15,92,211 ; subps %xmm11,%xmm10
+ DB 69,15,89,208 ; mulps %xmm8,%xmm10
+ DB 65,15,92,202 ; subps %xmm10,%xmm1
+ DB 102,69,15,118,201 ; pcmpeqd %xmm9,%xmm9
+ DB 102,69,15,254,200 ; paddd %xmm8,%xmm9
+ DB 65,15,93,201 ; minps %xmm9,%xmm1
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_mirror_x_sse2
+_sk_mirror_x_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 243,68,15,16,8 ; movss (%rax),%xmm9
+ DB 69,15,40,193 ; movaps %xmm9,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 65,15,92,192 ; subps %xmm8,%xmm0
+ DB 243,69,15,88,201 ; addss %xmm9,%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 68,15,40,208 ; movaps %xmm0,%xmm10
+ DB 69,15,94,209 ; divps %xmm9,%xmm10
+ DB 243,69,15,91,218 ; cvttps2dq %xmm10,%xmm11
+ DB 69,15,91,219 ; cvtdq2ps %xmm11,%xmm11
+ DB 69,15,194,211,1 ; cmpltps %xmm11,%xmm10
+ DB 243,68,15,16,34 ; movss (%rdx),%xmm12
+ DB 69,15,198,228,0 ; shufps $0x0,%xmm12,%xmm12
+ DB 69,15,84,226 ; andps %xmm10,%xmm12
+ DB 69,15,87,210 ; xorps %xmm10,%xmm10
+ DB 69,15,92,220 ; subps %xmm12,%xmm11
+ DB 69,15,89,217 ; mulps %xmm9,%xmm11
+ DB 65,15,92,195 ; subps %xmm11,%xmm0
+ DB 65,15,92,192 ; subps %xmm8,%xmm0
+ DB 68,15,92,208 ; subps %xmm0,%xmm10
+ DB 65,15,84,194 ; andps %xmm10,%xmm0
+ DB 102,69,15,118,201 ; pcmpeqd %xmm9,%xmm9
+ DB 102,69,15,254,200 ; paddd %xmm8,%xmm9
+ DB 65,15,93,193 ; minps %xmm9,%xmm0
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
+PUBLIC _sk_mirror_y_sse2
+_sk_mirror_y_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 243,68,15,16,8 ; movss (%rax),%xmm9
+ DB 69,15,40,193 ; movaps %xmm9,%xmm8
+ DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8
+ DB 65,15,92,200 ; subps %xmm8,%xmm1
+ DB 243,69,15,88,201 ; addss %xmm9,%xmm9
+ DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9
+ DB 68,15,40,209 ; movaps %xmm1,%xmm10
+ DB 69,15,94,209 ; divps %xmm9,%xmm10
+ DB 243,69,15,91,218 ; cvttps2dq %xmm10,%xmm11
+ DB 69,15,91,219 ; cvtdq2ps %xmm11,%xmm11
+ DB 69,15,194,211,1 ; cmpltps %xmm11,%xmm10
+ DB 243,68,15,16,34 ; movss (%rdx),%xmm12
+ DB 69,15,198,228,0 ; shufps $0x0,%xmm12,%xmm12
+ DB 69,15,84,226 ; andps %xmm10,%xmm12
+ DB 69,15,87,210 ; xorps %xmm10,%xmm10
+ DB 69,15,92,220 ; subps %xmm12,%xmm11
+ DB 69,15,89,217 ; mulps %xmm9,%xmm11
+ DB 65,15,92,203 ; subps %xmm11,%xmm1
+ DB 65,15,92,200 ; subps %xmm8,%xmm1
+ DB 68,15,92,209 ; subps %xmm1,%xmm10
+ DB 65,15,84,202 ; andps %xmm10,%xmm1
DB 102,69,15,118,201 ; pcmpeqd %xmm9,%xmm9
DB 102,69,15,254,200 ; paddd %xmm8,%xmm9
DB 65,15,93,201 ; minps %xmm9,%xmm1
- DB 69,15,87,192 ; xorps %xmm8,%xmm8
- DB 68,15,95,193 ; maxps %xmm1,%xmm8
DB 72,173 ; lods %ds:(%rsi),%rax
- DB 65,15,40,200 ; movaps %xmm8,%xmm1
DB 255,224 ; jmpq *%rax
PUBLIC _sk_matrix_2x3_sse2
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index 899323b0db..dfcd786629 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -38,6 +38,8 @@ static Dst bit_cast(const Src& src) {
static F mad(F f, F m, F a) { return f*m+a; }
static F min(F a, F b) { return fminf(a,b); }
static F max(F a, F b) { return fmaxf(a,b); }
+ static F abs (F v) { return fabsf(v); }
+ static F floor(F v, K*) { return floorf(v); }
static F rcp (F v) { return 1.0f / v; }
static F rsqrt(F v) { return 1.0f / sqrtf(v); }
static U32 round(F v, F scale) { return (uint32_t)lrintf(v*scale); }
@@ -64,6 +66,8 @@ static Dst bit_cast(const Src& src) {
static F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
static F min(F a, F b) { return vminq_f32(a,b); }
static F max(F a, F b) { return vmaxq_f32(a,b); }
+ static F abs (F v) { return vabsq_f32(v); }
+ static F floor(F v, K*) { return vrndmq_f32(v); }
static F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
static F rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
static U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
@@ -92,6 +96,7 @@ static Dst bit_cast(const Src& src) {
static F mad(F f, F m, F a) { return vfma_f32(a,f,m); }
static F min(F a, F b) { return vmin_f32(a,b); }
static F max(F a, F b) { return vmax_f32(a,b); }
+ static F abs (F v) { return vabs_f32(v); }
static F rcp (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e ) * e; }
static F rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
static U32 round(F v, F scale) { return vcvt_u32_f32(mad(v,scale,0.5f)); }
@@ -100,6 +105,11 @@ static Dst bit_cast(const Src& src) {
static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
+ static F floor(F v, K* k) {
+ F roundtrip = vcvt_f32_s32(vcvt_s32_f32(v));
+ return roundtrip - if_then_else(roundtrip > v, k->_1, 0);
+ }
+
static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
#define WRAP(name) sk_##name##_vfp4
@@ -117,6 +127,8 @@ static Dst bit_cast(const Src& src) {
static F mad(F f, F m, F a) { return _mm256_fmadd_ps(f,m,a);}
static F min(F a, F b) { return _mm256_min_ps(a,b); }
static F max(F a, F b) { return _mm256_max_ps(a,b); }
+ static F abs(F v) { return _mm256_and_ps(v, 0-v); }
+ static F floor(F v, K*) { return _mm256_floor_ps(v); }
static F rcp (F v) { return _mm256_rcp_ps (v); }
static F rsqrt(F v) { return _mm256_rsqrt_ps(v); }
static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
@@ -146,11 +158,13 @@ static Dst bit_cast(const Src& src) {
using U16 = uint16_t __attribute__((ext_vector_type(8)));
using U8 = uint8_t __attribute__((ext_vector_type(8)));
- static F mad(F f, F m, F a) { return f*m+a; }
- static F min(F a, F b) { return _mm256_min_ps(a,b); }
- static F max(F a, F b) { return _mm256_max_ps(a,b); }
- static F rcp (F v) { return _mm256_rcp_ps (v); }
- static F rsqrt(F v) { return _mm256_rsqrt_ps(v); }
+ static F mad(F f, F m, F a) { return f*m+a; }
+ static F min(F a, F b) { return _mm256_min_ps(a,b); }
+ static F max(F a, F b) { return _mm256_max_ps(a,b); }
+ static F abs(F v) { return _mm256_and_ps(v, 0-v); }
+ static F floor(F v, K*) { return _mm256_floor_ps(v); }
+ static F rcp (F v) { return _mm256_rcp_ps (v); }
+ static F rsqrt(F v) { return _mm256_rsqrt_ps(v); }
static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
static U16 pack(U32 v) {
@@ -181,11 +195,12 @@ static Dst bit_cast(const Src& src) {
using U16 = uint16_t __attribute__((ext_vector_type(4)));
using U8 = uint8_t __attribute__((ext_vector_type(4)));
- static F mad(F f, F m, F a) { return f*m+a; }
- static F min(F a, F b) { return _mm_min_ps(a,b); }
- static F max(F a, F b) { return _mm_max_ps(a,b); }
- static F rcp (F v) { return _mm_rcp_ps (v); }
- static F rsqrt(F v) { return _mm_rsqrt_ps(v); }
+ static F mad(F f, F m, F a) { return f*m+a; }
+ static F min(F a, F b) { return _mm_min_ps(a,b); }
+ static F max(F a, F b) { return _mm_max_ps(a,b); }
+ static F abs(F v) { return _mm_and_ps(v, 0-v); }
+ static F rcp (F v) { return _mm_rcp_ps (v); }
+ static F rsqrt(F v) { return _mm_rsqrt_ps(v); }
static U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
static U16 pack(U32 v) {
@@ -213,6 +228,15 @@ static Dst bit_cast(const Src& src) {
#endif
}
+ static F floor(F v, K* k) {
+ #if defined(__SSE4_1__)
+ return _mm_floor_ps(v);
+ #else
+ F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
+ return roundtrip - if_then_else(roundtrip > v, k->_1, 0);
+ #endif
+ }
+
static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
#if defined(__SSE4_1__)
@@ -795,12 +819,27 @@ STAGE(store_f16) {
#endif
}
-static F clamp(const F& v, float limit) {
- F l = bit_cast<F>(bit_cast<U32>(F(limit)) + U32(0xffffffff)); // limit - 1 ulp
- return max(0, min(v, l));
+static F ulp_before(F v) {
+ return bit_cast<F>(bit_cast<U32>(v) + U32(0xffffffff));
+}
+static F clamp(F v, float limit, K*) {
+ v = max(0, v);
+ return min(v, ulp_before(limit));
+}
+static F repeat(F v, float limit, K* k) {
+ v = v - floor(v/limit, k)*limit;
+ return min(v, ulp_before(limit));
+}
+static F mirror(F v, float limit, K* k) {
+ v = abs( (v-limit) - (limit+limit)*floor((v-limit)/(limit+limit),k) - limit );
+ return min(v, ulp_before(limit));
}
-STAGE(clamp_x) { r = clamp(r, *(const float*)ctx); }
-STAGE(clamp_y) { g = clamp(g, *(const float*)ctx); }
+STAGE(clamp_x) { r = clamp (r, *(const float*)ctx, k); }
+STAGE(clamp_y) { g = clamp (g, *(const float*)ctx, k); }
+STAGE(repeat_x) { r = repeat(r, *(const float*)ctx, k); }
+STAGE(repeat_y) { g = repeat(g, *(const float*)ctx, k); }
+STAGE(mirror_x) { r = mirror(r, *(const float*)ctx, k); }
+STAGE(mirror_y) { g = mirror(g, *(const float*)ctx, k); }
STAGE(matrix_2x3) {
auto m = (const float*)ctx;