aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-02-21 16:50:52 -0500
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-02-21 22:52:13 +0000
commit9ef63754a7262f57097b39318adf2f7789d23ecf (patch)
treee67f511f5e301564da500a00588793b9f04ec262 /src
parentc663953504e7cd6ad0e673e926203a3a38478d05 (diff)
Move looping logic into start_pipeline().
This should be a big win on Windows, but I haven't timed there yet. On my Mac, it's a solid 2% speedup. PS1 was insufficiently ambitious, but was this for posterity: No need to vzeroupper twice on Windows. On Windows start_pipeline() will vzeroupper, so no need to do it in just_return(). Change-Id: I099320b95da85900a60ce96fdb7a216a36db1858 Reviewed-on: https://skia-review.googlesource.com/8821 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Herb Derby <herb@google.com>
Diffstat (limited to 'src')
-rw-r--r--src/jumper/SkJumper.cpp19
-rw-r--r--src/jumper/SkJumper_generated.S151
-rw-r--r--src/jumper/SkJumper_generated_win.S109
-rw-r--r--src/jumper/SkJumper_stages.cpp17
4 files changed, 251 insertions, 45 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 4923f6f92b..2140fd06a0 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -65,23 +65,23 @@ using StageFn = void(void);
extern "C" {
#if defined(__aarch64__)
- void ASM(start_pipeline,aarch64)(size_t, void**, K*);
+ size_t ASM(start_pipeline,aarch64)(size_t, void**, K*, size_t);
StageFn ASM(just_return,aarch64);
#define M(st) StageFn ASM(st,aarch64);
STAGES(M)
#undef M
#elif defined(__arm__)
- void ASM(start_pipeline,vfp4)(size_t, void**, K*);
+ size_t ASM(start_pipeline,vfp4)(size_t, void**, K*, size_t);
StageFn ASM(just_return,vfp4);
#define M(st) StageFn ASM(st,vfp4);
STAGES(M)
#undef M
#elif defined(__x86_64__) || defined(_M_X64)
- void ASM(start_pipeline,hsw )(size_t, void**, K*);
- void ASM(start_pipeline,sse41)(size_t, void**, K*);
- void ASM(start_pipeline,sse2 )(size_t, void**, K*);
+ size_t ASM(start_pipeline,hsw )(size_t, void**, K*, size_t);
+ size_t ASM(start_pipeline,sse41)(size_t, void**, K*, size_t);
+ size_t ASM(start_pipeline,sse2 )(size_t, void**, K*, size_t);
StageFn ASM(just_return,hsw),
ASM(just_return,sse41),
@@ -99,7 +99,7 @@ extern "C" {
#endif
// Portable, single-pixel stages.
- void sk_start_pipeline(size_t, void**, K*);
+ size_t sk_start_pipeline(size_t, void**, K*, size_t);
StageFn sk_just_return;
#define M(st) StageFn sk_##st;
STAGES(M)
@@ -171,7 +171,7 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
auto build_and_run = [&](size_t stride,
StageFn* (*lookup)(SkRasterPipeline::StockStage),
StageFn* just_return,
- void (*start_pipeline)(size_t, void**, K*)) {
+ size_t (*start_pipeline)(size_t, void**, K*, size_t)) {
if (x + stride <= limit) {
void** ip = program.get();
for (auto&& st : fStages) {
@@ -184,10 +184,7 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
}
*ip = (void*)just_return;
- while (x + stride <= limit) {
- start_pipeline(x, program.get(), &kConstants);
- x += stride;
- }
+ x = start_pipeline(x, program.get(), &kConstants, limit);
}
return true;
};
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index 77a212e3df..0f6b1f3c37 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -12,7 +12,21 @@
.globl _sk_start_pipeline_aarch64
_sk_start_pipeline_aarch64:
- .long 0xf8408423 // ldr x3, [x1],#8
+ .long 0xa9bc5ff8 // stp x24, x23, [sp,#-64]!
+ .long 0xa90157f6 // stp x22, x21, [sp,#16]
+ .long 0xa9024ff4 // stp x20, x19, [sp,#32]
+ .long 0xa9037bfd // stp x29, x30, [sp,#48]
+ .long 0x9100c3fd // add x29, sp, #0x30
+ .long 0xaa0103f3 // mov x19, x1
+ .long 0xf8408677 // ldr x23, [x19],#8
+ .long 0xaa0003f6 // mov x22, x0
+ .long 0xaa0303f4 // mov x20, x3
+ .long 0xaa0203f5 // mov x21, x2
+ .long 0x910012c8 // add x8, x22, #0x4
+ .long 0xeb14011f // cmp x8, x20
+ .long 0x54000069 // b.ls 3c <sk_start_pipeline_aarch64+0x3c>
+ .long 0xaa1603e0 // mov x0, x22
+ .long 0x14000012 // b 80 <sk_start_pipeline_aarch64+0x80>
.long 0x6f00e400 // movi v0.2d, #0x0
.long 0x6f00e401 // movi v1.2d, #0x0
.long 0x6f00e402 // movi v2.2d, #0x0
@@ -21,7 +35,20 @@ _sk_start_pipeline_aarch64:
.long 0x6f00e405 // movi v5.2d, #0x0
.long 0x6f00e406 // movi v6.2d, #0x0
.long 0x6f00e407 // movi v7.2d, #0x0
- .long 0xd61f0060 // br x3
+ .long 0xaa1603e0 // mov x0, x22
+ .long 0xaa1303e1 // mov x1, x19
+ .long 0xaa1503e2 // mov x2, x21
+ .long 0xd63f02e0 // blr x23
+ .long 0x910022c8 // add x8, x22, #0x8
+ .long 0x910012c0 // add x0, x22, #0x4
+ .long 0xeb14011f // cmp x8, x20
+ .long 0xaa0003f6 // mov x22, x0
+ .long 0x54fffe09 // b.ls 3c <sk_start_pipeline_aarch64+0x3c>
+ .long 0xa9437bfd // ldp x29, x30, [sp,#48]
+ .long 0xa9424ff4 // ldp x20, x19, [sp,#32]
+ .long 0xa94157f6 // ldp x22, x21, [sp,#16]
+ .long 0xa8c45ff8 // ldp x24, x23, [sp],#64
+ .long 0xd65f03c0 // ret
.globl _sk_just_return_aarch64
_sk_just_return_aarch64:
@@ -551,16 +578,32 @@ _sk_linear_gradient_2stops_aarch64:
.globl _sk_start_pipeline_vfp4
_sk_start_pipeline_vfp4:
- .long 0xe4913004 // ldr r3, [r1], #4
+ .long 0xe92d41f0 // push {r4, r5, r6, r7, r8, lr}
+ .long 0xe1a07001 // mov r7, r1
+ .long 0xe1a04000 // mov r4, r0
+ .long 0xe1a05003 // mov r5, r3
+ .long 0xe1a08002 // mov r8, r2
+ .long 0xe4976004 // ldr r6, [r7], #4
+ .long 0xe2840002 // add r0, r4, #2
+ .long 0xea00000d // b 58 <sk_start_pipeline_vfp4+0x58>
.long 0xf2800010 // vmov.i32 d0, #0
+ .long 0xe1a00004 // mov r0, r4
.long 0xf2801010 // vmov.i32 d1, #0
+ .long 0xe1a01007 // mov r1, r7
.long 0xf2802010 // vmov.i32 d2, #0
+ .long 0xe1a02008 // mov r2, r8
.long 0xf2803010 // vmov.i32 d3, #0
.long 0xf2804010 // vmov.i32 d4, #0
.long 0xf2805010 // vmov.i32 d5, #0
.long 0xf2806010 // vmov.i32 d6, #0
.long 0xf2807010 // vmov.i32 d7, #0
- .long 0xe12fff13 // bx r3
+ .long 0xe12fff36 // blx r6
+ .long 0xe2840004 // add r0, r4, #4
+ .long 0xe2844002 // add r4, r4, #2
+ .long 0xe1500005 // cmp r0, r5
+ .long 0x9affffef // bls 20 <sk_start_pipeline_vfp4+0x20>
+ .long 0xe1a00004 // mov r0, r4
+ .long 0xe8bd81f0 // pop {r4, r5, r6, r7, r8, pc}
.globl _sk_just_return_vfp4
_sk_just_return_vfp4:
@@ -1152,7 +1195,22 @@ _sk_linear_gradient_2stops_vfp4:
.globl _sk_start_pipeline_hsw
_sk_start_pipeline_hsw:
- .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x41,0x57 // push %r15
+ .byte 0x41,0x56 // push %r14
+ .byte 0x41,0x55 // push %r13
+ .byte 0x41,0x54 // push %r12
+ .byte 0x53 // push %rbx
+ .byte 0x49,0x89,0xcf // mov %rcx,%r15
+ .byte 0x49,0x89,0xd6 // mov %rdx,%r14
+ .byte 0x48,0x89,0xfb // mov %rdi,%rbx
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x49,0x89,0xc4 // mov %rax,%r12
+ .byte 0x49,0x89,0xf5 // mov %rsi,%r13
+ .byte 0x48,0x8d,0x43,0x08 // lea 0x8(%rbx),%rax
+ .byte 0x4c,0x39,0xf8 // cmp %r15,%rax
+ .byte 0x76,0x05 // jbe 28 <_sk_start_pipeline_hsw+0x28>
+ .byte 0x48,0x89,0xd8 // mov %rbx,%rax
+ .byte 0xeb,0x3c // jmp 64 <_sk_start_pipeline_hsw+0x64>
.byte 0xc5,0xfc,0x57,0xc0 // vxorps %ymm0,%ymm0,%ymm0
.byte 0xc5,0xf4,0x57,0xc9 // vxorps %ymm1,%ymm1,%ymm1
.byte 0xc5,0xec,0x57,0xd2 // vxorps %ymm2,%ymm2,%ymm2
@@ -1161,7 +1219,22 @@ _sk_start_pipeline_hsw:
.byte 0xc5,0xd4,0x57,0xed // vxorps %ymm5,%ymm5,%ymm5
.byte 0xc5,0xcc,0x57,0xf6 // vxorps %ymm6,%ymm6,%ymm6
.byte 0xc5,0xc4,0x57,0xff // vxorps %ymm7,%ymm7,%ymm7
- .byte 0xff,0xe0 // jmpq *%rax
+ .byte 0x48,0x89,0xdf // mov %rbx,%rdi
+ .byte 0x4c,0x89,0xee // mov %r13,%rsi
+ .byte 0x4c,0x89,0xf2 // mov %r14,%rdx
+ .byte 0x41,0xff,0xd4 // callq *%r12
+ .byte 0x48,0x8d,0x43,0x08 // lea 0x8(%rbx),%rax
+ .byte 0x48,0x83,0xc3,0x10 // add $0x10,%rbx
+ .byte 0x4c,0x39,0xfb // cmp %r15,%rbx
+ .byte 0x48,0x89,0xc3 // mov %rax,%rbx
+ .byte 0x76,0xc4 // jbe 28 <_sk_start_pipeline_hsw+0x28>
+ .byte 0x5b // pop %rbx
+ .byte 0x41,0x5c // pop %r12
+ .byte 0x41,0x5d // pop %r13
+ .byte 0x41,0x5e // pop %r14
+ .byte 0x41,0x5f // pop %r15
+ .byte 0xc5,0xf8,0x77 // vzeroupper
+ .byte 0xc3 // retq
.globl _sk_just_return_hsw
_sk_just_return_hsw:
@@ -1640,7 +1713,22 @@ _sk_linear_gradient_2stops_hsw:
.globl _sk_start_pipeline_sse41
_sk_start_pipeline_sse41:
- .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x41,0x57 // push %r15
+ .byte 0x41,0x56 // push %r14
+ .byte 0x41,0x55 // push %r13
+ .byte 0x41,0x54 // push %r12
+ .byte 0x53 // push %rbx
+ .byte 0x49,0x89,0xcf // mov %rcx,%r15
+ .byte 0x49,0x89,0xd6 // mov %rdx,%r14
+ .byte 0x48,0x89,0xfb // mov %rdi,%rbx
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x49,0x89,0xc4 // mov %rax,%r12
+ .byte 0x49,0x89,0xf5 // mov %rsi,%r13
+ .byte 0x48,0x8d,0x43,0x04 // lea 0x4(%rbx),%rax
+ .byte 0x4c,0x39,0xf8 // cmp %r15,%rax
+ .byte 0x76,0x05 // jbe 28 <_sk_start_pipeline_sse41+0x28>
+ .byte 0x48,0x89,0xd8 // mov %rbx,%rax
+ .byte 0xeb,0x34 // jmp 5c <_sk_start_pipeline_sse41+0x5c>
.byte 0x0f,0x57,0xc0 // xorps %xmm0,%xmm0
.byte 0x0f,0x57,0xc9 // xorps %xmm1,%xmm1
.byte 0x0f,0x57,0xd2 // xorps %xmm2,%xmm2
@@ -1649,7 +1737,21 @@ _sk_start_pipeline_sse41:
.byte 0x0f,0x57,0xed // xorps %xmm5,%xmm5
.byte 0x0f,0x57,0xf6 // xorps %xmm6,%xmm6
.byte 0x0f,0x57,0xff // xorps %xmm7,%xmm7
- .byte 0xff,0xe0 // jmpq *%rax
+ .byte 0x48,0x89,0xdf // mov %rbx,%rdi
+ .byte 0x4c,0x89,0xee // mov %r13,%rsi
+ .byte 0x4c,0x89,0xf2 // mov %r14,%rdx
+ .byte 0x41,0xff,0xd4 // callq *%r12
+ .byte 0x48,0x8d,0x43,0x04 // lea 0x4(%rbx),%rax
+ .byte 0x48,0x83,0xc3,0x08 // add $0x8,%rbx
+ .byte 0x4c,0x39,0xfb // cmp %r15,%rbx
+ .byte 0x48,0x89,0xc3 // mov %rax,%rbx
+ .byte 0x76,0xcc // jbe 28 <_sk_start_pipeline_sse41+0x28>
+ .byte 0x5b // pop %rbx
+ .byte 0x41,0x5c // pop %r12
+ .byte 0x41,0x5d // pop %r13
+ .byte 0x41,0x5e // pop %r14
+ .byte 0x41,0x5f // pop %r15
+ .byte 0xc3 // retq
.globl _sk_just_return_sse41
_sk_just_return_sse41:
@@ -2312,7 +2414,22 @@ _sk_linear_gradient_2stops_sse41:
.globl _sk_start_pipeline_sse2
_sk_start_pipeline_sse2:
- .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x41,0x57 // push %r15
+ .byte 0x41,0x56 // push %r14
+ .byte 0x41,0x55 // push %r13
+ .byte 0x41,0x54 // push %r12
+ .byte 0x53 // push %rbx
+ .byte 0x49,0x89,0xcf // mov %rcx,%r15
+ .byte 0x49,0x89,0xd6 // mov %rdx,%r14
+ .byte 0x48,0x89,0xfb // mov %rdi,%rbx
+ .byte 0x48,0xad // lods %ds:(%rsi),%rax
+ .byte 0x49,0x89,0xc4 // mov %rax,%r12
+ .byte 0x49,0x89,0xf5 // mov %rsi,%r13
+ .byte 0x48,0x8d,0x43,0x04 // lea 0x4(%rbx),%rax
+ .byte 0x4c,0x39,0xf8 // cmp %r15,%rax
+ .byte 0x76,0x05 // jbe 28 <_sk_start_pipeline_sse2+0x28>
+ .byte 0x48,0x89,0xd8 // mov %rbx,%rax
+ .byte 0xeb,0x34 // jmp 5c <_sk_start_pipeline_sse2+0x5c>
.byte 0x0f,0x57,0xc0 // xorps %xmm0,%xmm0
.byte 0x0f,0x57,0xc9 // xorps %xmm1,%xmm1
.byte 0x0f,0x57,0xd2 // xorps %xmm2,%xmm2
@@ -2321,7 +2438,21 @@ _sk_start_pipeline_sse2:
.byte 0x0f,0x57,0xed // xorps %xmm5,%xmm5
.byte 0x0f,0x57,0xf6 // xorps %xmm6,%xmm6
.byte 0x0f,0x57,0xff // xorps %xmm7,%xmm7
- .byte 0xff,0xe0 // jmpq *%rax
+ .byte 0x48,0x89,0xdf // mov %rbx,%rdi
+ .byte 0x4c,0x89,0xee // mov %r13,%rsi
+ .byte 0x4c,0x89,0xf2 // mov %r14,%rdx
+ .byte 0x41,0xff,0xd4 // callq *%r12
+ .byte 0x48,0x8d,0x43,0x04 // lea 0x4(%rbx),%rax
+ .byte 0x48,0x83,0xc3,0x08 // add $0x8,%rbx
+ .byte 0x4c,0x39,0xfb // cmp %r15,%rbx
+ .byte 0x48,0x89,0xc3 // mov %rax,%rbx
+ .byte 0x76,0xcc // jbe 28 <_sk_start_pipeline_sse2+0x28>
+ .byte 0x5b // pop %rbx
+ .byte 0x41,0x5c // pop %r12
+ .byte 0x41,0x5d // pop %r13
+ .byte 0x41,0x5e // pop %r14
+ .byte 0x41,0x5f // pop %r15
+ .byte 0xc3 // retq
.globl _sk_just_return_sse2
_sk_just_return_sse2:
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index d6aaa2b7ea..27b8d584aa 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -10,9 +10,14 @@ _text SEGMENT
PUBLIC _sk_start_pipeline_hsw
_sk_start_pipeline_hsw LABEL PROC
+ DB 65,87 ; push %r15
+ DB 65,86 ; push %r14
+ DB 65,85 ; push %r13
+ DB 65,84 ; push %r12
DB 86 ; push %rsi
DB 87 ; push %rdi
- DB 72,129,236,168,0,0,0 ; sub $0xa8,%rsp
+ DB 83 ; push %rbx
+ DB 72,129,236,160,0,0,0 ; sub $0xa0,%rsp
DB 197,120,41,188,36,144,0,0,0 ; vmovaps %xmm15,0x90(%rsp)
DB 197,120,41,180,36,128,0,0,0 ; vmovaps %xmm14,0x80(%rsp)
DB 197,120,41,108,36,112 ; vmovaps %xmm13,0x70(%rsp)
@@ -23,8 +28,18 @@ _sk_start_pipeline_hsw LABEL PROC
DB 197,120,41,68,36,32 ; vmovaps %xmm8,0x20(%rsp)
DB 197,248,41,124,36,16 ; vmovaps %xmm7,0x10(%rsp)
DB 197,248,41,52,36 ; vmovaps %xmm6,(%rsp)
+ DB 77,137,207 ; mov %r9,%r15
+ DB 77,137,198 ; mov %r8,%r14
+ DB 72,137,203 ; mov %rcx,%rbx
DB 72,137,214 ; mov %rdx,%rsi
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 73,137,196 ; mov %rax,%r12
+ DB 73,137,245 ; mov %rsi,%r13
+ DB 72,141,67,8 ; lea 0x8(%rbx),%rax
+ DB 76,57,248 ; cmp %r15,%rax
+ DB 118,5 ; jbe 75 <_sk_start_pipeline_hsw+0x75>
+ DB 72,137,216 ; mov %rbx,%rax
+ DB 235,60 ; jmp b1 <_sk_start_pipeline_hsw+0xb1>
DB 197,252,87,192 ; vxorps %ymm0,%ymm0,%ymm0
DB 197,244,87,201 ; vxorps %ymm1,%ymm1,%ymm1
DB 197,236,87,210 ; vxorps %ymm2,%ymm2,%ymm2
@@ -33,9 +48,15 @@ _sk_start_pipeline_hsw LABEL PROC
DB 197,212,87,237 ; vxorps %ymm5,%ymm5,%ymm5
DB 197,204,87,246 ; vxorps %ymm6,%ymm6,%ymm6
DB 197,196,87,255 ; vxorps %ymm7,%ymm7,%ymm7
- DB 72,137,207 ; mov %rcx,%rdi
- DB 76,137,194 ; mov %r8,%rdx
- DB 255,208 ; callq *%rax
+ DB 72,137,223 ; mov %rbx,%rdi
+ DB 76,137,238 ; mov %r13,%rsi
+ DB 76,137,242 ; mov %r14,%rdx
+ DB 65,255,212 ; callq *%r12
+ DB 72,141,67,8 ; lea 0x8(%rbx),%rax
+ DB 72,131,195,16 ; add $0x10,%rbx
+ DB 76,57,251 ; cmp %r15,%rbx
+ DB 72,137,195 ; mov %rax,%rbx
+ DB 118,196 ; jbe 75 <_sk_start_pipeline_hsw+0x75>
DB 197,248,40,52,36 ; vmovaps (%rsp),%xmm6
DB 197,248,40,124,36,16 ; vmovaps 0x10(%rsp),%xmm7
DB 197,120,40,68,36,32 ; vmovaps 0x20(%rsp),%xmm8
@@ -46,15 +67,19 @@ _sk_start_pipeline_hsw LABEL PROC
DB 197,120,40,108,36,112 ; vmovaps 0x70(%rsp),%xmm13
DB 197,120,40,180,36,128,0,0,0 ; vmovaps 0x80(%rsp),%xmm14
DB 197,120,40,188,36,144,0,0,0 ; vmovaps 0x90(%rsp),%xmm15
- DB 72,129,196,168,0,0,0 ; add $0xa8,%rsp
+ DB 72,129,196,160,0,0,0 ; add $0xa0,%rsp
+ DB 91 ; pop %rbx
DB 95 ; pop %rdi
DB 94 ; pop %rsi
+ DB 65,92 ; pop %r12
+ DB 65,93 ; pop %r13
+ DB 65,94 ; pop %r14
+ DB 65,95 ; pop %r15
DB 197,248,119 ; vzeroupper
DB 195 ; retq
PUBLIC _sk_just_return_hsw
_sk_just_return_hsw LABEL PROC
- DB 197,248,119 ; vzeroupper
DB 195 ; retq
PUBLIC _sk_seed_shader_hsw
@@ -529,9 +554,14 @@ _sk_linear_gradient_2stops_hsw LABEL PROC
PUBLIC _sk_start_pipeline_sse41
_sk_start_pipeline_sse41 LABEL PROC
+ DB 65,87 ; push %r15
+ DB 65,86 ; push %r14
+ DB 65,85 ; push %r13
+ DB 65,84 ; push %r12
DB 86 ; push %rsi
DB 87 ; push %rdi
- DB 72,129,236,168,0,0,0 ; sub $0xa8,%rsp
+ DB 83 ; push %rbx
+ DB 72,129,236,160,0,0,0 ; sub $0xa0,%rsp
DB 68,15,41,188,36,144,0,0,0 ; movaps %xmm15,0x90(%rsp)
DB 68,15,41,180,36,128,0,0,0 ; movaps %xmm14,0x80(%rsp)
DB 68,15,41,108,36,112 ; movaps %xmm13,0x70(%rsp)
@@ -542,8 +572,18 @@ _sk_start_pipeline_sse41 LABEL PROC
DB 68,15,41,68,36,32 ; movaps %xmm8,0x20(%rsp)
DB 15,41,124,36,16 ; movaps %xmm7,0x10(%rsp)
DB 15,41,52,36 ; movaps %xmm6,(%rsp)
+ DB 77,137,207 ; mov %r9,%r15
+ DB 77,137,198 ; mov %r8,%r14
+ DB 72,137,203 ; mov %rcx,%rbx
DB 72,137,214 ; mov %rdx,%rsi
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 73,137,196 ; mov %rax,%r12
+ DB 73,137,245 ; mov %rsi,%r13
+ DB 72,141,67,4 ; lea 0x4(%rbx),%rax
+ DB 76,57,248 ; cmp %r15,%rax
+ DB 118,5 ; jbe 73 <_sk_start_pipeline_sse41+0x73>
+ DB 72,137,216 ; mov %rbx,%rax
+ DB 235,52 ; jmp a7 <_sk_start_pipeline_sse41+0xa7>
DB 15,87,192 ; xorps %xmm0,%xmm0
DB 15,87,201 ; xorps %xmm1,%xmm1
DB 15,87,210 ; xorps %xmm2,%xmm2
@@ -552,9 +592,15 @@ _sk_start_pipeline_sse41 LABEL PROC
DB 15,87,237 ; xorps %xmm5,%xmm5
DB 15,87,246 ; xorps %xmm6,%xmm6
DB 15,87,255 ; xorps %xmm7,%xmm7
- DB 72,137,207 ; mov %rcx,%rdi
- DB 76,137,194 ; mov %r8,%rdx
- DB 255,208 ; callq *%rax
+ DB 72,137,223 ; mov %rbx,%rdi
+ DB 76,137,238 ; mov %r13,%rsi
+ DB 76,137,242 ; mov %r14,%rdx
+ DB 65,255,212 ; callq *%r12
+ DB 72,141,67,4 ; lea 0x4(%rbx),%rax
+ DB 72,131,195,8 ; add $0x8,%rbx
+ DB 76,57,251 ; cmp %r15,%rbx
+ DB 72,137,195 ; mov %rax,%rbx
+ DB 118,204 ; jbe 73 <_sk_start_pipeline_sse41+0x73>
DB 15,40,52,36 ; movaps (%rsp),%xmm6
DB 15,40,124,36,16 ; movaps 0x10(%rsp),%xmm7
DB 68,15,40,68,36,32 ; movaps 0x20(%rsp),%xmm8
@@ -565,9 +611,14 @@ _sk_start_pipeline_sse41 LABEL PROC
DB 68,15,40,108,36,112 ; movaps 0x70(%rsp),%xmm13
DB 68,15,40,180,36,128,0,0,0 ; movaps 0x80(%rsp),%xmm14
DB 68,15,40,188,36,144,0,0,0 ; movaps 0x90(%rsp),%xmm15
- DB 72,129,196,168,0,0,0 ; add $0xa8,%rsp
+ DB 72,129,196,160,0,0,0 ; add $0xa0,%rsp
+ DB 91 ; pop %rbx
DB 95 ; pop %rdi
DB 94 ; pop %rsi
+ DB 65,92 ; pop %r12
+ DB 65,93 ; pop %r13
+ DB 65,94 ; pop %r14
+ DB 65,95 ; pop %r15
DB 195 ; retq
PUBLIC _sk_just_return_sse41
@@ -1231,9 +1282,14 @@ _sk_linear_gradient_2stops_sse41 LABEL PROC
PUBLIC _sk_start_pipeline_sse2
_sk_start_pipeline_sse2 LABEL PROC
+ DB 65,87 ; push %r15
+ DB 65,86 ; push %r14
+ DB 65,85 ; push %r13
+ DB 65,84 ; push %r12
DB 86 ; push %rsi
DB 87 ; push %rdi
- DB 72,129,236,168,0,0,0 ; sub $0xa8,%rsp
+ DB 83 ; push %rbx
+ DB 72,129,236,160,0,0,0 ; sub $0xa0,%rsp
DB 68,15,41,188,36,144,0,0,0 ; movaps %xmm15,0x90(%rsp)
DB 68,15,41,180,36,128,0,0,0 ; movaps %xmm14,0x80(%rsp)
DB 68,15,41,108,36,112 ; movaps %xmm13,0x70(%rsp)
@@ -1244,8 +1300,18 @@ _sk_start_pipeline_sse2 LABEL PROC
DB 68,15,41,68,36,32 ; movaps %xmm8,0x20(%rsp)
DB 15,41,124,36,16 ; movaps %xmm7,0x10(%rsp)
DB 15,41,52,36 ; movaps %xmm6,(%rsp)
+ DB 77,137,207 ; mov %r9,%r15
+ DB 77,137,198 ; mov %r8,%r14
+ DB 72,137,203 ; mov %rcx,%rbx
DB 72,137,214 ; mov %rdx,%rsi
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 73,137,196 ; mov %rax,%r12
+ DB 73,137,245 ; mov %rsi,%r13
+ DB 72,141,67,4 ; lea 0x4(%rbx),%rax
+ DB 76,57,248 ; cmp %r15,%rax
+ DB 118,5 ; jbe 73 <_sk_start_pipeline_sse2+0x73>
+ DB 72,137,216 ; mov %rbx,%rax
+ DB 235,52 ; jmp a7 <_sk_start_pipeline_sse2+0xa7>
DB 15,87,192 ; xorps %xmm0,%xmm0
DB 15,87,201 ; xorps %xmm1,%xmm1
DB 15,87,210 ; xorps %xmm2,%xmm2
@@ -1254,9 +1320,15 @@ _sk_start_pipeline_sse2 LABEL PROC
DB 15,87,237 ; xorps %xmm5,%xmm5
DB 15,87,246 ; xorps %xmm6,%xmm6
DB 15,87,255 ; xorps %xmm7,%xmm7
- DB 72,137,207 ; mov %rcx,%rdi
- DB 76,137,194 ; mov %r8,%rdx
- DB 255,208 ; callq *%rax
+ DB 72,137,223 ; mov %rbx,%rdi
+ DB 76,137,238 ; mov %r13,%rsi
+ DB 76,137,242 ; mov %r14,%rdx
+ DB 65,255,212 ; callq *%r12
+ DB 72,141,67,4 ; lea 0x4(%rbx),%rax
+ DB 72,131,195,8 ; add $0x8,%rbx
+ DB 76,57,251 ; cmp %r15,%rbx
+ DB 72,137,195 ; mov %rax,%rbx
+ DB 118,204 ; jbe 73 <_sk_start_pipeline_sse2+0x73>
DB 15,40,52,36 ; movaps (%rsp),%xmm6
DB 15,40,124,36,16 ; movaps 0x10(%rsp),%xmm7
DB 68,15,40,68,36,32 ; movaps 0x20(%rsp),%xmm8
@@ -1267,9 +1339,14 @@ _sk_start_pipeline_sse2 LABEL PROC
DB 68,15,40,108,36,112 ; movaps 0x70(%rsp),%xmm13
DB 68,15,40,180,36,128,0,0,0 ; movaps 0x80(%rsp),%xmm14
DB 68,15,40,188,36,144,0,0,0 ; movaps 0x90(%rsp),%xmm15
- DB 72,129,196,168,0,0,0 ; add $0xa8,%rsp
+ DB 72,129,196,160,0,0,0 ; add $0xa0,%rsp
+ DB 91 ; pop %rbx
DB 95 ; pop %rdi
DB 94 ; pop %rsi
+ DB 65,92 ; pop %r12
+ DB 65,93 ; pop %r13
+ DB 65,94 ; pop %r14
+ DB 65,95 ; pop %r15
DB 195 ; retq
PUBLIC _sk_just_return_sse2
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index e47137fd0d..6a3be91467 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -219,18 +219,19 @@ static void* load_and_inc(void**& program) {
#if defined(JUMPER) && defined(WIN)
__attribute__((ms_abi))
#endif
-extern "C" void WRAP(start_pipeline)(size_t x, void** program, K* k) {
- auto next = (Stage*)load_and_inc(program);
+extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
F v{}; // TODO: faster uninitialized?
- next(x,program,k, v,v,v,v, v,v,v,v);
+ size_t stride = sizeof(F) / sizeof(float);
+ auto start = (Stage*)load_and_inc(program);
+ while (x + stride <= limit) {
+ start(x,program,k, v,v,v,v, v,v,v,v);
+ x += stride;
+ }
+ return x;
}
// Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller).
-extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {
-#if defined(JUMPER) && defined(__AVX2__)
- asm("vzeroupper");
-#endif
-}
+extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
// We can now define Stages!