diff options
author | Mike Klein <mtklein@chromium.org> | 2017-02-21 16:50:52 -0500 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2017-02-21 22:52:13 +0000 |
commit | 9ef63754a7262f57097b39318adf2f7789d23ecf (patch) | |
tree | e67f511f5e301564da500a00588793b9f04ec262 /src | |
parent | c663953504e7cd6ad0e673e926203a3a38478d05 (diff) |
Move looping logic into start_pipeline().
This should be a big win on Windows, but I haven't timed there yet.
On my Mac, it's a solid 2% speedup.
PS1 was insufficiently ambitious, but was this for posterity:
No need to vzeroupper twice on Windows.
On Windows start_pipeline() will vzeroupper,
so no need to do it in just_return().
Change-Id: I099320b95da85900a60ce96fdb7a216a36db1858
Reviewed-on: https://skia-review.googlesource.com/8821
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Herb Derby <herb@google.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/jumper/SkJumper.cpp | 19 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated.S | 151 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated_win.S | 109 | ||||
-rw-r--r-- | src/jumper/SkJumper_stages.cpp | 17 |
4 files changed, 251 insertions, 45 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp index 4923f6f92b..2140fd06a0 100644 --- a/src/jumper/SkJumper.cpp +++ b/src/jumper/SkJumper.cpp @@ -65,23 +65,23 @@ using StageFn = void(void); extern "C" { #if defined(__aarch64__) - void ASM(start_pipeline,aarch64)(size_t, void**, K*); + size_t ASM(start_pipeline,aarch64)(size_t, void**, K*, size_t); StageFn ASM(just_return,aarch64); #define M(st) StageFn ASM(st,aarch64); STAGES(M) #undef M #elif defined(__arm__) - void ASM(start_pipeline,vfp4)(size_t, void**, K*); + size_t ASM(start_pipeline,vfp4)(size_t, void**, K*, size_t); StageFn ASM(just_return,vfp4); #define M(st) StageFn ASM(st,vfp4); STAGES(M) #undef M #elif defined(__x86_64__) || defined(_M_X64) - void ASM(start_pipeline,hsw )(size_t, void**, K*); - void ASM(start_pipeline,sse41)(size_t, void**, K*); - void ASM(start_pipeline,sse2 )(size_t, void**, K*); + size_t ASM(start_pipeline,hsw )(size_t, void**, K*, size_t); + size_t ASM(start_pipeline,sse41)(size_t, void**, K*, size_t); + size_t ASM(start_pipeline,sse2 )(size_t, void**, K*, size_t); StageFn ASM(just_return,hsw), ASM(just_return,sse41), @@ -99,7 +99,7 @@ extern "C" { #endif // Portable, single-pixel stages. - void sk_start_pipeline(size_t, void**, K*); + size_t sk_start_pipeline(size_t, void**, K*, size_t); StageFn sk_just_return; #define M(st) StageFn sk_##st; STAGES(M) @@ -171,7 +171,7 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const { auto build_and_run = [&](size_t stride, StageFn* (*lookup)(SkRasterPipeline::StockStage), StageFn* just_return, - void (*start_pipeline)(size_t, void**, K*)) { + size_t (*start_pipeline)(size_t, void**, K*, size_t)) { if (x + stride <= limit) { void** ip = program.get(); for (auto&& st : fStages) { @@ -184,10 +184,7 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const { } *ip = (void*)just_return; - while (x + stride <= limit) { - start_pipeline(x, program.get(), &kConstants); - x += stride; - } + x = start_pipeline(x, program.get(), &kConstants, limit); } return true; }; diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S index 77a212e3df..0f6b1f3c37 100644 --- a/src/jumper/SkJumper_generated.S +++ b/src/jumper/SkJumper_generated.S @@ -12,7 +12,21 @@ .globl _sk_start_pipeline_aarch64 _sk_start_pipeline_aarch64: - .long 0xf8408423 // ldr x3, [x1],#8 + .long 0xa9bc5ff8 // stp x24, x23, [sp,#-64]! + .long 0xa90157f6 // stp x22, x21, [sp,#16] + .long 0xa9024ff4 // stp x20, x19, [sp,#32] + .long 0xa9037bfd // stp x29, x30, [sp,#48] + .long 0x9100c3fd // add x29, sp, #0x30 + .long 0xaa0103f3 // mov x19, x1 + .long 0xf8408677 // ldr x23, [x19],#8 + .long 0xaa0003f6 // mov x22, x0 + .long 0xaa0303f4 // mov x20, x3 + .long 0xaa0203f5 // mov x21, x2 + .long 0x910012c8 // add x8, x22, #0x4 + .long 0xeb14011f // cmp x8, x20 + .long 0x54000069 // b.ls 3c <sk_start_pipeline_aarch64+0x3c> + .long 0xaa1603e0 // mov x0, x22 + .long 0x14000012 // b 80 <sk_start_pipeline_aarch64+0x80> .long 0x6f00e400 // movi v0.2d, #0x0 .long 0x6f00e401 // movi v1.2d, #0x0 .long 0x6f00e402 // movi v2.2d, #0x0 @@ -21,7 +35,20 @@ _sk_start_pipeline_aarch64: .long 0x6f00e405 // movi v5.2d, #0x0 .long 0x6f00e406 // movi v6.2d, #0x0 .long 0x6f00e407 // movi v7.2d, #0x0 - .long 0xd61f0060 // br x3 + .long 0xaa1603e0 // mov x0, x22 + .long 0xaa1303e1 // mov x1, x19 + .long 0xaa1503e2 // mov x2, x21 + .long 0xd63f02e0 // blr x23 + .long 0x910022c8 // add x8, x22, #0x8 + .long 0x910012c0 // add x0, x22, #0x4 + .long 0xeb14011f // cmp x8, x20 + .long 0xaa0003f6 // mov x22, x0 + .long 0x54fffe09 // b.ls 3c <sk_start_pipeline_aarch64+0x3c> + .long 0xa9437bfd // ldp x29, x30, [sp,#48] + .long 0xa9424ff4 // ldp x20, x19, [sp,#32] + .long 0xa94157f6 // ldp x22, x21, [sp,#16] + .long 0xa8c45ff8 // ldp x24, x23, [sp],#64 + .long 0xd65f03c0 // ret .globl _sk_just_return_aarch64 _sk_just_return_aarch64: @@ -551,16 +578,32 @@ _sk_linear_gradient_2stops_aarch64: .globl _sk_start_pipeline_vfp4 _sk_start_pipeline_vfp4: - .long 0xe4913004 // ldr r3, [r1], #4 + .long 0xe92d41f0 // push {r4, r5, r6, r7, r8, lr} + .long 0xe1a07001 // mov r7, r1 + .long 0xe1a04000 // mov r4, r0 + .long 0xe1a05003 // mov r5, r3 + .long 0xe1a08002 // mov r8, r2 + .long 0xe4976004 // ldr r6, [r7], #4 + .long 0xe2840002 // add r0, r4, #2 + .long 0xea00000d // b 58 <sk_start_pipeline_vfp4+0x58> .long 0xf2800010 // vmov.i32 d0, #0 + .long 0xe1a00004 // mov r0, r4 .long 0xf2801010 // vmov.i32 d1, #0 + .long 0xe1a01007 // mov r1, r7 .long 0xf2802010 // vmov.i32 d2, #0 + .long 0xe1a02008 // mov r2, r8 .long 0xf2803010 // vmov.i32 d3, #0 .long 0xf2804010 // vmov.i32 d4, #0 .long 0xf2805010 // vmov.i32 d5, #0 .long 0xf2806010 // vmov.i32 d6, #0 .long 0xf2807010 // vmov.i32 d7, #0 - .long 0xe12fff13 // bx r3 + .long 0xe12fff36 // blx r6 + .long 0xe2840004 // add r0, r4, #4 + .long 0xe2844002 // add r4, r4, #2 + .long 0xe1500005 // cmp r0, r5 + .long 0x9affffef // bls 20 <sk_start_pipeline_vfp4+0x20> + .long 0xe1a00004 // mov r0, r4 + .long 0xe8bd81f0 // pop {r4, r5, r6, r7, r8, pc} .globl _sk_just_return_vfp4 _sk_just_return_vfp4: @@ -1152,7 +1195,22 @@ _sk_linear_gradient_2stops_vfp4: .globl _sk_start_pipeline_hsw _sk_start_pipeline_hsw: - .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x41,0x57 // push %r15 + .byte 0x41,0x56 // push %r14 + .byte 0x41,0x55 // push %r13 + .byte 0x41,0x54 // push %r12 + .byte 0x53 // push %rbx + .byte 0x49,0x89,0xcf // mov %rcx,%r15 + .byte 0x49,0x89,0xd6 // mov %rdx,%r14 + .byte 0x48,0x89,0xfb // mov %rdi,%rbx + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x49,0x89,0xc4 // mov %rax,%r12 + .byte 0x49,0x89,0xf5 // mov %rsi,%r13 + .byte 0x48,0x8d,0x43,0x08 // lea 0x8(%rbx),%rax + .byte 0x4c,0x39,0xf8 // cmp %r15,%rax + .byte 0x76,0x05 // jbe 28 <_sk_start_pipeline_hsw+0x28> + .byte 0x48,0x89,0xd8 // mov %rbx,%rax + .byte 0xeb,0x3c // jmp 64 <_sk_start_pipeline_hsw+0x64> .byte 0xc5,0xfc,0x57,0xc0 // vxorps %ymm0,%ymm0,%ymm0 .byte 0xc5,0xf4,0x57,0xc9 // vxorps %ymm1,%ymm1,%ymm1 .byte 0xc5,0xec,0x57,0xd2 // vxorps %ymm2,%ymm2,%ymm2 @@ -1161,7 +1219,22 @@ _sk_start_pipeline_hsw: .byte 0xc5,0xd4,0x57,0xed // vxorps %ymm5,%ymm5,%ymm5 .byte 0xc5,0xcc,0x57,0xf6 // vxorps %ymm6,%ymm6,%ymm6 .byte 0xc5,0xc4,0x57,0xff // vxorps %ymm7,%ymm7,%ymm7 - .byte 0xff,0xe0 // jmpq *%rax + .byte 0x48,0x89,0xdf // mov %rbx,%rdi + .byte 0x4c,0x89,0xee // mov %r13,%rsi + .byte 0x4c,0x89,0xf2 // mov %r14,%rdx + .byte 0x41,0xff,0xd4 // callq *%r12 + .byte 0x48,0x8d,0x43,0x08 // lea 0x8(%rbx),%rax + .byte 0x48,0x83,0xc3,0x10 // add $0x10,%rbx + .byte 0x4c,0x39,0xfb // cmp %r15,%rbx + .byte 0x48,0x89,0xc3 // mov %rax,%rbx + .byte 0x76,0xc4 // jbe 28 <_sk_start_pipeline_hsw+0x28> + .byte 0x5b // pop %rbx + .byte 0x41,0x5c // pop %r12 + .byte 0x41,0x5d // pop %r13 + .byte 0x41,0x5e // pop %r14 + .byte 0x41,0x5f // pop %r15 + .byte 0xc5,0xf8,0x77 // vzeroupper + .byte 0xc3 // retq .globl _sk_just_return_hsw _sk_just_return_hsw: @@ -1640,7 +1713,22 @@ _sk_linear_gradient_2stops_hsw: .globl _sk_start_pipeline_sse41 _sk_start_pipeline_sse41: - .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x41,0x57 // push %r15 + .byte 0x41,0x56 // push %r14 + .byte 0x41,0x55 // push %r13 + .byte 0x41,0x54 // push %r12 + .byte 0x53 // push %rbx + .byte 0x49,0x89,0xcf // mov %rcx,%r15 + .byte 0x49,0x89,0xd6 // mov %rdx,%r14 + .byte 0x48,0x89,0xfb // mov %rdi,%rbx + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x49,0x89,0xc4 // mov %rax,%r12 + .byte 0x49,0x89,0xf5 // mov %rsi,%r13 + .byte 0x48,0x8d,0x43,0x04 // lea 0x4(%rbx),%rax + .byte 0x4c,0x39,0xf8 // cmp %r15,%rax + .byte 0x76,0x05 // jbe 28 <_sk_start_pipeline_sse41+0x28> + .byte 0x48,0x89,0xd8 // mov %rbx,%rax + .byte 0xeb,0x34 // jmp 5c <_sk_start_pipeline_sse41+0x5c> .byte 0x0f,0x57,0xc0 // xorps %xmm0,%xmm0 .byte 0x0f,0x57,0xc9 // xorps %xmm1,%xmm1 .byte 0x0f,0x57,0xd2 // xorps %xmm2,%xmm2 @@ -1649,7 +1737,21 @@ _sk_start_pipeline_sse41: .byte 0x0f,0x57,0xed // xorps %xmm5,%xmm5 .byte 0x0f,0x57,0xf6 // xorps %xmm6,%xmm6 .byte 0x0f,0x57,0xff // xorps %xmm7,%xmm7 - .byte 0xff,0xe0 // jmpq *%rax + .byte 0x48,0x89,0xdf // mov %rbx,%rdi + .byte 0x4c,0x89,0xee // mov %r13,%rsi + .byte 0x4c,0x89,0xf2 // mov %r14,%rdx + .byte 0x41,0xff,0xd4 // callq *%r12 + .byte 0x48,0x8d,0x43,0x04 // lea 0x4(%rbx),%rax + .byte 0x48,0x83,0xc3,0x08 // add $0x8,%rbx + .byte 0x4c,0x39,0xfb // cmp %r15,%rbx + .byte 0x48,0x89,0xc3 // mov %rax,%rbx + .byte 0x76,0xcc // jbe 28 <_sk_start_pipeline_sse41+0x28> + .byte 0x5b // pop %rbx + .byte 0x41,0x5c // pop %r12 + .byte 0x41,0x5d // pop %r13 + .byte 0x41,0x5e // pop %r14 + .byte 0x41,0x5f // pop %r15 + .byte 0xc3 // retq .globl _sk_just_return_sse41 _sk_just_return_sse41: @@ -2312,7 +2414,22 @@ _sk_linear_gradient_2stops_sse41: .globl _sk_start_pipeline_sse2 _sk_start_pipeline_sse2: - .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x41,0x57 // push %r15 + .byte 0x41,0x56 // push %r14 + .byte 0x41,0x55 // push %r13 + .byte 0x41,0x54 // push %r12 + .byte 0x53 // push %rbx + .byte 0x49,0x89,0xcf // mov %rcx,%r15 + .byte 0x49,0x89,0xd6 // mov %rdx,%r14 + .byte 0x48,0x89,0xfb // mov %rdi,%rbx + .byte 0x48,0xad // lods %ds:(%rsi),%rax + .byte 0x49,0x89,0xc4 // mov %rax,%r12 + .byte 0x49,0x89,0xf5 // mov %rsi,%r13 + .byte 0x48,0x8d,0x43,0x04 // lea 0x4(%rbx),%rax + .byte 0x4c,0x39,0xf8 // cmp %r15,%rax + .byte 0x76,0x05 // jbe 28 <_sk_start_pipeline_sse2+0x28> + .byte 0x48,0x89,0xd8 // mov %rbx,%rax + .byte 0xeb,0x34 // jmp 5c <_sk_start_pipeline_sse2+0x5c> .byte 0x0f,0x57,0xc0 // xorps %xmm0,%xmm0 .byte 0x0f,0x57,0xc9 // xorps %xmm1,%xmm1 .byte 0x0f,0x57,0xd2 // xorps %xmm2,%xmm2 @@ -2321,7 +2438,21 @@ _sk_start_pipeline_sse2: .byte 0x0f,0x57,0xed // xorps %xmm5,%xmm5 .byte 0x0f,0x57,0xf6 // xorps %xmm6,%xmm6 .byte 0x0f,0x57,0xff // xorps %xmm7,%xmm7 - .byte 0xff,0xe0 // jmpq *%rax + .byte 0x48,0x89,0xdf // mov %rbx,%rdi + .byte 0x4c,0x89,0xee // mov %r13,%rsi + .byte 0x4c,0x89,0xf2 // mov %r14,%rdx + .byte 0x41,0xff,0xd4 // callq *%r12 + .byte 0x48,0x8d,0x43,0x04 // lea 0x4(%rbx),%rax + .byte 0x48,0x83,0xc3,0x08 // add $0x8,%rbx + .byte 0x4c,0x39,0xfb // cmp %r15,%rbx + .byte 0x48,0x89,0xc3 // mov %rax,%rbx + .byte 0x76,0xcc // jbe 28 <_sk_start_pipeline_sse2+0x28> + .byte 0x5b // pop %rbx + .byte 0x41,0x5c // pop %r12 + .byte 0x41,0x5d // pop %r13 + .byte 0x41,0x5e // pop %r14 + .byte 0x41,0x5f // pop %r15 + .byte 0xc3 // retq .globl _sk_just_return_sse2 _sk_just_return_sse2: diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S index d6aaa2b7ea..27b8d584aa 100644 --- a/src/jumper/SkJumper_generated_win.S +++ b/src/jumper/SkJumper_generated_win.S @@ -10,9 +10,14 @@ _text SEGMENT PUBLIC _sk_start_pipeline_hsw _sk_start_pipeline_hsw LABEL PROC + DB 65,87 ; push %r15 + DB 65,86 ; push %r14 + DB 65,85 ; push %r13 + DB 65,84 ; push %r12 DB 86 ; push %rsi DB 87 ; push %rdi - DB 72,129,236,168,0,0,0 ; sub $0xa8,%rsp + DB 83 ; push %rbx + DB 72,129,236,160,0,0,0 ; sub $0xa0,%rsp DB 197,120,41,188,36,144,0,0,0 ; vmovaps %xmm15,0x90(%rsp) DB 197,120,41,180,36,128,0,0,0 ; vmovaps %xmm14,0x80(%rsp) DB 197,120,41,108,36,112 ; vmovaps %xmm13,0x70(%rsp) @@ -23,8 +28,18 @@ _sk_start_pipeline_hsw LABEL PROC DB 197,120,41,68,36,32 ; vmovaps %xmm8,0x20(%rsp) DB 197,248,41,124,36,16 ; vmovaps %xmm7,0x10(%rsp) DB 197,248,41,52,36 ; vmovaps %xmm6,(%rsp) + DB 77,137,207 ; mov %r9,%r15 + DB 77,137,198 ; mov %r8,%r14 + DB 72,137,203 ; mov %rcx,%rbx DB 72,137,214 ; mov %rdx,%rsi DB 72,173 ; lods %ds:(%rsi),%rax + DB 73,137,196 ; mov %rax,%r12 + DB 73,137,245 ; mov %rsi,%r13 + DB 72,141,67,8 ; lea 0x8(%rbx),%rax + DB 76,57,248 ; cmp %r15,%rax + DB 118,5 ; jbe 75 <_sk_start_pipeline_hsw+0x75> + DB 72,137,216 ; mov %rbx,%rax + DB 235,60 ; jmp b1 <_sk_start_pipeline_hsw+0xb1> DB 197,252,87,192 ; vxorps %ymm0,%ymm0,%ymm0 DB 197,244,87,201 ; vxorps %ymm1,%ymm1,%ymm1 DB 197,236,87,210 ; vxorps %ymm2,%ymm2,%ymm2 @@ -33,9 +48,15 @@ _sk_start_pipeline_hsw LABEL PROC DB 197,212,87,237 ; vxorps %ymm5,%ymm5,%ymm5 DB 197,204,87,246 ; vxorps %ymm6,%ymm6,%ymm6 DB 197,196,87,255 ; vxorps %ymm7,%ymm7,%ymm7 - DB 72,137,207 ; mov %rcx,%rdi - DB 76,137,194 ; mov %r8,%rdx - DB 255,208 ; callq *%rax + DB 72,137,223 ; mov %rbx,%rdi + DB 76,137,238 ; mov %r13,%rsi + DB 76,137,242 ; mov %r14,%rdx + DB 65,255,212 ; callq *%r12 + DB 72,141,67,8 ; lea 0x8(%rbx),%rax + DB 72,131,195,16 ; add $0x10,%rbx + DB 76,57,251 ; cmp %r15,%rbx + DB 72,137,195 ; mov %rax,%rbx + DB 118,196 ; jbe 75 <_sk_start_pipeline_hsw+0x75> DB 197,248,40,52,36 ; vmovaps (%rsp),%xmm6 DB 197,248,40,124,36,16 ; vmovaps 0x10(%rsp),%xmm7 DB 197,120,40,68,36,32 ; vmovaps 0x20(%rsp),%xmm8 @@ -46,15 +67,19 @@ _sk_start_pipeline_hsw LABEL PROC DB 197,120,40,108,36,112 ; vmovaps 0x70(%rsp),%xmm13 DB 197,120,40,180,36,128,0,0,0 ; vmovaps 0x80(%rsp),%xmm14 DB 197,120,40,188,36,144,0,0,0 ; vmovaps 0x90(%rsp),%xmm15 - DB 72,129,196,168,0,0,0 ; add $0xa8,%rsp + DB 72,129,196,160,0,0,0 ; add $0xa0,%rsp + DB 91 ; pop %rbx DB 95 ; pop %rdi DB 94 ; pop %rsi + DB 65,92 ; pop %r12 + DB 65,93 ; pop %r13 + DB 65,94 ; pop %r14 + DB 65,95 ; pop %r15 DB 197,248,119 ; vzeroupper DB 195 ; retq PUBLIC _sk_just_return_hsw _sk_just_return_hsw LABEL PROC - DB 197,248,119 ; vzeroupper DB 195 ; retq PUBLIC _sk_seed_shader_hsw @@ -529,9 +554,14 @@ _sk_linear_gradient_2stops_hsw LABEL PROC PUBLIC _sk_start_pipeline_sse41 _sk_start_pipeline_sse41 LABEL PROC + DB 65,87 ; push %r15 + DB 65,86 ; push %r14 + DB 65,85 ; push %r13 + DB 65,84 ; push %r12 DB 86 ; push %rsi DB 87 ; push %rdi - DB 72,129,236,168,0,0,0 ; sub $0xa8,%rsp + DB 83 ; push %rbx + DB 72,129,236,160,0,0,0 ; sub $0xa0,%rsp DB 68,15,41,188,36,144,0,0,0 ; movaps %xmm15,0x90(%rsp) DB 68,15,41,180,36,128,0,0,0 ; movaps %xmm14,0x80(%rsp) DB 68,15,41,108,36,112 ; movaps %xmm13,0x70(%rsp) @@ -542,8 +572,18 @@ _sk_start_pipeline_sse41 LABEL PROC DB 68,15,41,68,36,32 ; movaps %xmm8,0x20(%rsp) DB 15,41,124,36,16 ; movaps %xmm7,0x10(%rsp) DB 15,41,52,36 ; movaps %xmm6,(%rsp) + DB 77,137,207 ; mov %r9,%r15 + DB 77,137,198 ; mov %r8,%r14 + DB 72,137,203 ; mov %rcx,%rbx DB 72,137,214 ; mov %rdx,%rsi DB 72,173 ; lods %ds:(%rsi),%rax + DB 73,137,196 ; mov %rax,%r12 + DB 73,137,245 ; mov %rsi,%r13 + DB 72,141,67,4 ; lea 0x4(%rbx),%rax + DB 76,57,248 ; cmp %r15,%rax + DB 118,5 ; jbe 73 <_sk_start_pipeline_sse41+0x73> + DB 72,137,216 ; mov %rbx,%rax + DB 235,52 ; jmp a7 <_sk_start_pipeline_sse41+0xa7> DB 15,87,192 ; xorps %xmm0,%xmm0 DB 15,87,201 ; xorps %xmm1,%xmm1 DB 15,87,210 ; xorps %xmm2,%xmm2 @@ -552,9 +592,15 @@ _sk_start_pipeline_sse41 LABEL PROC DB 15,87,237 ; xorps %xmm5,%xmm5 DB 15,87,246 ; xorps %xmm6,%xmm6 DB 15,87,255 ; xorps %xmm7,%xmm7 - DB 72,137,207 ; mov %rcx,%rdi - DB 76,137,194 ; mov %r8,%rdx - DB 255,208 ; callq *%rax + DB 72,137,223 ; mov %rbx,%rdi + DB 76,137,238 ; mov %r13,%rsi + DB 76,137,242 ; mov %r14,%rdx + DB 65,255,212 ; callq *%r12 + DB 72,141,67,4 ; lea 0x4(%rbx),%rax + DB 72,131,195,8 ; add $0x8,%rbx + DB 76,57,251 ; cmp %r15,%rbx + DB 72,137,195 ; mov %rax,%rbx + DB 118,204 ; jbe 73 <_sk_start_pipeline_sse41+0x73> DB 15,40,52,36 ; movaps (%rsp),%xmm6 DB 15,40,124,36,16 ; movaps 0x10(%rsp),%xmm7 DB 68,15,40,68,36,32 ; movaps 0x20(%rsp),%xmm8 @@ -565,9 +611,14 @@ _sk_start_pipeline_sse41 LABEL PROC DB 68,15,40,108,36,112 ; movaps 0x70(%rsp),%xmm13 DB 68,15,40,180,36,128,0,0,0 ; movaps 0x80(%rsp),%xmm14 DB 68,15,40,188,36,144,0,0,0 ; movaps 0x90(%rsp),%xmm15 - DB 72,129,196,168,0,0,0 ; add $0xa8,%rsp + DB 72,129,196,160,0,0,0 ; add $0xa0,%rsp + DB 91 ; pop %rbx DB 95 ; pop %rdi DB 94 ; pop %rsi + DB 65,92 ; pop %r12 + DB 65,93 ; pop %r13 + DB 65,94 ; pop %r14 + DB 65,95 ; pop %r15 DB 195 ; retq PUBLIC _sk_just_return_sse41 @@ -1231,9 +1282,14 @@ _sk_linear_gradient_2stops_sse41 LABEL PROC PUBLIC _sk_start_pipeline_sse2 _sk_start_pipeline_sse2 LABEL PROC + DB 65,87 ; push %r15 + DB 65,86 ; push %r14 + DB 65,85 ; push %r13 + DB 65,84 ; push %r12 DB 86 ; push %rsi DB 87 ; push %rdi - DB 72,129,236,168,0,0,0 ; sub $0xa8,%rsp + DB 83 ; push %rbx + DB 72,129,236,160,0,0,0 ; sub $0xa0,%rsp DB 68,15,41,188,36,144,0,0,0 ; movaps %xmm15,0x90(%rsp) DB 68,15,41,180,36,128,0,0,0 ; movaps %xmm14,0x80(%rsp) DB 68,15,41,108,36,112 ; movaps %xmm13,0x70(%rsp) @@ -1244,8 +1300,18 @@ _sk_start_pipeline_sse2 LABEL PROC DB 68,15,41,68,36,32 ; movaps %xmm8,0x20(%rsp) DB 15,41,124,36,16 ; movaps %xmm7,0x10(%rsp) DB 15,41,52,36 ; movaps %xmm6,(%rsp) + DB 77,137,207 ; mov %r9,%r15 + DB 77,137,198 ; mov %r8,%r14 + DB 72,137,203 ; mov %rcx,%rbx DB 72,137,214 ; mov %rdx,%rsi DB 72,173 ; lods %ds:(%rsi),%rax + DB 73,137,196 ; mov %rax,%r12 + DB 73,137,245 ; mov %rsi,%r13 + DB 72,141,67,4 ; lea 0x4(%rbx),%rax + DB 76,57,248 ; cmp %r15,%rax + DB 118,5 ; jbe 73 <_sk_start_pipeline_sse2+0x73> + DB 72,137,216 ; mov %rbx,%rax + DB 235,52 ; jmp a7 <_sk_start_pipeline_sse2+0xa7> DB 15,87,192 ; xorps %xmm0,%xmm0 DB 15,87,201 ; xorps %xmm1,%xmm1 DB 15,87,210 ; xorps %xmm2,%xmm2 @@ -1254,9 +1320,15 @@ _sk_start_pipeline_sse2 LABEL PROC DB 15,87,237 ; xorps %xmm5,%xmm5 DB 15,87,246 ; xorps %xmm6,%xmm6 DB 15,87,255 ; xorps %xmm7,%xmm7 - DB 72,137,207 ; mov %rcx,%rdi - DB 76,137,194 ; mov %r8,%rdx - DB 255,208 ; callq *%rax + DB 72,137,223 ; mov %rbx,%rdi + DB 76,137,238 ; mov %r13,%rsi + DB 76,137,242 ; mov %r14,%rdx + DB 65,255,212 ; callq *%r12 + DB 72,141,67,4 ; lea 0x4(%rbx),%rax + DB 72,131,195,8 ; add $0x8,%rbx + DB 76,57,251 ; cmp %r15,%rbx + DB 72,137,195 ; mov %rax,%rbx + DB 118,204 ; jbe 73 <_sk_start_pipeline_sse2+0x73> DB 15,40,52,36 ; movaps (%rsp),%xmm6 DB 15,40,124,36,16 ; movaps 0x10(%rsp),%xmm7 DB 68,15,40,68,36,32 ; movaps 0x20(%rsp),%xmm8 @@ -1267,9 +1339,14 @@ _sk_start_pipeline_sse2 LABEL PROC DB 68,15,40,108,36,112 ; movaps 0x70(%rsp),%xmm13 DB 68,15,40,180,36,128,0,0,0 ; movaps 0x80(%rsp),%xmm14 DB 68,15,40,188,36,144,0,0,0 ; movaps 0x90(%rsp),%xmm15 - DB 72,129,196,168,0,0,0 ; add $0xa8,%rsp + DB 72,129,196,160,0,0,0 ; add $0xa0,%rsp + DB 91 ; pop %rbx DB 95 ; pop %rdi DB 94 ; pop %rsi + DB 65,92 ; pop %r12 + DB 65,93 ; pop %r13 + DB 65,94 ; pop %r14 + DB 65,95 ; pop %r15 DB 195 ; retq PUBLIC _sk_just_return_sse2 diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index e47137fd0d..6a3be91467 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -219,18 +219,19 @@ static void* load_and_inc(void**& program) { #if defined(JUMPER) && defined(WIN) __attribute__((ms_abi)) #endif -extern "C" void WRAP(start_pipeline)(size_t x, void** program, K* k) { - auto next = (Stage*)load_and_inc(program); +extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) { F v{}; // TODO: faster uninitialized? - next(x,program,k, v,v,v,v, v,v,v,v); + size_t stride = sizeof(F) / sizeof(float); + auto start = (Stage*)load_and_inc(program); + while (x + stride <= limit) { + start(x,program,k, v,v,v,v, v,v,v,v); + x += stride; + } + return x; } // Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller). -extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) { -#if defined(JUMPER) && defined(__AVX2__) - asm("vzeroupper"); -#endif -} +extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {} // We can now define Stages! |