Move looping logic into start_pipeline().

This should be a big win on Windows, but I haven't timed there yet. On my Mac, it's a solid 2% speedup. PS1 was insufficiently ambitious, but was this for posterity: No need to vzeroupper twice on Windows. On Windows start_pipeline() will vzeroupper, so no need to do it in just_return(). Change-Id: I099320b95da85900a60ce96fdb7a216a36db1858 Reviewed-on: https://skia-review.googlesource.com/8821 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Herb Derby <herb@google.com>
author: Mike Klein <mtklein@chromium.org> 2017-02-21 16:50:52 -0500
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2017-02-21 22:52:13 +0000
commit: 9ef63754a7262f57097b39318adf2f7789d23ecf (patch)
tree: e67f511f5e301564da500a00588793b9f04ec262 /src
parent: c663953504e7cd6ad0e673e926203a3a38478d05 (diff)
4 files changed, 251 insertions, 45 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 4923f6f92b..2140fd06a0 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -65,23 +65,23 @@ using StageFn = void(void);
 extern "C" {
 
 #if defined(__aarch64__)
-    void ASM(start_pipeline,aarch64)(size_t, void**, K*);
+    size_t ASM(start_pipeline,aarch64)(size_t, void**, K*, size_t);
     StageFn ASM(just_return,aarch64);
     #define M(st) StageFn ASM(st,aarch64);
         STAGES(M)
     #undef M
 
 #elif defined(__arm__)
-    void ASM(start_pipeline,vfp4)(size_t, void**, K*);
+    size_t ASM(start_pipeline,vfp4)(size_t, void**, K*, size_t);
     StageFn ASM(just_return,vfp4);
     #define M(st) StageFn ASM(st,vfp4);
         STAGES(M)
     #undef M
 
 #elif defined(__x86_64__) || defined(_M_X64)
-    void ASM(start_pipeline,hsw  )(size_t, void**, K*);
-    void ASM(start_pipeline,sse41)(size_t, void**, K*);
-    void ASM(start_pipeline,sse2 )(size_t, void**, K*);
+    size_t ASM(start_pipeline,hsw  )(size_t, void**, K*, size_t);
+    size_t ASM(start_pipeline,sse41)(size_t, void**, K*, size_t);
+    size_t ASM(start_pipeline,sse2 )(size_t, void**, K*, size_t);
 
     StageFn ASM(just_return,hsw),
             ASM(just_return,sse41),
@@ -99,7 +99,7 @@ extern "C" {
 #endif
 
     // Portable, single-pixel stages.
-    void sk_start_pipeline(size_t, void**, K*);
+    size_t sk_start_pipeline(size_t, void**, K*, size_t);
     StageFn sk_just_return;
     #define M(st) StageFn sk_##st;
         STAGES(M)
@@ -171,7 +171,7 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
     auto build_and_run = [&](size_t   stride,
                              StageFn* (*lookup)(SkRasterPipeline::StockStage),
                              StageFn* just_return,
-                             void     (*start_pipeline)(size_t, void**, K*)) {
+                             size_t   (*start_pipeline)(size_t, void**, K*, size_t)) {
         if (x + stride <= limit) {
             void** ip = program.get();
             for (auto&& st : fStages) {
@@ -184,10 +184,7 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
             }
             *ip = (void*)just_return;
 
-            while (x + stride <= limit) {
-                start_pipeline(x, program.get(), &kConstants);
-                x += stride;
-            }
+            x = start_pipeline(x, program.get(), &kConstants, limit);
         }
         return true;
     };
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index 77a212e3df..0f6b1f3c37 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -12,7 +12,21 @@
 
 .globl _sk_start_pipeline_aarch64
 _sk_start_pipeline_aarch64:
-  .long  0xf8408423                                      // ldr           x3, [x1],#8
+  .long  0xa9bc5ff8                                      // stp           x24, x23, [sp,#-64]!
+  .long  0xa90157f6                                      // stp           x22, x21, [sp,#16]
+  .long  0xa9024ff4                                      // stp           x20, x19, [sp,#32]
+  .long  0xa9037bfd                                      // stp           x29, x30, [sp,#48]
+  .long  0x9100c3fd                                      // add           x29, sp, #0x30
+  .long  0xaa0103f3                                      // mov           x19, x1
+  .long  0xf8408677                                      // ldr           x23, [x19],#8
+  .long  0xaa0003f6                                      // mov           x22, x0
+  .long  0xaa0303f4                                      // mov           x20, x3
+  .long  0xaa0203f5                                      // mov           x21, x2
+  .long  0x910012c8                                      // add           x8, x22, #0x4
+  .long  0xeb14011f                                      // cmp           x8, x20
+  .long  0x54000069                                      // b.ls          3c <sk_start_pipeline_aarch64+0x3c>
+  .long  0xaa1603e0                                      // mov           x0, x22
+  .long  0x14000012                                      // b             80 <sk_start_pipeline_aarch64+0x80>
   .long  0x6f00e400                                      // movi          v0.2d, #0x0
   .long  0x6f00e401                                      // movi          v1.2d, #0x0
   .long  0x6f00e402                                      // movi          v2.2d, #0x0
@@ -21,7 +35,20 @@ _sk_start_pipeline_aarch64:
   .long  0x6f00e405                                      // movi          v5.2d, #0x0
   .long  0x6f00e406                                      // movi          v6.2d, #0x0
   .long  0x6f00e407                                      // movi          v7.2d, #0x0
-  .long  0xd61f0060                                      // br            x3
+  .long  0xaa1603e0                                      // mov           x0, x22
+  .long  0xaa1303e1                                      // mov           x1, x19
+  .long  0xaa1503e2                                      // mov           x2, x21
+  .long  0xd63f02e0                                      // blr           x23
+  .long  0x910022c8                                      // add           x8, x22, #0x8
+  .long  0x910012c0                                      // add           x0, x22, #0x4
+  .long  0xeb14011f                                      // cmp           x8, x20
+  .long  0xaa0003f6                                      // mov           x22, x0
+  .long  0x54fffe09                                      // b.ls          3c <sk_start_pipeline_aarch64+0x3c>
+  .long  0xa9437bfd                                      // ldp           x29, x30, [sp,#48]
+  .long  0xa9424ff4                                      // ldp           x20, x19, [sp,#32]
+  .long  0xa94157f6                                      // ldp           x22, x21, [sp,#16]
+  .long  0xa8c45ff8                                      // ldp           x24, x23, [sp],#64
+  .long  0xd65f03c0                                      // ret
 
 .globl _sk_just_return_aarch64
 _sk_just_return_aarch64:
@@ -551,16 +578,32 @@ _sk_linear_gradient_2stops_aarch64:
 
 .globl _sk_start_pipeline_vfp4
 _sk_start_pipeline_vfp4:
-  .long  0xe4913004                                      // ldr           r3, [r1], #4
+  .long  0xe92d41f0                                      // push          {r4, r5, r6, r7, r8, lr}
+  .long  0xe1a07001                                      // mov           r7, r1
+  .long  0xe1a04000                                      // mov           r4, r0
+  .long  0xe1a05003                                      // mov           r5, r3
+  .long  0xe1a08002                                      // mov           r8, r2
+  .long  0xe4976004                                      // ldr           r6, [r7], #4
+  .long  0xe2840002                                      // add           r0, r4, #2
+  .long  0xea00000d                                      // b             58 <sk_start_pipeline_vfp4+0x58>
   .long  0xf2800010                                      // vmov.i32      d0, #0
+  .long  0xe1a00004                                      // mov           r0, r4
   .long  0xf2801010                                      // vmov.i32      d1, #0
+  .long  0xe1a01007                                      // mov           r1, r7
   .long  0xf2802010                                      // vmov.i32      d2, #0
+  .long  0xe1a02008                                      // mov           r2, r8
   .long  0xf2803010                                      // vmov.i32      d3, #0
   .long  0xf2804010                                      // vmov.i32      d4, #0
   .long  0xf2805010                                      // vmov.i32      d5, #0
   .long  0xf2806010                                      // vmov.i32      d6, #0
   .long  0xf2807010                                      // vmov.i32      d7, #0
-  .long  0xe12fff13                                      // bx            r3
+  .long  0xe12fff36                                      // blx           r6
+  .long  0xe2840004                                      // add           r0, r4, #4
+  .long  0xe2844002                                      // add           r4, r4, #2
+  .long  0xe1500005                                      // cmp           r0, r5
+  .long  0x9affffef                                      // bls           20 <sk_start_pipeline_vfp4+0x20>
+  .long  0xe1a00004                                      // mov           r0, r4
+  .long  0xe8bd81f0                                      // pop           {r4, r5, r6, r7, r8, pc}
 
 .globl _sk_just_return_vfp4
 _sk_just_return_vfp4:
@@ -1152,7 +1195,22 @@ _sk_linear_gradient_2stops_vfp4:
 
 .globl _sk_start_pipeline_hsw
 _sk_start_pipeline_hsw:
-  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x41,0x57                                       // push          %r15
+  .byte  0x41,0x56                                       // push          %r14
+  .byte  0x41,0x55                                       // push          %r13
+  .byte  0x41,0x54                                       // push          %r12
+  .byte  0x53                                            // push          %rbx
+  .byte  0x49,0x89,0xcf                                  // mov           %rcx,%r15
+  .byte  0x49,0x89,0xd6                                  // mov           %rdx,%r14
+  .byte  0x48,0x89,0xfb                                  // mov           %rdi,%rbx
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x49,0x89,0xc4                                  // mov           %rax,%r12
+  .byte  0x49,0x89,0xf5                                  // mov           %rsi,%r13
+  .byte  0x48,0x8d,0x43,0x08                             // lea           0x8(%rbx),%rax
+  .byte  0x4c,0x39,0xf8                                  // cmp           %r15,%rax
+  .byte  0x76,0x05                                       // jbe           28 <_sk_start_pipeline_hsw+0x28>
+  .byte  0x48,0x89,0xd8                                  // mov           %rbx,%rax
+  .byte  0xeb,0x3c                                       // jmp           64 <_sk_start_pipeline_hsw+0x64>
   .byte  0xc5,0xfc,0x57,0xc0                             // vxorps        %ymm0,%ymm0,%ymm0
   .byte  0xc5,0xf4,0x57,0xc9                             // vxorps        %ymm1,%ymm1,%ymm1
   .byte  0xc5,0xec,0x57,0xd2                             // vxorps        %ymm2,%ymm2,%ymm2
@@ -1161,7 +1219,22 @@ _sk_start_pipeline_hsw:
   .byte  0xc5,0xd4,0x57,0xed                             // vxorps        %ymm5,%ymm5,%ymm5
   .byte  0xc5,0xcc,0x57,0xf6                             // vxorps        %ymm6,%ymm6,%ymm6
   .byte  0xc5,0xc4,0x57,0xff                             // vxorps        %ymm7,%ymm7,%ymm7
-  .byte  0xff,0xe0                                       // jmpq          *%rax
+  .byte  0x48,0x89,0xdf                                  // mov           %rbx,%rdi
+  .byte  0x4c,0x89,0xee                                  // mov           %r13,%rsi
+  .byte  0x4c,0x89,0xf2                                  // mov           %r14,%rdx
+  .byte  0x41,0xff,0xd4                                  // callq         *%r12
+  .byte  0x48,0x8d,0x43,0x08                             // lea           0x8(%rbx),%rax
+  .byte  0x48,0x83,0xc3,0x10                             // add           $0x10,%rbx
+  .byte  0x4c,0x39,0xfb                                  // cmp           %r15,%rbx
+  .byte  0x48,0x89,0xc3                                  // mov           %rax,%rbx
+  .byte  0x76,0xc4                                       // jbe           28 <_sk_start_pipeline_hsw+0x28>
+  .byte  0x5b                                            // pop           %rbx
+  .byte  0x41,0x5c                                       // pop           %r12
+  .byte  0x41,0x5d                                       // pop           %r13
+  .byte  0x41,0x5e                                       // pop           %r14
+  .byte  0x41,0x5f                                       // pop           %r15
+  .byte  0xc5,0xf8,0x77                                  // vzeroupper
+  .byte  0xc3                                            // retq
 
 .globl _sk_just_return_hsw
 _sk_just_return_hsw:
@@ -1640,7 +1713,22 @@ _sk_linear_gradient_2stops_hsw:
 
 .globl _sk_start_pipeline_sse41
 _sk_start_pipeline_sse41:
-  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x41,0x57                                       // push          %r15
+  .byte  0x41,0x56                                       // push          %r14
+  .byte  0x41,0x55                                       // push          %r13
+  .byte  0x41,0x54                                       // push          %r12
+  .byte  0x53                                            // push          %rbx
+  .byte  0x49,0x89,0xcf                                  // mov           %rcx,%r15
+  .byte  0x49,0x89,0xd6                                  // mov           %rdx,%r14
+  .byte  0x48,0x89,0xfb                                  // mov           %rdi,%rbx
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x49,0x89,0xc4                                  // mov           %rax,%r12
+  .byte  0x49,0x89,0xf5                                  // mov           %rsi,%r13
+  .byte  0x48,0x8d,0x43,0x04                             // lea           0x4(%rbx),%rax
+  .byte  0x4c,0x39,0xf8                                  // cmp           %r15,%rax
+  .byte  0x76,0x05                                       // jbe           28 <_sk_start_pipeline_sse41+0x28>
+  .byte  0x48,0x89,0xd8                                  // mov           %rbx,%rax
+  .byte  0xeb,0x34                                       // jmp           5c <_sk_start_pipeline_sse41+0x5c>
   .byte  0x0f,0x57,0xc0                                  // xorps         %xmm0,%xmm0
   .byte  0x0f,0x57,0xc9                                  // xorps         %xmm1,%xmm1
   .byte  0x0f,0x57,0xd2                                  // xorps         %xmm2,%xmm2
@@ -1649,7 +1737,21 @@ _sk_start_pipeline_sse41:
   .byte  0x0f,0x57,0xed                                  // xorps         %xmm5,%xmm5
   .byte  0x0f,0x57,0xf6                                  // xorps         %xmm6,%xmm6
   .byte  0x0f,0x57,0xff                                  // xorps         %xmm7,%xmm7
-  .byte  0xff,0xe0                                       // jmpq          *%rax
+  .byte  0x48,0x89,0xdf                                  // mov           %rbx,%rdi
+  .byte  0x4c,0x89,0xee                                  // mov           %r13,%rsi
+  .byte  0x4c,0x89,0xf2                                  // mov           %r14,%rdx
+  .byte  0x41,0xff,0xd4                                  // callq         *%r12
+  .byte  0x48,0x8d,0x43,0x04                             // lea           0x4(%rbx),%rax
+  .byte  0x48,0x83,0xc3,0x08                             // add           $0x8,%rbx
+  .byte  0x4c,0x39,0xfb                                  // cmp           %r15,%rbx
+  .byte  0x48,0x89,0xc3                                  // mov           %rax,%rbx
+  .byte  0x76,0xcc                                       // jbe           28 <_sk_start_pipeline_sse41+0x28>
+  .byte  0x5b                                            // pop           %rbx
+  .byte  0x41,0x5c                                       // pop           %r12
+  .byte  0x41,0x5d                                       // pop           %r13
+  .byte  0x41,0x5e                                       // pop           %r14
+  .byte  0x41,0x5f                                       // pop           %r15
+  .byte  0xc3                                            // retq
 
 .globl _sk_just_return_sse41
 _sk_just_return_sse41:
@@ -2312,7 +2414,22 @@ _sk_linear_gradient_2stops_sse41:
 
 .globl _sk_start_pipeline_sse2
 _sk_start_pipeline_sse2:
-  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x41,0x57                                       // push          %r15
+  .byte  0x41,0x56                                       // push          %r14
+  .byte  0x41,0x55                                       // push          %r13
+  .byte  0x41,0x54                                       // push          %r12
+  .byte  0x53                                            // push          %rbx
+  .byte  0x49,0x89,0xcf                                  // mov           %rcx,%r15
+  .byte  0x49,0x89,0xd6                                  // mov           %rdx,%r14
+  .byte  0x48,0x89,0xfb                                  // mov           %rdi,%rbx
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x49,0x89,0xc4                                  // mov           %rax,%r12
+  .byte  0x49,0x89,0xf5                                  // mov           %rsi,%r13
+  .byte  0x48,0x8d,0x43,0x04                             // lea           0x4(%rbx),%rax
+  .byte  0x4c,0x39,0xf8                                  // cmp           %r15,%rax
+  .byte  0x76,0x05                                       // jbe           28 <_sk_start_pipeline_sse2+0x28>
+  .byte  0x48,0x89,0xd8                                  // mov           %rbx,%rax
+  .byte  0xeb,0x34                                       // jmp           5c <_sk_start_pipeline_sse2+0x5c>
   .byte  0x0f,0x57,0xc0                                  // xorps         %xmm0,%xmm0
   .byte  0x0f,0x57,0xc9                                  // xorps         %xmm1,%xmm1
   .byte  0x0f,0x57,0xd2                                  // xorps         %xmm2,%xmm2
@@ -2321,7 +2438,21 @@ _sk_start_pipeline_sse2:
   .byte  0x0f,0x57,0xed                                  // xorps         %xmm5,%xmm5
   .byte  0x0f,0x57,0xf6                                  // xorps         %xmm6,%xmm6
   .byte  0x0f,0x57,0xff                                  // xorps         %xmm7,%xmm7
-  .byte  0xff,0xe0                                       // jmpq          *%rax
+  .byte  0x48,0x89,0xdf                                  // mov           %rbx,%rdi
+  .byte  0x4c,0x89,0xee                                  // mov           %r13,%rsi
+  .byte  0x4c,0x89,0xf2                                  // mov           %r14,%rdx
+  .byte  0x41,0xff,0xd4                                  // callq         *%r12
+  .byte  0x48,0x8d,0x43,0x04                             // lea           0x4(%rbx),%rax
+  .byte  0x48,0x83,0xc3,0x08                             // add           $0x8,%rbx
+  .byte  0x4c,0x39,0xfb                                  // cmp           %r15,%rbx
+  .byte  0x48,0x89,0xc3                                  // mov           %rax,%rbx
+  .byte  0x76,0xcc                                       // jbe           28 <_sk_start_pipeline_sse2+0x28>
+  .byte  0x5b                                            // pop           %rbx
+  .byte  0x41,0x5c                                       // pop           %r12
+  .byte  0x41,0x5d                                       // pop           %r13
+  .byte  0x41,0x5e                                       // pop           %r14
+  .byte  0x41,0x5f                                       // pop           %r15
+  .byte  0xc3                                            // retq
 
 .globl _sk_just_return_sse2
 _sk_just_return_sse2:
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index d6aaa2b7ea..27b8d584aa 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -10,9 +10,14 @@ _text SEGMENT
 
 PUBLIC _sk_start_pipeline_hsw
 _sk_start_pipeline_hsw LABEL PROC
+  DB  65,87                                           ; push          %r15
+  DB  65,86                                           ; push          %r14
+  DB  65,85                                           ; push          %r13
+  DB  65,84                                           ; push          %r12
   DB  86                                              ; push          %rsi
   DB  87                                              ; push          %rdi
-  DB  72,129,236,168,0,0,0                            ; sub           $0xa8,%rsp
+  DB  83                                              ; push          %rbx
+  DB  72,129,236,160,0,0,0                            ; sub           $0xa0,%rsp
   DB  197,120,41,188,36,144,0,0,0                     ; vmovaps       %xmm15,0x90(%rsp)
   DB  197,120,41,180,36,128,0,0,0                     ; vmovaps       %xmm14,0x80(%rsp)
   DB  197,120,41,108,36,112                           ; vmovaps       %xmm13,0x70(%rsp)
@@ -23,8 +28,18 @@ _sk_start_pipeline_hsw LABEL PROC
   DB  197,120,41,68,36,32                             ; vmovaps       %xmm8,0x20(%rsp)
   DB  197,248,41,124,36,16                            ; vmovaps       %xmm7,0x10(%rsp)
   DB  197,248,41,52,36                                ; vmovaps       %xmm6,(%rsp)
+  DB  77,137,207                                      ; mov           %r9,%r15
+  DB  77,137,198                                      ; mov           %r8,%r14
+  DB  72,137,203                                      ; mov           %rcx,%rbx
   DB  72,137,214                                      ; mov           %rdx,%rsi
   DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  73,137,196                                      ; mov           %rax,%r12
+  DB  73,137,245                                      ; mov           %rsi,%r13
+  DB  72,141,67,8                                     ; lea           0x8(%rbx),%rax
+  DB  76,57,248                                       ; cmp           %r15,%rax
+  DB  118,5                                           ; jbe           75 <_sk_start_pipeline_hsw+0x75>
+  DB  72,137,216                                      ; mov           %rbx,%rax
+  DB  235,60                                          ; jmp           b1 <_sk_start_pipeline_hsw+0xb1>
   DB  197,252,87,192                                  ; vxorps        %ymm0,%ymm0,%ymm0
   DB  197,244,87,201                                  ; vxorps        %ymm1,%ymm1,%ymm1
   DB  197,236,87,210                                  ; vxorps        %ymm2,%ymm2,%ymm2
@@ -33,9 +48,15 @@ _sk_start_pipeline_hsw LABEL PROC
   DB  197,212,87,237                                  ; vxorps        %ymm5,%ymm5,%ymm5
   DB  197,204,87,246                                  ; vxorps        %ymm6,%ymm6,%ymm6
   DB  197,196,87,255                                  ; vxorps        %ymm7,%ymm7,%ymm7
-  DB  72,137,207                                      ; mov           %rcx,%rdi
-  DB  76,137,194                                      ; mov           %r8,%rdx
-  DB  255,208                                         ; callq         *%rax
+  DB  72,137,223                                      ; mov           %rbx,%rdi
+  DB  76,137,238                                      ; mov           %r13,%rsi
+  DB  76,137,242                                      ; mov           %r14,%rdx
+  DB  65,255,212                                      ; callq         *%r12
+  DB  72,141,67,8                                     ; lea           0x8(%rbx),%rax
+  DB  72,131,195,16                                   ; add           $0x10,%rbx
+  DB  76,57,251                                       ; cmp           %r15,%rbx
+  DB  72,137,195                                      ; mov           %rax,%rbx
+  DB  118,196                                         ; jbe           75 <_sk_start_pipeline_hsw+0x75>
   DB  197,248,40,52,36                                ; vmovaps       (%rsp),%xmm6
   DB  197,248,40,124,36,16                            ; vmovaps       0x10(%rsp),%xmm7
   DB  197,120,40,68,36,32                             ; vmovaps       0x20(%rsp),%xmm8
@@ -46,15 +67,19 @@ _sk_start_pipeline_hsw LABEL PROC
   DB  197,120,40,108,36,112                           ; vmovaps       0x70(%rsp),%xmm13
   DB  197,120,40,180,36,128,0,0,0                     ; vmovaps       0x80(%rsp),%xmm14
   DB  197,120,40,188,36,144,0,0,0                     ; vmovaps       0x90(%rsp),%xmm15
-  DB  72,129,196,168,0,0,0                            ; add           $0xa8,%rsp
+  DB  72,129,196,160,0,0,0                            ; add           $0xa0,%rsp
+  DB  91                                              ; pop           %rbx
   DB  95                                              ; pop           %rdi
   DB  94                                              ; pop           %rsi
+  DB  65,92                                           ; pop           %r12
+  DB  65,93                                           ; pop           %r13
+  DB  65,94                                           ; pop           %r14
+  DB  65,95                                           ; pop           %r15
   DB  197,248,119                                     ; vzeroupper
   DB  195                                             ; retq
 
 PUBLIC _sk_just_return_hsw
 _sk_just_return_hsw LABEL PROC
-  DB  197,248,119                                     ; vzeroupper
   DB  195                                             ; retq
 
 PUBLIC _sk_seed_shader_hsw
@@ -529,9 +554,14 @@ _sk_linear_gradient_2stops_hsw LABEL PROC
 
 PUBLIC _sk_start_pipeline_sse41
 _sk_start_pipeline_sse41 LABEL PROC
+  DB  65,87                                           ; push          %r15
+  DB  65,86                                           ; push          %r14
+  DB  65,85                                           ; push          %r13
+  DB  65,84                                           ; push          %r12
   DB  86                                              ; push          %rsi
   DB  87                                              ; push          %rdi
-  DB  72,129,236,168,0,0,0                            ; sub           $0xa8,%rsp
+  DB  83                                              ; push          %rbx
+  DB  72,129,236,160,0,0,0                            ; sub           $0xa0,%rsp
   DB  68,15,41,188,36,144,0,0,0                       ; movaps        %xmm15,0x90(%rsp)
   DB  68,15,41,180,36,128,0,0,0                       ; movaps        %xmm14,0x80(%rsp)
   DB  68,15,41,108,36,112                             ; movaps        %xmm13,0x70(%rsp)
@@ -542,8 +572,18 @@ _sk_start_pipeline_sse41 LABEL PROC
   DB  68,15,41,68,36,32                               ; movaps        %xmm8,0x20(%rsp)
   DB  15,41,124,36,16                                 ; movaps        %xmm7,0x10(%rsp)
   DB  15,41,52,36                                     ; movaps        %xmm6,(%rsp)
+  DB  77,137,207                                      ; mov           %r9,%r15
+  DB  77,137,198                                      ; mov           %r8,%r14
+  DB  72,137,203                                      ; mov           %rcx,%rbx
   DB  72,137,214                                      ; mov           %rdx,%rsi
   DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  73,137,196                                      ; mov           %rax,%r12
+  DB  73,137,245                                      ; mov           %rsi,%r13
+  DB  72,141,67,4                                     ; lea           0x4(%rbx),%rax
+  DB  76,57,248                                       ; cmp           %r15,%rax
+  DB  118,5                                           ; jbe           73 <_sk_start_pipeline_sse41+0x73>
+  DB  72,137,216                                      ; mov           %rbx,%rax
+  DB  235,52                                          ; jmp           a7 <_sk_start_pipeline_sse41+0xa7>
   DB  15,87,192                                       ; xorps         %xmm0,%xmm0
   DB  15,87,201                                       ; xorps         %xmm1,%xmm1
   DB  15,87,210                                       ; xorps         %xmm2,%xmm2
@@ -552,9 +592,15 @@ _sk_start_pipeline_sse41 LABEL PROC
   DB  15,87,237                                       ; xorps         %xmm5,%xmm5
   DB  15,87,246                                       ; xorps         %xmm6,%xmm6
   DB  15,87,255                                       ; xorps         %xmm7,%xmm7
-  DB  72,137,207                                      ; mov           %rcx,%rdi
-  DB  76,137,194                                      ; mov           %r8,%rdx
-  DB  255,208                                         ; callq         *%rax
+  DB  72,137,223                                      ; mov           %rbx,%rdi
+  DB  76,137,238                                      ; mov           %r13,%rsi
+  DB  76,137,242                                      ; mov           %r14,%rdx
+  DB  65,255,212                                      ; callq         *%r12
+  DB  72,141,67,4                                     ; lea           0x4(%rbx),%rax
+  DB  72,131,195,8                                    ; add           $0x8,%rbx
+  DB  76,57,251                                       ; cmp           %r15,%rbx
+  DB  72,137,195                                      ; mov           %rax,%rbx
+  DB  118,204                                         ; jbe           73 <_sk_start_pipeline_sse41+0x73>
   DB  15,40,52,36                                     ; movaps        (%rsp),%xmm6
   DB  15,40,124,36,16                                 ; movaps        0x10(%rsp),%xmm7
   DB  68,15,40,68,36,32                               ; movaps        0x20(%rsp),%xmm8
@@ -565,9 +611,14 @@ _sk_start_pipeline_sse41 LABEL PROC
   DB  68,15,40,108,36,112                             ; movaps        0x70(%rsp),%xmm13
   DB  68,15,40,180,36,128,0,0,0                       ; movaps        0x80(%rsp),%xmm14
   DB  68,15,40,188,36,144,0,0,0                       ; movaps        0x90(%rsp),%xmm15
-  DB  72,129,196,168,0,0,0                            ; add           $0xa8,%rsp
+  DB  72,129,196,160,0,0,0                            ; add           $0xa0,%rsp
+  DB  91                                              ; pop           %rbx
   DB  95                                              ; pop           %rdi
   DB  94                                              ; pop           %rsi
+  DB  65,92                                           ; pop           %r12
+  DB  65,93                                           ; pop           %r13
+  DB  65,94                                           ; pop           %r14
+  DB  65,95                                           ; pop           %r15
   DB  195                                             ; retq
 
 PUBLIC _sk_just_return_sse41
@@ -1231,9 +1282,14 @@ _sk_linear_gradient_2stops_sse41 LABEL PROC
 
 PUBLIC _sk_start_pipeline_sse2
 _sk_start_pipeline_sse2 LABEL PROC
+  DB  65,87                                           ; push          %r15
+  DB  65,86                                           ; push          %r14
+  DB  65,85                                           ; push          %r13
+  DB  65,84                                           ; push          %r12
   DB  86                                              ; push          %rsi
   DB  87                                              ; push          %rdi
-  DB  72,129,236,168,0,0,0                            ; sub           $0xa8,%rsp
+  DB  83                                              ; push          %rbx
+  DB  72,129,236,160,0,0,0                            ; sub           $0xa0,%rsp
   DB  68,15,41,188,36,144,0,0,0                       ; movaps        %xmm15,0x90(%rsp)
   DB  68,15,41,180,36,128,0,0,0                       ; movaps        %xmm14,0x80(%rsp)
   DB  68,15,41,108,36,112                             ; movaps        %xmm13,0x70(%rsp)
@@ -1244,8 +1300,18 @@ _sk_start_pipeline_sse2 LABEL PROC
   DB  68,15,41,68,36,32                               ; movaps        %xmm8,0x20(%rsp)
   DB  15,41,124,36,16                                 ; movaps        %xmm7,0x10(%rsp)
   DB  15,41,52,36                                     ; movaps        %xmm6,(%rsp)
+  DB  77,137,207                                      ; mov           %r9,%r15
+  DB  77,137,198                                      ; mov           %r8,%r14
+  DB  72,137,203                                      ; mov           %rcx,%rbx
   DB  72,137,214                                      ; mov           %rdx,%rsi
   DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  73,137,196                                      ; mov           %rax,%r12
+  DB  73,137,245                                      ; mov           %rsi,%r13
+  DB  72,141,67,4                                     ; lea           0x4(%rbx),%rax
+  DB  76,57,248                                       ; cmp           %r15,%rax
+  DB  118,5                                           ; jbe           73 <_sk_start_pipeline_sse2+0x73>
+  DB  72,137,216                                      ; mov           %rbx,%rax
+  DB  235,52                                          ; jmp           a7 <_sk_start_pipeline_sse2+0xa7>
   DB  15,87,192                                       ; xorps         %xmm0,%xmm0
   DB  15,87,201                                       ; xorps         %xmm1,%xmm1
   DB  15,87,210                                       ; xorps         %xmm2,%xmm2
@@ -1254,9 +1320,15 @@ _sk_start_pipeline_sse2 LABEL PROC
   DB  15,87,237                                       ; xorps         %xmm5,%xmm5
   DB  15,87,246                                       ; xorps         %xmm6,%xmm6
   DB  15,87,255                                       ; xorps         %xmm7,%xmm7
-  DB  72,137,207                                      ; mov           %rcx,%rdi
-  DB  76,137,194                                      ; mov           %r8,%rdx
-  DB  255,208                                         ; callq         *%rax
+  DB  72,137,223                                      ; mov           %rbx,%rdi
+  DB  76,137,238                                      ; mov           %r13,%rsi
+  DB  76,137,242                                      ; mov           %r14,%rdx
+  DB  65,255,212                                      ; callq         *%r12
+  DB  72,141,67,4                                     ; lea           0x4(%rbx),%rax
+  DB  72,131,195,8                                    ; add           $0x8,%rbx
+  DB  76,57,251                                       ; cmp           %r15,%rbx
+  DB  72,137,195                                      ; mov           %rax,%rbx
+  DB  118,204                                         ; jbe           73 <_sk_start_pipeline_sse2+0x73>
   DB  15,40,52,36                                     ; movaps        (%rsp),%xmm6
   DB  15,40,124,36,16                                 ; movaps        0x10(%rsp),%xmm7
   DB  68,15,40,68,36,32                               ; movaps        0x20(%rsp),%xmm8
@@ -1267,9 +1339,14 @@ _sk_start_pipeline_sse2 LABEL PROC
   DB  68,15,40,108,36,112                             ; movaps        0x70(%rsp),%xmm13
   DB  68,15,40,180,36,128,0,0,0                       ; movaps        0x80(%rsp),%xmm14
   DB  68,15,40,188,36,144,0,0,0                       ; movaps        0x90(%rsp),%xmm15
-  DB  72,129,196,168,0,0,0                            ; add           $0xa8,%rsp
+  DB  72,129,196,160,0,0,0                            ; add           $0xa0,%rsp
+  DB  91                                              ; pop           %rbx
   DB  95                                              ; pop           %rdi
   DB  94                                              ; pop           %rsi
+  DB  65,92                                           ; pop           %r12
+  DB  65,93                                           ; pop           %r13
+  DB  65,94                                           ; pop           %r14
+  DB  65,95                                           ; pop           %r15
   DB  195                                             ; retq
 
 PUBLIC _sk_just_return_sse2
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index e47137fd0d..6a3be91467 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -219,18 +219,19 @@ static void* load_and_inc(void**& program) {
 #if defined(JUMPER) && defined(WIN)
 __attribute__((ms_abi))
 #endif
-extern "C" void WRAP(start_pipeline)(size_t x, void** program, K* k) {
-    auto next = (Stage*)load_and_inc(program);
+extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
     F v{};   // TODO: faster uninitialized?
-    next(x,program,k, v,v,v,v, v,v,v,v);
+    size_t stride = sizeof(F) / sizeof(float);
+    auto start = (Stage*)load_and_inc(program);
+    while (x + stride <= limit) {
+        start(x,program,k, v,v,v,v, v,v,v,v);
+        x += stride;
+    }
+    return x;
 }
 
 // Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller).
-extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {
-#if defined(JUMPER) && defined(__AVX2__)
-    asm("vzeroupper");
-#endif
-}
+extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
 
 // We can now define Stages!
author	Mike Klein <mtklein@chromium.org>	2017-02-21 16:50:52 -0500
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2017-02-21 22:52:13 +0000
commit	9ef63754a7262f57097b39318adf2f7789d23ecf (patch)
tree	e67f511f5e301564da500a00588793b9f04ec262 /src
parent	c663953504e7cd6ad0e673e926203a3a38478d05 (diff)