7 files changed, 809 insertions, 21 deletions
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 353039171f..bd54e3f4df 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -16,7 +16,7 @@
 #include <functional>
 #include <vector>
 
-struct SkJumper_Engine;
+struct SkJumper_constants;
 
 /**
  * SkRasterPipeline provides a cheap way to chain together a pixel processing pipeline.
@@ -149,13 +149,15 @@ public:
     bool empty() const { return fStages == nullptr; }
 
 private:
+    using StartPipelineFn = void(size_t,size_t,size_t,void**,const SkJumper_constants*);
+
     struct StageList {
         StageList* prev;
         StockStage stage;
         void*      ctx;
     };
 
-    void build_pipeline(const SkJumper_Engine&, void**) const;
+    StartPipelineFn* build_pipeline(void**) const;
     void unchecked_append(StockStage, void*);
 
     SkArenaAlloc* fAlloc;
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index c5840d3522..77a5856027 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -29,6 +29,7 @@ static K kConstants = {
 // We can't express the real types of most stage functions portably, so we use a stand-in.
 // We'll only ever call start_pipeline(), which then chains into the rest for us.
 using StageFn = void(void);
+using StartPipelineFn = void(size_t,size_t,size_t,void**,K*);
 
 // Some platforms expect C "name" maps to asm "_name", others to "name".
 #if defined(__APPLE__)
@@ -43,29 +44,31 @@ extern "C" {
     // We'll just run portable code.
 
 #elif defined(__aarch64__)
-    void ASM(start_pipeline,aarch64)(size_t,size_t,size_t, void**, K*);
+    StartPipelineFn ASM(start_pipeline,aarch64);
     StageFn ASM(just_return,aarch64);
     #define M(st) StageFn ASM(st,aarch64);
         SK_RASTER_PIPELINE_STAGES(M)
     #undef M
 
 #elif defined(__arm__)
-    void ASM(start_pipeline,vfp4)(size_t,size_t,size_t, void**, K*);
+    StartPipelineFn ASM(start_pipeline,vfp4);
     StageFn ASM(just_return,vfp4);
     #define M(st) StageFn ASM(st,vfp4);
         SK_RASTER_PIPELINE_STAGES(M)
     #undef M
 
 #elif defined(__x86_64__) || defined(_M_X64)
-    void ASM(start_pipeline,hsw  )(size_t,size_t,size_t, void**, K*);
-    void ASM(start_pipeline,avx  )(size_t,size_t,size_t, void**, K*);
-    void ASM(start_pipeline,sse41)(size_t,size_t,size_t, void**, K*);
-    void ASM(start_pipeline,sse2 )(size_t,size_t,size_t, void**, K*);
+    StartPipelineFn ASM(start_pipeline,hsw       ),
+                    ASM(start_pipeline,avx       ),
+                    ASM(start_pipeline,sse41     ),
+                    ASM(start_pipeline,sse2      ),
+                    ASM(start_pipeline,ssse3_lowp);
 
     StageFn ASM(just_return,hsw),
             ASM(just_return,avx),
             ASM(just_return,sse41),
-            ASM(just_return,sse2);
+            ASM(just_return,sse2),
+            ASM(just_return,ssse3_lowp);
 
     #define M(st) StageFn ASM(st,hsw);
         SK_RASTER_PIPELINE_STAGES(M)
@@ -79,10 +82,14 @@ extern "C" {
     #define M(st) StageFn ASM(st,sse2);
         SK_RASTER_PIPELINE_STAGES(M)
     #undef M
+
+    StageFn ASM(load_8888,  ssse3_lowp),
+            ASM(store_8888, ssse3_lowp),
+            ASM(swap_rb,    ssse3_lowp);
 #endif
 
     // Portable, single-pixel stages.
-    void sk_start_pipeline(size_t,size_t,size_t, void**, K*);
+    StartPipelineFn sk_start_pipeline;
     StageFn sk_just_return;
     #define M(st) StageFn sk_##st;
         SK_RASTER_PIPELINE_STAGES(M)
@@ -170,43 +177,72 @@ static SkJumper_Engine choose_engine() {
     return kPortable;
 }
 
-void SkRasterPipeline::build_pipeline(const SkJumper_Engine& engine, void** ip) const {
-    const StageList* st = fStages;
+StartPipelineFn* SkRasterPipeline::build_pipeline(void** ip) const {
+#if defined(__x86_64__) || defined(_M_X64)
+    if (SkCpu::Supports(SkCpu::SSSE3)) {
+        void** reset_point = ip;
+
+        *--ip = (void*)ASM(just_return,ssse3_lowp);
+        for (const StageList* st = fStages; st; st = st->prev) {
+            StageFn* fn = nullptr;
+            switch (st->stage) {
+                case SkRasterPipeline::load_8888:  fn = ASM(load_8888, ssse3_lowp); break;
+                case SkRasterPipeline::store_8888: fn = ASM(store_8888,ssse3_lowp); break;
+                case SkRasterPipeline::swap_rb:    fn = ASM(swap_rb,   ssse3_lowp); break;
+                default:
+                    //SkDebugf("can't %d\n", st->stage);
+                    ip = reset_point;
+            }
+            if (ip == reset_point) {
+                break;
+            }
+            if (st->ctx) {
+                *--ip = st->ctx;
+            }
+            *--ip = (void*)fn;
+        }
+
+        if (ip != reset_point) {
+            return ASM(start_pipeline,ssse3_lowp);
+        }
+    }
+#endif
+    gChooseEngineOnce([]{ gEngine = choose_engine(); });
+
     // We're building the pipeline backwards, so we start with the final stage just_return.
-    *--ip = (void*)engine.just_return;
+    *--ip = (void*)gEngine.just_return;
 
     // Still going backwards, each stage's context pointer then its StageFn.
-    for (; st; st = st->prev) {
+    for (const StageList* st = fStages; st; st = st->prev) {
         if (st->ctx) {
             *--ip = st->ctx;
         }
-        *--ip = (void*)engine.stages[st->stage];
+        *--ip = (void*)gEngine.stages[st->stage];
     }
+    return gEngine.start_pipeline;
 }
 
 void SkRasterPipeline::run(size_t x, size_t y, size_t n) const {
     if (this->empty()) {
         return;
     }
-    gChooseEngineOnce([]{ gEngine = choose_engine(); });
 
     // Best to not use fAlloc here... we can't bound how often run() will be called.
     SkAutoSTMalloc<64, void*> program(fSlotsNeeded);
 
-    this->build_pipeline(gEngine, program.get() + fSlotsNeeded);
-    gEngine.start_pipeline(x,y,x+n, program.get(), &kConstants);
+    auto start_pipeline = this->build_pipeline(program.get() + fSlotsNeeded);
+    start_pipeline(x,y,x+n, program.get(), &kConstants);
 }
 
 std::function<void(size_t, size_t, size_t)> SkRasterPipeline::compile() const {
     if (this->empty()) {
         return [](size_t, size_t, size_t) {};
     }
-    gChooseEngineOnce([]{ gEngine = choose_engine(); });
 
     void** program = fAlloc->makeArray<void*>(fSlotsNeeded);
-    this->build_pipeline(gEngine, program + fSlotsNeeded);
+    auto start_pipeline = this->build_pipeline(program + fSlotsNeeded);
 
     return [=](size_t x, size_t y, size_t n) {
-        gEngine.start_pipeline(x,y,x+n, program, &kConstants);
+        start_pipeline(x,y,x+n, program, &kConstants);
     };
 }
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index 8379663481..84eb7a5e13 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -36884,4 +36884,265 @@ BALIGN16
   .byte  170                                 // stos          %al,%es:(%rdi)
   .byte  170                                 // stos          %al,%es:(%rdi)
   .byte  190,171,170,170,190                 // mov           $0xbeaaaaab,%esi
+BALIGN32
+
+HIDDEN _sk_start_pipeline_ssse3_lowp
+.globl _sk_start_pipeline_ssse3_lowp
+FUNCTION(_sk_start_pipeline_ssse3_lowp)
+_sk_start_pipeline_ssse3_lowp:
+  .byte  85                                  // push          %rbp
+  .byte  65,87                               // push          %r15
+  .byte  65,86                               // push          %r14
+  .byte  65,85                               // push          %r13
+  .byte  65,84                               // push          %r12
+  .byte  83                                  // push          %rbx
+  .byte  80                                  // push          %rax
+  .byte  77,137,198                          // mov           %r8,%r14
+  .byte  73,137,213                          // mov           %rdx,%r13
+  .byte  73,137,247                          // mov           %rsi,%r15
+  .byte  72,137,251                          // mov           %rdi,%rbx
+  .byte  72,137,206                          // mov           %rcx,%rsi
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  73,137,244                          // mov           %rsi,%r12
+  .byte  72,141,75,8                         // lea           0x8(%rbx),%rcx
+  .byte  76,57,233                           // cmp           %r13,%rcx
+  .byte  118,5                               // jbe           2d <_sk_start_pipeline_ssse3_lowp+0x2d>
+  .byte  72,137,218                          // mov           %rbx,%rdx
+  .byte  235,66                              // jmp           6f <_sk_start_pipeline_ssse3_lowp+0x6f>
+  .byte  65,184,0,0,0,0                      // mov           $0x0,%r8d
+  .byte  15,87,192                           // xorps         %xmm0,%xmm0
+  .byte  15,87,201                           // xorps         %xmm1,%xmm1
+  .byte  15,87,210                           // xorps         %xmm2,%xmm2
+  .byte  15,87,219                           // xorps         %xmm3,%xmm3
+  .byte  15,87,228                           // xorps         %xmm4,%xmm4
+  .byte  15,87,237                           // xorps         %xmm5,%xmm5
+  .byte  15,87,246                           // xorps         %xmm6,%xmm6
+  .byte  15,87,255                           // xorps         %xmm7,%xmm7
+  .byte  76,137,247                          // mov           %r14,%rdi
+  .byte  76,137,230                          // mov           %r12,%rsi
+  .byte  72,137,218                          // mov           %rbx,%rdx
+  .byte  76,137,249                          // mov           %r15,%rcx
+  .byte  72,137,197                          // mov           %rax,%rbp
+  .byte  255,213                             // callq         *%rbp
+  .byte  72,137,232                          // mov           %rbp,%rax
+  .byte  72,141,83,8                         // lea           0x8(%rbx),%rdx
+  .byte  72,131,195,16                       // add           $0x10,%rbx
+  .byte  76,57,235                           // cmp           %r13,%rbx
+  .byte  72,137,211                          // mov           %rdx,%rbx
+  .byte  118,190                             // jbe           2d <_sk_start_pipeline_ssse3_lowp+0x2d>
+  .byte  73,41,213                           // sub           %rdx,%r13
+  .byte  116,52                              // je            a8 <_sk_start_pipeline_ssse3_lowp+0xa8>
+  .byte  15,87,192                           // xorps         %xmm0,%xmm0
+  .byte  15,87,201                           // xorps         %xmm1,%xmm1
+  .byte  15,87,210                           // xorps         %xmm2,%xmm2
+  .byte  15,87,219                           // xorps         %xmm3,%xmm3
+  .byte  15,87,228                           // xorps         %xmm4,%xmm4
+  .byte  15,87,237                           // xorps         %xmm5,%xmm5
+  .byte  15,87,246                           // xorps         %xmm6,%xmm6
+  .byte  15,87,255                           // xorps         %xmm7,%xmm7
+  .byte  76,137,247                          // mov           %r14,%rdi
+  .byte  76,137,230                          // mov           %r12,%rsi
+  .byte  76,137,249                          // mov           %r15,%rcx
+  .byte  77,137,232                          // mov           %r13,%r8
+  .byte  72,131,196,8                        // add           $0x8,%rsp
+  .byte  91                                  // pop           %rbx
+  .byte  65,92                               // pop           %r12
+  .byte  65,93                               // pop           %r13
+  .byte  65,94                               // pop           %r14
+  .byte  65,95                               // pop           %r15
+  .byte  93                                  // pop           %rbp
+  .byte  255,224                             // jmpq          *%rax
+  .byte  72,131,196,8                        // add           $0x8,%rsp
+  .byte  91                                  // pop           %rbx
+  .byte  65,92                               // pop           %r12
+  .byte  65,93                               // pop           %r13
+  .byte  65,94                               // pop           %r14
+  .byte  65,95                               // pop           %r15
+  .byte  93                                  // pop           %rbp
+  .byte  195                                 // retq
+
+HIDDEN _sk_just_return_ssse3_lowp
+.globl _sk_just_return_ssse3_lowp
+FUNCTION(_sk_just_return_ssse3_lowp)
+_sk_just_return_ssse3_lowp:
+  .byte  195                                 // retq
+
+HIDDEN _sk_load_8888_ssse3_lowp
+.globl _sk_load_8888_ssse3_lowp
+FUNCTION(_sk_load_8888_ssse3_lowp)
+_sk_load_8888_ssse3_lowp:
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  76,139,24                           // mov           (%rax),%r11
+  .byte  77,133,192                          // test          %r8,%r8
+  .byte  117,120                             // jne           13a <_sk_load_8888_ssse3_lowp+0x82>
+  .byte  65,15,16,76,147,16                  // movups        0x10(%r11,%rdx,4),%xmm1
+  .byte  65,15,16,4,147                      // movups        (%r11,%rdx,4),%xmm0
+  .byte  15,40,208                           // movaps        %xmm0,%xmm2
+  .byte  102,15,96,209                       // punpcklbw     %xmm1,%xmm2
+  .byte  102,15,104,193                      // punpckhbw     %xmm1,%xmm0
+  .byte  102,68,15,111,194                   // movdqa        %xmm2,%xmm8
+  .byte  102,68,15,96,192                    // punpcklbw     %xmm0,%xmm8
+  .byte  102,15,104,208                      // punpckhbw     %xmm0,%xmm2
+  .byte  102,69,15,111,200                   // movdqa        %xmm8,%xmm9
+  .byte  102,68,15,96,202                    // punpcklbw     %xmm2,%xmm9
+  .byte  102,68,15,104,194                   // punpckhbw     %xmm2,%xmm8
+  .byte  102,15,239,219                      // pxor          %xmm3,%xmm3
+  .byte  102,15,239,192                      // pxor          %xmm0,%xmm0
+  .byte  102,65,15,96,193                    // punpcklbw     %xmm9,%xmm0
+  .byte  102,15,239,201                      // pxor          %xmm1,%xmm1
+  .byte  102,65,15,104,201                   // punpckhbw     %xmm9,%xmm1
+  .byte  102,15,239,210                      // pxor          %xmm2,%xmm2
+  .byte  102,65,15,96,208                    // punpcklbw     %xmm8,%xmm2
+  .byte  102,65,15,104,216                   // punpckhbw     %xmm8,%xmm3
+  .byte  102,68,15,111,5,238,1,0,0           // movdqa        0x1ee(%rip),%xmm8        # 310 <_sk_swap_rb_ssse3_lowp+0x1c>
+  .byte  102,65,15,228,192                   // pmulhuw       %xmm8,%xmm0
+  .byte  102,65,15,228,200                   // pmulhuw       %xmm8,%xmm1
+  .byte  102,65,15,228,208                   // pmulhuw       %xmm8,%xmm2
+  .byte  102,65,15,228,216                   // pmulhuw       %xmm8,%xmm3
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  255,224                             // jmpq          *%rax
+  .byte  69,137,193                          // mov           %r8d,%r9d
+  .byte  65,128,225,7                        // and           $0x7,%r9b
+  .byte  102,15,239,201                      // pxor          %xmm1,%xmm1
+  .byte  102,15,239,192                      // pxor          %xmm0,%xmm0
+  .byte  65,254,201                          // dec           %r9b
+  .byte  65,128,249,6                        // cmp           $0x6,%r9b
+  .byte  15,135,119,255,255,255              // ja            cd <_sk_load_8888_ssse3_lowp+0x15>
+  .byte  69,15,182,201                       // movzbl        %r9b,%r9d
+  .byte  76,141,21,119,0,0,0                 // lea           0x77(%rip),%r10        # 1d8 <_sk_load_8888_ssse3_lowp+0x120>
+  .byte  75,99,4,138                         // movslq        (%r10,%r9,4),%rax
+  .byte  76,1,208                            // add           %r10,%rax
+  .byte  255,224                             // jmpq          *%rax
+  .byte  102,65,15,110,68,147,24             // movd          0x18(%r11,%rdx,4),%xmm0
+  .byte  102,15,112,200,69                   // pshufd        $0x45,%xmm0,%xmm1
+  .byte  102,15,239,192                      // pxor          %xmm0,%xmm0
+  .byte  243,65,15,16,84,147,20              // movss         0x14(%r11,%rdx,4),%xmm2
+  .byte  15,198,209,0                        // shufps        $0x0,%xmm1,%xmm2
+  .byte  15,198,209,226                      // shufps        $0xe2,%xmm1,%xmm2
+  .byte  15,40,202                           // movaps        %xmm2,%xmm1
+  .byte  243,65,15,16,84,147,16              // movss         0x10(%r11,%rdx,4),%xmm2
+  .byte  243,15,16,202                       // movss         %xmm2,%xmm1
+  .byte  243,65,15,16,84,147,12              // movss         0xc(%r11,%rdx,4),%xmm2
+  .byte  15,198,208,32                       // shufps        $0x20,%xmm0,%xmm2
+  .byte  15,198,194,36                       // shufps        $0x24,%xmm2,%xmm0
+  .byte  243,65,15,16,84,147,8               // movss         0x8(%r11,%rdx,4),%xmm2
+  .byte  15,198,208,48                       // shufps        $0x30,%xmm0,%xmm2
+  .byte  15,198,194,132                      // shufps        $0x84,%xmm2,%xmm0
+  .byte  243,65,15,16,84,147,4               // movss         0x4(%r11,%rdx,4),%xmm2
+  .byte  15,198,208,0                        // shufps        $0x0,%xmm0,%xmm2
+  .byte  15,198,208,226                      // shufps        $0xe2,%xmm0,%xmm2
+  .byte  15,40,194                           // movaps        %xmm2,%xmm0
+  .byte  243,65,15,16,20,147                 // movss         (%r11,%rdx,4),%xmm2
+  .byte  243,15,16,194                       // movss         %xmm2,%xmm0
+  .byte  233,247,254,255,255                 // jmpq          cd <_sk_load_8888_ssse3_lowp+0x15>
+  .byte  102,144                             // xchg          %ax,%ax
+  .byte  239                                 // out           %eax,(%dx)
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  221,255                             // (bad)
+  .byte  255                                 // (bad)
+  .byte  255,206                             // dec           %esi
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  191,255,255,255,180                 // mov           $0xb4ffffff,%edi
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  255,162,255,255,255,146             // jmpq          *-0x6d000001(%rdx)
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  255                                 // .byte         0xff
+
+HIDDEN _sk_store_8888_ssse3_lowp
+.globl _sk_store_8888_ssse3_lowp
+FUNCTION(_sk_store_8888_ssse3_lowp)
+_sk_store_8888_ssse3_lowp:
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  76,139,24                           // mov           (%rax),%r11
+  .byte  102,68,15,111,208                   // movdqa        %xmm0,%xmm10
+  .byte  102,65,15,113,210,7                 // psrlw         $0x7,%xmm10
+  .byte  102,68,15,111,194                   // movdqa        %xmm2,%xmm8
+  .byte  102,65,15,113,208,7                 // psrlw         $0x7,%xmm8
+  .byte  102,69,15,103,208                   // packuswb      %xmm8,%xmm10
+  .byte  102,68,15,111,193                   // movdqa        %xmm1,%xmm8
+  .byte  102,65,15,113,208,7                 // psrlw         $0x7,%xmm8
+  .byte  102,68,15,111,203                   // movdqa        %xmm3,%xmm9
+  .byte  102,65,15,113,209,7                 // psrlw         $0x7,%xmm9
+  .byte  102,69,15,103,193                   // packuswb      %xmm9,%xmm8
+  .byte  102,69,15,111,202                   // movdqa        %xmm10,%xmm9
+  .byte  102,69,15,96,200                    // punpcklbw     %xmm8,%xmm9
+  .byte  102,69,15,104,208                   // punpckhbw     %xmm8,%xmm10
+  .byte  102,69,15,111,193                   // movdqa        %xmm9,%xmm8
+  .byte  102,69,15,97,194                    // punpcklwd     %xmm10,%xmm8
+  .byte  102,69,15,105,202                   // punpckhwd     %xmm10,%xmm9
+  .byte  77,133,192                          // test          %r8,%r8
+  .byte  117,17                              // jne           263 <_sk_store_8888_ssse3_lowp+0x6f>
+  .byte  243,69,15,127,76,147,16             // movdqu        %xmm9,0x10(%r11,%rdx,4)
+  .byte  243,69,15,127,4,147                 // movdqu        %xmm8,(%r11,%rdx,4)
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  255,224                             // jmpq          *%rax
+  .byte  69,137,193                          // mov           %r8d,%r9d
+  .byte  65,128,225,7                        // and           $0x7,%r9b
+  .byte  65,254,201                          // dec           %r9b
+  .byte  65,128,249,6                        // cmp           $0x6,%r9b
+  .byte  119,236                             // ja            25f <_sk_store_8888_ssse3_lowp+0x6b>
+  .byte  69,15,182,201                       // movzbl        %r9b,%r9d
+  .byte  76,141,21,90,0,0,0                  // lea           0x5a(%rip),%r10        # 2d8 <_sk_store_8888_ssse3_lowp+0xe4>
+  .byte  75,99,4,138                         // movslq        (%r10,%r9,4),%rax
+  .byte  76,1,208                            // add           %r10,%rax
+  .byte  255,224                             // jmpq          *%rax
+  .byte  102,69,15,112,209,78                // pshufd        $0x4e,%xmm9,%xmm10
+  .byte  102,69,15,126,84,147,24             // movd          %xmm10,0x18(%r11,%rdx,4)
+  .byte  102,69,15,112,209,229               // pshufd        $0xe5,%xmm9,%xmm10
+  .byte  102,69,15,126,84,147,20             // movd          %xmm10,0x14(%r11,%rdx,4)
+  .byte  102,69,15,126,76,147,16             // movd          %xmm9,0x10(%r11,%rdx,4)
+  .byte  102,69,15,112,200,231               // pshufd        $0xe7,%xmm8,%xmm9
+  .byte  102,69,15,126,76,147,12             // movd          %xmm9,0xc(%r11,%rdx,4)
+  .byte  102,69,15,112,200,78                // pshufd        $0x4e,%xmm8,%xmm9
+  .byte  102,69,15,126,76,147,8              // movd          %xmm9,0x8(%r11,%rdx,4)
+  .byte  102,69,15,112,200,229               // pshufd        $0xe5,%xmm8,%xmm9
+  .byte  102,69,15,126,76,147,4              // movd          %xmm9,0x4(%r11,%rdx,4)
+  .byte  102,69,15,126,4,147                 // movd          %xmm8,(%r11,%rdx,4)
+  .byte  235,136                             // jmp           25f <_sk_store_8888_ssse3_lowp+0x6b>
+  .byte  144                                 // nop
+  .byte  247,255                             // idiv          %edi
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  234                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  221,255                             // (bad)
+  .byte  255                                 // (bad)
+  .byte  255,208                             // callq         *%rax
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  255,201                             // dec           %ecx
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  188,255,255,255,175                 // mov           $0xafffffff,%esp
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  255                                 // .byte         0xff
+
+HIDDEN _sk_swap_rb_ssse3_lowp
+.globl _sk_swap_rb_ssse3_lowp
+FUNCTION(_sk_swap_rb_ssse3_lowp)
+_sk_swap_rb_ssse3_lowp:
+  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  15,40,194                           // movaps        %xmm2,%xmm0
+  .byte  65,15,40,208                        // movaps        %xmm8,%xmm2
+  .byte  255,224                             // jmpq          *%rax
+
+BALIGN16
+  .byte  129,128,129,128,129,128,129,128,129,128// addl          $0x80818081,-0x7f7e7f7f(%rax)
+  .byte  129                                 // .byte         0x81
+  .byte  128                                 // .byte         0x80
+  .byte  129                                 // .byte         0x81
+  .byte  128                                 // .byte         0x80
+  .byte  129                                 // .byte         0x81
+  .byte  128                                 // .byte         0x80
 #endif
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index 33d209ea38..ebdd645410 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -26262,5 +26262,274 @@ ALIGN 16
   DB  170                                 ; stos          %al,%es:(%rdi)
   DB  170                                 ; stos          %al,%es:(%rdi)
   DB  190,171,170,170,190                 ; mov           $0xbeaaaaab,%esi
+ALIGN 32
+
+PUBLIC _sk_start_pipeline_ssse3_lowp
+_sk_start_pipeline_ssse3_lowp LABEL PROC
+  DB  65,87                               ; push          %r15
+  DB  65,86                               ; push          %r14
+  DB  65,85                               ; push          %r13
+  DB  65,84                               ; push          %r12
+  DB  86                                  ; push          %rsi
+  DB  87                                  ; push          %rdi
+  DB  85                                  ; push          %rbp
+  DB  83                                  ; push          %rbx
+  DB  72,129,236,168,0,0,0                ; sub           $0xa8,%rsp
+  DB  68,15,41,188,36,144,0,0,0           ; movaps        %xmm15,0x90(%rsp)
+  DB  68,15,41,180,36,128,0,0,0           ; movaps        %xmm14,0x80(%rsp)
+  DB  68,15,41,108,36,112                 ; movaps        %xmm13,0x70(%rsp)
+  DB  68,15,41,100,36,96                  ; movaps        %xmm12,0x60(%rsp)
+  DB  68,15,41,92,36,80                   ; movaps        %xmm11,0x50(%rsp)
+  DB  68,15,41,84,36,64                   ; movaps        %xmm10,0x40(%rsp)
+  DB  68,15,41,76,36,48                   ; movaps        %xmm9,0x30(%rsp)
+  DB  68,15,41,68,36,32                   ; movaps        %xmm8,0x20(%rsp)
+  DB  15,41,124,36,16                     ; movaps        %xmm7,0x10(%rsp)
+  DB  15,41,52,36                         ; movaps        %xmm6,(%rsp)
+  DB  76,137,195                          ; mov           %r8,%rbx
+  DB  73,137,214                          ; mov           %rdx,%r14
+  DB  72,137,205                          ; mov           %rcx,%rbp
+  DB  76,139,188,36,16,1,0,0              ; mov           0x110(%rsp),%r15
+  DB  76,137,206                          ; mov           %r9,%rsi
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  73,137,196                          ; mov           %rax,%r12
+  DB  73,137,245                          ; mov           %rsi,%r13
+  DB  72,141,69,8                         ; lea           0x8(%rbp),%rax
+  DB  72,57,216                           ; cmp           %rbx,%rax
+  DB  118,5                               ; jbe           7c <_sk_start_pipeline_ssse3_lowp+0x7c>
+  DB  72,137,234                          ; mov           %rbp,%rdx
+  DB  235,61                              ; jmp           b9 <_sk_start_pipeline_ssse3_lowp+0xb9>
+  DB  65,184,0,0,0,0                      ; mov           $0x0,%r8d
+  DB  15,87,192                           ; xorps         %xmm0,%xmm0
+  DB  15,87,201                           ; xorps         %xmm1,%xmm1
+  DB  15,87,210                           ; xorps         %xmm2,%xmm2
+  DB  15,87,219                           ; xorps         %xmm3,%xmm3
+  DB  15,87,228                           ; xorps         %xmm4,%xmm4
+  DB  15,87,237                           ; xorps         %xmm5,%xmm5
+  DB  15,87,246                           ; xorps         %xmm6,%xmm6
+  DB  15,87,255                           ; xorps         %xmm7,%xmm7
+  DB  76,137,255                          ; mov           %r15,%rdi
+  DB  76,137,238                          ; mov           %r13,%rsi
+  DB  72,137,234                          ; mov           %rbp,%rdx
+  DB  76,137,241                          ; mov           %r14,%rcx
+  DB  65,255,212                          ; callq         *%r12
+  DB  72,141,85,8                         ; lea           0x8(%rbp),%rdx
+  DB  72,131,197,16                       ; add           $0x10,%rbp
+  DB  72,57,221                           ; cmp           %rbx,%rbp
+  DB  72,137,213                          ; mov           %rdx,%rbp
+  DB  118,195                             ; jbe           7c <_sk_start_pipeline_ssse3_lowp+0x7c>
+  DB  72,41,211                           ; sub           %rdx,%rbx
+  DB  116,39                              ; je            e5 <_sk_start_pipeline_ssse3_lowp+0xe5>
+  DB  15,87,192                           ; xorps         %xmm0,%xmm0
+  DB  15,87,201                           ; xorps         %xmm1,%xmm1
+  DB  15,87,210                           ; xorps         %xmm2,%xmm2
+  DB  15,87,219                           ; xorps         %xmm3,%xmm3
+  DB  15,87,228                           ; xorps         %xmm4,%xmm4
+  DB  15,87,237                           ; xorps         %xmm5,%xmm5
+  DB  15,87,246                           ; xorps         %xmm6,%xmm6
+  DB  15,87,255                           ; xorps         %xmm7,%xmm7
+  DB  76,137,255                          ; mov           %r15,%rdi
+  DB  76,137,238                          ; mov           %r13,%rsi
+  DB  76,137,241                          ; mov           %r14,%rcx
+  DB  73,137,216                          ; mov           %rbx,%r8
+  DB  65,255,212                          ; callq         *%r12
+  DB  15,40,52,36                         ; movaps        (%rsp),%xmm6
+  DB  15,40,124,36,16                     ; movaps        0x10(%rsp),%xmm7
+  DB  68,15,40,68,36,32                   ; movaps        0x20(%rsp),%xmm8
+  DB  68,15,40,76,36,48                   ; movaps        0x30(%rsp),%xmm9
+  DB  68,15,40,84,36,64                   ; movaps        0x40(%rsp),%xmm10
+  DB  68,15,40,92,36,80                   ; movaps        0x50(%rsp),%xmm11
+  DB  68,15,40,100,36,96                  ; movaps        0x60(%rsp),%xmm12
+  DB  68,15,40,108,36,112                 ; movaps        0x70(%rsp),%xmm13
+  DB  68,15,40,180,36,128,0,0,0           ; movaps        0x80(%rsp),%xmm14
+  DB  68,15,40,188,36,144,0,0,0           ; movaps        0x90(%rsp),%xmm15
+  DB  72,129,196,168,0,0,0                ; add           $0xa8,%rsp
+  DB  91                                  ; pop           %rbx
+  DB  93                                  ; pop           %rbp
+  DB  95                                  ; pop           %rdi
+  DB  94                                  ; pop           %rsi
+  DB  65,92                               ; pop           %r12
+  DB  65,93                               ; pop           %r13
+  DB  65,94                               ; pop           %r14
+  DB  65,95                               ; pop           %r15
+  DB  195                                 ; retq
+
+PUBLIC _sk_just_return_ssse3_lowp
+_sk_just_return_ssse3_lowp LABEL PROC
+  DB  195                                 ; retq
+
+PUBLIC _sk_load_8888_ssse3_lowp
+_sk_load_8888_ssse3_lowp LABEL PROC
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  76,139,24                           ; mov           (%rax),%r11
+  DB  77,133,192                          ; test          %r8,%r8
+  DB  117,120                             ; jne           1bb <_sk_load_8888_ssse3_lowp+0x82>
+  DB  65,15,16,76,147,16                  ; movups        0x10(%r11,%rdx,4),%xmm1
+  DB  65,15,16,4,147                      ; movups        (%r11,%rdx,4),%xmm0
+  DB  15,40,208                           ; movaps        %xmm0,%xmm2
+  DB  102,15,96,209                       ; punpcklbw     %xmm1,%xmm2
+  DB  102,15,104,193                      ; punpckhbw     %xmm1,%xmm0
+  DB  102,68,15,111,194                   ; movdqa        %xmm2,%xmm8
+  DB  102,68,15,96,192                    ; punpcklbw     %xmm0,%xmm8
+  DB  102,15,104,208                      ; punpckhbw     %xmm0,%xmm2
+  DB  102,69,15,111,200                   ; movdqa        %xmm8,%xmm9
+  DB  102,68,15,96,202                    ; punpcklbw     %xmm2,%xmm9
+  DB  102,68,15,104,194                   ; punpckhbw     %xmm2,%xmm8
+  DB  102,15,239,219                      ; pxor          %xmm3,%xmm3
+  DB  102,15,239,192                      ; pxor          %xmm0,%xmm0
+  DB  102,65,15,96,193                    ; punpcklbw     %xmm9,%xmm0
+  DB  102,15,239,201                      ; pxor          %xmm1,%xmm1
+  DB  102,65,15,104,201                   ; punpckhbw     %xmm9,%xmm1
+  DB  102,15,239,210                      ; pxor          %xmm2,%xmm2
+  DB  102,65,15,96,208                    ; punpcklbw     %xmm8,%xmm2
+  DB  102,65,15,104,216                   ; punpckhbw     %xmm8,%xmm3
+  DB  102,68,15,111,5,237,1,0,0           ; movdqa        0x1ed(%rip),%xmm8        # 390 <_sk_swap_rb_ssse3_lowp+0x1c>
+  DB  102,65,15,228,192                   ; pmulhuw       %xmm8,%xmm0
+  DB  102,65,15,228,200                   ; pmulhuw       %xmm8,%xmm1
+  DB  102,65,15,228,208                   ; pmulhuw       %xmm8,%xmm2
+  DB  102,65,15,228,216                   ; pmulhuw       %xmm8,%xmm3
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  255,224                             ; jmpq          *%rax
+  DB  69,137,193                          ; mov           %r8d,%r9d
+  DB  65,128,225,7                        ; and           $0x7,%r9b
+  DB  102,15,239,201                      ; pxor          %xmm1,%xmm1
+  DB  102,15,239,192                      ; pxor          %xmm0,%xmm0
+  DB  65,254,201                          ; dec           %r9b
+  DB  65,128,249,6                        ; cmp           $0x6,%r9b
+  DB  15,135,119,255,255,255              ; ja            14e <_sk_load_8888_ssse3_lowp+0x15>
+  DB  69,15,182,201                       ; movzbl        %r9b,%r9d
+  DB  76,141,21,118,0,0,0                 ; lea           0x76(%rip),%r10        # 258 <_sk_load_8888_ssse3_lowp+0x11f>
+  DB  75,99,4,138                         ; movslq        (%r10,%r9,4),%rax
+  DB  76,1,208                            ; add           %r10,%rax
+  DB  255,224                             ; jmpq          *%rax
+  DB  102,65,15,110,68,147,24             ; movd          0x18(%r11,%rdx,4),%xmm0
+  DB  102,15,112,200,69                   ; pshufd        $0x45,%xmm0,%xmm1
+  DB  102,15,239,192                      ; pxor          %xmm0,%xmm0
+  DB  243,65,15,16,84,147,20              ; movss         0x14(%r11,%rdx,4),%xmm2
+  DB  15,198,209,0                        ; shufps        $0x0,%xmm1,%xmm2
+  DB  15,198,209,226                      ; shufps        $0xe2,%xmm1,%xmm2
+  DB  15,40,202                           ; movaps        %xmm2,%xmm1
+  DB  243,65,15,16,84,147,16              ; movss         0x10(%r11,%rdx,4),%xmm2
+  DB  243,15,16,202                       ; movss         %xmm2,%xmm1
+  DB  243,65,15,16,84,147,12              ; movss         0xc(%r11,%rdx,4),%xmm2
+  DB  15,198,208,32                       ; shufps        $0x20,%xmm0,%xmm2
+  DB  15,198,194,36                       ; shufps        $0x24,%xmm2,%xmm0
+  DB  243,65,15,16,84,147,8               ; movss         0x8(%r11,%rdx,4),%xmm2
+  DB  15,198,208,48                       ; shufps        $0x30,%xmm0,%xmm2
+  DB  15,198,194,132                      ; shufps        $0x84,%xmm2,%xmm0
+  DB  243,65,15,16,84,147,4               ; movss         0x4(%r11,%rdx,4),%xmm2
+  DB  15,198,208,0                        ; shufps        $0x0,%xmm0,%xmm2
+  DB  15,198,208,226                      ; shufps        $0xe2,%xmm0,%xmm2
+  DB  15,40,194                           ; movaps        %xmm2,%xmm0
+  DB  243,65,15,16,20,147                 ; movss         (%r11,%rdx,4),%xmm2
+  DB  243,15,16,194                       ; movss         %xmm2,%xmm0
+  DB  233,247,254,255,255                 ; jmpq          14e <_sk_load_8888_ssse3_lowp+0x15>
+  DB  144                                 ; nop
+  DB  240,255                             ; lock          (bad)
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  222,255                             ; fdivrp        %st,%st(7)
+  DB  255                                 ; (bad)
+  DB  255,207                             ; dec           %edi
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  255,192                             ; inc           %eax
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  255,181,255,255,255,163             ; pushq         -0x5c000001(%rbp)
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  255                                 ; .byte         0xff
+  DB  147                                 ; xchg          %eax,%ebx
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  255                                 ; .byte         0xff
+
+PUBLIC _sk_store_8888_ssse3_lowp
+_sk_store_8888_ssse3_lowp LABEL PROC
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  76,139,24                           ; mov           (%rax),%r11
+  DB  102,68,15,111,208                   ; movdqa        %xmm0,%xmm10
+  DB  102,65,15,113,210,7                 ; psrlw         $0x7,%xmm10
+  DB  102,68,15,111,194                   ; movdqa        %xmm2,%xmm8
+  DB  102,65,15,113,208,7                 ; psrlw         $0x7,%xmm8
+  DB  102,69,15,103,208                   ; packuswb      %xmm8,%xmm10
+  DB  102,68,15,111,193                   ; movdqa        %xmm1,%xmm8
+  DB  102,65,15,113,208,7                 ; psrlw         $0x7,%xmm8
+  DB  102,68,15,111,203                   ; movdqa        %xmm3,%xmm9
+  DB  102,65,15,113,209,7                 ; psrlw         $0x7,%xmm9
+  DB  102,69,15,103,193                   ; packuswb      %xmm9,%xmm8
+  DB  102,69,15,111,202                   ; movdqa        %xmm10,%xmm9
+  DB  102,69,15,96,200                    ; punpcklbw     %xmm8,%xmm9
+  DB  102,69,15,104,208                   ; punpckhbw     %xmm8,%xmm10
+  DB  102,69,15,111,193                   ; movdqa        %xmm9,%xmm8
+  DB  102,69,15,97,194                    ; punpcklwd     %xmm10,%xmm8
+  DB  102,69,15,105,202                   ; punpckhwd     %xmm10,%xmm9
+  DB  77,133,192                          ; test          %r8,%r8
+  DB  117,17                              ; jne           2e3 <_sk_store_8888_ssse3_lowp+0x6f>
+  DB  243,69,15,127,76,147,16             ; movdqu        %xmm9,0x10(%r11,%rdx,4)
+  DB  243,69,15,127,4,147                 ; movdqu        %xmm8,(%r11,%rdx,4)
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  255,224                             ; jmpq          *%rax
+  DB  69,137,193                          ; mov           %r8d,%r9d
+  DB  65,128,225,7                        ; and           $0x7,%r9b
+  DB  65,254,201                          ; dec           %r9b
+  DB  65,128,249,6                        ; cmp           $0x6,%r9b
+  DB  119,236                             ; ja            2df <_sk_store_8888_ssse3_lowp+0x6b>
+  DB  69,15,182,201                       ; movzbl        %r9b,%r9d
+  DB  76,141,21,90,0,0,0                  ; lea           0x5a(%rip),%r10        # 358 <_sk_store_8888_ssse3_lowp+0xe4>
+  DB  75,99,4,138                         ; movslq        (%r10,%r9,4),%rax
+  DB  76,1,208                            ; add           %r10,%rax
+  DB  255,224                             ; jmpq          *%rax
+  DB  102,69,15,112,209,78                ; pshufd        $0x4e,%xmm9,%xmm10
+  DB  102,69,15,126,84,147,24             ; movd          %xmm10,0x18(%r11,%rdx,4)
+  DB  102,69,15,112,209,229               ; pshufd        $0xe5,%xmm9,%xmm10
+  DB  102,69,15,126,84,147,20             ; movd          %xmm10,0x14(%r11,%rdx,4)
+  DB  102,69,15,126,76,147,16             ; movd          %xmm9,0x10(%r11,%rdx,4)
+  DB  102,69,15,112,200,231               ; pshufd        $0xe7,%xmm8,%xmm9
+  DB  102,69,15,126,76,147,12             ; movd          %xmm9,0xc(%r11,%rdx,4)
+  DB  102,69,15,112,200,78                ; pshufd        $0x4e,%xmm8,%xmm9
+  DB  102,69,15,126,76,147,8              ; movd          %xmm9,0x8(%r11,%rdx,4)
+  DB  102,69,15,112,200,229               ; pshufd        $0xe5,%xmm8,%xmm9
+  DB  102,69,15,126,76,147,4              ; movd          %xmm9,0x4(%r11,%rdx,4)
+  DB  102,69,15,126,4,147                 ; movd          %xmm8,(%r11,%rdx,4)
+  DB  235,136                             ; jmp           2df <_sk_store_8888_ssse3_lowp+0x6b>
+  DB  144                                 ; nop
+  DB  247,255                             ; idiv          %edi
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  234                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  221,255                             ; (bad)
+  DB  255                                 ; (bad)
+  DB  255,208                             ; callq         *%rax
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  255,201                             ; dec           %ecx
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  188,255,255,255,175                 ; mov           $0xafffffff,%esp
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  255                                 ; .byte         0xff
+
+PUBLIC _sk_swap_rb_ssse3_lowp
+_sk_swap_rb_ssse3_lowp LABEL PROC
+  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  15,40,194                           ; movaps        %xmm2,%xmm0
+  DB  65,15,40,208                        ; movaps        %xmm8,%xmm2
+  DB  255,224                             ; jmpq          *%rax
+
+ALIGN 16
+  DB  129,128,129,128,129,128,129,128,129,128; addl          $0x80818081,-0x7f7e7f7f(%rax)
+  DB  129                                 ; .byte         0x81
+  DB  128                                 ; .byte         0x80
+  DB  129                                 ; .byte         0x81
+  DB  128                                 ; .byte         0x80
+  DB  129                                 ; .byte         0x81
+  DB  128                                 ; .byte         0x80
 ENDIF
 END
diff --git a/src/jumper/SkJumper_stages_lowp.cpp b/src/jumper/SkJumper_stages_lowp.cpp
new file mode 100644
index 0000000000..1abaff76b4
--- /dev/null
+++ b/src/jumper/SkJumper_stages_lowp.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkJumper.h"
+#include "SkJumper_misc.h"
+#include <immintrin.h>
+
+#if !defined(__SSSE3__) || !defined(__clang__) || !defined(__x86_64__)
+    #error "We're starting with just SSSE3 x86-64 for now, and will always require Clang."
+#endif
+
+#define WRAP(name) sk_##name##_ssse3_lowp
+
+using K = const SkJumper_constants;
+static const size_t kStride = 8;
+
+template <typename T> using V = T __attribute__((ext_vector_type(8)));
+using U8  = V<uint8_t>;
+using U16 = V<uint16_t>;
+using U32 = V<uint32_t>;
+
+// See SkFixed15.h for details on this format and its operations.
+struct F {
+    U16 vec;
+
+    F() = default;
+    F(uint16_t bits) : vec(bits) {}
+
+    F(U16 v) : vec(v) {}
+    operator U16() const { return vec; }
+};
+
+SI F operator+(F x, F y) { return x.vec + y.vec; }
+SI F operator-(F x, F y) { return x.vec - y.vec; }
+SI F operator*(F x, F y) { return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec)); }
+SI F mad(F f, F m, F a) { return f*m+a; }
+
+SI F operator<<(F x, int bits) { return x.vec << bits; }
+SI F operator>>(F x, int bits) { return x.vec >> bits; }
+
+using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F);
+
+MAYBE_MSABI
+extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** program, K* k) {
+    F v{};
+    auto start = (Stage*)load_and_inc(program);
+    while (x + kStride <= limit) {
+        start(k,program,x,y,0,    v,v,v,v, v,v,v,v);
+        x += kStride;
+    }
+    if (size_t tail = limit - x) {
+        start(k,program,x,y,tail, v,v,v,v, v,v,v,v);
+    }
+}
+extern "C" void WRAP(just_return)(K*, void**, size_t,size_t,size_t, F,F,F,F, F,F,F,F) {}
+
+#define STAGE(name)                                                                   \
+    SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail,              \
+                     F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);             \
+    extern "C" void WRAP(name)(K* k, void** program, size_t x, size_t y, size_t tail, \
+                               F r, F g, F b, F a, F dr, F dg, F db, F da) {          \
+        LazyCtx ctx(program);                                                         \
+        name##_k(k,ctx,x,y,tail, r,g,b,a, dr,dg,db,da);                               \
+        auto next = (Stage*)load_and_inc(program);                                    \
+        next(k,program,x,y,tail, r,g,b,a, dr,dg,db,da);                               \
+    }                                                                                 \
+    SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail,              \
+                     F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+
+
+// Helper functions used by multiple stages.
+
+template <typename V, typename T>
+SI V load(const T* src, size_t tail) {
+#if defined(JUMPER)
+    __builtin_assume(tail < kStride);
+    if (__builtin_expect(tail, 0)) {
+        V v{};  // Any inactive lanes are zeroed.
+        switch (tail-1) {
+            case 6: v[6] = src[6];
+            case 5: v[5] = src[5];
+            case 4: v[4] = src[4];
+            case 3: v[3] = src[3];
+            case 2: v[2] = src[2];
+            case 1: v[1] = src[1];
+            case 0: v[0] = src[0];
+        }
+        return v;
+    }
+#endif
+    return unaligned_load<V>(src);
+}
+
+template <typename V, typename T>
+SI void store(T* dst, V v, size_t tail) {
+#if defined(JUMPER)
+    __builtin_assume(tail < kStride);
+    if (__builtin_expect(tail, 0)) {
+        switch (tail-1) {
+            case 6: dst[6] = v[6];
+            case 5: dst[5] = v[5];
+            case 4: dst[4] = v[4];
+            case 3: dst[3] = v[3];
+            case 2: dst[2] = v[2];
+            case 1: dst[1] = v[1];
+            case 0: dst[0] = v[0];
+        }
+        return;
+    }
+#endif
+    unaligned_store(dst, v);
+}
+
+SI void from_8888(U32 rgba, F* r, F* g, F* b, F* a) {
+    // Split the 8 pixels into low and high halves, and reinterpret as vectors of 16-bit values.
+    U16 lo = unaligned_load<U16>((const uint32_t*)&rgba + 0),
+        hi = unaligned_load<U16>((const uint32_t*)&rgba + 4);
+
+    U16 _0415 = _mm_unpacklo_epi8(lo, hi),       // r0 r4 g0 g4 b0 b4 a0 a4  r1 r5 g1 g5 b1 b5 a1 a5
+        _2637 = _mm_unpackhi_epi8(lo, hi);
+
+    U16 even = _mm_unpacklo_epi8(_0415, _2637),  // r0 r2 r4 r6 g0 g2 g4 g6  b0 b2 b4 b6 a0 a2 a4 a6
+         odd = _mm_unpackhi_epi8(_0415, _2637);
+
+    U16 rg = _mm_unpacklo_epi8(even, odd),  // r0 r1 r2 r3 r4 r5 r6 r7  g0 g1 g2 g3 g4 g5 g6 g7
+        ba = _mm_unpackhi_epi8(even, odd);
+
+    // Unpack as 16-bit values into the high half of each 16-bit lane, to get a free *256.
+    U16 R = _mm_unpacklo_epi8(U16(0), rg),
+        G = _mm_unpackhi_epi8(U16(0), rg),
+        B = _mm_unpacklo_epi8(U16(0), ba),
+        A = _mm_unpackhi_epi8(U16(0), ba);
+
+    // Now we scale from [0,255] to [0,32768].  Ideally that's 32768/255 = 128.50196,
+    // but we can approximate that very cheaply as 256*32897/65536 = 128.50391.
+    // 0 and 255 map to 0 and 32768 correctly, and nothing else is off by more than 1.
+    *r = _mm_mulhi_epu16(R, U16(32897));
+    *g = _mm_mulhi_epu16(G, U16(32897));
+    *b = _mm_mulhi_epu16(B, U16(32897));
+    *a = _mm_mulhi_epu16(A, U16(32897));
+}
+SI U32 to_8888(F r, F g, F b, F a) {
+    // We want to interlace and pack these values from [0,32768] to [0,255].
+    // Luckily the simplest possible thing works great: >>7, then saturate.
+    // The 'u' in packus handles the saturation to [0,255] we need.
+    U16 rb = _mm_packus_epi16(r>>7,b>>7), // r0 r1 r2 r3 r4 r5 r6 r7 b0 b1 b2 b3 b4 b5 b6 b7
+        ga = _mm_packus_epi16(g>>7,a>>7);
+
+    U16 rg = _mm_unpacklo_epi8(rb, ga),   // r0 g0 r1 g1 ...                           r7 g7
+        ba = _mm_unpackhi_epi8(rb, ga);   // b0 a0       ...                           b7 a7
+
+    U16 lo = _mm_unpacklo_epi16(rg, ba),  // r0 g0 b0 a0 ...                     r3 g3 b3 a3
+        hi = _mm_unpackhi_epi16(rg, ba);  // r4 g4 b4 a4 ...                     r7 g7 b7 a7
+
+    U32 px;
+    memcpy((uint32_t*)&px + 0, &lo, sizeof(lo));
+    memcpy((uint32_t*)&px + 4, &hi, sizeof(hi));
+    return px;
+}
+
+// Stages!
+
+STAGE(load_8888) {
+    auto ptr = *(const uint32_t**)ctx + x;
+    from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
+}
+STAGE(store_8888) {
+    auto ptr = *(uint32_t**)ctx + x;
+    store(ptr, to_8888(r,g,b,a), tail);
+}
+
+STAGE(swap_rb) {
+    auto tmp = r;
+    r = b;
+    b = tmp;
+}
diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py
index 301295fa68..971087b3d5 100755
--- a/src/jumper/build_stages.py
+++ b/src/jumper/build_stages.py
@@ -29,6 +29,14 @@ subprocess.check_call(clang + cflags + sse2 + win +
                       ['-c', 'src/jumper/SkJumper_stages.cpp'] +
                       ['-o', 'win_sse2.o'])
 
+ssse3 = ['-mssse3', '-mno-sse4.1']
+subprocess.check_call(clang + cflags + ssse3 +
+                      ['-c', 'src/jumper/SkJumper_stages_lowp.cpp'] +
+                      ['-o', 'lowp_ssse3.o'])
+subprocess.check_call(clang + cflags + ssse3 + win +
+                      ['-c', 'src/jumper/SkJumper_stages_lowp.cpp'] +
+                      ['-o', 'win_lowp_ssse3.o'])
+
 sse41 = ['-msse4.1']
 subprocess.check_call(clang + cflags + sse41 +
                       ['-c', 'src/jumper/SkJumper_stages.cpp'] +
@@ -187,6 +195,8 @@ print 'BALIGN32'
 parse_object_file('sse41.o', '.byte')
 print 'BALIGN32'
 parse_object_file('sse2.o',  '.byte')
+print 'BALIGN32'
+parse_object_file('lowp_ssse3.o',  '.byte')
 
 print '#endif'
 
@@ -210,5 +220,7 @@ print 'ALIGN 32'
 parse_object_file('win_sse41.o', 'DB')
 print 'ALIGN 32'
 parse_object_file('win_sse2.o',  'DB')
+print 'ALIGN 32'
+parse_object_file('win_lowp_ssse3.o',  'DB')
 print 'ENDIF'
 print 'END'
diff --git a/tests/SkRasterPipelineTest.cpp b/tests/SkRasterPipelineTest.cpp
index f65a2c9868..014683c9b9 100644
--- a/tests/SkRasterPipelineTest.cpp
+++ b/tests/SkRasterPipelineTest.cpp
@@ -205,3 +205,31 @@ DEF_TEST(SkRasterPipeline_tail, r) {
         }
     }
 }
+
+DEF_TEST(SkRasterPipeline_lowp, r) {
+    uint32_t rgba[64];
+    for (int i = 0; i < 64; i++) {
+        rgba[i] = (4*i+0) << 0
+                | (4*i+1) << 8
+                | (4*i+2) << 16
+                | (4*i+3) << 24;
+    }
+
+    void* ptr = rgba;
+
+    SkRasterPipeline_<256> p;
+    p.append(SkRasterPipeline::load_8888,  &ptr);
+    p.append(SkRasterPipeline::swap_rb);
+    p.append(SkRasterPipeline::store_8888, &ptr);
+    p.run(0,0,64);
+
+    for (int i = 0; i < 64; i++) {
+        uint32_t want = (4*i+0) << 16
+                      | (4*i+1) << 8
+                      | (4*i+2) << 0
+                      | (4*i+3) << 24;
+        if (rgba[i] != want) {
+            ERRORF(r, "got %08x, want %08x\n", rgba[i], want);
+        }
+    }
+}