4 files changed, 180 insertions, 219 deletions
diff --git a/bench/SkRasterPipelineBench.cpp b/bench/SkRasterPipelineBench.cpp
index 0243940a10..a5263d770f 100644
--- a/bench/SkRasterPipelineBench.cpp
+++ b/bench/SkRasterPipelineBench.cpp
@@ -23,10 +23,8 @@ static uint8_t mask[N];
 //   - store src back as srgb
 // Every stage except for srcover interacts with memory, and so will need _tail variants.
 
-static void SK_VECTORCALL load_s_srgb(SkRasterPipeline::Stage* st, size_t x,
-                                      Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                      Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const uint32_t*>() + x;
+SK_RASTER_STAGE(load_s_srgb) {
+    auto ptr = (const uint32_t*)ctx + x;
 
     r = Sk4f{ sk_linear_from_srgb[(ptr[0] >>  0) & 0xff],
               sk_linear_from_srgb[(ptr[1] >>  0) & 0xff],
@@ -44,27 +42,19 @@ static void SK_VECTORCALL load_s_srgb(SkRasterPipeline::Stage* st, size_t x,
               sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] };
 
     a = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
-
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
-static void SK_VECTORCALL load_s_srgb_tail(SkRasterPipeline::Stage* st, size_t x,
-                                           Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                           Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const uint32_t*>() + x;
+SK_RASTER_STAGE(load_s_srgb_tail) {
+    auto ptr = (const uint32_t*)ctx + x;
 
     r = Sk4f{ sk_linear_from_srgb[(*ptr >>  0) & 0xff], 0,0,0 };
     g = Sk4f{ sk_linear_from_srgb[(*ptr >>  8) & 0xff], 0,0,0 };
     b = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 };
     a = Sk4f{                (*ptr >> 24) * (1/255.0f), 0,0,0 };
-
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
-static void SK_VECTORCALL load_d_srgb(SkRasterPipeline::Stage* st, size_t x,
-                                      Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                      Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const uint32_t*>() + x;
+SK_RASTER_STAGE(load_d_srgb) {
+    auto ptr = (const uint32_t*)ctx + x;
 
     dr = Sk4f{ sk_linear_from_srgb[(ptr[0] >>  0) & 0xff],
                sk_linear_from_srgb[(ptr[1] >>  0) & 0xff],
@@ -82,67 +72,47 @@ static void SK_VECTORCALL load_d_srgb(SkRasterPipeline::Stage* st, size_t x,
                sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] };
 
     da = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
-
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
-static void SK_VECTORCALL load_d_srgb_tail(SkRasterPipeline::Stage* st, size_t x,
-                                           Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                           Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const uint32_t*>() + x;
+SK_RASTER_STAGE(load_d_srgb_tail) {
+    auto ptr = (const uint32_t*)ctx + x;
 
     dr = Sk4f{ sk_linear_from_srgb[(*ptr >>  0) & 0xff], 0,0,0 };
     dg = Sk4f{ sk_linear_from_srgb[(*ptr >>  8) & 0xff], 0,0,0 };
     db = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 };
     da = Sk4f{                (*ptr >> 24) * (1/255.0f), 0,0,0 };
-
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
-static void SK_VECTORCALL scale_u8(SkRasterPipeline::Stage* st, size_t x,
-                                   Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                   Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const uint8_t*>() + x;
+SK_RASTER_STAGE(scale_u8) {
+    auto ptr = (const uint8_t*)ctx + x;
 
     auto c = SkNx_cast<float>(Sk4b::Load(ptr)) * (1/255.0f);
     r *= c;
     g *= c;
     b *= c;
     a *= c;
-
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
-static void SK_VECTORCALL scale_u8_tail(SkRasterPipeline::Stage* st, size_t x,
-                                        Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                        Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const uint8_t*>() + x;
+SK_RASTER_STAGE(scale_u8_tail) {
+    auto ptr = (const uint8_t*)ctx + x;
 
     auto c = *ptr * (1/255.0f);
     r *= c;
     g *= c;
     b *= c;
     a *= c;
-
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
-static void SK_VECTORCALL srcover(SkRasterPipeline::Stage* st, size_t x,
-                                  Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                  Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+SK_RASTER_STAGE(srcover) {
     auto A = 1.0f - a;
     r += dr * A;
     g += dg * A;
     b += db * A;
     a += da * A;
-
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
-static void SK_VECTORCALL store_srgb(SkRasterPipeline::Stage* st, size_t x,
-                                     Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                     Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<uint32_t*>() + x;
+SK_RASTER_STAGE(store_srgb) {
+    auto ptr = (uint32_t*)ctx + x;
 
     ( sk_linear_to_srgb(r)
     | sk_linear_to_srgb(g) << 8
@@ -150,10 +120,8 @@ static void SK_VECTORCALL store_srgb(SkRasterPipeline::Stage* st, size_t x,
     | Sk4f_round(255.0f*a) << 24).store(ptr);
 }
 
-static void SK_VECTORCALL store_srgb_tail(SkRasterPipeline::Stage* st, size_t x,
-                                          Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                          Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<uint32_t*>() + x;
+SK_RASTER_STAGE(store_srgb_tail) {
+    auto ptr = (uint32_t*)ctx + x;
 
     Sk4i rgba = sk_linear_to_srgb({r[0], g[0], b[0], 0});
     rgba = {rgba[0], rgba[1], rgba[2], (int)(255.0f * a[0] + 0.5f)};
@@ -163,23 +131,56 @@ static void SK_VECTORCALL store_srgb_tail(SkRasterPipeline::Stage* st, size_t x,
 
 class SkRasterPipelineBench : public Benchmark {
 public:
-    SkRasterPipelineBench() {}
+    SkRasterPipelineBench(bool fused) : fFused(fused) {}
 
     bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
-    const char* onGetName() override { return "SkRasterPipelineBench"; }
+    const char* onGetName() override { return fFused ? "SkRasterPipelineBench_fused"
+                                                     : "SkRasterPipelineBench_pipeline"; }
 
     void onDraw(int loops, SkCanvas*) override {
-        SkRasterPipeline p;
-        p.append(load_s_srgb, load_s_srgb_tail,  src);
-        p.append(   scale_u8,    scale_u8_tail, mask);
-        p.append(load_d_srgb, load_d_srgb_tail,  dst);
-        p.append(srcover);
-        p.append( store_srgb,  store_srgb_tail,  dst);
-
         while (loops --> 0) {
-            p.run(N);
+            fFused ? this->runFused() : this->runPipeline();
         }
     }
+
+    void runFused() {
+        Sk4f r,g,b,a, dr,dg,db,da;
+        size_t x = 0, n = N;
+        while (n >= 4) {
+            load_s_srgb(src    , x, r,g,b,a, dr,dg,db,da);
+            scale_u8   (mask   , x, r,g,b,a, dr,dg,da,da);
+            load_d_srgb(dst    , x, r,g,b,a, dr,dg,da,da);
+            srcover    (nullptr, x, r,g,b,a, dr,dg,da,da);
+            store_srgb (dst    , x, r,g,b,a, dr,dg,da,da);
+
+            x += 4;
+            n -= 4;
+        }
+        while (n > 0) {
+            load_s_srgb_tail(src    , x, r,g,b,a, dr,dg,db,da);
+            scale_u8_tail   (mask   , x, r,g,b,a, dr,dg,da,da);
+            load_d_srgb_tail(dst    , x, r,g,b,a, dr,dg,da,da);
+            srcover         (nullptr, x, r,g,b,a, dr,dg,da,da);
+            store_srgb_tail (dst    , x, r,g,b,a, dr,dg,da,da);
+
+            x += 1;
+            n -= 1;
+        }
+    }
+
+    void runPipeline() {
+        SkRasterPipeline p;
+        p.append<load_s_srgb, load_s_srgb_tail>( src);
+        p.append<   scale_u8,    scale_u8_tail>(mask);
+        p.append<load_d_srgb, load_d_srgb_tail>( dst);
+        p.append<srcover>();
+        p.append< store_srgb,  store_srgb_tail>( dst);
+
+        p.run(N);
+    }
+
+    bool fFused;
 };
 
-DEF_BENCH( return new SkRasterPipelineBench; )
+DEF_BENCH( return new SkRasterPipelineBench(true); )
+DEF_BENCH( return new SkRasterPipelineBench(false); )
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index acbabcbadc..475f517b18 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -45,6 +45,8 @@
  *
  * Some obvious stages that typically return are those that write a color to a destination pointer,
  * but any stage can short-circuit the rest of the pipeline by returning instead of calling next().
+ *
+ * TODO: explain EasyFn and SK_RASTER_STAGE
  */
 
 class SkRasterPipeline {
@@ -52,6 +54,9 @@ public:
     struct Stage;
     using Fn = void(SK_VECTORCALL *)(Stage*, size_t, Sk4f,Sk4f,Sk4f,Sk4f,
                                                      Sk4f,Sk4f,Sk4f,Sk4f);
+    using EasyFn = void(void*, size_t, Sk4f&, Sk4f&, Sk4f&, Sk4f&,
+                                       Sk4f&, Sk4f&, Sk4f&, Sk4f&);
+
     struct Stage {
         template <typename T>
         T ctx() { return static_cast<T>(fCtx); }
@@ -94,6 +99,21 @@ public:
         this->append(body, ctx, tail, ctx);
     }
 
+
+    // Versions of append that can be used with static EasyFns (see SK_RASTER_STAGE).
+    template <EasyFn body, EasyFn tail>
+    void append(const void* body_ctx, const void* tail_ctx) {
+        this->append(Easy<body>, body_ctx,
+                     Easy<tail>, tail_ctx);
+    }
+
+    template <EasyFn fn>
+    void append(const void* ctx = nullptr) { this->append<fn, fn>(ctx, ctx); }
+
+    template <EasyFn body, EasyFn tail>
+    void append(const void* ctx = nullptr) { this->append<body, tail>(ctx, ctx); }
+
+
     // Append all stages to this pipeline.
     void extend(const SkRasterPipeline&);
 
@@ -106,10 +126,25 @@ private:
     static void SK_VECTORCALL JustReturn(Stage*, size_t, Sk4f,Sk4f,Sk4f,Sk4f,
                                                          Sk4f,Sk4f,Sk4f,Sk4f);
 
+    template <EasyFn kernel>
+    static void SK_VECTORCALL Easy(SkRasterPipeline::Stage* st, size_t x,
+                                   Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
+                                   Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+        kernel(st->ctx<void*>(), x, r,g,b,a, dr,dg,db,da);
+        st->next(x, r,g,b,a, dr,dg,db,da);
+    }
+
     Stages fBody,
            fTail;
     Fn fBodyStart = &JustReturn,
        fTailStart = &JustReturn;
 };
 
+// These are always static, and we _really_ want them to inline.
+// If you find yourself wanting a non-inline stage, write a SkRasterPipeline::Fn directly.
+#define SK_RASTER_STAGE(name)                                       \
+    static SK_ALWAYS_INLINE void name(void* ctx, size_t x,          \
+                            Sk4f&  r, Sk4f&  g, Sk4f&  b, Sk4f&  a, \
+                            Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da)
+
 #endif//SkRasterPipeline_DEFINED
diff --git a/src/core/SkRasterPipelineBlitter.cpp b/src/core/SkRasterPipelineBlitter.cpp
index f3fc76ee79..0fae6dd5a2 100644
--- a/src/core/SkRasterPipelineBlitter.cpp
+++ b/src/core/SkRasterPipelineBlitter.cpp
@@ -58,9 +58,7 @@ SkBlitter* SkCreateRasterPipelineBlitter(const SkPixmap& dst,
 }
 
 // Clamp colors into [0,1] premul (e.g. just before storing back to memory).
-static void SK_VECTORCALL clamp_01_premul(SkRasterPipeline::Stage* st, size_t x,
-                                          Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                          Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+SK_RASTER_STAGE(clamp_01_premul) {
     a = Sk4f::Max(a, 0.0f);
     r = Sk4f::Max(r, 0.0f);
     g = Sk4f::Max(g, 0.0f);
@@ -70,32 +68,23 @@ static void SK_VECTORCALL clamp_01_premul(SkRasterPipeline::Stage* st, size_t x,
     r = Sk4f::Min(r, a);
     g = Sk4f::Min(g, a);
     b = Sk4f::Min(b, a);
-
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
 // The default shader produces a constant color (from the SkPaint).
-static void SK_VECTORCALL constant_color(SkRasterPipeline::Stage* st, size_t x,
-                                         Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                         Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto color = st->ctx<const SkPM4f*>();
+SK_RASTER_STAGE(constant_color) {
+    auto color = (const SkPM4f*)ctx;
     r = color->r();
     g = color->g();
     b = color->b();
     a = color->a();
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
 // The default transfer mode is srcover, s' = s + d*(1-sa).
-static void SK_VECTORCALL srcover(SkRasterPipeline::Stage* st, size_t x,
-                                  Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                  Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto A = 1.0f - a;
-    r += dr*A;
-    g += dg*A;
-    b += db*A;
-    a += da*A;
-    st->next(x, r,g,b,a, dr,dg,db,da);
+SK_RASTER_STAGE(srcover) {
+    r += dr*(1.0f - a);
+    g += dg*(1.0f - a);
+    b += db*(1.0f - a);
+    a += da*(1.0f - a);
 }
 
 static Sk4f lerp(const Sk4f& from, const Sk4f& to, const Sk4f& cov) {
@@ -103,44 +92,35 @@ static Sk4f lerp(const Sk4f& from, const Sk4f& to, const Sk4f& cov) {
 }
 
 // s' = d(1-c) + sc, for a constant c.
-static void SK_VECTORCALL lerp_constant_float(SkRasterPipeline::Stage* st, size_t x,
-                                              Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                              Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    Sk4f c = *st->ctx<const float*>();
+SK_RASTER_STAGE(lerp_constant_float) {
+    Sk4f c = *(const float*)ctx;
 
     r = lerp(dr, r, c);
     g = lerp(dg, g, c);
     b = lerp(db, b, c);
     a = lerp(da, a, c);
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
 // s' = d(1-c) + sc, 4 pixels at a time for 8-bit coverage.
-static void SK_VECTORCALL lerp_a8(SkRasterPipeline::Stage* st, size_t x,
-                                  Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                  Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const uint8_t*>() + x;
+SK_RASTER_STAGE(lerp_a8) {
+    auto ptr = (const uint8_t*)ctx + x;
     Sk4f c = SkNx_cast<float>(Sk4b::Load(ptr)) * (1/255.0f);
 
     r = lerp(dr, r, c);
     g = lerp(dg, g, c);
     b = lerp(db, b, c);
     a = lerp(da, a, c);
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
 // Tail variant of lerp_a8() handling 1 pixel at a time.
-static void SK_VECTORCALL lerp_a8_1(SkRasterPipeline::Stage* st, size_t x,
-                                    Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                    Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const uint8_t*>() + x;
+SK_RASTER_STAGE(lerp_a8_1) {
+    auto ptr = (const uint8_t*)ctx + x;
     Sk4f c = *ptr * (1/255.0f);
 
     r = lerp(dr, r, c);
     g = lerp(dg, g, c);
     b = lerp(db, b, c);
     a = lerp(da, a, c);
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
 static void from_565(const Sk4h& _565, Sk4f* r, Sk4f* g, Sk4f* b) {
@@ -158,10 +138,8 @@ static Sk4h to_565(const Sk4f& r, const Sk4f& g, const Sk4f& b) {
 }
 
 // s' = d(1-c) + sc, 4 pixels at a time for 565 coverage.
-static void SK_VECTORCALL lerp_lcd16(SkRasterPipeline::Stage* st, size_t x,
-                                     Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                     Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const uint16_t*>() + x;
+SK_RASTER_STAGE(lerp_lcd16) {
+    auto ptr = (const uint16_t*)ctx + x;
     Sk4f cr, cg, cb;
     from_565(Sk4h::Load(ptr), &cr, &cg, &cb);
 
@@ -169,14 +147,11 @@ static void SK_VECTORCALL lerp_lcd16(SkRasterPipeline::Stage* st, size_t x,
     g = lerp(dg, g, cg);
     b = lerp(db, b, cb);
     a = 1.0f;
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
 // Tail variant of lerp_lcd16() handling 1 pixel at a time.
-static void SK_VECTORCALL lerp_lcd16_1(SkRasterPipeline::Stage* st, size_t x,
-                                       Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                       Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const uint16_t*>() + x;
+SK_RASTER_STAGE(lerp_lcd16_1) {
+    auto ptr = (const uint16_t*)ctx + x;
     Sk4f cr, cg, cb;
     from_565({*ptr,0,0,0}, &cr, &cg, &cb);
 
@@ -184,52 +159,39 @@ static void SK_VECTORCALL lerp_lcd16_1(SkRasterPipeline::Stage* st, size_t x,
     g = lerp(dg, g, cg);
     b = lerp(db, b, cb);
     a = 1.0f;
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
 // Load 4 565 dst pixels.
-static void SK_VECTORCALL load_d_565(SkRasterPipeline::Stage* st, size_t x,
-                                     Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                     Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const uint16_t*>() + x;
+SK_RASTER_STAGE(load_d_565) {
+    auto ptr = (const uint16_t*)ctx + x;
 
     from_565(Sk4h::Load(ptr), &dr,&dg,&db);
     da = 1.0f;
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
 // Load 1 565 dst pixel.
-static void SK_VECTORCALL load_d_565_1(SkRasterPipeline::Stage* st, size_t x,
-                                       Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                       Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const uint16_t*>() + x;
+SK_RASTER_STAGE(load_d_565_1) {
+    auto ptr = (const uint16_t*)ctx + x;
 
     from_565({*ptr,0,0,0}, &dr,&dg,&db);
     da = 1.0f;
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
 // Store 4 565 pixels.
-static void SK_VECTORCALL store_565(SkRasterPipeline::Stage* st, size_t x,
-                                    Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                    Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<uint16_t*>() + x;
+SK_RASTER_STAGE(store_565) {
+    auto ptr = (uint16_t*)ctx + x;
     to_565(r,g,b).store(ptr);
 }
 
 // Store 1 565 pixel.
-static void SK_VECTORCALL store_565_1(SkRasterPipeline::Stage* st, size_t x,
-                                      Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                      Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<uint16_t*>() + x;
+SK_RASTER_STAGE(store_565_1) {
+    auto ptr = (uint16_t*)ctx + x;
     *ptr = to_565(r,g,b)[0];
 }
 
 // Load 4 F16 pixels.
-static void SK_VECTORCALL load_d_f16(SkRasterPipeline::Stage* st, size_t x,
-                                     Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                     Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const uint64_t*>() + x;
+SK_RASTER_STAGE(load_d_f16) {
+    auto ptr = (const uint64_t*)ctx + x;
 
     Sk4h rh, gh, bh, ah;
     Sk4h_load4(ptr, &rh, &gh, &bh, &ah);
@@ -238,49 +200,37 @@ static void SK_VECTORCALL load_d_f16(SkRasterPipeline::Stage* st, size_t x,
     dg = SkHalfToFloat_finite(gh);
     db = SkHalfToFloat_finite(bh);
     da = SkHalfToFloat_finite(ah);
-
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
 // Load 1 F16 pixel.
-static void SK_VECTORCALL load_d_f16_1(SkRasterPipeline::Stage* st, size_t x,
-                                       Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                       Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const uint64_t*>() + x;
+SK_RASTER_STAGE(load_d_f16_1) {
+    auto ptr = (const uint64_t*)ctx + x;
 
     auto p0 = SkHalfToFloat_finite(ptr[0]);
     dr = { p0[0],0,0,0 };
     dg = { p0[1],0,0,0 };
     db = { p0[2],0,0,0 };
     da = { p0[3],0,0,0 };
-
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
 // Store 4 F16 pixels.
-static void SK_VECTORCALL store_f16(SkRasterPipeline::Stage* st, size_t x,
-                                    Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                    Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<uint64_t*>() + x;
+SK_RASTER_STAGE(store_f16) {
+    auto ptr = (uint64_t*)ctx + x;
 
     Sk4h_store4(ptr, SkFloatToHalf_finite(r), SkFloatToHalf_finite(g),
                      SkFloatToHalf_finite(b), SkFloatToHalf_finite(a));
 }
 
 // Store 1 F16 pixel.
-static void SK_VECTORCALL store_f16_1(SkRasterPipeline::Stage* st, size_t x,
-                                      Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                      Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<uint64_t*>() + x;
+SK_RASTER_STAGE(store_f16_1) {
+    auto ptr = (uint64_t*)ctx + x;
 
     SkFloatToHalf_finite({r[0], g[0], b[0], a[0]}).store(ptr);
 }
 
 // Load 4 8-bit sRGB pixels from SkPMColor order to RGBA.
-static void SK_VECTORCALL load_d_srgb(SkRasterPipeline::Stage* st, size_t x,
-                                      Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                      Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const uint32_t*>() + x;
+SK_RASTER_STAGE(load_d_srgb) {
+    auto ptr = (const uint32_t*)ctx + x;
 
     dr = { sk_linear_from_srgb[(ptr[0] >> SK_R32_SHIFT) & 0xff],
            sk_linear_from_srgb[(ptr[1] >> SK_R32_SHIFT) & 0xff],
@@ -298,46 +248,36 @@ static void SK_VECTORCALL load_d_srgb(SkRasterPipeline::Stage* st, size_t x,
            sk_linear_from_srgb[(ptr[3] >> SK_B32_SHIFT) & 0xff] };
 
     da = SkNx_cast<float>(Sk4u::Load(ptr) >> SK_A32_SHIFT) * (1/255.0f);
-
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
 // Tail variant of load_d_srgb() handling 1 pixel at a time.
-static void SK_VECTORCALL load_d_srgb_1(SkRasterPipeline::Stage* st, size_t x,
-                                        Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                        Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto ptr = st->ctx<const uint32_t*>() + x;
+SK_RASTER_STAGE(load_d_srgb_1) {
+    auto ptr = (const uint32_t*)ctx + x;
 
     dr = { sk_linear_from_srgb[(*ptr >> SK_R32_SHIFT) & 0xff], 0,0,0 };
     dg = { sk_linear_from_srgb[(*ptr >> SK_G32_SHIFT) & 0xff], 0,0,0 };
     db = { sk_linear_from_srgb[(*ptr >> SK_B32_SHIFT) & 0xff], 0,0,0 };
     da = {        (1/255.0f) * (*ptr >> SK_A32_SHIFT)        , 0,0,0 };
-
-    st->next(x, r,g,b,a, dr,dg,db,da);
 }
 
 // Write out 4 pixels as 8-bit SkPMColor-order sRGB.
-static void SK_VECTORCALL store_srgb(SkRasterPipeline::Stage* st, size_t x,
-                                     Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                     Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto dst = st->ctx<uint32_t*>() + x;
+SK_RASTER_STAGE(store_srgb) {
+    auto ptr = (uint32_t*)ctx + x;
     ( sk_linear_to_srgb_noclamp(r) << SK_R32_SHIFT
     | sk_linear_to_srgb_noclamp(g) << SK_G32_SHIFT
     | sk_linear_to_srgb_noclamp(b) << SK_B32_SHIFT
-    |       Sk4f_round(255.0f * a) << SK_A32_SHIFT).store(dst);
+    |       Sk4f_round(255.0f * a) << SK_A32_SHIFT).store(ptr);
 }
 
 // Tail variant of store_srgb() handling 1 pixel at a time.
-static void SK_VECTORCALL store_srgb_1(SkRasterPipeline::Stage* st, size_t x,
-                                       Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
-                                       Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-    auto dst = st->ctx<uint32_t*>() + x;
+SK_RASTER_STAGE(store_srgb_1) {
+    auto ptr = (uint32_t*)ctx + x;
     Sk4i rgb = sk_linear_to_srgb_noclamp(swizzle_rb_if_bgra({ r[0], g[0], b[0], 0.0f }));
 
     uint32_t rgba;
     SkNx_cast<uint8_t>(rgb).store(&rgba);
     rgba |= (uint32_t)(255.0f * a[0] + 0.5f) << 24;
-    *dst = rgba;
+    *ptr = rgba;
 }
 
 static bool supported(const SkImageInfo& info) {
@@ -386,10 +326,10 @@ SkBlitter* SkRasterPipelineBlitter::Create(const SkPixmap& dst,
             color.premul());
 
     if (!paint.getShader()) {
-        blitter->fShader.append(constant_color, &blitter->fPaintColor);
+        blitter->fShader.append<constant_color>(&blitter->fPaintColor);
     }
     if (!paint.getXfermode()) {
-        blitter->fXfermode.append(srcover);
+        blitter->fXfermode.append<srcover>();
     }
 
     return blitter;
@@ -401,14 +341,14 @@ void SkRasterPipelineBlitter::append_load_d(SkRasterPipeline* p, const void* dst
     switch (fDst.info().colorType()) {
         case kN32_SkColorType:
             if (fDst.info().gammaCloseToSRGB()) {
-                p->append(load_d_srgb, load_d_srgb_1, dst);
+                p->append<load_d_srgb, load_d_srgb_1>(dst);
             }
             break;
         case kRGBA_F16_SkColorType:
-            p->append(load_d_f16, load_d_f16_1, dst);
+            p->append<load_d_f16, load_d_f16_1>(dst);
             break;
         case kRGB_565_SkColorType:
-            p->append(load_d_565, load_d_565_1, dst);
+            p->append<load_d_565, load_d_565_1>(dst);
             break;
         default: break;
     }
@@ -417,18 +357,18 @@ void SkRasterPipelineBlitter::append_load_d(SkRasterPipeline* p, const void* dst
 void SkRasterPipelineBlitter::append_store(SkRasterPipeline* p, void* dst) const {
     SkASSERT(supported(fDst.info()));
 
-    p->append(clamp_01_premul);
+    p->append<clamp_01_premul>();
     switch (fDst.info().colorType()) {
         case kN32_SkColorType:
             if (fDst.info().gammaCloseToSRGB()) {
-                p->append(store_srgb, store_srgb_1, dst);
+                p->append<store_srgb, store_srgb_1>(dst);
             }
             break;
         case kRGBA_F16_SkColorType:
-            p->append(store_f16, store_f16_1, dst);
+            p->append<store_f16, store_f16_1>(dst);
             break;
         case kRGB_565_SkColorType:
-            p->append(store_565, store_565_1, dst);
+            p->append<store_565, store_565_1>(dst);
             break;
         default: break;
     }
@@ -456,7 +396,7 @@ void SkRasterPipelineBlitter::blitAntiH(int x, int y, const SkAlpha aa[], const
     p.extend(fColorFilter);
     this->append_load_d(&p, dst);
     p.extend(fXfermode);
-    p.append(lerp_constant_float, &coverage);
+    p.append<lerp_constant_float>(&coverage);
     this->append_store(&p, dst);
 
     for (int16_t run = *runs; run > 0; run = *runs) {
@@ -486,10 +426,10 @@ void SkRasterPipelineBlitter::blitMask(const SkMask& mask, const SkIRect& clip)
         p.extend(fXfermode);
         switch (mask.fFormat) {
             case SkMask::kA8_Format:
-                p.append(lerp_a8, lerp_a8_1, mask.getAddr8(x,y)-x);
+                p.append<lerp_a8, lerp_a8_1>(mask.getAddr8(x,y)-x);
                 break;
             case SkMask::kLCD16_Format:
-                p.append(lerp_lcd16, lerp_lcd16_1, mask.getAddrLCD16(x,y)-x);
+                p.append<lerp_lcd16, lerp_lcd16_1>(mask.getAddrLCD16(x,y)-x);
                 break;
             default: break;
         }
diff --git a/tests/SkRasterPipelineTest.cpp b/tests/SkRasterPipelineTest.cpp
index beb517b3f7..29fe59d31b 100644
--- a/tests/SkRasterPipelineTest.cpp
+++ b/tests/SkRasterPipelineTest.cpp
@@ -9,55 +9,40 @@
 #include "SkRasterPipeline.h"
 
 // load needs two variants, one to load 4 values...
-static void SK_VECTORCALL load(SkRasterPipeline::Stage* st, size_t x,
-                               Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
-                               Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
-    auto ptr = st->ctx<const float*>();
-    v0 = Sk4f{ptr[x+0]};
-    v1 = Sk4f{ptr[x+1]};
-    v2 = Sk4f{ptr[x+2]};
-    v3 = Sk4f{ptr[x+3]};
-
-    st->next(x, v0,v1,v2,v3, v4,v5,v6,v7);
+SK_RASTER_STAGE(load) {
+    auto ptr = (const float*)ctx + x;
+    r = Sk4f{ptr[0]};
+    g = Sk4f{ptr[1]};
+    b = Sk4f{ptr[2]};
+    a = Sk4f{ptr[3]};
 }
 
 // ...and one to load a single value.
-static void SK_VECTORCALL load_tail(SkRasterPipeline::Stage* st, size_t x,
-                                    Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
-                                    Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
-    auto ptr = st->ctx<const float*>();
-    v0 = Sk4f{ptr[x]};
-
-    st->next(x, v0,v1,v2,v3, v4,v5,v6,v7);
+SK_RASTER_STAGE(load_tail) {
+    auto ptr = (const float*)ctx + x;
+    r = Sk4f{*ptr};
 }
 
 // square doesn't really care how many of its inputs are active, nor does it need a context.
-static void SK_VECTORCALL square(SkRasterPipeline::Stage* st, size_t x,
-                                 Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
-                                 Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
-    v0 *= v0;
-    v1 *= v1;
-    v2 *= v2;
-    v3 *= v3;
-    st->next(x, v0,v1,v2,v3, v4,v5,v6,v7);
+SK_RASTER_STAGE(square) {
+    r *= r;
+    g *= g;
+    b *= b;
+    a *= a;
 }
 
-// Like load, store has a _tail variant.  It ends the pipeline by returning.
-static void SK_VECTORCALL store(SkRasterPipeline::Stage* st, size_t x,
-                                Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
-                                Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
-    auto ptr = st->ctx<float*>();
-    ptr[x+0] = v0[0];
-    ptr[x+1] = v1[0];
-    ptr[x+2] = v2[0];
-    ptr[x+3] = v3[0];
+// Like load, store has a _tail variant.
+SK_RASTER_STAGE(store) {
+    auto ptr = (float*)ctx + x;
+    ptr[0] = r[0];
+    ptr[1] = g[0];
+    ptr[2] = b[0];
+    ptr[3] = a[0];
 }
 
-static void SK_VECTORCALL store_tail(SkRasterPipeline::Stage* st, size_t x,
-                                     Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
-                                     Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
-    auto ptr = st->ctx<float*>();
-    ptr[x+0] = v0[0];
+SK_RASTER_STAGE(store_tail) {
+    auto ptr = (float*)ctx + x;
+    *ptr = r[0];
 }
 
 DEF_TEST(SkRasterPipeline, r) {
@@ -73,9 +58,9 @@ DEF_TEST(SkRasterPipeline, r) {
     float       dst_vals[] = { 0,0,0,0,0 };
 
     SkRasterPipeline p;
-    p.append(load, load_tail, src_vals);
-    p.append(square);
-    p.append(store, store_tail, dst_vals);
+    p.append<load, load_tail>(src_vals);
+    p.append<square>();
+    p.append<store, store_tail>(dst_vals);
 
     p.run(5);
 
@@ -96,6 +81,6 @@ DEF_TEST(SkRasterPipeline_nonsense, r) {
     // No asserts... just a test that this is safe to run and terminates.
     // square() always calls st->next(); this makes sure we've always got something there to call.
     SkRasterPipeline p;
-    p.append(square);
+    p.append<square>();
     p.run(20);
 }