1 files changed, 52 insertions, 61 deletions
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 03fab998e7..7e934f1731 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -26,13 +26,16 @@
  * are designed to maximize the amount of data we can pass along the pipeline cheaply.
  * On many machines all arguments stay in registers the entire time.
  *
- * The meaning of the arguments to Fn are sometimes fixed...
+ * The meaning of the arguments to Fn are sometimes fixed:
  *    - The Stage* always represents the current stage, mainly providing access to ctx().
- *    - The size_t is always the destination x coordinate.  If you need y, put it in your context.
+ *    - The first size_t is always the destination x coordinate.
+ *      (If you need y, put it in your context.)
+ *    - The second size_t is always tail: 0 when working on a full 4-pixel slab,
+ *      or 1..3 when using only the bottom 1..3 lanes of each register.
  *    - By the time the shader's done, the first four vectors should hold source red,
  *      green, blue, and alpha, up to 4 pixels' worth each.
  *
- * ...and sometimes flexible:
+ * Sometimes arguments are flexible:
  *    - In the shader, the first four vectors can be used for anything, e.g. sample coordinates.
  *    - The last four vectors are scratch registers that can be used to communicate between
  *      stages; transfer modes use these to hold the original destination pixel components.
@@ -43,7 +46,7 @@
  *   1) call st->next() with its mutated arguments, chaining to the next stage of the pipeline; or
  *   2) return, indicating the pipeline is complete for these pixels.
  *
- * Some obvious stages that typically return are those that write a color to a destination pointer,
+ * Some stages that typically return are those that write a color to a destination pointer,
  * but any stage can short-circuit the rest of the pipeline by returning instead of calling next().
  *
  * Most simple pipeline stages can use the SK_RASTER_STAGE macro to define a static EasyFn,
@@ -52,26 +55,29 @@
  *    - instead of manually calling a next() function, just modify registers in place.
  *
  * To add an EasyFn stage to the pipeline, call append<fn>() instead of append(&fn).
- * For the last stage of a pipeline, it's a slight performance benefit to call last<fn>().
+ * It's a slight performance benefit to call last<fn>() for the last stage of a pipeline.
  */
 
+// TODO: There may be a better place to stuff tail, e.g. in the bottom alignment bits of
+// the Stage*.  This mostly matters on 64-bit Windows where every register is precious.
+
 class SkRasterPipeline {
 public:
     struct Stage;
-    using Fn = void(SK_VECTORCALL *)(Stage*, size_t, Sk4f,Sk4f,Sk4f,Sk4f,
-                                                     Sk4f,Sk4f,Sk4f,Sk4f);
-    using EasyFn = void(void*, size_t, Sk4f&, Sk4f&, Sk4f&, Sk4f&,
-                                       Sk4f&, Sk4f&, Sk4f&, Sk4f&);
+    using Fn = void(SK_VECTORCALL *)(Stage*, size_t, size_t, Sk4f,Sk4f,Sk4f,Sk4f,
+                                                             Sk4f,Sk4f,Sk4f,Sk4f);
+    using EasyFn = void(void*, size_t, size_t, Sk4f&, Sk4f&, Sk4f&, Sk4f&,
+                                               Sk4f&, Sk4f&, Sk4f&, Sk4f&);
 
     struct Stage {
         template <typename T>
         T ctx() { return static_cast<T>(fCtx); }
 
-        void SK_VECTORCALL next(size_t x, Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
-                                          Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
+        void SK_VECTORCALL next(size_t x, size_t tail, Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3,
+                                                       Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) {
             // Stages are logically a pipeline, and physically are contiguous in an array.
             // To get to the next stage, we just increment our pointer to the next array element.
-            fNext(this+1, x, v0,v1,v2,v3, v4,v5,v6,v7);
+            fNext(this+1, x,tail, v0,v1,v2,v3, v4,v5,v6,v7);
         }
 
         // It makes next() a good bit cheaper if we hold the next function to call here,
@@ -84,50 +90,26 @@ public:
     SkRasterPipeline();
 
     // Run the pipeline constructed with append(), walking x through [x,x+n),
-    // generally in 4 pixel steps, but sometimes 1 pixel at a time.
+    // generally in 4-pixel steps, with perhaps one jagged tail step.
     void run(size_t x, size_t n);
     void run(size_t n) { this->run(0, n); }
 
-    // Use this append() if your stage is sensitive to the number of pixels you're working with:
-    //   - body will always be called for a full 4 pixels
-    //   - tail will always be called for a single pixel
-    // Typically this is only an essential distintion for stages that read or write memory.
-    void append(Fn body, const void* body_ctx,
-                Fn tail, const void* tail_ctx);
-
-    // Most stages don't actually care if they're working on 4 or 1 pixel.
-    void append(Fn fn, const void* ctx = nullptr) {
-        this->append(fn, ctx, fn, ctx);
-    }
+    // body() will only be called with tail=0, indicating it always works on a full 4 pixels.
+    // tail() will only be called with tail=1..3 to handle the jagged end of n%4 pixels.
+    void append(Fn body, Fn tail, const void* ctx = nullptr);
+    void append(Fn fn, const void* ctx = nullptr) { this->append(fn, fn, ctx); }
 
-    // Most 4 pixel or 1 pixel variants share the same context pointer.
-    void append(Fn body, Fn tail, const void* ctx = nullptr) {
-        this->append(body, ctx, tail, ctx);
-    }
-
-
-    // Versions of append that can be used with static EasyFns (see SK_RASTER_STAGE).
-    template <EasyFn body, EasyFn tail>
-    void append(const void* body_ctx, const void* tail_ctx) {
-        this->append(Easy<body>, body_ctx,
-                     Easy<tail>, tail_ctx);
-    }
-    template <EasyFn body, EasyFn tail>
-    void last(const void* body_ctx, const void* tail_ctx) {
-        this->append(Last<body>, body_ctx,
-                     Last<tail>, tail_ctx);
+    // Version of append that can be used with static EasyFn (see SK_RASTER_STAGE).
+    template <EasyFn fn>
+    void append(const void* ctx = nullptr) {
+        this->append(Body<fn,true>, Tail<fn,true>, ctx);
     }
 
+    // If this is the last stage of the pipeline, last() is a bit faster than append().
     template <EasyFn fn>
-    void append(const void* ctx = nullptr) { this->append<fn, fn>(ctx, ctx); }
-    template <EasyFn fn>
-    void last(const void* ctx = nullptr) { this->last<fn, fn>(ctx, ctx); }
-
-    template <EasyFn body, EasyFn tail>
-    void append(const void* ctx = nullptr) { this->append<body, tail>(ctx, ctx); }
-    template <EasyFn body, EasyFn tail>
-    void last(const void* ctx = nullptr) { this->last<body, tail>(ctx, ctx); }
-
+    void last(const void* ctx = nullptr) {
+        this->append(Body<fn,false>, Tail<fn,false>, ctx);
+    }
 
     // Append all stages to this pipeline.
     void extend(const SkRasterPipeline&);
@@ -138,22 +120,31 @@ private:
     // This no-op default makes fBodyStart and fTailStart unconditionally safe to call,
     // and is always the last stage's fNext as a sort of safety net to make sure even a
     // buggy pipeline can't walk off its own end.
-    static void SK_VECTORCALL JustReturn(Stage*, size_t, Sk4f,Sk4f,Sk4f,Sk4f,
-                                                         Sk4f,Sk4f,Sk4f,Sk4f);
+    static void SK_VECTORCALL JustReturn(Stage*, size_t, size_t, Sk4f,Sk4f,Sk4f,Sk4f,
+                                                                 Sk4f,Sk4f,Sk4f,Sk4f);
 
-    template <EasyFn kernel>
-    static void SK_VECTORCALL Easy(SkRasterPipeline::Stage* st, size_t x,
+    template <EasyFn kernel, bool kCallNext>
+    static void SK_VECTORCALL Body(SkRasterPipeline::Stage* st, size_t x, size_t tail,
                                    Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
                                    Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-        kernel(st->ctx<void*>(), x, r,g,b,a, dr,dg,db,da);
-        st->next(x, r,g,b,a, dr,dg,db,da);
+        // Passing 0 lets the optimizer completely drop any "if (tail) {...}" code in kernel.
+        kernel(st->ctx<void*>(), x,0, r,g,b,a, dr,dg,db,da);
+        if (kCallNext) {
+            st->next(x,tail, r,g,b,a, dr,dg,db,da);  // It's faster to pass tail here than 0.
+        }
     }
 
-    template <EasyFn kernel>
-    static void SK_VECTORCALL Last(SkRasterPipeline::Stage* st, size_t x,
+    template <EasyFn kernel, bool kCallNext>
+    static void SK_VECTORCALL Tail(SkRasterPipeline::Stage* st, size_t x, size_t tail,
                                    Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
                                    Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
-        kernel(st->ctx<void*>(), x, r,g,b,a, dr,dg,db,da);
+    #if defined(__clang__)
+        __builtin_assume(tail > 0);  // This flourish lets Clang compile away any tail==0 code.
+    #endif
+        kernel(st->ctx<void*>(), x,tail, r,g,b,a, dr,dg,db,da);
+        if (kCallNext) {
+            st->next(x,tail, r,g,b,a, dr,dg,db,da);
+        }
     }
 
     Stages fBody,
@@ -164,9 +155,9 @@ private:
 
 // These are always static, and we _really_ want them to inline.
 // If you find yourself wanting a non-inline stage, write a SkRasterPipeline::Fn directly.
-#define SK_RASTER_STAGE(name)                                       \
-    static SK_ALWAYS_INLINE void name(void* ctx, size_t x,          \
-                            Sk4f&  r, Sk4f&  g, Sk4f&  b, Sk4f&  a, \
+#define SK_RASTER_STAGE(name)                                           \
+    static SK_ALWAYS_INLINE void name(void* ctx, size_t x, size_t tail, \
+                            Sk4f&  r, Sk4f&  g, Sk4f&  b, Sk4f&  a,     \
                             Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da)
 
 #endif//SkRasterPipeline_DEFINED