Rearrange SkRasterPipeline scanline tail handling.

We used to step at a 4-pixel stride as long as possible, then run up to 3 times, one pixel at a time. Now replace those 1-at-a-time runs with a single tail stamp if there are 1-3 remaining pixels. This style is simply more efficient: e.g. we'll blend and lerp once for 3 pixels instead of 3 times. This should make short blits significantly more efficient. It's also more future-oriented... AVX+ on Intel and SVE on ARM support masked loads and stores, so we can do the entire tail in one direct step. This also makes it possible to re-arrange the code a bit to encapsulate each stage better. I think generally this code reads more clearly than the old code, but YMMV. I've arranged things so you write one function, but it's compiled into two specializations, one for tail=0 (Body) and one for tail>0 (Tail). It's pretty tidy. For now I've just burned a register to pass around tail. It's 2 bits now, maybe soon 3 with AVX, and capped at 4 for even the craziest new toys, so there are plenty of places we can pack it if we want to get clever. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2717 Change-Id: I45852a3e5d4c5b5e9315302c46601aee0d32265f Reviewed-on: https://skia-review.googlesource.com/2717 Reviewed-by: Mike Reed <reed@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
author: Mike Klein <mtklein@chromium.org> 2016-09-28 10:43:53 -0400
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2016-09-28 15:28:24 +0000
commit: c8dd6bc3e7a4b01c848ba15b808ea6ffdf249b06 (patch)
tree: e58beeb3fd18659cf8ebfa11c11349c17675a46c /src/core/SkRasterPipeline.cpp
parent: b37eb0e968c5082e021244d4baf9b7721e3f360a (diff)
1 files changed, 18 insertions, 19 deletions
diff --git a/src/core/SkRasterPipeline.cpp b/src/core/SkRasterPipeline.cpp
index c50383af01..71c993dfb8 100644
--- a/src/core/SkRasterPipeline.cpp
+++ b/src/core/SkRasterPipeline.cpp
@@ -9,29 +9,30 @@
 
 SkRasterPipeline::SkRasterPipeline() {}
 
-void SkRasterPipeline::append(SkRasterPipeline::Fn body_fn, const void* body_ctx,
-                              SkRasterPipeline::Fn tail_fn, const void* tail_ctx) {
+void SkRasterPipeline::append(SkRasterPipeline::Fn body,
+                              SkRasterPipeline::Fn tail,
+                              const void* ctx) {
     // Each stage holds its own context and the next function to call.
     // So the pipeline itself has to hold onto the first function that starts the pipeline.
-    (fBody.empty() ? fBodyStart : fBody.back().fNext) = body_fn;
-    (fTail.empty() ? fTailStart : fTail.back().fNext) = tail_fn;
+    (fBody.empty() ? fBodyStart : fBody.back().fNext) = body;
+    (fTail.empty() ? fTailStart : fTail.back().fNext) = tail;
 
     // Each last stage starts with its next function set to JustReturn as a safety net.
     // It'll be overwritten by the next call to append().
-    fBody.push_back({ &JustReturn, const_cast<void*>(body_ctx) });
-    fTail.push_back({ &JustReturn, const_cast<void*>(tail_ctx) });
+    fBody.push_back({ &JustReturn, const_cast<void*>(ctx) });
+    fTail.push_back({ &JustReturn, const_cast<void*>(ctx) });
 }
 
 void SkRasterPipeline::extend(const SkRasterPipeline& src) {
     SkASSERT(src.fBody.count() == src.fTail.count());
 
-    Fn body_fn = src.fBodyStart,
-       tail_fn = src.fTailStart;
+    Fn body = src.fBodyStart,
+       tail = src.fTailStart;
     for (int i = 0; i < src.fBody.count(); i++) {
-        this->append(body_fn, src.fBody[i].fCtx,
-                     tail_fn, src.fTail[i].fCtx);
-        body_fn = src.fBody[i].fNext;
-        tail_fn = src.fTail[i].fNext;
+        SkASSERT(src.fBody[i].fCtx == src.fTail[i].fCtx);
+        this->append(body, tail, src.fBody[i].fCtx);
+        body = src.fBody[i].fNext;
+        tail = src.fTail[i].fNext;
     }
 }
 
@@ -40,16 +41,14 @@ void SkRasterPipeline::run(size_t x, size_t n) {
     Sk4f v;
 
     while (n >= 4) {
-        fBodyStart(fBody.begin(), x, v,v,v,v, v,v,v,v);
+        fBodyStart(fBody.begin(), x,0, v,v,v,v, v,v,v,v);
         x += 4;
         n -= 4;
     }
-    while (n > 0) {
-        fTailStart(fTail.begin(), x, v,v,v,v, v,v,v,v);
-        x += 1;
-        n -= 1;
+    if (n > 0) {
+        fTailStart(fTail.begin(), x,n, v,v,v,v, v,v,v,v);
     }
 }
 
-void SK_VECTORCALL SkRasterPipeline::JustReturn(Stage*, size_t, Sk4f,Sk4f,Sk4f,Sk4f,
-                                                                Sk4f,Sk4f,Sk4f,Sk4f) {}
+void SK_VECTORCALL SkRasterPipeline::JustReturn(Stage*, size_t, size_t, Sk4f,Sk4f,Sk4f,Sk4f,
+                                                                        Sk4f,Sk4f,Sk4f,Sk4f) {}
author	Mike Klein <mtklein@chromium.org>	2016-09-28 10:43:53 -0400
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2016-09-28 15:28:24 +0000
commit	c8dd6bc3e7a4b01c848ba15b808ea6ffdf249b06 (patch)
tree	e58beeb3fd18659cf8ebfa11c11349c17675a46c /src/core/SkRasterPipeline.cpp
parent	b37eb0e968c5082e021244d4baf9b7721e3f360a (diff)