From c8dd6bc3e7a4b01c848ba15b808ea6ffdf249b06 Mon Sep 17 00:00:00 2001
From: Mike Klein <mtklein@chromium.org>
Date: Wed, 28 Sep 2016 10:43:53 -0400
Subject: Rearrange SkRasterPipeline scanline tail handling.

We used to step at a 4-pixel stride as long as possible, then run up to 3 times, one pixel at a time.  Now replace those 1-at-a-time runs with a single tail stamp if there are 1-3 remaining pixels.

This style is simply more efficient: e.g. we'll blend and lerp once for 3 pixels instead of 3 times.  This should make short blits significantly more efficient.  It's also more future-oriented... AVX+ on Intel and SVE on ARM support masked loads and stores, so we can do the entire tail in one direct step.

This also makes it possible to re-arrange the code a bit to encapsulate each stage better.  I think generally this code reads more clearly than the old code, but YMMV.  I've arranged things so you write one function, but it's compiled into two specializations, one for tail=0 (Body) and one for tail>0 (Tail).  It's pretty tidy.

For now I've just burned a register to pass around tail.  It's 2 bits now, maybe soon 3 with AVX, and capped at 4 for even the craziest new toys, so there are plenty of places we can pack it if we want to get clever.

BUG=skia:

GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2717

Change-Id: I45852a3e5d4c5b5e9315302c46601aee0d32265f
Reviewed-on: https://skia-review.googlesource.com/2717
Reviewed-by: Mike Reed <reed@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
---
 bench/SkRasterPipelineBench.cpp | 135 +++++++++++++++++++++++-----------------
 1 file changed, 78 insertions(+), 57 deletions(-)

(limited to 'bench/SkRasterPipelineBench.cpp')
diff --git a/bench/SkRasterPipelineBench.cpp b/bench/SkRasterPipelineBench.cpp
index 5ef64ce603..aa3e355e45 100644
--- a/bench/SkRasterPipelineBench.cpp
+++ b/bench/SkRasterPipelineBench.cpp
@@ -21,11 +21,28 @@ static uint8_t mask[N];
 //   - load srgb dst
 //   - src = srcover(dst, src)
 //   - store src back as srgb
-// Every stage except for srcover interacts with memory, and so will need _tail variants.
 
 SK_RASTER_STAGE(load_s_srgb) {
     auto ptr = (const uint32_t*)ctx + x;
 
+    if (tail) {
+        float rs[] = {0,0,0,0},
+              gs[] = {0,0,0,0},
+              bs[] = {0,0,0,0},
+              as[] = {0,0,0,0};
+        for (size_t i = 0; i < (tail&3); i++) {
+            rs[i] = sk_linear_from_srgb[(ptr[i] >>  0) & 0xff];
+            gs[i] = sk_linear_from_srgb[(ptr[i] >>  8) & 0xff];
+            bs[i] = sk_linear_from_srgb[(ptr[i] >> 16) & 0xff];
+            as[i] = (ptr[i] >> 24) * (1/255.0f);
+        }
+        r = Sk4f::Load(rs);
+        g = Sk4f::Load(gs);
+        b = Sk4f::Load(bs);
+        a = Sk4f::Load(as);
+        return;
+    }
+
     r = Sk4f{ sk_linear_from_srgb[(ptr[0] >>  0) & 0xff],
               sk_linear_from_srgb[(ptr[1] >>  0) & 0xff],
               sk_linear_from_srgb[(ptr[2] >>  0) & 0xff],
@@ -40,22 +57,30 @@ SK_RASTER_STAGE(load_s_srgb) {
               sk_linear_from_srgb[(ptr[1] >> 16) & 0xff],
               sk_linear_from_srgb[(ptr[2] >> 16) & 0xff],
               sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] };
-
     a = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
 }
 
-SK_RASTER_STAGE(load_s_srgb_tail) {
-    auto ptr = (const uint32_t*)ctx + x;
-
-    r = Sk4f{ sk_linear_from_srgb[(*ptr >>  0) & 0xff], 0,0,0 };
-    g = Sk4f{ sk_linear_from_srgb[(*ptr >>  8) & 0xff], 0,0,0 };
-    b = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 };
-    a = Sk4f{                (*ptr >> 24) * (1/255.0f), 0,0,0 };
-}
-
 SK_RASTER_STAGE(load_d_srgb) {
     auto ptr = (const uint32_t*)ctx + x;
 
+    if (tail) {
+        float rs[] = {0,0,0,0},
+              gs[] = {0,0,0,0},
+              bs[] = {0,0,0,0},
+              as[] = {0,0,0,0};
+        for (size_t i = 0; i < (tail&3); i++) {
+            rs[i] = sk_linear_from_srgb[(ptr[i] >>  0) & 0xff];
+            gs[i] = sk_linear_from_srgb[(ptr[i] >>  8) & 0xff];
+            bs[i] = sk_linear_from_srgb[(ptr[i] >> 16) & 0xff];
+            as[i] = (ptr[i] >> 24) * (1/255.0f);
+        }
+        dr = Sk4f::Load(rs);
+        dg = Sk4f::Load(gs);
+        db = Sk4f::Load(bs);
+        da = Sk4f::Load(as);
+        return;
+    }
+
     dr = Sk4f{ sk_linear_from_srgb[(ptr[0] >>  0) & 0xff],
                sk_linear_from_srgb[(ptr[1] >>  0) & 0xff],
                sk_linear_from_srgb[(ptr[2] >>  0) & 0xff],
@@ -74,29 +99,24 @@ SK_RASTER_STAGE(load_d_srgb) {
     da = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
 }
 
-SK_RASTER_STAGE(load_d_srgb_tail) {
-    auto ptr = (const uint32_t*)ctx + x;
-
-    dr = Sk4f{ sk_linear_from_srgb[(*ptr >>  0) & 0xff], 0,0,0 };
-    dg = Sk4f{ sk_linear_from_srgb[(*ptr >>  8) & 0xff], 0,0,0 };
-    db = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 };
-    da = Sk4f{                (*ptr >> 24) * (1/255.0f), 0,0,0 };
-}
-
 SK_RASTER_STAGE(scale_u8) {
     auto ptr = (const uint8_t*)ctx + x;
 
-    auto c = SkNx_cast<float>(Sk4b::Load(ptr)) * (1/255.0f);
-    r *= c;
-    g *= c;
-    b *= c;
-    a *= c;
-}
+    Sk4b cov;
 
-SK_RASTER_STAGE(scale_u8_tail) {
-    auto ptr = (const uint8_t*)ctx + x;
+    if (tail) {
+        uint8_t cs[] = {0,0,0,0};
+        switch (tail&3) {
+            case 3: cs[2] = ptr[2];
+            case 2: cs[1] = ptr[1];
+            case 1: cs[0] = ptr[0];
+        }
+        cov = Sk4b::Load(cs);
+    } else {
+        cov = Sk4b::Load(ptr);
+    }
 
-    auto c = *ptr * (1/255.0f);
+    auto c = SkNx_cast<float>(cov) * (1/255.0f);
     r *= c;
     g *= c;
     b *= c;
@@ -114,19 +134,24 @@ SK_RASTER_STAGE(srcover) {
 SK_RASTER_STAGE(store_srgb) {
     auto ptr = (uint32_t*)ctx + x;
 
+    uint32_t* dst = nullptr;
+    uint32_t stack[4];
+
+    if (tail) {
+        dst = ptr;
+        ptr = stack;
+    }
+
     ( sk_linear_to_srgb(r)
     | sk_linear_to_srgb(g) << 8
     | sk_linear_to_srgb(b) << 16
     | Sk4f_round(255.0f*a) << 24).store(ptr);
-}
-
-SK_RASTER_STAGE(store_srgb_tail) {
-    auto ptr = (uint32_t*)ctx + x;
-
-    Sk4i rgba = sk_linear_to_srgb({r[0], g[0], b[0], 0});
-    rgba = {rgba[0], rgba[1], rgba[2], (int)(255.0f * a[0] + 0.5f)};
 
-    SkNx_cast<uint8_t>(rgba).store(ptr);
+    switch (tail&3) {
+        case 3: dst[2] = ptr[2];
+        case 2: dst[1] = ptr[1];
+        case 1: dst[0] = ptr[0];
+    }
 }
 
 class SkRasterPipelineBench : public Benchmark {
@@ -147,35 +172,31 @@ public:
         Sk4f r,g,b,a, dr,dg,db,da;
         size_t x = 0, n = N;
         while (n >= 4) {
-            load_s_srgb(src    , x, r,g,b,a, dr,dg,db,da);
-            scale_u8   (mask   , x, r,g,b,a, dr,dg,da,da);
-            load_d_srgb(dst    , x, r,g,b,a, dr,dg,da,da);
-            srcover    (nullptr, x, r,g,b,a, dr,dg,da,da);
-            store_srgb (dst    , x, r,g,b,a, dr,dg,da,da);
+            load_s_srgb(src    , x,0, r,g,b,a, dr,dg,db,da);
+            scale_u8   (mask   , x,0, r,g,b,a, dr,dg,da,da);
+            load_d_srgb(dst    , x,0, r,g,b,a, dr,dg,da,da);
+            srcover    (nullptr, x,0, r,g,b,a, dr,dg,da,da);
+            store_srgb (dst    , x,0, r,g,b,a, dr,dg,da,da);
 
             x += 4;
             n -= 4;
         }
-        while (n > 0) {
-            load_s_srgb_tail(src    , x, r,g,b,a, dr,dg,db,da);
-            scale_u8_tail   (mask   , x, r,g,b,a, dr,dg,da,da);
-            load_d_srgb_tail(dst    , x, r,g,b,a, dr,dg,da,da);
-            srcover         (nullptr, x, r,g,b,a, dr,dg,da,da);
-            store_srgb_tail (dst    , x, r,g,b,a, dr,dg,da,da);
-
-            x += 1;
-            n -= 1;
+        if (n > 0) {
+            load_s_srgb(src    , x,n, r,g,b,a, dr,dg,db,da);
+            scale_u8   (mask   , x,n, r,g,b,a, dr,dg,da,da);
+            load_d_srgb(dst    , x,n, r,g,b,a, dr,dg,da,da);
+            srcover    (nullptr, x,n, r,g,b,a, dr,dg,da,da);
+            store_srgb (dst    , x,n, r,g,b,a, dr,dg,da,da);
         }
     }
 
     void runPipeline() {
         SkRasterPipeline p;
-        p.append<load_s_srgb, load_s_srgb_tail>( src);
-        p.append<   scale_u8,    scale_u8_tail>(mask);
-        p.append<load_d_srgb, load_d_srgb_tail>( dst);
-        p.append<srcover>();
-        p.last< store_srgb,  store_srgb_tail>( dst);
-
+        p.append<load_s_srgb>(src);
+        p.append<   scale_u8>(mask);
+        p.append<load_d_srgb>(dst);
+        p.append<    srcover>();
+        p.last  < store_srgb>(dst);
         p.run(N);
     }
 
-- 
cgit v1.2.3