From c8dd6bc3e7a4b01c848ba15b808ea6ffdf249b06 Mon Sep 17 00:00:00 2001 From: Mike Klein Date: Wed, 28 Sep 2016 10:43:53 -0400 Subject: Rearrange SkRasterPipeline scanline tail handling. We used to step at a 4-pixel stride as long as possible, then run up to 3 times, one pixel at a time. Now replace those 1-at-a-time runs with a single tail stamp if there are 1-3 remaining pixels. This style is simply more efficient: e.g. we'll blend and lerp once for 3 pixels instead of 3 times. This should make short blits significantly more efficient. It's also more future-oriented... AVX+ on Intel and SVE on ARM support masked loads and stores, so we can do the entire tail in one direct step. This also makes it possible to re-arrange the code a bit to encapsulate each stage better. I think generally this code reads more clearly than the old code, but YMMV. I've arranged things so you write one function, but it's compiled into two specializations, one for tail=0 (Body) and one for tail>0 (Tail). It's pretty tidy. For now I've just burned a register to pass around tail. It's 2 bits now, maybe soon 3 with AVX, and capped at 4 for even the craziest new toys, so there are plenty of places we can pack it if we want to get clever. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2717 Change-Id: I45852a3e5d4c5b5e9315302c46601aee0d32265f Reviewed-on: https://skia-review.googlesource.com/2717 Reviewed-by: Mike Reed Commit-Queue: Mike Klein --- tests/SkRasterPipelineTest.cpp | 44 ++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) (limited to 'tests/SkRasterPipelineTest.cpp') diff --git a/tests/SkRasterPipelineTest.cpp b/tests/SkRasterPipelineTest.cpp index 29fe59d31b..867baf7918 100644 --- a/tests/SkRasterPipelineTest.cpp +++ b/tests/SkRasterPipelineTest.cpp @@ -8,22 +8,16 @@ #include "Test.h" #include "SkRasterPipeline.h" -// load needs two variants, one to load 4 values... SK_RASTER_STAGE(load) { auto ptr = (const float*)ctx + x; - r = Sk4f{ptr[0]}; - g = Sk4f{ptr[1]}; - b = Sk4f{ptr[2]}; - a = Sk4f{ptr[3]}; + switch(tail&3) { + case 0: a = Sk4f{ptr[3]}; + case 3: b = Sk4f{ptr[2]}; + case 2: g = Sk4f{ptr[1]}; + case 1: r = Sk4f{ptr[0]}; + } } -// ...and one to load a single value. -SK_RASTER_STAGE(load_tail) { - auto ptr = (const float*)ctx + x; - r = Sk4f{*ptr}; -} - -// square doesn't really care how many of its inputs are active, nor does it need a context. SK_RASTER_STAGE(square) { r *= r; g *= g; @@ -31,26 +25,22 @@ SK_RASTER_STAGE(square) { a *= a; } -// Like load, store has a _tail variant. SK_RASTER_STAGE(store) { auto ptr = (float*)ctx + x; - ptr[0] = r[0]; - ptr[1] = g[0]; - ptr[2] = b[0]; - ptr[3] = a[0]; -} - -SK_RASTER_STAGE(store_tail) { - auto ptr = (float*)ctx + x; - *ptr = r[0]; + switch (tail&3) { + case 0: ptr[3] = a[0]; + case 3: ptr[2] = b[0]; + case 2: ptr[1] = g[0]; + case 1: ptr[0] = r[0]; + } } DEF_TEST(SkRasterPipeline, r) { // We'll build up and run a simple pipeline that exercises the salient // mechanics of SkRasterPipeline: - // - context pointers - // - stages sensitive to the number of pixels - // - stages insensitive to the number of pixels + // - context pointers (load,store) + // - stages sensitive to the number of pixels (load,store) + // - stages insensitive to the number of pixels (square) // // This pipeline loads up some values, squares them, then writes them back to memory. @@ -58,9 +48,9 @@ DEF_TEST(SkRasterPipeline, r) { float dst_vals[] = { 0,0,0,0,0 }; SkRasterPipeline p; - p.append(src_vals); + p.append(src_vals); p.append(); - p.append(dst_vals); + p.append(dst_vals); p.run(5); -- cgit v1.2.3