aboutsummaryrefslogtreecommitdiffhomepage
path: root/bench
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2016-09-28 10:43:53 -0400
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2016-09-28 15:28:24 +0000
commitc8dd6bc3e7a4b01c848ba15b808ea6ffdf249b06 (patch)
treee58beeb3fd18659cf8ebfa11c11349c17675a46c /bench
parentb37eb0e968c5082e021244d4baf9b7721e3f360a (diff)
Rearrange SkRasterPipeline scanline tail handling.
We used to step at a 4-pixel stride as long as possible, then run up to 3 times, one pixel at a time. Now replace those 1-at-a-time runs with a single tail stamp if there are 1-3 remaining pixels. This style is simply more efficient: e.g. we'll blend and lerp once for 3 pixels instead of 3 times. This should make short blits significantly more efficient. It's also more future-oriented... AVX+ on Intel and SVE on ARM support masked loads and stores, so we can do the entire tail in one direct step. This also makes it possible to re-arrange the code a bit to encapsulate each stage better. I think generally this code reads more clearly than the old code, but YMMV. I've arranged things so you write one function, but it's compiled into two specializations, one for tail=0 (Body) and one for tail>0 (Tail). It's pretty tidy. For now I've just burned a register to pass around tail. It's 2 bits now, maybe soon 3 with AVX, and capped at 4 for even the craziest new toys, so there are plenty of places we can pack it if we want to get clever. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2717 Change-Id: I45852a3e5d4c5b5e9315302c46601aee0d32265f Reviewed-on: https://skia-review.googlesource.com/2717 Reviewed-by: Mike Reed <reed@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'bench')
-rw-r--r--bench/SkRasterPipelineBench.cpp135
1 files changed, 78 insertions, 57 deletions
diff --git a/bench/SkRasterPipelineBench.cpp b/bench/SkRasterPipelineBench.cpp
index 5ef64ce603..aa3e355e45 100644
--- a/bench/SkRasterPipelineBench.cpp
+++ b/bench/SkRasterPipelineBench.cpp
@@ -21,11 +21,28 @@ static uint8_t mask[N];
// - load srgb dst
// - src = srcover(dst, src)
// - store src back as srgb
-// Every stage except for srcover interacts with memory, and so will need _tail variants.
SK_RASTER_STAGE(load_s_srgb) {
auto ptr = (const uint32_t*)ctx + x;
+ if (tail) {
+ float rs[] = {0,0,0,0},
+ gs[] = {0,0,0,0},
+ bs[] = {0,0,0,0},
+ as[] = {0,0,0,0};
+ for (size_t i = 0; i < (tail&3); i++) {
+ rs[i] = sk_linear_from_srgb[(ptr[i] >> 0) & 0xff];
+ gs[i] = sk_linear_from_srgb[(ptr[i] >> 8) & 0xff];
+ bs[i] = sk_linear_from_srgb[(ptr[i] >> 16) & 0xff];
+ as[i] = (ptr[i] >> 24) * (1/255.0f);
+ }
+ r = Sk4f::Load(rs);
+ g = Sk4f::Load(gs);
+ b = Sk4f::Load(bs);
+ a = Sk4f::Load(as);
+ return;
+ }
+
r = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff],
sk_linear_from_srgb[(ptr[1] >> 0) & 0xff],
sk_linear_from_srgb[(ptr[2] >> 0) & 0xff],
@@ -40,22 +57,30 @@ SK_RASTER_STAGE(load_s_srgb) {
sk_linear_from_srgb[(ptr[1] >> 16) & 0xff],
sk_linear_from_srgb[(ptr[2] >> 16) & 0xff],
sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] };
-
a = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
}
-SK_RASTER_STAGE(load_s_srgb_tail) {
- auto ptr = (const uint32_t*)ctx + x;
-
- r = Sk4f{ sk_linear_from_srgb[(*ptr >> 0) & 0xff], 0,0,0 };
- g = Sk4f{ sk_linear_from_srgb[(*ptr >> 8) & 0xff], 0,0,0 };
- b = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 };
- a = Sk4f{ (*ptr >> 24) * (1/255.0f), 0,0,0 };
-}
-
SK_RASTER_STAGE(load_d_srgb) {
auto ptr = (const uint32_t*)ctx + x;
+ if (tail) {
+ float rs[] = {0,0,0,0},
+ gs[] = {0,0,0,0},
+ bs[] = {0,0,0,0},
+ as[] = {0,0,0,0};
+ for (size_t i = 0; i < (tail&3); i++) {
+ rs[i] = sk_linear_from_srgb[(ptr[i] >> 0) & 0xff];
+ gs[i] = sk_linear_from_srgb[(ptr[i] >> 8) & 0xff];
+ bs[i] = sk_linear_from_srgb[(ptr[i] >> 16) & 0xff];
+ as[i] = (ptr[i] >> 24) * (1/255.0f);
+ }
+ dr = Sk4f::Load(rs);
+ dg = Sk4f::Load(gs);
+ db = Sk4f::Load(bs);
+ da = Sk4f::Load(as);
+ return;
+ }
+
dr = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff],
sk_linear_from_srgb[(ptr[1] >> 0) & 0xff],
sk_linear_from_srgb[(ptr[2] >> 0) & 0xff],
@@ -74,29 +99,24 @@ SK_RASTER_STAGE(load_d_srgb) {
da = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
}
-SK_RASTER_STAGE(load_d_srgb_tail) {
- auto ptr = (const uint32_t*)ctx + x;
-
- dr = Sk4f{ sk_linear_from_srgb[(*ptr >> 0) & 0xff], 0,0,0 };
- dg = Sk4f{ sk_linear_from_srgb[(*ptr >> 8) & 0xff], 0,0,0 };
- db = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 };
- da = Sk4f{ (*ptr >> 24) * (1/255.0f), 0,0,0 };
-}
-
SK_RASTER_STAGE(scale_u8) {
auto ptr = (const uint8_t*)ctx + x;
- auto c = SkNx_cast<float>(Sk4b::Load(ptr)) * (1/255.0f);
- r *= c;
- g *= c;
- b *= c;
- a *= c;
-}
+ Sk4b cov;
-SK_RASTER_STAGE(scale_u8_tail) {
- auto ptr = (const uint8_t*)ctx + x;
+ if (tail) {
+ uint8_t cs[] = {0,0,0,0};
+ switch (tail&3) {
+ case 3: cs[2] = ptr[2];
+ case 2: cs[1] = ptr[1];
+ case 1: cs[0] = ptr[0];
+ }
+ cov = Sk4b::Load(cs);
+ } else {
+ cov = Sk4b::Load(ptr);
+ }
- auto c = *ptr * (1/255.0f);
+ auto c = SkNx_cast<float>(cov) * (1/255.0f);
r *= c;
g *= c;
b *= c;
@@ -114,19 +134,24 @@ SK_RASTER_STAGE(srcover) {
SK_RASTER_STAGE(store_srgb) {
auto ptr = (uint32_t*)ctx + x;
+ uint32_t* dst = nullptr;
+ uint32_t stack[4];
+
+ if (tail) {
+ dst = ptr;
+ ptr = stack;
+ }
+
( sk_linear_to_srgb(r)
| sk_linear_to_srgb(g) << 8
| sk_linear_to_srgb(b) << 16
| Sk4f_round(255.0f*a) << 24).store(ptr);
-}
-
-SK_RASTER_STAGE(store_srgb_tail) {
- auto ptr = (uint32_t*)ctx + x;
-
- Sk4i rgba = sk_linear_to_srgb({r[0], g[0], b[0], 0});
- rgba = {rgba[0], rgba[1], rgba[2], (int)(255.0f * a[0] + 0.5f)};
- SkNx_cast<uint8_t>(rgba).store(ptr);
+ switch (tail&3) {
+ case 3: dst[2] = ptr[2];
+ case 2: dst[1] = ptr[1];
+ case 1: dst[0] = ptr[0];
+ }
}
class SkRasterPipelineBench : public Benchmark {
@@ -147,35 +172,31 @@ public:
Sk4f r,g,b,a, dr,dg,db,da;
size_t x = 0, n = N;
while (n >= 4) {
- load_s_srgb(src , x, r,g,b,a, dr,dg,db,da);
- scale_u8 (mask , x, r,g,b,a, dr,dg,da,da);
- load_d_srgb(dst , x, r,g,b,a, dr,dg,da,da);
- srcover (nullptr, x, r,g,b,a, dr,dg,da,da);
- store_srgb (dst , x, r,g,b,a, dr,dg,da,da);
+ load_s_srgb(src , x,0, r,g,b,a, dr,dg,db,da);
+ scale_u8 (mask , x,0, r,g,b,a, dr,dg,da,da);
+ load_d_srgb(dst , x,0, r,g,b,a, dr,dg,da,da);
+ srcover (nullptr, x,0, r,g,b,a, dr,dg,da,da);
+ store_srgb (dst , x,0, r,g,b,a, dr,dg,da,da);
x += 4;
n -= 4;
}
- while (n > 0) {
- load_s_srgb_tail(src , x, r,g,b,a, dr,dg,db,da);
- scale_u8_tail (mask , x, r,g,b,a, dr,dg,da,da);
- load_d_srgb_tail(dst , x, r,g,b,a, dr,dg,da,da);
- srcover (nullptr, x, r,g,b,a, dr,dg,da,da);
- store_srgb_tail (dst , x, r,g,b,a, dr,dg,da,da);
-
- x += 1;
- n -= 1;
+ if (n > 0) {
+ load_s_srgb(src , x,n, r,g,b,a, dr,dg,db,da);
+ scale_u8 (mask , x,n, r,g,b,a, dr,dg,da,da);
+ load_d_srgb(dst , x,n, r,g,b,a, dr,dg,da,da);
+ srcover (nullptr, x,n, r,g,b,a, dr,dg,da,da);
+ store_srgb (dst , x,n, r,g,b,a, dr,dg,da,da);
}
}
void runPipeline() {
SkRasterPipeline p;
- p.append<load_s_srgb, load_s_srgb_tail>( src);
- p.append< scale_u8, scale_u8_tail>(mask);
- p.append<load_d_srgb, load_d_srgb_tail>( dst);
- p.append<srcover>();
- p.last< store_srgb, store_srgb_tail>( dst);
-
+ p.append<load_s_srgb>(src);
+ p.append< scale_u8>(mask);
+ p.append<load_d_srgb>(dst);
+ p.append< srcover>();
+ p.last < store_srgb>(dst);
p.run(N);
}