From c8dd6bc3e7a4b01c848ba15b808ea6ffdf249b06 Mon Sep 17 00:00:00 2001 From: Mike Klein Date: Wed, 28 Sep 2016 10:43:53 -0400 Subject: Rearrange SkRasterPipeline scanline tail handling. We used to step at a 4-pixel stride as long as possible, then run up to 3 times, one pixel at a time. Now replace those 1-at-a-time runs with a single tail stamp if there are 1-3 remaining pixels. This style is simply more efficient: e.g. we'll blend and lerp once for 3 pixels instead of 3 times. This should make short blits significantly more efficient. It's also more future-oriented... AVX+ on Intel and SVE on ARM support masked loads and stores, so we can do the entire tail in one direct step. This also makes it possible to re-arrange the code a bit to encapsulate each stage better. I think generally this code reads more clearly than the old code, but YMMV. I've arranged things so you write one function, but it's compiled into two specializations, one for tail=0 (Body) and one for tail>0 (Tail). It's pretty tidy. For now I've just burned a register to pass around tail. It's 2 bits now, maybe soon 3 with AVX, and capped at 4 for even the craziest new toys, so there are plenty of places we can pack it if we want to get clever. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2717 Change-Id: I45852a3e5d4c5b5e9315302c46601aee0d32265f Reviewed-on: https://skia-review.googlesource.com/2717 Reviewed-by: Mike Reed Commit-Queue: Mike Klein --- bench/SkRasterPipelineBench.cpp | 135 +++++++++++++++++++++++----------------- 1 file changed, 78 insertions(+), 57 deletions(-) (limited to 'bench/SkRasterPipelineBench.cpp') diff --git a/bench/SkRasterPipelineBench.cpp b/bench/SkRasterPipelineBench.cpp index 5ef64ce603..aa3e355e45 100644 --- a/bench/SkRasterPipelineBench.cpp +++ b/bench/SkRasterPipelineBench.cpp @@ -21,11 +21,28 @@ static uint8_t mask[N]; // - load srgb dst // - src = srcover(dst, src) // - store src back as srgb -// Every stage except for srcover interacts with memory, and so will need _tail variants. SK_RASTER_STAGE(load_s_srgb) { auto ptr = (const uint32_t*)ctx + x; + if (tail) { + float rs[] = {0,0,0,0}, + gs[] = {0,0,0,0}, + bs[] = {0,0,0,0}, + as[] = {0,0,0,0}; + for (size_t i = 0; i < (tail&3); i++) { + rs[i] = sk_linear_from_srgb[(ptr[i] >> 0) & 0xff]; + gs[i] = sk_linear_from_srgb[(ptr[i] >> 8) & 0xff]; + bs[i] = sk_linear_from_srgb[(ptr[i] >> 16) & 0xff]; + as[i] = (ptr[i] >> 24) * (1/255.0f); + } + r = Sk4f::Load(rs); + g = Sk4f::Load(gs); + b = Sk4f::Load(bs); + a = Sk4f::Load(as); + return; + } + r = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff], sk_linear_from_srgb[(ptr[1] >> 0) & 0xff], sk_linear_from_srgb[(ptr[2] >> 0) & 0xff], @@ -40,22 +57,30 @@ SK_RASTER_STAGE(load_s_srgb) { sk_linear_from_srgb[(ptr[1] >> 16) & 0xff], sk_linear_from_srgb[(ptr[2] >> 16) & 0xff], sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] }; - a = SkNx_cast((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f); } -SK_RASTER_STAGE(load_s_srgb_tail) { - auto ptr = (const uint32_t*)ctx + x; - - r = Sk4f{ sk_linear_from_srgb[(*ptr >> 0) & 0xff], 0,0,0 }; - g = Sk4f{ sk_linear_from_srgb[(*ptr >> 8) & 0xff], 0,0,0 }; - b = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 }; - a = Sk4f{ (*ptr >> 24) * (1/255.0f), 0,0,0 }; -} - SK_RASTER_STAGE(load_d_srgb) { auto ptr = (const uint32_t*)ctx + x; + if (tail) { + float rs[] = {0,0,0,0}, + gs[] = {0,0,0,0}, + bs[] = {0,0,0,0}, + as[] = {0,0,0,0}; + for (size_t i = 0; i < (tail&3); i++) { + rs[i] = sk_linear_from_srgb[(ptr[i] >> 0) & 0xff]; + gs[i] = sk_linear_from_srgb[(ptr[i] >> 8) & 0xff]; + bs[i] = sk_linear_from_srgb[(ptr[i] >> 16) & 0xff]; + as[i] = (ptr[i] >> 24) * (1/255.0f); + } + dr = Sk4f::Load(rs); + dg = Sk4f::Load(gs); + db = Sk4f::Load(bs); + da = Sk4f::Load(as); + return; + } + dr = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff], sk_linear_from_srgb[(ptr[1] >> 0) & 0xff], sk_linear_from_srgb[(ptr[2] >> 0) & 0xff], @@ -74,29 +99,24 @@ SK_RASTER_STAGE(load_d_srgb) { da = SkNx_cast((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f); } -SK_RASTER_STAGE(load_d_srgb_tail) { - auto ptr = (const uint32_t*)ctx + x; - - dr = Sk4f{ sk_linear_from_srgb[(*ptr >> 0) & 0xff], 0,0,0 }; - dg = Sk4f{ sk_linear_from_srgb[(*ptr >> 8) & 0xff], 0,0,0 }; - db = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 }; - da = Sk4f{ (*ptr >> 24) * (1/255.0f), 0,0,0 }; -} - SK_RASTER_STAGE(scale_u8) { auto ptr = (const uint8_t*)ctx + x; - auto c = SkNx_cast(Sk4b::Load(ptr)) * (1/255.0f); - r *= c; - g *= c; - b *= c; - a *= c; -} + Sk4b cov; -SK_RASTER_STAGE(scale_u8_tail) { - auto ptr = (const uint8_t*)ctx + x; + if (tail) { + uint8_t cs[] = {0,0,0,0}; + switch (tail&3) { + case 3: cs[2] = ptr[2]; + case 2: cs[1] = ptr[1]; + case 1: cs[0] = ptr[0]; + } + cov = Sk4b::Load(cs); + } else { + cov = Sk4b::Load(ptr); + } - auto c = *ptr * (1/255.0f); + auto c = SkNx_cast(cov) * (1/255.0f); r *= c; g *= c; b *= c; @@ -114,19 +134,24 @@ SK_RASTER_STAGE(srcover) { SK_RASTER_STAGE(store_srgb) { auto ptr = (uint32_t*)ctx + x; + uint32_t* dst = nullptr; + uint32_t stack[4]; + + if (tail) { + dst = ptr; + ptr = stack; + } + ( sk_linear_to_srgb(r) | sk_linear_to_srgb(g) << 8 | sk_linear_to_srgb(b) << 16 | Sk4f_round(255.0f*a) << 24).store(ptr); -} - -SK_RASTER_STAGE(store_srgb_tail) { - auto ptr = (uint32_t*)ctx + x; - - Sk4i rgba = sk_linear_to_srgb({r[0], g[0], b[0], 0}); - rgba = {rgba[0], rgba[1], rgba[2], (int)(255.0f * a[0] + 0.5f)}; - SkNx_cast(rgba).store(ptr); + switch (tail&3) { + case 3: dst[2] = ptr[2]; + case 2: dst[1] = ptr[1]; + case 1: dst[0] = ptr[0]; + } } class SkRasterPipelineBench : public Benchmark { @@ -147,35 +172,31 @@ public: Sk4f r,g,b,a, dr,dg,db,da; size_t x = 0, n = N; while (n >= 4) { - load_s_srgb(src , x, r,g,b,a, dr,dg,db,da); - scale_u8 (mask , x, r,g,b,a, dr,dg,da,da); - load_d_srgb(dst , x, r,g,b,a, dr,dg,da,da); - srcover (nullptr, x, r,g,b,a, dr,dg,da,da); - store_srgb (dst , x, r,g,b,a, dr,dg,da,da); + load_s_srgb(src , x,0, r,g,b,a, dr,dg,db,da); + scale_u8 (mask , x,0, r,g,b,a, dr,dg,da,da); + load_d_srgb(dst , x,0, r,g,b,a, dr,dg,da,da); + srcover (nullptr, x,0, r,g,b,a, dr,dg,da,da); + store_srgb (dst , x,0, r,g,b,a, dr,dg,da,da); x += 4; n -= 4; } - while (n > 0) { - load_s_srgb_tail(src , x, r,g,b,a, dr,dg,db,da); - scale_u8_tail (mask , x, r,g,b,a, dr,dg,da,da); - load_d_srgb_tail(dst , x, r,g,b,a, dr,dg,da,da); - srcover (nullptr, x, r,g,b,a, dr,dg,da,da); - store_srgb_tail (dst , x, r,g,b,a, dr,dg,da,da); - - x += 1; - n -= 1; + if (n > 0) { + load_s_srgb(src , x,n, r,g,b,a, dr,dg,db,da); + scale_u8 (mask , x,n, r,g,b,a, dr,dg,da,da); + load_d_srgb(dst , x,n, r,g,b,a, dr,dg,da,da); + srcover (nullptr, x,n, r,g,b,a, dr,dg,da,da); + store_srgb (dst , x,n, r,g,b,a, dr,dg,da,da); } } void runPipeline() { SkRasterPipeline p; - p.append( src); - p.append< scale_u8, scale_u8_tail>(mask); - p.append( dst); - p.append(); - p.last< store_srgb, store_srgb_tail>( dst); - + p.append(src); + p.append< scale_u8>(mask); + p.append(dst); + p.append< srcover>(); + p.last < store_srgb>(dst); p.run(N); } -- cgit v1.2.3