aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/core/SkRasterPipelineBlitter.cpp
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2016-09-28 10:43:53 -0400
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2016-09-28 15:28:24 +0000
commitc8dd6bc3e7a4b01c848ba15b808ea6ffdf249b06 (patch)
treee58beeb3fd18659cf8ebfa11c11349c17675a46c /src/core/SkRasterPipelineBlitter.cpp
parentb37eb0e968c5082e021244d4baf9b7721e3f360a (diff)
Rearrange SkRasterPipeline scanline tail handling.
We used to step at a 4-pixel stride as long as possible, then run up to 3 times, one pixel at a time. Now replace those 1-at-a-time runs with a single tail stamp if there are 1-3 remaining pixels. This style is simply more efficient: e.g. we'll blend and lerp once for 3 pixels instead of 3 times. This should make short blits significantly more efficient. It's also more future-oriented... AVX+ on Intel and SVE on ARM support masked loads and stores, so we can do the entire tail in one direct step. This also makes it possible to re-arrange the code a bit to encapsulate each stage better. I think generally this code reads more clearly than the old code, but YMMV. I've arranged things so you write one function, but it's compiled into two specializations, one for tail=0 (Body) and one for tail>0 (Tail). It's pretty tidy. For now I've just burned a register to pass around tail. It's 2 bits now, maybe soon 3 with AVX, and capped at 4 for even the craziest new toys, so there are plenty of places we can pack it if we want to get clever. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2717 Change-Id: I45852a3e5d4c5b5e9315302c46601aee0d32265f Reviewed-on: https://skia-review.googlesource.com/2717 Reviewed-by: Mike Reed <reed@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/core/SkRasterPipelineBlitter.cpp')
-rw-r--r--src/core/SkRasterPipelineBlitter.cpp173
1 files changed, 72 insertions, 101 deletions
diff --git a/src/core/SkRasterPipelineBlitter.cpp b/src/core/SkRasterPipelineBlitter.cpp
index e407d013f5..d618e8b961 100644
--- a/src/core/SkRasterPipelineBlitter.cpp
+++ b/src/core/SkRasterPipelineBlitter.cpp
@@ -101,22 +101,29 @@ SK_RASTER_STAGE(lerp_constant_float) {
a = lerp(da, a, c);
}
-// s' = d(1-c) + sc, 4 pixels at a time for 8-bit coverage.
-SK_RASTER_STAGE(lerp_a8) {
- auto ptr = (const uint8_t*)ctx + x;
- Sk4f c = SkNx_cast<float>(Sk4b::Load(ptr)) * (1/255.0f);
+template <typename T>
+static SkNx<4,T> load_tail(size_t tail, const T* src) {
+ if (tail) {
+ return SkNx<4,T>(src[0], (tail>1 ? src[1] : 0), (tail>2 ? src[2] : 0), 0);
+ }
+ return SkNx<4,T>::Load(src);
+}
- r = lerp(dr, r, c);
- g = lerp(dg, g, c);
- b = lerp(db, b, c);
- a = lerp(da, a, c);
+template <typename T>
+static void store_tail(size_t tail, const SkNx<4,T>& v, T* dst) {
+ switch(tail) {
+ case 0: return v.store(dst);
+ case 3: dst[2] = v[2];
+ case 2: dst[1] = v[1];
+ case 1: dst[0] = v[0];
+ }
}
-// Tail variant of lerp_a8() handling 1 pixel at a time.
-SK_RASTER_STAGE(lerp_a8_1) {
+// s' = d(1-c) + sc for 8-bit c.
+SK_RASTER_STAGE(lerp_a8) {
auto ptr = (const uint8_t*)ctx + x;
- Sk4f c = *ptr * (1/255.0f);
+ Sk4f c = SkNx_cast<float>(load_tail(tail, ptr)) * (1/255.0f);
r = lerp(dr, r, c);
g = lerp(dg, g, c);
b = lerp(db, b, c);
@@ -137,11 +144,11 @@ static Sk4h to_565(const Sk4f& r, const Sk4f& g, const Sk4f& b) {
| Sk4f_round(b * SK_B16_MASK) << SK_B16_SHIFT);
}
-// s' = d(1-c) + sc, 4 pixels at a time for 565 coverage.
+// s' = d(1-c) + sc for 565 c.
SK_RASTER_STAGE(lerp_lcd16) {
auto ptr = (const uint16_t*)ctx + x;
Sk4f cr, cg, cb;
- from_565(Sk4h::Load(ptr), &cr, &cg, &cb);
+ from_565(load_tail(tail, ptr), &cr, &cg, &cb);
r = lerp(dr, r, cr);
g = lerp(dg, g, cg);
@@ -149,89 +156,74 @@ SK_RASTER_STAGE(lerp_lcd16) {
a = 1.0f;
}
-// Tail variant of lerp_lcd16() handling 1 pixel at a time.
-SK_RASTER_STAGE(lerp_lcd16_1) {
- auto ptr = (const uint16_t*)ctx + x;
- Sk4f cr, cg, cb;
- from_565({*ptr,0,0,0}, &cr, &cg, &cb);
-
- r = lerp(dr, r, cr);
- g = lerp(dg, g, cg);
- b = lerp(db, b, cb);
- a = 1.0f;
-}
-
-// Load 4 565 dst pixels.
SK_RASTER_STAGE(load_d_565) {
auto ptr = (const uint16_t*)ctx + x;
-
- from_565(Sk4h::Load(ptr), &dr,&dg,&db);
- da = 1.0f;
-}
-
-// Load 1 565 dst pixel.
-SK_RASTER_STAGE(load_d_565_1) {
- auto ptr = (const uint16_t*)ctx + x;
-
- from_565({*ptr,0,0,0}, &dr,&dg,&db);
+ from_565(load_tail(tail, ptr), &dr,&dg,&db);
da = 1.0f;
}
-// Store 4 565 pixels.
SK_RASTER_STAGE(store_565) {
auto ptr = (uint16_t*)ctx + x;
- to_565(r,g,b).store(ptr);
+ store_tail(tail, to_565(r,g,b), ptr);
}
-// Store 1 565 pixel.
-SK_RASTER_STAGE(store_565_1) {
- auto ptr = (uint16_t*)ctx + x;
- *ptr = to_565(r,g,b)[0];
-}
-
-// Load 4 F16 pixels.
SK_RASTER_STAGE(load_d_f16) {
auto ptr = (const uint64_t*)ctx + x;
+ if (tail) {
+ auto p0 = SkHalfToFloat_finite_ftz(ptr[0]) ,
+ p1 = tail>1 ? SkHalfToFloat_finite_ftz(ptr[1]) : Sk4f{0},
+ p2 = tail>2 ? SkHalfToFloat_finite_ftz(ptr[2]) : Sk4f{0};
+ dr = { p0[0],p1[0],p2[0],0 };
+ dg = { p0[1],p1[1],p2[1],0 };
+ db = { p0[2],p1[2],p2[2],0 };
+ da = { p0[3],p1[3],p2[3],0 };
+ return;
+ }
+
Sk4h rh, gh, bh, ah;
Sk4h_load4(ptr, &rh, &gh, &bh, &ah);
-
dr = SkHalfToFloat_finite_ftz(rh);
dg = SkHalfToFloat_finite_ftz(gh);
db = SkHalfToFloat_finite_ftz(bh);
da = SkHalfToFloat_finite_ftz(ah);
}
-// Load 1 F16 pixel.
-SK_RASTER_STAGE(load_d_f16_1) {
- auto ptr = (const uint64_t*)ctx + x;
-
- auto p0 = SkHalfToFloat_finite_ftz(ptr[0]);
- dr = { p0[0],0,0,0 };
- dg = { p0[1],0,0,0 };
- db = { p0[2],0,0,0 };
- da = { p0[3],0,0,0 };
-}
-
-// Store 4 F16 pixels.
SK_RASTER_STAGE(store_f16) {
auto ptr = (uint64_t*)ctx + x;
- Sk4h_store4(ptr, SkFloatToHalf_finite_ftz(r), SkFloatToHalf_finite_ftz(g),
- SkFloatToHalf_finite_ftz(b), SkFloatToHalf_finite_ftz(a));
-}
-
-// Store 1 F16 pixel.
-SK_RASTER_STAGE(store_f16_1) {
- auto ptr = (uint64_t*)ctx + x;
+ switch (tail) {
+ case 0: return Sk4h_store4(ptr, SkFloatToHalf_finite_ftz(r), SkFloatToHalf_finite_ftz(g),
+ SkFloatToHalf_finite_ftz(b), SkFloatToHalf_finite_ftz(a));
- SkFloatToHalf_finite_ftz({r[0], g[0], b[0], a[0]}).store(ptr);
+ case 3: SkFloatToHalf_finite_ftz({r[2], g[2], b[2], a[2]}).store(ptr+2);
+ case 2: SkFloatToHalf_finite_ftz({r[1], g[1], b[1], a[1]}).store(ptr+1);
+ case 1: SkFloatToHalf_finite_ftz({r[0], g[0], b[0], a[0]}).store(ptr+0);
+ }
}
-// Load 4 8-bit sRGB pixels from SkPMColor order to RGBA.
+// Load 8-bit SkPMColor-order sRGB.
SK_RASTER_STAGE(load_d_srgb) {
auto ptr = (const uint32_t*)ctx + x;
+ if (tail) {
+ float rs[] = {0,0,0,0},
+ gs[] = {0,0,0,0},
+ bs[] = {0,0,0,0},
+ as[] = {0,0,0,0};
+ for (size_t i = 0; i < tail; i++) {
+ rs[i] = sk_linear_from_srgb[(ptr[i] >> SK_R32_SHIFT) & 0xff];
+ gs[i] = sk_linear_from_srgb[(ptr[i] >> SK_G32_SHIFT) & 0xff];
+ bs[i] = sk_linear_from_srgb[(ptr[i] >> SK_B32_SHIFT) & 0xff];
+ as[i] = (1/255.0f) * (ptr[i] >> SK_A32_SHIFT) ;
+ }
+ dr = Sk4f::Load(rs);
+ dg = Sk4f::Load(gs);
+ db = Sk4f::Load(bs);
+ da = Sk4f::Load(as);
+ return;
+ }
+
dr = { sk_linear_from_srgb[(ptr[0] >> SK_R32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[1] >> SK_R32_SHIFT) & 0xff],
sk_linear_from_srgb[(ptr[2] >> SK_R32_SHIFT) & 0xff],
@@ -250,34 +242,13 @@ SK_RASTER_STAGE(load_d_srgb) {
da = SkNx_cast<float>(Sk4u::Load(ptr) >> SK_A32_SHIFT) * (1/255.0f);
}
-// Tail variant of load_d_srgb() handling 1 pixel at a time.
-SK_RASTER_STAGE(load_d_srgb_1) {
- auto ptr = (const uint32_t*)ctx + x;
-
- dr = { sk_linear_from_srgb[(*ptr >> SK_R32_SHIFT) & 0xff], 0,0,0 };
- dg = { sk_linear_from_srgb[(*ptr >> SK_G32_SHIFT) & 0xff], 0,0,0 };
- db = { sk_linear_from_srgb[(*ptr >> SK_B32_SHIFT) & 0xff], 0,0,0 };
- da = { (1/255.0f) * (*ptr >> SK_A32_SHIFT) , 0,0,0 };
-}
-
-// Write out 4 pixels as 8-bit SkPMColor-order sRGB.
+// Store 8-bit SkPMColor-order sRGB.
SK_RASTER_STAGE(store_srgb) {
auto ptr = (uint32_t*)ctx + x;
- ( sk_linear_to_srgb_noclamp(r) << SK_R32_SHIFT
- | sk_linear_to_srgb_noclamp(g) << SK_G32_SHIFT
- | sk_linear_to_srgb_noclamp(b) << SK_B32_SHIFT
- | Sk4f_round(255.0f * a) << SK_A32_SHIFT).store(ptr);
-}
-
-// Tail variant of store_srgb() handling 1 pixel at a time.
-SK_RASTER_STAGE(store_srgb_1) {
- auto ptr = (uint32_t*)ctx + x;
- Sk4i rgb = sk_linear_to_srgb_noclamp(swizzle_rb_if_bgra({ r[0], g[0], b[0], 0.0f }));
-
- uint32_t rgba;
- SkNx_cast<uint8_t>(rgb).store(&rgba);
- rgba |= (uint32_t)(255.0f * a[0] + 0.5f) << 24;
- *ptr = rgba;
+ store_tail(tail, ( sk_linear_to_srgb_noclamp(r) << SK_R32_SHIFT
+ | sk_linear_to_srgb_noclamp(g) << SK_G32_SHIFT
+ | sk_linear_to_srgb_noclamp(b) << SK_B32_SHIFT
+ | Sk4f_round(255.0f * a) << SK_A32_SHIFT), (int*)ptr);
}
static bool supported(const SkImageInfo& info) {
@@ -341,14 +312,14 @@ void SkRasterPipelineBlitter::append_load_d(SkRasterPipeline* p, const void* dst
switch (fDst.info().colorType()) {
case kN32_SkColorType:
if (fDst.info().gammaCloseToSRGB()) {
- p->append<load_d_srgb, load_d_srgb_1>(dst);
+ p->append<load_d_srgb>(dst);
}
break;
case kRGBA_F16_SkColorType:
- p->append<load_d_f16, load_d_f16_1>(dst);
+ p->append<load_d_f16>(dst);
break;
case kRGB_565_SkColorType:
- p->append<load_d_565, load_d_565_1>(dst);
+ p->append<load_d_565>(dst);
break;
default: break;
}
@@ -361,14 +332,14 @@ void SkRasterPipelineBlitter::append_store(SkRasterPipeline* p, void* dst) const
switch (fDst.info().colorType()) {
case kN32_SkColorType:
if (fDst.info().gammaCloseToSRGB()) {
- p->last<store_srgb, store_srgb_1>(dst);
+ p->last<store_srgb>(dst);
}
break;
case kRGBA_F16_SkColorType:
- p->last<store_f16, store_f16_1>(dst);
+ p->last<store_f16>(dst);
break;
case kRGB_565_SkColorType:
- p->last<store_565, store_565_1>(dst);
+ p->last<store_565>(dst);
break;
default: break;
}
@@ -426,10 +397,10 @@ void SkRasterPipelineBlitter::blitMask(const SkMask& mask, const SkIRect& clip)
p.extend(fXfermode);
switch (mask.fFormat) {
case SkMask::kA8_Format:
- p.append<lerp_a8, lerp_a8_1>(mask.getAddr8(x,y)-x);
+ p.append<lerp_a8>(mask.getAddr8(x,y)-x);
break;
case SkMask::kLCD16_Format:
- p.append<lerp_lcd16, lerp_lcd16_1>(mask.getAddrLCD16(x,y)-x);
+ p.append<lerp_lcd16>(mask.getAddrLCD16(x,y)-x);
break;
default: break;
}