aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts/SkRasterPipeline_opts.h
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2016-12-11 11:42:07 -0500
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2016-12-12 15:56:38 +0000
commit4958006657dc3d6bbad933217b687e275f1554a3 (patch)
treed07455d91618a2ce0b1dece9b1856c16f47a2b54 /src/opts/SkRasterPipeline_opts.h
parentebfce6d9b42198e04288a15953f40c395a7b6139 (diff)
funnel f16 through standard load/store/gather
This is a precursor to using mask load, mask store, and gather instructions for f16. This is a slight performance win too, through slightly simpler code generation. Having done this, it now makes sense to give a name to f16->f32 conversion, from_f16(). Finally, while we're at this, also send store_f32 through store(), so that now all formats use load, gather, and store uniformly. CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD Change-Id: I403f16f712936e2bcf3294e72c863cb6c6fbcf0c Reviewed-on: https://skia-review.googlesource.com/5731 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/opts/SkRasterPipeline_opts.h')
-rw-r--r--src/opts/SkRasterPipeline_opts.h111
1 files changed, 30 insertions, 81 deletions
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index e06b68062c..3cc3f2f01a 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -264,6 +264,15 @@ SI void from_565(const SkNh& _565, SkNf* r, SkNf* g, SkNf* b) {
*g = SkNx_cast<float>(_32_bit & SK_G16_MASK_IN_PLACE) * (1.0f / SK_G16_MASK_IN_PLACE);
*b = SkNx_cast<float>(_32_bit & SK_B16_MASK_IN_PLACE) * (1.0f / SK_B16_MASK_IN_PLACE);
}
+SI void from_f16(const void* px, SkNf* r, SkNf* g, SkNf* b, SkNf* a) {
+ SkNh rh, gh, bh, ah;
+ SkNh::Load4(px, &rh, &gh, &bh, &ah);
+
+ *r = SkHalfToFloat_finite_ftz(rh);
+ *g = SkHalfToFloat_finite_ftz(gh);
+ *b = SkHalfToFloat_finite_ftz(bh);
+ *a = SkHalfToFloat_finite_ftz(ah);
+}
STAGE(trace) {
SkDebugf("%s\n", (const char*)ctx);
@@ -459,89 +468,45 @@ STAGE(store_565) {
STAGE(load_f16) {
auto ptr = *(const uint64_t**)ctx + x;
- SkNh rh, gh, bh, ah;
+ const void* src = ptr;
+ SkNx<N, uint64_t> px;
if (tail) {
- uint64_t buf[8] = {0};
- switch (tail & (N-1)) {
- case 7: buf[6] = ptr[6];
- case 6: buf[5] = ptr[5];
- case 5: buf[4] = ptr[4];
- case 4: buf[3] = ptr[3];
- case 3: buf[2] = ptr[2];
- case 2: buf[1] = ptr[1];
- }
- buf[0] = ptr[0];
- SkNh::Load4(buf, &rh, &gh, &bh, &ah);
- } else {
- SkNh::Load4(ptr, &rh, &gh, &bh, &ah);
+ px = load(tail, ptr);
+ src = &px;
}
-
- r = SkHalfToFloat_finite_ftz(rh);
- g = SkHalfToFloat_finite_ftz(gh);
- b = SkHalfToFloat_finite_ftz(bh);
- a = SkHalfToFloat_finite_ftz(ah);
+ from_f16(src, &r, &g, &b, &a);
}
STAGE(load_f16_d) {
auto ptr = *(const uint64_t**)ctx + x;
- SkNh rh, gh, bh, ah;
+ const void* src = ptr;
+ SkNx<N, uint64_t> px;
if (tail) {
- uint64_t buf[8] = {0};
- switch (tail & (N-1)) {
- case 7: buf[6] = ptr[6];
- case 6: buf[5] = ptr[5];
- case 5: buf[4] = ptr[4];
- case 4: buf[3] = ptr[3];
- case 3: buf[2] = ptr[2];
- case 2: buf[1] = ptr[1];
- }
- buf[0] = ptr[0];
- SkNh::Load4(buf, &rh, &gh, &bh, &ah);
- } else {
- SkNh::Load4(ptr, &rh, &gh, &bh, &ah);
+ px = load(tail, ptr);
+ src = &px;
}
-
- dr = SkHalfToFloat_finite_ftz(rh);
- dg = SkHalfToFloat_finite_ftz(gh);
- db = SkHalfToFloat_finite_ftz(bh);
- da = SkHalfToFloat_finite_ftz(ah);
+ from_f16(src, &dr, &dg, &db, &da);
}
STAGE(store_f16) {
auto ptr = *(uint64_t**)ctx + x;
- uint64_t buf[8];
- SkNh::Store4(tail ? buf : ptr, SkFloatToHalf_finite_ftz(r),
- SkFloatToHalf_finite_ftz(g),
- SkFloatToHalf_finite_ftz(b),
- SkFloatToHalf_finite_ftz(a));
+ SkNx<N, uint64_t> px;
+ SkNh::Store4(tail ? (void*)&px : (void*)ptr, SkFloatToHalf_finite_ftz(r),
+ SkFloatToHalf_finite_ftz(g),
+ SkFloatToHalf_finite_ftz(b),
+ SkFloatToHalf_finite_ftz(a));
if (tail) {
- switch (tail & (N-1)) {
- case 7: ptr[6] = buf[6];
- case 6: ptr[5] = buf[5];
- case 5: ptr[4] = buf[4];
- case 4: ptr[3] = buf[3];
- case 3: ptr[2] = buf[2];
- case 2: ptr[1] = buf[1];
- }
- ptr[0] = buf[0];
+ store(tail, px, ptr);
}
}
STAGE(store_f32) {
auto ptr = *(SkPM4f**)ctx + x;
- SkPM4f buf[8];
- SkNf::Store4(tail ? buf : ptr, r,g,b,a);
+ SkNx<N, SkPM4f> px;
+ SkNf::Store4(tail ? (void*)&px : (void*)ptr, r,g,b,a);
if (tail) {
- switch (tail & (N-1)) {
- case 7: ptr[6] = buf[6];
- case 6: ptr[5] = buf[5];
- case 5: ptr[4] = buf[4];
- case 4: ptr[3] = buf[3];
- case 3: ptr[2] = buf[2];
- case 2: ptr[1] = buf[1];
- }
- ptr[0] = buf[0];
+ store(tail, px, ptr);
}
}
@@ -964,24 +929,8 @@ STAGE(gather_f16) {
const uint64_t* p;
SkNi offset = offset_and_ptr(&p, ctx, r, g);
- // f16 -> f32 conversion works best with tightly packed f16s,
- // so we gather each component rather than using gather().
- uint16_t R[N], G[N], B[N], A[N];
- size_t n = tail ? tail : N;
- for (size_t i = 0; i < n; i++) {
- uint64_t rgba = p[offset[i]];
- R[i] = rgba >> 0;
- G[i] = rgba >> 16;
- B[i] = rgba >> 32;
- A[i] = rgba >> 48;
- }
- for (size_t i = n; i < N; i++) {
- R[i] = G[i] = B[i] = A[i] = 0;
- }
- r = SkHalfToFloat_finite_ftz(SkNh::Load(R));
- g = SkHalfToFloat_finite_ftz(SkNh::Load(G));
- b = SkHalfToFloat_finite_ftz(SkNh::Load(B));
- a = SkHalfToFloat_finite_ftz(SkNh::Load(A));
+ auto px = gather(tail, p, offset);
+ from_f16(&px, &r, &g, &b, &a);
}