diff options
author | 2016-10-20 16:20:46 -0400 | |
---|---|---|
committer | 2016-10-20 21:33:01 +0000 | |
commit | 050ffa9ad5d2bafc935c0a48ce3caed47446be12 (patch) | |
tree | 551f12d226aee77e2f95ef67968f9371e613ff21 | |
parent | 958788ab461bf84e8c604a17dba5ebc2d6c27dfa (diff) |
SkRasterPipeline: memcpy-free tail code.
We don't call the tail code nearly as often as the body code, but when we do and call memcpy(), we first have to vzeroupper back into the non-AVX world. That does seem to slow things down considerably. You wouldn't think it, but this gives a nice speed up (tested on Windows).
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=3783
Change-Id: I40cbe1e529f2431825edec7638265601b64e7ec5
Reviewed-on: https://skia-review.googlesource.com/3783
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
-rw-r--r-- | src/opts/SkRasterPipeline_opts.h | 66 |
1 files changed, 53 insertions, 13 deletions
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h index 7e884c7cd3..f266433cd3 100644 --- a/src/opts/SkRasterPipeline_opts.h +++ b/src/opts/SkRasterPipeline_opts.h @@ -158,10 +158,18 @@ namespace SK_OPTS_NS { template <bool kIsTail, typename T> SI SkNx<N,T> load(size_t tail, const T* src) { SkASSERT(kIsTail == (tail > 0)); - // TODO: better tail, maskload for 32- and 64-bit T - T buf[N] = {0}; + // TODO: maskload for 32- and 64-bit T + T buf[8]; if (kIsTail) { - memcpy(buf, src, tail*sizeof(T)); + switch (tail & (N-1)) { + case 7: buf[6] = src[6]; + case 6: buf[5] = src[5]; + case 5: buf[4] = src[4]; + case 4: buf[3] = src[3]; + case 3: buf[2] = src[2]; + case 2: buf[1] = src[1]; + } + buf[0] = src[0]; src = buf; } return SkNx<N,T>::Load(src); @@ -170,12 +178,20 @@ namespace SK_OPTS_NS { template <bool kIsTail, typename T> SI void store(size_t tail, const SkNx<N,T>& v, T* dst) { SkASSERT(kIsTail == (tail > 0)); - // TODO: better tail, maskstore for 32- and 64-bit T - T buf[N] = {0}; - v.store(kIsTail ? buf : dst); + // TODO: maskstore for 32- and 64-bit T if (kIsTail) { - memcpy(dst, buf, tail*sizeof(T)); + switch (tail & (N-1)) { + case 7: dst[6] = v[6]; + case 6: dst[5] = v[5]; + case 5: dst[4] = v[4]; + case 4: dst[3] = v[3]; + case 3: dst[2] = v[2]; + case 2: dst[1] = v[1]; + } + dst[0] = v[0]; + return; } + v.store(dst); } SI void from_565(const SkNh& _565, SkNf* r, SkNf* g, SkNf* b) { @@ -275,9 +291,17 @@ namespace SK_OPTS_NS { STAGE(load_d_f16, true) { auto ptr = (const uint64_t*)ctx + x; - uint64_t buf[N] = {0}; + uint64_t buf[8]; if (kIsTail) { - memcpy(buf, ptr, tail*sizeof(uint64_t)); + switch (tail & (N-1)) { + case 7: buf[6] = ptr[6]; + case 6: buf[5] = ptr[5]; + case 5: buf[4] = ptr[4]; + case 4: buf[3] = ptr[3]; + case 3: buf[2] = ptr[2]; + case 2: buf[1] = ptr[1]; + } + buf[0] = ptr[0]; ptr = buf; } @@ -292,9 +316,17 @@ namespace SK_OPTS_NS { STAGE(load_s_f16, true) { auto ptr = (const uint64_t*)ctx + x; - uint64_t buf[N] = {0}; + uint64_t buf[8]; if (kIsTail) { - memcpy(buf, ptr, tail*sizeof(uint64_t)); + switch (tail & (N-1)) { + case 7: buf[6] = ptr[6]; + case 6: buf[5] = ptr[5]; + case 5: buf[4] = ptr[4]; + case 4: buf[3] = ptr[3]; + case 3: buf[2] = ptr[2]; + case 2: buf[1] = ptr[1]; + } + buf[0] = ptr[0]; ptr = buf; } @@ -310,13 +342,21 @@ namespace SK_OPTS_NS { clamp_01_premul(r,g,b,a); auto ptr = (uint64_t*)ctx + x; - uint64_t buf[N] = {0}; + uint64_t buf[8]; SkNh::Store4(kIsTail ? buf : ptr, SkFloatToHalf_finite_ftz(r), SkFloatToHalf_finite_ftz(g), SkFloatToHalf_finite_ftz(b), SkFloatToHalf_finite_ftz(a)); if (kIsTail) { - memcpy(ptr, buf, tail*sizeof(uint64_t)); + switch (tail & (N-1)) { + case 7: ptr[6] = buf[6]; + case 6: ptr[5] = buf[5]; + case 5: ptr[4] = buf[4]; + case 4: ptr[3] = buf[3]; + case 3: ptr[2] = buf[2]; + case 2: ptr[1] = buf[1]; + } + ptr[0] = buf[0]; } } |