diff options
author | mtklein <mtklein@google.com> | 2015-04-27 14:51:16 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2015-04-27 14:51:16 -0700 |
commit | 641c3ff7c680ef7935d47d2e68f8301acc79e3de (patch) | |
tree | 0739bd83543af31c22ef25e1b352cedb56a4ee77 /src/opts/SkBlitRow_opts_SSE2.cpp | |
parent | d65dc0cedd5b50dd407b6ff8fdc39123f11511cc (diff) |
Revert of De-proc Color32 (patchset #5 id:80001 of https://codereview.chromium.org/1104183004/)
Reason for revert:
duh
Original issue's description:
> De-proc Color32
>
> Also strips SK_SUPPORT_LEGACY_COLOR32_MATH,
> which is no longer needed.
>
> Seems handy to have SkTypes include the relevant intrinsics when
> we know we've got them, but I'm not married to it.
>
> Locally this looks like a pointlessly small perf win, but I'm mostly
> keen to get all the code together.
>
> BUG=skia:
>
> Committed: https://skia.googlesource.com/skia/+/376e9bc206b69d9190f38dfebb132a8769bbd72b
>
> Committed: https://skia.googlesource.com/skia/+/d65dc0cedd5b50dd407b6ff8fdc39123f11511cc
TBR=reed@google.com,mtklein@chromium.org
NOPRESUBMIT=true
NOTREECHECKS=true
NOTRY=true
BUG=skia:
Review URL: https://codereview.chromium.org/1102363006
Diffstat (limited to 'src/opts/SkBlitRow_opts_SSE2.cpp')
-rw-r--r-- | src/opts/SkBlitRow_opts_SSE2.cpp | 65 |
1 files changed, 65 insertions, 0 deletions
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp index 7f5b6779cf..59375f1831 100644 --- a/src/opts/SkBlitRow_opts_SSE2.cpp +++ b/src/opts/SkBlitRow_opts_SSE2.cpp @@ -232,6 +232,71 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, } } +#define SK_SUPPORT_LEGACY_COLOR32_MATHx + +/* SSE2 version of Color32() + * portable version is in core/SkBlitRow_D32.cpp + */ +// Color32 and its SIMD specializations use the blend_256_round_alt algorithm +// from tests/BlendTest.cpp. It's not quite perfect, but it's never wrong in the +// interesting edge cases, and it's quite a bit faster than blend_perfect. +// +// blend_256_round_alt is our currently blessed algorithm. Please use it or an analogous one. +void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) { + switch (SkGetPackedA32(color)) { + case 0: memmove(dst, src, count * sizeof(SkPMColor)); return; + case 255: sk_memset32(dst, color, count); return; + } + + __m128i colorHigh = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_set1_epi32(color)); +#ifdef SK_SUPPORT_LEGACY_COLOR32_MATH // blend_256_plus1_trunc, busted + __m128i colorAndRound = colorHigh; +#else // blend_256_round_alt, good + __m128i colorAndRound = _mm_add_epi16(colorHigh, _mm_set1_epi16(128)); +#endif + + unsigned invA = 255 - SkGetPackedA32(color); +#ifdef SK_SUPPORT_LEGACY_COLOR32_MATH // blend_256_plus1_trunc, busted + __m128i invA16 = _mm_set1_epi16(invA); +#else // blend_256_round_alt, good + SkASSERT(invA + (invA >> 7) < 256); // We should still fit in the low byte here. + __m128i invA16 = _mm_set1_epi16(invA + (invA >> 7)); +#endif + + // Does the core work of blending color onto 4 pixels, returning the resulting 4 pixels. + auto kernel = [&](const __m128i& src4) -> __m128i { + __m128i lo = _mm_mullo_epi16(invA16, _mm_unpacklo_epi8(src4, _mm_setzero_si128())), + hi = _mm_mullo_epi16(invA16, _mm_unpackhi_epi8(src4, _mm_setzero_si128())); + return _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(colorAndRound, lo), 8), + _mm_srli_epi16(_mm_add_epi16(colorAndRound, hi), 8)); + }; + + while (count >= 8) { + __m128i dst0 = kernel(_mm_loadu_si128((const __m128i*)(src+0))), + dst4 = kernel(_mm_loadu_si128((const __m128i*)(src+4))); + _mm_storeu_si128((__m128i*)(dst+0), dst0); + _mm_storeu_si128((__m128i*)(dst+4), dst4); + src += 8; + dst += 8; + count -= 8; + } + if (count >= 4) { + _mm_storeu_si128((__m128i*)dst, kernel(_mm_loadu_si128((const __m128i*)src))); + src += 4; + dst += 4; + count -= 4; + } + if (count >= 2) { + _mm_storel_epi64((__m128i*)dst, kernel(_mm_loadl_epi64((const __m128i*)src))); + src += 2; + dst += 2; + count -= 2; + } + if (count >= 1) { + *dst = _mm_cvtsi128_si32(kernel(_mm_cvtsi32_si128(*src))); + } +} + void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) { SkASSERT(count > 0); |