From bc927548db17accec2195af6e15053f7918bb3f5 Mon Sep 17 00:00:00 2001 From: herb Date: Mon, 16 May 2016 14:27:51 -0700 Subject: Add tests and benches to support the sRGB blitter for SkOpts 1,370.85 LinearSrcOvericonstrip.pngVSkOptsSSE41 2,359.69 LinearSrcOvericonstrip.pngVSkOptsDefault 1,828.72 LinearSrcOvericonstrip.pngVSkOptsNonSimdCore 3,277.40 LinearSrcOvericonstrip.pngVSkOptsTrivial 9,862.34 LinearSrcOvericonstrip.pngVSkOptsBruteForce 633.55 LinearSrcOvermandrill_512.pngVSkOptsSSE41 684.29 LinearSrcOvermandrill_512.pngVSkOptsDefault 1,201.88 LinearSrcOvermandrill_512.pngVSkOptsNonSimdCore 2,382.63 LinearSrcOvermandrill_512.pngVSkOptsTrivial 10,888.74 LinearSrcOvermandrill_512.pngVSkOptsBruteForce 209.14 LinearSrcOverplane.pngVSkOptsSSE41 562.24 LinearSrcOverplane.pngVSkOptsDefault 272.64 LinearSrcOverplane.pngVSkOptsNonSimdCore 436.46 LinearSrcOverplane.pngVSkOptsTrivial 1,327.23 LinearSrcOverplane.pngVSkOptsBruteForce 318.01 LinearSrcOverbaby_tux.pngVSkOptsSSE41 529.05 LinearSrcOverbaby_tux.pngVSkOptsDefault 441.33 LinearSrcOverbaby_tux.pngVSkOptsNonSimdCore 720.50 LinearSrcOverbaby_tux.pngVSkOptsTrivial 2,191.10 LinearSrcOverbaby_tux.pngVSkOptsBruteForce 479.68 LinearSrcOveryellow_rose.pngVSkOptsSSE41 1,095.03 LinearSrcOveryellow_rose.pngVSkOptsDefault 668.60 LinearSrcOveryellow_rose.pngVSkOptsNonSimdCore 1,257.19 LinearSrcOveryellow_rose.pngVSkOptsTrivial 4,970.25 LinearSrcOveryellow_rose.pngVSkOptsBruteForce BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1939513002 CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Committed: https://skia.googlesource.com/skia/+/554784cd85029c05d9ed04b1aeb71520d196153a Review-Url: https://codereview.chromium.org/1939513002 --- src/opts/SkBlend_opts.h | 235 ++++++++++++++++++++++++++++++++++++++++------ src/opts/SkOpts_sse41.cpp | 9 +- 2 files changed, 213 insertions(+), 31 deletions(-) (limited to 'src/opts') diff --git a/src/opts/SkBlend_opts.h b/src/opts/SkBlend_opts.h index a1067407be..93946438e5 100644 --- a/src/opts/SkBlend_opts.h +++ b/src/opts/SkBlend_opts.h @@ -5,52 +5,233 @@ * found in the LICENSE file. */ +/* +ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; and ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q + */ + #ifndef SkBlend_opts_DEFINED #define SkBlend_opts_DEFINED +#include "SkNx.h" +#include "SkPM4fPriv.h" + namespace SK_OPTS_NS { -#if 0 +// An implementation of SrcOver from bytes to bytes in linear space that takes advantage of the +// observation that the 255's cancel. +// invA = 1 - (As / 255); +// +// R = 255 * sqrt((Rs/255)^2 + (Rd/255)^2 * invA) +// => R = 255 * sqrt((Rs^2 + Rd^2 * invA)/255^2) +// => R = sqrt(Rs^2 + Rd^2 * invA) +static inline void blend_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) { + Sk4f s = srgb_to_linear(to_4f(pixel)); + Sk4f d = srgb_to_linear(to_4f(*dst)); + Sk4f invAlpha = 1.0f - Sk4f{s[SkPM4f::A]} * (1.0f / 255.0f); + Sk4f r = linear_to_srgb(s + d * invAlpha) + 0.5f; + *dst = to_4b(r); +} -#else +static inline void srcover_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) { + if ((~pixel & 0xFF000000) == 0) { + *dst = pixel; + } else if ((pixel & 0xFF000000) != 0) { + blend_srgb_srgb_1(dst, pixel); + } +} + +static inline void srcover_srgb_srgb_2(uint32_t* dst, const uint32_t* src) { + srcover_srgb_srgb_1(dst++, *src++); + srcover_srgb_srgb_1(dst, *src); +} + +static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) { + srcover_srgb_srgb_1(dst++, *src++); + srcover_srgb_srgb_1(dst++, *src++); + srcover_srgb_srgb_1(dst++, *src++); + srcover_srgb_srgb_1(dst, *src); +} + +void best_non_simd_srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { + uint64_t* ddst = reinterpret_cast(dst); - static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) { - switch (src >> 24) { - case 0x00: return; - case 0xff: *dst = src; return; + while (ndst >0) { + int count = SkTMin(ndst, nsrc); + ndst -= count; + const uint64_t* dsrc = reinterpret_cast(src); + const uint64_t* end = dsrc + (count >> 1); + do { + if ((~*dsrc & 0xFF000000FF000000) == 0) { + do { + *ddst++ = *dsrc++; + } while (dsrc < end && (~*dsrc & 0xFF000000FF000000) == 0); + } else if ((*dsrc & 0xFF000000FF000000) == 0) { + do { + dsrc++; + ddst++; + } while (dsrc < end && (*dsrc & 0xFF000000FF000000) == 0); + } else { + srcover_srgb_srgb_2(reinterpret_cast(ddst++), + reinterpret_cast(dsrc++)); + } + } while (dsrc < end); + + if ((count & 1) != 0) { + srcover_srgb_srgb_1(reinterpret_cast(ddst), + *reinterpret_cast(dsrc)); + } + } +} + +void brute_force_srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { + while (ndst > 0) { + int n = SkTMin(ndst, nsrc); + + for (int i = 0; i < n; i++) { + blend_srgb_srgb_1(dst++, src[i]); } + ndst -= n; + } +} - Sk4f d = SkNx_cast(Sk4b::Load( dst)), - s = SkNx_cast(Sk4b::Load(&src)); +void trivial_srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { + while (ndst > 0) { + int n = SkTMin(ndst, nsrc); - // Approximate sRGB gamma as 2.0. - Sk4f d_sq = d*d, - s_sq = s*s; - d = Sk4f{d_sq[0], d_sq[1], d_sq[2], d[3]}; - s = Sk4f{s_sq[0], s_sq[1], s_sq[2], s[3]}; + for (int i = 0; i < n; i++) { + srcover_srgb_srgb_1(dst++, src[i]); + } + ndst -= n; + } +} - // SrcOver. - Sk4f invA = 1.0f - s[3]*(1/255.0f); - d = s + d * invA; +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 - // Re-apply approximate sRGB gamma. - Sk4f d_sqrt = d.sqrt(); - d = Sk4f{d_sqrt[0], d_sqrt[1], d_sqrt[2], d[3]}; + static inline __m128i load(const uint32_t* p) { + return _mm_loadu_si128(reinterpret_cast(p)); + } - SkNx_cast(d).store(dst); + static inline void store(uint32_t* p, __m128i v) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v); } - static inline void srcover_srgb_srgb(uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { - while (ndst > 0) { - int n = SkTMin(ndst, nsrc); + #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 + + void srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) { + const __m128i alphaMask = _mm_set1_epi32(0xFF000000); + while (ndst > 0) { + int count = SkTMin(ndst, nsrc); + ndst -= count; + const uint32_t* src = srcStart; + const uint32_t* end = src + (count & ~3); - for (int i = 0; i < n; i++) { - srcover_srgb_srgb_1(dst++, src[i]); + while (src < end) { + __m128i pixels = load(src); + if (_mm_testc_si128(pixels, alphaMask)) { + do { + store(dst, pixels); + dst += 4; + src += 4; + } while (src < end && _mm_testc_si128(pixels = load(src), alphaMask)); + } else if (_mm_testz_si128(pixels, alphaMask)) { + do { + dst += 4; + src += 4; + } while (src < end && _mm_testz_si128(pixels = load(src), alphaMask)); + } else { + do { + srcover_srgb_srgb_4(dst, src); + dst += 4; + src += 4; + } while (src < end && _mm_testnzc_si128(pixels = load(src), alphaMask)); + } + } + + count = count & 3; + while (count-- > 0) { + srcover_srgb_srgb_1(dst++, *src++); + } } - ndst -= n; } + #else + // SSE2 versions + static inline bool check_opaque_alphas(__m128i pixels) { + int mask = + _mm_movemask_epi8( + _mm_cmpeq_epi32( + _mm_andnot_si128(pixels, _mm_set1_epi32(0xFF000000)), + _mm_setzero_si128())); + return mask == 0xFFFF; + } + + static inline bool check_transparent_alphas(__m128i pixels) { + int mask = + _mm_movemask_epi8( + _mm_cmpeq_epi32( + _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)), + _mm_setzero_si128())); + return mask == 0xFFFF; + } + + static inline bool check_partial_alphas(__m128i pixels) { + __m128i alphas = _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)); + int mask = + _mm_movemask_epi8( + _mm_cmpeq_epi8( + _mm_srai_epi32(alphas, 8), + alphas)); + return mask == 0xFFFF; + } + + void srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) { + while (ndst > 0) { + int count = SkTMin(ndst, nsrc); + ndst -= count; + const uint32_t* src = srcStart; + const uint32_t* end = src + (count & ~3); + + __m128i pixels = load(src); + do { + if (check_opaque_alphas(pixels)) { + do { + store(dst, pixels); + dst += 4; + src += 4; + } while (src < end && check_opaque_alphas(pixels = load(src))); + } else if (check_transparent_alphas(pixels)) { + const uint32_t* start = src; + do { + src += 4; + } while (src < end && check_transparent_alphas(pixels = load(src))); + dst += src - start; + } else { + do { + srcover_srgb_srgb_4(dst, src); + dst += 4; + src += 4; + } while (src < end && check_partial_alphas(pixels = load(src))); + } + } while (src < end); + + count = count & 3; + while (count-- > 0) { + srcover_srgb_srgb_1(dst++, *src++); + } + } + } + #endif +#else + + void srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { + trivial_srcover_srgb_srgb(dst, src, ndst, nsrc); } - + #endif } // namespace SK_OPTS_NS diff --git a/src/opts/SkOpts_sse41.cpp b/src/opts/SkOpts_sse41.cpp index 34b078c2ca..f0561a69c6 100644 --- a/src/opts/SkOpts_sse41.cpp +++ b/src/opts/SkOpts_sse41.cpp @@ -10,6 +10,7 @@ #define SK_OPTS_NS sk_sse41 #include "SkBlurImageFilter_opts.h" #include "SkBlitRow_opts.h" +#include "SkBlend_opts.h" #ifndef SK_SUPPORT_LEGACY_X86_BLITS @@ -211,16 +212,16 @@ static void blit_mask_d32_a8(SkPMColor* dst, size_t dstRB, } } } - } // namespace sk_sse41_new #endif namespace SkOpts { void Init_sse41() { - box_blur_xx = sk_sse41::box_blur_xx; - box_blur_xy = sk_sse41::box_blur_xy; - box_blur_yx = sk_sse41::box_blur_yx; + box_blur_xx = sk_sse41::box_blur_xx; + box_blur_xy = sk_sse41::box_blur_xy; + box_blur_yx = sk_sse41::box_blur_yx; + srcover_srgb_srgb = sk_sse41::srcover_srgb_srgb; #ifndef SK_SUPPORT_LEGACY_X86_BLITS blit_row_color32 = sk_sse41_new::blit_row_color32; -- cgit v1.2.3