/* * Copyright 2016 Google Inc. * * Use of this source code is governed by a BSD-style license that can be * found in the LICENSE file. */ /* ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; and ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q */ #ifndef SkBlend_opts_DEFINED #define SkBlend_opts_DEFINED #include "SkNx.h" #include "SkPM4fPriv.h" namespace SK_OPTS_NS { // An implementation of SrcOver from bytes to bytes in linear space that takes advantage of the // observation that the 255's cancel. // invA = 1 - (As / 255); // // R = 255 * sqrt((Rs/255)^2 + (Rd/255)^2 * invA) // => R = 255 * sqrt((Rs^2 + Rd^2 * invA)/255^2) // => R = sqrt(Rs^2 + Rd^2 * invA) static inline void blend_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) { Sk4f s = srgb_to_linear(to_4f(pixel)); Sk4f d = srgb_to_linear(to_4f(*dst)); Sk4f invAlpha = 1.0f - Sk4f{s[SkPM4f::A]} * (1.0f / 255.0f); Sk4f r = linear_to_srgb(s + d * invAlpha) + 0.5f; *dst = to_4b(r); } static inline void srcover_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) { if ((~pixel & 0xFF000000) == 0) { *dst = pixel; } else if ((pixel & 0xFF000000) != 0) { blend_srgb_srgb_1(dst, pixel); } } static inline void srcover_srgb_srgb_2(uint32_t* dst, const uint32_t* src) { srcover_srgb_srgb_1(dst++, *src++); srcover_srgb_srgb_1(dst, *src); } static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) { srcover_srgb_srgb_1(dst++, *src++); srcover_srgb_srgb_1(dst++, *src++); srcover_srgb_srgb_1(dst++, *src++); srcover_srgb_srgb_1(dst, *src); } void best_non_simd_srcover_srgb_srgb( uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { uint64_t* ddst = reinterpret_cast(dst); while (ndst >0) { int count = SkTMin(ndst, nsrc); ndst -= count; const uint64_t* dsrc = reinterpret_cast(src); const uint64_t* end = dsrc + (count >> 1); do { if ((~*dsrc & 0xFF000000FF000000) == 0) { do { *ddst++ = *dsrc++; } while (dsrc < end && (~*dsrc & 0xFF000000FF000000) == 0); } else if ((*dsrc & 0xFF000000FF000000) == 0) { do { dsrc++; ddst++; } while (dsrc < end && (*dsrc & 0xFF000000FF000000) == 0); } else { srcover_srgb_srgb_2(reinterpret_cast(ddst++), reinterpret_cast(dsrc++)); } } while (dsrc < end); if ((count & 1) != 0) { srcover_srgb_srgb_1(reinterpret_cast(ddst), *reinterpret_cast(dsrc)); } } } void brute_force_srcover_srgb_srgb( uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { while (ndst > 0) { int n = SkTMin(ndst, nsrc); for (int i = 0; i < n; i++) { blend_srgb_srgb_1(dst++, src[i]); } ndst -= n; } } void trivial_srcover_srgb_srgb( uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { while (ndst > 0) { int n = SkTMin(ndst, nsrc); for (int i = 0; i < n; i++) { srcover_srgb_srgb_1(dst++, src[i]); } ndst -= n; } } #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 static inline __m128i load(const uint32_t* p) { return _mm_loadu_si128(reinterpret_cast(p)); } static inline void store(uint32_t* p, __m128i v) { _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v); } #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 void srcover_srgb_srgb( uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) { const __m128i alphaMask = _mm_set1_epi32(0xFF000000); while (ndst > 0) { int count = SkTMin(ndst, nsrc); ndst -= count; const uint32_t* src = srcStart; const uint32_t* end = dst + (count & ~3); ptrdiff_t delta = src - dst; while (dst < end) { __m128i pixels = load(src); if (_mm_testc_si128(pixels, alphaMask)) { uint32_t* start = dst; do { store(dst, pixels); dst += 4; } while (dst < end && _mm_testc_si128(pixels = load(dst + delta), alphaMask)); src += dst - start; } else if (_mm_testz_si128(pixels, alphaMask)) { do { dst += 4; src += 4; } while (dst < end && _mm_testz_si128(pixels = load(src), alphaMask)); } else { uint32_t* start = dst; do { srcover_srgb_srgb_4(dst, dst + delta); dst += 4; } while (dst < end && _mm_testnzc_si128(pixels = load(dst + delta), alphaMask)); src += dst - start; } } count = count & 3; while (count-- > 0) { srcover_srgb_srgb_1(dst++, *src++); } } } #else // SSE2 versions // Note: In the next three comparisons a group of 4 pixels is converted to a group of // "signed" pixels because the sse2 does not have an unsigned comparison. // Make it so that we can use the signed comparison operators by biasing // 0x00xxxxxx to 0x80xxxxxxx which is the smallest values and biasing 0xffxxxxxx to // 0x7fxxxxxx which is the largest set of values. static inline bool check_opaque_alphas(__m128i pixels) { __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000)); int mask = _mm_movemask_epi8( _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000))); return mask == 0; } static inline bool check_transparent_alphas(__m128i pixels) { __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000)); int mask = _mm_movemask_epi8( _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF))); return mask == 0; } static inline bool check_partial_alphas(__m128i pixels) { __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000)); __m128i opaque = _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000)); __m128i transparent = _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF)); int mask = _mm_movemask_epi8(_mm_xor_si128(opaque, transparent)); return mask == 0; } void srcover_srgb_srgb( uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) { while (ndst > 0) { int count = SkTMin(ndst, nsrc); ndst -= count; const uint32_t* src = srcStart; const uint32_t* end = dst + (count & ~3); const ptrdiff_t delta = src - dst; __m128i pixels = load(src); do { if (check_opaque_alphas(pixels)) { uint32_t* start = dst; do { store(dst, pixels); dst += 4; } while (dst < end && check_opaque_alphas((pixels = load(dst + delta)))); src += dst - start; } else if (check_transparent_alphas(pixels)) { const uint32_t* start = dst; do { dst += 4; } while (dst < end && check_transparent_alphas(pixels = load(dst + delta))); src += dst - start; } else { const uint32_t* start = dst; do { srcover_srgb_srgb_4(dst, dst + delta); dst += 4; } while (dst < end && check_partial_alphas(pixels = load(dst + delta))); src += dst - start; } } while (dst < end); count = count & 3; while (count-- > 0) { srcover_srgb_srgb_1(dst++, *src++); } } } #endif #else void srcover_srgb_srgb( uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { trivial_srcover_srgb_srgb(dst, src, ndst, nsrc); } #endif } // namespace SK_OPTS_NS #endif//SkBlend_opts_DEFINED