diff options
author | herb <herb@google.com> | 2016-05-23 13:50:12 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2016-05-23 13:50:12 -0700 |
commit | 074b48ecb5ed8f9b25039477794437ae853d85c4 (patch) | |
tree | 3188dbfc96a1e64c52c22d6c383c82f3dfd31af6 /src | |
parent | 1d1559620058365e0de25636f1bcf07fcc071c3d (diff) |
I have found a more efficient way of detecting 1 and 0 alpha in SSE2. In addition, I found a stall on an execution unit for the lea instruction and rearranged to code to avoid that.
Before
1,362.01 LinearSrcOvericonstrip.pngVSkOptsSSE41
2,132.54 LinearSrcOvericonstrip.pngVSkOptsDefault
1,717.77 LinearSrcOvericonstrip.pngVSkOptsNonSimdCore
3,525.14 LinearSrcOvericonstrip.pngVSkOptsTrivial
11,181.78 LinearSrcOvericonstrip.pngVSkOptsBruteForce
644.77 LinearSrcOvermandrill_512.pngVSkOptsSSE41
682.51 LinearSrcOvermandrill_512.pngVSkOptsDefault
1,169.65 LinearSrcOvermandrill_512.pngVSkOptsNonSimdCore
2,486.45 LinearSrcOvermandrill_512.pngVSkOptsTrivial
11,635.94 LinearSrcOvermandrill_512.pngVSkOptsBruteForce
217.76 LinearSrcOverplane.pngVSkOptsSSE41
437.09 LinearSrcOverplane.pngVSkOptsDefault
275.91 LinearSrcOverplane.pngVSkOptsNonSimdCore
481.70 LinearSrcOverplane.pngVSkOptsTrivial
1,504.66 LinearSrcOverplane.pngVSkOptsBruteForce
323.90 LinearSrcOverbaby_tux.pngVSkOptsSSE41
497.49 LinearSrcOverbaby_tux.pngVSkOptsDefault
456.08 LinearSrcOverbaby_tux.pngVSkOptsNonSimdCore
786.46 LinearSrcOverbaby_tux.pngVSkOptsTrivial
2,554.65 LinearSrcOverbaby_tux.pngVSkOptsBruteForce
484.83 LinearSrcOveryellow_rose.pngVSkOptsSSE41
821.86 LinearSrcOveryellow_rose.pngVSkOptsDefault
655.37 LinearSrcOveryellow_rose.pngVSkOptsNonSimdCore
1,323.80 LinearSrcOveryellow_rose.pngVSkOptsTrivial
5,802.61 LinearSrcOveryellow_rose.pngVSkOptsBruteForce
After changes to sse2 and sse4.1
1,343.12 LinearSrcOvericonstrip.pngVSkOptsSSE41
1,441.17 LinearSrcOvericonstrip.pngVSkOptsDefault
1,679.97 LinearSrcOvericonstrip.pngVSkOptsNonSimdCore
3,481.05 LinearSrcOvericonstrip.pngVSkOptsTrivial
10,979.99 LinearSrcOvericonstrip.pngVSkOptsBruteForce
574.17 LinearSrcOvermandrill_512.pngVSkOptsSSE41
641.40 LinearSrcOvermandrill_512.pngVSkOptsDefault
1,169.44 LinearSrcOvermandrill_512.pngVSkOptsNonSimdCore
2,359.84 LinearSrcOvermandrill_512.pngVSkOptsTrivial
12,106.02 LinearSrcOvermandrill_512.pngVSkOptsBruteForce
209.95 LinearSrcOverplane.pngVSkOptsSSE41
249.12 LinearSrcOverplane.pngVSkOptsDefault
270.36 LinearSrcOverplane.pngVSkOptsNonSimdCore
466.30 LinearSrcOverplane.pngVSkOptsTrivial
1,431.14 LinearSrcOverplane.pngVSkOptsBruteForce
309.70 LinearSrcOverbaby_tux.pngVSkOptsSSE41
354.86 LinearSrcOverbaby_tux.pngVSkOptsDefault
442.69 LinearSrcOverbaby_tux.pngVSkOptsNonSimdCore
764.12 LinearSrcOverbaby_tux.pngVSkOptsTrivial
2,756.16 LinearSrcOverbaby_tux.pngVSkOptsBruteForce
457.70 LinearSrcOveryellow_rose.pngVSkOptsSSE41
500.50 LinearSrcOveryellow_rose.pngVSkOptsDefault
677.84 LinearSrcOveryellow_rose.pngVSkOptsNonSimdCore
1,301.50 LinearSrcOveryellow_rose.pngVSkOptsTrivial
5,786.40 LinearSrcOveryellow_rose.pngVSkOptsBruteForce
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1998373002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review-Url: https://codereview.chromium.org/1998373002
Diffstat (limited to 'src')
-rw-r--r-- | src/opts/SkBlend_opts.h | 79 |
1 files changed, 45 insertions, 34 deletions
diff --git a/src/opts/SkBlend_opts.h b/src/opts/SkBlend_opts.h index 93946438e5..2e6eff6277 100644 --- a/src/opts/SkBlend_opts.h +++ b/src/opts/SkBlend_opts.h @@ -127,27 +127,33 @@ void trivial_srcover_srgb_srgb( int count = SkTMin(ndst, nsrc); ndst -= count; const uint32_t* src = srcStart; - const uint32_t* end = src + (count & ~3); + const uint32_t* end = dst + (count & ~3); + ptrdiff_t delta = src - dst; - while (src < end) { + while (dst < end) { __m128i pixels = load(src); if (_mm_testc_si128(pixels, alphaMask)) { + uint32_t* start = dst; do { store(dst, pixels); dst += 4; - src += 4; - } while (src < end && _mm_testc_si128(pixels = load(src), alphaMask)); + } while (dst < end + && _mm_testc_si128(pixels = load(dst + delta), alphaMask)); + src += dst - start; } else if (_mm_testz_si128(pixels, alphaMask)) { do { dst += 4; src += 4; - } while (src < end && _mm_testz_si128(pixels = load(src), alphaMask)); + } while (dst < end + && _mm_testz_si128(pixels = load(src), alphaMask)); } else { + uint32_t* start = dst; do { - srcover_srgb_srgb_4(dst, src); + srcover_srgb_srgb_4(dst, dst + delta); dst += 4; - src += 4; - } while (src < end && _mm_testnzc_si128(pixels = load(src), alphaMask)); + } while (dst < end + && _mm_testnzc_si128(pixels = load(dst + delta), alphaMask)); + src += dst - start; } } @@ -159,32 +165,34 @@ void trivial_srcover_srgb_srgb( } #else // SSE2 versions + + // Note: In the next three comparisons a group of 4 pixels is converted to a group of + // "signed" pixels because the sse2 does not have an unsigned comparison. + // Make it so that we can use the signed comparison operators by biasing + // 0x00xxxxxx to 0x80xxxxxxx which is the smallest values and biasing 0xffxxxxxx to + // 0x7fxxxxxx which is the largest set of values. static inline bool check_opaque_alphas(__m128i pixels) { + __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000)); int mask = _mm_movemask_epi8( - _mm_cmpeq_epi32( - _mm_andnot_si128(pixels, _mm_set1_epi32(0xFF000000)), - _mm_setzero_si128())); - return mask == 0xFFFF; + _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000))); + return mask == 0; } static inline bool check_transparent_alphas(__m128i pixels) { + __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000)); int mask = _mm_movemask_epi8( - _mm_cmpeq_epi32( - _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)), - _mm_setzero_si128())); - return mask == 0xFFFF; + _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF))); + return mask == 0; } static inline bool check_partial_alphas(__m128i pixels) { - __m128i alphas = _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)); - int mask = - _mm_movemask_epi8( - _mm_cmpeq_epi8( - _mm_srai_epi32(alphas, 8), - alphas)); - return mask == 0xFFFF; + __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000)); + __m128i opaque = _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000)); + __m128i transparent = _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF)); + int mask = _mm_movemask_epi8(_mm_xor_si128(opaque, transparent)); + return mask == 0; } void srcover_srgb_srgb( @@ -193,30 +201,33 @@ void trivial_srcover_srgb_srgb( int count = SkTMin(ndst, nsrc); ndst -= count; const uint32_t* src = srcStart; - const uint32_t* end = src + (count & ~3); + const uint32_t* end = dst + (count & ~3); + const ptrdiff_t delta = src - dst; __m128i pixels = load(src); do { if (check_opaque_alphas(pixels)) { + uint32_t* start = dst; do { store(dst, pixels); dst += 4; - src += 4; - } while (src < end && check_opaque_alphas(pixels = load(src))); + } while (dst < end && check_opaque_alphas((pixels = load(dst + delta)))); + src += dst - start; } else if (check_transparent_alphas(pixels)) { - const uint32_t* start = src; + const uint32_t* start = dst; do { - src += 4; - } while (src < end && check_transparent_alphas(pixels = load(src))); - dst += src - start; + dst += 4; + } while (dst < end && check_transparent_alphas(pixels = load(dst + delta))); + src += dst - start; } else { + const uint32_t* start = dst; do { - srcover_srgb_srgb_4(dst, src); + srcover_srgb_srgb_4(dst, dst + delta); dst += 4; - src += 4; - } while (src < end && check_partial_alphas(pixels = load(src))); + } while (dst < end && check_partial_alphas(pixels = load(dst + delta))); + src += dst - start; } - } while (src < end); + } while (dst < end); count = count & 3; while (count-- > 0) { |