diff options
author | msarett <msarett@google.com> | 2016-01-19 13:17:58 -0800 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2016-01-19 13:17:58 -0800 |
commit | 53b9d29b973f2828624f097bf110f1c7acc4b593 (patch) | |
tree | 1f42436b2827d2b7426ad7f0cef72b3721782539 /src | |
parent | bc161d6fd75a15d9d2e4e626028c99978068fe46 (diff) |
Add SSSE3 Optimizations for premul and swap
Improves deocde performance for RGBA pngs.
Swizzler Time on z620 (clang):
SwapPremul 0.24x
Premul 0.24x
Swap 0.37x
Decode Time on z620 (clang):
Premul ZeroInit Decodes 0.88x
Unpremul ZeroInit Decodes 0.94x
Premul Regular Decodes 0.91x
Unpremul Regular Decodes 0.98x
Swizzler Time in Dell Venue 8 (gcc):
SwapPremul 0.14x
Premul 0.14x
Swap 0.08x
Decode Time on Dell Venus 8 (gcc):
Premul ZeroInit Decodes 0.79x
Premul Regular Decodes 0.77x
Note:
ZeroInit means memory is zero initialized, and we do not write to
memory for large sections of zero pixels (memory use opt for Android).
BUG=skia:4767
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1601883002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1601883002
Diffstat (limited to 'src')
-rw-r--r-- | src/opts/SkOpts_ssse3.cpp | 5 | ||||
-rw-r--r-- | src/opts/SkSwizzler_opts.h | 96 |
2 files changed, 101 insertions, 0 deletions
diff --git a/src/opts/SkOpts_ssse3.cpp b/src/opts/SkOpts_ssse3.cpp index 675cdaa23e..5378377d1e 100644 --- a/src/opts/SkOpts_ssse3.cpp +++ b/src/opts/SkOpts_ssse3.cpp @@ -9,6 +9,7 @@ #define SK_OPTS_NS sk_ssse3 #include "SkBlitMask_opts.h" #include "SkColorCubeFilter_opts.h" +#include "SkSwizzler_opts.h" #include "SkXfermode_opts.h" namespace SkOpts { @@ -16,5 +17,9 @@ namespace SkOpts { create_xfermode = sk_ssse3::create_xfermode; blit_mask_d32_a8 = sk_ssse3::blit_mask_d32_a8; color_cube_filter_span = sk_ssse3::color_cube_filter_span; + + premul_xxxa = sk_ssse3::premul_xxxa; + premul_swaprb_xxxa = sk_ssse3::premul_swaprb_xxxa; + swaprb_xxxa = sk_ssse3::swaprb_xxxa; } } diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h index 3c0bf5cf2c..b0cf4cad53 100644 --- a/src/opts/SkSwizzler_opts.h +++ b/src/opts/SkSwizzler_opts.h @@ -168,6 +168,102 @@ static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { swaprb_xxxa_portable(dst, src, count); } +#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 + +template <bool kSwapRB> +static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { + + auto premul8 = [](__m128i* lo, __m128i* hi) { + const __m128i zeros = _mm_setzero_si128(); + const __m128i _128 = _mm_set1_epi16(128); + const __m128i _257 = _mm_set1_epi16(257); + __m128i planar; + if (kSwapRB) { + planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); + } else { + planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); + } + + // Swizzle the pixels to 8-bit planar. + *lo = _mm_shuffle_epi8(*lo, planar); // bbbbgggg rrrraaaa + *hi = _mm_shuffle_epi8(*hi, planar); // BBBBGGGG RRRRAAAA + __m128i bg = _mm_unpacklo_epi32(*lo, *hi), // bbbbBBBB ggggGGGG + ra = _mm_unpackhi_epi32(*lo, *hi); // rrrrRRRR aaaaAAAA + + // Unpack to 16-bit planar. + __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_B_B_B_ + g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_G_G_G_ + r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_R_R_R_ + a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_A_A_A_ + + // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. + b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); + g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); + r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257); + + // Repack into interlaced pixels. + bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BGBGBGBG + ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RARARARA + *lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bgrabgra + *hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BGRABGRA + }; + + while (count >= 8) { + __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), + hi = _mm_loadu_si128((const __m128i*) (src + 4)); + + premul8(&lo, &hi); + + _mm_storeu_si128((__m128i*) (dst + 0), lo); + _mm_storeu_si128((__m128i*) (dst + 4), hi); + + src += 8; + dst += 8; + count -= 8; + } + + if (count >= 4) { + __m128i lo = _mm_loadu_si128((const __m128i*) src), + hi = _mm_setzero_si128(); + + premul8(&lo, &hi); + + _mm_storeu_si128((__m128i*) dst, lo); + + src += 4; + dst += 4; + count -= 4; + } + + // Call portable code to finish up the tail of [0,4) pixels. + auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; + proc(dst, src, count); +} + +static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { + premul_xxxa_should_swaprb<false>(dst, src, count); +} + +static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { + premul_xxxa_should_swaprb<true>(dst, src, count); +} + +static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { + const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15); + + while (count >= 4) { + __m128i bgra = _mm_loadu_si128((const __m128i*) src); + __m128i rgba = _mm_shuffle_epi8(bgra, swapRB); + _mm_storeu_si128((__m128i*) dst, rgba); + + src += 4; + dst += 4; + count -= 4; + } + + swaprb_xxxa_portable(dst, src, count); +} + #else static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |