diff options
author | henrik.smiding <henrik.smiding@intel.com> | 2015-03-20 09:20:46 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2015-03-20 09:20:46 -0700 |
commit | 70840cbd898df67f603987213164c798415d76bf (patch) | |
tree | d6119805a17a04ace94a75748204621f090b4fe2 /src/opts/SkBlitRow_opts_SSE4.cpp | |
parent | 86ad8d643624a55b02e529100bbe4e2940115fa1 (diff) |
Replace SSE optimization of Color32A_D565
Adds an SSE2 version of the Color32A_D565 function, to replace
the existing SSE4 version. Also does some minor cleanup.
Performance improvement in the following Skia benchmarks.
Measured on Atom Silvermont:
Xfermode_SrcOver - x3
luma_colorfilter_large - x4.6
luma_colorfilter_small - x2
tablebench - ~15%
chart_bw - ~10%
Measured on Corei7 Haswell:
luma_colorfilter_large running SSE2 - x2
luma_colorfilter_large running SSE4 - x2.3
Also improves performance in WPS Office application and 2D subtest of 0xbenchmark on Android.
Signed-off-by: Henrik Smiding <henrik.smiding@intel.com>
Review URL: https://codereview.chromium.org/923523002
Diffstat (limited to 'src/opts/SkBlitRow_opts_SSE4.cpp')
-rw-r--r-- | src/opts/SkBlitRow_opts_SSE4.cpp | 79 |
1 files changed, 1 insertions, 78 deletions
diff --git a/src/opts/SkBlitRow_opts_SSE4.cpp b/src/opts/SkBlitRow_opts_SSE4.cpp index f4273d27b4..3649d175ef 100644 --- a/src/opts/SkBlitRow_opts_SSE4.cpp +++ b/src/opts/SkBlitRow_opts_SSE4.cpp @@ -7,14 +7,9 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST sk_throw(); } -void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) { - sk_throw(); -} - #else -#include <smmintrin.h> // SSE4.1 intrinsics - +#include <smmintrin.h> // SSE4.1 intrinsics #include "SkColorPriv.h" #include "SkColor_opts_SSE2.h" @@ -66,76 +61,4 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, } } -static inline uint16_t Color32A_D565_1x(uint16_t dst, unsigned scale, uint32_t src_expand) { - uint32_t dst_expand = SkExpand_rgb_16(dst) * scale; - return SkCompact_rgb_16((src_expand + dst_expand) >> 5); -} - -void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) { - SkASSERT(count > 0); - - uint32_t src_expand = (SkGetPackedG32(src) << 24) | - (SkGetPackedR32(src) << 13) | - (SkGetPackedB32(src) << 2); - unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; - - // Check if we have enough pixels to run SIMD - if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) { - __m128i* dst_wide; - const __m128i src_expand_wide = _mm_set1_epi32(src_expand); - const __m128i scale_wide = _mm_set1_epi32(scale); - const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE | - SK_B16_MASK_IN_PLACE | - (SK_G16_MASK_IN_PLACE << 16)); - - // Align dst to an even 16 byte address (0-7 pixels) - while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) { - *dst = Color32A_D565_1x(*dst, scale, src_expand); - dst += 1; - count--; - } - - dst_wide = reinterpret_cast<__m128i*>(dst); - do { - // Load 8 RGB565 pixels - __m128i pixels = _mm_load_si128(dst_wide); - - // Duplicate and mask - __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels); - pixels_high = _mm_and_si128(mask_green, pixels_high); - pixels = _mm_unpacklo_epi16(pixels, pixels); - pixels = _mm_and_si128(mask_green, pixels); - - // Scale with alpha - pixels_high = _mm_mullo_epi32(pixels_high, scale_wide); - pixels = _mm_mullo_epi32(pixels, scale_wide); - - // Add src_expand_wide and shift down again - pixels_high = _mm_add_epi32(pixels_high, src_expand_wide); - pixels_high = _mm_srli_epi32(pixels_high, 5); - pixels = _mm_add_epi32(pixels, src_expand_wide); - pixels = _mm_srli_epi32(pixels, 5); - - // Mask - pixels_high = _mm_and_si128(mask_green, pixels_high); - pixels = _mm_and_si128(mask_green, pixels); - - // Combine into RGB565 and store - pixels = _mm_hadd_epi16(pixels, pixels_high); - _mm_store_si128(dst_wide, pixels); - count -= 8; - dst_wide++; - } while (count >= 8); - - dst = reinterpret_cast<uint16_t*>(dst_wide); - } - - // Small loop to handle remaining pixels. - while (count > 0) { - *dst = Color32A_D565_1x(*dst, scale, src_expand); - dst += 1; - count--; - } -} - #endif |