diff options
author | henrik.smiding <henrik.smiding@intel.com> | 2015-02-10 09:42:33 -0800 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2015-02-10 09:42:33 -0800 |
commit | 4e65473069b3a9382292577875366bd86c5777c8 (patch) | |
tree | b3615fd7ca83393d84497f38ef0e87bb6efc359b /src/opts | |
parent | 46b80833394d7919cadf2abf2b93802141dd21c5 (diff) |
Add SSE optimization of Color32A_D565
Adds an SSE4.1 version of the Color32A_D565 function.
Performance improvement in the following benchmarks:
Xfermode_SrcOver - ~100%
luma_colorfilter_large - ~150%
luma_colorfilter_small - ~60%
tablebench - ~10%
chart_bw - ~10%
(Measured on a Atom Silvermont core)
Signed-off-by: Henrik Smiding <henrik.smiding@intel.com>
Review URL: https://codereview.chromium.org/892623002
Diffstat (limited to 'src/opts')
-rw-r--r-- | src/opts/SkBlitRow_opts_SSE4.cpp | 79 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_SSE4.h | 3 | ||||
-rw-r--r-- | src/opts/opts_check_x86.cpp | 17 |
3 files changed, 93 insertions, 6 deletions
diff --git a/src/opts/SkBlitRow_opts_SSE4.cpp b/src/opts/SkBlitRow_opts_SSE4.cpp index fd837d54fe..f4273d27b4 100644 --- a/src/opts/SkBlitRow_opts_SSE4.cpp +++ b/src/opts/SkBlitRow_opts_SSE4.cpp @@ -7,10 +7,13 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST sk_throw(); } +void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) { + sk_throw(); +} + #else -#include <emmintrin.h> // SSE2: Most _mm_foo() in this file. -#include <smmintrin.h> // SSE4.1: _mm_testz_si128 and _mm_testc_si128. +#include <smmintrin.h> // SSE4.1 intrinsics #include "SkColorPriv.h" #include "SkColor_opts_SSE2.h" @@ -63,4 +66,76 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, } } +static inline uint16_t Color32A_D565_1x(uint16_t dst, unsigned scale, uint32_t src_expand) { + uint32_t dst_expand = SkExpand_rgb_16(dst) * scale; + return SkCompact_rgb_16((src_expand + dst_expand) >> 5); +} + +void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) { + SkASSERT(count > 0); + + uint32_t src_expand = (SkGetPackedG32(src) << 24) | + (SkGetPackedR32(src) << 13) | + (SkGetPackedB32(src) << 2); + unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; + + // Check if we have enough pixels to run SIMD + if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) { + __m128i* dst_wide; + const __m128i src_expand_wide = _mm_set1_epi32(src_expand); + const __m128i scale_wide = _mm_set1_epi32(scale); + const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE | + SK_B16_MASK_IN_PLACE | + (SK_G16_MASK_IN_PLACE << 16)); + + // Align dst to an even 16 byte address (0-7 pixels) + while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) { + *dst = Color32A_D565_1x(*dst, scale, src_expand); + dst += 1; + count--; + } + + dst_wide = reinterpret_cast<__m128i*>(dst); + do { + // Load 8 RGB565 pixels + __m128i pixels = _mm_load_si128(dst_wide); + + // Duplicate and mask + __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels); + pixels_high = _mm_and_si128(mask_green, pixels_high); + pixels = _mm_unpacklo_epi16(pixels, pixels); + pixels = _mm_and_si128(mask_green, pixels); + + // Scale with alpha + pixels_high = _mm_mullo_epi32(pixels_high, scale_wide); + pixels = _mm_mullo_epi32(pixels, scale_wide); + + // Add src_expand_wide and shift down again + pixels_high = _mm_add_epi32(pixels_high, src_expand_wide); + pixels_high = _mm_srli_epi32(pixels_high, 5); + pixels = _mm_add_epi32(pixels, src_expand_wide); + pixels = _mm_srli_epi32(pixels, 5); + + // Mask + pixels_high = _mm_and_si128(mask_green, pixels_high); + pixels = _mm_and_si128(mask_green, pixels); + + // Combine into RGB565 and store + pixels = _mm_hadd_epi16(pixels, pixels_high); + _mm_store_si128(dst_wide, pixels); + count -= 8; + dst_wide++; + } while (count >= 8); + + dst = reinterpret_cast<uint16_t*>(dst_wide); + } + + // Small loop to handle remaining pixels. + while (count > 0) { + *dst = Color32A_D565_1x(*dst, scale, src_expand); + dst += 1; + count--; + } +} + #endif diff --git a/src/opts/SkBlitRow_opts_SSE4.h b/src/opts/SkBlitRow_opts_SSE4.h index 577ace6f8f..6a572161e1 100644 --- a/src/opts/SkBlitRow_opts_SSE4.h +++ b/src/opts/SkBlitRow_opts_SSE4.h @@ -14,5 +14,8 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_RESTRICT, int count, U8CPU alpha); + +void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y); + #endif diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp index 84a4913021..2d3b794283 100644 --- a/src/opts/opts_check_x86.cpp +++ b/src/opts/opts_check_x86.cpp @@ -197,7 +197,7 @@ void SkBitmapProcState::platformProcs() { //////////////////////////////////////////////////////////////////////////////// -static SkBlitRow::Proc16 platform_16_procs[] = { +static const SkBlitRow::Proc16 platform_16_procs[] = { S32_D565_Opaque_SSE2, // S32_D565_Opaque NULL, // S32_D565_Blend S32A_D565_Opaque_SSE2, // S32A_D565_Opaque @@ -216,18 +216,27 @@ SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) { } } +static const SkBlitRow::ColorProc16 platform_565_colorprocs_SSE4[] = { + Color32A_D565_SSE4, // Color32A_D565, + NULL, // Color32A_D565_Dither +}; + SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) { - return NULL; + if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) { + return platform_565_colorprocs_SSE4[flags]; + } else { + return NULL; + } } -static SkBlitRow::Proc32 platform_32_procs_SSE2[] = { +static const SkBlitRow::Proc32 platform_32_procs_SSE2[] = { NULL, // S32_Opaque, S32_Blend_BlitRow32_SSE2, // S32_Blend, S32A_Opaque_BlitRow32_SSE2, // S32A_Opaque S32A_Blend_BlitRow32_SSE2, // S32A_Blend, }; -static SkBlitRow::Proc32 platform_32_procs_SSE4[] = { +static const SkBlitRow::Proc32 platform_32_procs_SSE4[] = { NULL, // S32_Opaque, S32_Blend_BlitRow32_SSE2, // S32_Blend, S32A_Opaque_BlitRow32_SSE4, // S32A_Opaque |