diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/core/SkBlitRow_D16.cpp | 5 | ||||
-rw-r--r-- | src/opts/SkBitmapProcState_opts_SSE2.h | 2 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_SSE2.cpp | 69 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_SSE2.h | 7 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_SSE4.cpp | 79 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_SSE4.h | 3 | ||||
-rw-r--r-- | src/opts/opts_check_x86.cpp | 13 |
7 files changed, 87 insertions, 91 deletions
diff --git a/src/core/SkBlitRow_D16.cpp b/src/core/SkBlitRow_D16.cpp index d5082769c5..5aaa7a5805 100644 --- a/src/core/SkBlitRow_D16.cpp +++ b/src/core/SkBlitRow_D16.cpp @@ -216,8 +216,7 @@ static void Color32A_D565(uint16_t dst[], SkPMColor src, int count, int x, int y uint32_t src_expand = pmcolor_to_expand16(src); unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; do { - uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; - *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); + *dst = SkBlend32_RGB16(src_expand, *dst, scale); dst += 1; } while (--count != 0); } @@ -269,7 +268,7 @@ SkBlitRow::ColorProc16 SkBlitRow::ColorFactory16(unsigned flags) { // just so we don't crash flags &= kFlags16_Mask; // we ignore both kGlobalAlpha_Flag and kSrcPixelAlpha_Flag, so shift down - // since this factory is only used for transparent source alphas + // no need for the additional code specializing on opaque alpha at this time flags >>= 2; SkASSERT(flags < SK_ARRAY_COUNT(gDefault_565_ColorProcs)); diff --git a/src/opts/SkBitmapProcState_opts_SSE2.h b/src/opts/SkBitmapProcState_opts_SSE2.h index 82c5cc8d6e..82bf2cdae1 100644 --- a/src/opts/SkBitmapProcState_opts_SSE2.h +++ b/src/opts/SkBitmapProcState_opts_SSE2.h @@ -16,8 +16,6 @@ void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s, void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s, const uint32_t* xy, int count, uint32_t* colors); -void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, - SkPMColor color); void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y); void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s, diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp index e830c5fa06..80fdeecbcb 100644 --- a/src/opts/SkBlitRow_opts_SSE2.cpp +++ b/src/opts/SkBlitRow_opts_SSE2.cpp @@ -289,6 +289,75 @@ void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, } } +void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) { + SkASSERT(count > 0); + + uint32_t src_expand = (SkGetPackedG32(src) << 24) | + (SkGetPackedR32(src) << 13) | + (SkGetPackedB32(src) << 2); + unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; + + // Check if we have enough pixels to run SIMD + if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) { + __m128i* dst_wide; + const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2); + const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3); + const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2); + const __m128i scale_wide = _mm_set1_epi16(scale); + const __m128i mask_blue = _mm_set1_epi16(SK_B16_MASK); + const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT); + + // Align dst to an even 16 byte address (0-7 pixels) + while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) { + *dst = SkBlend32_RGB16(src_expand, *dst, scale); + dst += 1; + count--; + } + + dst_wide = reinterpret_cast<__m128i*>(dst); + do { + // Load eight RGB565 pixels + __m128i pixels = _mm_load_si128(dst_wide); + + // Mask out sub-pixels + __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT); + __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS); + pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS); + __m128i pixel_B = _mm_and_si128(pixels, mask_blue); + + // Scale with alpha + pixel_R = _mm_mullo_epi16(pixel_R, scale_wide); + pixel_G = _mm_mullo_epi16(pixel_G, scale_wide); + pixel_B = _mm_mullo_epi16(pixel_B, scale_wide); + + // Add src_X_wide and shift down again + pixel_R = _mm_add_epi16(pixel_R, src_R_wide); + pixel_R = _mm_srli_epi16(pixel_R, 5); + pixel_G = _mm_add_epi16(pixel_G, src_G_wide); + pixel_B = _mm_add_epi16(pixel_B, src_B_wide); + pixel_B = _mm_srli_epi16(pixel_B, 5); + + // Combine into RGB565 and store + pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT); + pixel_G = _mm_and_si128(pixel_G, mask_green); + pixels = _mm_or_si128(pixel_R, pixel_G); + pixels = _mm_or_si128(pixels, pixel_B); + _mm_store_si128(dst_wide, pixels); + count -= 8; + dst_wide++; + } while (count >= 8); + + dst = reinterpret_cast<uint16_t*>(dst_wide); + } + + // Small loop to handle remaining pixels. + while (count > 0) { + *dst = SkBlend32_RGB16(src_expand, *dst, scale); + dst += 1; + count--; + } +} + void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, size_t maskRB, SkColor origColor, int width, int height) { diff --git a/src/opts/SkBlitRow_opts_SSE2.h b/src/opts/SkBlitRow_opts_SSE2.h index 29fd96e5e9..bb6cece478 100644 --- a/src/opts/SkBlitRow_opts_SSE2.h +++ b/src/opts/SkBlitRow_opts_SSE2.h @@ -21,6 +21,12 @@ void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, U8CPU alpha); + +void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, + SkPMColor color); +void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, + int y); + void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* mask, size_t maskRB, SkColor color, int width, int height); @@ -42,5 +48,4 @@ void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, U8CPU alpha, int x, int y); - #endif diff --git a/src/opts/SkBlitRow_opts_SSE4.cpp b/src/opts/SkBlitRow_opts_SSE4.cpp index f4273d27b4..3649d175ef 100644 --- a/src/opts/SkBlitRow_opts_SSE4.cpp +++ b/src/opts/SkBlitRow_opts_SSE4.cpp @@ -7,14 +7,9 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST sk_throw(); } -void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) { - sk_throw(); -} - #else -#include <smmintrin.h> // SSE4.1 intrinsics - +#include <smmintrin.h> // SSE4.1 intrinsics #include "SkColorPriv.h" #include "SkColor_opts_SSE2.h" @@ -66,76 +61,4 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, } } -static inline uint16_t Color32A_D565_1x(uint16_t dst, unsigned scale, uint32_t src_expand) { - uint32_t dst_expand = SkExpand_rgb_16(dst) * scale; - return SkCompact_rgb_16((src_expand + dst_expand) >> 5); -} - -void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) { - SkASSERT(count > 0); - - uint32_t src_expand = (SkGetPackedG32(src) << 24) | - (SkGetPackedR32(src) << 13) | - (SkGetPackedB32(src) << 2); - unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; - - // Check if we have enough pixels to run SIMD - if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) { - __m128i* dst_wide; - const __m128i src_expand_wide = _mm_set1_epi32(src_expand); - const __m128i scale_wide = _mm_set1_epi32(scale); - const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE | - SK_B16_MASK_IN_PLACE | - (SK_G16_MASK_IN_PLACE << 16)); - - // Align dst to an even 16 byte address (0-7 pixels) - while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) { - *dst = Color32A_D565_1x(*dst, scale, src_expand); - dst += 1; - count--; - } - - dst_wide = reinterpret_cast<__m128i*>(dst); - do { - // Load 8 RGB565 pixels - __m128i pixels = _mm_load_si128(dst_wide); - - // Duplicate and mask - __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels); - pixels_high = _mm_and_si128(mask_green, pixels_high); - pixels = _mm_unpacklo_epi16(pixels, pixels); - pixels = _mm_and_si128(mask_green, pixels); - - // Scale with alpha - pixels_high = _mm_mullo_epi32(pixels_high, scale_wide); - pixels = _mm_mullo_epi32(pixels, scale_wide); - - // Add src_expand_wide and shift down again - pixels_high = _mm_add_epi32(pixels_high, src_expand_wide); - pixels_high = _mm_srli_epi32(pixels_high, 5); - pixels = _mm_add_epi32(pixels, src_expand_wide); - pixels = _mm_srli_epi32(pixels, 5); - - // Mask - pixels_high = _mm_and_si128(mask_green, pixels_high); - pixels = _mm_and_si128(mask_green, pixels); - - // Combine into RGB565 and store - pixels = _mm_hadd_epi16(pixels, pixels_high); - _mm_store_si128(dst_wide, pixels); - count -= 8; - dst_wide++; - } while (count >= 8); - - dst = reinterpret_cast<uint16_t*>(dst_wide); - } - - // Small loop to handle remaining pixels. - while (count > 0) { - *dst = Color32A_D565_1x(*dst, scale, src_expand); - dst += 1; - count--; - } -} - #endif diff --git a/src/opts/SkBlitRow_opts_SSE4.h b/src/opts/SkBlitRow_opts_SSE4.h index 6a572161e1..577ace6f8f 100644 --- a/src/opts/SkBlitRow_opts_SSE4.h +++ b/src/opts/SkBlitRow_opts_SSE4.h @@ -14,8 +14,5 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_RESTRICT, int count, U8CPU alpha); - -void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y); - #endif diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp index 6b9758c123..7314f7dcf8 100644 --- a/src/opts/opts_check_x86.cpp +++ b/src/opts/opts_check_x86.cpp @@ -215,14 +215,19 @@ SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) { } } -static const SkBlitRow::ColorProc16 platform_565_colorprocs_SSE4[] = { - Color32A_D565_SSE4, // Color32A_D565, +static const SkBlitRow::ColorProc16 platform_565_colorprocs_SSE2[] = { + Color32A_D565_SSE2, // Color32A_D565, NULL, // Color32A_D565_Dither }; SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) { - if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) { - return platform_565_colorprocs_SSE4[flags]; +/* If you're thinking about writing an SSE4 version of this, do check it's + * actually faster on Atom. Our original SSE4 version was slower than this + * SSE2 version on Silvermont, and only marginally faster on a Core i7, + * mainly due to the MULLD timings. + */ + if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { + return platform_565_colorprocs_SSE2[flags]; } else { return NULL; } |