diff options
author | senorblanco@chromium.org <senorblanco@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2010-12-13 15:27:20 +0000 |
---|---|---|
committer | senorblanco@chromium.org <senorblanco@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2010-12-13 15:27:20 +0000 |
commit | c3856384e4ab9a7ad5902696a5c972ab595b8467 (patch) | |
tree | 0025aae7299eb0025598d8a647b4479598941859 /src | |
parent | ec7a30cc8688923e0ccfff4c8f81c5e577c4c9ab (diff) |
SSE2 optimizations for 32bit Color operation.
[Patch from weiwei.li@intel.com]
SSE2 optimization has been added by Stephen White before, this improves the skia
performance on SSE2-supporting platform. (please refer to below issues)
Issue 171055: More SSE2ification
Issue 157141: More SSE2ification
Issue 150060: minor tweaks to SSE2 code for -fPIC
Issue 144072: SSE2 optimizations for 32bit blending blitters
This CL implements SSE2 optimizations for the 32bit Color operation. Like above
issues, it uses CPUID to detect for SSE2 and changes the platform procs at
runtime as well. The 32bit Color operation is heavily used on Chrome HTML5
canvas operations. Take Microsoft IE test drives Pulsating Bubbles as example
(http://ie.microsoft.com/testdrive/Performance/PulsatingBubbles/Default.xhtml),
if running this cases on Chrome, the overhead of 32bit Color operation is about
40~50%. So this CL will make skia performance more better, and also make Chrome
HTML5 canvas performance more better.
Additional, this CL has passed the skia bench & tests validation, the result is
pretty good. We also apply this CL to the latest chromium, and re-run Microsoft
IE test drives Pulsating Bubbles, the performance is improved by almost 9~10%.
git-svn-id: http://skia.googlecode.com/svn/trunk@633 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src')
-rw-r--r-- | src/core/SkBlitRow_D32.cpp | 10 | ||||
-rw-r--r-- | src/opts/SkBitmapProcState_opts_SSE2.h | 2 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_SSE2.cpp | 80 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_none.cpp | 4 | ||||
-rw-r--r-- | src/opts/opts_check_SSE2.cpp | 8 |
5 files changed, 104 insertions, 0 deletions
diff --git a/src/core/SkBlitRow_D32.cpp b/src/core/SkBlitRow_D32.cpp index 003602566e..1f154a486a 100644 --- a/src/core/SkBlitRow_D32.cpp +++ b/src/core/SkBlitRow_D32.cpp @@ -88,6 +88,16 @@ SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) { void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) { + SkBlitRow::ColorProc proc = PlatformColorProc(); + if (NULL == proc) { + Color32_BlitRow32(dst, src, count, color); + return; + } + proc(dst, src, count, color); +} + +void SkBlitRow::Color32_BlitRow32(SkPMColor dst[], const SkPMColor src[], + int count, SkPMColor color) { if (count > 0) { if (0 == color) { if (src != dst) { diff --git a/src/opts/SkBitmapProcState_opts_SSE2.h b/src/opts/SkBitmapProcState_opts_SSE2.h index 29df88fbd0..11d305b1cb 100644 --- a/src/opts/SkBitmapProcState_opts_SSE2.h +++ b/src/opts/SkBitmapProcState_opts_SSE2.h @@ -23,3 +23,5 @@ void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s, void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s, const uint32_t* xy, int count, uint32_t* colors); +void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, + SkPMColor color); diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp index acf8418b40..244dbb4c1e 100644 --- a/src/opts/SkBlitRow_opts_SSE2.cpp +++ b/src/opts/SkBlitRow_opts_SSE2.cpp @@ -17,6 +17,7 @@ #include "SkBlitRow_opts_SSE2.h" #include "SkColorPriv.h" +#include "SkUtils.h" #include <emmintrin.h> @@ -310,3 +311,82 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, count--; } } + +/* SSE2 version of Color32() + * portable version is in core/SkBlitRow_D32.cpp + */ +void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, + SkPMColor color) { + + if (count <= 0) { + return; + } + + if (0 == color) { + if (src != dst) { + memcpy(dst, src, count * sizeof(SkPMColor)); + } + } + + unsigned colorA = SkGetPackedA32(color); + if (255 == colorA) { + sk_memset32(dst, color, count); + } else { + unsigned scale = 256 - SkAlpha255To256(colorA); + + if (count >= 4) { + SkASSERT(((size_t)dst & 0x03) == 0); + while (((size_t)dst & 0x0F) != 0) { + *dst = color + SkAlphaMulQ(*src, scale); + src++; + dst++; + count--; + } + + const __m128i *s = reinterpret_cast<const __m128i*>(src); + __m128i *d = reinterpret_cast<__m128i*>(dst); + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); + __m128i src_scale_wide = _mm_set1_epi16(scale); + __m128i color_wide = _mm_set1_epi32(color); + while (count >= 4) { + // Load 4 pixels each of src and dest. + __m128i src_pixel = _mm_loadu_si128(s); + + // Get red and blue pixels into lower byte of each word. + __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); + + // Get alpha and green into lower byte of each word. + __m128i src_ag = _mm_srli_epi16(src_pixel, 8); + + // Multiply by scale. + src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); + src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); + + // Divide by 256. + src_rb = _mm_srli_epi16(src_rb, 8); + src_ag = _mm_andnot_si128(rb_mask, src_ag); + + // Combine back into RGBA. + src_pixel = _mm_or_si128(src_rb, src_ag); + + // Add color to result. + __m128i result = _mm_add_epi8(color_wide, src_pixel); + + // Store result. + _mm_store_si128(d, result); + s++; + d++; + count -= 4; + } + src = reinterpret_cast<const SkPMColor*>(s); + dst = reinterpret_cast<SkPMColor*>(d); + } + + while (count > 0) { + *dst = color + SkAlphaMulQ(*src, scale); + src += 1; + dst += 1; + count--; + } + } +} diff --git a/src/opts/SkBlitRow_opts_none.cpp b/src/opts/SkBlitRow_opts_none.cpp index 8e0cddcda0..0eb1185347 100644 --- a/src/opts/SkBlitRow_opts_none.cpp +++ b/src/opts/SkBlitRow_opts_none.cpp @@ -13,3 +13,7 @@ SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { return NULL; } + +SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() { + return NULL; +} diff --git a/src/opts/opts_check_SSE2.cpp b/src/opts/opts_check_SSE2.cpp index 7fea1c4b84..fa7b17a5b5 100644 --- a/src/opts/opts_check_SSE2.cpp +++ b/src/opts/opts_check_SSE2.cpp @@ -89,6 +89,14 @@ SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { return NULL; } +SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() { + if (hasSSE2()) { + return Color32_SSE2; + } else { + return NULL; + } +} + SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { if (hasSSE2()) { return platform_32_procs[flags]; |