aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar senorblanco@chromium.org <senorblanco@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>2010-12-13 15:27:20 +0000
committerGravatar senorblanco@chromium.org <senorblanco@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>2010-12-13 15:27:20 +0000
commitc3856384e4ab9a7ad5902696a5c972ab595b8467 (patch)
tree0025aae7299eb0025598d8a647b4479598941859 /src
parentec7a30cc8688923e0ccfff4c8f81c5e577c4c9ab (diff)
SSE2 optimizations for 32bit Color operation.
[Patch from weiwei.li@intel.com] SSE2 optimization has been added by Stephen White before, this improves the skia performance on SSE2-supporting platform. (please refer to below issues) Issue 171055: More SSE2ification Issue 157141: More SSE2ification Issue 150060: minor tweaks to SSE2 code for -fPIC Issue 144072: SSE2 optimizations for 32bit blending blitters This CL implements SSE2 optimizations for the 32bit Color operation. Like above issues, it uses CPUID to detect for SSE2 and changes the platform procs at runtime as well. The 32bit Color operation is heavily used on Chrome HTML5 canvas operations. Take Microsoft IE test drives Pulsating Bubbles as example (http://ie.microsoft.com/testdrive/Performance/PulsatingBubbles/Default.xhtml), if running this cases on Chrome, the overhead of 32bit Color operation is about 40~50%. So this CL will make skia performance more better, and also make Chrome HTML5 canvas performance more better. Additional, this CL has passed the skia bench & tests validation, the result is pretty good. We also apply this CL to the latest chromium, and re-run Microsoft IE test drives Pulsating Bubbles, the performance is improved by almost 9~10%. git-svn-id: http://skia.googlecode.com/svn/trunk@633 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src')
-rw-r--r--src/core/SkBlitRow_D32.cpp10
-rw-r--r--src/opts/SkBitmapProcState_opts_SSE2.h2
-rw-r--r--src/opts/SkBlitRow_opts_SSE2.cpp80
-rw-r--r--src/opts/SkBlitRow_opts_none.cpp4
-rw-r--r--src/opts/opts_check_SSE2.cpp8
5 files changed, 104 insertions, 0 deletions
diff --git a/src/core/SkBlitRow_D32.cpp b/src/core/SkBlitRow_D32.cpp
index 003602566e..1f154a486a 100644
--- a/src/core/SkBlitRow_D32.cpp
+++ b/src/core/SkBlitRow_D32.cpp
@@ -88,6 +88,16 @@ SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count,
SkPMColor color) {
+ SkBlitRow::ColorProc proc = PlatformColorProc();
+ if (NULL == proc) {
+ Color32_BlitRow32(dst, src, count, color);
+ return;
+ }
+ proc(dst, src, count, color);
+}
+
+void SkBlitRow::Color32_BlitRow32(SkPMColor dst[], const SkPMColor src[],
+ int count, SkPMColor color) {
if (count > 0) {
if (0 == color) {
if (src != dst) {
diff --git a/src/opts/SkBitmapProcState_opts_SSE2.h b/src/opts/SkBitmapProcState_opts_SSE2.h
index 29df88fbd0..11d305b1cb 100644
--- a/src/opts/SkBitmapProcState_opts_SSE2.h
+++ b/src/opts/SkBitmapProcState_opts_SSE2.h
@@ -23,3 +23,5 @@ void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
const uint32_t* xy,
int count, uint32_t* colors);
+void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
+ SkPMColor color);
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp
index acf8418b40..244dbb4c1e 100644
--- a/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -17,6 +17,7 @@
#include "SkBlitRow_opts_SSE2.h"
#include "SkColorPriv.h"
+#include "SkUtils.h"
#include <emmintrin.h>
@@ -310,3 +311,82 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
count--;
}
}
+
+/* SSE2 version of Color32()
+ * portable version is in core/SkBlitRow_D32.cpp
+ */
+void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
+ SkPMColor color) {
+
+ if (count <= 0) {
+ return;
+ }
+
+ if (0 == color) {
+ if (src != dst) {
+ memcpy(dst, src, count * sizeof(SkPMColor));
+ }
+ }
+
+ unsigned colorA = SkGetPackedA32(color);
+ if (255 == colorA) {
+ sk_memset32(dst, color, count);
+ } else {
+ unsigned scale = 256 - SkAlpha255To256(colorA);
+
+ if (count >= 4) {
+ SkASSERT(((size_t)dst & 0x03) == 0);
+ while (((size_t)dst & 0x0F) != 0) {
+ *dst = color + SkAlphaMulQ(*src, scale);
+ src++;
+ dst++;
+ count--;
+ }
+
+ const __m128i *s = reinterpret_cast<const __m128i*>(src);
+ __m128i *d = reinterpret_cast<__m128i*>(dst);
+ __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
+ __m128i src_scale_wide = _mm_set1_epi16(scale);
+ __m128i color_wide = _mm_set1_epi32(color);
+ while (count >= 4) {
+ // Load 4 pixels each of src and dest.
+ __m128i src_pixel = _mm_loadu_si128(s);
+
+ // Get red and blue pixels into lower byte of each word.
+ __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
+
+ // Get alpha and green into lower byte of each word.
+ __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
+
+ // Multiply by scale.
+ src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
+ src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
+
+ // Divide by 256.
+ src_rb = _mm_srli_epi16(src_rb, 8);
+ src_ag = _mm_andnot_si128(rb_mask, src_ag);
+
+ // Combine back into RGBA.
+ src_pixel = _mm_or_si128(src_rb, src_ag);
+
+ // Add color to result.
+ __m128i result = _mm_add_epi8(color_wide, src_pixel);
+
+ // Store result.
+ _mm_store_si128(d, result);
+ s++;
+ d++;
+ count -= 4;
+ }
+ src = reinterpret_cast<const SkPMColor*>(s);
+ dst = reinterpret_cast<SkPMColor*>(d);
+ }
+
+ while (count > 0) {
+ *dst = color + SkAlphaMulQ(*src, scale);
+ src += 1;
+ dst += 1;
+ count--;
+ }
+ }
+}
diff --git a/src/opts/SkBlitRow_opts_none.cpp b/src/opts/SkBlitRow_opts_none.cpp
index 8e0cddcda0..0eb1185347 100644
--- a/src/opts/SkBlitRow_opts_none.cpp
+++ b/src/opts/SkBlitRow_opts_none.cpp
@@ -13,3 +13,7 @@ SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
return NULL;
}
+
+SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
+ return NULL;
+}
diff --git a/src/opts/opts_check_SSE2.cpp b/src/opts/opts_check_SSE2.cpp
index 7fea1c4b84..fa7b17a5b5 100644
--- a/src/opts/opts_check_SSE2.cpp
+++ b/src/opts/opts_check_SSE2.cpp
@@ -89,6 +89,14 @@ SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
return NULL;
}
+SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
+ if (hasSSE2()) {
+ return Color32_SSE2;
+ } else {
+ return NULL;
+ }
+}
+
SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
if (hasSSE2()) {
return platform_32_procs[flags];