Replace SSE optimization of Color32A_D565

Adds an SSE2 version of the Color32A_D565 function, to replace the existing SSE4 version. Also does some minor cleanup. Performance improvement in the following Skia benchmarks. Measured on Atom Silvermont: Xfermode_SrcOver - x3 luma_colorfilter_large - x4.6 luma_colorfilter_small - x2 tablebench - ~15% chart_bw - ~10% Measured on Corei7 Haswell: luma_colorfilter_large running SSE2 - x2 luma_colorfilter_large running SSE4 - x2.3 Also improves performance in WPS Office application and 2D subtest of 0xbenchmark on Android. Signed-off-by: Henrik Smiding <henrik.smiding@intel.com> Review URL: https://codereview.chromium.org/923523002
author: henrik.smiding <henrik.smiding@intel.com> 2015-03-20 09:20:46 -0700
committer: Commit bot <commit-bot@chromium.org> 2015-03-20 09:20:46 -0700
commit: 70840cbd898df67f603987213164c798415d76bf (patch)
tree: d6119805a17a04ace94a75748204621f090b4fe2 /src/opts/SkBlitRow_opts_SSE2.cpp
parent: 86ad8d643624a55b02e529100bbe4e2940115fa1 (diff)
1 files changed, 69 insertions, 0 deletions
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp
index e830c5fa06..80fdeecbcb 100644
--- a/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -289,6 +289,75 @@ void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
     }
 }
 
+void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
+    SkASSERT(count > 0);
+
+    uint32_t src_expand = (SkGetPackedG32(src) << 24) |
+                          (SkGetPackedR32(src) << 13) |
+                          (SkGetPackedB32(src) << 2);
+    unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
+
+    // Check if we have enough pixels to run SIMD
+    if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
+        __m128i* dst_wide;
+        const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
+        const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
+        const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
+        const __m128i scale_wide = _mm_set1_epi16(scale);
+        const __m128i mask_blue  = _mm_set1_epi16(SK_B16_MASK);
+        const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
+
+        // Align dst to an even 16 byte address (0-7 pixels)
+        while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
+            *dst = SkBlend32_RGB16(src_expand, *dst, scale);
+            dst += 1;
+            count--;
+        }
+
+        dst_wide = reinterpret_cast<__m128i*>(dst);
+        do {
+            // Load eight RGB565 pixels
+            __m128i pixels = _mm_load_si128(dst_wide);
+
+            // Mask out sub-pixels
+            __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
+            __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
+            pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
+            __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
+
+            // Scale with alpha
+            pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
+            pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
+            pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
+
+            // Add src_X_wide and shift down again
+            pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
+            pixel_R = _mm_srli_epi16(pixel_R, 5);
+            pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
+            pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
+            pixel_B = _mm_srli_epi16(pixel_B, 5);
+
+            // Combine into RGB565 and store
+            pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
+            pixel_G = _mm_and_si128(pixel_G, mask_green);
+            pixels = _mm_or_si128(pixel_R, pixel_G);
+            pixels = _mm_or_si128(pixels, pixel_B);
+            _mm_store_si128(dst_wide, pixels);
+            count -= 8;
+            dst_wide++;
+        } while (count >= 8);
+
+        dst = reinterpret_cast<uint16_t*>(dst_wide);
+    }
+
+    // Small loop to handle remaining pixels.
+    while (count > 0) {
+        *dst = SkBlend32_RGB16(src_expand, *dst, scale);
+        dst += 1;
+        count--;
+    }
+}
+
 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
                                size_t maskRB, SkColor origColor,
                                int width, int height) {
author	henrik.smiding <henrik.smiding@intel.com>	2015-03-20 09:20:46 -0700
committer	Commit bot <commit-bot@chromium.org>	2015-03-20 09:20:46 -0700
commit	70840cbd898df67f603987213164c798415d76bf (patch)
tree	d6119805a17a04ace94a75748204621f090b4fe2 /src/opts/SkBlitRow_opts_SSE2.cpp
parent	86ad8d643624a55b02e529100bbe4e2940115fa1 (diff)