8 files changed, 97 insertions, 91 deletions
diff --git a/include/core/SkColorPriv.h b/include/core/SkColorPriv.h
index 9db768783f..15c94ac68c 100644
--- a/include/core/SkColorPriv.h
+++ b/include/core/SkColorPriv.h
@@ -281,6 +281,16 @@ static inline U16CPU SkAlphaMulRGB16(U16CPU c, unsigned scale) {
 // this helper explicitly returns a clean 16bit value (but slower)
 #define SkAlphaMulRGB16_ToU16(c, s)  (uint16_t)SkAlphaMulRGB16(c, s)
 
+/** Blend pre-expanded RGB32 with 16bit color value by the 0..32 scale parameter.
+    The computation yields only 16bits of valid data, but we claim to return
+    32bits, so that the compiler won't generate extra instructions to "clean"
+    the top 16bits.
+*/
+static inline U16CPU SkBlend32_RGB16(uint32_t src_expand, uint16_t dst, unsigned scale) {
+    uint32_t dst_expand = SkExpand_rgb_16(dst) * scale;
+    return SkCompact_rgb_16((src_expand + dst_expand) >> 5);
+}
+
 /** Blend src and dst 16bit colors by the 0..256 scale parameter.
     The computation yields only 16bits of valid data, but we claim
     to return 32bits, so that the compiler won't generate extra instructions to
diff --git a/src/core/SkBlitRow_D16.cpp b/src/core/SkBlitRow_D16.cpp
index d5082769c5..5aaa7a5805 100644
--- a/src/core/SkBlitRow_D16.cpp
+++ b/src/core/SkBlitRow_D16.cpp
@@ -216,8 +216,7 @@ static void Color32A_D565(uint16_t dst[], SkPMColor src, int count, int x, int y
     uint32_t src_expand = pmcolor_to_expand16(src);
     unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
     do {
-        uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
-        *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
+        *dst = SkBlend32_RGB16(src_expand, *dst, scale);
         dst += 1;
     } while (--count != 0);
 }
@@ -269,7 +268,7 @@ SkBlitRow::ColorProc16 SkBlitRow::ColorFactory16(unsigned flags) {
     // just so we don't crash
     flags &= kFlags16_Mask;
     // we ignore both kGlobalAlpha_Flag and kSrcPixelAlpha_Flag, so shift down
-    // since this factory is only used for transparent source alphas
+    // no need for the additional code specializing on opaque alpha at this time
     flags >>= 2;
 
     SkASSERT(flags < SK_ARRAY_COUNT(gDefault_565_ColorProcs));
diff --git a/src/opts/SkBitmapProcState_opts_SSE2.h b/src/opts/SkBitmapProcState_opts_SSE2.h
index 82c5cc8d6e..82bf2cdae1 100644
--- a/src/opts/SkBitmapProcState_opts_SSE2.h
+++ b/src/opts/SkBitmapProcState_opts_SSE2.h
@@ -16,8 +16,6 @@ void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
 void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
                                   const uint32_t* xy,
                                   int count, uint32_t* colors);
-void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
-                  SkPMColor color);
 void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
                                      int count, int x, int y);
 void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp
index e830c5fa06..80fdeecbcb 100644
--- a/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -289,6 +289,75 @@ void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
     }
 }
 
+void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
+    SkASSERT(count > 0);
+
+    uint32_t src_expand = (SkGetPackedG32(src) << 24) |
+                          (SkGetPackedR32(src) << 13) |
+                          (SkGetPackedB32(src) << 2);
+    unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
+
+    // Check if we have enough pixels to run SIMD
+    if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
+        __m128i* dst_wide;
+        const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
+        const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
+        const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
+        const __m128i scale_wide = _mm_set1_epi16(scale);
+        const __m128i mask_blue  = _mm_set1_epi16(SK_B16_MASK);
+        const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
+
+        // Align dst to an even 16 byte address (0-7 pixels)
+        while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
+            *dst = SkBlend32_RGB16(src_expand, *dst, scale);
+            dst += 1;
+            count--;
+        }
+
+        dst_wide = reinterpret_cast<__m128i*>(dst);
+        do {
+            // Load eight RGB565 pixels
+            __m128i pixels = _mm_load_si128(dst_wide);
+
+            // Mask out sub-pixels
+            __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
+            __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
+            pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
+            __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
+
+            // Scale with alpha
+            pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
+            pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
+            pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
+
+            // Add src_X_wide and shift down again
+            pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
+            pixel_R = _mm_srli_epi16(pixel_R, 5);
+            pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
+            pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
+            pixel_B = _mm_srli_epi16(pixel_B, 5);
+
+            // Combine into RGB565 and store
+            pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
+            pixel_G = _mm_and_si128(pixel_G, mask_green);
+            pixels = _mm_or_si128(pixel_R, pixel_G);
+            pixels = _mm_or_si128(pixels, pixel_B);
+            _mm_store_si128(dst_wide, pixels);
+            count -= 8;
+            dst_wide++;
+        } while (count >= 8);
+
+        dst = reinterpret_cast<uint16_t*>(dst_wide);
+    }
+
+    // Small loop to handle remaining pixels.
+    while (count > 0) {
+        *dst = SkBlend32_RGB16(src_expand, *dst, scale);
+        dst += 1;
+        count--;
+    }
+}
+
 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
                                size_t maskRB, SkColor origColor,
                                int width, int height) {
diff --git a/src/opts/SkBlitRow_opts_SSE2.h b/src/opts/SkBlitRow_opts_SSE2.h
index 29fd96e5e9..bb6cece478 100644
--- a/src/opts/SkBlitRow_opts_SSE2.h
+++ b/src/opts/SkBlitRow_opts_SSE2.h
@@ -21,6 +21,12 @@ void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
                                const SkPMColor* SK_RESTRICT src,
                                int count, U8CPU alpha);
+
+void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
+                  SkPMColor color);
+void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x,
+                        int y);
+
 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* mask,
                                size_t maskRB, SkColor color,
                                int width, int height);
@@ -42,5 +48,4 @@ void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
                                   const SkPMColor* SK_RESTRICT src,
                                   int count, U8CPU alpha, int x, int y);
-
 #endif
diff --git a/src/opts/SkBlitRow_opts_SSE4.cpp b/src/opts/SkBlitRow_opts_SSE4.cpp
index f4273d27b4..3649d175ef 100644
--- a/src/opts/SkBlitRow_opts_SSE4.cpp
+++ b/src/opts/SkBlitRow_opts_SSE4.cpp
@@ -7,14 +7,9 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST
     sk_throw();
 }
 
-void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {
-    sk_throw();
-}
-
 #else
 
-#include <smmintrin.h>  // SSE4.1 intrinsics
-
+#include <smmintrin.h>      // SSE4.1 intrinsics
 #include "SkColorPriv.h"
 #include "SkColor_opts_SSE2.h"
 
@@ -66,76 +61,4 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
     }
 }
 
-static inline uint16_t Color32A_D565_1x(uint16_t dst, unsigned scale, uint32_t src_expand) {
-    uint32_t dst_expand = SkExpand_rgb_16(dst) * scale;
-    return SkCompact_rgb_16((src_expand + dst_expand) >> 5);
-}
-
-void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {
-    SkASSERT(count > 0);
-
-    uint32_t src_expand = (SkGetPackedG32(src) << 24) |
-                          (SkGetPackedR32(src) << 13) |
-                          (SkGetPackedB32(src) << 2);
-    unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
-
-    // Check if we have enough pixels to run SIMD
-    if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
-        __m128i* dst_wide;
-        const __m128i src_expand_wide = _mm_set1_epi32(src_expand);
-        const __m128i scale_wide = _mm_set1_epi32(scale);
-        const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE |
-                                                  SK_B16_MASK_IN_PLACE |
-                                                 (SK_G16_MASK_IN_PLACE << 16));
-
-        // Align dst to an even 16 byte address (0-7 pixels)
-        while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
-            *dst = Color32A_D565_1x(*dst, scale, src_expand);
-            dst += 1;
-            count--;
-        }
-
-        dst_wide = reinterpret_cast<__m128i*>(dst);
-        do {
-            // Load 8 RGB565 pixels
-            __m128i pixels = _mm_load_si128(dst_wide);
-
-            // Duplicate and mask
-            __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels);
-            pixels_high = _mm_and_si128(mask_green, pixels_high);
-            pixels = _mm_unpacklo_epi16(pixels, pixels);
-            pixels = _mm_and_si128(mask_green, pixels);
-
-            // Scale with alpha
-            pixels_high = _mm_mullo_epi32(pixels_high, scale_wide);
-            pixels = _mm_mullo_epi32(pixels, scale_wide);
-
-            // Add src_expand_wide and shift down again
-            pixels_high = _mm_add_epi32(pixels_high, src_expand_wide);
-            pixels_high = _mm_srli_epi32(pixels_high, 5);
-            pixels = _mm_add_epi32(pixels, src_expand_wide);
-            pixels = _mm_srli_epi32(pixels, 5);
-
-            // Mask
-            pixels_high = _mm_and_si128(mask_green, pixels_high);
-            pixels = _mm_and_si128(mask_green, pixels);
-
-            // Combine into RGB565 and store
-            pixels = _mm_hadd_epi16(pixels, pixels_high);
-            _mm_store_si128(dst_wide, pixels);
-            count -= 8;
-            dst_wide++;
-        } while (count >= 8);
-
-        dst = reinterpret_cast<uint16_t*>(dst_wide);
-    }
-
-    // Small loop to handle remaining pixels.
-    while (count > 0) {
-        *dst = Color32A_D565_1x(*dst, scale, src_expand);
-        dst += 1;
-        count--;
-    }
-}
-
 #endif
diff --git a/src/opts/SkBlitRow_opts_SSE4.h b/src/opts/SkBlitRow_opts_SSE4.h
index 6a572161e1..577ace6f8f 100644
--- a/src/opts/SkBlitRow_opts_SSE4.h
+++ b/src/opts/SkBlitRow_opts_SSE4.h
@@ -14,8 +14,5 @@ void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT,
                                 const SkPMColor* SK_RESTRICT,
                                 int count,
                                 U8CPU alpha);
-
-void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y);
-
 #endif
 
diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp
index 6b9758c123..7314f7dcf8 100644
--- a/src/opts/opts_check_x86.cpp
+++ b/src/opts/opts_check_x86.cpp
@@ -215,14 +215,19 @@ SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) {
     }
 }
 
-static const SkBlitRow::ColorProc16 platform_565_colorprocs_SSE4[] = {
-    Color32A_D565_SSE4,                 // Color32A_D565,
+static const SkBlitRow::ColorProc16 platform_565_colorprocs_SSE2[] = {
+    Color32A_D565_SSE2,                 // Color32A_D565,
     NULL,                               // Color32A_D565_Dither
 };
 
 SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) {
-    if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
-        return platform_565_colorprocs_SSE4[flags];
+/* If you're thinking about writing an SSE4 version of this, do check it's
+ * actually faster on Atom. Our original SSE4 version was slower than this
+ * SSE2 version on Silvermont, and only marginally faster on a Core i7,
+ * mainly due to the MULLD timings.
+ */
+    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+        return platform_565_colorprocs_SSE2[flags];
     } else {
         return NULL;
     }