2 files changed, 166 insertions, 77 deletions
diff --git a/include/core/SkColorPriv.h b/include/core/SkColorPriv.h
index 5d2df62cef..fe59b1ba93 100644
--- a/include/core/SkColorPriv.h
+++ b/include/core/SkColorPriv.h
@@ -838,29 +838,29 @@ static inline SkPMColor SkBlendLCD16Opaque(int srcR, int srcG, int srcB,
                         SkBlend32(srcB, dstB, maskB));
 }
 
-static inline void SkBlitLCD16Row(SkPMColor dst[], const uint16_t src[],
-                                  SkColor color, int width, SkPMColor) {
-    int srcA = SkColorGetA(color);
-    int srcR = SkColorGetR(color);
-    int srcG = SkColorGetG(color);
-    int srcB = SkColorGetB(color);
+static inline void SkBlitLCD16Row(SkPMColor dst[], const uint16_t mask[],
+                                  SkColor src, int width, SkPMColor) {
+    int srcA = SkColorGetA(src);
+    int srcR = SkColorGetR(src);
+    int srcG = SkColorGetG(src);
+    int srcB = SkColorGetB(src);
 
     srcA = SkAlpha255To256(srcA);
 
     for (int i = 0; i < width; i++) {
-        dst[i] = SkBlendLCD16(srcA, srcR, srcG, srcB, dst[i], src[i]);
+        dst[i] = SkBlendLCD16(srcA, srcR, srcG, srcB, dst[i], mask[i]);
     }
 }
 
-static inline void SkBlitLCD16OpaqueRow(SkPMColor dst[], const uint16_t src[],
-                                        SkColor color, int width,
+static inline void SkBlitLCD16OpaqueRow(SkPMColor dst[], const uint16_t mask[],
+                                        SkColor src, int width,
                                         SkPMColor opaqueDst) {
-    int srcR = SkColorGetR(color);
-    int srcG = SkColorGetG(color);
-    int srcB = SkColorGetB(color);
+    int srcR = SkColorGetR(src);
+    int srcG = SkColorGetG(src);
+    int srcB = SkColorGetB(src);
 
     for (int i = 0; i < width; i++) {
-        dst[i] = SkBlendLCD16Opaque(srcR, srcG, srcB, dst[i], src[i],
+        dst[i] = SkBlendLCD16Opaque(srcR, srcG, srcB, dst[i], mask[i],
                                     opaqueDst);
     }
 }
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp
index 27ce1e5f62..f3d010e3bc 100644
--- a/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -544,149 +544,232 @@ void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
 #endif
 
-static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst,
-                                 __m128i &mask, __m128i &scale) {
+static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
+                                 __m128i &mask, __m128i &srcA) {
+    // In the following comments, the components of src, dst and mask are
+    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
+    // by an R, G, B, or A suffix. Components of one of the four pixels that
+    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
+    // example is the blue channel of the second destination pixel. Memory
+    // layout is shown for an ARGB byte order in a color value.
+
+    // src and srcA store 8-bit values interleaved with zeros.
+    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
+    // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
+    //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
+    // mask stores 16-bit values (compressed three channels) interleaved with zeros.
+    // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
+    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
+    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
+
     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
+    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
 
+    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
 
+    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
 
     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
+    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
+    // 8-bit position
+    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
+    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
     mask = _mm_or_si128(_mm_or_si128(r, g), b);
 
     // Interleave R,G,B into the lower byte of word.
+    // i.e. split the sixteen 8-bit values from mask into two sets of eight
+    // 16-bit values, padded by zero.
     __m128i maskLo, maskHi;
+    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
+    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
 
-    // Upscale to 0..32
+    // Upscale from 0..31 to 0..32
+    // (allows to replace division by left-shift further down)
+    // Left-shift each component by 4 and add the result back to that component,
+    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
 
-    maskLo = _mm_mullo_epi16(maskLo, scale);
-    maskHi = _mm_mullo_epi16(maskHi, scale);
+    // Multiply each component of maskLo and maskHi by srcA
+    maskLo = _mm_mullo_epi16(maskLo, srcA);
+    maskHi = _mm_mullo_epi16(maskHi, srcA);
 
+    // Left shift mask components by 8 (divide by 256)
     maskLo = _mm_srli_epi16(maskLo, 8);
     maskHi = _mm_srli_epi16(maskHi, 8);
 
-    // Interleave R,G,B into the lower byte of the word.
+    // Interleave R,G,B into the lower byte of the word
+    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
+    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
 
-    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
-    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
+    // mask = (src - dst) * mask
+    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
+    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
 
+    // mask = (src - dst) * mask >> 5
     maskLo = _mm_srai_epi16(maskLo, 5);
     maskHi = _mm_srai_epi16(maskHi, 5);
 
     // Add two pixels into result.
+    // result = dst + ((src - dst) * mask >> 5)
     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
 
-    // Pack into 4 32bit dst pixels
+    // Pack into 4 32bit dst pixels.
+    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
+    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
+    // clamping to 255 if necessary.
     return _mm_packus_epi16(resultLo, resultHi);
 }
 
-static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst,
+static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
                                        __m128i &mask) {
+    // In the following comments, the components of src, dst and mask are
+    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
+    // by an R, G, B, or A suffix. Components of one of the four pixels that
+    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
+    // example is the blue channel of the second destination pixel. Memory
+    // layout is shown for an ARGB byte order in a color value.
+
+    // src and srcA store 8-bit values interleaved with zeros.
+    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
+    // mask stores 16-bit values (shown as high and low bytes) interleaved with
+    // zeros
+    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
+    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
+
     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
+    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
 
+    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
 
+    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
 
     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
+    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
+    // 8-bit position
+    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
+    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
     mask = _mm_or_si128(_mm_or_si128(r, g), b);
 
     // Interleave R,G,B into the lower byte of word.
+    // i.e. split the sixteen 8-bit values from mask into two sets of eight
+    // 16-bit values, padded by zero.
     __m128i maskLo, maskHi;
+    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
+    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
 
-    // Upscale to 0..32
+    // Upscale from 0..31 to 0..32
+    // (allows to replace division by left-shift further down)
+    // Left-shift each component by 4 and add the result back to that component,
+    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
 
-    // Interleave R,G,B into the lower byte of the word.
+    // Interleave R,G,B into the lower byte of the word
+    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
+    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
 
-    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
-    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
+    // mask = (src - dst) * mask
+    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
+    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
 
+    // mask = (src - dst) * mask >> 5
     maskLo = _mm_srai_epi16(maskLo, 5);
     maskHi = _mm_srai_epi16(maskHi, 5);
 
     // Add two pixels into result.
+    // result = dst + ((src - dst) * mask >> 5)
     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
 
     // Pack into 4 32bit dst pixels and force opaque.
+    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
+    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
+    // clamping to 255 if necessary. Set alpha components to 0xFF.
     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
 }
 
-void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
-                         SkColor color, int width, SkPMColor) {
+void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
+                         SkColor src, int width, SkPMColor) {
     if (width <= 0) {
         return;
     }
 
-    int srcA = SkColorGetA(color);
-    int srcR = SkColorGetR(color);
-    int srcG = SkColorGetG(color);
-    int srcB = SkColorGetB(color);
+    int srcA = SkColorGetA(src);
+    int srcR = SkColorGetR(src);
+    int srcG = SkColorGetG(src);
+    int srcB = SkColorGetB(src);
 
     srcA = SkAlpha255To256(srcA);
 
     if (width >= 4) {
         SkASSERT(((size_t)dst & 0x03) == 0);
         while (((size_t)dst & 0x0F) != 0) {
-            *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
-            src++;
+            *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
+            mask++;
             dst++;
             width--;
         }
 
         __m128i *d = reinterpret_cast<__m128i*>(dst);
-        __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
-        srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
-        __m128i scale = _mm_set1_epi16(srcA);
+        // Set alpha to 0xFF and replicate source four times in SSE register.
+        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
+        // Interleave with zeros to get two sets of four 16-bit values.
+        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
+        // Set srcA_sse to contain eight copies of srcA, padded with zero.
+        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
+        __m128i srcA_sse = _mm_set1_epi16(srcA);
         while (width >= 4) {
-            __m128i dst_pixel = _mm_load_si128(d);
-            __m128i mask_pixel = _mm_loadl_epi64(
-                                     reinterpret_cast<const __m128i*>(src));
-
-            // Check whether mask_pixels are equal to 0 and get the highest bit
-            // of each byte of result, if mask pixes are all zero, we will get
+            // Load four destination pixels into dst_sse.
+            __m128i dst_sse = _mm_load_si128(d);
+            // Load four 16-bit masks into lower half of mask_sse.
+            __m128i mask_sse = _mm_loadl_epi64(
+                                   reinterpret_cast<const __m128i*>(mask));
+
+            // Check whether masks are equal to 0 and get the highest bit
+            // of each byte of result, if masks are all zero, we will get
             // pack_cmp to 0xFFFF
-            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
+            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
                                              _mm_setzero_si128()));
 
             // if mask pixels are not all zero, we will blend the dst pixels
             if (pack_cmp != 0xFFFF) {
                 // Unpack 4 16bit mask pixels to
-                // (p0, 0, p1, 0, p2, 0, p3, 0)
-                mask_pixel = _mm_unpacklo_epi16(mask_pixel,
-                                                _mm_setzero_si128());
+                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
+                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
+                mask_sse = _mm_unpacklo_epi16(mask_sse,
+                                              _mm_setzero_si128());
 
                 // Process 4 32bit dst pixels
-                __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel,
-                                                   mask_pixel, scale);
+                __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
+                                                   mask_sse, srcA_sse);
                 _mm_store_si128(d, result);
             }
 
             d++;
-            src += 4;
+            mask += 4;
             width -= 4;
         }
 
@@ -694,61 +777,67 @@ void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
     }
 
     while (width > 0) {
-        *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
-        src++;
+        *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
+        mask++;
         dst++;
         width--;
     }
 }
 
-void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
-                               SkColor color, int width, SkPMColor opaqueDst) {
+void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
+                               SkColor src, int width, SkPMColor opaqueDst) {
     if (width <= 0) {
         return;
     }
 
-    int srcR = SkColorGetR(color);
-    int srcG = SkColorGetG(color);
-    int srcB = SkColorGetB(color);
+    int srcR = SkColorGetR(src);
+    int srcG = SkColorGetG(src);
+    int srcB = SkColorGetB(src);
 
     if (width >= 4) {
         SkASSERT(((size_t)dst & 0x03) == 0);
         while (((size_t)dst & 0x0F) != 0) {
-            *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
-            src++;
+            *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
+            mask++;
             dst++;
             width--;
         }
 
         __m128i *d = reinterpret_cast<__m128i*>(dst);
-        __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
-        srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
+        // Set alpha to 0xFF and replicate source four times in SSE register.
+        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
+        // Set srcA_sse to contain eight copies of srcA, padded with zero.
+        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
+        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
         while (width >= 4) {
-            __m128i dst_pixel = _mm_load_si128(d);
-            __m128i mask_pixel = _mm_loadl_epi64(
-                                     reinterpret_cast<const __m128i*>(src));
-
-            // Check whether mask_pixels are equal to 0 and get the highest bit
-            // of each byte of result, if mask pixes are all zero, we will get
+            // Load four destination pixels into dst_sse.
+            __m128i dst_sse = _mm_load_si128(d);
+            // Load four 16-bit masks into lower half of mask_sse.
+            __m128i mask_sse = _mm_loadl_epi64(
+                                   reinterpret_cast<const __m128i*>(mask));
+
+            // Check whether masks are equal to 0 and get the highest bit
+            // of each byte of result, if masks are all zero, we will get
             // pack_cmp to 0xFFFF
-            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
+            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
                                              _mm_setzero_si128()));
 
             // if mask pixels are not all zero, we will blend the dst pixels
             if (pack_cmp != 0xFFFF) {
                 // Unpack 4 16bit mask pixels to
-                // (p0, 0, p1, 0, p2, 0, p3, 0)
-                mask_pixel = _mm_unpacklo_epi16(mask_pixel,
-                                                _mm_setzero_si128());
+                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
+                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
+                mask_sse = _mm_unpacklo_epi16(mask_sse,
+                                              _mm_setzero_si128());
 
                 // Process 4 32bit dst pixels
-                __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel,
-                                                         mask_pixel);
+                __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
+                                                         mask_sse);
                 _mm_store_si128(d, result);
             }
 
             d++;
-            src += 4;
+            mask += 4;
             width -= 4;
         }
 
@@ -756,8 +845,8 @@ void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
     }
 
     while (width > 0) {
-        *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
-        src++;
+        *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
+        mask++;
         dst++;
         width--;
     }