From dbd94e2bb265b34c2d9bf82624909fef84a7217e Mon Sep 17 00:00:00 2001
From: mtklein <mtklein@chromium.org>
Date: Fri, 18 Mar 2016 11:07:46 -0700
Subject: custom ssse3 srcover_n_srgb_bw, about 1.8x speedup

This is a little demo of the sorts of speedups we can get from working in planar format, or even just a mini-planar of 4 pixels at a time like I'm doing here.

I chose this blit by running
  $ out/Release/nanobench --config srgb --match skp
and looking for the hottest sRGB-related method.
After this CL, src_1 and src_n become hotter than srcover_n.  They can probably get a similar treatment.

We transpose three times in this function:
   - dst after reading, as part of the zero-extension and conversion to float
   - src after reading, _MM_TRANSPOSE4_PS (which expands to 8 cheap instructions)
   - result before writing, the last _mm_shuffle_epi8
If we changed our buffer format to a mini-planar format like rrrr gggg bbbb aaaa, we could eliminate the src transpose and get another small speedup, to right around 2x.

This code leans pretty heavily on SSSE3, so if we want it to speed up Windows+Linux Chrome, it'll eventually want to go behind a function pointer.

This also appears to fix what looks like overflow in a few GMs, most noticeably in hairmodes.  This is something we'd better look into...

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1813263002

Review URL: https://codereview.chromium.org/1813263002
---
 src/core/SkXfermode4f.cpp | 108 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 99 insertions(+), 9 deletions(-)

(limited to 'src')
diff --git a/src/core/SkXfermode4f.cpp b/src/core/SkXfermode4f.cpp
index b5981b2014..4e444a791a 100644
--- a/src/core/SkXfermode4f.cpp
+++ b/src/core/SkXfermode4f.cpp
@@ -178,7 +178,7 @@ template <DstType D> void src_1(const SkXfermode*, uint32_t dst[],
                 Sk4f r2 = lerp(s4_255, to_4f(dst[2]), Sk4f(aa4[2])) + Sk4f(0.5f);
                 Sk4f r3 = lerp(s4_255, to_4f(dst[3]), Sk4f(aa4[3])) + Sk4f(0.5f);
                 Sk4f_ToBytes((uint8_t*)dst, r0, r1, r2, r3);
-                
+
                 dst += 4;
                 aa += 4;
                 count -= 4;
@@ -200,7 +200,7 @@ template <DstType D> void src_1(const SkXfermode*, uint32_t dst[],
                              linear_unit_to_srgb_255f(r1),
                              linear_unit_to_srgb_255f(r2),
                              linear_unit_to_srgb_255f(r3));
-                
+
                 dst += 4;
                 aa += 4;
                 count -= 4;
@@ -233,6 +233,92 @@ const SkXfermode::D32Proc gProcs_Dst[] = {
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+
+static void srcover_n_srgb_bw(uint32_t dst[], const SkPM4f src[], int count) {
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3  // For _mm_shuffle_epi8
+    while (count >= 4) {
+        // Load 4 sRGB RGBA/BGRA 8888 dst pixels.
+        // We'll write most of this as if they're RGBA, and just swizzle the src pixels to match.
+        __m128i d4 = _mm_loadu_si128((const __m128i*)dst);
+
+        // Transpose into planar and convert each plane to float.
+        auto _ = ~0;  // Shuffles in a zero byte.
+        auto dr = _mm_cvtepi32_ps(
+                _mm_shuffle_epi8(d4, _mm_setr_epi8(0,_,_,_, 4,_,_,_, 8,_,_,_,12,_,_,_)));
+        auto dg = _mm_cvtepi32_ps(
+                _mm_shuffle_epi8(d4, _mm_setr_epi8(1,_,_,_, 5,_,_,_, 9,_,_,_,13,_,_,_)));
+        auto db = _mm_cvtepi32_ps(
+                _mm_shuffle_epi8(d4, _mm_setr_epi8(2,_,_,_, 6,_,_,_,10,_,_,_,14,_,_,_)));
+        auto da = _mm_cvtepi32_ps(
+                _mm_shuffle_epi8(d4, _mm_setr_epi8(3,_,_,_, 7,_,_,_,11,_,_,_,15,_,_,_)));
+
+        // Scale to [0,1].
+        dr = _mm_mul_ps(dr, _mm_set1_ps(1/255.0f));
+        dg = _mm_mul_ps(dg, _mm_set1_ps(1/255.0f));
+        db = _mm_mul_ps(db, _mm_set1_ps(1/255.0f));
+        da = _mm_mul_ps(da, _mm_set1_ps(1/255.0f));
+
+        // Apply approximate sRGB gamma correction to convert to linear (as if gamma were 2).
+        dr = _mm_mul_ps(dr, dr);
+        dg = _mm_mul_ps(dg, dg);
+        db = _mm_mul_ps(db, db);
+
+        // Load 4 linear float src pixels.
+        auto s0 = _mm_loadu_ps(src[0].fVec),
+             s1 = _mm_loadu_ps(src[1].fVec),
+             s2 = _mm_loadu_ps(src[2].fVec),
+             s3 = _mm_loadu_ps(src[3].fVec);
+
+        // Transpose src pixels to planar too, and give the registers better names.
+        _MM_TRANSPOSE4_PS(s0, s1, s2, s3);
+        auto sr = s0,
+             sg = s1,
+             sb = s2,
+             sa = s3;
+
+        // Match color order with destination, if necessary.
+    #if defined(SK_PMCOLOR_IS_BGRA)
+        SkTSwap(sr, sb);
+    #endif
+
+        // Now, the meat of what we wanted to do... perform the srcover blend.
+        auto invSA = _mm_sub_ps(_mm_set1_ps(1), sa);
+        auto r = _mm_add_ps(sr, _mm_mul_ps(dr, invSA)),
+             g = _mm_add_ps(sg, _mm_mul_ps(dg, invSA)),
+             b = _mm_add_ps(sb, _mm_mul_ps(db, invSA)),
+             a = _mm_add_ps(sa, _mm_mul_ps(da, invSA));
+
+        // Convert back to sRGB and [0,255], again approximating sRGB as gamma == 2.
+        r = _mm_mul_ps(_mm_sqrt_ps(r), _mm_set1_ps(255));
+        g = _mm_mul_ps(_mm_sqrt_ps(g), _mm_set1_ps(255));
+        b = _mm_mul_ps(_mm_sqrt_ps(b), _mm_set1_ps(255));
+        a = _mm_mul_ps(           (a), _mm_set1_ps(255));
+
+        // Convert to int (with rounding) and pack back down to planar 8-bit.
+        __m128i x = _mm_packus_epi16(_mm_packus_epi16(_mm_cvtps_epi32(r), _mm_cvtps_epi32(g)),
+                                     _mm_packus_epi16(_mm_cvtps_epi32(b), _mm_cvtps_epi32(a)));
+
+        // Transpose back to interlaced RGBA and write back to dst.
+        x = _mm_shuffle_epi8(x, _mm_setr_epi8(0, 4,  8, 12,
+                                              1, 5,  9, 13,
+                                              2, 6, 10, 14,
+                                              3, 7, 11, 15));
+        _mm_storeu_si128((__m128i*)dst, x);
+
+        count -= 4;
+        dst += 4;
+        src += 4;
+    }
+#endif
+    // This should look just like the non-specialized case in srcover_n.
+    for (int i = 0; i < count; ++i) {
+        Sk4f s4 = src[i].to4f_pmorder();
+        Sk4f d4 = load_dst<kSRGB_Dst>(dst[i]);
+        Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4));
+        dst[i] = store_dst<kSRGB_Dst>(r4);
+    }
+}
+
 template <DstType D> void srcover_n(const SkXfermode*, uint32_t dst[],
                                     const SkPM4f src[], int count, const SkAlpha aa[]) {
     if (aa) {
@@ -250,11 +336,15 @@ template <DstType D> void srcover_n(const SkXfermode*, uint32_t dst[],
             dst[i] = store_dst<D>(r4);
         }
     } else {
-        for (int i = 0; i < count; ++i) {
-            Sk4f s4 = src[i].to4f_pmorder();
-            Sk4f d4 = load_dst<D>(dst[i]);
-            Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4));
-            dst[i] = store_dst<D>(r4);
+        if (D == kSRGB_Dst) {
+            srcover_n_srgb_bw(dst, src, count);
+        } else {
+            for (int i = 0; i < count; ++i) {
+                Sk4f s4 = src[i].to4f_pmorder();
+                Sk4f d4 = load_dst<D>(dst[i]);
+                Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4));
+                dst[i] = store_dst<D>(r4);
+            }
         }
     }
 }
@@ -263,7 +353,7 @@ static void srcover_linear_dst_1(const SkXfermode*, uint32_t dst[],
                                  const SkPM4f* src, int count, const SkAlpha aa[]) {
     const Sk4f s4 = src->to4f_pmorder();
     const Sk4f dst_scale = Sk4f(1 - get_alpha(s4));
-    
+
     if (aa) {
         for (int i = 0; i < count; ++i) {
             unsigned a = aa[i];
@@ -396,7 +486,7 @@ static Sk4f lcd16_to_unit_4f(uint16_t rgb) {
 template <DstType D>
 void src_1_lcd(uint32_t dst[], const SkPM4f* src, int count, const uint16_t lcd[]) {
     const Sk4f s4 = Sk4f::Load(src->fVec);
-    
+
     if (D == kLinear_Dst) {
         // operate in bias-255 space for src and dst
         const Sk4f s4bias = s4 * Sk4f(255);
-- 
cgit v1.2.3