From f1b8b6ae34e5a1f4b29e423401da39f88f0c117a Mon Sep 17 00:00:00 2001
From: msarett <msarett@google.com>
Date: Fri, 22 Jan 2016 09:54:21 -0800
Subject: Use NEON optimizations for RGB -> RGB(FF) or BGR(FF) in SkSwizzler

Swizzle Bench Runtime Nexus 6P
xxx_xxxa        0.32x
xxx_swaprb_xxxa 0.31x

Swizzle Bench Runtime Nexus 9
xxx_xxxa        1.11x
xxx_swaprb_xxxa 1.14x
(This is a slow down.)

Swizzle Bench Runtime Nexus 5
xxx_xxxa        0.12x
xxx_swaprb      0.12x

RGB PNG Decode Runtime
Nexus 6P        0.94x
Nexus 9         0.98x

I don't know how to explain the fact that the Swizzle Bench was
slower on Nexus 9, but the decode times got faster.

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1618003002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review URL: https://codereview.chromium.org/1618003002
---
 src/opts/SkSwizzler_opts.h | 142 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 124 insertions(+), 18 deletions(-)

(limited to 'src/opts/SkSwizzler_opts.h')

diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h
index 8d1be84df2..ad121cfafe 100644
--- a/src/opts/SkSwizzler_opts.h
+++ b/src/opts/SkSwizzler_opts.h
@@ -60,6 +60,34 @@ static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {
     }
 }
 
+static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
+    const uint8_t* src = (const uint8_t*)vsrc;
+    for (int i = 0; i < count; i++) {
+        uint8_t r = src[0],
+                g = src[1],
+                b = src[2];
+        src += 3;
+        dst[i] = (uint32_t)0xFF << 24
+               | (uint32_t)b    << 16
+               | (uint32_t)g    <<  8
+               | (uint32_t)r    <<  0;
+    }
+}
+
+static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) {
+    const uint8_t* src = (const uint8_t*)vsrc;
+    for (int i = 0; i < count; i++) {
+        uint8_t r = src[0],
+                g = src[1],
+                b = src[2];
+        src += 3;
+        dst[i] = (uint32_t)0xFF << 24
+               | (uint32_t)r    << 16
+               | (uint32_t)g    <<  8
+               | (uint32_t)b    <<  0;
+    }
+}
+
 #if defined(SK_ARM_HAS_NEON)
 
 // Rounded divide by 255, (x + 127) / 255
@@ -96,12 +124,12 @@ static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
     auto src = (const uint32_t*)vsrc;
     while (count >= 8) {
         // Load 8 pixels.
-        uint8x8x4_t bgra = vld4_u8((const uint8_t*) src);
+        uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
 
-        uint8x8_t a = bgra.val[3],
-                  b = bgra.val[2],
-                  g = bgra.val[1],
-                  r = bgra.val[0];
+        uint8x8_t a = rgba.val[3],
+                  b = rgba.val[2],
+                  g = rgba.val[1],
+                  r = rgba.val[0];
 
         // Premultiply.
         b = scale(b, a);
@@ -110,15 +138,15 @@ static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
 
         // Store 8 premultiplied pixels.
         if (kSwapRB) {
-            bgra.val[2] = r;
-            bgra.val[1] = g;
-            bgra.val[0] = b;
+            rgba.val[2] = r;
+            rgba.val[1] = g;
+            rgba.val[0] = b;
         } else {
-            bgra.val[2] = b;
-            bgra.val[1] = g;
-            bgra.val[0] = r;
+            rgba.val[2] = b;
+            rgba.val[1] = g;
+            rgba.val[0] = r;
         }
-        vst4_u8((uint8_t*) dst, bgra);
+        vst4_u8((uint8_t*) dst, rgba);
         src += 8;
         dst += 8;
         count -= 8;
@@ -141,13 +169,13 @@ static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
     auto src = (const uint32_t*)vsrc;
     while (count >= 16) {
         // Load 16 pixels.
-        uint8x16x4_t bgra = vld4q_u8((const uint8_t*) src);
+        uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
 
         // Swap r and b.
-        SkTSwap(bgra.val[0], bgra.val[2]);
+        SkTSwap(rgba.val[0], rgba.val[2]);
 
         // Store 16 pixels.
-        vst4q_u8((uint8_t*) dst, bgra);
+        vst4q_u8((uint8_t*) dst, rgba);
         src += 16;
         dst += 16;
         count -= 16;
@@ -155,13 +183,13 @@ static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
 
     if (count >= 8) {
         // Load 8 pixels.
-        uint8x8x4_t bgra = vld4_u8((const uint8_t*) src);
+        uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
 
         // Swap r and b.
-        SkTSwap(bgra.val[0], bgra.val[2]);
+        SkTSwap(rgba.val[0], rgba.val[2]);
 
         // Store 8 pixels.
-        vst4_u8((uint8_t*) dst, bgra);
+        vst4_u8((uint8_t*) dst, rgba);
         src += 8;
         dst += 8;
         count -= 8;
@@ -170,6 +198,68 @@ static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
     RGBA_to_BGRA_portable(dst, src, count);
 }
 
+template <bool kSwapRB>
+static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
+    const uint8_t* src = (const uint8_t*) vsrc;
+    while (count >= 16) {
+        // Load 16 pixels.
+        uint8x16x3_t rgb = vld3q_u8(src);
+
+        // Insert an opaque alpha channel and swap if needed.
+        uint8x16x4_t rgba;
+        if (kSwapRB) {
+            rgba.val[0] = rgb.val[2];
+            rgba.val[2] = rgb.val[0];
+        } else {
+            rgba.val[0] = rgb.val[0];
+            rgba.val[2] = rgb.val[2];
+        }
+        rgba.val[1] = rgb.val[1];
+        rgba.val[3] = vdupq_n_u8(0xFF);
+
+        // Store 16 pixels.
+        vst4q_u8((uint8_t*) dst, rgba);
+        src += 16*3;
+        dst += 16;
+        count -= 16;
+    }
+
+    if (count >= 8) {
+        // Load 8 pixels.
+        uint8x8x3_t rgb = vld3_u8(src);
+
+        // Insert an opaque alpha channel and swap if needed.
+        uint8x8x4_t rgba;
+        if (kSwapRB) {
+            rgba.val[0] = rgb.val[2];
+            rgba.val[2] = rgb.val[0];
+        } else {
+            rgba.val[0] = rgb.val[0];
+            rgba.val[2] = rgb.val[2];
+        }
+        rgba.val[1] = rgb.val[1];
+        rgba.val[3] = vdup_n_u8(0xFF);
+
+        // Store 8 pixels.
+        vst4_u8((uint8_t*) dst, rgba);
+        src += 8*3;
+        dst += 8;
+        count -= 8;
+    }
+
+    // Call portable code to finish up the tail of [0,8) pixels.
+    auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
+    proc(dst, src, count);
+}
+
+static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
+    insert_alpha_should_swaprb<false>(dst, src, count);
+}
+
+static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
+    insert_alpha_should_swaprb<true>(dst, src, count);
+}
+
 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
 
 template <bool kSwapRB>
@@ -268,6 +358,14 @@ static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
     RGBA_to_BGRA_portable(dst, src, count);
 }
 
+static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
+    RGB_to_RGB1_portable(dst, src, count);
+}
+
+static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
+    RGB_to_BGR1_portable(dst, src, count);
+}
+
 #else
 
 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
@@ -282,6 +380,14 @@ static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
     RGBA_to_BGRA_portable(dst, src, count);
 }
 
+static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
+    RGB_to_RGB1_portable(dst, src, count);
+}
+
+static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
+    RGB_to_BGR1_portable(dst, src, count);
+}
+
 #endif
 
 }
-- 
cgit v1.2.3