Faster and more accurate blit_row_s32a_opaque for ARM

Change ARM implementation of alpha blending to work on 8 pixels at a time (using NEON). Also improve the accuracy of alpha blending by using a formula based on SkMulDiv255Round rather than SkPMSrcOver. Note that a number of variations of this code were considered. Here are some notes: - A 16 pixels at a time version was considered. This performs well for the case of extreme alpha (all-opaque or all-transparent pixels), but performs worst than the 8 pixels version when there are frequent transitions of alpha. Also gcc 6.2.1 seems to have troubles with register pressure when using this version. - If the branch to detect the fully-opaque or fully-transparent cases is removed, then the performance increases significantly for images which are all partially transparent (especially on ARM Cortex A72), but can significantly decrease for images that are almost fully opaque or fully transparent. This implementation is a compromise to the effects described above. This patch produces a ~10% improvement on the nanobench's sub-scores repeatTile_BGRA_8888_A, constXTile_MM_filter_trans, constXTile_CC_trans, constXTile_RR_filter_trans when running on ARM Cortex A72. Improvements of greater magnitude (20% to 30%) are observed when running on ARM Cortex A53. CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD Change-Id: I1f0c9f549057613bbffd26e6651f3beeb0019af9 Bug: skia: Reviewed-on: https://skia-review.googlesource.com/16520 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
author: Matteo Franchin <matteo.franchin@arm.com> 2017-05-26 18:56:51 +0100
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2017-05-26 19:02:11 +0000
commit: a132c3869fcffb350d7a5ca7256496ab977bdd0c (patch)
tree: bac931af4afd8f08fce96f974a8a6c9b5ba12bc7 /src/opts
parent: 0b403f8b6b0d07f115bc15dd65af69a850fa8172 (diff)
1 files changed, 76 insertions, 36 deletions
diff --git a/src/opts/SkBlitRow_opts.h b/src/opts/SkBlitRow_opts.h
index 2a123f2dd7..74dead5da0 100644
--- a/src/opts/SkBlitRow_opts.h
+++ b/src/opts/SkBlitRow_opts.h
@@ -37,6 +37,45 @@ void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor
     });
 }
 
+#if defined(SK_ARM_HAS_NEON)
+
+// Return a uint8x8_t value, r, computed as r[i] = SkMulDiv255Round(x[i], y[i]), where r[i], x[i],
+// y[i] are the i-th lanes of the corresponding NEON vectors.
+static inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t x, uint8x8_t y) {
+    uint16x8_t prod = vmull_u8(x, y);
+    return vraddhn_u16(prod, vrshrq_n_u16(prod, 8));
+}
+
+// The implementations of SkPMSrcOver below perform alpha blending consistently with
+// SkMulDiv255Round. They compute the color components (numbers in the interval [0, 255]) as:
+//
+//   result_i = src_i + rint(g(src_alpha, dst_i))
+//
+// where g(x, y) = ((255.0 - x) * y) / 255.0 and rint rounds to the nearest integer.
+
+// In this variant of SkPMSrcOver each NEON register, dst.val[i], src.val[i], contains the value
+// of the same color component for 8 consecutive pixels. The result of this function follows the
+// same convention.
+static inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t dst, uint8x8x4_t src) {
+    uint8x8_t nalphas = vmvn_u8(src.val[3]);
+    uint8x8x4_t result;
+    result.val[0] = vadd_u8(src.val[0], SkMulDiv255Round_neon8(nalphas,  dst.val[0]));
+    result.val[1] = vadd_u8(src.val[1], SkMulDiv255Round_neon8(nalphas,  dst.val[1]));
+    result.val[2] = vadd_u8(src.val[2], SkMulDiv255Round_neon8(nalphas,  dst.val[2]));
+    result.val[3] = vadd_u8(src.val[3], SkMulDiv255Round_neon8(nalphas,  dst.val[3]));
+    return result;
+}
+
+// In this variant of SkPMSrcOver dst and src contain the color components of two consecutive
+// pixels. The return value follows the same convention.
+static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
+    const uint8x8_t alpha_indices = vcreate_u8(0x0707070703030303);
+    uint8x8_t nalphas = vmvn_u8(vtbl1_u8(src, alpha_indices));
+    return vadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst));
+}
+
+#endif
+
 static inline
 void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
     SkASSERT(alpha == 0xFF);
@@ -142,51 +181,52 @@ void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU a
     }
 
 #elif defined(SK_ARM_HAS_NEON)
-    while (len >= 4) {
-        if ((src[0] | src[1] | src[2] | src[3]) == 0x00000000) {
-            // All 16 source pixels are transparent.  Nothing to do.
-            src += 4;
-            dst += 4;
-            len -= 4;
+    // Do 8-pixels at a time. A 16-pixels at a time version of this code was also tested, but it
+    // underperformed on some of the platforms under test for inputs with frequent transitions of
+    // alpha (corresponding to changes of the conditions [~]alpha_u64 == 0 below). It may be worth
+    // revisiting the situation in the future.
+    while (len >= 8) {
+        // Load 8 pixels in 4 NEON registers. src_col.val[i] will contain the same color component
+        // for 8 consecutive pixels (e.g. src_col.val[3] will contain all alpha components of 8
+        // pixels).
+        uint8x8x4_t src_col = vld4_u8(reinterpret_cast<const uint8_t*>(src));
+        src += 8;
+        len -= 8;
+
+        // We now detect 2 special cases: the first occurs when all alphas are zero (the 8 pixels
+        // are all transparent), the second when all alphas are fully set (they are all opaque).
+        uint8x8_t alphas = src_col.val[3];
+        uint64_t alphas_u64 = vget_lane_u64(vreinterpret_u64_u8(alphas), 0);
+        if (alphas_u64 == 0) {
+            // All pixels transparent.
+            dst += 8;
             continue;
         }
 
-        if ((src[0] & src[1] & src[2] & src[3]) >= 0xFF000000) {
-            // All 16 source pixels are opaque.  SrcOver becomes Src.
-            dst[0] = src[0];
-            dst[1] = src[1];
-            dst[2] = src[2];
-            dst[3] = src[3];
-            src += 4;
-            dst += 4;
-            len -= 4;
+        if (~alphas_u64 == 0) {
+            // All pixels opaque.
+            vst4_u8(reinterpret_cast<uint8_t*>(dst), src_col);
+            dst += 8;
             continue;
         }
 
-        // Load 4 source and destination pixels.
-        auto src0 = vreinterpret_u8_u32(vld1_u32(src+0)),
-             src2 = vreinterpret_u8_u32(vld1_u32(src+2)),
-             dst0 = vreinterpret_u8_u32(vld1_u32(dst+0)),
-             dst2 = vreinterpret_u8_u32(vld1_u32(dst+2));
-
-        // TODO: This math is wrong.
-        const uint8x8_t alphas = vcreate_u8(0x0707070703030303);
-        auto invSA0_w = vsubw_u8(vdupq_n_u16(256), vtbl1_u8(src0, alphas)),
-             invSA2_w = vsubw_u8(vdupq_n_u16(256), vtbl1_u8(src2, alphas));
-
-        auto dstInvSA0 = vmulq_u16(invSA0_w, vmovl_u8(dst0)),
-             dstInvSA2 = vmulq_u16(invSA2_w, vmovl_u8(dst2));
-
-        dst0 = vadd_u8(src0, vshrn_n_u16(dstInvSA0, 8));
-        dst2 = vadd_u8(src2, vshrn_n_u16(dstInvSA2, 8));
+        uint8x8x4_t dst_col = vld4_u8(reinterpret_cast<uint8_t*>(dst));
+        vst4_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon8(dst_col, src_col));
+        dst += 8;
+    }
 
-        vst1_u32(dst+0, vreinterpret_u32_u8(dst0));
-        vst1_u32(dst+2, vreinterpret_u32_u8(dst2));
+    // Deal with leftover pixels.
+    for (; len >= 2; len -= 2, src += 2, dst += 2) {
+        uint8x8_t src2 = vld1_u8(reinterpret_cast<const uint8_t*>(src));
+        uint8x8_t dst2 = vld1_u8(reinterpret_cast<const uint8_t*>(dst));
+        vst1_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon2(dst2, src2));
+    }
 
-        src += 4;
-        dst += 4;
-        len -= 4;
+    if (len != 0) {
+        uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8(*dst), vcreate_u8(*src));
+        vst1_lane_u32(dst, vreinterpret_u32_u8(result), 0);
     }
+    return;
 #endif
 
     while (len-- > 0) {
author	Matteo Franchin <matteo.franchin@arm.com>	2017-05-26 18:56:51 +0100
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2017-05-26 19:02:11 +0000
commit	a132c3869fcffb350d7a5ca7256496ab977bdd0c (patch)
tree	bac931af4afd8f08fce96f974a8a6c9b5ba12bc7 /src/opts
parent	0b403f8b6b0d07f115bc15dd65af69a850fa8172 (diff)