fast NEON divide-by-255

We can approximate (xy + 127) / 255 with (xy + 255) / 256. On ARM this divide-by-255 is a single instruction, one of the two we use today to do a perfect divide-by-255 (#if 0). This cuts div-255 in half, or a full mul-div-255 by a third. The U16(255) constant can even be created in a single instruction without hitting memory, which is as good as it gets. Here's a nice little example: 0000000000000000 <sk_premul_8bit>: 0: f8408404 ldr x4, [x0], #8 // Load the next stage. 4: 2e23c000 umull v0.8h, v0.8b, v3.8b // r = r * a 8: 6f02e6b0 movi v16.2d, #0xff00ff00ff00ff // create U16(255) c: 2e23c021 umull v1.8h, v1.8b, v3.8b // g = g * a 10: 2e23c042 umull v2.8h, v2.8b, v3.8b // b = b * a 14: 0e304000 addhn v0.8b, v0.8h, v16.8h // r = div255(r) 18: 0e304021 addhn v1.8b, v1.8h, v16.8h // g = div255(g) 1c: 0e304042 addhn v2.8b, v2.8h, v16.8h // b = div255(b) 20: d61f0080 br x4 // JUMP! Change-Id: I4224ed3844abf6c67d9e42b67444a60f4aee8f08 Reviewed-on: https://skia-review.googlesource.com/40121 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Florin Malita <fmalita@chromium.org>
author: Mike Klein <mtklein@chromium.org> 2017-08-29 18:10:15 -0400
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2017-08-29 23:12:21 +0000
commit: 21befdcf5e2509d9c7111a8d3dc1ba74c902fae2 (patch)
tree: 4b5f962bd811aeb06a7124a8c8152d8ba8a03a45 /src/jumper
parent: 569b74c38cf782281ad14a289f02f13bfcbdbf69 (diff)
1 files changed, 6 insertions, 1 deletions
diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp
index 26432ca744..f76634a5b7 100644
--- a/src/jumper/SkJumper_stages_8bit.cpp
+++ b/src/jumper/SkJumper_stages_8bit.cpp
@@ -476,8 +476,13 @@ SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int x, int y) {
         V(int   v) : vec(v) {}
         V(float v) : vec(v * 255) {}
         V(U16   v) {
-            // (v + 127) / 255 == (v + (v+128)>>8 +128) >> 8
+        #if 0
+            // (v + 127) / 255 = (v + ((v+128)>>8) + 128) >> 8
             vec = vraddhn_u16(v, vrshrq_n_u16(v, 8));
+        #else
+            // (v + 127) / 255 ≈ (v + 255) >> 8
+            vec = vaddhn_u16(v, U16(255));
+        #endif
         }
 
         operator U8() const { return vec; }
author	Mike Klein <mtklein@chromium.org>	2017-08-29 18:10:15 -0400
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2017-08-29 23:12:21 +0000
commit	21befdcf5e2509d9c7111a8d3dc1ba74c902fae2 (patch)
tree	4b5f962bd811aeb06a7124a8c8152d8ba8a03a45 /src/jumper
parent	569b74c38cf782281ad14a289f02f13bfcbdbf69 (diff)