diff options
author | Mike Klein <mtklein@chromium.org> | 2017-08-29 18:10:15 -0400 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2017-08-29 23:12:21 +0000 |
commit | 21befdcf5e2509d9c7111a8d3dc1ba74c902fae2 (patch) | |
tree | 4b5f962bd811aeb06a7124a8c8152d8ba8a03a45 /src/jumper | |
parent | 569b74c38cf782281ad14a289f02f13bfcbdbf69 (diff) |
fast NEON divide-by-255
We can approximate (xy + 127) / 255 with (xy + 255) / 256.
On ARM this divide-by-255 is a single instruction, one of the two we use today
to do a perfect divide-by-255 (#if 0). This cuts div-255 in half, or a full
mul-div-255 by a third. The U16(255) constant can even be created in a single
instruction without hitting memory, which is as good as it gets.
Here's a nice little example:
0000000000000000 <sk_premul_8bit>:
0: f8408404 ldr x4, [x0], #8 // Load the next stage.
4: 2e23c000 umull v0.8h, v0.8b, v3.8b // r = r * a
8: 6f02e6b0 movi v16.2d, #0xff00ff00ff00ff // create U16(255)
c: 2e23c021 umull v1.8h, v1.8b, v3.8b // g = g * a
10: 2e23c042 umull v2.8h, v2.8b, v3.8b // b = b * a
14: 0e304000 addhn v0.8b, v0.8h, v16.8h // r = div255(r)
18: 0e304021 addhn v1.8b, v1.8h, v16.8h // g = div255(g)
1c: 0e304042 addhn v2.8b, v2.8h, v16.8h // b = div255(b)
20: d61f0080 br x4 // JUMP!
Change-Id: I4224ed3844abf6c67d9e42b67444a60f4aee8f08
Reviewed-on: https://skia-review.googlesource.com/40121
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Florin Malita <fmalita@chromium.org>
Diffstat (limited to 'src/jumper')
-rw-r--r-- | src/jumper/SkJumper_stages_8bit.cpp | 7 |
1 files changed, 6 insertions, 1 deletions
diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp index 26432ca744..f76634a5b7 100644 --- a/src/jumper/SkJumper_stages_8bit.cpp +++ b/src/jumper/SkJumper_stages_8bit.cpp @@ -476,8 +476,13 @@ SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int x, int y) { V(int v) : vec(v) {} V(float v) : vec(v * 255) {} V(U16 v) { - // (v + 127) / 255 == (v + (v+128)>>8 +128) >> 8 + #if 0 + // (v + 127) / 255 = (v + ((v+128)>>8) + 128) >> 8 vec = vraddhn_u16(v, vrshrq_n_u16(v, 8)); + #else + // (v + 127) / 255 ≈ (v + 255) >> 8 + vec = vaddhn_u16(v, U16(255)); + #endif } operator U8() const { return vec; } |