Refactor FourByteInterps. Add 64-bit Fast version. Add tests.

Effect on benches (only _fast_ should be affected, and only on 64-bit): Desktop (64-bit) four_byte_interp_slow_255 NONRENDERING c 7.80 7.84 -0.04 -0.5% four_byte_interp_slow_256 NONRENDERING c 7.38 7.36 +0.02 +0.3% four_byte_interp_fast_256 NONRENDERING c 4.86 4.38 +0.48 +9.9% four_byte_interp_fast_255 NONRENDERING c 5.80 5.16 +0.64 +11.0% N5 (32-bit) four_byte_interp_slow_256 NONRENDERING c 22.22 22.66 -0.44 -2.0% four_byte_interp_fast_255 NONRENDERING c 22.22 22.22 +0.00 +0.0% four_byte_interp_fast_256 NONRENDERING c 18.81 18.81 +0.00 +0.0% four_byte_interp_slow_255 NONRENDERING c 22.42 22.42 +0.00 +0.0% BUG= R=reed@google.com Author: mtklein@google.com Review URL: https://codereview.chromium.org/100923003 git-svn-id: http://skia.googlecode.com/svn/trunk@12468 2bbb7eff-a529-9590-31e7-b0007b416f81
author: commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> 2013-12-03 18:53:30 +0000
committer: commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> 2013-12-03 18:53:30 +0000
commit: 55ca8244cc19c3067defa64f139521264d777eb0 (patch)
tree: 545ee9d8821b38f775bf6a00f448f309988df960 /include/core/SkColorPriv.h
parent: 3361471a3504ecd0351ff70f4c42d8d6fee963d4 (diff)
1 files changed, 69 insertions, 21 deletions
diff --git a/include/core/SkColorPriv.h b/include/core/SkColorPriv.h
index fe4377fda6..98f4e5b698 100644
--- a/include/core/SkColorPriv.h
+++ b/include/core/SkColorPriv.h
@@ -269,34 +269,82 @@ static inline SkPMColor SkFourByteInterp(SkPMColor src, SkPMColor dst,
 }
 
 /**
- * 32b optimized version; currently appears to be 10% faster even on 64b
- * architectures than an equivalent 64b version and 30% faster than
- * SkFourByteInterp(). Third parameter controls blending of the first two:
- *   (src, dst, 0) returns dst
- *   (src, dst, 256) returns src
- * ** Does not match the results of SkFourByteInterp256() because we use
- * a more accurate scale computation!
- * TODO: migrate Skia function to using an accurate 255->266 alpha
- * conversion.
+ * 0xAARRGGBB -> 0x00AA00GG, 0x00RR00BB
+ */
+static inline void SkSplay(SkPMColor color, uint32_t* ag, uint32_t* rb) {
+    const uint32_t mask = 0x00FF00FF;
+    *ag = (color >> 8) & mask;
+    *rb = color & mask;
+}
+
+/**
+ * 0xAARRGGBB -> 0x00AA00GG00RR00BB
+ * (note, ARGB -> AGRB)
+ */
+static inline uint64_t SkSplay(SkPMColor color) {
+    const uint32_t mask = 0x00FF00FF;
+    uint64_t agrb = (color >> 8) & mask;  // 0x0000000000AA00GG
+    agrb <<= 32;                          // 0x00AA00GG00000000
+    agrb |= color & mask;                 // 0x00AA00GG00RR00BB
+    return agrb;
+}
+
+/**
+ * 0xAAxxGGxx, 0xRRxxBBxx-> 0xAARRGGBB
+ */
+static inline SkPMColor SkUnsplay(uint32_t ag, uint32_t rb) {
+    const uint32_t mask = 0xFF00FF00;
+    return (ag & mask) | ((rb & mask) >> 8);
+}
+
+/**
+ * 0xAAxxGGxxRRxxBBxx -> 0xAARRGGBB
+ * (note, AGRB -> ARGB)
  */
-static inline SkPMColor SkFastFourByteInterp256(SkPMColor src,
-                                                SkPMColor dst,
-                                                unsigned scale) {
+static inline SkPMColor SkUnsplay(uint64_t agrb) {
+    const uint32_t mask = 0xFF00FF00;
+    return ((agrb & mask) >> 8) |  // 0x00RR00BB
+           ((agrb >> 32) & mask);  // 0xAARRGGBB
+}
+
+static inline SkPMColor SkFastFourByteInterp256_32(SkPMColor src, SkPMColor dst, unsigned scale) {
     SkASSERT(scale <= 256);
 
-    // Reorders ARGB to AG-RB in order to reduce the number of operations.
-    const uint32_t mask = 0xFF00FF;
-    uint32_t src_rb = src & mask;
-    uint32_t src_ag = (src >> 8) & mask;
-    uint32_t dst_rb = dst & mask;
-    uint32_t dst_ag = (dst >> 8) & mask;
+    // Two 8-bit blends per two 32-bit registers, with space to make sure the math doesn't collide.
+    uint32_t src_ag, src_rb, dst_ag, dst_rb;
+    SkSplay(src, &src_ag, &src_rb);
+    SkSplay(dst, &dst_ag, &dst_rb);
 
-    uint32_t ret_rb = src_rb * scale + (256 - scale) * dst_rb;
-    uint32_t ret_ag = src_ag * scale + (256 - scale) * dst_ag;
+    const uint32_t ret_ag = src_ag * scale + (256 - scale) * dst_ag;
+    const uint32_t ret_rb = src_rb * scale + (256 - scale) * dst_rb;
 
-    return (ret_ag & ~mask) | ((ret_rb & ~mask) >> 8);
+    return SkUnsplay(ret_ag, ret_rb);
 }
 
+static inline SkPMColor SkFastFourByteInterp256_64(SkPMColor src, SkPMColor dst, unsigned scale) {
+    SkASSERT(scale <= 256);
+    // Four 8-bit blends in one 64-bit register, with space to make sure the math doesn't collide.
+    return SkUnsplay(SkSplay(src) * scale + (256-scale) * SkSplay(dst));
+}
+
+// TODO(mtklein): Replace slow versions with fast versions, using scale + (scale>>7) everywhere.
+
+/**
+ * Same as SkFourByteInterp256, but faster.
+ */
+static inline SkPMColor SkFastFourByteInterp256(SkPMColor src, SkPMColor dst, unsigned scale) {
+    // On a 64-bit machine, _64 is about 10% faster than _32, but ~40% slower on a 32-bit machine.
+    if (sizeof(void*) == 4) {
+        return SkFastFourByteInterp256_32(src, dst, scale);
+    } else {
+        return SkFastFourByteInterp256_64(src, dst, scale);
+    }
+}
+
+/**
+ * Nearly the same as SkFourByteInterp, but faster and a touch more accurate, due to better
+ * srcWeight scaling to [0, 256].
+ */
 static inline SkPMColor SkFastFourByteInterp(SkPMColor src,
                                              SkPMColor dst,
                                              U8CPU srcWeight) {
author	commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>	2013-12-03 18:53:30 +0000
committer	commit-bot@chromium.org <commit-bot@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81>	2013-12-03 18:53:30 +0000
commit	55ca8244cc19c3067defa64f139521264d777eb0 (patch)
tree	545ee9d8821b38f775bf6a00f448f309988df960 /include/core/SkColorPriv.h
parent	3361471a3504ecd0351ff70f4c42d8d6fee963d4 (diff)