SkPx: new approach to fixed-point SIMD

SkPx is like Sk4px, except each platform implementation of SkPx can declare a different sweet spot of N pixels, with extra loads and stores to handle the ragged edge of 0<n<N pixels. In this case, _sse's sweet spot remains 4 pixels. _neon jumps up to 8 so we can now use NEON's transposing loads and stores, and _none is just 1. This makes operations involving alpha considerably more efficient on NEON, as alpha is its own distinct 8x8 bit plane that's easy to toss around. This incorporates a few other improvements I've been wanting: - no requirement that we're dealing with SkPMColor. SkColor works too. - no anonymous namespace hack to differentiate implementations. Codegen and perf look good on Clang/x86-64 and GCC/ARMv7. The NEON code looks very similar to the old NEON code, as intended. No .skp or GM diffs on my laptop. Don't expect any. I intend this to replace Sk4px. Plan after landing: - port SkXfermode_opts.h - port Color32 in SkBlitRow_D32.cpp (and move to SkBlitRow_opts.h like other SkOpts code) - delete all Sk4px-related code - clean up evolutionary dead ends in SkNx (Sk16b, Sk16h, Sk4i, Sk4d, etc.) leaving Sk2f, Sk4f (and Sk2s, Sk4s). - find a machine with AVX2 to work on, write SkPx_avx2.h handling 8 pixels at a time. In the end we'll have Sk4f for float pixels, SkPx for fixed-point pixels. BUG=skia:4117 Review URL: https://codereview.chromium.org/1317233005
author: mtklein <mtklein@chromium.org> 2015-09-14 12:43:20 -0700
committer: Commit bot <commit-bot@chromium.org> 2015-09-14 12:43:20 -0700
commit: 82c93b45ed6ac0b628adb8375389c202d1f586f9 (patch)
tree: bdb517b2f2a05fe22dda1f84f5f5eafda87efa87 /src/opts/SkPx_none.h
parent: b5b603241aaa99e07dc4e12ca9f2661aa85e5f74 (diff)
1 files changed, 106 insertions, 0 deletions
diff --git a/src/opts/SkPx_none.h b/src/opts/SkPx_none.h
new file mode 100644
index 0000000000..a4758c1004
--- /dev/null
+++ b/src/opts/SkPx_none.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkPx_none_DEFINED
+#define SkPx_none_DEFINED
+
+// Nothing fancy here.  We're the backup _none case after all.
+// Our declared sweet spot is simply a single pixel at a time.
+
+struct SkPx_none {
+    static const int N = 1;
+    uint8_t f8[4];
+
+    SkPx_none(uint32_t px) { memcpy(f8, &px, 4); }
+    SkPx_none(uint8_t x, uint8_t y, uint8_t z, uint8_t a) {
+        f8[0] = x; f8[1] = y; f8[2] = z; f8[3] = a;
+    }
+
+    static SkPx_none Dup(uint32_t px) { return px; }
+    static SkPx_none LoadN(const uint32_t* px) { return *px; }
+    static SkPx_none Load(int n, const uint32_t* px) {
+        SkASSERT(false);  // There are no 0<n<1.
+        return 0;
+    }
+
+    void storeN(uint32_t* px) const { memcpy(px, f8, 4); }
+    void store(int n, uint32_t* px) const {
+        SkASSERT(false);  // There are no 0<n<1.
+    }
+
+    struct Alpha {
+        uint8_t fA;
+        Alpha(uint8_t a) : fA(a) {}
+
+        static Alpha Dup(uint8_t a) { return a; }
+        static Alpha LoadN(const uint8_t* a) { return *a; }
+        static Alpha Load(int n, const uint8_t* a) {
+            SkASSERT(false);  // There are no 0<n<1.
+            return 0;
+        }
+        Alpha inv() const { return 255 - fA; }
+    };
+
+    struct Wide {
+        uint16_t f16[4];
+
+        Wide(uint16_t x, uint16_t y, uint16_t z, uint16_t a) {
+            f16[0] = x; f16[1] = y; f16[2] = z; f16[3] = a;
+        }
+
+        Wide operator+(const Wide& o) const {
+            return Wide(f16[0]+o.f16[0], f16[1]+o.f16[1], f16[2]+o.f16[2], f16[3]+o.f16[3]);
+        }
+        Wide operator-(const Wide& o) const {
+            return Wide(f16[0]-o.f16[0], f16[1]-o.f16[1], f16[2]-o.f16[2], f16[3]-o.f16[3]);
+        }
+        Wide operator<<(int bits) const {
+            return Wide(f16[0]<<bits, f16[1]<<bits, f16[2]<<bits, f16[3]<<bits);
+        }
+        Wide operator>>(int bits) const {
+            return Wide(f16[0]>>bits, f16[1]>>bits, f16[2]>>bits, f16[3]>>bits);
+        }
+
+        SkPx_none addNarrowHi(const SkPx_none& o) const {
+            Wide sum = (*this + o.widenLo()) >> 8;
+            return SkPx_none(sum.f16[0], sum.f16[1], sum.f16[2], sum.f16[3]);
+        }
+    };
+
+    Alpha alpha() const { return f8[3]; }
+
+    Wide widenLo() const { return Wide(f8[0], f8[1], f8[2], f8[3]); }
+    Wide widenHi() const { return this->widenLo() << 8; }
+    Wide widenLoHi() const { return this->widenLo() + this->widenHi(); }
+
+    SkPx_none operator+(const SkPx_none& o) const {
+        return SkPx_none(f8[0]+o.f8[0], f8[1]+o.f8[1], f8[2]+o.f8[2], f8[3]+o.f8[3]);
+    }
+    SkPx_none operator-(const SkPx_none& o) const {
+        return SkPx_none(f8[0]-o.f8[0], f8[1]-o.f8[1], f8[2]-o.f8[2], f8[3]-o.f8[3]);
+    }
+    SkPx_none saturatedAdd(const SkPx_none& o) const {
+        return SkPx_none(SkTMax(0, SkTMin(255, f8[0]+o.f8[0])),
+                         SkTMax(0, SkTMin(255, f8[1]+o.f8[1])),
+                         SkTMax(0, SkTMin(255, f8[2]+o.f8[2])),
+                         SkTMax(0, SkTMin(255, f8[3]+o.f8[3])));
+    }
+
+    Wide operator*(const Alpha& a) const {
+        return Wide(f8[0]*a.fA, f8[1]*a.fA, f8[2]*a.fA, f8[3]*a.fA);
+    }
+    SkPx_none approxMulDiv255(const Alpha& a) const {
+        return (*this * a).addNarrowHi(*this);
+    }
+
+    SkPx_none addAlpha(const Alpha& a) const {
+        return SkPx_none(f8[0], f8[1], f8[2], f8[3]+a.fA);
+    }
+};
+typedef SkPx_none SkPx;
+
+#endif//SkPx_none_DEFINED
author	mtklein <mtklein@chromium.org>	2015-09-14 12:43:20 -0700
committer	Commit bot <commit-bot@chromium.org>	2015-09-14 12:43:20 -0700
commit	82c93b45ed6ac0b628adb8375389c202d1f586f9 (patch)
tree	bdb517b2f2a05fe22dda1f84f5f5eafda87efa87 /src/opts/SkPx_none.h
parent	b5b603241aaa99e07dc4e12ca9f2661aa85e5f74 (diff)