aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts/Sk4px_NEON.h
diff options
context:
space:
mode:
authorGravatar mtklein <mtklein@chromium.org>2015-05-13 12:19:42 -0700
committerGravatar Commit bot <commit-bot@chromium.org>2015-05-13 12:19:42 -0700
commit8a90edc2a58a4f8a4b4da73eb08e943be09538c0 (patch)
tree4257ccbfada3494e3fd4d82e4dec135fa62e5612 /src/opts/Sk4px_NEON.h
parent5ae1312c9faa25531c07e591b4dff6804020f121 (diff)
Sk4px: alphas() and Load[24]Alphas()
alphas() extracts the 4 alphas from an existing Sk4px as another Sk4px. LoadNAlphas() constructs an Sk4px from N packed alphas. In both cases, we end up with 4x repeated alphas aligned with their pixels. alphas() A0 R0 G0 B0 A1 R1 G1 B1 A2 R2 G2 B2 A3 R3 G3 B3 -> A0 A0 A0 A0 A1 A1 A1 A1 A2 A2 A2 A2 A3 A3 A3 A3 Load4Alphas() A0 A1 A2 A3 -> A0 A0 A0 A0 A1 A1 A1 A1 A2 A2 A2 A2 A3 A3 A3 A3 Load2Alphas() A0 A1 -> A0 A0 A0 A0 A1 A1 A1 A1 0 0 0 0 0 0 0 0 This is a 5-10% speedup for AA on Intel, and wash on ARM. AA is still mostly dominated by the final lerp. alphas() isn't used yet, but it's similar enough to Load[24]Alphas() that it was easier to write all at once. BUG=skia: Review URL: https://codereview.chromium.org/1138333003
Diffstat (limited to 'src/opts/Sk4px_NEON.h')
-rw-r--r--src/opts/Sk4px_NEON.h30
1 files changed, 30 insertions, 0 deletions
diff --git a/src/opts/Sk4px_NEON.h b/src/opts/Sk4px_NEON.h
index ede5f2cd8e..368551949c 100644
--- a/src/opts/Sk4px_NEON.h
+++ b/src/opts/Sk4px_NEON.h
@@ -48,3 +48,33 @@ inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
return Sk16b(vcombine_u8(vaddhn_u16(this->fLo.fVec, o.fLo.fVec),
vaddhn_u16(this->fHi.fVec, o.fHi.fVec)));
}
+
+inline Sk4px Sk4px::alphas() const {
+ static_assert(SK_A32_SHIFT == 24, "This method assumes little-endian.");
+ auto as = vshrq_n_u32((uint32x4_t)this->fVec, 24); // ___3 ___2 ___1 ___0
+ as = vorrq_u32(as, vshlq_n_u32(as, 8)); // __33 __22 __11 __11
+ as = vorrq_u32(as, vshlq_n_u32(as, 16)); // 3333 2222 1111 1111
+ return Sk16b((uint8x16_t)as);
+}
+
+inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
+ uint8x16_t a8 = vdupq_n_u8(0); // ____ ____ ____ ____
+ a8 = vld1q_lane_u8(a+0, a8, 0); // ____ ____ ____ ___0
+ a8 = vld1q_lane_u8(a+1, a8, 4); // ____ ____ ___1 ___0
+ a8 = vld1q_lane_u8(a+2, a8, 8); // ____ ___2 ___1 ___0
+ a8 = vld1q_lane_u8(a+3, a8, 12); // ___3 ___2 ___1 ___0
+ auto a32 = (uint32x4_t)a8; //
+ a32 = vorrq_u32(a32, vshlq_n_u32(a32, 8)); // __33 __22 __11 __00
+ a32 = vorrq_u32(a32, vshlq_n_u32(a32, 16)); // 3333 2222 1111 0000
+ return Sk16b((uint8x16_t)a32);
+}
+
+inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
+ uint8x16_t a8 = vdupq_n_u8(0); // ____ ____ ____ ____
+ a8 = vld1q_lane_u8(a+0, a8, 0); // ____ ____ ____ ___0
+ a8 = vld1q_lane_u8(a+1, a8, 4); // ____ ____ ___1 ___0
+ auto a32 = (uint32x4_t)a8; //
+ a32 = vorrq_u32(a32, vshlq_n_u32(a32, 8)); // ____ ____ __11 __00
+ a32 = vorrq_u32(a32, vshlq_n_u32(a32, 16)); // ____ ____ 1111 0000
+ return Sk16b((uint8x16_t)a32);
+}