aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts
diff options
context:
space:
mode:
Diffstat (limited to 'src/opts')
-rw-r--r--src/opts/Sk4px_NEON.h30
-rw-r--r--src/opts/Sk4px_SSE2.h39
-rw-r--r--src/opts/Sk4px_none.h22
3 files changed, 91 insertions, 0 deletions
diff --git a/src/opts/Sk4px_NEON.h b/src/opts/Sk4px_NEON.h
index ede5f2cd8e..368551949c 100644
--- a/src/opts/Sk4px_NEON.h
+++ b/src/opts/Sk4px_NEON.h
@@ -48,3 +48,33 @@ inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
return Sk16b(vcombine_u8(vaddhn_u16(this->fLo.fVec, o.fLo.fVec),
vaddhn_u16(this->fHi.fVec, o.fHi.fVec)));
}
+
+inline Sk4px Sk4px::alphas() const {
+ static_assert(SK_A32_SHIFT == 24, "This method assumes little-endian.");
+ auto as = vshrq_n_u32((uint32x4_t)this->fVec, 24); // ___3 ___2 ___1 ___0
+ as = vorrq_u32(as, vshlq_n_u32(as, 8)); // __33 __22 __11 __11
+ as = vorrq_u32(as, vshlq_n_u32(as, 16)); // 3333 2222 1111 1111
+ return Sk16b((uint8x16_t)as);
+}
+
+inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
+ uint8x16_t a8 = vdupq_n_u8(0); // ____ ____ ____ ____
+ a8 = vld1q_lane_u8(a+0, a8, 0); // ____ ____ ____ ___0
+ a8 = vld1q_lane_u8(a+1, a8, 4); // ____ ____ ___1 ___0
+ a8 = vld1q_lane_u8(a+2, a8, 8); // ____ ___2 ___1 ___0
+ a8 = vld1q_lane_u8(a+3, a8, 12); // ___3 ___2 ___1 ___0
+ auto a32 = (uint32x4_t)a8; //
+ a32 = vorrq_u32(a32, vshlq_n_u32(a32, 8)); // __33 __22 __11 __00
+ a32 = vorrq_u32(a32, vshlq_n_u32(a32, 16)); // 3333 2222 1111 0000
+ return Sk16b((uint8x16_t)a32);
+}
+
+inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
+ uint8x16_t a8 = vdupq_n_u8(0); // ____ ____ ____ ____
+ a8 = vld1q_lane_u8(a+0, a8, 0); // ____ ____ ____ ___0
+ a8 = vld1q_lane_u8(a+1, a8, 4); // ____ ____ ___1 ___0
+ auto a32 = (uint32x4_t)a8; //
+ a32 = vorrq_u32(a32, vshlq_n_u32(a32, 8)); // ____ ____ __11 __00
+ a32 = vorrq_u32(a32, vshlq_n_u32(a32, 16)); // ____ ____ 1111 0000
+ return Sk16b((uint8x16_t)a32);
+}
diff --git a/src/opts/Sk4px_SSE2.h b/src/opts/Sk4px_SSE2.h
index d036328c14..55e1a69135 100644
--- a/src/opts/Sk4px_SSE2.h
+++ b/src/opts/Sk4px_SSE2.h
@@ -37,3 +37,42 @@ inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
Sk4px::Wide r = (*this + other) >> 8;
return Sk4px(_mm_packus_epi16(r.fLo.fVec, r.fHi.fVec));
}
+
+// Load4Alphas and Load2Alphas use possibly-unaligned loads (SkAlpha[] -> uint16_t or uint32_t).
+// These are safe on x86, often with no speed penalty.
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+ inline Sk4px Sk4px::alphas() const {
+ static_assert(SK_A32_SHIFT == 24, "Intel's always little-endian.");
+ __m128i splat = _mm_set_epi8(15,15,15,15, 11,11,11,11, 7,7,7,7, 3,3,3,3);
+ return Sk16b(_mm_shuffle_epi8(this->fVec, splat));
+ }
+
+ inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
+ uint32_t as = *(const uint32_t*)a;
+ __m128i splat = _mm_set_epi8(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
+ return Sk16b(_mm_shuffle_epi8(_mm_cvtsi32_si128(as), splat));
+ }
+#else
+ inline Sk4px Sk4px::alphas() const {
+ static_assert(SK_A32_SHIFT == 24, "Intel's always little-endian.");
+ __m128i as = _mm_srli_epi32(this->fVec, 24); // ___3 ___2 ___1 ___0
+ as = _mm_or_si128(as, _mm_slli_si128(as, 1)); // __33 __22 __11 __00
+ as = _mm_or_si128(as, _mm_slli_si128(as, 2)); // 3333 2222 1111 0000
+ return Sk16b(as);
+ }
+
+ inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
+ __m128i as = _mm_cvtsi32_si128(*(const uint32_t*)a); // ____ ____ ____ 3210
+ as = _mm_unpacklo_epi8 (as, _mm_setzero_si128()); // ____ ____ _3_2 _1_0
+ as = _mm_unpacklo_epi16(as, _mm_setzero_si128()); // ___3 ___2 ___1 ___0
+ as = _mm_or_si128(as, _mm_slli_si128(as, 1)); // __33 __22 __11 __00
+ as = _mm_or_si128(as, _mm_slli_si128(as, 2)); // 3333 2222 1111 0000
+ return Sk16b(as);
+ }
+#endif
+
+inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
+ uint32_t as = *(const uint16_t*)a; // Aa -> Aa00
+ return Load4Alphas((const SkAlpha*)&as);
+}
diff --git a/src/opts/Sk4px_none.h b/src/opts/Sk4px_none.h
index c8c33a0d16..340a36fd14 100644
--- a/src/opts/Sk4px_none.h
+++ b/src/opts/Sk4px_none.h
@@ -55,3 +55,25 @@ inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
r.kth< 8>(), r.kth< 9>(), r.kth<10>(), r.kth<11>(),
r.kth<12>(), r.kth<13>(), r.kth<14>(), r.kth<15>());
}
+
+inline Sk4px Sk4px::alphas() const {
+ static_assert(SK_A32_SHIFT == 24, "This method assumes little-endian.");
+ return Sk16b(this->kth< 3>(), this->kth< 3>(), this->kth< 3>(), this->kth< 3>(),
+ this->kth< 7>(), this->kth< 7>(), this->kth< 7>(), this->kth< 7>(),
+ this->kth<11>(), this->kth<11>(), this->kth<11>(), this->kth<11>(),
+ this->kth<15>(), this->kth<15>(), this->kth<15>(), this->kth<15>());
+}
+
+inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
+ return Sk16b(a[0], a[0], a[0], a[0],
+ a[1], a[1], a[1], a[1],
+ a[2], a[2], a[2], a[2],
+ a[3], a[3], a[3], a[3]);
+}
+
+inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
+ return Sk16b(a[0], a[0], a[0], a[0],
+ a[1], a[1], a[1], a[1],
+ 0,0,0,0,
+ 0,0,0,0);
+}