aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts/Sk4px_SSE2.h
diff options
context:
space:
mode:
authorGravatar mtklein <mtklein@chromium.org>2015-05-13 12:19:42 -0700
committerGravatar Commit bot <commit-bot@chromium.org>2015-05-13 12:19:42 -0700
commit8a90edc2a58a4f8a4b4da73eb08e943be09538c0 (patch)
tree4257ccbfada3494e3fd4d82e4dec135fa62e5612 /src/opts/Sk4px_SSE2.h
parent5ae1312c9faa25531c07e591b4dff6804020f121 (diff)
Sk4px: alphas() and Load[24]Alphas()
alphas() extracts the 4 alphas from an existing Sk4px as another Sk4px. LoadNAlphas() constructs an Sk4px from N packed alphas. In both cases, we end up with 4x repeated alphas aligned with their pixels. alphas() A0 R0 G0 B0 A1 R1 G1 B1 A2 R2 G2 B2 A3 R3 G3 B3 -> A0 A0 A0 A0 A1 A1 A1 A1 A2 A2 A2 A2 A3 A3 A3 A3 Load4Alphas() A0 A1 A2 A3 -> A0 A0 A0 A0 A1 A1 A1 A1 A2 A2 A2 A2 A3 A3 A3 A3 Load2Alphas() A0 A1 -> A0 A0 A0 A0 A1 A1 A1 A1 0 0 0 0 0 0 0 0 This is a 5-10% speedup for AA on Intel, and wash on ARM. AA is still mostly dominated by the final lerp. alphas() isn't used yet, but it's similar enough to Load[24]Alphas() that it was easier to write all at once. BUG=skia: Review URL: https://codereview.chromium.org/1138333003
Diffstat (limited to 'src/opts/Sk4px_SSE2.h')
-rw-r--r--src/opts/Sk4px_SSE2.h39
1 files changed, 39 insertions, 0 deletions
diff --git a/src/opts/Sk4px_SSE2.h b/src/opts/Sk4px_SSE2.h
index d036328c14..55e1a69135 100644
--- a/src/opts/Sk4px_SSE2.h
+++ b/src/opts/Sk4px_SSE2.h
@@ -37,3 +37,42 @@ inline Sk4px Sk4px::Wide::addNarrowHi(const Sk16h& other) const {
Sk4px::Wide r = (*this + other) >> 8;
return Sk4px(_mm_packus_epi16(r.fLo.fVec, r.fHi.fVec));
}
+
+// Load4Alphas and Load2Alphas use possibly-unaligned loads (SkAlpha[] -> uint16_t or uint32_t).
+// These are safe on x86, often with no speed penalty.
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+ inline Sk4px Sk4px::alphas() const {
+ static_assert(SK_A32_SHIFT == 24, "Intel's always little-endian.");
+ __m128i splat = _mm_set_epi8(15,15,15,15, 11,11,11,11, 7,7,7,7, 3,3,3,3);
+ return Sk16b(_mm_shuffle_epi8(this->fVec, splat));
+ }
+
+ inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
+ uint32_t as = *(const uint32_t*)a;
+ __m128i splat = _mm_set_epi8(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
+ return Sk16b(_mm_shuffle_epi8(_mm_cvtsi32_si128(as), splat));
+ }
+#else
+ inline Sk4px Sk4px::alphas() const {
+ static_assert(SK_A32_SHIFT == 24, "Intel's always little-endian.");
+ __m128i as = _mm_srli_epi32(this->fVec, 24); // ___3 ___2 ___1 ___0
+ as = _mm_or_si128(as, _mm_slli_si128(as, 1)); // __33 __22 __11 __00
+ as = _mm_or_si128(as, _mm_slli_si128(as, 2)); // 3333 2222 1111 0000
+ return Sk16b(as);
+ }
+
+ inline Sk4px Sk4px::Load4Alphas(const SkAlpha a[4]) {
+ __m128i as = _mm_cvtsi32_si128(*(const uint32_t*)a); // ____ ____ ____ 3210
+ as = _mm_unpacklo_epi8 (as, _mm_setzero_si128()); // ____ ____ _3_2 _1_0
+ as = _mm_unpacklo_epi16(as, _mm_setzero_si128()); // ___3 ___2 ___1 ___0
+ as = _mm_or_si128(as, _mm_slli_si128(as, 1)); // __33 __22 __11 __00
+ as = _mm_or_si128(as, _mm_slli_si128(as, 2)); // 3333 2222 1111 0000
+ return Sk16b(as);
+ }
+#endif
+
+inline Sk4px Sk4px::Load2Alphas(const SkAlpha a[2]) {
+ uint32_t as = *(const uint16_t*)a; // Aa -> Aa00
+ return Load4Alphas((const SkAlpha*)&as);
+}