aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--src/opts/SkRasterPipeline_opts.h91
1 files changed, 66 insertions, 25 deletions
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index a8cefcfc4f..11447bcdf9 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -109,7 +109,6 @@ SI void SK_VECTORCALL just_return(Stage*, size_t, SkNf, SkNf, SkNf, SkNf,
template <typename T>
SI SkNx<N,T> load(size_t tail, const T* src) {
- // TODO: maskload for 32- and 64-bit T
if (tail) {
T buf[8] = {0};
switch (tail & (N-1)) {
@@ -126,8 +125,28 @@ SI SkNx<N,T> load(size_t tail, const T* src) {
return SkNx<N,T>::Load(src);
}
template <typename T>
+SI SkNx<N,T> gather(size_t tail, const T* src, const SkNi& offset) {
+ if (tail) {
+ T buf[8] = {0};
+ switch (tail & (N-1)) {
+ case 7: buf[6] = src[offset[6]];
+ case 6: buf[5] = src[offset[5]];
+ case 5: buf[4] = src[offset[4]];
+ case 4: buf[3] = src[offset[3]];
+ case 3: buf[2] = src[offset[2]];
+ case 2: buf[1] = src[offset[1]];
+ }
+ buf[0] = src[offset[0]];
+ return SkNx<N,T>::Load(buf);
+ }
+ T buf[8];
+ for (size_t i = 0; i < N; i++) {
+ buf[i] = src[offset[i]];
+ }
+ return SkNx<N,T>::Load(buf);
+}
+template <typename T>
SI void store(size_t tail, const SkNx<N,T>& v, T* dst) {
- // TODO: maskstore for 32- and 64-bit T
if (tail) {
switch (tail & (N-1)) {
case 7: dst[6] = v[6];
@@ -143,6 +162,47 @@ SI void store(size_t tail, const SkNx<N,T>& v, T* dst) {
v.store(dst);
}
+#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
+ SI __m256i mask(size_t tail) {
+ static const int masks[][8] = {
+ {~0,~0,~0,~0, ~0,~0,~0,~0 }, // remember, tail == 0 ~~> load all N
+ {~0, 0, 0, 0, 0, 0, 0, 0 },
+ {~0,~0, 0, 0, 0, 0, 0, 0 },
+ {~0,~0,~0, 0, 0, 0, 0, 0 },
+ {~0,~0,~0,~0, 0, 0, 0, 0 },
+ {~0,~0,~0,~0, ~0, 0, 0, 0 },
+ {~0,~0,~0,~0, ~0,~0, 0, 0 },
+ {~0,~0,~0,~0, ~0,~0,~0, 0 },
+ };
+ return SkNi::Load(masks + tail).fVec;
+ }
+
+ SI SkNi load(size_t tail, const int32_t* src) {
+ return tail ? _mm256_maskload_epi32((const int*)src, mask(tail))
+ : SkNi::Load(src);
+ }
+ SI SkNu load(size_t tail, const uint32_t* src) {
+ return tail ? _mm256_maskload_epi32((const int*)src, mask(tail))
+ : SkNu::Load(src);
+ }
+ SI SkNi gather(size_t tail, const int32_t* src, const SkNi& offset) {
+ return _mm256_mask_i32gather_epi32(SkNi(0).fVec,
+ (const int*)src, offset.fVec, mask(tail), 4);
+ }
+ SI SkNu gather(size_t tail, const uint32_t* src, const SkNi& offset) {
+ return _mm256_mask_i32gather_epi32(SkNi(0).fVec,
+ (const int*)src, offset.fVec, mask(tail), 4);
+ }
+ SI void store(size_t tail, const SkNi& v, int32_t* dst) {
+ tail ? _mm256_maskstore_epi32((int*)dst, mask(tail), v.fVec)
+ : v.store(dst);
+ }
+ SI void store(size_t tail, const SkNu& v, uint32_t* dst) {
+ tail ? _mm256_maskstore_epi32((int*)dst, mask(tail), v.fVec)
+ : v.store(dst);
+ }
+#endif
+
SI void from_8888(const SkNu& _8888, SkNf* r, SkNf* g, SkNf* b, SkNf* a) {
auto to_float = [](const SkNu& v) { return SkNx_cast<float>(SkNi::Load(&v)); };
*r = (1/255.0f)*to_float((_8888 >> 0) & 0xff);
@@ -707,52 +767,33 @@ SI SkNi offset_and_ptr(T** ptr, const void* ctx, const SkNf& x, const SkNf& y) {
return offset;
}
-template <typename T>
-SI void gather(T (&dst)[N], const T* src, const SkNi& offset, size_t tail) {
- size_t n = tail ? tail : N;
- for (size_t i = 0; i < n; i++) { dst[i] = src[offset[i]]; }
- for (size_t i = n; i < N; i++) { dst[i] = 0; }
-}
-
STAGE(gather_a8) {} // TODO
STAGE(gather_i8) {} // TODO
STAGE(gather_g8) {
const uint8_t* p;
SkNi offset = offset_and_ptr(&p, ctx, r, g);
- uint8_t px[N];
- gather(px, p, offset, tail);
-
- r = g = b = SkNx_cast<float>(SkNb::Load(px)) * (1/255.0f);
+ r = g = b = SkNx_cast<float>(gather(tail, p, offset)) * (1/255.0f);
a = 1.0f;
}
STAGE(gather_565) {
const uint16_t* p;
SkNi offset = offset_and_ptr(&p, ctx, r, g);
- uint16_t px[N];
- gather(px, p, offset, tail);
-
- from_565(SkNh::Load(px), &r, &g, &b);
+ from_565(gather(tail, p, offset), &r, &g, &b);
a = 1.0f;
}
STAGE(gather_4444) {
const uint16_t* p;
SkNi offset = offset_and_ptr(&p, ctx, r, g);
- uint16_t px[N];
- gather(px, p, offset, tail);
-
- from_4444(SkNh::Load(px), &r, &g, &b, &a);
+ from_4444(gather(tail, p, offset), &r, &g, &b, &a);
}
STAGE(gather_8888) {
const uint32_t* p;
SkNi offset = offset_and_ptr(&p, ctx, r, g);
- uint32_t px[N];
- gather(px, p, offset, tail);
-
- from_8888(SkNu::Load(px), &r, &g, &b, &a);
+ from_8888(gather(tail, p, offset), &r, &g, &b, &a);
}
STAGE(gather_f16) {
const uint64_t* p;