aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper/SkJumper_stages.cpp
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-03-02 14:08:36 -0500
committerGravatar Mike Klein <mtklein@chromium.org>2017-03-02 20:32:20 +0000
commit767c7e7a0b8a5462df100c2662f0bf99cbad6f03 (patch)
tree61648eda5c4db856c897ab94095ab252bc9d9f0c /src/jumper/SkJumper_stages.cpp
parent8e48c1e1d38bf0f0086971be2b077d1a2cb12131 (diff)
SkJumper: use AVX2 mask loads and stores for U32
SkRasterPipeline_f16: 63 -> 58 (8888+f16 loads, f16 store) SkRasterPipeline_srgb: 96 -> 84 (2x 8888 loads, 8888 store) PS3 has a simpler way to build the mask, in a uint64_t. Timing is still roughlt the same. Change-Id: Ie278611dff02281e5a0f3a57185050bbe852bff0 Reviewed-on: https://skia-review.googlesource.com/9165 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Herb Derby <herb@google.com>
Diffstat (limited to 'src/jumper/SkJumper_stages.cpp')
-rw-r--r--src/jumper/SkJumper_stages.cpp61
1 files changed, 45 insertions, 16 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index ab942230d8..2f9c60a24f 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -258,6 +258,26 @@ static inline V load(const T* src, size_t tail) {
return unaligned_load<V>(src);
}
+template <typename V, typename T>
+static inline void store(T* dst, V v, size_t tail) {
+#if defined(JUMPER)
+ __builtin_assume(tail < kStride);
+ if (__builtin_expect(tail, 0)) {
+ switch (tail-1) {
+ case 6: dst[6] = v[6];
+ case 5: dst[5] = v[5];
+ case 4: dst[4] = v[4];
+ case 3: dst[3] = v[3];
+ case 2: dst[2] = v[2];
+ case 1: dst[1] = v[1];
+ case 0: dst[0] = v[0];
+ }
+ return;
+ }
+#endif
+ memcpy(dst, &v, sizeof(v));
+}
+
#if 1 && defined(JUMPER) && defined(__AVX__)
template <>
inline U8 load(const uint8_t* src, size_t tail) {
@@ -275,25 +295,34 @@ static inline V load(const T* src, size_t tail) {
}
#endif
-template <typename V, typename T>
-static inline void store(T* dst, V v, size_t tail) {
-#if defined(JUMPER)
- __builtin_assume(tail < kStride);
- if (__builtin_expect(tail, 0)) {
- switch (tail-1) {
- case 6: dst[6] = v[6];
- case 5: dst[5] = v[5];
- case 4: dst[4] = v[4];
- case 3: dst[3] = v[3];
- case 2: dst[2] = v[2];
- case 1: dst[1] = v[1];
- case 0: dst[0] = v[0];
+#if 1 && defined(JUMPER) && defined(__AVX2__)
+ static inline U32 mask(size_t tail) {
+ // It's easiest to build the mask as 8 8-bit values, either 0x00 or 0xff.
+ // Start fully on, then shift away lanes from the top until we've got our mask.
+ uint64_t mask = 0xffffffffffffffff >> 8*(kStride-tail);
+
+ // Sign-extend each mask lane to its full width, 0x00000000 or 0xffffffff.
+ return _mm256_cvtepi8_epi32(_mm_cvtsi64_si128((int64_t)mask));
+ }
+
+ template <>
+ inline U32 load(const uint32_t* src, size_t tail) {
+ __builtin_assume(tail < kStride);
+ if (__builtin_expect(tail, 0)) {
+ return _mm256_maskload_epi32((const int*)src, mask(tail));
}
- return;
+ return unaligned_load<U32>(src);
+ }
+
+ template <>
+ inline void store(uint32_t* dst, U32 v, size_t tail) {
+ __builtin_assume(tail < kStride);
+ if (__builtin_expect(tail, 0)) {
+ return _mm256_maskstore_epi32((int*)dst, mask(tail), v);
+ }
+ memcpy(dst, &v, sizeof(v));
}
#endif
- memcpy(dst, &v, sizeof(v));
-}
static F lerp(F from, F to, F t) {