diff options
author | Mike Klein <mtklein@chromium.org> | 2017-03-02 14:08:36 -0500 |
---|---|---|
committer | Mike Klein <mtklein@chromium.org> | 2017-03-02 20:32:20 +0000 |
commit | 767c7e7a0b8a5462df100c2662f0bf99cbad6f03 (patch) | |
tree | 61648eda5c4db856c897ab94095ab252bc9d9f0c /src/jumper/SkJumper_stages.cpp | |
parent | 8e48c1e1d38bf0f0086971be2b077d1a2cb12131 (diff) |
SkJumper: use AVX2 mask loads and stores for U32
SkRasterPipeline_f16: 63 -> 58 (8888+f16 loads, f16 store)
SkRasterPipeline_srgb: 96 -> 84 (2x 8888 loads, 8888 store)
PS3 has a simpler way to build the mask, in a uint64_t.
Timing is still roughlt the same.
Change-Id: Ie278611dff02281e5a0f3a57185050bbe852bff0
Reviewed-on: https://skia-review.googlesource.com/9165
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Herb Derby <herb@google.com>
Diffstat (limited to 'src/jumper/SkJumper_stages.cpp')
-rw-r--r-- | src/jumper/SkJumper_stages.cpp | 61 |
1 files changed, 45 insertions, 16 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index ab942230d8..2f9c60a24f 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -258,6 +258,26 @@ static inline V load(const T* src, size_t tail) { return unaligned_load<V>(src); } +template <typename V, typename T> +static inline void store(T* dst, V v, size_t tail) { +#if defined(JUMPER) + __builtin_assume(tail < kStride); + if (__builtin_expect(tail, 0)) { + switch (tail-1) { + case 6: dst[6] = v[6]; + case 5: dst[5] = v[5]; + case 4: dst[4] = v[4]; + case 3: dst[3] = v[3]; + case 2: dst[2] = v[2]; + case 1: dst[1] = v[1]; + case 0: dst[0] = v[0]; + } + return; + } +#endif + memcpy(dst, &v, sizeof(v)); +} + #if 1 && defined(JUMPER) && defined(__AVX__) template <> inline U8 load(const uint8_t* src, size_t tail) { @@ -275,25 +295,34 @@ static inline V load(const T* src, size_t tail) { } #endif -template <typename V, typename T> -static inline void store(T* dst, V v, size_t tail) { -#if defined(JUMPER) - __builtin_assume(tail < kStride); - if (__builtin_expect(tail, 0)) { - switch (tail-1) { - case 6: dst[6] = v[6]; - case 5: dst[5] = v[5]; - case 4: dst[4] = v[4]; - case 3: dst[3] = v[3]; - case 2: dst[2] = v[2]; - case 1: dst[1] = v[1]; - case 0: dst[0] = v[0]; +#if 1 && defined(JUMPER) && defined(__AVX2__) + static inline U32 mask(size_t tail) { + // It's easiest to build the mask as 8 8-bit values, either 0x00 or 0xff. + // Start fully on, then shift away lanes from the top until we've got our mask. + uint64_t mask = 0xffffffffffffffff >> 8*(kStride-tail); + + // Sign-extend each mask lane to its full width, 0x00000000 or 0xffffffff. + return _mm256_cvtepi8_epi32(_mm_cvtsi64_si128((int64_t)mask)); + } + + template <> + inline U32 load(const uint32_t* src, size_t tail) { + __builtin_assume(tail < kStride); + if (__builtin_expect(tail, 0)) { + return _mm256_maskload_epi32((const int*)src, mask(tail)); } - return; + return unaligned_load<U32>(src); + } + + template <> + inline void store(uint32_t* dst, U32 v, size_t tail) { + __builtin_assume(tail < kStride); + if (__builtin_expect(tail, 0)) { + return _mm256_maskstore_epi32((int*)dst, mask(tail), v); + } + memcpy(dst, &v, sizeof(v)); } #endif - memcpy(dst, &v, sizeof(v)); -} static F lerp(F from, F to, F t) { |