aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper/SkJumper_stages.cpp
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-05-22 10:28:27 -0400
committerGravatar Mike Klein <mtklein@chromium.org>2017-05-22 17:08:19 +0000
commit139e463dc6f965fdaed854efcb20c6cafbb6dbdc (patch)
tree4a30cac42be605bdbba0d5da49685c8809cb7f74 /src/jumper/SkJumper_stages.cpp
parent0a76b413eac46ec218b367c5456709059557f5db (diff)
We can mask load and store with just AVX.
Previously we were using AVX2 instructions to generate the masks, and AVX2 instructions for the mask load and stores themselves. AVX came with float mask loads and stores, which will work perfectly fine. I don't really get what the point of the 32-bit int loads and stores are in AVX2, beyond maybe syntax sugar? Change-Id: I81fa55fb09daea4f5546f8c9ebbc886015edce51 Reviewed-on: https://skia-review.googlesource.com/17452 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Ravi Mistry <rmistry@google.com>
Diffstat (limited to 'src/jumper/SkJumper_stages.cpp')
-rw-r--r--src/jumper/SkJumper_stages.cpp12
1 files changed, 7 insertions, 5 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index fa11869a6e..d1747018bf 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -217,8 +217,8 @@ SI void store(T* dst, V v, size_t tail) {
}
#endif
-// AVX2 adds some mask loads and stores that make for shorter, faster code.
-#if defined(JUMPER) && defined(__AVX2__)
+// AVX adds some mask loads and stores that make for shorter, faster code.
+#if defined(JUMPER) && defined(__AVX__)
SI U32 mask(size_t tail) {
// We go a little out of our way to avoid needing large constant values here.
@@ -227,14 +227,16 @@ SI void store(T* dst, V v, size_t tail) {
uint64_t mask = 0xffffffffffffffff >> 8*(kStride-tail);
// Sign-extend each mask lane to its full width, 0x00000000 or 0xffffffff.
- return _mm256_cvtepi8_epi32(_mm_cvtsi64_si128((int64_t)mask));
+ using S8 = int8_t __attribute__((ext_vector_type(8)));
+ using S32 = int32_t __attribute__((ext_vector_type(8)));
+ return (U32)__builtin_convertvector(unaligned_load<S8>(&mask), S32);
}
template <>
inline U32 load(const uint32_t* src, size_t tail) {
__builtin_assume(tail < kStride);
if (__builtin_expect(tail, 0)) {
- return _mm256_maskload_epi32((const int*)src, mask(tail));
+ return (U32)_mm256_maskload_ps((const float*)src, mask(tail));
}
return unaligned_load<U32>(src);
}
@@ -243,7 +245,7 @@ SI void store(T* dst, V v, size_t tail) {
inline void store(uint32_t* dst, U32 v, size_t tail) {
__builtin_assume(tail < kStride);
if (__builtin_expect(tail, 0)) {
- return _mm256_maskstore_epi32((int*)dst, mask(tail), v);
+ return _mm256_maskstore_ps((float*)dst, mask(tail), (F)v);
}
unaligned_store(dst, v);
}