We can mask load and store with just AVX.

Previously we were using AVX2 instructions to generate the masks, and AVX2 instructions for the mask load and stores themselves. AVX came with float mask loads and stores, which will work perfectly fine. I don't really get what the point of the 32-bit int loads and stores are in AVX2, beyond maybe syntax sugar? Change-Id: I81fa55fb09daea4f5546f8c9ebbc886015edce51 Reviewed-on: https://skia-review.googlesource.com/17452 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Ravi Mistry <rmistry@google.com>
author: Mike Klein <mtklein@chromium.org> 2017-05-22 10:28:27 -0400
committer: Mike Klein <mtklein@chromium.org> 2017-05-22 17:08:19 +0000
commit: 139e463dc6f965fdaed854efcb20c6cafbb6dbdc (patch)
tree: 4a30cac42be605bdbba0d5da49685c8809cb7f74 /src/jumper/SkJumper_stages.cpp
parent: 0a76b413eac46ec218b367c5456709059557f5db (diff)
1 files changed, 7 insertions, 5 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index fa11869a6e..d1747018bf 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -217,8 +217,8 @@ SI void store(T* dst, V v, size_t tail) {
     }
 #endif
 
-// AVX2 adds some mask loads and stores that make for shorter, faster code.
-#if defined(JUMPER) && defined(__AVX2__)
+// AVX adds some mask loads and stores that make for shorter, faster code.
+#if defined(JUMPER) && defined(__AVX__)
     SI U32 mask(size_t tail) {
         // We go a little out of our way to avoid needing large constant values here.
 
@@ -227,14 +227,16 @@ SI void store(T* dst, V v, size_t tail) {
         uint64_t mask = 0xffffffffffffffff >> 8*(kStride-tail);
 
         // Sign-extend each mask lane to its full width, 0x00000000 or 0xffffffff.
-        return _mm256_cvtepi8_epi32(_mm_cvtsi64_si128((int64_t)mask));
+        using S8  = int8_t  __attribute__((ext_vector_type(8)));
+        using S32 = int32_t __attribute__((ext_vector_type(8)));
+        return (U32)__builtin_convertvector(unaligned_load<S8>(&mask), S32);
     }
 
     template <>
     inline U32 load(const uint32_t* src, size_t tail) {
         __builtin_assume(tail < kStride);
         if (__builtin_expect(tail, 0)) {
-            return _mm256_maskload_epi32((const int*)src, mask(tail));
+            return (U32)_mm256_maskload_ps((const float*)src, mask(tail));
         }
         return unaligned_load<U32>(src);
     }
@@ -243,7 +245,7 @@ SI void store(T* dst, V v, size_t tail) {
     inline void store(uint32_t* dst, U32 v, size_t tail) {
         __builtin_assume(tail < kStride);
         if (__builtin_expect(tail, 0)) {
-            return _mm256_maskstore_epi32((int*)dst, mask(tail), v);
+            return _mm256_maskstore_ps((float*)dst, mask(tail), (F)v);
         }
         unaligned_store(dst, v);
     }
author	Mike Klein <mtklein@chromium.org>	2017-05-22 10:28:27 -0400
committer	Mike Klein <mtklein@chromium.org>	2017-05-22 17:08:19 +0000
commit	139e463dc6f965fdaed854efcb20c6cafbb6dbdc (patch)
tree	4a30cac42be605bdbba0d5da49685c8809cb7f74 /src/jumper/SkJumper_stages.cpp
parent	0a76b413eac46ec218b367c5456709059557f5db (diff)