aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper/SkJumper_stages_lowp.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/jumper/SkJumper_stages_lowp.cpp')
-rw-r--r--src/jumper/SkJumper_stages_lowp.cpp40
1 files changed, 30 insertions, 10 deletions
diff --git a/src/jumper/SkJumper_stages_lowp.cpp b/src/jumper/SkJumper_stages_lowp.cpp
index a58328d074..21b257f9c1 100644
--- a/src/jumper/SkJumper_stages_lowp.cpp
+++ b/src/jumper/SkJumper_stages_lowp.cpp
@@ -470,19 +470,39 @@ SI void store(T* ptr, size_t tail, V v) {
}
}
-template <typename V, typename T>
-SI V gather(const T* ptr, U32 ix) {
#if defined(__AVX2__)
- return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
- ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
- ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
- ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
+ template <typename V, typename T>
+ SI V gather(const T* ptr, U32 ix) {
+ return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
+ ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
+ ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
+ ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
+ }
+
+ template<>
+ F gather(const float* p, U32 ix) {
+ __m256i lo, hi;
+ split(ix, &lo, &hi);
+
+ return join<F>(_mm256_i32gather_ps(p, lo, 4),
+ _mm256_i32gather_ps(p, hi, 4));
+ }
+
+ template<>
+ U32 gather(const uint32_t* p, U32 ix) {
+ __m256i lo, hi;
+ split(ix, &lo, &hi);
+
+ return join<U32>(_mm256_i32gather_epi32(p, lo, 4),
+ _mm256_i32gather_epi32(p, hi, 4));
+ }
#else
- return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
- ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
+ template <typename V, typename T>
+ SI V gather(const T* ptr, U32 ix) {
+ return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
+ ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
+ }
#endif
-}
-// TODO: AVX2 gather instructions where possible
// ~~~~~~ 32-bit memory loads and stores ~~~~~~ //