aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-11-10 13:01:38 -0500
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-11-10 18:46:17 +0000
commita3dd5ec3a769fb833ce77878cd4e551c15e5074d (patch)
tree11c27d0b7c789833f13450954973d28772b0a4af /src/jumper
parent037d1fc45354508e91d5e8238089e72364412427 (diff)
more powerful map()
Change-Id: Icbae002999a295e3a9d1d2e6046e686784d5f608 Reviewed-on: https://skia-review.googlesource.com/69901 Reviewed-by: Florin Malita <fmalita@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/jumper')
-rw-r--r--src/jumper/SkJumper_stages_lowp.cpp54
1 files changed, 30 insertions, 24 deletions
diff --git a/src/jumper/SkJumper_stages_lowp.cpp b/src/jumper/SkJumper_stages_lowp.cpp
index 21b257f9c1..8c0274f911 100644
--- a/src/jumper/SkJumper_stages_lowp.cpp
+++ b/src/jumper/SkJumper_stages_lowp.cpp
@@ -183,14 +183,28 @@ SI D join(S lo, S hi) {
memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S));
return v;
}
-template <typename V, typename H>
-SI V map(V v, H (*fn)(H)) {
- H lo,hi;
- split(v, &lo,&hi);
- lo = fn(lo);
- hi = fn(hi);
- return join<V>(lo,hi);
-}
+
+#if defined(__AVX2__)
+ template <typename V, typename Fn>
+ SI auto map(V v, Fn&& fn) -> decltype(
+ __builtin_shufflevector(fn(__builtin_shufflevector(v,v, 0,1, 2, 3, 4, 5, 6, 7)),
+ fn(__builtin_shufflevector(v,v, 8,9,10,11,12,13,14,15)),
+ 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)) {
+ return __builtin_shufflevector(fn(__builtin_shufflevector(v,v, 0,1, 2, 3, 4, 5, 6, 7)),
+ fn(__builtin_shufflevector(v,v, 8,9,10,11,12,13,14,15)),
+ 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
+ }
+#else
+ template <typename V, typename Fn>
+ SI auto map(V v, Fn&& fn) -> decltype(
+ __builtin_shufflevector(fn(__builtin_shufflevector(v,v, 0,1,2,3)),
+ fn(__builtin_shufflevector(v,v, 4,5,6,7)),
+ 0,1,2,3,4,5,6,7)) {
+ return __builtin_shufflevector(fn(__builtin_shufflevector(v,v, 0,1,2,3)),
+ fn(__builtin_shufflevector(v,v, 4,5,6,7)),
+ 0,1,2,3,4,5,6,7);
+ }
+#endif
// TODO: do we need platform-specific intrinsics for any of these?
SI F if_then_else(I32 c, F t, F e) {
@@ -208,7 +222,7 @@ SI F rcp(F x) {
#elif defined(__SSE__)
return map(x, _mm_rcp_ps);
#elif defined(__ARM_NEON)
- return map(x, +[](float32x4_t v) {
+ return map(x, [](float32x4_t v) {
auto est = vrecpeq_f32(v);
return vrecpsq_f32(v,est)*est;
});
@@ -224,7 +238,7 @@ SI F sqrt_(F x) {
#elif defined(__aarch64__)
return map(x, vsqrtq_f32);
#elif defined(__ARM_NEON)
- return map(x, +[](float32x4_t v) {
+ return map(x, [](float32x4_t v) {
auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v).
est *= vrsqrtsq_f32(v,est*est);
est *= vrsqrtsq_f32(v,est*est);
@@ -242,9 +256,9 @@ SI F floor_(F x) {
#if defined(__aarch64__)
return map(x, vrndmq_f32);
#elif defined(__AVX2__)
- return map(x, +[](__m256 v){ return _mm256_floor_ps(v); }); // _mm256_floor_ps is a macro...
+ return map(x, [](__m256 v){ return _mm256_floor_ps(v); }); // _mm256_floor_ps is a macro...
#elif defined(__SSE4_1__)
- return map(x, +[](__m128 v){ return _mm_floor_ps(v); }); // _mm_floor_ps() is a macro too.
+ return map(x, [](__m128 v){ return _mm_floor_ps(v); }); // _mm_floor_ps() is a macro too.
#else
F roundtrip = cast<F>(cast<I32>(x));
return roundtrip - if_then_else(roundtrip > x, F(1), F(0));
@@ -480,21 +494,13 @@ SI void store(T* ptr, size_t tail, V v) {
}
template<>
- F gather(const float* p, U32 ix) {
- __m256i lo, hi;
- split(ix, &lo, &hi);
-
- return join<F>(_mm256_i32gather_ps(p, lo, 4),
- _mm256_i32gather_ps(p, hi, 4));
+ F gather(const float* ptr, U32 ix) {
+ return map(ix, [&](__m256i ix) { return _mm256_i32gather_ps(ptr, ix, 4); });
}
template<>
- U32 gather(const uint32_t* p, U32 ix) {
- __m256i lo, hi;
- split(ix, &lo, &hi);
-
- return join<U32>(_mm256_i32gather_epi32(p, lo, 4),
- _mm256_i32gather_epi32(p, hi, 4));
+ U32 gather(const uint32_t* ptr, U32 ix) {
+ return map(ix, [&](__m256i ix) { return _mm256_i32gather_epi32(ptr, ix, 4); });
}
#else
template <typename V, typename T>