aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts
diff options
context:
space:
mode:
authorGravatar Herb Derby <herb@google.com>2017-10-10 17:14:18 -0400
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-10-11 08:21:09 +0000
commit5eb1528234733e548a6417161e0b70ce333dbd1d (patch)
tree497d3fba05965fc55d039154b5393d6adb457704 /src/opts
parent67cf6896f7f018815521d2ecb5d7c7131a30753e (diff)
Add mulHi to SkNx
Add mulHi to base SkNx, and specialize implementations for Sk4u for neon and sse. Add casts for converting from uint8_t by 4 to uint32_t by 4. Cq-Include-Trybots: skia.primary:Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD Change-Id: I29a32e2ad9812a47fff841ceca334e562362836f Reviewed-on: https://skia-review.googlesource.com/57960 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Herb Derby <herb@google.com>
Diffstat (limited to 'src/opts')
-rw-r--r--src/opts/SkNx_neon.h15
-rw-r--r--src/opts/SkNx_sse.h14
2 files changed, 25 insertions, 4 deletions
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index b906a02c82..32be78f66b 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -497,6 +497,13 @@ public:
AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.fVec); }
// TODO as needed
+ AI SkNx mulHi(const SkNx& m) const {
+ uint64x2_t hi = vmull_u32(vget_high_u32(fVec), vget_high_u32(m.fVec));
+ uint64x2_t lo = vmull_u32( vget_low_u32(fVec), vget_low_u32(m.fVec));
+
+ return { vcombine_u32(vshrn_n_u64(lo,32), vshrn_n_u64(hi,32)) };
+ }
+
AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
return vbslq_u32(fVec, t.fVec, e.fVec);
}
@@ -529,9 +536,13 @@ template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
return vqmovn_u16(vcombine_u16(_16, _16));
}
-template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint8_t>(const Sk4b& src) {
+template<> AI /*static*/ Sk4u SkNx_cast<uint32_t, uint8_t>(const Sk4b& src) {
uint16x8_t _16 = vmovl_u8(src.fVec);
- return vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(_16)));
+ return vmovl_u16(vget_low_u16(_16));
+}
+
+template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint8_t>(const Sk4b& src) {
+ return vreinterpretq_s32_u32(SkNx_cast<uint32_t>(src).fVec);
}
template<> AI /*static*/ Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index 469aefb145..d4d4781e5a 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -287,10 +287,16 @@ public:
#endif
}
+ AI SkNx mulHi(SkNx m) const {
+ SkNx v20{_mm_mul_epu32(m.fVec, fVec)};
+ SkNx v31{_mm_mul_epu32(_mm_srli_si128(m.fVec, 4), _mm_srli_si128(fVec, 4))};
+
+ return SkNx{v20[1], v31[1], v20[3], v31[3]};
+ }
+
__m128i fVec;
};
-
template <>
class SkNx<4, uint16_t> {
public:
@@ -568,7 +574,7 @@ template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
#endif
}
-template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint8_t>(const Sk4b& src) {
+template<> AI /*static*/ Sk4u SkNx_cast<uint32_t, uint8_t>(const Sk4b& src) {
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
const int _ = ~0;
return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_));
@@ -578,6 +584,10 @@ template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint8_t>(const Sk4b& src) {
#endif
}
+template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint8_t>(const Sk4b& src) {
+ return SkNx_cast<uint32_t>(src).fVec;
+}
+
template<> AI /*static*/ Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
return _mm_cvtepi32_ps(SkNx_cast<int32_t>(src).fVec);
}