diff options
author | reed <reed@google.com> | 2016-07-11 13:17:35 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2016-07-11 13:17:35 -0700 |
commit | b42b785d1cbc98bd34aceae338060831b974f9c5 (patch) | |
tree | 91f84ddd2f3cecf85c8c0dad4a5cfc48e2140c10 /include/private | |
parent | 7e602c2c6cb2fbeb70a3978e2148844b673a8a4c (diff) |
try to speed-up maprect + round2i + contains
We call roundOut in a few places. If we can get SkNx::Ceil we could efficiently implement that as well.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2133413002
CQ_INCLUDE_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review-Url: https://codereview.chromium.org/2133413002
Diffstat (limited to 'include/private')
-rw-r--r-- | include/private/SkNx.h | 310 | ||||
-rw-r--r-- | include/private/SkNx_neon.h | 453 | ||||
-rw-r--r-- | include/private/SkNx_sse.h | 374 |
3 files changed, 1137 insertions, 0 deletions
diff --git a/include/private/SkNx.h b/include/private/SkNx.h new file mode 100644 index 0000000000..8d1b29dfaa --- /dev/null +++ b/include/private/SkNx.h @@ -0,0 +1,310 @@ +/* + * Copyright 2015 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SkNx_DEFINED +#define SkNx_DEFINED + +//#define SKNX_NO_SIMD + +#include "SkScalar.h" +#include "SkTypes.h" +#include <limits> +#include <math.h> +#include <type_traits> + +#define SI static inline + +// The default SkNx<N,T> just proxies down to a pair of SkNx<N/2, T>. +template <int N, typename T> +struct SkNx { + typedef SkNx<N/2, T> Half; + + Half fLo, fHi; + + SkNx() = default; + SkNx(const Half& lo, const Half& hi) : fLo(lo), fHi(hi) {} + + SkNx(T v) : fLo(v), fHi(v) {} + + SkNx(T a, T b) : fLo(a) , fHi(b) { static_assert(N==2, ""); } + SkNx(T a, T b, T c, T d) : fLo(a,b), fHi(c,d) { static_assert(N==4, ""); } + SkNx(T a, T b, T c, T d, T e, T f, T g, T h) : fLo(a,b,c,d), fHi(e,f,g,h) { + static_assert(N==8, ""); + } + SkNx(T a, T b, T c, T d, T e, T f, T g, T h, + T i, T j, T k, T l, T m, T n, T o, T p) : fLo(a,b,c,d, e,f,g,h), fHi(i,j,k,l, m,n,o,p) { + static_assert(N==16, ""); + } + + T operator[](int k) const { + SkASSERT(0 <= k && k < N); + return k < N/2 ? fLo[k] : fHi[k-N/2]; + } + + static SkNx Load(const void* vptr) { + auto ptr = (const char*)vptr; + return { Half::Load(ptr), Half::Load(ptr + N/2*sizeof(T)) }; + } + void store(void* vptr) const { + auto ptr = (char*)vptr; + fLo.store(ptr); + fHi.store(ptr + N/2*sizeof(T)); + } + + bool anyTrue() const { return fLo.anyTrue() || fHi.anyTrue(); } + bool allTrue() const { return fLo.allTrue() && fHi.allTrue(); } + + SkNx abs() const { return { fLo. abs(), fHi. abs() }; } + SkNx sqrt() const { return { fLo. sqrt(), fHi. sqrt() }; } + SkNx rsqrt() const { return { fLo. rsqrt(), fHi. rsqrt() }; } + SkNx floor() const { return { fLo. floor(), fHi. floor() }; } + SkNx invert() const { return { fLo.invert(), fHi.invert() }; } + + SkNx operator!() const { return { !fLo, !fHi }; } + SkNx operator-() const { return { -fLo, -fHi }; } + SkNx operator~() const { return { ~fLo, ~fHi }; } + + SkNx operator<<(int bits) const { return { fLo << bits, fHi << bits }; } + SkNx operator>>(int bits) const { return { fLo >> bits, fHi >> bits }; } + + SkNx operator+(const SkNx& y) const { return { fLo + y.fLo, fHi + y.fHi }; } + SkNx operator-(const SkNx& y) const { return { fLo - y.fLo, fHi - y.fHi }; } + SkNx operator*(const SkNx& y) const { return { fLo * y.fLo, fHi * y.fHi }; } + SkNx operator/(const SkNx& y) const { return { fLo / y.fLo, fHi / y.fHi }; } + + SkNx operator&(const SkNx& y) const { return { fLo & y.fLo, fHi & y.fHi }; } + SkNx operator|(const SkNx& y) const { return { fLo | y.fLo, fHi | y.fHi }; } + SkNx operator^(const SkNx& y) const { return { fLo ^ y.fLo, fHi ^ y.fHi }; } + + SkNx operator==(const SkNx& y) const { return { fLo == y.fLo, fHi == y.fHi }; } + SkNx operator!=(const SkNx& y) const { return { fLo != y.fLo, fHi != y.fHi }; } + SkNx operator<=(const SkNx& y) const { return { fLo <= y.fLo, fHi <= y.fHi }; } + SkNx operator>=(const SkNx& y) const { return { fLo >= y.fLo, fHi >= y.fHi }; } + SkNx operator< (const SkNx& y) const { return { fLo < y.fLo, fHi < y.fHi }; } + SkNx operator> (const SkNx& y) const { return { fLo > y.fLo, fHi > y.fHi }; } + + SkNx saturatedAdd(const SkNx& y) const { + return { fLo.saturatedAdd(y.fLo), fHi.saturatedAdd(y.fHi) }; + } + SkNx thenElse(const SkNx& t, const SkNx& e) const { + return { fLo.thenElse(t.fLo, e.fLo), fHi.thenElse(t.fHi, e.fHi) }; + } + + static SkNx Min(const SkNx& x, const SkNx& y) { + return { Half::Min(x.fLo, y.fLo), Half::Min(x.fHi, y.fHi) }; + } + static SkNx Max(const SkNx& x, const SkNx& y) { + return { Half::Max(x.fLo, y.fLo), Half::Max(x.fHi, y.fHi) }; + } +}; + +// The N -> N/2 recursion bottoms out at N == 1, a scalar value. +template <typename T> +struct SkNx<1,T> { + T fVal; + + SkNx() = default; + SkNx(T v) : fVal(v) {} + + T operator[](int k) const { + SkASSERT(k == 0); + return fVal; + } + + static SkNx Load(const void* ptr) { + SkNx v; + memcpy(&v, ptr, sizeof(T)); + return v; + } + void store(void* ptr) const { memcpy(ptr, &fVal, sizeof(T)); } + + bool anyTrue() const { return fVal != 0; } + bool allTrue() const { return fVal != 0; } + + SkNx abs() const { return Abs(fVal); } + SkNx sqrt() const { return Sqrt(fVal); } + SkNx rsqrt() const { return T(1) / this->sqrt(); } + SkNx floor() const { return Floor(fVal); } + SkNx invert() const { return T(1) / *this; } + + SkNx operator!() const { return !fVal; } + SkNx operator-() const { return -fVal; } + SkNx operator~() const { return FromBits(~ToBits(fVal)); } + + SkNx operator<<(int bits) const { return fVal << bits; } + SkNx operator>>(int bits) const { return fVal >> bits; } + + SkNx operator+(const SkNx& y) const { return fVal + y.fVal; } + SkNx operator-(const SkNx& y) const { return fVal - y.fVal; } + SkNx operator*(const SkNx& y) const { return fVal * y.fVal; } + SkNx operator/(const SkNx& y) const { return fVal / y.fVal; } + + SkNx operator&(const SkNx& y) const { return FromBits(ToBits(fVal) & ToBits(y.fVal)); } + SkNx operator|(const SkNx& y) const { return FromBits(ToBits(fVal) | ToBits(y.fVal)); } + SkNx operator^(const SkNx& y) const { return FromBits(ToBits(fVal) ^ ToBits(y.fVal)); } + + SkNx operator==(const SkNx& y) const { return FromBits(fVal == y.fVal ? ~0 : 0); } + SkNx operator!=(const SkNx& y) const { return FromBits(fVal != y.fVal ? ~0 : 0); } + SkNx operator<=(const SkNx& y) const { return FromBits(fVal <= y.fVal ? ~0 : 0); } + SkNx operator>=(const SkNx& y) const { return FromBits(fVal >= y.fVal ? ~0 : 0); } + SkNx operator< (const SkNx& y) const { return FromBits(fVal < y.fVal ? ~0 : 0); } + SkNx operator> (const SkNx& y) const { return FromBits(fVal > y.fVal ? ~0 : 0); } + + static SkNx Min(const SkNx& x, const SkNx& y) { return x.fVal < y.fVal ? x : y; } + static SkNx Max(const SkNx& x, const SkNx& y) { return x.fVal > y.fVal ? x : y; } + + SkNx saturatedAdd(const SkNx& y) const { + static_assert(std::is_unsigned<T>::value, ""); + T sum = fVal + y.fVal; + return sum < fVal ? std::numeric_limits<T>::max() : sum; + } + + SkNx thenElse(const SkNx& t, const SkNx& e) const { return fVal != 0 ? t : e; } + +private: + // Helper functions to choose the right float/double methods. (In <cmath> madness lies...) + static float Abs(float val) { return ::fabsf(val); } + static float Sqrt(float val) { return ::sqrtf(val); } + static float Floor(float val) { return ::floorf(val); } + + static double Abs(double val) { return ::fabs(val); } + static double Sqrt(double val) { return ::sqrt(val); } + static double Floor(double val) { return ::floor(val); } + + // Helper functions for working with floats/doubles as bit patterns. + template <typename U> static U ToBits(U v) { return v; } + static int32_t ToBits(float v) { int32_t bits; memcpy(&bits, &v, sizeof(v)); return bits; } + static int64_t ToBits(double v) { int64_t bits; memcpy(&bits, &v, sizeof(v)); return bits; } + + template <typename Bits> static T FromBits(Bits bits) { + static_assert(std::is_pod<T >::value && + std::is_pod<Bits>::value && + sizeof(T) <= sizeof(Bits), ""); + T val; + memcpy(&val, &bits, sizeof(T)); + return val; + } +}; + +// Allow scalars on the left or right of binary operators, and things like +=, &=, etc. +#define V template <int N, typename T> SI SkNx<N,T> + V operator+ (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) + y; } + V operator- (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) - y; } + V operator* (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) * y; } + V operator/ (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) / y; } + V operator& (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) & y; } + V operator| (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) | y; } + V operator^ (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) ^ y; } + V operator==(T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) == y; } + V operator!=(T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) != y; } + V operator<=(T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) <= y; } + V operator>=(T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) >= y; } + V operator< (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) < y; } + V operator> (T x, const SkNx<N,T>& y) { return SkNx<N,T>(x) > y; } + + V operator+ (const SkNx<N,T>& x, T y) { return x + SkNx<N,T>(y); } + V operator- (const SkNx<N,T>& x, T y) { return x - SkNx<N,T>(y); } + V operator* (const SkNx<N,T>& x, T y) { return x * SkNx<N,T>(y); } + V operator/ (const SkNx<N,T>& x, T y) { return x / SkNx<N,T>(y); } + V operator& (const SkNx<N,T>& x, T y) { return x & SkNx<N,T>(y); } + V operator| (const SkNx<N,T>& x, T y) { return x | SkNx<N,T>(y); } + V operator^ (const SkNx<N,T>& x, T y) { return x ^ SkNx<N,T>(y); } + V operator==(const SkNx<N,T>& x, T y) { return x == SkNx<N,T>(y); } + V operator!=(const SkNx<N,T>& x, T y) { return x != SkNx<N,T>(y); } + V operator<=(const SkNx<N,T>& x, T y) { return x <= SkNx<N,T>(y); } + V operator>=(const SkNx<N,T>& x, T y) { return x >= SkNx<N,T>(y); } + V operator< (const SkNx<N,T>& x, T y) { return x < SkNx<N,T>(y); } + V operator> (const SkNx<N,T>& x, T y) { return x > SkNx<N,T>(y); } + + V& operator<<=(SkNx<N,T>& x, int bits) { return (x = x << bits); } + V& operator>>=(SkNx<N,T>& x, int bits) { return (x = x >> bits); } + + V& operator +=(SkNx<N,T>& x, const SkNx<N,T>& y) { return (x = x + y); } + V& operator -=(SkNx<N,T>& x, const SkNx<N,T>& y) { return (x = x - y); } + V& operator *=(SkNx<N,T>& x, const SkNx<N,T>& y) { return (x = x * y); } + V& operator /=(SkNx<N,T>& x, const SkNx<N,T>& y) { return (x = x / y); } + V& operator &=(SkNx<N,T>& x, const SkNx<N,T>& y) { return (x = x & y); } + V& operator |=(SkNx<N,T>& x, const SkNx<N,T>& y) { return (x = x | y); } + V& operator ^=(SkNx<N,T>& x, const SkNx<N,T>& y) { return (x = x ^ y); } + + V& operator +=(SkNx<N,T>& x, T y) { return (x = x + SkNx<N,T>(y)); } + V& operator -=(SkNx<N,T>& x, T y) { return (x = x - SkNx<N,T>(y)); } + V& operator *=(SkNx<N,T>& x, T y) { return (x = x * SkNx<N,T>(y)); } + V& operator /=(SkNx<N,T>& x, T y) { return (x = x / SkNx<N,T>(y)); } + V& operator &=(SkNx<N,T>& x, T y) { return (x = x & SkNx<N,T>(y)); } + V& operator |=(SkNx<N,T>& x, T y) { return (x = x | SkNx<N,T>(y)); } + V& operator ^=(SkNx<N,T>& x, T y) { return (x = x ^ SkNx<N,T>(y)); } +#undef V + +// SkNx<N,T> ~~> SkNx<N/2,T> + SkNx<N/2,T> +template <int N, typename T> +SI void SkNx_split(const SkNx<N,T>& v, SkNx<N/2,T>* lo, SkNx<N/2,T>* hi) { + *lo = v.fLo; + *hi = v.fHi; +} + +// SkNx<N/2,T> + SkNx<N/2,T> ~~> SkNx<N,T> +template <int N, typename T> +SI SkNx<N*2,T> SkNx_join(const SkNx<N,T>& lo, const SkNx<N,T>& hi) { + return { lo, hi }; +} + +// A very generic shuffle. Can reorder, duplicate, contract, expand... +// Sk4f v = { R,G,B,A }; +// SkNx_shuffle<2,1,0,3>(v) ~~> {B,G,R,A} +// SkNx_shuffle<2,1>(v) ~~> {B,G} +// SkNx_shuffle<2,1,2,1,2,1,2,1>(v) ~~> {B,G,B,G,B,G,B,G} +// SkNx_shuffle<3,3,3,3>(v) ~~> {A,A,A,A} +template <int... Ix, int N, typename T> +SI SkNx<sizeof...(Ix),T> SkNx_shuffle(const SkNx<N,T>& v) { + return { v[Ix]... }; +} + +// Cast from SkNx<N, Src> to SkNx<N, Dst>, as if you called static_cast<Dst>(Src). +template <typename Dst, typename Src, int N> +SI SkNx<N,Dst> SkNx_cast(const SkNx<N,Src>& v) { + return { SkNx_cast<Dst>(v.fLo), SkNx_cast<Dst>(v.fHi) }; +} +template <typename Dst, typename Src> +SI SkNx<1,Dst> SkNx_cast(const SkNx<1,Src>& v) { + return static_cast<Dst>(v.fVal); +} + +typedef SkNx<2, float> Sk2f; +typedef SkNx<4, float> Sk4f; +typedef SkNx<8, float> Sk8f; +typedef SkNx<16, float> Sk16f; + +typedef SkNx<2, SkScalar> Sk2s; +typedef SkNx<4, SkScalar> Sk4s; +typedef SkNx<8, SkScalar> Sk8s; +typedef SkNx<16, SkScalar> Sk16s; + +typedef SkNx<4, uint8_t> Sk4b; +typedef SkNx<8, uint8_t> Sk8b; +typedef SkNx<16, uint8_t> Sk16b; + +typedef SkNx<4, uint16_t> Sk4h; +typedef SkNx<8, uint16_t> Sk8h; +typedef SkNx<16, uint16_t> Sk16h; + +typedef SkNx<4, int> Sk4i; + +// Include platform specific specializations if available. +#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 + #include "SkNx_sse.h" +#elif !defined(SKNX_NO_SIMD) && defined(SK_ARM_HAS_NEON) + #include "SkNx_neon.h" +#endif + +SI void Sk4f_ToBytes(uint8_t p[16], const Sk4f& a, const Sk4f& b, const Sk4f& c, const Sk4f& d) { + SkNx_cast<uint8_t>(SkNx_join(SkNx_join(a,b), SkNx_join(c,d))).store(p); +} + +#undef SI + +#endif//SkNx_DEFINED diff --git a/include/private/SkNx_neon.h b/include/private/SkNx_neon.h new file mode 100644 index 0000000000..bb81cded18 --- /dev/null +++ b/include/private/SkNx_neon.h @@ -0,0 +1,453 @@ +/* + * Copyright 2015 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SkNx_neon_DEFINED +#define SkNx_neon_DEFINED + +#include <arm_neon.h> + +#define SKNX_IS_FAST + +// ARMv8 has vrndmq_f32 to floor 4 floats. Here we emulate it: +// - roundtrip through integers via truncation +// - subtract 1 if that's too big (possible for negative values). +// This restricts the domain of our inputs to a maximum somehwere around 2^31. Seems plenty big. +static inline float32x4_t armv7_vrndmq_f32(float32x4_t v) { + auto roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); + auto too_big = vcgtq_f32(roundtrip, v); + return vsubq_f32(roundtrip, (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1))); +} + +// Well, this is absurd. The shifts require compile-time constant arguments. + +#define SHIFT8(op, v, bits) switch(bits) { \ + case 1: return op(v, 1); case 2: return op(v, 2); case 3: return op(v, 3); \ + case 4: return op(v, 4); case 5: return op(v, 5); case 6: return op(v, 6); \ + case 7: return op(v, 7); \ + } return fVec + +#define SHIFT16(op, v, bits) if (bits < 8) { SHIFT8(op, v, bits); } switch(bits) { \ + case 8: return op(v, 8); case 9: return op(v, 9); \ + case 10: return op(v, 10); case 11: return op(v, 11); case 12: return op(v, 12); \ + case 13: return op(v, 13); case 14: return op(v, 14); case 15: return op(v, 15); \ + } return fVec + +#define SHIFT32(op, v, bits) if (bits < 16) { SHIFT16(op, v, bits); } switch(bits) { \ + case 16: return op(v, 16); case 17: return op(v, 17); case 18: return op(v, 18); \ + case 19: return op(v, 19); case 20: return op(v, 20); case 21: return op(v, 21); \ + case 22: return op(v, 22); case 23: return op(v, 23); case 24: return op(v, 24); \ + case 25: return op(v, 25); case 26: return op(v, 26); case 27: return op(v, 27); \ + case 28: return op(v, 28); case 29: return op(v, 29); case 30: return op(v, 30); \ + case 31: return op(v, 31); } return fVec + +template <> +class SkNx<2, float> { +public: + SkNx(float32x2_t vec) : fVec(vec) {} + + SkNx() {} + SkNx(float val) : fVec(vdup_n_f32(val)) {} + static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); } + SkNx(float a, float b) { fVec = (float32x2_t) { a, b }; } + + void store(void* ptr) const { vst1_f32((float*)ptr, fVec); } + + SkNx invert() const { + float32x2_t est0 = vrecpe_f32(fVec), + est1 = vmul_f32(vrecps_f32(est0, fVec), est0); + return est1; + } + + SkNx operator + (const SkNx& o) const { return vadd_f32(fVec, o.fVec); } + SkNx operator - (const SkNx& o) const { return vsub_f32(fVec, o.fVec); } + SkNx operator * (const SkNx& o) const { return vmul_f32(fVec, o.fVec); } + SkNx operator / (const SkNx& o) const { + #if defined(SK_CPU_ARM64) + return vdiv_f32(fVec, o.fVec); + #else + float32x2_t est0 = vrecpe_f32(o.fVec), + est1 = vmul_f32(vrecps_f32(est0, o.fVec), est0), + est2 = vmul_f32(vrecps_f32(est1, o.fVec), est1); + return vmul_f32(fVec, est2); + #endif + } + + SkNx operator == (const SkNx& o) const { return vreinterpret_f32_u32(vceq_f32(fVec, o.fVec)); } + SkNx operator < (const SkNx& o) const { return vreinterpret_f32_u32(vclt_f32(fVec, o.fVec)); } + SkNx operator > (const SkNx& o) const { return vreinterpret_f32_u32(vcgt_f32(fVec, o.fVec)); } + SkNx operator <= (const SkNx& o) const { return vreinterpret_f32_u32(vcle_f32(fVec, o.fVec)); } + SkNx operator >= (const SkNx& o) const { return vreinterpret_f32_u32(vcge_f32(fVec, o.fVec)); } + SkNx operator != (const SkNx& o) const { + return vreinterpret_f32_u32(vmvn_u32(vceq_f32(fVec, o.fVec))); + } + + static SkNx Min(const SkNx& l, const SkNx& r) { return vmin_f32(l.fVec, r.fVec); } + static SkNx Max(const SkNx& l, const SkNx& r) { return vmax_f32(l.fVec, r.fVec); } + + SkNx rsqrt() const { + float32x2_t est0 = vrsqrte_f32(fVec); + return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0); + } + + SkNx sqrt() const { + #if defined(SK_CPU_ARM64) + return vsqrt_f32(fVec); + #else + float32x2_t est0 = vrsqrte_f32(fVec), + est1 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0), + est2 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1); + return vmul_f32(fVec, est2); + #endif + } + + float operator[](int k) const { + SkASSERT(0 <= k && k < 2); + union { float32x2_t v; float fs[2]; } pun = {fVec}; + return pun.fs[k&1]; + } + + bool allTrue() const { + auto v = vreinterpret_u32_f32(fVec); + return vget_lane_u32(v,0) && vget_lane_u32(v,1); + } + bool anyTrue() const { + auto v = vreinterpret_u32_f32(fVec); + return vget_lane_u32(v,0) || vget_lane_u32(v,1); + } + + float32x2_t fVec; +}; + +template <> +class SkNx<4, float> { +public: + SkNx(float32x4_t vec) : fVec(vec) {} + + SkNx() {} + SkNx(float val) : fVec(vdupq_n_f32(val)) {} + static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); } + SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; } + + void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); } + SkNx invert() const { + float32x4_t est0 = vrecpeq_f32(fVec), + est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0); + return est1; + } + + SkNx operator + (const SkNx& o) const { return vaddq_f32(fVec, o.fVec); } + SkNx operator - (const SkNx& o) const { return vsubq_f32(fVec, o.fVec); } + SkNx operator * (const SkNx& o) const { return vmulq_f32(fVec, o.fVec); } + SkNx operator / (const SkNx& o) const { + #if defined(SK_CPU_ARM64) + return vdivq_f32(fVec, o.fVec); + #else + float32x4_t est0 = vrecpeq_f32(o.fVec), + est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0), + est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1); + return vmulq_f32(fVec, est2); + #endif + } + + SkNx operator==(const SkNx& o) const { return vreinterpretq_f32_u32(vceqq_f32(fVec, o.fVec)); } + SkNx operator <(const SkNx& o) const { return vreinterpretq_f32_u32(vcltq_f32(fVec, o.fVec)); } + SkNx operator >(const SkNx& o) const { return vreinterpretq_f32_u32(vcgtq_f32(fVec, o.fVec)); } + SkNx operator<=(const SkNx& o) const { return vreinterpretq_f32_u32(vcleq_f32(fVec, o.fVec)); } + SkNx operator>=(const SkNx& o) const { return vreinterpretq_f32_u32(vcgeq_f32(fVec, o.fVec)); } + SkNx operator!=(const SkNx& o) const { + return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec))); + } + + static SkNx Min(const SkNx& l, const SkNx& r) { return vminq_f32(l.fVec, r.fVec); } + static SkNx Max(const SkNx& l, const SkNx& r) { return vmaxq_f32(l.fVec, r.fVec); } + + SkNx abs() const { return vabsq_f32(fVec); } + SkNx floor() const { + #if defined(SK_CPU_ARM64) + return vrndmq_f32(fVec); + #else + return armv7_vrndmq_f32(fVec); + #endif + } + + + SkNx rsqrt() const { + float32x4_t est0 = vrsqrteq_f32(fVec); + return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0); + } + + SkNx sqrt() const { + #if defined(SK_CPU_ARM64) + return vsqrtq_f32(fVec); + #else + float32x4_t est0 = vrsqrteq_f32(fVec), + est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0), + est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1); + return vmulq_f32(fVec, est2); + #endif + } + + float operator[](int k) const { + SkASSERT(0 <= k && k < 4); + union { float32x4_t v; float fs[4]; } pun = {fVec}; + return pun.fs[k&3]; + } + + bool allTrue() const { + auto v = vreinterpretq_u32_f32(fVec); + return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1) + && vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3); + } + bool anyTrue() const { + auto v = vreinterpretq_u32_f32(fVec); + return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1) + || vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3); + } + + SkNx thenElse(const SkNx& t, const SkNx& e) const { + return vbslq_f32(vreinterpretq_u32_f32(fVec), t.fVec, e.fVec); + } + + float32x4_t fVec; +}; + +// It's possible that for our current use cases, representing this as +// half a uint16x8_t might be better than representing it as a uint16x4_t. +// It'd make conversion to Sk4b one step simpler. +template <> +class SkNx<4, uint16_t> { +public: + SkNx(const uint16x4_t& vec) : fVec(vec) {} + + SkNx() {} + SkNx(uint16_t val) : fVec(vdup_n_u16(val)) {} + static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); } + + SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { + fVec = (uint16x4_t) { a,b,c,d }; + } + + void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); } + + SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); } + SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); } + SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); } + + SkNx operator << (int bits) const { SHIFT16(vshl_n_u16, fVec, bits); } + SkNx operator >> (int bits) const { SHIFT16(vshr_n_u16, fVec, bits); } + + static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fVec); } + + uint16_t operator[](int k) const { + SkASSERT(0 <= k && k < 4); + union { uint16x4_t v; uint16_t us[4]; } pun = {fVec}; + return pun.us[k&3]; + } + + SkNx thenElse(const SkNx& t, const SkNx& e) const { + return vbsl_u16(fVec, t.fVec, e.fVec); + } + + uint16x4_t fVec; +}; + +template <> +class SkNx<8, uint16_t> { +public: + SkNx(const uint16x8_t& vec) : fVec(vec) {} + + SkNx() {} + SkNx(uint16_t val) : fVec(vdupq_n_u16(val)) {} + static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr); } + + SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, + uint16_t e, uint16_t f, uint16_t g, uint16_t h) { + fVec = (uint16x8_t) { a,b,c,d, e,f,g,h }; + } + + void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); } + + SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); } + SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); } + SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); } + + SkNx operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); } + SkNx operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); } + + static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.fVec); } + + uint16_t operator[](int k) const { + SkASSERT(0 <= k && k < 8); + union { uint16x8_t v; uint16_t us[8]; } pun = {fVec}; + return pun.us[k&7]; + } + + SkNx thenElse(const SkNx& t, const SkNx& e) const { + return vbslq_u16(fVec, t.fVec, e.fVec); + } + + uint16x8_t fVec; +}; + +template <> +class SkNx<4, uint8_t> { +public: + SkNx(const uint8x8_t& vec) : fVec(vec) {} + + SkNx() {} + SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d) { + fVec = (uint8x8_t){a,b,c,d, 0,0,0,0}; + } + static SkNx Load(const void* ptr) { + return (uint8x8_t)vld1_dup_u32((const uint32_t*)ptr); + } + void store(void* ptr) const { + return vst1_lane_u32((uint32_t*)ptr, (uint32x2_t)fVec, 0); + } + uint8_t operator[](int k) const { + SkASSERT(0 <= k && k < 4); + union { uint8x8_t v; uint8_t us[8]; } pun = {fVec}; + return pun.us[k&3]; + } + + // TODO as needed + + uint8x8_t fVec; +}; + +template <> +class SkNx<16, uint8_t> { +public: + SkNx(const uint8x16_t& vec) : fVec(vec) {} + + SkNx() {} + SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {} + static SkNx Load(const void* ptr) { return vld1q_u8((const uint8_t*)ptr); } + + SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, + uint8_t e, uint8_t f, uint8_t g, uint8_t h, + uint8_t i, uint8_t j, uint8_t k, uint8_t l, + uint8_t m, uint8_t n, uint8_t o, uint8_t p) { + fVec = (uint8x16_t) { a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p }; + } + + void store(void* ptr) const { vst1q_u8((uint8_t*)ptr, fVec); } + + SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); } + + SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); } + SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); } + + static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fVec); } + SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); } + + uint8_t operator[](int k) const { + SkASSERT(0 <= k && k < 16); + union { uint8x16_t v; uint8_t us[16]; } pun = {fVec}; + return pun.us[k&15]; + } + + SkNx thenElse(const SkNx& t, const SkNx& e) const { + return vbslq_u8(fVec, t.fVec, e.fVec); + } + + uint8x16_t fVec; +}; + +template <> +class SkNx<4, int> { +public: + SkNx(const int32x4_t& vec) : fVec(vec) {} + + SkNx() {} + SkNx(int v) { + fVec = vdupq_n_s32(v); + } + SkNx(int a, int b, int c, int d) { + fVec = (int32x4_t){a,b,c,d}; + } + static SkNx Load(const void* ptr) { + return vld1q_s32((const int32_t*)ptr); + } + void store(void* ptr) const { + return vst1q_s32((int32_t*)ptr, fVec); + } + int operator[](int k) const { + SkASSERT(0 <= k && k < 4); + union { int32x4_t v; int is[4]; } pun = {fVec}; + return pun.is[k&3]; + } + + SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); } + SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); } + SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); } + + SkNx operator | (const SkNx& o) const { return vorrq_s32(fVec, o.fVec); } + + SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); } + SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); } + + static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.fVec); } + // TODO as needed + + int32x4_t fVec; +}; + +#undef SHIFT32 +#undef SHIFT16 +#undef SHIFT8 + +template<> inline Sk4i SkNx_cast<int, float>(const Sk4f& src) { + return vcvtq_s32_f32(src.fVec); + +} +template<> inline Sk4f SkNx_cast<float, int>(const Sk4i& src) { + return vcvtq_f32_s32(src.fVec); +} + +template<> inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { + return vqmovn_u32(vcvtq_u32_f32(src.fVec)); +} + +template<> inline Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) { + return vcvtq_f32_u32(vmovl_u16(src.fVec)); +} + +template<> inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { + uint32x4_t _32 = vcvtq_u32_f32(src.fVec); + uint16x4_t _16 = vqmovn_u32(_32); + return vqmovn_u16(vcombine_u16(_16, _16)); +} + +template<> inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) { + uint16x8_t _16 = vmovl_u8 (src.fVec) ; + uint32x4_t _32 = vmovl_u16(vget_low_u16(_16)); + return vcvtq_f32_u32(_32); +} + +template<> inline Sk16b SkNx_cast<uint8_t, float>(const Sk16f& src) { + Sk8f ab, cd; + SkNx_split(src, &ab, &cd); + + Sk4f a,b,c,d; + SkNx_split(ab, &a, &b); + SkNx_split(cd, &c, &d); + return vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec), + (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0], + vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec), + (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0]).val[0]; +} + +template<> inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) { + return vget_low_u16(vmovl_u8(src.fVec)); +} + +template<> inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) { + return vmovn_u16(vcombine_u16(src.fVec, src.fVec)); +} + +#endif//SkNx_neon_DEFINED diff --git a/include/private/SkNx_sse.h b/include/private/SkNx_sse.h new file mode 100644 index 0000000000..65d9873c5c --- /dev/null +++ b/include/private/SkNx_sse.h @@ -0,0 +1,374 @@ +/* + * Copyright 2015 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SkNx_sse_DEFINED +#define SkNx_sse_DEFINED + +#include <immintrin.h> + +// This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything more recent. +// If you do, make sure this is in a static inline function... anywhere else risks violating ODR. + +#define SKNX_IS_FAST + +template <> +class SkNx<2, float> { +public: + SkNx(const __m128& vec) : fVec(vec) {} + + SkNx() {} + SkNx(float val) : fVec(_mm_set1_ps(val)) {} + static SkNx Load(const void* ptr) { + return _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)ptr)); + } + SkNx(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {} + + void store(void* ptr) const { _mm_storel_pi((__m64*)ptr, fVec); } + + SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } + SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } + SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } + SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } + + SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); } + SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); } + SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); } + SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); } + SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); } + SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); } + + static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); } + static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); } + + SkNx sqrt() const { return _mm_sqrt_ps (fVec); } + SkNx rsqrt() const { return _mm_rsqrt_ps(fVec); } + SkNx invert() const { return _mm_rcp_ps(fVec); } + + float operator[](int k) const { + SkASSERT(0 <= k && k < 2); + union { __m128 v; float fs[4]; } pun = {fVec}; + return pun.fs[k&1]; + } + + bool allTrue() const { return 0xff == (_mm_movemask_epi8(_mm_castps_si128(fVec)) & 0xff); } + bool anyTrue() const { return 0x00 != (_mm_movemask_epi8(_mm_castps_si128(fVec)) & 0xff); } + + __m128 fVec; +}; + +template <> +class SkNx<4, float> { +public: + SkNx(const __m128& vec) : fVec(vec) {} + + SkNx() {} + SkNx(float val) : fVec( _mm_set1_ps(val) ) {} + static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr); } + + SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} + + void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); } + + SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } + SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } + SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } + SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } + + SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); } + SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); } + SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); } + SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); } + SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); } + SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); } + + static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); } + static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); } + + SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); } + SkNx floor() const { + #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 + return _mm_floor_ps(fVec); + #else + // Emulate _mm_floor_ps() with SSE2: + // - roundtrip through integers via truncation + // - subtract 1 if that's too big (possible for negative values). + // This restricts the domain of our inputs to a maximum somehwere around 2^31. + // Seems plenty big. + __m128 roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(fVec)); + __m128 too_big = _mm_cmpgt_ps(roundtrip, fVec); + return _mm_sub_ps(roundtrip, _mm_and_ps(too_big, _mm_set1_ps(1.0f))); + #endif + } + + SkNx sqrt() const { return _mm_sqrt_ps (fVec); } + SkNx rsqrt() const { return _mm_rsqrt_ps(fVec); } + SkNx invert() const { return _mm_rcp_ps(fVec); } + + float operator[](int k) const { + SkASSERT(0 <= k && k < 4); + union { __m128 v; float fs[4]; } pun = {fVec}; + return pun.fs[k&3]; + } + + bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castps_si128(fVec)); } + bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(fVec)); } + + SkNx thenElse(const SkNx& t, const SkNx& e) const { + #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 + return _mm_blendv_ps(e.fVec, t.fVec, fVec); + #else + return _mm_or_ps(_mm_and_ps (fVec, t.fVec), + _mm_andnot_ps(fVec, e.fVec)); + #endif + } + + __m128 fVec; +}; + +template <> +class SkNx<4, int> { +public: + SkNx(const __m128i& vec) : fVec(vec) {} + + SkNx() {} + SkNx(int val) : fVec(_mm_set1_epi32(val)) {} + static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } + SkNx(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {} + + void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } + + SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); } + SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); } + SkNx operator * (const SkNx& o) const { + __m128i mul20 = _mm_mul_epu32(fVec, o.fVec), + mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.fVec, 4)); + return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)), + _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0))); + } + + SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } + + SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } + SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); } + + int operator[](int k) const { + SkASSERT(0 <= k && k < 4); + union { __m128i v; int is[4]; } pun = {fVec}; + return pun.is[k&3]; + } + + __m128i fVec; +}; + +template <> +class SkNx<4, uint16_t> { +public: + SkNx(const __m128i& vec) : fVec(vec) {} + + SkNx() {} + SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {} + static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); } + SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a,b,c,d,0,0,0,0)) {} + + void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); } + + SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); } + SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); } + SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); } + + SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); } + SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); } + + uint16_t operator[](int k) const { + SkASSERT(0 <= k && k < 4); + union { __m128i v; uint16_t us[8]; } pun = {fVec}; + return pun.us[k&3]; + } + + __m128i fVec; +}; + +template <> +class SkNx<8, uint16_t> { +public: + SkNx(const __m128i& vec) : fVec(vec) {} + + SkNx() {} + SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {} + static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } + SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, + uint16_t e, uint16_t f, uint16_t g, uint16_t h) : fVec(_mm_setr_epi16(a,b,c,d,e,f,g,h)) {} + + void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } + + SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); } + SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); } + SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); } + + SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); } + SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); } + + static SkNx Min(const SkNx& a, const SkNx& b) { + // No unsigned _mm_min_epu16, so we'll shift into a space where we can use the + // signed version, _mm_min_epi16, then shift back. + const uint16_t top = 0x8000; // Keep this separate from _mm_set1_epi16 or MSVC will whine. + const __m128i top_8x = _mm_set1_epi16(top); + return _mm_add_epi8(top_8x, _mm_min_epi16(_mm_sub_epi8(a.fVec, top_8x), + _mm_sub_epi8(b.fVec, top_8x))); + } + + SkNx thenElse(const SkNx& t, const SkNx& e) const { + return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), + _mm_andnot_si128(fVec, e.fVec)); + } + + uint16_t operator[](int k) const { + SkASSERT(0 <= k && k < 8); + union { __m128i v; uint16_t us[8]; } pun = {fVec}; + return pun.us[k&7]; + } + + __m128i fVec; +}; + +template <> +class SkNx<4, uint8_t> { +public: + SkNx() {} + SkNx(const __m128i& vec) : fVec(vec) {} + SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d) + : fVec(_mm_setr_epi8(a,b,c,d, 0,0,0,0, 0,0,0,0, 0,0,0,0)) {} + + + static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)ptr); } + void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); } + + uint8_t operator[](int k) const { + SkASSERT(0 <= k && k < 4); + union { __m128i v; uint8_t us[16]; } pun = {fVec}; + return pun.us[k&3]; + } + + // TODO as needed + + __m128i fVec; +}; + +template <> +class SkNx<16, uint8_t> { +public: + SkNx(const __m128i& vec) : fVec(vec) {} + + SkNx() {} + SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {} + static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } + SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, + uint8_t e, uint8_t f, uint8_t g, uint8_t h, + uint8_t i, uint8_t j, uint8_t k, uint8_t l, + uint8_t m, uint8_t n, uint8_t o, uint8_t p) + : fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p)) {} + + void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } + + SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec); } + + SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); } + SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); } + + static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec, b.fVec); } + SkNx operator < (const SkNx& o) const { + // There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use a signed compare. + auto flip = _mm_set1_epi8(char(0x80)); + return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.fVec)); + } + + uint8_t operator[](int k) const { + SkASSERT(0 <= k && k < 16); + union { __m128i v; uint8_t us[16]; } pun = {fVec}; + return pun.us[k&15]; + } + + SkNx thenElse(const SkNx& t, const SkNx& e) const { + return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), + _mm_andnot_si128(fVec, e.fVec)); + } + + __m128i fVec; +}; + +template<> /*static*/ inline Sk4f SkNx_cast<float, int>(const Sk4i& src) { + return _mm_cvtepi32_ps(src.fVec); +} + +template <> /*static*/ inline Sk4i SkNx_cast<int, float>(const Sk4f& src) { + return _mm_cvttps_epi32(src.fVec); +} + +template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { + auto _32 = _mm_cvttps_epi32(src.fVec); + // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+. +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 + // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place. + const int _ = ~0; + return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_)); +#else + // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32: + _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000)); + return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000)); +#endif +} + +template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { + auto _32 = _mm_cvttps_epi32(src.fVec); +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 + const int _ = ~0; + return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_,_)); +#else + auto _16 = _mm_packus_epi16(_32, _32); + return _mm_packus_epi16(_16, _16); +#endif +} + +template<> /*static*/ inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) { +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 + const int _ = ~0; + auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_)); +#else + auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()), + _32 = _mm_unpacklo_epi16(_16, _mm_setzero_si128()); +#endif + return _mm_cvtepi32_ps(_32); +} + +template<> /*static*/ inline Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) { + auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); + return _mm_cvtepi32_ps(_32); +} + +template<> /*static*/ inline Sk16b SkNx_cast<uint8_t, float>(const Sk16f& src) { + Sk8f ab, cd; + SkNx_split(src, &ab, &cd); + + Sk4f a,b,c,d; + SkNx_split(ab, &a, &b); + SkNx_split(cd, &c, &d); + + return _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec), + _mm_cvttps_epi32(b.fVec)), + _mm_packus_epi16(_mm_cvttps_epi32(c.fVec), + _mm_cvttps_epi32(d.fVec))); +} + +template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) { + return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()); +} + +template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) { + return _mm_packus_epi16(src.fVec, src.fVec); +} + +#endif//SkNx_sse_DEFINED |