diff options
-rw-r--r-- | src/core/SkHalf.h | 29 | ||||
-rw-r--r-- | src/opts/SkNx_sse.h | 240 | ||||
-rw-r--r-- | src/opts/SkRasterPipeline_opts.h | 69 |
3 files changed, 0 insertions, 338 deletions
diff --git a/src/core/SkHalf.h b/src/core/SkHalf.h index e71cb8750a..dd978a2347 100644 --- a/src/core/SkHalf.h +++ b/src/core/SkHalf.h @@ -11,10 +11,6 @@ #include "SkNx.h" #include "SkTypes.h" -#if !defined(_MSC_VER) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 - #include <x86intrin.h> -#endif - // 16-bit floating point value // format is 1 bit sign, 5 bits exponent, 10 bits mantissa // only used for storage @@ -89,29 +85,4 @@ static inline Sk4h SkFloatToHalf_finite_ftz(const Sk4f& fs) { #endif } -static inline Sk8f SkHalfToFloat_finite_ftz(const Sk8h& hs) { -#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 - return _mm256_cvtph_ps(hs.fVec); - -#else - uint64_t parts[2]; - hs.store(parts); - return SkNx_join(SkHalfToFloat_finite_ftz(parts[0]), - SkHalfToFloat_finite_ftz(parts[1])); - -#endif -} - -static inline Sk8h SkFloatToHalf_finite_ftz(const Sk8f& fs) { -#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 - return _mm256_cvtps_ph(fs.fVec, _MM_FROUND_CUR_DIRECTION); - -#else - uint64_t parts[2]; - SkFloatToHalf_finite_ftz(fs.fLo).store(parts+0); - SkFloatToHalf_finite_ftz(fs.fHi).store(parts+1); - return Sk8h::Load(parts); -#endif -} - #endif diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index e81872291e..3e59a9d0e8 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -329,7 +329,6 @@ public: AI void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) { - // TODO: AVX2 version __m128i _01 = _mm_loadu_si128(((__m128i*)ptr) + 0), _23 = _mm_loadu_si128(((__m128i*)ptr) + 1), _45 = _mm_loadu_si128(((__m128i*)ptr) + 2), @@ -351,7 +350,6 @@ public: *a = _mm_unpackhi_epi64(ba0123, ba4567); } AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) { - // TODO: AVX2 version const uint8_t* ptr8 = (const uint8_t*) ptr; __m128i rgb0 = _mm_loadu_si128((const __m128i*) (ptr8 + 0*2)); __m128i rgb1 = _mm_srli_si128(rgb0, 3*2); @@ -377,7 +375,6 @@ public: *b = _mm_unpacklo_epi64(bx03, bx47); } AI static void Store4(void* ptr, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) { - // TODO: AVX2 version __m128i rg0123 = _mm_unpacklo_epi16(r.fVec, g.fVec), // r0 g0 r1 g1 r2 g2 r3 g3 rg4567 = _mm_unpackhi_epi16(r.fVec, g.fVec), // r4 g4 r5 g5 r6 g6 r7 g7 ba0123 = _mm_unpacklo_epi16(b.fVec, a.fVec), @@ -486,243 +483,6 @@ public: __m128i fVec; }; -#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 - - template <> - class SkNx<8, uint8_t> { - public: - AI SkNx(const __m128i& vec) : fVec(vec) {} - - AI SkNx() {} - AI SkNx(uint8_t v) : fVec(_mm_set1_epi8(v)) {} - AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, - uint8_t e, uint8_t f, uint8_t g, uint8_t h) - : fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, 0,0,0,0, 0,0,0,0)) {} - - - AI static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); } - AI void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); } - - AI uint8_t operator[](int k) const { - SkASSERT(0 <= k && k < 8); - union { __m128i v; uint8_t us[16]; } pun = {fVec}; - return pun.us[k&7]; - } - - __m128i fVec; - }; - - template <> - class SkNx<8, int32_t> { - public: - AI SkNx(const __m256i& vec) : fVec(vec) {} - - AI SkNx() {} - AI SkNx(int32_t v) : fVec(_mm256_set1_epi32(v)) {} - AI SkNx(int32_t a, int32_t b, int32_t c, int32_t d, - int32_t e, int32_t f, int32_t g, int32_t h) - : fVec(_mm256_setr_epi32(a,b,c,d, e,f,g,h)) {} - - AI static SkNx Load(const void* ptr) { return _mm256_loadu_si256((const __m256i*)ptr); } - AI void store(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, fVec); } - - AI SkNx operator + (const SkNx& o) const { return _mm256_add_epi32(fVec, o.fVec); } - AI SkNx operator - (const SkNx& o) const { return _mm256_sub_epi32(fVec, o.fVec); } - AI SkNx operator * (const SkNx& o) const { return _mm256_mullo_epi32(fVec, o.fVec); } - - AI SkNx operator & (const SkNx& o) const { return _mm256_and_si256(fVec, o.fVec); } - AI SkNx operator | (const SkNx& o) const { return _mm256_or_si256(fVec, o.fVec); } - AI SkNx operator ^ (const SkNx& o) const { return _mm256_xor_si256(fVec, o.fVec); } - - AI SkNx operator << (int bits) const { return _mm256_slli_epi32(fVec, bits); } - AI SkNx operator >> (int bits) const { return _mm256_srai_epi32(fVec, bits); } - - AI int32_t operator[](int k) const { - SkASSERT(0 <= k && k < 8); - union { __m256i v; int32_t is[8]; } pun = {fVec}; - return pun.is[k&7]; - } - - __m256i fVec; - }; - - template <> - class SkNx<8, uint32_t> { - public: - AI SkNx(const __m256i& vec) : fVec(vec) {} - - AI SkNx() {} - AI SkNx(uint32_t v) : fVec(_mm256_set1_epi32(v)) {} - AI SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d, - uint32_t e, uint32_t f, uint32_t g, uint32_t h) - : fVec(_mm256_setr_epi32(a,b,c,d, e,f,g,h)) {} - - AI static SkNx Load(const void* ptr) { return _mm256_loadu_si256((const __m256i*)ptr); } - AI void store(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, fVec); } - - AI SkNx operator + (const SkNx& o) const { return _mm256_add_epi32(fVec, o.fVec); } - AI SkNx operator - (const SkNx& o) const { return _mm256_sub_epi32(fVec, o.fVec); } - AI SkNx operator * (const SkNx& o) const { return _mm256_mullo_epi32(fVec, o.fVec); } - - AI SkNx operator & (const SkNx& o) const { return _mm256_and_si256(fVec, o.fVec); } - AI SkNx operator | (const SkNx& o) const { return _mm256_or_si256(fVec, o.fVec); } - AI SkNx operator ^ (const SkNx& o) const { return _mm256_xor_si256(fVec, o.fVec); } - - AI SkNx operator << (int bits) const { return _mm256_slli_epi32(fVec, bits); } - AI SkNx operator >> (int bits) const { return _mm256_srli_epi32(fVec, bits); } - - AI uint32_t operator[](int k) const { - SkASSERT(0 <= k && k < 8); - union { __m256i v; uint32_t us[8]; } pun = {fVec}; - return pun.us[k&7]; - } - - __m256i fVec; - }; - - // _mm256_unpack{lo,hi}_pd() auto-casting to and from __m256d. - AI static __m256 unpacklo_pd(__m256 x, __m256 y) { - return _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(x), _mm256_castps_pd(y))); - } - AI static __m256 unpackhi_pd(__m256 x, __m256 y) { - return _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(x), _mm256_castps_pd(y))); - } - - template <> - class SkNx<8, float> { - public: - AI SkNx(const __m256& vec) : fVec(vec) {} - - AI SkNx() {} - AI SkNx(float val) : fVec(_mm256_set1_ps(val)) {} - AI SkNx(float a, float b, float c, float d, - float e, float f, float g, float h) : fVec(_mm256_setr_ps(a,b,c,d,e,f,g,h)) {} - - AI static SkNx Load(const void* ptr) { return _mm256_loadu_ps((const float*)ptr); } - AI void store(void* ptr) const { _mm256_storeu_ps((float*)ptr, fVec); } - - AI static void Store4(void* ptr, - const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) { - __m256 rg0145 = _mm256_unpacklo_ps(r.fVec, g.fVec), // r0 g0 r1 g1 | r4 g4 r5 g5 - rg2367 = _mm256_unpackhi_ps(r.fVec, g.fVec), // r2 ... | r6 ... - ba0145 = _mm256_unpacklo_ps(b.fVec, a.fVec), // b0 a0 b1 a1 | b4 a4 b5 a5 - ba2367 = _mm256_unpackhi_ps(b.fVec, a.fVec); // b2 ... | b6 ... - - __m256 _04 = unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4 - _15 = unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ... - _26 = unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ... - _37 = unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ... - - __m256 _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo - _23 = _mm256_permute2f128_ps(_26, _37, 32), - _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi - _67 = _mm256_permute2f128_ps(_26, _37, 49); - - _mm256_storeu_ps((float*)ptr + 0*8, _01); - _mm256_storeu_ps((float*)ptr + 1*8, _23); - _mm256_storeu_ps((float*)ptr + 2*8, _45); - _mm256_storeu_ps((float*)ptr + 3*8, _67); - } - AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) { - Sk4f rl, gl, bl, al, - rh, gh, bh, ah; - Sk4f::Load4((const float*)ptr + 0, &rl, &gl, &bl, &al); - Sk4f::Load4((const float*)ptr + 16, &rh, &gh, &bh, &ah); - *r = _mm256_setr_m128(rl.fVec, rh.fVec); - *g = _mm256_setr_m128(gl.fVec, gh.fVec); - *b = _mm256_setr_m128(bl.fVec, bh.fVec); - *a = _mm256_setr_m128(al.fVec, ah.fVec); - } - - AI SkNx operator+(const SkNx& o) const { return _mm256_add_ps(fVec, o.fVec); } - AI SkNx operator-(const SkNx& o) const { return _mm256_sub_ps(fVec, o.fVec); } - AI SkNx operator*(const SkNx& o) const { return _mm256_mul_ps(fVec, o.fVec); } - AI SkNx operator/(const SkNx& o) const { return _mm256_div_ps(fVec, o.fVec); } - - AI SkNx operator==(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_EQ_OQ); } - AI SkNx operator!=(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_NEQ_OQ); } - AI SkNx operator <(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_LT_OQ); } - AI SkNx operator >(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_GT_OQ); } - AI SkNx operator<=(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_LE_OQ); } - AI SkNx operator>=(const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_GE_OQ); } - - AI static SkNx Min(const SkNx& l, const SkNx& r) { return _mm256_min_ps(l.fVec, r.fVec); } - AI static SkNx Max(const SkNx& l, const SkNx& r) { return _mm256_max_ps(l.fVec, r.fVec); } - - AI SkNx sqrt() const { return _mm256_sqrt_ps (fVec); } - AI SkNx rsqrt() const { return _mm256_rsqrt_ps(fVec); } - AI SkNx invert() const { return _mm256_rcp_ps (fVec); } - - AI SkNx abs() const { return _mm256_andnot_ps(_mm256_set1_ps(-0.0f), fVec); } - AI SkNx floor() const { return _mm256_floor_ps(fVec); } - - AI float operator[](int k) const { - SkASSERT(0 <= k && k < 8); - union { __m256 v; float fs[8]; } pun = {fVec}; - return pun.fs[k&7]; - } - - AI SkNx thenElse(const SkNx& t, const SkNx& e) const { - return _mm256_blendv_ps(e.fVec, t.fVec, fVec); - } - - __m256 fVec; - }; - - AI static void SkNx_split(const Sk8f& v, Sk4f* lo, Sk4f* hi) { - *lo = _mm256_extractf128_ps(v.fVec, 0); - *hi = _mm256_extractf128_ps(v.fVec, 1); - } - - AI static Sk8f SkNx_join(const Sk4f& lo, const Sk4f& hi) { - return _mm256_insertf128_ps(_mm256_castps128_ps256(lo.fVec), hi.fVec, 1); - } - - AI static Sk8f SkNx_fma(const Sk8f& a, const Sk8f& b, const Sk8f& c) { - return _mm256_fmadd_ps(a.fVec, b.fVec, c.fVec); - } - - template<> AI /*static*/ Sk8i SkNx_cast<int>(const Sk8b& src) { - return _mm256_cvtepu8_epi32(src.fVec); - } - - template<> AI /*static*/ Sk8f SkNx_cast<float>(const Sk8b& src) { - return _mm256_cvtepi32_ps(SkNx_cast<int>(src).fVec); - } - - template<> AI /*static*/ Sk8i SkNx_cast<int>(const Sk8h& src) { - return _mm256_cvtepu16_epi32(src.fVec); - } - - template<> AI /*static*/ Sk8f SkNx_cast<float>(const Sk8h& src) { - return _mm256_cvtepi32_ps(SkNx_cast<int>(src).fVec); - } - - template<> AI /*static*/ Sk8f SkNx_cast<float>(const Sk8i& src) { - return _mm256_cvtepi32_ps(src.fVec); - } - - template<> AI /*static*/ Sk8i SkNx_cast<int>(const Sk8f& src) { - return _mm256_cvttps_epi32(src.fVec); - } - - template<> AI /*static*/ Sk8h SkNx_cast<uint16_t>(const Sk8i& src) { - __m128i lo = _mm256_extractf128_si256(src.fVec, 0), - hi = _mm256_extractf128_si256(src.fVec, 1); - return _mm_packus_epi32(lo, hi); - } - - template<> AI /*static*/ Sk8h SkNx_cast<uint16_t>(const Sk8f& src) { - return SkNx_cast<uint16_t>(SkNx_cast<int>(src)); - } - - template<> AI /*static*/ Sk8b SkNx_cast<uint8_t>(const Sk8i& src) { - auto _16 = SkNx_cast<uint16_t>(src); - return _mm_packus_epi16(_16.fVec, _16.fVec); - } - -#endif - template<> AI /*static*/ Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) { return _mm_cvtepi32_ps(src.fVec); } diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h index 0378f4af1f..44cbe83618 100644 --- a/src/opts/SkRasterPipeline_opts.h +++ b/src/opts/SkRasterPipeline_opts.h @@ -23,11 +23,7 @@ namespace { -#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 - static constexpr int N = 8; -#else static constexpr int N = 4; -#endif using SkNf = SkNx<N, float>; using SkNi = SkNx<N, int32_t>; @@ -202,71 +198,6 @@ SI void store(size_t tail, const SkNx<N,T>& v, T* dst) { v.store(dst); } -#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 - SI __m256i mask(size_t tail) { - static const int masks[][8] = { - {~0,~0,~0,~0, ~0,~0,~0,~0 }, // remember, tail == 0 ~~> load all N - {~0, 0, 0, 0, 0, 0, 0, 0 }, - {~0,~0, 0, 0, 0, 0, 0, 0 }, - {~0,~0,~0, 0, 0, 0, 0, 0 }, - {~0,~0,~0,~0, 0, 0, 0, 0 }, - {~0,~0,~0,~0, ~0, 0, 0, 0 }, - {~0,~0,~0,~0, ~0,~0, 0, 0 }, - {~0,~0,~0,~0, ~0,~0,~0, 0 }, - }; - return SkNi::Load(masks + tail).fVec; - } - - SI SkNi load(size_t tail, const int32_t* src) { - return tail ? _mm256_maskload_epi32((const int*)src, mask(tail)) - : SkNi::Load(src); - } - SI SkNu load(size_t tail, const uint32_t* src) { - return tail ? _mm256_maskload_epi32((const int*)src, mask(tail)) - : SkNu::Load(src); - } - SI SkNf load(size_t tail, const float* src) { - return tail ? _mm256_maskload_ps((const float*)src, mask(tail)) - : SkNf::Load(src); - } - SI SkNi gather(size_t tail, const int32_t* src, const SkNi& offset) { - auto m = mask(tail); - return _mm256_mask_i32gather_epi32(SkNi(0).fVec, (const int*)src, offset.fVec, m, 4); - } - SI SkNu gather(size_t tail, const uint32_t* src, const SkNi& offset) { - auto m = mask(tail); - return _mm256_mask_i32gather_epi32(SkNi(0).fVec, (const int*)src, offset.fVec, m, 4); - } - SI SkNf gather(size_t tail, const float* src, const SkNi& offset) { - auto m = _mm256_castsi256_ps(mask(tail)); - return _mm256_mask_i32gather_ps(SkNf(0).fVec, (const float*)src, offset.fVec, m, 4); - } - - static const char* bug = "I don't think MSAN understands maskstore."; - - SI void store(size_t tail, const SkNi& v, int32_t* dst) { - if (tail) { - _mm256_maskstore_epi32((int*)dst, mask(tail), v.fVec); - return sk_msan_mark_initialized(dst, dst+tail, bug); - } - v.store(dst); - } - SI void store(size_t tail, const SkNu& v, uint32_t* dst) { - if (tail) { - _mm256_maskstore_epi32((int*)dst, mask(tail), v.fVec); - return sk_msan_mark_initialized(dst, dst+tail, bug); - } - v.store(dst); - } - SI void store(size_t tail, const SkNf& v, float* dst) { - if (tail) { - _mm256_maskstore_ps((float*)dst, mask(tail), v.fVec); - return sk_msan_mark_initialized(dst, dst+tail, bug); - } - v.store(dst); - } -#endif - SI SkNf SkNf_fma(const SkNf& f, const SkNf& m, const SkNf& a) { return SkNx_fma(f,m,a); } SI SkNi SkNf_round(const SkNf& x, const SkNf& scale) { |