diff options
-rw-r--r-- | gm/gradients.cpp | 2 | ||||
-rw-r--r-- | src/core/SkNx.h | 14 | ||||
-rw-r--r-- | src/effects/gradients/SkRadialGradient.cpp | 87 | ||||
-rw-r--r-- | src/opts/SkNx_neon.h | 44 | ||||
-rw-r--r-- | src/opts/SkNx_sse.h | 40 | ||||
-rw-r--r-- | tests/SkNxTest.cpp | 10 |
6 files changed, 189 insertions, 8 deletions
diff --git a/gm/gradients.cpp b/gm/gradients.cpp index cedaff3322..7ff9d380d1 100644 --- a/gm/gradients.cpp +++ b/gm/gradients.cpp @@ -448,7 +448,7 @@ protected: const SkScalar kRadius = 3000; const SkColor gColors[] = { 0xFFFFFFFF, 0xFF000000 }; fShader.reset(SkGradientShader::CreateRadial(center, kRadius, gColors, NULL, 2, - SkShader::kClamp_TileMode)); + SkShader::kClamp_TileMode)); } void onDraw(SkCanvas* canvas) override { diff --git a/src/core/SkNx.h b/src/core/SkNx.h index 8244e9026c..5b2110bdac 100644 --- a/src/core/SkNx.h +++ b/src/core/SkNx.h @@ -77,6 +77,10 @@ private: template <int N, typename T> class SkNf { typedef SkNb<N, sizeof(T)> Nb; + + static int32_t MyNi(float); + static int64_t MyNi(double); + typedef SkNi<N, decltype(MyNi(T()))> Ni; public: SkNf() {} explicit SkNf(T val) : fLo(val), fHi(val) {} @@ -93,6 +97,8 @@ public: fHi.store(vals+N/2); } + Ni castTrunc() const { return Ni(fLo.castTrunc(), fHi.castTrunc()); } + SkNf operator + (const SkNf& o) const { return SkNf(fLo + o.fLo, fHi + o.fHi); } SkNf operator - (const SkNf& o) const { return SkNf(fLo - o.fLo, fHi - o.fHi); } SkNf operator * (const SkNf& o) const { return SkNf(fLo * o.fLo, fHi * o.fHi); } @@ -172,6 +178,10 @@ private: template <typename T> class SkNf<1,T> { typedef SkNb<1, sizeof(T)> Nb; + + static int32_t MyNi(float); + static int64_t MyNi(double); + typedef SkNi<1, decltype(MyNi(T()))> Ni; public: SkNf() {} explicit SkNf(T val) : fVal(val) {} @@ -179,6 +189,8 @@ public: void store(T vals[1]) const { vals[0] = fVal; } + Ni castTrunc() const { return Ni(fVal); } + SkNf operator + (const SkNf& o) const { return SkNf(fVal + o.fVal); } SkNf operator - (const SkNf& o) const { return SkNf(fVal - o.fVal); } SkNf operator * (const SkNf& o) const { return SkNf(fVal * o.fVal); } @@ -248,4 +260,6 @@ typedef SkNf<4, SkScalar> Sk4s; typedef SkNi<4, uint16_t> Sk4h; typedef SkNi<8, uint16_t> Sk8h; +typedef SkNi<4, int> Sk4i; + #endif//SkNx_DEFINED diff --git a/src/effects/gradients/SkRadialGradient.cpp b/src/effects/gradients/SkRadialGradient.cpp index b25a8750a2..bf3c821f6b 100644 --- a/src/effects/gradients/SkRadialGradient.cpp +++ b/src/effects/gradients/SkRadialGradient.cpp @@ -8,6 +8,7 @@ #include "SkRadialGradient.h" #include "SkRadialGradient_Table.h" +#include "SkNx.h" #define kSQRT_TABLE_BITS 11 #define kSQRT_TABLE_SIZE (1 << kSQRT_TABLE_BITS) @@ -270,13 +271,16 @@ void SkRadialGradient::flatten(SkWriteBuffer& buffer) const { namespace { inline bool radial_completely_pinned(int fx, int dx, int fy, int dy) { - // fast, overly-conservative test: checks unit square instead - // of unit circle - bool xClamped = (fx >= SK_FixedHalf && dx >= 0) || - (fx <= -SK_FixedHalf && dx <= 0); - bool yClamped = (fy >= SK_FixedHalf && dy >= 0) || - (fy <= -SK_FixedHalf && dy <= 0); + // fast, overly-conservative test: checks unit square instead of unit circle + bool xClamped = (fx >= SK_FixedHalf && dx >= 0) || (fx <= -SK_FixedHalf && dx <= 0); + bool yClamped = (fy >= SK_FixedHalf && dy >= 0) || (fy <= -SK_FixedHalf && dy <= 0); + return xClamped || yClamped; +} +inline bool radial_completely_pinned(SkScalar fx, SkScalar dx, SkScalar fy, SkScalar dy) { + // fast, overly-conservative test: checks unit square instead of unit circle + bool xClamped = (fx >= 1 && dx >= 0) || (fx <= -1 && dx <= 0); + bool yClamped = (fy >= 1 && dy >= 0) || (fy <= -1 && dy <= 0); return xClamped || yClamped; } @@ -373,6 +377,70 @@ void shadeSpan_radial_clamp(SkScalar sfx, SkScalar sdx, } } +// TODO: can we get away with 0th approximatino of inverse-sqrt (i.e. faster than rsqrt)? +// seems like ~10bits is more than enough for our use, since we want a byte-index +static inline Sk4f fast_sqrt(const Sk4f& R) { + return R * R.rsqrt(); +} + +static inline Sk4f sum_squares(const Sk4f& a, const Sk4f& b) { + return a * a + b * b; +} + +void shadeSpan_radial_clamp2(SkScalar sfx, SkScalar sdx, SkScalar sfy, SkScalar sdy, + SkPMColor* SK_RESTRICT dstC, const SkPMColor* SK_RESTRICT cache, + int count, int toggle) { + if (radial_completely_pinned(sfx, sdx, sfy, sdy)) { + unsigned fi = SkGradientShaderBase::kCache32Count - 1; + sk_memset32_dither(dstC, + cache[toggle + fi], + cache[next_dither_toggle(toggle) + fi], + count); + } else { + const Sk4f max(255); + const float scale = 255; + sfx *= scale; + sfy *= scale; + sdx *= scale; + sdy *= scale; + const Sk4f fx4(sfx, sfx + sdx, sfx + 2*sdx, sfx + 3*sdx); + const Sk4f fy4(sfy, sfy + sdy, sfy + 2*sdy, sfy + 3*sdy); + const Sk4f dx4(sdx * 4); + const Sk4f dy4(sdy * 4); + + Sk4f tmpxy = fx4 * dx4 + fy4 * dy4; + Sk4f tmpdxdy = sum_squares(dx4, dy4); + Sk4f R = sum_squares(fx4, fy4); + Sk4f dR = tmpxy + tmpxy + tmpdxdy; + const Sk4f ddR = tmpdxdy + tmpdxdy; + + for (int i = 0; i < (count >> 2); ++i) { + Sk4f dist = Sk4f::Min(fast_sqrt(R), max); + R += dR; + dR += ddR; + + int fi[4]; + dist.castTrunc().store(fi); + + for (int i = 0; i < 4; i++) { + *dstC++ = cache[toggle + fi[i]]; + toggle = next_dither_toggle(toggle); + } + } + count &= 3; + if (count) { + Sk4f dist = Sk4f::Min(fast_sqrt(R), max); + + int fi[4]; + dist.castTrunc().store(fi); + for (int i = 0; i < count; i++) { + *dstC++ = cache[toggle + fi[i]]; + toggle = next_dither_toggle(toggle); + } + } + } +} + // Unrolling this loop doesn't seem to help (when float); we're stalling to // get the results of the sqrt (?), and don't have enough extra registers to // have many in flight. @@ -407,6 +475,11 @@ void shadeSpan_radial_repeat(SkScalar fx, SkScalar dx, SkScalar fy, SkScalar dy, void SkRadialGradient::RadialGradientContext::shadeSpan(int x, int y, SkPMColor* SK_RESTRICT dstC, int count) { +#ifdef SK_SUPPORT_LEGACY_RADIAL_GRADIENT_SQRT + const bool use_new_proc = false; +#else + const bool use_new_proc = true; +#endif SkASSERT(count > 0); const SkRadialGradient& radialGradient = static_cast<const SkRadialGradient&>(fShader); @@ -435,7 +508,7 @@ void SkRadialGradient::RadialGradientContext::shadeSpan(int x, int y, RadialShadeProc shadeProc = shadeSpan_radial_repeat; if (SkShader::kClamp_TileMode == radialGradient.fTileMode) { - shadeProc = shadeSpan_radial_clamp; + shadeProc = use_new_proc ? shadeSpan_radial_clamp2 : shadeSpan_radial_clamp; } else if (SkShader::kMirror_TileMode == radialGradient.fTileMode) { shadeProc = shadeSpan_radial_mirror; } else { diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h index 04db8781bc..1da86ab7bf 100644 --- a/src/opts/SkNx_neon.h +++ b/src/opts/SkNx_neon.h @@ -181,6 +181,48 @@ private: #endif//defined(SK_CPU_ARM64) template <> +class SkNi<4, int> { +public: + SkNi(const int32x4_t& vec) : fVec(vec) {} + + SkNi() {} + explicit SkNi(int val) : fVec(vdupq_n_s32(val)) {} + static SkNi Load(const int vals[4]) { return vld1q_s32(vals); } + SkNi(int a, int b, int c, int d) { fVec = (int32x4_t) { a, b, c, d }; } + + void store(int vals[4]) const { vst1q_s32(vals, fVec); } + + SkNi operator + (const SkNi& o) const { return vaddq_s32(fVec, o.fVec); } + SkNi operator - (const SkNi& o) const { return vsubq_s32(fVec, o.fVec); } + SkNi operator * (const SkNi& o) const { return vmulq_s32(fVec, o.fVec); } + + // Well, this is absurd. The shifts require compile-time constant arguments. +#define SHIFT(op, v, bits) switch(bits) { \ + case 1: return op(v, 1); case 2: return op(v, 2); case 3: return op(v, 3); \ + case 4: return op(v, 4); case 5: return op(v, 5); case 6: return op(v, 6); \ + case 7: return op(v, 7); case 8: return op(v, 8); case 9: return op(v, 9); \ + case 10: return op(v, 10); case 11: return op(v, 11); case 12: return op(v, 12); \ + case 13: return op(v, 13); case 14: return op(v, 14); case 15: return op(v, 15); \ + case 16: return op(v, 16); case 17: return op(v, 17); case 18: return op(v, 18); \ + case 19: return op(v, 19); case 20: return op(v, 20); case 21: return op(v, 21); \ + case 22: return op(v, 22); case 23: return op(v, 23); case 24: return op(v, 24); \ + case 25: return op(v, 25); case 26: return op(v, 26); case 27: return op(v, 27); \ + case 28: return op(v, 28); case 29: return op(v, 29); case 30: return op(v, 30); \ + case 31: return op(v, 31); } return fVec + + SkNi operator << (int bits) const { SHIFT(vshlq_n_s32, fVec, bits); } + SkNi operator >> (int bits) const { SHIFT(vshrq_n_s32, fVec, bits); } +#undef SHIFT + + template <int k> int kth() const { + SkASSERT(0 <= k && k < 4); + return vgetq_lane_s32(fVec, k); + } +protected: + int32x4_t fVec; +}; + +template <> class SkNf<4, float> { typedef SkNb<4, 4> Nb; public: @@ -193,6 +235,8 @@ public: void store(float vals[4]) const { vst1q_f32(vals, fVec); } + SkNi<4, int> castTrunc() const { return vcvtq_s32_f32(fVec); } + SkNf approxInvert() const { float32x4_t est0 = vrecpeq_f32(fVec), est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0); diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index 46ddcb2d12..2608525bd0 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -142,6 +142,44 @@ private: }; template <> +class SkNi<4, int> { +public: + SkNi(const __m128i& vec) : fVec(vec) {} + + SkNi() {} + explicit SkNi(int val) : fVec(_mm_set1_epi32(val)) {} + static SkNi Load(const int vals[4]) { return _mm_loadu_si128((const __m128i*)vals); } + SkNi(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {} + + void store(int vals[4]) const { _mm_storeu_si128((__m128i*)vals, fVec); } + + SkNi operator + (const SkNi& o) const { return _mm_add_epi32(fVec, o.fVec); } + SkNi operator - (const SkNi& o) const { return _mm_sub_epi32(fVec, o.fVec); } + SkNi operator * (const SkNi& o) const { + __m128i mul20 = _mm_mul_epu32(fVec, o.fVec), + mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.fVec, 4)); + return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)), + _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0))); + } + + SkNi operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } + SkNi operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); } + + template <int k> int kth() const { + SkASSERT(0 <= k && k < 4); + switch (k) { + case 0: return _mm_cvtsi128_si32(fVec); + case 1: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 4)); + case 2: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 8)); + case 3: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 12)); + default: SkASSERT(false); return 0; + } + } +protected: + __m128i fVec; +}; + +template <> class SkNf<4, float> { typedef SkNb<4, 4> Nb; public: @@ -154,6 +192,8 @@ public: void store(float vals[4]) const { _mm_storeu_ps(vals, fVec); } + SkNi<4, int> castTrunc() const { return _mm_cvttps_epi32(fVec); } + SkNf operator + (const SkNf& o) const { return _mm_add_ps(fVec, o.fVec); } SkNf operator - (const SkNf& o) const { return _mm_sub_ps(fVec, o.fVec); } SkNf operator * (const SkNf& o) const { return _mm_mul_ps(fVec, o.fVec); } diff --git a/tests/SkNxTest.cpp b/tests/SkNxTest.cpp index af6918e38c..25ece3867b 100644 --- a/tests/SkNxTest.cpp +++ b/tests/SkNxTest.cpp @@ -89,6 +89,12 @@ void test_Ni(skiatest::Reporter* r) { case 4: REPORTER_ASSERT(r, vals[2] == c && vals[3] == d); case 2: REPORTER_ASSERT(r, vals[0] == a && vals[1] == b); } + switch (N) { + case 8: REPORTER_ASSERT(r, v.template kth<4>() == e && v.template kth<5>() == f && + v.template kth<6>() == g && v.template kth<7>() == h); + case 4: REPORTER_ASSERT(r, v.template kth<2>() == c && v.template kth<3>() == d); + case 2: REPORTER_ASSERT(r, v.template kth<0>() == a && v.template kth<1>() == b); + } }; T vals[] = { 1,2,3,4,5,6,7,8 }; @@ -117,4 +123,8 @@ DEF_TEST(SkNi, r) { test_Ni<2, uint16_t>(r); test_Ni<4, uint16_t>(r); test_Ni<8, uint16_t>(r); + + test_Ni<2, int>(r); + test_Ni<4, int>(r); + test_Ni<8, int>(r); } |