diff options
author | mtklein <mtklein@chromium.org> | 2015-07-14 10:54:19 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2015-07-14 10:54:19 -0700 |
commit | 4be181e304d2b280c6801bd13369cfba236d1a66 (patch) | |
tree | ae0510f8a6504c3333582fa004e961a8771a2d99 /src | |
parent | a5517e2b190a8083b38964972b031c13e99f1012 (diff) |
3-15% speedup to HardLight / Overlay xfermodes.
While investigating my bug (skia:4052) I saw this TODO and figured
it'd make me feel better about an otherwise unsuccessful investigation.
This speeds up HardLight and Overlay (same code) by about 15% with SSE, mostly
by rewriting the logic from 1 cheap comparison and 2 expensive div255() calls
to 2 cheap comparisons and 1 expensive div255().
NEON speeds up by a more modest ~3%.
BUG=skia:
Review URL: https://codereview.chromium.org/1230663005
Diffstat (limited to 'src')
-rw-r--r-- | src/core/Sk4px.h | 2 | ||||
-rw-r--r-- | src/core/Sk4pxXfermode.h | 6 | ||||
-rw-r--r-- | src/opts/Sk4px_NEON.h | 6 | ||||
-rw-r--r-- | src/opts/Sk4px_SSE2.h | 5 | ||||
-rw-r--r-- | src/opts/Sk4px_none.h | 2 | ||||
-rw-r--r-- | src/opts/SkNx_neon.h | 5 | ||||
-rw-r--r-- | src/opts/SkNx_sse.h | 5 |
7 files changed, 27 insertions, 4 deletions
diff --git a/src/core/Sk4px.h b/src/core/Sk4px.h index e046e265fe..e1d4dc1244 100644 --- a/src/core/Sk4px.h +++ b/src/core/Sk4px.h @@ -70,6 +70,7 @@ public: Wide operator >> (int bits) const { return INHERITED::operator>>(bits); } Wide operator << (int bits) const { return INHERITED::operator<<(bits); } static Wide Min(const Wide& a, const Wide& b) { return INHERITED::Min(a,b); } + Wide thenElse(const Wide& t, const Wide& e) const { return INHERITED::thenElse(t,e); } private: typedef Sk16h INHERITED; @@ -77,6 +78,7 @@ public: Wide widenLo() const; // ARGB -> 0A 0R 0G 0B Wide widenHi() const; // ARGB -> A0 R0 G0 B0 + Wide widenLoHi() const; // ARGB -> AA RR GG BB Wide mulWiden(const Sk16b&) const; // 8-bit x 8-bit -> 16-bit components. // The only 8-bit multiply we use is 8-bit x 8-bit -> 16-bit. Might as well make it pithy. diff --git a/src/core/Sk4pxXfermode.h b/src/core/Sk4pxXfermode.h index 98b0bd901f..97321b7413 100644 --- a/src/core/Sk4pxXfermode.h +++ b/src/core/Sk4pxXfermode.h @@ -68,15 +68,13 @@ XFERMODE(HardLight) { auto sa = s.alphas(), da = d.alphas(); - auto isLite = (sa-s) < s; + auto isLite = ((sa-s) < s).widenLoHi(); auto dark = s*d << 1, lite = sa*da - ((da-d)*(sa-s) << 1), both = s*da.inv() + d*sa.inv(); - // TODO: do isLite in 16-bit so we only have to div255() once. - auto colors = isLite.thenElse((lite + both).div255(), - (dark + both).div255()); + auto colors = (both + isLite.thenElse(lite, dark)).div255(); return alphas.zeroColors() + colors.zeroAlphas(); } XFERMODE(Overlay) { return HardLight::Xfer(d,s); } diff --git a/src/opts/Sk4px_NEON.h b/src/opts/Sk4px_NEON.h index 9401864697..cd6dea9979 100644 --- a/src/opts/Sk4px_NEON.h +++ b/src/opts/Sk4px_NEON.h @@ -40,6 +40,12 @@ inline Sk4px::Wide Sk4px::widenHi() const { vshll_n_u8(vget_high_u8(this->fVec), 8)); } +inline Sk4px::Wide Sk4px::widenLoHi() const { + auto zipped = vzipq_u8(this->fVec, this->fVec); + return Sk16h((uint16x8_t)zipped.val[0], + (uint16x8_t)zipped.val[1]); +} + inline Sk4px::Wide Sk4px::mulWiden(const Sk16b& other) const { return Sk16h(vmull_u8(vget_low_u8 (this->fVec), vget_low_u8 (other.fVec)), vmull_u8(vget_high_u8(this->fVec), vget_high_u8(other.fVec))); diff --git a/src/opts/Sk4px_SSE2.h b/src/opts/Sk4px_SSE2.h index 74ccffc277..3809c5e47b 100644 --- a/src/opts/Sk4px_SSE2.h +++ b/src/opts/Sk4px_SSE2.h @@ -31,6 +31,11 @@ inline Sk4px::Wide Sk4px::widenHi() const { _mm_unpackhi_epi8(_mm_setzero_si128(), this->fVec)); } +inline Sk4px::Wide Sk4px::widenLoHi() const { + return Sk16h(_mm_unpacklo_epi8(this->fVec, this->fVec), + _mm_unpackhi_epi8(this->fVec, this->fVec)); +} + inline Sk4px::Wide Sk4px::mulWiden(const Sk16b& other) const { return this->widenLo() * Sk4px(other).widenLo(); } diff --git a/src/opts/Sk4px_none.h b/src/opts/Sk4px_none.h index ce2f8452e5..ba13e58fb5 100644 --- a/src/opts/Sk4px_none.h +++ b/src/opts/Sk4px_none.h @@ -48,6 +48,8 @@ inline Sk4px::Wide Sk4px::widenLo() const { inline Sk4px::Wide Sk4px::widenHi() const { return this->widenLo() << 8; } +inline Sk4px::Wide Sk4px::widenLoHi() const { return this->widenLo() + this->widenHi(); } + inline Sk4px::Wide Sk4px::mulWiden(const Sk16b& other) const { return this->widenLo() * Sk4px(other).widenLo(); } diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h index ccba163e56..1cae223b6c 100644 --- a/src/opts/SkNx_neon.h +++ b/src/opts/SkNx_neon.h @@ -337,6 +337,11 @@ public: return vgetq_lane_u16(fVec, k&7); } + SkNi thenElse(const SkNi& t, const SkNi& e) const { + return vorrq_u16(vandq_u16(t.fVec, fVec), + vbicq_u16(e.fVec, fVec)); + } + uint16x8_t fVec; }; diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index 2d3acbe459..e165f58737 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -257,6 +257,11 @@ public: _mm_sub_epi8(b.fVec, top_8x))); } + SkNi thenElse(const SkNi& t, const SkNi& e) const { + return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), + _mm_andnot_si128(fVec, e.fVec)); + } + template <int k> uint16_t kth() const { SkASSERT(0 <= k && k < 8); return _mm_extract_epi16(fVec, k); |