diff options
author | 2015-05-13 14:08:45 -0700 | |
---|---|---|
committer | 2015-05-13 14:08:45 -0700 | |
commit | 2d8d33e9e825f9919875be64a71b746189b385be (patch) | |
tree | fd07661c53e4e6a796a43bdf4bc3b433880d1caf /src/core | |
parent | 177cb8498662600e9866e260d52d6a2401ad78a8 (diff) |
Sk4px: SrcATop, DstATop, Xor, Multiply
SSE runs 2-3x faster (than 4f), NEON runs 1.2-1.4x faster (than existing NEON).
Small diffs on {aarectmodes, imagefilters_xfermodes, hairmodes, mixed_xfermodes} only on AA edges due to precision drop.
BUG=skia:
Review URL: https://codereview.chromium.org/1132853005
Diffstat (limited to 'src/core')
-rw-r--r-- | src/core/Sk4px.h | 17 | ||||
-rw-r--r-- | src/core/SkXfermode.cpp | 29 |
2 files changed, 33 insertions, 13 deletions
diff --git a/src/core/Sk4px.h b/src/core/Sk4px.h index af078ca92b..5537b90230 100644 --- a/src/core/Sk4px.h +++ b/src/core/Sk4px.h @@ -14,21 +14,22 @@ // 1, 2 or 4 SkPMColors, generally vectorized. class Sk4px : public Sk16b { public: - Sk4px(SkAlpha a) : INHERITED(a) {} // Duplicate 16x. - Sk4px(SkPMColor); // Duplicate 4x. + Sk4px(SkAlpha a) : INHERITED(a) {} // Duplicate 16x: a -> aaaa aaaa aaaa aaaa + Sk4px(SkPMColor); // Duplicate 4x: argb -> argb argb argb argb Sk4px(const Sk16b& v) : INHERITED(v) {} - // ARGB argb XYZW xyzw -> AAAA aaaa XXXX xxxx - Sk4px alphas() const; + Sk4px alphas() const; // ARGB argb XYZW xyzw -> AAAA aaaa XXXX xxxx + + Sk4px inv() const { return Sk16b(255) - *this; } // When loading or storing fewer than 4 SkPMColors, we use the low lanes. - static Sk4px Load4(const SkPMColor[4]); - static Sk4px Load2(const SkPMColor[2]); - static Sk4px Load1(const SkPMColor[1]); + static Sk4px Load4(const SkPMColor[4]); // PMColor[4] -> ARGB argb XYZW xyzw + static Sk4px Load2(const SkPMColor[2]); // PMColor[2] -> ARGB argb ???? ???? + static Sk4px Load1(const SkPMColor[1]); // PMColor[1] -> ARGB ???? ???? ???? // Ditto for Alphas... Load2Alphas fills the low two lanes of Sk4px. static Sk4px Load4Alphas(const SkAlpha[4]); // AaXx -> AAAA aaaa XXXX xxxx - static Sk4px Load2Alphas(const SkAlpha[2]); // Aa -> AAAA aaaa 0000 0000 + static Sk4px Load2Alphas(const SkAlpha[2]); // Aa -> AAAA aaaa ???? ???? void store4(SkPMColor[4]) const; void store2(SkPMColor[2]) const; diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp index a2ab65b0a3..2abe55a8ca 100644 --- a/src/core/SkXfermode.cpp +++ b/src/core/SkXfermode.cpp @@ -1238,6 +1238,10 @@ struct SrcATop4f { const Sk4f inv255(gInv255); return check_as_pmfloat(dst + (src * Sk4f(dst.a()) - dst * Sk4f(src.a())) * inv255); } + static Sk4px Xfer(const Sk4px& src, const Sk4px& dst) { + return Sk4px::Wide(src.mulWiden(dst.alphas()) + dst.mulWiden(src.alphas().inv())) + .div255RoundNarrow(); + } static const bool kFoldCoverageIntoSrcAlpha = true; static const SkXfermode::Mode kMode = SkXfermode::kSrcATop_Mode; }; @@ -1245,8 +1249,10 @@ struct SrcATop4f { // kDstATop_Mode, //!< [Sa, Sa * Dc + Sc * (1 - Da)] struct DstATop4f { static SkPMFloat Xfer(const SkPMFloat& src, const SkPMFloat& dst) { - const Sk4f inv255(gInv255); - return check_as_pmfloat(src + (dst * Sk4f(src.a()) - src * Sk4f(dst.a())) * inv255); + return SrcATop4f::Xfer(dst, src); + } + static Sk4px Xfer(const Sk4px& src, const Sk4px& dst) { + return SrcATop4f::Xfer(dst, src); } static const bool kFoldCoverageIntoSrcAlpha = false; static const SkXfermode::Mode kMode = SkXfermode::kDstATop_Mode; @@ -1258,6 +1264,10 @@ struct Xor4f { const Sk4f inv255(gInv255); return check_as_pmfloat(src + dst - (src * Sk4f(dst.a()) + dst * Sk4f(src.a())) * inv255); } + static Sk4px Xfer(const Sk4px& src, const Sk4px& dst) { + return Sk4px::Wide(src.mulWiden(dst.alphas().inv()) + dst.mulWiden(src.alphas().inv())) + .div255RoundNarrow(); + } static const bool kFoldCoverageIntoSrcAlpha = true; static const SkXfermode::Mode kMode = SkXfermode::kXor_Mode; }; @@ -1295,9 +1305,8 @@ struct Screen4f { } static Sk4px Xfer(const Sk4px& src, const Sk4px& dst) { // Doing the math as S + (1-S)*D or S + (D - S*D) means the add and subtract can be done - // in 8-bit space without overflow. S + (1-S)*D is a touch faster because 255-x is an xor. - // TODO: do we need to explicitly implement / call Sk16b(255) ^ src ? - return src + Sk4px(Sk16b(255) - src).mulWiden(dst).div255RoundNarrow(); + // in 8-bit space without overflow. S + (1-S)*D is a touch faster because inv() is cheap. + return src + src.inv().mulWiden(dst).div255RoundNarrow(); } static const bool kFoldCoverageIntoSrcAlpha = true; static const SkXfermode::Mode kMode = SkXfermode::kScreen_Mode; @@ -1314,6 +1323,12 @@ struct Multiply4f { // ra = srcover(sa, da), but the calc for rc happens to accomplish this for us return check_as_pmfloat(clamp_0_255(rc)); } + static Sk4px Xfer(const Sk4px& src, const Sk4px& dst) { + return Sk4px::Wide(src.mulWiden(dst.alphas().inv()) + + dst.mulWiden(src.alphas().inv()) + + src.mulWiden(dst)) + .div255RoundNarrow(); + } static const bool kFoldCoverageIntoSrcAlpha = false; static const SkXfermode::Mode kMode = SkXfermode::kMultiply_Mode; }; @@ -1472,9 +1487,13 @@ SkXfermode* create_mode(int iMode) { #if defined(SK_4PX_XFERMODES_ARE_FAST) && !defined(SK_PREFER_LEGACY_FLOAT_XFERMODES) switch (mode) { + case SkXfermode::kSrcATop_Mode: return SkT4pxXfermode<SrcATop4f>::Create(rec); + case SkXfermode::kDstATop_Mode: return SkT4pxXfermode<DstATop4f>::Create(rec); + case SkXfermode::kXor_Mode: return SkT4pxXfermode<Xor4f>::Create(rec); case SkXfermode::kPlus_Mode: return SkT4pxXfermode<Plus4f>::Create(rec); case SkXfermode::kModulate_Mode: return SkT4pxXfermode<Modulate4f>::Create(rec); case SkXfermode::kScreen_Mode: return SkT4pxXfermode<Screen4f>::Create(rec); + case SkXfermode::kMultiply_Mode: return SkT4pxXfermode<Multiply4f>::Create(rec); default: break; } #endif |