aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/core
diff options
context:
space:
mode:
authorGravatar mtklein <mtklein@chromium.org>2015-05-13 14:08:45 -0700
committerGravatar Commit bot <commit-bot@chromium.org>2015-05-13 14:08:45 -0700
commit2d8d33e9e825f9919875be64a71b746189b385be (patch)
treefd07661c53e4e6a796a43bdf4bc3b433880d1caf /src/core
parent177cb8498662600e9866e260d52d6a2401ad78a8 (diff)
Sk4px: SrcATop, DstATop, Xor, Multiply
SSE runs 2-3x faster (than 4f), NEON runs 1.2-1.4x faster (than existing NEON). Small diffs on {aarectmodes, imagefilters_xfermodes, hairmodes, mixed_xfermodes} only on AA edges due to precision drop. BUG=skia: Review URL: https://codereview.chromium.org/1132853005
Diffstat (limited to 'src/core')
-rw-r--r--src/core/Sk4px.h17
-rw-r--r--src/core/SkXfermode.cpp29
2 files changed, 33 insertions, 13 deletions
diff --git a/src/core/Sk4px.h b/src/core/Sk4px.h
index af078ca92b..5537b90230 100644
--- a/src/core/Sk4px.h
+++ b/src/core/Sk4px.h
@@ -14,21 +14,22 @@
// 1, 2 or 4 SkPMColors, generally vectorized.
class Sk4px : public Sk16b {
public:
- Sk4px(SkAlpha a) : INHERITED(a) {} // Duplicate 16x.
- Sk4px(SkPMColor); // Duplicate 4x.
+ Sk4px(SkAlpha a) : INHERITED(a) {} // Duplicate 16x: a -> aaaa aaaa aaaa aaaa
+ Sk4px(SkPMColor); // Duplicate 4x: argb -> argb argb argb argb
Sk4px(const Sk16b& v) : INHERITED(v) {}
- // ARGB argb XYZW xyzw -> AAAA aaaa XXXX xxxx
- Sk4px alphas() const;
+ Sk4px alphas() const; // ARGB argb XYZW xyzw -> AAAA aaaa XXXX xxxx
+
+ Sk4px inv() const { return Sk16b(255) - *this; }
// When loading or storing fewer than 4 SkPMColors, we use the low lanes.
- static Sk4px Load4(const SkPMColor[4]);
- static Sk4px Load2(const SkPMColor[2]);
- static Sk4px Load1(const SkPMColor[1]);
+ static Sk4px Load4(const SkPMColor[4]); // PMColor[4] -> ARGB argb XYZW xyzw
+ static Sk4px Load2(const SkPMColor[2]); // PMColor[2] -> ARGB argb ???? ????
+ static Sk4px Load1(const SkPMColor[1]); // PMColor[1] -> ARGB ???? ???? ????
// Ditto for Alphas... Load2Alphas fills the low two lanes of Sk4px.
static Sk4px Load4Alphas(const SkAlpha[4]); // AaXx -> AAAA aaaa XXXX xxxx
- static Sk4px Load2Alphas(const SkAlpha[2]); // Aa -> AAAA aaaa 0000 0000
+ static Sk4px Load2Alphas(const SkAlpha[2]); // Aa -> AAAA aaaa ???? ????
void store4(SkPMColor[4]) const;
void store2(SkPMColor[2]) const;
diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp
index a2ab65b0a3..2abe55a8ca 100644
--- a/src/core/SkXfermode.cpp
+++ b/src/core/SkXfermode.cpp
@@ -1238,6 +1238,10 @@ struct SrcATop4f {
const Sk4f inv255(gInv255);
return check_as_pmfloat(dst + (src * Sk4f(dst.a()) - dst * Sk4f(src.a())) * inv255);
}
+ static Sk4px Xfer(const Sk4px& src, const Sk4px& dst) {
+ return Sk4px::Wide(src.mulWiden(dst.alphas()) + dst.mulWiden(src.alphas().inv()))
+ .div255RoundNarrow();
+ }
static const bool kFoldCoverageIntoSrcAlpha = true;
static const SkXfermode::Mode kMode = SkXfermode::kSrcATop_Mode;
};
@@ -1245,8 +1249,10 @@ struct SrcATop4f {
// kDstATop_Mode, //!< [Sa, Sa * Dc + Sc * (1 - Da)]
struct DstATop4f {
static SkPMFloat Xfer(const SkPMFloat& src, const SkPMFloat& dst) {
- const Sk4f inv255(gInv255);
- return check_as_pmfloat(src + (dst * Sk4f(src.a()) - src * Sk4f(dst.a())) * inv255);
+ return SrcATop4f::Xfer(dst, src);
+ }
+ static Sk4px Xfer(const Sk4px& src, const Sk4px& dst) {
+ return SrcATop4f::Xfer(dst, src);
}
static const bool kFoldCoverageIntoSrcAlpha = false;
static const SkXfermode::Mode kMode = SkXfermode::kDstATop_Mode;
@@ -1258,6 +1264,10 @@ struct Xor4f {
const Sk4f inv255(gInv255);
return check_as_pmfloat(src + dst - (src * Sk4f(dst.a()) + dst * Sk4f(src.a())) * inv255);
}
+ static Sk4px Xfer(const Sk4px& src, const Sk4px& dst) {
+ return Sk4px::Wide(src.mulWiden(dst.alphas().inv()) + dst.mulWiden(src.alphas().inv()))
+ .div255RoundNarrow();
+ }
static const bool kFoldCoverageIntoSrcAlpha = true;
static const SkXfermode::Mode kMode = SkXfermode::kXor_Mode;
};
@@ -1295,9 +1305,8 @@ struct Screen4f {
}
static Sk4px Xfer(const Sk4px& src, const Sk4px& dst) {
// Doing the math as S + (1-S)*D or S + (D - S*D) means the add and subtract can be done
- // in 8-bit space without overflow. S + (1-S)*D is a touch faster because 255-x is an xor.
- // TODO: do we need to explicitly implement / call Sk16b(255) ^ src ?
- return src + Sk4px(Sk16b(255) - src).mulWiden(dst).div255RoundNarrow();
+ // in 8-bit space without overflow. S + (1-S)*D is a touch faster because inv() is cheap.
+ return src + src.inv().mulWiden(dst).div255RoundNarrow();
}
static const bool kFoldCoverageIntoSrcAlpha = true;
static const SkXfermode::Mode kMode = SkXfermode::kScreen_Mode;
@@ -1314,6 +1323,12 @@ struct Multiply4f {
// ra = srcover(sa, da), but the calc for rc happens to accomplish this for us
return check_as_pmfloat(clamp_0_255(rc));
}
+ static Sk4px Xfer(const Sk4px& src, const Sk4px& dst) {
+ return Sk4px::Wide(src.mulWiden(dst.alphas().inv()) +
+ dst.mulWiden(src.alphas().inv()) +
+ src.mulWiden(dst))
+ .div255RoundNarrow();
+ }
static const bool kFoldCoverageIntoSrcAlpha = false;
static const SkXfermode::Mode kMode = SkXfermode::kMultiply_Mode;
};
@@ -1472,9 +1487,13 @@ SkXfermode* create_mode(int iMode) {
#if defined(SK_4PX_XFERMODES_ARE_FAST) && !defined(SK_PREFER_LEGACY_FLOAT_XFERMODES)
switch (mode) {
+ case SkXfermode::kSrcATop_Mode: return SkT4pxXfermode<SrcATop4f>::Create(rec);
+ case SkXfermode::kDstATop_Mode: return SkT4pxXfermode<DstATop4f>::Create(rec);
+ case SkXfermode::kXor_Mode: return SkT4pxXfermode<Xor4f>::Create(rec);
case SkXfermode::kPlus_Mode: return SkT4pxXfermode<Plus4f>::Create(rec);
case SkXfermode::kModulate_Mode: return SkT4pxXfermode<Modulate4f>::Create(rec);
case SkXfermode::kScreen_Mode: return SkT4pxXfermode<Screen4f>::Create(rec);
+ case SkXfermode::kMultiply_Mode: return SkT4pxXfermode<Multiply4f>::Create(rec);
default: break;
}
#endif