aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar mtklein <mtklein@chromium.org>2015-05-18 07:03:01 -0700
committerGravatar Commit bot <commit-bot@chromium.org>2015-05-18 07:03:01 -0700
commit9b777967b1e531d0ebdb3349c4bd149fdb86589f (patch)
tree1dc9b030c103e7df4e60f9ce948eb31585a4da96
parentd8b544cd04668f130e30a6a29eab8ec43b0bbb8b (diff)
sk4px the rest of the easy xfermodes.
Adds and uses fastMulDiv255Round() where possible, which approximates x*y/255 as (x*y+x)/256. Seems like a sizeable speedup, as seen below on Exclusion, Screen, and Modulate. The existing NEON code uses this approximation for {Src,Dst}x{In,Out,Over}, and without it we'd regress speed there. This will require rebaselines whether or not we use this approximation: the x86 bots change if we do, the ARM bots change if we don't. None of the diffs are significant. Desktop: Xfermode_Screen_aa 5.82ms -> 5.54ms 0.95x Xfermode_Modulate_aa 5.67ms -> 5.36ms 0.95x Xfermode_Exclusion_aa 6.18ms -> 5.81ms 0.94x Xfermode_Exclusion 5.03ms -> 4.24ms 0.84x Xfermode_Screen 4.51ms -> 3.59ms 0.8x Xfermode_Modulate 4.2ms -> 3.19ms 0.76x Xfermode_DstOver 6.73ms -> 3.88ms 0.58x Xfermode_SrcOut 6.47ms -> 3.48ms 0.54x Xfermode_SrcIn 6.46ms -> 3.46ms 0.54x Xfermode_DstOut 6.49ms -> 3.41ms 0.52x Xfermode_DstIn 6.5ms -> 3.32ms 0.51x Xfermode_Src_aa 9.53ms -> 4.75ms 0.5x Xfermode_Clear_aa 9.65ms -> 4.8ms 0.5x Xfermode_DstIn_aa 11.5ms -> 5.57ms 0.49x Xfermode_DstOver_aa 11.6ms -> 5.63ms 0.49x Xfermode_SrcOut_aa 11.6ms -> 5.5ms 0.47x Xfermode_SrcIn_aa 11.7ms -> 5.51ms 0.47x Xfermode_DstOut_aa 11.7ms -> 5.4ms 0.46x N7 performance is close enough to 1x that I'm not sure whether this is a net win, net loss, or truly neutral. I figure the bots will show that. I experimented with another approximation, (x*(255-y))/255 ≈ (x*(256-y))/256. This was inconclusive, so I'm leaving it out for now. The remaining modes are the complicated conditional ones. BUG=skia: Review URL: https://codereview.chromium.org/1141953004
-rw-r--r--src/core/Sk4px.h9
-rw-r--r--src/core/SkXfermode.cpp35
2 files changed, 41 insertions, 3 deletions
diff --git a/src/core/Sk4px.h b/src/core/Sk4px.h
index 028630d100..48e09e1c92 100644
--- a/src/core/Sk4px.h
+++ b/src/core/Sk4px.h
@@ -65,6 +65,15 @@ public:
return this->mulWiden(Sk16b(255));
}
+ // Generally faster than this->mulWiden(other).div255RoundNarrow().
+ // May be incorrect by +-1, but is always exactly correct when *this or other is 0 or 255.
+ Sk4px fastMulDiv255Round(const Sk16b& other) const {
+ // (x*y + x) / 256 meets these criteria. (As of course does (x*y + y) / 256 by symmetry.)
+ Sk4px::Wide x = this->widenLo(),
+ xy = this->mulWiden(other);
+ return x.addNarrowHi(xy);
+ }
+
// A generic driver that maps fn over a src array into a dst array.
// fn should take an Sk4px (4 src pixels) and return an Sk4px (4 dst pixels).
template <typename Fn>
diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp
index 8824a8875f..4f0f9f3d4c 100644
--- a/src/core/SkXfermode.cpp
+++ b/src/core/SkXfermode.cpp
@@ -1232,6 +1232,25 @@ static inline SkPMFloat check_as_pmfloat(const Sk4f& value) {
return pm;
}
+#define XFERMODE(Name) \
+ struct Name { \
+ static Sk4px Xfer(const Sk4px&, const Sk4px&); \
+ static const SkXfermode::Mode kMode = SkXfermode::k##Name##_Mode; \
+ }; \
+ inline Sk4px Name::Xfer(const Sk4px& s, const Sk4px& d)
+
+XFERMODE(Clear) { return Sk4px((SkPMColor)0); }
+XFERMODE(Src) { return s; }
+XFERMODE(Dst) { return d; }
+XFERMODE(SrcIn) { return s.fastMulDiv255Round(d.alphas() ); }
+XFERMODE(SrcOut) { return s.fastMulDiv255Round(d.alphas().inv()); }
+XFERMODE(SrcOver) { return s + d.fastMulDiv255Round(s.alphas().inv()); }
+XFERMODE(DstIn) { return SrcIn ::Xfer(d,s); }
+XFERMODE(DstOut) { return SrcOut ::Xfer(d,s); }
+XFERMODE(DstOver) { return SrcOver::Xfer(d,s); }
+
+#undef XFERMODE
+
// kSrcATop_Mode, //!< [Da, Sc * Da + (1 - Sa) * Dc]
struct SrcATop4f {
static SkPMFloat Xfer(const SkPMFloat& src, const SkPMFloat& dst) {
@@ -1291,7 +1310,7 @@ struct Modulate4f {
return check_as_pmfloat(src * dst * inv255);
}
static Sk4px Xfer(const Sk4px& src, const Sk4px& dst) {
- return src.mulWiden(dst).div255RoundNarrow();
+ return src.fastMulDiv255Round(dst);
}
static const bool kFoldCoverageIntoSrcAlpha = false;
static const SkXfermode::Mode kMode = SkXfermode::kModulate_Mode;
@@ -1306,7 +1325,7 @@ struct Screen4f {
static Sk4px Xfer(const Sk4px& src, const Sk4px& dst) {
// Doing the math as S + (1-S)*D or S + (D - S*D) means the add and subtract can be done
// in 8-bit space without overflow. S + (1-S)*D is a touch faster because inv() is cheap.
- return src + src.inv().mulWiden(dst).div255RoundNarrow();
+ return src + dst.fastMulDiv255Round(src.inv());
}
static const bool kFoldCoverageIntoSrcAlpha = true;
static const SkXfermode::Mode kMode = SkXfermode::kScreen_Mode;
@@ -1366,7 +1385,7 @@ struct Exclusion4f {
return check_as_pmfloat(ra - prod * SkPMFloat(0, 1, 1, 1));
}
static Sk4px Xfer(const Sk4px& src, const Sk4px& dst) {
- auto p = src.mulWiden(dst).div255RoundNarrow();
+ auto p = src.fastMulDiv255Round(dst);
// There's no chance of underflow, and if we subtract p before adding src+dst, no overflow.
return (src - p) + (dst - p.zeroAlphas());
}
@@ -1500,6 +1519,16 @@ SkXfermode* create_mode(int iMode) {
#if defined(SK_4PX_XFERMODES_ARE_FAST) && !defined(SK_PREFER_LEGACY_FLOAT_XFERMODES)
switch (mode) {
+ case SkXfermode::kClear_Mode: return SkT4pxXfermode<Clear>::Create(rec);
+ case SkXfermode::kSrc_Mode: return SkT4pxXfermode<Src>::Create(rec);
+ case SkXfermode::kDst_Mode: return SkT4pxXfermode<Dst>::Create(rec);
+ case SkXfermode::kSrcOver_Mode: return SkT4pxXfermode<SrcOver>::Create(rec);
+ case SkXfermode::kDstOver_Mode: return SkT4pxXfermode<DstOver>::Create(rec);
+ case SkXfermode::kSrcIn_Mode: return SkT4pxXfermode<SrcIn>::Create(rec);
+ case SkXfermode::kDstIn_Mode: return SkT4pxXfermode<DstIn>::Create(rec);
+ case SkXfermode::kSrcOut_Mode: return SkT4pxXfermode<SrcOut>::Create(rec);
+ case SkXfermode::kDstOut_Mode: return SkT4pxXfermode<DstOut>::Create(rec);
+
case SkXfermode::kSrcATop_Mode: return SkT4pxXfermode<SrcATop4f>::Create(rec);
case SkXfermode::kDstATop_Mode: return SkT4pxXfermode<DstATop4f>::Create(rec);
case SkXfermode::kXor_Mode: return SkT4pxXfermode<Xor4f>::Create(rec);