aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar mtklein <mtklein@chromium.org>2015-06-26 10:46:31 -0700
committerGravatar Commit bot <commit-bot@chromium.org>2015-06-26 10:46:31 -0700
commit2aab22a58a366df4752c1cf0f004092c6e7be335 (patch)
treebc4026ca98f28068b99ca6394c05a0129f0dc4d6 /src
parentcdb42bb55c3bdbbd6682dcd50b5c77322bb6e565 (diff)
Color dodge and burn with SkPMFloat.
Both 25-35% faster with SSE. With NEON, Burn measures as a ~10% regression, Dodge a huge 2.9x improvement. The Burn regression is somewhat artificial: we're drawing random colored rects onto an opaque white dst, so we're heavily biased toward the (d==da) fast path in the serial code. In the vector code there's no short-circuiting and we always pay a fixed cost for ColorBurn regardless of src or dst content. Dodge's fast paths, in contrast, only trigger when (s==sa) or (d==0), neither of which happens any more than randomly in our benchmark. I don't think (d==0) should happen at all. Similarly, the (s==0) Burn fast path is really only going to happen as often as SkRandom allows. In practice, the existing Burn benchmark is hitting its fast path 100% of the time. So I actually feel really great that this only dings the benchmark by 10%. Chrome's still guarded by SK_SUPPORT_LEGACY_XFERMODES, which I'll lift after finishing the last xfermode, SoftLight. BUG=skia: Review URL: https://codereview.chromium.org/1214443002
Diffstat (limited to 'src')
-rw-r--r--src/core/Sk4pxXfermode.h75
-rw-r--r--src/core/SkPMFloat.h2
-rw-r--r--src/opts/SkNx_neon.h7
-rw-r--r--src/opts/SkNx_sse.h5
-rw-r--r--src/opts/SkPMFloat_neon.h5
-rw-r--r--src/opts/SkPMFloat_none.h4
-rw-r--r--src/opts/SkPMFloat_sse.h5
-rw-r--r--src/opts/SkXfermode_opts_SSE2.cpp6
8 files changed, 105 insertions, 4 deletions
diff --git a/src/core/Sk4pxXfermode.h b/src/core/Sk4pxXfermode.h
index 09490dc990..b587183046 100644
--- a/src/core/Sk4pxXfermode.h
+++ b/src/core/Sk4pxXfermode.h
@@ -9,11 +9,13 @@
#define Sk4pxXfermode_DEFINED
#include "Sk4px.h"
+#include "SkPMFloat.h"
// This file is possibly included into multiple .cpp files.
// Each gets its own independent instantiation by wrapping in an anonymous namespace.
namespace {
+// Most xfermodes can be done most efficiently 4 pixels at a time in 8 or 16-bit fixed point.
#define XFERMODE(Name) \
struct Name { \
static Sk4px Xfer(const Sk4px&, const Sk4px&); \
@@ -97,7 +99,48 @@ XFERMODE(Lighten) {
colors = (sda < dsa).thenElse(dstover, srcover);
return alphas.zeroColors() + colors.zeroAlphas();
}
+#undef XFERMODE
+
+// Some xfermodes use math like divide or sqrt that's best done in floats 1 pixel at a time.
+#define XFERMODE(Name) \
+ struct Name { \
+ static SkPMFloat Xfer(const SkPMFloat&, const SkPMFloat&); \
+ static const SkXfermode::Mode kMode = SkXfermode::k##Name##_Mode; \
+ }; \
+ inline SkPMFloat Name::Xfer(const SkPMFloat& s, const SkPMFloat& d)
+
+XFERMODE(ColorDodge) {
+ auto sa = s.alphas(),
+ da = d.alphas(),
+ isa = Sk4f(1)-sa,
+ ida = Sk4f(1)-da;
+ auto srcover = s + d*isa,
+ dstover = d + s*ida,
+ otherwise = sa * Sk4f::Min(da, (d*sa)*(sa-s).approxInvert()) + s*ida + d*isa;
+
+ // Order matters here, preferring d==0 over s==sa.
+ auto colors = (d == Sk4f(0)).thenElse(dstover,
+ (s == sa).thenElse(srcover,
+ otherwise));
+ return srcover * SkPMFloat(1,0,0,0) + colors * SkPMFloat(0,1,1,1);
+}
+XFERMODE(ColorBurn) {
+ auto sa = s.alphas(),
+ da = d.alphas(),
+ isa = Sk4f(1)-sa,
+ ida = Sk4f(1)-da;
+
+ auto srcover = s + d*isa,
+ dstover = d + s*ida,
+ otherwise = sa*(da-Sk4f::Min(da, (da-d)*sa*s.approxInvert())) + s*ida + d*isa;
+
+ // Order matters here, preferring d==da over s==0.
+ auto colors = (d == da).thenElse(dstover,
+ (s == Sk4f(0)).thenElse(srcover,
+ otherwise));
+ return srcover * SkPMFloat(1,0,0,0) + colors * SkPMFloat(0,1,1,1);
+}
#undef XFERMODE
// A reasonable fallback mode for doing AA is to simply apply the transfermode first,
@@ -140,7 +183,34 @@ public:
}
private:
- SkT4pxXfermode(const ProcCoeff& rec) : SkProcCoeffXfermode(rec, ProcType::kMode) {}
+ SkT4pxXfermode(const ProcCoeff& rec) : INHERITED(rec, ProcType::kMode) {}
+
+ typedef SkProcCoeffXfermode INHERITED;
+};
+
+template <typename ProcType>
+class SkTPMFloatXfermode : public SkProcCoeffXfermode {
+public:
+ static SkProcCoeffXfermode* Create(const ProcCoeff& rec) {
+ return SkNEW_ARGS(SkTPMFloatXfermode, (rec));
+ }
+
+ void xfer32(SkPMColor dst[], const SkPMColor src[], int n, const SkAlpha aa[]) const override {
+ for (int i = 0; i < n; i++) {
+ SkPMFloat s(src[i]),
+ d(dst[i]),
+ b(ProcType::Xfer(s,d));
+ if (aa) {
+ // We do aa in full float precision before going back down to bytes, because we can!
+ SkPMFloat a = Sk4f(aa[i]) * Sk4f(1.0f/255);
+ b = b*a + d*(Sk4f(1)-a);
+ }
+ dst[i] = b.round();
+ }
+ }
+
+private:
+ SkTPMFloatXfermode(const ProcCoeff& rec) : INHERITED(rec, ProcType::kMode) {}
typedef SkProcCoeffXfermode INHERITED;
};
@@ -171,6 +241,9 @@ static SkProcCoeffXfermode* SkCreate4pxXfermode(const ProcCoeff& rec, SkXfermode
case SkXfermode::kOverlay_Mode: return SkT4pxXfermode<Overlay>::Create(rec);
case SkXfermode::kDarken_Mode: return SkT4pxXfermode<Darken>::Create(rec);
case SkXfermode::kLighten_Mode: return SkT4pxXfermode<Lighten>::Create(rec);
+
+ case SkXfermode::kColorDodge_Mode: return SkTPMFloatXfermode<ColorDodge>::Create(rec);
+ case SkXfermode::kColorBurn_Mode: return SkTPMFloatXfermode<ColorBurn>::Create(rec);
#endif
default: break;
}
diff --git a/src/core/SkPMFloat.h b/src/core/SkPMFloat.h
index f1d302458d..f97f25c9c6 100644
--- a/src/core/SkPMFloat.h
+++ b/src/core/SkPMFloat.h
@@ -27,6 +27,8 @@ public:
static SkPMFloat FromPMColor(SkPMColor c) { return SkPMFloat(c); }
static SkPMFloat FromARGB(float a, float r, float g, float b) { return SkPMFloat(a,r,g,b); }
+ Sk4f alphas() const; // argb -> aaaa, generally faster than the equivalent Sk4f(this->a()).
+
// Uninitialized.
SkPMFloat() {}
explicit SkPMFloat(SkPMColor);
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index b319807779..ccba163e56 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -297,6 +297,13 @@ public:
|| vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3);
}
+ SkNf thenElse(const SkNf& t, const SkNf& e) const {
+ uint32x4_t ci = vreinterpretq_u32_f32(fVec),
+ ti = vreinterpretq_u32_f32(t.fVec),
+ ei = vreinterpretq_u32_f32(e.fVec);
+ return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(ti, ci), vbicq_u32(ei, ci)));
+ }
+
float32x4_t fVec;
};
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index 9b4de700ee..2d3acbe459 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -193,6 +193,11 @@ public:
bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castps_si128(fVec)); }
bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(fVec)); }
+ SkNf thenElse(const SkNf& t, const SkNf& e) const {
+ return _mm_or_ps(_mm_and_ps (fVec, t.fVec),
+ _mm_andnot_ps(fVec, e.fVec));
+ }
+
__m128 fVec;
};
diff --git a/src/opts/SkPMFloat_neon.h b/src/opts/SkPMFloat_neon.h
index c8976860b1..8bee5b551a 100644
--- a/src/opts/SkPMFloat_neon.h
+++ b/src/opts/SkPMFloat_neon.h
@@ -27,4 +27,9 @@ inline SkPMColor SkPMFloat::round() const {
return c;
}
+inline Sk4f SkPMFloat::alphas() const {
+ static_assert(SK_A32_SHIFT == 24, "Assuming little-endian.");
+ return vdupq_lane_f32(vget_high_f32(fVec), 1); // Duplicate high lane of high half i.e. lane 3.
+}
+
} // namespace
diff --git a/src/opts/SkPMFloat_none.h b/src/opts/SkPMFloat_none.h
index ba773564d6..518ad159ff 100644
--- a/src/opts/SkPMFloat_none.h
+++ b/src/opts/SkPMFloat_none.h
@@ -30,4 +30,8 @@ inline SkPMColor SkPMFloat::round() const {
return c;
}
+inline Sk4f SkPMFloat::alphas() const {
+ return Sk4f(this->a());
+}
+
} // namespace
diff --git a/src/opts/SkPMFloat_sse.h b/src/opts/SkPMFloat_sse.h
index 802b17ba0c..28aa90bf29 100644
--- a/src/opts/SkPMFloat_sse.h
+++ b/src/opts/SkPMFloat_sse.h
@@ -33,4 +33,9 @@ inline SkPMColor SkPMFloat::round() const {
return c;
}
+inline Sk4f SkPMFloat::alphas() const {
+ static_assert(SK_A32_SHIFT == 24, "");
+ return _mm_shuffle_ps(fVec, fVec, 0xff); // Read as 11 11 11 11, copying lane 3 to all lanes.
+}
+
} // namespace
diff --git a/src/opts/SkXfermode_opts_SSE2.cpp b/src/opts/SkXfermode_opts_SSE2.cpp
index f8772808a7..ca26263727 100644
--- a/src/opts/SkXfermode_opts_SSE2.cpp
+++ b/src/opts/SkXfermode_opts_SSE2.cpp
@@ -521,11 +521,11 @@ SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec,
case SkProcCoeffXfermode::kDarken_Mode: proc = darken_modeproc_SSE2; break;
case SkProcCoeffXfermode::kLighten_Mode: proc = lighten_modeproc_SSE2; break;
case SkProcCoeffXfermode::kHardLight_Mode: proc = hardlight_modeproc_SSE2; break;
-
- // TODO(mtklein): implement these with SkPMFloat.
- case SkProcCoeffXfermode::kSoftLight_Mode: proc = softlight_modeproc_SSE2; break;
case SkProcCoeffXfermode::kColorDodge_Mode: proc = colordodge_modeproc_SSE2; break;
case SkProcCoeffXfermode::kColorBurn_Mode: proc = colorburn_modeproc_SSE2; break;
+
+ // TODO(mtklein): implement this with SkPMFloat.
+ case SkProcCoeffXfermode::kSoftLight_Mode: proc = softlight_modeproc_SSE2; break;
default: break;
}
return proc ? SkNEW_ARGS(SkSSE2ProcCoeffXfermode, (rec, mode, (void*)proc)) : nullptr;