aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/core
diff options
context:
space:
mode:
authorGravatar mtklein <mtklein@chromium.org>2015-05-12 15:48:09 -0700
committerGravatar Commit bot <commit-bot@chromium.org>2015-05-12 15:48:09 -0700
commit6cbf18c70bf99f58b2bb1c49cdf8d41be561fee4 (patch)
tree9e634dfc2fc8a957b50c2bf5d830344ab106943c /src/core
parent8c9b6f1d71faeff9ae742425466a4ee58492d6d2 (diff)
Plus xfermode using Sk4px.
Xfermode_Plus runs 4-5x faster. We expect mixed_xfermodes to have a small diff. This is because kFoldCoverageIntoSrcAlpha was incorrectly set to true. This implementation handily beats the Sk4f impl, the portable impl, and the existing SSE2 impl. Reading the SkXfermodes_opts_SSE2.cpp file, I'm pretty confident that we'll be able to beat all SSE2 impls. I believe this impl will beat or match the existing NEON impl too, but that may not be true for more complicated xfermodes. They can take advantage of transposing ARGBARGB... to AAAARRRR.... cheaply and I haven't figured out an abstraction for that yet that doesn't screw SSE. Adds: - MapDstSrc() to Sk4px - saturatedAdd() to SkNi (only implemented as far as it's used). - div255Narrow() BUG=skia: Review URL: https://codereview.chromium.org/1138893002
Diffstat (limited to 'src/core')
-rw-r--r--src/core/Sk4px.h72
-rw-r--r--src/core/SkNx.h10
-rw-r--r--src/core/SkXfermode.cpp56
3 files changed, 137 insertions, 1 deletions
diff --git a/src/core/Sk4px.h b/src/core/Sk4px.h
index 3d2a8e3362..830e60a156 100644
--- a/src/core/Sk4px.h
+++ b/src/core/Sk4px.h
@@ -34,6 +34,12 @@ public:
// Pack the top byte of each component back down into 4 SkPMColors.
Sk4px addNarrowHi(const Sk16h&) const;
+
+ Sk4px div255TruncNarrow() const { return this->addNarrowHi(*this >> 8); }
+ Sk4px div255RoundNarrow() const {
+ return Sk4px::Wide(*this + Sk16h(128)).div255TruncNarrow();
+ }
+
private:
typedef Sk16h INHERITED;
};
@@ -73,6 +79,72 @@ public:
}
}
+ // As above, but with dst4' = fn(dst4, src4).
+ template <typename Fn>
+ static void MapDstSrc(int count, SkPMColor* dst, const SkPMColor* src, Fn fn) {
+ while (count > 0) {
+ if (count >= 8) {
+ Sk4px dst0 = fn(Load4(dst+0), Load4(src+0)),
+ dst4 = fn(Load4(dst+4), Load4(src+4));
+ dst0.store4(dst+0);
+ dst4.store4(dst+4);
+ dst += 8; src += 8; count -= 8;
+ continue; // Keep our stride at 8 pixels as long as possible.
+ }
+ SkASSERT(count <= 7);
+ if (count >= 4) {
+ fn(Load4(dst), Load4(src)).store4(dst);
+ dst += 4; src += 4; count -= 4;
+ }
+ if (count >= 2) {
+ fn(Load2(dst), Load2(src)).store2(dst);
+ dst += 2; src += 2; count -= 2;
+ }
+ if (count >= 1) {
+ fn(Load1(dst), Load1(src)).store1(dst);
+ }
+ break;
+ }
+ }
+
+ // As above, but with dst4' = fn(dst4, src4, alpha4).
+ template <typename Fn>
+ static void MapDstSrcAlpha(
+ int count, SkPMColor* dst, const SkPMColor* src, const SkAlpha* a, Fn fn) {
+ // TODO: find a terser / faster way to construct Sk16b alphas.
+ while (count > 0) {
+ if (count >= 8) {
+ Sk16b alpha0(a[0],a[0],a[0],a[0], a[1],a[1],a[1],a[1],
+ a[2],a[2],a[2],a[2], a[3],a[3],a[3],a[3]),
+ alpha4(a[4],a[4],a[4],a[4], a[5],a[5],a[5],a[5],
+ a[6],a[6],a[6],a[6], a[7],a[7],a[7],a[7]);
+ Sk4px dst0 = fn(Load4(dst+0), Load4(src+0), alpha0),
+ dst4 = fn(Load4(dst+4), Load4(src+4), alpha4);
+ dst0.store4(dst+0);
+ dst4.store4(dst+4);
+ dst += 8; src += 8; a += 8; count -= 8;
+ continue; // Keep our stride at 8 pixels as long as possible.
+ }
+ SkASSERT(count <= 7);
+ if (count >= 4) {
+ Sk16b alpha(a[0],a[0],a[0],a[0], a[1],a[1],a[1],a[1],
+ a[2],a[2],a[2],a[2], a[3],a[3],a[3],a[3]);
+ fn(Load4(dst), Load4(src), alpha).store4(dst);
+ dst += 4; src += 4; a += 4; count -= 4;
+ }
+ if (count >= 2) {
+ Sk16b alpha(a[0],a[0],a[0],a[0], a[1],a[1],a[1],a[1], 0,0,0,0, 0,0,0,0);
+ fn(Load2(dst), Load2(src), alpha).store2(dst);
+ dst += 2; src += 2; a += 2; count -= 2;
+ }
+ if (count >= 1) {
+ Sk16b alpha(a[0],a[0],a[0],a[0], 0,0,0,0, 0,0,0,0, 0,0,0,0);
+ fn(Load1(dst), Load1(src), alpha).store1(dst);
+ }
+ break;
+ }
+ }
+
private:
typedef Sk16b INHERITED;
};
diff --git a/src/core/SkNx.h b/src/core/SkNx.h
index 9d319623a6..ed939c3e4f 100644
--- a/src/core/SkNx.h
+++ b/src/core/SkNx.h
@@ -57,6 +57,10 @@ public:
fHi.store(vals+N/2);
}
+ SkNi saturatedAdd(const SkNi& o) const {
+ return SkNi(fLo.saturatedAdd(o.fLo), fHi.saturatedAdd(o.fHi));
+ }
+
SkNi operator + (const SkNi& o) const { return SkNi(fLo + o.fLo, fHi + o.fHi); }
SkNi operator - (const SkNi& o) const { return SkNi(fLo - o.fLo, fHi - o.fHi); }
SkNi operator * (const SkNi& o) const { return SkNi(fLo * o.fLo, fHi * o.fHi); }
@@ -166,6 +170,12 @@ public:
void store(T vals[1]) const { vals[0] = fVal; }
+ SkNi saturatedAdd(const SkNi& o) const {
+ SkASSERT((T)(~0) > 0); // TODO: support signed T
+ T sum = fVal + o.fVal;
+ return SkNi(sum > fVal ? sum : (T)(~0));
+ }
+
SkNi operator + (const SkNi& o) const { return SkNi(fVal + o.fVal); }
SkNi operator - (const SkNi& o) const { return SkNi(fVal - o.fVal); }
SkNi operator * (const SkNi& o) const { return SkNi(fVal * o.fVal); }
diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp
index ba9670a9db..ee000233d5 100644
--- a/src/core/SkXfermode.cpp
+++ b/src/core/SkXfermode.cpp
@@ -9,6 +9,7 @@
#include "SkXfermode.h"
#include "SkXfermode_opts_SSE2.h"
#include "SkXfermode_proccoeff.h"
+#include "Sk4px.h"
#include "SkColorPriv.h"
#include "SkLazyPtr.h"
#include "SkMathPriv.h"
@@ -1269,7 +1270,10 @@ struct Plus4f {
static SkPMFloat Xfer(const SkPMFloat& src, const SkPMFloat& dst) {
return check_as_pmfloat(clamp_255(src + dst));
}
- static const bool kFoldCoverageIntoSrcAlpha = true;
+ static Sk4px Xfer(const Sk4px& src, const Sk4px& dst) {
+ return src.saturatedAdd(dst);
+ }
+ static const bool kFoldCoverageIntoSrcAlpha = false;
static const SkXfermode::Mode kMode = SkXfermode::kPlus_Mode;
};
@@ -1279,6 +1283,9 @@ struct Modulate4f {
const Sk4f inv255(gInv255);
return check_as_pmfloat(src * dst * inv255);
}
+ static Sk4px Xfer(const Sk4px& src, const Sk4px& dst) {
+ return src.mulWiden(dst).div255RoundNarrow();
+ }
static const bool kFoldCoverageIntoSrcAlpha = false;
static const SkXfermode::Mode kMode = SkXfermode::kModulate_Mode;
};
@@ -1289,6 +1296,12 @@ struct Screen4f {
const Sk4f inv255(gInv255);
return check_as_pmfloat(src + dst - src * dst * inv255);
}
+ static Sk4px Xfer(const Sk4px& src, const Sk4px& dst) {
+ // Doing the math as S + (1-S)*D or S + (D - S*D) means the add and subtract can be done
+ // in 8-bit space without overflow. S + (1-S)*D is a touch faster because 255-x is an xor.
+ // TODO: do we need to explicitly implement / call Sk16b(255) ^ src ?
+ return src + Sk4px(Sk16b(255) - src).mulWiden(dst).div255RoundNarrow();
+ }
static const bool kFoldCoverageIntoSrcAlpha = true;
static const SkXfermode::Mode kMode = SkXfermode::kScreen_Mode;
};
@@ -1370,6 +1383,35 @@ private:
typedef SkProcCoeffXfermode INHERITED;
};
+
+template <typename ProcType>
+class SkT4pxXfermode : public SkProcCoeffXfermode {
+public:
+ static SkXfermode* Create(const ProcCoeff& rec) {
+ return SkNEW_ARGS(SkT4pxXfermode, (rec));
+ }
+
+ void xfer32(SkPMColor dst[], const SkPMColor src[], int n, const SkAlpha aa[]) const override {
+ if (NULL == aa) {
+ Sk4px::MapDstSrc(n, dst, src, [&](const Sk4px& dst4, const Sk4px& src4) {
+ return ProcType::Xfer(src4, dst4);
+ });
+ } else {
+ Sk4px::MapDstSrcAlpha(n, dst, src, aa,
+ [&](const Sk4px& dst4, const Sk4px& src4, const Sk16b& alpha) {
+ // We can't exploit kFoldCoverageIntoSrcAlpha. That requires >=24-bit intermediates.
+ Sk4px res4 = ProcType::Xfer(src4, dst4);
+ return Sk4px::Wide(res4.mulWiden(alpha) + dst4.mulWiden(Sk16b(255)-alpha))
+ .div255RoundNarrow();
+ });
+ }
+ }
+
+private:
+ SkT4pxXfermode(const ProcCoeff& rec) : SkProcCoeffXfermode(rec, ProcType::kMode) {}
+
+ typedef SkProcCoeffXfermode INHERITED;
+};
#endif
///////////////////////////////////////////////////////////////////////////////
@@ -1445,6 +1487,7 @@ SkXfermode* create_mode(int iMode) {
case SkXfermode::kXor_Mode:
xfer = SkT4fXfermode<Xor4f>::Create(rec);
break;
+ #ifdef SK_PREFER_LEGACY_FLOAT_XFERMODES
case SkXfermode::kPlus_Mode:
xfer = SkT4fXfermode<Plus4f>::Create(rec);
break;
@@ -1454,6 +1497,17 @@ SkXfermode* create_mode(int iMode) {
case SkXfermode::kScreen_Mode:
xfer = SkT4fXfermode<Screen4f>::Create(rec);
break;
+ #else
+ case SkXfermode::kPlus_Mode:
+ xfer = SkT4pxXfermode<Plus4f>::Create(rec);
+ break;
+ case SkXfermode::kModulate_Mode:
+ xfer = SkT4pxXfermode<Modulate4f>::Create(rec);
+ break;
+ case SkXfermode::kScreen_Mode:
+ xfer = SkT4pxXfermode<Screen4f>::Create(rec);
+ break;
+ #endif
case SkXfermode::kMultiply_Mode:
xfer = SkT4fXfermode<Multiply4f>::Create(rec);
break;