aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--bench/SkBlend_optsBench.cpp167
-rw-r--r--src/opts/SkBlend_opts.h235
-rw-r--r--src/opts/SkOpts_sse41.cpp9
-rw-r--r--tests/SkBlend_optsTest.cpp134
4 files changed, 514 insertions, 31 deletions
diff --git a/bench/SkBlend_optsBench.cpp b/bench/SkBlend_optsBench.cpp
new file mode 100644
index 0000000000..9c34d49f34
--- /dev/null
+++ b/bench/SkBlend_optsBench.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <tuple>
+
+#include "Benchmark.h"
+#include "Resources.h"
+#include "SkCpu.h"
+#include "SkImage.h"
+#include "SkImage_Base.h"
+#include "SkNx.h"
+#include "SkOpts.h"
+#include "SkString.h"
+
+#define INNER_LOOPS 10
+
+namespace sk_default {
+extern void brute_force_srcover_srgb_srgb(
+ uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc);
+}
+
+class SrcOverVSkOptsBruteForce {
+public:
+ static SkString Name() { return SkString{"VSkOptsBruteForce"}; }
+ static bool WorksOnCpu() { return true; }
+ static void BlendN(uint32_t* dst, int count, const uint32_t* src) {
+ sk_default::brute_force_srcover_srgb_srgb(dst, src, count, count);
+ }
+};
+
+namespace sk_default {
+extern void trivial_srcover_srgb_srgb(
+ uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc);
+}
+
+class SrcOverVSkOptsTrivial {
+public:
+ static SkString Name() { return SkString{"VSkOptsTrivial"}; }
+ static bool WorksOnCpu() { return true; }
+ static void BlendN(uint32_t* dst, int count, const uint32_t* src) {
+ sk_default::trivial_srcover_srgb_srgb(dst, src, count, count);
+ }
+};
+
+namespace sk_default {
+extern void best_non_simd_srcover_srgb_srgb(
+ uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc);
+}
+
+class SrcOverVSkOptsNonSimdCore {
+public:
+ static SkString Name() { return SkString{"VSkOptsNonSimdCore"}; }
+ static bool WorksOnCpu() { return true; }
+ static void BlendN(uint32_t* dst, int count, const uint32_t* src) {
+ sk_default::best_non_simd_srcover_srgb_srgb(dst, src, count, count);
+ }
+};
+
+namespace sk_default {
+extern void srcover_srgb_srgb(
+ uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc);
+}
+
+class SrcOverVSkOptsDefault {
+public:
+ static SkString Name() { return SkString{"VSkOptsDefault"}; }
+ static bool WorksOnCpu() { return true; }
+ static void BlendN(uint32_t* dst, int count, const uint32_t* src) {
+ sk_default::srcover_srgb_srgb(dst, src, count, count);
+ }
+};
+
+namespace sk_sse41 {
+ extern void srcover_srgb_srgb(
+ uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc);
+}
+
+class SrcOverVSkOptsSSE41 {
+public:
+ static SkString Name() { return SkString{"VSkOptsSSE41"}; }
+ static bool WorksOnCpu() { return SkCpu::Supports(SkCpu::SSE41); }
+ static void BlendN(uint32_t* dst, int count, const uint32_t* src) {
+ sk_sse41::srcover_srgb_srgb(dst, src, count, count);
+ }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Blender>
+class LinearSrcOverBench : public Benchmark {
+public:
+ LinearSrcOverBench(const char* fileName) {
+ fName = "LinearSrcOver";
+ fName.append(fileName);
+ fName.append(Blender::Name());
+
+ sk_sp<SkImage> image = GetResourceAsImage(fileName);
+ SkBitmap bm;
+ if (!as_IB(image)->getROPixels(&bm)) {
+ SkFAIL("Could not read resource");
+ }
+ bm.peekPixels(&fPixmap);
+ fCount = fPixmap.rowBytesAsPixels();
+ fDst.reset(fCount);
+ memset(fDst.get(), 0, fPixmap.rowBytes());
+ }
+
+protected:
+ bool isSuitableFor(Backend backend) override {
+ return backend == kNonRendering_Backend && Blender::WorksOnCpu();
+ }
+ const char* onGetName() override { return fName.c_str(); }
+ void onDraw(int loops, SkCanvas*) override {
+ SkASSERT(fPixmap.colorType() == kN32_SkColorType);
+
+ const int width = fPixmap.rowBytesAsPixels();
+
+ for (int i = 0; i < loops * INNER_LOOPS; ++i) {
+ const uint32_t* src = fPixmap.addr32();
+ for (int y = 0; y < fPixmap.height(); y++) {
+ Blender::BlendN(fDst.get(), width, src);
+ src += width;
+ }
+ }
+ }
+
+ void onPostDraw(SkCanvas*) override {
+ // Make sure the compiler does not optimize away the operation.
+ volatile uint32_t v = 0;
+ for (int i = 0; i < fCount; i++) {
+ v ^= fDst[i];
+ }
+ }
+
+private:
+ int fCount;
+ SkAutoTArray<uint32_t> fDst;
+ SkString fName;
+ SkPixmap fPixmap;
+
+ typedef Benchmark INHERITED;
+};
+
+#if defined(SK_CPU_X86) && !defined(SK_BUILD_FOR_IOS)
+#define BENCHES(fileName) \
+DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsBruteForce>(fileName); ) \
+DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsTrivial>(fileName); ) \
+DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsNonSimdCore>(fileName); ) \
+DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsDefault>(fileName); ) \
+DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsSSE41>(fileName); )
+#else
+#define BENCHES(fileName) \
+DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsBruteForce>(fileName); ) \
+DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsTrivial>(fileName); ) \
+DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsNonSimdCore>(fileName); ) \
+DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsDefault>(fileName); )
+#endif
+
+BENCHES("yellow_rose.png")
+BENCHES("baby_tux.png")
+BENCHES("plane.png")
+BENCHES("mandrill_512.png")
+BENCHES("iconstrip.png")
diff --git a/src/opts/SkBlend_opts.h b/src/opts/SkBlend_opts.h
index a1067407be..93946438e5 100644
--- a/src/opts/SkBlend_opts.h
+++ b/src/opts/SkBlend_opts.h
@@ -5,52 +5,233 @@
* found in the LICENSE file.
*/
+/*
+ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; and ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q
+ */
+
#ifndef SkBlend_opts_DEFINED
#define SkBlend_opts_DEFINED
+#include "SkNx.h"
+#include "SkPM4fPriv.h"
+
namespace SK_OPTS_NS {
-#if 0
+// An implementation of SrcOver from bytes to bytes in linear space that takes advantage of the
+// observation that the 255's cancel.
+// invA = 1 - (As / 255);
+//
+// R = 255 * sqrt((Rs/255)^2 + (Rd/255)^2 * invA)
+// => R = 255 * sqrt((Rs^2 + Rd^2 * invA)/255^2)
+// => R = sqrt(Rs^2 + Rd^2 * invA)
+static inline void blend_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) {
+ Sk4f s = srgb_to_linear(to_4f(pixel));
+ Sk4f d = srgb_to_linear(to_4f(*dst));
+ Sk4f invAlpha = 1.0f - Sk4f{s[SkPM4f::A]} * (1.0f / 255.0f);
+ Sk4f r = linear_to_srgb(s + d * invAlpha) + 0.5f;
+ *dst = to_4b(r);
+}
-#else
+static inline void srcover_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) {
+ if ((~pixel & 0xFF000000) == 0) {
+ *dst = pixel;
+ } else if ((pixel & 0xFF000000) != 0) {
+ blend_srgb_srgb_1(dst, pixel);
+ }
+}
+
+static inline void srcover_srgb_srgb_2(uint32_t* dst, const uint32_t* src) {
+ srcover_srgb_srgb_1(dst++, *src++);
+ srcover_srgb_srgb_1(dst, *src);
+}
+
+static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) {
+ srcover_srgb_srgb_1(dst++, *src++);
+ srcover_srgb_srgb_1(dst++, *src++);
+ srcover_srgb_srgb_1(dst++, *src++);
+ srcover_srgb_srgb_1(dst, *src);
+}
+
+void best_non_simd_srcover_srgb_srgb(
+ uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
+ uint64_t* ddst = reinterpret_cast<uint64_t*>(dst);
- static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) {
- switch (src >> 24) {
- case 0x00: return;
- case 0xff: *dst = src; return;
+ while (ndst >0) {
+ int count = SkTMin(ndst, nsrc);
+ ndst -= count;
+ const uint64_t* dsrc = reinterpret_cast<const uint64_t*>(src);
+ const uint64_t* end = dsrc + (count >> 1);
+ do {
+ if ((~*dsrc & 0xFF000000FF000000) == 0) {
+ do {
+ *ddst++ = *dsrc++;
+ } while (dsrc < end && (~*dsrc & 0xFF000000FF000000) == 0);
+ } else if ((*dsrc & 0xFF000000FF000000) == 0) {
+ do {
+ dsrc++;
+ ddst++;
+ } while (dsrc < end && (*dsrc & 0xFF000000FF000000) == 0);
+ } else {
+ srcover_srgb_srgb_2(reinterpret_cast<uint32_t*>(ddst++),
+ reinterpret_cast<const uint32_t*>(dsrc++));
+ }
+ } while (dsrc < end);
+
+ if ((count & 1) != 0) {
+ srcover_srgb_srgb_1(reinterpret_cast<uint32_t*>(ddst),
+ *reinterpret_cast<const uint32_t*>(dsrc));
+ }
+ }
+}
+
+void brute_force_srcover_srgb_srgb(
+ uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
+ while (ndst > 0) {
+ int n = SkTMin(ndst, nsrc);
+
+ for (int i = 0; i < n; i++) {
+ blend_srgb_srgb_1(dst++, src[i]);
}
+ ndst -= n;
+ }
+}
- Sk4f d = SkNx_cast<float>(Sk4b::Load( dst)),
- s = SkNx_cast<float>(Sk4b::Load(&src));
+void trivial_srcover_srgb_srgb(
+ uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
+ while (ndst > 0) {
+ int n = SkTMin(ndst, nsrc);
- // Approximate sRGB gamma as 2.0.
- Sk4f d_sq = d*d,
- s_sq = s*s;
- d = Sk4f{d_sq[0], d_sq[1], d_sq[2], d[3]};
- s = Sk4f{s_sq[0], s_sq[1], s_sq[2], s[3]};
+ for (int i = 0; i < n; i++) {
+ srcover_srgb_srgb_1(dst++, src[i]);
+ }
+ ndst -= n;
+ }
+}
- // SrcOver.
- Sk4f invA = 1.0f - s[3]*(1/255.0f);
- d = s + d * invA;
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
- // Re-apply approximate sRGB gamma.
- Sk4f d_sqrt = d.sqrt();
- d = Sk4f{d_sqrt[0], d_sqrt[1], d_sqrt[2], d[3]};
+ static inline __m128i load(const uint32_t* p) {
+ return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
+ }
- SkNx_cast<uint8_t>(d).store(dst);
+ static inline void store(uint32_t* p, __m128i v) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v);
}
- static inline void srcover_srgb_srgb(uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
- while (ndst > 0) {
- int n = SkTMin(ndst, nsrc);
+ #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
+
+ void srcover_srgb_srgb(
+ uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {
+ const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
+ while (ndst > 0) {
+ int count = SkTMin(ndst, nsrc);
+ ndst -= count;
+ const uint32_t* src = srcStart;
+ const uint32_t* end = src + (count & ~3);
- for (int i = 0; i < n; i++) {
- srcover_srgb_srgb_1(dst++, src[i]);
+ while (src < end) {
+ __m128i pixels = load(src);
+ if (_mm_testc_si128(pixels, alphaMask)) {
+ do {
+ store(dst, pixels);
+ dst += 4;
+ src += 4;
+ } while (src < end && _mm_testc_si128(pixels = load(src), alphaMask));
+ } else if (_mm_testz_si128(pixels, alphaMask)) {
+ do {
+ dst += 4;
+ src += 4;
+ } while (src < end && _mm_testz_si128(pixels = load(src), alphaMask));
+ } else {
+ do {
+ srcover_srgb_srgb_4(dst, src);
+ dst += 4;
+ src += 4;
+ } while (src < end && _mm_testnzc_si128(pixels = load(src), alphaMask));
+ }
+ }
+
+ count = count & 3;
+ while (count-- > 0) {
+ srcover_srgb_srgb_1(dst++, *src++);
+ }
}
- ndst -= n;
}
+ #else
+ // SSE2 versions
+ static inline bool check_opaque_alphas(__m128i pixels) {
+ int mask =
+ _mm_movemask_epi8(
+ _mm_cmpeq_epi32(
+ _mm_andnot_si128(pixels, _mm_set1_epi32(0xFF000000)),
+ _mm_setzero_si128()));
+ return mask == 0xFFFF;
+ }
+
+ static inline bool check_transparent_alphas(__m128i pixels) {
+ int mask =
+ _mm_movemask_epi8(
+ _mm_cmpeq_epi32(
+ _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)),
+ _mm_setzero_si128()));
+ return mask == 0xFFFF;
+ }
+
+ static inline bool check_partial_alphas(__m128i pixels) {
+ __m128i alphas = _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000));
+ int mask =
+ _mm_movemask_epi8(
+ _mm_cmpeq_epi8(
+ _mm_srai_epi32(alphas, 8),
+ alphas));
+ return mask == 0xFFFF;
+ }
+
+ void srcover_srgb_srgb(
+ uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {
+ while (ndst > 0) {
+ int count = SkTMin(ndst, nsrc);
+ ndst -= count;
+ const uint32_t* src = srcStart;
+ const uint32_t* end = src + (count & ~3);
+
+ __m128i pixels = load(src);
+ do {
+ if (check_opaque_alphas(pixels)) {
+ do {
+ store(dst, pixels);
+ dst += 4;
+ src += 4;
+ } while (src < end && check_opaque_alphas(pixels = load(src)));
+ } else if (check_transparent_alphas(pixels)) {
+ const uint32_t* start = src;
+ do {
+ src += 4;
+ } while (src < end && check_transparent_alphas(pixels = load(src)));
+ dst += src - start;
+ } else {
+ do {
+ srcover_srgb_srgb_4(dst, src);
+ dst += 4;
+ src += 4;
+ } while (src < end && check_partial_alphas(pixels = load(src)));
+ }
+ } while (src < end);
+
+ count = count & 3;
+ while (count-- > 0) {
+ srcover_srgb_srgb_1(dst++, *src++);
+ }
+ }
+ }
+ #endif
+#else
+
+ void srcover_srgb_srgb(
+ uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
+ trivial_srcover_srgb_srgb(dst, src, ndst, nsrc);
}
-
+
#endif
} // namespace SK_OPTS_NS
diff --git a/src/opts/SkOpts_sse41.cpp b/src/opts/SkOpts_sse41.cpp
index 34b078c2ca..f0561a69c6 100644
--- a/src/opts/SkOpts_sse41.cpp
+++ b/src/opts/SkOpts_sse41.cpp
@@ -10,6 +10,7 @@
#define SK_OPTS_NS sk_sse41
#include "SkBlurImageFilter_opts.h"
#include "SkBlitRow_opts.h"
+#include "SkBlend_opts.h"
#ifndef SK_SUPPORT_LEGACY_X86_BLITS
@@ -211,16 +212,16 @@ static void blit_mask_d32_a8(SkPMColor* dst, size_t dstRB,
}
}
}
-
} // namespace sk_sse41_new
#endif
namespace SkOpts {
void Init_sse41() {
- box_blur_xx = sk_sse41::box_blur_xx;
- box_blur_xy = sk_sse41::box_blur_xy;
- box_blur_yx = sk_sse41::box_blur_yx;
+ box_blur_xx = sk_sse41::box_blur_xx;
+ box_blur_xy = sk_sse41::box_blur_xy;
+ box_blur_yx = sk_sse41::box_blur_yx;
+ srcover_srgb_srgb = sk_sse41::srcover_srgb_srgb;
#ifndef SK_SUPPORT_LEGACY_X86_BLITS
blit_row_color32 = sk_sse41_new::blit_row_color32;
diff --git a/tests/SkBlend_optsTest.cpp b/tests/SkBlend_optsTest.cpp
new file mode 100644
index 0000000000..ee60d5ebdf
--- /dev/null
+++ b/tests/SkBlend_optsTest.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <string>
+#include <tuple>
+#include <vector>
+#include "Resources.h"
+#include "SkCpu.h"
+#include "SkImage.h"
+#include "SkImage_Base.h"
+#include "SkOpts.h"
+#include "SkNx.h"
+#include "Test.h"
+#include "../include/core/SkImageInfo.h"
+
+typedef void (*Blender)(uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc);
+
+namespace sk_default {
+extern void brute_force_srcover_srgb_srgb(
+ uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc);
+}
+
+namespace sk_default {
+extern void trivial_srcover_srgb_srgb(
+ uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc);
+
+extern void best_non_simd_srcover_srgb_srgb(
+ uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc);
+
+extern void srcover_srgb_srgb(
+ uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc);
+}
+
+#if defined(SK_CPU_X86) && !defined(SK_BUILD_FOR_IOS)
+namespace sk_sse41 {
+extern void srcover_srgb_srgb(
+ uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc);
+}
+#endif
+
+static SkString missmatch_message(std::string resourceName, std::string name, int x, int y,
+ uint32_t src, uint32_t good, uint32_t bad) {
+ return SkStringPrintf(
+ "%s - %s missmatch at %d, %d src: %08x good: %08x bad: %08x",
+ resourceName.c_str(), name.c_str(), x, y, src, good, bad);
+}
+
+using Spec = std::tuple<Blender, std::string>;
+
+static void test_blender(
+ Spec spec,
+ std::string resourceName,
+ skiatest::Reporter* reporter)
+{
+ Blender blender;
+ std::string name;
+ std::tie(blender, name) = spec;
+
+ std::string fileName = resourceName + ".png";
+ sk_sp<SkImage> image = GetResourceAsImage(fileName.c_str());
+ SkASSERT(image != nullptr);
+ if (image == nullptr) {
+ SkFAIL("image is NULL");
+ }
+ SkBitmap bm;
+ if (!as_IB(image)->getROPixels(&bm)) {
+ SkFAIL("Could not read resource");
+ }
+
+ SkPixmap pixmap;
+ bm.peekPixels(&pixmap);
+ SkASSERTF(pixmap.colorType() == kN32_SkColorType, "colorType: %d", pixmap.colorType());
+ SkASSERT(pixmap.alphaType() != kUnpremul_SkAlphaType);
+ const uint32_t* src = pixmap.addr32();
+ const int width = pixmap.rowBytesAsPixels();
+ SkASSERT(width > 0);
+ SkASSERT(width < 4000);
+ SkAutoTArray<uint32_t> correctDst(width);
+ SkAutoTArray<uint32_t> testDst(width);
+
+ for (int y = 0; y < pixmap.height(); y++) {
+ memset(correctDst.get(), 0, width * sizeof(uint32_t));
+ memset(testDst.get(), 0, width * sizeof(uint32_t));
+ sk_default::brute_force_srcover_srgb_srgb(correctDst.get(), src, width, width);
+ blender(testDst.get(), src, width, width);
+ for (int x = 0; x < width; x++) {
+ REPORTER_ASSERT_MESSAGE(
+ reporter, correctDst[x] == testDst[x],
+ missmatch_message(resourceName, name, x, y, src[x], correctDst[x], testDst[x]));
+ if (correctDst[x] != testDst[x]) break;
+ }
+ src += width;
+ }
+}
+
+DEF_TEST(SkBlend_optsCheck, reporter) {
+ std::vector<Spec> specs = {
+ Spec{sk_default::trivial_srcover_srgb_srgb, "trivial"},
+ Spec{sk_default::best_non_simd_srcover_srgb_srgb, "best_non_simd"},
+ Spec{sk_default::srcover_srgb_srgb, "default"},
+ };
+ #if defined(SK_CPU_X86) && !defined(SK_BUILD_FOR_IOS)
+ if (SkCpu::Supports(SkCpu::SSE41)) {
+ specs.push_back(Spec{sk_sse41::srcover_srgb_srgb, "sse41", });
+ }
+ #endif
+
+ std::vector<std::string> testResources = {
+ "yellow_rose", "baby_tux", "plane", "mandrill_512", "iconstrip"
+ };
+
+ for (auto& spec : specs) {
+ for (auto& resourceName : testResources) {
+ test_blender(spec, resourceName, reporter);
+ }
+ }
+}
+
+
+
+DEF_TEST(SkBlend_optsSqrtCheck, reporter) {
+ for (int c = 0; c < 256; c++) {
+ Sk4f i{(float)c};
+ Sk4f ii = i * i;
+ Sk4f s = ii.sqrt() + 0.5f;
+ Sk4f sf = s.floor();
+ REPORTER_ASSERT_MESSAGE(
+ reporter, i[0] == sf[0], SkStringPrintf("i: %f, s: %f", i[0], sf[0]));
+ }
+}