diff options
-rw-r--r-- | bench/SkBlend_optsBench.cpp | 167 | ||||
-rw-r--r-- | src/opts/SkBlend_opts.h | 235 | ||||
-rw-r--r-- | src/opts/SkOpts_sse41.cpp | 9 | ||||
-rw-r--r-- | tests/SkBlend_optsTest.cpp | 134 |
4 files changed, 514 insertions, 31 deletions
diff --git a/bench/SkBlend_optsBench.cpp b/bench/SkBlend_optsBench.cpp new file mode 100644 index 0000000000..9c34d49f34 --- /dev/null +++ b/bench/SkBlend_optsBench.cpp @@ -0,0 +1,167 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include <tuple> + +#include "Benchmark.h" +#include "Resources.h" +#include "SkCpu.h" +#include "SkImage.h" +#include "SkImage_Base.h" +#include "SkNx.h" +#include "SkOpts.h" +#include "SkString.h" + +#define INNER_LOOPS 10 + +namespace sk_default { +extern void brute_force_srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc); +} + +class SrcOverVSkOptsBruteForce { +public: + static SkString Name() { return SkString{"VSkOptsBruteForce"}; } + static bool WorksOnCpu() { return true; } + static void BlendN(uint32_t* dst, int count, const uint32_t* src) { + sk_default::brute_force_srcover_srgb_srgb(dst, src, count, count); + } +}; + +namespace sk_default { +extern void trivial_srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc); +} + +class SrcOverVSkOptsTrivial { +public: + static SkString Name() { return SkString{"VSkOptsTrivial"}; } + static bool WorksOnCpu() { return true; } + static void BlendN(uint32_t* dst, int count, const uint32_t* src) { + sk_default::trivial_srcover_srgb_srgb(dst, src, count, count); + } +}; + +namespace sk_default { +extern void best_non_simd_srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc); +} + +class SrcOverVSkOptsNonSimdCore { +public: + static SkString Name() { return SkString{"VSkOptsNonSimdCore"}; } + static bool WorksOnCpu() { return true; } + static void BlendN(uint32_t* dst, int count, const uint32_t* src) { + sk_default::best_non_simd_srcover_srgb_srgb(dst, src, count, count); + } +}; + +namespace sk_default { +extern void srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc); +} + +class SrcOverVSkOptsDefault { +public: + static SkString Name() { return SkString{"VSkOptsDefault"}; } + static bool WorksOnCpu() { return true; } + static void BlendN(uint32_t* dst, int count, const uint32_t* src) { + sk_default::srcover_srgb_srgb(dst, src, count, count); + } +}; + +namespace sk_sse41 { + extern void srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc); +} + +class SrcOverVSkOptsSSE41 { +public: + static SkString Name() { return SkString{"VSkOptsSSE41"}; } + static bool WorksOnCpu() { return SkCpu::Supports(SkCpu::SSE41); } + static void BlendN(uint32_t* dst, int count, const uint32_t* src) { + sk_sse41::srcover_srgb_srgb(dst, src, count, count); + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template <typename Blender> +class LinearSrcOverBench : public Benchmark { +public: + LinearSrcOverBench(const char* fileName) { + fName = "LinearSrcOver"; + fName.append(fileName); + fName.append(Blender::Name()); + + sk_sp<SkImage> image = GetResourceAsImage(fileName); + SkBitmap bm; + if (!as_IB(image)->getROPixels(&bm)) { + SkFAIL("Could not read resource"); + } + bm.peekPixels(&fPixmap); + fCount = fPixmap.rowBytesAsPixels(); + fDst.reset(fCount); + memset(fDst.get(), 0, fPixmap.rowBytes()); + } + +protected: + bool isSuitableFor(Backend backend) override { + return backend == kNonRendering_Backend && Blender::WorksOnCpu(); + } + const char* onGetName() override { return fName.c_str(); } + void onDraw(int loops, SkCanvas*) override { + SkASSERT(fPixmap.colorType() == kN32_SkColorType); + + const int width = fPixmap.rowBytesAsPixels(); + + for (int i = 0; i < loops * INNER_LOOPS; ++i) { + const uint32_t* src = fPixmap.addr32(); + for (int y = 0; y < fPixmap.height(); y++) { + Blender::BlendN(fDst.get(), width, src); + src += width; + } + } + } + + void onPostDraw(SkCanvas*) override { + // Make sure the compiler does not optimize away the operation. + volatile uint32_t v = 0; + for (int i = 0; i < fCount; i++) { + v ^= fDst[i]; + } + } + +private: + int fCount; + SkAutoTArray<uint32_t> fDst; + SkString fName; + SkPixmap fPixmap; + + typedef Benchmark INHERITED; +}; + +#if defined(SK_CPU_X86) && !defined(SK_BUILD_FOR_IOS) +#define BENCHES(fileName) \ +DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsBruteForce>(fileName); ) \ +DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsTrivial>(fileName); ) \ +DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsNonSimdCore>(fileName); ) \ +DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsDefault>(fileName); ) \ +DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsSSE41>(fileName); ) +#else +#define BENCHES(fileName) \ +DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsBruteForce>(fileName); ) \ +DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsTrivial>(fileName); ) \ +DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsNonSimdCore>(fileName); ) \ +DEF_BENCH( return new LinearSrcOverBench<SrcOverVSkOptsDefault>(fileName); ) +#endif + +BENCHES("yellow_rose.png") +BENCHES("baby_tux.png") +BENCHES("plane.png") +BENCHES("mandrill_512.png") +BENCHES("iconstrip.png") diff --git a/src/opts/SkBlend_opts.h b/src/opts/SkBlend_opts.h index a1067407be..93946438e5 100644 --- a/src/opts/SkBlend_opts.h +++ b/src/opts/SkBlend_opts.h @@ -5,52 +5,233 @@ * found in the LICENSE file. */ +/* +ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; and ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q + */ + #ifndef SkBlend_opts_DEFINED #define SkBlend_opts_DEFINED +#include "SkNx.h" +#include "SkPM4fPriv.h" + namespace SK_OPTS_NS { -#if 0 +// An implementation of SrcOver from bytes to bytes in linear space that takes advantage of the +// observation that the 255's cancel. +// invA = 1 - (As / 255); +// +// R = 255 * sqrt((Rs/255)^2 + (Rd/255)^2 * invA) +// => R = 255 * sqrt((Rs^2 + Rd^2 * invA)/255^2) +// => R = sqrt(Rs^2 + Rd^2 * invA) +static inline void blend_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) { + Sk4f s = srgb_to_linear(to_4f(pixel)); + Sk4f d = srgb_to_linear(to_4f(*dst)); + Sk4f invAlpha = 1.0f - Sk4f{s[SkPM4f::A]} * (1.0f / 255.0f); + Sk4f r = linear_to_srgb(s + d * invAlpha) + 0.5f; + *dst = to_4b(r); +} -#else +static inline void srcover_srgb_srgb_1(uint32_t* dst, const uint32_t pixel) { + if ((~pixel & 0xFF000000) == 0) { + *dst = pixel; + } else if ((pixel & 0xFF000000) != 0) { + blend_srgb_srgb_1(dst, pixel); + } +} + +static inline void srcover_srgb_srgb_2(uint32_t* dst, const uint32_t* src) { + srcover_srgb_srgb_1(dst++, *src++); + srcover_srgb_srgb_1(dst, *src); +} + +static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) { + srcover_srgb_srgb_1(dst++, *src++); + srcover_srgb_srgb_1(dst++, *src++); + srcover_srgb_srgb_1(dst++, *src++); + srcover_srgb_srgb_1(dst, *src); +} + +void best_non_simd_srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { + uint64_t* ddst = reinterpret_cast<uint64_t*>(dst); - static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) { - switch (src >> 24) { - case 0x00: return; - case 0xff: *dst = src; return; + while (ndst >0) { + int count = SkTMin(ndst, nsrc); + ndst -= count; + const uint64_t* dsrc = reinterpret_cast<const uint64_t*>(src); + const uint64_t* end = dsrc + (count >> 1); + do { + if ((~*dsrc & 0xFF000000FF000000) == 0) { + do { + *ddst++ = *dsrc++; + } while (dsrc < end && (~*dsrc & 0xFF000000FF000000) == 0); + } else if ((*dsrc & 0xFF000000FF000000) == 0) { + do { + dsrc++; + ddst++; + } while (dsrc < end && (*dsrc & 0xFF000000FF000000) == 0); + } else { + srcover_srgb_srgb_2(reinterpret_cast<uint32_t*>(ddst++), + reinterpret_cast<const uint32_t*>(dsrc++)); + } + } while (dsrc < end); + + if ((count & 1) != 0) { + srcover_srgb_srgb_1(reinterpret_cast<uint32_t*>(ddst), + *reinterpret_cast<const uint32_t*>(dsrc)); + } + } +} + +void brute_force_srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { + while (ndst > 0) { + int n = SkTMin(ndst, nsrc); + + for (int i = 0; i < n; i++) { + blend_srgb_srgb_1(dst++, src[i]); } + ndst -= n; + } +} - Sk4f d = SkNx_cast<float>(Sk4b::Load( dst)), - s = SkNx_cast<float>(Sk4b::Load(&src)); +void trivial_srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { + while (ndst > 0) { + int n = SkTMin(ndst, nsrc); - // Approximate sRGB gamma as 2.0. - Sk4f d_sq = d*d, - s_sq = s*s; - d = Sk4f{d_sq[0], d_sq[1], d_sq[2], d[3]}; - s = Sk4f{s_sq[0], s_sq[1], s_sq[2], s[3]}; + for (int i = 0; i < n; i++) { + srcover_srgb_srgb_1(dst++, src[i]); + } + ndst -= n; + } +} - // SrcOver. - Sk4f invA = 1.0f - s[3]*(1/255.0f); - d = s + d * invA; +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 - // Re-apply approximate sRGB gamma. - Sk4f d_sqrt = d.sqrt(); - d = Sk4f{d_sqrt[0], d_sqrt[1], d_sqrt[2], d[3]}; + static inline __m128i load(const uint32_t* p) { + return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)); + } - SkNx_cast<uint8_t>(d).store(dst); + static inline void store(uint32_t* p, __m128i v) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v); } - static inline void srcover_srgb_srgb(uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { - while (ndst > 0) { - int n = SkTMin(ndst, nsrc); + #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 + + void srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) { + const __m128i alphaMask = _mm_set1_epi32(0xFF000000); + while (ndst > 0) { + int count = SkTMin(ndst, nsrc); + ndst -= count; + const uint32_t* src = srcStart; + const uint32_t* end = src + (count & ~3); - for (int i = 0; i < n; i++) { - srcover_srgb_srgb_1(dst++, src[i]); + while (src < end) { + __m128i pixels = load(src); + if (_mm_testc_si128(pixels, alphaMask)) { + do { + store(dst, pixels); + dst += 4; + src += 4; + } while (src < end && _mm_testc_si128(pixels = load(src), alphaMask)); + } else if (_mm_testz_si128(pixels, alphaMask)) { + do { + dst += 4; + src += 4; + } while (src < end && _mm_testz_si128(pixels = load(src), alphaMask)); + } else { + do { + srcover_srgb_srgb_4(dst, src); + dst += 4; + src += 4; + } while (src < end && _mm_testnzc_si128(pixels = load(src), alphaMask)); + } + } + + count = count & 3; + while (count-- > 0) { + srcover_srgb_srgb_1(dst++, *src++); + } } - ndst -= n; } + #else + // SSE2 versions + static inline bool check_opaque_alphas(__m128i pixels) { + int mask = + _mm_movemask_epi8( + _mm_cmpeq_epi32( + _mm_andnot_si128(pixels, _mm_set1_epi32(0xFF000000)), + _mm_setzero_si128())); + return mask == 0xFFFF; + } + + static inline bool check_transparent_alphas(__m128i pixels) { + int mask = + _mm_movemask_epi8( + _mm_cmpeq_epi32( + _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)), + _mm_setzero_si128())); + return mask == 0xFFFF; + } + + static inline bool check_partial_alphas(__m128i pixels) { + __m128i alphas = _mm_and_si128(pixels, _mm_set1_epi32(0xFF000000)); + int mask = + _mm_movemask_epi8( + _mm_cmpeq_epi8( + _mm_srai_epi32(alphas, 8), + alphas)); + return mask == 0xFFFF; + } + + void srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) { + while (ndst > 0) { + int count = SkTMin(ndst, nsrc); + ndst -= count; + const uint32_t* src = srcStart; + const uint32_t* end = src + (count & ~3); + + __m128i pixels = load(src); + do { + if (check_opaque_alphas(pixels)) { + do { + store(dst, pixels); + dst += 4; + src += 4; + } while (src < end && check_opaque_alphas(pixels = load(src))); + } else if (check_transparent_alphas(pixels)) { + const uint32_t* start = src; + do { + src += 4; + } while (src < end && check_transparent_alphas(pixels = load(src))); + dst += src - start; + } else { + do { + srcover_srgb_srgb_4(dst, src); + dst += 4; + src += 4; + } while (src < end && check_partial_alphas(pixels = load(src))); + } + } while (src < end); + + count = count & 3; + while (count-- > 0) { + srcover_srgb_srgb_1(dst++, *src++); + } + } + } + #endif +#else + + void srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { + trivial_srcover_srgb_srgb(dst, src, ndst, nsrc); } - + #endif } // namespace SK_OPTS_NS diff --git a/src/opts/SkOpts_sse41.cpp b/src/opts/SkOpts_sse41.cpp index 34b078c2ca..f0561a69c6 100644 --- a/src/opts/SkOpts_sse41.cpp +++ b/src/opts/SkOpts_sse41.cpp @@ -10,6 +10,7 @@ #define SK_OPTS_NS sk_sse41 #include "SkBlurImageFilter_opts.h" #include "SkBlitRow_opts.h" +#include "SkBlend_opts.h" #ifndef SK_SUPPORT_LEGACY_X86_BLITS @@ -211,16 +212,16 @@ static void blit_mask_d32_a8(SkPMColor* dst, size_t dstRB, } } } - } // namespace sk_sse41_new #endif namespace SkOpts { void Init_sse41() { - box_blur_xx = sk_sse41::box_blur_xx; - box_blur_xy = sk_sse41::box_blur_xy; - box_blur_yx = sk_sse41::box_blur_yx; + box_blur_xx = sk_sse41::box_blur_xx; + box_blur_xy = sk_sse41::box_blur_xy; + box_blur_yx = sk_sse41::box_blur_yx; + srcover_srgb_srgb = sk_sse41::srcover_srgb_srgb; #ifndef SK_SUPPORT_LEGACY_X86_BLITS blit_row_color32 = sk_sse41_new::blit_row_color32; diff --git a/tests/SkBlend_optsTest.cpp b/tests/SkBlend_optsTest.cpp new file mode 100644 index 0000000000..ee60d5ebdf --- /dev/null +++ b/tests/SkBlend_optsTest.cpp @@ -0,0 +1,134 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include <string> +#include <tuple> +#include <vector> +#include "Resources.h" +#include "SkCpu.h" +#include "SkImage.h" +#include "SkImage_Base.h" +#include "SkOpts.h" +#include "SkNx.h" +#include "Test.h" +#include "../include/core/SkImageInfo.h" + +typedef void (*Blender)(uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc); + +namespace sk_default { +extern void brute_force_srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc); +} + +namespace sk_default { +extern void trivial_srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc); + +extern void best_non_simd_srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc); + +extern void srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc); +} + +#if defined(SK_CPU_X86) && !defined(SK_BUILD_FOR_IOS) +namespace sk_sse41 { +extern void srcover_srgb_srgb( + uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc); +} +#endif + +static SkString missmatch_message(std::string resourceName, std::string name, int x, int y, + uint32_t src, uint32_t good, uint32_t bad) { + return SkStringPrintf( + "%s - %s missmatch at %d, %d src: %08x good: %08x bad: %08x", + resourceName.c_str(), name.c_str(), x, y, src, good, bad); +} + +using Spec = std::tuple<Blender, std::string>; + +static void test_blender( + Spec spec, + std::string resourceName, + skiatest::Reporter* reporter) +{ + Blender blender; + std::string name; + std::tie(blender, name) = spec; + + std::string fileName = resourceName + ".png"; + sk_sp<SkImage> image = GetResourceAsImage(fileName.c_str()); + SkASSERT(image != nullptr); + if (image == nullptr) { + SkFAIL("image is NULL"); + } + SkBitmap bm; + if (!as_IB(image)->getROPixels(&bm)) { + SkFAIL("Could not read resource"); + } + + SkPixmap pixmap; + bm.peekPixels(&pixmap); + SkASSERTF(pixmap.colorType() == kN32_SkColorType, "colorType: %d", pixmap.colorType()); + SkASSERT(pixmap.alphaType() != kUnpremul_SkAlphaType); + const uint32_t* src = pixmap.addr32(); + const int width = pixmap.rowBytesAsPixels(); + SkASSERT(width > 0); + SkASSERT(width < 4000); + SkAutoTArray<uint32_t> correctDst(width); + SkAutoTArray<uint32_t> testDst(width); + + for (int y = 0; y < pixmap.height(); y++) { + memset(correctDst.get(), 0, width * sizeof(uint32_t)); + memset(testDst.get(), 0, width * sizeof(uint32_t)); + sk_default::brute_force_srcover_srgb_srgb(correctDst.get(), src, width, width); + blender(testDst.get(), src, width, width); + for (int x = 0; x < width; x++) { + REPORTER_ASSERT_MESSAGE( + reporter, correctDst[x] == testDst[x], + missmatch_message(resourceName, name, x, y, src[x], correctDst[x], testDst[x])); + if (correctDst[x] != testDst[x]) break; + } + src += width; + } +} + +DEF_TEST(SkBlend_optsCheck, reporter) { + std::vector<Spec> specs = { + Spec{sk_default::trivial_srcover_srgb_srgb, "trivial"}, + Spec{sk_default::best_non_simd_srcover_srgb_srgb, "best_non_simd"}, + Spec{sk_default::srcover_srgb_srgb, "default"}, + }; + #if defined(SK_CPU_X86) && !defined(SK_BUILD_FOR_IOS) + if (SkCpu::Supports(SkCpu::SSE41)) { + specs.push_back(Spec{sk_sse41::srcover_srgb_srgb, "sse41", }); + } + #endif + + std::vector<std::string> testResources = { + "yellow_rose", "baby_tux", "plane", "mandrill_512", "iconstrip" + }; + + for (auto& spec : specs) { + for (auto& resourceName : testResources) { + test_blender(spec, resourceName, reporter); + } + } +} + + + +DEF_TEST(SkBlend_optsSqrtCheck, reporter) { + for (int c = 0; c < 256; c++) { + Sk4f i{(float)c}; + Sk4f ii = i * i; + Sk4f s = ii.sqrt() + 0.5f; + Sk4f sf = s.floor(); + REPORTER_ASSERT_MESSAGE( + reporter, i[0] == sf[0], SkStringPrintf("i: %f, s: %f", i[0], sf[0])); + } +} |