diff options
author | Herb Derby <herb@google.com> | 2017-10-17 10:37:09 -0400 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2017-10-17 16:06:32 +0000 |
commit | d4a0fc7383546d106db2216515b3753937398ece (patch) | |
tree | e6820bf1486529f2644baacadb1f6fcde347cb42 | |
parent | 9eca72b57bdf2fc008006b43fd8bf37efbfa7517 (diff) |
Use combined three pass code for image blur.
This changes more closely matches the GL output, and the runtimes are similar or
faster for the common cases.
x86_64 times:
benchmark old-Us new-Us old/new
blur_image_filter_large_80.00_80.00 4842.04 2626.10 1.84381
blur_image_filter_small_80.00_80.00 3297.72 854.97 3.85712
blur_image_filter_large_10.00_10.00 930.44 720.50 1.29138
blur_image_filter_small_10.00_10.00 69.96 42.15 1.65979
blur_image_filter_large_1.00_1.00 682.66 521.78 1.30833
blur_image_filter_small_1.00_1.00 19.21 14.43 1.33125
blur_image_filter_large_0.50_0.50 696.17 64.14 10.8539
blur_image_filter_small_0.50_0.50 16.26 5.02 3.23904
arm64 times:
benchmark old-Us new-Us old/new
blur_image_filter_large_80.00_80.00 42144.53 14128.42 2.98296
blur_image_filter_small_80.00_80.00 24840.58 4392.58 5.65512
blur_image_filter_large_10.00_10.00 3556.40 3793.70 0.937449
blur_image_filter_small_10.00_10.00 282.53 220.62 1.28062
blur_image_filter_large_1.00_1.00 2502.20 2937.99 0.851671
blur_image_filter_small_1.00_1.00 83.32 81.93 1.01697
blur_image_filter_large_0.50_0.50 5643.80 272.83 20.6861
blur_image_filter_small_0.50_0.50 141.02 38.29 3.68295
Cq-Include-Trybots: skia.primary:Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-All-SKNX_NO_SIMD
Change-Id: Ic53b3186607d5485477b92e4ca7b092bf1366c52
Reviewed-on: https://skia-review.googlesource.com/52771
Commit-Queue: Herb Derby <herb@google.com>
Reviewed-by: Mike Klein <mtklein@google.com>
-rw-r--r-- | src/core/SkBlurImageFilter.cpp | 393 | ||||
-rw-r--r-- | tests/ImageFilterTest.cpp | 10 |
2 files changed, 379 insertions, 24 deletions
diff --git a/src/core/SkBlurImageFilter.cpp b/src/core/SkBlurImageFilter.cpp index 5d9a1cf33a..e9682ff301 100644 --- a/src/core/SkBlurImageFilter.cpp +++ b/src/core/SkBlurImageFilter.cpp @@ -7,11 +7,16 @@ #include "SkBlurImageFilter.h" +#include <algorithm> + +#include "SkArenaAlloc.h" #include "SkAutoPixmapStorage.h" +#include "SkBitmap.h" #include "SkColorData.h" #include "SkColorSpaceXformer.h" #include "SkTFitsIn.h" #include "SkGpuBlurUtils.h" +#include "SkNx.h" #include "SkOpts.h" #include "SkReadBuffer.h" #include "SkSpecialImage.h" @@ -23,13 +28,18 @@ #include "SkGr.h" #endif +// The value where the three pass window calculation results in a zero window. +// N[Solve[sigma*3*Sqrt[2 Pi]/4 == 1/2, sigma], 16] +static constexpr double kZeroWindow = 0.26596152026762; +static constexpr double kPi = 3.14159265358979323846264338327950288; + class SkBlurImageFilterImpl final : public SkImageFilter { public: SkBlurImageFilterImpl(SkScalar sigmaX, - SkScalar sigmaY, - sk_sp<SkImageFilter> input, - const CropRect* cropRect, - SkBlurImageFilter::TileMode tileMode); + SkScalar sigmaY, + sk_sp<SkImageFilter> input, + const CropRect* cropRect, + SkBlurImageFilter::TileMode tileMode); SkRect computeFastBounds(const SkRect&) const override; @@ -162,6 +172,342 @@ static void get_box3_params(SkScalar s, int *kernelSize, int* kernelSize3, int * } } +#if !defined(SK_SUPPORT_LEGACY_BLUR_IMAGE) + +// This is defined by the SVG spec: +// https://drafts.fxtf.org/filter-effects/#feGaussianBlurElement +static int calculate_window(double sigma) { + // NB 136 is the largest sigma that will not cause a buffer full of 255 mask values to overflow + // using the Gauss filter. It also limits the size of buffers used hold intermediate values. + // Explanation of maximums: + // sum0 = window * 255 + // sum1 = window * sum0 -> window * window * 255 + // sum2 = window * sum1 -> window * window * window * 255 -> window^3 * 255 + // + // The value window^3 * 255 must fit in a uint32_t. So, + // window^3 < 2^32. window = 255. + // + // window = floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5) + // For window <= 255, the largest value for sigma is 136. + sigma = SkTPin(sigma, 0.0, 136.0); + auto possibleWindow = static_cast<int>(floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5)); + return std::max(1, possibleWindow); +} + +// Calculating the border is tricky. The border is the distance in pixels between the first dst +// pixel and the first src pixel (or the last src pixel and the last dst pixel). +// I will go through the odd case which is simpler, and then through the even case. Given a +// stack of filters seven wide for the odd case of three passes. +// +// S +// aaaAaaa +// bbbBbbb +// cccCccc +// D +// +// The furthest changed pixel is when the filters are in the following configuration. +// +// S +// aaaAaaa +// bbbBbbb +// cccCccc +// D +// +// The A pixel is calculated using the value S, the B uses A, and the C uses B, and +// finally D is C. So, with a window size of seven the border is nine. In the odd case, the +// border is 3*((window - 1)/2). +// +// For even cases the filter stack is more complicated. The spec specifies two passes +// of even filters and a final pass of odd filters. A stack for a width of six looks like +// this. +// +// S +// aaaAaa +// bbBbbb +// cccCccc +// D +// +// The furthest pixel looks like this. +// +// S +// aaaAaa +// bbBbbb +// cccCccc +// D +// +// For a window of six, the border value is eight. In the even case the border is 3 * +// (window/2) - 1. +static int calculate_border(int window) { + return (window & 1) == 1 ? 3 * ((window - 1) / 2) : 3 * (window / 2) - 1; +} + +static int calculate_buffer(int window) { + int bufferSize = window - 1; + return (window & 1) == 1 ? 3 * bufferSize : 3 * bufferSize + 1; +} + +// blur_one_direction implements the common three pass box filter approximation of Gaussian blur, +// but combines all three passes into a single pass. This approach is facilitated by three circular +// buffers the width of the window which track values for trailing edges of each of the three +// passes. This allows the algorithm to use more precision in the calculation because the values +// are not rounded each pass. And this implementation also avoids a trap that's easy to fall +// into resulting in blending in too many zeroes near the edge. +// +// In general, a window sum has the form: +// sum_n+1 = sum_n + leading_edge - trailing_edge. +// If instead we do the subtraction at the end of the previous iteration, we can just +// calculate the sums instead of having to do the subtractions too. +// +// In previous iteration: +// sum_n+1 = sum_n - trailing_edge. +// +// In this iteration: +// sum_n+1 = sum_n + leading_edge. +// +// Now we can stack all three sums and do them at once. Sum0 gets its leading edge from the +// actual data. Sum1's leading edge is just Sum0, and Sum2's leading edge is Sum1. So, doing the +// three passes at the same time has the form: +// +// sum0_n+1 = sum0_n + leading edge +// sum1_n+1 = sum1_n + sum0_n+1 +// sum2_n+1 = sum2_n + sum1_n+1 +// +// sum2_n+1 / window^3 is the new value of the destination pixel. +// +// Reduce the sums by the trailing edges which were stored in the circular buffers, +// for the next go around. This is the case for odd sized windows, even windows the the third +// circular buffer is one larger then the first two circular buffers. +// +// sum2_n+2 = sum2_n+1 - buffer2[i]; +// buffer2[i] = sum1; +// sum1_n+2 = sum1_n+1 - buffer1[i]; +// buffer1[i] = sum0; +// sum0_n+2 = sum0_n+1 - buffer0[i]; +// buffer0[i] = leading edge +// +// This is all encapsulated in the processValue function below. +// +using Pass0And1 = Sk4u[2]; +// The would be dLeft parameter is assumed to be 0. +static void blur_one_direction(Sk4u* buffer, int window, + int srcLeft, int srcRight, int dstRight, + const uint32_t* src, int srcXStride, int srcYStride, int srcH, + uint32_t* dst, int dstXStride, int dstYStride) { + + // The circular buffers are one less than the window. + auto pass0Count = window - 1, + pass1Count = window - 1, + pass2Count = (window & 1) == 1 ? window - 1 : window; + + Pass0And1* buffer01Start = (Pass0And1*)buffer; + Sk4u* buffer2Start = buffer + pass0Count + pass1Count; + Pass0And1* buffer01End = (Pass0And1*)buffer2Start; + Sk4u* buffer2End = buffer2Start + pass2Count; + + // If the window is odd then the divisor is just window ^ 3 otherwise, + // it is window * window * (window + 1) = window ^ 3 + window ^ 2; + auto window2 = window * window; + auto window3 = window2 * window; + auto divisor = (window & 1) == 1 ? window3 : window3 + window2; + + // NB the sums in the blur code use the following technique to avoid + // adding 1/2 to round the divide. + // + // Sum/d + 1/2 == (Sum + h) / d + // Sum + d(1/2) == Sum + h + // h == (1/2)d + // + // But the d/2 it self should be rounded. + // h == d/2 + 1/2 == (d + 1) / 2 + // + // weight = 1 / d * 2 ^ 32 + auto weight = static_cast<uint32_t>(round(1.0 / divisor * (1ull << 32))); + auto half = static_cast<uint32_t>((divisor + 1) / 2); + + auto border = calculate_border(window); + + // Calculate the start and end of the source pixels with respect to the destination start. + auto srcStart = srcLeft - border, + srcEnd = srcRight - border, + dstEnd = dstRight; + + for (auto y = 0; y < srcH; y++) { + auto buffer01Cursor = buffer01Start; + auto buffer2Cursor = buffer2Start; + + Sk4u sum0{0u}; + Sk4u sum1{0u}; + Sk4u sum2{half}; + + sk_bzero(buffer01Start, (buffer2End - (Sk4u *) (buffer01Start)) * sizeof(*buffer2Start)); + + // Given an expanded input pixel, move the window ahead using the leadingEdge value. + auto processValue = [&](const Sk4u& leadingEdge) -> Sk4u { + sum0 += leadingEdge; + sum1 += sum0; + sum2 += sum1; + + Sk4u value = sum2.mulHi(weight); + + sum2 -= *buffer2Cursor; + *buffer2Cursor = sum1; + buffer2Cursor = (buffer2Cursor + 1) < buffer2End ? buffer2Cursor + 1 : buffer2Start; + + sum1 -= (*buffer01Cursor)[1]; + (*buffer01Cursor)[1] = sum0; + sum0 -= (*buffer01Cursor)[0]; + (*buffer01Cursor)[0] = leadingEdge; + buffer01Cursor = + (buffer01Cursor + 1) < buffer01End ? buffer01Cursor + 1 : buffer01Start; + + return value; + }; + + auto srcIdx = srcStart; + auto dstIdx = 0; + const uint32_t* srcCursor = src; + uint32_t* dstCursor = dst; + + // The destination pixels are not effected by the src pixels, + // change to zero as per the spec. + // https://drafts.fxtf.org/filter-effects/#FilterPrimitivesOverviewIntro + while (dstIdx < srcIdx) { + *dstCursor = 0; + dstCursor += dstXStride; + SK_PREFETCH(dstCursor); + dstIdx++; + } + + // The edge of the source is before the edge of the destination. Calculate the sums for + // the pixels before the start of the destination. + while (dstIdx > srcIdx) { + Sk4u leadingEdge = srcIdx < srcEnd ? SkNx_cast<uint32_t>(Sk4b::Load(srcCursor)) : 0; + (void) processValue(leadingEdge); + srcCursor += srcXStride; + srcIdx++; + } + + // The dstIdx and srcIdx are in sync now; the code just uses the dstIdx for both now. + // Consume the source generating pixels to dst. + auto loopEnd = std::min(dstEnd, srcEnd); + while (dstIdx < loopEnd) { + Sk4u leadingEdge = SkNx_cast<uint32_t>(Sk4b::Load(srcCursor)); + SkNx_cast<uint8_t>(processValue(leadingEdge)).store(dstCursor); + srcCursor += srcXStride; + dstCursor += dstXStride; + SK_PREFETCH(dstCursor); + dstIdx++; + } + + // The leading edge is beyond the end of the source. Assume that the pixels + // are now 0x0000 until the end of the destination. + loopEnd = dstEnd; + while (dstIdx < loopEnd) { + SkNx_cast<uint8_t>(processValue(0u)).store(dstCursor); + dstCursor += dstXStride; + SK_PREFETCH(dstCursor); + dstIdx++; + } + + src += srcYStride; + dst += dstYStride; + } +} + +static sk_sp<SkSpecialImage> combined_pass_blur( + SkVector sigma, + SkSpecialImage* source, const sk_sp<SkSpecialImage>& input, + SkIRect inputBounds, SkIRect dstBounds) { + SkBitmap inputBM; + + if (!input->getROPixels(&inputBM)) { + return nullptr; + } + + if (inputBM.colorType() != kN32_SkColorType) { + return nullptr; + } + + SkBitmap src; + inputBM.extractSubset(&src, inputBounds); + + // Make everything relative to the destination bounds. + inputBounds.offset(-dstBounds.x(), -dstBounds.y()); + dstBounds.offset( -dstBounds.x(), -dstBounds.y()); + + auto windowW = calculate_window(sigma.x()), + windowH = calculate_window(sigma.y()); + + auto srcW = inputBounds.width(), + srcH = inputBounds.height(), + dstW = dstBounds.width(), + dstH = dstBounds.height(); + + SkImageInfo dstInfo = SkImageInfo::Make(dstW, dstH, inputBM.colorType(), inputBM.alphaType()); + + SkBitmap dst; + if (!dst.tryAllocPixels(dstInfo)) { + return nullptr; + } + + auto bufferSizeW = calculate_buffer(windowW), + bufferSizeH = calculate_buffer(windowH); + + // The amount 1024 is enough for buffers up to 10 sigma. The tmp bitmap will be + // allocated on the heap. + SkSTArenaAlloc<1024> alloc; + Sk4u* buffer = alloc.makeArrayDefault<Sk4u>(std::max(bufferSizeW, bufferSizeH)); + + if (windowW > 1 && windowH > 1) { + // Blur both directions. + + auto tmpW = srcH, + tmpH = dstW; + + auto tmp = alloc.makeArrayDefault<uint32_t>(tmpW * tmpH); + + // Blur horizontally, and transpose. + blur_one_direction( + buffer, windowW, + inputBounds.left(), inputBounds.right(), dstBounds.right(), + static_cast<uint32_t*>(src.getPixels()), 1, src.rowBytesAsPixels(), srcH, + tmp, tmpW, 1); + + // Blur vertically (scan in memory order because of the transposition), + // and transpose back to the original orientation. + blur_one_direction( + buffer, windowH, + inputBounds.top(), inputBounds.bottom(), dstBounds.bottom(), + tmp, 1, tmpW, tmpH, + static_cast<uint32_t*>(dst.getPixels()), dst.rowBytesAsPixels(), 1); + } else if (windowW > 1) { + // Blur only horizontally. + + blur_one_direction( + buffer, windowW, + inputBounds.left(), inputBounds.right(), dstBounds.right(), + static_cast<uint32_t*>(src.getPixels()), 1, src.rowBytesAsPixels(), srcH, + static_cast<uint32_t*>(dst.getPixels()), 1, dst.rowBytesAsPixels()); + } else if (windowH > 1) { + // Blur only vertically. + + blur_one_direction( + buffer, windowH, + inputBounds.top(), inputBounds.bottom(), dstBounds.bottom(), + static_cast<uint32_t*>(src.getPixels()), src.rowBytesAsPixels(), 1, srcW, + static_cast<uint32_t*>(dst.getPixels()), dst.rowBytesAsPixels(), 1); + } else { + // Nothing to do. + + return input->makeSubset(inputBounds); + } + + return SkSpecialImage::MakeFromRaster(SkIRect::MakeWH(dstBounds.width(), + dstBounds.height()), + dst, &source->props()); +} +#endif + sk_sp<SkSpecialImage> SkBlurImageFilterImpl::onFilterImage(SkSpecialImage* source, const Context& ctx, SkIPoint* offset) const { @@ -207,12 +553,16 @@ sk_sp<SkSpecialImage> SkBlurImageFilterImpl::onFilterImage(SkSpecialImage* sourc } else #endif { - #if defined(SK_SUPPORT_LEGACY_BLUR_IMAGE) - result = this->cpuFilter(source, sigma, input, inputBounds, dstBounds); - #else - // The new code will go here. - result = this->cpuFilter(source, sigma, input, inputBounds, dstBounds); - #endif + // If both sigmas will result in a zero width window, there is nothing to do. + if (sigma.x() < kZeroWindow && sigma.y() < kZeroWindow) { + result = input->makeSubset(inputBounds); + } else { + #if defined(SK_SUPPORT_LEGACY_BLUR_IMAGE) + result = this->cpuFilter(source, sigma, input, inputBounds, dstBounds); + #else + result = combined_pass_blur(sigma, source, input, inputBounds, dstBounds); + #endif + } } // Return the resultOffset if the blur succeeded. @@ -235,8 +585,8 @@ sk_sp<SkSpecialImage> SkBlurImageFilterImpl::gpuFilter( // The raw cross arm value c = E^-s // The normalized cross arm value = c/n // N[Solve[{c/n == 1/2048, sigma > 0}, sigma], 16] - static constexpr double kCrossTooSmall = 0.2561130112451658; - if (sigma.x() < kCrossTooSmall && sigma.y() < kCrossTooSmall) { + static constexpr double kZeroWindowGPU = 0.2561130112451658; + if (sigma.x() < kZeroWindowGPU && sigma.y() < kZeroWindowGPU) { return input->makeSubset(inputBounds); } @@ -280,13 +630,6 @@ sk_sp<SkSpecialImage> SkBlurImageFilterImpl::cpuFilter( SkVector sigma, const sk_sp<SkSpecialImage> &input, SkIRect inputBounds, SkIRect dstBounds) const { - // If both sigmas will result in a zero width window, there is nothing to do. - // N[Solve[sigma*3*Sqrt[2 Pi]/4 == 1/2, sigma], 16] - static constexpr double kZeroWindow = 0.2659615202676218; - if (sigma.x() < kZeroWindow && sigma.y() < kZeroWindow) { - return input->makeSubset(inputBounds); - } - int kernelSizeX, kernelSizeX3, lowOffsetX, highOffsetX; int kernelSizeY, kernelSizeY3, lowOffsetY, highOffsetY; get_box3_params(sigma.x(), &kernelSizeX, &kernelSizeX3, &lowOffsetX, &highOffsetX); @@ -376,14 +719,26 @@ const { SkRect SkBlurImageFilterImpl::computeFastBounds(const SkRect& src) const { SkRect bounds = this->getInput(0) ? this->getInput(0)->computeFastBounds(src) : src; +#if defined(SK_SUPPORT_LEGACY_BLUR_IMAGE) bounds.outset(fSigma.width() * 3, fSigma.height() * 3); +#else + auto borderW = calculate_border(calculate_window(fSigma.width())), + borderH = calculate_border(calculate_window(fSigma.height())); + bounds.outset(borderW, borderH); +#endif return bounds; } SkIRect SkBlurImageFilterImpl::onFilterNodeBounds(const SkIRect& src, const SkMatrix& ctm, MapDirection) const { SkVector sigma = map_sigma(fSigma, ctm); +#if defined(SK_SUPPORT_LEGACY_BLUR_IMAGE) return src.makeOutset(SkScalarCeilToInt(sigma.x() * 3), SkScalarCeilToInt(sigma.y() * 3)); +#else + auto borderW = calculate_border(calculate_window(sigma.x())), + borderH = calculate_border(calculate_window(sigma.y())); + return src.makeOutset(borderW, borderH); +#endif } #ifndef SK_IGNORE_TO_STRING diff --git a/tests/ImageFilterTest.cpp b/tests/ImageFilterTest.cpp index c39cc2ee83..77175f7cdf 100644 --- a/tests/ImageFilterTest.cpp +++ b/tests/ImageFilterTest.cpp @@ -853,7 +853,7 @@ DEF_TEST(ImageFilterBlurThenShadowBounds, reporter) { sk_sp<SkImageFilter> filter2(make_drop_shadow(std::move(filter1))); SkIRect bounds = SkIRect::MakeXYWH(0, 0, 100, 100); - SkIRect expectedBounds = SkIRect::MakeXYWH(-133, -133, 236, 236); + SkIRect expectedBounds = SkIRect::MakeXYWH(-132, -132, 234, 234); bounds = filter2->filterBounds(bounds, SkMatrix::I()); REPORTER_ASSERT(reporter, bounds == expectedBounds); @@ -864,7 +864,7 @@ DEF_TEST(ImageFilterShadowThenBlurBounds, reporter) { sk_sp<SkImageFilter> filter2(make_blur(std::move(filter1))); SkIRect bounds = SkIRect::MakeXYWH(0, 0, 100, 100); - SkIRect expectedBounds = SkIRect::MakeXYWH(-133, -133, 236, 236); + SkIRect expectedBounds = SkIRect::MakeXYWH(-132, -132, 234, 234); bounds = filter2->filterBounds(bounds, SkMatrix::I()); REPORTER_ASSERT(reporter, bounds == expectedBounds); @@ -893,7 +893,7 @@ DEF_TEST(ImageFilterScaledBlurRadius, reporter) { scaleMatrix.setScale(2, 2); SkIRect bounds = SkIRect::MakeLTRB(0, 0, 200, 200); - SkIRect expectedBlurBounds = SkIRect::MakeLTRB(-6, -6, 206, 206); + SkIRect expectedBlurBounds = SkIRect::MakeLTRB(-5, -5, 205, 205); SkIRect blurBounds = blur->filterBounds( bounds, scaleMatrix, SkImageFilter::kForward_MapDirection); REPORTER_ASSERT(reporter, blurBounds == expectedBlurBounds); @@ -918,7 +918,7 @@ DEF_TEST(ImageFilterScaledBlurRadius, reporter) { scaleMatrix.setScale(1, -1); SkIRect bounds = SkIRect::MakeLTRB(0, -100, 100, 0); - SkIRect expectedBlurBounds = SkIRect::MakeLTRB(-3, -103, 103, 3); + SkIRect expectedBlurBounds = SkIRect::MakeLTRB(-2, -102, 102, 2); SkIRect blurBounds = blur->filterBounds( bounds, scaleMatrix, SkImageFilter::kForward_MapDirection); REPORTER_ASSERT(reporter, blurBounds == expectedBlurBounds); @@ -947,7 +947,7 @@ DEF_TEST(ImageFilterComposedBlurFastBounds, reporter) { SkRect boundsSrc = SkRect::MakeWH(SkIntToScalar(100), SkIntToScalar(100)); SkRect expectedBounds = SkRect::MakeXYWH( - SkIntToScalar(-6), SkIntToScalar(-6), SkIntToScalar(112), SkIntToScalar(112)); + SkIntToScalar(-4), SkIntToScalar(-4), SkIntToScalar(108), SkIntToScalar(108)); SkRect boundsDst = composedFilter->computeFastBounds(boundsSrc); REPORTER_ASSERT(reporter, boundsDst == expectedBounds); |