/* * Copyright 2017 Google Inc. * * Use of this source code is governed by a BSD-style license that can be * found in the LICENSE file. */ #include "SkMaskBlurFilter.h" #include #include #include "SkArenaAlloc.h" #include "SkGaussFilter.h" #include "SkNx.h" #include "SkSafeMath.h" static const double kPi = 3.14159265358979323846264338327950288; static constexpr double kSmallSigma = 2.0; class BlurScanInterface { public: virtual ~BlurScanInterface() = default; virtual void blur(const uint8_t* src, int srcStride, const uint8_t* srcEnd, uint8_t* dst, int dstStride, uint8_t* dstEnd) const = 0; virtual bool canBlur4() { return false; } virtual void blur4Transpose( const uint8_t* src, int srcStride, const uint8_t* srcEnd, uint8_t* dst, int dstStride, uint8_t* dstEnd) const { SK_ABORT("This should not be called."); } }; class PlanningInterface { public: virtual ~PlanningInterface() = default; virtual size_t bufferSize() const = 0; virtual int border() const = 0; virtual bool needsBlur() const = 0; virtual BlurScanInterface* makeBlurScan( SkArenaAlloc* alloc, int width, uint32_t* buffer) const = 0; }; class None final : public PlanningInterface { public: None() = default; size_t bufferSize() const override { return 0; } int border() const override { return 0; } bool needsBlur() const override { return false; } BlurScanInterface* makeBlurScan( SkArenaAlloc* alloc, int width, uint32_t* buffer) const override { SK_ABORT("Should never be called."); return nullptr; } }; class PlanBox final : public PlanningInterface { public: explicit PlanBox(double sigma) { // Calculate the radius from sigma. Taken from the old code until something better is // figured out. auto possibleRadius = 1.5 * sigma - 0.5; auto radius = std::max(std::numeric_limits::epsilon(), possibleRadius); auto outerRadius = std::ceil(radius); auto outerWindow = 2 * outerRadius + 1; auto outerFactor = (1 - (outerRadius - radius)) / outerWindow; fOuterWeight = static_cast(round(outerFactor * (1ull << 24))); auto innerRadius = outerRadius - 1; auto innerWindow = 2 * innerRadius + 1; auto innerFactor = (1 - (radius - innerRadius)) / innerWindow; fInnerWeight = static_cast(round(innerFactor * (1ull << 24))); // Sliding window is defined by the relationship between the outer and inner widows. // In the single window case, you add the element on the right, and subtract the element on // the left. But, because two windows are used, this relationship is more complicated; an // element is added from the right of the outer window, and subtracted from the left of the // inner window. Because innerWindow = outerWindow - 2, the distance between // the left and right in the two window case is outerWindow - 1. fSlidingWindow = static_cast(outerWindow - 1); } size_t bufferSize() const override { return fSlidingWindow * (sizeof(Sk4u) / sizeof(uint32_t)); } // Remember that sliding window = window - 1. Therefore, radius = sliding window / 2. int border() const override { return fSlidingWindow / 2; } bool needsBlur() const override { return true; } BlurScanInterface* makeBlurScan( SkArenaAlloc* alloc, int width, uint32_t* buffer) const override { int noChangeCount, trailingEdgeZeroCount; // The relation between the slidingWindow and the width dictates two operating modes. // * width >= slidingWindow - both sides of the window are contained in the image while // scanning. Therefore, we assume that slidingWindow zeros are consumed on the trailing // edge of the window. After this count, then both edges are traversing the image. // * slidingWindow > width - both sides of the window are off the image while scanning // the middle. The front edge of the window can only travel width until it falls off the // image. At this point, both edges of the window are off the image consuming zeros // and therefore, the destination value does not change. The scan produces unchanged // values until the trailing edge of the window enters the image. This count is // slidingWindow - width. if (width >= fSlidingWindow) { noChangeCount = 0; trailingEdgeZeroCount = fSlidingWindow; } else { noChangeCount = fSlidingWindow - width; trailingEdgeZeroCount = width; } Sk4u* sk4uBuffer = reinterpret_cast(buffer); return alloc->make(fOuterWeight, fInnerWeight, noChangeCount, trailingEdgeZeroCount, sk4uBuffer, sk4uBuffer + fSlidingWindow); } private: class Box final : public BlurScanInterface { public: Box(uint32_t outerWeight, uint32_t innerWeight, int noChangeCount, int trailingEdgeZeroCount, Sk4u* buffer, Sk4u* bufferEnd) : fOuterWeight{outerWeight} , fInnerWeight{innerWeight} , fNoChangeCount{noChangeCount} , fTrailingEdgeZeroCount{trailingEdgeZeroCount} , fBuffer{buffer} , fBufferEnd{bufferEnd} { } void blur(const uint8_t* src, int srcStride, const uint8_t* srcEnd, uint8_t* dst, int dstStride, uint8_t* dstEnd) const override { auto rightOuter = src; auto dstCursor = dst; auto interpolateSums = [this](uint32_t outerSum, uint32_t innerSum) { return SkTo( (fOuterWeight * outerSum + fInnerWeight * innerSum + kHalf) >> 24); }; uint32_t outerSum = 0; uint32_t innerSum = 0; for (int i = 0; i < fTrailingEdgeZeroCount; i++) { innerSum = outerSum; outerSum += *rightOuter; *dstCursor = interpolateSums(outerSum, innerSum); rightOuter += srcStride; dstCursor += dstStride; } // slidingWindow > width for (int i = 0; i < fNoChangeCount; i++) { *dstCursor = interpolateSums(outerSum, innerSum);; dstCursor += dstStride; } // width > slidingWindow auto leftInner = src; while (rightOuter < srcEnd) { innerSum = outerSum - *leftInner; outerSum += *rightOuter; *dstCursor = interpolateSums(outerSum, innerSum); outerSum -= *leftInner; rightOuter += srcStride; leftInner += srcStride; dstCursor += dstStride; } auto leftOuter = srcEnd; dstCursor = dstEnd; outerSum = 0; for (int i = 0; i < fTrailingEdgeZeroCount; i++) { leftOuter -= srcStride; dstCursor -= dstStride; innerSum = outerSum; outerSum += *leftOuter; *dstCursor = interpolateSums(outerSum, innerSum); } } bool canBlur4() override { return true; } // NB this is a transposing scan. The next src is src+1, and the next down is // src+srcStride. void blur4Transpose( const uint8_t* src, int srcStride, const uint8_t* srcEnd, uint8_t* dst, int dstStride, uint8_t* dstEnd) const override { auto rightOuter = src; auto dstCursor = dst; Sk4u* const bufferStart = fBuffer; Sk4u* bufferCursor = bufferStart; Sk4u* const bufferEnd = fBufferEnd; const Sk4u outerWeight(SkTo(fOuterWeight)); const Sk4u innerWeight(SkTo(fInnerWeight)); auto load = [](const uint8_t* cursor, int stride) -> Sk4u { return Sk4u(cursor[0*stride], cursor[1*stride], cursor[2*stride], cursor[3*stride]); }; auto interpolateSums = [&] (const Sk4u& outerSum, const Sk4u& innerSum) { return SkNx_cast( (outerSum * outerWeight + innerSum * innerWeight + kHalf) >> 24); }; Sk4u outerSum = 0; Sk4u innerSum = 0; for (int i = 0; i < fTrailingEdgeZeroCount; i++) { innerSum = outerSum; Sk4u leadingEdge = load(rightOuter, srcStride); outerSum += leadingEdge; Sk4b blurred = interpolateSums(outerSum, innerSum); blurred.store(dstCursor); leadingEdge.store(bufferCursor); bufferCursor = (bufferCursor + 1) < bufferEnd ? bufferCursor + 1 : bufferStart; rightOuter += 1; dstCursor += dstStride; } // slidingWindow > width for (int i = 0; i < fNoChangeCount; i++) { Sk4b blurred = interpolateSums(outerSum, innerSum); blurred.store(dstCursor); dstCursor += dstStride; } // width > slidingWindow auto leftInner = src; while (rightOuter < srcEnd) { Sk4u trailEdge = Sk4u::Load(bufferCursor); Sk4u leadingEdge = load(rightOuter, srcStride); innerSum = outerSum - trailEdge; outerSum += leadingEdge; Sk4b blurred = interpolateSums(outerSum, innerSum); blurred.store(dstCursor); outerSum -= trailEdge; leadingEdge.store(bufferCursor); bufferCursor = (bufferCursor + 1) < bufferEnd ? bufferCursor + 1 : bufferStart; rightOuter += 1; leftInner += 1; dstCursor += dstStride; } auto leftOuter = srcEnd; dstCursor = dstEnd; outerSum = 0; for (int i = 0; i < fTrailingEdgeZeroCount; i++) { leftOuter -= 1; dstCursor -= dstStride; innerSum = outerSum; outerSum += load(leftOuter, srcStride); Sk4b blurred = interpolateSums(outerSum, innerSum); blurred.store(dstCursor); } } private: static constexpr uint32_t kHalf = static_cast(1) << 23; const uint32_t fOuterWeight; const uint32_t fInnerWeight; const int fNoChangeCount; const int fTrailingEdgeZeroCount; Sk4u* const fBuffer; Sk4u* const fBufferEnd; }; private: uint32_t fOuterWeight; uint32_t fInnerWeight; int fSlidingWindow; }; class PlanGauss final : public PlanningInterface { public: explicit PlanGauss(double sigma) { auto possibleWindow = static_cast(floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5)); auto window = std::max(1, possibleWindow); fPass0Size = window - 1; fPass1Size = window - 1; fPass2Size = (window & 1) == 1 ? window - 1 : window; // Calculating the border is tricky. I will go through the odd case which is simpler, and // then through the even case. Given a stack of filters seven wide for the odd case of // three passes. // // S // aaaAaaa // bbbBbbb // cccCccc // D // // The furthest changed pixel is when the filters are in the following configuration. // // S // aaaAaaa // bbbBbbb // cccCccc // D // // The A pixel is calculated using the value S, the B uses A, and the C uses B, and // finally D is C. So, with a window size of seven the border is nine. In general, the // border is 3*((window - 1)/2). // // For even cases the filter stack is more complicated. The spec specifies two passes // of even filters and a final pass of odd filters. A stack for a width of six looks like // this. // // S // aaaAaa // bbBbbb // cccCccc // D // // The furthest pixel looks like this. // // S // aaaAaa // bbBbbb // cccCccc // D // // For a window of size, the border value is seven. In general the border is 3 * // (window/2) -1. fBorder = (window & 1) == 1 ? 3 * ((window - 1) / 2) : 3 * (window / 2) - 1; fSlidingWindow = 2 * fBorder + 1; // If the window is odd then the divisor is just window ^ 3 otherwise, // it is window * window * (window + 1) = window ^ 2 + window ^ 3; auto window2 = window * window; auto window3 = window2 * window; auto divisor = (window & 1) == 1 ? window3 : window3 + window2; fWeight = static_cast(round(1.0 / divisor * (1ull << 32))); } size_t bufferSize() const override { return fPass0Size + fPass1Size + fPass2Size; } int border() const override { return fBorder; } bool needsBlur() const override { return true; } BlurScanInterface* makeBlurScan( SkArenaAlloc* alloc, int width, uint32_t* buffer) const override { uint32_t* buffer0, *buffer0End, *buffer1, *buffer1End, *buffer2, *buffer2End; buffer0 = buffer; buffer0End = buffer1 = buffer0 + fPass0Size; buffer1End = buffer2 = buffer1 + fPass1Size; buffer2End = buffer2 + fPass2Size; int noChangeCount = fSlidingWindow > width ? fSlidingWindow - width : 0; return alloc->make( fWeight, noChangeCount, buffer0, buffer0End, buffer1, buffer1End, buffer2, buffer2End); } public: class Gauss final : public BlurScanInterface { public: Gauss(uint64_t weight, int noChangeCount, uint32_t* buffer0, uint32_t* buffer0End, uint32_t* buffer1, uint32_t* buffer1End, uint32_t* buffer2, uint32_t* buffer2End) : fWeight{weight} , fNoChangeCount{noChangeCount} , fBuffer0{buffer0} , fBuffer0End{buffer0End} , fBuffer1{buffer1} , fBuffer1End{buffer1End} , fBuffer2{buffer2} , fBuffer2End{buffer2End} { } void blur(const uint8_t* src, int srcStride, const uint8_t* srcEnd, uint8_t* dst, int dstStride, uint8_t* dstEnd) const override { auto buffer0Cursor = fBuffer0; auto buffer1Cursor = fBuffer1; auto buffer2Cursor = fBuffer2; std::memset(fBuffer0, 0x00, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0)); uint32_t sum0 = 0; uint32_t sum1 = 0; uint32_t sum2 = 0; // Consume the source generating pixels. for (auto srcCursor = src; srcCursor < srcEnd; dst += dstStride, srcCursor += srcStride) { uint32_t leadingEdge = *srcCursor; sum0 += leadingEdge; sum1 += sum0; sum2 += sum1; *dst = this->finalScale(sum2); sum2 -= *buffer2Cursor; *buffer2Cursor = sum1; buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2; sum1 -= *buffer1Cursor; *buffer1Cursor = sum0; buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1; sum0 -= *buffer0Cursor; *buffer0Cursor = leadingEdge; buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0; } // The leading edge is off the right side of the mask. for (int i = 0; i < fNoChangeCount; i++) { uint32_t leadingEdge = 0; sum0 += leadingEdge; sum1 += sum0; sum2 += sum1; *dst = this->finalScale(sum2); sum2 -= *buffer2Cursor; *buffer2Cursor = sum1; buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2; sum1 -= *buffer1Cursor; *buffer1Cursor = sum0; buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1; sum0 -= *buffer0Cursor; *buffer0Cursor = leadingEdge; buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0; dst += dstStride; } // Starting from the right, fill in the rest of the buffer. std::memset(fBuffer0, 0, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0)); sum0 = sum1 = sum2 = 0; uint8_t* dstCursor = dstEnd; const uint8_t* srcCursor = srcEnd; while (dstCursor > dst) { dstCursor -= dstStride; srcCursor -= srcStride; uint32_t leadingEdge = *srcCursor; sum0 += leadingEdge; sum1 += sum0; sum2 += sum1; *dstCursor = this->finalScale(sum2); sum2 -= *buffer2Cursor; *buffer2Cursor = sum1; buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2; sum1 -= *buffer1Cursor; *buffer1Cursor = sum0; buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1; sum0 -= *buffer0Cursor; *buffer0Cursor = leadingEdge; buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0; } } private: static constexpr uint64_t kHalf = static_cast(1) << 31; uint8_t finalScale(uint32_t sum) const { return SkTo((fWeight * sum + kHalf) >> 32); } uint64_t fWeight; int fNoChangeCount; uint32_t* fBuffer0; uint32_t* fBuffer0End; uint32_t* fBuffer1; uint32_t* fBuffer1End; uint32_t* fBuffer2; uint32_t* fBuffer2End; }; uint64_t fWeight; int fBorder; int fSlidingWindow; int fPass0Size; int fPass1Size; int fPass2Size; }; static PlanningInterface* make_plan(SkArenaAlloc* alloc, double sigma) { PlanningInterface* plan = nullptr; if (3 * sigma <= 1) { plan = alloc->make(); } else if (sigma < kSmallSigma) { plan = alloc->make(sigma); } else { plan = alloc->make(sigma); } return plan; }; // NB 136 is the largest sigma that will not cause a buffer full of 255 mask values to overflow // using the Gauss filter. It also limits the size of buffers used hold intermediate values. // Explanation of maximums: // sum0 = window * 255 // sum1 = window * sum0 -> window * window * 255 // sum2 = window * sum1 -> window * window * window * 255 -> window^3 * 255 // // The value window^3 * 255 must fit in a uint32_t. So, // window^3 < 2^32. window = 255. // // window = floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5) // For window <= 255, the largest value for sigma is 136. SkMaskBlurFilter::SkMaskBlurFilter(double sigmaW, double sigmaH) : fSigmaW{SkTPin(sigmaW, 0.0, 136.0)} , fSigmaH{SkTPin(sigmaH, 0.0, 136.0)} { SkASSERT(sigmaW >= 0); SkASSERT(sigmaH >= 0); } bool SkMaskBlurFilter::hasNoBlur() const { return (3 * fSigmaW <= 1) && (3 * fSigmaH <= 1); } static SkMask prepare_destination(int radiusX, int radiusY, const SkMask& src) { SkSafeMath safe; SkMask dst; // dstW = srcW + 2 * radiusX; size_t dstW = safe.add(src.fBounds.width(), safe.add(radiusX, radiusX)); // dstH = srcH + 2 * radiusY; size_t dstH = safe.add(src.fBounds.height(), safe.add(radiusY, radiusY)); dst.fBounds.set(0, 0, SkTo(dstW), SkTo(dstH)); dst.fBounds.offset(src.fBounds.x(), src.fBounds.y()); dst.fBounds.offset(-radiusX, -radiusY); dst.fImage = nullptr; dst.fRowBytes = SkTo(dstW); dst.fFormat = SkMask::kA8_Format; size_t toAlloc = safe.mul(dstW, dstH); if (safe && src.fImage != nullptr) { dst.fImage = SkMask::AllocImage(toAlloc); } return dst; } #if !defined(SK_USE_LEGACY_INTERP_BLUR) static constexpr uint16_t _____ = 0u; static constexpr uint16_t kHalf = 0x80u; static SK_ALWAYS_INLINE Sk8h load(const uint8_t* from, int width) { uint8_t buffer[8]; if (width < 8) { sk_bzero(buffer, sizeof(buffer)); for (int i = 0; i < width; i++) { buffer[i] = from[i]; } from = buffer; } auto v = SkNx_cast(Sk8b::Load(from)); // Convert from 0-255 to 8.8 encoding. return v << 8; }; static SK_ALWAYS_INLINE void store(uint8_t* to, const Sk8h& v, int width) { Sk8b b = SkNx_cast(v >> 8); if (width == 8) { b.store(to); } else { uint8_t buffer[8]; b.store(buffer); for (int i = 0; i < width; i++) { to[i] = buffer[i]; } } }; // In all the blur_x_radius_N and blur_y_radius_N functions the gaussian values are encoded // in 0.16 format, none of the values is greater than one. The incoming mask values are in 8.8 // format. The resulting multiply has a 8.24 format, by the mulhi truncates the lower 16 bits // resulting in a 8.8 format. // // The blur_x_radius_N function below blur along a row of pixels using a kernel with radius N. This // system is setup to minimize the number of multiplies needed. // // Explanation: // Blurring a specific mask value is given by the following equation where D_n is the resulting // mask value and S_n is the source value. The example below is for a filter with a radius of 1 // and a width of 3 (radius == (width-1)/2). The indexes for the source and destination are // aligned. The filter is given by G_n where n is the symmetric filter value. // // D[n] = S[n-1]*G[1] + S[n]*G[0] + S[n+1]*G[1]. // // We can start the source index at an offset relative to the destination separated by the // radius. This results in a non-traditional restating of the above filter. // // D[n] = S[n]*G[1] + S[n+1]*G[0] + S[n+2]*G[1] // // If we look at three specific consecutive destinations the following equations result: // // D[5] = S[5]*G[1] + S[6]*G[0] + S[7]*G[1] // D[7] = S[6]*G[1] + S[7]*G[0] + S[8]*G[1] // D[8] = S[7]*G[1] + S[8]*G[0] + S[9]*G[1]. // // In the above equations, notice that S[7] is used in all three. In particular, two values are // used: S[7]*G[0] and S[7]*G[1]. So, S[7] is only multiplied twice, but used in D[5], D[6] and // D[7]. // // From the point of view of a source value we end up with the following three equations. // // Given S[7]: // D[5] += S[7]*G[1] // D[6] += S[7]*G[0] // D[7] += S[7]*G[1] // // In General: // D[n] += S[n]*G[1] // D[n+1] += S[n]*G[0] // D[n+2] += S[n]*G[1] // // Now these equations can be ganged using SIMD to form: // D[n..n+7] += S[n..n+7]*G[1] // D[n+1..n+8] += S[n..n+7]*G[0] // D[n+2..n+9] += S[n..n+7]*G[1] // The next set of values becomes. // D[n+8..n+15] += S[n+8..n+15]*G[1] // D[n+9..n+16] += S[n+8..n+15]*G[0] // D[n+10..n+17] += S[n+8..n+15]*G[1] // You can see that the D[n+8] and D[n+9] values overlap the two sets, using parts of both // S[n..7] and S[n+8..n+15]. // // Just one more transformation allows the code to maintain all working values in // registers. I introduce the notation {0, S[n..n+7] * G[k]} to mean that the value where 0 is // prepended to the array of values to form {0, S[n] * G[k], ..., S[n+7]*G[k]}. // // D[n..n+7] += S[n..n+7] * G[1] // D[n..n+8] += {0, S[n..n+7] * G[0]} // D[n..n+9] += {0, 0, S[n..n+7] * G[1]} // // Now we can encode D[n..n+7] in a single Sk8h register called d0, and D[n+8..n+15] in a // register d8. In addition, S[0..n+7] becomes s0. // // The translation of the {0, S[n..n+7] * G[k]} is translated in the following way below. // // Sk8h v0 = s0*G[0] // Sk8h v1 = s0*G[1] // /* D[n..n+7] += S[n..n+7] * G[1] */ // d0 += v1; // /* D[n..n+8] += {0, S[n..n+7] * G[0]} */ // d0 += {_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]} // d1 += {v0[7], _____, _____, _____, _____, _____, _____, _____} // /* D[n..n+9] += {0, 0, S[n..n+7] * G[1]} */ // d0 += {_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]} // d1 += {v1[6], v1[7], _____, _____, _____, _____, _____, _____} // Where we rely on the compiler to generate efficient code for the {____, n, ....} notation. static SK_ALWAYS_INLINE void blur_x_radius_1( const Sk8h& s0, const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&, Sk8h* d0, Sk8h* d8) { auto v1 = s0.mulHi(g1); auto v0 = s0.mulHi(g0); // D[n..n+7] += S[n..n+7] * G[1] *d0 += v1; //D[n..n+8] += {0, S[n..n+7] * G[0]} *d0 += Sk8h{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]}; *d8 += Sk8h{v0[7], _____, _____, _____, _____, _____, _____, _____}; // D[n..n+9] += {0, 0, S[n..n+7] * G[1]} *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}; *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____}; } static SK_ALWAYS_INLINE void blur_x_radius_2( const Sk8h& s0, const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&, Sk8h* d0, Sk8h* d8) { auto v0 = s0.mulHi(g0); auto v1 = s0.mulHi(g1); auto v2 = s0.mulHi(g2); // D[n..n+7] += S[n..n+7] * G[2] *d0 += v2; // D[n..n+8] += {0, S[n..n+7] * G[1]} *d0 += Sk8h{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]}; *d8 += Sk8h{v1[7], _____, _____, _____, _____, _____, _____, _____}; // D[n..n+9] += {0, 0, S[n..n+7] * G[0]} *d0 += Sk8h{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]}; *d8 += Sk8h{v0[6], v0[7], _____, _____, _____, _____, _____, _____}; // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]} *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]}; *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____}; // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[2]} *d0 += Sk8h{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]}; *d8 += Sk8h{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____}; } static SK_ALWAYS_INLINE void blur_x_radius_3( const Sk8h& s0, const Sk8h& gauss0, const Sk8h& gauss1, const Sk8h& gauss2, const Sk8h& gauss3, const Sk8h&, Sk8h* d0, Sk8h* d8) { auto v0 = s0.mulHi(gauss0); auto v1 = s0.mulHi(gauss1); auto v2 = s0.mulHi(gauss2); auto v3 = s0.mulHi(gauss3); // D[n..n+7] += S[n..n+7] * G[3] *d0 += v3; // D[n..n+8] += {0, S[n..n+7] * G[2]} *d0 += Sk8h{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]}; *d8 += Sk8h{v2[7], _____, _____, _____, _____, _____, _____, _____}; // D[n..n+9] += {0, 0, S[n..n+7] * G[1]} *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}; *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____}; // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[0]} *d0 += Sk8h{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]}; *d8 += Sk8h{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____}; // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[1]} *d0 += Sk8h{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]}; *d8 += Sk8h{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____}; // D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[2]} *d0 += Sk8h{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]}; *d8 += Sk8h{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____}; // D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]} *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v3[0], v3[1]}; *d8 += Sk8h{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____}; } static SK_ALWAYS_INLINE void blur_x_radius_4( const Sk8h& s0, const Sk8h& gauss0, const Sk8h& gauss1, const Sk8h& gauss2, const Sk8h& gauss3, const Sk8h& gauss4, Sk8h* d0, Sk8h* d8) { auto v0 = s0.mulHi(gauss0); auto v1 = s0.mulHi(gauss1); auto v2 = s0.mulHi(gauss2); auto v3 = s0.mulHi(gauss3); auto v4 = s0.mulHi(gauss4); // D[n..n+7] += S[n..n+7] * G[4] *d0 += v4; // D[n..n+8] += {0, S[n..n+7] * G[3]} *d0 += Sk8h{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]}; *d8 += Sk8h{v3[7], _____, _____, _____, _____, _____, _____, _____}; // D[n..n+9] += {0, 0, S[n..n+7] * G[2]} *d0 += Sk8h{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]}; *d8 += Sk8h{v2[6], v2[7], _____, _____, _____, _____, _____, _____}; // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]} *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]}; *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____}; // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[0]} *d0 += Sk8h{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]}; *d8 += Sk8h{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____}; // D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[1]} *d0 += Sk8h{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]}; *d8 += Sk8h{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____}; // D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[2]} *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v2[0], v2[1]}; *d8 += Sk8h{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____}; // D[n..n+14] += {0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]} *d0 += Sk8h{_____, _____, _____, _____, _____, _____, _____, v3[0]}; *d8 += Sk8h{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____}; // D[n..n+15] += {0, 0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[4]} *d8 += v4; } using BlurX = decltype(blur_x_radius_1); // BlurX will only be one of the functions blur_x_radius_(1|2|3|4). static SK_ALWAYS_INLINE void blur_row( BlurX blur, const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, const uint8_t* src, int srcW, uint8_t* dst, int dstW) { // Clear the buffer to handle summing wider than source. Sk8h d0{kHalf}, d8{kHalf}; // Go by multiples of 8 in src. int x = 0; for (; x <= srcW - 8; x += 8) { blur(load(src, 8), g0, g1, g2, g3, g4, &d0, &d8); store(dst, d0, 8); d0 = d8; d8 = Sk8h{kHalf}; src += 8; dst += 8; } // There are src values left, but the remainder of src values is not a multiple of 8. int srcTail = srcW - x; if (srcTail > 0) { blur(load(src, srcTail), g0, g1, g2, g3, g4, &d0, &d8); int dstTail = std::min(8, dstW - x); store(dst, d0, dstTail); d0 = d8; dst += dstTail; x += dstTail; } // There are dst mask values to complete. int dstTail = dstW - x; if (dstTail > 0) { store(dst, d0, dstTail); } } // BlurX will only be one of the functions blur_x_radius_(1|2|3|4). static SK_ALWAYS_INLINE void blur_x_rect( BlurX blur, uint16_t* gauss, const uint8_t* src, size_t srcStride, int srcW, uint8_t* dst, size_t dstStride, int dstW, int dstH) { Sk8h g0{gauss[0]}, g1{gauss[1]}, g2{gauss[2]}, g3{gauss[3]}, g4{gauss[4]}; // Blur *ALL* the rows. for (int y = 0; y < dstH; y++) { blur_row(blur, g0, g1, g2, g3, g4, src, srcW, dst, dstW); src += srcStride; dst += dstStride; } } SK_ATTRIBUTE(noinline) static void direct_blur_x( int radius, uint16_t* gauss, const uint8_t* src, size_t srcStride, int srcW, uint8_t* dst, size_t dstStride, int dstW, int dstH) { switch (radius) { case 1: blur_x_rect(blur_x_radius_1, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH); break; case 2: blur_x_rect(blur_x_radius_2, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH); break; case 3: blur_x_rect(blur_x_radius_3, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH); break; case 4: blur_x_rect(blur_x_radius_4, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH); break; default: SkASSERTF(false, "The radius %d is not handled\n", radius); } } // The operations of the blur_y_radius_N functions work on a theme similar to the blur_x_radius_N // functions, but end up being simpler because there is no complicated shift of registers. We // start with the non-traditional form of the gaussian filter. In the following r is the value // when added generates the next value in the column. // // D[n+0r] = S[n+0r]*G[1] // + S[n+1r]*G[0] // + S[n+2r]*G[1] // // Expanding out in a way similar to blur_x_radius_N for specific values of n. // // D[n+0r] = S[n-2r]*G[1] + S[n-1r]*G[0] + S[n+0r]*G[1] // D[n+1r] = S[n-1r]*G[1] + S[n+0r]*G[0] + S[n+1r]*G[1] // D[n+2r] = S[n+0r]*G[1] + S[n+1r]*G[0] + S[n+2r]*G[1] // // We can see that S[n+0r] is in all three D[] equations, but is only multiplied twice. Now we // can look at the calculation form the point of view of a source value. // // Given S[n+0r]: // D[n+0r] += S[n+0r]*G[1]; // /* D[n+0r] is done and can be stored now. */ // D[n+1r] += S[n+0r]*G[0]; // D[n+2r] = S[n+0r]*G[1]; // // Remember, by induction, that D[n+0r] == S[n-2r]*G[1] + S[n-1r]*G[0] before adding in // S[n+0r]*G[1]. So, after the addition D[n+0r] has finished calculation and can be stored. Also, // notice that D[n+2r] is receiving its first value from S[n+0r]*G[1] and is not added in. Notice // how values flow in the following two iterations in source. // // D[n+0r] += S[n+0r]*G[1] // D[n+1r] += S[n+0r]*G[0] // D[n+2r] = S[n+0r]*G[1] // /* ------- */ // D[n+1r] += S[n+1r]*G[1] // D[n+2r] += S[n+1r]*G[0] // D[n+3r] = S[n+1r]*G[1] // // Instead of using memory we can introduce temporaries d01 and d12. The update step changes // to the following. // // answer = d01 + S[n+0r]*G[1] // d01 = d12 + S[n+0r]*G[0] // d12 = S[n+0r]*G[1] // return answer // // Finally, this can be ganged into SIMD style. // answer[0..7] = d01[0..7] + S[n+0r..n+0r+7]*G[1] // d01[0..7] = d12[0..7] + S[n+0r..n+0r+7]*G[0] // d12[0..7] = S[n+0r..n+0r+7]*G[1] // return answer[0..7] static SK_ALWAYS_INLINE Sk8h blur_y_radius_1( const Sk8h& s0, const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&, Sk8h* d01, Sk8h* d12, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*) { auto v0 = s0.mulHi(g0); auto v1 = s0.mulHi(g1); Sk8h answer = *d01 + v1; *d01 = *d12 + v0; *d12 = v1 + kHalf; return answer; } static SK_ALWAYS_INLINE Sk8h blur_y_radius_2( const Sk8h& s0, const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&, Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h*, Sk8h*, Sk8h*, Sk8h*) { auto v0 = s0.mulHi(g0); auto v1 = s0.mulHi(g1); auto v2 = s0.mulHi(g2); Sk8h answer = *d01 + v2; *d01 = *d12 + v1; *d12 = *d23 + v0; *d23 = *d34 + v1; *d34 = v2 + kHalf; return answer; } static SK_ALWAYS_INLINE Sk8h blur_y_radius_3( const Sk8h& s0, const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h&, Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h*, Sk8h*) { auto v0 = s0.mulHi(g0); auto v1 = s0.mulHi(g1); auto v2 = s0.mulHi(g2); auto v3 = s0.mulHi(g3); Sk8h answer = *d01 + v3; *d01 = *d12 + v2; *d12 = *d23 + v1; *d23 = *d34 + v0; *d34 = *d45 + v1; *d45 = *d56 + v2; *d56 = v3 + kHalf; return answer; } static SK_ALWAYS_INLINE Sk8h blur_y_radius_4( const Sk8h& s0, const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h* d67, Sk8h* d78) { auto v0 = s0.mulHi(g0); auto v1 = s0.mulHi(g1); auto v2 = s0.mulHi(g2); auto v3 = s0.mulHi(g3); auto v4 = s0.mulHi(g4); Sk8h answer = *d01 + v4; *d01 = *d12 + v3; *d12 = *d23 + v2; *d23 = *d34 + v1; *d34 = *d45 + v0; *d45 = *d56 + v1; *d56 = *d67 + v2; *d67 = *d78 + v3; *d78 = v4 + kHalf; return answer; } using BlurY = decltype(blur_y_radius_1); // BlurY will be one of blur_y_radius_(1|2|3|4). static SK_ALWAYS_INLINE void blur_column( BlurY blur, int radius, int width, const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, const uint8_t* src, size_t srcStride, int srcH, uint8_t* dst, size_t dstStride) { Sk8h d01{kHalf}, d12{kHalf}, d23{kHalf}, d34{kHalf}, d45{kHalf}, d56{kHalf}, d67{kHalf}, d78{kHalf}; auto flush = [&](uint8_t* to, const Sk8h& v0, const Sk8h& v1) { store(to, v0, width); to += dstStride; store(to, v1, width); return to + dstStride; }; for (int y = 0; y < srcH; y += 1) { auto s = load(src, width); auto b = blur(s, g0, g1, g2, g3, g4, &d01, &d12, &d23, &d34, &d45, &d56, &d67, &d78); store(dst, b, width); src += srcStride; dst += dstStride; } if (radius >= 1) { dst = flush(dst, d01, d12); } if (radius >= 2) { dst = flush(dst, d23, d34); } if (radius >= 3) { dst = flush(dst, d45, d56); } if (radius >= 4) { flush(dst, d67, d78); } } // BlurY will be one of blur_y_radius_(1|2|3|4). static SK_ALWAYS_INLINE void blur_y_rect( BlurY blur, int radius, uint16_t *gauss, const uint8_t *src, size_t srcStride, int srcW, int srcH, uint8_t *dst, size_t dstStride) { Sk8h g0{gauss[0]}, g1{gauss[1]}, g2{gauss[2]}, g3{gauss[3]}, g4{gauss[4]}; int x = 0; for (; x <= srcW - 8; x += 8) { blur_column(blur, radius, 8, g0, g1, g2, g3, g4, src, srcStride, srcH, dst, dstStride); src += 8; dst += 8; } int xTail = srcW - x; if (xTail > 0) { blur_column(blur, radius, xTail, g0, g1, g2, g3, g4, src, srcStride, srcH, dst, dstStride); } } SK_ATTRIBUTE(noinline) static void direct_blur_y( int radius, uint16_t* gauss, const uint8_t* src, size_t srcStride, int srcW, int srcH, uint8_t* dst, size_t dstStride) { switch (radius) { case 1: blur_y_rect(blur_y_radius_1, 1, gauss, src, srcStride, srcW, srcH, dst, dstStride); break; case 2: blur_y_rect(blur_y_radius_2, 2, gauss, src, srcStride, srcW, srcH, dst, dstStride); break; case 3: blur_y_rect(blur_y_radius_3, 3, gauss, src, srcStride, srcW, srcH, dst, dstStride); break; case 4: blur_y_rect(blur_y_radius_4, 4, gauss, src, srcStride, srcW, srcH, dst, dstStride); break; default: SkASSERTF(false, "The radius %d is not handled\n", radius); } } static SkIPoint small_blur(double sigmaX, double sigmaY, const SkMask& src, SkMask* dst) { SkASSERT(0 <= sigmaX && sigmaX < 2); SkASSERT(0 <= sigmaY && sigmaY < 2); SkGaussFilter filterX{sigmaX, SkGaussFilter::Type::Bessel}, filterY{sigmaY, SkGaussFilter::Type::Bessel}; int radiusX = filterX.radius(), radiusY = filterY.radius(); SkASSERT(radiusX <= 4 && radiusY <= 4); auto prepareGauss = [](const SkGaussFilter& filter, uint16_t* factors) { int i = 0; for (double d : filter) { factors[i++] = static_cast(round(d * (1 << 16))); } }; uint16_t gaussFactorsX[SkGaussFilter::kGaussArrayMax], gaussFactorsY[SkGaussFilter::kGaussArrayMax]; prepareGauss(filterX, gaussFactorsX); prepareGauss(filterY, gaussFactorsY); *dst = prepare_destination(radiusX, radiusY, src); if (src.fImage == nullptr) { return {SkTo(radiusX), SkTo(radiusY)}; } if (dst->fImage == nullptr) { dst->fBounds.setEmpty(); return {0, 0}; } int srcW = src.fBounds.width(), srcH = src.fBounds.height(); int dstW = dst->fBounds.width(), dstH = dst->fBounds.height(); size_t srcStride = src.fRowBytes, dstStride = dst->fRowBytes; //TODO: handle bluring in only one direction. // Blur vertically and copy to destination. direct_blur_y(radiusY, gaussFactorsY, src.fImage, srcStride, srcW, srcH, dst->fImage + radiusX, dstStride); // Blur horizontally in place. direct_blur_x(radiusX, gaussFactorsX, dst->fImage + radiusX, dstStride, srcW, dst->fImage, dstStride, dstW, dstH); return {radiusX, radiusY}; } #endif // SK_USE_LEGACY_INTERP_BLUR SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMask* dst) const { #if !defined(SK_USE_LEGACY_INTERP_BLUR) if (fSigmaW < 2.0 && fSigmaH < 2.0) { return small_blur(fSigmaW, fSigmaH, src, dst); } #endif // 1024 is a place holder guess until more analysis can be done. SkSTArenaAlloc<1024> alloc; PlanningInterface* planW = make_plan(&alloc, fSigmaW); PlanningInterface* planH = make_plan(&alloc, fSigmaH); int borderW = planW->border(), borderH = planH->border(); SkASSERT(borderH >= 0 && borderW >= 0); *dst = prepare_destination(borderW, borderH, src); if (src.fImage == nullptr) { return {SkTo(borderW), SkTo(borderH)}; } if (dst->fImage == nullptr) { dst->fBounds.setEmpty(); return {0, 0}; } int srcW = src.fBounds.width(), srcH = src.fBounds.height(), dstW = dst->fBounds.width(), dstH = dst->fBounds.height(); SkASSERT(srcW >= 0 && srcH >= 0 && dstW >= 0 && dstH >= 0); auto bufferSize = std::max(planW->bufferSize(), planH->bufferSize()); auto buffer = alloc.makeArrayDefault(bufferSize); if (planW->needsBlur() && planH->needsBlur()) { // Blur both directions. int tmpW = srcH, tmpH = dstW; auto tmp = alloc.makeArrayDefault(tmpW * tmpH); // Blur horizontally, and transpose. auto scanW = planW->makeBlurScan(&alloc, srcW, buffer); int y = 0; if (scanW->canBlur4() && srcH > 4) { for (;y + 4 <= srcH; y += 4) { auto srcStart = &src.fImage[y * src.fRowBytes]; auto tmpStart = &tmp[y]; scanW->blur4Transpose(srcStart, src.fRowBytes, srcStart + srcW, tmpStart, tmpW, tmpStart + tmpW * tmpH); } } for (;y < srcH; y++) { auto srcStart = &src.fImage[y * src.fRowBytes]; auto tmpStart = &tmp[y]; scanW->blur(srcStart, 1, srcStart + srcW, tmpStart, tmpW, tmpStart + tmpW * tmpH); } // Blur vertically (scan in memory order because of the transposition), // and transpose back to the original orientation. auto scanH = planH->makeBlurScan(&alloc, tmpW, buffer); y = 0; if (scanH->canBlur4() && tmpH > 4) { for (;y + 4 <= tmpH; y += 4) { auto tmpStart = &tmp[y * tmpW]; auto dstStart = &dst->fImage[y]; scanH->blur4Transpose( tmpStart, tmpW, tmpStart + tmpW, dstStart, dst->fRowBytes, dstStart + dst->fRowBytes * dstH); } } for (;y < tmpH; y++) { auto tmpStart = &tmp[y * tmpW]; auto dstStart = &dst->fImage[y]; scanH->blur(tmpStart, 1, tmpStart + tmpW, dstStart, dst->fRowBytes, dstStart + dst->fRowBytes * dstH); } } else if (planW->needsBlur()) { // Blur only horizontally. auto scanW = planW->makeBlurScan(&alloc, srcW, buffer); for (int y = 0; y < srcH; y++) { auto srcStart = &src.fImage[y * src.fRowBytes]; auto dstStart = &dst->fImage[y * dst->fRowBytes]; scanW->blur(srcStart, 1, srcStart + srcW, dstStart, 1, dstStart + dstW); } } else if (planH->needsBlur()) { // Blur only vertically. auto srcEnd = &src.fImage[src.fRowBytes * srcH]; auto dstEnd = &dst->fImage[dst->fRowBytes * dstH]; auto scanH = planH->makeBlurScan(&alloc, srcH, buffer); for (int x = 0; x < srcW; x++) { auto srcStart = &src.fImage[x]; auto dstStart = &dst->fImage[x]; scanH->blur(srcStart, src.fRowBytes, srcEnd, dstStart, dst->fRowBytes, dstEnd); } } else { // Copy to dst. No Blur. SkASSERT(false); // should not get here for (int y = 0; y < srcH; y++) { std::memcpy(&dst->fImage[y * dst->fRowBytes], &src.fImage[y * src.fRowBytes], dstW); } } return {SkTo(borderW), SkTo(borderH)}; }