aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/core/SkMaskBlurFilter.cpp
diff options
context:
space:
mode:
authorGravatar Herb Derby <herb@google.com>2017-09-13 16:46:05 -0400
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-09-14 03:42:50 +0000
commit0f96bb303adfc712baba7830e036df1d0972d7f3 (patch)
treefb1ed034d32d8c5536e32f6be0eec39c0cbb93a8 /src/core/SkMaskBlurFilter.cpp
parent09dfc4759e99229e7c74891a88596e8b9b3d9026 (diff)
Sk4i version of blur.
For the blur_1.50_normal_low_quality benchmark, this code goes from about 120us to 85us. The original implementation executes at about 95us. This changed in controlled by the flag: SK_SUPPORT_LEGACY_SLOW_SMALL_BLUR BUG=chromium:759070 CQ_INCLUDE_TRYBOTS=skia.primary:Test-Debian9-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD Change-Id: If722cb8ffd8c47a94b7a6b4e6dd26fd1474b6209 Reviewed-on: https://skia-review.googlesource.com/45300 Commit-Queue: Herb Derby <herb@google.com> Reviewed-by: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/core/SkMaskBlurFilter.cpp')
-rw-r--r--src/core/SkMaskBlurFilter.cpp369
1 files changed, 261 insertions, 108 deletions
diff --git a/src/core/SkMaskBlurFilter.cpp b/src/core/SkMaskBlurFilter.cpp
index 294b4a6322..0ab85000a4 100644
--- a/src/core/SkMaskBlurFilter.cpp
+++ b/src/core/SkMaskBlurFilter.cpp
@@ -11,6 +11,7 @@
#include <climits>
#include "SkArenaAlloc.h"
+#include "SkNx.h"
#include "SkSafeMath.h"
static const double kPi = 3.14159265358979323846264338327950288;
@@ -25,7 +26,13 @@ class BlurScanInterface {
public:
virtual ~BlurScanInterface() = default;
virtual void blur(const uint8_t* src, size_t srcStride, const uint8_t* srcEnd,
- uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const = 0;
+ uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const = 0;
+ virtual bool canBlur4() { return false; }
+ virtual void blur4Transpose(
+ const uint8_t* src, size_t srcStride, const uint8_t* srcEnd,
+ uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const {
+ SK_ABORT("This should not be called.");
+ }
};
class PlanningInterface {
@@ -49,120 +56,146 @@ public:
SK_ABORT("Should never be called.");
return nullptr;
}
-private:
};
-// This class is deprecated, and will be replaced by Box.
-class PlanBoxInteger final : public PlanningInterface {
+// Old slower version of Box which uses 64 bit multiply instead of 32 bit multiple.
+// Controlled by SK_SUPPORT_LEGACY_SLOW_SMALL_BLUR
+class PlanBox32 final : public PlanningInterface {
public:
- explicit PlanBoxInteger(double sigma) {
-
+ explicit PlanBox32(double sigma) {
// Calculate the radius from sigma. Taken from the old code until something better is
// figured out.
auto possibleRadius = 1.5 * sigma - 0.5;
- auto radius = std::max(1.0, std::ceil(possibleRadius));
- auto window = 2 * radius + 1;
+ auto radius = std::max(std::numeric_limits<double>::epsilon(), possibleRadius);
+ auto outerRadius = std::ceil(radius);
+ auto outerWindow = 2 * outerRadius + 1;
+ auto outerFactor = (1 - (outerRadius - radius)) / outerWindow;
+ fOuterWeight = static_cast<uint64_t>(round(outerFactor * (1ull << 32)));
+
+ auto innerRadius = outerRadius - 1;
+ auto innerWindow = 2 * innerRadius + 1;
+ auto innerFactor = (1 - (radius - innerRadius)) / innerWindow;
+ fInnerWeight = static_cast<uint64_t>(round(innerFactor * (1ull << 32)));
- fWindow = static_cast<size_t>(window);
+ // Sliding window is defined by the relationship between the outer and inner widows.
+ // In the single window case, you add the element on the right, and subtract the element on
+ // the left. But, because two windows are used, this relationship is more complicated; an
+ // element is added from the right of the outer window, and subtracted from the left of the
+ // inner window. Because innerWindow = outerWindow - 2, the distance between
+ // the left and right in the two window case is outerWindow - 1.
+ fSlidingWindow = static_cast<size_t>(outerWindow - 1);
}
- size_t bufferSize() const override { return fWindow - 1; }
+ size_t bufferSize() const override { return 0; }
- size_t border() const override { return (fWindow - 1) / 2; }
+ // Remember that sliding window = window - 1. Therefore, radius = sliding window / 2.
+ size_t border() const override { return fSlidingWindow / 2; }
bool needsBlur() const override { return true; }
BlurScanInterface* makeBlurScan(
SkArenaAlloc* alloc, size_t width, uint32_t* buffer) const override
{
- auto weight = static_cast<uint64_t>(round(1.0 / fWindow * (1ull << 32)));
- auto bufferEnd = buffer + this->bufferSize();
- size_t noChangeCount = fWindow > width ? fWindow - width : 0;
+ size_t noChangeCount;
+ size_t trailingEdgeZeroCount;
+
+ // The relation between the slidingWindow and the width dictates two operating modes.
+ // * width >= slidingWindow - both sides of the window are contained in the image while
+ // scanning. Therefore, we assume that slidingWindow zeros are consumed on the trailing
+ // edge of the window. After this count, then both edges are traversing the image.
+ // * slidingWindow > width - both sides of the window are off the image while scanning
+ // the middle. The front edge of the window can only travel width until it falls off the
+ // image. At this point, both edges of the window are off the image consuming zeros
+ // and therefore, the destination value does not change. The scan produces unchanged
+ // values until the trailing edge of the window enters the image. This count is
+ // slidingWindow - width.
+ if (width >= fSlidingWindow) {
+ noChangeCount = 0;
+ trailingEdgeZeroCount = fSlidingWindow;
+ } else {
+ noChangeCount = fSlidingWindow - width;
+ trailingEdgeZeroCount = width;
+ }
+ return alloc->make<Box>(fOuterWeight, fInnerWeight, noChangeCount, trailingEdgeZeroCount);
- return alloc->make<BoxInteger>(weight, buffer, bufferEnd, noChangeCount);
}
private:
- class BoxInteger final : public BlurScanInterface {
+ class Box final : public BlurScanInterface {
public:
- BoxInteger(uint64_t weight, uint32_t* buffer, uint32_t* bufferEnd, size_t noChangeCount)
- : fWeight{weight}
- , fBuffer0{buffer}
- , fBuffer0End{bufferEnd}
- , fNoChangeCount{noChangeCount} { }
+ Box(uint64_t outerWeight, uint64_t innerWeight,
+ size_t noChangeCount, size_t trailingEdgeZeroCount)
+ : fOuterWeight{outerWeight}
+ , fInnerWeight{innerWeight}
+ , fNoChangeCount{noChangeCount}
+ , fTrailingEdgeZeroCount{trailingEdgeZeroCount} { }
void blur(const uint8_t* src, size_t srcStride, const uint8_t* srcEnd,
uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const override {
- auto buffer0Cursor = fBuffer0;
- std::memset(&fBuffer0[0], 0, (fBuffer0End - fBuffer0) * sizeof(*fBuffer0));
- uint32_t sum0 = 0;
-
- // Consume the source generating pixels.
- for (auto srcCursor = src;
- srcCursor < srcEnd; dst += dstStride, srcCursor += srcStride) {
- uint32_t s = *srcCursor;
- sum0 += s;
+ auto rightOuter = src;
+ auto dstCursor = dst;
- *dst = this->finalScale(sum0);
+ uint32_t outerSum = 0;
+ uint32_t innerSum = 0;
+ for (size_t i = 0; i < fTrailingEdgeZeroCount; i++) {
+ innerSum = outerSum;
+ outerSum += *rightOuter;
+ *dstCursor = this->interpolateSums(outerSum, innerSum);
- sum0 -= *buffer0Cursor;
- *buffer0Cursor = s;
- buffer0Cursor =
- (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : &fBuffer0[0];
+ rightOuter += srcStride;
+ dstCursor += dstStride;
}
- // This handles the case when both ends of the box are not between [src, srcEnd), and
- // both are zero at that point.
+ // slidingWindow > width
for (size_t i = 0; i < fNoChangeCount; i++) {
- uint32_t s = 0;
- sum0 += s;
-
- *dst = this->finalScale(sum0);
-
- sum0 -= *buffer0Cursor;
- *buffer0Cursor = s;
- buffer0Cursor =
- (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : &fBuffer0[0];
- dst += dstStride;
+ *dstCursor = this->interpolateSums(outerSum, innerSum);;
+ dstCursor += dstStride;
}
- // Starting from the right, fill in the rest of the buffer.
- std::memset(&fBuffer0[0], 0, (fBuffer0End - &fBuffer0[0]) * sizeof(fBuffer0[0]));
+ // width > slidingWindow
+ auto leftInner = src;
+ while (rightOuter < srcEnd) {
+ innerSum = outerSum - *leftInner;
+ outerSum += *rightOuter;
+ *dstCursor = this->interpolateSums(outerSum, innerSum);
+ outerSum -= *leftInner;
- sum0 = 0;
+ rightOuter += srcStride;
+ leftInner += srcStride;
+ dstCursor += dstStride;
+ }
- uint8_t* dstCursor = dstEnd;
- const uint8_t* srcCursor = srcEnd;
- do {
+ auto leftOuter = srcEnd;
+ dstCursor = dstEnd;
+ outerSum = 0;
+ for (size_t i = 0; i < fTrailingEdgeZeroCount; i++) {
+ leftOuter -= srcStride;
dstCursor -= dstStride;
- srcCursor -= srcStride;
- uint32_t s = *srcCursor;
- sum0 += s;
- *dstCursor = this->finalScale(sum0);
-
- sum0 -= *buffer0Cursor;
- *buffer0Cursor = s;
- buffer0Cursor =
- (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : &fBuffer0[0];
- } while (dstCursor > dst);
+ innerSum = outerSum;
+ outerSum += *leftOuter;
+ *dstCursor = this->interpolateSums(outerSum, innerSum);
+ }
}
+ private:
static constexpr uint64_t kHalf = static_cast<uint64_t>(1) << 31;
- uint8_t finalScale(uint32_t sum) const {
- return SkTo<uint8_t>((fWeight * sum + kHalf) >> 32);
+ uint8_t interpolateSums(uint32_t outerSum, uint32_t innerSum) const {
+ return SkTo<uint8_t>((fOuterWeight * outerSum + fInnerWeight * innerSum + kHalf) >> 32);
}
-
- const uint64_t fWeight;
- uint32_t* const fBuffer0;
- uint32_t* const fBuffer0End;
- const size_t fNoChangeCount;
+ uint64_t fOuterWeight;
+ uint64_t fInnerWeight;
+ size_t fNoChangeCount;
+ size_t fTrailingEdgeZeroCount;
};
-
- size_t fWindow;
+private:
+ uint64_t fOuterWeight;
+ uint64_t fInnerWeight;
+ size_t fSlidingWindow;
};
+
class PlanBox final : public PlanningInterface {
public:
explicit PlanBox(double sigma) {
@@ -173,12 +206,12 @@ public:
auto outerRadius = std::ceil(radius);
auto outerWindow = 2 * outerRadius + 1;
auto outerFactor = (1 - (outerRadius - radius)) / outerWindow;
- fOuterWeight = static_cast<uint64_t>(round(outerFactor * (1ull << 32)));
+ fOuterWeight = static_cast<uint32_t>(round(outerFactor * (1ull << 24)));
auto innerRadius = outerRadius - 1;
auto innerWindow = 2 * innerRadius + 1;
auto innerFactor = (1 - (radius - innerRadius)) / innerWindow;
- fInnerWeight = static_cast<uint64_t>(round(innerFactor * (1ull << 32)));
+ fInnerWeight = static_cast<uint32_t>(round(innerFactor * (1ull << 24)));
// Sliding window is defined by the relationship between the outer and inner widows.
// In the single window case, you add the element on the right, and subtract the element on
@@ -189,7 +222,9 @@ public:
fSlidingWindow = static_cast<size_t>(outerWindow - 1);
}
- size_t bufferSize() const override { return 0; }
+ size_t bufferSize() const override {
+ return fSlidingWindow * (sizeof(Sk4u) / sizeof(uint32_t));
+ }
// Remember that sliding window = window - 1. Therefore, radius = sliding window / 2.
size_t border() const override { return fSlidingWindow / 2; }
@@ -219,31 +254,41 @@ public:
noChangeCount = fSlidingWindow - width;
trailingEdgeZeroCount = width;
}
- return alloc->make<Box>(fOuterWeight, fInnerWeight, noChangeCount, trailingEdgeZeroCount);
+ Sk4u* sk4uBuffer = reinterpret_cast<Sk4u*>(buffer);
+ return alloc->make<Box>(fOuterWeight, fInnerWeight, noChangeCount, trailingEdgeZeroCount,
+ sk4uBuffer, sk4uBuffer + fSlidingWindow);
}
private:
class Box final : public BlurScanInterface {
public:
- Box(uint64_t outerWeight, uint64_t innerWeight,
- size_t noChangeCount, size_t trailingEdgeZeroCount)
+ Box(uint32_t outerWeight, uint32_t innerWeight,
+ size_t noChangeCount, size_t trailingEdgeZeroCount,
+ Sk4u* buffer, Sk4u* bufferEnd)
: fOuterWeight{outerWeight}
, fInnerWeight{innerWeight}
, fNoChangeCount{noChangeCount}
- , fTrailingEdgeZeroCount{trailingEdgeZeroCount} { }
+ , fTrailingEdgeZeroCount{trailingEdgeZeroCount}
+ , fBuffer{buffer}
+ , fBufferEnd{bufferEnd} { }
void blur(const uint8_t* src, size_t srcStride, const uint8_t* srcEnd,
- uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const override {
+ uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const override {
auto rightOuter = src;
auto dstCursor = dst;
+ auto interpolateSums = [this](uint32_t outerSum, uint32_t innerSum) {
+ return SkTo<uint8_t>(
+ (fOuterWeight * outerSum + fInnerWeight * innerSum + kHalf) >> 24);
+ };
+
uint32_t outerSum = 0;
uint32_t innerSum = 0;
for (size_t i = 0; i < fTrailingEdgeZeroCount; i++) {
innerSum = outerSum;
outerSum += *rightOuter;
- *dstCursor = this->interpolateSums(outerSum, innerSum);
+ *dstCursor = interpolateSums(outerSum, innerSum);
rightOuter += srcStride;
dstCursor += dstStride;
@@ -251,7 +296,7 @@ private:
// slidingWindow > width
for (size_t i = 0; i < fNoChangeCount; i++) {
- *dstCursor = this->interpolateSums(outerSum, innerSum);;
+ *dstCursor = interpolateSums(outerSum, innerSum);;
dstCursor += dstStride;
}
@@ -260,7 +305,7 @@ private:
while (rightOuter < srcEnd) {
innerSum = outerSum - *leftInner;
outerSum += *rightOuter;
- *dstCursor = this->interpolateSums(outerSum, innerSum);
+ *dstCursor = interpolateSums(outerSum, innerSum);
outerSum -= *leftInner;
rightOuter += srcStride;
@@ -277,24 +322,108 @@ private:
innerSum = outerSum;
outerSum += *leftOuter;
- *dstCursor = this->interpolateSums(outerSum, innerSum);
+ *dstCursor = interpolateSums(outerSum, innerSum);
}
}
- private:
- static constexpr uint64_t kHalf = static_cast<uint64_t>(1) << 31;
+ bool canBlur4() override { return true; }
- uint8_t interpolateSums(uint32_t outerSum, uint32_t innerSum) const {
- return SkTo<uint8_t>((fOuterWeight * outerSum + fInnerWeight * innerSum + kHalf) >> 32);
+ // NB this is a transposing scan. The next src is src+1, and the next down is
+ // src+srcStride.
+ void blur4Transpose(
+ const uint8_t* src, size_t srcStride, const uint8_t* srcEnd,
+ uint8_t* dst, size_t dstStride, uint8_t* dstEnd) const override {
+ auto rightOuter = src;
+ auto dstCursor = dst;
+
+ Sk4u* const bufferStart = fBuffer;
+ Sk4u* bufferCursor = bufferStart;
+ Sk4u* const bufferEnd = fBufferEnd;
+
+ const Sk4u outerWeight(SkTo<uint32_t>(fOuterWeight));
+ const Sk4u innerWeight(SkTo<uint32_t>(fInnerWeight));
+
+ auto load = [](const uint8_t* cursor, size_t stride) -> Sk4u {
+ return Sk4u(cursor[0*stride], cursor[1*stride], cursor[2*stride], cursor[3*stride]);
+ };
+
+ auto interpolateSums = [&] (const Sk4u& outerSum, const Sk4u& innerSum) {
+ return
+ SkNx_cast<uint8_t>(
+ (outerSum * outerWeight + innerSum * innerWeight + kHalf) >> 24);
+ };
+
+ Sk4u outerSum = 0;
+ Sk4u innerSum = 0;
+ for (size_t i = 0; i < fTrailingEdgeZeroCount; i++) {
+ innerSum = outerSum;
+
+ Sk4u leadingEdge = load(rightOuter, srcStride);
+ outerSum += leadingEdge;
+ Sk4b blurred = interpolateSums(outerSum, innerSum);
+ blurred.store(dstCursor);
+
+ leadingEdge.store(bufferCursor);
+ bufferCursor = (bufferCursor + 1) < bufferEnd ? bufferCursor + 1 : bufferStart;
+
+ rightOuter += 1;
+ dstCursor += dstStride;
+ }
+
+ // slidingWindow > width
+ for (size_t i = 0; i < fNoChangeCount; i++) {
+ Sk4b blurred = interpolateSums(outerSum, innerSum);
+ blurred.store(dstCursor);
+ dstCursor += dstStride;
+ }
+
+ // width > slidingWindow
+ auto leftInner = src;
+ while (rightOuter < srcEnd) {
+ Sk4u trailEdge = Sk4u::Load(bufferCursor);
+ Sk4u leadingEdge = load(rightOuter, srcStride);
+ innerSum = outerSum - trailEdge;
+ outerSum += leadingEdge;
+
+ Sk4b blurred = interpolateSums(outerSum, innerSum);
+ blurred.store(dstCursor);
+
+ outerSum -= trailEdge;
+ leadingEdge.store(bufferCursor);
+ bufferCursor = (bufferCursor + 1) < bufferEnd ? bufferCursor + 1 : bufferStart;
+
+ rightOuter += 1;
+ leftInner += 1;
+ dstCursor += dstStride;
+ }
+
+ auto leftOuter = srcEnd;
+ dstCursor = dstEnd;
+ outerSum = 0;
+ for (size_t i = 0; i < fTrailingEdgeZeroCount; i++) {
+ leftOuter -= 1;
+ dstCursor -= dstStride;
+
+ innerSum = outerSum;
+ outerSum += load(leftOuter, srcStride);
+ Sk4b blurred = interpolateSums(outerSum, innerSum);
+ blurred.store(dstCursor);
+ }
}
- uint64_t fOuterWeight;
- uint64_t fInnerWeight;
- size_t fNoChangeCount;
- size_t fTrailingEdgeZeroCount;
+
+ private:
+ static constexpr uint32_t kHalf = static_cast<uint32_t>(1) << 23;
+
+ const uint32_t fOuterWeight;
+ const uint32_t fInnerWeight;
+ const size_t fNoChangeCount;
+ const size_t fTrailingEdgeZeroCount;
+ Sk4u* const fBuffer;
+ Sk4u* const fBufferEnd;
};
private:
- uint64_t fOuterWeight;
- uint64_t fInnerWeight;
+ uint32_t fOuterWeight;
+ uint32_t fInnerWeight;
size_t fSlidingWindow;
};
@@ -421,8 +550,8 @@ public:
// Consume the source generating pixels.
for (auto srcCursor = src;
srcCursor < srcEnd; dst += dstStride, srcCursor += srcStride) {
- uint32_t s = *srcCursor;
- sum0 += s;
+ uint32_t leadingEdge = *srcCursor;
+ sum0 += leadingEdge;
sum1 += sum0;
sum2 += sum1;
@@ -437,13 +566,14 @@ public:
buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
sum0 -= *buffer0Cursor;
- *buffer0Cursor = s;
+ *buffer0Cursor = leadingEdge;
buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
}
+ // The leading edge is off the right side of the mask.
for (size_t i = 0; i < fNoChangeCount; i++) {
- uint32_t s = 0;
- sum0 += s;
+ uint32_t leadingEdge = 0;
+ sum0 += leadingEdge;
sum1 += sum0;
sum2 += sum1;
@@ -458,7 +588,7 @@ public:
buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
sum0 -= *buffer0Cursor;
- *buffer0Cursor = s;
+ *buffer0Cursor = leadingEdge;
buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
dst += dstStride;
@@ -474,8 +604,8 @@ public:
while (dstCursor > dst) {
dstCursor -= dstStride;
srcCursor -= srcStride;
- uint32_t s = *srcCursor;
- sum0 += s;
+ uint32_t leadingEdge = *srcCursor;
+ sum0 += leadingEdge;
sum1 += sum0;
sum2 += sum1;
@@ -490,7 +620,7 @@ public:
buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
sum0 -= *buffer0Cursor;
- *buffer0Cursor = s;
+ *buffer0Cursor = leadingEdge;
buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
}
}
@@ -526,8 +656,8 @@ static PlanningInterface* make_plan(SkArenaAlloc* alloc, double sigma) {
if (3 * sigma <= 1) {
plan = alloc->make<None>();
} else if (sigma < kSmallSigma) {
- #if defined(SK_LEGACY_SUPPORT_INTEGER_SMALL_RADII)
- plan = alloc->make<PlanBoxInteger>(sigma);
+ #if defined(SK_SUPPORT_LEGACY_SLOW_SMALL_BLUR)
+ plan = alloc->make<PlanBox32>(sigma);
#else
plan = alloc->make<PlanBox>(sigma);
#endif
@@ -598,21 +728,44 @@ SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMask* dst) const {
// Blur both directions.
size_t tmpW = srcH;
size_t tmpH = dstW;
+
auto tmp = alloc.makeArrayDefault<uint8_t>(tmpW * tmpH);
// Blur horizontally, and transpose.
auto scanW = planW->makeBlurScan(&alloc, srcW, buffer);
- for (size_t y = 0; y < srcH; y++) {
+ size_t y = 0;
+ if (scanW->canBlur4() && srcH > 4) {
+ for (;y + 4 <= srcH; y += 4) {
+ auto srcStart = &src.fImage[y * src.fRowBytes];
+ auto tmpStart = &tmp[y];
+ scanW->blur4Transpose(srcStart, src.fRowBytes, srcStart + srcW,
+ tmpStart, tmpW, tmpStart + tmpW * tmpH);
+ }
+ }
+
+ for (;y < srcH; y++) {
auto srcStart = &src.fImage[y * src.fRowBytes];
auto tmpStart = &tmp[y];
- scanW->blur(srcStart, 1, srcStart + srcW,
+ scanW->blur(srcStart, 1, srcStart + srcW,
tmpStart, tmpW, tmpStart + tmpW * tmpH);
}
+
// Blur vertically (scan in memory order because of the transposition),
// and transpose back to the original orientation.
auto scanH = planH->makeBlurScan(&alloc, tmpW, buffer);
- for (size_t y = 0; y < tmpH; y++) {
+ y = 0;
+ if (scanH->canBlur4() && tmpH > 4) {
+ for (;y + 4 <= tmpH; y += 4) {
+ auto tmpStart = &tmp[y * tmpW];
+ auto dstStart = &dst->fImage[y];
+
+ scanH->blur4Transpose(
+ tmpStart, tmpW, tmpStart + tmpW,
+ dstStart, dst->fRowBytes, dstStart + dst->fRowBytes * dstH);
+ }
+ }
+ for (;y < tmpH; y++) {
auto tmpStart = &tmp[y * tmpW];
auto dstStart = &dst->fImage[y];