diff options
author | senorblanco@chromium.org <senorblanco@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2012-11-15 20:27:35 +0000 |
---|---|---|
committer | senorblanco@chromium.org <senorblanco@chromium.org@2bbb7eff-a529-9590-31e7-b0007b416f81> | 2012-11-15 20:27:35 +0000 |
commit | 908276b3969cf8f8eec28026363897134c0e54e0 (patch) | |
tree | 55fb3b7689972532535358a0f5cb53e5bd3809d7 /src | |
parent | 0561a3ccd24e46259d825adf5507c7ee9531b9c5 (diff) |
Reorder the separable blur passes into XXX/YYY, with an image transpose on the last pass of each group. This results in continuguous memory reads in all passes, giving a 22% speedup on theverge.skp over the previous separable implementation, and a 30%-50% improvement over the existing implementation (depending on platform).
Review URL: https://codereview.appspot.com/6851053
git-svn-id: http://skia.googlecode.com/svn/trunk@6445 2bbb7eff-a529-9590-31e7-b0007b416f81
Diffstat (limited to 'src')
-rw-r--r-- | src/effects/SkBlurMask.cpp | 101 |
1 files changed, 38 insertions, 63 deletions
diff --git a/src/effects/SkBlurMask.cpp b/src/effects/SkBlurMask.cpp index 1e76acf27d..54423ef555 100644 --- a/src/effects/SkBlurMask.cpp +++ b/src/effects/SkBlurMask.cpp @@ -12,80 +12,50 @@ #include "SkTemplates.h" #include "SkEndian.h" -static int boxBlurX(const uint8_t* src, int src_row_bytes, - uint8_t* dst, int dst_row_bytes, - int radius, int width, int height) +/** + * This function performs a box blur in X, of the given radius. If the + * "transpose" parameter is true, it will transpose the pixels on write, + * such that X and Y are swapped. Reads are always performed from contiguous + * memory in X, for speed. The destination buffer (dst) must be at least + * (width + radius * 2) * height bytes in size. + */ +static int boxBlur(const uint8_t* src, int src_y_stride, uint8_t* dst, + int radius, int width, int height, bool transpose) { int kernelSize = radius * 2 + 1; int border = SkMin32(width, radius * 2); uint32_t scale = (1 << 24) / kernelSize; + int new_width = width + radius * 2; + int dst_x_stride = transpose ? height : 1; + int dst_y_stride = transpose ? 1 : new_width; for (int y = 0; y < height; ++y) { int sum = 0; - uint8_t* dptr = dst + y * dst_row_bytes; - const uint8_t* sptr = src + y * src_row_bytes - radius; + uint8_t* dptr = dst + y * dst_y_stride; + const uint8_t* right = src + y * src_y_stride; + const uint8_t* left = right; for (int x = 0; x < border; ++x) { - sum += *(sptr + radius); - *dptr++ = (sum * scale) >> 24; - sptr++; - } - for (int x = width; x < radius * 2; ++x) { - *dptr++ = (sum * scale) >> 24; - sptr++; - } - for (int x = radius * 2; x < width; ++x) { - sum += *(sptr + radius); - *dptr++ = (sum * scale) >> 24; - sum -= *(sptr - radius); - sptr++; - } - for (int x = 0; x < border; ++x) { - *dptr++ = (sum * scale) >> 24; - sum -= *(sptr - radius); - sptr++; - } - SkASSERT(sum == 0); - } - return width + radius * 2; -} - -static int boxBlurY(const uint8_t* src, int src_row_bytes, - uint8_t* dst, int dst_row_bytes, - int radius, int width, int height) -{ - int kernelSize = radius * 2 + 1; - uint32_t scale = (1 << 24) / kernelSize; - int border = SkMin32(height, radius * 2); - for (int x = 0; x < width; ++x) { - int sum = 0; - uint8_t* dptr = dst + x; - const uint8_t* sptr = src + x - radius * src_row_bytes; - for (int y = 0; y < border; ++y) { - sum += *(sptr + radius * src_row_bytes); + sum += *right++; *dptr = (sum * scale) >> 24; - sptr += src_row_bytes; - dptr += dst_row_bytes; + dptr += dst_x_stride; } - for (int y = height; y < radius * 2; ++y) { + for (int x = width; x < radius * 2; ++x) { *dptr = (sum * scale) >> 24; - sptr += src_row_bytes; - dptr += dst_row_bytes; + dptr += dst_x_stride; } - for (int y = radius * 2; y < height; ++y) { - sum += *(sptr + radius * src_row_bytes); + for (int x = radius * 2; x < width; ++x) { + sum += *right++; *dptr = (sum * scale) >> 24; - sum -= *(sptr - radius * src_row_bytes); - sptr += src_row_bytes; - dptr += dst_row_bytes; + sum -= *left++; + dptr += dst_x_stride; } - for (int y = 0; y < border; ++y) { + for (int x = 0; x < border; ++x) { *dptr = (sum * scale) >> 24; - sum -= *(sptr - radius * src_row_bytes); - sptr += src_row_bytes; - dptr += dst_row_bytes; + sum -= *left++; + dptr += dst_x_stride; } SkASSERT(sum == 0); } - return height + radius * 2; + return new_width; } // Unrolling the integer blur kernel seems to give us a ~15% speedup on Windows, @@ -683,13 +653,18 @@ bool SkBlurMask::Blur(SkMask* dst, const SkMask& src, uint8_t* tp = tmpBuffer.get(); int w = sw, h = sh; - w = boxBlurX(sp, src.fRowBytes, tp, dst->fRowBytes, rx, w, h); - h = boxBlurY(tp, dst->fRowBytes, dp, dst->fRowBytes, ry, w, h); if (quality == kHigh_Quality) { - w = boxBlurX(dp, dst->fRowBytes, tp, dst->fRowBytes, rx, w, h); - h = boxBlurY(tp, dst->fRowBytes, dp, dst->fRowBytes, ry, w, h); - w = boxBlurX(dp, dst->fRowBytes, tp, dst->fRowBytes, rx, w, h); - h = boxBlurY(tp, dst->fRowBytes, dp, dst->fRowBytes, ry, w, h); + // Do three X blurs, with a transpose on the final one. + w = boxBlur(sp, src.fRowBytes, tp, rx, w, h, false); + w = boxBlur(tp, w, dp, rx, w, h, false); + w = boxBlur(dp, w, tp, rx, w, h, true); + // Do three Y blurs, with a transpose on the final one. + h = boxBlur(tp, h, dp, ry, h, w, false); + h = boxBlur(dp, h, tp, ry, h, w, false); + h = boxBlur(tp, h, dp, ry, h, w, true); + } else { + w = boxBlur(sp, src.fRowBytes, tp, rx, w, h, true); + h = boxBlur(tp, h, dp, ry, h, w, true); } } else { const size_t storageW = sw + 2 * (passCount - 1) * rx + 1; |