From d5b9211d8d072672856019d5608fa21d4397ba54 Mon Sep 17 00:00:00 2001 From: senorblanco Date: Wed, 28 Oct 2015 15:39:33 -0700 Subject: SkBlurImageFilter_opts: optimize NEON box_blur_double in separate loops. Stop leaning so hard on the branch predictor, and pull the conditionals out of the loops for box_blur_double() (NEON). This is conceptually the same change as https://codereview.chromium.org/1426583004/ for the NEON double-pixel loop. R=mtklein@google.com BUG=skia:4526 CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review URL: https://codereview.chromium.org/1412793009 --- src/opts/SkBlurImageFilter_opts.h | 74 +++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 27 deletions(-) (limited to 'src/opts') diff --git a/src/opts/SkBlurImageFilter_opts.h b/src/opts/SkBlurImageFilter_opts.h index 94d734627b..720130b1ec 100644 --- a/src/opts/SkBlurImageFilter_opts.h +++ b/src/opts/SkBlurImageFilter_opts.h @@ -67,6 +67,21 @@ static inline __m128i mullo_epi32(__m128i a, __m128i b) { #elif defined(SK_ARM_HAS_NEON) +// val = (sum * scale * 2 + 0x8000) >> 16 +#define STORE_SUMS_DOUBLE \ + uint16x8_t resultPixels = vreinterpretq_u16_s16(vqrdmulhq_s16( \ + vreinterpretq_s16_u16(sum), vreinterpretq_s16_u16(scale))); \ + if (dstDirection == BlurDirection::kX) { \ + uint32x2_t px2 = vreinterpret_u32_u8(vmovn_u16(resultPixels)); \ + vst1_lane_u32(dptr + 0, px2, 0); \ + vst1_lane_u32(dptr + width, px2, 1); \ + } else { \ + vst1_u8((uint8_t*)dptr, vmovn_u16(resultPixels)); \ + } + +#define INCREMENT_SUMS_DOUBLE(p) sum = vaddw_u8(sum, load_2_pixels(p)) +#define DECREMENT_SUMS_DOUBLE(p) sum = vsubw_u8(sum, load_2_pixels(p)) + // Fast path for kernel sizes between 2 and 127, working on two rows at a time. template void box_blur_double(const SkPMColor** src, int srcStride, SkPMColor** dst, int kernelSize, @@ -84,7 +99,9 @@ void box_blur_double(const SkPMColor** src, int srcStride, SkPMColor** dst, int return vld1_u8((uint8_t*)s); } }; - const int rightBorder = SkMin32(rightOffset + 1, width); + int incrementStart = SkMax32(-rightOffset - 1, -width); + int incrementEnd = SkMax32(width - rightOffset - 1, 0); + int decrementStart = SkMin32(leftOffset, width); const int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride; const int dstStrideX = dstDirection == BlurDirection::kX ? 1 : *height; const int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1; @@ -93,34 +110,37 @@ void box_blur_double(const SkPMColor** src, int srcStride, SkPMColor** dst, int for (; *height >= 2; *height -= 2) { uint16x8_t sum = vdupq_n_u16(0); - const SkPMColor* p = *src; - for (int i = 0; i < rightBorder; i++) { - sum = vaddw_u8(sum, load_2_pixels(p)); - p += srcStrideX; - } - - const SkPMColor* sptr = *src; + const SkPMColor* lptr = *src; + const SkPMColor* rptr = *src; SkPMColor* dptr = *dst; - for (int x = 0; x < width; x++) { - // val = (sum * scale * 2 + 0x8000) >> 16 - uint16x8_t resultPixels = vreinterpretq_u16_s16(vqrdmulhq_s16( - vreinterpretq_s16_u16(sum), vreinterpretq_s16_u16(scale))); - if (dstDirection == BlurDirection::kX) { - uint32x2_t px2 = vreinterpret_u32_u8(vmovn_u16(resultPixels)); - vst1_lane_u32(dptr + 0, px2, 0); - vst1_lane_u32(dptr + width, px2, 1); - } else { - vst1_u8((uint8_t*)dptr, vmovn_u16(resultPixels)); - } - - if (x >= leftOffset) { - sum = vsubw_u8(sum, load_2_pixels(sptr - leftOffset * srcStrideX)); - } - if (x + rightOffset + 1 < width) { - sum = vaddw_u8(sum, load_2_pixels(sptr + (rightOffset + 1) * srcStrideX)); - } - sptr += srcStrideX; + int x; + for (x = incrementStart; x < 0; ++x) { + INCREMENT_SUMS_DOUBLE(rptr); + rptr += srcStrideX; + } + for (; x < decrementStart && x < incrementEnd; ++x) { + STORE_SUMS_DOUBLE dptr += dstStrideX; + INCREMENT_SUMS_DOUBLE(rptr); + rptr += srcStrideX; + } + for (x = decrementStart; x < incrementEnd; ++x) { + STORE_SUMS_DOUBLE + dptr += dstStrideX; + INCREMENT_SUMS_DOUBLE(rptr); + rptr += srcStrideX; + DECREMENT_SUMS_DOUBLE(lptr); + lptr += srcStrideX; + } + for (x = incrementEnd; x < decrementStart; ++x) { + STORE_SUMS_DOUBLE + dptr += dstStrideX; + } + for (; x < width; ++x) { + STORE_SUMS_DOUBLE + dptr += dstStrideX; + DECREMENT_SUMS_DOUBLE(lptr); + lptr += srcStrideX; } *src += srcStrideY * 2; *dst += dstStrideY * 2; -- cgit v1.2.3