diff options
author | Mike Klein <mtklein@chromium.org> | 2018-06-27 12:02:21 -0400 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2018-06-28 15:15:26 +0000 |
commit | b06be0e8b909f7237dda6817bc6301826f7ede3f (patch) | |
tree | 8dad249814081d619d4e95925d5065d866af0b88 /src/core/SkMaskBlurFilter.cpp | |
parent | fcb04d853ec46e2390dfd8b1f20fbbdab4b104c0 (diff) |
cut SkMaskBlurFilter code size by about half
Replace the templated loaders with function pointers instead,
like we do for the BlurX functions already, and favor A8.
And let the compiler/build config decide all the inlining.
A stripped, optimized ARM64 build goes from about 56K to about 26K.
Speed on our mask blur benches (bench/BlurBench.cpp) on a Galaxy S9 is mixed:
some slowdowns, some speedups. This seems to perform between the
all-template version at head and all function-pointer version in PS2.
Change-Id: Ia27d92c08ca68e0b44c89e8a77d7b6e7297239c4
Reviewed-on: https://skia-review.googlesource.com/137889
Reviewed-by: Ben Wagner <bungeman@google.com>
Reviewed-by: Mike Reed <reed@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/core/SkMaskBlurFilter.cpp')
-rw-r--r-- | src/core/SkMaskBlurFilter.cpp | 238 |
1 files changed, 111 insertions, 127 deletions
diff --git a/src/core/SkMaskBlurFilter.cpp b/src/core/SkMaskBlurFilter.cpp index 2c8cda5c97..21bcf53bec 100644 --- a/src/core/SkMaskBlurFilter.cpp +++ b/src/core/SkMaskBlurFilter.cpp @@ -260,74 +260,58 @@ bool SkMaskBlurFilter::hasNoBlur() const { return (3 * fSigmaW <= 1) && (3 * fSigmaH <= 1); } -namespace { -struct LoadBW { - static constexpr int StrideOf8 = 1; - static SK_ALWAYS_INLINE Sk8h load(const uint8_t* from, int width) { - SkASSERT(0 < width && width <= 8); - uint8_t buffer[8]; - sk_bzero(buffer, sizeof(buffer)); - uint8_t value = *from; - for (int i = 0; i < width; ++i) { - buffer[i] = (value >> (7 - i)) & 1 ? 0xFF : 0; - } - auto v = SkNx_cast<uint16_t>(Sk8b::Load(buffer)); - // Convert from 0-255 to 8.8 encoding. - return v << 8; +// We favor A8 masks, and if we need to work with another format, we'll convert to A8 first. +// Each of these converts width (up to 8) mask values to A8. +static void bw_to_a8(uint8_t* a8, const uint8_t* from, int width) { + SkASSERT(0 < width && width <= 8); + + uint8_t masks = *from; + for (int i = 0; i < width; ++i) { + a8[i] = (masks >> (7 - i)) & 1 ? 0xFF + : 0x00; } -}; -struct LoadA8 { - static constexpr int StrideOf8 = 8 * sizeof(uint8_t); - static SK_ALWAYS_INLINE Sk8h load(const uint8_t* from, int width) { - SkASSERT(0 < width && width <= 8); - uint8_t buffer[8]; - if (width < 8) { - sk_bzero(buffer, sizeof(buffer)); - for (int i = 0; i < width; ++i) { - buffer[i] = from[i]; - } - from = buffer; - } - auto v = SkNx_cast<uint16_t>(Sk8b::Load(from)); - // Convert from 0-255 to 8.8 encoding. - return v << 8; +} +static void lcd_to_a8(uint8_t* a8, const uint8_t* from, int width) { + SkASSERT(0 < width && width <= 8); + + for (int i = 0; i < width; ++i) { + unsigned rgb = reinterpret_cast<const uint16_t*>(from)[i], + r = SkPacked16ToR32(rgb), + g = SkPacked16ToG32(rgb), + b = SkPacked16ToB32(rgb); + a8[i] = (r + g + b) / 3; } -}; -struct LoadARGB { - static constexpr int StrideOf8 = 8 * sizeof(uint32_t); - static SK_ALWAYS_INLINE Sk8h load(const uint8_t* from, int width) { - SkASSERT(0 < width && width <= 8); - uint8_t buffer[8]; - sk_bzero(buffer, sizeof(buffer)); - for (int i = 0; i < width; ++i) { - uint32_t packed = reinterpret_cast<const uint32_t*>(from)[i]; - buffer[i] = SkGetPackedA32(packed); - } - auto v = SkNx_cast<uint16_t>(Sk8b::Load(buffer)); - // Convert from 0-255 to 8.8 encoding. - return v << 8; +} +static void argb32_to_a8(uint8_t* a8, const uint8_t* from, int width) { + SkASSERT(0 < width && width <= 8); + for (int i = 0; i < width; ++i) { + uint32_t rgba = reinterpret_cast<const uint32_t*>(from)[i]; + a8[i] = SkGetPackedA32(rgba); } -}; -struct LoadLCD { - static constexpr int StrideOf8 = 8 * sizeof(uint16_t); - static SK_ALWAYS_INLINE Sk8h load(const uint8_t* from, int width) { - SkASSERT(0 < width && width <= 8); - uint8_t buffer[8]; - sk_bzero(buffer, sizeof(buffer)); +} +using ToA8 = decltype(bw_to_a8); + +static Sk8h load(const uint8_t* from, int width, ToA8* toA8) { + // Our fast path is a full 8-byte load of A8. + // So we'll conditionally handle the two slow paths using tmp: + // - if we have a function to convert another mask to A8, use it; + // - if not but we have less than 8 bytes to load, load them one at a time. + uint8_t tmp[8] = {0,0,0,0, 0,0,0,0}; + if (toA8) { + toA8(tmp, from, width); + from = tmp; + } else if (width < 8) { for (int i = 0; i < width; ++i) { - unsigned packed = reinterpret_cast<const uint16_t*>(from)[i]; - unsigned r = SkPacked16ToR32(packed); - unsigned g = SkPacked16ToG32(packed); - unsigned b = SkPacked16ToB32(packed); - buffer[i] = (r + g + b) / 3; + tmp[i] = from[i]; } - auto v = SkNx_cast<uint16_t>(Sk8b::Load(buffer)); - // Convert from 0-255 to 8.8 encoding. - return v << 8; + from = tmp; } -}; -static SK_ALWAYS_INLINE void store(uint8_t* to, const Sk8h& v, int width) { + // Load A8 and convert to 8.8 fixed-point. + return SkNx_cast<uint16_t>(Sk8b::Load(from)) << 8; +} + +static void store(uint8_t* to, const Sk8h& v, int width) { Sk8b b = SkNx_cast<uint8_t>(v >> 8); if (width == 8) { b.store(to); @@ -422,7 +406,7 @@ static constexpr uint16_t kHalf = 0x80u; // d1 += {v1[6], v1[7], _____, _____, _____, _____, _____, _____} // Where we rely on the compiler to generate efficient code for the {____, n, ....} notation. -static SK_ALWAYS_INLINE void blur_x_radius_1( +static void blur_x_radius_1( const Sk8h& s0, const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&, Sk8h* d0, Sk8h* d8) { @@ -443,7 +427,7 @@ static SK_ALWAYS_INLINE void blur_x_radius_1( } -static SK_ALWAYS_INLINE void blur_x_radius_2( +static void blur_x_radius_2( const Sk8h& s0, const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&, Sk8h* d0, Sk8h* d8) { @@ -471,7 +455,7 @@ static SK_ALWAYS_INLINE void blur_x_radius_2( *d8 += Sk8h{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____}; } -static SK_ALWAYS_INLINE void blur_x_radius_3( +static void blur_x_radius_3( const Sk8h& s0, const Sk8h& gauss0, const Sk8h& gauss1, const Sk8h& gauss2, const Sk8h& gauss3, const Sk8h&, Sk8h* d0, Sk8h* d8) { @@ -508,7 +492,7 @@ static SK_ALWAYS_INLINE void blur_x_radius_3( *d8 += Sk8h{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____}; } -static SK_ALWAYS_INLINE void blur_x_radius_4( +static void blur_x_radius_4( const Sk8h& s0, const Sk8h& gauss0, const Sk8h& gauss1, @@ -560,7 +544,7 @@ static SK_ALWAYS_INLINE void blur_x_radius_4( using BlurX = decltype(blur_x_radius_1); // BlurX will only be one of the functions blur_x_radius_(1|2|3|4). -static SK_ALWAYS_INLINE void blur_row( +static void blur_row( BlurX blur, const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, const uint8_t* src, int srcW, @@ -571,14 +555,14 @@ static SK_ALWAYS_INLINE void blur_row( // Go by multiples of 8 in src. int x = 0; for (; x <= srcW - 8; x += 8) { - blur(LoadA8::load(src, 8), g0, g1, g2, g3, g4, &d0, &d8); + blur(load(src, 8, nullptr), g0, g1, g2, g3, g4, &d0, &d8); store(dst, d0, 8); d0 = d8; d8 = Sk8h{kHalf}; - src += LoadA8::StrideOf8; + src += 8; dst += 8; } @@ -586,7 +570,7 @@ static SK_ALWAYS_INLINE void blur_row( int srcTail = srcW - x; if (srcTail > 0) { - blur(LoadA8::load(src, srcTail), g0, g1, g2, g3, g4, &d0, &d8); + blur(load(src, srcTail, nullptr), g0, g1, g2, g3, g4, &d0, &d8); int dstTail = std::min(8, dstW - x); store(dst, d0, dstTail); @@ -604,11 +588,10 @@ static SK_ALWAYS_INLINE void blur_row( } // BlurX will only be one of the functions blur_x_radius_(1|2|3|4). -static SK_ALWAYS_INLINE void blur_x_rect( - BlurX blur, - uint16_t* gauss, - const uint8_t* src, size_t srcStride, int srcW, - uint8_t* dst, size_t dstStride, int dstW, int dstH) { +static void blur_x_rect(BlurX blur, + uint16_t* gauss, + const uint8_t* src, size_t srcStride, int srcW, + uint8_t* dst, size_t dstStride, int dstW, int dstH) { Sk8h g0{gauss[0]}, g1{gauss[1]}, @@ -624,10 +607,9 @@ static SK_ALWAYS_INLINE void blur_x_rect( } } -SK_ATTRIBUTE(noinline) static void direct_blur_x( - int radius, uint16_t* gauss, - const uint8_t* src, size_t srcStride, int srcW, - uint8_t* dst, size_t dstStride, int dstW, int dstH) { +static void direct_blur_x(int radius, uint16_t* gauss, + const uint8_t* src, size_t srcStride, int srcW, + uint8_t* dst, size_t dstStride, int dstW, int dstH) { switch (radius) { case 1: @@ -701,7 +683,7 @@ SK_ATTRIBUTE(noinline) static void direct_blur_x( // d01[0..7] = d12[0..7] + S[n+0r..n+0r+7]*G[0] // d12[0..7] = S[n+0r..n+0r+7]*G[1] // return answer[0..7] -static SK_ALWAYS_INLINE Sk8h blur_y_radius_1( +static Sk8h blur_y_radius_1( const Sk8h& s0, const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&, Sk8h* d01, Sk8h* d12, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*) { @@ -715,7 +697,7 @@ static SK_ALWAYS_INLINE Sk8h blur_y_radius_1( return answer; } -static SK_ALWAYS_INLINE Sk8h blur_y_radius_2( +static Sk8h blur_y_radius_2( const Sk8h& s0, const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&, Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h*, Sk8h*, Sk8h*, Sk8h*) { @@ -732,7 +714,7 @@ static SK_ALWAYS_INLINE Sk8h blur_y_radius_2( return answer; } -static SK_ALWAYS_INLINE Sk8h blur_y_radius_3( +static Sk8h blur_y_radius_3( const Sk8h& s0, const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h&, Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h*, Sk8h*) { @@ -752,7 +734,7 @@ static SK_ALWAYS_INLINE Sk8h blur_y_radius_3( return answer; } -static SK_ALWAYS_INLINE Sk8h blur_y_radius_4( +static Sk8h blur_y_radius_4( const Sk8h& s0, const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h* d67, Sk8h* d78) { @@ -778,7 +760,8 @@ static SK_ALWAYS_INLINE Sk8h blur_y_radius_4( using BlurY = decltype(blur_y_radius_1); // BlurY will be one of blur_y_radius_(1|2|3|4). -template <typename Loader> static SK_ALWAYS_INLINE void blur_column( +static void blur_column( + ToA8 toA8, BlurY blur, int radius, int width, const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, const uint8_t* src, size_t srcRB, int srcH, @@ -794,7 +777,7 @@ template <typename Loader> static SK_ALWAYS_INLINE void blur_column( }; for (int y = 0; y < srcH; y += 1) { - auto s = Loader::load(src, width); + auto s = load(src, width, toA8); auto b = blur(s, g0, g1, g2, g3, g4, &d01, &d12, &d23, &d34, &d45, &d56, &d67, &d78); @@ -818,10 +801,10 @@ template <typename Loader> static SK_ALWAYS_INLINE void blur_column( } // BlurY will be one of blur_y_radius_(1|2|3|4). -template <typename Loader> static SK_ALWAYS_INLINE void blur_y_rect( - BlurY blur, int radius, uint16_t *gauss, - const uint8_t *src, size_t srcRB, int srcW, int srcH, - uint8_t *dst, size_t dstRB) { +static void blur_y_rect(ToA8 toA8, const int strideOf8, + BlurY blur, int radius, uint16_t *gauss, + const uint8_t *src, size_t srcRB, int srcW, int srcH, + uint8_t *dst, size_t dstRB) { Sk8h g0{gauss[0]}, g1{gauss[1]}, @@ -831,51 +814,51 @@ template <typename Loader> static SK_ALWAYS_INLINE void blur_y_rect( int x = 0; for (; x <= srcW - 8; x += 8) { - blur_column<Loader>(blur, radius, 8, - g0, g1, g2, g3, g4, - src, srcRB, srcH, - dst, dstRB); - src += Loader::StrideOf8; + blur_column(toA8, blur, radius, 8, + g0, g1, g2, g3, g4, + src, srcRB, srcH, + dst, dstRB); + src += strideOf8; dst += 8; } int xTail = srcW - x; if (xTail > 0) { - blur_column<Loader>(blur, radius, xTail, - g0, g1, g2, g3, g4, - src, srcRB, srcH, - dst, dstRB); + blur_column(toA8, blur, radius, xTail, + g0, g1, g2, g3, g4, + src, srcRB, srcH, + dst, dstRB); } } -template <typename Loader> SK_ATTRIBUTE(noinline) static void direct_blur_y( - int radius, uint16_t* gauss, - const uint8_t* src, size_t srcRB, int srcW, int srcH, - uint8_t* dst, size_t dstRB) { +static void direct_blur_y(ToA8 toA8, const int strideOf8, + int radius, uint16_t* gauss, + const uint8_t* src, size_t srcRB, int srcW, int srcH, + uint8_t* dst, size_t dstRB) { switch (radius) { case 1: - blur_y_rect<Loader>(blur_y_radius_1, 1, gauss, - src, srcRB, srcW, srcH, - dst, dstRB); + blur_y_rect(toA8, strideOf8, blur_y_radius_1, 1, gauss, + src, srcRB, srcW, srcH, + dst, dstRB); break; case 2: - blur_y_rect<Loader>(blur_y_radius_2, 2, gauss, - src, srcRB, srcW, srcH, - dst, dstRB); + blur_y_rect(toA8, strideOf8, blur_y_radius_2, 2, gauss, + src, srcRB, srcW, srcH, + dst, dstRB); break; case 3: - blur_y_rect<Loader>(blur_y_radius_3, 3, gauss, - src, srcRB, srcW, srcH, - dst, dstRB); + blur_y_rect(toA8, strideOf8, blur_y_radius_3, 3, gauss, + src, srcRB, srcW, srcH, + dst, dstRB); break; case 4: - blur_y_rect<Loader>(blur_y_radius_4, 4, gauss, - src, srcRB, srcW, srcH, - dst, dstRB); + blur_y_rect(toA8, strideOf8, blur_y_radius_4, 4, gauss, + src, srcRB, srcW, srcH, + dst, dstRB); break; default: @@ -932,24 +915,27 @@ static SkIPoint small_blur(double sigmaX, double sigmaY, const SkMask& src, SkMa // Blur vertically and copy to destination. switch (src.fFormat) { case SkMask::kBW_Format: - direct_blur_y<LoadBW>(radiusY, gaussFactorsY, - src.fImage, srcRB, srcW, srcH, - dst->fImage + radiusX, dstRB); + direct_blur_y(bw_to_a8, 1, + radiusY, gaussFactorsY, + src.fImage, srcRB, srcW, srcH, + dst->fImage + radiusX, dstRB); break; case SkMask::kA8_Format: - direct_blur_y<LoadA8>(radiusY, gaussFactorsY, - src.fImage, srcRB, srcW, srcH, - dst->fImage + radiusX, dstRB); + direct_blur_y(nullptr, 8, + radiusY, gaussFactorsY, + src.fImage, srcRB, srcW, srcH, + dst->fImage + radiusX, dstRB); break; case SkMask::kARGB32_Format: - direct_blur_y<LoadARGB>(radiusY, gaussFactorsY, - src.fImage, srcRB, srcW, srcH, - dst->fImage + radiusX, dstRB); + direct_blur_y(argb32_to_a8, 32, + radiusY, gaussFactorsY, + src.fImage, srcRB, srcW, srcH, + dst->fImage + radiusX, dstRB); break; case SkMask::kLCD16_Format: - direct_blur_y<LoadLCD>(radiusY, gaussFactorsY, - src.fImage, srcRB, srcW, srcH, - dst->fImage + radiusX, dstRB); + direct_blur_y(lcd_to_a8, 16, radiusY, gaussFactorsY, + src.fImage, srcRB, srcW, srcH, + dst->fImage + radiusX, dstRB); break; default: SK_ABORT("Unhandled format."); @@ -963,8 +949,6 @@ static SkIPoint small_blur(double sigmaX, double sigmaY, const SkMask& src, SkMa return {radiusX, radiusY}; } -} // namespace - // TODO: assuming sigmaW = sigmaH. Allow different sigmas. Right now the // API forces the sigmas to be the same. SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMask* dst) const { |