diff options
author | msarett <msarett@google.com> | 2016-02-03 15:28:35 -0800 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2016-02-03 15:28:35 -0800 |
commit | 095742419d0277a4fb0d499a05ff29b7506f1c5e (patch) | |
tree | 43ac3fb64a521e30ebd735dc5e1f60ca0d1de600 | |
parent | de3a726ad38b99e5393d5d9e6de3c844893d01b6 (diff) |
SSE optimizations for GrayAlpha -> RGBA/BGRA Premul/Unpremul
Swizzle Runtime (Dell Venue 8)
Unpremul 0.17x
Premul 0.20x
PNG Decode Runtime on GrayAlpha Encoded PNGs (Dell Venue 8)
Unpremul Regular 0.91x
Unpremul ZeroInit 0.92x
Premul Regular 0.84x
Premul ZeroInit 0.85x
BUG=skia:4767
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1666853002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1666853002
-rw-r--r-- | bench/SwizzleBench.cpp | 2 | ||||
-rw-r--r-- | src/opts/SkSwizzler_opts.h | 67 |
2 files changed, 61 insertions, 8 deletions
diff --git a/bench/SwizzleBench.cpp b/bench/SwizzleBench.cpp index 0f85b59481..cf7a407f0e 100644 --- a/bench/SwizzleBench.cpp +++ b/bench/SwizzleBench.cpp @@ -33,3 +33,5 @@ DEF_BENCH(return new SwizzleBench("SkOpts::RGBA_to_BGRA", SkOpts::RGBA_to_BGRA)) DEF_BENCH(return new SwizzleBench("SkOpts::RGB_to_RGB1", SkOpts::RGB_to_RGB1)); DEF_BENCH(return new SwizzleBench("SkOpts::RGB_to_BGR1", SkOpts::RGB_to_BGR1)); DEF_BENCH(return new SwizzleBench("SkOpts::gray_to_RGB1", SkOpts::gray_to_RGB1)); +DEF_BENCH(return new SwizzleBench("SkOpts::grayA_to_RGBA", SkOpts::grayA_to_RGBA)); +DEF_BENCH(return new SwizzleBench("SkOpts::grayA_to_rgbA", SkOpts::grayA_to_rgbA)); diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h index f0492e7d33..1d3cc51c37 100644 --- a/src/opts/SkSwizzler_opts.h +++ b/src/opts/SkSwizzler_opts.h @@ -403,14 +403,22 @@ static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 +// Scale a byte by another. +// Inputs are stored in 16-bit lanes, but are not larger than 8-bits. +static __m128i scale(__m128i x, __m128i y) { + const __m128i _128 = _mm_set1_epi16(128); + const __m128i _257 = _mm_set1_epi16(257); + + // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. + return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257); +} + template <bool kSwapRB> static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { auto src = (const uint32_t*)vsrc; auto premul8 = [](__m128i* lo, __m128i* hi) { const __m128i zeros = _mm_setzero_si128(); - const __m128i _128 = _mm_set1_epi16(128); - const __m128i _257 = _mm_set1_epi16(257); __m128i planar; if (kSwapRB) { planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); @@ -430,10 +438,10 @@ static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_ a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_ - // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. - r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257); - g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); - b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); + // Premultiply! + r = scale(r, a); + g = scale(g, a); + b = scale(b, a); // Repack into interlaced pixels. rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG @@ -572,11 +580,54 @@ static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) { gray_to_RGB1_portable(dst, src, count); } -static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) { +static void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) { + const uint8_t* src = (const uint8_t*) vsrc; + while (count >= 8) { + __m128i ga = _mm_loadu_si128((const __m128i*) src); + + __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)), + _mm_slli_epi16(ga, 8)); + + __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga); + __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga); + + _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo); + _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); + + src += 8*2; + dst += 8; + count -= 8; + } + grayA_to_RGBA_portable(dst, src, count); } -static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { +static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) { + const uint8_t* src = (const uint8_t*) vsrc; + while (count >= 8) { + __m128i grayA = _mm_loadu_si128((const __m128i*) src); + + __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF)); + __m128i a0 = _mm_srli_epi16(grayA, 8); + + // Premultiply + g0 = scale(g0, a0); + + __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8)); + __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8)); + + + __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga); + __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga); + + _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo); + _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); + + src += 8*2; + dst += 8; + count -= 8; + } + grayA_to_rgbA_portable(dst, src, count); } |