diff options
author | qiankun.miao <qiankun.miao@intel.com> | 2014-11-25 06:35:02 -0800 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2014-11-25 06:35:02 -0800 |
commit | 2253aa93930cdc5d0615098ce5473065427bcff6 (patch) | |
tree | 8c8d2ac7466025509e4790ad3398a3f869b8519e /src/opts/SkBlitRow_opts_SSE2.cpp | |
parent | 551051c0492f4789df56547e3fe9ac3bfd25f002 (diff) |
Add SkBlendARGB32_SSE2() to clean up code
Related nanobench results:
before:
maxrss loops min median mean max stddev samples config bench
10M 2 31.9µs 32.4µs 33.3µs 38.7µs 6% █▄▂▂▂▁▂▁▁▁ 8888 bitmap_BGRA_8888_A_scale_bicubic
10M 13 43.8µs 51.8µs 49.6µs 57.9µs 11% ▁▁▁▁▂▆▇▆▅█ 8888 bitmap_BGRA_8888_A_scale_bilerp
10M 13 23.7µs 24.3µs 26µs 32.7µs 13% ▅█▆▁▁▁▁▂▁▁ 8888 bitmap_Index_8_A
10M 4 1.68µs 1.7µs 4.09µs 25.4µs 183% █▁▁▁▁▁▁▁▁▁ 8888 text_16_AA_88
10M 144 1.76µs 1.77µs 1.78µs 1.81µs 1% █▂▇▂▅▁▁▁▁▁ 8888 text_16_AA_FF
10M 10 4.7µs 5.34µs 5.61µs 8.63µs 21% █▂▂▃▂▁▁▁▁▄ 8888 rotated_rects_aa_alternating_transparent_and_opaque_src
10M 50 4.44µs 4.47µs 4.5µs 4.71µs 2% █▅▃▂▂▂▁▁▁▁ 8888 rotated_rects_aa_changing_opaque_src
10M 51 4.39µs 4.78µs 5.21µs 6.62µs 17% ▁▆▆▇▁▁█▁▂▂ 8888 rotated_rects_aa_same_opaque_src
10M 50 4.47µs 5.79µs 5.43µs 6.14µs 11% ▄▂▁▃▇▇▆▇▇█ 8888 rotated_rects_aa_alternating_transparent_and_opaque_srcover
10M 30 4.35µs 6.06µs 5.84µs 7.63µs 16% ▅▅▅▄▅▅▄█▁▁ 8888 rotated_rects_aa_changing_transparent_srcover
10M 44 4.31µs 4.51µs 4.76µs 6.25µs 13% ▄▂▂▁█▃▁▃▁▁ 8888 rotated_rects_aa_changing_opaque_srcover
10M 46 4.36µs 4.42µs 4.75µs 6.19µs 14% ▆█▃▁▁▁▁▁▁▁ 8888 rotated_rects_aa_same_transparent_srcover
10M 47 4.29µs 4.35µs 4.44µs 5.15µs 6% ▃▂▂▁▁█▁▁▁▁ 8888 rotated_rects_aa_same_opaque_srcover
10M 3 39.1µs 39.2µs 50.7µs 153µs 71% █▁▁▁▁▁▁▁▁▁ 8888 rectori
10M 1 2.3ms 2.31ms 2.35ms 2.74ms 6% ▁▁▁▁▁▁▁▁█▂ 8888 maskcolor
10M 1 2.33ms 2.34ms 2.53ms 3.14ms 11% ▁▁▁▁▁▁▅█▄▄ 8888 maskopaque
10M 11 15µs 15.3µs 15.7µs 18.3µs 7% ▅▃▂▂▁▁▁▁█▁ 8888 rrects_3_stroke_4
10M 46 3.99µs 4.07µs 4.14µs 4.54µs 4% █▅▅▃▂▂▁▁▁▁ 8888 rrects_3
10M 16 15.6µs 15.9µs 16.1µs 17.5µs 4% █▄▃▂▂▂▁▂▁▁ 8888 ovals_3_stroke_4
10M 40 5.09µs 5.18µs 5.23µs 5.67µs 3% █▅▃▂▂▁▃▁▁▁ 8888 ovals_3
10M 231 1.92µs 1.93µs 1.94µs 2µs 1% █▃▂▁▃▁▁▁▁▁ 8888 zeroradroundrect
10M 924 3.88µs 3.93µs 4.11µs 4.95µs 9% ▁█▆▃▁▁▁▁▁▁ 8888 arbroundrect
10M 8 8.11µs 8.47µs 8.48µs 8.85µs 3% █▅▇▄▄▂▁▄▄▆ 8888 merge_large
10M 14 6.71µs 6.92µs 6.96µs 7.46µs 3% ▃▆▁█▃▃▃▂▂▁ 8888 merge_small
11M 2 225µs 227µs 229µs 233µs 1% ███▃▇▂▃▁▃▂ 8888 displacement_full_large
16M 1 381µs 401µs 401µs 421µs 3% ▅▅▅█▆▄▄▃▃▁ 8888 displacement_alpha_large
19M 1 507µs 508µs 509µs 512µs 0% █▃▂▆▂▂▃▂▃▁ 8888 displacement_zero_large
19M 19 9µs 9.11µs 9.15µs 9.67µs 2% ▄▂▂▂█▂▁▁▁▂ 8888 displacement_full_small
19M 5 54.2µs 54.5µs 54.9µs 58µs 2% █▃▂▂▁▁▃▁▁▁ 8888 blurroundrect_WH[100x100]_cr[90]
20M 1 229µs 230µs 231µs 240µs 2% █▄▃▂▂▁▁▁▁▂ 8888 GM_varied_text_clipped_no_lcd
20M 1 267µs 269µs 270µs 279µs 1% █▄▃▂▂▂▂▂▁▁ 8888 GM_varied_text_ignorable_clip_no_lcd
22M 1 1.95ms 1.97ms 2.03ms 2.46ms 8% ▁▁▁▁▁▁▁▂█▃ 8888 GM_convex_poly_clip
after:
maxrss loops min median mean max stddev samples config bench
10M 2 31.5µs 32.3µs 32.8µs 37.2µs 5% █▄▃▂▂▂▁▁▁▁ 8888 bitmap_BGRA_8888_A_scale_bicubic
10M 13 43.9µs 44µs 44.1µs 44.9µs 1% █▂▁▁▁▆▁▁▁▂ 8888 bitmap_BGRA_8888_A_scale_bilerp
10M 19 22.7µs 23.3µs 25.6µs 32.4µs 14% ▁▁▁▁▁▅▆▁▅█ 8888 bitmap_Index_8_A
10M 5 1.79µs 1.97µs 3.85µs 21.1µs 158% █▁▁▁▁▁▁▁▁▁ 8888 text_16_AA_88
10M 141 1.83µs 1.83µs 1.85µs 1.93µs 2% ▅▁▁█▁▁▁▁▁▁ 8888 text_16_AA_FF
10M 10 4.65µs 4.92µs 5.06µs 6.56µs 11% █▃▃▂▂▂▁▁▁▁ 8888 rotated_rects_aa_alternating_transparent_and_opaque_src
10M 51 4.35µs 4.48µs 4.83µs 6.68µs 17% ▂▁▁▁▁▁▁▂▆█ 8888 rotated_rects_aa_changing_opaque_src
10M 51 4.38µs 4.79µs 4.85µs 5.84µs 11% ▁█▁▃▃▁▄▁▄▇ 8888 rotated_rects_aa_same_opaque_src
10M 32 5.58µs 6.24µs 6.1µs 6.39µs 5% █▂█▆▁▇▄▅▇▇ 8888 rotated_rects_aa_alternating_transparent_and_opaque_srcover
10M 42 4.28µs 5.59µs 5.11µs 6.01µs 15% ▂▂█▇█▂▁▆▁▇ 8888 rotated_rects_aa_changing_transparent_srcover
10M 48 4.24µs 4.33µs 4.58µs 6.46µs 15% ▁▁▁▁▁█▃▂▁▁ 8888 rotated_rects_aa_changing_opaque_srcover
10M 48 4.28µs 4.3µs 4.4µs 5.12µs 6% ▂▂▁▁▁▁▁▁▁█ 8888 rotated_rects_aa_same_transparent_srcover
10M 46 4.24µs 4.29µs 4.66µs 7.11µs 20% ▁▁▁▁▁▁▁▁▃█ 8888 rotated_rects_aa_same_opaque_srcover
10M 3 39.3µs 39.4µs 51.4µs 154µs 70% █▁▁▁▁▁▁▁▁▁ 8888 rectori
10M 1 2.32ms 2.43ms 2.53ms 3.14ms 11% ▁▁▁▁▂▄█▃▅▁ 8888 maskcolor
10M 1 2.33ms 2.37ms 2.54ms 3.21ms 12% ▁▁▁▁▁▂█▅▆▁ 8888 maskopaque
10M 10 15.3µs 15.6µs 15.8µs 17.2µs 4% █▅▃▂▂▂▁▁▁▁ 8888 rrects_3_stroke_4
10M 46 4.03µs 4.09µs 4.15µs 4.47µs 4% █▄▆▂▂▂▁▁▁▁ 8888 rrects_3
10M 15 15.9µs 16.2µs 16.3µs 17.8µs 4% █▄▃▂▂▂▁▁▁▁ 8888 ovals_3_stroke_4
10M 40 5.14µs 5.26µs 5.29µs 5.72µs 3% █▅▃▂▂▁▂▂▁▁ 8888 ovals_3
10M 222 1.91µs 1.99µs 2.21µs 2.91µs 19% ▂▁▁▁▁▁▂▇▇█ 8888 zeroradroundrect
10M 462 3.9µs 3.96µs 4.23µs 5.22µs 12% ▆▄█▁▂▁▁▁▁▁ 8888 arbroundrect
10M 8 8.2µs 8.59µs 8.62µs 8.97µs 3% ▆▄█▄▅▃▁▆▄█ 8888 merge_large
10M 14 6.73µs 6.88µs 6.86µs 7.08µs 2% ▄█▁▂▄▂▅▄▂▅ 8888 merge_small
11M 2 221µs 234µs 237µs 263µs 5% ▄▃▃▃▄▃▂▁▇█ 8888 displacement_full_large
16M 1 387µs 416µs 427µs 471µs 7% ▇█▁▃▃▁▃▃▇▆ 8888 displacement_alpha_large
19M 1 512µs 521µs 528µs 594µs 5% █▂▂▂▁▁▂▃▁▁ 8888 displacement_zero_large
19M 18 9.06µs 9.12µs 9.13µs 9.23µs 1% █▃▃▃▄▃▆▁▅▅ 8888 displacement_full_small
19M 5 55.6µs 55.9µs 56.5µs 59.5µs 2% █▃▂▁▁▁▁▁▅▁ 8888 blurroundrect_WH[100x100]_cr[90]
20M 1 229µs 233µs 235µs 254µs 3% █▄▃▂▂▁▁▂▁▁ 8888 GM_varied_text_clipped_no_lcd
20M 1 270µs 271µs 272µs 278µs 1% █▄▃▂▂▂▁▂▁▇ 8888 GM_varied_text_ignorable_clip_no_lcd
22M 1 1.96ms 2ms 2.06ms 2.45ms 7% ▂▂▁▁▁▁▁▃█▄ 8888 GM_convex_poly_clip
BUG=skia:
Review URL: https://codereview.chromium.org/754733002
Diffstat (limited to 'src/opts/SkBlitRow_opts_SSE2.cpp')
-rw-r--r-- | src/opts/SkBlitRow_opts_SSE2.cpp | 129 |
1 files changed, 10 insertions, 119 deletions
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp index 17cd20a1a2..7b9c043835 100644 --- a/src/opts/SkBlitRow_opts_SSE2.cpp +++ b/src/opts/SkBlitRow_opts_SSE2.cpp @@ -207,74 +207,14 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, count--; } - uint32_t src_scale = SkAlpha255To256(alpha); - const __m128i *s = reinterpret_cast<const __m128i*>(src); __m128i *d = reinterpret_cast<__m128i*>(dst); - __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); - __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); - __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) while (count >= 4) { // Load 4 pixels each of src and dest. __m128i src_pixel = _mm_loadu_si128(s); __m128i dst_pixel = _mm_load_si128(d); - // Get red and blue pixels into lower byte of each word. - __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); - __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); - - // Get alpha and green into lower byte of each word. - __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); - __m128i src_ag = _mm_srli_epi16(src_pixel, 8); - - // Put per-pixel alpha in low byte of each word. - // After the following two statements, the dst_alpha looks like - // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3) - __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); - dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); - - // dst_alpha = dst_alpha * src_scale - // Because src_scales are in the higher byte of each word and - // we use mulhi here, the resulting alpha values are already - // in the right place and don't need to be divided by 256. - // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3) - dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide); - - // Subtract alphas from 256, to get 1..256 - dst_alpha = _mm_sub_epi16(c_256, dst_alpha); - - // Multiply red and blue by dst pixel alpha. - dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); - // Multiply alpha and green by dst pixel alpha. - dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); - - // Multiply red and blue by global alpha. - // (4 x (0, rs.h, 0, bs.h)) - // where rs.h stands for the higher byte of r * src_scale, - // and bs.h the higher byte of b * src_scale. - // Again, because we use mulhi, the resuling red and blue - // values are already in the right place and don't need to - // be divided by 256. - src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); - // Multiply alpha and green by global alpha. - // (4 x (0, as.h, 0, gs.h)) - src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); - - // Divide by 256. - dst_rb = _mm_srli_epi16(dst_rb, 8); - - // Mask out low bits (goodies already in the right place; no need to divide) - dst_ag = _mm_andnot_si128(rb_mask, dst_ag); - // Shift alpha and green to higher byte of each word. - // (4 x (as.h, 0, gs.h, 0)) - src_ag = _mm_slli_epi16(src_ag, 8); - - // Combine back into RGBA. - dst_pixel = _mm_or_si128(dst_rb, dst_ag); - src_pixel = _mm_or_si128(src_rb, src_ag); - - // Add two pixels into result. - __m128i result = _mm_add_epi8(src_pixel, dst_pixel); + __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha); _mm_store_si128(d, result); s++; d++; @@ -367,73 +307,24 @@ void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, count--; } __m128i *d = reinterpret_cast<__m128i*>(dst); - __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); - __m128i c_256 = _mm_set1_epi16(256); - __m128i c_1 = _mm_set1_epi16(1); __m128i src_pixel = _mm_set1_epi32(color); while (count >= 4) { - // Load 4 pixels each of src and dest. + // Load 4 dst pixels __m128i dst_pixel = _mm_load_si128(d); - //set the aphla value - __m128i src_scale_wide = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(mask)); - src_scale_wide = _mm_unpacklo_epi8(src_scale_wide, - _mm_setzero_si128()); - src_scale_wide = _mm_unpacklo_epi16(src_scale_wide, src_scale_wide); - - //call SkAlpha255To256() - src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); - - // Get red and blue pixels into lower byte of each word. - __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); - __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); - - // Get alpha and green into lower byte of each word. - __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); - __m128i src_ag = _mm_srli_epi16(src_pixel, 8); - - // Put per-pixel alpha in low byte of each word. - __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); - dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); - - // dst_alpha = dst_alpha * src_scale - dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); - - // Divide by 256. - dst_alpha = _mm_srli_epi16(dst_alpha, 8); - - // Subtract alphas from 256, to get 1..256 - dst_alpha = _mm_sub_epi16(c_256, dst_alpha); - // Multiply red and blue by dst pixel alpha. - dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); - // Multiply alpha and green by dst pixel alpha. - dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); - - // Multiply red and blue by global alpha. - src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); - // Multiply alpha and green by global alpha. - src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); - // Divide by 256. - dst_rb = _mm_srli_epi16(dst_rb, 8); - src_rb = _mm_srli_epi16(src_rb, 8); - - // Mask out low bits (goodies already in the right place; no need to divide) - dst_ag = _mm_andnot_si128(rb_mask, dst_ag); - src_ag = _mm_andnot_si128(rb_mask, src_ag); - - // Combine back into RGBA. - dst_pixel = _mm_or_si128(dst_rb, dst_ag); - __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); - - // Add two pixels into result. - __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); + // Set the alpha value + __m128i alpha_wide = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(mask)); + alpha_wide = _mm_unpacklo_epi8(alpha_wide, _mm_setzero_si128()); + alpha_wide = _mm_unpacklo_epi16(alpha_wide, _mm_setzero_si128()); + + __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha_wide); _mm_store_si128(d, result); - // load the next 4 pixel + // Load the next 4 dst pixels and alphas mask = mask + 4; d++; count -= 4; } - dst = reinterpret_cast<SkPMColor *>(d); + dst = reinterpret_cast<SkPMColor*>(d); } while (count > 0) { *dst= SkBlendARGB32(color, *dst, *mask); |