diff options
author | Mike Klein <mtklein@chromium.org> | 2017-01-26 15:46:02 -0500 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2017-01-26 21:58:09 +0000 |
commit | 690dd5e3747cf36b42d37fcd460b357c8e2c6f92 (patch) | |
tree | b13378568c9dd46693d90f9a3172168573062a9a /src/opts | |
parent | 88235bbbbe51cc58af8c89f74ef153b433934dd6 (diff) |
Really use vpmaddwd in hsw::convolve_vertical().
No pixel diffs.
Performance on 8888 looks like an overall win.
Before:
micros bench
222.41 bitmap_scale_filter_64_256
40.06 bitmap_scale_filter_256_64
8.17 bitmap_scale_filter_90_10
10.32 bitmap_scale_filter_90_30
22.50 bitmap_scale_filter_90_80
1.80 bitmap_scale_filter_90_90
57.51 bitmap_scale_filter_80_90
41.99 bitmap_scale_filter_30_90
31.51 bitmap_scale_filter_10_90
After:
micros bench
193.60 bitmap_scale_filter_64_256
46.26 bitmap_scale_filter_256_64
7.81 bitmap_scale_filter_90_10
9.99 bitmap_scale_filter_90_30
22.05 bitmap_scale_filter_90_80
1.96 bitmap_scale_filter_90_90
52.07 bitmap_scale_filter_80_90
37.73 bitmap_scale_filter_30_90
27.63 bitmap_scale_filter_10_90
Change-Id: I2f29366b0fd503176c5af4d825fa524e632da21b
Reviewed-on: https://skia-review.googlesource.com/7630
Reviewed-by: Matt Sarett <msarett@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/opts')
-rw-r--r-- | src/opts/SkOpts_hsw.cpp | 48 |
1 files changed, 31 insertions, 17 deletions
diff --git a/src/opts/SkOpts_hsw.cpp b/src/opts/SkOpts_hsw.cpp index f14d2a5aea..dded64776a 100644 --- a/src/opts/SkOpts_hsw.cpp +++ b/src/opts/SkOpts_hsw.cpp @@ -32,25 +32,39 @@ namespace hsw { accum37 = _mm256_setzero_si256(); // Convolve with the filter. (This inner loop is where we spend ~all our time.) - for (int i = 0; i < filterLen; i++) { - auto coeffs = _mm256_set1_epi16(filter[i]); - auto pixels = _mm256_loadu_si256((const __m256i*)(srcRows[i] + x*4)); - - auto pixels_0145 = _mm256_unpacklo_epi8(pixels, _mm256_setzero_si256()), - pixels_2367 = _mm256_unpackhi_epi8(pixels, _mm256_setzero_si256()); - - auto lo_0145 = _mm256_mullo_epi16(pixels_0145, coeffs), - hi_0145 = _mm256_mulhi_epi16(pixels_0145, coeffs), - lo_2367 = _mm256_mullo_epi16(pixels_2367, coeffs), - hi_2367 = _mm256_mulhi_epi16(pixels_2367, coeffs); - - accum04 = _mm256_add_epi32(accum04, _mm256_unpacklo_epi16(lo_0145, hi_0145)); - accum15 = _mm256_add_epi32(accum15, _mm256_unpackhi_epi16(lo_0145, hi_0145)); - accum26 = _mm256_add_epi32(accum26, _mm256_unpacklo_epi16(lo_2367, hi_2367)); - accum37 = _mm256_add_epi32(accum37, _mm256_unpackhi_epi16(lo_2367, hi_2367)); + // While we can, we consume 2 filter coefficients and 2 rows of 8 pixels each at a time. + auto convolve_16_pixels = [&](__m256i interlaced_coeffs, + __m256i pixels_01234567, __m256i pixels_89ABCDEF) { + // Interlaced R0R8 G0G8 B0B8 A0A8 R1R9 G1G9... 32 8-bit values each. + auto _08194C5D = _mm256_unpacklo_epi8(pixels_01234567, pixels_89ABCDEF), + _2A3B6E7F = _mm256_unpackhi_epi8(pixels_01234567, pixels_89ABCDEF); + + // Still interlaced R0R8 G0G8... as above, each channel expanded to 16-bit lanes. + auto _084C = _mm256_unpacklo_epi8(_08194C5D, _mm256_setzero_si256()), + _195D = _mm256_unpackhi_epi8(_08194C5D, _mm256_setzero_si256()), + _2A6E = _mm256_unpacklo_epi8(_2A3B6E7F, _mm256_setzero_si256()), + _3B7F = _mm256_unpackhi_epi8(_2A3B6E7F, _mm256_setzero_si256()); + + // accum0_R += R0*coeff0 + R8*coeff1, etc. + accum04 = _mm256_add_epi32(accum04, _mm256_madd_epi16(_084C, interlaced_coeffs)); + accum15 = _mm256_add_epi32(accum15, _mm256_madd_epi16(_195D, interlaced_coeffs)); + accum26 = _mm256_add_epi32(accum26, _mm256_madd_epi16(_2A6E, interlaced_coeffs)); + accum37 = _mm256_add_epi32(accum37, _mm256_madd_epi16(_3B7F, interlaced_coeffs)); + }; + + int i = 0; + for (; i < filterLen/2*2; i += 2) { + convolve_16_pixels(_mm256_set1_epi32(*(const int32_t*)(filter+i)), + _mm256_loadu_si256((const __m256i*)(srcRows[i+0] + x*4)), + _mm256_loadu_si256((const __m256i*)(srcRows[i+1] + x*4))); + } + if (i < filterLen) { + convolve_16_pixels(_mm256_set1_epi32(*(const int16_t*)(filter+i)), + _mm256_loadu_si256((const __m256i*)(srcRows[i] + x*4)), + _mm256_setzero_si256()); } - // Trim the fractional parts. + // Trim the fractional parts off the accumulators. accum04 = _mm256_srai_epi32(accum04, 14); accum15 = _mm256_srai_epi32(accum15, 14); accum26 = _mm256_srai_epi32(accum26, 14); |