src/opts/SkOpts_hsw.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95

/*
 * Copyright 2016 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

// It is not safe to #include any header file here unless it has been vetted for ODR safety:
// all symbols used must be file-scoped static or in an anonymous namespace.  This applies
// to _all_ header files:  C standard library, C++ standard library, Skia... everything.

#include <immintrin.h>   // ODR safe
#include <stdint.h>      // ODR safe

namespace hsw {

    void convolve_vertically(const int16_t* filter, int filterLen,
                             uint8_t* const* srcRows, int width,
                             uint8_t* out, bool hasAlpha) {
        // It's simpler to work with the output array in terms of 4-byte pixels.
        auto dst = (int*)out;

        // Output up to eight pixels per iteration.
        for (int x = 0; x < width; x += 8) {
            // Accumulated result for 4 adjacent pairs of pixels, in signed 17.14 fixed point.
            auto accum01 = _mm256_setzero_si256(),
                 accum23 = _mm256_setzero_si256(),
                 accum45 = _mm256_setzero_si256(),
                 accum67 = _mm256_setzero_si256();

            // Convolve with the filter.  (This inner loop is where we spend ~all our time.)
            for (int i = 0; i < filterLen; i++) {
                auto coeffs = _mm256_set1_epi16(filter[i]);
                auto pixels = _mm256_loadu_si256((const __m256i*)(srcRows[i] + x*4));

                auto pixels_0123 = _mm256_unpacklo_epi8(pixels, _mm256_setzero_si256()),
                     pixels_4567 = _mm256_unpackhi_epi8(pixels, _mm256_setzero_si256());

                auto lo_0123 = _mm256_mullo_epi16(pixels_0123, coeffs),
                     hi_0123 = _mm256_mulhi_epi16(pixels_0123, coeffs),
                     lo_4567 = _mm256_mullo_epi16(pixels_4567, coeffs),
                     hi_4567 = _mm256_mulhi_epi16(pixels_4567, coeffs);

                accum01 = _mm256_add_epi32(accum01, _mm256_unpacklo_epi16(lo_0123, hi_0123));
                accum23 = _mm256_add_epi32(accum23, _mm256_unpackhi_epi16(lo_0123, hi_0123));
                accum45 = _mm256_add_epi32(accum45, _mm256_unpacklo_epi16(lo_4567, hi_4567));
                accum67 = _mm256_add_epi32(accum67, _mm256_unpackhi_epi16(lo_4567, hi_4567));
            }

            // Trim the fractional parts.
            accum01 = _mm256_srai_epi32(accum01, 14);
            accum23 = _mm256_srai_epi32(accum23, 14);
            accum45 = _mm256_srai_epi32(accum45, 14);
            accum67 = _mm256_srai_epi32(accum67, 14);

            // Pack back down to 8-bit channels.
            auto pixels = _mm256_packus_epi16(_mm256_packs_epi32(accum01, accum23),
                                              _mm256_packs_epi32(accum45, accum67));

            if (hasAlpha) {
                // Clamp alpha to the max of r,g,b to make sure we stay premultiplied.
                __m256i max_rg  = _mm256_max_epu8(pixels, _mm256_srli_epi32(pixels,  8)),
                        max_rgb = _mm256_max_epu8(max_rg, _mm256_srli_epi32(pixels, 16));
                pixels = _mm256_max_epu8(pixels, _mm256_slli_epi32(max_rgb, 24));
            } else {
                // Force opaque.
                pixels = _mm256_or_si256(pixels, _mm256_set1_epi32(0xff000000));
            }

            // Normal path to store 8 pixels.
            if (x + 8 <= width) {
                _mm256_storeu_si256((__m256i*)dst, pixels);
                dst += 8;
                continue;
            }

            // Store one pixel at a time on the last iteration.
            for (int i = x; i < width; i++) {
                *dst++ = _mm_cvtsi128_si32(_mm256_castsi256_si128(pixels));
                pixels = _mm256_permutevar8x32_epi32(pixels, _mm256_setr_epi32(1,2,3,4,5,6,7,0));
            }
        }
    }

}

namespace SkOpts {
    // See SkOpts.h, writing SkConvolutionFilter1D::ConvolutionFixed as the underlying type.
    extern void (*convolve_vertically)(const int16_t* filter, int filterLen,
                                       uint8_t* const* srcRows, int width,
                                       uint8_t* out, bool hasAlpha);
    void Init_hsw() {
        convolve_vertically = hsw::convolve_vertically;
    }
}