src/opts_neon/SkBitmapProcState_neon.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199

#include <arm_neon.h>
#include "SkBitmapProcState.h"
#include "SkColorPriv.h"
#include "SkFilterProc.h"

void S16_D16_filter_DX_neon(const SkBitmapProcState& s,
                           const uint32_t* SK_RESTRICT xy,
                           int count, uint16_t* SK_RESTRICT colors) {
    SkASSERT(count > 0 && colors != NULL);
    SkASSERT(s.fDoFilter);

    const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
    unsigned rb = s.fBitmap->rowBytes();
    unsigned subY;
    const uint16_t* SK_RESTRICT row0;
    const uint16_t* SK_RESTRICT row1;
    unsigned int rowgap;
    const uint32_t c7ffe = 0x7ffe;

    // setup row ptrs and update proc_table
    {
        uint32_t XY = *xy++;
        unsigned y0 = XY >> 14;
        row0 = (const uint16_t*)(srcAddr + (y0 >> 4) * rb);
        row1 = (const uint16_t*)(srcAddr + (XY & 0x3FFF) * rb);
        rowgap = (unsigned int)row1 - (unsigned int)row0;
        subY = y0 & 0xF;
    }

    unsigned int count4 = ((count >> 2) << 4) | subY;
    count &= 3;

    asm volatile (
        "and            r4, %[count4], #0xF             \n\t"   // mask off subY
        "vmov.u16       d2[0], r4                       \n\t"   // move subY to Neon
        "rsb            r4, r4, #16                     \n\t"   // r4 = 16-subY
        "vmov.u16       d2[1], r4                       \n\t"   // move 16-subY to Neon
        "movs           %[count4], %[count4], lsr #4    \n\t"   // shift count down, lose subY
        "vmov.u16       d3, #16                         \n\t"   // create constant
        "vmov.u16       q2, #31                         \n\t"   // set up blue mask
        "beq            2f                              \n\t"   // if count4 == 0, exit

    "1:                                                 \n\t"
        "ldmia          %[xy]!, {r4, r5, r6, r7}        \n\t"   // load four xy values
                                                                // xy = [ x0:14 | subX:4 | x1:14 ]
        // extract subX for iter 0-3
        "vmov           d0, r4, r5                      \n\t"   // move xy to Neon, iter 0-1
        "vmov           d1, r6, r7                      \n\t"   // move xy to Neon, iter 2-3

        // Load 16 pixels for four filter iterations from memory.
        // Because the source pixels are potentially scattered, each lane
        // of each vector is loaded separately. Also, the X sub pixel
        // offset is extracted.

        // iter 0
        "mov            r8, r4, lsr #18                 \n\t"   // extract x0
        "and            r4, %[c7ffe], r4, lsl #1        \n\t"   // extract x1 and make byte offset
        "add            r8, %[row0], r8, lsl #1         \n\t"   // calculate address of row0[x0]
        "add            r4, %[row0], r4                 \n\t"   // calculate address of row0[x1]
        "vld1.u16       {d16[0]}, [r8], %[rowgap]       \n\t"   // load row0[x0] and move ptr to row1
        "vld1.u16       {d17[0]}, [r4], %[rowgap]       \n\t"   // load row0[x1] and move ptr to row1
        "vld1.u16       {d18[0]}, [r8]                  \n\t"   // load row1[x0]
        "vld1.u16       {d19[0]}, [r4]                  \n\t"   // load row1[x1]

        // iter 1
        "mov            r8, r5, lsr #18                 \n\t"   // extract x0
        "and            r5, %[c7ffe], r5, lsl #1        \n\t"   // extract x1 and make byte offset
        "add            r8, %[row0], r8, lsl #1         \n\t"   // calculate address of row0[x0]
        "add            r5, %[row0], r5                 \n\t"   // calculate address of row0[x1]
        "vld1.u16       {d16[1]}, [r8], %[rowgap]       \n\t"   // load row0[x0] and move ptr to row1
        "vld1.u16       {d17[1]}, [r5], %[rowgap]       \n\t"   // load row0[x1] and move ptr to row1
        "vld1.u16       {d18[1]}, [r8]                  \n\t"   // load row1[x0]
        "vld1.u16       {d19[1]}, [r5]                  \n\t"   // load row1[x1]

        "vshrn.u32      d0, q0, #2                      \n\t"   // shift right subX by 2 and narrow
        // iter 2
        "mov            r8, r6, lsr #18                 \n\t"   // extract x0
        "and            r6, %[c7ffe], r6, lsl #1        \n\t"   // extract x1 and make byte offset
        "add            r8, %[row0], r8, lsl #1         \n\t"   // calculate address of row0[x0]
        "add            r6, %[row0], r6                 \n\t"   // calculate address of row0[x1]
        "vld1.u16       {d16[2]}, [r8], %[rowgap]       \n\t"   // load row0[x0] and move ptr to row1
        "vld1.u16       {d17[2]}, [r6], %[rowgap]       \n\t"   // load row0[x1] and move ptr to row1
        "vld1.u16       {d18[2]}, [r8]                  \n\t"   // load row1[x0]
        "vld1.u16       {d19[2]}, [r6]                  \n\t"   // load row1[x1]

        "vshr.u16       d0, d0, #12                     \n\t"   // shift right subX to bottom 4 bits
        // iter 3
        "mov            r8, r7, lsr #18                 \n\t"   // extract x0
        "and            r7, %[c7ffe], r7, lsl #1        \n\t"   // extract x1 and make byte offset
        "add            r8, %[row0], r8, lsl #1         \n\t"   // calculate address of row0[x0]
        "add            r7, %[row0], r7                 \n\t"   // calculate address of row0[x1]
        "vld1.u16       {d16[3]}, [r8], %[rowgap]       \n\t"   // load row0[x0] and move ptr to row1
        "vld1.u16       {d17[3]}, [r7], %[rowgap]       \n\t"   // load row0[x1] and move ptr to row1
        "vld1.u16       {d18[3]}, [r8]                  \n\t"   // load row1[x0]
        "vld1.u16       {d19[3]}, [r7]                  \n\t"   // load row1[x1]

        // Registers d16-d19 now contain pixels a00-a11 for 4 iterations:
        //   d16 = [ a00_3 | a00_2 | a00_1 | a00_0 ]
        //   d17 = [ a01_3 | a01_2 | a01_1 | a01_0 ]
        //   d18 = [ a10_3 | a10_2 | a10_1 | a10_0 ]
        //   d19 = [ a11_3 | a11_2 | a11_1 | a11_0 ]
        //
        // Extract RGB channels from each 565 pixel.

        "vshl.i16       q11, q8, #5                     \n\t"   // shift greens to top of each lane
        "vand           q12, q8, q2                     \n\t"   // mask blues
        "vshr.u16       q10, q8, #11                    \n\t"   // shift reds to bottom of each lane
        "vshr.u16       q11, q11, #10                   \n\t"   // shift greens to bottom of each lane
        "vshl.i16       q14, q9, #5                     \n\t"   // shift greens to top of each lane
        "vand           q15, q9, q2                     \n\t"   // mask blues
        "vshr.u16       q13, q9, #11                    \n\t"   // shift reds to bottom of each lane
        "vshr.u16       q14, q14, #10                   \n\t"   // shift greens to bottom of each lane

        // There are now six Q regs, containing
        //   q10 = [ a01r3 | a01r2 | a01r1 | a01r0 | a00r3 | a00r2 | a00r1 | a00r0 ]
        //   q11 = [ a01g3 | a01g2 | a01g1 | a01g0 | a00g3 | a00g2 | a00g1 | a00g0 ]
        //   q12 = [ a01b3 | a01b2 | a01b1 | a01b0 | a00b3 | a00b2 | a00b1 | a00b0 ]
        //   q13 = [ a11r3 | a11r2 | a11r1 | a11r0 | a01r3 | a01r2 | a01r1 | a01r0 ]
        //   q14 = [ a11g3 | a11g2 | a11g1 | a11g0 | a01g3 | a01g2 | a01g1 | a01g0 ]
        //   q15 = [ a11b3 | a11b2 | a11b1 | a11b0 | a01b3 | a01b2 | a01b1 | a01b0 ]
        // where aXXyZ: XX = pixel position, y = colour channel, Z = iteration
        //   d0 = subX, d1 = 16-subX
        //   d2[0] = subY, d2[1] = 16-subY
        //   d3 = 16, q2(d4d5) = 31

        // The filter:
        //
        //           |        |
        //     ---- a00 ---- a01 ----> * (16-y)
        //           |        |
        //     -----a10 ---- a11 ----> * y
        //           |        |
        //           V        V
        //        * (16-x)   * x
        //
        // result = (a00.(16-y).(16-x) + a01.(16-y).x + a10.(16-x).y + a11.x.y) >> 8
        //

        "vsub.u16       d1, d3, d0                      \n\t"   // calculate 16-subX
        // multiply top pixel pair by (16-y)
        "vmul.i16       q10, q10, d2[1]                 \n\t"   // top reds multiplied by (16-y)
        "vmul.i16       q11, q11, d2[1]                 \n\t"   // top greens multiplied by (16-y)
        "vmul.i16       q12, q12, d2[1]                 \n\t"   // top blues multiplied by (16-y)
        // multiply bottom pixel pair by y
        "vmul.i16       q13, q13, d2[0]                 \n\t"   // bottom reds multiplied by y
        "vmul.i16       q14, q14, d2[0]                 \n\t"   // bottom greens multiplied by y
        "vmul.i16       q15, q15, d2[0]                 \n\t"   // bottom blues multiplied by y
        // mul/acc left pixels by (16-x)
        "vmul.i16       d16, d20, d1                    \n\t"   // resultr  = a00r * (16-x)
        "vmul.i16       d17, d22, d1                    \n\t"   // resultg  = a00g * (16-x)
        "vmul.i16       d18, d24, d1                    \n\t"   // resultb  = a00b * (16-x)
        "vmla.i16       d16, d26, d1                    \n\t"   // resultr += a00r * (16-x)
        "vmla.i16       d17, d28, d1                    \n\t"   // resultg += a00g * (16-x)
        "vmla.i16       d18, d30, d1                    \n\t"   // resultb += a00b * (16-x)
        // mul/acc right pixels by x
        "vmla.i16       d16, d21, d0                    \n\t"   // resultr += a01r * x
        "vmla.i16       d17, d23, d0                    \n\t"   // resultg += a01g * x
        "vmla.i16       d18, d25, d0                    \n\t"   // resultb += a01b * x
        "vmla.i16       d16, d27, d0                    \n\t"   // resultr += a11r * x
        "vmla.i16       d17, d29, d0                    \n\t"   // resultg += a11g * x
        "vmla.i16       d18, d31, d0                    \n\t"   // resultb += a11b * x
        "subs           %[count4], %[count4], #1        \n\t"   // decrement counter
        // shift results down 8 bits
        "vshr.u16       q8, q8, #8                      \n\t"   // resultr >>= 8, resultg >>=8
        "vshr.u16       d18, d18, #8                    \n\t"   // resultb >>= 8
        // put rgb into 565
        "vsli.i16       d18, d17, #5                    \n\t"   // shift greens into blues
        "vsli.i16       d18, d16, #11                   \n\t"   // shift reds into greens and blues
        "vst1.i16       {d18}, [%[colors]]!             \n\t"   // store result
        "bgt            1b                              \n\t"   // if counter > 0, loop
    "2:                                                 \n\t"   // exit
        : [xy] "+r" (xy), [count4] "+r" (count4), [colors] "+r" (colors)
        : [row0] "r" (row0), [rowgap] "r" (rowgap), [c7ffe] "r" (c7ffe)
        : "cc", "memory", "r4", "r5", "r6", "r7", "r8", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
    );

    while(count != 0) {
        uint32_t XX = *xy++;    // x0:14 | subX:4 | x1:14
        unsigned x0 = XX >> 14;
        unsigned x1 = XX & 0x3FFF;
        unsigned subX = x0 & 0xF;
        x0 >>= 4;

        uint32_t a00 = SkExpand_rgb_16(row0[x0]);
        uint32_t a01 = SkExpand_rgb_16(row0[x1]);
        uint32_t a10 = SkExpand_rgb_16(row1[x0]);
        uint32_t a11 = SkExpand_rgb_16(row1[x1]);

        int xy = subX * subY >> 3;
        uint32_t c = a00 * (32 - 2*subY - 2*subX + xy) +
                     a01 * (2*subX - xy) +
                     a10 * (2*subY - xy) +
                     a11 * xy;

        *colors++ = SkCompact_rgb_16(c>>5);
        count--;
    }
}