aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts/SkUtils_opts.h
blob: 44fe643276cb5139fca3ebfc8af14cbf03f369a3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
/*
 * Copyright 2015 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#ifndef SkUtils_opts_DEFINED
#define SkUtils_opts_DEFINED

namespace SK_OPTS_NS {

#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

static void memset16(uint16_t* dst, uint16_t val, int n) {
    auto dst8 = (__m128i*)dst;
    auto val8 = _mm_set1_epi16(val);
    for ( ; n >= 8; n -= 8) {
        _mm_storeu_si128(dst8++, val8);
    }
    dst = (uint16_t*)dst8;
    if (n & 4) {
        _mm_storel_epi64((__m128i*)dst, val8);
        dst += 4;
    }
    if (n & 2) {
        *(uint32_t*)dst = _mm_cvtsi128_si32(val8);
        dst += 2;
    }
    if (n & 1) {
        *dst = val;
    }
}

static void memset32(uint32_t* dst, uint32_t val, int n) {
    auto dst4 = (__m128i*)dst;
    auto val4 = _mm_set1_epi32(val);
    for ( ; n >= 4; n -= 4) {
        _mm_storeu_si128(dst4++, val4);
    }
    dst = (uint32_t*)dst4;
    if (n & 2) {
        _mm_storel_epi64((__m128i*)dst, val4);
        dst += 2;
    }
    if (n & 1) {
        *dst = val;
    }
}

#elif defined(SK_ARM_HAS_NEON)

static void memset16(uint16_t* dst, uint16_t value, int n) {
    uint16x8_t   v8  = vdupq_n_u16(value);
    uint16x8x4_t v32 = {{ v8, v8, v8, v8 }};

    while (n >= 32) {
        vst4q_u16(dst, v32);  // This swizzles, but we don't care: all lanes are the same, value.
        dst += 32;
        n   -= 32;
    }
    switch (n / 8) {
        case 3: vst1q_u16(dst, v8); dst += 8;
        case 2: vst1q_u16(dst, v8); dst += 8;
        case 1: vst1q_u16(dst, v8); dst += 8;
    }
    if (n & 4) {
        vst1_u16(dst, vget_low_u16(v8));
        dst += 4;
    }
    switch (n & 3) {
        case 3: *dst++ = value;
        case 2: *dst++ = value;
        case 1: *dst   = value;
    }
}

static void memset32(uint32_t* dst, uint32_t value, int n) {
    uint32x4_t   v4  = vdupq_n_u32(value);
    uint32x4x4_t v16 = {{ v4, v4, v4, v4 }};

    while (n >= 16) {
        vst4q_u32(dst, v16);  // This swizzles, but we don't care: all lanes are the same, value.
        dst += 16;
        n   -= 16;
    }
    switch (n / 4) {
        case 3: vst1q_u32(dst, v4); dst += 4;
        case 2: vst1q_u32(dst, v4); dst += 4;
        case 1: vst1q_u32(dst, v4); dst += 4;
    }
    if (n & 2) {
        vst1_u32(dst, vget_low_u32(v4));
        dst += 2;
    }
    if (n & 1) {
        *dst = value;
    }
}

#else // Neither NEON nor SSE2.

static void memset16(uint16_t* dst, uint16_t val, int n) { while (n --> 0) { *dst++ = val; } }
static void memset32(uint32_t* dst, uint32_t val, int n) { while (n --> 0) { *dst++ = val; } }

#endif

}  // namespace SK_OPTS_NS

#endif//SkUtils_opts_DEFINED