1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
|
/*
* Copyright 2016 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkCpu_DEFINED
#define SkCpu_DEFINED
#include "SkTypes.h"
struct SkCpu {
enum {
SSE1 = 1 << 0,
SSE2 = 1 << 1,
SSE3 = 1 << 2,
SSSE3 = 1 << 3,
SSE41 = 1 << 4,
SSE42 = 1 << 5,
AVX = 1 << 6,
F16C = 1 << 7,
FMA = 1 << 8,
AVX2 = 1 << 9,
};
enum {
NEON = 1 << 0,
NEON_FMA = 1 << 1,
VFP_FP16 = 1 << 2,
};
static bool Supports(uint32_t);
private:
// Consider a loop like this that expands 16-bit floats out to 32-bit, does math, and repacks:
// for (int i = 0; i < N; i++) {
// if (SkCpu::Supports(SkCpu::F16C)) {
// f32s = SkCpu::F16C_cvtph_ps(f16s);
// } else {
// f32s = some_slower_f16_to_f32_routine(f16s);
// }
//
// ... do some math with f32s ...
//
// if (SkCpu::Supports(SkCpu::F16C)) {
// f16s = SkCpu::F16C_cvtps_ph(f32s);
// } else {
// f16s = some_slower_f32_to_f16_routine(f32s);
// }
// }
//
// We would like SkCpu::Supports() to participate in common sub-expression elimination,
// so that it's called exactly 1 time, rather than N or 2N times. This is especially
// important when the if-else blocks you see above are really inline functions.
//
// The key to this is to make sure to implement RuntimeCpuFeatures() with the same
// capacity for common sub-expression elimination.
//
// __attribute__((const)) works perfectly when available.
//
// When it's not (MSVC), we fall back to a static initializer.
// (Static intializers would work fine everywhere, but Chrome really dislikes them.)
#if defined(__GNUC__) || defined(__clang__) // i.e. GCC, Clang, or clang-cl
__attribute__((const))
static uint32_t RuntimeCpuFeatures();
#else
static const uint32_t gCachedCpuFeatures;
static uint32_t RuntimeCpuFeatures() {
return gCachedCpuFeatures;
}
#endif
};
inline bool SkCpu::Supports(uint32_t mask) {
uint32_t features = RuntimeCpuFeatures();
// If we mask in compile-time known lower limits, the compiler can completely
// drop many calls to RuntimeCpuFeatures().
#if SK_CPU_X86
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
features |= SSE1;
#endif
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
features |= SSE2;
#endif
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE3
features |= SSE3;
#endif
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
features |= SSSE3;
#endif
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
features |= SSE41;
#endif
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
features |= SSE42;
#endif
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
features |= AVX;
#endif
// F16C goes here if we add SK_CPU_SSE_LEVEL_F16C
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
features |= AVX2;
#endif
// FMA doesn't fit neatly into this total ordering.
// It's available on Haswell+ just like AVX2, but it's technically a different bit.
// TODO: circle back on this if we find ourselves limited by lack of compile-time FMA
#else
#if defined(SK_ARM_HAS_NEON)
features |= NEON;
#endif
#if defined(SK_CPU_ARM64)
features |= NEON|NEON_FMA|VFP_FP16;
#endif
#endif
return (features & mask) == mask;
}
#endif//SkCpu_DEFINED
|