1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
|
// Copyright 2022 The Abseil Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
#define ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
#include <cstdint>
#include "absl/base/config.h"
// -------------------------------------------------------------------------
// Many x86 and ARM machines have CRC acceleration hardware.
// We can do a faster version of Extend() on such machines.
// We define a translation layer for both x86 and ARM for the ease of use and
// most performance gains.
// This implementation requires 64-bit CRC instructions (part of SSE 4.2) and
// PCLMULQDQ instructions. 32-bit builds with SSE 4.2 do exist, so the
// __x86_64__ condition is necessary.
#if defined(__x86_64__) && defined(__SSE4_2__) && defined(__PCLMUL__)
#include <x86intrin.h>
#define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
#elif defined(_MSC_VER) && !defined(__clang__) && defined(__AVX__) && \
defined(_M_AMD64)
// MSVC AVX (/arch:AVX) implies SSE 4.2 and PCLMULQDQ.
#include <intrin.h>
#define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
#elif defined(__aarch64__) && defined(__LITTLE_ENDIAN__) && \
defined(__ARM_FEATURE_CRC32) && defined(ABSL_INTERNAL_HAVE_ARM_NEON) && \
defined(__ARM_FEATURE_CRYPTO)
#include <arm_acle.h>
#include <arm_neon.h>
#define ABSL_CRC_INTERNAL_HAVE_ARM_SIMD
#endif
namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {
#if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
#if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
using V128 = uint64x2_t;
#else
// Note: Do not use __m128i_u, it is not portable.
// Use V128_LoadU() perform an unaligned load from __m128i*.
using V128 = __m128i;
#endif
// Starting with the initial value in |crc|, accumulates a CRC32 value for
// unsigned integers of different sizes.
uint32_t CRC32_u8(uint32_t crc, uint8_t v);
uint32_t CRC32_u16(uint32_t crc, uint16_t v);
uint32_t CRC32_u32(uint32_t crc, uint32_t v);
uint32_t CRC32_u64(uint32_t crc, uint64_t v);
// Loads 128 bits of integer data. |src| must be 16-byte aligned.
V128 V128_Load(const V128* src);
// Load 128 bits of integer data. |src| does not need to be aligned.
V128 V128_LoadU(const V128* src);
// Store 128 bits of integer data. |src| must be 16-byte aligned.
void V128_Store(V128* dst, V128 data);
// Polynomially multiplies the high 64 bits of |l| and |r|.
V128 V128_PMulHi(const V128 l, const V128 r);
// Polynomially multiplies the low 64 bits of |l| and |r|.
V128 V128_PMulLow(const V128 l, const V128 r);
// Polynomially multiplies the low 64 bits of |r| and high 64 bits of |l|.
V128 V128_PMul01(const V128 l, const V128 r);
// Polynomially multiplies the low 64 bits of |l| and high 64 bits of |r|.
V128 V128_PMul10(const V128 l, const V128 r);
// Produces a XOR operation of |l| and |r|.
V128 V128_Xor(const V128 l, const V128 r);
// Produces an AND operation of |l| and |r|.
V128 V128_And(const V128 l, const V128 r);
// Sets the lower half of a 128 bit register to the given 64-bit value and
// zeroes the upper half.
// dst[63:0] := |r|
// dst[127:64] := |0|
V128 V128_From64WithZeroFill(const uint64_t r);
// Shift |l| right by |imm| bytes while shifting in zeros.
template <int imm>
V128 V128_ShiftRight(const V128 l);
// Extracts a 32-bit integer from |l|, selected with |imm|.
template <int imm>
int V128_Extract32(const V128 l);
// Extracts a 64-bit integer from |l|, selected with |imm|.
template <int imm>
uint64_t V128_Extract64(const V128 l);
// Extracts the low 64 bits from V128.
int64_t V128_Low64(const V128 l);
// Left-shifts packed 64-bit integers in l by r.
V128 V128_ShiftLeft64(const V128 l, const V128 r);
#endif
#if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) {
return _mm_crc32_u8(crc, v);
}
inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) {
return _mm_crc32_u16(crc, v);
}
inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
return _mm_crc32_u32(crc, v);
}
inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
return static_cast<uint32_t>(_mm_crc32_u64(crc, v));
}
inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); }
inline V128 V128_PMulHi(const V128 l, const V128 r) {
return _mm_clmulepi64_si128(l, r, 0x11);
}
inline V128 V128_PMulLow(const V128 l, const V128 r) {
return _mm_clmulepi64_si128(l, r, 0x00);
}
inline V128 V128_PMul01(const V128 l, const V128 r) {
return _mm_clmulepi64_si128(l, r, 0x01);
}
inline V128 V128_PMul10(const V128 l, const V128 r) {
return _mm_clmulepi64_si128(l, r, 0x10);
}
inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); }
inline V128 V128_From64WithZeroFill(const uint64_t r) {
return _mm_set_epi64x(static_cast<int64_t>(0), static_cast<int64_t>(r));
}
template <int imm>
inline V128 V128_ShiftRight(const V128 l) {
return _mm_srli_si128(l, imm);
}
template <int imm>
inline int V128_Extract32(const V128 l) {
return _mm_extract_epi32(l, imm);
}
template <int imm>
inline uint64_t V128_Extract64(const V128 l) {
return static_cast<uint64_t>(_mm_extract_epi64(l, imm));
}
inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); }
inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
return _mm_sll_epi64(l, r);
}
#elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) { return __crc32cb(crc, v); }
inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) {
return __crc32ch(crc, v);
}
inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
return __crc32cw(crc, v);
}
inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
return __crc32cd(crc, v);
}
inline V128 V128_Load(const V128* src) {
return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
}
inline V128 V128_LoadU(const V128* src) {
return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
}
inline void V128_Store(V128* dst, V128 data) {
vst1q_u64(reinterpret_cast<uint64_t*>(dst), data);
}
// Using inline assembly as clang does not generate the pmull2 instruction and
// performance drops by 15-20%.
// TODO(b/193678732): Investigate why there is a slight performance hit when
// using intrinsics instead of inline assembly.
inline V128 V128_PMulHi(const V128 l, const V128 r) {
uint64x2_t res;
__asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t"
: "=w"(res)
: "w"(l), "w"(r));
return res;
}
// TODO(b/193678732): Investigate why the compiler decides to move the constant
// loop multiplicands from GPR to Neon registers every loop iteration.
inline V128 V128_PMulLow(const V128 l, const V128 r) {
uint64x2_t res;
__asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t"
: "=w"(res)
: "w"(l), "w"(r));
return res;
}
inline V128 V128_PMul01(const V128 l, const V128 r) {
return reinterpret_cast<V128>(vmull_p64(
reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(l))),
reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r)))));
}
inline V128 V128_PMul10(const V128 l, const V128 r) {
return reinterpret_cast<V128>(vmull_p64(
reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))),
reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(r)))));
}
inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); }
inline V128 V128_And(const V128 l, const V128 r) { return vandq_u64(l, r); }
inline V128 V128_From64WithZeroFill(const uint64_t r){
constexpr uint64x2_t kZero = {0, 0};
return vsetq_lane_u64(r, kZero, 0);
}
template <int imm>
inline V128 V128_ShiftRight(const V128 l) {
return vreinterpretq_u64_s8(
vextq_s8(vreinterpretq_s8_u64(l), vdupq_n_s8(0), imm));
}
template <int imm>
inline int V128_Extract32(const V128 l) {
return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
}
template <int imm>
inline uint64_t V128_Extract64(const V128 l) {
return vgetq_lane_u64(l, imm);
}
inline int64_t V128_Low64(const V128 l) {
return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0);
}
inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
return vshlq_u64(l, vreinterpretq_s64_u64(r));
}
#endif
} // namespace crc_internal
ABSL_NAMESPACE_END
} // namespace absl
#endif // ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
|