absl/crc/internal/crc_x86_arm_combined.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737

// Copyright 2022 The Abseil Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Hardware accelerated CRC32 computation on Intel and ARM architecture.

#include <cstddef>
#include <cstdint>
#include <memory>
#include <vector>

#include "absl/base/attributes.h"
#include "absl/base/config.h"
#include "absl/base/internal/endian.h"
#include "absl/base/prefetch.h"
#include "absl/crc/internal/cpu_detect.h"
#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
#include "absl/crc/internal/crc_internal.h"
#include "absl/memory/memory.h"
#include "absl/numeric/bits.h"

#if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
    defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
#define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
#endif

namespace absl {
ABSL_NAMESPACE_BEGIN
namespace crc_internal {

#if defined(ABSL_INTERNAL_CAN_USE_SIMD_CRC32C)

// Implementation details not exported outside of file
namespace {

// Some machines have CRC acceleration hardware.
// We can do a faster version of Extend() on such machines.
class CRC32AcceleratedX86ARMCombined : public CRC32 {
 public:
  CRC32AcceleratedX86ARMCombined() {}
  ~CRC32AcceleratedX86ARMCombined() override {}
  void ExtendByZeroes(uint32_t* crc, size_t length) const override;
  uint32_t ComputeZeroConstant(size_t length) const;

 private:
  CRC32AcceleratedX86ARMCombined(const CRC32AcceleratedX86ARMCombined&) =
      delete;
  CRC32AcceleratedX86ARMCombined& operator=(
      const CRC32AcceleratedX86ARMCombined&) = delete;
};

// Constants for switching between algorithms.
// Chosen by comparing speed at different powers of 2.
constexpr size_t kSmallCutoff = 256;
constexpr size_t kMediumCutoff = 2048;

#define ABSL_INTERNAL_STEP1(crc)                      \
  do {                                                \
    crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \
  } while (0)
#define ABSL_INTERNAL_STEP2(crc)                                               \
  do {                                                                         \
    crc =                                                                      \
        CRC32_u16(static_cast<uint32_t>(crc), absl::little_endian::Load16(p)); \
    p += 2;                                                                    \
  } while (0)
#define ABSL_INTERNAL_STEP4(crc)                                               \
  do {                                                                         \
    crc =                                                                      \
        CRC32_u32(static_cast<uint32_t>(crc), absl::little_endian::Load32(p)); \
    p += 4;                                                                    \
  } while (0)
#define ABSL_INTERNAL_STEP8(crc, data)                  \
  do {                                                  \
    crc = CRC32_u64(static_cast<uint32_t>(crc),         \
                    absl::little_endian::Load64(data)); \
    data += 8;                                          \
  } while (0)
#define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \
  do {                                             \
    ABSL_INTERNAL_STEP8(crc0, p0);                 \
    ABSL_INTERNAL_STEP8(crc1, p1);                 \
  } while (0)
#define ABSL_INTERNAL_STEP8BY3(crc0, crc1, crc2, p0, p1, p2) \
  do {                                                       \
    ABSL_INTERNAL_STEP8(crc0, p0);                           \
    ABSL_INTERNAL_STEP8(crc1, p1);                           \
    ABSL_INTERNAL_STEP8(crc2, p2);                           \
  } while (0)

namespace {

uint32_t multiply(uint32_t a, uint32_t b) {
  V128 power = V128_From64WithZeroFill(a);
  V128 crc = V128_From64WithZeroFill(b);
  V128 res = V128_PMulLow(power, crc);

  // Combine crc values.
  //
  // Adding res to itself is equivalent to multiplying by 2,
  // or shifting left by 1. Addition is used as not all compilers
  // are able to generate optimal code without this hint.
  // https://godbolt.org/z/rr3fMnf39
  res = V128_Add64(res, res);
  return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
         CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));
}

// Powers of crc32c polynomial, for faster ExtendByZeros.
// Verified against folly:
// folly/hash/detail/Crc32CombineDetail.cpp
constexpr uint32_t kCRC32CPowers[] = {
    0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 0xb8fdb1e7,
    0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 0x28461564,
    0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f, 0x538586e3,
    0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe, 0xe94ca9bc,
    0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 0x00800000,
    0x00008000, 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955,
    0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62,
    0x28461564, 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f,
    0x538586e3, 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe,
    0xe94ca9bc, 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000,
    0x00800000, 0x00008000,
};

}  // namespace

// Compute a magic constant, so that multiplying by it is the same as
// extending crc by length zeros.
uint32_t CRC32AcceleratedX86ARMCombined::ComputeZeroConstant(
    size_t length) const {
  // Lowest 2 bits are handled separately in ExtendByZeroes
  length >>= 2;

  int index = absl::countr_zero(length);
  uint32_t prev = kCRC32CPowers[index];
  length &= length - 1;

  while (length) {
    // For each bit of length, extend by 2**n zeros.
    index = absl::countr_zero(length);
    prev = multiply(prev, kCRC32CPowers[index]);
    length &= length - 1;
  }
  return prev;
}

void CRC32AcceleratedX86ARMCombined::ExtendByZeroes(uint32_t* crc,
                                                    size_t length) const {
  uint32_t val = *crc;
  // Don't bother with multiplication for small length.
  switch (length & 3) {
    case 0:
      break;
    case 1:
      val = CRC32_u8(val, 0);
      break;
    case 2:
      val = CRC32_u16(val, 0);
      break;
    case 3:
      val = CRC32_u8(val, 0);
      val = CRC32_u16(val, 0);
      break;
  }
  if (length > 3) {
    val = multiply(val, ComputeZeroConstant(length));
  }
  *crc = val;
}

// Taken from Intel paper "Fast CRC Computation for iSCSI Polynomial Using CRC32
// Instruction"
// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
// We only need every 4th value, because we unroll loop by 4.
constexpr uint64_t kClmulConstants[] = {
    0x09e4addf8, 0x0ba4fc28e, 0x00d3b6092, 0x09e4addf8, 0x0ab7aff2a,
    0x102f9b8a2, 0x0b9e02b86, 0x00d3b6092, 0x1bf2e8b8a, 0x18266e456,
    0x0d270f1a2, 0x0ab7aff2a, 0x11eef4f8e, 0x083348832, 0x0dd7e3b0c,
    0x0b9e02b86, 0x0271d9844, 0x1b331e26a, 0x06b749fb2, 0x1bf2e8b8a,
    0x0e6fc4e6a, 0x0ce7f39f4, 0x0d7a4825c, 0x0d270f1a2, 0x026f6a60a,
    0x12ed0daac, 0x068bce87a, 0x11eef4f8e, 0x1329d9f7e, 0x0b3e32c28,
    0x0170076fa, 0x0dd7e3b0c, 0x1fae1cc66, 0x010746f3c, 0x086d8e4d2,
    0x0271d9844, 0x0b3af077a, 0x093a5f730, 0x1d88abd4a, 0x06b749fb2,
    0x0c9c8b782, 0x0cec3662e, 0x1ddffc5d4, 0x0e6fc4e6a, 0x168763fa6,
    0x0b0cd4768, 0x19b1afbc4, 0x0d7a4825c, 0x123888b7a, 0x00167d312,
    0x133d7a042, 0x026f6a60a, 0x000bcf5f6, 0x19d34af3a, 0x1af900c24,
    0x068bce87a, 0x06d390dec, 0x16cba8aca, 0x1f16a3418, 0x1329d9f7e,
    0x19fb2a8b0, 0x02178513a, 0x1a0f717c4, 0x0170076fa,
};

enum class CutoffStrategy {
  // Use 3 CRC streams to fold into 1.
  Fold3,
  // Unroll CRC instructions for 64 bytes.
  Unroll64CRC,
};

// Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the
// methods and data that don't need the template arguments.
class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
    : public CRC32AcceleratedX86ARMCombined {
 protected:
  // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
  // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
  // high latency, so we run 4 128-bit partial checksums that can be reduced to
  // a single value by FinalizePclmulStream later. Computing crc for arbitrary
  // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
  // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  // We are applying it to CRC32C polynomial.
  ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
      const uint8_t* p, V128* partialCRC) const {
    V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));

    V128 partialCRC1 = partialCRC[0];
    V128 partialCRC2 = partialCRC[1];
    V128 partialCRC3 = partialCRC[2];
    V128 partialCRC4 = partialCRC[3];

    V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
    V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
    V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
    V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
    V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
    V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
    V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
    V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
    partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
    partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
    partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
    partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
    partialCRC1 = V128_Xor(tmp1, partialCRC1);
    partialCRC2 = V128_Xor(tmp2, partialCRC2);
    partialCRC3 = V128_Xor(tmp3, partialCRC3);
    partialCRC4 = V128_Xor(tmp4, partialCRC4);
    partialCRC1 = V128_Xor(partialCRC1, data1);
    partialCRC2 = V128_Xor(partialCRC2, data2);
    partialCRC3 = V128_Xor(partialCRC3, data3);
    partialCRC4 = V128_Xor(partialCRC4, data4);
    partialCRC[0] = partialCRC1;
    partialCRC[1] = partialCRC2;
    partialCRC[2] = partialCRC3;
    partialCRC[3] = partialCRC4;
  }

  // Reduce partialCRC produced by Process64BytesPclmul into a single value,
  // that represents crc checksum of all the processed bytes.
  ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
  FinalizePclmulStream(V128* partialCRC) const {
    V128 partialCRC1 = partialCRC[0];
    V128 partialCRC2 = partialCRC[1];
    V128 partialCRC3 = partialCRC[2];
    V128 partialCRC4 = partialCRC[3];

    // Combine 4 vectors of partial crc into a single vector.
    V128 reductionMultiplicands =
        V128_Load(reinterpret_cast<const V128*>(k5k6));

    V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
    V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);

    partialCRC1 = V128_Xor(low, high);
    partialCRC1 = V128_Xor(partialCRC1, partialCRC2);

    low = V128_PMulLow(reductionMultiplicands, partialCRC3);
    high = V128_PMulHi(reductionMultiplicands, partialCRC3);

    partialCRC3 = V128_Xor(low, high);
    partialCRC3 = V128_Xor(partialCRC3, partialCRC4);

    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));

    low = V128_PMulLow(reductionMultiplicands, partialCRC1);
    high = V128_PMulHi(reductionMultiplicands, partialCRC1);
    V128 fullCRC = V128_Xor(low, high);
    fullCRC = V128_Xor(fullCRC, partialCRC3);

    // Reduce fullCRC into scalar value.
    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));

    V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));

    V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
    fullCRC = V128_ShiftRight<8>(fullCRC);
    fullCRC = V128_Xor(fullCRC, tmp);

    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));

    tmp = V128_ShiftRight<4>(fullCRC);
    fullCRC = V128_And(fullCRC, mask);
    fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
    fullCRC = V128_Xor(tmp, fullCRC);

    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));

    tmp = V128_And(fullCRC, mask);
    tmp = V128_PMul01(reductionMultiplicands, tmp);
    tmp = V128_And(tmp, mask);
    tmp = V128_PMulLow(reductionMultiplicands, tmp);

    fullCRC = V128_Xor(tmp, fullCRC);

    return static_cast<uint64_t>(V128_Extract32<1>(fullCRC));
  }

  // Update crc with 64 bytes of data from p.
  ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
                                                          uint64_t crc) const {
    for (int i = 0; i < 8; i++) {
      crc =
          CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p));
      p += 8;
    }
    return crc;
  }

  // Generated by crc32c_x86_test --crc32c_generate_constants=true
  // and verified against constants in linux kernel for S390:
  // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
  alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
  alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
  alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
  alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
  alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
  alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};

  // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
  // size. Each group is CRCed in parallel then combined at the end of the
  // block.
  static constexpr size_t kGroupsSmall = 3;
  // For large runs we use up to kMaxStreams blocks computed with CRC
  // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
  // are combined in the end.
  static constexpr size_t kMaxStreams = 3;
};

#ifdef ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
alignas(16) constexpr uint64_t
    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k1k2[2];
alignas(16) constexpr uint64_t
    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k3k4[2];
alignas(16) constexpr uint64_t
    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k5k6[2];
alignas(16) constexpr uint64_t
    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k7k0[2];
alignas(16) constexpr uint64_t
    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kPoly[2];
alignas(16) constexpr uint32_t
    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMask[4];
constexpr size_t
    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kGroupsSmall;
constexpr size_t CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMaxStreams;
#endif  // ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL

template <size_t num_crc_streams, size_t num_pclmul_streams,
          CutoffStrategy strategy>
class CRC32AcceleratedX86ARMCombinedMultipleStreams
    : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase {
  ABSL_ATTRIBUTE_HOT
  void Extend(uint32_t* crc, const void* bytes, size_t length) const override {
    static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams,
                  "Invalid number of crc streams");
    static_assert(num_pclmul_streams >= 0 && num_pclmul_streams <= kMaxStreams,
                  "Invalid number of pclmul streams");
    const uint8_t* p = static_cast<const uint8_t*>(bytes);
    const uint8_t* e = p + length;
    uint32_t l = *crc;
    uint64_t l64;

    // We have dedicated instruction for 1,2,4 and 8 bytes.
    if (length & 8) {
      ABSL_INTERNAL_STEP8(l, p);
      length &= ~size_t{8};
    }
    if (length & 4) {
      ABSL_INTERNAL_STEP4(l);
      length &= ~size_t{4};
    }
    if (length & 2) {
      ABSL_INTERNAL_STEP2(l);
      length &= ~size_t{2};
    }
    if (length & 1) {
      ABSL_INTERNAL_STEP1(l);
      length &= ~size_t{1};
    }
    if (length == 0) {
      *crc = l;
      return;
    }
    // length is now multiple of 16.

    // For small blocks just run simple loop, because cost of combining multiple
    // streams is significant.
    if (strategy != CutoffStrategy::Unroll64CRC) {
      if (length < kSmallCutoff) {
        while (length >= 16) {
          ABSL_INTERNAL_STEP8(l, p);
          ABSL_INTERNAL_STEP8(l, p);
          length -= 16;
        }
        *crc = l;
        return;
      }
    }

    // For medium blocks we run 3 crc streams and combine them as described in
    // Intel paper above. Running 4th stream doesn't help, because crc
    // instruction has latency 3 and throughput 1.
    if (length < kMediumCutoff) {
      l64 = l;
      if (strategy == CutoffStrategy::Fold3) {
        uint64_t l641 = 0;
        uint64_t l642 = 0;
        const size_t blockSize = 32;
        size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize;
        const uint8_t* p1 = p + bs * blockSize;
        const uint8_t* p2 = p1 + bs * blockSize;

        for (size_t i = 0; i + 1 < bs; ++i) {
          ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
          ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
          ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
          ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
          PrefetchToLocalCache(
              reinterpret_cast<const char*>(p + kPrefetchHorizonMedium));
          PrefetchToLocalCache(
              reinterpret_cast<const char*>(p1 + kPrefetchHorizonMedium));
          PrefetchToLocalCache(
              reinterpret_cast<const char*>(p2 + kPrefetchHorizonMedium));
        }
        // Don't run crc on last 8 bytes.
        ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
        ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
        ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
        ABSL_INTERNAL_STEP8BY2(l64, l641, p, p1);

        V128 magic = *(reinterpret_cast<const V128*>(kClmulConstants) + bs - 1);

        V128 tmp = V128_From64WithZeroFill(l64);

        V128 res1 = V128_PMulLow(tmp, magic);

        tmp = V128_From64WithZeroFill(l641);

        V128 res2 = V128_PMul10(tmp, magic);
        V128 x = V128_Xor(res1, res2);
        l64 = static_cast<uint64_t>(V128_Low64(x)) ^
              absl::little_endian::Load64(p2);
        l64 = CRC32_u64(static_cast<uint32_t>(l642), l64);

        p = p2 + 8;
      } else if (strategy == CutoffStrategy::Unroll64CRC) {
        while ((e - p) >= 64) {
          l64 = Process64BytesCRC(p, l64);
          p += 64;
        }
      }
    } else {
      // There is a lot of data, we can ignore combine costs and run all
      // requested streams (num_crc_streams + num_pclmul_streams),
      // using prefetch. CRC and PCLMULQDQ use different cpu execution units,
      // so on some cpus it makes sense to execute both of them for different
      // streams.

      // Point x at first 8-byte aligned byte in string.
      const uint8_t* x = RoundUp<8>(p);
      // Process bytes until p is 8-byte aligned, if that isn't past the end.
      while (p != x) {
        ABSL_INTERNAL_STEP1(l);
      }

      size_t bs = static_cast<size_t>(e - p) /
                  (num_crc_streams + num_pclmul_streams) / 64;
      const uint8_t* crc_streams[kMaxStreams];
      const uint8_t* pclmul_streams[kMaxStreams];
      // We are guaranteed to have at least one crc stream.
      crc_streams[0] = p;
      for (size_t i = 1; i < num_crc_streams; i++) {
        crc_streams[i] = crc_streams[i - 1] + bs * 64;
      }
      pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64;
      for (size_t i = 1; i < num_pclmul_streams; i++) {
        pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64;
      }

      // Per stream crc sums.
      uint64_t l64_crc[kMaxStreams] = {l};
      uint64_t l64_pclmul[kMaxStreams] = {0};

      // Peel first iteration, because PCLMULQDQ stream, needs setup.
      for (size_t i = 0; i < num_crc_streams; i++) {
        l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
        crc_streams[i] += 16 * 4;
      }

      V128 partialCRC[kMaxStreams][4];
      for (size_t i = 0; i < num_pclmul_streams; i++) {
        partialCRC[i][0] = V128_LoadU(
            reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0));
        partialCRC[i][1] = V128_LoadU(
            reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 1));
        partialCRC[i][2] = V128_LoadU(
            reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 2));
        partialCRC[i][3] = V128_LoadU(
            reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 3));
        pclmul_streams[i] += 16 * 4;
      }

      for (size_t i = 1; i < bs; i++) {
        // Prefetch data for next iterations.
        for (size_t j = 0; j < num_crc_streams; j++) {
          PrefetchToLocalCache(
              reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon));
        }
        for (size_t j = 0; j < num_pclmul_streams; j++) {
          PrefetchToLocalCache(reinterpret_cast<const char*>(pclmul_streams[j] +
                                                             kPrefetchHorizon));
        }

        // We process each stream in 64 byte blocks. This can be written as
        // for (int i = 0; i < num_pclmul_streams; i++) {
        //   Process64BytesPclmul(pclmul_streams[i], partialCRC[i]);
        //   pclmul_streams[i] += 16 * 4;
        // }
        // for (int i = 0; i < num_crc_streams; i++) {
        //   l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
        //   crc_streams[i] += 16*4;
        // }
        // But unrolling and interleaving PCLMULQDQ and CRC blocks manually
        // gives ~2% performance boost.
        l64_crc[0] = Process64BytesCRC(crc_streams[0], l64_crc[0]);
        crc_streams[0] += 16 * 4;
        if (num_pclmul_streams > 0) {
          Process64BytesPclmul(pclmul_streams[0], partialCRC[0]);
          pclmul_streams[0] += 16 * 4;
        }
        if (num_crc_streams > 1) {
          l64_crc[1] = Process64BytesCRC(crc_streams[1], l64_crc[1]);
          crc_streams[1] += 16 * 4;
        }
        if (num_pclmul_streams > 1) {
          Process64BytesPclmul(pclmul_streams[1], partialCRC[1]);
          pclmul_streams[1] += 16 * 4;
        }
        if (num_crc_streams > 2) {
          l64_crc[2] = Process64BytesCRC(crc_streams[2], l64_crc[2]);
          crc_streams[2] += 16 * 4;
        }
        if (num_pclmul_streams > 2) {
          Process64BytesPclmul(pclmul_streams[2], partialCRC[2]);
          pclmul_streams[2] += 16 * 4;
        }
      }

      // PCLMULQDQ based streams require special final step;
      // CRC based don't.
      for (size_t i = 0; i < num_pclmul_streams; i++) {
        l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]);
      }

      // Combine all streams into single result.
      uint32_t magic = ComputeZeroConstant(bs * 64);
      l64 = l64_crc[0];
      for (size_t i = 1; i < num_crc_streams; i++) {
        l64 = multiply(static_cast<uint32_t>(l64), magic);
        l64 ^= l64_crc[i];
      }
      for (size_t i = 0; i < num_pclmul_streams; i++) {
        l64 = multiply(static_cast<uint32_t>(l64), magic);
        l64 ^= l64_pclmul[i];
      }

      // Update p.
      if (num_pclmul_streams > 0) {
        p = pclmul_streams[num_pclmul_streams - 1];
      } else {
        p = crc_streams[num_crc_streams - 1];
      }
    }
    l = static_cast<uint32_t>(l64);

    while ((e - p) >= 16) {
      ABSL_INTERNAL_STEP8(l, p);
      ABSL_INTERNAL_STEP8(l, p);
    }
    // Process the last few bytes
    while (p != e) {
      ABSL_INTERNAL_STEP1(l);
    }

#undef ABSL_INTERNAL_STEP8BY3
#undef ABSL_INTERNAL_STEP8BY2
#undef ABSL_INTERNAL_STEP8
#undef ABSL_INTERNAL_STEP4
#undef ABSL_INTERNAL_STEP2
#undef ABSL_INTERNAL_STEP1

    *crc = l;
  }
};

}  // namespace

// Intel processors with SSE4.2 have an instruction for one particular
// 32-bit CRC polynomial:  crc32c
CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() {
  CpuType type = GetCpuType();
  switch (type) {
    case CpuType::kIntelHaswell:
    case CpuType::kAmdRome:
    case CpuType::kAmdNaples:
    case CpuType::kAmdMilan:
      return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
          3, 1, CutoffStrategy::Fold3>();
    // PCLMULQDQ is fast, use combined PCLMULQDQ + CRC implementation.
    case CpuType::kIntelCascadelakeXeon:
    case CpuType::kIntelSkylakeXeon:
    case CpuType::kIntelBroadwell:
    case CpuType::kIntelSkylake:
      return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
          3, 2, CutoffStrategy::Fold3>();
    // PCLMULQDQ is slow, don't use it.
    case CpuType::kIntelIvybridge:
    case CpuType::kIntelSandybridge:
    case CpuType::kIntelWestmere:
      return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
          3, 0, CutoffStrategy::Fold3>();
    case CpuType::kArmNeoverseN1:
    case CpuType::kArmNeoverseN2:
    case CpuType::kArmNeoverseV1:
      return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
          1, 1, CutoffStrategy::Unroll64CRC>();
    case CpuType::kAmpereSiryn:
      return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
          3, 2, CutoffStrategy::Fold3>();
    case CpuType::kArmNeoverseV2:
      return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
          1, 2, CutoffStrategy::Unroll64CRC>();
#if defined(__aarch64__)
    default:
      // Not all ARM processors support the needed instructions, so check here
      // before trying to use an accelerated implementation.
      if (SupportsArmCRC32PMULL()) {
        return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
            1, 1, CutoffStrategy::Unroll64CRC>();
      } else {
        return nullptr;
      }
#else
    default:
      // Something else, play it safe and assume slow PCLMULQDQ.
      return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
          3, 0, CutoffStrategy::Fold3>();
#endif
  }
}

std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
  auto ret = std::vector<std::unique_ptr<CRCImpl>>();
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    1, 0, CutoffStrategy::Fold3>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    1, 1, CutoffStrategy::Fold3>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    1, 2, CutoffStrategy::Fold3>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    1, 3, CutoffStrategy::Fold3>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    2, 0, CutoffStrategy::Fold3>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    2, 1, CutoffStrategy::Fold3>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    2, 2, CutoffStrategy::Fold3>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    2, 3, CutoffStrategy::Fold3>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    3, 0, CutoffStrategy::Fold3>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    3, 1, CutoffStrategy::Fold3>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    3, 2, CutoffStrategy::Fold3>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    3, 3, CutoffStrategy::Fold3>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    1, 0, CutoffStrategy::Unroll64CRC>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    1, 1, CutoffStrategy::Unroll64CRC>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    1, 2, CutoffStrategy::Unroll64CRC>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    1, 3, CutoffStrategy::Unroll64CRC>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    2, 0, CutoffStrategy::Unroll64CRC>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    2, 1, CutoffStrategy::Unroll64CRC>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    2, 2, CutoffStrategy::Unroll64CRC>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    2, 3, CutoffStrategy::Unroll64CRC>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    3, 0, CutoffStrategy::Unroll64CRC>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    3, 1, CutoffStrategy::Unroll64CRC>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    3, 2, CutoffStrategy::Unroll64CRC>>());
  ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
                    3, 3, CutoffStrategy::Unroll64CRC>>());

  return ret;
}

#else  // !ABSL_INTERNAL_CAN_USE_SIMD_CRC32C

std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
  return std::vector<std::unique_ptr<CRCImpl>>();
}

// no hardware acceleration available
CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() { return nullptr; }

#endif

}  // namespace crc_internal
ABSL_NAMESPACE_END
}  // namespace absl