1 files changed, 211 insertions, 181 deletions
diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc
index d771a786..2112f609 100644
--- a/absl/crc/internal/crc_x86_arm_combined.cc
+++ b/absl/crc/internal/crc_x86_arm_combined.cc
@@ -14,12 +14,11 @@
 
 // Hardware accelerated CRC32 computation on Intel and ARM architecture.
 
-#include <stddef.h>
-
+#include <cstddef>
 #include <cstdint>
 
 #include "absl/base/attributes.h"
-#include "absl/base/call_once.h"
+#include "absl/base/config.h"
 #include "absl/base/dynamic_annotations.h"
 #include "absl/base/internal/endian.h"
 #include "absl/base/internal/prefetch.h"
@@ -67,27 +66,30 @@ class CRC32AcceleratedX86ARMCombined : public CRC32 {
 
 // Constants for switching between algorithms.
 // Chosen by comparing speed at different powers of 2.
-constexpr int kSmallCutoff = 256;
-constexpr int kMediumCutoff = 2048;
+constexpr size_t kSmallCutoff = 256;
+constexpr size_t kMediumCutoff = 2048;
 
-#define ABSL_INTERNAL_STEP1(crc) \
-  do {                           \
-    crc = CRC32_u8(crc, *p++);   \
+#define ABSL_INTERNAL_STEP1(crc)                      \
+  do {                                                \
+    crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \
   } while (0)
-#define ABSL_INTERNAL_STEP2(crc)                          \
-  do {                                                    \
-    crc = CRC32_u16(crc, absl::little_endian::Load16(p)); \
-    p += 2;                                               \
+#define ABSL_INTERNAL_STEP2(crc)                                               \
+  do {                                                                         \
+    crc =                                                                      \
+        CRC32_u16(static_cast<uint32_t>(crc), absl::little_endian::Load16(p)); \
+    p += 2;                                                                    \
   } while (0)
-#define ABSL_INTERNAL_STEP4(crc)                          \
-  do {                                                    \
-    crc = CRC32_u32(crc, absl::little_endian::Load32(p)); \
-    p += 4;                                               \
+#define ABSL_INTERNAL_STEP4(crc)                                               \
+  do {                                                                         \
+    crc =                                                                      \
+        CRC32_u32(static_cast<uint32_t>(crc), absl::little_endian::Load32(p)); \
+    p += 4;                                                                    \
   } while (0)
-#define ABSL_INTERNAL_STEP8(crc, data)                       \
-  do {                                                       \
-    crc = CRC32_u64(crc, absl::little_endian::Load64(data)); \
-    data += 8;                                               \
+#define ABSL_INTERNAL_STEP8(crc, data)                  \
+  do {                                                  \
+    crc = CRC32_u64(static_cast<uint32_t>(crc),         \
+                    absl::little_endian::Load64(data)); \
+    data += 8;                                          \
   } while (0)
 #define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \
   do {                                             \
@@ -101,6 +103,8 @@ constexpr int kMediumCutoff = 2048;
     ABSL_INTERNAL_STEP8(crc2, p2);                           \
   } while (0)
 
+namespace {
+
 uint32_t multiply(uint32_t a, uint32_t b) {
   V128 shifts = V128_From2x64(0, 1);
   V128 power = V128_From2x64(0, a);
@@ -109,11 +113,10 @@ uint32_t multiply(uint32_t a, uint32_t b) {
 
   // Combine crc values
   res = V128_ShiftLeft64(res, shifts);
-  return V128_Extract32<1>(res) ^ CRC32_u32(0, V128_Low64(res));
+  return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
+         CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));
 }
 
-namespace {
-
 // Powers of crc32c polynomial, for faster ExtendByZeros.
 // Verified against folly:
 // folly/hash/detail/Crc32CombineDetail.cpp
@@ -204,9 +207,167 @@ enum class CutoffStrategy {
   Unroll64CRC,
 };
 
-template <int num_crc_streams, int num_pclmul_streams, CutoffStrategy strategy>
-class CRC32AcceleratedX86ARMCombinedMultipleStreams
+// Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the
+// methods and data that don't need the template arguments.
+class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
     : public CRC32AcceleratedX86ARMCombined {
+ protected:
+  // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
+  // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
+  // high latency, so we run 4 128-bit partial checksums that can be reduced to
+  // a single value by FinalizePclmulStream later. Computing crc for arbitrary
+  // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
+  // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+  // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+  // We are applying it to CRC32C polynomial.
+  ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
+      const uint8_t* p, V128* partialCRC) const {
+    V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
+
+    V128 partialCRC1 = partialCRC[0];
+    V128 partialCRC2 = partialCRC[1];
+    V128 partialCRC3 = partialCRC[2];
+    V128 partialCRC4 = partialCRC[3];
+
+    V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
+    V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
+    V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
+    V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
+    V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
+    V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
+    V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
+    V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
+    partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
+    partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
+    partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
+    partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
+    partialCRC1 = V128_Xor(tmp1, partialCRC1);
+    partialCRC2 = V128_Xor(tmp2, partialCRC2);
+    partialCRC3 = V128_Xor(tmp3, partialCRC3);
+    partialCRC4 = V128_Xor(tmp4, partialCRC4);
+    partialCRC1 = V128_Xor(partialCRC1, data1);
+    partialCRC2 = V128_Xor(partialCRC2, data2);
+    partialCRC3 = V128_Xor(partialCRC3, data3);
+    partialCRC4 = V128_Xor(partialCRC4, data4);
+    partialCRC[0] = partialCRC1;
+    partialCRC[1] = partialCRC2;
+    partialCRC[2] = partialCRC3;
+    partialCRC[3] = partialCRC4;
+  }
+
+  // Reduce partialCRC produced by Process64BytesPclmul into a single value,
+  // that represents crc checksum of all the processed bytes.
+  ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
+  FinalizePclmulStream(V128* partialCRC) const {
+    V128 partialCRC1 = partialCRC[0];
+    V128 partialCRC2 = partialCRC[1];
+    V128 partialCRC3 = partialCRC[2];
+    V128 partialCRC4 = partialCRC[3];
+
+    // Combine 4 vectors of partial crc into a single vector.
+    V128 reductionMultiplicands =
+        V128_Load(reinterpret_cast<const V128*>(k5k6));
+
+    V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
+    V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
+
+    partialCRC1 = V128_Xor(low, high);
+    partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
+
+    low = V128_PMulLow(reductionMultiplicands, partialCRC3);
+    high = V128_PMulHi(reductionMultiplicands, partialCRC3);
+
+    partialCRC3 = V128_Xor(low, high);
+    partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
+
+    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
+
+    low = V128_PMulLow(reductionMultiplicands, partialCRC1);
+    high = V128_PMulHi(reductionMultiplicands, partialCRC1);
+    V128 fullCRC = V128_Xor(low, high);
+    fullCRC = V128_Xor(fullCRC, partialCRC3);
+
+    // Reduce fullCRC into scalar value.
+    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
+
+    V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
+
+    V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
+    fullCRC = V128_ShiftRight<8>(fullCRC);
+    fullCRC = V128_Xor(fullCRC, tmp);
+
+    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
+
+    tmp = V128_ShiftRight<4>(fullCRC);
+    fullCRC = V128_And(fullCRC, mask);
+    fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
+    fullCRC = V128_Xor(tmp, fullCRC);
+
+    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
+
+    tmp = V128_And(fullCRC, mask);
+    tmp = V128_PMul01(reductionMultiplicands, tmp);
+    tmp = V128_And(tmp, mask);
+    tmp = V128_PMulLow(reductionMultiplicands, tmp);
+
+    fullCRC = V128_Xor(tmp, fullCRC);
+
+    return static_cast<uint64_t>(V128_Extract32<1>(fullCRC));
+  }
+
+  // Update crc with 64 bytes of data from p.
+  ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
+                                                          uint64_t crc) const {
+    for (int i = 0; i < 8; i++) {
+      crc =
+          CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p));
+      p += 8;
+    }
+    return crc;
+  }
+
+  // Generated by crc32c_x86_test --crc32c_generate_constants=true
+  // and verified against constants in linux kernel for S390:
+  // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
+  alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
+  alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
+  alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
+  alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
+  alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
+  alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
+
+  // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
+  // size. Each group is CRCed in parallel then combined at the end of the
+  // block.
+  static constexpr size_t kGroupsSmall = 3;
+  // For large runs we use up to kMaxStreams blocks computed with CRC
+  // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
+  // are combined in the end.
+  static constexpr size_t kMaxStreams = 3;
+};
+
+#ifdef ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
+alignas(16) constexpr uint64_t
+    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k1k2[2];
+alignas(16) constexpr uint64_t
+    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k3k4[2];
+alignas(16) constexpr uint64_t
+    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k5k6[2];
+alignas(16) constexpr uint64_t
+    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k7k0[2];
+alignas(16) constexpr uint64_t
+    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kPoly[2];
+alignas(16) constexpr uint32_t
+    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMask[4];
+constexpr size_t
+    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kGroupsSmall;
+constexpr size_t CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMaxStreams;
+#endif  // ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
+
+template <size_t num_crc_streams, size_t num_pclmul_streams,
+          CutoffStrategy strategy>
+class CRC32AcceleratedX86ARMCombinedMultipleStreams
+    : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase {
   ABSL_ATTRIBUTE_HOT
   void Extend(uint32_t* crc, const void* bytes, size_t length) const override {
     static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams,
@@ -221,19 +382,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
     // We have dedicated instruction for 1,2,4 and 8 bytes.
     if (length & 8) {
       ABSL_INTERNAL_STEP8(l, p);
-      length &= ~8LL;
+      length &= ~size_t{8};
     }
     if (length & 4) {
       ABSL_INTERNAL_STEP4(l);
-      length &= ~4LL;
+      length &= ~size_t{4};
     }
     if (length & 2) {
       ABSL_INTERNAL_STEP2(l);
-      length &= ~2LL;
+      length &= ~size_t{2};
     }
     if (length & 1) {
       ABSL_INTERNAL_STEP1(l);
-      length &= ~1LL;
+      length &= ~size_t{1};
     }
     if (length == 0) {
       *crc = l;
@@ -263,12 +424,12 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
       if (strategy == CutoffStrategy::Fold3) {
         uint64_t l641 = 0;
         uint64_t l642 = 0;
-        const int blockSize = 32;
-        int64_t bs = (e - p) / kGroupsSmall / blockSize;
+        const size_t blockSize = 32;
+        size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize;
         const uint8_t* p1 = p + bs * blockSize;
         const uint8_t* p2 = p1 + bs * blockSize;
 
-        for (int64_t i = 0; i < bs - 1; ++i) {
+        for (size_t i = 0; i + 1 < bs; ++i) {
           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
@@ -290,8 +451,9 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
 
         V128 res2 = V128_PMul10(tmp, magic);
         V128 x = V128_Xor(res1, res2);
-        l64 = V128_Low64(x) ^ absl::little_endian::Load64(p2);
-        l64 = CRC32_u64(l642, l64);
+        l64 = static_cast<uint64_t>(V128_Low64(x)) ^
+              absl::little_endian::Load64(p2);
+        l64 = CRC32_u64(static_cast<uint32_t>(l642), l64);
 
         p = p2 + 8;
       } else if (strategy == CutoffStrategy::Unroll64CRC) {
@@ -314,16 +476,17 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
         ABSL_INTERNAL_STEP1(l);
       }
 
-      int64_t bs = (e - p) / (num_crc_streams + num_pclmul_streams) / 64;
+      size_t bs = static_cast<size_t>(e - p) /
+                  (num_crc_streams + num_pclmul_streams) / 64;
       const uint8_t* crc_streams[kMaxStreams];
       const uint8_t* pclmul_streams[kMaxStreams];
       // We are guaranteed to have at least one crc stream.
       crc_streams[0] = p;
-      for (int i = 1; i < num_crc_streams; i++) {
+      for (size_t i = 1; i < num_crc_streams; i++) {
         crc_streams[i] = crc_streams[i - 1] + bs * 64;
       }
       pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64;
-      for (int i = 1; i < num_pclmul_streams; i++) {
+      for (size_t i = 1; i < num_pclmul_streams; i++) {
         pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64;
       }
 
@@ -332,13 +495,13 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
       uint64_t l64_pclmul[kMaxStreams] = {0};
 
       // Peel first iteration, because PCLMULQDQ stream, needs setup.
-      for (int i = 0; i < num_crc_streams; i++) {
+      for (size_t i = 0; i < num_crc_streams; i++) {
         l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
         crc_streams[i] += 16 * 4;
       }
 
       V128 partialCRC[kMaxStreams][4];
-      for (int i = 0; i < num_pclmul_streams; i++) {
+      for (size_t i = 0; i < num_pclmul_streams; i++) {
         partialCRC[i][0] = V128_LoadU(
             reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0));
         partialCRC[i][1] = V128_LoadU(
@@ -350,13 +513,13 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
         pclmul_streams[i] += 16 * 4;
       }
 
-      for (int64_t i = 1; i < bs; i++) {
+      for (size_t i = 1; i < bs; i++) {
         // Prefetch data for next itterations.
-        for (int j = 0; j < num_crc_streams; j++) {
+        for (size_t j = 0; j < num_crc_streams; j++) {
           base_internal::PrefetchT0(
               reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon));
         }
-        for (int j = 0; j < num_pclmul_streams; j++) {
+        for (size_t j = 0; j < num_pclmul_streams; j++) {
           base_internal::PrefetchT0(reinterpret_cast<const char*>(
               pclmul_streams[j] + kPrefetchHorizon));
         }
@@ -398,19 +561,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
 
       // PCLMULQDQ based streams require special final step;
       // CRC based don't.
-      for (int i = 0; i < num_pclmul_streams; i++) {
+      for (size_t i = 0; i < num_pclmul_streams; i++) {
         l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]);
       }
 
       // Combine all streams into single result.
       uint32_t magic = ComputeZeroConstant(bs * 64);
       l64 = l64_crc[0];
-      for (int i = 1; i < num_crc_streams; i++) {
-        l64 = multiply(l64, magic);
+      for (size_t i = 1; i < num_crc_streams; i++) {
+        l64 = multiply(static_cast<uint32_t>(l64), magic);
         l64 ^= l64_crc[i];
       }
-      for (int i = 0; i < num_pclmul_streams; i++) {
-        l64 = multiply(l64, magic);
+      for (size_t i = 0; i < num_pclmul_streams; i++) {
+        l64 = multiply(static_cast<uint32_t>(l64), magic);
         l64 ^= l64_pclmul[i];
       }
 
@@ -421,7 +584,7 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
         p = crc_streams[num_crc_streams - 1];
       }
     }
-    l = l64;
+    l = static_cast<uint32_t>(l64);
 
     while ((e - p) >= 16) {
       ABSL_INTERNAL_STEP8(l, p);
@@ -441,139 +604,6 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
 
     *crc = l;
   }
-
- private:
-  // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
-  // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
-  // high latency, so we run 4 128-bit partial checksums that can be reduced to
-  // a single value by FinalizePclmulStream later. Computing crc for arbitrary
-  // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
-  // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
-  // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-  // We are applying it to CRC32C polynomial.
-  ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
-      const uint8_t* p, V128* partialCRC) const {
-    V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
-
-    V128 partialCRC1 = partialCRC[0];
-    V128 partialCRC2 = partialCRC[1];
-    V128 partialCRC3 = partialCRC[2];
-    V128 partialCRC4 = partialCRC[3];
-
-    V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
-    V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
-    V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
-    V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
-    V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
-    V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
-    V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
-    V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
-    partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
-    partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
-    partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
-    partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
-    partialCRC1 = V128_Xor(tmp1, partialCRC1);
-    partialCRC2 = V128_Xor(tmp2, partialCRC2);
-    partialCRC3 = V128_Xor(tmp3, partialCRC3);
-    partialCRC4 = V128_Xor(tmp4, partialCRC4);
-    partialCRC1 = V128_Xor(partialCRC1, data1);
-    partialCRC2 = V128_Xor(partialCRC2, data2);
-    partialCRC3 = V128_Xor(partialCRC3, data3);
-    partialCRC4 = V128_Xor(partialCRC4, data4);
-    partialCRC[0] = partialCRC1;
-    partialCRC[1] = partialCRC2;
-    partialCRC[2] = partialCRC3;
-    partialCRC[3] = partialCRC4;
-  }
-
-  // Reduce partialCRC produced by Process64BytesPclmul into a single value,
-  // that represents crc checksum of all the processed bytes.
-  ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
-  FinalizePclmulStream(V128* partialCRC) const {
-    V128 partialCRC1 = partialCRC[0];
-    V128 partialCRC2 = partialCRC[1];
-    V128 partialCRC3 = partialCRC[2];
-    V128 partialCRC4 = partialCRC[3];
-
-    // Combine 4 vectors of partial crc into a single vector.
-    V128 reductionMultiplicands =
-        V128_Load(reinterpret_cast<const V128*>(k5k6));
-
-    V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
-    V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
-
-    partialCRC1 = V128_Xor(low, high);
-    partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
-
-    low = V128_PMulLow(reductionMultiplicands, partialCRC3);
-    high = V128_PMulHi(reductionMultiplicands, partialCRC3);
-
-    partialCRC3 = V128_Xor(low, high);
-    partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
-
-    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
-
-    low = V128_PMulLow(reductionMultiplicands, partialCRC1);
-    high = V128_PMulHi(reductionMultiplicands, partialCRC1);
-    V128 fullCRC = V128_Xor(low, high);
-    fullCRC = V128_Xor(fullCRC, partialCRC3);
-
-    // Reduce fullCRC into scalar value.
-    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
-
-    V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
-
-    V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
-    fullCRC = V128_ShiftRight<8>(fullCRC);
-    fullCRC = V128_Xor(fullCRC, tmp);
-
-    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
-
-    tmp = V128_ShiftRight<4>(fullCRC);
-    fullCRC = V128_And(fullCRC, mask);
-    fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
-    fullCRC = V128_Xor(tmp, fullCRC);
-
-    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
-
-    tmp = V128_And(fullCRC, mask);
-    tmp = V128_PMul01(reductionMultiplicands, tmp);
-    tmp = V128_And(tmp, mask);
-    tmp = V128_PMulLow(reductionMultiplicands, tmp);
-
-    fullCRC = V128_Xor(tmp, fullCRC);
-
-    return V128_Extract32<1>(fullCRC);
-  }
-
-  // Update crc with 64 bytes of data from p.
-  ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
-                                                          uint64_t crc) const {
-    for (int i = 0; i < 8; i++) {
-      crc = CRC32_u64(crc, absl::little_endian::Load64(p));
-      p += 8;
-    }
-    return crc;
-  }
-
-  // Generated by crc32c_x86_test --crc32c_generate_constants=true
-  // and verified against constants in linux kernel for S390:
-  // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
-  alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
-  alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
-  alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
-  alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
-  alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
-  alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
-
-  // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
-  // size. Each group is CRCed in parallel then combined at the end of the
-  // block.
-  static constexpr int kGroupsSmall = 3;
-  // For large runs we use up to kMaxStreams blocks computed with CRC
-  // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
-  // are combined in the end.
-  static constexpr int kMaxStreams = 3;
 };
 
 }  // namespace