5 files changed, 262 insertions, 253 deletions
diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h
index 8f15ded3..f6c2a21c 100644
--- a/absl/crc/internal/crc32_x86_arm_combined_simd.h
+++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -129,7 +129,7 @@ inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
 }
 
 inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
-  return _mm_crc32_u64(crc, v);
+  return static_cast<uint32_t>(_mm_crc32_u64(crc, v));
 }
 
 inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
@@ -157,7 +157,7 @@ inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
 inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); }
 
 inline V128 V128_From2x64(const uint64_t l, const uint64_t r) {
-  return _mm_set_epi64x(l, r);
+  return _mm_set_epi64x(static_cast<int64_t>(l), static_cast<int64_t>(r));
 }
 
 template <int imm>
diff --git a/absl/crc/internal/crc32c_inline.h b/absl/crc/internal/crc32c_inline.h
index 43ad14f4..6236c10b 100644
--- a/absl/crc/internal/crc32c_inline.h
+++ b/absl/crc/internal/crc32c_inline.h
@@ -34,7 +34,7 @@ inline bool ExtendCrc32cInline(uint32_t* crc, const char* p, size_t n) {
   constexpr uint32_t kCrc32Xor = 0xffffffffU;
   *crc ^= kCrc32Xor;
   if (n & 1) {
-    *crc = CRC32_u8(*crc, *p);
+    *crc = CRC32_u8(*crc, static_cast<uint8_t>(*p));
     n--;
     p++;
   }
diff --git a/absl/crc/internal/crc_memcpy_x86_64.cc b/absl/crc/internal/crc_memcpy_x86_64.cc
index 1c3299ee..a148fe17 100644
--- a/absl/crc/internal/crc_memcpy_x86_64.cc
+++ b/absl/crc/internal/crc_memcpy_x86_64.cc
@@ -82,21 +82,18 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
   return crc32c_t{crc_uint32};
 }
 
-constexpr int kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
+constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
 
 // Common function for copying the tails of multiple large regions.
-template <int vec_regions, int int_regions>
+template <size_t vec_regions, size_t int_regions>
 inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
                           size_t region_size, size_t copy_rounds) {
   std::array<__m128i, vec_regions> data;
   std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
 
   while (copy_rounds > 0) {
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
-    for (int i = 0; i < vec_regions; i++) {
-      int region = i;
+    for (size_t i = 0; i < vec_regions; i++) {
+      size_t region = i;
 
       auto* vsrc =
           reinterpret_cast<const __m128i*>(*src + region_size * region);
@@ -109,27 +106,23 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
       _mm_store_si128(vdst, data[i]);
 
       // Compute the running CRC
-      crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
-          static_cast<uint32_t>(crcs[region]), _mm_extract_epi64(data[i], 0)))};
-      crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
-          static_cast<uint32_t>(crcs[region]), _mm_extract_epi64(data[i], 1)))};
+      crcs[region] = crc32c_t{static_cast<uint32_t>(
+          _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
+                        static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))};
+      crcs[region] = crc32c_t{static_cast<uint32_t>(
+          _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
+                        static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))};
     }
 
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
-    for (int i = 0; i < int_regions; i++) {
-      int region = vec_regions + i;
+    for (size_t i = 0; i < int_regions; i++) {
+      size_t region = vec_regions + i;
 
       auto* usrc =
           reinterpret_cast<const uint64_t*>(*src + region_size * region);
       auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region);
 
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
-      for (int j = 0; j < kIntLoadsPerVec; j++) {
-        int data_index = i * kIntLoadsPerVec + j;
+      for (size_t j = 0; j < kIntLoadsPerVec; j++) {
+        size_t data_index = i * kIntLoadsPerVec + j;
 
         int_data[data_index] = *(usrc + j);
         crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
@@ -148,7 +141,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
 
 }  // namespace
 
-template <int vec_regions, int int_regions>
+template <size_t vec_regions, size_t int_regions>
 class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
  public:
   AcceleratedCrcMemcpyEngine() = default;
@@ -160,12 +153,12 @@ class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
                    std::size_t length, crc32c_t initial_crc) const override;
 };
 
-template <int vec_regions, int int_regions>
+template <size_t vec_regions, size_t int_regions>
 crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
     void* __restrict dst, const void* __restrict src, std::size_t length,
     crc32c_t initial_crc) const {
   constexpr std::size_t kRegions = vec_regions + int_regions;
-  constexpr crc32c_t kCrcDataXor = crc32c_t{0xffffffff};
+  constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
   constexpr std::size_t kBlockSize = sizeof(__m128i);
   constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
 
@@ -201,7 +194,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
   // Start work on the CRC: undo the XOR from the previous calculation or set up
   // the initial value of the CRC.
   // initial_crc ^= kCrcDataXor;
-  initial_crc = initial_crc ^ kCrcDataXor;
+  initial_crc = crc32c_t{static_cast<uint32_t>(initial_crc) ^ kCrcDataXor};
 
   // Do an initial alignment copy, so we can use aligned store instructions to
   // the destination pointer.  We align the destination pointer because the
@@ -229,13 +222,13 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
   // Initialize CRCs for kRegions regions.
   crc32c_t crcs[kRegions];
   crcs[0] = initial_crc;
-  for (int i = 1; i < kRegions; i++) {
-    crcs[i] = kCrcDataXor;
+  for (size_t i = 1; i < kRegions; i++) {
+    crcs[i] = crc32c_t{kCrcDataXor};
   }
 
   // Find the number of rounds to copy and the region size.  Also compute the
   // tail size here.
-  int64_t copy_rounds = length / kCopyRoundSize;
+  size_t copy_rounds = length / kCopyRoundSize;
 
   // Find the size of each region and the size of the tail.
   const std::size_t region_size = copy_rounds * kBlockSize;
@@ -248,10 +241,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
   // Main loop.
   while (copy_rounds > kBlocksPerCacheLine) {
     // Prefetch kPrefetchAhead bytes ahead of each pointer.
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
-    for (int i = 0; i < kRegions; i++) {
+    for (size_t i = 0; i < kRegions; i++) {
       absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead +
                                       region_size * i);
       absl::base_internal::PrefetchT0(dst_bytes + kPrefetchAhead +
@@ -259,58 +249,46 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
     }
 
     // Load and store data, computing CRC on the way.
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
-    for (int i = 0; i < kBlocksPerCacheLine; i++) {
+    for (size_t i = 0; i < kBlocksPerCacheLine; i++) {
       // Copy and CRC the data for the CRC regions.
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
-      for (int j = 0; j < vec_regions; j++) {
+      for (size_t j = 0; j < vec_regions; j++) {
         // Cycle which regions get vector load/store and integer load/store, to
         // engage prefetching logic around vector load/stores and save issue
         // slots by using the integer registers.
-        int region = (j + i) % kRegions;
+        size_t region = (j + i) % kRegions;
 
-        auto* src =
+        auto* vsrc =
             reinterpret_cast<const __m128i*>(src_bytes + region_size * region);
-        auto* dst =
+        auto* vdst =
             reinterpret_cast<__m128i*>(dst_bytes + region_size * region);
 
         // Load and CRC data.
-        vec_data[j] = _mm_loadu_si128(src + i);
-        crcs[region] = crc32c_t{static_cast<uint32_t>(
-            _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
-                          _mm_extract_epi64(vec_data[j], 0)))};
-        crcs[region] = crc32c_t{static_cast<uint32_t>(
-            _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
-                          _mm_extract_epi64(vec_data[j], 1)))};
+        vec_data[j] = _mm_loadu_si128(vsrc + i);
+        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+            static_cast<uint32_t>(crcs[region]),
+            static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))};
+        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+            static_cast<uint32_t>(crcs[region]),
+            static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))};
 
         // Store the data.
-        _mm_store_si128(dst + i, vec_data[j]);
+        _mm_store_si128(vdst + i, vec_data[j]);
       }
 
       // Preload the partial CRCs for the CLMUL subregions.
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
-      for (int j = 0; j < int_regions; j++) {
+      for (size_t j = 0; j < int_regions; j++) {
         // Cycle which regions get vector load/store and integer load/store, to
         // engage prefetching logic around vector load/stores and save issue
         // slots by using the integer registers.
-        int region = (j + vec_regions + i) % kRegions;
+        size_t region = (j + vec_regions + i) % kRegions;
 
         auto* usrc =
             reinterpret_cast<const uint64_t*>(src_bytes + region_size * region);
         auto* udst =
             reinterpret_cast<uint64_t*>(dst_bytes + region_size * region);
 
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
-        for (int k = 0; k < kIntLoadsPerVec; k++) {
-          int data_index = j * kIntLoadsPerVec + k;
+        for (size_t k = 0; k < kIntLoadsPerVec; k++) {
+          size_t data_index = j * kIntLoadsPerVec + k;
 
           // Load and CRC the data.
           int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
@@ -339,13 +317,13 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
 
   // Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the
   // XOR done before doing block copy + CRCs.
-  for (int i = 0; i < kRegions - 1; i++) {
-    crcs[i] = crcs[i] ^ kCrcDataXor;
+  for (size_t i = 0; i + 1 < kRegions; i++) {
+    crcs[i] = crc32c_t{static_cast<uint32_t>(crcs[i]) ^ kCrcDataXor};
   }
 
   // Build a CRC of the first kRegions - 1 regions.
   crc32c_t full_crc = crcs[0];
-  for (int i = 1; i < kRegions - 1; i++) {
+  for (size_t i = 1; i + 1 < kRegions; i++) {
     full_crc = ConcatCrc32c(full_crc, crcs[i], region_size);
   }
 
@@ -360,7 +338,8 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
                    crcs[kRegions - 1]);
 
   // Finalize and concatenate the final CRC, then return.
-  crcs[kRegions - 1] = crcs[kRegions - 1] ^ kCrcDataXor;
+  crcs[kRegions - 1] =
+      crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor};
   return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size);
 }
 
diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc
index d771a786..2112f609 100644
--- a/absl/crc/internal/crc_x86_arm_combined.cc
+++ b/absl/crc/internal/crc_x86_arm_combined.cc
@@ -14,12 +14,11 @@
 
 // Hardware accelerated CRC32 computation on Intel and ARM architecture.
 
-#include <stddef.h>
-
+#include <cstddef>
 #include <cstdint>
 
 #include "absl/base/attributes.h"
-#include "absl/base/call_once.h"
+#include "absl/base/config.h"
 #include "absl/base/dynamic_annotations.h"
 #include "absl/base/internal/endian.h"
 #include "absl/base/internal/prefetch.h"
@@ -67,27 +66,30 @@ class CRC32AcceleratedX86ARMCombined : public CRC32 {
 
 // Constants for switching between algorithms.
 // Chosen by comparing speed at different powers of 2.
-constexpr int kSmallCutoff = 256;
-constexpr int kMediumCutoff = 2048;
+constexpr size_t kSmallCutoff = 256;
+constexpr size_t kMediumCutoff = 2048;
 
-#define ABSL_INTERNAL_STEP1(crc) \
-  do {                           \
-    crc = CRC32_u8(crc, *p++);   \
+#define ABSL_INTERNAL_STEP1(crc)                      \
+  do {                                                \
+    crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \
   } while (0)
-#define ABSL_INTERNAL_STEP2(crc)                          \
-  do {                                                    \
-    crc = CRC32_u16(crc, absl::little_endian::Load16(p)); \
-    p += 2;                                               \
+#define ABSL_INTERNAL_STEP2(crc)                                               \
+  do {                                                                         \
+    crc =                                                                      \
+        CRC32_u16(static_cast<uint32_t>(crc), absl::little_endian::Load16(p)); \
+    p += 2;                                                                    \
   } while (0)
-#define ABSL_INTERNAL_STEP4(crc)                          \
-  do {                                                    \
-    crc = CRC32_u32(crc, absl::little_endian::Load32(p)); \
-    p += 4;                                               \
+#define ABSL_INTERNAL_STEP4(crc)                                               \
+  do {                                                                         \
+    crc =                                                                      \
+        CRC32_u32(static_cast<uint32_t>(crc), absl::little_endian::Load32(p)); \
+    p += 4;                                                                    \
   } while (0)
-#define ABSL_INTERNAL_STEP8(crc, data)                       \
-  do {                                                       \
-    crc = CRC32_u64(crc, absl::little_endian::Load64(data)); \
-    data += 8;                                               \
+#define ABSL_INTERNAL_STEP8(crc, data)                  \
+  do {                                                  \
+    crc = CRC32_u64(static_cast<uint32_t>(crc),         \
+                    absl::little_endian::Load64(data)); \
+    data += 8;                                          \
   } while (0)
 #define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \
   do {                                             \
@@ -101,6 +103,8 @@ constexpr int kMediumCutoff = 2048;
     ABSL_INTERNAL_STEP8(crc2, p2);                           \
   } while (0)
 
+namespace {
+
 uint32_t multiply(uint32_t a, uint32_t b) {
   V128 shifts = V128_From2x64(0, 1);
   V128 power = V128_From2x64(0, a);
@@ -109,11 +113,10 @@ uint32_t multiply(uint32_t a, uint32_t b) {
 
   // Combine crc values
   res = V128_ShiftLeft64(res, shifts);
-  return V128_Extract32<1>(res) ^ CRC32_u32(0, V128_Low64(res));
+  return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
+         CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));
 }
 
-namespace {
-
 // Powers of crc32c polynomial, for faster ExtendByZeros.
 // Verified against folly:
 // folly/hash/detail/Crc32CombineDetail.cpp
@@ -204,9 +207,167 @@ enum class CutoffStrategy {
   Unroll64CRC,
 };
 
-template <int num_crc_streams, int num_pclmul_streams, CutoffStrategy strategy>
-class CRC32AcceleratedX86ARMCombinedMultipleStreams
+// Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the
+// methods and data that don't need the template arguments.
+class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
     : public CRC32AcceleratedX86ARMCombined {
+ protected:
+  // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
+  // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
+  // high latency, so we run 4 128-bit partial checksums that can be reduced to
+  // a single value by FinalizePclmulStream later. Computing crc for arbitrary
+  // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
+  // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+  // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+  // We are applying it to CRC32C polynomial.
+  ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
+      const uint8_t* p, V128* partialCRC) const {
+    V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
+
+    V128 partialCRC1 = partialCRC[0];
+    V128 partialCRC2 = partialCRC[1];
+    V128 partialCRC3 = partialCRC[2];
+    V128 partialCRC4 = partialCRC[3];
+
+    V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
+    V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
+    V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
+    V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
+    V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
+    V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
+    V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
+    V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
+    partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
+    partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
+    partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
+    partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
+    partialCRC1 = V128_Xor(tmp1, partialCRC1);
+    partialCRC2 = V128_Xor(tmp2, partialCRC2);
+    partialCRC3 = V128_Xor(tmp3, partialCRC3);
+    partialCRC4 = V128_Xor(tmp4, partialCRC4);
+    partialCRC1 = V128_Xor(partialCRC1, data1);
+    partialCRC2 = V128_Xor(partialCRC2, data2);
+    partialCRC3 = V128_Xor(partialCRC3, data3);
+    partialCRC4 = V128_Xor(partialCRC4, data4);
+    partialCRC[0] = partialCRC1;
+    partialCRC[1] = partialCRC2;
+    partialCRC[2] = partialCRC3;
+    partialCRC[3] = partialCRC4;
+  }
+
+  // Reduce partialCRC produced by Process64BytesPclmul into a single value,
+  // that represents crc checksum of all the processed bytes.
+  ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
+  FinalizePclmulStream(V128* partialCRC) const {
+    V128 partialCRC1 = partialCRC[0];
+    V128 partialCRC2 = partialCRC[1];
+    V128 partialCRC3 = partialCRC[2];
+    V128 partialCRC4 = partialCRC[3];
+
+    // Combine 4 vectors of partial crc into a single vector.
+    V128 reductionMultiplicands =
+        V128_Load(reinterpret_cast<const V128*>(k5k6));
+
+    V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
+    V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
+
+    partialCRC1 = V128_Xor(low, high);
+    partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
+
+    low = V128_PMulLow(reductionMultiplicands, partialCRC3);
+    high = V128_PMulHi(reductionMultiplicands, partialCRC3);
+
+    partialCRC3 = V128_Xor(low, high);
+    partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
+
+    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
+
+    low = V128_PMulLow(reductionMultiplicands, partialCRC1);
+    high = V128_PMulHi(reductionMultiplicands, partialCRC1);
+    V128 fullCRC = V128_Xor(low, high);
+    fullCRC = V128_Xor(fullCRC, partialCRC3);
+
+    // Reduce fullCRC into scalar value.
+    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
+
+    V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
+
+    V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
+    fullCRC = V128_ShiftRight<8>(fullCRC);
+    fullCRC = V128_Xor(fullCRC, tmp);
+
+    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
+
+    tmp = V128_ShiftRight<4>(fullCRC);
+    fullCRC = V128_And(fullCRC, mask);
+    fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
+    fullCRC = V128_Xor(tmp, fullCRC);
+
+    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
+
+    tmp = V128_And(fullCRC, mask);
+    tmp = V128_PMul01(reductionMultiplicands, tmp);
+    tmp = V128_And(tmp, mask);
+    tmp = V128_PMulLow(reductionMultiplicands, tmp);
+
+    fullCRC = V128_Xor(tmp, fullCRC);
+
+    return static_cast<uint64_t>(V128_Extract32<1>(fullCRC));
+  }
+
+  // Update crc with 64 bytes of data from p.
+  ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
+                                                          uint64_t crc) const {
+    for (int i = 0; i < 8; i++) {
+      crc =
+          CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p));
+      p += 8;
+    }
+    return crc;
+  }
+
+  // Generated by crc32c_x86_test --crc32c_generate_constants=true
+  // and verified against constants in linux kernel for S390:
+  // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
+  alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
+  alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
+  alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
+  alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
+  alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
+  alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
+
+  // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
+  // size. Each group is CRCed in parallel then combined at the end of the
+  // block.
+  static constexpr size_t kGroupsSmall = 3;
+  // For large runs we use up to kMaxStreams blocks computed with CRC
+  // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
+  // are combined in the end.
+  static constexpr size_t kMaxStreams = 3;
+};
+
+#ifdef ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
+alignas(16) constexpr uint64_t
+    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k1k2[2];
+alignas(16) constexpr uint64_t
+    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k3k4[2];
+alignas(16) constexpr uint64_t
+    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k5k6[2];
+alignas(16) constexpr uint64_t
+    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k7k0[2];
+alignas(16) constexpr uint64_t
+    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kPoly[2];
+alignas(16) constexpr uint32_t
+    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMask[4];
+constexpr size_t
+    CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kGroupsSmall;
+constexpr size_t CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMaxStreams;
+#endif  // ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
+
+template <size_t num_crc_streams, size_t num_pclmul_streams,
+          CutoffStrategy strategy>
+class CRC32AcceleratedX86ARMCombinedMultipleStreams
+    : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase {
   ABSL_ATTRIBUTE_HOT
   void Extend(uint32_t* crc, const void* bytes, size_t length) const override {
     static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams,
@@ -221,19 +382,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
     // We have dedicated instruction for 1,2,4 and 8 bytes.
     if (length & 8) {
       ABSL_INTERNAL_STEP8(l, p);
-      length &= ~8LL;
+      length &= ~size_t{8};
     }
     if (length & 4) {
       ABSL_INTERNAL_STEP4(l);
-      length &= ~4LL;
+      length &= ~size_t{4};
     }
     if (length & 2) {
       ABSL_INTERNAL_STEP2(l);
-      length &= ~2LL;
+      length &= ~size_t{2};
     }
     if (length & 1) {
       ABSL_INTERNAL_STEP1(l);
-      length &= ~1LL;
+      length &= ~size_t{1};
     }
     if (length == 0) {
       *crc = l;
@@ -263,12 +424,12 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
       if (strategy == CutoffStrategy::Fold3) {
         uint64_t l641 = 0;
         uint64_t l642 = 0;
-        const int blockSize = 32;
-        int64_t bs = (e - p) / kGroupsSmall / blockSize;
+        const size_t blockSize = 32;
+        size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize;
         const uint8_t* p1 = p + bs * blockSize;
         const uint8_t* p2 = p1 + bs * blockSize;
 
-        for (int64_t i = 0; i < bs - 1; ++i) {
+        for (size_t i = 0; i + 1 < bs; ++i) {
           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
@@ -290,8 +451,9 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
 
         V128 res2 = V128_PMul10(tmp, magic);
         V128 x = V128_Xor(res1, res2);
-        l64 = V128_Low64(x) ^ absl::little_endian::Load64(p2);
-        l64 = CRC32_u64(l642, l64);
+        l64 = static_cast<uint64_t>(V128_Low64(x)) ^
+              absl::little_endian::Load64(p2);
+        l64 = CRC32_u64(static_cast<uint32_t>(l642), l64);
 
         p = p2 + 8;
       } else if (strategy == CutoffStrategy::Unroll64CRC) {
@@ -314,16 +476,17 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
         ABSL_INTERNAL_STEP1(l);
       }
 
-      int64_t bs = (e - p) / (num_crc_streams + num_pclmul_streams) / 64;
+      size_t bs = static_cast<size_t>(e - p) /
+                  (num_crc_streams + num_pclmul_streams) / 64;
       const uint8_t* crc_streams[kMaxStreams];
       const uint8_t* pclmul_streams[kMaxStreams];
       // We are guaranteed to have at least one crc stream.
       crc_streams[0] = p;
-      for (int i = 1; i < num_crc_streams; i++) {
+      for (size_t i = 1; i < num_crc_streams; i++) {
         crc_streams[i] = crc_streams[i - 1] + bs * 64;
       }
       pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64;
-      for (int i = 1; i < num_pclmul_streams; i++) {
+      for (size_t i = 1; i < num_pclmul_streams; i++) {
         pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64;
       }
 
@@ -332,13 +495,13 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
       uint64_t l64_pclmul[kMaxStreams] = {0};
 
       // Peel first iteration, because PCLMULQDQ stream, needs setup.
-      for (int i = 0; i < num_crc_streams; i++) {
+      for (size_t i = 0; i < num_crc_streams; i++) {
         l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
         crc_streams[i] += 16 * 4;
       }
 
       V128 partialCRC[kMaxStreams][4];
-      for (int i = 0; i < num_pclmul_streams; i++) {
+      for (size_t i = 0; i < num_pclmul_streams; i++) {
         partialCRC[i][0] = V128_LoadU(
             reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0));
         partialCRC[i][1] = V128_LoadU(
@@ -350,13 +513,13 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
         pclmul_streams[i] += 16 * 4;
       }
 
-      for (int64_t i = 1; i < bs; i++) {
+      for (size_t i = 1; i < bs; i++) {
         // Prefetch data for next itterations.
-        for (int j = 0; j < num_crc_streams; j++) {
+        for (size_t j = 0; j < num_crc_streams; j++) {
           base_internal::PrefetchT0(
               reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon));
         }
-        for (int j = 0; j < num_pclmul_streams; j++) {
+        for (size_t j = 0; j < num_pclmul_streams; j++) {
           base_internal::PrefetchT0(reinterpret_cast<const char*>(
               pclmul_streams[j] + kPrefetchHorizon));
         }
@@ -398,19 +561,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
 
       // PCLMULQDQ based streams require special final step;
       // CRC based don't.
-      for (int i = 0; i < num_pclmul_streams; i++) {
+      for (size_t i = 0; i < num_pclmul_streams; i++) {
         l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]);
       }
 
       // Combine all streams into single result.
       uint32_t magic = ComputeZeroConstant(bs * 64);
       l64 = l64_crc[0];
-      for (int i = 1; i < num_crc_streams; i++) {
-        l64 = multiply(l64, magic);
+      for (size_t i = 1; i < num_crc_streams; i++) {
+        l64 = multiply(static_cast<uint32_t>(l64), magic);
         l64 ^= l64_crc[i];
       }
-      for (int i = 0; i < num_pclmul_streams; i++) {
-        l64 = multiply(l64, magic);
+      for (size_t i = 0; i < num_pclmul_streams; i++) {
+        l64 = multiply(static_cast<uint32_t>(l64), magic);
         l64 ^= l64_pclmul[i];
       }
 
@@ -421,7 +584,7 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
         p = crc_streams[num_crc_streams - 1];
       }
     }
-    l = l64;
+    l = static_cast<uint32_t>(l64);
 
     while ((e - p) >= 16) {
       ABSL_INTERNAL_STEP8(l, p);
@@ -441,139 +604,6 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
 
     *crc = l;
   }
-
- private:
-  // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
-  // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
-  // high latency, so we run 4 128-bit partial checksums that can be reduced to
-  // a single value by FinalizePclmulStream later. Computing crc for arbitrary
-  // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
-  // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
-  // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-  // We are applying it to CRC32C polynomial.
-  ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
-      const uint8_t* p, V128* partialCRC) const {
-    V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
-
-    V128 partialCRC1 = partialCRC[0];
-    V128 partialCRC2 = partialCRC[1];
-    V128 partialCRC3 = partialCRC[2];
-    V128 partialCRC4 = partialCRC[3];
-
-    V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
-    V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
-    V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
-    V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
-    V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
-    V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
-    V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
-    V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
-    partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
-    partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
-    partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
-    partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
-    partialCRC1 = V128_Xor(tmp1, partialCRC1);
-    partialCRC2 = V128_Xor(tmp2, partialCRC2);
-    partialCRC3 = V128_Xor(tmp3, partialCRC3);
-    partialCRC4 = V128_Xor(tmp4, partialCRC4);
-    partialCRC1 = V128_Xor(partialCRC1, data1);
-    partialCRC2 = V128_Xor(partialCRC2, data2);
-    partialCRC3 = V128_Xor(partialCRC3, data3);
-    partialCRC4 = V128_Xor(partialCRC4, data4);
-    partialCRC[0] = partialCRC1;
-    partialCRC[1] = partialCRC2;
-    partialCRC[2] = partialCRC3;
-    partialCRC[3] = partialCRC4;
-  }
-
-  // Reduce partialCRC produced by Process64BytesPclmul into a single value,
-  // that represents crc checksum of all the processed bytes.
-  ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
-  FinalizePclmulStream(V128* partialCRC) const {
-    V128 partialCRC1 = partialCRC[0];
-    V128 partialCRC2 = partialCRC[1];
-    V128 partialCRC3 = partialCRC[2];
-    V128 partialCRC4 = partialCRC[3];
-
-    // Combine 4 vectors of partial crc into a single vector.
-    V128 reductionMultiplicands =
-        V128_Load(reinterpret_cast<const V128*>(k5k6));
-
-    V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
-    V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
-
-    partialCRC1 = V128_Xor(low, high);
-    partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
-
-    low = V128_PMulLow(reductionMultiplicands, partialCRC3);
-    high = V128_PMulHi(reductionMultiplicands, partialCRC3);
-
-    partialCRC3 = V128_Xor(low, high);
-    partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
-
-    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
-
-    low = V128_PMulLow(reductionMultiplicands, partialCRC1);
-    high = V128_PMulHi(reductionMultiplicands, partialCRC1);
-    V128 fullCRC = V128_Xor(low, high);
-    fullCRC = V128_Xor(fullCRC, partialCRC3);
-
-    // Reduce fullCRC into scalar value.
-    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
-
-    V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
-
-    V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
-    fullCRC = V128_ShiftRight<8>(fullCRC);
-    fullCRC = V128_Xor(fullCRC, tmp);
-
-    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
-
-    tmp = V128_ShiftRight<4>(fullCRC);
-    fullCRC = V128_And(fullCRC, mask);
-    fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
-    fullCRC = V128_Xor(tmp, fullCRC);
-
-    reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
-
-    tmp = V128_And(fullCRC, mask);
-    tmp = V128_PMul01(reductionMultiplicands, tmp);
-    tmp = V128_And(tmp, mask);
-    tmp = V128_PMulLow(reductionMultiplicands, tmp);
-
-    fullCRC = V128_Xor(tmp, fullCRC);
-
-    return V128_Extract32<1>(fullCRC);
-  }
-
-  // Update crc with 64 bytes of data from p.
-  ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
-                                                          uint64_t crc) const {
-    for (int i = 0; i < 8; i++) {
-      crc = CRC32_u64(crc, absl::little_endian::Load64(p));
-      p += 8;
-    }
-    return crc;
-  }
-
-  // Generated by crc32c_x86_test --crc32c_generate_constants=true
-  // and verified against constants in linux kernel for S390:
-  // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
-  alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
-  alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
-  alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
-  alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
-  alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
-  alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
-
-  // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
-  // size. Each group is CRCed in parallel then combined at the end of the
-  // block.
-  static constexpr int kGroupsSmall = 3;
-  // For large runs we use up to kMaxStreams blocks computed with CRC
-  // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
-  // are combined in the end.
-  static constexpr int kMaxStreams = 3;
 };
 
 }  // namespace
diff --git a/absl/crc/internal/non_temporal_memcpy.h b/absl/crc/internal/non_temporal_memcpy.h
index 092c6078..6cdd3197 100644
--- a/absl/crc/internal/non_temporal_memcpy.h
+++ b/absl/crc/internal/non_temporal_memcpy.h
@@ -74,7 +74,7 @@ inline void *non_temporal_store_memcpy(void *__restrict dst,
     uintptr_t bytes_before_alignment_boundary =
         kCacheLineSize -
         (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
-    int header_len = (std::min)(bytes_before_alignment_boundary, len);
+    size_t header_len = (std::min)(bytes_before_alignment_boundary, len);
     assert(bytes_before_alignment_boundary < kCacheLineSize);
     memcpy(d, s, header_len);
     d += header_len;
@@ -87,7 +87,7 @@ inline void *non_temporal_store_memcpy(void *__restrict dst,
     __m128i *dst_cacheline = reinterpret_cast<__m128i *>(d);
     const __m128i *src_cacheline = reinterpret_cast<const __m128i *>(s);
     constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m128i);
-    uint64_t loops = len / kCacheLineSize;
+    size_t loops = len / kCacheLineSize;
 
     while (len >= kCacheLineSize) {
       __m128i temp1, temp2, temp3, temp4;
@@ -132,7 +132,7 @@ inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
     uintptr_t bytes_before_alignment_boundary =
         kCacheLineSize -
         (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
-    int header_len = (std::min)(bytes_before_alignment_boundary, len);
+    size_t header_len = (std::min)(bytes_before_alignment_boundary, len);
     assert(bytes_before_alignment_boundary < kCacheLineSize);
     memcpy(d, s, header_len);
     d += header_len;
@@ -145,7 +145,7 @@ inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
     __m256i *dst_cacheline = reinterpret_cast<__m256i *>(d);
     const __m256i *src_cacheline = reinterpret_cast<const __m256i *>(s);
     constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m256i);
-    int loops = len / kCacheLineSize;
+    size_t loops = len / kCacheLineSize;
 
     while (len >= kCacheLineSize) {
       __m256i temp1, temp2;