diff options
Diffstat (limited to 'absl/crc/internal/crc_x86_arm_combined.cc')
-rw-r--r-- | absl/crc/internal/crc_x86_arm_combined.cc | 392 |
1 files changed, 211 insertions, 181 deletions
diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc index d771a786..2112f609 100644 --- a/absl/crc/internal/crc_x86_arm_combined.cc +++ b/absl/crc/internal/crc_x86_arm_combined.cc @@ -14,12 +14,11 @@ // Hardware accelerated CRC32 computation on Intel and ARM architecture. -#include <stddef.h> - +#include <cstddef> #include <cstdint> #include "absl/base/attributes.h" -#include "absl/base/call_once.h" +#include "absl/base/config.h" #include "absl/base/dynamic_annotations.h" #include "absl/base/internal/endian.h" #include "absl/base/internal/prefetch.h" @@ -67,27 +66,30 @@ class CRC32AcceleratedX86ARMCombined : public CRC32 { // Constants for switching between algorithms. // Chosen by comparing speed at different powers of 2. -constexpr int kSmallCutoff = 256; -constexpr int kMediumCutoff = 2048; +constexpr size_t kSmallCutoff = 256; +constexpr size_t kMediumCutoff = 2048; -#define ABSL_INTERNAL_STEP1(crc) \ - do { \ - crc = CRC32_u8(crc, *p++); \ +#define ABSL_INTERNAL_STEP1(crc) \ + do { \ + crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \ } while (0) -#define ABSL_INTERNAL_STEP2(crc) \ - do { \ - crc = CRC32_u16(crc, absl::little_endian::Load16(p)); \ - p += 2; \ +#define ABSL_INTERNAL_STEP2(crc) \ + do { \ + crc = \ + CRC32_u16(static_cast<uint32_t>(crc), absl::little_endian::Load16(p)); \ + p += 2; \ } while (0) -#define ABSL_INTERNAL_STEP4(crc) \ - do { \ - crc = CRC32_u32(crc, absl::little_endian::Load32(p)); \ - p += 4; \ +#define ABSL_INTERNAL_STEP4(crc) \ + do { \ + crc = \ + CRC32_u32(static_cast<uint32_t>(crc), absl::little_endian::Load32(p)); \ + p += 4; \ } while (0) -#define ABSL_INTERNAL_STEP8(crc, data) \ - do { \ - crc = CRC32_u64(crc, absl::little_endian::Load64(data)); \ - data += 8; \ +#define ABSL_INTERNAL_STEP8(crc, data) \ + do { \ + crc = CRC32_u64(static_cast<uint32_t>(crc), \ + absl::little_endian::Load64(data)); \ + data += 8; \ } while (0) #define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \ do { \ @@ -101,6 +103,8 @@ constexpr int kMediumCutoff = 2048; ABSL_INTERNAL_STEP8(crc2, p2); \ } while (0) +namespace { + uint32_t multiply(uint32_t a, uint32_t b) { V128 shifts = V128_From2x64(0, 1); V128 power = V128_From2x64(0, a); @@ -109,11 +113,10 @@ uint32_t multiply(uint32_t a, uint32_t b) { // Combine crc values res = V128_ShiftLeft64(res, shifts); - return V128_Extract32<1>(res) ^ CRC32_u32(0, V128_Low64(res)); + return static_cast<uint32_t>(V128_Extract32<1>(res)) ^ + CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res))); } -namespace { - // Powers of crc32c polynomial, for faster ExtendByZeros. // Verified against folly: // folly/hash/detail/Crc32CombineDetail.cpp @@ -204,9 +207,167 @@ enum class CutoffStrategy { Unroll64CRC, }; -template <int num_crc_streams, int num_pclmul_streams, CutoffStrategy strategy> -class CRC32AcceleratedX86ARMCombinedMultipleStreams +// Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the +// methods and data that don't need the template arguments. +class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase : public CRC32AcceleratedX86ARMCombined { + protected: + // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream + // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a + // high latency, so we run 4 128-bit partial checksums that can be reduced to + // a single value by FinalizePclmulStream later. Computing crc for arbitrary + // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC + // Computation for Generic Polynomials Using PCLMULQDQ Instruction" + // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf + // We are applying it to CRC32C polynomial. + ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul( + const uint8_t* p, V128* partialCRC) const { + V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2)); + + V128 partialCRC1 = partialCRC[0]; + V128 partialCRC2 = partialCRC[1]; + V128 partialCRC3 = partialCRC[2]; + V128 partialCRC4 = partialCRC[3]; + + V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands); + V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands); + V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands); + V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands); + V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0)); + V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1)); + V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2)); + V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3)); + partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands); + partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands); + partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands); + partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands); + partialCRC1 = V128_Xor(tmp1, partialCRC1); + partialCRC2 = V128_Xor(tmp2, partialCRC2); + partialCRC3 = V128_Xor(tmp3, partialCRC3); + partialCRC4 = V128_Xor(tmp4, partialCRC4); + partialCRC1 = V128_Xor(partialCRC1, data1); + partialCRC2 = V128_Xor(partialCRC2, data2); + partialCRC3 = V128_Xor(partialCRC3, data3); + partialCRC4 = V128_Xor(partialCRC4, data4); + partialCRC[0] = partialCRC1; + partialCRC[1] = partialCRC2; + partialCRC[2] = partialCRC3; + partialCRC[3] = partialCRC4; + } + + // Reduce partialCRC produced by Process64BytesPclmul into a single value, + // that represents crc checksum of all the processed bytes. + ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t + FinalizePclmulStream(V128* partialCRC) const { + V128 partialCRC1 = partialCRC[0]; + V128 partialCRC2 = partialCRC[1]; + V128 partialCRC3 = partialCRC[2]; + V128 partialCRC4 = partialCRC[3]; + + // Combine 4 vectors of partial crc into a single vector. + V128 reductionMultiplicands = + V128_Load(reinterpret_cast<const V128*>(k5k6)); + + V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1); + V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1); + + partialCRC1 = V128_Xor(low, high); + partialCRC1 = V128_Xor(partialCRC1, partialCRC2); + + low = V128_PMulLow(reductionMultiplicands, partialCRC3); + high = V128_PMulHi(reductionMultiplicands, partialCRC3); + + partialCRC3 = V128_Xor(low, high); + partialCRC3 = V128_Xor(partialCRC3, partialCRC4); + + reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4)); + + low = V128_PMulLow(reductionMultiplicands, partialCRC1); + high = V128_PMulHi(reductionMultiplicands, partialCRC1); + V128 fullCRC = V128_Xor(low, high); + fullCRC = V128_Xor(fullCRC, partialCRC3); + + // Reduce fullCRC into scalar value. + reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6)); + + V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask)); + + V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC); + fullCRC = V128_ShiftRight<8>(fullCRC); + fullCRC = V128_Xor(fullCRC, tmp); + + reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0)); + + tmp = V128_ShiftRight<4>(fullCRC); + fullCRC = V128_And(fullCRC, mask); + fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC); + fullCRC = V128_Xor(tmp, fullCRC); + + reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly)); + + tmp = V128_And(fullCRC, mask); + tmp = V128_PMul01(reductionMultiplicands, tmp); + tmp = V128_And(tmp, mask); + tmp = V128_PMulLow(reductionMultiplicands, tmp); + + fullCRC = V128_Xor(tmp, fullCRC); + + return static_cast<uint64_t>(V128_Extract32<1>(fullCRC)); + } + + // Update crc with 64 bytes of data from p. + ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p, + uint64_t crc) const { + for (int i = 0; i < 8; i++) { + crc = + CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p)); + p += 8; + } + return crc; + } + + // Generated by crc32c_x86_test --crc32c_generate_constants=true + // and verified against constants in linux kernel for S390: + // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S + alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8}; + alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e}; + alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6}; + alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000}; + alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1}; + alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u}; + + // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same + // size. Each group is CRCed in parallel then combined at the end of the + // block. + static constexpr size_t kGroupsSmall = 3; + // For large runs we use up to kMaxStreams blocks computed with CRC + // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which + // are combined in the end. + static constexpr size_t kMaxStreams = 3; +}; + +#ifdef ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL +alignas(16) constexpr uint64_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k1k2[2]; +alignas(16) constexpr uint64_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k3k4[2]; +alignas(16) constexpr uint64_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k5k6[2]; +alignas(16) constexpr uint64_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k7k0[2]; +alignas(16) constexpr uint64_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kPoly[2]; +alignas(16) constexpr uint32_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMask[4]; +constexpr size_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kGroupsSmall; +constexpr size_t CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMaxStreams; +#endif // ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL + +template <size_t num_crc_streams, size_t num_pclmul_streams, + CutoffStrategy strategy> +class CRC32AcceleratedX86ARMCombinedMultipleStreams + : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase { ABSL_ATTRIBUTE_HOT void Extend(uint32_t* crc, const void* bytes, size_t length) const override { static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams, @@ -221,19 +382,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams // We have dedicated instruction for 1,2,4 and 8 bytes. if (length & 8) { ABSL_INTERNAL_STEP8(l, p); - length &= ~8LL; + length &= ~size_t{8}; } if (length & 4) { ABSL_INTERNAL_STEP4(l); - length &= ~4LL; + length &= ~size_t{4}; } if (length & 2) { ABSL_INTERNAL_STEP2(l); - length &= ~2LL; + length &= ~size_t{2}; } if (length & 1) { ABSL_INTERNAL_STEP1(l); - length &= ~1LL; + length &= ~size_t{1}; } if (length == 0) { *crc = l; @@ -263,12 +424,12 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams if (strategy == CutoffStrategy::Fold3) { uint64_t l641 = 0; uint64_t l642 = 0; - const int blockSize = 32; - int64_t bs = (e - p) / kGroupsSmall / blockSize; + const size_t blockSize = 32; + size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize; const uint8_t* p1 = p + bs * blockSize; const uint8_t* p2 = p1 + bs * blockSize; - for (int64_t i = 0; i < bs - 1; ++i) { + for (size_t i = 0; i + 1 < bs; ++i) { ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); @@ -290,8 +451,9 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams V128 res2 = V128_PMul10(tmp, magic); V128 x = V128_Xor(res1, res2); - l64 = V128_Low64(x) ^ absl::little_endian::Load64(p2); - l64 = CRC32_u64(l642, l64); + l64 = static_cast<uint64_t>(V128_Low64(x)) ^ + absl::little_endian::Load64(p2); + l64 = CRC32_u64(static_cast<uint32_t>(l642), l64); p = p2 + 8; } else if (strategy == CutoffStrategy::Unroll64CRC) { @@ -314,16 +476,17 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams ABSL_INTERNAL_STEP1(l); } - int64_t bs = (e - p) / (num_crc_streams + num_pclmul_streams) / 64; + size_t bs = static_cast<size_t>(e - p) / + (num_crc_streams + num_pclmul_streams) / 64; const uint8_t* crc_streams[kMaxStreams]; const uint8_t* pclmul_streams[kMaxStreams]; // We are guaranteed to have at least one crc stream. crc_streams[0] = p; - for (int i = 1; i < num_crc_streams; i++) { + for (size_t i = 1; i < num_crc_streams; i++) { crc_streams[i] = crc_streams[i - 1] + bs * 64; } pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64; - for (int i = 1; i < num_pclmul_streams; i++) { + for (size_t i = 1; i < num_pclmul_streams; i++) { pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64; } @@ -332,13 +495,13 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams uint64_t l64_pclmul[kMaxStreams] = {0}; // Peel first iteration, because PCLMULQDQ stream, needs setup. - for (int i = 0; i < num_crc_streams; i++) { + for (size_t i = 0; i < num_crc_streams; i++) { l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]); crc_streams[i] += 16 * 4; } V128 partialCRC[kMaxStreams][4]; - for (int i = 0; i < num_pclmul_streams; i++) { + for (size_t i = 0; i < num_pclmul_streams; i++) { partialCRC[i][0] = V128_LoadU( reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0)); partialCRC[i][1] = V128_LoadU( @@ -350,13 +513,13 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams pclmul_streams[i] += 16 * 4; } - for (int64_t i = 1; i < bs; i++) { + for (size_t i = 1; i < bs; i++) { // Prefetch data for next itterations. - for (int j = 0; j < num_crc_streams; j++) { + for (size_t j = 0; j < num_crc_streams; j++) { base_internal::PrefetchT0( reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon)); } - for (int j = 0; j < num_pclmul_streams; j++) { + for (size_t j = 0; j < num_pclmul_streams; j++) { base_internal::PrefetchT0(reinterpret_cast<const char*>( pclmul_streams[j] + kPrefetchHorizon)); } @@ -398,19 +561,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams // PCLMULQDQ based streams require special final step; // CRC based don't. - for (int i = 0; i < num_pclmul_streams; i++) { + for (size_t i = 0; i < num_pclmul_streams; i++) { l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]); } // Combine all streams into single result. uint32_t magic = ComputeZeroConstant(bs * 64); l64 = l64_crc[0]; - for (int i = 1; i < num_crc_streams; i++) { - l64 = multiply(l64, magic); + for (size_t i = 1; i < num_crc_streams; i++) { + l64 = multiply(static_cast<uint32_t>(l64), magic); l64 ^= l64_crc[i]; } - for (int i = 0; i < num_pclmul_streams; i++) { - l64 = multiply(l64, magic); + for (size_t i = 0; i < num_pclmul_streams; i++) { + l64 = multiply(static_cast<uint32_t>(l64), magic); l64 ^= l64_pclmul[i]; } @@ -421,7 +584,7 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams p = crc_streams[num_crc_streams - 1]; } } - l = l64; + l = static_cast<uint32_t>(l64); while ((e - p) >= 16) { ABSL_INTERNAL_STEP8(l, p); @@ -441,139 +604,6 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams *crc = l; } - - private: - // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream - // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a - // high latency, so we run 4 128-bit partial checksums that can be reduced to - // a single value by FinalizePclmulStream later. Computing crc for arbitrary - // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC - // Computation for Generic Polynomials Using PCLMULQDQ Instruction" - // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf - // We are applying it to CRC32C polynomial. - ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul( - const uint8_t* p, V128* partialCRC) const { - V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2)); - - V128 partialCRC1 = partialCRC[0]; - V128 partialCRC2 = partialCRC[1]; - V128 partialCRC3 = partialCRC[2]; - V128 partialCRC4 = partialCRC[3]; - - V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands); - V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands); - V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands); - V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands); - V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0)); - V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1)); - V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2)); - V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3)); - partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands); - partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands); - partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands); - partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands); - partialCRC1 = V128_Xor(tmp1, partialCRC1); - partialCRC2 = V128_Xor(tmp2, partialCRC2); - partialCRC3 = V128_Xor(tmp3, partialCRC3); - partialCRC4 = V128_Xor(tmp4, partialCRC4); - partialCRC1 = V128_Xor(partialCRC1, data1); - partialCRC2 = V128_Xor(partialCRC2, data2); - partialCRC3 = V128_Xor(partialCRC3, data3); - partialCRC4 = V128_Xor(partialCRC4, data4); - partialCRC[0] = partialCRC1; - partialCRC[1] = partialCRC2; - partialCRC[2] = partialCRC3; - partialCRC[3] = partialCRC4; - } - - // Reduce partialCRC produced by Process64BytesPclmul into a single value, - // that represents crc checksum of all the processed bytes. - ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t - FinalizePclmulStream(V128* partialCRC) const { - V128 partialCRC1 = partialCRC[0]; - V128 partialCRC2 = partialCRC[1]; - V128 partialCRC3 = partialCRC[2]; - V128 partialCRC4 = partialCRC[3]; - - // Combine 4 vectors of partial crc into a single vector. - V128 reductionMultiplicands = - V128_Load(reinterpret_cast<const V128*>(k5k6)); - - V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1); - V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1); - - partialCRC1 = V128_Xor(low, high); - partialCRC1 = V128_Xor(partialCRC1, partialCRC2); - - low = V128_PMulLow(reductionMultiplicands, partialCRC3); - high = V128_PMulHi(reductionMultiplicands, partialCRC3); - - partialCRC3 = V128_Xor(low, high); - partialCRC3 = V128_Xor(partialCRC3, partialCRC4); - - reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4)); - - low = V128_PMulLow(reductionMultiplicands, partialCRC1); - high = V128_PMulHi(reductionMultiplicands, partialCRC1); - V128 fullCRC = V128_Xor(low, high); - fullCRC = V128_Xor(fullCRC, partialCRC3); - - // Reduce fullCRC into scalar value. - reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6)); - - V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask)); - - V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC); - fullCRC = V128_ShiftRight<8>(fullCRC); - fullCRC = V128_Xor(fullCRC, tmp); - - reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0)); - - tmp = V128_ShiftRight<4>(fullCRC); - fullCRC = V128_And(fullCRC, mask); - fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC); - fullCRC = V128_Xor(tmp, fullCRC); - - reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly)); - - tmp = V128_And(fullCRC, mask); - tmp = V128_PMul01(reductionMultiplicands, tmp); - tmp = V128_And(tmp, mask); - tmp = V128_PMulLow(reductionMultiplicands, tmp); - - fullCRC = V128_Xor(tmp, fullCRC); - - return V128_Extract32<1>(fullCRC); - } - - // Update crc with 64 bytes of data from p. - ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p, - uint64_t crc) const { - for (int i = 0; i < 8; i++) { - crc = CRC32_u64(crc, absl::little_endian::Load64(p)); - p += 8; - } - return crc; - } - - // Generated by crc32c_x86_test --crc32c_generate_constants=true - // and verified against constants in linux kernel for S390: - // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S - alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8}; - alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e}; - alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6}; - alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000}; - alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1}; - alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u}; - - // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same - // size. Each group is CRCed in parallel then combined at the end of the - // block. - static constexpr int kGroupsSmall = 3; - // For large runs we use up to kMaxStreams blocks computed with CRC - // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which - // are combined in the end. - static constexpr int kMaxStreams = 3; }; } // namespace |