summaryrefslogtreecommitdiff
path: root/absl/crc/internal/crc_x86_arm_combined.cc
diff options
context:
space:
mode:
Diffstat (limited to 'absl/crc/internal/crc_x86_arm_combined.cc')
-rw-r--r--absl/crc/internal/crc_x86_arm_combined.cc392
1 files changed, 211 insertions, 181 deletions
diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc
index d771a786..2112f609 100644
--- a/absl/crc/internal/crc_x86_arm_combined.cc
+++ b/absl/crc/internal/crc_x86_arm_combined.cc
@@ -14,12 +14,11 @@
// Hardware accelerated CRC32 computation on Intel and ARM architecture.
-#include <stddef.h>
-
+#include <cstddef>
#include <cstdint>
#include "absl/base/attributes.h"
-#include "absl/base/call_once.h"
+#include "absl/base/config.h"
#include "absl/base/dynamic_annotations.h"
#include "absl/base/internal/endian.h"
#include "absl/base/internal/prefetch.h"
@@ -67,27 +66,30 @@ class CRC32AcceleratedX86ARMCombined : public CRC32 {
// Constants for switching between algorithms.
// Chosen by comparing speed at different powers of 2.
-constexpr int kSmallCutoff = 256;
-constexpr int kMediumCutoff = 2048;
+constexpr size_t kSmallCutoff = 256;
+constexpr size_t kMediumCutoff = 2048;
-#define ABSL_INTERNAL_STEP1(crc) \
- do { \
- crc = CRC32_u8(crc, *p++); \
+#define ABSL_INTERNAL_STEP1(crc) \
+ do { \
+ crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \
} while (0)
-#define ABSL_INTERNAL_STEP2(crc) \
- do { \
- crc = CRC32_u16(crc, absl::little_endian::Load16(p)); \
- p += 2; \
+#define ABSL_INTERNAL_STEP2(crc) \
+ do { \
+ crc = \
+ CRC32_u16(static_cast<uint32_t>(crc), absl::little_endian::Load16(p)); \
+ p += 2; \
} while (0)
-#define ABSL_INTERNAL_STEP4(crc) \
- do { \
- crc = CRC32_u32(crc, absl::little_endian::Load32(p)); \
- p += 4; \
+#define ABSL_INTERNAL_STEP4(crc) \
+ do { \
+ crc = \
+ CRC32_u32(static_cast<uint32_t>(crc), absl::little_endian::Load32(p)); \
+ p += 4; \
} while (0)
-#define ABSL_INTERNAL_STEP8(crc, data) \
- do { \
- crc = CRC32_u64(crc, absl::little_endian::Load64(data)); \
- data += 8; \
+#define ABSL_INTERNAL_STEP8(crc, data) \
+ do { \
+ crc = CRC32_u64(static_cast<uint32_t>(crc), \
+ absl::little_endian::Load64(data)); \
+ data += 8; \
} while (0)
#define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \
do { \
@@ -101,6 +103,8 @@ constexpr int kMediumCutoff = 2048;
ABSL_INTERNAL_STEP8(crc2, p2); \
} while (0)
+namespace {
+
uint32_t multiply(uint32_t a, uint32_t b) {
V128 shifts = V128_From2x64(0, 1);
V128 power = V128_From2x64(0, a);
@@ -109,11 +113,10 @@ uint32_t multiply(uint32_t a, uint32_t b) {
// Combine crc values
res = V128_ShiftLeft64(res, shifts);
- return V128_Extract32<1>(res) ^ CRC32_u32(0, V128_Low64(res));
+ return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
+ CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));
}
-namespace {
-
// Powers of crc32c polynomial, for faster ExtendByZeros.
// Verified against folly:
// folly/hash/detail/Crc32CombineDetail.cpp
@@ -204,9 +207,167 @@ enum class CutoffStrategy {
Unroll64CRC,
};
-template <int num_crc_streams, int num_pclmul_streams, CutoffStrategy strategy>
-class CRC32AcceleratedX86ARMCombinedMultipleStreams
+// Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the
+// methods and data that don't need the template arguments.
+class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
: public CRC32AcceleratedX86ARMCombined {
+ protected:
+ // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
+ // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
+ // high latency, so we run 4 128-bit partial checksums that can be reduced to
+ // a single value by FinalizePclmulStream later. Computing crc for arbitrary
+ // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
+ // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ // We are applying it to CRC32C polynomial.
+ ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
+ const uint8_t* p, V128* partialCRC) const {
+ V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
+
+ V128 partialCRC1 = partialCRC[0];
+ V128 partialCRC2 = partialCRC[1];
+ V128 partialCRC3 = partialCRC[2];
+ V128 partialCRC4 = partialCRC[3];
+
+ V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
+ V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
+ V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
+ V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
+ V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
+ V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
+ V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
+ V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
+ partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
+ partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
+ partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
+ partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
+ partialCRC1 = V128_Xor(tmp1, partialCRC1);
+ partialCRC2 = V128_Xor(tmp2, partialCRC2);
+ partialCRC3 = V128_Xor(tmp3, partialCRC3);
+ partialCRC4 = V128_Xor(tmp4, partialCRC4);
+ partialCRC1 = V128_Xor(partialCRC1, data1);
+ partialCRC2 = V128_Xor(partialCRC2, data2);
+ partialCRC3 = V128_Xor(partialCRC3, data3);
+ partialCRC4 = V128_Xor(partialCRC4, data4);
+ partialCRC[0] = partialCRC1;
+ partialCRC[1] = partialCRC2;
+ partialCRC[2] = partialCRC3;
+ partialCRC[3] = partialCRC4;
+ }
+
+ // Reduce partialCRC produced by Process64BytesPclmul into a single value,
+ // that represents crc checksum of all the processed bytes.
+ ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
+ FinalizePclmulStream(V128* partialCRC) const {
+ V128 partialCRC1 = partialCRC[0];
+ V128 partialCRC2 = partialCRC[1];
+ V128 partialCRC3 = partialCRC[2];
+ V128 partialCRC4 = partialCRC[3];
+
+ // Combine 4 vectors of partial crc into a single vector.
+ V128 reductionMultiplicands =
+ V128_Load(reinterpret_cast<const V128*>(k5k6));
+
+ V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
+ V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
+
+ partialCRC1 = V128_Xor(low, high);
+ partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
+
+ low = V128_PMulLow(reductionMultiplicands, partialCRC3);
+ high = V128_PMulHi(reductionMultiplicands, partialCRC3);
+
+ partialCRC3 = V128_Xor(low, high);
+ partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
+
+ reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
+
+ low = V128_PMulLow(reductionMultiplicands, partialCRC1);
+ high = V128_PMulHi(reductionMultiplicands, partialCRC1);
+ V128 fullCRC = V128_Xor(low, high);
+ fullCRC = V128_Xor(fullCRC, partialCRC3);
+
+ // Reduce fullCRC into scalar value.
+ reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
+
+ V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
+
+ V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
+ fullCRC = V128_ShiftRight<8>(fullCRC);
+ fullCRC = V128_Xor(fullCRC, tmp);
+
+ reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
+
+ tmp = V128_ShiftRight<4>(fullCRC);
+ fullCRC = V128_And(fullCRC, mask);
+ fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
+ fullCRC = V128_Xor(tmp, fullCRC);
+
+ reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
+
+ tmp = V128_And(fullCRC, mask);
+ tmp = V128_PMul01(reductionMultiplicands, tmp);
+ tmp = V128_And(tmp, mask);
+ tmp = V128_PMulLow(reductionMultiplicands, tmp);
+
+ fullCRC = V128_Xor(tmp, fullCRC);
+
+ return static_cast<uint64_t>(V128_Extract32<1>(fullCRC));
+ }
+
+ // Update crc with 64 bytes of data from p.
+ ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
+ uint64_t crc) const {
+ for (int i = 0; i < 8; i++) {
+ crc =
+ CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p));
+ p += 8;
+ }
+ return crc;
+ }
+
+ // Generated by crc32c_x86_test --crc32c_generate_constants=true
+ // and verified against constants in linux kernel for S390:
+ // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
+ alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
+ alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
+ alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
+ alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
+ alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
+ alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
+
+ // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
+ // size. Each group is CRCed in parallel then combined at the end of the
+ // block.
+ static constexpr size_t kGroupsSmall = 3;
+ // For large runs we use up to kMaxStreams blocks computed with CRC
+ // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
+ // are combined in the end.
+ static constexpr size_t kMaxStreams = 3;
+};
+
+#ifdef ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k1k2[2];
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k3k4[2];
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k5k6[2];
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k7k0[2];
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kPoly[2];
+alignas(16) constexpr uint32_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMask[4];
+constexpr size_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kGroupsSmall;
+constexpr size_t CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMaxStreams;
+#endif // ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
+
+template <size_t num_crc_streams, size_t num_pclmul_streams,
+ CutoffStrategy strategy>
+class CRC32AcceleratedX86ARMCombinedMultipleStreams
+ : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase {
ABSL_ATTRIBUTE_HOT
void Extend(uint32_t* crc, const void* bytes, size_t length) const override {
static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams,
@@ -221,19 +382,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
// We have dedicated instruction for 1,2,4 and 8 bytes.
if (length & 8) {
ABSL_INTERNAL_STEP8(l, p);
- length &= ~8LL;
+ length &= ~size_t{8};
}
if (length & 4) {
ABSL_INTERNAL_STEP4(l);
- length &= ~4LL;
+ length &= ~size_t{4};
}
if (length & 2) {
ABSL_INTERNAL_STEP2(l);
- length &= ~2LL;
+ length &= ~size_t{2};
}
if (length & 1) {
ABSL_INTERNAL_STEP1(l);
- length &= ~1LL;
+ length &= ~size_t{1};
}
if (length == 0) {
*crc = l;
@@ -263,12 +424,12 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
if (strategy == CutoffStrategy::Fold3) {
uint64_t l641 = 0;
uint64_t l642 = 0;
- const int blockSize = 32;
- int64_t bs = (e - p) / kGroupsSmall / blockSize;
+ const size_t blockSize = 32;
+ size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize;
const uint8_t* p1 = p + bs * blockSize;
const uint8_t* p2 = p1 + bs * blockSize;
- for (int64_t i = 0; i < bs - 1; ++i) {
+ for (size_t i = 0; i + 1 < bs; ++i) {
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
@@ -290,8 +451,9 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
V128 res2 = V128_PMul10(tmp, magic);
V128 x = V128_Xor(res1, res2);
- l64 = V128_Low64(x) ^ absl::little_endian::Load64(p2);
- l64 = CRC32_u64(l642, l64);
+ l64 = static_cast<uint64_t>(V128_Low64(x)) ^
+ absl::little_endian::Load64(p2);
+ l64 = CRC32_u64(static_cast<uint32_t>(l642), l64);
p = p2 + 8;
} else if (strategy == CutoffStrategy::Unroll64CRC) {
@@ -314,16 +476,17 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
ABSL_INTERNAL_STEP1(l);
}
- int64_t bs = (e - p) / (num_crc_streams + num_pclmul_streams) / 64;
+ size_t bs = static_cast<size_t>(e - p) /
+ (num_crc_streams + num_pclmul_streams) / 64;
const uint8_t* crc_streams[kMaxStreams];
const uint8_t* pclmul_streams[kMaxStreams];
// We are guaranteed to have at least one crc stream.
crc_streams[0] = p;
- for (int i = 1; i < num_crc_streams; i++) {
+ for (size_t i = 1; i < num_crc_streams; i++) {
crc_streams[i] = crc_streams[i - 1] + bs * 64;
}
pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64;
- for (int i = 1; i < num_pclmul_streams; i++) {
+ for (size_t i = 1; i < num_pclmul_streams; i++) {
pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64;
}
@@ -332,13 +495,13 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
uint64_t l64_pclmul[kMaxStreams] = {0};
// Peel first iteration, because PCLMULQDQ stream, needs setup.
- for (int i = 0; i < num_crc_streams; i++) {
+ for (size_t i = 0; i < num_crc_streams; i++) {
l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
crc_streams[i] += 16 * 4;
}
V128 partialCRC[kMaxStreams][4];
- for (int i = 0; i < num_pclmul_streams; i++) {
+ for (size_t i = 0; i < num_pclmul_streams; i++) {
partialCRC[i][0] = V128_LoadU(
reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0));
partialCRC[i][1] = V128_LoadU(
@@ -350,13 +513,13 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
pclmul_streams[i] += 16 * 4;
}
- for (int64_t i = 1; i < bs; i++) {
+ for (size_t i = 1; i < bs; i++) {
// Prefetch data for next itterations.
- for (int j = 0; j < num_crc_streams; j++) {
+ for (size_t j = 0; j < num_crc_streams; j++) {
base_internal::PrefetchT0(
reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon));
}
- for (int j = 0; j < num_pclmul_streams; j++) {
+ for (size_t j = 0; j < num_pclmul_streams; j++) {
base_internal::PrefetchT0(reinterpret_cast<const char*>(
pclmul_streams[j] + kPrefetchHorizon));
}
@@ -398,19 +561,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
// PCLMULQDQ based streams require special final step;
// CRC based don't.
- for (int i = 0; i < num_pclmul_streams; i++) {
+ for (size_t i = 0; i < num_pclmul_streams; i++) {
l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]);
}
// Combine all streams into single result.
uint32_t magic = ComputeZeroConstant(bs * 64);
l64 = l64_crc[0];
- for (int i = 1; i < num_crc_streams; i++) {
- l64 = multiply(l64, magic);
+ for (size_t i = 1; i < num_crc_streams; i++) {
+ l64 = multiply(static_cast<uint32_t>(l64), magic);
l64 ^= l64_crc[i];
}
- for (int i = 0; i < num_pclmul_streams; i++) {
- l64 = multiply(l64, magic);
+ for (size_t i = 0; i < num_pclmul_streams; i++) {
+ l64 = multiply(static_cast<uint32_t>(l64), magic);
l64 ^= l64_pclmul[i];
}
@@ -421,7 +584,7 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
p = crc_streams[num_crc_streams - 1];
}
}
- l = l64;
+ l = static_cast<uint32_t>(l64);
while ((e - p) >= 16) {
ABSL_INTERNAL_STEP8(l, p);
@@ -441,139 +604,6 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
*crc = l;
}
-
- private:
- // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
- // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
- // high latency, so we run 4 128-bit partial checksums that can be reduced to
- // a single value by FinalizePclmulStream later. Computing crc for arbitrary
- // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
- // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
- // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
- // We are applying it to CRC32C polynomial.
- ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
- const uint8_t* p, V128* partialCRC) const {
- V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
-
- V128 partialCRC1 = partialCRC[0];
- V128 partialCRC2 = partialCRC[1];
- V128 partialCRC3 = partialCRC[2];
- V128 partialCRC4 = partialCRC[3];
-
- V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
- V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
- V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
- V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
- V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
- V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
- V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
- V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
- partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
- partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
- partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
- partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
- partialCRC1 = V128_Xor(tmp1, partialCRC1);
- partialCRC2 = V128_Xor(tmp2, partialCRC2);
- partialCRC3 = V128_Xor(tmp3, partialCRC3);
- partialCRC4 = V128_Xor(tmp4, partialCRC4);
- partialCRC1 = V128_Xor(partialCRC1, data1);
- partialCRC2 = V128_Xor(partialCRC2, data2);
- partialCRC3 = V128_Xor(partialCRC3, data3);
- partialCRC4 = V128_Xor(partialCRC4, data4);
- partialCRC[0] = partialCRC1;
- partialCRC[1] = partialCRC2;
- partialCRC[2] = partialCRC3;
- partialCRC[3] = partialCRC4;
- }
-
- // Reduce partialCRC produced by Process64BytesPclmul into a single value,
- // that represents crc checksum of all the processed bytes.
- ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
- FinalizePclmulStream(V128* partialCRC) const {
- V128 partialCRC1 = partialCRC[0];
- V128 partialCRC2 = partialCRC[1];
- V128 partialCRC3 = partialCRC[2];
- V128 partialCRC4 = partialCRC[3];
-
- // Combine 4 vectors of partial crc into a single vector.
- V128 reductionMultiplicands =
- V128_Load(reinterpret_cast<const V128*>(k5k6));
-
- V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
- V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
-
- partialCRC1 = V128_Xor(low, high);
- partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
-
- low = V128_PMulLow(reductionMultiplicands, partialCRC3);
- high = V128_PMulHi(reductionMultiplicands, partialCRC3);
-
- partialCRC3 = V128_Xor(low, high);
- partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
-
- reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
-
- low = V128_PMulLow(reductionMultiplicands, partialCRC1);
- high = V128_PMulHi(reductionMultiplicands, partialCRC1);
- V128 fullCRC = V128_Xor(low, high);
- fullCRC = V128_Xor(fullCRC, partialCRC3);
-
- // Reduce fullCRC into scalar value.
- reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
-
- V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
-
- V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
- fullCRC = V128_ShiftRight<8>(fullCRC);
- fullCRC = V128_Xor(fullCRC, tmp);
-
- reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
-
- tmp = V128_ShiftRight<4>(fullCRC);
- fullCRC = V128_And(fullCRC, mask);
- fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
- fullCRC = V128_Xor(tmp, fullCRC);
-
- reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
-
- tmp = V128_And(fullCRC, mask);
- tmp = V128_PMul01(reductionMultiplicands, tmp);
- tmp = V128_And(tmp, mask);
- tmp = V128_PMulLow(reductionMultiplicands, tmp);
-
- fullCRC = V128_Xor(tmp, fullCRC);
-
- return V128_Extract32<1>(fullCRC);
- }
-
- // Update crc with 64 bytes of data from p.
- ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
- uint64_t crc) const {
- for (int i = 0; i < 8; i++) {
- crc = CRC32_u64(crc, absl::little_endian::Load64(p));
- p += 8;
- }
- return crc;
- }
-
- // Generated by crc32c_x86_test --crc32c_generate_constants=true
- // and verified against constants in linux kernel for S390:
- // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
- alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
- alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
- alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
- alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
- alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
- alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
-
- // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
- // size. Each group is CRCed in parallel then combined at the end of the
- // block.
- static constexpr int kGroupsSmall = 3;
- // For large runs we use up to kMaxStreams blocks computed with CRC
- // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
- // are combined in the end.
- static constexpr int kMaxStreams = 3;
};
} // namespace