summaryrefslogtreecommitdiff
path: root/absl/crc/internal/crc_x86_arm_combined.cc
diff options
context:
space:
mode:
authorGravatar Derek Mauro <dmauro@google.com>2022-11-30 10:58:38 -0800
committerGravatar Copybara-Service <copybara-worker@google.com>2022-11-30 10:59:21 -0800
commit66665d8d2e3fedff340b83f9841ca427145a7b26 (patch)
tree747602a57a578dc3220d8e61298184c43bbf8deb /absl/crc/internal/crc_x86_arm_combined.cc
parent94e9ee3f75a617403049a29e6c932b8b3bb13177 (diff)
Fixes many compilation issues that come from having no external CI
coverage of the accelerated CRC implementation and some differences bewteen the internal and external implementation. This change adds CI coverage to the linux_clang-latest_libstdcxx_bazel.sh script assuming this script always runs on machines of at least the Intel Haswell generation. Fixes include: * Remove the use of the deprecated xor operator on crc32c_t * Remove #pragma unroll_completely, which isn't known by GCC or Clang: https://godbolt.org/z/97j4vbacs * Fixes for -Wsign-compare, -Wsign-conversion and -Wshorten-64-to-32 PiperOrigin-RevId: 491965029 Change-Id: Ic5e1f3a20f69fcd35fe81ebef63443ad26bf7931
Diffstat (limited to 'absl/crc/internal/crc_x86_arm_combined.cc')
-rw-r--r--absl/crc/internal/crc_x86_arm_combined.cc392
1 files changed, 211 insertions, 181 deletions
diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc
index d771a786..2112f609 100644
--- a/absl/crc/internal/crc_x86_arm_combined.cc
+++ b/absl/crc/internal/crc_x86_arm_combined.cc
@@ -14,12 +14,11 @@
// Hardware accelerated CRC32 computation on Intel and ARM architecture.
-#include <stddef.h>
-
+#include <cstddef>
#include <cstdint>
#include "absl/base/attributes.h"
-#include "absl/base/call_once.h"
+#include "absl/base/config.h"
#include "absl/base/dynamic_annotations.h"
#include "absl/base/internal/endian.h"
#include "absl/base/internal/prefetch.h"
@@ -67,27 +66,30 @@ class CRC32AcceleratedX86ARMCombined : public CRC32 {
// Constants for switching between algorithms.
// Chosen by comparing speed at different powers of 2.
-constexpr int kSmallCutoff = 256;
-constexpr int kMediumCutoff = 2048;
+constexpr size_t kSmallCutoff = 256;
+constexpr size_t kMediumCutoff = 2048;
-#define ABSL_INTERNAL_STEP1(crc) \
- do { \
- crc = CRC32_u8(crc, *p++); \
+#define ABSL_INTERNAL_STEP1(crc) \
+ do { \
+ crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \
} while (0)
-#define ABSL_INTERNAL_STEP2(crc) \
- do { \
- crc = CRC32_u16(crc, absl::little_endian::Load16(p)); \
- p += 2; \
+#define ABSL_INTERNAL_STEP2(crc) \
+ do { \
+ crc = \
+ CRC32_u16(static_cast<uint32_t>(crc), absl::little_endian::Load16(p)); \
+ p += 2; \
} while (0)
-#define ABSL_INTERNAL_STEP4(crc) \
- do { \
- crc = CRC32_u32(crc, absl::little_endian::Load32(p)); \
- p += 4; \
+#define ABSL_INTERNAL_STEP4(crc) \
+ do { \
+ crc = \
+ CRC32_u32(static_cast<uint32_t>(crc), absl::little_endian::Load32(p)); \
+ p += 4; \
} while (0)
-#define ABSL_INTERNAL_STEP8(crc, data) \
- do { \
- crc = CRC32_u64(crc, absl::little_endian::Load64(data)); \
- data += 8; \
+#define ABSL_INTERNAL_STEP8(crc, data) \
+ do { \
+ crc = CRC32_u64(static_cast<uint32_t>(crc), \
+ absl::little_endian::Load64(data)); \
+ data += 8; \
} while (0)
#define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \
do { \
@@ -101,6 +103,8 @@ constexpr int kMediumCutoff = 2048;
ABSL_INTERNAL_STEP8(crc2, p2); \
} while (0)
+namespace {
+
uint32_t multiply(uint32_t a, uint32_t b) {
V128 shifts = V128_From2x64(0, 1);
V128 power = V128_From2x64(0, a);
@@ -109,11 +113,10 @@ uint32_t multiply(uint32_t a, uint32_t b) {
// Combine crc values
res = V128_ShiftLeft64(res, shifts);
- return V128_Extract32<1>(res) ^ CRC32_u32(0, V128_Low64(res));
+ return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
+ CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));
}
-namespace {
-
// Powers of crc32c polynomial, for faster ExtendByZeros.
// Verified against folly:
// folly/hash/detail/Crc32CombineDetail.cpp
@@ -204,9 +207,167 @@ enum class CutoffStrategy {
Unroll64CRC,
};
-template <int num_crc_streams, int num_pclmul_streams, CutoffStrategy strategy>
-class CRC32AcceleratedX86ARMCombinedMultipleStreams
+// Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the
+// methods and data that don't need the template arguments.
+class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
: public CRC32AcceleratedX86ARMCombined {
+ protected:
+ // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
+ // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
+ // high latency, so we run 4 128-bit partial checksums that can be reduced to
+ // a single value by FinalizePclmulStream later. Computing crc for arbitrary
+ // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
+ // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ // We are applying it to CRC32C polynomial.
+ ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
+ const uint8_t* p, V128* partialCRC) const {
+ V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
+
+ V128 partialCRC1 = partialCRC[0];
+ V128 partialCRC2 = partialCRC[1];
+ V128 partialCRC3 = partialCRC[2];
+ V128 partialCRC4 = partialCRC[3];
+
+ V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
+ V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
+ V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
+ V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
+ V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
+ V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
+ V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
+ V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
+ partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
+ partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
+ partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
+ partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
+ partialCRC1 = V128_Xor(tmp1, partialCRC1);
+ partialCRC2 = V128_Xor(tmp2, partialCRC2);
+ partialCRC3 = V128_Xor(tmp3, partialCRC3);
+ partialCRC4 = V128_Xor(tmp4, partialCRC4);
+ partialCRC1 = V128_Xor(partialCRC1, data1);
+ partialCRC2 = V128_Xor(partialCRC2, data2);
+ partialCRC3 = V128_Xor(partialCRC3, data3);
+ partialCRC4 = V128_Xor(partialCRC4, data4);
+ partialCRC[0] = partialCRC1;
+ partialCRC[1] = partialCRC2;
+ partialCRC[2] = partialCRC3;
+ partialCRC[3] = partialCRC4;
+ }
+
+ // Reduce partialCRC produced by Process64BytesPclmul into a single value,
+ // that represents crc checksum of all the processed bytes.
+ ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
+ FinalizePclmulStream(V128* partialCRC) const {
+ V128 partialCRC1 = partialCRC[0];
+ V128 partialCRC2 = partialCRC[1];
+ V128 partialCRC3 = partialCRC[2];
+ V128 partialCRC4 = partialCRC[3];
+
+ // Combine 4 vectors of partial crc into a single vector.
+ V128 reductionMultiplicands =
+ V128_Load(reinterpret_cast<const V128*>(k5k6));
+
+ V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
+ V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
+
+ partialCRC1 = V128_Xor(low, high);
+ partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
+
+ low = V128_PMulLow(reductionMultiplicands, partialCRC3);
+ high = V128_PMulHi(reductionMultiplicands, partialCRC3);
+
+ partialCRC3 = V128_Xor(low, high);
+ partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
+
+ reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
+
+ low = V128_PMulLow(reductionMultiplicands, partialCRC1);
+ high = V128_PMulHi(reductionMultiplicands, partialCRC1);
+ V128 fullCRC = V128_Xor(low, high);
+ fullCRC = V128_Xor(fullCRC, partialCRC3);
+
+ // Reduce fullCRC into scalar value.
+ reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
+
+ V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
+
+ V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
+ fullCRC = V128_ShiftRight<8>(fullCRC);
+ fullCRC = V128_Xor(fullCRC, tmp);
+
+ reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
+
+ tmp = V128_ShiftRight<4>(fullCRC);
+ fullCRC = V128_And(fullCRC, mask);
+ fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
+ fullCRC = V128_Xor(tmp, fullCRC);
+
+ reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
+
+ tmp = V128_And(fullCRC, mask);
+ tmp = V128_PMul01(reductionMultiplicands, tmp);
+ tmp = V128_And(tmp, mask);
+ tmp = V128_PMulLow(reductionMultiplicands, tmp);
+
+ fullCRC = V128_Xor(tmp, fullCRC);
+
+ return static_cast<uint64_t>(V128_Extract32<1>(fullCRC));
+ }
+
+ // Update crc with 64 bytes of data from p.
+ ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
+ uint64_t crc) const {
+ for (int i = 0; i < 8; i++) {
+ crc =
+ CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p));
+ p += 8;
+ }
+ return crc;
+ }
+
+ // Generated by crc32c_x86_test --crc32c_generate_constants=true
+ // and verified against constants in linux kernel for S390:
+ // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
+ alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
+ alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
+ alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
+ alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
+ alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
+ alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
+
+ // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
+ // size. Each group is CRCed in parallel then combined at the end of the
+ // block.
+ static constexpr size_t kGroupsSmall = 3;
+ // For large runs we use up to kMaxStreams blocks computed with CRC
+ // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
+ // are combined in the end.
+ static constexpr size_t kMaxStreams = 3;
+};
+
+#ifdef ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k1k2[2];
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k3k4[2];
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k5k6[2];
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k7k0[2];
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kPoly[2];
+alignas(16) constexpr uint32_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMask[4];
+constexpr size_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kGroupsSmall;
+constexpr size_t CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMaxStreams;
+#endif // ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
+
+template <size_t num_crc_streams, size_t num_pclmul_streams,
+ CutoffStrategy strategy>
+class CRC32AcceleratedX86ARMCombinedMultipleStreams
+ : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase {
ABSL_ATTRIBUTE_HOT
void Extend(uint32_t* crc, const void* bytes, size_t length) const override {
static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams,
@@ -221,19 +382,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
// We have dedicated instruction for 1,2,4 and 8 bytes.
if (length & 8) {
ABSL_INTERNAL_STEP8(l, p);
- length &= ~8LL;
+ length &= ~size_t{8};
}
if (length & 4) {
ABSL_INTERNAL_STEP4(l);
- length &= ~4LL;
+ length &= ~size_t{4};
}
if (length & 2) {
ABSL_INTERNAL_STEP2(l);
- length &= ~2LL;
+ length &= ~size_t{2};
}
if (length & 1) {
ABSL_INTERNAL_STEP1(l);
- length &= ~1LL;
+ length &= ~size_t{1};
}
if (length == 0) {
*crc = l;
@@ -263,12 +424,12 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
if (strategy == CutoffStrategy::Fold3) {
uint64_t l641 = 0;
uint64_t l642 = 0;
- const int blockSize = 32;
- int64_t bs = (e - p) / kGroupsSmall / blockSize;
+ const size_t blockSize = 32;
+ size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize;
const uint8_t* p1 = p + bs * blockSize;
const uint8_t* p2 = p1 + bs * blockSize;
- for (int64_t i = 0; i < bs - 1; ++i) {
+ for (size_t i = 0; i + 1 < bs; ++i) {
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
@@ -290,8 +451,9 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
V128 res2 = V128_PMul10(tmp, magic);
V128 x = V128_Xor(res1, res2);
- l64 = V128_Low64(x) ^ absl::little_endian::Load64(p2);
- l64 = CRC32_u64(l642, l64);
+ l64 = static_cast<uint64_t>(V128_Low64(x)) ^
+ absl::little_endian::Load64(p2);
+ l64 = CRC32_u64(static_cast<uint32_t>(l642), l64);
p = p2 + 8;
} else if (strategy == CutoffStrategy::Unroll64CRC) {
@@ -314,16 +476,17 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
ABSL_INTERNAL_STEP1(l);
}
- int64_t bs = (e - p) / (num_crc_streams + num_pclmul_streams) / 64;
+ size_t bs = static_cast<size_t>(e - p) /
+ (num_crc_streams + num_pclmul_streams) / 64;
const uint8_t* crc_streams[kMaxStreams];
const uint8_t* pclmul_streams[kMaxStreams];
// We are guaranteed to have at least one crc stream.
crc_streams[0] = p;
- for (int i = 1; i < num_crc_streams; i++) {
+ for (size_t i = 1; i < num_crc_streams; i++) {
crc_streams[i] = crc_streams[i - 1] + bs * 64;
}
pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64;
- for (int i = 1; i < num_pclmul_streams; i++) {
+ for (size_t i = 1; i < num_pclmul_streams; i++) {
pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64;
}
@@ -332,13 +495,13 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
uint64_t l64_pclmul[kMaxStreams] = {0};
// Peel first iteration, because PCLMULQDQ stream, needs setup.
- for (int i = 0; i < num_crc_streams; i++) {
+ for (size_t i = 0; i < num_crc_streams; i++) {
l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
crc_streams[i] += 16 * 4;
}
V128 partialCRC[kMaxStreams][4];
- for (int i = 0; i < num_pclmul_streams; i++) {
+ for (size_t i = 0; i < num_pclmul_streams; i++) {
partialCRC[i][0] = V128_LoadU(
reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0));
partialCRC[i][1] = V128_LoadU(
@@ -350,13 +513,13 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
pclmul_streams[i] += 16 * 4;
}
- for (int64_t i = 1; i < bs; i++) {
+ for (size_t i = 1; i < bs; i++) {
// Prefetch data for next itterations.
- for (int j = 0; j < num_crc_streams; j++) {
+ for (size_t j = 0; j < num_crc_streams; j++) {
base_internal::PrefetchT0(
reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon));
}
- for (int j = 0; j < num_pclmul_streams; j++) {
+ for (size_t j = 0; j < num_pclmul_streams; j++) {
base_internal::PrefetchT0(reinterpret_cast<const char*>(
pclmul_streams[j] + kPrefetchHorizon));
}
@@ -398,19 +561,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
// PCLMULQDQ based streams require special final step;
// CRC based don't.
- for (int i = 0; i < num_pclmul_streams; i++) {
+ for (size_t i = 0; i < num_pclmul_streams; i++) {
l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]);
}
// Combine all streams into single result.
uint32_t magic = ComputeZeroConstant(bs * 64);
l64 = l64_crc[0];
- for (int i = 1; i < num_crc_streams; i++) {
- l64 = multiply(l64, magic);
+ for (size_t i = 1; i < num_crc_streams; i++) {
+ l64 = multiply(static_cast<uint32_t>(l64), magic);
l64 ^= l64_crc[i];
}
- for (int i = 0; i < num_pclmul_streams; i++) {
- l64 = multiply(l64, magic);
+ for (size_t i = 0; i < num_pclmul_streams; i++) {
+ l64 = multiply(static_cast<uint32_t>(l64), magic);
l64 ^= l64_pclmul[i];
}
@@ -421,7 +584,7 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
p = crc_streams[num_crc_streams - 1];
}
}
- l = l64;
+ l = static_cast<uint32_t>(l64);
while ((e - p) >= 16) {
ABSL_INTERNAL_STEP8(l, p);
@@ -441,139 +604,6 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
*crc = l;
}
-
- private:
- // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
- // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
- // high latency, so we run 4 128-bit partial checksums that can be reduced to
- // a single value by FinalizePclmulStream later. Computing crc for arbitrary
- // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
- // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
- // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
- // We are applying it to CRC32C polynomial.
- ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
- const uint8_t* p, V128* partialCRC) const {
- V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
-
- V128 partialCRC1 = partialCRC[0];
- V128 partialCRC2 = partialCRC[1];
- V128 partialCRC3 = partialCRC[2];
- V128 partialCRC4 = partialCRC[3];
-
- V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
- V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
- V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
- V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
- V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
- V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
- V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
- V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
- partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
- partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
- partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
- partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
- partialCRC1 = V128_Xor(tmp1, partialCRC1);
- partialCRC2 = V128_Xor(tmp2, partialCRC2);
- partialCRC3 = V128_Xor(tmp3, partialCRC3);
- partialCRC4 = V128_Xor(tmp4, partialCRC4);
- partialCRC1 = V128_Xor(partialCRC1, data1);
- partialCRC2 = V128_Xor(partialCRC2, data2);
- partialCRC3 = V128_Xor(partialCRC3, data3);
- partialCRC4 = V128_Xor(partialCRC4, data4);
- partialCRC[0] = partialCRC1;
- partialCRC[1] = partialCRC2;
- partialCRC[2] = partialCRC3;
- partialCRC[3] = partialCRC4;
- }
-
- // Reduce partialCRC produced by Process64BytesPclmul into a single value,
- // that represents crc checksum of all the processed bytes.
- ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
- FinalizePclmulStream(V128* partialCRC) const {
- V128 partialCRC1 = partialCRC[0];
- V128 partialCRC2 = partialCRC[1];
- V128 partialCRC3 = partialCRC[2];
- V128 partialCRC4 = partialCRC[3];
-
- // Combine 4 vectors of partial crc into a single vector.
- V128 reductionMultiplicands =
- V128_Load(reinterpret_cast<const V128*>(k5k6));
-
- V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
- V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
-
- partialCRC1 = V128_Xor(low, high);
- partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
-
- low = V128_PMulLow(reductionMultiplicands, partialCRC3);
- high = V128_PMulHi(reductionMultiplicands, partialCRC3);
-
- partialCRC3 = V128_Xor(low, high);
- partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
-
- reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
-
- low = V128_PMulLow(reductionMultiplicands, partialCRC1);
- high = V128_PMulHi(reductionMultiplicands, partialCRC1);
- V128 fullCRC = V128_Xor(low, high);
- fullCRC = V128_Xor(fullCRC, partialCRC3);
-
- // Reduce fullCRC into scalar value.
- reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
-
- V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
-
- V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
- fullCRC = V128_ShiftRight<8>(fullCRC);
- fullCRC = V128_Xor(fullCRC, tmp);
-
- reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
-
- tmp = V128_ShiftRight<4>(fullCRC);
- fullCRC = V128_And(fullCRC, mask);
- fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
- fullCRC = V128_Xor(tmp, fullCRC);
-
- reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
-
- tmp = V128_And(fullCRC, mask);
- tmp = V128_PMul01(reductionMultiplicands, tmp);
- tmp = V128_And(tmp, mask);
- tmp = V128_PMulLow(reductionMultiplicands, tmp);
-
- fullCRC = V128_Xor(tmp, fullCRC);
-
- return V128_Extract32<1>(fullCRC);
- }
-
- // Update crc with 64 bytes of data from p.
- ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
- uint64_t crc) const {
- for (int i = 0; i < 8; i++) {
- crc = CRC32_u64(crc, absl::little_endian::Load64(p));
- p += 8;
- }
- return crc;
- }
-
- // Generated by crc32c_x86_test --crc32c_generate_constants=true
- // and verified against constants in linux kernel for S390:
- // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
- alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
- alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
- alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
- alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
- alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
- alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
-
- // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
- // size. Each group is CRCed in parallel then combined at the end of the
- // block.
- static constexpr int kGroupsSmall = 3;
- // For large runs we use up to kMaxStreams blocks computed with CRC
- // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
- // are combined in the end.
- static constexpr int kMaxStreams = 3;
};
} // namespace