summaryrefslogtreecommitdiff
path: root/absl/crc/internal
diff options
context:
space:
mode:
Diffstat (limited to 'absl/crc/internal')
-rw-r--r--absl/crc/internal/crc32_x86_arm_combined_simd.h4
-rw-r--r--absl/crc/internal/crc32c_inline.h2
-rw-r--r--absl/crc/internal/crc_memcpy_x86_64.cc109
-rw-r--r--absl/crc/internal/crc_x86_arm_combined.cc392
-rw-r--r--absl/crc/internal/non_temporal_memcpy.h8
5 files changed, 262 insertions, 253 deletions
diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h
index 8f15ded3..f6c2a21c 100644
--- a/absl/crc/internal/crc32_x86_arm_combined_simd.h
+++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -129,7 +129,7 @@ inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
}
inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
- return _mm_crc32_u64(crc, v);
+ return static_cast<uint32_t>(_mm_crc32_u64(crc, v));
}
inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
@@ -157,7 +157,7 @@ inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); }
inline V128 V128_From2x64(const uint64_t l, const uint64_t r) {
- return _mm_set_epi64x(l, r);
+ return _mm_set_epi64x(static_cast<int64_t>(l), static_cast<int64_t>(r));
}
template <int imm>
diff --git a/absl/crc/internal/crc32c_inline.h b/absl/crc/internal/crc32c_inline.h
index 43ad14f4..6236c10b 100644
--- a/absl/crc/internal/crc32c_inline.h
+++ b/absl/crc/internal/crc32c_inline.h
@@ -34,7 +34,7 @@ inline bool ExtendCrc32cInline(uint32_t* crc, const char* p, size_t n) {
constexpr uint32_t kCrc32Xor = 0xffffffffU;
*crc ^= kCrc32Xor;
if (n & 1) {
- *crc = CRC32_u8(*crc, *p);
+ *crc = CRC32_u8(*crc, static_cast<uint8_t>(*p));
n--;
p++;
}
diff --git a/absl/crc/internal/crc_memcpy_x86_64.cc b/absl/crc/internal/crc_memcpy_x86_64.cc
index 1c3299ee..a148fe17 100644
--- a/absl/crc/internal/crc_memcpy_x86_64.cc
+++ b/absl/crc/internal/crc_memcpy_x86_64.cc
@@ -82,21 +82,18 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
return crc32c_t{crc_uint32};
}
-constexpr int kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
+constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
// Common function for copying the tails of multiple large regions.
-template <int vec_regions, int int_regions>
+template <size_t vec_regions, size_t int_regions>
inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
size_t region_size, size_t copy_rounds) {
std::array<__m128i, vec_regions> data;
std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
while (copy_rounds > 0) {
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
- for (int i = 0; i < vec_regions; i++) {
- int region = i;
+ for (size_t i = 0; i < vec_regions; i++) {
+ size_t region = i;
auto* vsrc =
reinterpret_cast<const __m128i*>(*src + region_size * region);
@@ -109,27 +106,23 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
_mm_store_si128(vdst, data[i]);
// Compute the running CRC
- crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
- static_cast<uint32_t>(crcs[region]), _mm_extract_epi64(data[i], 0)))};
- crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
- static_cast<uint32_t>(crcs[region]), _mm_extract_epi64(data[i], 1)))};
+ crcs[region] = crc32c_t{static_cast<uint32_t>(
+ _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))};
+ crcs[region] = crc32c_t{static_cast<uint32_t>(
+ _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))};
}
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
- for (int i = 0; i < int_regions; i++) {
- int region = vec_regions + i;
+ for (size_t i = 0; i < int_regions; i++) {
+ size_t region = vec_regions + i;
auto* usrc =
reinterpret_cast<const uint64_t*>(*src + region_size * region);
auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region);
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
- for (int j = 0; j < kIntLoadsPerVec; j++) {
- int data_index = i * kIntLoadsPerVec + j;
+ for (size_t j = 0; j < kIntLoadsPerVec; j++) {
+ size_t data_index = i * kIntLoadsPerVec + j;
int_data[data_index] = *(usrc + j);
crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
@@ -148,7 +141,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
} // namespace
-template <int vec_regions, int int_regions>
+template <size_t vec_regions, size_t int_regions>
class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
public:
AcceleratedCrcMemcpyEngine() = default;
@@ -160,12 +153,12 @@ class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
std::size_t length, crc32c_t initial_crc) const override;
};
-template <int vec_regions, int int_regions>
+template <size_t vec_regions, size_t int_regions>
crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
void* __restrict dst, const void* __restrict src, std::size_t length,
crc32c_t initial_crc) const {
constexpr std::size_t kRegions = vec_regions + int_regions;
- constexpr crc32c_t kCrcDataXor = crc32c_t{0xffffffff};
+ constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
constexpr std::size_t kBlockSize = sizeof(__m128i);
constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
@@ -201,7 +194,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Start work on the CRC: undo the XOR from the previous calculation or set up
// the initial value of the CRC.
// initial_crc ^= kCrcDataXor;
- initial_crc = initial_crc ^ kCrcDataXor;
+ initial_crc = crc32c_t{static_cast<uint32_t>(initial_crc) ^ kCrcDataXor};
// Do an initial alignment copy, so we can use aligned store instructions to
// the destination pointer. We align the destination pointer because the
@@ -229,13 +222,13 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Initialize CRCs for kRegions regions.
crc32c_t crcs[kRegions];
crcs[0] = initial_crc;
- for (int i = 1; i < kRegions; i++) {
- crcs[i] = kCrcDataXor;
+ for (size_t i = 1; i < kRegions; i++) {
+ crcs[i] = crc32c_t{kCrcDataXor};
}
// Find the number of rounds to copy and the region size. Also compute the
// tail size here.
- int64_t copy_rounds = length / kCopyRoundSize;
+ size_t copy_rounds = length / kCopyRoundSize;
// Find the size of each region and the size of the tail.
const std::size_t region_size = copy_rounds * kBlockSize;
@@ -248,10 +241,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Main loop.
while (copy_rounds > kBlocksPerCacheLine) {
// Prefetch kPrefetchAhead bytes ahead of each pointer.
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
- for (int i = 0; i < kRegions; i++) {
+ for (size_t i = 0; i < kRegions; i++) {
absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead +
region_size * i);
absl::base_internal::PrefetchT0(dst_bytes + kPrefetchAhead +
@@ -259,58 +249,46 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
}
// Load and store data, computing CRC on the way.
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
- for (int i = 0; i < kBlocksPerCacheLine; i++) {
+ for (size_t i = 0; i < kBlocksPerCacheLine; i++) {
// Copy and CRC the data for the CRC regions.
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
- for (int j = 0; j < vec_regions; j++) {
+ for (size_t j = 0; j < vec_regions; j++) {
// Cycle which regions get vector load/store and integer load/store, to
// engage prefetching logic around vector load/stores and save issue
// slots by using the integer registers.
- int region = (j + i) % kRegions;
+ size_t region = (j + i) % kRegions;
- auto* src =
+ auto* vsrc =
reinterpret_cast<const __m128i*>(src_bytes + region_size * region);
- auto* dst =
+ auto* vdst =
reinterpret_cast<__m128i*>(dst_bytes + region_size * region);
// Load and CRC data.
- vec_data[j] = _mm_loadu_si128(src + i);
- crcs[region] = crc32c_t{static_cast<uint32_t>(
- _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
- _mm_extract_epi64(vec_data[j], 0)))};
- crcs[region] = crc32c_t{static_cast<uint32_t>(
- _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
- _mm_extract_epi64(vec_data[j], 1)))};
+ vec_data[j] = _mm_loadu_si128(vsrc + i);
+ crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+ static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))};
+ crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+ static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))};
// Store the data.
- _mm_store_si128(dst + i, vec_data[j]);
+ _mm_store_si128(vdst + i, vec_data[j]);
}
// Preload the partial CRCs for the CLMUL subregions.
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
- for (int j = 0; j < int_regions; j++) {
+ for (size_t j = 0; j < int_regions; j++) {
// Cycle which regions get vector load/store and integer load/store, to
// engage prefetching logic around vector load/stores and save issue
// slots by using the integer registers.
- int region = (j + vec_regions + i) % kRegions;
+ size_t region = (j + vec_regions + i) % kRegions;
auto* usrc =
reinterpret_cast<const uint64_t*>(src_bytes + region_size * region);
auto* udst =
reinterpret_cast<uint64_t*>(dst_bytes + region_size * region);
-#ifdef __GNUC__
-#pragma unroll_completely
-#endif
- for (int k = 0; k < kIntLoadsPerVec; k++) {
- int data_index = j * kIntLoadsPerVec + k;
+ for (size_t k = 0; k < kIntLoadsPerVec; k++) {
+ size_t data_index = j * kIntLoadsPerVec + k;
// Load and CRC the data.
int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
@@ -339,13 +317,13 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the
// XOR done before doing block copy + CRCs.
- for (int i = 0; i < kRegions - 1; i++) {
- crcs[i] = crcs[i] ^ kCrcDataXor;
+ for (size_t i = 0; i + 1 < kRegions; i++) {
+ crcs[i] = crc32c_t{static_cast<uint32_t>(crcs[i]) ^ kCrcDataXor};
}
// Build a CRC of the first kRegions - 1 regions.
crc32c_t full_crc = crcs[0];
- for (int i = 1; i < kRegions - 1; i++) {
+ for (size_t i = 1; i + 1 < kRegions; i++) {
full_crc = ConcatCrc32c(full_crc, crcs[i], region_size);
}
@@ -360,7 +338,8 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
crcs[kRegions - 1]);
// Finalize and concatenate the final CRC, then return.
- crcs[kRegions - 1] = crcs[kRegions - 1] ^ kCrcDataXor;
+ crcs[kRegions - 1] =
+ crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor};
return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size);
}
diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc
index d771a786..2112f609 100644
--- a/absl/crc/internal/crc_x86_arm_combined.cc
+++ b/absl/crc/internal/crc_x86_arm_combined.cc
@@ -14,12 +14,11 @@
// Hardware accelerated CRC32 computation on Intel and ARM architecture.
-#include <stddef.h>
-
+#include <cstddef>
#include <cstdint>
#include "absl/base/attributes.h"
-#include "absl/base/call_once.h"
+#include "absl/base/config.h"
#include "absl/base/dynamic_annotations.h"
#include "absl/base/internal/endian.h"
#include "absl/base/internal/prefetch.h"
@@ -67,27 +66,30 @@ class CRC32AcceleratedX86ARMCombined : public CRC32 {
// Constants for switching between algorithms.
// Chosen by comparing speed at different powers of 2.
-constexpr int kSmallCutoff = 256;
-constexpr int kMediumCutoff = 2048;
+constexpr size_t kSmallCutoff = 256;
+constexpr size_t kMediumCutoff = 2048;
-#define ABSL_INTERNAL_STEP1(crc) \
- do { \
- crc = CRC32_u8(crc, *p++); \
+#define ABSL_INTERNAL_STEP1(crc) \
+ do { \
+ crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \
} while (0)
-#define ABSL_INTERNAL_STEP2(crc) \
- do { \
- crc = CRC32_u16(crc, absl::little_endian::Load16(p)); \
- p += 2; \
+#define ABSL_INTERNAL_STEP2(crc) \
+ do { \
+ crc = \
+ CRC32_u16(static_cast<uint32_t>(crc), absl::little_endian::Load16(p)); \
+ p += 2; \
} while (0)
-#define ABSL_INTERNAL_STEP4(crc) \
- do { \
- crc = CRC32_u32(crc, absl::little_endian::Load32(p)); \
- p += 4; \
+#define ABSL_INTERNAL_STEP4(crc) \
+ do { \
+ crc = \
+ CRC32_u32(static_cast<uint32_t>(crc), absl::little_endian::Load32(p)); \
+ p += 4; \
} while (0)
-#define ABSL_INTERNAL_STEP8(crc, data) \
- do { \
- crc = CRC32_u64(crc, absl::little_endian::Load64(data)); \
- data += 8; \
+#define ABSL_INTERNAL_STEP8(crc, data) \
+ do { \
+ crc = CRC32_u64(static_cast<uint32_t>(crc), \
+ absl::little_endian::Load64(data)); \
+ data += 8; \
} while (0)
#define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \
do { \
@@ -101,6 +103,8 @@ constexpr int kMediumCutoff = 2048;
ABSL_INTERNAL_STEP8(crc2, p2); \
} while (0)
+namespace {
+
uint32_t multiply(uint32_t a, uint32_t b) {
V128 shifts = V128_From2x64(0, 1);
V128 power = V128_From2x64(0, a);
@@ -109,11 +113,10 @@ uint32_t multiply(uint32_t a, uint32_t b) {
// Combine crc values
res = V128_ShiftLeft64(res, shifts);
- return V128_Extract32<1>(res) ^ CRC32_u32(0, V128_Low64(res));
+ return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
+ CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));
}
-namespace {
-
// Powers of crc32c polynomial, for faster ExtendByZeros.
// Verified against folly:
// folly/hash/detail/Crc32CombineDetail.cpp
@@ -204,9 +207,167 @@ enum class CutoffStrategy {
Unroll64CRC,
};
-template <int num_crc_streams, int num_pclmul_streams, CutoffStrategy strategy>
-class CRC32AcceleratedX86ARMCombinedMultipleStreams
+// Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the
+// methods and data that don't need the template arguments.
+class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
: public CRC32AcceleratedX86ARMCombined {
+ protected:
+ // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
+ // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
+ // high latency, so we run 4 128-bit partial checksums that can be reduced to
+ // a single value by FinalizePclmulStream later. Computing crc for arbitrary
+ // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
+ // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ // We are applying it to CRC32C polynomial.
+ ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
+ const uint8_t* p, V128* partialCRC) const {
+ V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
+
+ V128 partialCRC1 = partialCRC[0];
+ V128 partialCRC2 = partialCRC[1];
+ V128 partialCRC3 = partialCRC[2];
+ V128 partialCRC4 = partialCRC[3];
+
+ V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
+ V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
+ V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
+ V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
+ V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
+ V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
+ V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
+ V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
+ partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
+ partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
+ partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
+ partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
+ partialCRC1 = V128_Xor(tmp1, partialCRC1);
+ partialCRC2 = V128_Xor(tmp2, partialCRC2);
+ partialCRC3 = V128_Xor(tmp3, partialCRC3);
+ partialCRC4 = V128_Xor(tmp4, partialCRC4);
+ partialCRC1 = V128_Xor(partialCRC1, data1);
+ partialCRC2 = V128_Xor(partialCRC2, data2);
+ partialCRC3 = V128_Xor(partialCRC3, data3);
+ partialCRC4 = V128_Xor(partialCRC4, data4);
+ partialCRC[0] = partialCRC1;
+ partialCRC[1] = partialCRC2;
+ partialCRC[2] = partialCRC3;
+ partialCRC[3] = partialCRC4;
+ }
+
+ // Reduce partialCRC produced by Process64BytesPclmul into a single value,
+ // that represents crc checksum of all the processed bytes.
+ ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
+ FinalizePclmulStream(V128* partialCRC) const {
+ V128 partialCRC1 = partialCRC[0];
+ V128 partialCRC2 = partialCRC[1];
+ V128 partialCRC3 = partialCRC[2];
+ V128 partialCRC4 = partialCRC[3];
+
+ // Combine 4 vectors of partial crc into a single vector.
+ V128 reductionMultiplicands =
+ V128_Load(reinterpret_cast<const V128*>(k5k6));
+
+ V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
+ V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
+
+ partialCRC1 = V128_Xor(low, high);
+ partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
+
+ low = V128_PMulLow(reductionMultiplicands, partialCRC3);
+ high = V128_PMulHi(reductionMultiplicands, partialCRC3);
+
+ partialCRC3 = V128_Xor(low, high);
+ partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
+
+ reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
+
+ low = V128_PMulLow(reductionMultiplicands, partialCRC1);
+ high = V128_PMulHi(reductionMultiplicands, partialCRC1);
+ V128 fullCRC = V128_Xor(low, high);
+ fullCRC = V128_Xor(fullCRC, partialCRC3);
+
+ // Reduce fullCRC into scalar value.
+ reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
+
+ V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
+
+ V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
+ fullCRC = V128_ShiftRight<8>(fullCRC);
+ fullCRC = V128_Xor(fullCRC, tmp);
+
+ reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
+
+ tmp = V128_ShiftRight<4>(fullCRC);
+ fullCRC = V128_And(fullCRC, mask);
+ fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
+ fullCRC = V128_Xor(tmp, fullCRC);
+
+ reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
+
+ tmp = V128_And(fullCRC, mask);
+ tmp = V128_PMul01(reductionMultiplicands, tmp);
+ tmp = V128_And(tmp, mask);
+ tmp = V128_PMulLow(reductionMultiplicands, tmp);
+
+ fullCRC = V128_Xor(tmp, fullCRC);
+
+ return static_cast<uint64_t>(V128_Extract32<1>(fullCRC));
+ }
+
+ // Update crc with 64 bytes of data from p.
+ ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
+ uint64_t crc) const {
+ for (int i = 0; i < 8; i++) {
+ crc =
+ CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p));
+ p += 8;
+ }
+ return crc;
+ }
+
+ // Generated by crc32c_x86_test --crc32c_generate_constants=true
+ // and verified against constants in linux kernel for S390:
+ // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
+ alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
+ alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
+ alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
+ alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
+ alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
+ alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
+
+ // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
+ // size. Each group is CRCed in parallel then combined at the end of the
+ // block.
+ static constexpr size_t kGroupsSmall = 3;
+ // For large runs we use up to kMaxStreams blocks computed with CRC
+ // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
+ // are combined in the end.
+ static constexpr size_t kMaxStreams = 3;
+};
+
+#ifdef ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k1k2[2];
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k3k4[2];
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k5k6[2];
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k7k0[2];
+alignas(16) constexpr uint64_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kPoly[2];
+alignas(16) constexpr uint32_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMask[4];
+constexpr size_t
+ CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kGroupsSmall;
+constexpr size_t CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMaxStreams;
+#endif // ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
+
+template <size_t num_crc_streams, size_t num_pclmul_streams,
+ CutoffStrategy strategy>
+class CRC32AcceleratedX86ARMCombinedMultipleStreams
+ : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase {
ABSL_ATTRIBUTE_HOT
void Extend(uint32_t* crc, const void* bytes, size_t length) const override {
static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams,
@@ -221,19 +382,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
// We have dedicated instruction for 1,2,4 and 8 bytes.
if (length & 8) {
ABSL_INTERNAL_STEP8(l, p);
- length &= ~8LL;
+ length &= ~size_t{8};
}
if (length & 4) {
ABSL_INTERNAL_STEP4(l);
- length &= ~4LL;
+ length &= ~size_t{4};
}
if (length & 2) {
ABSL_INTERNAL_STEP2(l);
- length &= ~2LL;
+ length &= ~size_t{2};
}
if (length & 1) {
ABSL_INTERNAL_STEP1(l);
- length &= ~1LL;
+ length &= ~size_t{1};
}
if (length == 0) {
*crc = l;
@@ -263,12 +424,12 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
if (strategy == CutoffStrategy::Fold3) {
uint64_t l641 = 0;
uint64_t l642 = 0;
- const int blockSize = 32;
- int64_t bs = (e - p) / kGroupsSmall / blockSize;
+ const size_t blockSize = 32;
+ size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize;
const uint8_t* p1 = p + bs * blockSize;
const uint8_t* p2 = p1 + bs * blockSize;
- for (int64_t i = 0; i < bs - 1; ++i) {
+ for (size_t i = 0; i + 1 < bs; ++i) {
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
@@ -290,8 +451,9 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
V128 res2 = V128_PMul10(tmp, magic);
V128 x = V128_Xor(res1, res2);
- l64 = V128_Low64(x) ^ absl::little_endian::Load64(p2);
- l64 = CRC32_u64(l642, l64);
+ l64 = static_cast<uint64_t>(V128_Low64(x)) ^
+ absl::little_endian::Load64(p2);
+ l64 = CRC32_u64(static_cast<uint32_t>(l642), l64);
p = p2 + 8;
} else if (strategy == CutoffStrategy::Unroll64CRC) {
@@ -314,16 +476,17 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
ABSL_INTERNAL_STEP1(l);
}
- int64_t bs = (e - p) / (num_crc_streams + num_pclmul_streams) / 64;
+ size_t bs = static_cast<size_t>(e - p) /
+ (num_crc_streams + num_pclmul_streams) / 64;
const uint8_t* crc_streams[kMaxStreams];
const uint8_t* pclmul_streams[kMaxStreams];
// We are guaranteed to have at least one crc stream.
crc_streams[0] = p;
- for (int i = 1; i < num_crc_streams; i++) {
+ for (size_t i = 1; i < num_crc_streams; i++) {
crc_streams[i] = crc_streams[i - 1] + bs * 64;
}
pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64;
- for (int i = 1; i < num_pclmul_streams; i++) {
+ for (size_t i = 1; i < num_pclmul_streams; i++) {
pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64;
}
@@ -332,13 +495,13 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
uint64_t l64_pclmul[kMaxStreams] = {0};
// Peel first iteration, because PCLMULQDQ stream, needs setup.
- for (int i = 0; i < num_crc_streams; i++) {
+ for (size_t i = 0; i < num_crc_streams; i++) {
l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
crc_streams[i] += 16 * 4;
}
V128 partialCRC[kMaxStreams][4];
- for (int i = 0; i < num_pclmul_streams; i++) {
+ for (size_t i = 0; i < num_pclmul_streams; i++) {
partialCRC[i][0] = V128_LoadU(
reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0));
partialCRC[i][1] = V128_LoadU(
@@ -350,13 +513,13 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
pclmul_streams[i] += 16 * 4;
}
- for (int64_t i = 1; i < bs; i++) {
+ for (size_t i = 1; i < bs; i++) {
// Prefetch data for next itterations.
- for (int j = 0; j < num_crc_streams; j++) {
+ for (size_t j = 0; j < num_crc_streams; j++) {
base_internal::PrefetchT0(
reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon));
}
- for (int j = 0; j < num_pclmul_streams; j++) {
+ for (size_t j = 0; j < num_pclmul_streams; j++) {
base_internal::PrefetchT0(reinterpret_cast<const char*>(
pclmul_streams[j] + kPrefetchHorizon));
}
@@ -398,19 +561,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
// PCLMULQDQ based streams require special final step;
// CRC based don't.
- for (int i = 0; i < num_pclmul_streams; i++) {
+ for (size_t i = 0; i < num_pclmul_streams; i++) {
l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]);
}
// Combine all streams into single result.
uint32_t magic = ComputeZeroConstant(bs * 64);
l64 = l64_crc[0];
- for (int i = 1; i < num_crc_streams; i++) {
- l64 = multiply(l64, magic);
+ for (size_t i = 1; i < num_crc_streams; i++) {
+ l64 = multiply(static_cast<uint32_t>(l64), magic);
l64 ^= l64_crc[i];
}
- for (int i = 0; i < num_pclmul_streams; i++) {
- l64 = multiply(l64, magic);
+ for (size_t i = 0; i < num_pclmul_streams; i++) {
+ l64 = multiply(static_cast<uint32_t>(l64), magic);
l64 ^= l64_pclmul[i];
}
@@ -421,7 +584,7 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
p = crc_streams[num_crc_streams - 1];
}
}
- l = l64;
+ l = static_cast<uint32_t>(l64);
while ((e - p) >= 16) {
ABSL_INTERNAL_STEP8(l, p);
@@ -441,139 +604,6 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
*crc = l;
}
-
- private:
- // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
- // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
- // high latency, so we run 4 128-bit partial checksums that can be reduced to
- // a single value by FinalizePclmulStream later. Computing crc for arbitrary
- // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
- // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
- // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
- // We are applying it to CRC32C polynomial.
- ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
- const uint8_t* p, V128* partialCRC) const {
- V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
-
- V128 partialCRC1 = partialCRC[0];
- V128 partialCRC2 = partialCRC[1];
- V128 partialCRC3 = partialCRC[2];
- V128 partialCRC4 = partialCRC[3];
-
- V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
- V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
- V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
- V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
- V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
- V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
- V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
- V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
- partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
- partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
- partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
- partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
- partialCRC1 = V128_Xor(tmp1, partialCRC1);
- partialCRC2 = V128_Xor(tmp2, partialCRC2);
- partialCRC3 = V128_Xor(tmp3, partialCRC3);
- partialCRC4 = V128_Xor(tmp4, partialCRC4);
- partialCRC1 = V128_Xor(partialCRC1, data1);
- partialCRC2 = V128_Xor(partialCRC2, data2);
- partialCRC3 = V128_Xor(partialCRC3, data3);
- partialCRC4 = V128_Xor(partialCRC4, data4);
- partialCRC[0] = partialCRC1;
- partialCRC[1] = partialCRC2;
- partialCRC[2] = partialCRC3;
- partialCRC[3] = partialCRC4;
- }
-
- // Reduce partialCRC produced by Process64BytesPclmul into a single value,
- // that represents crc checksum of all the processed bytes.
- ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
- FinalizePclmulStream(V128* partialCRC) const {
- V128 partialCRC1 = partialCRC[0];
- V128 partialCRC2 = partialCRC[1];
- V128 partialCRC3 = partialCRC[2];
- V128 partialCRC4 = partialCRC[3];
-
- // Combine 4 vectors of partial crc into a single vector.
- V128 reductionMultiplicands =
- V128_Load(reinterpret_cast<const V128*>(k5k6));
-
- V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
- V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
-
- partialCRC1 = V128_Xor(low, high);
- partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
-
- low = V128_PMulLow(reductionMultiplicands, partialCRC3);
- high = V128_PMulHi(reductionMultiplicands, partialCRC3);
-
- partialCRC3 = V128_Xor(low, high);
- partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
-
- reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
-
- low = V128_PMulLow(reductionMultiplicands, partialCRC1);
- high = V128_PMulHi(reductionMultiplicands, partialCRC1);
- V128 fullCRC = V128_Xor(low, high);
- fullCRC = V128_Xor(fullCRC, partialCRC3);
-
- // Reduce fullCRC into scalar value.
- reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
-
- V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
-
- V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
- fullCRC = V128_ShiftRight<8>(fullCRC);
- fullCRC = V128_Xor(fullCRC, tmp);
-
- reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
-
- tmp = V128_ShiftRight<4>(fullCRC);
- fullCRC = V128_And(fullCRC, mask);
- fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
- fullCRC = V128_Xor(tmp, fullCRC);
-
- reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
-
- tmp = V128_And(fullCRC, mask);
- tmp = V128_PMul01(reductionMultiplicands, tmp);
- tmp = V128_And(tmp, mask);
- tmp = V128_PMulLow(reductionMultiplicands, tmp);
-
- fullCRC = V128_Xor(tmp, fullCRC);
-
- return V128_Extract32<1>(fullCRC);
- }
-
- // Update crc with 64 bytes of data from p.
- ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
- uint64_t crc) const {
- for (int i = 0; i < 8; i++) {
- crc = CRC32_u64(crc, absl::little_endian::Load64(p));
- p += 8;
- }
- return crc;
- }
-
- // Generated by crc32c_x86_test --crc32c_generate_constants=true
- // and verified against constants in linux kernel for S390:
- // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
- alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
- alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
- alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
- alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
- alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
- alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
-
- // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
- // size. Each group is CRCed in parallel then combined at the end of the
- // block.
- static constexpr int kGroupsSmall = 3;
- // For large runs we use up to kMaxStreams blocks computed with CRC
- // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
- // are combined in the end.
- static constexpr int kMaxStreams = 3;
};
} // namespace
diff --git a/absl/crc/internal/non_temporal_memcpy.h b/absl/crc/internal/non_temporal_memcpy.h
index 092c6078..6cdd3197 100644
--- a/absl/crc/internal/non_temporal_memcpy.h
+++ b/absl/crc/internal/non_temporal_memcpy.h
@@ -74,7 +74,7 @@ inline void *non_temporal_store_memcpy(void *__restrict dst,
uintptr_t bytes_before_alignment_boundary =
kCacheLineSize -
(reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
- int header_len = (std::min)(bytes_before_alignment_boundary, len);
+ size_t header_len = (std::min)(bytes_before_alignment_boundary, len);
assert(bytes_before_alignment_boundary < kCacheLineSize);
memcpy(d, s, header_len);
d += header_len;
@@ -87,7 +87,7 @@ inline void *non_temporal_store_memcpy(void *__restrict dst,
__m128i *dst_cacheline = reinterpret_cast<__m128i *>(d);
const __m128i *src_cacheline = reinterpret_cast<const __m128i *>(s);
constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m128i);
- uint64_t loops = len / kCacheLineSize;
+ size_t loops = len / kCacheLineSize;
while (len >= kCacheLineSize) {
__m128i temp1, temp2, temp3, temp4;
@@ -132,7 +132,7 @@ inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
uintptr_t bytes_before_alignment_boundary =
kCacheLineSize -
(reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
- int header_len = (std::min)(bytes_before_alignment_boundary, len);
+ size_t header_len = (std::min)(bytes_before_alignment_boundary, len);
assert(bytes_before_alignment_boundary < kCacheLineSize);
memcpy(d, s, header_len);
d += header_len;
@@ -145,7 +145,7 @@ inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
__m256i *dst_cacheline = reinterpret_cast<__m256i *>(d);
const __m256i *src_cacheline = reinterpret_cast<const __m256i *>(s);
constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m256i);
- int loops = len / kCacheLineSize;
+ size_t loops = len / kCacheLineSize;
while (len >= kCacheLineSize) {
__m256i temp1, temp2;