diff options
Diffstat (limited to 'absl/crc/internal')
-rw-r--r-- | absl/crc/internal/crc32_x86_arm_combined_simd.h | 4 | ||||
-rw-r--r-- | absl/crc/internal/crc32c_inline.h | 2 | ||||
-rw-r--r-- | absl/crc/internal/crc_memcpy_x86_64.cc | 109 | ||||
-rw-r--r-- | absl/crc/internal/crc_x86_arm_combined.cc | 392 | ||||
-rw-r--r-- | absl/crc/internal/non_temporal_memcpy.h | 8 |
5 files changed, 262 insertions, 253 deletions
diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h index 8f15ded3..f6c2a21c 100644 --- a/absl/crc/internal/crc32_x86_arm_combined_simd.h +++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h @@ -129,7 +129,7 @@ inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) { } inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) { - return _mm_crc32_u64(crc, v); + return static_cast<uint32_t>(_mm_crc32_u64(crc, v)); } inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); } @@ -157,7 +157,7 @@ inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); } inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); } inline V128 V128_From2x64(const uint64_t l, const uint64_t r) { - return _mm_set_epi64x(l, r); + return _mm_set_epi64x(static_cast<int64_t>(l), static_cast<int64_t>(r)); } template <int imm> diff --git a/absl/crc/internal/crc32c_inline.h b/absl/crc/internal/crc32c_inline.h index 43ad14f4..6236c10b 100644 --- a/absl/crc/internal/crc32c_inline.h +++ b/absl/crc/internal/crc32c_inline.h @@ -34,7 +34,7 @@ inline bool ExtendCrc32cInline(uint32_t* crc, const char* p, size_t n) { constexpr uint32_t kCrc32Xor = 0xffffffffU; *crc ^= kCrc32Xor; if (n & 1) { - *crc = CRC32_u8(*crc, *p); + *crc = CRC32_u8(*crc, static_cast<uint8_t>(*p)); n--; p++; } diff --git a/absl/crc/internal/crc_memcpy_x86_64.cc b/absl/crc/internal/crc_memcpy_x86_64.cc index 1c3299ee..a148fe17 100644 --- a/absl/crc/internal/crc_memcpy_x86_64.cc +++ b/absl/crc/internal/crc_memcpy_x86_64.cc @@ -82,21 +82,18 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length, return crc32c_t{crc_uint32}; } -constexpr int kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t); +constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t); // Common function for copying the tails of multiple large regions. -template <int vec_regions, int int_regions> +template <size_t vec_regions, size_t int_regions> inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, size_t region_size, size_t copy_rounds) { std::array<__m128i, vec_regions> data; std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data; while (copy_rounds > 0) { -#ifdef __GNUC__ -#pragma unroll_completely -#endif - for (int i = 0; i < vec_regions; i++) { - int region = i; + for (size_t i = 0; i < vec_regions; i++) { + size_t region = i; auto* vsrc = reinterpret_cast<const __m128i*>(*src + region_size * region); @@ -109,27 +106,23 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, _mm_store_si128(vdst, data[i]); // Compute the running CRC - crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( - static_cast<uint32_t>(crcs[region]), _mm_extract_epi64(data[i], 0)))}; - crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( - static_cast<uint32_t>(crcs[region]), _mm_extract_epi64(data[i], 1)))}; + crcs[region] = crc32c_t{static_cast<uint32_t>( + _mm_crc32_u64(static_cast<uint32_t>(crcs[region]), + static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))}; + crcs[region] = crc32c_t{static_cast<uint32_t>( + _mm_crc32_u64(static_cast<uint32_t>(crcs[region]), + static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))}; } -#ifdef __GNUC__ -#pragma unroll_completely -#endif - for (int i = 0; i < int_regions; i++) { - int region = vec_regions + i; + for (size_t i = 0; i < int_regions; i++) { + size_t region = vec_regions + i; auto* usrc = reinterpret_cast<const uint64_t*>(*src + region_size * region); auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region); -#ifdef __GNUC__ -#pragma unroll_completely -#endif - for (int j = 0; j < kIntLoadsPerVec; j++) { - int data_index = i * kIntLoadsPerVec + j; + for (size_t j = 0; j < kIntLoadsPerVec; j++) { + size_t data_index = i * kIntLoadsPerVec + j; int_data[data_index] = *(usrc + j); crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( @@ -148,7 +141,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, } // namespace -template <int vec_regions, int int_regions> +template <size_t vec_regions, size_t int_regions> class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine { public: AcceleratedCrcMemcpyEngine() = default; @@ -160,12 +153,12 @@ class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine { std::size_t length, crc32c_t initial_crc) const override; }; -template <int vec_regions, int int_regions> +template <size_t vec_regions, size_t int_regions> crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( void* __restrict dst, const void* __restrict src, std::size_t length, crc32c_t initial_crc) const { constexpr std::size_t kRegions = vec_regions + int_regions; - constexpr crc32c_t kCrcDataXor = crc32c_t{0xffffffff}; + constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff}; constexpr std::size_t kBlockSize = sizeof(__m128i); constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize; @@ -201,7 +194,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( // Start work on the CRC: undo the XOR from the previous calculation or set up // the initial value of the CRC. // initial_crc ^= kCrcDataXor; - initial_crc = initial_crc ^ kCrcDataXor; + initial_crc = crc32c_t{static_cast<uint32_t>(initial_crc) ^ kCrcDataXor}; // Do an initial alignment copy, so we can use aligned store instructions to // the destination pointer. We align the destination pointer because the @@ -229,13 +222,13 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( // Initialize CRCs for kRegions regions. crc32c_t crcs[kRegions]; crcs[0] = initial_crc; - for (int i = 1; i < kRegions; i++) { - crcs[i] = kCrcDataXor; + for (size_t i = 1; i < kRegions; i++) { + crcs[i] = crc32c_t{kCrcDataXor}; } // Find the number of rounds to copy and the region size. Also compute the // tail size here. - int64_t copy_rounds = length / kCopyRoundSize; + size_t copy_rounds = length / kCopyRoundSize; // Find the size of each region and the size of the tail. const std::size_t region_size = copy_rounds * kBlockSize; @@ -248,10 +241,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( // Main loop. while (copy_rounds > kBlocksPerCacheLine) { // Prefetch kPrefetchAhead bytes ahead of each pointer. -#ifdef __GNUC__ -#pragma unroll_completely -#endif - for (int i = 0; i < kRegions; i++) { + for (size_t i = 0; i < kRegions; i++) { absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead + region_size * i); absl::base_internal::PrefetchT0(dst_bytes + kPrefetchAhead + @@ -259,58 +249,46 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( } // Load and store data, computing CRC on the way. -#ifdef __GNUC__ -#pragma unroll_completely -#endif - for (int i = 0; i < kBlocksPerCacheLine; i++) { + for (size_t i = 0; i < kBlocksPerCacheLine; i++) { // Copy and CRC the data for the CRC regions. -#ifdef __GNUC__ -#pragma unroll_completely -#endif - for (int j = 0; j < vec_regions; j++) { + for (size_t j = 0; j < vec_regions; j++) { // Cycle which regions get vector load/store and integer load/store, to // engage prefetching logic around vector load/stores and save issue // slots by using the integer registers. - int region = (j + i) % kRegions; + size_t region = (j + i) % kRegions; - auto* src = + auto* vsrc = reinterpret_cast<const __m128i*>(src_bytes + region_size * region); - auto* dst = + auto* vdst = reinterpret_cast<__m128i*>(dst_bytes + region_size * region); // Load and CRC data. - vec_data[j] = _mm_loadu_si128(src + i); - crcs[region] = crc32c_t{static_cast<uint32_t>( - _mm_crc32_u64(static_cast<uint32_t>(crcs[region]), - _mm_extract_epi64(vec_data[j], 0)))}; - crcs[region] = crc32c_t{static_cast<uint32_t>( - _mm_crc32_u64(static_cast<uint32_t>(crcs[region]), - _mm_extract_epi64(vec_data[j], 1)))}; + vec_data[j] = _mm_loadu_si128(vsrc + i); + crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( + static_cast<uint32_t>(crcs[region]), + static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))}; + crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( + static_cast<uint32_t>(crcs[region]), + static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))}; // Store the data. - _mm_store_si128(dst + i, vec_data[j]); + _mm_store_si128(vdst + i, vec_data[j]); } // Preload the partial CRCs for the CLMUL subregions. -#ifdef __GNUC__ -#pragma unroll_completely -#endif - for (int j = 0; j < int_regions; j++) { + for (size_t j = 0; j < int_regions; j++) { // Cycle which regions get vector load/store and integer load/store, to // engage prefetching logic around vector load/stores and save issue // slots by using the integer registers. - int region = (j + vec_regions + i) % kRegions; + size_t region = (j + vec_regions + i) % kRegions; auto* usrc = reinterpret_cast<const uint64_t*>(src_bytes + region_size * region); auto* udst = reinterpret_cast<uint64_t*>(dst_bytes + region_size * region); -#ifdef __GNUC__ -#pragma unroll_completely -#endif - for (int k = 0; k < kIntLoadsPerVec; k++) { - int data_index = j * kIntLoadsPerVec + k; + for (size_t k = 0; k < kIntLoadsPerVec; k++) { + size_t data_index = j * kIntLoadsPerVec + k; // Load and CRC the data. int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k); @@ -339,13 +317,13 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( // Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the // XOR done before doing block copy + CRCs. - for (int i = 0; i < kRegions - 1; i++) { - crcs[i] = crcs[i] ^ kCrcDataXor; + for (size_t i = 0; i + 1 < kRegions; i++) { + crcs[i] = crc32c_t{static_cast<uint32_t>(crcs[i]) ^ kCrcDataXor}; } // Build a CRC of the first kRegions - 1 regions. crc32c_t full_crc = crcs[0]; - for (int i = 1; i < kRegions - 1; i++) { + for (size_t i = 1; i + 1 < kRegions; i++) { full_crc = ConcatCrc32c(full_crc, crcs[i], region_size); } @@ -360,7 +338,8 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( crcs[kRegions - 1]); // Finalize and concatenate the final CRC, then return. - crcs[kRegions - 1] = crcs[kRegions - 1] ^ kCrcDataXor; + crcs[kRegions - 1] = + crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor}; return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size); } diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc index d771a786..2112f609 100644 --- a/absl/crc/internal/crc_x86_arm_combined.cc +++ b/absl/crc/internal/crc_x86_arm_combined.cc @@ -14,12 +14,11 @@ // Hardware accelerated CRC32 computation on Intel and ARM architecture. -#include <stddef.h> - +#include <cstddef> #include <cstdint> #include "absl/base/attributes.h" -#include "absl/base/call_once.h" +#include "absl/base/config.h" #include "absl/base/dynamic_annotations.h" #include "absl/base/internal/endian.h" #include "absl/base/internal/prefetch.h" @@ -67,27 +66,30 @@ class CRC32AcceleratedX86ARMCombined : public CRC32 { // Constants for switching between algorithms. // Chosen by comparing speed at different powers of 2. -constexpr int kSmallCutoff = 256; -constexpr int kMediumCutoff = 2048; +constexpr size_t kSmallCutoff = 256; +constexpr size_t kMediumCutoff = 2048; -#define ABSL_INTERNAL_STEP1(crc) \ - do { \ - crc = CRC32_u8(crc, *p++); \ +#define ABSL_INTERNAL_STEP1(crc) \ + do { \ + crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \ } while (0) -#define ABSL_INTERNAL_STEP2(crc) \ - do { \ - crc = CRC32_u16(crc, absl::little_endian::Load16(p)); \ - p += 2; \ +#define ABSL_INTERNAL_STEP2(crc) \ + do { \ + crc = \ + CRC32_u16(static_cast<uint32_t>(crc), absl::little_endian::Load16(p)); \ + p += 2; \ } while (0) -#define ABSL_INTERNAL_STEP4(crc) \ - do { \ - crc = CRC32_u32(crc, absl::little_endian::Load32(p)); \ - p += 4; \ +#define ABSL_INTERNAL_STEP4(crc) \ + do { \ + crc = \ + CRC32_u32(static_cast<uint32_t>(crc), absl::little_endian::Load32(p)); \ + p += 4; \ } while (0) -#define ABSL_INTERNAL_STEP8(crc, data) \ - do { \ - crc = CRC32_u64(crc, absl::little_endian::Load64(data)); \ - data += 8; \ +#define ABSL_INTERNAL_STEP8(crc, data) \ + do { \ + crc = CRC32_u64(static_cast<uint32_t>(crc), \ + absl::little_endian::Load64(data)); \ + data += 8; \ } while (0) #define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \ do { \ @@ -101,6 +103,8 @@ constexpr int kMediumCutoff = 2048; ABSL_INTERNAL_STEP8(crc2, p2); \ } while (0) +namespace { + uint32_t multiply(uint32_t a, uint32_t b) { V128 shifts = V128_From2x64(0, 1); V128 power = V128_From2x64(0, a); @@ -109,11 +113,10 @@ uint32_t multiply(uint32_t a, uint32_t b) { // Combine crc values res = V128_ShiftLeft64(res, shifts); - return V128_Extract32<1>(res) ^ CRC32_u32(0, V128_Low64(res)); + return static_cast<uint32_t>(V128_Extract32<1>(res)) ^ + CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res))); } -namespace { - // Powers of crc32c polynomial, for faster ExtendByZeros. // Verified against folly: // folly/hash/detail/Crc32CombineDetail.cpp @@ -204,9 +207,167 @@ enum class CutoffStrategy { Unroll64CRC, }; -template <int num_crc_streams, int num_pclmul_streams, CutoffStrategy strategy> -class CRC32AcceleratedX86ARMCombinedMultipleStreams +// Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the +// methods and data that don't need the template arguments. +class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase : public CRC32AcceleratedX86ARMCombined { + protected: + // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream + // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a + // high latency, so we run 4 128-bit partial checksums that can be reduced to + // a single value by FinalizePclmulStream later. Computing crc for arbitrary + // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC + // Computation for Generic Polynomials Using PCLMULQDQ Instruction" + // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf + // We are applying it to CRC32C polynomial. + ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul( + const uint8_t* p, V128* partialCRC) const { + V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2)); + + V128 partialCRC1 = partialCRC[0]; + V128 partialCRC2 = partialCRC[1]; + V128 partialCRC3 = partialCRC[2]; + V128 partialCRC4 = partialCRC[3]; + + V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands); + V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands); + V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands); + V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands); + V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0)); + V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1)); + V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2)); + V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3)); + partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands); + partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands); + partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands); + partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands); + partialCRC1 = V128_Xor(tmp1, partialCRC1); + partialCRC2 = V128_Xor(tmp2, partialCRC2); + partialCRC3 = V128_Xor(tmp3, partialCRC3); + partialCRC4 = V128_Xor(tmp4, partialCRC4); + partialCRC1 = V128_Xor(partialCRC1, data1); + partialCRC2 = V128_Xor(partialCRC2, data2); + partialCRC3 = V128_Xor(partialCRC3, data3); + partialCRC4 = V128_Xor(partialCRC4, data4); + partialCRC[0] = partialCRC1; + partialCRC[1] = partialCRC2; + partialCRC[2] = partialCRC3; + partialCRC[3] = partialCRC4; + } + + // Reduce partialCRC produced by Process64BytesPclmul into a single value, + // that represents crc checksum of all the processed bytes. + ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t + FinalizePclmulStream(V128* partialCRC) const { + V128 partialCRC1 = partialCRC[0]; + V128 partialCRC2 = partialCRC[1]; + V128 partialCRC3 = partialCRC[2]; + V128 partialCRC4 = partialCRC[3]; + + // Combine 4 vectors of partial crc into a single vector. + V128 reductionMultiplicands = + V128_Load(reinterpret_cast<const V128*>(k5k6)); + + V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1); + V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1); + + partialCRC1 = V128_Xor(low, high); + partialCRC1 = V128_Xor(partialCRC1, partialCRC2); + + low = V128_PMulLow(reductionMultiplicands, partialCRC3); + high = V128_PMulHi(reductionMultiplicands, partialCRC3); + + partialCRC3 = V128_Xor(low, high); + partialCRC3 = V128_Xor(partialCRC3, partialCRC4); + + reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4)); + + low = V128_PMulLow(reductionMultiplicands, partialCRC1); + high = V128_PMulHi(reductionMultiplicands, partialCRC1); + V128 fullCRC = V128_Xor(low, high); + fullCRC = V128_Xor(fullCRC, partialCRC3); + + // Reduce fullCRC into scalar value. + reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6)); + + V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask)); + + V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC); + fullCRC = V128_ShiftRight<8>(fullCRC); + fullCRC = V128_Xor(fullCRC, tmp); + + reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0)); + + tmp = V128_ShiftRight<4>(fullCRC); + fullCRC = V128_And(fullCRC, mask); + fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC); + fullCRC = V128_Xor(tmp, fullCRC); + + reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly)); + + tmp = V128_And(fullCRC, mask); + tmp = V128_PMul01(reductionMultiplicands, tmp); + tmp = V128_And(tmp, mask); + tmp = V128_PMulLow(reductionMultiplicands, tmp); + + fullCRC = V128_Xor(tmp, fullCRC); + + return static_cast<uint64_t>(V128_Extract32<1>(fullCRC)); + } + + // Update crc with 64 bytes of data from p. + ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p, + uint64_t crc) const { + for (int i = 0; i < 8; i++) { + crc = + CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p)); + p += 8; + } + return crc; + } + + // Generated by crc32c_x86_test --crc32c_generate_constants=true + // and verified against constants in linux kernel for S390: + // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S + alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8}; + alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e}; + alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6}; + alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000}; + alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1}; + alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u}; + + // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same + // size. Each group is CRCed in parallel then combined at the end of the + // block. + static constexpr size_t kGroupsSmall = 3; + // For large runs we use up to kMaxStreams blocks computed with CRC + // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which + // are combined in the end. + static constexpr size_t kMaxStreams = 3; +}; + +#ifdef ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL +alignas(16) constexpr uint64_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k1k2[2]; +alignas(16) constexpr uint64_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k3k4[2]; +alignas(16) constexpr uint64_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k5k6[2]; +alignas(16) constexpr uint64_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k7k0[2]; +alignas(16) constexpr uint64_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kPoly[2]; +alignas(16) constexpr uint32_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMask[4]; +constexpr size_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kGroupsSmall; +constexpr size_t CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMaxStreams; +#endif // ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL + +template <size_t num_crc_streams, size_t num_pclmul_streams, + CutoffStrategy strategy> +class CRC32AcceleratedX86ARMCombinedMultipleStreams + : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase { ABSL_ATTRIBUTE_HOT void Extend(uint32_t* crc, const void* bytes, size_t length) const override { static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams, @@ -221,19 +382,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams // We have dedicated instruction for 1,2,4 and 8 bytes. if (length & 8) { ABSL_INTERNAL_STEP8(l, p); - length &= ~8LL; + length &= ~size_t{8}; } if (length & 4) { ABSL_INTERNAL_STEP4(l); - length &= ~4LL; + length &= ~size_t{4}; } if (length & 2) { ABSL_INTERNAL_STEP2(l); - length &= ~2LL; + length &= ~size_t{2}; } if (length & 1) { ABSL_INTERNAL_STEP1(l); - length &= ~1LL; + length &= ~size_t{1}; } if (length == 0) { *crc = l; @@ -263,12 +424,12 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams if (strategy == CutoffStrategy::Fold3) { uint64_t l641 = 0; uint64_t l642 = 0; - const int blockSize = 32; - int64_t bs = (e - p) / kGroupsSmall / blockSize; + const size_t blockSize = 32; + size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize; const uint8_t* p1 = p + bs * blockSize; const uint8_t* p2 = p1 + bs * blockSize; - for (int64_t i = 0; i < bs - 1; ++i) { + for (size_t i = 0; i + 1 < bs; ++i) { ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); @@ -290,8 +451,9 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams V128 res2 = V128_PMul10(tmp, magic); V128 x = V128_Xor(res1, res2); - l64 = V128_Low64(x) ^ absl::little_endian::Load64(p2); - l64 = CRC32_u64(l642, l64); + l64 = static_cast<uint64_t>(V128_Low64(x)) ^ + absl::little_endian::Load64(p2); + l64 = CRC32_u64(static_cast<uint32_t>(l642), l64); p = p2 + 8; } else if (strategy == CutoffStrategy::Unroll64CRC) { @@ -314,16 +476,17 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams ABSL_INTERNAL_STEP1(l); } - int64_t bs = (e - p) / (num_crc_streams + num_pclmul_streams) / 64; + size_t bs = static_cast<size_t>(e - p) / + (num_crc_streams + num_pclmul_streams) / 64; const uint8_t* crc_streams[kMaxStreams]; const uint8_t* pclmul_streams[kMaxStreams]; // We are guaranteed to have at least one crc stream. crc_streams[0] = p; - for (int i = 1; i < num_crc_streams; i++) { + for (size_t i = 1; i < num_crc_streams; i++) { crc_streams[i] = crc_streams[i - 1] + bs * 64; } pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64; - for (int i = 1; i < num_pclmul_streams; i++) { + for (size_t i = 1; i < num_pclmul_streams; i++) { pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64; } @@ -332,13 +495,13 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams uint64_t l64_pclmul[kMaxStreams] = {0}; // Peel first iteration, because PCLMULQDQ stream, needs setup. - for (int i = 0; i < num_crc_streams; i++) { + for (size_t i = 0; i < num_crc_streams; i++) { l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]); crc_streams[i] += 16 * 4; } V128 partialCRC[kMaxStreams][4]; - for (int i = 0; i < num_pclmul_streams; i++) { + for (size_t i = 0; i < num_pclmul_streams; i++) { partialCRC[i][0] = V128_LoadU( reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0)); partialCRC[i][1] = V128_LoadU( @@ -350,13 +513,13 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams pclmul_streams[i] += 16 * 4; } - for (int64_t i = 1; i < bs; i++) { + for (size_t i = 1; i < bs; i++) { // Prefetch data for next itterations. - for (int j = 0; j < num_crc_streams; j++) { + for (size_t j = 0; j < num_crc_streams; j++) { base_internal::PrefetchT0( reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon)); } - for (int j = 0; j < num_pclmul_streams; j++) { + for (size_t j = 0; j < num_pclmul_streams; j++) { base_internal::PrefetchT0(reinterpret_cast<const char*>( pclmul_streams[j] + kPrefetchHorizon)); } @@ -398,19 +561,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams // PCLMULQDQ based streams require special final step; // CRC based don't. - for (int i = 0; i < num_pclmul_streams; i++) { + for (size_t i = 0; i < num_pclmul_streams; i++) { l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]); } // Combine all streams into single result. uint32_t magic = ComputeZeroConstant(bs * 64); l64 = l64_crc[0]; - for (int i = 1; i < num_crc_streams; i++) { - l64 = multiply(l64, magic); + for (size_t i = 1; i < num_crc_streams; i++) { + l64 = multiply(static_cast<uint32_t>(l64), magic); l64 ^= l64_crc[i]; } - for (int i = 0; i < num_pclmul_streams; i++) { - l64 = multiply(l64, magic); + for (size_t i = 0; i < num_pclmul_streams; i++) { + l64 = multiply(static_cast<uint32_t>(l64), magic); l64 ^= l64_pclmul[i]; } @@ -421,7 +584,7 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams p = crc_streams[num_crc_streams - 1]; } } - l = l64; + l = static_cast<uint32_t>(l64); while ((e - p) >= 16) { ABSL_INTERNAL_STEP8(l, p); @@ -441,139 +604,6 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams *crc = l; } - - private: - // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream - // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a - // high latency, so we run 4 128-bit partial checksums that can be reduced to - // a single value by FinalizePclmulStream later. Computing crc for arbitrary - // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC - // Computation for Generic Polynomials Using PCLMULQDQ Instruction" - // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf - // We are applying it to CRC32C polynomial. - ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul( - const uint8_t* p, V128* partialCRC) const { - V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2)); - - V128 partialCRC1 = partialCRC[0]; - V128 partialCRC2 = partialCRC[1]; - V128 partialCRC3 = partialCRC[2]; - V128 partialCRC4 = partialCRC[3]; - - V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands); - V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands); - V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands); - V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands); - V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0)); - V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1)); - V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2)); - V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3)); - partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands); - partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands); - partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands); - partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands); - partialCRC1 = V128_Xor(tmp1, partialCRC1); - partialCRC2 = V128_Xor(tmp2, partialCRC2); - partialCRC3 = V128_Xor(tmp3, partialCRC3); - partialCRC4 = V128_Xor(tmp4, partialCRC4); - partialCRC1 = V128_Xor(partialCRC1, data1); - partialCRC2 = V128_Xor(partialCRC2, data2); - partialCRC3 = V128_Xor(partialCRC3, data3); - partialCRC4 = V128_Xor(partialCRC4, data4); - partialCRC[0] = partialCRC1; - partialCRC[1] = partialCRC2; - partialCRC[2] = partialCRC3; - partialCRC[3] = partialCRC4; - } - - // Reduce partialCRC produced by Process64BytesPclmul into a single value, - // that represents crc checksum of all the processed bytes. - ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t - FinalizePclmulStream(V128* partialCRC) const { - V128 partialCRC1 = partialCRC[0]; - V128 partialCRC2 = partialCRC[1]; - V128 partialCRC3 = partialCRC[2]; - V128 partialCRC4 = partialCRC[3]; - - // Combine 4 vectors of partial crc into a single vector. - V128 reductionMultiplicands = - V128_Load(reinterpret_cast<const V128*>(k5k6)); - - V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1); - V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1); - - partialCRC1 = V128_Xor(low, high); - partialCRC1 = V128_Xor(partialCRC1, partialCRC2); - - low = V128_PMulLow(reductionMultiplicands, partialCRC3); - high = V128_PMulHi(reductionMultiplicands, partialCRC3); - - partialCRC3 = V128_Xor(low, high); - partialCRC3 = V128_Xor(partialCRC3, partialCRC4); - - reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4)); - - low = V128_PMulLow(reductionMultiplicands, partialCRC1); - high = V128_PMulHi(reductionMultiplicands, partialCRC1); - V128 fullCRC = V128_Xor(low, high); - fullCRC = V128_Xor(fullCRC, partialCRC3); - - // Reduce fullCRC into scalar value. - reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6)); - - V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask)); - - V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC); - fullCRC = V128_ShiftRight<8>(fullCRC); - fullCRC = V128_Xor(fullCRC, tmp); - - reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0)); - - tmp = V128_ShiftRight<4>(fullCRC); - fullCRC = V128_And(fullCRC, mask); - fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC); - fullCRC = V128_Xor(tmp, fullCRC); - - reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly)); - - tmp = V128_And(fullCRC, mask); - tmp = V128_PMul01(reductionMultiplicands, tmp); - tmp = V128_And(tmp, mask); - tmp = V128_PMulLow(reductionMultiplicands, tmp); - - fullCRC = V128_Xor(tmp, fullCRC); - - return V128_Extract32<1>(fullCRC); - } - - // Update crc with 64 bytes of data from p. - ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p, - uint64_t crc) const { - for (int i = 0; i < 8; i++) { - crc = CRC32_u64(crc, absl::little_endian::Load64(p)); - p += 8; - } - return crc; - } - - // Generated by crc32c_x86_test --crc32c_generate_constants=true - // and verified against constants in linux kernel for S390: - // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S - alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8}; - alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e}; - alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6}; - alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000}; - alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1}; - alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u}; - - // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same - // size. Each group is CRCed in parallel then combined at the end of the - // block. - static constexpr int kGroupsSmall = 3; - // For large runs we use up to kMaxStreams blocks computed with CRC - // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which - // are combined in the end. - static constexpr int kMaxStreams = 3; }; } // namespace diff --git a/absl/crc/internal/non_temporal_memcpy.h b/absl/crc/internal/non_temporal_memcpy.h index 092c6078..6cdd3197 100644 --- a/absl/crc/internal/non_temporal_memcpy.h +++ b/absl/crc/internal/non_temporal_memcpy.h @@ -74,7 +74,7 @@ inline void *non_temporal_store_memcpy(void *__restrict dst, uintptr_t bytes_before_alignment_boundary = kCacheLineSize - (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)); - int header_len = (std::min)(bytes_before_alignment_boundary, len); + size_t header_len = (std::min)(bytes_before_alignment_boundary, len); assert(bytes_before_alignment_boundary < kCacheLineSize); memcpy(d, s, header_len); d += header_len; @@ -87,7 +87,7 @@ inline void *non_temporal_store_memcpy(void *__restrict dst, __m128i *dst_cacheline = reinterpret_cast<__m128i *>(d); const __m128i *src_cacheline = reinterpret_cast<const __m128i *>(s); constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m128i); - uint64_t loops = len / kCacheLineSize; + size_t loops = len / kCacheLineSize; while (len >= kCacheLineSize) { __m128i temp1, temp2, temp3, temp4; @@ -132,7 +132,7 @@ inline void *non_temporal_store_memcpy_avx(void *__restrict dst, uintptr_t bytes_before_alignment_boundary = kCacheLineSize - (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)); - int header_len = (std::min)(bytes_before_alignment_boundary, len); + size_t header_len = (std::min)(bytes_before_alignment_boundary, len); assert(bytes_before_alignment_boundary < kCacheLineSize); memcpy(d, s, header_len); d += header_len; @@ -145,7 +145,7 @@ inline void *non_temporal_store_memcpy_avx(void *__restrict dst, __m256i *dst_cacheline = reinterpret_cast<__m256i *>(d); const __m256i *src_cacheline = reinterpret_cast<const __m256i *>(s); constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m256i); - int loops = len / kCacheLineSize; + size_t loops = len / kCacheLineSize; while (len >= kCacheLineSize) { __m256i temp1, temp2; |