diff options
author | Derek Mauro <dmauro@google.com> | 2022-11-30 10:58:38 -0800 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2022-11-30 10:59:21 -0800 |
commit | 66665d8d2e3fedff340b83f9841ca427145a7b26 (patch) | |
tree | 747602a57a578dc3220d8e61298184c43bbf8deb /absl | |
parent | 94e9ee3f75a617403049a29e6c932b8b3bb13177 (diff) |
Fixes many compilation issues that come from having no external CI
coverage of the accelerated CRC implementation and some differences
bewteen the internal and external implementation.
This change adds CI coverage to the
linux_clang-latest_libstdcxx_bazel.sh script assuming this script
always runs on machines of at least the Intel Haswell generation.
Fixes include:
* Remove the use of the deprecated xor operator on crc32c_t
* Remove #pragma unroll_completely, which isn't known by GCC or Clang:
https://godbolt.org/z/97j4vbacs
* Fixes for -Wsign-compare, -Wsign-conversion and -Wshorten-64-to-32
PiperOrigin-RevId: 491965029
Change-Id: Ic5e1f3a20f69fcd35fe81ebef63443ad26bf7931
Diffstat (limited to 'absl')
-rw-r--r-- | absl/crc/crc32c_test.cc | 18 | ||||
-rw-r--r-- | absl/crc/internal/crc32_x86_arm_combined_simd.h | 4 | ||||
-rw-r--r-- | absl/crc/internal/crc32c_inline.h | 2 | ||||
-rw-r--r-- | absl/crc/internal/crc_memcpy_x86_64.cc | 109 | ||||
-rw-r--r-- | absl/crc/internal/crc_x86_arm_combined.cc | 392 | ||||
-rw-r--r-- | absl/crc/internal/non_temporal_memcpy.h | 8 |
6 files changed, 275 insertions, 258 deletions
diff --git a/absl/crc/crc32c_test.cc b/absl/crc/crc32c_test.cc index 0b9dc683..72d422a1 100644 --- a/absl/crc/crc32c_test.cc +++ b/absl/crc/crc32c_test.cc @@ -15,6 +15,7 @@ #include "absl/crc/crc32c.h" #include <algorithm> +#include <cstddef> #include <cstdint> #include <cstring> #include <string> @@ -90,7 +91,8 @@ TEST(CRC32C, ExtendByZeroes) { std::string base = "hello world"; absl::crc32c_t base_crc = absl::crc32c_t{0xc99465aa}; - for (const size_t extend_by : {100, 10000, 100000}) { + constexpr size_t kExtendByValues[] = {100, 10000, 100000}; + for (const size_t extend_by : kExtendByValues) { SCOPED_TRACE(extend_by); absl::crc32c_t crc2 = absl::ExtendCrc32cByZeroes(base_crc, extend_by); EXPECT_EQ(crc2, absl::ComputeCrc32c(base + std::string(extend_by, '\0'))); @@ -98,10 +100,13 @@ TEST(CRC32C, ExtendByZeroes) { } TEST(CRC32C, UnextendByZeroes) { + constexpr size_t kExtendByValues[] = {2, 200, 20000, 200000, 20000000}; + constexpr size_t kUnextendByValues[] = {0, 100, 10000, 100000, 10000000}; + for (auto seed_crc : {absl::crc32c_t{0}, absl::crc32c_t{0xc99465aa}}) { SCOPED_TRACE(seed_crc); - for (const size_t size_1 : {2, 200, 20000, 200000, 20000000}) { - for (const size_t size_2 : {0, 100, 10000, 100000, 10000000}) { + for (const size_t size_1 : kExtendByValues) { + for (const size_t size_2 : kUnextendByValues) { size_t extend_size = std::max(size_1, size_2); size_t unextend_size = std::min(size_1, size_2); SCOPED_TRACE(extend_size); @@ -120,7 +125,9 @@ TEST(CRC32C, UnextendByZeroes) { } } } - for (const size_t size : {0, 1, 100, 10000}) { + + constexpr size_t kSizes[] = {0, 1, 100, 10000}; + for (const size_t size : kSizes) { SCOPED_TRACE(size); std::string string_before = TestString(size); std::string string_after = string_before + std::string(size, '\0'); @@ -146,7 +153,8 @@ TEST(CRC32C, Concat) { } TEST(CRC32C, Memcpy) { - for (size_t bytes : {0, 1, 20, 500, 100000}) { + constexpr size_t kBytesSize[] = {0, 1, 20, 500, 100000}; + for (size_t bytes : kBytesSize) { SCOPED_TRACE(bytes); std::string sample_string = TestString(bytes); std::string target_buffer = std::string(bytes, '\0'); diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h index 8f15ded3..f6c2a21c 100644 --- a/absl/crc/internal/crc32_x86_arm_combined_simd.h +++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h @@ -129,7 +129,7 @@ inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) { } inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) { - return _mm_crc32_u64(crc, v); + return static_cast<uint32_t>(_mm_crc32_u64(crc, v)); } inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); } @@ -157,7 +157,7 @@ inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); } inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); } inline V128 V128_From2x64(const uint64_t l, const uint64_t r) { - return _mm_set_epi64x(l, r); + return _mm_set_epi64x(static_cast<int64_t>(l), static_cast<int64_t>(r)); } template <int imm> diff --git a/absl/crc/internal/crc32c_inline.h b/absl/crc/internal/crc32c_inline.h index 43ad14f4..6236c10b 100644 --- a/absl/crc/internal/crc32c_inline.h +++ b/absl/crc/internal/crc32c_inline.h @@ -34,7 +34,7 @@ inline bool ExtendCrc32cInline(uint32_t* crc, const char* p, size_t n) { constexpr uint32_t kCrc32Xor = 0xffffffffU; *crc ^= kCrc32Xor; if (n & 1) { - *crc = CRC32_u8(*crc, *p); + *crc = CRC32_u8(*crc, static_cast<uint8_t>(*p)); n--; p++; } diff --git a/absl/crc/internal/crc_memcpy_x86_64.cc b/absl/crc/internal/crc_memcpy_x86_64.cc index 1c3299ee..a148fe17 100644 --- a/absl/crc/internal/crc_memcpy_x86_64.cc +++ b/absl/crc/internal/crc_memcpy_x86_64.cc @@ -82,21 +82,18 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length, return crc32c_t{crc_uint32}; } -constexpr int kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t); +constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t); // Common function for copying the tails of multiple large regions. -template <int vec_regions, int int_regions> +template <size_t vec_regions, size_t int_regions> inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, size_t region_size, size_t copy_rounds) { std::array<__m128i, vec_regions> data; std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data; while (copy_rounds > 0) { -#ifdef __GNUC__ -#pragma unroll_completely -#endif - for (int i = 0; i < vec_regions; i++) { - int region = i; + for (size_t i = 0; i < vec_regions; i++) { + size_t region = i; auto* vsrc = reinterpret_cast<const __m128i*>(*src + region_size * region); @@ -109,27 +106,23 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, _mm_store_si128(vdst, data[i]); // Compute the running CRC - crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( - static_cast<uint32_t>(crcs[region]), _mm_extract_epi64(data[i], 0)))}; - crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( - static_cast<uint32_t>(crcs[region]), _mm_extract_epi64(data[i], 1)))}; + crcs[region] = crc32c_t{static_cast<uint32_t>( + _mm_crc32_u64(static_cast<uint32_t>(crcs[region]), + static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))}; + crcs[region] = crc32c_t{static_cast<uint32_t>( + _mm_crc32_u64(static_cast<uint32_t>(crcs[region]), + static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))}; } -#ifdef __GNUC__ -#pragma unroll_completely -#endif - for (int i = 0; i < int_regions; i++) { - int region = vec_regions + i; + for (size_t i = 0; i < int_regions; i++) { + size_t region = vec_regions + i; auto* usrc = reinterpret_cast<const uint64_t*>(*src + region_size * region); auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region); -#ifdef __GNUC__ -#pragma unroll_completely -#endif - for (int j = 0; j < kIntLoadsPerVec; j++) { - int data_index = i * kIntLoadsPerVec + j; + for (size_t j = 0; j < kIntLoadsPerVec; j++) { + size_t data_index = i * kIntLoadsPerVec + j; int_data[data_index] = *(usrc + j); crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( @@ -148,7 +141,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, } // namespace -template <int vec_regions, int int_regions> +template <size_t vec_regions, size_t int_regions> class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine { public: AcceleratedCrcMemcpyEngine() = default; @@ -160,12 +153,12 @@ class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine { std::size_t length, crc32c_t initial_crc) const override; }; -template <int vec_regions, int int_regions> +template <size_t vec_regions, size_t int_regions> crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( void* __restrict dst, const void* __restrict src, std::size_t length, crc32c_t initial_crc) const { constexpr std::size_t kRegions = vec_regions + int_regions; - constexpr crc32c_t kCrcDataXor = crc32c_t{0xffffffff}; + constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff}; constexpr std::size_t kBlockSize = sizeof(__m128i); constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize; @@ -201,7 +194,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( // Start work on the CRC: undo the XOR from the previous calculation or set up // the initial value of the CRC. // initial_crc ^= kCrcDataXor; - initial_crc = initial_crc ^ kCrcDataXor; + initial_crc = crc32c_t{static_cast<uint32_t>(initial_crc) ^ kCrcDataXor}; // Do an initial alignment copy, so we can use aligned store instructions to // the destination pointer. We align the destination pointer because the @@ -229,13 +222,13 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( // Initialize CRCs for kRegions regions. crc32c_t crcs[kRegions]; crcs[0] = initial_crc; - for (int i = 1; i < kRegions; i++) { - crcs[i] = kCrcDataXor; + for (size_t i = 1; i < kRegions; i++) { + crcs[i] = crc32c_t{kCrcDataXor}; } // Find the number of rounds to copy and the region size. Also compute the // tail size here. - int64_t copy_rounds = length / kCopyRoundSize; + size_t copy_rounds = length / kCopyRoundSize; // Find the size of each region and the size of the tail. const std::size_t region_size = copy_rounds * kBlockSize; @@ -248,10 +241,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( // Main loop. while (copy_rounds > kBlocksPerCacheLine) { // Prefetch kPrefetchAhead bytes ahead of each pointer. -#ifdef __GNUC__ -#pragma unroll_completely -#endif - for (int i = 0; i < kRegions; i++) { + for (size_t i = 0; i < kRegions; i++) { absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead + region_size * i); absl::base_internal::PrefetchT0(dst_bytes + kPrefetchAhead + @@ -259,58 +249,46 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( } // Load and store data, computing CRC on the way. -#ifdef __GNUC__ -#pragma unroll_completely -#endif - for (int i = 0; i < kBlocksPerCacheLine; i++) { + for (size_t i = 0; i < kBlocksPerCacheLine; i++) { // Copy and CRC the data for the CRC regions. -#ifdef __GNUC__ -#pragma unroll_completely -#endif - for (int j = 0; j < vec_regions; j++) { + for (size_t j = 0; j < vec_regions; j++) { // Cycle which regions get vector load/store and integer load/store, to // engage prefetching logic around vector load/stores and save issue // slots by using the integer registers. - int region = (j + i) % kRegions; + size_t region = (j + i) % kRegions; - auto* src = + auto* vsrc = reinterpret_cast<const __m128i*>(src_bytes + region_size * region); - auto* dst = + auto* vdst = reinterpret_cast<__m128i*>(dst_bytes + region_size * region); // Load and CRC data. - vec_data[j] = _mm_loadu_si128(src + i); - crcs[region] = crc32c_t{static_cast<uint32_t>( - _mm_crc32_u64(static_cast<uint32_t>(crcs[region]), - _mm_extract_epi64(vec_data[j], 0)))}; - crcs[region] = crc32c_t{static_cast<uint32_t>( - _mm_crc32_u64(static_cast<uint32_t>(crcs[region]), - _mm_extract_epi64(vec_data[j], 1)))}; + vec_data[j] = _mm_loadu_si128(vsrc + i); + crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( + static_cast<uint32_t>(crcs[region]), + static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))}; + crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( + static_cast<uint32_t>(crcs[region]), + static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))}; // Store the data. - _mm_store_si128(dst + i, vec_data[j]); + _mm_store_si128(vdst + i, vec_data[j]); } // Preload the partial CRCs for the CLMUL subregions. -#ifdef __GNUC__ -#pragma unroll_completely -#endif - for (int j = 0; j < int_regions; j++) { + for (size_t j = 0; j < int_regions; j++) { // Cycle which regions get vector load/store and integer load/store, to // engage prefetching logic around vector load/stores and save issue // slots by using the integer registers. - int region = (j + vec_regions + i) % kRegions; + size_t region = (j + vec_regions + i) % kRegions; auto* usrc = reinterpret_cast<const uint64_t*>(src_bytes + region_size * region); auto* udst = reinterpret_cast<uint64_t*>(dst_bytes + region_size * region); -#ifdef __GNUC__ -#pragma unroll_completely -#endif - for (int k = 0; k < kIntLoadsPerVec; k++) { - int data_index = j * kIntLoadsPerVec + k; + for (size_t k = 0; k < kIntLoadsPerVec; k++) { + size_t data_index = j * kIntLoadsPerVec + k; // Load and CRC the data. int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k); @@ -339,13 +317,13 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( // Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the // XOR done before doing block copy + CRCs. - for (int i = 0; i < kRegions - 1; i++) { - crcs[i] = crcs[i] ^ kCrcDataXor; + for (size_t i = 0; i + 1 < kRegions; i++) { + crcs[i] = crc32c_t{static_cast<uint32_t>(crcs[i]) ^ kCrcDataXor}; } // Build a CRC of the first kRegions - 1 regions. crc32c_t full_crc = crcs[0]; - for (int i = 1; i < kRegions - 1; i++) { + for (size_t i = 1; i + 1 < kRegions; i++) { full_crc = ConcatCrc32c(full_crc, crcs[i], region_size); } @@ -360,7 +338,8 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( crcs[kRegions - 1]); // Finalize and concatenate the final CRC, then return. - crcs[kRegions - 1] = crcs[kRegions - 1] ^ kCrcDataXor; + crcs[kRegions - 1] = + crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor}; return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size); } diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc index d771a786..2112f609 100644 --- a/absl/crc/internal/crc_x86_arm_combined.cc +++ b/absl/crc/internal/crc_x86_arm_combined.cc @@ -14,12 +14,11 @@ // Hardware accelerated CRC32 computation on Intel and ARM architecture. -#include <stddef.h> - +#include <cstddef> #include <cstdint> #include "absl/base/attributes.h" -#include "absl/base/call_once.h" +#include "absl/base/config.h" #include "absl/base/dynamic_annotations.h" #include "absl/base/internal/endian.h" #include "absl/base/internal/prefetch.h" @@ -67,27 +66,30 @@ class CRC32AcceleratedX86ARMCombined : public CRC32 { // Constants for switching between algorithms. // Chosen by comparing speed at different powers of 2. -constexpr int kSmallCutoff = 256; -constexpr int kMediumCutoff = 2048; +constexpr size_t kSmallCutoff = 256; +constexpr size_t kMediumCutoff = 2048; -#define ABSL_INTERNAL_STEP1(crc) \ - do { \ - crc = CRC32_u8(crc, *p++); \ +#define ABSL_INTERNAL_STEP1(crc) \ + do { \ + crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \ } while (0) -#define ABSL_INTERNAL_STEP2(crc) \ - do { \ - crc = CRC32_u16(crc, absl::little_endian::Load16(p)); \ - p += 2; \ +#define ABSL_INTERNAL_STEP2(crc) \ + do { \ + crc = \ + CRC32_u16(static_cast<uint32_t>(crc), absl::little_endian::Load16(p)); \ + p += 2; \ } while (0) -#define ABSL_INTERNAL_STEP4(crc) \ - do { \ - crc = CRC32_u32(crc, absl::little_endian::Load32(p)); \ - p += 4; \ +#define ABSL_INTERNAL_STEP4(crc) \ + do { \ + crc = \ + CRC32_u32(static_cast<uint32_t>(crc), absl::little_endian::Load32(p)); \ + p += 4; \ } while (0) -#define ABSL_INTERNAL_STEP8(crc, data) \ - do { \ - crc = CRC32_u64(crc, absl::little_endian::Load64(data)); \ - data += 8; \ +#define ABSL_INTERNAL_STEP8(crc, data) \ + do { \ + crc = CRC32_u64(static_cast<uint32_t>(crc), \ + absl::little_endian::Load64(data)); \ + data += 8; \ } while (0) #define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \ do { \ @@ -101,6 +103,8 @@ constexpr int kMediumCutoff = 2048; ABSL_INTERNAL_STEP8(crc2, p2); \ } while (0) +namespace { + uint32_t multiply(uint32_t a, uint32_t b) { V128 shifts = V128_From2x64(0, 1); V128 power = V128_From2x64(0, a); @@ -109,11 +113,10 @@ uint32_t multiply(uint32_t a, uint32_t b) { // Combine crc values res = V128_ShiftLeft64(res, shifts); - return V128_Extract32<1>(res) ^ CRC32_u32(0, V128_Low64(res)); + return static_cast<uint32_t>(V128_Extract32<1>(res)) ^ + CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res))); } -namespace { - // Powers of crc32c polynomial, for faster ExtendByZeros. // Verified against folly: // folly/hash/detail/Crc32CombineDetail.cpp @@ -204,9 +207,167 @@ enum class CutoffStrategy { Unroll64CRC, }; -template <int num_crc_streams, int num_pclmul_streams, CutoffStrategy strategy> -class CRC32AcceleratedX86ARMCombinedMultipleStreams +// Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the +// methods and data that don't need the template arguments. +class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase : public CRC32AcceleratedX86ARMCombined { + protected: + // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream + // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a + // high latency, so we run 4 128-bit partial checksums that can be reduced to + // a single value by FinalizePclmulStream later. Computing crc for arbitrary + // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC + // Computation for Generic Polynomials Using PCLMULQDQ Instruction" + // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf + // We are applying it to CRC32C polynomial. + ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul( + const uint8_t* p, V128* partialCRC) const { + V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2)); + + V128 partialCRC1 = partialCRC[0]; + V128 partialCRC2 = partialCRC[1]; + V128 partialCRC3 = partialCRC[2]; + V128 partialCRC4 = partialCRC[3]; + + V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands); + V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands); + V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands); + V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands); + V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0)); + V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1)); + V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2)); + V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3)); + partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands); + partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands); + partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands); + partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands); + partialCRC1 = V128_Xor(tmp1, partialCRC1); + partialCRC2 = V128_Xor(tmp2, partialCRC2); + partialCRC3 = V128_Xor(tmp3, partialCRC3); + partialCRC4 = V128_Xor(tmp4, partialCRC4); + partialCRC1 = V128_Xor(partialCRC1, data1); + partialCRC2 = V128_Xor(partialCRC2, data2); + partialCRC3 = V128_Xor(partialCRC3, data3); + partialCRC4 = V128_Xor(partialCRC4, data4); + partialCRC[0] = partialCRC1; + partialCRC[1] = partialCRC2; + partialCRC[2] = partialCRC3; + partialCRC[3] = partialCRC4; + } + + // Reduce partialCRC produced by Process64BytesPclmul into a single value, + // that represents crc checksum of all the processed bytes. + ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t + FinalizePclmulStream(V128* partialCRC) const { + V128 partialCRC1 = partialCRC[0]; + V128 partialCRC2 = partialCRC[1]; + V128 partialCRC3 = partialCRC[2]; + V128 partialCRC4 = partialCRC[3]; + + // Combine 4 vectors of partial crc into a single vector. + V128 reductionMultiplicands = + V128_Load(reinterpret_cast<const V128*>(k5k6)); + + V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1); + V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1); + + partialCRC1 = V128_Xor(low, high); + partialCRC1 = V128_Xor(partialCRC1, partialCRC2); + + low = V128_PMulLow(reductionMultiplicands, partialCRC3); + high = V128_PMulHi(reductionMultiplicands, partialCRC3); + + partialCRC3 = V128_Xor(low, high); + partialCRC3 = V128_Xor(partialCRC3, partialCRC4); + + reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4)); + + low = V128_PMulLow(reductionMultiplicands, partialCRC1); + high = V128_PMulHi(reductionMultiplicands, partialCRC1); + V128 fullCRC = V128_Xor(low, high); + fullCRC = V128_Xor(fullCRC, partialCRC3); + + // Reduce fullCRC into scalar value. + reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6)); + + V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask)); + + V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC); + fullCRC = V128_ShiftRight<8>(fullCRC); + fullCRC = V128_Xor(fullCRC, tmp); + + reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0)); + + tmp = V128_ShiftRight<4>(fullCRC); + fullCRC = V128_And(fullCRC, mask); + fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC); + fullCRC = V128_Xor(tmp, fullCRC); + + reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly)); + + tmp = V128_And(fullCRC, mask); + tmp = V128_PMul01(reductionMultiplicands, tmp); + tmp = V128_And(tmp, mask); + tmp = V128_PMulLow(reductionMultiplicands, tmp); + + fullCRC = V128_Xor(tmp, fullCRC); + + return static_cast<uint64_t>(V128_Extract32<1>(fullCRC)); + } + + // Update crc with 64 bytes of data from p. + ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p, + uint64_t crc) const { + for (int i = 0; i < 8; i++) { + crc = + CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p)); + p += 8; + } + return crc; + } + + // Generated by crc32c_x86_test --crc32c_generate_constants=true + // and verified against constants in linux kernel for S390: + // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S + alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8}; + alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e}; + alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6}; + alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000}; + alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1}; + alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u}; + + // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same + // size. Each group is CRCed in parallel then combined at the end of the + // block. + static constexpr size_t kGroupsSmall = 3; + // For large runs we use up to kMaxStreams blocks computed with CRC + // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which + // are combined in the end. + static constexpr size_t kMaxStreams = 3; +}; + +#ifdef ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL +alignas(16) constexpr uint64_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k1k2[2]; +alignas(16) constexpr uint64_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k3k4[2]; +alignas(16) constexpr uint64_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k5k6[2]; +alignas(16) constexpr uint64_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k7k0[2]; +alignas(16) constexpr uint64_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kPoly[2]; +alignas(16) constexpr uint32_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMask[4]; +constexpr size_t + CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kGroupsSmall; +constexpr size_t CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMaxStreams; +#endif // ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL + +template <size_t num_crc_streams, size_t num_pclmul_streams, + CutoffStrategy strategy> +class CRC32AcceleratedX86ARMCombinedMultipleStreams + : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase { ABSL_ATTRIBUTE_HOT void Extend(uint32_t* crc, const void* bytes, size_t length) const override { static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams, @@ -221,19 +382,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams // We have dedicated instruction for 1,2,4 and 8 bytes. if (length & 8) { ABSL_INTERNAL_STEP8(l, p); - length &= ~8LL; + length &= ~size_t{8}; } if (length & 4) { ABSL_INTERNAL_STEP4(l); - length &= ~4LL; + length &= ~size_t{4}; } if (length & 2) { ABSL_INTERNAL_STEP2(l); - length &= ~2LL; + length &= ~size_t{2}; } if (length & 1) { ABSL_INTERNAL_STEP1(l); - length &= ~1LL; + length &= ~size_t{1}; } if (length == 0) { *crc = l; @@ -263,12 +424,12 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams if (strategy == CutoffStrategy::Fold3) { uint64_t l641 = 0; uint64_t l642 = 0; - const int blockSize = 32; - int64_t bs = (e - p) / kGroupsSmall / blockSize; + const size_t blockSize = 32; + size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize; const uint8_t* p1 = p + bs * blockSize; const uint8_t* p2 = p1 + bs * blockSize; - for (int64_t i = 0; i < bs - 1; ++i) { + for (size_t i = 0; i + 1 < bs; ++i) { ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); @@ -290,8 +451,9 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams V128 res2 = V128_PMul10(tmp, magic); V128 x = V128_Xor(res1, res2); - l64 = V128_Low64(x) ^ absl::little_endian::Load64(p2); - l64 = CRC32_u64(l642, l64); + l64 = static_cast<uint64_t>(V128_Low64(x)) ^ + absl::little_endian::Load64(p2); + l64 = CRC32_u64(static_cast<uint32_t>(l642), l64); p = p2 + 8; } else if (strategy == CutoffStrategy::Unroll64CRC) { @@ -314,16 +476,17 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams ABSL_INTERNAL_STEP1(l); } - int64_t bs = (e - p) / (num_crc_streams + num_pclmul_streams) / 64; + size_t bs = static_cast<size_t>(e - p) / + (num_crc_streams + num_pclmul_streams) / 64; const uint8_t* crc_streams[kMaxStreams]; const uint8_t* pclmul_streams[kMaxStreams]; // We are guaranteed to have at least one crc stream. crc_streams[0] = p; - for (int i = 1; i < num_crc_streams; i++) { + for (size_t i = 1; i < num_crc_streams; i++) { crc_streams[i] = crc_streams[i - 1] + bs * 64; } pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64; - for (int i = 1; i < num_pclmul_streams; i++) { + for (size_t i = 1; i < num_pclmul_streams; i++) { pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64; } @@ -332,13 +495,13 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams uint64_t l64_pclmul[kMaxStreams] = {0}; // Peel first iteration, because PCLMULQDQ stream, needs setup. - for (int i = 0; i < num_crc_streams; i++) { + for (size_t i = 0; i < num_crc_streams; i++) { l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]); crc_streams[i] += 16 * 4; } V128 partialCRC[kMaxStreams][4]; - for (int i = 0; i < num_pclmul_streams; i++) { + for (size_t i = 0; i < num_pclmul_streams; i++) { partialCRC[i][0] = V128_LoadU( reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0)); partialCRC[i][1] = V128_LoadU( @@ -350,13 +513,13 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams pclmul_streams[i] += 16 * 4; } - for (int64_t i = 1; i < bs; i++) { + for (size_t i = 1; i < bs; i++) { // Prefetch data for next itterations. - for (int j = 0; j < num_crc_streams; j++) { + for (size_t j = 0; j < num_crc_streams; j++) { base_internal::PrefetchT0( reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon)); } - for (int j = 0; j < num_pclmul_streams; j++) { + for (size_t j = 0; j < num_pclmul_streams; j++) { base_internal::PrefetchT0(reinterpret_cast<const char*>( pclmul_streams[j] + kPrefetchHorizon)); } @@ -398,19 +561,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams // PCLMULQDQ based streams require special final step; // CRC based don't. - for (int i = 0; i < num_pclmul_streams; i++) { + for (size_t i = 0; i < num_pclmul_streams; i++) { l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]); } // Combine all streams into single result. uint32_t magic = ComputeZeroConstant(bs * 64); l64 = l64_crc[0]; - for (int i = 1; i < num_crc_streams; i++) { - l64 = multiply(l64, magic); + for (size_t i = 1; i < num_crc_streams; i++) { + l64 = multiply(static_cast<uint32_t>(l64), magic); l64 ^= l64_crc[i]; } - for (int i = 0; i < num_pclmul_streams; i++) { - l64 = multiply(l64, magic); + for (size_t i = 0; i < num_pclmul_streams; i++) { + l64 = multiply(static_cast<uint32_t>(l64), magic); l64 ^= l64_pclmul[i]; } @@ -421,7 +584,7 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams p = crc_streams[num_crc_streams - 1]; } } - l = l64; + l = static_cast<uint32_t>(l64); while ((e - p) >= 16) { ABSL_INTERNAL_STEP8(l, p); @@ -441,139 +604,6 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams *crc = l; } - - private: - // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream - // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a - // high latency, so we run 4 128-bit partial checksums that can be reduced to - // a single value by FinalizePclmulStream later. Computing crc for arbitrary - // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC - // Computation for Generic Polynomials Using PCLMULQDQ Instruction" - // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf - // We are applying it to CRC32C polynomial. - ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul( - const uint8_t* p, V128* partialCRC) const { - V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2)); - - V128 partialCRC1 = partialCRC[0]; - V128 partialCRC2 = partialCRC[1]; - V128 partialCRC3 = partialCRC[2]; - V128 partialCRC4 = partialCRC[3]; - - V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands); - V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands); - V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands); - V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands); - V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0)); - V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1)); - V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2)); - V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3)); - partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands); - partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands); - partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands); - partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands); - partialCRC1 = V128_Xor(tmp1, partialCRC1); - partialCRC2 = V128_Xor(tmp2, partialCRC2); - partialCRC3 = V128_Xor(tmp3, partialCRC3); - partialCRC4 = V128_Xor(tmp4, partialCRC4); - partialCRC1 = V128_Xor(partialCRC1, data1); - partialCRC2 = V128_Xor(partialCRC2, data2); - partialCRC3 = V128_Xor(partialCRC3, data3); - partialCRC4 = V128_Xor(partialCRC4, data4); - partialCRC[0] = partialCRC1; - partialCRC[1] = partialCRC2; - partialCRC[2] = partialCRC3; - partialCRC[3] = partialCRC4; - } - - // Reduce partialCRC produced by Process64BytesPclmul into a single value, - // that represents crc checksum of all the processed bytes. - ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t - FinalizePclmulStream(V128* partialCRC) const { - V128 partialCRC1 = partialCRC[0]; - V128 partialCRC2 = partialCRC[1]; - V128 partialCRC3 = partialCRC[2]; - V128 partialCRC4 = partialCRC[3]; - - // Combine 4 vectors of partial crc into a single vector. - V128 reductionMultiplicands = - V128_Load(reinterpret_cast<const V128*>(k5k6)); - - V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1); - V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1); - - partialCRC1 = V128_Xor(low, high); - partialCRC1 = V128_Xor(partialCRC1, partialCRC2); - - low = V128_PMulLow(reductionMultiplicands, partialCRC3); - high = V128_PMulHi(reductionMultiplicands, partialCRC3); - - partialCRC3 = V128_Xor(low, high); - partialCRC3 = V128_Xor(partialCRC3, partialCRC4); - - reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4)); - - low = V128_PMulLow(reductionMultiplicands, partialCRC1); - high = V128_PMulHi(reductionMultiplicands, partialCRC1); - V128 fullCRC = V128_Xor(low, high); - fullCRC = V128_Xor(fullCRC, partialCRC3); - - // Reduce fullCRC into scalar value. - reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6)); - - V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask)); - - V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC); - fullCRC = V128_ShiftRight<8>(fullCRC); - fullCRC = V128_Xor(fullCRC, tmp); - - reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0)); - - tmp = V128_ShiftRight<4>(fullCRC); - fullCRC = V128_And(fullCRC, mask); - fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC); - fullCRC = V128_Xor(tmp, fullCRC); - - reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly)); - - tmp = V128_And(fullCRC, mask); - tmp = V128_PMul01(reductionMultiplicands, tmp); - tmp = V128_And(tmp, mask); - tmp = V128_PMulLow(reductionMultiplicands, tmp); - - fullCRC = V128_Xor(tmp, fullCRC); - - return V128_Extract32<1>(fullCRC); - } - - // Update crc with 64 bytes of data from p. - ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p, - uint64_t crc) const { - for (int i = 0; i < 8; i++) { - crc = CRC32_u64(crc, absl::little_endian::Load64(p)); - p += 8; - } - return crc; - } - - // Generated by crc32c_x86_test --crc32c_generate_constants=true - // and verified against constants in linux kernel for S390: - // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S - alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8}; - alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e}; - alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6}; - alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000}; - alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1}; - alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u}; - - // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same - // size. Each group is CRCed in parallel then combined at the end of the - // block. - static constexpr int kGroupsSmall = 3; - // For large runs we use up to kMaxStreams blocks computed with CRC - // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which - // are combined in the end. - static constexpr int kMaxStreams = 3; }; } // namespace diff --git a/absl/crc/internal/non_temporal_memcpy.h b/absl/crc/internal/non_temporal_memcpy.h index 092c6078..6cdd3197 100644 --- a/absl/crc/internal/non_temporal_memcpy.h +++ b/absl/crc/internal/non_temporal_memcpy.h @@ -74,7 +74,7 @@ inline void *non_temporal_store_memcpy(void *__restrict dst, uintptr_t bytes_before_alignment_boundary = kCacheLineSize - (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)); - int header_len = (std::min)(bytes_before_alignment_boundary, len); + size_t header_len = (std::min)(bytes_before_alignment_boundary, len); assert(bytes_before_alignment_boundary < kCacheLineSize); memcpy(d, s, header_len); d += header_len; @@ -87,7 +87,7 @@ inline void *non_temporal_store_memcpy(void *__restrict dst, __m128i *dst_cacheline = reinterpret_cast<__m128i *>(d); const __m128i *src_cacheline = reinterpret_cast<const __m128i *>(s); constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m128i); - uint64_t loops = len / kCacheLineSize; + size_t loops = len / kCacheLineSize; while (len >= kCacheLineSize) { __m128i temp1, temp2, temp3, temp4; @@ -132,7 +132,7 @@ inline void *non_temporal_store_memcpy_avx(void *__restrict dst, uintptr_t bytes_before_alignment_boundary = kCacheLineSize - (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)); - int header_len = (std::min)(bytes_before_alignment_boundary, len); + size_t header_len = (std::min)(bytes_before_alignment_boundary, len); assert(bytes_before_alignment_boundary < kCacheLineSize); memcpy(d, s, header_len); d += header_len; @@ -145,7 +145,7 @@ inline void *non_temporal_store_memcpy_avx(void *__restrict dst, __m256i *dst_cacheline = reinterpret_cast<__m256i *>(d); const __m256i *src_cacheline = reinterpret_cast<const __m256i *>(s); constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m256i); - int loops = len / kCacheLineSize; + size_t loops = len / kCacheLineSize; while (len >= kCacheLineSize) { __m256i temp1, temp2; |