// Copyright 2022 The Abseil Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Simultaneous memcopy and CRC-32C for x86-64. Uses integer registers because // XMM registers do not support the CRC instruction (yet). While copying, // compute the running CRC of the data being copied. // // It is assumed that any CPU running this code has SSE4.2 instructions // available (for CRC32C). This file will do nothing if that is not true. // // The CRC instruction has a 3-byte latency, and we are stressing the ALU ports // here (unlike a traditional memcopy, which has almost no ALU use), so we will // need to copy in such a way that the CRC unit is used efficiently. We have two // regimes in this code: // 1. For operations of size < kCrcSmallSize, do the CRC then the memcpy // 2. For operations of size > kCrcSmallSize: // a) compute an initial CRC + copy on a small amount of data to align the // destination pointer on a 16-byte boundary. // b) Split the data into 3 main regions and a tail (smaller than 48 bytes) // c) Do the copy and CRC of the 3 main regions, interleaving (start with // full cache line copies for each region, then move to single 16 byte // pieces per region). // d) Combine the CRCs with CRC32C::Concat. // e) Copy the tail and extend the CRC with the CRC of the tail. // This method is not ideal for op sizes between ~1k and ~8k because CRC::Concat // takes a significant amount of time. A medium-sized approach could be added // using 3 CRCs over fixed-size blocks where the zero-extensions required for // CRC32C::Concat can be precomputed. #ifdef __SSE4_2__ #include #endif #ifdef _MSC_VER #include #endif #include #include #include #include #include #include "absl/base/config.h" #include "absl/base/optimization.h" #include "absl/base/prefetch.h" #include "absl/crc/crc32c.h" #include "absl/crc/internal/cpu_detect.h" #include "absl/crc/internal/crc_memcpy.h" #include "absl/strings/string_view.h" #ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE namespace absl { ABSL_NAMESPACE_BEGIN namespace crc_internal { namespace { inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length, crc32c_t crc) { // Small copy: just go 1 byte at a time: being nice to the branch predictor // is more important here than anything else uint32_t crc_uint32 = static_cast(crc); for (std::size_t i = 0; i < length; i++) { uint8_t data = *reinterpret_cast(src); crc_uint32 = _mm_crc32_u8(crc_uint32, data); *reinterpret_cast(dst) = data; ++src; ++dst; } return crc32c_t{crc_uint32}; } constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t); // Common function for copying the tails of multiple large regions. template inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, size_t region_size, size_t copy_rounds) { std::array<__m128i, vec_regions> data; std::array int_data; while (copy_rounds > 0) { for (size_t i = 0; i < vec_regions; i++) { size_t region = i; auto* vsrc = reinterpret_cast(*src + region_size * region); auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region); // Load the blocks, unaligned data[i] = _mm_loadu_si128(vsrc); // Store the blocks, aligned _mm_store_si128(vdst, data[i]); // Compute the running CRC crcs[region] = crc32c_t{static_cast( _mm_crc32_u64(static_cast(crcs[region]), static_cast(_mm_extract_epi64(data[i], 0))))}; crcs[region] = crc32c_t{static_cast( _mm_crc32_u64(static_cast(crcs[region]), static_cast(_mm_extract_epi64(data[i], 1))))}; } for (size_t i = 0; i < int_regions; i++) { size_t region = vec_regions + i; auto* usrc = reinterpret_cast(*src + region_size * region); auto* udst = reinterpret_cast(*dst + region_size * region); for (size_t j = 0; j < kIntLoadsPerVec; j++) { size_t data_index = i * kIntLoadsPerVec + j; int_data[data_index] = *(usrc + j); crcs[region] = crc32c_t{static_cast(_mm_crc32_u64( static_cast(crcs[region]), int_data[data_index]))}; *(udst + j) = int_data[data_index]; } } // Increment pointers *src += sizeof(__m128i); *dst += sizeof(__m128i); --copy_rounds; } } } // namespace template class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine { public: AcceleratedCrcMemcpyEngine() = default; AcceleratedCrcMemcpyEngine(const AcceleratedCrcMemcpyEngine&) = delete; AcceleratedCrcMemcpyEngine operator=(const AcceleratedCrcMemcpyEngine&) = delete; crc32c_t Compute(void* __restrict dst, const void* __restrict src, std::size_t length, crc32c_t initial_crc) const override; }; template crc32c_t AcceleratedCrcMemcpyEngine::Compute( void* __restrict dst, const void* __restrict src, std::size_t length, crc32c_t initial_crc) const { constexpr std::size_t kRegions = vec_regions + int_regions; static_assert(kRegions > 0, "Must specify at least one region."); constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff}; constexpr std::size_t kBlockSize = sizeof(__m128i); constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize; // Number of blocks per cacheline. constexpr std::size_t kBlocksPerCacheLine = ABSL_CACHELINE_SIZE / kBlockSize; char* dst_bytes = static_cast(dst); const char* src_bytes = static_cast(src); // Make sure that one prefetch per big block is enough to cover the whole // dataset, and we don't prefetch too much. static_assert(ABSL_CACHELINE_SIZE % kBlockSize == 0, "Cache lines are not divided evenly into blocks, may have " "unintended behavior!"); // Experimentally-determined boundary between a small and large copy. // Below this number, spin-up and concatenation of CRCs takes enough time that // it kills the throughput gains of using 3 regions and wide vectors. constexpr size_t kCrcSmallSize = 256; // Experimentally-determined prefetch distance. Main loop copies will // prefeth data 2 cache lines ahead. constexpr std::size_t kPrefetchAhead = 2 * ABSL_CACHELINE_SIZE; // Small-size CRC-memcpy : just do CRC + memcpy if (length < kCrcSmallSize) { crc32c_t crc = ExtendCrc32c(initial_crc, absl::string_view(src_bytes, length)); memcpy(dst, src, length); return crc; } // Start work on the CRC: undo the XOR from the previous calculation or set up // the initial value of the CRC. // initial_crc ^= kCrcDataXor; initial_crc = crc32c_t{static_cast(initial_crc) ^ kCrcDataXor}; // Do an initial alignment copy, so we can use aligned store instructions to // the destination pointer. We align the destination pointer because the // penalty for an unaligned load is small compared to the penalty of an // unaligned store on modern CPUs. std::size_t bytes_from_last_aligned = reinterpret_cast(dst) & (kBlockSize - 1); if (bytes_from_last_aligned != 0) { std::size_t bytes_for_alignment = kBlockSize - bytes_from_last_aligned; // Do the short-sized copy and CRC. initial_crc = ShortCrcCopy(dst_bytes, src_bytes, bytes_for_alignment, initial_crc); src_bytes += bytes_for_alignment; dst_bytes += bytes_for_alignment; length -= bytes_for_alignment; } // We are going to do the copy and CRC in kRegions regions to make sure that // we can saturate the CRC unit. The CRCs will be combined at the end of the // run. Copying will use the SSE registers, and we will extract words from // the SSE registers to add to the CRC. Initially, we run the loop one full // cache line per region at a time, in order to insert prefetches. // Initialize CRCs for kRegions regions. crc32c_t crcs[kRegions]; crcs[0] = initial_crc; for (size_t i = 1; i < kRegions; i++) { crcs[i] = crc32c_t{kCrcDataXor}; } // Find the number of rounds to copy and the region size. Also compute the // tail size here. size_t copy_rounds = length / kCopyRoundSize; // Find the size of each region and the size of the tail. const std::size_t region_size = copy_rounds * kBlockSize; const std::size_t tail_size = length - (kRegions * region_size); // Holding registers for data in each region. std::array<__m128i, vec_regions> vec_data; std::array int_data; // Main loop. while (copy_rounds > kBlocksPerCacheLine) { // Prefetch kPrefetchAhead bytes ahead of each pointer. for (size_t i = 0; i < kRegions; i++) { absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i); absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i); } // Load and store data, computing CRC on the way. for (size_t i = 0; i < kBlocksPerCacheLine; i++) { // Copy and CRC the data for the CRC regions. for (size_t j = 0; j < vec_regions; j++) { // Cycle which regions get vector load/store and integer load/store, to // engage prefetching logic around vector load/stores and save issue // slots by using the integer registers. size_t region = (j + i) % kRegions; auto* vsrc = reinterpret_cast(src_bytes + region_size * region); auto* vdst = reinterpret_cast<__m128i*>(dst_bytes + region_size * region); // Load and CRC data. vec_data[j] = _mm_loadu_si128(vsrc + i); crcs[region] = crc32c_t{static_cast(_mm_crc32_u64( static_cast(crcs[region]), static_cast(_mm_extract_epi64(vec_data[j], 0))))}; crcs[region] = crc32c_t{static_cast(_mm_crc32_u64( static_cast(crcs[region]), static_cast(_mm_extract_epi64(vec_data[j], 1))))}; // Store the data. _mm_store_si128(vdst + i, vec_data[j]); } // Preload the partial CRCs for the CLMUL subregions. for (size_t j = 0; j < int_regions; j++) { // Cycle which regions get vector load/store and integer load/store, to // engage prefetching logic around vector load/stores and save issue // slots by using the integer registers. size_t region = (j + vec_regions + i) % kRegions; auto* usrc = reinterpret_cast(src_bytes + region_size * region); auto* udst = reinterpret_cast(dst_bytes + region_size * region); for (size_t k = 0; k < kIntLoadsPerVec; k++) { size_t data_index = j * kIntLoadsPerVec + k; // Load and CRC the data. int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k); crcs[region] = crc32c_t{static_cast(_mm_crc32_u64( static_cast(crcs[region]), int_data[data_index]))}; // Store the data. *(udst + i * kIntLoadsPerVec + k) = int_data[data_index]; } } } // Increment pointers src_bytes += kBlockSize * kBlocksPerCacheLine; dst_bytes += kBlockSize * kBlocksPerCacheLine; copy_rounds -= kBlocksPerCacheLine; } // Copy and CRC the tails of each region. LargeTailCopy(crcs, &dst_bytes, &src_bytes, region_size, copy_rounds); // Move the source and destination pointers to the end of the region src_bytes += region_size * (kRegions - 1); dst_bytes += region_size * (kRegions - 1); // Copy and CRC the tail through the XMM registers. std::size_t tail_blocks = tail_size / kBlockSize; LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0, tail_blocks); // Final tail copy for under 16 bytes. crcs[kRegions - 1] = ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize, crcs[kRegions - 1]); if (kRegions == 1) { // If there is only one region, finalize and return its CRC. return crc32c_t{static_cast(crcs[0]) ^ kCrcDataXor}; } // Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the // XOR done before doing block copy + CRCs. for (size_t i = 0; i + 1 < kRegions; i++) { crcs[i] = crc32c_t{static_cast(crcs[i]) ^ kCrcDataXor}; } // Build a CRC of the first kRegions - 1 regions. crc32c_t full_crc = crcs[0]; for (size_t i = 1; i + 1 < kRegions; i++) { full_crc = ConcatCrc32c(full_crc, crcs[i], region_size); } // Finalize and concatenate the final CRC, then return. crcs[kRegions - 1] = crc32c_t{static_cast(crcs[kRegions - 1]) ^ kCrcDataXor}; return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size); } CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() { #ifdef UNDEFINED_BEHAVIOR_SANITIZER // UBSAN does not play nicely with unaligned loads (which we use a lot). // Get the underlying architecture. CpuType cpu_type = GetCpuType(); switch (cpu_type) { case CpuType::kAmdRome: case CpuType::kAmdNaples: case CpuType::kAmdMilan: case CpuType::kAmdGenoa: case CpuType::kAmdRyzenV3000: case CpuType::kIntelCascadelakeXeon: case CpuType::kIntelSkylakeXeon: case CpuType::kIntelSkylake: case CpuType::kIntelBroadwell: case CpuType::kIntelHaswell: case CpuType::kIntelIvybridge: return { /*.temporal=*/new FallbackCrcMemcpyEngine(), /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(), }; // INTEL_SANDYBRIDGE performs better with SSE than AVX. case CpuType::kIntelSandybridge: return { /*.temporal=*/new FallbackCrcMemcpyEngine(), /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(), }; default: return {/*.temporal=*/new FallbackCrcMemcpyEngine(), /*.non_temporal=*/new FallbackCrcMemcpyEngine()}; } #else // Get the underlying architecture. CpuType cpu_type = GetCpuType(); switch (cpu_type) { // On Zen 2, PEXTRQ uses 2 micro-ops, including one on the vector store port // which data movement from the vector registers to the integer registers // (where CRC32C happens) to crowd the same units as vector stores. As a // result, using that path exclusively causes bottlenecking on this port. // We can avoid this bottleneck by using the integer side of the CPU for // most operations rather than the vector side. We keep a vector region to // engage some of the prefetching logic in the cache hierarchy which seems // to give vector instructions special treatment. These prefetch units see // strided access to each region, and do the right thing. case CpuType::kAmdRome: case CpuType::kAmdNaples: case CpuType::kAmdMilan: case CpuType::kAmdGenoa: case CpuType::kAmdRyzenV3000: return { /*.temporal=*/new AcceleratedCrcMemcpyEngine<1, 2>(), /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(), }; // PCLMULQDQ is slow and we don't have wide enough issue width to take // advantage of it. For an unknown architecture, don't risk using CLMULs. case CpuType::kIntelCascadelakeXeon: case CpuType::kIntelSkylakeXeon: case CpuType::kIntelSkylake: case CpuType::kIntelBroadwell: case CpuType::kIntelHaswell: case CpuType::kIntelIvybridge: return { /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(), /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(), }; // INTEL_SANDYBRIDGE performs better with SSE than AVX. case CpuType::kIntelSandybridge: return { /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(), /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(), }; default: return {/*.temporal=*/new FallbackCrcMemcpyEngine(), /*.non_temporal=*/new FallbackCrcMemcpyEngine()}; } #endif // UNDEFINED_BEHAVIOR_SANITIZER } // For testing, allow the user to specify which engine they want. std::unique_ptr CrcMemcpy::GetTestEngine(int vector, int integer) { if (vector == 3 && integer == 0) { return std::make_unique>(); } else if (vector == 1 && integer == 2) { return std::make_unique>(); } else if (vector == 1 && integer == 0) { return std::make_unique>(); } return nullptr; } } // namespace crc_internal ABSL_NAMESPACE_END } // namespace absl #endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE