diff options
Diffstat (limited to 'absl/crc/internal/crc_memcpy_x86_64.cc')
-rw-r--r-- | absl/crc/internal/crc_memcpy_x86_64.cc | 435 |
1 files changed, 435 insertions, 0 deletions
diff --git a/absl/crc/internal/crc_memcpy_x86_64.cc b/absl/crc/internal/crc_memcpy_x86_64.cc new file mode 100644 index 00000000..4680fbce --- /dev/null +++ b/absl/crc/internal/crc_memcpy_x86_64.cc @@ -0,0 +1,435 @@ +// Copyright 2022 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Simultaneous memcopy and CRC-32C for x86-64. Uses integer registers because +// XMM registers do not support the CRC instruction (yet). While copying, +// compute the running CRC of the data being copied. +// +// It is assumed that any CPU running this code has SSE4.2 instructions +// available (for CRC32C). This file will do nothing if that is not true. +// +// The CRC instruction has a 3-byte latency, and we are stressing the ALU ports +// here (unlike a traditional memcopy, which has almost no ALU use), so we will +// need to copy in such a way that the CRC unit is used efficiently. We have two +// regimes in this code: +// 1. For operations of size < kCrcSmallSize, do the CRC then the memcpy +// 2. For operations of size > kCrcSmallSize: +// a) compute an initial CRC + copy on a small amount of data to align the +// destination pointer on a 16-byte boundary. +// b) Split the data into 3 main regions and a tail (smaller than 48 bytes) +// c) Do the copy and CRC of the 3 main regions, interleaving (start with +// full cache line copies for each region, then move to single 16 byte +// pieces per region). +// d) Combine the CRCs with CRC32C::Concat. +// e) Copy the tail and extend the CRC with the CRC of the tail. +// This method is not ideal for op sizes between ~1k and ~8k because CRC::Concat +// takes a significant amount of time. A medium-sized approach could be added +// using 3 CRCs over fixed-size blocks where the zero-extensions required for +// CRC32C::Concat can be precomputed. + +#include <cstddef> +#include <cstdint> + +#include "absl/crc/crc32c.h" +#include "absl/strings/string_view.h" + +#ifdef __SSE4_2__ + +#include <emmintrin.h> +#include <x86intrin.h> + +#include <type_traits> + +#include "absl/base/dynamic_annotations.h" +#include "absl/base/internal/prefetch.h" +#include "absl/base/optimization.h" +#include "absl/crc/internal/cpu_detect.h" +#include "absl/crc/internal/crc_memcpy.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace crc_internal { + +namespace { + +inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length, + crc32c_t crc) { + // Small copy: just go 1 byte at a time: being nice to the branch predictor + // is more important here than anything else + uint32_t crc_uint32 = static_cast<uint32_t>(crc); + for (std::size_t i = 0; i < length; i++) { + uint8_t data = *reinterpret_cast<const uint8_t*>(src); + crc_uint32 = _mm_crc32_u8(crc_uint32, data); + *reinterpret_cast<uint8_t*>(dst) = data; + ++src; + ++dst; + } + return ToCrc32c(crc_uint32); +} + +constexpr int kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t); + +// Common function for copying the tails of multiple large regions. +template <int vec_regions, int int_regions> +inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, + size_t region_size, size_t copy_rounds) { + __m128i data[vec_regions]; + uint64_t int_data[kIntLoadsPerVec * int_regions]; + + while (copy_rounds > 0) { +#pragma unroll_completely + for (int i = 0; i < vec_regions; i++) { + int region = i; + + auto* vsrc = + reinterpret_cast<const __m128i_u*>(*src + region_size * region); + auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region); + + // Load the blocks, unaligned + data[i] = _mm_loadu_si128(vsrc); + + // Store the blocks, aligned + _mm_store_si128(vdst, data[i]); + + // Compute the running CRC + crcs[region] = ToCrc32c(_mm_crc32_u64(static_cast<uint32_t>(crcs[region]), + _mm_extract_epi64(data[i], 0))); + crcs[region] = ToCrc32c(_mm_crc32_u64(static_cast<uint32_t>(crcs[region]), + _mm_extract_epi64(data[i], 1))); + } + +#pragma unroll_completely + for (int i = 0; i < int_regions; i++) { + int region = vec_regions + i; + + auto* usrc = + reinterpret_cast<const uint64_t*>(*src + region_size * region); + auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region); + +#pragma unroll_completely + for (int j = 0; j < kIntLoadsPerVec; j++) { + int data_index = i * kIntLoadsPerVec + j; + + int_data[data_index] = *(usrc + j); + crcs[region] = ToCrc32c(_mm_crc32_u64( + static_cast<uint32_t>(crcs[region]), int_data[data_index])); + + *(udst + j) = int_data[data_index]; + } + } + + // Increment pointers + *src += sizeof(__m128i); + *dst += sizeof(__m128i); + --copy_rounds; + } +} + +} // namespace + +template <int vec_regions, int int_regions> +class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine { + public: + AcceleratedCrcMemcpyEngine() = default; + AcceleratedCrcMemcpyEngine(const AcceleratedCrcMemcpyEngine&) = delete; + AcceleratedCrcMemcpyEngine operator=(const AcceleratedCrcMemcpyEngine&) = + delete; + + crc32c_t Compute(void* __restrict dst, const void* __restrict src, + std::size_t length, crc32c_t initial_crc) const override; +}; + +template <int vec_regions, int int_regions> +crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( + void* __restrict dst, const void* __restrict src, std::size_t length, + crc32c_t initial_crc) const { + constexpr std::size_t kRegions = vec_regions + int_regions; + constexpr crc32c_t kCrcDataXor = crc32c_t{0xffffffff}; + constexpr std::size_t kBlockSize = sizeof(__m128i); + constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize; + + // Number of blocks per cacheline. + constexpr std::size_t kBlocksPerCacheLine = ABSL_CACHELINE_SIZE / kBlockSize; + + char* dst_bytes = static_cast<char*>(dst); + const char* src_bytes = static_cast<const char*>(src); + + // Make sure that one prefetch per big block is enough to cover the whole + // dataset, and we don't prefetch too much. + static_assert(ABSL_CACHELINE_SIZE % kBlockSize == 0, + "Cache lines are not divided evenly into blocks, may have " + "unintended behavior!"); + + // Experimentally-determined boundary between a small and large copy. + // Below this number, spin-up and concatenation of CRCs takes enough time that + // it kills the throughput gains of using 3 regions and wide vectors. + constexpr size_t kCrcSmallSize = 256; + + // Experimentally-determined prefetch distance. Main loop copies will + // prefeth data 2 cache lines ahead. + constexpr std::size_t kPrefetchAhead = 2 * ABSL_CACHELINE_SIZE; + + // Small-size CRC-memcpy : just do CRC + memcpy + if (length < kCrcSmallSize) { + crc32c_t crc = + ExtendCrc32c(initial_crc, absl::string_view(src_bytes, length)); + memcpy(dst, src, length); + return crc; + } + + // Start work on the CRC: undo the XOR from the previous calculation or set up + // the initial value of the CRC. + // initial_crc ^= kCrcDataXor; + initial_crc = initial_crc ^ kCrcDataXor; + + // Do an initial alignment copy, so we can use aligned store instructions to + // the destination pointer. We align the destination pointer because the + // penalty for an unaligned load is small compared to the penalty of an + // unaligned store on modern CPUs. + std::size_t bytes_from_last_aligned = + reinterpret_cast<uintptr_t>(dst) & (kBlockSize - 1); + if (bytes_from_last_aligned != 0) { + std::size_t bytes_for_alignment = kBlockSize - bytes_from_last_aligned; + + // Do the short-sized copy and CRC. + initial_crc = + ShortCrcCopy(dst_bytes, src_bytes, bytes_for_alignment, initial_crc); + src_bytes += bytes_for_alignment; + dst_bytes += bytes_for_alignment; + length -= bytes_for_alignment; + } + + // We are going to do the copy and CRC in kRegions regions to make sure that + // we can saturate the CRC unit. The CRCs will be combined at the end of the + // run. Copying will use the SSE registers, and we will extract words from + // the SSE registers to add to the CRC. Initially, we run the loop one full + // cache line per region at a time, in order to insert prefetches. + + // Initialize CRCs for kRegions regions. + crc32c_t crcs[kRegions]; + crcs[0] = initial_crc; + for (int i = 1; i < kRegions; i++) { + crcs[i] = kCrcDataXor; + } + + // Find the number of rounds to copy and the region size. Also compute the + // tail size here. + int64_t copy_rounds = length / kCopyRoundSize; + + // Find the size of each region and the size of the tail. + const std::size_t region_size = copy_rounds * kBlockSize; + const std::size_t tail_size = length - (kRegions * region_size); + + // Holding registers for data in each region. + __m128i vec_data[vec_regions]; + uint64_t int_data[int_regions * kIntLoadsPerVec]; + + // Main loop. + while (copy_rounds > kBlocksPerCacheLine) { + // Prefetch kPrefetchAhead bytes ahead of each pointer. +#pragma unroll_completely + for (int i = 0; i < kRegions; i++) { + absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead + + region_size * i); + absl::base_internal::PrefetchT0(dst_bytes + kPrefetchAhead + + region_size * i); + } + + // Load and store data, computing CRC on the way. +#pragma unroll_completely + for (int i = 0; i < kBlocksPerCacheLine; i++) { + // Copy and CRC the data for the CRC regions. +#pragma unroll_completely + for (int j = 0; j < vec_regions; j++) { + // Cycle which regions get vector load/store and integer load/store, to + // engage prefetching logic around vector load/stores and save issue + // slots by using the integer registers. + int region = (j + i) % kRegions; + + auto* src = reinterpret_cast<const __m128i_u*>(src_bytes + + region_size * region); + auto* dst = + reinterpret_cast<__m128i*>(dst_bytes + region_size * region); + + // Load and CRC data. + vec_data[j] = _mm_loadu_si128(src + i); + crcs[region] = + ToCrc32c(_mm_crc32_u64(static_cast<uint32_t>(crcs[region]), + _mm_extract_epi64(vec_data[j], 0))); + crcs[region] = + ToCrc32c(_mm_crc32_u64(static_cast<uint32_t>(crcs[region]), + _mm_extract_epi64(vec_data[j], 1))); + + // Store the data. + _mm_store_si128(dst + i, vec_data[j]); + } + + // Preload the partial CRCs for the CLMUL subregions. +#pragma unroll_completely + for (int j = 0; j < int_regions; j++) { + // Cycle which regions get vector load/store and integer load/store, to + // engage prefetching logic around vector load/stores and save issue + // slots by using the integer registers. + int region = (j + vec_regions + i) % kRegions; + + auto* usrc = + reinterpret_cast<const uint64_t*>(src_bytes + region_size * region); + auto* udst = + reinterpret_cast<uint64_t*>(dst_bytes + region_size * region); + +#pragma unroll_completely + for (int k = 0; k < kIntLoadsPerVec; k++) { + int data_index = j * kIntLoadsPerVec + k; + + // Load and CRC the data. + int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k); + crcs[region] = ToCrc32c(_mm_crc32_u64( + static_cast<uint32_t>(crcs[region]), int_data[data_index])); + + // Store the data. + *(udst + i * kIntLoadsPerVec + k) = int_data[data_index]; + } + } + } + + // Increment pointers + src_bytes += kBlockSize * kBlocksPerCacheLine; + dst_bytes += kBlockSize * kBlocksPerCacheLine; + copy_rounds -= kBlocksPerCacheLine; + } + + // Copy and CRC the tails of each region. + LargeTailCopy<vec_regions, int_regions>(crcs, &dst_bytes, &src_bytes, + region_size, copy_rounds); + + // Move the source and destination pointers to the end of the region + src_bytes += region_size * (kRegions - 1); + dst_bytes += region_size * (kRegions - 1); + + // Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the + // XOR done before doing block copy + CRCs. + for (int i = 0; i < kRegions - 1; i++) { + crcs[i] = crcs[i] ^ kCrcDataXor; + } + + // Build a CRC of the first kRegions - 1 regions. + crc32c_t full_crc = crcs[0]; + for (int i = 1; i < kRegions - 1; i++) { + full_crc = ConcatCrc32c(full_crc, crcs[i], region_size); + } + + // Copy and CRC the tail through the XMM registers. + std::size_t tail_blocks = tail_size / kBlockSize; + LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0, + tail_blocks); + + // Final tail copy for under 16 bytes. + crcs[kRegions - 1] = + ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize, + crcs[kRegions - 1]); + + // Finalize and concatenate the final CRC, then return. + crcs[kRegions - 1] = crcs[kRegions - 1] ^ kCrcDataXor; + return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size); +} + +CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() { +#ifdef UNDEFINED_BEHAVIOR_SANITIZER + // UBSAN does not play nicely with unaligned loads (which we use a lot). + // Get the underlying architecture. + CpuType cpu_type = GetCpuType(); + switch (cpu_type) { + case CpuType::kUnknown: + case CpuType::kAmdRome: + case CpuType::kAmdNaples: + case CpuType::kIntelCascadelakeXeon: + case CpuType::kIntelSkylakeXeon: + case CpuType::kIntelSkylake: + case CpuType::kIntelBroadwell: + case CpuType::kIntelHaswell: + case CpuType::kIntelIvybridge: + return { + .temporal = new FallbackCrcMemcpyEngine(), + .non_temporal = new CrcNonTemporalMemcpyAVXEngine(), + }; + // INTEL_SANDYBRIDGE performs better with SSE than AVX. + case CpuType::kIntelSandybridge: + return { + .temporal = new FallbackCrcMemcpyEngine(), + .non_temporal = new CrcNonTemporalMemcpyEngine(), + }; + default: + return {.temporal = new FallbackCrcMemcpyEngine(), + .non_temporal = new FallbackCrcMemcpyEngine()}; + } +#else + // Get the underlying architecture. + CpuType cpu_type = GetCpuType(); + switch (cpu_type) { + // On Zen 2, PEXTRQ uses 2 micro-ops, including one on the vector store port + // which data movement from the vector registers to the integer registers + // (where CRC32C happens) to crowd the same units as vector stores. As a + // result, using that path exclusively causes bottlenecking on this port. + // We can avoid this bottleneck by using the integer side of the CPU for + // most operations rather than the vector side. We keep a vector region to + // engage some of the prefetching logic in the cache hierarchy which seems + // to give vector instructions special treatment. These prefetch units see + // strided access to each region, and do the right thing. + case CpuType::kAmdRome: + case CpuType::kAmdNaples: + return { + .temporal = new AcceleratedCrcMemcpyEngine<1, 2>(), + .non_temporal = new CrcNonTemporalMemcpyAVXEngine(), + }; + // PCLMULQDQ is slow and we don't have wide enough issue width to take + // advantage of it. For an unknown architecture, don't risk using CLMULs. + case CpuType::kIntelCascadelakeXeon: + case CpuType::kIntelSkylakeXeon: + case CpuType::kIntelSkylake: + case CpuType::kIntelBroadwell: + case CpuType::kIntelHaswell: + case CpuType::kIntelIvybridge: + return { + .temporal = new AcceleratedCrcMemcpyEngine<3, 0>(), + .non_temporal = new CrcNonTemporalMemcpyAVXEngine(), + }; + // INTEL_SANDYBRIDGE performs better with SSE than AVX. + case CpuType::kIntelSandybridge: + return { + .temporal = new AcceleratedCrcMemcpyEngine<3, 0>(), + .non_temporal = new CrcNonTemporalMemcpyEngine(), + }; + default: + return {.temporal = new FallbackCrcMemcpyEngine(), + .non_temporal = new FallbackCrcMemcpyEngine()}; + } +#endif // UNDEFINED_BEHAVIOR_SANITIZER +} + +// For testing, allow the user to specify which engine they want. +std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector, + int integer) { + if (vector == 3 && integer == 0) { + return std::make_unique<AcceleratedCrcMemcpyEngine<3, 0>>(); + } else if (vector == 1 && integer == 2) { + return std::make_unique<AcceleratedCrcMemcpyEngine<1, 2>>(); + } + return nullptr; +} + +} // namespace crc_internal +ABSL_NAMESPACE_END +} // namespace absl + +#endif // __SSE4_2__ |