From 433289a2586aa959241cf73675eadefaf21211b1 Mon Sep 17 00:00:00 2001 From: Abseil Team Date: Thu, 7 Sep 2023 06:52:28 -0700 Subject: Roll forward support for ARM intrinsics in crc_memcpy This CL rolls forward a previous change which we rolled back temporarily due to compilation errors on x86 when PCLMUL intrinsics were unavailable. *** Original change description *** This change replaces inline x86 intrinsics with generic versions that compile for both x86 and ARM depending on the target arch. This change does not enable the accelerated crc memcpy engine on ARM. That will be done in a subsequent change after the optimal number of vector and integer regions for different CPUs is determined. *** PiperOrigin-RevId: 563416413 Change-Id: Iee630a15ed83c26659adb0e8a03d3f3d3a46d688 --- absl/crc/BUILD.bazel | 5 +- absl/crc/CMakeLists.txt | 5 +- absl/crc/internal/crc32_x86_arm_combined_simd.h | 31 +- absl/crc/internal/crc_memcpy.h | 9 +- absl/crc/internal/crc_memcpy_fallback.cc | 6 +- absl/crc/internal/crc_memcpy_x86_64.cc | 446 ---------------------- absl/crc/internal/crc_memcpy_x86_arm_combined.cc | 450 +++++++++++++++++++++++ 7 files changed, 490 insertions(+), 462 deletions(-) delete mode 100644 absl/crc/internal/crc_memcpy_x86_64.cc create mode 100644 absl/crc/internal/crc_memcpy_x86_arm_combined.cc (limited to 'absl/crc') diff --git a/absl/crc/BUILD.bazel b/absl/crc/BUILD.bazel index cdbaa9b2..5580420a 100644 --- a/absl/crc/BUILD.bazel +++ b/absl/crc/BUILD.bazel @@ -54,10 +54,8 @@ cc_library( visibility = ["//visibility:private"], deps = [ ":cpu_detect", - "//absl/base", "//absl/base:config", "//absl/base:core_headers", - "//absl/base:dynamic_annotations", "//absl/base:endian", "//absl/base:prefetch", "//absl/base:raw_logging_internal", @@ -72,7 +70,7 @@ cc_library( "crc32c.cc", "internal/crc32c_inline.h", "internal/crc_memcpy_fallback.cc", - "internal/crc_memcpy_x86_64.cc", + "internal/crc_memcpy_x86_arm_combined.cc", "internal/crc_non_temporal_memcpy.cc", ], hdrs = [ @@ -89,7 +87,6 @@ cc_library( ":non_temporal_memcpy", "//absl/base:config", "//absl/base:core_headers", - "//absl/base:dynamic_annotations", "//absl/base:endian", "//absl/base:prefetch", "//absl/strings", diff --git a/absl/crc/CMakeLists.txt b/absl/crc/CMakeLists.txt index 21247160..ec7b4512 100644 --- a/absl/crc/CMakeLists.txt +++ b/absl/crc/CMakeLists.txt @@ -42,10 +42,8 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} DEPS absl::crc_cpu_detect - absl::base absl::config absl::core_headers - absl::dynamic_annotations absl::endian absl::prefetch absl::raw_logging_internal @@ -64,7 +62,7 @@ absl_cc_library( "crc32c.cc" "internal/crc32c_inline.h" "internal/crc_memcpy_fallback.cc" - "internal/crc_memcpy_x86_64.cc" + "internal/crc_memcpy_x86_arm_combined.cc" "internal/crc_non_temporal_memcpy.cc" COPTS ${ABSL_DEFAULT_COPTS} @@ -74,7 +72,6 @@ absl_cc_library( absl::non_temporal_memcpy absl::config absl::core_headers - absl::dynamic_annotations absl::endian absl::prefetch absl::str_format diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h index 39e53dd0..514e8fd9 100644 --- a/absl/crc/internal/crc32_x86_arm_combined_simd.h +++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h @@ -58,8 +58,10 @@ namespace crc_internal { #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) using V128 = uint64x2_t; +using V128u = uint64x2_t; #else using V128 = __m128i; +using V128u = __m128i_u; #endif // Starting with the initial value in |crc|, accumulates a CRC32 value for @@ -76,7 +78,10 @@ uint32_t CRC32_u64(uint32_t crc, uint64_t v); V128 V128_Load(const V128* src); // Load 128 bits of integer data. |src| does not need to be aligned. -V128 V128_LoadU(const V128* src); +V128 V128_LoadU(const V128u* src); + +// Store 128 bits of integer data. |src| must be 16-byte aligned. +void V128_Store(V128* dst, V128 data); // Polynomially multiplies the high 64 bits of |l| and |r|. V128 V128_PMulHi(const V128 l, const V128 r); @@ -109,6 +114,10 @@ V128 V128_ShiftRight(const V128 l); template int V128_Extract32(const V128 l); +// Extracts a 64-bit integer from |l|, selected with |imm|. +template +uint64_t V128_Extract64(const V128 l); + // Extracts the low 64 bits from V128. int64_t V128_Low64(const V128 l); @@ -137,7 +146,9 @@ inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) { inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); } -inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); } +inline V128 V128_LoadU(const V128u* src) { return _mm_loadu_si128(src); } + +inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); } inline V128 V128_PMulHi(const V128 l, const V128 r) { return _mm_clmulepi64_si128(l, r, 0x11); @@ -173,6 +184,11 @@ inline int V128_Extract32(const V128 l) { return _mm_extract_epi32(l, imm); } +template +inline uint64_t V128_Extract64(const V128 l) { + return static_cast(_mm_extract_epi64(l, imm)); +} + inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); } inline V128 V128_ShiftLeft64(const V128 l, const V128 r) { @@ -199,10 +215,14 @@ inline V128 V128_Load(const V128* src) { return vld1q_u64(reinterpret_cast(src)); } -inline V128 V128_LoadU(const V128* src) { +inline V128 V128_LoadU(const V128u* src) { return vld1q_u64(reinterpret_cast(src)); } +inline void V128_Store(V128* dst, V128 data) { + vst1q_u64(reinterpret_cast(dst), data); +} + // Using inline assembly as clang does not generate the pmull2 instruction and // performance drops by 15-20%. // TODO(b/193678732): Investigate why the compiler decides not to generate @@ -252,6 +272,11 @@ inline int V128_Extract32(const V128 l) { return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm); } +template +inline uint64_t V128_Extract64(const V128 l) { + return vgetq_lane_s64(vreinterpretq_s64_u64(l), imm); +} + inline int64_t V128_Low64(const V128 l) { return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0); } diff --git a/absl/crc/internal/crc_memcpy.h b/absl/crc/internal/crc_memcpy.h index 4909d433..a0fed65a 100644 --- a/absl/crc/internal/crc_memcpy.h +++ b/absl/crc/internal/crc_memcpy.h @@ -20,12 +20,15 @@ #include "absl/base/config.h" #include "absl/crc/crc32c.h" +#include "absl/crc/internal/crc32_x86_arm_combined_simd.h" // Defined if the class AcceleratedCrcMemcpyEngine exists. -#if defined(__x86_64__) && defined(__SSE4_2__) -#define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1 -#elif defined(_MSC_VER) && defined(__AVX__) +// TODO(b/299127771): Consider relaxing the pclmul requirement once the other +// intrinsics are conditionally compiled without it. +#if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) #define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1 +#elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) +#define ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE 1 #endif namespace absl { diff --git a/absl/crc/internal/crc_memcpy_fallback.cc b/absl/crc/internal/crc_memcpy_fallback.cc index 15b4b055..07795504 100644 --- a/absl/crc/internal/crc_memcpy_fallback.cc +++ b/absl/crc/internal/crc_memcpy_fallback.cc @@ -54,7 +54,8 @@ absl::crc32c_t FallbackCrcMemcpyEngine::Compute(void* __restrict dst, } // Compile the following only if we don't have -#ifndef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE +#if !defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) && \ + !defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE) CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() { CrcMemcpy::ArchSpecificEngines engines; @@ -68,7 +69,8 @@ std::unique_ptr CrcMemcpy::GetTestEngine(int /*vector*/, return std::make_unique(); } -#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE +#endif // !ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE && + // !ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE } // namespace crc_internal ABSL_NAMESPACE_END diff --git a/absl/crc/internal/crc_memcpy_x86_64.cc b/absl/crc/internal/crc_memcpy_x86_64.cc deleted file mode 100644 index c4ccd472..00000000 --- a/absl/crc/internal/crc_memcpy_x86_64.cc +++ /dev/null @@ -1,446 +0,0 @@ -// Copyright 2022 The Abseil Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Simultaneous memcopy and CRC-32C for x86-64. Uses integer registers because -// XMM registers do not support the CRC instruction (yet). While copying, -// compute the running CRC of the data being copied. -// -// It is assumed that any CPU running this code has SSE4.2 instructions -// available (for CRC32C). This file will do nothing if that is not true. -// -// The CRC instruction has a 3-byte latency, and we are stressing the ALU ports -// here (unlike a traditional memcopy, which has almost no ALU use), so we will -// need to copy in such a way that the CRC unit is used efficiently. We have two -// regimes in this code: -// 1. For operations of size < kCrcSmallSize, do the CRC then the memcpy -// 2. For operations of size > kCrcSmallSize: -// a) compute an initial CRC + copy on a small amount of data to align the -// destination pointer on a 16-byte boundary. -// b) Split the data into 3 main regions and a tail (smaller than 48 bytes) -// c) Do the copy and CRC of the 3 main regions, interleaving (start with -// full cache line copies for each region, then move to single 16 byte -// pieces per region). -// d) Combine the CRCs with CRC32C::Concat. -// e) Copy the tail and extend the CRC with the CRC of the tail. -// This method is not ideal for op sizes between ~1k and ~8k because CRC::Concat -// takes a significant amount of time. A medium-sized approach could be added -// using 3 CRCs over fixed-size blocks where the zero-extensions required for -// CRC32C::Concat can be precomputed. - -#ifdef __SSE4_2__ -#include -#endif - -#ifdef _MSC_VER -#include -#endif - -#include -#include -#include -#include -#include - -#include "absl/base/config.h" -#include "absl/base/optimization.h" -#include "absl/base/prefetch.h" -#include "absl/crc/crc32c.h" -#include "absl/crc/internal/cpu_detect.h" -#include "absl/crc/internal/crc_memcpy.h" -#include "absl/strings/string_view.h" - -#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE - -namespace absl { -ABSL_NAMESPACE_BEGIN -namespace crc_internal { - -namespace { - -inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length, - crc32c_t crc) { - // Small copy: just go 1 byte at a time: being nice to the branch predictor - // is more important here than anything else - uint32_t crc_uint32 = static_cast(crc); - for (std::size_t i = 0; i < length; i++) { - uint8_t data = *reinterpret_cast(src); - crc_uint32 = _mm_crc32_u8(crc_uint32, data); - *reinterpret_cast(dst) = data; - ++src; - ++dst; - } - return crc32c_t{crc_uint32}; -} - -constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t); - -// Common function for copying the tails of multiple large regions. -template -inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, - size_t region_size, size_t copy_rounds) { - std::array<__m128i, vec_regions> data; - std::array int_data; - - while (copy_rounds > 0) { - for (size_t i = 0; i < vec_regions; i++) { - size_t region = i; - - auto* vsrc = - reinterpret_cast(*src + region_size * region); - auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region); - - // Load the blocks, unaligned - data[i] = _mm_loadu_si128(vsrc); - - // Store the blocks, aligned - _mm_store_si128(vdst, data[i]); - - // Compute the running CRC - crcs[region] = crc32c_t{static_cast( - _mm_crc32_u64(static_cast(crcs[region]), - static_cast(_mm_extract_epi64(data[i], 0))))}; - crcs[region] = crc32c_t{static_cast( - _mm_crc32_u64(static_cast(crcs[region]), - static_cast(_mm_extract_epi64(data[i], 1))))}; - } - - for (size_t i = 0; i < int_regions; i++) { - size_t region = vec_regions + i; - - auto* usrc = - reinterpret_cast(*src + region_size * region); - auto* udst = reinterpret_cast(*dst + region_size * region); - - for (size_t j = 0; j < kIntLoadsPerVec; j++) { - size_t data_index = i * kIntLoadsPerVec + j; - - int_data[data_index] = *(usrc + j); - crcs[region] = crc32c_t{static_cast(_mm_crc32_u64( - static_cast(crcs[region]), int_data[data_index]))}; - - *(udst + j) = int_data[data_index]; - } - } - - // Increment pointers - *src += sizeof(__m128i); - *dst += sizeof(__m128i); - --copy_rounds; - } -} - -} // namespace - -template -class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine { - public: - AcceleratedCrcMemcpyEngine() = default; - AcceleratedCrcMemcpyEngine(const AcceleratedCrcMemcpyEngine&) = delete; - AcceleratedCrcMemcpyEngine operator=(const AcceleratedCrcMemcpyEngine&) = - delete; - - crc32c_t Compute(void* __restrict dst, const void* __restrict src, - std::size_t length, crc32c_t initial_crc) const override; -}; - -template -crc32c_t AcceleratedCrcMemcpyEngine::Compute( - void* __restrict dst, const void* __restrict src, std::size_t length, - crc32c_t initial_crc) const { - constexpr std::size_t kRegions = vec_regions + int_regions; - static_assert(kRegions > 0, "Must specify at least one region."); - constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff}; - constexpr std::size_t kBlockSize = sizeof(__m128i); - constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize; - - // Number of blocks per cacheline. - constexpr std::size_t kBlocksPerCacheLine = ABSL_CACHELINE_SIZE / kBlockSize; - - char* dst_bytes = static_cast(dst); - const char* src_bytes = static_cast(src); - - // Make sure that one prefetch per big block is enough to cover the whole - // dataset, and we don't prefetch too much. - static_assert(ABSL_CACHELINE_SIZE % kBlockSize == 0, - "Cache lines are not divided evenly into blocks, may have " - "unintended behavior!"); - - // Experimentally-determined boundary between a small and large copy. - // Below this number, spin-up and concatenation of CRCs takes enough time that - // it kills the throughput gains of using 3 regions and wide vectors. - constexpr size_t kCrcSmallSize = 256; - - // Experimentally-determined prefetch distance. Main loop copies will - // prefeth data 2 cache lines ahead. - constexpr std::size_t kPrefetchAhead = 2 * ABSL_CACHELINE_SIZE; - - // Small-size CRC-memcpy : just do CRC + memcpy - if (length < kCrcSmallSize) { - crc32c_t crc = - ExtendCrc32c(initial_crc, absl::string_view(src_bytes, length)); - memcpy(dst, src, length); - return crc; - } - - // Start work on the CRC: undo the XOR from the previous calculation or set up - // the initial value of the CRC. - // initial_crc ^= kCrcDataXor; - initial_crc = crc32c_t{static_cast(initial_crc) ^ kCrcDataXor}; - - // Do an initial alignment copy, so we can use aligned store instructions to - // the destination pointer. We align the destination pointer because the - // penalty for an unaligned load is small compared to the penalty of an - // unaligned store on modern CPUs. - std::size_t bytes_from_last_aligned = - reinterpret_cast(dst) & (kBlockSize - 1); - if (bytes_from_last_aligned != 0) { - std::size_t bytes_for_alignment = kBlockSize - bytes_from_last_aligned; - - // Do the short-sized copy and CRC. - initial_crc = - ShortCrcCopy(dst_bytes, src_bytes, bytes_for_alignment, initial_crc); - src_bytes += bytes_for_alignment; - dst_bytes += bytes_for_alignment; - length -= bytes_for_alignment; - } - - // We are going to do the copy and CRC in kRegions regions to make sure that - // we can saturate the CRC unit. The CRCs will be combined at the end of the - // run. Copying will use the SSE registers, and we will extract words from - // the SSE registers to add to the CRC. Initially, we run the loop one full - // cache line per region at a time, in order to insert prefetches. - - // Initialize CRCs for kRegions regions. - crc32c_t crcs[kRegions]; - crcs[0] = initial_crc; - for (size_t i = 1; i < kRegions; i++) { - crcs[i] = crc32c_t{kCrcDataXor}; - } - - // Find the number of rounds to copy and the region size. Also compute the - // tail size here. - size_t copy_rounds = length / kCopyRoundSize; - - // Find the size of each region and the size of the tail. - const std::size_t region_size = copy_rounds * kBlockSize; - const std::size_t tail_size = length - (kRegions * region_size); - - // Holding registers for data in each region. - std::array<__m128i, vec_regions> vec_data; - std::array int_data; - - // Main loop. - while (copy_rounds > kBlocksPerCacheLine) { - // Prefetch kPrefetchAhead bytes ahead of each pointer. - for (size_t i = 0; i < kRegions; i++) { - absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i); - absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i); - } - - // Load and store data, computing CRC on the way. - for (size_t i = 0; i < kBlocksPerCacheLine; i++) { - // Copy and CRC the data for the CRC regions. - for (size_t j = 0; j < vec_regions; j++) { - // Cycle which regions get vector load/store and integer load/store, to - // engage prefetching logic around vector load/stores and save issue - // slots by using the integer registers. - size_t region = (j + i) % kRegions; - - auto* vsrc = - reinterpret_cast(src_bytes + region_size * region); - auto* vdst = - reinterpret_cast<__m128i*>(dst_bytes + region_size * region); - - // Load and CRC data. - vec_data[j] = _mm_loadu_si128(vsrc + i); - crcs[region] = crc32c_t{static_cast(_mm_crc32_u64( - static_cast(crcs[region]), - static_cast(_mm_extract_epi64(vec_data[j], 0))))}; - crcs[region] = crc32c_t{static_cast(_mm_crc32_u64( - static_cast(crcs[region]), - static_cast(_mm_extract_epi64(vec_data[j], 1))))}; - - // Store the data. - _mm_store_si128(vdst + i, vec_data[j]); - } - - // Preload the partial CRCs for the CLMUL subregions. - for (size_t j = 0; j < int_regions; j++) { - // Cycle which regions get vector load/store and integer load/store, to - // engage prefetching logic around vector load/stores and save issue - // slots by using the integer registers. - size_t region = (j + vec_regions + i) % kRegions; - - auto* usrc = - reinterpret_cast(src_bytes + region_size * region); - auto* udst = - reinterpret_cast(dst_bytes + region_size * region); - - for (size_t k = 0; k < kIntLoadsPerVec; k++) { - size_t data_index = j * kIntLoadsPerVec + k; - - // Load and CRC the data. - int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k); - crcs[region] = crc32c_t{static_cast(_mm_crc32_u64( - static_cast(crcs[region]), int_data[data_index]))}; - - // Store the data. - *(udst + i * kIntLoadsPerVec + k) = int_data[data_index]; - } - } - } - - // Increment pointers - src_bytes += kBlockSize * kBlocksPerCacheLine; - dst_bytes += kBlockSize * kBlocksPerCacheLine; - copy_rounds -= kBlocksPerCacheLine; - } - - // Copy and CRC the tails of each region. - LargeTailCopy(crcs, &dst_bytes, &src_bytes, - region_size, copy_rounds); - - // Move the source and destination pointers to the end of the region - src_bytes += region_size * (kRegions - 1); - dst_bytes += region_size * (kRegions - 1); - - // Copy and CRC the tail through the XMM registers. - std::size_t tail_blocks = tail_size / kBlockSize; - LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0, - tail_blocks); - - // Final tail copy for under 16 bytes. - crcs[kRegions - 1] = - ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize, - crcs[kRegions - 1]); - - if (kRegions == 1) { - // If there is only one region, finalize and return its CRC. - return crc32c_t{static_cast(crcs[0]) ^ kCrcDataXor}; - } - - // Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the - // XOR done before doing block copy + CRCs. - for (size_t i = 0; i + 1 < kRegions; i++) { - crcs[i] = crc32c_t{static_cast(crcs[i]) ^ kCrcDataXor}; - } - - // Build a CRC of the first kRegions - 1 regions. - crc32c_t full_crc = crcs[0]; - for (size_t i = 1; i + 1 < kRegions; i++) { - full_crc = ConcatCrc32c(full_crc, crcs[i], region_size); - } - - // Finalize and concatenate the final CRC, then return. - crcs[kRegions - 1] = - crc32c_t{static_cast(crcs[kRegions - 1]) ^ kCrcDataXor}; - return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size); -} - -CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() { -#ifdef UNDEFINED_BEHAVIOR_SANITIZER - // UBSAN does not play nicely with unaligned loads (which we use a lot). - // Get the underlying architecture. - CpuType cpu_type = GetCpuType(); - switch (cpu_type) { - case CpuType::kAmdRome: - case CpuType::kAmdNaples: - case CpuType::kAmdMilan: - case CpuType::kAmdGenoa: - case CpuType::kAmdRyzenV3000: - case CpuType::kIntelCascadelakeXeon: - case CpuType::kIntelSkylakeXeon: - case CpuType::kIntelSkylake: - case CpuType::kIntelBroadwell: - case CpuType::kIntelHaswell: - case CpuType::kIntelIvybridge: - return { - /*.temporal=*/new FallbackCrcMemcpyEngine(), - /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(), - }; - // INTEL_SANDYBRIDGE performs better with SSE than AVX. - case CpuType::kIntelSandybridge: - return { - /*.temporal=*/new FallbackCrcMemcpyEngine(), - /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(), - }; - default: - return {/*.temporal=*/new FallbackCrcMemcpyEngine(), - /*.non_temporal=*/new FallbackCrcMemcpyEngine()}; - } -#else - // Get the underlying architecture. - CpuType cpu_type = GetCpuType(); - switch (cpu_type) { - // On Zen 2, PEXTRQ uses 2 micro-ops, including one on the vector store port - // which data movement from the vector registers to the integer registers - // (where CRC32C happens) to crowd the same units as vector stores. As a - // result, using that path exclusively causes bottlenecking on this port. - // We can avoid this bottleneck by using the integer side of the CPU for - // most operations rather than the vector side. We keep a vector region to - // engage some of the prefetching logic in the cache hierarchy which seems - // to give vector instructions special treatment. These prefetch units see - // strided access to each region, and do the right thing. - case CpuType::kAmdRome: - case CpuType::kAmdNaples: - case CpuType::kAmdMilan: - case CpuType::kAmdGenoa: - case CpuType::kAmdRyzenV3000: - return { - /*.temporal=*/new AcceleratedCrcMemcpyEngine<1, 2>(), - /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(), - }; - // PCLMULQDQ is slow and we don't have wide enough issue width to take - // advantage of it. For an unknown architecture, don't risk using CLMULs. - case CpuType::kIntelCascadelakeXeon: - case CpuType::kIntelSkylakeXeon: - case CpuType::kIntelSkylake: - case CpuType::kIntelBroadwell: - case CpuType::kIntelHaswell: - case CpuType::kIntelIvybridge: - return { - /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(), - /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(), - }; - // INTEL_SANDYBRIDGE performs better with SSE than AVX. - case CpuType::kIntelSandybridge: - return { - /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(), - /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(), - }; - default: - return {/*.temporal=*/new FallbackCrcMemcpyEngine(), - /*.non_temporal=*/new FallbackCrcMemcpyEngine()}; - } -#endif // UNDEFINED_BEHAVIOR_SANITIZER -} - -// For testing, allow the user to specify which engine they want. -std::unique_ptr CrcMemcpy::GetTestEngine(int vector, - int integer) { - if (vector == 3 && integer == 0) { - return std::make_unique>(); - } else if (vector == 1 && integer == 2) { - return std::make_unique>(); - } else if (vector == 1 && integer == 0) { - return std::make_unique>(); - } - return nullptr; -} - -} // namespace crc_internal -ABSL_NAMESPACE_END -} // namespace absl - -#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE diff --git a/absl/crc/internal/crc_memcpy_x86_arm_combined.cc b/absl/crc/internal/crc_memcpy_x86_arm_combined.cc new file mode 100644 index 00000000..a06485a6 --- /dev/null +++ b/absl/crc/internal/crc_memcpy_x86_arm_combined.cc @@ -0,0 +1,450 @@ +// Copyright 2022 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Simultaneous memcopy and CRC-32C for x86-64 and ARM 64. Uses integer +// registers because XMM registers do not support the CRC instruction (yet). +// While copying, compute the running CRC of the data being copied. +// +// It is assumed that any CPU running this code has SSE4.2 instructions +// available (for CRC32C). This file will do nothing if that is not true. +// +// The CRC instruction has a 3-byte latency, and we are stressing the ALU ports +// here (unlike a traditional memcopy, which has almost no ALU use), so we will +// need to copy in such a way that the CRC unit is used efficiently. We have two +// regimes in this code: +// 1. For operations of size < kCrcSmallSize, do the CRC then the memcpy +// 2. For operations of size > kCrcSmallSize: +// a) compute an initial CRC + copy on a small amount of data to align the +// destination pointer on a 16-byte boundary. +// b) Split the data into 3 main regions and a tail (smaller than 48 bytes) +// c) Do the copy and CRC of the 3 main regions, interleaving (start with +// full cache line copies for each region, then move to single 16 byte +// pieces per region). +// d) Combine the CRCs with CRC32C::Concat. +// e) Copy the tail and extend the CRC with the CRC of the tail. +// This method is not ideal for op sizes between ~1k and ~8k because CRC::Concat +// takes a significant amount of time. A medium-sized approach could be added +// using 3 CRCs over fixed-size blocks where the zero-extensions required for +// CRC32C::Concat can be precomputed. + +#ifdef __SSE4_2__ +#include +#endif + +#ifdef _MSC_VER +#include +#endif + +#include +#include +#include +#include +#include + +#include "absl/base/config.h" +#include "absl/base/optimization.h" +#include "absl/base/prefetch.h" +#include "absl/crc/crc32c.h" +#include "absl/crc/internal/cpu_detect.h" +#include "absl/crc/internal/crc32_x86_arm_combined_simd.h" +#include "absl/crc/internal/crc_memcpy.h" +#include "absl/strings/string_view.h" + +#if defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) || \ + defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE) + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace crc_internal { + +namespace { + +inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length, + crc32c_t crc) { + // Small copy: just go 1 byte at a time: being nice to the branch predictor + // is more important here than anything else + uint32_t crc_uint32 = static_cast(crc); + for (std::size_t i = 0; i < length; i++) { + uint8_t data = *reinterpret_cast(src); + crc_uint32 = CRC32_u8(crc_uint32, data); + *reinterpret_cast(dst) = data; + ++src; + ++dst; + } + return crc32c_t{crc_uint32}; +} + +constexpr size_t kIntLoadsPerVec = sizeof(V128) / sizeof(uint64_t); + +// Common function for copying the tails of multiple large regions. +template +inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, + size_t region_size, size_t copy_rounds) { + std::array data; + std::array int_data; + + while (copy_rounds > 0) { + for (size_t i = 0; i < vec_regions; i++) { + size_t region = i; + + auto* vsrc = reinterpret_cast(*src + region_size * region); + auto* vdst = reinterpret_cast(*dst + region_size * region); + + // Load the blocks, unaligned + data[i] = V128_LoadU(vsrc); + + // Store the blocks, aligned + V128_Store(vdst, data[i]); + + // Compute the running CRC + crcs[region] = crc32c_t{static_cast( + CRC32_u64(static_cast(crcs[region]), + static_cast(V128_Extract64<0>(data[i]))))}; + crcs[region] = crc32c_t{static_cast( + CRC32_u64(static_cast(crcs[region]), + static_cast(V128_Extract64<1>(data[i]))))}; + } + + for (size_t i = 0; i < int_regions; i++) { + size_t region = vec_regions + i; + + auto* usrc = + reinterpret_cast(*src + region_size * region); + auto* udst = reinterpret_cast(*dst + region_size * region); + + for (size_t j = 0; j < kIntLoadsPerVec; j++) { + size_t data_index = i * kIntLoadsPerVec + j; + + int_data[data_index] = *(usrc + j); + crcs[region] = crc32c_t{static_cast(CRC32_u64( + static_cast(crcs[region]), int_data[data_index]))}; + + *(udst + j) = int_data[data_index]; + } + } + + // Increment pointers + *src += sizeof(V128); + *dst += sizeof(V128); + --copy_rounds; + } +} + +} // namespace + +template +class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine { + public: + AcceleratedCrcMemcpyEngine() = default; + AcceleratedCrcMemcpyEngine(const AcceleratedCrcMemcpyEngine&) = delete; + AcceleratedCrcMemcpyEngine operator=(const AcceleratedCrcMemcpyEngine&) = + delete; + + crc32c_t Compute(void* __restrict dst, const void* __restrict src, + std::size_t length, crc32c_t initial_crc) const override; +}; + +template +crc32c_t AcceleratedCrcMemcpyEngine::Compute( + void* __restrict dst, const void* __restrict src, std::size_t length, + crc32c_t initial_crc) const { + constexpr std::size_t kRegions = vec_regions + int_regions; + static_assert(kRegions > 0, "Must specify at least one region."); + constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff}; + constexpr std::size_t kBlockSize = sizeof(V128); + constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize; + + // Number of blocks per cacheline. + constexpr std::size_t kBlocksPerCacheLine = ABSL_CACHELINE_SIZE / kBlockSize; + + char* dst_bytes = static_cast(dst); + const char* src_bytes = static_cast(src); + + // Make sure that one prefetch per big block is enough to cover the whole + // dataset, and we don't prefetch too much. + static_assert(ABSL_CACHELINE_SIZE % kBlockSize == 0, + "Cache lines are not divided evenly into blocks, may have " + "unintended behavior!"); + + // Experimentally-determined boundary between a small and large copy. + // Below this number, spin-up and concatenation of CRCs takes enough time that + // it kills the throughput gains of using 3 regions and wide vectors. + constexpr size_t kCrcSmallSize = 256; + + // Experimentally-determined prefetch distance. Main loop copies will + // prefeth data 2 cache lines ahead. + constexpr std::size_t kPrefetchAhead = 2 * ABSL_CACHELINE_SIZE; + + // Small-size CRC-memcpy : just do CRC + memcpy + if (length < kCrcSmallSize) { + crc32c_t crc = + ExtendCrc32c(initial_crc, absl::string_view(src_bytes, length)); + memcpy(dst, src, length); + return crc; + } + + // Start work on the CRC: undo the XOR from the previous calculation or set up + // the initial value of the CRC. + // initial_crc ^= kCrcDataXor; + initial_crc = crc32c_t{static_cast(initial_crc) ^ kCrcDataXor}; + + // Do an initial alignment copy, so we can use aligned store instructions to + // the destination pointer. We align the destination pointer because the + // penalty for an unaligned load is small compared to the penalty of an + // unaligned store on modern CPUs. + std::size_t bytes_from_last_aligned = + reinterpret_cast(dst) & (kBlockSize - 1); + if (bytes_from_last_aligned != 0) { + std::size_t bytes_for_alignment = kBlockSize - bytes_from_last_aligned; + + // Do the short-sized copy and CRC. + initial_crc = + ShortCrcCopy(dst_bytes, src_bytes, bytes_for_alignment, initial_crc); + src_bytes += bytes_for_alignment; + dst_bytes += bytes_for_alignment; + length -= bytes_for_alignment; + } + + // We are going to do the copy and CRC in kRegions regions to make sure that + // we can saturate the CRC unit. The CRCs will be combined at the end of the + // run. Copying will use the SSE registers, and we will extract words from + // the SSE registers to add to the CRC. Initially, we run the loop one full + // cache line per region at a time, in order to insert prefetches. + + // Initialize CRCs for kRegions regions. + crc32c_t crcs[kRegions]; + crcs[0] = initial_crc; + for (size_t i = 1; i < kRegions; i++) { + crcs[i] = crc32c_t{kCrcDataXor}; + } + + // Find the number of rounds to copy and the region size. Also compute the + // tail size here. + size_t copy_rounds = length / kCopyRoundSize; + + // Find the size of each region and the size of the tail. + const std::size_t region_size = copy_rounds * kBlockSize; + const std::size_t tail_size = length - (kRegions * region_size); + + // Holding registers for data in each region. + std::array vec_data; + std::array int_data; + + // Main loop. + while (copy_rounds > kBlocksPerCacheLine) { + // Prefetch kPrefetchAhead bytes ahead of each pointer. + for (size_t i = 0; i < kRegions; i++) { + absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i); +#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE + // TODO(b/297082454): investigate dropping prefetch on x86. + absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i); +#endif + } + + // Load and store data, computing CRC on the way. + for (size_t i = 0; i < kBlocksPerCacheLine; i++) { + // Copy and CRC the data for the CRC regions. + for (size_t j = 0; j < vec_regions; j++) { + // Cycle which regions get vector load/store and integer load/store, to + // engage prefetching logic around vector load/stores and save issue + // slots by using the integer registers. + size_t region = (j + i) % kRegions; + + auto* vsrc = + reinterpret_cast(src_bytes + region_size * region); + auto* vdst = reinterpret_cast(dst_bytes + region_size * region); + + // Load and CRC data. + vec_data[j] = V128_LoadU(vsrc + i); + crcs[region] = crc32c_t{static_cast( + CRC32_u64(static_cast(crcs[region]), + static_cast(V128_Extract64<0>(vec_data[j]))))}; + crcs[region] = crc32c_t{static_cast( + CRC32_u64(static_cast(crcs[region]), + static_cast(V128_Extract64<1>(vec_data[j]))))}; + + // Store the data. + V128_Store(vdst + i, vec_data[j]); + } + + // Preload the partial CRCs for the CLMUL subregions. + for (size_t j = 0; j < int_regions; j++) { + // Cycle which regions get vector load/store and integer load/store, to + // engage prefetching logic around vector load/stores and save issue + // slots by using the integer registers. + size_t region = (j + vec_regions + i) % kRegions; + + auto* usrc = + reinterpret_cast(src_bytes + region_size * region); + auto* udst = + reinterpret_cast(dst_bytes + region_size * region); + + for (size_t k = 0; k < kIntLoadsPerVec; k++) { + size_t data_index = j * kIntLoadsPerVec + k; + + // Load and CRC the data. + int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k); + crcs[region] = crc32c_t{static_cast(CRC32_u64( + static_cast(crcs[region]), int_data[data_index]))}; + + // Store the data. + *(udst + i * kIntLoadsPerVec + k) = int_data[data_index]; + } + } + } + + // Increment pointers + src_bytes += kBlockSize * kBlocksPerCacheLine; + dst_bytes += kBlockSize * kBlocksPerCacheLine; + copy_rounds -= kBlocksPerCacheLine; + } + + // Copy and CRC the tails of each region. + LargeTailCopy(crcs, &dst_bytes, &src_bytes, + region_size, copy_rounds); + + // Move the source and destination pointers to the end of the region + src_bytes += region_size * (kRegions - 1); + dst_bytes += region_size * (kRegions - 1); + + // Copy and CRC the tail through the XMM registers. + std::size_t tail_blocks = tail_size / kBlockSize; + LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0, + tail_blocks); + + // Final tail copy for under 16 bytes. + crcs[kRegions - 1] = + ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize, + crcs[kRegions - 1]); + + if (kRegions == 1) { + // If there is only one region, finalize and return its CRC. + return crc32c_t{static_cast(crcs[0]) ^ kCrcDataXor}; + } + + // Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the + // XOR done before doing block copy + CRCs. + for (size_t i = 0; i + 1 < kRegions; i++) { + crcs[i] = crc32c_t{static_cast(crcs[i]) ^ kCrcDataXor}; + } + + // Build a CRC of the first kRegions - 1 regions. + crc32c_t full_crc = crcs[0]; + for (size_t i = 1; i + 1 < kRegions; i++) { + full_crc = ConcatCrc32c(full_crc, crcs[i], region_size); + } + + // Finalize and concatenate the final CRC, then return. + crcs[kRegions - 1] = + crc32c_t{static_cast(crcs[kRegions - 1]) ^ kCrcDataXor}; + return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size); +} + +CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() { +#ifdef UNDEFINED_BEHAVIOR_SANITIZER + // UBSAN does not play nicely with unaligned loads (which we use a lot). + // Get the underlying architecture. + CpuType cpu_type = GetCpuType(); + switch (cpu_type) { + case CpuType::kAmdRome: + case CpuType::kAmdNaples: + case CpuType::kAmdMilan: + case CpuType::kAmdGenoa: + case CpuType::kAmdRyzenV3000: + case CpuType::kIntelCascadelakeXeon: + case CpuType::kIntelSkylakeXeon: + case CpuType::kIntelSkylake: + case CpuType::kIntelBroadwell: + case CpuType::kIntelHaswell: + case CpuType::kIntelIvybridge: + return { + /*.temporal=*/new FallbackCrcMemcpyEngine(), + /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(), + }; + // INTEL_SANDYBRIDGE performs better with SSE than AVX. + case CpuType::kIntelSandybridge: + return { + /*.temporal=*/new FallbackCrcMemcpyEngine(), + /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(), + }; + default: + return {/*.temporal=*/new FallbackCrcMemcpyEngine(), + /*.non_temporal=*/new FallbackCrcMemcpyEngine()}; + } +#else + // Get the underlying architecture. + CpuType cpu_type = GetCpuType(); + switch (cpu_type) { + // On Zen 2, PEXTRQ uses 2 micro-ops, including one on the vector store port + // which data movement from the vector registers to the integer registers + // (where CRC32C happens) to crowd the same units as vector stores. As a + // result, using that path exclusively causes bottlenecking on this port. + // We can avoid this bottleneck by using the integer side of the CPU for + // most operations rather than the vector side. We keep a vector region to + // engage some of the prefetching logic in the cache hierarchy which seems + // to give vector instructions special treatment. These prefetch units see + // strided access to each region, and do the right thing. + case CpuType::kAmdRome: + case CpuType::kAmdNaples: + case CpuType::kAmdMilan: + case CpuType::kAmdGenoa: + case CpuType::kAmdRyzenV3000: + return { + /*.temporal=*/new AcceleratedCrcMemcpyEngine<1, 2>(), + /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(), + }; + // PCLMULQDQ is slow and we don't have wide enough issue width to take + // advantage of it. For an unknown architecture, don't risk using CLMULs. + case CpuType::kIntelCascadelakeXeon: + case CpuType::kIntelSkylakeXeon: + case CpuType::kIntelSkylake: + case CpuType::kIntelBroadwell: + case CpuType::kIntelHaswell: + case CpuType::kIntelIvybridge: + return { + /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(), + /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(), + }; + // INTEL_SANDYBRIDGE performs better with SSE than AVX. + case CpuType::kIntelSandybridge: + return { + /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(), + /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(), + }; + default: + return {/*.temporal=*/new FallbackCrcMemcpyEngine(), + /*.non_temporal=*/new FallbackCrcMemcpyEngine()}; + } +#endif // UNDEFINED_BEHAVIOR_SANITIZER +} + +// For testing, allow the user to specify which engine they want. +std::unique_ptr CrcMemcpy::GetTestEngine(int vector, + int integer) { + if (vector == 3 && integer == 0) { + return std::make_unique>(); + } else if (vector == 1 && integer == 2) { + return std::make_unique>(); + } else if (vector == 1 && integer == 0) { + return std::make_unique>(); + } + return nullptr; +} + +} // namespace crc_internal +ABSL_NAMESPACE_END +} // namespace absl + +#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE || + // ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE -- cgit v1.2.3