From c2e9ce1d07116432c64fbc2f4e0328d387ad6a1c Mon Sep 17 00:00:00 2001 From: Derek Mauro Date: Wed, 23 Nov 2022 11:38:14 -0800 Subject: CRC: Get CPU detection and hardware acceleration working on MSVC x86(_64) Using /arch:AVX on MSVC now uses the accelerated implementation PiperOrigin-RevId: 490550573 Change-Id: I924259845f38ee41d15f23f95ad085ad664642b5 --- absl/crc/internal/cpu_detect.cc | 79 +++++++++++++------------ absl/crc/internal/crc32_x86_arm_combined_simd.h | 8 ++- absl/crc/internal/crc_memcpy_x86_64.cc | 37 +++++++++--- absl/crc/internal/crc_x86_arm_combined.cc | 3 + absl/crc/internal/non_temporal_memcpy.h | 58 ++++++++++-------- 5 files changed, 111 insertions(+), 74 deletions(-) (limited to 'absl/crc') diff --git a/absl/crc/internal/cpu_detect.cc b/absl/crc/internal/cpu_detect.cc index 339b7cc7..253c71fd 100644 --- a/absl/crc/internal/cpu_detect.cc +++ b/absl/crc/internal/cpu_detect.cc @@ -24,30 +24,29 @@ #include #endif +#if defined(_WIN32) || defined(_WIN64) +#include +#endif + namespace absl { ABSL_NAMESPACE_BEGIN namespace crc_internal { -#if defined(__x86_64__) - -// Inline cpuid instruction. %rbx is occasionally used to address stack -// variables in presence of dynamic allocas. Preserve the %rbx register via -// %rdi to work around a clang bug https://bugs.llvm.org/show_bug.cgi?id=17907 -// (%rbx in an output constraint is not considered a clobbered register). -// -// a_inp and c_inp are the input parameters eax and ecx of the CPUID -// instruction. -// a, b, c, and d contain the contents of eax, ebx, ecx, and edx as returned by -// the CPUID instruction -#define ABSL_INTERNAL_GETCPUID(a, b, c, d, a_inp, c_inp) \ - asm("mov %%rbx, %%rdi\n" \ - "cpuid\n" \ - "xchg %%rdi, %%rbx\n" \ - : "=a"(a), "=D"(b), "=c"(c), "=d"(d) \ - : "a"(a_inp), "2"(c_inp)) +#if defined(__x86_64__) || defined(_M_X64) namespace { +#if !defined(_WIN32) && !defined(_WIN64) +// MSVC defines this function for us. +// https://learn.microsoft.com/en-us/cpp/intrinsics/cpuid-cpuidex +static void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile("cpuid \n\t" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), + "=d"(cpu_info[3]) + : "a"(info_type), "c"(0)); +} +#endif // !defined(_WIN32) && !defined(_WIN64) + enum class Vendor { kUnknown, kIntel, @@ -55,14 +54,14 @@ enum class Vendor { }; Vendor GetVendor() { - uint32_t eax, ebx, ecx, edx; + // Get the vendor string (issue CPUID with eax = 0). + int cpu_info[4]; + __cpuid(cpu_info, 0); - // Get vendor string (issue CPUID with eax = 0) - ABSL_INTERNAL_GETCPUID(eax, ebx, ecx, edx, 0, 0); std::string vendor; - vendor.append(reinterpret_cast(&ebx), 4); - vendor.append(reinterpret_cast(&edx), 4); - vendor.append(reinterpret_cast(&ecx), 4); + vendor.append(reinterpret_cast(&cpu_info[1]), 4); + vendor.append(reinterpret_cast(&cpu_info[3]), 4); + vendor.append(reinterpret_cast(&cpu_info[2]), 4); if (vendor == "GenuineIntel") { return Vendor::kIntel; } else if (vendor == "AuthenticAmd") { @@ -73,13 +72,14 @@ Vendor GetVendor() { } CpuType GetIntelCpuType() { - uint32_t eax, ebx, ecx, edx; - // to get general information and extended features we send eax = 1 and + // To get general information and extended features we send eax = 1 and // ecx = 0 to cpuid. The response is returned in eax, ebx, ecx and edx. // (See Intel 64 and IA-32 Architectures Software Developer's Manual // Volume 2A: Instruction Set Reference, A-M CPUID). // https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-2a-manual.html - ABSL_INTERNAL_GETCPUID(eax, ebx, ecx, edx, 1, 0); + // https://learn.microsoft.com/en-us/cpp/intrinsics/cpuid-cpuidex + int cpu_info[4]; + __cpuid(cpu_info, 1); // Response in eax bits as follows: // 0-3 (stepping id) @@ -89,12 +89,12 @@ CpuType GetIntelCpuType() { // 16-19 (extended model) // 20-27 (extended family) - int family = (eax >> 8) & 0x0f; - int model_num = (eax >> 4) & 0x0f; - int ext_family = (eax >> 20) & 0xff; - int ext_model_num = (eax >> 16) & 0x0f; + int family = (cpu_info[0] >> 8) & 0x0f; + int model_num = (cpu_info[0] >> 4) & 0x0f; + int ext_family = (cpu_info[0] >> 20) & 0xff; + int ext_model_num = (cpu_info[0] >> 16) & 0x0f; - int brand_id = ebx & 0xff; + int brand_id = cpu_info[1] & 0xff; // Process the extended family and model info if necessary if (family == 0x0f) { @@ -123,7 +123,7 @@ CpuType GetIntelCpuType() { case 0x56: // BroadwellDE return CpuType::kIntelBroadwell; case 0x55: // Skylake Xeon - if ((eax & 0x0f) < 5) { // stepping < 5 is skylake + if ((cpu_info[0] & 0x0f) < 5) { // stepping < 5 is skylake return CpuType::kIntelSkylakeXeon; } else { // stepping >= 5 is cascadelake return CpuType::kIntelCascadelakeXeon; @@ -142,12 +142,13 @@ CpuType GetIntelCpuType() { } CpuType GetAmdCpuType() { - uint32_t eax, ebx, ecx, edx; - // to get general information and extended features we send eax = 1 and + // To get general information and extended features we send eax = 1 and // ecx = 0 to cpuid. The response is returned in eax, ebx, ecx and edx. // (See Intel 64 and IA-32 Architectures Software Developer's Manual // Volume 2A: Instruction Set Reference, A-M CPUID). - ABSL_INTERNAL_GETCPUID(eax, ebx, ecx, edx, 1, 0); + // https://learn.microsoft.com/en-us/cpp/intrinsics/cpuid-cpuidex + int cpu_info[4]; + __cpuid(cpu_info, 1); // Response in eax bits as follows: // 0-3 (stepping id) @@ -157,10 +158,10 @@ CpuType GetAmdCpuType() { // 16-19 (extended model) // 20-27 (extended family) - int family = (eax >> 8) & 0x0f; - int model_num = (eax >> 4) & 0x0f; - int ext_family = (eax >> 20) & 0xff; - int ext_model_num = (eax >> 16) & 0x0f; + int family = (cpu_info[0] >> 8) & 0x0f; + int model_num = (cpu_info[0] >> 4) & 0x0f; + int ext_family = (cpu_info[0] >> 20) & 0xff; + int ext_model_num = (cpu_info[0] >> 16) & 0x0f; if (family == 0x0f) { family += ext_family; diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h index 59d71fd4..8f15ded3 100644 --- a/absl/crc/internal/crc32_x86_arm_combined_simd.h +++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h @@ -25,12 +25,18 @@ // We define a translation layer for both x86 and ARM for the ease of use and // most performance gains. -// We need CRC (part of sse4.2) and PCLMULQDQ instructions. +// We need CRC (part of SSE 4.2) and PCLMULQDQ instructions. #if defined(__SSE4_2__) && defined(__PCLMUL__) #include #define ABSL_CRC_INTERNAL_HAVE_X86_SIMD +#elif defined(_MSC_VER) && defined(__AVX__) + +// MSVC AVX (/arch:AVX) implies SSE 4.2 and PCLMULQDQ. +#include +#define ABSL_CRC_INTERNAL_HAVE_X86_SIMD + #elif defined(__aarch64__) && defined(__LITTLE_ENDIAN__) && \ defined(__ARM_FEATURE_CRC32) && defined(__ARM_NEON) diff --git a/absl/crc/internal/crc_memcpy_x86_64.cc b/absl/crc/internal/crc_memcpy_x86_64.cc index 4680fbce..b2e9cfe6 100644 --- a/absl/crc/internal/crc_memcpy_x86_64.cc +++ b/absl/crc/internal/crc_memcpy_x86_64.cc @@ -38,24 +38,27 @@ // using 3 CRCs over fixed-size blocks where the zero-extensions required for // CRC32C::Concat can be precomputed. -#include -#include - -#include "absl/crc/crc32c.h" -#include "absl/strings/string_view.h" - #ifdef __SSE4_2__ +#include +#endif -#include -#include +#ifdef _MSC_VER +#include +#endif +#include +#include #include #include "absl/base/dynamic_annotations.h" #include "absl/base/internal/prefetch.h" #include "absl/base/optimization.h" +#include "absl/crc/crc32c.h" #include "absl/crc/internal/cpu_detect.h" #include "absl/crc/internal/crc_memcpy.h" +#include "absl/strings/string_view.h" + +#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(__AVX__)) namespace absl { ABSL_NAMESPACE_BEGIN @@ -88,7 +91,9 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, uint64_t int_data[kIntLoadsPerVec * int_regions]; while (copy_rounds > 0) { +#ifdef __GNUC__ #pragma unroll_completely +#endif for (int i = 0; i < vec_regions; i++) { int region = i; @@ -109,7 +114,9 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, _mm_extract_epi64(data[i], 1))); } +#ifdef __GNUC__ #pragma unroll_completely +#endif for (int i = 0; i < int_regions; i++) { int region = vec_regions + i; @@ -117,7 +124,9 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, reinterpret_cast(*src + region_size * region); auto* udst = reinterpret_cast(*dst + region_size * region); +#ifdef __GNUC__ #pragma unroll_completely +#endif for (int j = 0; j < kIntLoadsPerVec; j++) { int data_index = i * kIntLoadsPerVec + j; @@ -238,7 +247,9 @@ crc32c_t AcceleratedCrcMemcpyEngine::Compute( // Main loop. while (copy_rounds > kBlocksPerCacheLine) { // Prefetch kPrefetchAhead bytes ahead of each pointer. +#ifdef __GNUC__ #pragma unroll_completely +#endif for (int i = 0; i < kRegions; i++) { absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead + region_size * i); @@ -247,10 +258,14 @@ crc32c_t AcceleratedCrcMemcpyEngine::Compute( } // Load and store data, computing CRC on the way. +#ifdef __GNUC__ #pragma unroll_completely +#endif for (int i = 0; i < kBlocksPerCacheLine; i++) { // Copy and CRC the data for the CRC regions. +#ifdef __GNUC__ #pragma unroll_completely +#endif for (int j = 0; j < vec_regions; j++) { // Cycle which regions get vector load/store and integer load/store, to // engage prefetching logic around vector load/stores and save issue @@ -276,7 +291,9 @@ crc32c_t AcceleratedCrcMemcpyEngine::Compute( } // Preload the partial CRCs for the CLMUL subregions. +#ifdef __GNUC__ #pragma unroll_completely +#endif for (int j = 0; j < int_regions; j++) { // Cycle which regions get vector load/store and integer load/store, to // engage prefetching logic around vector load/stores and save issue @@ -288,7 +305,9 @@ crc32c_t AcceleratedCrcMemcpyEngine::Compute( auto* udst = reinterpret_cast(dst_bytes + region_size * region); +#ifdef __GNUC__ #pragma unroll_completely +#endif for (int k = 0; k < kIntLoadsPerVec; k++) { int data_index = j * kIntLoadsPerVec + k; @@ -432,4 +451,4 @@ std::unique_ptr CrcMemcpy::GetTestEngine(int vector, ABSL_NAMESPACE_END } // namespace absl -#endif // __SSE4_2__ +#endif // defined(__SSE4_2__) || (defined(_MSC_VER) && defined(__AVX__)) diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc index 06f9c69c..d771a786 100644 --- a/absl/crc/internal/crc_x86_arm_combined.cc +++ b/absl/crc/internal/crc_x86_arm_combined.cc @@ -35,6 +35,9 @@ #define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C #elif defined(__SSE4_2__) && defined(__PCLMUL__) #define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C +#elif defined(_MSC_VER) && defined(__AVX__) +// MSVC AVX support (/arch:AVX) implies SSE 4.2 and PCLMUL support. +#define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C #endif namespace absl { diff --git a/absl/crc/internal/non_temporal_memcpy.h b/absl/crc/internal/non_temporal_memcpy.h index 0c6d7655..092c6078 100644 --- a/absl/crc/internal/non_temporal_memcpy.h +++ b/absl/crc/internal/non_temporal_memcpy.h @@ -15,46 +15,56 @@ #ifndef ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_ #define ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_ -#include -#include -#include -#include - -#include "absl/base/config.h" -#include "absl/base/optimization.h" +#ifdef _MSC_VER +#include +#endif #ifdef __SSE__ -// Only include if we're running on a CPU that supports SSE ISA, needed for -// sfence -#include // IWYU pragma: keep +#include #endif + #ifdef __SSE2__ -// Only include if we're running on a CPU that supports SSE2 ISA, needed for -// movdqa, movdqu, movntdq -#include // IWYU pragma: keep +#include +#endif + +#ifdef __SSE3__ +#include +#endif + +#ifdef __AVX__ +#include #endif + #ifdef __aarch64__ -// Only include if we're running on a CPU that supports ARM NEON ISA, needed for -// sfence, movdqa, movdqu, movntdq #include "absl/crc/internal/non_temporal_arm_intrinsics.h" #endif +#include +#include +#include +#include + +#include "absl/base/config.h" +#include "absl/base/optimization.h" + namespace absl { ABSL_NAMESPACE_BEGIN namespace crc_internal { + // This non-temporal memcpy does regular load and non-temporal store memory // copy. It is compatible to both 16-byte aligned and unaligned addresses. If // data at the destination is not immediately accessed, using non-temporal // memcpy can save 1 DRAM load of the destination cacheline. - -constexpr int kCacheLineSize = ABSL_CACHELINE_SIZE; +constexpr size_t kCacheLineSize = ABSL_CACHELINE_SIZE; // If the objects overlap, the behavior is undefined. -// MSVC does not have proper header support for some of these intrinsics, -// so it should go to fallback inline void *non_temporal_store_memcpy(void *__restrict dst, const void *__restrict src, size_t len) { -#if (defined(__SSE3__) || defined(__aarch64__)) && !defined(_MSC_VER) +#if defined(__SSE3__) || defined(__aarch64__) || \ + (defined(_MSC_VER) && defined(__AVX__)) + // This implementation requires SSE3. + // MSVC cannot target SSE3 directly, but when MSVC targets AVX, + // SSE3 support is implied. uint8_t *d = reinterpret_cast(dst); const uint8_t *s = reinterpret_cast(src); @@ -104,17 +114,15 @@ inline void *non_temporal_store_memcpy(void *__restrict dst, } return dst; #else - // Fallback to regular memcpy when SSE2/3 & aarch64 is not available. + // Fallback to regular memcpy. return memcpy(dst, src, len); -#endif // __SSE3__ || __aarch64__ +#endif // __SSE3__ || __aarch64__ || (_MSC_VER && __AVX__) } -// MSVC does not have proper header support for some of these intrinsics, -// so it should go to fallback inline void *non_temporal_store_memcpy_avx(void *__restrict dst, const void *__restrict src, size_t len) { -#if defined(__AVX__) && !defined(_MSC_VER) +#ifdef __AVX__ uint8_t *d = reinterpret_cast(dst); const uint8_t *s = reinterpret_cast(src); -- cgit v1.2.3