From c2e9ce1d07116432c64fbc2f4e0328d387ad6a1c Mon Sep 17 00:00:00 2001
From: Derek Mauro <dmauro@google.com>
Date: Wed, 23 Nov 2022 11:38:14 -0800
Subject: CRC: Get CPU detection and hardware acceleration working on MSVC
 x86(_64)

Using /arch:AVX on MSVC now uses the accelerated implementation

PiperOrigin-RevId: 490550573
Change-Id: I924259845f38ee41d15f23f95ad085ad664642b5
---
 absl/crc/internal/cpu_detect.cc                 | 79 +++++++++++++------------
 absl/crc/internal/crc32_x86_arm_combined_simd.h |  8 ++-
 absl/crc/internal/crc_memcpy_x86_64.cc          | 37 +++++++++---
 absl/crc/internal/crc_x86_arm_combined.cc       |  3 +
 absl/crc/internal/non_temporal_memcpy.h         | 58 ++++++++++--------
 5 files changed, 111 insertions(+), 74 deletions(-)

(limited to 'absl/crc')

diff --git a/absl/crc/internal/cpu_detect.cc b/absl/crc/internal/cpu_detect.cc
index 339b7cc7..253c71fd 100644
--- a/absl/crc/internal/cpu_detect.cc
+++ b/absl/crc/internal/cpu_detect.cc
@@ -24,30 +24,29 @@
 #include <sys/auxv.h>
 #endif
 
+#if defined(_WIN32) || defined(_WIN64)
+#include <intrin.h>
+#endif
+
 namespace absl {
 ABSL_NAMESPACE_BEGIN
 namespace crc_internal {
 
-#if defined(__x86_64__)
-
-// Inline cpuid instruction. %rbx is occasionally used to address stack
-// variables in presence of dynamic allocas. Preserve the %rbx register via
-// %rdi to work around a clang bug https://bugs.llvm.org/show_bug.cgi?id=17907
-// (%rbx in an output constraint is not considered a clobbered register).
-//
-// a_inp and c_inp are the input parameters eax and ecx of the CPUID
-// instruction.
-// a, b, c, and d contain the contents of eax, ebx, ecx, and edx as returned by
-// the CPUID instruction
-#define ABSL_INTERNAL_GETCPUID(a, b, c, d, a_inp, c_inp) \
-  asm("mov %%rbx, %%rdi\n"                               \
-      "cpuid\n"                                          \
-      "xchg %%rdi, %%rbx\n"                              \
-      : "=a"(a), "=D"(b), "=c"(c), "=d"(d)               \
-      : "a"(a_inp), "2"(c_inp))
+#if defined(__x86_64__) || defined(_M_X64)
 
 namespace {
 
+#if !defined(_WIN32) && !defined(_WIN64)
+// MSVC defines this function for us.
+// https://learn.microsoft.com/en-us/cpp/intrinsics/cpuid-cpuidex
+static void __cpuid(int cpu_info[4], int info_type) {
+  __asm__ volatile("cpuid \n\t"
+                   : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
+                     "=d"(cpu_info[3])
+                   : "a"(info_type), "c"(0));
+}
+#endif  // !defined(_WIN32) && !defined(_WIN64)
+
 enum class Vendor {
   kUnknown,
   kIntel,
@@ -55,14 +54,14 @@ enum class Vendor {
 };
 
 Vendor GetVendor() {
-  uint32_t eax, ebx, ecx, edx;
+  // Get the vendor string (issue CPUID with eax = 0).
+  int cpu_info[4];
+  __cpuid(cpu_info, 0);
 
-  // Get vendor string (issue CPUID with eax = 0)
-  ABSL_INTERNAL_GETCPUID(eax, ebx, ecx, edx, 0, 0);
   std::string vendor;
-  vendor.append(reinterpret_cast<char*>(&ebx), 4);
-  vendor.append(reinterpret_cast<char*>(&edx), 4);
-  vendor.append(reinterpret_cast<char*>(&ecx), 4);
+  vendor.append(reinterpret_cast<char*>(&cpu_info[1]), 4);
+  vendor.append(reinterpret_cast<char*>(&cpu_info[3]), 4);
+  vendor.append(reinterpret_cast<char*>(&cpu_info[2]), 4);
   if (vendor == "GenuineIntel") {
     return Vendor::kIntel;
   } else if (vendor == "AuthenticAmd") {
@@ -73,13 +72,14 @@ Vendor GetVendor() {
 }
 
 CpuType GetIntelCpuType() {
-  uint32_t eax, ebx, ecx, edx;
-  // to get general information and extended features we send eax = 1 and
+  // To get general information and extended features we send eax = 1 and
   // ecx = 0 to cpuid.  The response is returned in eax, ebx, ecx and edx.
   // (See Intel 64 and IA-32 Architectures Software Developer's Manual
   // Volume 2A: Instruction Set Reference, A-M CPUID).
   // https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-2a-manual.html
-  ABSL_INTERNAL_GETCPUID(eax, ebx, ecx, edx, 1, 0);
+  // https://learn.microsoft.com/en-us/cpp/intrinsics/cpuid-cpuidex
+  int cpu_info[4];
+  __cpuid(cpu_info, 1);
 
   // Response in eax bits as follows:
   // 0-3 (stepping id)
@@ -89,12 +89,12 @@ CpuType GetIntelCpuType() {
   // 16-19 (extended model)
   // 20-27 (extended family)
 
-  int family = (eax >> 8) & 0x0f;
-  int model_num = (eax >> 4) & 0x0f;
-  int ext_family = (eax >> 20) & 0xff;
-  int ext_model_num = (eax >> 16) & 0x0f;
+  int family = (cpu_info[0] >> 8) & 0x0f;
+  int model_num = (cpu_info[0] >> 4) & 0x0f;
+  int ext_family = (cpu_info[0] >> 20) & 0xff;
+  int ext_model_num = (cpu_info[0] >> 16) & 0x0f;
 
-  int brand_id = ebx & 0xff;
+  int brand_id = cpu_info[1] & 0xff;
 
   // Process the extended family and model info if necessary
   if (family == 0x0f) {
@@ -123,7 +123,7 @@ CpuType GetIntelCpuType() {
             case 0x56:  // BroadwellDE
               return CpuType::kIntelBroadwell;
             case 0x55:                 // Skylake Xeon
-              if ((eax & 0x0f) < 5) {  // stepping < 5 is skylake
+              if ((cpu_info[0] & 0x0f) < 5) {  // stepping < 5 is skylake
                 return CpuType::kIntelSkylakeXeon;
               } else {  // stepping >= 5 is cascadelake
                 return CpuType::kIntelCascadelakeXeon;
@@ -142,12 +142,13 @@ CpuType GetIntelCpuType() {
 }
 
 CpuType GetAmdCpuType() {
-  uint32_t eax, ebx, ecx, edx;
-  // to get general information and extended features we send eax = 1 and
+  // To get general information and extended features we send eax = 1 and
   // ecx = 0 to cpuid.  The response is returned in eax, ebx, ecx and edx.
   // (See Intel 64 and IA-32 Architectures Software Developer's Manual
   // Volume 2A: Instruction Set Reference, A-M CPUID).
-  ABSL_INTERNAL_GETCPUID(eax, ebx, ecx, edx, 1, 0);
+  // https://learn.microsoft.com/en-us/cpp/intrinsics/cpuid-cpuidex
+  int cpu_info[4];
+  __cpuid(cpu_info, 1);
 
   // Response in eax bits as follows:
   // 0-3 (stepping id)
@@ -157,10 +158,10 @@ CpuType GetAmdCpuType() {
   // 16-19 (extended model)
   // 20-27 (extended family)
 
-  int family = (eax >> 8) & 0x0f;
-  int model_num = (eax >> 4) & 0x0f;
-  int ext_family = (eax >> 20) & 0xff;
-  int ext_model_num = (eax >> 16) & 0x0f;
+  int family = (cpu_info[0] >> 8) & 0x0f;
+  int model_num = (cpu_info[0] >> 4) & 0x0f;
+  int ext_family = (cpu_info[0] >> 20) & 0xff;
+  int ext_model_num = (cpu_info[0] >> 16) & 0x0f;
 
   if (family == 0x0f) {
     family += ext_family;
diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h
index 59d71fd4..8f15ded3 100644
--- a/absl/crc/internal/crc32_x86_arm_combined_simd.h
+++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -25,12 +25,18 @@
 // We define a translation layer for both x86 and ARM for the ease of use and
 // most performance gains.
 
-// We need CRC (part of sse4.2) and PCLMULQDQ instructions.
+// We need CRC (part of SSE 4.2) and PCLMULQDQ instructions.
 #if defined(__SSE4_2__) && defined(__PCLMUL__)
 
 #include <x86intrin.h>
 #define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
 
+#elif defined(_MSC_VER) && defined(__AVX__)
+
+// MSVC AVX (/arch:AVX) implies SSE 4.2 and PCLMULQDQ.
+#include <intrin.h>
+#define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
+
 #elif defined(__aarch64__) && defined(__LITTLE_ENDIAN__) && \
     defined(__ARM_FEATURE_CRC32) && defined(__ARM_NEON)
 
diff --git a/absl/crc/internal/crc_memcpy_x86_64.cc b/absl/crc/internal/crc_memcpy_x86_64.cc
index 4680fbce..b2e9cfe6 100644
--- a/absl/crc/internal/crc_memcpy_x86_64.cc
+++ b/absl/crc/internal/crc_memcpy_x86_64.cc
@@ -38,24 +38,27 @@
 // using 3 CRCs over fixed-size blocks where the zero-extensions required for
 // CRC32C::Concat can be precomputed.
 
-#include <cstddef>
-#include <cstdint>
-
-#include "absl/crc/crc32c.h"
-#include "absl/strings/string_view.h"
-
 #ifdef __SSE4_2__
+#include <immintrin.h>
+#endif
 
-#include <emmintrin.h>
-#include <x86intrin.h>
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
 
+#include <cstddef>
+#include <cstdint>
 #include <type_traits>
 
 #include "absl/base/dynamic_annotations.h"
 #include "absl/base/internal/prefetch.h"
 #include "absl/base/optimization.h"
+#include "absl/crc/crc32c.h"
 #include "absl/crc/internal/cpu_detect.h"
 #include "absl/crc/internal/crc_memcpy.h"
+#include "absl/strings/string_view.h"
+
+#if defined(__SSE4_2__) || (defined(_MSC_VER) && defined(__AVX__))
 
 namespace absl {
 ABSL_NAMESPACE_BEGIN
@@ -88,7 +91,9 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
   uint64_t int_data[kIntLoadsPerVec * int_regions];
 
   while (copy_rounds > 0) {
+#ifdef __GNUC__
 #pragma unroll_completely
+#endif
     for (int i = 0; i < vec_regions; i++) {
       int region = i;
 
@@ -109,7 +114,9 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
                                             _mm_extract_epi64(data[i], 1)));
     }
 
+#ifdef __GNUC__
 #pragma unroll_completely
+#endif
     for (int i = 0; i < int_regions; i++) {
       int region = vec_regions + i;
 
@@ -117,7 +124,9 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
           reinterpret_cast<const uint64_t*>(*src + region_size * region);
       auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region);
 
+#ifdef __GNUC__
 #pragma unroll_completely
+#endif
       for (int j = 0; j < kIntLoadsPerVec; j++) {
         int data_index = i * kIntLoadsPerVec + j;
 
@@ -238,7 +247,9 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
   // Main loop.
   while (copy_rounds > kBlocksPerCacheLine) {
     // Prefetch kPrefetchAhead bytes ahead of each pointer.
+#ifdef __GNUC__
 #pragma unroll_completely
+#endif
     for (int i = 0; i < kRegions; i++) {
       absl::base_internal::PrefetchT0(src_bytes + kPrefetchAhead +
                                       region_size * i);
@@ -247,10 +258,14 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
     }
 
     // Load and store data, computing CRC on the way.
+#ifdef __GNUC__
 #pragma unroll_completely
+#endif
     for (int i = 0; i < kBlocksPerCacheLine; i++) {
       // Copy and CRC the data for the CRC regions.
+#ifdef __GNUC__
 #pragma unroll_completely
+#endif
       for (int j = 0; j < vec_regions; j++) {
         // Cycle which regions get vector load/store and integer load/store, to
         // engage prefetching logic around vector load/stores and save issue
@@ -276,7 +291,9 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
       }
 
       // Preload the partial CRCs for the CLMUL subregions.
+#ifdef __GNUC__
 #pragma unroll_completely
+#endif
       for (int j = 0; j < int_regions; j++) {
         // Cycle which regions get vector load/store and integer load/store, to
         // engage prefetching logic around vector load/stores and save issue
@@ -288,7 +305,9 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
         auto* udst =
             reinterpret_cast<uint64_t*>(dst_bytes + region_size * region);
 
+#ifdef __GNUC__
 #pragma unroll_completely
+#endif
         for (int k = 0; k < kIntLoadsPerVec; k++) {
           int data_index = j * kIntLoadsPerVec + k;
 
@@ -432,4 +451,4 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
 ABSL_NAMESPACE_END
 }  // namespace absl
 
-#endif  // __SSE4_2__
+#endif  // defined(__SSE4_2__) || (defined(_MSC_VER) && defined(__AVX__))
diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc
index 06f9c69c..d771a786 100644
--- a/absl/crc/internal/crc_x86_arm_combined.cc
+++ b/absl/crc/internal/crc_x86_arm_combined.cc
@@ -35,6 +35,9 @@
 #define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
 #elif defined(__SSE4_2__) && defined(__PCLMUL__)
 #define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
+#elif defined(_MSC_VER) && defined(__AVX__)
+// MSVC AVX support (/arch:AVX) implies SSE 4.2 and PCLMUL support.
+#define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
 #endif
 
 namespace absl {
diff --git a/absl/crc/internal/non_temporal_memcpy.h b/absl/crc/internal/non_temporal_memcpy.h
index 0c6d7655..092c6078 100644
--- a/absl/crc/internal/non_temporal_memcpy.h
+++ b/absl/crc/internal/non_temporal_memcpy.h
@@ -15,46 +15,56 @@
 #ifndef ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
 #define ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
 
-#include <algorithm>
-#include <cassert>
-#include <cstring>
-#include <iostream>
-
-#include "absl/base/config.h"
-#include "absl/base/optimization.h"
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
 
 #ifdef __SSE__
-// Only include if we're running on a CPU that supports SSE ISA, needed for
-// sfence
-#include <immintrin.h>  // IWYU pragma: keep
+#include <xmmintrin.h>
 #endif
+
 #ifdef __SSE2__
-// Only include if we're running on a CPU that supports SSE2 ISA, needed for
-// movdqa, movdqu, movntdq
-#include <emmintrin.h>  // IWYU pragma: keep
+#include <emmintrin.h>
+#endif
+
+#ifdef __SSE3__
+#include <pmmintrin.h>
+#endif
+
+#ifdef __AVX__
+#include <immintrin.h>
 #endif
+
 #ifdef __aarch64__
-// Only include if we're running on a CPU that supports ARM NEON ISA, needed for
-// sfence, movdqa, movdqu, movntdq
 #include "absl/crc/internal/non_temporal_arm_intrinsics.h"
 #endif
 
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <iostream>
+
+#include "absl/base/config.h"
+#include "absl/base/optimization.h"
+
 namespace absl {
 ABSL_NAMESPACE_BEGIN
 namespace crc_internal {
+
 // This non-temporal memcpy does regular load and non-temporal store memory
 // copy. It is compatible to both 16-byte aligned and unaligned addresses. If
 // data at the destination is not immediately accessed, using non-temporal
 // memcpy can save 1 DRAM load of the destination cacheline.
-
-constexpr int kCacheLineSize = ABSL_CACHELINE_SIZE;
+constexpr size_t kCacheLineSize = ABSL_CACHELINE_SIZE;
 
 // If the objects overlap, the behavior is undefined.
-// MSVC does not have proper header support for some of these intrinsics,
-// so it should go to fallback
 inline void *non_temporal_store_memcpy(void *__restrict dst,
                                        const void *__restrict src, size_t len) {
-#if (defined(__SSE3__) || defined(__aarch64__)) && !defined(_MSC_VER)
+#if defined(__SSE3__) || defined(__aarch64__) ||        \
+    (defined(_MSC_VER) && defined(__AVX__))
+  // This implementation requires SSE3.
+  // MSVC cannot target SSE3 directly, but when MSVC targets AVX,
+  // SSE3 support is implied.
   uint8_t *d = reinterpret_cast<uint8_t *>(dst);
   const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
 
@@ -104,17 +114,15 @@ inline void *non_temporal_store_memcpy(void *__restrict dst,
   }
   return dst;
 #else
-  // Fallback to regular memcpy when SSE2/3 & aarch64 is not available.
+  // Fallback to regular memcpy.
   return memcpy(dst, src, len);
-#endif  // __SSE3__ || __aarch64__
+#endif  // __SSE3__ || __aarch64__ || (_MSC_VER && __AVX__)
 }
 
-// MSVC does not have proper header support for some of these intrinsics,
-// so it should go to fallback
 inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
                                            const void *__restrict src,
                                            size_t len) {
-#if defined(__AVX__) && !defined(_MSC_VER)
+#ifdef __AVX__
   uint8_t *d = reinterpret_cast<uint8_t *>(dst);
   const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
 
-- 
cgit v1.2.3