Roll forward support for ARM intrinsics in crc_memcpy

This CL rolls forward a previous change which we rolled back temporarily due to compilation errors on x86 when PCLMUL intrinsics were unavailable. *** Original change description *** This change replaces inline x86 intrinsics with generic versions that compile for both x86 and ARM depending on the target arch. This change does not enable the accelerated crc memcpy engine on ARM. That will be done in a subsequent change after the optimal number of vector and integer regions for different CPUs is determined. *** PiperOrigin-RevId: 563416413 Change-Id: Iee630a15ed83c26659adb0e8a03d3f3d3a46d688
author: Abseil Team <absl-team@google.com> 2023-09-07 06:52:28 -0700
committer: Copybara-Service <copybara-worker@google.com> 2023-09-07 06:53:24 -0700
commit: 433289a2586aa959241cf73675eadefaf21211b1 (patch)
tree: 9bf28efde37cb93685f87172dca0ef4fdad02f76 /absl/crc
parent: 7c05348692d70df64110d5d3e10ba90545870aeb (diff)
6 files changed, 78 insertions, 50 deletions
diff --git a/absl/crc/BUILD.bazel b/absl/crc/BUILD.bazel
index cdbaa9b2..5580420a 100644
--- a/absl/crc/BUILD.bazel
+++ b/absl/crc/BUILD.bazel
@@ -54,10 +54,8 @@ cc_library(
     visibility = ["//visibility:private"],
     deps = [
         ":cpu_detect",
-        "//absl/base",
         "//absl/base:config",
         "//absl/base:core_headers",
-        "//absl/base:dynamic_annotations",
         "//absl/base:endian",
         "//absl/base:prefetch",
         "//absl/base:raw_logging_internal",
@@ -72,7 +70,7 @@ cc_library(
         "crc32c.cc",
         "internal/crc32c_inline.h",
         "internal/crc_memcpy_fallback.cc",
-        "internal/crc_memcpy_x86_64.cc",
+        "internal/crc_memcpy_x86_arm_combined.cc",
         "internal/crc_non_temporal_memcpy.cc",
     ],
     hdrs = [
@@ -89,7 +87,6 @@ cc_library(
         ":non_temporal_memcpy",
         "//absl/base:config",
         "//absl/base:core_headers",
-        "//absl/base:dynamic_annotations",
         "//absl/base:endian",
         "//absl/base:prefetch",
         "//absl/strings",
diff --git a/absl/crc/CMakeLists.txt b/absl/crc/CMakeLists.txt
index 21247160..ec7b4512 100644
--- a/absl/crc/CMakeLists.txt
+++ b/absl/crc/CMakeLists.txt
@@ -42,10 +42,8 @@ absl_cc_library(
     ${ABSL_DEFAULT_COPTS}
   DEPS
     absl::crc_cpu_detect
-    absl::base
     absl::config
     absl::core_headers
-    absl::dynamic_annotations
     absl::endian
     absl::prefetch
     absl::raw_logging_internal
@@ -64,7 +62,7 @@ absl_cc_library(
     "crc32c.cc"
     "internal/crc32c_inline.h"
     "internal/crc_memcpy_fallback.cc"
-    "internal/crc_memcpy_x86_64.cc"
+    "internal/crc_memcpy_x86_arm_combined.cc"
     "internal/crc_non_temporal_memcpy.cc"
   COPTS
     ${ABSL_DEFAULT_COPTS}
@@ -74,7 +72,6 @@ absl_cc_library(
     absl::non_temporal_memcpy
     absl::config
     absl::core_headers
-    absl::dynamic_annotations
     absl::endian
     absl::prefetch
     absl::str_format
diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h
index 39e53dd0..514e8fd9 100644
--- a/absl/crc/internal/crc32_x86_arm_combined_simd.h
+++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -58,8 +58,10 @@ namespace crc_internal {
 
 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
 using V128 = uint64x2_t;
+using V128u = uint64x2_t;
 #else
 using V128 = __m128i;
+using V128u = __m128i_u;
 #endif
 
 // Starting with the initial value in |crc|, accumulates a CRC32 value for
@@ -76,7 +78,10 @@ uint32_t CRC32_u64(uint32_t crc, uint64_t v);
 V128 V128_Load(const V128* src);
 
 // Load 128 bits of integer data. |src| does not need to be aligned.
-V128 V128_LoadU(const V128* src);
+V128 V128_LoadU(const V128u* src);
+
+// Store 128 bits of integer data. |src| must be 16-byte aligned.
+void V128_Store(V128* dst, V128 data);
 
 // Polynomially multiplies the high 64 bits of |l| and |r|.
 V128 V128_PMulHi(const V128 l, const V128 r);
@@ -109,6 +114,10 @@ V128 V128_ShiftRight(const V128 l);
 template <int imm>
 int V128_Extract32(const V128 l);
 
+// Extracts a 64-bit integer from |l|, selected with |imm|.
+template <int imm>
+uint64_t V128_Extract64(const V128 l);
+
 // Extracts the low 64 bits from V128.
 int64_t V128_Low64(const V128 l);
 
@@ -137,7 +146,9 @@ inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
 
 inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
 
-inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
+inline V128 V128_LoadU(const V128u* src) { return _mm_loadu_si128(src); }
+
+inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); }
 
 inline V128 V128_PMulHi(const V128 l, const V128 r) {
   return _mm_clmulepi64_si128(l, r, 0x11);
@@ -173,6 +184,11 @@ inline int V128_Extract32(const V128 l) {
   return _mm_extract_epi32(l, imm);
 }
 
+template <int imm>
+inline uint64_t V128_Extract64(const V128 l) {
+  return static_cast<uint64_t>(_mm_extract_epi64(l, imm));
+}
+
 inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); }
 
 inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
@@ -199,10 +215,14 @@ inline V128 V128_Load(const V128* src) {
   return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
 }
 
-inline V128 V128_LoadU(const V128* src) {
+inline V128 V128_LoadU(const V128u* src) {
   return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
 }
 
+inline void V128_Store(V128* dst, V128 data) {
+  vst1q_u64(reinterpret_cast<uint64_t*>(dst), data);
+}
+
 // Using inline assembly as clang does not generate the pmull2 instruction and
 // performance drops by 15-20%.
 // TODO(b/193678732): Investigate why the compiler decides not to generate
@@ -252,6 +272,11 @@ inline int V128_Extract32(const V128 l) {
   return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
 }
 
+template <int imm>
+inline uint64_t V128_Extract64(const V128 l) {
+  return vgetq_lane_s64(vreinterpretq_s64_u64(l), imm);
+}
+
 inline int64_t V128_Low64(const V128 l) {
   return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0);
 }
diff --git a/absl/crc/internal/crc_memcpy.h b/absl/crc/internal/crc_memcpy.h
index 4909d433..a0fed65a 100644
--- a/absl/crc/internal/crc_memcpy.h
+++ b/absl/crc/internal/crc_memcpy.h
@@ -20,12 +20,15 @@
 
 #include "absl/base/config.h"
 #include "absl/crc/crc32c.h"
+#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
 
 // Defined if the class AcceleratedCrcMemcpyEngine exists.
-#if defined(__x86_64__) && defined(__SSE4_2__)
-#define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
-#elif defined(_MSC_VER) && defined(__AVX__)
+// TODO(b/299127771): Consider relaxing the pclmul requirement once the other
+// intrinsics are conditionally compiled without it.
+#if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
 #define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
+#elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
+#define ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE 1
 #endif
 
 namespace absl {
diff --git a/absl/crc/internal/crc_memcpy_fallback.cc b/absl/crc/internal/crc_memcpy_fallback.cc
index 15b4b055..07795504 100644
--- a/absl/crc/internal/crc_memcpy_fallback.cc
+++ b/absl/crc/internal/crc_memcpy_fallback.cc
@@ -54,7 +54,8 @@ absl::crc32c_t FallbackCrcMemcpyEngine::Compute(void* __restrict dst,
 }
 
 // Compile the following only if we don't have
-#ifndef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+#if !defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) && \
+    !defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
 
 CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
   CrcMemcpy::ArchSpecificEngines engines;
@@ -68,7 +69,8 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int /*vector*/,
   return std::make_unique<FallbackCrcMemcpyEngine>();
 }
 
-#endif  // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+#endif  // !ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE &&
+        // !ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
 
 }  // namespace crc_internal
 ABSL_NAMESPACE_END
diff --git a/absl/crc/internal/crc_memcpy_x86_64.cc b/absl/crc/internal/crc_memcpy_x86_arm_combined.cc
index c4ccd472..a06485a6 100644
--- a/absl/crc/internal/crc_memcpy_x86_64.cc
+++ b/absl/crc/internal/crc_memcpy_x86_arm_combined.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Simultaneous memcopy and CRC-32C for x86-64.  Uses integer registers because
-// XMM registers do not support the CRC instruction (yet).  While copying,
-// compute the running CRC of the data being copied.
+// Simultaneous memcopy and CRC-32C for x86-64 and ARM 64. Uses integer
+// registers because XMM registers do not support the CRC instruction (yet).
+// While copying, compute the running CRC of the data being copied.
 //
 // It is assumed that any CPU running this code has SSE4.2 instructions
 // available (for CRC32C).  This file will do nothing if that is not true.
@@ -57,10 +57,12 @@
 #include "absl/base/prefetch.h"
 #include "absl/crc/crc32c.h"
 #include "absl/crc/internal/cpu_detect.h"
+#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
 #include "absl/crc/internal/crc_memcpy.h"
 #include "absl/strings/string_view.h"
 
-#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+#if defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) || \
+    defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
 
 namespace absl {
 ABSL_NAMESPACE_BEGIN
@@ -75,7 +77,7 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
   uint32_t crc_uint32 = static_cast<uint32_t>(crc);
   for (std::size_t i = 0; i < length; i++) {
     uint8_t data = *reinterpret_cast<const uint8_t*>(src);
-    crc_uint32 = _mm_crc32_u8(crc_uint32, data);
+    crc_uint32 = CRC32_u8(crc_uint32, data);
     *reinterpret_cast<uint8_t*>(dst) = data;
     ++src;
     ++dst;
@@ -83,36 +85,35 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
   return crc32c_t{crc_uint32};
 }
 
-constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
+constexpr size_t kIntLoadsPerVec = sizeof(V128) / sizeof(uint64_t);
 
 // Common function for copying the tails of multiple large regions.
 template <size_t vec_regions, size_t int_regions>
 inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
                           size_t region_size, size_t copy_rounds) {
-  std::array<__m128i, vec_regions> data;
+  std::array<V128, vec_regions> data;
   std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
 
   while (copy_rounds > 0) {
     for (size_t i = 0; i < vec_regions; i++) {
       size_t region = i;
 
-      auto* vsrc =
-          reinterpret_cast<const __m128i*>(*src + region_size * region);
-      auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region);
+      auto* vsrc = reinterpret_cast<const V128u*>(*src + region_size * region);
+      auto* vdst = reinterpret_cast<V128*>(*dst + region_size * region);
 
       // Load the blocks, unaligned
-      data[i] = _mm_loadu_si128(vsrc);
+      data[i] = V128_LoadU(vsrc);
 
       // Store the blocks, aligned
-      _mm_store_si128(vdst, data[i]);
+      V128_Store(vdst, data[i]);
 
       // Compute the running CRC
       crcs[region] = crc32c_t{static_cast<uint32_t>(
-          _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
-                        static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))};
+          CRC32_u64(static_cast<uint32_t>(crcs[region]),
+                    static_cast<uint64_t>(V128_Extract64<0>(data[i]))))};
       crcs[region] = crc32c_t{static_cast<uint32_t>(
-          _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
-                        static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))};
+          CRC32_u64(static_cast<uint32_t>(crcs[region]),
+                    static_cast<uint64_t>(V128_Extract64<1>(data[i]))))};
     }
 
     for (size_t i = 0; i < int_regions; i++) {
@@ -126,7 +127,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
         size_t data_index = i * kIntLoadsPerVec + j;
 
         int_data[data_index] = *(usrc + j);
-        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+        crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64(
             static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
 
         *(udst + j) = int_data[data_index];
@@ -134,8 +135,8 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
     }
 
     // Increment pointers
-    *src += sizeof(__m128i);
-    *dst += sizeof(__m128i);
+    *src += sizeof(V128);
+    *dst += sizeof(V128);
     --copy_rounds;
   }
 }
@@ -161,7 +162,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
   constexpr std::size_t kRegions = vec_regions + int_regions;
   static_assert(kRegions > 0, "Must specify at least one region.");
   constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
-  constexpr std::size_t kBlockSize = sizeof(__m128i);
+  constexpr std::size_t kBlockSize = sizeof(V128);
   constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
 
   // Number of blocks per cacheline.
@@ -237,7 +238,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
   const std::size_t tail_size = length - (kRegions * region_size);
 
   // Holding registers for data in each region.
-  std::array<__m128i, vec_regions> vec_data;
+  std::array<V128, vec_regions> vec_data;
   std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data;
 
   // Main loop.
@@ -245,7 +246,10 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
     // Prefetch kPrefetchAhead bytes ahead of each pointer.
     for (size_t i = 0; i < kRegions; i++) {
       absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i);
+#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+      // TODO(b/297082454): investigate dropping prefetch on x86.
       absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i);
+#endif
     }
 
     // Load and store data, computing CRC on the way.
@@ -258,21 +262,20 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
         size_t region = (j + i) % kRegions;
 
         auto* vsrc =
-            reinterpret_cast<const __m128i*>(src_bytes + region_size * region);
-        auto* vdst =
-            reinterpret_cast<__m128i*>(dst_bytes + region_size * region);
+            reinterpret_cast<const V128u*>(src_bytes + region_size * region);
+        auto* vdst = reinterpret_cast<V128*>(dst_bytes + region_size * region);
 
         // Load and CRC data.
-        vec_data[j] = _mm_loadu_si128(vsrc + i);
-        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
-            static_cast<uint32_t>(crcs[region]),
-            static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))};
-        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
-            static_cast<uint32_t>(crcs[region]),
-            static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))};
+        vec_data[j] = V128_LoadU(vsrc + i);
+        crcs[region] = crc32c_t{static_cast<uint32_t>(
+            CRC32_u64(static_cast<uint32_t>(crcs[region]),
+                      static_cast<uint64_t>(V128_Extract64<0>(vec_data[j]))))};
+        crcs[region] = crc32c_t{static_cast<uint32_t>(
+            CRC32_u64(static_cast<uint32_t>(crcs[region]),
+                      static_cast<uint64_t>(V128_Extract64<1>(vec_data[j]))))};
 
         // Store the data.
-        _mm_store_si128(vdst + i, vec_data[j]);
+        V128_Store(vdst + i, vec_data[j]);
       }
 
       // Preload the partial CRCs for the CLMUL subregions.
@@ -292,7 +295,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
 
           // Load and CRC the data.
           int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
-          crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+          crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64(
               static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
 
           // Store the data.
@@ -443,4 +446,5 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
 ABSL_NAMESPACE_END
 }  // namespace absl
 
-#endif  // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+#endif  // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE ||
+        // ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
author	Abseil Team <absl-team@google.com>	2023-09-07 06:52:28 -0700
committer	Copybara-Service <copybara-worker@google.com>	2023-09-07 06:53:24 -0700
commit	433289a2586aa959241cf73675eadefaf21211b1 (patch)
tree	9bf28efde37cb93685f87172dca0ef4fdad02f76 /absl/crc
parent	7c05348692d70df64110d5d3e10ba90545870aeb (diff)