Rollback adding support for ARM intrinsics

In some configurations this change causes compilation errors. We will roll this forward again after those issue are addressed. PiperOrigin-RevId: 562810916 Change-Id: I45b2a8d456273e9eff188f36da8f11323c4dfe66
author: Abseil Team <absl-team@google.com> 2023-09-05 09:56:46 -0700
committer: Copybara-Service <copybara-worker@google.com> 2023-09-05 09:57:30 -0700
commit: 461f1e49b395700ff4d7b0bb820df49e0f8ba5cb (patch)
tree: 22fce051a60e5e47dd173b9110e69847a6843906 /absl/crc
parent: 1a882833c0e81309d0d72d46c768820744d053df (diff)
6 files changed, 47 insertions, 75 deletions
diff --git a/absl/crc/BUILD.bazel b/absl/crc/BUILD.bazel
index 5580420a..cdbaa9b2 100644
--- a/absl/crc/BUILD.bazel
+++ b/absl/crc/BUILD.bazel
@@ -54,8 +54,10 @@ cc_library(
     visibility = ["//visibility:private"],
     deps = [
         ":cpu_detect",
+        "//absl/base",
         "//absl/base:config",
         "//absl/base:core_headers",
+        "//absl/base:dynamic_annotations",
         "//absl/base:endian",
         "//absl/base:prefetch",
         "//absl/base:raw_logging_internal",
@@ -70,7 +72,7 @@ cc_library(
         "crc32c.cc",
         "internal/crc32c_inline.h",
         "internal/crc_memcpy_fallback.cc",
-        "internal/crc_memcpy_x86_arm_combined.cc",
+        "internal/crc_memcpy_x86_64.cc",
         "internal/crc_non_temporal_memcpy.cc",
     ],
     hdrs = [
@@ -87,6 +89,7 @@ cc_library(
         ":non_temporal_memcpy",
         "//absl/base:config",
         "//absl/base:core_headers",
+        "//absl/base:dynamic_annotations",
         "//absl/base:endian",
         "//absl/base:prefetch",
         "//absl/strings",
diff --git a/absl/crc/CMakeLists.txt b/absl/crc/CMakeLists.txt
index ec7b4512..21247160 100644
--- a/absl/crc/CMakeLists.txt
+++ b/absl/crc/CMakeLists.txt
@@ -42,8 +42,10 @@ absl_cc_library(
     ${ABSL_DEFAULT_COPTS}
   DEPS
     absl::crc_cpu_detect
+    absl::base
     absl::config
     absl::core_headers
+    absl::dynamic_annotations
     absl::endian
     absl::prefetch
     absl::raw_logging_internal
@@ -62,7 +64,7 @@ absl_cc_library(
     "crc32c.cc"
     "internal/crc32c_inline.h"
     "internal/crc_memcpy_fallback.cc"
-    "internal/crc_memcpy_x86_arm_combined.cc"
+    "internal/crc_memcpy_x86_64.cc"
     "internal/crc_non_temporal_memcpy.cc"
   COPTS
     ${ABSL_DEFAULT_COPTS}
@@ -72,6 +74,7 @@ absl_cc_library(
     absl::non_temporal_memcpy
     absl::config
     absl::core_headers
+    absl::dynamic_annotations
     absl::endian
     absl::prefetch
     absl::str_format
diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h
index 514e8fd9..39e53dd0 100644
--- a/absl/crc/internal/crc32_x86_arm_combined_simd.h
+++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -58,10 +58,8 @@ namespace crc_internal {
 
 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
 using V128 = uint64x2_t;
-using V128u = uint64x2_t;
 #else
 using V128 = __m128i;
-using V128u = __m128i_u;
 #endif
 
 // Starting with the initial value in |crc|, accumulates a CRC32 value for
@@ -78,10 +76,7 @@ uint32_t CRC32_u64(uint32_t crc, uint64_t v);
 V128 V128_Load(const V128* src);
 
 // Load 128 bits of integer data. |src| does not need to be aligned.
-V128 V128_LoadU(const V128u* src);
-
-// Store 128 bits of integer data. |src| must be 16-byte aligned.
-void V128_Store(V128* dst, V128 data);
+V128 V128_LoadU(const V128* src);
 
 // Polynomially multiplies the high 64 bits of |l| and |r|.
 V128 V128_PMulHi(const V128 l, const V128 r);
@@ -114,10 +109,6 @@ V128 V128_ShiftRight(const V128 l);
 template <int imm>
 int V128_Extract32(const V128 l);
 
-// Extracts a 64-bit integer from |l|, selected with |imm|.
-template <int imm>
-uint64_t V128_Extract64(const V128 l);
-
 // Extracts the low 64 bits from V128.
 int64_t V128_Low64(const V128 l);
 
@@ -146,9 +137,7 @@ inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
 
 inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
 
-inline V128 V128_LoadU(const V128u* src) { return _mm_loadu_si128(src); }
-
-inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); }
+inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
 
 inline V128 V128_PMulHi(const V128 l, const V128 r) {
   return _mm_clmulepi64_si128(l, r, 0x11);
@@ -184,11 +173,6 @@ inline int V128_Extract32(const V128 l) {
   return _mm_extract_epi32(l, imm);
 }
 
-template <int imm>
-inline uint64_t V128_Extract64(const V128 l) {
-  return static_cast<uint64_t>(_mm_extract_epi64(l, imm));
-}
-
 inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); }
 
 inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
@@ -215,14 +199,10 @@ inline V128 V128_Load(const V128* src) {
   return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
 }
 
-inline V128 V128_LoadU(const V128u* src) {
+inline V128 V128_LoadU(const V128* src) {
   return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
 }
 
-inline void V128_Store(V128* dst, V128 data) {
-  vst1q_u64(reinterpret_cast<uint64_t*>(dst), data);
-}
-
 // Using inline assembly as clang does not generate the pmull2 instruction and
 // performance drops by 15-20%.
 // TODO(b/193678732): Investigate why the compiler decides not to generate
@@ -272,11 +252,6 @@ inline int V128_Extract32(const V128 l) {
   return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
 }
 
-template <int imm>
-inline uint64_t V128_Extract64(const V128 l) {
-  return vgetq_lane_s64(vreinterpretq_s64_u64(l), imm);
-}
-
 inline int64_t V128_Low64(const V128 l) {
   return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0);
 }
diff --git a/absl/crc/internal/crc_memcpy.h b/absl/crc/internal/crc_memcpy.h
index 1147f4ef..4909d433 100644
--- a/absl/crc/internal/crc_memcpy.h
+++ b/absl/crc/internal/crc_memcpy.h
@@ -20,15 +20,12 @@
 
 #include "absl/base/config.h"
 #include "absl/crc/crc32c.h"
-#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
 
 // Defined if the class AcceleratedCrcMemcpyEngine exists.
 #if defined(__x86_64__) && defined(__SSE4_2__)
 #define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
 #elif defined(_MSC_VER) && defined(__AVX__)
 #define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
-#elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
-#define ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE 1
 #endif
 
 namespace absl {
diff --git a/absl/crc/internal/crc_memcpy_fallback.cc b/absl/crc/internal/crc_memcpy_fallback.cc
index 07795504..15b4b055 100644
--- a/absl/crc/internal/crc_memcpy_fallback.cc
+++ b/absl/crc/internal/crc_memcpy_fallback.cc
@@ -54,8 +54,7 @@ absl::crc32c_t FallbackCrcMemcpyEngine::Compute(void* __restrict dst,
 }
 
 // Compile the following only if we don't have
-#if !defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) && \
-    !defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
+#ifndef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
 
 CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
   CrcMemcpy::ArchSpecificEngines engines;
@@ -69,8 +68,7 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int /*vector*/,
   return std::make_unique<FallbackCrcMemcpyEngine>();
 }
 
-#endif  // !ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE &&
-        // !ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
+#endif  // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
 
 }  // namespace crc_internal
 ABSL_NAMESPACE_END
diff --git a/absl/crc/internal/crc_memcpy_x86_arm_combined.cc b/absl/crc/internal/crc_memcpy_x86_64.cc
index a06485a6..c4ccd472 100644
--- a/absl/crc/internal/crc_memcpy_x86_arm_combined.cc
+++ b/absl/crc/internal/crc_memcpy_x86_64.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Simultaneous memcopy and CRC-32C for x86-64 and ARM 64. Uses integer
-// registers because XMM registers do not support the CRC instruction (yet).
-// While copying, compute the running CRC of the data being copied.
+// Simultaneous memcopy and CRC-32C for x86-64.  Uses integer registers because
+// XMM registers do not support the CRC instruction (yet).  While copying,
+// compute the running CRC of the data being copied.
 //
 // It is assumed that any CPU running this code has SSE4.2 instructions
 // available (for CRC32C).  This file will do nothing if that is not true.
@@ -57,12 +57,10 @@
 #include "absl/base/prefetch.h"
 #include "absl/crc/crc32c.h"
 #include "absl/crc/internal/cpu_detect.h"
-#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
 #include "absl/crc/internal/crc_memcpy.h"
 #include "absl/strings/string_view.h"
 
-#if defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) || \
-    defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
+#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
 
 namespace absl {
 ABSL_NAMESPACE_BEGIN
@@ -77,7 +75,7 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
   uint32_t crc_uint32 = static_cast<uint32_t>(crc);
   for (std::size_t i = 0; i < length; i++) {
     uint8_t data = *reinterpret_cast<const uint8_t*>(src);
-    crc_uint32 = CRC32_u8(crc_uint32, data);
+    crc_uint32 = _mm_crc32_u8(crc_uint32, data);
     *reinterpret_cast<uint8_t*>(dst) = data;
     ++src;
     ++dst;
@@ -85,35 +83,36 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
   return crc32c_t{crc_uint32};
 }
 
-constexpr size_t kIntLoadsPerVec = sizeof(V128) / sizeof(uint64_t);
+constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
 
 // Common function for copying the tails of multiple large regions.
 template <size_t vec_regions, size_t int_regions>
 inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
                           size_t region_size, size_t copy_rounds) {
-  std::array<V128, vec_regions> data;
+  std::array<__m128i, vec_regions> data;
   std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
 
   while (copy_rounds > 0) {
     for (size_t i = 0; i < vec_regions; i++) {
       size_t region = i;
 
-      auto* vsrc = reinterpret_cast<const V128u*>(*src + region_size * region);
-      auto* vdst = reinterpret_cast<V128*>(*dst + region_size * region);
+      auto* vsrc =
+          reinterpret_cast<const __m128i*>(*src + region_size * region);
+      auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region);
 
       // Load the blocks, unaligned
-      data[i] = V128_LoadU(vsrc);
+      data[i] = _mm_loadu_si128(vsrc);
 
       // Store the blocks, aligned
-      V128_Store(vdst, data[i]);
+      _mm_store_si128(vdst, data[i]);
 
       // Compute the running CRC
       crcs[region] = crc32c_t{static_cast<uint32_t>(
-          CRC32_u64(static_cast<uint32_t>(crcs[region]),
-                    static_cast<uint64_t>(V128_Extract64<0>(data[i]))))};
+          _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
+                        static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))};
       crcs[region] = crc32c_t{static_cast<uint32_t>(
-          CRC32_u64(static_cast<uint32_t>(crcs[region]),
-                    static_cast<uint64_t>(V128_Extract64<1>(data[i]))))};
+          _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
+                        static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))};
     }
 
     for (size_t i = 0; i < int_regions; i++) {
@@ -127,7 +126,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
         size_t data_index = i * kIntLoadsPerVec + j;
 
         int_data[data_index] = *(usrc + j);
-        crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64(
+        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
             static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
 
         *(udst + j) = int_data[data_index];
@@ -135,8 +134,8 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
     }
 
     // Increment pointers
-    *src += sizeof(V128);
-    *dst += sizeof(V128);
+    *src += sizeof(__m128i);
+    *dst += sizeof(__m128i);
     --copy_rounds;
   }
 }
@@ -162,7 +161,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
   constexpr std::size_t kRegions = vec_regions + int_regions;
   static_assert(kRegions > 0, "Must specify at least one region.");
   constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
-  constexpr std::size_t kBlockSize = sizeof(V128);
+  constexpr std::size_t kBlockSize = sizeof(__m128i);
   constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
 
   // Number of blocks per cacheline.
@@ -238,7 +237,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
   const std::size_t tail_size = length - (kRegions * region_size);
 
   // Holding registers for data in each region.
-  std::array<V128, vec_regions> vec_data;
+  std::array<__m128i, vec_regions> vec_data;
   std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data;
 
   // Main loop.
@@ -246,10 +245,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
     // Prefetch kPrefetchAhead bytes ahead of each pointer.
     for (size_t i = 0; i < kRegions; i++) {
       absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i);
-#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
-      // TODO(b/297082454): investigate dropping prefetch on x86.
       absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i);
-#endif
     }
 
     // Load and store data, computing CRC on the way.
@@ -262,20 +258,21 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
         size_t region = (j + i) % kRegions;
 
         auto* vsrc =
-            reinterpret_cast<const V128u*>(src_bytes + region_size * region);
-        auto* vdst = reinterpret_cast<V128*>(dst_bytes + region_size * region);
+            reinterpret_cast<const __m128i*>(src_bytes + region_size * region);
+        auto* vdst =
+            reinterpret_cast<__m128i*>(dst_bytes + region_size * region);
 
         // Load and CRC data.
-        vec_data[j] = V128_LoadU(vsrc + i);
-        crcs[region] = crc32c_t{static_cast<uint32_t>(
-            CRC32_u64(static_cast<uint32_t>(crcs[region]),
-                      static_cast<uint64_t>(V128_Extract64<0>(vec_data[j]))))};
-        crcs[region] = crc32c_t{static_cast<uint32_t>(
-            CRC32_u64(static_cast<uint32_t>(crcs[region]),
-                      static_cast<uint64_t>(V128_Extract64<1>(vec_data[j]))))};
+        vec_data[j] = _mm_loadu_si128(vsrc + i);
+        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+            static_cast<uint32_t>(crcs[region]),
+            static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))};
+        crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+            static_cast<uint32_t>(crcs[region]),
+            static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))};
 
         // Store the data.
-        V128_Store(vdst + i, vec_data[j]);
+        _mm_store_si128(vdst + i, vec_data[j]);
       }
 
       // Preload the partial CRCs for the CLMUL subregions.
@@ -295,7 +292,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
 
           // Load and CRC the data.
           int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
-          crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64(
+          crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
               static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
 
           // Store the data.
@@ -446,5 +443,4 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
 ABSL_NAMESPACE_END
 }  // namespace absl
 
-#endif  // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE ||
-        // ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
+#endif  // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
author	Abseil Team <absl-team@google.com>	2023-09-05 09:56:46 -0700
committer	Copybara-Service <copybara-worker@google.com>	2023-09-05 09:57:30 -0700
commit	461f1e49b395700ff4d7b0bb820df49e0f8ba5cb (patch)
tree	22fce051a60e5e47dd173b9110e69847a6843906 /absl/crc
parent	1a882833c0e81309d0d72d46c768820744d053df (diff)