diff options
-rw-r--r-- | CMake/AbseilDll.cmake | 2 | ||||
-rw-r--r-- | absl/crc/BUILD.bazel | 5 | ||||
-rw-r--r-- | absl/crc/CMakeLists.txt | 5 | ||||
-rw-r--r-- | absl/crc/internal/crc32_x86_arm_combined_simd.h | 31 | ||||
-rw-r--r-- | absl/crc/internal/crc_memcpy.h | 9 | ||||
-rw-r--r-- | absl/crc/internal/crc_memcpy_fallback.cc | 6 | ||||
-rw-r--r-- | absl/crc/internal/crc_memcpy_x86_arm_combined.cc (renamed from absl/crc/internal/crc_memcpy_x86_64.cc) | 72 |
7 files changed, 79 insertions, 51 deletions
diff --git a/CMake/AbseilDll.cmake b/CMake/AbseilDll.cmake index 25031be9..0b2d5c76 100644 --- a/CMake/AbseilDll.cmake +++ b/CMake/AbseilDll.cmake @@ -107,7 +107,7 @@ set(ABSL_INTERNAL_DLL_FILES "crc/internal/crc_x86_arm_combined.cc" "crc/internal/crc_memcpy_fallback.cc" "crc/internal/crc_memcpy.h" - "crc/internal/crc_memcpy_x86_64.cc" + "crc/internal/crc_memcpy_x86_arm_combined.cc" "crc/internal/crc_non_temporal_memcpy.cc" "crc/internal/crc_x86_arm_combined.cc" "crc/internal/non_temporal_arm_intrinsics.h" diff --git a/absl/crc/BUILD.bazel b/absl/crc/BUILD.bazel index cdbaa9b2..5580420a 100644 --- a/absl/crc/BUILD.bazel +++ b/absl/crc/BUILD.bazel @@ -54,10 +54,8 @@ cc_library( visibility = ["//visibility:private"], deps = [ ":cpu_detect", - "//absl/base", "//absl/base:config", "//absl/base:core_headers", - "//absl/base:dynamic_annotations", "//absl/base:endian", "//absl/base:prefetch", "//absl/base:raw_logging_internal", @@ -72,7 +70,7 @@ cc_library( "crc32c.cc", "internal/crc32c_inline.h", "internal/crc_memcpy_fallback.cc", - "internal/crc_memcpy_x86_64.cc", + "internal/crc_memcpy_x86_arm_combined.cc", "internal/crc_non_temporal_memcpy.cc", ], hdrs = [ @@ -89,7 +87,6 @@ cc_library( ":non_temporal_memcpy", "//absl/base:config", "//absl/base:core_headers", - "//absl/base:dynamic_annotations", "//absl/base:endian", "//absl/base:prefetch", "//absl/strings", diff --git a/absl/crc/CMakeLists.txt b/absl/crc/CMakeLists.txt index 21247160..ec7b4512 100644 --- a/absl/crc/CMakeLists.txt +++ b/absl/crc/CMakeLists.txt @@ -42,10 +42,8 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} DEPS absl::crc_cpu_detect - absl::base absl::config absl::core_headers - absl::dynamic_annotations absl::endian absl::prefetch absl::raw_logging_internal @@ -64,7 +62,7 @@ absl_cc_library( "crc32c.cc" "internal/crc32c_inline.h" "internal/crc_memcpy_fallback.cc" - "internal/crc_memcpy_x86_64.cc" + "internal/crc_memcpy_x86_arm_combined.cc" "internal/crc_non_temporal_memcpy.cc" COPTS ${ABSL_DEFAULT_COPTS} @@ -74,7 +72,6 @@ absl_cc_library( absl::non_temporal_memcpy absl::config absl::core_headers - absl::dynamic_annotations absl::endian absl::prefetch absl::str_format diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h index 39e53dd0..514e8fd9 100644 --- a/absl/crc/internal/crc32_x86_arm_combined_simd.h +++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h @@ -58,8 +58,10 @@ namespace crc_internal { #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) using V128 = uint64x2_t; +using V128u = uint64x2_t; #else using V128 = __m128i; +using V128u = __m128i_u; #endif // Starting with the initial value in |crc|, accumulates a CRC32 value for @@ -76,7 +78,10 @@ uint32_t CRC32_u64(uint32_t crc, uint64_t v); V128 V128_Load(const V128* src); // Load 128 bits of integer data. |src| does not need to be aligned. -V128 V128_LoadU(const V128* src); +V128 V128_LoadU(const V128u* src); + +// Store 128 bits of integer data. |src| must be 16-byte aligned. +void V128_Store(V128* dst, V128 data); // Polynomially multiplies the high 64 bits of |l| and |r|. V128 V128_PMulHi(const V128 l, const V128 r); @@ -109,6 +114,10 @@ V128 V128_ShiftRight(const V128 l); template <int imm> int V128_Extract32(const V128 l); +// Extracts a 64-bit integer from |l|, selected with |imm|. +template <int imm> +uint64_t V128_Extract64(const V128 l); + // Extracts the low 64 bits from V128. int64_t V128_Low64(const V128 l); @@ -137,7 +146,9 @@ inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) { inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); } -inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); } +inline V128 V128_LoadU(const V128u* src) { return _mm_loadu_si128(src); } + +inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); } inline V128 V128_PMulHi(const V128 l, const V128 r) { return _mm_clmulepi64_si128(l, r, 0x11); @@ -173,6 +184,11 @@ inline int V128_Extract32(const V128 l) { return _mm_extract_epi32(l, imm); } +template <int imm> +inline uint64_t V128_Extract64(const V128 l) { + return static_cast<uint64_t>(_mm_extract_epi64(l, imm)); +} + inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); } inline V128 V128_ShiftLeft64(const V128 l, const V128 r) { @@ -199,10 +215,14 @@ inline V128 V128_Load(const V128* src) { return vld1q_u64(reinterpret_cast<const uint64_t*>(src)); } -inline V128 V128_LoadU(const V128* src) { +inline V128 V128_LoadU(const V128u* src) { return vld1q_u64(reinterpret_cast<const uint64_t*>(src)); } +inline void V128_Store(V128* dst, V128 data) { + vst1q_u64(reinterpret_cast<uint64_t*>(dst), data); +} + // Using inline assembly as clang does not generate the pmull2 instruction and // performance drops by 15-20%. // TODO(b/193678732): Investigate why the compiler decides not to generate @@ -252,6 +272,11 @@ inline int V128_Extract32(const V128 l) { return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm); } +template <int imm> +inline uint64_t V128_Extract64(const V128 l) { + return vgetq_lane_s64(vreinterpretq_s64_u64(l), imm); +} + inline int64_t V128_Low64(const V128 l) { return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0); } diff --git a/absl/crc/internal/crc_memcpy.h b/absl/crc/internal/crc_memcpy.h index 4909d433..a0fed65a 100644 --- a/absl/crc/internal/crc_memcpy.h +++ b/absl/crc/internal/crc_memcpy.h @@ -20,12 +20,15 @@ #include "absl/base/config.h" #include "absl/crc/crc32c.h" +#include "absl/crc/internal/crc32_x86_arm_combined_simd.h" // Defined if the class AcceleratedCrcMemcpyEngine exists. -#if defined(__x86_64__) && defined(__SSE4_2__) -#define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1 -#elif defined(_MSC_VER) && defined(__AVX__) +// TODO(b/299127771): Consider relaxing the pclmul requirement once the other +// intrinsics are conditionally compiled without it. +#if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) #define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1 +#elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) +#define ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE 1 #endif namespace absl { diff --git a/absl/crc/internal/crc_memcpy_fallback.cc b/absl/crc/internal/crc_memcpy_fallback.cc index 15b4b055..07795504 100644 --- a/absl/crc/internal/crc_memcpy_fallback.cc +++ b/absl/crc/internal/crc_memcpy_fallback.cc @@ -54,7 +54,8 @@ absl::crc32c_t FallbackCrcMemcpyEngine::Compute(void* __restrict dst, } // Compile the following only if we don't have -#ifndef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE +#if !defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) && \ + !defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE) CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() { CrcMemcpy::ArchSpecificEngines engines; @@ -68,7 +69,8 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int /*vector*/, return std::make_unique<FallbackCrcMemcpyEngine>(); } -#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE +#endif // !ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE && + // !ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE } // namespace crc_internal ABSL_NAMESPACE_END diff --git a/absl/crc/internal/crc_memcpy_x86_64.cc b/absl/crc/internal/crc_memcpy_x86_arm_combined.cc index c4ccd472..a06485a6 100644 --- a/absl/crc/internal/crc_memcpy_x86_64.cc +++ b/absl/crc/internal/crc_memcpy_x86_arm_combined.cc @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Simultaneous memcopy and CRC-32C for x86-64. Uses integer registers because -// XMM registers do not support the CRC instruction (yet). While copying, -// compute the running CRC of the data being copied. +// Simultaneous memcopy and CRC-32C for x86-64 and ARM 64. Uses integer +// registers because XMM registers do not support the CRC instruction (yet). +// While copying, compute the running CRC of the data being copied. // // It is assumed that any CPU running this code has SSE4.2 instructions // available (for CRC32C). This file will do nothing if that is not true. @@ -57,10 +57,12 @@ #include "absl/base/prefetch.h" #include "absl/crc/crc32c.h" #include "absl/crc/internal/cpu_detect.h" +#include "absl/crc/internal/crc32_x86_arm_combined_simd.h" #include "absl/crc/internal/crc_memcpy.h" #include "absl/strings/string_view.h" -#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE +#if defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) || \ + defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE) namespace absl { ABSL_NAMESPACE_BEGIN @@ -75,7 +77,7 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length, uint32_t crc_uint32 = static_cast<uint32_t>(crc); for (std::size_t i = 0; i < length; i++) { uint8_t data = *reinterpret_cast<const uint8_t*>(src); - crc_uint32 = _mm_crc32_u8(crc_uint32, data); + crc_uint32 = CRC32_u8(crc_uint32, data); *reinterpret_cast<uint8_t*>(dst) = data; ++src; ++dst; @@ -83,36 +85,35 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length, return crc32c_t{crc_uint32}; } -constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t); +constexpr size_t kIntLoadsPerVec = sizeof(V128) / sizeof(uint64_t); // Common function for copying the tails of multiple large regions. template <size_t vec_regions, size_t int_regions> inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, size_t region_size, size_t copy_rounds) { - std::array<__m128i, vec_regions> data; + std::array<V128, vec_regions> data; std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data; while (copy_rounds > 0) { for (size_t i = 0; i < vec_regions; i++) { size_t region = i; - auto* vsrc = - reinterpret_cast<const __m128i*>(*src + region_size * region); - auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region); + auto* vsrc = reinterpret_cast<const V128u*>(*src + region_size * region); + auto* vdst = reinterpret_cast<V128*>(*dst + region_size * region); // Load the blocks, unaligned - data[i] = _mm_loadu_si128(vsrc); + data[i] = V128_LoadU(vsrc); // Store the blocks, aligned - _mm_store_si128(vdst, data[i]); + V128_Store(vdst, data[i]); // Compute the running CRC crcs[region] = crc32c_t{static_cast<uint32_t>( - _mm_crc32_u64(static_cast<uint32_t>(crcs[region]), - static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))}; + CRC32_u64(static_cast<uint32_t>(crcs[region]), + static_cast<uint64_t>(V128_Extract64<0>(data[i]))))}; crcs[region] = crc32c_t{static_cast<uint32_t>( - _mm_crc32_u64(static_cast<uint32_t>(crcs[region]), - static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))}; + CRC32_u64(static_cast<uint32_t>(crcs[region]), + static_cast<uint64_t>(V128_Extract64<1>(data[i]))))}; } for (size_t i = 0; i < int_regions; i++) { @@ -126,7 +127,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, size_t data_index = i * kIntLoadsPerVec + j; int_data[data_index] = *(usrc + j); - crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( + crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64( static_cast<uint32_t>(crcs[region]), int_data[data_index]))}; *(udst + j) = int_data[data_index]; @@ -134,8 +135,8 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src, } // Increment pointers - *src += sizeof(__m128i); - *dst += sizeof(__m128i); + *src += sizeof(V128); + *dst += sizeof(V128); --copy_rounds; } } @@ -161,7 +162,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( constexpr std::size_t kRegions = vec_regions + int_regions; static_assert(kRegions > 0, "Must specify at least one region."); constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff}; - constexpr std::size_t kBlockSize = sizeof(__m128i); + constexpr std::size_t kBlockSize = sizeof(V128); constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize; // Number of blocks per cacheline. @@ -237,7 +238,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( const std::size_t tail_size = length - (kRegions * region_size); // Holding registers for data in each region. - std::array<__m128i, vec_regions> vec_data; + std::array<V128, vec_regions> vec_data; std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data; // Main loop. @@ -245,7 +246,10 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( // Prefetch kPrefetchAhead bytes ahead of each pointer. for (size_t i = 0; i < kRegions; i++) { absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i); +#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE + // TODO(b/297082454): investigate dropping prefetch on x86. absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i); +#endif } // Load and store data, computing CRC on the way. @@ -258,21 +262,20 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( size_t region = (j + i) % kRegions; auto* vsrc = - reinterpret_cast<const __m128i*>(src_bytes + region_size * region); - auto* vdst = - reinterpret_cast<__m128i*>(dst_bytes + region_size * region); + reinterpret_cast<const V128u*>(src_bytes + region_size * region); + auto* vdst = reinterpret_cast<V128*>(dst_bytes + region_size * region); // Load and CRC data. - vec_data[j] = _mm_loadu_si128(vsrc + i); - crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( - static_cast<uint32_t>(crcs[region]), - static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))}; - crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( - static_cast<uint32_t>(crcs[region]), - static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))}; + vec_data[j] = V128_LoadU(vsrc + i); + crcs[region] = crc32c_t{static_cast<uint32_t>( + CRC32_u64(static_cast<uint32_t>(crcs[region]), + static_cast<uint64_t>(V128_Extract64<0>(vec_data[j]))))}; + crcs[region] = crc32c_t{static_cast<uint32_t>( + CRC32_u64(static_cast<uint32_t>(crcs[region]), + static_cast<uint64_t>(V128_Extract64<1>(vec_data[j]))))}; // Store the data. - _mm_store_si128(vdst + i, vec_data[j]); + V128_Store(vdst + i, vec_data[j]); } // Preload the partial CRCs for the CLMUL subregions. @@ -292,7 +295,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( // Load and CRC the data. int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k); - crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64( + crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64( static_cast<uint32_t>(crcs[region]), int_data[data_index]))}; // Store the data. @@ -443,4 +446,5 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector, ABSL_NAMESPACE_END } // namespace absl -#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE +#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE || + // ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE |