summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMake/AbseilDll.cmake2
-rw-r--r--absl/crc/BUILD.bazel5
-rw-r--r--absl/crc/CMakeLists.txt5
-rw-r--r--absl/crc/internal/crc32_x86_arm_combined_simd.h31
-rw-r--r--absl/crc/internal/crc_memcpy.h9
-rw-r--r--absl/crc/internal/crc_memcpy_fallback.cc6
-rw-r--r--absl/crc/internal/crc_memcpy_x86_arm_combined.cc (renamed from absl/crc/internal/crc_memcpy_x86_64.cc)72
7 files changed, 79 insertions, 51 deletions
diff --git a/CMake/AbseilDll.cmake b/CMake/AbseilDll.cmake
index 25031be9..0b2d5c76 100644
--- a/CMake/AbseilDll.cmake
+++ b/CMake/AbseilDll.cmake
@@ -107,7 +107,7 @@ set(ABSL_INTERNAL_DLL_FILES
"crc/internal/crc_x86_arm_combined.cc"
"crc/internal/crc_memcpy_fallback.cc"
"crc/internal/crc_memcpy.h"
- "crc/internal/crc_memcpy_x86_64.cc"
+ "crc/internal/crc_memcpy_x86_arm_combined.cc"
"crc/internal/crc_non_temporal_memcpy.cc"
"crc/internal/crc_x86_arm_combined.cc"
"crc/internal/non_temporal_arm_intrinsics.h"
diff --git a/absl/crc/BUILD.bazel b/absl/crc/BUILD.bazel
index cdbaa9b2..5580420a 100644
--- a/absl/crc/BUILD.bazel
+++ b/absl/crc/BUILD.bazel
@@ -54,10 +54,8 @@ cc_library(
visibility = ["//visibility:private"],
deps = [
":cpu_detect",
- "//absl/base",
"//absl/base:config",
"//absl/base:core_headers",
- "//absl/base:dynamic_annotations",
"//absl/base:endian",
"//absl/base:prefetch",
"//absl/base:raw_logging_internal",
@@ -72,7 +70,7 @@ cc_library(
"crc32c.cc",
"internal/crc32c_inline.h",
"internal/crc_memcpy_fallback.cc",
- "internal/crc_memcpy_x86_64.cc",
+ "internal/crc_memcpy_x86_arm_combined.cc",
"internal/crc_non_temporal_memcpy.cc",
],
hdrs = [
@@ -89,7 +87,6 @@ cc_library(
":non_temporal_memcpy",
"//absl/base:config",
"//absl/base:core_headers",
- "//absl/base:dynamic_annotations",
"//absl/base:endian",
"//absl/base:prefetch",
"//absl/strings",
diff --git a/absl/crc/CMakeLists.txt b/absl/crc/CMakeLists.txt
index 21247160..ec7b4512 100644
--- a/absl/crc/CMakeLists.txt
+++ b/absl/crc/CMakeLists.txt
@@ -42,10 +42,8 @@ absl_cc_library(
${ABSL_DEFAULT_COPTS}
DEPS
absl::crc_cpu_detect
- absl::base
absl::config
absl::core_headers
- absl::dynamic_annotations
absl::endian
absl::prefetch
absl::raw_logging_internal
@@ -64,7 +62,7 @@ absl_cc_library(
"crc32c.cc"
"internal/crc32c_inline.h"
"internal/crc_memcpy_fallback.cc"
- "internal/crc_memcpy_x86_64.cc"
+ "internal/crc_memcpy_x86_arm_combined.cc"
"internal/crc_non_temporal_memcpy.cc"
COPTS
${ABSL_DEFAULT_COPTS}
@@ -74,7 +72,6 @@ absl_cc_library(
absl::non_temporal_memcpy
absl::config
absl::core_headers
- absl::dynamic_annotations
absl::endian
absl::prefetch
absl::str_format
diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h
index 39e53dd0..514e8fd9 100644
--- a/absl/crc/internal/crc32_x86_arm_combined_simd.h
+++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -58,8 +58,10 @@ namespace crc_internal {
#if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
using V128 = uint64x2_t;
+using V128u = uint64x2_t;
#else
using V128 = __m128i;
+using V128u = __m128i_u;
#endif
// Starting with the initial value in |crc|, accumulates a CRC32 value for
@@ -76,7 +78,10 @@ uint32_t CRC32_u64(uint32_t crc, uint64_t v);
V128 V128_Load(const V128* src);
// Load 128 bits of integer data. |src| does not need to be aligned.
-V128 V128_LoadU(const V128* src);
+V128 V128_LoadU(const V128u* src);
+
+// Store 128 bits of integer data. |src| must be 16-byte aligned.
+void V128_Store(V128* dst, V128 data);
// Polynomially multiplies the high 64 bits of |l| and |r|.
V128 V128_PMulHi(const V128 l, const V128 r);
@@ -109,6 +114,10 @@ V128 V128_ShiftRight(const V128 l);
template <int imm>
int V128_Extract32(const V128 l);
+// Extracts a 64-bit integer from |l|, selected with |imm|.
+template <int imm>
+uint64_t V128_Extract64(const V128 l);
+
// Extracts the low 64 bits from V128.
int64_t V128_Low64(const V128 l);
@@ -137,7 +146,9 @@ inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
-inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
+inline V128 V128_LoadU(const V128u* src) { return _mm_loadu_si128(src); }
+
+inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); }
inline V128 V128_PMulHi(const V128 l, const V128 r) {
return _mm_clmulepi64_si128(l, r, 0x11);
@@ -173,6 +184,11 @@ inline int V128_Extract32(const V128 l) {
return _mm_extract_epi32(l, imm);
}
+template <int imm>
+inline uint64_t V128_Extract64(const V128 l) {
+ return static_cast<uint64_t>(_mm_extract_epi64(l, imm));
+}
+
inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); }
inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
@@ -199,10 +215,14 @@ inline V128 V128_Load(const V128* src) {
return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
}
-inline V128 V128_LoadU(const V128* src) {
+inline V128 V128_LoadU(const V128u* src) {
return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
}
+inline void V128_Store(V128* dst, V128 data) {
+ vst1q_u64(reinterpret_cast<uint64_t*>(dst), data);
+}
+
// Using inline assembly as clang does not generate the pmull2 instruction and
// performance drops by 15-20%.
// TODO(b/193678732): Investigate why the compiler decides not to generate
@@ -252,6 +272,11 @@ inline int V128_Extract32(const V128 l) {
return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
}
+template <int imm>
+inline uint64_t V128_Extract64(const V128 l) {
+ return vgetq_lane_s64(vreinterpretq_s64_u64(l), imm);
+}
+
inline int64_t V128_Low64(const V128 l) {
return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0);
}
diff --git a/absl/crc/internal/crc_memcpy.h b/absl/crc/internal/crc_memcpy.h
index 4909d433..a0fed65a 100644
--- a/absl/crc/internal/crc_memcpy.h
+++ b/absl/crc/internal/crc_memcpy.h
@@ -20,12 +20,15 @@
#include "absl/base/config.h"
#include "absl/crc/crc32c.h"
+#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
// Defined if the class AcceleratedCrcMemcpyEngine exists.
-#if defined(__x86_64__) && defined(__SSE4_2__)
-#define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
-#elif defined(_MSC_VER) && defined(__AVX__)
+// TODO(b/299127771): Consider relaxing the pclmul requirement once the other
+// intrinsics are conditionally compiled without it.
+#if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
#define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
+#elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
+#define ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE 1
#endif
namespace absl {
diff --git a/absl/crc/internal/crc_memcpy_fallback.cc b/absl/crc/internal/crc_memcpy_fallback.cc
index 15b4b055..07795504 100644
--- a/absl/crc/internal/crc_memcpy_fallback.cc
+++ b/absl/crc/internal/crc_memcpy_fallback.cc
@@ -54,7 +54,8 @@ absl::crc32c_t FallbackCrcMemcpyEngine::Compute(void* __restrict dst,
}
// Compile the following only if we don't have
-#ifndef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+#if !defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) && \
+ !defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
CrcMemcpy::ArchSpecificEngines engines;
@@ -68,7 +69,8 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int /*vector*/,
return std::make_unique<FallbackCrcMemcpyEngine>();
}
-#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+#endif // !ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE &&
+ // !ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
} // namespace crc_internal
ABSL_NAMESPACE_END
diff --git a/absl/crc/internal/crc_memcpy_x86_64.cc b/absl/crc/internal/crc_memcpy_x86_arm_combined.cc
index c4ccd472..a06485a6 100644
--- a/absl/crc/internal/crc_memcpy_x86_64.cc
+++ b/absl/crc/internal/crc_memcpy_x86_arm_combined.cc
@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// Simultaneous memcopy and CRC-32C for x86-64. Uses integer registers because
-// XMM registers do not support the CRC instruction (yet). While copying,
-// compute the running CRC of the data being copied.
+// Simultaneous memcopy and CRC-32C for x86-64 and ARM 64. Uses integer
+// registers because XMM registers do not support the CRC instruction (yet).
+// While copying, compute the running CRC of the data being copied.
//
// It is assumed that any CPU running this code has SSE4.2 instructions
// available (for CRC32C). This file will do nothing if that is not true.
@@ -57,10 +57,12 @@
#include "absl/base/prefetch.h"
#include "absl/crc/crc32c.h"
#include "absl/crc/internal/cpu_detect.h"
+#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
#include "absl/crc/internal/crc_memcpy.h"
#include "absl/strings/string_view.h"
-#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+#if defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) || \
+ defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
namespace absl {
ABSL_NAMESPACE_BEGIN
@@ -75,7 +77,7 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
uint32_t crc_uint32 = static_cast<uint32_t>(crc);
for (std::size_t i = 0; i < length; i++) {
uint8_t data = *reinterpret_cast<const uint8_t*>(src);
- crc_uint32 = _mm_crc32_u8(crc_uint32, data);
+ crc_uint32 = CRC32_u8(crc_uint32, data);
*reinterpret_cast<uint8_t*>(dst) = data;
++src;
++dst;
@@ -83,36 +85,35 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
return crc32c_t{crc_uint32};
}
-constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);
+constexpr size_t kIntLoadsPerVec = sizeof(V128) / sizeof(uint64_t);
// Common function for copying the tails of multiple large regions.
template <size_t vec_regions, size_t int_regions>
inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
size_t region_size, size_t copy_rounds) {
- std::array<__m128i, vec_regions> data;
+ std::array<V128, vec_regions> data;
std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
while (copy_rounds > 0) {
for (size_t i = 0; i < vec_regions; i++) {
size_t region = i;
- auto* vsrc =
- reinterpret_cast<const __m128i*>(*src + region_size * region);
- auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region);
+ auto* vsrc = reinterpret_cast<const V128u*>(*src + region_size * region);
+ auto* vdst = reinterpret_cast<V128*>(*dst + region_size * region);
// Load the blocks, unaligned
- data[i] = _mm_loadu_si128(vsrc);
+ data[i] = V128_LoadU(vsrc);
// Store the blocks, aligned
- _mm_store_si128(vdst, data[i]);
+ V128_Store(vdst, data[i]);
// Compute the running CRC
crcs[region] = crc32c_t{static_cast<uint32_t>(
- _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
- static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))};
+ CRC32_u64(static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(V128_Extract64<0>(data[i]))))};
crcs[region] = crc32c_t{static_cast<uint32_t>(
- _mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
- static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))};
+ CRC32_u64(static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(V128_Extract64<1>(data[i]))))};
}
for (size_t i = 0; i < int_regions; i++) {
@@ -126,7 +127,7 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
size_t data_index = i * kIntLoadsPerVec + j;
int_data[data_index] = *(usrc + j);
- crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+ crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64(
static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
*(udst + j) = int_data[data_index];
@@ -134,8 +135,8 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
}
// Increment pointers
- *src += sizeof(__m128i);
- *dst += sizeof(__m128i);
+ *src += sizeof(V128);
+ *dst += sizeof(V128);
--copy_rounds;
}
}
@@ -161,7 +162,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
constexpr std::size_t kRegions = vec_regions + int_regions;
static_assert(kRegions > 0, "Must specify at least one region.");
constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
- constexpr std::size_t kBlockSize = sizeof(__m128i);
+ constexpr std::size_t kBlockSize = sizeof(V128);
constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
// Number of blocks per cacheline.
@@ -237,7 +238,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
const std::size_t tail_size = length - (kRegions * region_size);
// Holding registers for data in each region.
- std::array<__m128i, vec_regions> vec_data;
+ std::array<V128, vec_regions> vec_data;
std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data;
// Main loop.
@@ -245,7 +246,10 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Prefetch kPrefetchAhead bytes ahead of each pointer.
for (size_t i = 0; i < kRegions; i++) {
absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i);
+#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+ // TODO(b/297082454): investigate dropping prefetch on x86.
absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i);
+#endif
}
// Load and store data, computing CRC on the way.
@@ -258,21 +262,20 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
size_t region = (j + i) % kRegions;
auto* vsrc =
- reinterpret_cast<const __m128i*>(src_bytes + region_size * region);
- auto* vdst =
- reinterpret_cast<__m128i*>(dst_bytes + region_size * region);
+ reinterpret_cast<const V128u*>(src_bytes + region_size * region);
+ auto* vdst = reinterpret_cast<V128*>(dst_bytes + region_size * region);
// Load and CRC data.
- vec_data[j] = _mm_loadu_si128(vsrc + i);
- crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
- static_cast<uint32_t>(crcs[region]),
- static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))};
- crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
- static_cast<uint32_t>(crcs[region]),
- static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))};
+ vec_data[j] = V128_LoadU(vsrc + i);
+ crcs[region] = crc32c_t{static_cast<uint32_t>(
+ CRC32_u64(static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(V128_Extract64<0>(vec_data[j]))))};
+ crcs[region] = crc32c_t{static_cast<uint32_t>(
+ CRC32_u64(static_cast<uint32_t>(crcs[region]),
+ static_cast<uint64_t>(V128_Extract64<1>(vec_data[j]))))};
// Store the data.
- _mm_store_si128(vdst + i, vec_data[j]);
+ V128_Store(vdst + i, vec_data[j]);
}
// Preload the partial CRCs for the CLMUL subregions.
@@ -292,7 +295,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
// Load and CRC the data.
int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
- crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
+ crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64(
static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
// Store the data.
@@ -443,4 +446,5 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
ABSL_NAMESPACE_END
} // namespace absl
-#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
+#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE ||
+ // ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE