diff options
author | Connal de Souza <connaldesouza@google.com> | 2024-04-04 13:08:45 -0700 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2024-04-04 13:09:48 -0700 |
commit | 61e47a454c81eb07147b0315485f476513cc1230 (patch) | |
tree | 3c52cb6c62108ae8cfafc7eeddf80fdb0b11bf26 /absl/crc/internal | |
parent | 1ec4a27e39944462a574abbfa040498ed2831cc8 (diff) |
Optimize crc32 V128_From2x64 on Arm
This removes redundant vector-vector moves and results in Extend being up to 3% faster.
PiperOrigin-RevId: 621948170
Change-Id: Id82816aa6e294d34140ff591103cb20feac79d9a
Diffstat (limited to 'absl/crc/internal')
-rw-r--r-- | absl/crc/internal/crc32_x86_arm_combined_simd.h | 17 | ||||
-rw-r--r-- | absl/crc/internal/crc_x86_arm_combined.cc | 10 |
2 files changed, 15 insertions, 12 deletions
diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h index 1938a9b2..aa6a6595 100644 --- a/absl/crc/internal/crc32_x86_arm_combined_simd.h +++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h @@ -102,10 +102,11 @@ V128 V128_Xor(const V128 l, const V128 r); // Produces an AND operation of |l| and |r|. V128 V128_And(const V128 l, const V128 r); -// Sets two 64 bit integers to one 128 bit vector. The order is reverse. +// Sets the lower half of a 128 bit register to the given 64-bit value and +// zeroes the upper half. // dst[63:0] := |r| -// dst[127:64] := |l| -V128 V128_From2x64(const uint64_t l, const uint64_t r); +// dst[127:64] := |0| +V128 V128_From64WithZeroFill(const uint64_t r); // Shift |l| right by |imm| bytes while shifting in zeros. template <int imm> @@ -171,8 +172,8 @@ inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); } inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); } -inline V128 V128_From2x64(const uint64_t l, const uint64_t r) { - return _mm_set_epi64x(static_cast<int64_t>(l), static_cast<int64_t>(r)); +inline V128 V128_From64WithZeroFill(const uint64_t r) { + return _mm_set_epi64x(static_cast<int64_t>(0), static_cast<int64_t>(r)); } template <int imm> @@ -262,10 +263,12 @@ inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); } inline V128 V128_And(const V128 l, const V128 r) { return vandq_u64(l, r); } -inline V128 V128_From2x64(const uint64_t l, const uint64_t r) { - return vcombine_u64(vcreate_u64(r), vcreate_u64(l)); +inline V128 V128_From64WithZeroFill(const uint64_t r){ + constexpr uint64x2_t kZero = {0, 0}; + return vsetq_lane_u64(r, kZero, 0); } + template <int imm> inline V128 V128_ShiftRight(const V128 l) { return vreinterpretq_u64_s8( diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc index 51eff4ed..20dd3e01 100644 --- a/absl/crc/internal/crc_x86_arm_combined.cc +++ b/absl/crc/internal/crc_x86_arm_combined.cc @@ -101,9 +101,9 @@ constexpr size_t kMediumCutoff = 2048; namespace { uint32_t multiply(uint32_t a, uint32_t b) { - V128 shifts = V128_From2x64(0, 1); - V128 power = V128_From2x64(0, a); - V128 crc = V128_From2x64(0, b); + V128 shifts = V128_From64WithZeroFill(1); + V128 power = V128_From64WithZeroFill(a); + V128 crc = V128_From64WithZeroFill(b); V128 res = V128_PMulLow(power, crc); // Combine crc values @@ -444,11 +444,11 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams V128 magic = *(reinterpret_cast<const V128*>(kClmulConstants) + bs - 1); - V128 tmp = V128_From2x64(0, l64); + V128 tmp = V128_From64WithZeroFill(l64); V128 res1 = V128_PMulLow(tmp, magic); - tmp = V128_From2x64(0, l641); + tmp = V128_From64WithZeroFill(l641); V128 res2 = V128_PMul10(tmp, magic); V128 x = V128_Xor(res1, res2); |