summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Connal de Souza <connaldesouza@google.com>2023-09-21 12:51:54 -0700
committerGravatar Copybara-Service <copybara-worker@google.com>2023-09-21 12:52:45 -0700
commitaa3c949a7f0b4dc6fb4569117b2e3aa50e2cd27a (patch)
tree4e2459680e6faf54b5f1b4da39e8526499910959
parent821756c32ee197556905a94910e631721113dbb3 (diff)
Optimize CRC32 Extend for large inputs on Arm
This is a temporary workaround for an apparent compiler bug with pmull(2) instructions. The current hot loop looks like this: mov w14, #0xef02, lsl x15, x15, #6, mov x13, xzr, movk w14, #0x740e, lsl #16, sub x15, x15, #0x40, ldr q4, [x16, #0x4e0], _LOOP_START: add x16, x9, x13, add x17, x12, x13, fmov d19, x14, <--------- This is Loop invariant and expensive add x13, x13, #0x40, cmp x15, x13, prfm pldl1keep, [x16, #0x140], prfm pldl1keep, [x17, #0x140], ldp x18, x0, [x16, #0x40], crc32cx w10, w10, x18, ldp x2, x18, [x16, #0x50], crc32cx w10, w10, x0, crc32cx w10, w10, x2, ldp x0, x2, [x16, #0x60], crc32cx w10, w10, x18, ldp x18, x16, [x16, #0x70], pmull2 v5.1q, v1.2d, v4.2d, pmull2 v6.1q, v0.2d, v4.2d, pmull2 v7.1q, v2.2d, v4.2d, pmull2 v16.1q, v3.2d, v4.2d, ldp q17, q18, [x17, #0x40], crc32cx w10, w10, x0, pmull v1.1q, v1.1d, v19.1d, crc32cx w10, w10, x2, pmull v0.1q, v0.1d, v19.1d, crc32cx w10, w10, x18, pmull v2.1q, v2.1d, v19.1d, crc32cx w10, w10, x16, pmull v3.1q, v3.1d, v19.1d, ldp q20, q21, [x17, #0x60], eor v1.16b, v17.16b, v1.16b, eor v0.16b, v18.16b, v0.16b, eor v1.16b, v1.16b, v5.16b, eor v2.16b, v20.16b, v2.16b, eor v0.16b, v0.16b, v6.16b, eor v3.16b, v21.16b, v3.16b, eor v2.16b, v2.16b, v7.16b, eor v3.16b, v3.16b, v16.16b, b.ne _LOOP_START There is a redundant fmov that moves the same constant into a Neon register every loop iteration to be used in the PMULL instructions. The PMULL2 instructions already have this constant loaded into Neon registers. After this change, both the PMULL and PMULL2 instructions use the values in q4, and they are not reloaded every iteration. This fmov was expensive because it contends for execution units with crc32cx instructions. This is up to 20% faster for large inputs. PiperOrigin-RevId: 567391972 Change-Id: I4c8e49750cfa5cc5730c3bb713bd9fd67657804a
-rw-r--r--absl/crc/internal/crc32_x86_arm_combined_simd.h14
1 files changed, 9 insertions, 5 deletions
diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h
index d2fc6211..d3eedd58 100644
--- a/absl/crc/internal/crc32_x86_arm_combined_simd.h
+++ b/absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -225,8 +225,8 @@ inline void V128_Store(V128* dst, V128 data) {
// Using inline assembly as clang does not generate the pmull2 instruction and
// performance drops by 15-20%.
-// TODO(b/193678732): Investigate why the compiler decides not to generate
-// such instructions and why it becomes so much worse.
+// TODO(b/193678732): Investigate why there is a slight performance hit when
+// using intrinsics instead of inline assembly.
inline V128 V128_PMulHi(const V128 l, const V128 r) {
uint64x2_t res;
__asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t"
@@ -235,10 +235,14 @@ inline V128 V128_PMulHi(const V128 l, const V128 r) {
return res;
}
+// TODO(b/193678732): Investigate why the compiler decides to move the constant
+// loop multiplicands from GPR to Neon registers every loop iteration.
inline V128 V128_PMulLow(const V128 l, const V128 r) {
- return reinterpret_cast<V128>(vmull_p64(
- reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))),
- reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r)))));
+ uint64x2_t res;
+ __asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t"
+ : "=w"(res)
+ : "w"(l), "w"(r));
+ return res;
}
inline V128 V128_PMul01(const V128 l, const V128 r) {