summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Connal de Souza <connaldesouza@google.com>2023-09-26 14:13:18 -0700
committerGravatar Copybara-Service <copybara-worker@google.com>2023-09-26 14:13:55 -0700
commitac364eb9d07ff4d8c7fb8d848b66398559220d04 (patch)
tree1e001cba5a64ac395b77fa561333487729683baa
parent2fa24cc42f9103f2af71b3df858e8b6814ff1fd4 (diff)
Optimize CRC32 for Ampere Siryn
Siryn's crc32 instruction seems to have latency 3 and throughput 1, which makes the optimal ratio of pmull and crc streams close to that of tested x86 machines. Up to +120% faster for large inputs. PiperOrigin-RevId: 568645559 Change-Id: I86b85b1b2a5d4fb3680c516c4c9044238b20fe61
-rw-r--r--absl/crc/internal/crc_x86_arm_combined.cc3
1 files changed, 3 insertions, 0 deletions
diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc
index 4847bc83..dfb3b325 100644
--- a/absl/crc/internal/crc_x86_arm_combined.cc
+++ b/absl/crc/internal/crc_x86_arm_combined.cc
@@ -636,6 +636,9 @@ CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() {
case CpuType::kArmNeoverseN1:
return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
1, 1, CutoffStrategy::Unroll64CRC>();
+ case CpuType::kAmpereSiryn:
+ return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
+ 3, 2, CutoffStrategy::Fold3>();
#if defined(__aarch64__)
default:
// Not all ARM processors support the needed instructions, so check here