Use ARMv8 CRC32 instructions for SkOpts::hash().

For large inputs, this runs ~11x faster than Murmur3. My bench drops from 1µs to 88ns. Like x86-64, this runs fastest if we work in 24 byte chunks. 16 byte chunks run at about 0.75x this speed, 8 byte chunks at about 0.4x (which would still be about 5x faster than Murmur3). This'll require plumbing support for opts_crc32 into Chrome first before it can roll. perf.skia.org charts we want to watch: https://perf.skia.org/#5490 Seach for compute_hash in these logs to see the difference: baseline: https://luci-milo.appspot.com/swarming/task/30ba22f3dfe30e10/steps/nanobench/0/stdout trybot: https://luci-milo.appspot.com/swarming/task/30bbc406cbf62d10/steps/nanobench/0/stdout BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2260823002 CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review-Url: https://codereview.chromium.org/2260823002
author: mtklein <mtklein@chromium.org> 2016-08-22 08:53:45 -0700
committer: Commit bot <commit-bot@chromium.org> 2016-08-22 08:53:45 -0700
commit: 78559a78f9d3e6444f8c0c9443696699703d6531 (patch)
tree: 22ddd055715ef77a469f66df999320cc7a5edcbb /src/opts/SkChecksum_opts.h
parent: dd3259eb95c3b47e11eefa3b176365a112a32b48 (diff)
1 files changed, 46 insertions, 2 deletions
diff --git a/src/opts/SkChecksum_opts.h b/src/opts/SkChecksum_opts.h
index 07fdfaab65..4bcd9b1c35 100644
--- a/src/opts/SkChecksum_opts.h
+++ b/src/opts/SkChecksum_opts.h
@@ -13,10 +13,10 @@
 
 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
     #include <immintrin.h>
+#elif defined(SK_CPU_ARM64) && defined(__ARM_FEATURE_CRC32)
+    #include <arm_acle.h>
 #endif
 
-// TODO: ARMv8 has optional CRC instructions similar to SSE 4.2
-
 namespace SK_OPTS_NS {
 
 template <typename T>
@@ -127,6 +127,50 @@ static inline T unaligned_load(const uint8_t* src) {
         return hash;
     }
 
+#elif defined(SK_CPU_ARM64) && defined(__ARM_FEATURE_CRC32)
+    static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
+        auto data = (const uint8_t*)vdata;
+        if (bytes >= 24) {
+            uint32_t a = hash,
+                     b = hash,
+                     c = hash;
+            size_t steps = bytes/24;
+            while (steps --> 0) {
+                a = __crc32d(a, unaligned_load<uint64_t>(data+ 0));
+                b = __crc32d(b, unaligned_load<uint64_t>(data+ 8));
+                c = __crc32d(c, unaligned_load<uint64_t>(data+16));
+                data += 24;
+            }
+            bytes %= 24;
+            hash = a^b^c;
+        }
+
+        SkASSERT(bytes < 24);
+        if (bytes >= 16) {
+            hash = __crc32d(hash, unaligned_load<uint64_t>(data));
+            bytes -= 8;
+            data  += 8;
+        }
+
+        SkASSERT(bytes < 16);
+        if (bytes & 8) {
+            hash = __crc32d(hash, unaligned_load<uint64_t>(data));
+            data += 8;
+        }
+        if (bytes & 4) {
+            hash = __crc32w(hash, unaligned_load<uint32_t>(data));
+            data += 4;
+        }
+        if (bytes & 2) {
+            hash = __crc32h(hash, unaligned_load<uint16_t>(data));
+            data += 2;
+        }
+        if (bytes & 1) {
+            hash = __crc32b(hash, unaligned_load<uint8_t>(data));
+        }
+        return hash;
+    }
+
 #else
     // This is Murmur3.
     static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
author	mtklein <mtklein@chromium.org>	2016-08-22 08:53:45 -0700
committer	Commit bot <commit-bot@chromium.org>	2016-08-22 08:53:45 -0700
commit	78559a78f9d3e6444f8c0c9443696699703d6531 (patch)
tree	22ddd055715ef77a469f66df999320cc7a5edcbb /src/opts/SkChecksum_opts.h
parent	dd3259eb95c3b47e11eefa3b176365a112a32b48 (diff)