diff options
author | mtklein <mtklein@chromium.org> | 2016-08-22 08:53:45 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2016-08-22 08:53:45 -0700 |
commit | 78559a78f9d3e6444f8c0c9443696699703d6531 (patch) | |
tree | 22ddd055715ef77a469f66df999320cc7a5edcbb /src/opts/SkChecksum_opts.h | |
parent | dd3259eb95c3b47e11eefa3b176365a112a32b48 (diff) |
Use ARMv8 CRC32 instructions for SkOpts::hash().
For large inputs, this runs ~11x faster than Murmur3.
My bench drops from 1µs to 88ns.
Like x86-64, this runs fastest if we work in 24 byte chunks. 16 byte chunks
run at about 0.75x this speed, 8 byte chunks at about 0.4x (which would still
be about 5x faster than Murmur3).
This'll require plumbing support for opts_crc32 into Chrome first before it can roll.
perf.skia.org charts we want to watch: https://perf.skia.org/#5490
Seach for compute_hash in these logs to see the difference:
baseline: https://luci-milo.appspot.com/swarming/task/30ba22f3dfe30e10/steps/nanobench/0/stdout
trybot: https://luci-milo.appspot.com/swarming/task/30bbc406cbf62d10/steps/nanobench/0/stdout
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2260823002
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review-Url: https://codereview.chromium.org/2260823002
Diffstat (limited to 'src/opts/SkChecksum_opts.h')
-rw-r--r-- | src/opts/SkChecksum_opts.h | 48 |
1 files changed, 46 insertions, 2 deletions
diff --git a/src/opts/SkChecksum_opts.h b/src/opts/SkChecksum_opts.h index 07fdfaab65..4bcd9b1c35 100644 --- a/src/opts/SkChecksum_opts.h +++ b/src/opts/SkChecksum_opts.h @@ -13,10 +13,10 @@ #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 #include <immintrin.h> +#elif defined(SK_CPU_ARM64) && defined(__ARM_FEATURE_CRC32) + #include <arm_acle.h> #endif -// TODO: ARMv8 has optional CRC instructions similar to SSE 4.2 - namespace SK_OPTS_NS { template <typename T> @@ -127,6 +127,50 @@ static inline T unaligned_load(const uint8_t* src) { return hash; } +#elif defined(SK_CPU_ARM64) && defined(__ARM_FEATURE_CRC32) + static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { + auto data = (const uint8_t*)vdata; + if (bytes >= 24) { + uint32_t a = hash, + b = hash, + c = hash; + size_t steps = bytes/24; + while (steps --> 0) { + a = __crc32d(a, unaligned_load<uint64_t>(data+ 0)); + b = __crc32d(b, unaligned_load<uint64_t>(data+ 8)); + c = __crc32d(c, unaligned_load<uint64_t>(data+16)); + data += 24; + } + bytes %= 24; + hash = a^b^c; + } + + SkASSERT(bytes < 24); + if (bytes >= 16) { + hash = __crc32d(hash, unaligned_load<uint64_t>(data)); + bytes -= 8; + data += 8; + } + + SkASSERT(bytes < 16); + if (bytes & 8) { + hash = __crc32d(hash, unaligned_load<uint64_t>(data)); + data += 8; + } + if (bytes & 4) { + hash = __crc32w(hash, unaligned_load<uint32_t>(data)); + data += 4; + } + if (bytes & 2) { + hash = __crc32h(hash, unaligned_load<uint16_t>(data)); + data += 2; + } + if (bytes & 1) { + hash = __crc32b(hash, unaligned_load<uint8_t>(data)); + } + return hash; + } + #else // This is Murmur3. static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { |