aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts
diff options
context:
space:
mode:
authorGravatar mtklein <mtklein@chromium.org>2016-08-08 09:06:27 -0700
committerGravatar Commit bot <commit-bot@chromium.org>2016-08-08 09:06:28 -0700
commit4e97607d9a1cef66fac16f347c5ca813ec4f9515 (patch)
treee523cac97fae89a2abb1ec0d4c1fc87f86e3169f /src/opts
parent9a5a201472c3fb9f02b954ba09ae1ec13ebd50f4 (diff)
Use sse4.2 CRC32 instructions to hash when available.
About 9x faster than Murmur3 for long inputs. Most of this is a mechanical change from SkChecksum::Murmur3(...) to SkOpts::hash(...). BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2208903002 CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot;master.client.skia.compile:Build-Ubuntu-GCC-x86_64-Release-CMake-Trybot,Build-Mac-Clang-x86_64-Release-CMake-Trybot Review-Url: https://codereview.chromium.org/2208903002
Diffstat (limited to 'src/opts')
-rw-r--r--src/opts/SkChecksum_opts.h130
-rw-r--r--src/opts/SkOpts_sse42.cpp18
2 files changed, 148 insertions, 0 deletions
diff --git a/src/opts/SkChecksum_opts.h b/src/opts/SkChecksum_opts.h
new file mode 100644
index 0000000000..346b16b3f5
--- /dev/null
+++ b/src/opts/SkChecksum_opts.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkChecksum_opts_DEFINED
+#define SkChecksum_opts_DEFINED
+
+#include "SkChecksum.h"
+#include "SkTypes.h"
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
+ #include <immintrin.h>
+#endif
+
+// TODO: ARMv8 has optional CRC instructions similar to SSE 4.2
+// TODO: 32-bit x86 version: same sort of idea using only _mm_crc32_u32() and smaller
+
+namespace SK_OPTS_NS {
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
+ template <typename T>
+ static inline T unaligned_load(const uint8_t* src) {
+ T val;
+ memcpy(&val, src, sizeof(val));
+ return val;
+ }
+
+ static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) {
+ auto data = (const uint8_t*)vdata;
+
+ // _mm_crc32_u64() operates on 64-bit registers, so we use uint64_t for a while.
+ uint64_t hash = seed;
+ if (bytes >= 24) {
+ // We'll create 3 independent hashes, each using _mm_crc32_u64()
+ // to hash 8 bytes per step. Both 3 and independent are important:
+ // we can execute 3 of these instructions in parallel on a single core.
+ uint64_t a = hash,
+ b = hash,
+ c = hash;
+ size_t steps = bytes/24;
+ while (steps --> 0) {
+ a = _mm_crc32_u64(a, unaligned_load<uint64_t>(data+ 0));
+ b = _mm_crc32_u64(b, unaligned_load<uint64_t>(data+ 8));
+ c = _mm_crc32_u64(c, unaligned_load<uint64_t>(data+16));
+ data += 24;
+ }
+ bytes %= 24;
+ hash = a^b^c;
+ }
+
+ SkASSERT(bytes < 24);
+ if (bytes >= 16) {
+ hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data));
+ bytes -= 8;
+ data += 8;
+ }
+
+ SkASSERT(bytes < 16);
+ if (bytes & 8) {
+ hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data));
+ data += 8;
+ }
+
+ // The remainder of these _mm_crc32_u*() operate on a 32-bit register.
+ // We don't lose anything here: only the bottom 32-bits were populated.
+ auto hash32 = (uint32_t)hash;
+
+ if (bytes & 4) {
+ hash32 = _mm_crc32_u32(hash32, unaligned_load<uint32_t>(data));
+ data += 4;
+ }
+ if (bytes & 2) {
+ hash32 = _mm_crc32_u16(hash32, unaligned_load<uint16_t>(data));
+ data += 2;
+ }
+ if (bytes & 1) {
+ hash32 = _mm_crc32_u8(hash32, unaligned_load<uint8_t>(data));
+ }
+ return hash32;
+ }
+
+#else
+ static uint32_t hash_fn(const void* data, size_t bytes, uint32_t seed) {
+ // This is Murmur3.
+
+ // Use may_alias to remind the compiler we're intentionally violating strict aliasing,
+ // and so not to apply strict-aliasing-based optimizations.
+ typedef uint32_t SK_ATTRIBUTE(may_alias) aliased_uint32_t;
+ typedef uint8_t SK_ATTRIBUTE(may_alias) aliased_uint8_t;
+
+ // Handle 4 bytes at a time while possible.
+ const aliased_uint32_t* safe_data = (const aliased_uint32_t*)data;
+ const size_t words = bytes/4;
+ uint32_t hash = seed;
+ for (size_t i = 0; i < words; i++) {
+ uint32_t k = safe_data[i];
+ k *= 0xcc9e2d51;
+ k = (k << 15) | (k >> 17);
+ k *= 0x1b873593;
+
+ hash ^= k;
+ hash = (hash << 13) | (hash >> 19);
+ hash *= 5;
+ hash += 0xe6546b64;
+ }
+
+ // Handle last 0-3 bytes.
+ const aliased_uint8_t* safe_tail = (const uint8_t*)(safe_data + words);
+ uint32_t k = 0;
+ switch (bytes & 3) {
+ case 3: k ^= safe_tail[2] << 16;
+ case 2: k ^= safe_tail[1] << 8;
+ case 1: k ^= safe_tail[0] << 0;
+ k *= 0xcc9e2d51;
+ k = (k << 15) | (k >> 17);
+ k *= 0x1b873593;
+ hash ^= k;
+ }
+
+ hash ^= bytes;
+ return SkChecksum::Mix(hash);
+ }
+#endif
+
+} // namespace SK_OPTS_NS
+
+#endif//SkChecksum_opts_DEFINED
diff --git a/src/opts/SkOpts_sse42.cpp b/src/opts/SkOpts_sse42.cpp
new file mode 100644
index 0000000000..1883182192
--- /dev/null
+++ b/src/opts/SkOpts_sse42.cpp
@@ -0,0 +1,18 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkOpts.h"
+
+#define SK_OPTS_NS sse42
+#include "SkChecksum_opts.h"
+
+namespace SkOpts {
+ void Init_sse42() {
+ hash_fn = sse42::hash_fn;
+ }
+}
+