32-bit fast hash, tidy up murmur3 a bit

Nothing too surprising in the new 32-bit x86 hash. It's about half speed of the 64-bit variant, just as you'd expect. Using unaligned_load like the others makes the may_alias parts of murmur3 moot. I've updated some of the terms in the murmur hash to read consistently with the others. The existing hashes are the same speed and produce the same hashes. In case this is not obvious, all three hash functions produce different hashes. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2251773002 CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review-Url: https://codereview.chromium.org/2251773002
author: mtklein <mtklein@chromium.org> 2016-08-16 09:29:57 -0700
committer: Commit bot <commit-bot@chromium.org> 2016-08-16 09:29:57 -0700
commit: 2f4114a246e4753ef5dab63a4b40caf0cf7950ac (patch)
tree: c087a71690e0be1caed9403dcb785f9949db6dbb /src/opts/SkChecksum_opts.h
parent: af68fa11ed61d8a14d9ca15996e2da6ca06795eb (diff)
1 files changed, 66 insertions, 24 deletions
diff --git a/src/opts/SkChecksum_opts.h b/src/opts/SkChecksum_opts.h
index 346b16b3f5..07fdfaab65 100644
--- a/src/opts/SkChecksum_opts.h
+++ b/src/opts/SkChecksum_opts.h
@@ -16,18 +16,18 @@
 #endif
 
 // TODO: ARMv8 has optional CRC instructions similar to SSE 4.2
-// TODO: 32-bit x86 version: same sort of idea using only _mm_crc32_u32() and smaller
 
 namespace SK_OPTS_NS {
 
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
-    template <typename T>
-    static inline T unaligned_load(const uint8_t* src) {
-        T val;
-        memcpy(&val, src, sizeof(val));
-        return val;
-    }
+template <typename T>
+static inline T unaligned_load(const uint8_t* src) {
+    T val;
+    memcpy(&val, src, sizeof(val));
+    return val;
+}
 
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
+    // This is not a CRC32.  It's Just A Hash that uses those instructions because they're fast.
     static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) {
         auto data = (const uint8_t*)vdata;
 
@@ -82,21 +82,61 @@ namespace SK_OPTS_NS {
         return hash32;
     }
 
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
+    // 32-bit version of above, using _mm_crc32_u32() but not _mm_crc32_u64().
+    static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
+        auto data = (const uint8_t*)vdata;
+
+        if (bytes >= 12) {
+            // We'll create 3 independent hashes, each using _mm_crc32_u32()
+            // to hash 4 bytes per step.  Both 3 and independent are important:
+            // we can execute 3 of these instructions in parallel on a single core.
+            uint32_t a = hash,
+                     b = hash,
+                     c = hash;
+            size_t steps = bytes/12;
+            while (steps --> 0) {
+                a = _mm_crc32_u32(a, unaligned_load<uint32_t>(data+0));
+                b = _mm_crc32_u32(b, unaligned_load<uint32_t>(data+4));
+                c = _mm_crc32_u32(c, unaligned_load<uint32_t>(data+8));
+                data += 12;
+            }
+            bytes %= 12;
+            hash = a^b^c;
+        }
+
+        SkASSERT(bytes < 12);
+        if (bytes >= 8) {
+            hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
+            bytes -= 4;
+            data  += 4;
+        }
+
+        SkASSERT(bytes < 8);
+        if (bytes & 4) {
+            hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
+            data += 4;
+        }
+        if (bytes & 2) {
+            hash = _mm_crc32_u16(hash, unaligned_load<uint16_t>(data));
+            data += 2;
+        }
+        if (bytes & 1) {
+            hash = _mm_crc32_u8(hash, unaligned_load<uint8_t>(data));
+        }
+        return hash;
+    }
+
 #else
-    static uint32_t hash_fn(const void* data, size_t bytes, uint32_t seed) {
-        // This is Murmur3.
+    // This is Murmur3.
+    static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
+        auto data = (const uint8_t*)vdata;
 
-        // Use may_alias to remind the compiler we're intentionally violating strict aliasing,
-        // and so not to apply strict-aliasing-based optimizations.
-        typedef uint32_t SK_ATTRIBUTE(may_alias) aliased_uint32_t;
-        typedef uint8_t SK_ATTRIBUTE(may_alias) aliased_uint8_t;
+        size_t original_bytes = bytes;
 
         // Handle 4 bytes at a time while possible.
-        const aliased_uint32_t* safe_data = (const aliased_uint32_t*)data;
-        const size_t words = bytes/4;
-        uint32_t hash = seed;
-        for (size_t i = 0; i < words; i++) {
-            uint32_t k = safe_data[i];
+        while (bytes >= 4) {
+            uint32_t k = unaligned_load<uint32_t>(data);
             k *= 0xcc9e2d51;
             k = (k << 15) | (k >> 17);
             k *= 0x1b873593;
@@ -105,22 +145,24 @@ namespace SK_OPTS_NS {
             hash = (hash << 13) | (hash >> 19);
             hash *= 5;
             hash += 0xe6546b64;
+
+            bytes -= 4;
+            data  += 4;
         }
 
         // Handle last 0-3 bytes.
-        const aliased_uint8_t* safe_tail = (const uint8_t*)(safe_data + words);
         uint32_t k = 0;
         switch (bytes & 3) {
-            case 3: k ^= safe_tail[2] << 16;
-            case 2: k ^= safe_tail[1] <<  8;
-            case 1: k ^= safe_tail[0] <<  0;
+            case 3: k ^= data[2] << 16;
+            case 2: k ^= data[1] <<  8;
+            case 1: k ^= data[0] <<  0;
                     k *= 0xcc9e2d51;
                     k = (k << 15) | (k >> 17);
                     k *= 0x1b873593;
                     hash ^= k;
         }
 
-        hash ^= bytes;
+        hash ^= original_bytes;
         return SkChecksum::Mix(hash);
     }
 #endif
author	mtklein <mtklein@chromium.org>	2016-08-16 09:29:57 -0700
committer	Commit bot <commit-bot@chromium.org>	2016-08-16 09:29:57 -0700
commit	2f4114a246e4753ef5dab63a4b40caf0cf7950ac (patch)
tree	c087a71690e0be1caed9403dcb785f9949db6dbb /src/opts/SkChecksum_opts.h
parent	af68fa11ed61d8a14d9ca15996e2da6ca06795eb (diff)