Use SSE4.2 crc32c instruction if available.

Also added a benchmark. New code is ~6x the throughput for lengths >= 16; Run on ... (12 X 3501 MHz CPUs); 2017-01-23T13:53:34.121700508-08:00 CPU: Intel Ivybridge with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:12MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------- Old: BM_CRC/1_mean 4.53 4.53 1529955670 210.559MB/s BM_CRC/8_mean 7.67 7.67 923461420 995.407MB/s BM_CRC/64_mean 49.3 49.2 142465300 1.211GB/s BM_CRC/512_mean 380 380 18472980 1.255GB/s BM_CRC/4k_mean 3070 3067 2306140 1.244GB/s BM_CRC/32k_mean 24398 24373 283310 1.252GB/s BM_CRC/256k_mean 197142 196936 36060 1.240GB/s New: BM_CRC/1_mean 4.64 4.63 1441767130 205.946MB/s BM_CRC/8_mean 8.36 8.35 837511140 913.375MB/s BM_CRC/64_mean 7.01 7.00 972688290 8.512GB/s BM_CRC/512_mean 51.8 51.7 100000000 9.223GB/s BM_CRC/4k_mean 416 416 16838270 9.179GB/s BM_CRC/32k_mean 3275 3272 2093370 9.327GB/s BM_CRC/256k_mean 26205 26179 267630 9.326GB/s name old time/op new time/op delta BM_CRC/1 4.53ns +- 1% 4.63ns +- 3% +2.30% (p=0.000 n=10+10) BM_CRC/8 7.64ns +- 1% 8.35ns +- 1% +9.32% (p=0.000 n=9+10) BM_CRC/64 49.2ns +- 0% 7.0ns +- 0% -85.77% (p=0.000 n=6+10) BM_CRC/512 380ns +- 0% 52ns +- 1% -86.39% (p=0.000 n=9+10) BM_CRC/4k 3.07us +- 1% 0.42us +- 1% -86.45% (p=0.000 n=10+10) BM_CRC/32k 24.4us +- 2% 3.3us +- 1% -86.58% (p=0.000 n=10+10) BM_CRC/256k 197us +- 3% 26us +- 0% -86.71% (p=0.000 n=10+10) Change: 146051723
author: A. Unique TensorFlower <gardener@tensorflow.org> 2017-01-30 16:02:42 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-01-30 16:27:17 -0800
commit: 51e5197bb73653609f4100439664123b80126447 (patch)
tree: 4cf52017ac4cca9fe05510c827e199be7907a364 /tensorflow/core/lib/hash
parent: 0c43e2b358f9de6921998a3e30538f2f1d6dfea9 (diff)
3 files changed, 120 insertions, 0 deletions
diff --git a/tensorflow/core/lib/hash/crc32c.cc b/tensorflow/core/lib/hash/crc32c.cc
index 322dcffea0..bd3b41e748 100644
--- a/tensorflow/core/lib/hash/crc32c.cc
+++ b/tensorflow/core/lib/hash/crc32c.cc
@@ -24,6 +24,9 @@ limitations under the License.
 namespace tensorflow {
 namespace crc32c {
 
+extern bool CanAccelerate();
+extern uint32_t AcceleratedExtend(uint32_t crc, const char *buf, size_t size);
+
 static const uint32 table0_[256] = {
     0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c,
     0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
@@ -207,6 +210,11 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) {
 }
 
 uint32 Extend(uint32 crc, const char *buf, size_t size) {
+  static bool can_accelerate = CanAccelerate();
+  if (can_accelerate) {
+    return AcceleratedExtend(crc, buf, size);
+  }
+
   const uint8 *p = reinterpret_cast<const uint8 *>(buf);
   const uint8 *e = p + size;
   uint32 l = crc ^ 0xffffffffu;
diff --git a/tensorflow/core/lib/hash/crc32c_accelerate.cc b/tensorflow/core/lib/hash/crc32c_accelerate.cc
new file mode 100644
index 0000000000..07fa2faedd
--- /dev/null
+++ b/tensorflow/core/lib/hash/crc32c_accelerate.cc
@@ -0,0 +1,93 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stddef.h>
+#include <stdint.h>
+
+// SSE4.2 accelerated CRC32c.
+
+// See if the SSE4.2 crc32c instruction is available.
+#undef USE_SSE_CRC32C
+#ifdef __SSE4_2__
+#if defined(__x86_64__) && defined(__GNUC__) && \
+    (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+#define USE_SSE_CRC32C 1
+#elif defined(__x86_64__) && defined(__clang__)
+#if __has_builtin(__builtin_cpu_supports)
+#define USE_SSE_CRC32C 1
+#endif
+#endif
+#endif /* __SSE4_2__ */
+
+#ifdef USE_SSE_CRC32C
+#include <nmmintrin.h>
+#endif
+
+namespace tensorflow {
+namespace crc32c {
+
+#ifndef USE_SSE_CRC32C
+
+bool CanAccelerate() { return false; }
+uint32_t AcceleratedExtend(uint32_t crc, const char *buf, size_t size) {
+  // Should not be called.
+  return 0;
+}
+
+#else
+
+// SSE4.2 optimized crc32c computation.
+bool CanAccelerate() { return __builtin_cpu_supports("sse4.2"); }
+
+uint32_t AcceleratedExtend(uint32_t crc, const char *buf, size_t size) {
+  const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
+  const uint8_t *e = p + size;
+  uint32_t l = crc ^ 0xffffffffu;
+
+  // Advance p until aligned to 8-bytes..
+  // Point x at first 7-byte aligned byte in string.  This might be
+  // just past the end of the string.
+  const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
+  const uint8_t *x = reinterpret_cast<const uint8_t *>(((pval + 7) >> 3) << 3);
+  if (x <= e) {
+    // Process bytes until finished or p is 8-byte aligned
+    while (p != x) {
+      l = _mm_crc32_u8(l, *p);
+      p++;
+    }
+  }
+
+  // Process bytes 16 at a time
+  uint64_t l64 = l;
+  while ((e - p) >= 16) {
+    l64 = _mm_crc32_u64(l64, *reinterpret_cast<const uint64_t *>(p));
+    l64 = _mm_crc32_u64(l64, *reinterpret_cast<const uint64_t *>(p + 8));
+    p += 16;
+  }
+
+  // Process remaining bytes one at a time.
+  l = l64;
+  while (p < e) {
+    l = _mm_crc32_u8(l, *p);
+    p++;
+  }
+
+  return l ^ 0xffffffffu;
+}
+
+#endif
+
+}  // namespace crc32c
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/hash/crc32c_test.cc b/tensorflow/core/lib/hash/crc32c_test.cc
index ee8ef98872..5213e4c532 100644
--- a/tensorflow/core/lib/hash/crc32c_test.cc
+++ b/tensorflow/core/lib/hash/crc32c_test.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/lib/hash/crc32c.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
 namespace crc32c {
@@ -46,6 +48,12 @@ TEST(CRC, StandardResults) {
       0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   };
   ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data)));
+
+  // Try unaligned sizes and offsets.
+  // Accelerated and unaccelerated code both produce these results.
+  ASSERT_EQ(0xdd1b19be, Value(reinterpret_cast<char*>(data), sizeof(data) - 7));
+  ASSERT_EQ(0x4930c4b1,
+            Value(reinterpret_cast<char*>(data) + 1, sizeof(data) - 4));
 }
 
 TEST(CRC, Values) { ASSERT_NE(Value("a", 1), Value("foo", 3)); }
@@ -62,5 +70,16 @@ TEST(CRC, Mask) {
   ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc)))));
 }
 
+static void BM_CRC(int iters, int len) {
+  std::string input(len, 'x');
+  uint32 h = 0;
+  for (int i = 0; i < iters; i++) {
+    h = Extend(h, input.data() + 1, len - 1);
+  }
+  testing::BytesProcessed(static_cast<int64>(iters) * len);
+  VLOG(1) << h;
+}
+BENCHMARK(BM_CRC)->Range(1, 256 * 1024);
+
 }  // namespace crc32c
 }  // namespace tensorflow
author	A. Unique TensorFlower <gardener@tensorflow.org>	2017-01-30 16:02:42 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-01-30 16:27:17 -0800
commit	51e5197bb73653609f4100439664123b80126447 (patch)
tree	4cf52017ac4cca9fe05510c827e199be7907a364 /tensorflow/core/lib/hash
parent	0c43e2b358f9de6921998a3e30538f2f1d6dfea9 (diff)