aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/lib/hash
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-01-30 16:02:42 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-01-30 16:27:17 -0800
commit51e5197bb73653609f4100439664123b80126447 (patch)
tree4cf52017ac4cca9fe05510c827e199be7907a364 /tensorflow/core/lib/hash
parent0c43e2b358f9de6921998a3e30538f2f1d6dfea9 (diff)
Use SSE4.2 crc32c instruction if available.
Also added a benchmark. New code is ~6x the throughput for lengths >= 16; Run on ... (12 X 3501 MHz CPUs); 2017-01-23T13:53:34.121700508-08:00 CPU: Intel Ivybridge with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:12MB Benchmark Time(ns) CPU(ns) Iterations ----------------------------------------------------------- Old: BM_CRC/1_mean 4.53 4.53 1529955670 210.559MB/s BM_CRC/8_mean 7.67 7.67 923461420 995.407MB/s BM_CRC/64_mean 49.3 49.2 142465300 1.211GB/s BM_CRC/512_mean 380 380 18472980 1.255GB/s BM_CRC/4k_mean 3070 3067 2306140 1.244GB/s BM_CRC/32k_mean 24398 24373 283310 1.252GB/s BM_CRC/256k_mean 197142 196936 36060 1.240GB/s New: BM_CRC/1_mean 4.64 4.63 1441767130 205.946MB/s BM_CRC/8_mean 8.36 8.35 837511140 913.375MB/s BM_CRC/64_mean 7.01 7.00 972688290 8.512GB/s BM_CRC/512_mean 51.8 51.7 100000000 9.223GB/s BM_CRC/4k_mean 416 416 16838270 9.179GB/s BM_CRC/32k_mean 3275 3272 2093370 9.327GB/s BM_CRC/256k_mean 26205 26179 267630 9.326GB/s name old time/op new time/op delta BM_CRC/1 4.53ns +- 1% 4.63ns +- 3% +2.30% (p=0.000 n=10+10) BM_CRC/8 7.64ns +- 1% 8.35ns +- 1% +9.32% (p=0.000 n=9+10) BM_CRC/64 49.2ns +- 0% 7.0ns +- 0% -85.77% (p=0.000 n=6+10) BM_CRC/512 380ns +- 0% 52ns +- 1% -86.39% (p=0.000 n=9+10) BM_CRC/4k 3.07us +- 1% 0.42us +- 1% -86.45% (p=0.000 n=10+10) BM_CRC/32k 24.4us +- 2% 3.3us +- 1% -86.58% (p=0.000 n=10+10) BM_CRC/256k 197us +- 3% 26us +- 0% -86.71% (p=0.000 n=10+10) Change: 146051723
Diffstat (limited to 'tensorflow/core/lib/hash')
-rw-r--r--tensorflow/core/lib/hash/crc32c.cc8
-rw-r--r--tensorflow/core/lib/hash/crc32c_accelerate.cc93
-rw-r--r--tensorflow/core/lib/hash/crc32c_test.cc19
3 files changed, 120 insertions, 0 deletions
diff --git a/tensorflow/core/lib/hash/crc32c.cc b/tensorflow/core/lib/hash/crc32c.cc
index 322dcffea0..bd3b41e748 100644
--- a/tensorflow/core/lib/hash/crc32c.cc
+++ b/tensorflow/core/lib/hash/crc32c.cc
@@ -24,6 +24,9 @@ limitations under the License.
namespace tensorflow {
namespace crc32c {
+extern bool CanAccelerate();
+extern uint32_t AcceleratedExtend(uint32_t crc, const char *buf, size_t size);
+
static const uint32 table0_[256] = {
0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c,
0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
@@ -207,6 +210,11 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) {
}
uint32 Extend(uint32 crc, const char *buf, size_t size) {
+ static bool can_accelerate = CanAccelerate();
+ if (can_accelerate) {
+ return AcceleratedExtend(crc, buf, size);
+ }
+
const uint8 *p = reinterpret_cast<const uint8 *>(buf);
const uint8 *e = p + size;
uint32 l = crc ^ 0xffffffffu;
diff --git a/tensorflow/core/lib/hash/crc32c_accelerate.cc b/tensorflow/core/lib/hash/crc32c_accelerate.cc
new file mode 100644
index 0000000000..07fa2faedd
--- /dev/null
+++ b/tensorflow/core/lib/hash/crc32c_accelerate.cc
@@ -0,0 +1,93 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stddef.h>
+#include <stdint.h>
+
+// SSE4.2 accelerated CRC32c.
+
+// See if the SSE4.2 crc32c instruction is available.
+#undef USE_SSE_CRC32C
+#ifdef __SSE4_2__
+#if defined(__x86_64__) && defined(__GNUC__) && \
+ (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+#define USE_SSE_CRC32C 1
+#elif defined(__x86_64__) && defined(__clang__)
+#if __has_builtin(__builtin_cpu_supports)
+#define USE_SSE_CRC32C 1
+#endif
+#endif
+#endif /* __SSE4_2__ */
+
+#ifdef USE_SSE_CRC32C
+#include <nmmintrin.h>
+#endif
+
+namespace tensorflow {
+namespace crc32c {
+
+#ifndef USE_SSE_CRC32C
+
+bool CanAccelerate() { return false; }
+uint32_t AcceleratedExtend(uint32_t crc, const char *buf, size_t size) {
+ // Should not be called.
+ return 0;
+}
+
+#else
+
+// SSE4.2 optimized crc32c computation.
+bool CanAccelerate() { return __builtin_cpu_supports("sse4.2"); }
+
+uint32_t AcceleratedExtend(uint32_t crc, const char *buf, size_t size) {
+ const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
+ const uint8_t *e = p + size;
+ uint32_t l = crc ^ 0xffffffffu;
+
+ // Advance p until aligned to 8-bytes..
+ // Point x at first 7-byte aligned byte in string. This might be
+ // just past the end of the string.
+ const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
+ const uint8_t *x = reinterpret_cast<const uint8_t *>(((pval + 7) >> 3) << 3);
+ if (x <= e) {
+ // Process bytes until finished or p is 8-byte aligned
+ while (p != x) {
+ l = _mm_crc32_u8(l, *p);
+ p++;
+ }
+ }
+
+ // Process bytes 16 at a time
+ uint64_t l64 = l;
+ while ((e - p) >= 16) {
+ l64 = _mm_crc32_u64(l64, *reinterpret_cast<const uint64_t *>(p));
+ l64 = _mm_crc32_u64(l64, *reinterpret_cast<const uint64_t *>(p + 8));
+ p += 16;
+ }
+
+ // Process remaining bytes one at a time.
+ l = l64;
+ while (p < e) {
+ l = _mm_crc32_u8(l, *p);
+ p++;
+ }
+
+ return l ^ 0xffffffffu;
+}
+
+#endif
+
+} // namespace crc32c
+} // namespace tensorflow
diff --git a/tensorflow/core/lib/hash/crc32c_test.cc b/tensorflow/core/lib/hash/crc32c_test.cc
index ee8ef98872..5213e4c532 100644
--- a/tensorflow/core/lib/hash/crc32c_test.cc
+++ b/tensorflow/core/lib/hash/crc32c_test.cc
@@ -14,7 +14,9 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/core/lib/hash/crc32c.h"
+#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
namespace tensorflow {
namespace crc32c {
@@ -46,6 +48,12 @@ TEST(CRC, StandardResults) {
0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};
ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data)));
+
+ // Try unaligned sizes and offsets.
+ // Accelerated and unaccelerated code both produce these results.
+ ASSERT_EQ(0xdd1b19be, Value(reinterpret_cast<char*>(data), sizeof(data) - 7));
+ ASSERT_EQ(0x4930c4b1,
+ Value(reinterpret_cast<char*>(data) + 1, sizeof(data) - 4));
}
TEST(CRC, Values) { ASSERT_NE(Value("a", 1), Value("foo", 3)); }
@@ -62,5 +70,16 @@ TEST(CRC, Mask) {
ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc)))));
}
+static void BM_CRC(int iters, int len) {
+ std::string input(len, 'x');
+ uint32 h = 0;
+ for (int i = 0; i < iters; i++) {
+ h = Extend(h, input.data() + 1, len - 1);
+ }
+ testing::BytesProcessed(static_cast<int64>(iters) * len);
+ VLOG(1) << h;
+}
+BENCHMARK(BM_CRC)->Range(1, 256 * 1024);
+
} // namespace crc32c
} // namespace tensorflow