diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2017-01-30 16:02:42 -0800 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2017-01-30 16:27:17 -0800 |
commit | 51e5197bb73653609f4100439664123b80126447 (patch) | |
tree | 4cf52017ac4cca9fe05510c827e199be7907a364 /tensorflow/core/lib/hash | |
parent | 0c43e2b358f9de6921998a3e30538f2f1d6dfea9 (diff) |
Use SSE4.2 crc32c instruction if available.
Also added a benchmark. New code is ~6x the throughput for lengths >= 16;
Run on ... (12 X 3501 MHz CPUs); 2017-01-23T13:53:34.121700508-08:00
CPU: Intel Ivybridge with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:12MB
Benchmark Time(ns) CPU(ns) Iterations
-----------------------------------------------------------
Old:
BM_CRC/1_mean 4.53 4.53 1529955670 210.559MB/s
BM_CRC/8_mean 7.67 7.67 923461420 995.407MB/s
BM_CRC/64_mean 49.3 49.2 142465300 1.211GB/s
BM_CRC/512_mean 380 380 18472980 1.255GB/s
BM_CRC/4k_mean 3070 3067 2306140 1.244GB/s
BM_CRC/32k_mean 24398 24373 283310 1.252GB/s
BM_CRC/256k_mean 197142 196936 36060 1.240GB/s
New:
BM_CRC/1_mean 4.64 4.63 1441767130 205.946MB/s
BM_CRC/8_mean 8.36 8.35 837511140 913.375MB/s
BM_CRC/64_mean 7.01 7.00 972688290 8.512GB/s
BM_CRC/512_mean 51.8 51.7 100000000 9.223GB/s
BM_CRC/4k_mean 416 416 16838270 9.179GB/s
BM_CRC/32k_mean 3275 3272 2093370 9.327GB/s
BM_CRC/256k_mean 26205 26179 267630 9.326GB/s
name old time/op new time/op delta
BM_CRC/1 4.53ns +- 1% 4.63ns +- 3% +2.30% (p=0.000 n=10+10)
BM_CRC/8 7.64ns +- 1% 8.35ns +- 1% +9.32% (p=0.000 n=9+10)
BM_CRC/64 49.2ns +- 0% 7.0ns +- 0% -85.77% (p=0.000 n=6+10)
BM_CRC/512 380ns +- 0% 52ns +- 1% -86.39% (p=0.000 n=9+10)
BM_CRC/4k 3.07us +- 1% 0.42us +- 1% -86.45% (p=0.000 n=10+10)
BM_CRC/32k 24.4us +- 2% 3.3us +- 1% -86.58% (p=0.000 n=10+10)
BM_CRC/256k 197us +- 3% 26us +- 0% -86.71% (p=0.000 n=10+10)
Change: 146051723
Diffstat (limited to 'tensorflow/core/lib/hash')
-rw-r--r-- | tensorflow/core/lib/hash/crc32c.cc | 8 | ||||
-rw-r--r-- | tensorflow/core/lib/hash/crc32c_accelerate.cc | 93 | ||||
-rw-r--r-- | tensorflow/core/lib/hash/crc32c_test.cc | 19 |
3 files changed, 120 insertions, 0 deletions
diff --git a/tensorflow/core/lib/hash/crc32c.cc b/tensorflow/core/lib/hash/crc32c.cc index 322dcffea0..bd3b41e748 100644 --- a/tensorflow/core/lib/hash/crc32c.cc +++ b/tensorflow/core/lib/hash/crc32c.cc @@ -24,6 +24,9 @@ limitations under the License. namespace tensorflow { namespace crc32c { +extern bool CanAccelerate(); +extern uint32_t AcceleratedExtend(uint32_t crc, const char *buf, size_t size); + static const uint32 table0_[256] = { 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, @@ -207,6 +210,11 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) { } uint32 Extend(uint32 crc, const char *buf, size_t size) { + static bool can_accelerate = CanAccelerate(); + if (can_accelerate) { + return AcceleratedExtend(crc, buf, size); + } + const uint8 *p = reinterpret_cast<const uint8 *>(buf); const uint8 *e = p + size; uint32 l = crc ^ 0xffffffffu; diff --git a/tensorflow/core/lib/hash/crc32c_accelerate.cc b/tensorflow/core/lib/hash/crc32c_accelerate.cc new file mode 100644 index 0000000000..07fa2faedd --- /dev/null +++ b/tensorflow/core/lib/hash/crc32c_accelerate.cc @@ -0,0 +1,93 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include <stddef.h> +#include <stdint.h> + +// SSE4.2 accelerated CRC32c. + +// See if the SSE4.2 crc32c instruction is available. +#undef USE_SSE_CRC32C +#ifdef __SSE4_2__ +#if defined(__x86_64__) && defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) +#define USE_SSE_CRC32C 1 +#elif defined(__x86_64__) && defined(__clang__) +#if __has_builtin(__builtin_cpu_supports) +#define USE_SSE_CRC32C 1 +#endif +#endif +#endif /* __SSE4_2__ */ + +#ifdef USE_SSE_CRC32C +#include <nmmintrin.h> +#endif + +namespace tensorflow { +namespace crc32c { + +#ifndef USE_SSE_CRC32C + +bool CanAccelerate() { return false; } +uint32_t AcceleratedExtend(uint32_t crc, const char *buf, size_t size) { + // Should not be called. + return 0; +} + +#else + +// SSE4.2 optimized crc32c computation. +bool CanAccelerate() { return __builtin_cpu_supports("sse4.2"); } + +uint32_t AcceleratedExtend(uint32_t crc, const char *buf, size_t size) { + const uint8_t *p = reinterpret_cast<const uint8_t *>(buf); + const uint8_t *e = p + size; + uint32_t l = crc ^ 0xffffffffu; + + // Advance p until aligned to 8-bytes.. + // Point x at first 7-byte aligned byte in string. This might be + // just past the end of the string. + const uintptr_t pval = reinterpret_cast<uintptr_t>(p); + const uint8_t *x = reinterpret_cast<const uint8_t *>(((pval + 7) >> 3) << 3); + if (x <= e) { + // Process bytes until finished or p is 8-byte aligned + while (p != x) { + l = _mm_crc32_u8(l, *p); + p++; + } + } + + // Process bytes 16 at a time + uint64_t l64 = l; + while ((e - p) >= 16) { + l64 = _mm_crc32_u64(l64, *reinterpret_cast<const uint64_t *>(p)); + l64 = _mm_crc32_u64(l64, *reinterpret_cast<const uint64_t *>(p + 8)); + p += 16; + } + + // Process remaining bytes one at a time. + l = l64; + while (p < e) { + l = _mm_crc32_u8(l, *p); + p++; + } + + return l ^ 0xffffffffu; +} + +#endif + +} // namespace crc32c +} // namespace tensorflow diff --git a/tensorflow/core/lib/hash/crc32c_test.cc b/tensorflow/core/lib/hash/crc32c_test.cc index ee8ef98872..5213e4c532 100644 --- a/tensorflow/core/lib/hash/crc32c_test.cc +++ b/tensorflow/core/lib/hash/crc32c_test.cc @@ -14,7 +14,9 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/lib/hash/crc32c.h" +#include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" namespace tensorflow { namespace crc32c { @@ -46,6 +48,12 @@ TEST(CRC, StandardResults) { 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data))); + + // Try unaligned sizes and offsets. + // Accelerated and unaccelerated code both produce these results. + ASSERT_EQ(0xdd1b19be, Value(reinterpret_cast<char*>(data), sizeof(data) - 7)); + ASSERT_EQ(0x4930c4b1, + Value(reinterpret_cast<char*>(data) + 1, sizeof(data) - 4)); } TEST(CRC, Values) { ASSERT_NE(Value("a", 1), Value("foo", 3)); } @@ -62,5 +70,16 @@ TEST(CRC, Mask) { ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); } +static void BM_CRC(int iters, int len) { + std::string input(len, 'x'); + uint32 h = 0; + for (int i = 0; i < iters; i++) { + h = Extend(h, input.data() + 1, len - 1); + } + testing::BytesProcessed(static_cast<int64>(iters) * len); + VLOG(1) << h; +} +BENCHMARK(BM_CRC)->Range(1, 256 * 1024); + } // namespace crc32c } // namespace tensorflow |