diff options
author | Ilya Tokar <tokarip@google.com> | 2022-12-13 13:57:36 -0800 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2022-12-13 13:58:32 -0800 |
commit | 4cb6c3893638348ef261716ac47ede4f5f88b8e9 (patch) | |
tree | 76901eef07a06e499c99e943fd5d35167154c639 /absl/crc | |
parent | 1887dece5e9ed5362b9e87ec05d343c750886e99 (diff) |
Add prefetch to crc32
We already prefetch in case of large inputs, do the same
for medium sized inputs as well. This is mostly neutral
for performance in most cases, so this also adds a new
bench with working size >> cache size to ensure that we
are seeing performance benefits of prefetch. Main benefits
are on AMD with hardware prefetchers turned off:
AMD prefetchers on:
name old time/op new time/op delta
BM_Calculate/0 2.43ns ± 1% 2.43ns ± 1% ~ (p=0.814 n=40+40)
BM_Calculate/1 2.50ns ± 2% 2.50ns ± 2% ~ (p=0.745 n=39+39)
BM_Calculate/100 9.17ns ± 1% 9.17ns ± 2% ~ (p=0.747 n=40+40)
BM_Calculate/10000 474ns ± 1% 474ns ± 2% ~ (p=0.749 n=40+40)
BM_Calculate/500000 22.8µs ± 1% 22.9µs ± 2% ~ (p=0.298 n=39+40)
BM_Extend/0 1.38ns ± 1% 1.38ns ± 1% ~ (p=0.651 n=40+40)
BM_Extend/1 1.53ns ± 2% 1.53ns ± 1% ~ (p=0.957 n=40+39)
BM_Extend/100 9.48ns ± 1% 9.48ns ± 2% ~ (p=1.000 n=40+40)
BM_Extend/10000 474ns ± 2% 474ns ± 1% ~ (p=0.928 n=40+40)
BM_Extend/500000 22.8µs ± 1% 22.9µs ± 2% ~ (p=0.331 n=40+40)
BM_Extend/100000000 4.79ms ± 1% 4.79ms ± 1% ~ (p=0.753 n=38+38)
BM_ExtendCacheMiss/10 25.5ms ± 2% 25.5ms ± 2% ~ (p=0.988 n=38+40)
BM_ExtendCacheMiss/100 23.1ms ± 2% 23.1ms ± 2% ~ (p=0.792 n=40+40)
BM_ExtendCacheMiss/1000 37.2ms ± 1% 28.6ms ± 2% -23.00% (p=0.000 n=38+40)
BM_ExtendCacheMiss/100000 7.77ms ± 2% 7.74ms ± 2% -0.45% (p=0.006 n=40+40)
AMD prefetchers off:
name old time/op new time/op delta
BM_Calculate/0 2.43ns ± 2% 2.43ns ± 2% ~ (p=0.351 n=40+39)
BM_Calculate/1 2.51ns ± 2% 2.51ns ± 1% ~ (p=0.535 n=40+40)
BM_Calculate/100 9.18ns ± 2% 9.15ns ± 2% ~ (p=0.120 n=38+39)
BM_Calculate/10000 475ns ± 2% 475ns ± 2% ~ (p=0.852 n=40+40)
BM_Calculate/500000 22.9µs ± 2% 22.8µs ± 2% ~ (p=0.396 n=40+40)
BM_Extend/0 1.38ns ± 2% 1.38ns ± 2% ~ (p=0.466 n=40+40)
BM_Extend/1 1.53ns ± 2% 1.53ns ± 2% ~ (p=0.914 n=40+39)
BM_Extend/100 9.49ns ± 2% 9.49ns ± 2% ~ (p=0.802 n=40+40)
BM_Extend/10000 475ns ± 2% 474ns ± 1% ~ (p=0.589 n=40+40)
BM_Extend/500000 22.8µs ± 2% 22.8µs ± 2% ~ (p=0.872 n=39+40)
BM_Extend/100000000 10.0ms ± 3% 10.0ms ± 4% ~ (p=0.355 n=40+40)
BM_ExtendCacheMiss/10 196ms ± 2% 196ms ± 2% ~ (p=0.698 n=40+40)
BM_ExtendCacheMiss/100 129ms ± 1% 129ms ± 1% ~ (p=0.602 n=36+37)
BM_ExtendCacheMiss/1000 88.6ms ± 1% 57.2ms ± 1% -35.49% (p=0.000 n=36+38)
BM_ExtendCacheMiss/100000 14.9ms ± 1% 14.9ms ± 1% ~ (p=0.888 n=39+40)
Intel skylake:
BM_Calculate/0 2.49ns ± 2% 2.44ns ± 4% -2.15% (p=0.001 n=31+34)
BM_Calculate/1 3.04ns ± 2% 2.98ns ± 9% -1.95% (p=0.003 n=31+35)
BM_Calculate/100 8.64ns ± 3% 8.53ns ± 5% ~ (p=0.065 n=31+35)
BM_Calculate/10000 290ns ± 3% 285ns ± 7% -1.80% (p=0.004 n=28+34)
BM_Calculate/500000 11.8µs ± 2% 11.6µs ± 8% -1.59% (p=0.003 n=26+34)
BM_Extend/0 1.56ns ± 1% 1.52ns ± 3% -2.44% (p=0.000 n=26+35)
BM_Extend/1 1.88ns ± 3% 1.83ns ± 6% -2.17% (p=0.001 n=27+35)
BM_Extend/100 9.31ns ± 3% 9.13ns ± 7% -1.92% (p=0.000 n=33+38)
BM_Extend/10000 290ns ± 3% 283ns ± 3% -2.45% (p=0.000 n=32+38)
BM_Extend/500000 11.8µs ± 2% 11.5µs ± 8% -1.80% (p=0.001 n=35+37)
BM_Extend/100000000 6.39ms ±10% 6.11ms ± 8% -4.34% (p=0.000 n=40+40)
BM_ExtendCacheMiss/10 36.2ms ± 7% 35.8ms ±14% ~ (p=0.281 n=33+37)
BM_ExtendCacheMiss/100 26.9ms ±15% 25.9ms ±12% -3.93% (p=0.000 n=40+40)
BM_ExtendCacheMiss/1000 23.8ms ± 5% 23.4ms ± 5% -1.68% (p=0.001 n=39+40)
BM_ExtendCacheMiss/100000 10.1ms ± 5% 10.0ms ± 4% ~ (p=0.051 n=39+39)
PiperOrigin-RevId: 495119444
Change-Id: I67bcf3b0282b5e1c43122de2837a24c16b8aded7
Diffstat (limited to 'absl/crc')
-rw-r--r-- | absl/crc/BUILD.bazel | 1 | ||||
-rw-r--r-- | absl/crc/crc32c_benchmark.cc | 23 | ||||
-rw-r--r-- | absl/crc/internal/crc_internal.h | 2 | ||||
-rw-r--r-- | absl/crc/internal/crc_x86_arm_combined.cc | 6 |
4 files changed, 31 insertions, 1 deletions
diff --git a/absl/crc/BUILD.bazel b/absl/crc/BUILD.bazel index 29374560..1c58f46c 100644 --- a/absl/crc/BUILD.bazel +++ b/absl/crc/BUILD.bazel @@ -204,6 +204,7 @@ cc_binary( deps = [ ":crc32c", "//absl/memory", + "//absl/strings", "@com_github_google_benchmark//:benchmark_main", ], ) diff --git a/absl/crc/crc32c_benchmark.cc b/absl/crc/crc32c_benchmark.cc index df99d5cf..3b46ef36 100644 --- a/absl/crc/crc32c_benchmark.cc +++ b/absl/crc/crc32c_benchmark.cc @@ -17,6 +17,7 @@ #include "absl/crc/crc32c.h" #include "absl/crc/internal/crc32c.h" #include "absl/memory/memory.h" +#include "absl/strings/string_view.h" #include "benchmark/benchmark.h" namespace { @@ -52,7 +53,27 @@ void BM_Extend(benchmark::State& state) { benchmark::DoNotOptimize(crc); } } -BENCHMARK(BM_Extend)->Arg(0)->Arg(1)->Arg(100)->Arg(10000)->Arg(500000); +BENCHMARK(BM_Extend)->Arg(0)->Arg(1)->Arg(100)->Arg(10000)->Arg(500000)->Arg( + 100 * 1000 * 1000); + +// Make working set >> CPU cache size to benchmark prefetches better +void BM_ExtendCacheMiss(benchmark::State& state) { + int len = state.range(0); + constexpr int total = 300 * 1000 * 1000; + std::string extension = TestString(total); + absl::crc32c_t base = absl::crc32c_t{0xC99465AA}; // CRC32C of "Hello World" + for (auto s : state) { + for (int i = 0; i < total; i += len * 2) { + benchmark::DoNotOptimize(base); + benchmark::DoNotOptimize(extension); + absl::crc32c_t crc = + absl::ExtendCrc32c(base, absl::string_view(&extension[i], len)); + benchmark::DoNotOptimize(crc); + } + } + state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * total / 2); +} +BENCHMARK(BM_ExtendCacheMiss)->Arg(10)->Arg(100)->Arg(1000)->Arg(100000); void BM_ExtendByZeroes(benchmark::State& state) { absl::crc32c_t base = absl::crc32c_t{0xC99465AA}; // CRC32C of "Hello World" diff --git a/absl/crc/internal/crc_internal.h b/absl/crc/internal/crc_internal.h index 7a503433..0611b383 100644 --- a/absl/crc/internal/crc_internal.h +++ b/absl/crc/internal/crc_internal.h @@ -29,6 +29,8 @@ namespace crc_internal { // Prefetch constants used in some Extend() implementations constexpr int kPrefetchHorizon = ABSL_CACHELINE_SIZE * 4; // Prefetch this far +// Shorter prefetch distance for smaller buffers +constexpr int kPrefetchHorizonMedium = ABSL_CACHELINE_SIZE * 1; static_assert(kPrefetchHorizon >= 64, "CRCPrefetchHorizon less than loop len"); // We require the Scramble() function: diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc index f6e6aacb..d71191e3 100644 --- a/absl/crc/internal/crc_x86_arm_combined.cc +++ b/absl/crc/internal/crc_x86_arm_combined.cc @@ -429,6 +429,12 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); + base_internal::PrefetchT0( + reinterpret_cast<const char*>(p + kPrefetchHorizonMedium)); + base_internal::PrefetchT0( + reinterpret_cast<const char*>(p1 + kPrefetchHorizonMedium)); + base_internal::PrefetchT0( + reinterpret_cast<const char*>(p2 + kPrefetchHorizonMedium)); } // Don't run crc on last 8 bytes. ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2); |