Add prefetch to crc32

We already prefetch in case of large inputs, do the same for medium sized inputs as well. This is mostly neutral for performance in most cases, so this also adds a new bench with working size >> cache size to ensure that we are seeing performance benefits of prefetch. Main benefits are on AMD with hardware prefetchers turned off: AMD prefetchers on: name old time/op new time/op delta BM_Calculate/0 2.43ns ± 1% 2.43ns ± 1% ~ (p=0.814 n=40+40) BM_Calculate/1 2.50ns ± 2% 2.50ns ± 2% ~ (p=0.745 n=39+39) BM_Calculate/100 9.17ns ± 1% 9.17ns ± 2% ~ (p=0.747 n=40+40) BM_Calculate/10000 474ns ± 1% 474ns ± 2% ~ (p=0.749 n=40+40) BM_Calculate/500000 22.8µs ± 1% 22.9µs ± 2% ~ (p=0.298 n=39+40) BM_Extend/0 1.38ns ± 1% 1.38ns ± 1% ~ (p=0.651 n=40+40) BM_Extend/1 1.53ns ± 2% 1.53ns ± 1% ~ (p=0.957 n=40+39) BM_Extend/100 9.48ns ± 1% 9.48ns ± 2% ~ (p=1.000 n=40+40) BM_Extend/10000 474ns ± 2% 474ns ± 1% ~ (p=0.928 n=40+40) BM_Extend/500000 22.8µs ± 1% 22.9µs ± 2% ~ (p=0.331 n=40+40) BM_Extend/100000000 4.79ms ± 1% 4.79ms ± 1% ~ (p=0.753 n=38+38) BM_ExtendCacheMiss/10 25.5ms ± 2% 25.5ms ± 2% ~ (p=0.988 n=38+40) BM_ExtendCacheMiss/100 23.1ms ± 2% 23.1ms ± 2% ~ (p=0.792 n=40+40) BM_ExtendCacheMiss/1000 37.2ms ± 1% 28.6ms ± 2% -23.00% (p=0.000 n=38+40) BM_ExtendCacheMiss/100000 7.77ms ± 2% 7.74ms ± 2% -0.45% (p=0.006 n=40+40) AMD prefetchers off: name old time/op new time/op delta BM_Calculate/0 2.43ns ± 2% 2.43ns ± 2% ~ (p=0.351 n=40+39) BM_Calculate/1 2.51ns ± 2% 2.51ns ± 1% ~ (p=0.535 n=40+40) BM_Calculate/100 9.18ns ± 2% 9.15ns ± 2% ~ (p=0.120 n=38+39) BM_Calculate/10000 475ns ± 2% 475ns ± 2% ~ (p=0.852 n=40+40) BM_Calculate/500000 22.9µs ± 2% 22.8µs ± 2% ~ (p=0.396 n=40+40) BM_Extend/0 1.38ns ± 2% 1.38ns ± 2% ~ (p=0.466 n=40+40) BM_Extend/1 1.53ns ± 2% 1.53ns ± 2% ~ (p=0.914 n=40+39) BM_Extend/100 9.49ns ± 2% 9.49ns ± 2% ~ (p=0.802 n=40+40) BM_Extend/10000 475ns ± 2% 474ns ± 1% ~ (p=0.589 n=40+40) BM_Extend/500000 22.8µs ± 2% 22.8µs ± 2% ~ (p=0.872 n=39+40) BM_Extend/100000000 10.0ms ± 3% 10.0ms ± 4% ~ (p=0.355 n=40+40) BM_ExtendCacheMiss/10 196ms ± 2% 196ms ± 2% ~ (p=0.698 n=40+40) BM_ExtendCacheMiss/100 129ms ± 1% 129ms ± 1% ~ (p=0.602 n=36+37) BM_ExtendCacheMiss/1000 88.6ms ± 1% 57.2ms ± 1% -35.49% (p=0.000 n=36+38) BM_ExtendCacheMiss/100000 14.9ms ± 1% 14.9ms ± 1% ~ (p=0.888 n=39+40) Intel skylake: BM_Calculate/0 2.49ns ± 2% 2.44ns ± 4% -2.15% (p=0.001 n=31+34) BM_Calculate/1 3.04ns ± 2% 2.98ns ± 9% -1.95% (p=0.003 n=31+35) BM_Calculate/100 8.64ns ± 3% 8.53ns ± 5% ~ (p=0.065 n=31+35) BM_Calculate/10000 290ns ± 3% 285ns ± 7% -1.80% (p=0.004 n=28+34) BM_Calculate/500000 11.8µs ± 2% 11.6µs ± 8% -1.59% (p=0.003 n=26+34) BM_Extend/0 1.56ns ± 1% 1.52ns ± 3% -2.44% (p=0.000 n=26+35) BM_Extend/1 1.88ns ± 3% 1.83ns ± 6% -2.17% (p=0.001 n=27+35) BM_Extend/100 9.31ns ± 3% 9.13ns ± 7% -1.92% (p=0.000 n=33+38) BM_Extend/10000 290ns ± 3% 283ns ± 3% -2.45% (p=0.000 n=32+38) BM_Extend/500000 11.8µs ± 2% 11.5µs ± 8% -1.80% (p=0.001 n=35+37) BM_Extend/100000000 6.39ms ±10% 6.11ms ± 8% -4.34% (p=0.000 n=40+40) BM_ExtendCacheMiss/10 36.2ms ± 7% 35.8ms ±14% ~ (p=0.281 n=33+37) BM_ExtendCacheMiss/100 26.9ms ±15% 25.9ms ±12% -3.93% (p=0.000 n=40+40) BM_ExtendCacheMiss/1000 23.8ms ± 5% 23.4ms ± 5% -1.68% (p=0.001 n=39+40) BM_ExtendCacheMiss/100000 10.1ms ± 5% 10.0ms ± 4% ~ (p=0.051 n=39+39) PiperOrigin-RevId: 495119444 Change-Id: I67bcf3b0282b5e1c43122de2837a24c16b8aded7
author: Ilya Tokar <tokarip@google.com> 2022-12-13 13:57:36 -0800
committer: Copybara-Service <copybara-worker@google.com> 2022-12-13 13:58:32 -0800
commit: 4cb6c3893638348ef261716ac47ede4f5f88b8e9 (patch)
tree: 76901eef07a06e499c99e943fd5d35167154c639
parent: 1887dece5e9ed5362b9e87ec05d343c750886e99 (diff)
4 files changed, 31 insertions, 1 deletions
diff --git a/absl/crc/BUILD.bazel b/absl/crc/BUILD.bazel
index 29374560..1c58f46c 100644
--- a/absl/crc/BUILD.bazel
+++ b/absl/crc/BUILD.bazel
@@ -204,6 +204,7 @@ cc_binary(
     deps = [
         ":crc32c",
         "//absl/memory",
+        "//absl/strings",
         "@com_github_google_benchmark//:benchmark_main",
     ],
 )
diff --git a/absl/crc/crc32c_benchmark.cc b/absl/crc/crc32c_benchmark.cc
index df99d5cf..3b46ef36 100644
--- a/absl/crc/crc32c_benchmark.cc
+++ b/absl/crc/crc32c_benchmark.cc
@@ -17,6 +17,7 @@
 #include "absl/crc/crc32c.h"
 #include "absl/crc/internal/crc32c.h"
 #include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
 #include "benchmark/benchmark.h"
 
 namespace {
@@ -52,7 +53,27 @@ void BM_Extend(benchmark::State& state) {
     benchmark::DoNotOptimize(crc);
   }
 }
-BENCHMARK(BM_Extend)->Arg(0)->Arg(1)->Arg(100)->Arg(10000)->Arg(500000);
+BENCHMARK(BM_Extend)->Arg(0)->Arg(1)->Arg(100)->Arg(10000)->Arg(500000)->Arg(
+    100 * 1000 * 1000);
+
+// Make working set >> CPU cache size to benchmark prefetches better
+void BM_ExtendCacheMiss(benchmark::State& state) {
+  int len = state.range(0);
+  constexpr int total = 300 * 1000 * 1000;
+  std::string extension = TestString(total);
+  absl::crc32c_t base = absl::crc32c_t{0xC99465AA};  // CRC32C of "Hello World"
+  for (auto s : state) {
+    for (int i = 0; i < total; i += len * 2) {
+      benchmark::DoNotOptimize(base);
+      benchmark::DoNotOptimize(extension);
+      absl::crc32c_t crc =
+          absl::ExtendCrc32c(base, absl::string_view(&extension[i], len));
+      benchmark::DoNotOptimize(crc);
+    }
+  }
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * total / 2);
+}
+BENCHMARK(BM_ExtendCacheMiss)->Arg(10)->Arg(100)->Arg(1000)->Arg(100000);
 
 void BM_ExtendByZeroes(benchmark::State& state) {
   absl::crc32c_t base = absl::crc32c_t{0xC99465AA};  // CRC32C of "Hello World"
diff --git a/absl/crc/internal/crc_internal.h b/absl/crc/internal/crc_internal.h
index 7a503433..0611b383 100644
--- a/absl/crc/internal/crc_internal.h
+++ b/absl/crc/internal/crc_internal.h
@@ -29,6 +29,8 @@ namespace crc_internal {
 
 // Prefetch constants used in some Extend() implementations
 constexpr int kPrefetchHorizon = ABSL_CACHELINE_SIZE * 4;  // Prefetch this far
+// Shorter prefetch distance for smaller buffers
+constexpr int kPrefetchHorizonMedium = ABSL_CACHELINE_SIZE * 1;
 static_assert(kPrefetchHorizon >= 64, "CRCPrefetchHorizon less than loop len");
 
 // We require the Scramble() function:
diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc
index f6e6aacb..d71191e3 100644
--- a/absl/crc/internal/crc_x86_arm_combined.cc
+++ b/absl/crc/internal/crc_x86_arm_combined.cc
@@ -429,6 +429,12 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
+          base_internal::PrefetchT0(
+              reinterpret_cast<const char*>(p + kPrefetchHorizonMedium));
+          base_internal::PrefetchT0(
+              reinterpret_cast<const char*>(p1 + kPrefetchHorizonMedium));
+          base_internal::PrefetchT0(
+              reinterpret_cast<const char*>(p2 + kPrefetchHorizonMedium));
         }
         // Don't run crc on last 8 bytes.
         ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
author	Ilya Tokar <tokarip@google.com>	2022-12-13 13:57:36 -0800
committer	Copybara-Service <copybara-worker@google.com>	2022-12-13 13:58:32 -0800
commit	4cb6c3893638348ef261716ac47ede4f5f88b8e9 (patch)
tree	76901eef07a06e499c99e943fd5d35167154c639
parent	1887dece5e9ed5362b9e87ec05d343c750886e99 (diff)