summaryrefslogtreecommitdiff
path: root/absl/synchronization/mutex_benchmark.cc
diff options
context:
space:
mode:
Diffstat (limited to 'absl/synchronization/mutex_benchmark.cc')
-rw-r--r--absl/synchronization/mutex_benchmark.cc224
1 files changed, 155 insertions, 69 deletions
diff --git a/absl/synchronization/mutex_benchmark.cc b/absl/synchronization/mutex_benchmark.cc
index 933ea14f..e35aed8b 100644
--- a/absl/synchronization/mutex_benchmark.cc
+++ b/absl/synchronization/mutex_benchmark.cc
@@ -61,8 +61,124 @@ class RaiiLocker<std::mutex> {
std::mutex* mu_;
};
+// RAII object to change the Mutex priority of the running thread.
+class ScopedThreadMutexPriority {
+ public:
+ explicit ScopedThreadMutexPriority(int priority) {
+ absl::base_internal::ThreadIdentity* identity =
+ absl::synchronization_internal::GetOrCreateCurrentThreadIdentity();
+ identity->per_thread_synch.priority = priority;
+ // Bump next_priority_read_cycles to the infinite future so that the
+ // implementation doesn't re-read the thread's actual scheduler priority
+ // and replace our temporary scoped priority.
+ identity->per_thread_synch.next_priority_read_cycles =
+ std::numeric_limits<int64_t>::max();
+ }
+ ~ScopedThreadMutexPriority() {
+ // Reset the "next priority read time" back to the infinite past so that
+ // the next time the Mutex implementation wants to know this thread's
+ // priority, it re-reads it from the OS instead of using our overridden
+ // priority.
+ absl::synchronization_internal::GetOrCreateCurrentThreadIdentity()
+ ->per_thread_synch.next_priority_read_cycles =
+ std::numeric_limits<int64_t>::min();
+ }
+};
+
+void BM_MutexEnqueue(benchmark::State& state) {
+ // In the "multiple priorities" variant of the benchmark, one of the
+ // threads runs with Mutex priority 0 while the rest run at elevated priority.
+ // This benchmarks the performance impact of the presence of a low priority
+ // waiter when a higher priority waiter adds itself of the queue
+ // (b/175224064).
+ //
+ // NOTE: The actual scheduler priority is not modified in this benchmark:
+ // all of the threads get CPU slices with the same priority. Only the
+ // Mutex queueing behavior is modified.
+ const bool multiple_priorities = state.range(0);
+ ScopedThreadMutexPriority priority_setter(
+ (multiple_priorities && state.thread_index != 0) ? 1 : 0);
+
+ struct Shared {
+ absl::Mutex mu;
+ std::atomic<int> looping_threads{0};
+ std::atomic<int> blocked_threads{0};
+ std::atomic<bool> thread_has_mutex{false};
+ };
+ static Shared* shared = new Shared;
+
+ // Set up 'blocked_threads' to count how many threads are currently blocked
+ // in Abseil synchronization code.
+ //
+ // NOTE: Blocking done within the Google Benchmark library itself (e.g.
+ // the barrier which synchronizes threads entering and exiting the benchmark
+ // loop) does _not_ get registered in this counter. This is because Google
+ // Benchmark uses its own synchronization primitives based on std::mutex, not
+ // Abseil synchronization primitives. If at some point the benchmark library
+ // merges into Abseil, this code may break.
+ absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(
+ &shared->blocked_threads);
+
+ // The benchmark framework may run several iterations in the same process,
+ // reusing the same static-initialized 'shared' object. Given the semantics
+ // of the members, here, we expect everything to be reset to zero by the
+ // end of any iteration. Assert that's the case, just to be sure.
+ ABSL_RAW_CHECK(
+ shared->looping_threads.load(std::memory_order_relaxed) == 0 &&
+ shared->blocked_threads.load(std::memory_order_relaxed) == 0 &&
+ !shared->thread_has_mutex.load(std::memory_order_relaxed),
+ "Shared state isn't zeroed at start of benchmark iteration");
+
+ static constexpr int kBatchSize = 1000;
+ while (state.KeepRunningBatch(kBatchSize)) {
+ shared->looping_threads.fetch_add(1);
+ for (int i = 0; i < kBatchSize; i++) {
+ {
+ absl::MutexLock l(&shared->mu);
+ shared->thread_has_mutex.store(true, std::memory_order_relaxed);
+ // Spin until all other threads are either out of the benchmark loop
+ // or blocked on the mutex. This ensures that the mutex queue is kept
+ // at its maximal length to benchmark the performance of queueing on
+ // a highly contended mutex.
+ while (shared->looping_threads.load(std::memory_order_relaxed) -
+ shared->blocked_threads.load(std::memory_order_relaxed) !=
+ 1) {
+ }
+ shared->thread_has_mutex.store(false);
+ }
+ // Spin until some other thread has acquired the mutex before we block
+ // again. This ensures that we always go through the slow (queueing)
+ // acquisition path rather than reacquiring the mutex we just released.
+ while (!shared->thread_has_mutex.load(std::memory_order_relaxed) &&
+ shared->looping_threads.load(std::memory_order_relaxed) > 1) {
+ }
+ }
+ // The benchmark framework uses a barrier to ensure that all of the threads
+ // complete their benchmark loop together before any of the threads exit
+ // the loop. So, we need to remove ourselves from the "looping threads"
+ // counter here before potentially blocking on that barrier. Otherwise,
+ // another thread spinning above might wait forever for this thread to
+ // block on the mutex while we in fact are waiting to exit.
+ shared->looping_threads.fetch_add(-1);
+ }
+ absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(
+ nullptr);
+}
+
+BENCHMARK(BM_MutexEnqueue)
+ ->Threads(4)
+ ->Threads(64)
+ ->Threads(128)
+ ->Threads(512)
+ ->ArgName("multiple_priorities")
+ ->Arg(false)
+ ->Arg(true);
+
template <typename MutexType>
void BM_Contended(benchmark::State& state) {
+ int priority = state.thread_index % state.range(1);
+ ScopedThreadMutexPriority priority_setter(priority);
+
struct Shared {
MutexType mu;
int data = 0;
@@ -85,81 +201,51 @@ void BM_Contended(benchmark::State& state) {
DelayNs(state.range(0), &shared->data);
}
}
+void SetupBenchmarkArgs(benchmark::internal::Benchmark* bm,
+ bool do_test_priorities) {
+ const int max_num_priorities = do_test_priorities ? 2 : 1;
+ bm->UseRealTime()
+ // ThreadPerCpu poorly handles non-power-of-two CPU counts.
+ ->Threads(1)
+ ->Threads(2)
+ ->Threads(4)
+ ->Threads(6)
+ ->Threads(8)
+ ->Threads(12)
+ ->Threads(16)
+ ->Threads(24)
+ ->Threads(32)
+ ->Threads(48)
+ ->Threads(64)
+ ->Threads(96)
+ ->Threads(128)
+ ->Threads(192)
+ ->Threads(256)
+ ->ArgNames({"cs_ns", "num_prios"});
+ // Some empirically chosen amounts of work in critical section.
+ // 1 is low contention, 2000 is high contention and few values in between.
+ for (int critical_section_ns : {1, 20, 50, 200, 2000}) {
+ for (int num_priorities = 1; num_priorities <= max_num_priorities;
+ num_priorities++) {
+ bm->ArgPair(critical_section_ns, num_priorities);
+ }
+ }
+}
BENCHMARK_TEMPLATE(BM_Contended, absl::Mutex)
- ->UseRealTime()
- // ThreadPerCpu poorly handles non-power-of-two CPU counts.
- ->Threads(1)
- ->Threads(2)
- ->Threads(4)
- ->Threads(6)
- ->Threads(8)
- ->Threads(12)
- ->Threads(16)
- ->Threads(24)
- ->Threads(32)
- ->Threads(48)
- ->Threads(64)
- ->Threads(96)
- ->Threads(128)
- ->Threads(192)
- ->Threads(256)
- // Some empirically chosen amounts of work in critical section.
- // 1 is low contention, 200 is high contention and few values in between.
- ->Arg(1)
- ->Arg(20)
- ->Arg(50)
- ->Arg(200);
+ ->Apply([](benchmark::internal::Benchmark* bm) {
+ SetupBenchmarkArgs(bm, /*do_test_priorities=*/true);
+ });
BENCHMARK_TEMPLATE(BM_Contended, absl::base_internal::SpinLock)
- ->UseRealTime()
- // ThreadPerCpu poorly handles non-power-of-two CPU counts.
- ->Threads(1)
- ->Threads(2)
- ->Threads(4)
- ->Threads(6)
- ->Threads(8)
- ->Threads(12)
- ->Threads(16)
- ->Threads(24)
- ->Threads(32)
- ->Threads(48)
- ->Threads(64)
- ->Threads(96)
- ->Threads(128)
- ->Threads(192)
- ->Threads(256)
- // Some empirically chosen amounts of work in critical section.
- // 1 is low contention, 200 is high contention and few values in between.
- ->Arg(1)
- ->Arg(20)
- ->Arg(50)
- ->Arg(200);
+ ->Apply([](benchmark::internal::Benchmark* bm) {
+ SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);
+ });
BENCHMARK_TEMPLATE(BM_Contended, std::mutex)
- ->UseRealTime()
- // ThreadPerCpu poorly handles non-power-of-two CPU counts.
- ->Threads(1)
- ->Threads(2)
- ->Threads(4)
- ->Threads(6)
- ->Threads(8)
- ->Threads(12)
- ->Threads(16)
- ->Threads(24)
- ->Threads(32)
- ->Threads(48)
- ->Threads(64)
- ->Threads(96)
- ->Threads(128)
- ->Threads(192)
- ->Threads(256)
- // Some empirically chosen amounts of work in critical section.
- // 1 is low contention, 200 is high contention and few values in between.
- ->Arg(1)
- ->Arg(20)
- ->Arg(50)
- ->Arg(200);
+ ->Apply([](benchmark::internal::Benchmark* bm) {
+ SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);
+ });
// Measure the overhead of conditions on mutex release (when they must be
// evaluated). Mutex has (some) support for equivalence classes allowing