diff options
-rw-r--r-- | absl/flags/internal/sequence_lock_test.cc | 23 | ||||
-rw-r--r-- | absl/synchronization/mutex.cc | 68 | ||||
-rw-r--r-- | absl/synchronization/mutex_benchmark.cc | 224 |
3 files changed, 204 insertions, 111 deletions
diff --git a/absl/flags/internal/sequence_lock_test.cc b/absl/flags/internal/sequence_lock_test.cc index 9aff1edc..ff8b476b 100644 --- a/absl/flags/internal/sequence_lock_test.cc +++ b/absl/flags/internal/sequence_lock_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "absl/flags/internal/sequence_lock.h" +#include <algorithm> #include <atomic> #include <thread> // NOLINT(build/c++11) #include <tuple> @@ -112,13 +113,21 @@ std::vector<int> MultiplicativeRange(int low, int high, int scale) { return result; } -INSTANTIATE_TEST_SUITE_P(TestManyByteSizes, ConcurrentSequenceLockTest, - testing::Combine( - // Buffer size (bytes). - testing::Range(1, 128), - // Number of reader threads. - testing::ValuesIn(MultiplicativeRange( - 1, absl::base_internal::NumCPUs(), 2)))); +#ifndef ABSL_HAVE_THREAD_SANITIZER +const int kMaxThreads = absl::base_internal::NumCPUs(); +#else +// With TSAN, a lot of threads contending for atomic access on the sequence +// lock make this test run too slowly. +const int kMaxThreads = std::min(absl::base_internal::NumCPUs(), 4); +#endif + +INSTANTIATE_TEST_SUITE_P( + TestManyByteSizes, ConcurrentSequenceLockTest, + testing::Combine( + // Buffer size (bytes). + testing::Range(1, 128), + // Number of reader threads. + testing::ValuesIn(MultiplicativeRange(1, kMaxThreads, 2)))); // Simple single-threaded test, parameterized by the size of the buffer to be // protected. diff --git a/absl/synchronization/mutex.cc b/absl/synchronization/mutex.cc index 82b631af..d2468ce5 100644 --- a/absl/synchronization/mutex.cc +++ b/absl/synchronization/mutex.cc @@ -761,11 +761,13 @@ void SetMutexDeadlockDetectionMode(OnDeadlockCycle mode) { synch_deadlock_detection.store(mode, std::memory_order_release); } -// Return true iff threads x and y are waiting on the same condition for the -// same type of lock. Requires that x and y be waiting on the same Mutex -// queue. -static bool MuSameCondition(PerThreadSynch *x, PerThreadSynch *y) { - return x->waitp->how == y->waitp->how && +// Return true iff threads x and y are part of the same equivalence +// class of waiters. An equivalence class is defined as the set of +// waiters with the same condition, type of lock, and thread priority. +// +// Requires that x and y be waiting on the same Mutex queue. +static bool MuEquivalentWaiter(PerThreadSynch *x, PerThreadSynch *y) { + return x->waitp->how == y->waitp->how && x->priority == y->priority && Condition::GuaranteedEqual(x->waitp->cond, y->waitp->cond); } @@ -784,18 +786,19 @@ static inline PerThreadSynch *GetPerThreadSynch(intptr_t v) { // - invalid (iff x is not in a Mutex wait queue), // - null, or // - a pointer to a distinct thread waiting later in the same Mutex queue -// such that all threads in [x, x->skip] have the same condition and -// lock type (MuSameCondition() is true for all pairs in [x, x->skip]). +// such that all threads in [x, x->skip] have the same condition, priority +// and lock type (MuEquivalentWaiter() is true for all pairs in [x, +// x->skip]). // In addition, if x->skip is valid, (x->may_skip || x->skip == null) // -// By the spec of MuSameCondition(), it is not necessary when removing the +// By the spec of MuEquivalentWaiter(), it is not necessary when removing the // first runnable thread y from the front a Mutex queue to adjust the skip // field of another thread x because if x->skip==y, x->skip must (have) become // invalid before y is removed. The function TryRemove can remove a specified // thread from an arbitrary position in the queue whether runnable or not, so // it fixes up skip fields that would otherwise be left dangling. // The statement -// if (x->may_skip && MuSameCondition(x, x->next)) { x->skip = x->next; } +// if (x->may_skip && MuEquivalentWaiter(x, x->next)) { x->skip = x->next; } // maintains the invariant provided x is not the last waiter in a Mutex queue // The statement // if (x->skip != null) { x->skip = x->skip->skip; } @@ -929,24 +932,17 @@ static PerThreadSynch *Enqueue(PerThreadSynch *head, if (s->priority > head->priority) { // s's priority is above head's // try to put s in priority-fifo order, or failing that at the front. if (!head->maybe_unlocking) { - // No unlocker can be scanning the queue, so we can insert between - // skip-chains, and within a skip-chain if it has the same condition as - // s. We insert in priority-fifo order, examining the end of every - // skip-chain, plus every element with the same condition as s. + // No unlocker can be scanning the queue, so we can insert into the + // middle of the queue. + // + // Within a skip chain, all waiters have the same priority, so we can + // skip forward through the chains until we find one with a lower + // priority than the waiter to be enqueued. PerThreadSynch *advance_to = head; // next value of enqueue_after - PerThreadSynch *cur; // successor of enqueue_after do { enqueue_after = advance_to; - cur = enqueue_after->next; // this advance ensures progress - advance_to = Skip(cur); // normally, advance to end of skip chain - // (side-effect: optimizes skip chain) - if (advance_to != cur && s->priority > advance_to->priority && - MuSameCondition(s, cur)) { - // but this skip chain is not a singleton, s has higher priority - // than its tail and has the same condition as the chain, - // so we can insert within the skip-chain - advance_to = cur; // advance by just one - } + // (side-effect: optimizes skip chain) + advance_to = Skip(enqueue_after->next); } while (s->priority <= advance_to->priority); // termination guaranteed because s->priority > head->priority // and head is the end of a skip chain @@ -965,21 +961,21 @@ static PerThreadSynch *Enqueue(PerThreadSynch *head, // enqueue_after can be: head, Skip(...), or cur. // The first two imply enqueue_after->skip == nullptr, and - // the last is used only if MuSameCondition(s, cur). + // the last is used only if MuEquivalentWaiter(s, cur). // We require this because clearing enqueue_after->skip // is impossible; enqueue_after's predecessors might also // incorrectly skip over s if we were to allow other // insertion points. - ABSL_RAW_CHECK( - enqueue_after->skip == nullptr || MuSameCondition(enqueue_after, s), - "Mutex Enqueue failure"); + ABSL_RAW_CHECK(enqueue_after->skip == nullptr || + MuEquivalentWaiter(enqueue_after, s), + "Mutex Enqueue failure"); if (enqueue_after != head && enqueue_after->may_skip && - MuSameCondition(enqueue_after, enqueue_after->next)) { + MuEquivalentWaiter(enqueue_after, enqueue_after->next)) { // enqueue_after can skip to its new successor, s enqueue_after->skip = enqueue_after->next; } - if (MuSameCondition(s, s->next)) { // s->may_skip is known to be true + if (MuEquivalentWaiter(s, s->next)) { // s->may_skip is known to be true s->skip = s->next; // s may skip to its successor } } else { // enqueue not done any other way, so @@ -989,7 +985,7 @@ static PerThreadSynch *Enqueue(PerThreadSynch *head, head->next = s; s->readers = head->readers; // reader count is from previous head s->maybe_unlocking = head->maybe_unlocking; // same for unlock hint - if (head->may_skip && MuSameCondition(head, s)) { + if (head->may_skip && MuEquivalentWaiter(head, s)) { // head now has successor; may skip head->skip = s; } @@ -1009,7 +1005,7 @@ static PerThreadSynch *Dequeue(PerThreadSynch *head, PerThreadSynch *pw) { pw->next = w->next; // snip w out of list if (head == w) { // we removed the head head = (pw == w) ? nullptr : pw; // either emptied list, or pw is new head - } else if (pw != head && MuSameCondition(pw, pw->next)) { + } else if (pw != head && MuEquivalentWaiter(pw, pw->next)) { // pw can skip to its new successor if (pw->next->skip != nullptr) { // either skip to its successors skip target @@ -1079,11 +1075,13 @@ void Mutex::TryRemove(PerThreadSynch *s) { PerThreadSynch *w; if ((w = pw->next) != s) { // search for thread, do { // processing at least one element - if (!MuSameCondition(s, w)) { // seeking different condition + // If the current element isn't equivalent to the waiter to be + // removed, we can skip the entire chain. + if (!MuEquivalentWaiter(s, w)) { pw = Skip(w); // so skip all that won't match // we don't have to worry about dangling skip fields // in the threads we skipped; none can point to s - // because their condition differs from s + // because they are in a different equivalence class. } else { // seeking same condition FixSkip(w, s); // fix up any skip pointer from w to s pw = w; @@ -2148,7 +2146,7 @@ ABSL_ATTRIBUTE_NOINLINE void Mutex::UnlockSlow(SynchWaitParams *waitp) { !old_h->may_skip) { // we used old_h as a terminator old_h->may_skip = true; // allow old_h to skip once more ABSL_RAW_CHECK(old_h->skip == nullptr, "illegal skip from head"); - if (h != old_h && MuSameCondition(old_h, old_h->next)) { + if (h != old_h && MuEquivalentWaiter(old_h, old_h->next)) { old_h->skip = old_h->next; // old_h not head & can skip to successor } } diff --git a/absl/synchronization/mutex_benchmark.cc b/absl/synchronization/mutex_benchmark.cc index 933ea14f..e35aed8b 100644 --- a/absl/synchronization/mutex_benchmark.cc +++ b/absl/synchronization/mutex_benchmark.cc @@ -61,8 +61,124 @@ class RaiiLocker<std::mutex> { std::mutex* mu_; }; +// RAII object to change the Mutex priority of the running thread. +class ScopedThreadMutexPriority { + public: + explicit ScopedThreadMutexPriority(int priority) { + absl::base_internal::ThreadIdentity* identity = + absl::synchronization_internal::GetOrCreateCurrentThreadIdentity(); + identity->per_thread_synch.priority = priority; + // Bump next_priority_read_cycles to the infinite future so that the + // implementation doesn't re-read the thread's actual scheduler priority + // and replace our temporary scoped priority. + identity->per_thread_synch.next_priority_read_cycles = + std::numeric_limits<int64_t>::max(); + } + ~ScopedThreadMutexPriority() { + // Reset the "next priority read time" back to the infinite past so that + // the next time the Mutex implementation wants to know this thread's + // priority, it re-reads it from the OS instead of using our overridden + // priority. + absl::synchronization_internal::GetOrCreateCurrentThreadIdentity() + ->per_thread_synch.next_priority_read_cycles = + std::numeric_limits<int64_t>::min(); + } +}; + +void BM_MutexEnqueue(benchmark::State& state) { + // In the "multiple priorities" variant of the benchmark, one of the + // threads runs with Mutex priority 0 while the rest run at elevated priority. + // This benchmarks the performance impact of the presence of a low priority + // waiter when a higher priority waiter adds itself of the queue + // (b/175224064). + // + // NOTE: The actual scheduler priority is not modified in this benchmark: + // all of the threads get CPU slices with the same priority. Only the + // Mutex queueing behavior is modified. + const bool multiple_priorities = state.range(0); + ScopedThreadMutexPriority priority_setter( + (multiple_priorities && state.thread_index != 0) ? 1 : 0); + + struct Shared { + absl::Mutex mu; + std::atomic<int> looping_threads{0}; + std::atomic<int> blocked_threads{0}; + std::atomic<bool> thread_has_mutex{false}; + }; + static Shared* shared = new Shared; + + // Set up 'blocked_threads' to count how many threads are currently blocked + // in Abseil synchronization code. + // + // NOTE: Blocking done within the Google Benchmark library itself (e.g. + // the barrier which synchronizes threads entering and exiting the benchmark + // loop) does _not_ get registered in this counter. This is because Google + // Benchmark uses its own synchronization primitives based on std::mutex, not + // Abseil synchronization primitives. If at some point the benchmark library + // merges into Abseil, this code may break. + absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter( + &shared->blocked_threads); + + // The benchmark framework may run several iterations in the same process, + // reusing the same static-initialized 'shared' object. Given the semantics + // of the members, here, we expect everything to be reset to zero by the + // end of any iteration. Assert that's the case, just to be sure. + ABSL_RAW_CHECK( + shared->looping_threads.load(std::memory_order_relaxed) == 0 && + shared->blocked_threads.load(std::memory_order_relaxed) == 0 && + !shared->thread_has_mutex.load(std::memory_order_relaxed), + "Shared state isn't zeroed at start of benchmark iteration"); + + static constexpr int kBatchSize = 1000; + while (state.KeepRunningBatch(kBatchSize)) { + shared->looping_threads.fetch_add(1); + for (int i = 0; i < kBatchSize; i++) { + { + absl::MutexLock l(&shared->mu); + shared->thread_has_mutex.store(true, std::memory_order_relaxed); + // Spin until all other threads are either out of the benchmark loop + // or blocked on the mutex. This ensures that the mutex queue is kept + // at its maximal length to benchmark the performance of queueing on + // a highly contended mutex. + while (shared->looping_threads.load(std::memory_order_relaxed) - + shared->blocked_threads.load(std::memory_order_relaxed) != + 1) { + } + shared->thread_has_mutex.store(false); + } + // Spin until some other thread has acquired the mutex before we block + // again. This ensures that we always go through the slow (queueing) + // acquisition path rather than reacquiring the mutex we just released. + while (!shared->thread_has_mutex.load(std::memory_order_relaxed) && + shared->looping_threads.load(std::memory_order_relaxed) > 1) { + } + } + // The benchmark framework uses a barrier to ensure that all of the threads + // complete their benchmark loop together before any of the threads exit + // the loop. So, we need to remove ourselves from the "looping threads" + // counter here before potentially blocking on that barrier. Otherwise, + // another thread spinning above might wait forever for this thread to + // block on the mutex while we in fact are waiting to exit. + shared->looping_threads.fetch_add(-1); + } + absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter( + nullptr); +} + +BENCHMARK(BM_MutexEnqueue) + ->Threads(4) + ->Threads(64) + ->Threads(128) + ->Threads(512) + ->ArgName("multiple_priorities") + ->Arg(false) + ->Arg(true); + template <typename MutexType> void BM_Contended(benchmark::State& state) { + int priority = state.thread_index % state.range(1); + ScopedThreadMutexPriority priority_setter(priority); + struct Shared { MutexType mu; int data = 0; @@ -85,81 +201,51 @@ void BM_Contended(benchmark::State& state) { DelayNs(state.range(0), &shared->data); } } +void SetupBenchmarkArgs(benchmark::internal::Benchmark* bm, + bool do_test_priorities) { + const int max_num_priorities = do_test_priorities ? 2 : 1; + bm->UseRealTime() + // ThreadPerCpu poorly handles non-power-of-two CPU counts. + ->Threads(1) + ->Threads(2) + ->Threads(4) + ->Threads(6) + ->Threads(8) + ->Threads(12) + ->Threads(16) + ->Threads(24) + ->Threads(32) + ->Threads(48) + ->Threads(64) + ->Threads(96) + ->Threads(128) + ->Threads(192) + ->Threads(256) + ->ArgNames({"cs_ns", "num_prios"}); + // Some empirically chosen amounts of work in critical section. + // 1 is low contention, 2000 is high contention and few values in between. + for (int critical_section_ns : {1, 20, 50, 200, 2000}) { + for (int num_priorities = 1; num_priorities <= max_num_priorities; + num_priorities++) { + bm->ArgPair(critical_section_ns, num_priorities); + } + } +} BENCHMARK_TEMPLATE(BM_Contended, absl::Mutex) - ->UseRealTime() - // ThreadPerCpu poorly handles non-power-of-two CPU counts. - ->Threads(1) - ->Threads(2) - ->Threads(4) - ->Threads(6) - ->Threads(8) - ->Threads(12) - ->Threads(16) - ->Threads(24) - ->Threads(32) - ->Threads(48) - ->Threads(64) - ->Threads(96) - ->Threads(128) - ->Threads(192) - ->Threads(256) - // Some empirically chosen amounts of work in critical section. - // 1 is low contention, 200 is high contention and few values in between. - ->Arg(1) - ->Arg(20) - ->Arg(50) - ->Arg(200); + ->Apply([](benchmark::internal::Benchmark* bm) { + SetupBenchmarkArgs(bm, /*do_test_priorities=*/true); + }); BENCHMARK_TEMPLATE(BM_Contended, absl::base_internal::SpinLock) - ->UseRealTime() - // ThreadPerCpu poorly handles non-power-of-two CPU counts. - ->Threads(1) - ->Threads(2) - ->Threads(4) - ->Threads(6) - ->Threads(8) - ->Threads(12) - ->Threads(16) - ->Threads(24) - ->Threads(32) - ->Threads(48) - ->Threads(64) - ->Threads(96) - ->Threads(128) - ->Threads(192) - ->Threads(256) - // Some empirically chosen amounts of work in critical section. - // 1 is low contention, 200 is high contention and few values in between. - ->Arg(1) - ->Arg(20) - ->Arg(50) - ->Arg(200); + ->Apply([](benchmark::internal::Benchmark* bm) { + SetupBenchmarkArgs(bm, /*do_test_priorities=*/false); + }); BENCHMARK_TEMPLATE(BM_Contended, std::mutex) - ->UseRealTime() - // ThreadPerCpu poorly handles non-power-of-two CPU counts. - ->Threads(1) - ->Threads(2) - ->Threads(4) - ->Threads(6) - ->Threads(8) - ->Threads(12) - ->Threads(16) - ->Threads(24) - ->Threads(32) - ->Threads(48) - ->Threads(64) - ->Threads(96) - ->Threads(128) - ->Threads(192) - ->Threads(256) - // Some empirically chosen amounts of work in critical section. - // 1 is low contention, 200 is high contention and few values in between. - ->Arg(1) - ->Arg(20) - ->Arg(50) - ->Arg(200); + ->Apply([](benchmark::internal::Benchmark* bm) { + SetupBenchmarkArgs(bm, /*do_test_priorities=*/false); + }); // Measure the overhead of conditions on mutex release (when they must be // evaluated). Mutex has (some) support for equivalence classes allowing |