diff options
author | Dmitry Vyukov <dvyukov@google.com> | 2023-09-18 20:09:03 -0700 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2023-09-18 20:09:50 -0700 |
commit | cffc9ef2b1174dea2b78cbf5efeeaea1606ad25b (patch) | |
tree | 2fe706d08a05e6a0e2514952671d091af150ec4c /absl/synchronization | |
parent | a5dc018f1016ffc92024338c300d64acfb1475f5 (diff) |
absl: speed up Mutex::Lock
Currently Mutex::Lock contains not inlined non-tail call:
TryAcquireWithSpinning -> GetMutexGlobals -> LowLevelCallOnce -> init closure
This turns the function into non-leaf with stack frame allocation
and additional register use. Remove this non-tail call to make the function leaf.
Move spin iterations initialization to LockSlow.
Current Lock happy path:
00000000001edc20 <absl::Mutex::Lock()>:
1edc20: 55 push %rbp
1edc21: 48 89 e5 mov %rsp,%rbp
1edc24: 53 push %rbx
1edc25: 50 push %rax
1edc26: 48 89 fb mov %rdi,%rbx
1edc29: 48 8b 07 mov (%rdi),%rax
1edc2c: a8 19 test $0x19,%al
1edc2e: 75 0e jne 1edc3e <absl::Mutex::Lock()+0x1e>
1edc30: 48 89 c1 mov %rax,%rcx
1edc33: 48 83 c9 08 or $0x8,%rcx
1edc37: f0 48 0f b1 0b lock cmpxchg %rcx,(%rbx)
1edc3c: 74 42 je 1edc80 <absl::Mutex::Lock()+0x60>
... unhappy path ...
1edc80: 48 83 c4 08 add $0x8,%rsp
1edc84: 5b pop %rbx
1edc85: 5d pop %rbp
1edc86: c3 ret
New Lock happy path:
00000000001eea80 <absl::Mutex::Lock()>:
1eea80: 48 8b 07 mov (%rdi),%rax
1eea83: a8 19 test $0x19,%al
1eea85: 75 0f jne 1eea96 <absl::Mutex::Lock()+0x16>
1eea87: 48 89 c1 mov %rax,%rcx
1eea8a: 48 83 c9 08 or $0x8,%rcx
1eea8e: f0 48 0f b1 0f lock cmpxchg %rcx,(%rdi)
1eea93: 75 01 jne 1eea96 <absl::Mutex::Lock()+0x16>
1eea95: c3 ret
... unhappy path ...
PiperOrigin-RevId: 566488042
Change-Id: I62f854b82a322cfb1d42c34f8ed01b4677693fca
Diffstat (limited to 'absl/synchronization')
-rw-r--r-- | absl/synchronization/mutex.cc | 36 |
1 files changed, 23 insertions, 13 deletions
diff --git a/absl/synchronization/mutex.cc b/absl/synchronization/mutex.cc index 268327de..5d1a516d 100644 --- a/absl/synchronization/mutex.cc +++ b/absl/synchronization/mutex.cc @@ -129,11 +129,12 @@ enum DelayMode { AGGRESSIVE, GENTLE }; struct ABSL_CACHELINE_ALIGNED MutexGlobals { absl::once_flag once; - int spinloop_iterations = 0; int32_t mutex_sleep_spins[2] = {}; absl::Duration mutex_sleep_time; }; +std::atomic<int> spinloop_iterations{-1}; + absl::Duration MeasureTimeToYield() { absl::Time before = absl::Now(); ABSL_INTERNAL_C_SYMBOL(AbslInternalMutexYield)(); @@ -144,12 +145,11 @@ const MutexGlobals& GetMutexGlobals() { ABSL_CONST_INIT static MutexGlobals data; absl::base_internal::LowLevelCallOnce(&data.once, [&]() { if (absl::base_internal::NumCPUs() > 1) { - // If this is multiprocessor, allow spinning. If the mode is - // aggressive then spin many times before yielding. If the mode is - // gentle then spin only a few times before yielding. Aggressive spinning - // is used to ensure that an Unlock() call, which must get the spin lock - // for any thread to make progress gets it without undue delay. - data.spinloop_iterations = 1500; + // If the mode is aggressive then spin many times before yielding. + // If the mode is gentle then spin only a few times before yielding. + // Aggressive spinning is used to ensure that an Unlock() call, + // which must get the spin lock for any thread to make progress gets it + // without undue delay. data.mutex_sleep_spins[AGGRESSIVE] = 5000; data.mutex_sleep_spins[GENTLE] = 250; data.mutex_sleep_time = absl::Microseconds(10); @@ -157,7 +157,6 @@ const MutexGlobals& GetMutexGlobals() { // If this a uniprocessor, only yield/sleep. Real-time threads are often // unable to yield, so the sleep time needs to be long enough to keep // the calling thread asleep until scheduling happens. - data.spinloop_iterations = 0; data.mutex_sleep_spins[AGGRESSIVE] = 0; data.mutex_sleep_spins[GENTLE] = 0; data.mutex_sleep_time = MeasureTimeToYield() * 5; @@ -1487,7 +1486,7 @@ void Mutex::AssertNotHeld() const { // Attempt to acquire *mu, and return whether successful. The implementation // may spin for a short while if the lock cannot be acquired immediately. static bool TryAcquireWithSpinning(std::atomic<intptr_t>* mu) { - int c = GetMutexGlobals().spinloop_iterations; + int c = spinloop_iterations.load(std::memory_order_relaxed); do { // do/while somewhat faster on AMD intptr_t v = mu->load(std::memory_order_relaxed); if ((v & (kMuReader | kMuEvent)) != 0) { @@ -1507,11 +1506,12 @@ void Mutex::Lock() { GraphId id = DebugOnlyDeadlockCheck(this); intptr_t v = mu_.load(std::memory_order_relaxed); // try fast acquire, then spin loop - if ((v & (kMuWriter | kMuReader | kMuEvent)) != 0 || - !mu_.compare_exchange_strong(v, kMuWriter | v, std::memory_order_acquire, - std::memory_order_relaxed)) { + if (ABSL_PREDICT_FALSE((v & (kMuWriter | kMuReader | kMuEvent)) != 0) || + ABSL_PREDICT_FALSE(!mu_.compare_exchange_strong( + v, kMuWriter | v, std::memory_order_acquire, + std::memory_order_relaxed))) { // try spin acquire, then slow loop - if (!TryAcquireWithSpinning(&this->mu_)) { + if (ABSL_PREDICT_FALSE(!TryAcquireWithSpinning(&this->mu_))) { this->LockSlow(kExclusive, nullptr, 0); } } @@ -1746,6 +1746,16 @@ static intptr_t IgnoreWaitingWritersMask(int flag) { // Internal version of LockWhen(). See LockSlowWithDeadline() ABSL_ATTRIBUTE_NOINLINE void Mutex::LockSlow(MuHow how, const Condition* cond, int flags) { + if (ABSL_PREDICT_FALSE(spinloop_iterations.load(std::memory_order_relaxed) < + 0)) { + if (absl::base_internal::NumCPUs() > 1) { + // If this is multiprocessor, allow spinning. + spinloop_iterations.store(1500, std::memory_order_relaxed); + } else { + // If this a uniprocessor, only yield/sleep. + spinloop_iterations.store(0, std::memory_order_relaxed); + } + } ABSL_RAW_CHECK( this->LockSlowWithDeadline(how, cond, KernelTimeout::Never(), flags), "condition untrue on return from LockSlow"); |