absl:speed up Mutex::[Reader]TryLock

Tidy up Mutex::[Reader]TryLock codegen by outlining slow path and non-tail function call, and un-unrolling the loop. Current codegen: https://gist.githubusercontent.com/dvyukov/a4d353fd71ac873af9332c1340675b60/raw/226537ffa305b25a79ef3a85277fa870fee5191d/gistfile1.txt New codegen: https://gist.githubusercontent.com/dvyukov/686a094c5aa357025689764f155e5a29/raw/e3125c1cdb5669fac60faf336e2f60395e29d888/gistfile1.txt name old cpu/op new cpu/op delta BM_TryLock 18.0ns ± 0% 17.7ns ± 0% -1.64% (p=0.016 n=4+5) BM_ReaderTryLock/real_time/threads:1 17.9ns ± 0% 17.9ns ± 0% -0.10% (p=0.016 n=5+5) BM_ReaderTryLock/real_time/threads:72 9.61µs ± 8% 8.42µs ± 7% -12.37% (p=0.008 n=5+5) PiperOrigin-RevId: 567006472 Change-Id: Iea0747e71bbf2dc1f00c70a4235203071d795b99
author: Dmitry Vyukov <dvyukov@google.com> 2023-09-20 10:26:25 -0700
committer: Copybara-Service <copybara-worker@google.com> 2023-09-20 10:27:09 -0700
commit: c45a4393c075fb0e50e408d9b69035a42ed2617c (patch)
tree: 2b0e228ce7a1a564b5d5d9724257706307c606e5 /absl/synchronization
parent: adcaae433fe10da72bc4f8b61eaf559604b81d03 (diff)
2 files changed, 71 insertions, 41 deletions
diff --git a/absl/synchronization/mutex.cc b/absl/synchronization/mutex.cc
index eb4b6e54..ac5f1696 100644
--- a/absl/synchronization/mutex.cc
+++ b/absl/synchronization/mutex.cc
@@ -1582,26 +1582,36 @@ bool Mutex::AwaitCommon(const Condition& cond, KernelTimeout t) {
 bool Mutex::TryLock() {
   ABSL_TSAN_MUTEX_PRE_LOCK(this, __tsan_mutex_try_lock);
   intptr_t v = mu_.load(std::memory_order_relaxed);
-  if ((v & (kMuWriter | kMuReader | kMuEvent)) == 0 &&  // try fast acquire
-      mu_.compare_exchange_strong(v, kMuWriter | v, std::memory_order_acquire,
-                                  std::memory_order_relaxed)) {
-    DebugOnlyLockEnter(this);
-    ABSL_TSAN_MUTEX_POST_LOCK(this, __tsan_mutex_try_lock, 0);
-    return true;
-  }
-  if ((v & kMuEvent) != 0) {                      // we're recording events
-    if ((v & kExclusive->slow_need_zero) == 0 &&  // try fast acquire
-        mu_.compare_exchange_strong(
-            v, (kExclusive->fast_or | v) + kExclusive->fast_add,
-            std::memory_order_acquire, std::memory_order_relaxed)) {
+  // Try fast acquire.
+  if (ABSL_PREDICT_TRUE((v & (kMuWriter | kMuReader | kMuEvent)) == 0)) {
+    if (ABSL_PREDICT_TRUE(mu_.compare_exchange_strong(
+            v, kMuWriter | v, std::memory_order_acquire,
+            std::memory_order_relaxed))) {
       DebugOnlyLockEnter(this);
-      PostSynchEvent(this, SYNCH_EV_TRYLOCK_SUCCESS);
       ABSL_TSAN_MUTEX_POST_LOCK(this, __tsan_mutex_try_lock, 0);
       return true;
-    } else {
-      PostSynchEvent(this, SYNCH_EV_TRYLOCK_FAILED);
     }
+  } else if (ABSL_PREDICT_FALSE((v & kMuEvent) != 0)) {
+    // We're recording events.
+    return TryLockSlow();
+  }
+  ABSL_TSAN_MUTEX_POST_LOCK(
+      this, __tsan_mutex_try_lock | __tsan_mutex_try_lock_failed, 0);
+  return false;
+}
+
+ABSL_ATTRIBUTE_NOINLINE bool Mutex::TryLockSlow() {
+  intptr_t v = mu_.load(std::memory_order_relaxed);
+  if ((v & kExclusive->slow_need_zero) == 0 &&  // try fast acquire
+      mu_.compare_exchange_strong(
+          v, (kExclusive->fast_or | v) + kExclusive->fast_add,
+          std::memory_order_acquire, std::memory_order_relaxed)) {
+    DebugOnlyLockEnter(this);
+    PostSynchEvent(this, SYNCH_EV_TRYLOCK_SUCCESS);
+    ABSL_TSAN_MUTEX_POST_LOCK(this, __tsan_mutex_try_lock, 0);
+    return true;
   }
+  PostSynchEvent(this, SYNCH_EV_TRYLOCK_FAILED);
   ABSL_TSAN_MUTEX_POST_LOCK(
       this, __tsan_mutex_try_lock | __tsan_mutex_try_lock_failed, 0);
   return false;
@@ -1611,41 +1621,57 @@ bool Mutex::ReaderTryLock() {
   ABSL_TSAN_MUTEX_PRE_LOCK(this,
                            __tsan_mutex_read_lock | __tsan_mutex_try_lock);
   intptr_t v = mu_.load(std::memory_order_relaxed);
+  // Clang tends to unroll the loop when compiling with optimization.
+  // But in this case it just unnecessary increases code size.
+  // If CAS is failing due to contention, the jump cost is negligible.
+#if defined(__clang__)
+#pragma nounroll
+#endif
   // The while-loops (here and below) iterate only if the mutex word keeps
-  // changing (typically because the reader count changes) under the CAS.  We
-  // limit the number of attempts to avoid having to think about livelock.
-  int loop_limit = 5;
-  while ((v & (kMuWriter | kMuWait | kMuEvent)) == 0 && loop_limit != 0) {
-    if (mu_.compare_exchange_strong(v, (kMuReader | v) + kMuOne,
-                                    std::memory_order_acquire,
-                                    std::memory_order_relaxed)) {
+  // changing (typically because the reader count changes) under the CAS.
+  // We limit the number of attempts to avoid having to think about livelock.
+  for (int loop_limit = 5; loop_limit != 0; loop_limit--) {
+    if (ABSL_PREDICT_FALSE((v & (kMuWriter | kMuWait | kMuEvent)) != 0)) {
+      break;
+    }
+    if (ABSL_PREDICT_TRUE(mu_.compare_exchange_strong(
+            v, (kMuReader | v) + kMuOne, std::memory_order_acquire,
+            std::memory_order_relaxed))) {
       DebugOnlyLockEnter(this);
       ABSL_TSAN_MUTEX_POST_LOCK(
           this, __tsan_mutex_read_lock | __tsan_mutex_try_lock, 0);
       return true;
     }
-    loop_limit--;
-    v = mu_.load(std::memory_order_relaxed);
   }
-  if ((v & kMuEvent) != 0) {  // we're recording events
-    loop_limit = 5;
-    while ((v & kShared->slow_need_zero) == 0 && loop_limit != 0) {
-      if (mu_.compare_exchange_strong(v, (kMuReader | v) + kMuOne,
-                                      std::memory_order_acquire,
-                                      std::memory_order_relaxed)) {
-        DebugOnlyLockEnter(this);
-        PostSynchEvent(this, SYNCH_EV_READERTRYLOCK_SUCCESS);
-        ABSL_TSAN_MUTEX_POST_LOCK(
-            this, __tsan_mutex_read_lock | __tsan_mutex_try_lock, 0);
-        return true;
-      }
-      loop_limit--;
-      v = mu_.load(std::memory_order_relaxed);
-    }
-    if ((v & kMuEvent) != 0) {
-      PostSynchEvent(this, SYNCH_EV_READERTRYLOCK_FAILED);
+  if (ABSL_PREDICT_TRUE((v & kMuEvent) == 0)) {
+    ABSL_TSAN_MUTEX_POST_LOCK(this,
+                              __tsan_mutex_read_lock | __tsan_mutex_try_lock |
+                                  __tsan_mutex_try_lock_failed,
+                              0);
+    return false;
+  }
+  // we're recording events
+  return ReaderTryLockSlow();
+}
+
+ABSL_ATTRIBUTE_NOINLINE bool Mutex::ReaderTryLockSlow() {
+  intptr_t v = mu_.load(std::memory_order_relaxed);
+#if defined(__clang__)
+#pragma nounroll
+#endif
+  for (int loop_limit = 5; loop_limit != 0; loop_limit--) {
+    if ((v & kShared->slow_need_zero) == 0 &&
+        mu_.compare_exchange_strong(v, (kMuReader | v) + kMuOne,
+                                    std::memory_order_acquire,
+                                    std::memory_order_relaxed)) {
+      DebugOnlyLockEnter(this);
+      PostSynchEvent(this, SYNCH_EV_READERTRYLOCK_SUCCESS);
+      ABSL_TSAN_MUTEX_POST_LOCK(
+          this, __tsan_mutex_read_lock | __tsan_mutex_try_lock, 0);
+      return true;
     }
   }
+  PostSynchEvent(this, SYNCH_EV_READERTRYLOCK_FAILED);
   ABSL_TSAN_MUTEX_POST_LOCK(this,
                             __tsan_mutex_read_lock | __tsan_mutex_try_lock |
                                 __tsan_mutex_try_lock_failed,
diff --git a/absl/synchronization/mutex.h b/absl/synchronization/mutex.h
index d146b066..95726f6b 100644
--- a/absl/synchronization/mutex.h
+++ b/absl/synchronization/mutex.h
@@ -521,6 +521,10 @@ class ABSL_LOCKABLE Mutex {
                 int flags) ABSL_ATTRIBUTE_COLD;
   // slow path release
   void UnlockSlow(SynchWaitParams* waitp) ABSL_ATTRIBUTE_COLD;
+  // TryLock slow path.
+  bool TryLockSlow();
+  // ReaderTryLock slow path.
+  bool ReaderTryLockSlow();
   // Common code between Await() and AwaitWithTimeout/Deadline()
   bool AwaitCommon(const Condition& cond,
                    synchronization_internal::KernelTimeout t);
author	Dmitry Vyukov <dvyukov@google.com>	2023-09-20 10:26:25 -0700
committer	Copybara-Service <copybara-worker@google.com>	2023-09-20 10:27:09 -0700
commit	c45a4393c075fb0e50e408d9b69035a42ed2617c (patch)
tree	2b0e228ce7a1a564b5d5d9724257706307c606e5 /absl/synchronization
parent	adcaae433fe10da72bc4f8b61eaf559604b81d03 (diff)