QLatch: optimize to avoid a syscall when no one is waitingHEAD dev

We do it by using the sign bit of the atomic to indicate that there are no waiters waiting. That way, the counter only becomes a zero when all expected counters have counted down *and* there is at least one waiter. That means the countDown() code remains unchanged. On x86-64 and AArch64, there is no change in the number of instructions in the inline portion of wait() either. The non-inline portion uses a __atomic_and_fetch() instead of atomic_fetch_and() so compilers will generate LOCK AND for x86-64, LDCLR for AArch64 ARMv8.1, and AMOAND.W for RISC-V. This is more efficient than the Standard Libraries's current implementations, which use a separate, global atomic out of a pool, meaning that there could be a collision between two or more latches (or any other users std::atomic waiting, such as std::semaphore and std::barrier). Coupled with the fact that we futexWait() on the actual latch's address (something libc++ can't / won't do), this implementation should be overall much more efficient. Change-Id: Ib5ce7a497e034ebabb2cfffd1761b02a44d548d3 Reviewed-by: Mårten Nordheim <[email protected]>
author: Thiago Macieira <[email protected]> 2023-05-22 22:57:11 -0700
committer: Mårten Nordheim <[email protected]> 2025-06-27 14:50:31 +0000
commit: b8c09d2e3b1eabf3f908ca9e0633a4430a976f89 (patch)
tree: c6ce987d72f95cab2c5f645dc0c376754e2ef1a3
parent: 4409a7c21399e3c602edeea62c409a18ef78148f (diff)
2 files changed, 37 insertions, 3 deletions
diff --git a/src/corelib/thread/qlatch.cpp b/src/corelib/thread/qlatch.cpp
index f91dfb1400a..0b4863267c0 100644
--- a/src/corelib/thread/qlatch.cpp
+++ b/src/corelib/thread/qlatch.cpp
@@ -59,6 +59,7 @@ namespace atomicwait = q20;
     \endcode
 
     In fact, the above is exactly what Qt::BlockingQueued connection does.
+
     \section3 Synchronizing execution
 
     For this use-case, multiple threads must reach a particular state before
@@ -84,6 +85,16 @@ namespace atomicwait = q20;
       \li count_down() is not \c{const} (libstdc++ implementation is).
     \endlist
 
+    \omit
+    \section2 Implementation details
+
+    countDown() must call wakeUp() if the latch counter reaches zero and there
+    are threads waiting to be woken up. Or, conversely, countDown() needs to do
+    nothing after decrementing if the latch counter is still non-zero or there
+    are no waiters. Therefore, we choose the bits so that a non-zero
+    \c{counter} member implies no action required.
+
+    \endomit
 */
 
 /*!
@@ -172,6 +183,26 @@ namespace atomicwait = q20;
 
 void QLatch::waitInternal(int current) noexcept
 {
+    // mark that there is a waiter -> clear the bit that there are no waiters
+    if (current & NoWaiters) {
+#if __has_builtin(__atomic_and_fetch)
+        // Modern GCC and Clang are able to generate loop-free code for this
+        // operation on x86-64, ARMv8.1 and RISC-V.
+        if (__atomic_and_fetch(reinterpret_cast<int *>(&counter._q_value), ~NoWaiters,
+                               int(std::memory_order_relaxed)) == 0)
+            return;
+#else
+        // Do it in two steps, which is usually better than a compare_exchange
+        // loop. This is not exactly the same as above (it's not atomic!) but
+        // is correct for our purposes because the counter never changes from 0
+        // once it reaches that.
+        counter.fetchAndAndRelaxed(~NoWaiters);
+        if (counter.loadRelaxed() == 0)
+            return;     // no need to wait!
+#endif
+    }
+    current &= ~NoWaiters;
+
     auto waitLoop = [&](auto waiter) {
         do {
             waiter(current);
diff --git a/src/corelib/thread/qlatch_p.h b/src/corelib/thread/qlatch_p.h
index b407e3b6c7d..95890f7519d 100644
--- a/src/corelib/thread/qlatch_p.h
+++ b/src/corelib/thread/qlatch_p.h
@@ -28,12 +28,12 @@ class QLatch
 {
 public:
     constexpr explicit QLatch(int expected) noexcept
-        : counter(expected)
+        : counter(expected | NoWaiters)
     {}
 
     int pending() const noexcept
     {
-        return counter.loadAcquire();
+        return (counter.loadAcquire() & CounterMask);
     }
 
     void countDown(int n = 1) noexcept
@@ -53,7 +53,7 @@ public:
 
     void wait() noexcept // not const
     {
-        if (int current = counter.loadAcquire(); current != 0) {
+        if (int current = counter.loadAcquire(); (current & CounterMask) != 0) {
             waitInternal(current);
             QtTsan::latchWait(&counter);
         }
@@ -72,6 +72,9 @@ public:
     void arrive_and_wait(int n = 1) noexcept { arriveAndWait(n); }
 
 private:
+    static constexpr int NoWaitersBit = 31;
+    static constexpr int NoWaiters = 1 << NoWaitersBit;
+    static constexpr int CounterMask = ~NoWaiters;
     QBasicAtomicInt counter;
 
     Q_DISABLE_COPY_MOVE(QLatch)
author	Thiago Macieira <[email protected]>	2023-05-22 22:57:11 -0700
committer	Mårten Nordheim <[email protected]>	2025-06-27 14:50:31 +0000
commit	b8c09d2e3b1eabf3f908ca9e0633a4430a976f89 (patch)
tree	c6ce987d72f95cab2c5f645dc0c376754e2ef1a3
parent	4409a7c21399e3c602edeea62c409a18ef78148f (diff)