Optimize shared LWLock acquisition for high-core-count systems

ZhiguoZh · Commitfest Bot · commit 026b65ce8fc7 · 2025-08-04T01:04:21.000Z
This patch introduces optimizations to reduce lock acquisition overhead in
LWLock by merging the read and update operations for the LW_SHARED lock's
state. This eliminates the need for separate atomic instructions, which is
critical for improving performance on high-core-count systems.

Key changes:
- Extended LW_SHARED_MASK by 1 bit and shifted LW_VAL_EXCLUSIVE by 1 bit to
  ensure compatibility with the upper bound of MAX_BACKENDS * 2.
- Added a `willwait` parameter to `LWLockAttemptLock` to disable the
  optimization when the caller is unwilling to wait, avoiding conflicts
  between the reference count and the LW_VAL_EXCLUSIVE flag.
- Updated `LWLockReleaseInternal` to use `pg_atomic_fetch_and_u32` for
  clearing lock state flags atomically.
- Adjusted related functions (`LWLockAcquire`, `LWLockConditionalAcquire`,
  `LWLockAcquireOrWait`) to pass the `willwait` parameter appropriately.

Key optimization ideas:
It is only activated when willwait=true, ensuring that the reference
count does not grow unchecked and overflow into the LW_VAL_EXCLUSIVE bit.

Three scenarios can occur when acquiring a shared lock:
1) Lock is free: atomically increment reference count and acquire
2) Lock held in shared mode: atomically increment reference count and acquire
3) Lock held exclusively: atomically increment reference count but fail to acquire

Scenarios 1 and 2 work as expected - we successfully increment the count
and acquire the lock.

Scenario 3 is counterintuitive: we increment the reference count even though
we cannot acquire the lock due to the exclusive holder. This creates a
temporarily invalid reference count, but it's acceptable because:
- The LW_VAL_EXCLUSIVE flag takes precedence in determining lock state
- Each process retries at most twice before blocking on a semaphore
- This bounds the "overcounted" references to MAX_BACKENDS * 2
- The bound fits within LW_SHARED_MASK capacity
- The lock-&gt;state including "overcounted" references is reset when the exclusive
  lock is released.

These changes improve scalability and reduce contention in workloads with
frequent LWLock operations on servers with many cores.
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
@@ -97,20 +97,41 @@
 #define LW_FLAG_BITS				3
 #define LW_FLAG_MASK				(((1<<LW_FLAG_BITS)-1)<<(32-LW_FLAG_BITS))
 
-/* assumes MAX_BACKENDS is a (power of 2) - 1, checked below */
-#define LW_VAL_EXCLUSIVE			(MAX_BACKENDS + 1)
+/*
+ * already (power of 2)-1, i.e. suitable for a mask
+ *
+ * Originally, the LW_SHARED lock reference count was maintained in bits
+ * [MAX_BACKEND_BITS-1:0] of LWLock.state, with a theoretical maximum of
+ * MAX_BACKENDS (when all MAX_BACKENDS processes hold the lock concurrently).
+ *
+ * To reduce lock acquisition overhead, we optimized LWLockAttemptLock by
+ * merging the read and update operations for the LW_SHARED lock's state.
+ * This eliminates the need for separate atomic instructions - a critical
+ * improvement given the high cost of atomic operations on high-core-count
+ * systems.
+ *
+ * This optimization introduces a scenario where the reference count may
+ * temporarily increment even when a reader fails to acquire an exclusive lock.
+ * However, since each process retries lock acquisition up to *twice* before
+ * waiting on a semaphore, the reference count is bounded by MAX_BACKENDS * 2.
+ *
+ * To ensure compatibility with this upper bound:
+ * 1. LW_SHARED_MASK has been extended by 1 bit
+ * 2. LW_VAL_EXCLUSIVE is left-shifted by 1 bit
+ */
+#define LW_SHARED_MASK				((MAX_BACKENDS << 1) + 1)
+#define LW_VAL_EXCLUSIVE			(LW_SHARED_MASK + 1)
+#define LW_LOCK_MASK				(LW_SHARED_MASK	| LW_VAL_EXCLUSIVE)
 #define LW_VAL_SHARED				1
 
-/* already (power of 2)-1, i.e. suitable for a mask */
-#define LW_SHARED_MASK				MAX_BACKENDS
-#define LW_LOCK_MASK				(MAX_BACKENDS | LW_VAL_EXCLUSIVE)
+/* assumes MAX_BACKENDS is a (power of 2) - 1, checked below */
 
 
 StaticAssertDecl(((MAX_BACKENDS + 1) & MAX_BACKENDS) == 0,
 				 "MAX_BACKENDS + 1 needs to be a power of 2");
 
-StaticAssertDecl((MAX_BACKENDS & LW_FLAG_MASK) == 0,
-				 "MAX_BACKENDS and LW_FLAG_MASK overlap");
+StaticAssertDecl((LW_SHARED_MASK & LW_FLAG_MASK) == 0,
+				 "LW_SHARED_MASK and LW_FLAG_MASK overlap");
 
 StaticAssertDecl((LW_VAL_EXCLUSIVE & LW_FLAG_MASK) == 0,
 				 "LW_VAL_EXCLUSIVE and LW_FLAG_MASK overlap");
@@ -237,15 +258,17 @@ PRINT_LWDEBUG(const char *where, LWLock *lock, LWLockMode mode)
 	if (Trace_lwlocks)
 	{
 		uint32		state = pg_atomic_read_u32(&lock->state);
+		uint32		excl = (state & LW_VAL_EXCLUSIVE) != 0;
+		uint32		shared = excl ? 0 : state & LW_SHARED_MASK;
 
 		ereport(LOG,
 				(errhidestmt(true),
 				 errhidecontext(true),
 				 errmsg_internal("%d: %s(%s %p): excl %u shared %u haswaiters %u waiters %u rOK %d",
 								 MyProcPid,
 								 where, T_NAME(lock), lock,
-								 (state & LW_VAL_EXCLUSIVE) != 0,
-								 state & LW_SHARED_MASK,
+								 excl,
+								 shared,
 								 (state & LW_FLAG_HAS_WAITERS) != 0,
 								 pg_atomic_read_u32(&lock->nwaiters),
 								 (state & LW_FLAG_RELEASE_OK) != 0)));
@@ -750,14 +773,53 @@ GetLWLockIdentifier(uint32 classId, uint16 eventId)
  * This function will not block waiting for a lock to become free - that's the
  * caller's job.
  *
+ * willwait: true if the caller is willing to wait for the lock to become free
+ *          false if the caller is not willing to wait.
+ *
  * Returns true if the lock isn't free and we need to wait.
  */
 static bool
-LWLockAttemptLock(LWLock *lock, LWLockMode mode)
+LWLockAttemptLock(LWLock *lock, LWLockMode mode, bool willwait)
 {
 	uint32		old_state;
 
 	Assert(mode == LW_EXCLUSIVE || mode == LW_SHARED);
+	/*
+	 * Optimized shared lock acquisition using atomic fetch-and-add.
+	 *
+	 * This optimization aims to lower the cost of acquiring shared locks
+	 * by reducing the number of atomic operations, which can be expensive
+	 * on systems with many CPU cores.
+	 *
+	 * It is only activated when willwait=true, ensuring that the reference
+	 * count does not grow unchecked and overflow into the LW_VAL_EXCLUSIVE bit.
+	 *
+	 * Three scenarios can occur when acquiring a shared lock:
+	 * 1) Lock is free: atomically increment reference count and acquire
+	 * 2) Lock held in shared mode: atomically increment reference count and acquire
+	 * 3) Lock held exclusively: atomically increment reference count but fail to acquire
+	 *
+	 * Scenarios 1 and 2 work as expected - we successfully increment the count
+	 * and acquire the lock.
+	 *
+	 * Scenario 3 is counterintuitive: we increment the reference count even though
+	 * we cannot acquire the lock due to the exclusive holder. This creates a
+	 * temporarily invalid reference count, but it's acceptable because:
+	 * - The LW_VAL_EXCLUSIVE flag takes precedence in determining lock state
+	 * - Each process retries at most twice before blocking on a semaphore
+	 * - This bounds the "overcounted" references to MAX_BACKENDS * 2
+	 * - The bound fits within LW_SHARED_MASK capacity
+	 * - The lock->state including "overcounted" references is reset when the exclusive
+	 *   lock is released.
+	 *
+	 * See LW_SHARED_MASK definition comments for additional details.
+	 */
+	if (willwait && mode == LW_SHARED)
+	{
+		old_state = pg_atomic_fetch_add_u32(&lock->state, LW_VAL_SHARED);
+		Assert((old_state & LW_LOCK_MASK) != LW_LOCK_MASK);
+		return (old_state & LW_VAL_EXCLUSIVE) != 0;
+	}
 
 	/*
 	 * Read once outside the loop, later iterations will get the newer value
@@ -1202,7 +1264,7 @@ LWLockAcquire(LWLock *lock, LWLockMode mode)
 		 * Try to grab the lock the first time, we're not in the waitqueue
 		 * yet/anymore.
 		 */
-		mustwait = LWLockAttemptLock(lock, mode);
+		mustwait = LWLockAttemptLock(lock, mode, true);
 
 		if (!mustwait)
 		{
@@ -1225,7 +1287,7 @@ LWLockAcquire(LWLock *lock, LWLockMode mode)
 		LWLockQueueSelf(lock, mode);
 
 		/* we're now guaranteed to be woken up if necessary */
-		mustwait = LWLockAttemptLock(lock, mode);
+		mustwait = LWLockAttemptLock(lock, mode, true);
 
 		/* ok, grabbed the lock the second time round, need to undo queueing */
 		if (!mustwait)
@@ -1256,6 +1318,7 @@ LWLockAcquire(LWLock *lock, LWLockMode mode)
 
 		for (;;)
 		{
+			/* When signaled, lock->state has been zero-initialized by the previous holder */
 			PGSemaphoreLock(proc->sem);
 			if (proc->lwWaiting == LW_WS_NOT_WAITING)
 				break;
@@ -1328,7 +1391,7 @@ LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
 	HOLD_INTERRUPTS();
 
 	/* Check for the lock */
-	mustwait = LWLockAttemptLock(lock, mode);
+	mustwait = LWLockAttemptLock(lock, mode, false);
 
 	if (mustwait)
 	{
@@ -1395,13 +1458,13 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
 	 * NB: We're using nearly the same twice-in-a-row lock acquisition
 	 * protocol as LWLockAcquire(). Check its comments for details.
 	 */
-	mustwait = LWLockAttemptLock(lock, mode);
+	mustwait = LWLockAttemptLock(lock, mode, true);
 
 	if (mustwait)
 	{
 		LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
 
-		mustwait = LWLockAttemptLock(lock, mode);
+		mustwait = LWLockAttemptLock(lock, mode, true);
 
 		if (mustwait)
 		{
@@ -1421,6 +1484,7 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
 
 			for (;;)
 			{
+				/* When signaled, lock->state has been zero-initialized by the previous holder */
 				PGSemaphoreLock(proc->sem);
 				if (proc->lwWaiting == LW_WS_NOT_WAITING)
 					break;
@@ -1803,7 +1867,15 @@ LWLockReleaseInternal(LWLock *lock, LWLockMode mode)
 	 * others, even if we still have to wakeup other waiters.
 	 */
 	if (mode == LW_EXCLUSIVE)
-		oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_EXCLUSIVE);
+	{
+		/*
+		 * To release the exclusive lock, all bits of LW_LOCK_MASK,
+		 * including any "overcounted" increments from blocked readers,
+		 * are cleared.
+		 */
+		oldstate = pg_atomic_fetch_and_u32(&lock->state, ~LW_LOCK_MASK);
+		oldstate &= ~LW_LOCK_MASK;
+	}
 	else
 		oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_SHARED);