Skip to content

Commit 2f1c97c

Browse files
author
Commitfest Bot
committed
[CF 5784] v1 - Optimize shared LWLock acquisition for high-core-count systems
This branch was automatically generated by a robot using patches from an email thread registered at: https://commitfest.postgresql.org/patch/5784 The branch will be overwritten each time a new patch version is posted to the thread, and also periodically to check for bitrot caused by changes on the master branch. Patch(es): https://www.postgresql.org/message-id/[email protected] Author(s): Zhiguo Zhou
2 parents 5c8eda1 + 026b65c commit 2f1c97c

File tree

1 file changed

+88
-16
lines changed

1 file changed

+88
-16
lines changed

src/backend/storage/lmgr/lwlock.c

Lines changed: 88 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -97,20 +97,41 @@
9797
#define LW_FLAG_BITS 3
9898
#define LW_FLAG_MASK (((1<<LW_FLAG_BITS)-1)<<(32-LW_FLAG_BITS))
9999

100-
/* assumes MAX_BACKENDS is a (power of 2) - 1, checked below */
101-
#define LW_VAL_EXCLUSIVE (MAX_BACKENDS + 1)
100+
/*
101+
* already (power of 2)-1, i.e. suitable for a mask
102+
*
103+
* Originally, the LW_SHARED lock reference count was maintained in bits
104+
* [MAX_BACKEND_BITS-1:0] of LWLock.state, with a theoretical maximum of
105+
* MAX_BACKENDS (when all MAX_BACKENDS processes hold the lock concurrently).
106+
*
107+
* To reduce lock acquisition overhead, we optimized LWLockAttemptLock by
108+
* merging the read and update operations for the LW_SHARED lock's state.
109+
* This eliminates the need for separate atomic instructions - a critical
110+
* improvement given the high cost of atomic operations on high-core-count
111+
* systems.
112+
*
113+
* This optimization introduces a scenario where the reference count may
114+
* temporarily increment even when a reader fails to acquire an exclusive lock.
115+
* However, since each process retries lock acquisition up to *twice* before
116+
* waiting on a semaphore, the reference count is bounded by MAX_BACKENDS * 2.
117+
*
118+
* To ensure compatibility with this upper bound:
119+
* 1. LW_SHARED_MASK has been extended by 1 bit
120+
* 2. LW_VAL_EXCLUSIVE is left-shifted by 1 bit
121+
*/
122+
#define LW_SHARED_MASK ((MAX_BACKENDS << 1) + 1)
123+
#define LW_VAL_EXCLUSIVE (LW_SHARED_MASK + 1)
124+
#define LW_LOCK_MASK (LW_SHARED_MASK | LW_VAL_EXCLUSIVE)
102125
#define LW_VAL_SHARED 1
103126

104-
/* already (power of 2)-1, i.e. suitable for a mask */
105-
#define LW_SHARED_MASK MAX_BACKENDS
106-
#define LW_LOCK_MASK (MAX_BACKENDS | LW_VAL_EXCLUSIVE)
127+
/* assumes MAX_BACKENDS is a (power of 2) - 1, checked below */
107128

108129

109130
StaticAssertDecl(((MAX_BACKENDS + 1) & MAX_BACKENDS) == 0,
110131
"MAX_BACKENDS + 1 needs to be a power of 2");
111132

112-
StaticAssertDecl((MAX_BACKENDS & LW_FLAG_MASK) == 0,
113-
"MAX_BACKENDS and LW_FLAG_MASK overlap");
133+
StaticAssertDecl((LW_SHARED_MASK & LW_FLAG_MASK) == 0,
134+
"LW_SHARED_MASK and LW_FLAG_MASK overlap");
114135

115136
StaticAssertDecl((LW_VAL_EXCLUSIVE & LW_FLAG_MASK) == 0,
116137
"LW_VAL_EXCLUSIVE and LW_FLAG_MASK overlap");
@@ -237,15 +258,17 @@ PRINT_LWDEBUG(const char *where, LWLock *lock, LWLockMode mode)
237258
if (Trace_lwlocks)
238259
{
239260
uint32 state = pg_atomic_read_u32(&lock->state);
261+
uint32 excl = (state & LW_VAL_EXCLUSIVE) != 0;
262+
uint32 shared = excl ? 0 : state & LW_SHARED_MASK;
240263

241264
ereport(LOG,
242265
(errhidestmt(true),
243266
errhidecontext(true),
244267
errmsg_internal("%d: %s(%s %p): excl %u shared %u haswaiters %u waiters %u rOK %d",
245268
MyProcPid,
246269
where, T_NAME(lock), lock,
247-
(state & LW_VAL_EXCLUSIVE) != 0,
248-
state & LW_SHARED_MASK,
270+
excl,
271+
shared,
249272
(state & LW_FLAG_HAS_WAITERS) != 0,
250273
pg_atomic_read_u32(&lock->nwaiters),
251274
(state & LW_FLAG_RELEASE_OK) != 0)));
@@ -750,14 +773,53 @@ GetLWLockIdentifier(uint32 classId, uint16 eventId)
750773
* This function will not block waiting for a lock to become free - that's the
751774
* caller's job.
752775
*
776+
* willwait: true if the caller is willing to wait for the lock to become free
777+
* false if the caller is not willing to wait.
778+
*
753779
* Returns true if the lock isn't free and we need to wait.
754780
*/
755781
static bool
756-
LWLockAttemptLock(LWLock *lock, LWLockMode mode)
782+
LWLockAttemptLock(LWLock *lock, LWLockMode mode, bool willwait)
757783
{
758784
uint32 old_state;
759785

760786
Assert(mode == LW_EXCLUSIVE || mode == LW_SHARED);
787+
/*
788+
* Optimized shared lock acquisition using atomic fetch-and-add.
789+
*
790+
* This optimization aims to lower the cost of acquiring shared locks
791+
* by reducing the number of atomic operations, which can be expensive
792+
* on systems with many CPU cores.
793+
*
794+
* It is only activated when willwait=true, ensuring that the reference
795+
* count does not grow unchecked and overflow into the LW_VAL_EXCLUSIVE bit.
796+
*
797+
* Three scenarios can occur when acquiring a shared lock:
798+
* 1) Lock is free: atomically increment reference count and acquire
799+
* 2) Lock held in shared mode: atomically increment reference count and acquire
800+
* 3) Lock held exclusively: atomically increment reference count but fail to acquire
801+
*
802+
* Scenarios 1 and 2 work as expected - we successfully increment the count
803+
* and acquire the lock.
804+
*
805+
* Scenario 3 is counterintuitive: we increment the reference count even though
806+
* we cannot acquire the lock due to the exclusive holder. This creates a
807+
* temporarily invalid reference count, but it's acceptable because:
808+
* - The LW_VAL_EXCLUSIVE flag takes precedence in determining lock state
809+
* - Each process retries at most twice before blocking on a semaphore
810+
* - This bounds the "overcounted" references to MAX_BACKENDS * 2
811+
* - The bound fits within LW_SHARED_MASK capacity
812+
* - The lock->state including "overcounted" references is reset when the exclusive
813+
* lock is released.
814+
*
815+
* See LW_SHARED_MASK definition comments for additional details.
816+
*/
817+
if (willwait && mode == LW_SHARED)
818+
{
819+
old_state = pg_atomic_fetch_add_u32(&lock->state, LW_VAL_SHARED);
820+
Assert((old_state & LW_LOCK_MASK) != LW_LOCK_MASK);
821+
return (old_state & LW_VAL_EXCLUSIVE) != 0;
822+
}
761823

762824
/*
763825
* Read once outside the loop, later iterations will get the newer value
@@ -1202,7 +1264,7 @@ LWLockAcquire(LWLock *lock, LWLockMode mode)
12021264
* Try to grab the lock the first time, we're not in the waitqueue
12031265
* yet/anymore.
12041266
*/
1205-
mustwait = LWLockAttemptLock(lock, mode);
1267+
mustwait = LWLockAttemptLock(lock, mode, true);
12061268

12071269
if (!mustwait)
12081270
{
@@ -1225,7 +1287,7 @@ LWLockAcquire(LWLock *lock, LWLockMode mode)
12251287
LWLockQueueSelf(lock, mode);
12261288

12271289
/* we're now guaranteed to be woken up if necessary */
1228-
mustwait = LWLockAttemptLock(lock, mode);
1290+
mustwait = LWLockAttemptLock(lock, mode, true);
12291291

12301292
/* ok, grabbed the lock the second time round, need to undo queueing */
12311293
if (!mustwait)
@@ -1256,6 +1318,7 @@ LWLockAcquire(LWLock *lock, LWLockMode mode)
12561318

12571319
for (;;)
12581320
{
1321+
/* When signaled, lock->state has been zero-initialized by the previous holder */
12591322
PGSemaphoreLock(proc->sem);
12601323
if (proc->lwWaiting == LW_WS_NOT_WAITING)
12611324
break;
@@ -1328,7 +1391,7 @@ LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
13281391
HOLD_INTERRUPTS();
13291392

13301393
/* Check for the lock */
1331-
mustwait = LWLockAttemptLock(lock, mode);
1394+
mustwait = LWLockAttemptLock(lock, mode, false);
13321395

13331396
if (mustwait)
13341397
{
@@ -1395,13 +1458,13 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
13951458
* NB: We're using nearly the same twice-in-a-row lock acquisition
13961459
* protocol as LWLockAcquire(). Check its comments for details.
13971460
*/
1398-
mustwait = LWLockAttemptLock(lock, mode);
1461+
mustwait = LWLockAttemptLock(lock, mode, true);
13991462

14001463
if (mustwait)
14011464
{
14021465
LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
14031466

1404-
mustwait = LWLockAttemptLock(lock, mode);
1467+
mustwait = LWLockAttemptLock(lock, mode, true);
14051468

14061469
if (mustwait)
14071470
{
@@ -1421,6 +1484,7 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
14211484

14221485
for (;;)
14231486
{
1487+
/* When signaled, lock->state has been zero-initialized by the previous holder */
14241488
PGSemaphoreLock(proc->sem);
14251489
if (proc->lwWaiting == LW_WS_NOT_WAITING)
14261490
break;
@@ -1803,7 +1867,15 @@ LWLockReleaseInternal(LWLock *lock, LWLockMode mode)
18031867
* others, even if we still have to wakeup other waiters.
18041868
*/
18051869
if (mode == LW_EXCLUSIVE)
1806-
oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_EXCLUSIVE);
1870+
{
1871+
/*
1872+
* To release the exclusive lock, all bits of LW_LOCK_MASK,
1873+
* including any "overcounted" increments from blocked readers,
1874+
* are cleared.
1875+
*/
1876+
oldstate = pg_atomic_fetch_and_u32(&lock->state, ~LW_LOCK_MASK);
1877+
oldstate &= ~LW_LOCK_MASK;
1878+
}
18071879
else
18081880
oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_SHARED);
18091881

0 commit comments

Comments
 (0)