Skip to content

Commit 2ab2d6f

Browse files
author
Amit Kapila
committed
Fix a deadlock during ALTER SUBSCRIPTION ... DROP PUBLICATION.
A deadlock can occur when the DDL command and the apply worker acquire catalog locks in different orders while dropping replication origins. The issue is rare in PG16 and higher branches because, in most cases, the tablesync worker performs the origin drop in those branches, and its locking sequence does not conflict with DDL operations. This patch ensures consistent lock acquisition to prevent such deadlocks. As per buildfarm. Reported-by: Alexander Lakhin <[email protected]> Author: Ajin Cherian <[email protected]> Reviewed-by: Hayato Kuroda <[email protected]> Reviewed-by: vignesh C <[email protected]> Reviewed-by: Amit Kapila <[email protected]> Backpatch-through: 14, where it was introduced Discussion: https://postgr.es/m/[email protected]
1 parent ca09ef3 commit 2ab2d6f

File tree

3 files changed

+49
-8
lines changed

3 files changed

+49
-8
lines changed

src/backend/catalog/pg_subscription.c

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -320,17 +320,32 @@ AddSubscriptionRelState(Oid subid, Oid relid, char state,
320320
*/
321321
void
322322
UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
323-
XLogRecPtr sublsn)
323+
XLogRecPtr sublsn, bool already_locked)
324324
{
325325
Relation rel;
326326
HeapTuple tup;
327327
bool nulls[Natts_pg_subscription_rel];
328328
Datum values[Natts_pg_subscription_rel];
329329
bool replaces[Natts_pg_subscription_rel];
330330

331-
LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock);
331+
if (already_locked)
332+
{
333+
#ifdef USE_ASSERT_CHECKING
334+
LOCKTAG tag;
332335

333-
rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
336+
Assert(CheckRelationOidLockedByMe(SubscriptionRelRelationId,
337+
RowExclusiveLock, true));
338+
SET_LOCKTAG_OBJECT(tag, InvalidOid, SubscriptionRelationId, subid, 0);
339+
Assert(LockHeldByMe(&tag, AccessShareLock, true));
340+
#endif
341+
342+
rel = table_open(SubscriptionRelRelationId, NoLock);
343+
}
344+
else
345+
{
346+
LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock);
347+
rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
348+
}
334349

335350
/* Try finding existing mapping. */
336351
tup = SearchSysCacheCopy2(SUBSCRIPTIONRELMAP,

src/backend/replication/logical/tablesync.c

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,8 @@ process_syncing_tables_for_sync(XLogRecPtr current_lsn)
316316
UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
317317
MyLogicalRepWorker->relid,
318318
MyLogicalRepWorker->relstate,
319-
MyLogicalRepWorker->relstate_lsn);
319+
MyLogicalRepWorker->relstate_lsn,
320+
false);
320321

321322
/*
322323
* End streaming so that LogRepWorkerWalRcvConn can be used to drop
@@ -425,6 +426,7 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
425426
ListCell *lc;
426427
bool started_tx = false;
427428
bool should_exit = false;
429+
Relation rel = NULL;
428430

429431
Assert(!IsTransactionState());
430432

@@ -492,7 +494,17 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
492494
* worker to remove the origin tracking as if there is any
493495
* error while dropping we won't restart it to drop the
494496
* origin. So passing missing_ok = true.
497+
*
498+
* Lock the subscription and origin in the same order as we
499+
* are doing during DDL commands to avoid deadlocks. See
500+
* AlterSubscription_refresh.
495501
*/
502+
LockSharedObject(SubscriptionRelationId, MyLogicalRepWorker->subid,
503+
0, AccessShareLock);
504+
505+
if (!rel)
506+
rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
507+
496508
ReplicationOriginNameForLogicalRep(MyLogicalRepWorker->subid,
497509
rstate->relid,
498510
originname,
@@ -504,7 +516,7 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
504516
*/
505517
UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
506518
rstate->relid, rstate->state,
507-
rstate->lsn);
519+
rstate->lsn, true);
508520
}
509521
}
510522
else
@@ -555,7 +567,14 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
555567
* This is required to avoid any undetected deadlocks
556568
* due to any existing lock as deadlock detector won't
557569
* be able to detect the waits on the latch.
570+
*
571+
* Also close any tables prior to the commit.
558572
*/
573+
if (rel)
574+
{
575+
table_close(rel, NoLock);
576+
rel = NULL;
577+
}
559578
CommitTransactionCommand();
560579
pgstat_report_stat(false);
561580
}
@@ -623,6 +642,11 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
623642
}
624643
}
625644

645+
/* Close table if opened */
646+
if (rel)
647+
table_close(rel, NoLock);
648+
649+
626650
if (started_tx)
627651
{
628652
/*
@@ -1414,7 +1438,8 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos)
14141438
UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
14151439
MyLogicalRepWorker->relid,
14161440
MyLogicalRepWorker->relstate,
1417-
MyLogicalRepWorker->relstate_lsn);
1441+
MyLogicalRepWorker->relstate_lsn,
1442+
false);
14181443
CommitTransactionCommand();
14191444
pgstat_report_stat(true);
14201445

@@ -1547,7 +1572,8 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos)
15471572
UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
15481573
MyLogicalRepWorker->relid,
15491574
SUBREL_STATE_FINISHEDCOPY,
1550-
MyLogicalRepWorker->relstate_lsn);
1575+
MyLogicalRepWorker->relstate_lsn,
1576+
false);
15511577

15521578
CommitTransactionCommand();
15531579

src/include/catalog/pg_subscription_rel.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ typedef struct SubscriptionRelState
8585
extern void AddSubscriptionRelState(Oid subid, Oid relid, char state,
8686
XLogRecPtr sublsn, bool retain_lock);
8787
extern void UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
88-
XLogRecPtr sublsn);
88+
XLogRecPtr sublsn, bool already_locked);
8989
extern char GetSubscriptionRelState(Oid subid, Oid relid, XLogRecPtr *sublsn);
9090
extern void RemoveSubscriptionRel(Oid subid, Oid relid);
9191

0 commit comments

Comments
 (0)