Skip to content

Commit 50fd187

Browse files
author
Commitfest Bot
committed
[CF 5627] Fix oldest LSN calculation for WAL segments removal when slots are advancing during checkpoint
This branch was automatically generated by a robot using patches from an email thread registered at: https://commitfest.postgresql.org/patch/5627 The branch will be overwritten each time a new patch version is posted to the thread, and also periodically to check for bitrot caused by changes on the master branch. Patch(es): https://www.postgresql.org/message-id/CALDaNm1u=QV1a7w48BWgkg6GnK20eE_y+raL43D46_R9Wnt8wg@mail.gmail.com Author(s): Vitaly Davydov
2 parents 144ad72 + c1f1501 commit 50fd187

File tree

5 files changed

+43
-3
lines changed

5 files changed

+43
-3
lines changed

src/backend/access/transam/xlog.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6836,6 +6836,7 @@ ShutdownXLOG(int code, Datum arg)
68366836
static void
68376837
LogCheckpointStart(int flags, bool restartpoint)
68386838
{
6839+
#if 0
68396840
if (restartpoint)
68406841
ereport(LOG,
68416842
/* translator: the placeholders show checkpoint options */
@@ -6860,6 +6861,7 @@ LogCheckpointStart(int flags, bool restartpoint)
68606861
(flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
68616862
(flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
68626863
(flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
6864+
#endif
68636865
}
68646866

68656867
/*
@@ -6909,6 +6911,7 @@ LogCheckpointEnd(bool restartpoint)
69096911
CheckpointStats.ckpt_sync_rels;
69106912
average_msecs = (long) ((average_sync_time + 999) / 1000);
69116913

6914+
#if 0
69126915
/*
69136916
* ControlFileLock is not required to see ControlFile->checkPoint and
69146917
* ->checkPointCopy here as we are the only updator of those variables at
@@ -6962,6 +6965,7 @@ LogCheckpointEnd(bool restartpoint)
69626965
(int) (CheckPointDistanceEstimate / 1024.0),
69636966
LSN_FORMAT_ARGS(ControlFile->checkPoint),
69646967
LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
6968+
#endif
69656969
}
69666970

69676971
/*
@@ -7160,13 +7164,15 @@ CreateCheckPoint(int flags)
71607164
if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
71617165
CHECKPOINT_FORCE)) == 0)
71627166
{
7167+
#if 0
71637168
if (last_important_lsn == ControlFile->checkPoint)
71647169
{
71657170
END_CRIT_SECTION();
71667171
ereport(DEBUG1,
71677172
(errmsg_internal("checkpoint skipped because system is idle")));
71687173
return false;
71697174
}
7175+
#endif
71707176
}
71717177

71727178
/*

src/backend/access/transam/xlogrecovery.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1918,6 +1918,8 @@ ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *repl
19181918
{
19191919
ErrorContextCallback errcallback;
19201920
bool switchedTLI = false;
1921+
static int count = 0;
1922+
count++;
19211923

19221924
/* Setup error traceback support for ereport() */
19231925
errcallback.callback = rm_redo_error_callback;
@@ -2009,7 +2011,13 @@ ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *repl
20092011

20102012
/* Pop the error context stack */
20112013
error_context_stack = errcallback.previous;
2012-
2014+
pg_usleep(1000L);
2015+
if (count > 1000)
2016+
pg_usleep(1000L);
2017+
2018+
elog(LOG, "Applied wal record end rec %X/%X, read rec %X/%X",
2019+
LSN_FORMAT_ARGS(xlogreader->EndRecPtr),
2020+
LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
20132021
/*
20142022
* Update lastReplayedEndRecPtr after this record has been successfully
20152023
* replayed.
@@ -3640,6 +3648,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
36403648
if (StandbyMode && CheckForStandbyTrigger())
36413649
{
36423650
XLogShutdownWalRcv();
3651+
elog(LOG, "returning fail vignesh1");
36433652
return XLREAD_FAIL;
36443653
}
36453654

@@ -3648,7 +3657,10 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
36483657
* and pg_wal.
36493658
*/
36503659
if (!StandbyMode)
3660+
{
3661+
elog(LOG, "returning fail vignesh2");
36513662
return XLREAD_FAIL;
3663+
}
36523664

36533665
/*
36543666
* Move to XLOG_FROM_STREAM state, and set to start a
@@ -3870,6 +3882,8 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
38703882
}
38713883
curFileTLI = tli;
38723884
SetInstallXLogFileSegmentActive();
3885+
elog(LOG, "vignesh request to start streaming from primary at %X/%X on timeline %u",
3886+
LSN_FORMAT_ARGS(ptr), curFileTLI);
38733887
RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
38743888
PrimarySlotName,
38753889
wal_receiver_create_temp_slot);

src/backend/postmaster/checkpointer.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ CheckpointerMain(const void *startup_data, size_t startup_data_len)
345345
*/
346346
for (;;)
347347
{
348-
bool do_checkpoint = false;
348+
bool do_checkpoint = true;
349349
int flags = 0;
350350
pg_time_t now;
351351
int elapsed_secs;
@@ -573,11 +573,14 @@ CheckpointerMain(const void *startup_data, size_t startup_data_len)
573573
continue; /* no sleep for us ... */
574574
cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs);
575575
}
576+
pg_usleep(1000L);
576577

578+
#if 0
577579
(void) WaitLatch(MyLatch,
578580
WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
579581
cur_timeout * 1000L /* convert to ms */ ,
580582
WAIT_EVENT_CHECKPOINTER_MAIN);
583+
#endif
581584
}
582585

583586
/*

src/backend/replication/walreceiver.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,9 @@ WalReceiverMain(const void *startup_data, size_t startup_data_len)
396396

397397
/* Initialize LogstreamResult and buffers for processing messages */
398398
LogstreamResult.Write = LogstreamResult.Flush = GetXLogReplayRecPtr(NULL);
399+
elog(LOG, "WalReceiverMain LogstreamResult.Flush initialized to %X/%X",
400+
LSN_FORMAT_ARGS(LogstreamResult.Flush));
401+
399402
initStringInfo(&reply_message);
400403

401404
/* Initialize nap wakeup times. */
@@ -848,6 +851,8 @@ XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len, TimeLineID tli)
848851
buf += hdrlen;
849852
len -= hdrlen;
850853
XLogWalRcvWrite(buf, len, dataStart, tli);
854+
elog(LOG, "XLogWalRcvProcessMsg: wrote %zu bytes of WAL at %X/%X",
855+
len, LSN_FORMAT_ARGS(dataStart));
851856
break;
852857
}
853858
case 'k': /* Keepalive */
@@ -960,6 +965,8 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr, TimeLineID tli)
960965
buf += byteswritten;
961966

962967
LogstreamResult.Write = recptr;
968+
elog(LOG, "XLogWalRcvFlush LogstreamResult.Write set to %X/%X",
969+
LSN_FORMAT_ARGS(LogstreamResult.Write));
963970
}
964971

965972
/* Update shared-memory status */
@@ -994,6 +1001,9 @@ XLogWalRcvFlush(bool dying, TimeLineID tli)
9941001

9951002
LogstreamResult.Flush = LogstreamResult.Write;
9961003

1004+
elog(LOG, "XLogWalRcvFlush LogstreamResult.Flush initialized to %X/%X",
1005+
LSN_FORMAT_ARGS(LogstreamResult.Flush));
1006+
9971007
/* Update shared-memory status */
9981008
SpinLockAcquire(&walrcv->mutex);
9991009
if (walrcv->flushedUpto < LogstreamResult.Flush)
@@ -1138,7 +1148,7 @@ XLogWalRcvSendReply(bool force, bool requestReply)
11381148
pq_sendbyte(&reply_message, requestReply ? 1 : 0);
11391149

11401150
/* Send it */
1141-
elog(DEBUG2, "sending write %X/%X flush %X/%X apply %X/%X%s",
1151+
elog(LOG, "sending write %X/%X flush %X/%X apply %X/%X%s",
11421152
LSN_FORMAT_ARGS(writePtr),
11431153
LSN_FORMAT_ARGS(flushPtr),
11441154
LSN_FORMAT_ARGS(applyPtr),

src/backend/replication/walsender.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2377,7 +2377,14 @@ PhysicalConfirmReceivedLocation(XLogRecPtr lsn)
23772377
if (slot->data.restart_lsn != lsn)
23782378
{
23792379
changed = true;
2380+
2381+
if (lsn < slot->data.restart_lsn)
2382+
elog(LOG, "crash scenario - slot %s, cannot confirm a restart LSN (%X/%X) that is older than the current one (%X/%X)",
2383+
NameStr(slot->data.name), LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(slot->data.restart_lsn));
2384+
23802385
slot->data.restart_lsn = lsn;
2386+
elog(LOG, "PhysicalConfirmReceivedLocation replication slot \"%s\" set restart_lsn to %X/%X",
2387+
NameStr(slot->data.name), LSN_FORMAT_ARGS(slot->data.restart_lsn));
23812388
}
23822389
SpinLockRelease(&slot->mutex);
23832390

0 commit comments

Comments
 (0)