Skip to content

Commit 47da0cf

Browse files
macdiceCommitfest Bot
authored andcommitted
InstallXLogFileSegment() vs concurrent WAL flush
If the checkpointer is recycling a WAL segment file or another backend is installing a new file, we mustn't allow other processes to write data into the file before its name is durable. Otherwise the data could become unreachable in recovery after a power loss.
1 parent b498af4 commit 47da0cf

File tree

2 files changed

+45
-0
lines changed

2 files changed

+45
-0
lines changed

src/backend/access/transam/xlog.c

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,11 @@ typedef struct XLogCtlData
463463

464464
/* Fake LSN counter, for unlogged relations. */
465465
pg_atomic_uint64 unloggedLSN;
466+
/*
467+
* Approximation of the last WAL segment number that is known to have been
468+
* installed by InstallXLogFileSegment().
469+
*/
470+
pg_atomic_uint64 last_known_installed_segno;
466471

467472
/* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
468473
pg_time_t lastSegSwitchTime;
@@ -3237,7 +3242,28 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
32373242
errmsg("could not open file \"%s\": %m", path)));
32383243
}
32393244
else
3245+
{
3246+
/*
3247+
* The file is there, but it is possible that InstallXLogFileSegment()
3248+
* has recently renamed it and not yet made the new name durable. We
3249+
* don't want to be able to flush data into a file whose name might
3250+
* not survive power loss, since it would become unreachable in
3251+
* recovery. Since InstallXlogFileSegment() holds ControlFileLock,
3252+
* acquiring it here is enough to wait for any durable_rename() call
3253+
* that might have started before we opened the file.
3254+
*
3255+
* We can skip that if we can already see that the WAL space we need
3256+
* is fully synchronized. We may see a slightly out of date value
3257+
* since we haven't acquired the lock yet, but that's OK, it just
3258+
* means we might take the lock when we don't need to.
3259+
*/
3260+
if (pg_atomic_read_u64(&XLogCtl->last_known_installed_segno) < logsegno)
3261+
{
3262+
LWLockAcquire(ControlFileLock, LW_SHARED);
3263+
LWLockRelease(ControlFileLock);
3264+
}
32403265
return fd;
3266+
}
32413267

32423268
/*
32433269
* Initialize an empty (all zeroes) segment. NOTE: it is possible that
@@ -3589,6 +3615,11 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
35893615

35903616
XLogFilePath(path, tli, *segno, wal_segment_size);
35913617

3618+
/*
3619+
* Acquire and keep the ControlFileLock held *until* we have renamed the
3620+
* target segment durably. See XLogFileInitInternal() for details as to why
3621+
* it is dangerous otherwise.
3622+
*/
35923623
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
35933624
if (!XLogCtl->InstallXLogFileSegmentActive)
35943625
{
@@ -3625,6 +3656,8 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
36253656
return false;
36263657
}
36273658

3659+
pg_atomic_write_u64(&XLogCtl->last_known_installed_segno, *segno);
3660+
36283661
LWLockRelease(ControlFileLock);
36293662

36303663
return true;
@@ -4967,6 +5000,7 @@ XLOGShmemInit(void)
49675000
char *allocptr;
49685001
int i;
49695002
ControlFileData *localControlFile;
5003+
XLogSegNo lastKnownInstalledSegno = 0;
49705004

49715005
#ifdef WAL_DEBUG
49725006

@@ -5014,6 +5048,12 @@ XLOGShmemInit(void)
50145048
{
50155049
memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
50165050
pfree(localControlFile);
5051+
/*
5052+
* A decent approximation for the last known installed WAL segment
5053+
* number can be the segment in which the checkpoint record resides,
5054+
* specially in cases where we have had a clean shutdown.
5055+
*/
5056+
XLByteToSeg(ControlFile->checkPoint, lastKnownInstalledSegno, wal_segment_size);
50175057
}
50185058

50195059
/*
@@ -5068,6 +5108,7 @@ XLOGShmemInit(void)
50685108
pg_atomic_init_u64(&XLogCtl->logWriteResult, InvalidXLogRecPtr);
50695109
pg_atomic_init_u64(&XLogCtl->logFlushResult, InvalidXLogRecPtr);
50705110
pg_atomic_init_u64(&XLogCtl->unloggedLSN, InvalidXLogRecPtr);
5111+
pg_atomic_init_u64(&XLogCtl->last_known_installed_segno, lastKnownInstalledSegno);
50715112
}
50725113

50735114
/*

src/backend/storage/file/fd.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -770,6 +770,10 @@ fsync_fname(const char *fname, bool isdir)
770770
* might not be on the same filesystem. Therefore this routine does not
771771
* support renaming across directories.
772772
*
773+
* Note that there is a window between the rename and the fsync(s). If "newfile"
774+
* is opened, written to and then fdatasynced, and if there is a crash before
775+
* the fsync(s) hits disk, the written data could be .
776+
*
773777
* Log errors with the caller specified severity.
774778
*
775779
* Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not

0 commit comments

Comments
 (0)