@@ -303,6 +303,11 @@ static bool doPageWrites;
303303 * so it's a plain spinlock. The other locks are held longer (potentially
304304 * over I/O operations), so we use LWLocks for them. These locks are:
305305 *
306+ * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
307+ * It is only held while initializing and changing the mapping. If the
308+ * contents of the buffer being replaced haven't been written yet, the mapping
309+ * lock is released while the write is done, and reacquired afterwards.
310+ *
306311 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
307312 * XLogFlush).
308313 *
@@ -468,37 +473,21 @@ typedef struct XLogCtlData
468473 pg_atomic_uint64 logFlushResult ; /* last byte + 1 flushed */
469474
470475 /*
471- * First initialized page in the cache (first byte position).
472- */
473- XLogRecPtr InitializedFrom ;
474-
475- /*
476- * Latest reserved for initialization page in the cache (last byte
477- * position + 1).
476+ * Latest initialized page in the cache (last byte position + 1).
478477 *
479- * To change the identity of a buffer, you need to advance
480- * InitializeReserved first . To change the identity of a buffer that's
478+ * To change the identity of a buffer (and InitializedUpTo) , you need to
479+ * hold WALBufMappingLock . To change the identity of a buffer that's
481480 * still dirty, the old page needs to be written out first, and for that
482481 * you need WALWriteLock, and you need to ensure that there are no
483482 * in-progress insertions to the page by calling
484483 * WaitXLogInsertionsToFinish().
485484 */
486- pg_atomic_uint64 InitializeReserved ;
487-
488- /*
489- * Latest initialized page in the cache (last byte position + 1).
490- *
491- * InitializedUpTo is updated after the buffer initialization. After
492- * update, waiters got notification using InitializedUpToCondVar.
493- */
494- pg_atomic_uint64 InitializedUpTo ;
495- ConditionVariable InitializedUpToCondVar ;
485+ XLogRecPtr InitializedUpTo ;
496486
497487 /*
498488 * These values do not change after startup, although the pointed-to pages
499- * and xlblocks values certainly do. xlblocks values are changed
500- * lock-free according to the check for the xlog write position and are
501- * accompanied by changes of InitializeReserved and InitializedUpTo.
489+ * and xlblocks values certainly do. xlblocks values are protected by
490+ * WALBufMappingLock.
502491 */
503492 char * pages ; /* buffers for unwritten XLOG pages */
504493 pg_atomic_uint64 * xlblocks ; /* 1st byte ptr-s + XLOG_BLCKSZ */
@@ -821,9 +810,9 @@ XLogInsertRecord(XLogRecData *rdata,
821810 * fullPageWrites from changing until the insertion is finished.
822811 *
823812 * Step 2 can usually be done completely in parallel. If the required WAL
824- * page is not initialized yet, you have to go through AdvanceXLInsertBuffer,
825- * which will ensure it is initialized. But the WAL writer tries to do that
826- * ahead of insertions to avoid that from happening in the critical path.
813+ * page is not initialized yet, you have to grab WALBufMappingLock to
814+ * initialize it, but the WAL writer tries to do that ahead of insertions
815+ * to avoid that from happening in the critical path.
827816 *
828817 *----------
829818 */
@@ -2005,79 +1994,32 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
20051994 XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr ;
20061995 XLogRecPtr NewPageBeginPtr ;
20071996 XLogPageHeader NewPage ;
2008- XLogRecPtr ReservedPtr ;
20091997 int npages pg_attribute_unused () = 0 ;
20101998
2011- /*
2012- * We must run the loop below inside the critical section as we expect
2013- * XLogCtl->InitializedUpTo to eventually keep up. The most of callers
2014- * already run inside the critical section. Except for WAL writer, which
2015- * passed 'opportunistic == true', and therefore we don't perform
2016- * operations that could error out.
2017- *
2018- * Start an explicit critical section anyway though.
2019- */
2020- Assert (CritSectionCount > 0 || opportunistic );
2021- START_CRIT_SECTION ();
1999+ LWLockAcquire (WALBufMappingLock , LW_EXCLUSIVE );
20222000
2023- /*--
2024- * Loop till we get all the pages in WAL buffer before 'upto' reserved for
2025- * initialization. Multiple process can initialize different buffers with
2026- * this loop in parallel as following.
2027- *
2028- * 1. Reserve page for initialization using XLogCtl->InitializeReserved.
2029- * 2. Initialize the reserved page.
2030- * 3. Attempt to advance XLogCtl->InitializedUpTo,
2001+ /*
2002+ * Now that we have the lock, check if someone initialized the page
2003+ * already.
20312004 */
2032- ReservedPtr = pg_atomic_read_u64 (& XLogCtl -> InitializeReserved );
2033- while (upto >= ReservedPtr || opportunistic )
2005+ while (upto >= XLogCtl -> InitializedUpTo || opportunistic )
20342006 {
2035- Assert ( ReservedPtr % XLOG_BLCKSZ == 0 );
2007+ nextidx = XLogRecPtrToBufIdx ( XLogCtl -> InitializedUpTo );
20362008
20372009 /*
2038- * Get ending-offset of the buffer page we need to replace.
2039- *
2040- * We don't lookup into xlblocks, but rather calculate position we
2041- * must wait to be written. If it was written, xlblocks will have this
2042- * position (or uninitialized)
2010+ * Get ending-offset of the buffer page we need to replace (this may
2011+ * be zero if the buffer hasn't been used yet). Fall through if it's
2012+ * already written out.
20432013 */
2044- if (ReservedPtr + XLOG_BLCKSZ > XLogCtl -> InitializedFrom + XLOG_BLCKSZ * XLOGbuffers )
2045- OldPageRqstPtr = ReservedPtr + XLOG_BLCKSZ - (XLogRecPtr ) XLOG_BLCKSZ * XLOGbuffers ;
2046- else
2047- OldPageRqstPtr = InvalidXLogRecPtr ;
2048-
2049- if (LogwrtResult .Write < OldPageRqstPtr && opportunistic )
2014+ OldPageRqstPtr = pg_atomic_read_u64 (& XLogCtl -> xlblocks [nextidx ]);
2015+ if (LogwrtResult .Write < OldPageRqstPtr )
20502016 {
20512017 /*
2052- * If we just want to pre-initialize as much as we can without
2053- * flushing, give up now.
2018+ * Nope, got work to do. If we just want to pre-initialize as much
2019+ * as we can without flushing, give up now.
20542020 */
2055- upto = ReservedPtr - 1 ;
2056- break ;
2057- }
2058-
2059- /*
2060- * Attempt to reserve the page for initialization. Failure means that
2061- * this page got reserved by another process.
2062- */
2063- if (!pg_atomic_compare_exchange_u64 (& XLogCtl -> InitializeReserved ,
2064- & ReservedPtr ,
2065- ReservedPtr + XLOG_BLCKSZ ))
2066- continue ;
2067-
2068- /*
2069- * Wait till page gets correctly initialized up to OldPageRqstPtr.
2070- */
2071- nextidx = XLogRecPtrToBufIdx (ReservedPtr );
2072- while (pg_atomic_read_u64 (& XLogCtl -> InitializedUpTo ) < OldPageRqstPtr )
2073- ConditionVariableSleep (& XLogCtl -> InitializedUpToCondVar , WAIT_EVENT_WAL_BUFFER_INIT );
2074- ConditionVariableCancelSleep ();
2075- Assert (pg_atomic_read_u64 (& XLogCtl -> xlblocks [nextidx ]) == OldPageRqstPtr );
2076-
2077- /* Fall through if it's already written out. */
2078- if (LogwrtResult .Write < OldPageRqstPtr )
2079- {
2080- /* Nope, got work to do. */
2021+ if (opportunistic )
2022+ break ;
20812023
20822024 /* Advance shared memory write request position */
20832025 SpinLockAcquire (& XLogCtl -> info_lck );
@@ -2092,6 +2034,14 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
20922034 RefreshXLogWriteResult (LogwrtResult );
20932035 if (LogwrtResult .Write < OldPageRqstPtr )
20942036 {
2037+ /*
2038+ * Must acquire write lock. Release WALBufMappingLock first,
2039+ * to make sure that all insertions that we need to wait for
2040+ * can finish (up to this same position). Otherwise we risk
2041+ * deadlock.
2042+ */
2043+ LWLockRelease (WALBufMappingLock );
2044+
20952045 WaitXLogInsertionsToFinish (OldPageRqstPtr );
20962046
20972047 LWLockAcquire (WALWriteLock , LW_EXCLUSIVE );
@@ -2119,16 +2069,21 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
21192069 */
21202070 pgstat_report_fixed = true;
21212071 }
2072+ /* Re-acquire WALBufMappingLock and retry */
2073+ LWLockAcquire (WALBufMappingLock , LW_EXCLUSIVE );
2074+ continue ;
21222075 }
21232076 }
21242077
21252078 /*
21262079 * Now the next buffer slot is free and we can set it up to be the
21272080 * next output page.
21282081 */
2129- NewPageBeginPtr = ReservedPtr ;
2082+ NewPageBeginPtr = XLogCtl -> InitializedUpTo ;
21302083 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ ;
21312084
2085+ Assert (XLogRecPtrToBufIdx (NewPageBeginPtr ) == nextidx );
2086+
21322087 NewPage = (XLogPageHeader ) (XLogCtl -> pages + nextidx * (Size ) XLOG_BLCKSZ );
21332088
21342089 /*
@@ -2192,100 +2147,12 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
21922147 */
21932148 pg_write_barrier ();
21942149
2195- /*-----
2196- * Update the value of XLogCtl->xlblocks[nextidx] and try to advance
2197- * XLogCtl->InitializedUpTo in a lock-less manner.
2198- *
2199- * First, let's provide a formal proof of the algorithm. Let it be 'n'
2200- * process with the following variables in shared memory:
2201- * f - an array of 'n' boolean flags,
2202- * v - atomic integer variable.
2203- *
2204- * Also, let
2205- * i - a number of a process,
2206- * j - local integer variable,
2207- * CAS(var, oldval, newval) - compare-and-swap atomic operation
2208- * returning true on success,
2209- * write_barrier()/read_barrier() - memory barriers.
2210- *
2211- * The pseudocode for each process is the following.
2212- *
2213- * j := i
2214- * f[i] := true
2215- * write_barrier()
2216- * while CAS(v, j, j + 1):
2217- * j := j + 1
2218- * read_barrier()
2219- * if not f[j]:
2220- * break
2221- *
2222- * Let's prove that v eventually reaches the value of n.
2223- * 1. Prove by contradiction. Assume v doesn't reach n and stucks
2224- * on k, where k < n.
2225- * 2. Process k attempts CAS(v, k, k + 1). 1). If, as we assumed, v
2226- * gets stuck at k, then this CAS operation must fail. Therefore,
2227- * v < k when process k attempts CAS(v, k, k + 1).
2228- * 3. If, as we assumed, v gets stuck at k, then the value k of v
2229- * must be achieved by some process m, where m < k. The process
2230- * m must observe f[k] == false. Otherwise, it will later attempt
2231- * CAS(v, k, k + 1) with success.
2232- * 4. Therefore, corresponding read_barrier() (while j == k) on
2233- * process m reached before write_barrier() of process k. But then
2234- * process k attempts CAS(v, k, k + 1) after process m successfully
2235- * incremented v to k, and that CAS operation must succeed.
2236- * That leads to a contradiction. So, there is no such k (k < n)
2237- * where v gets stuck. Q.E.D.
2238- *
2239- * To apply this proof to the code below, we assume
2240- * XLogCtl->InitializedUpTo will play the role of v with XLOG_BLCKSZ
2241- * granularity. We also assume setting XLogCtl->xlblocks[nextidx] to
2242- * NewPageEndPtr to play the role of setting f[i] to true. Also, note
2243- * that processes can't concurrently map different xlog locations to
2244- * the same nextidx because we previously requested that
2245- * XLogCtl->InitializedUpTo >= OldPageRqstPtr. So, a xlog buffer can
2246- * be taken for initialization only once the previous initialization
2247- * takes effect on XLogCtl->InitializedUpTo.
2248- */
2249-
22502150 pg_atomic_write_u64 (& XLogCtl -> xlblocks [nextidx ], NewPageEndPtr );
2251-
2252- pg_write_barrier ();
2253-
2254- while (pg_atomic_compare_exchange_u64 (& XLogCtl -> InitializedUpTo , & NewPageBeginPtr , NewPageEndPtr ))
2255- {
2256- NewPageBeginPtr = NewPageEndPtr ;
2257- NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ ;
2258- nextidx = XLogRecPtrToBufIdx (NewPageBeginPtr );
2259-
2260- pg_read_barrier ();
2261-
2262- if (pg_atomic_read_u64 (& XLogCtl -> xlblocks [nextidx ]) != NewPageEndPtr )
2263- {
2264- /*
2265- * Page at nextidx wasn't initialized yet, so we can't move
2266- * InitializedUpto further. It will be moved by backend which
2267- * will initialize nextidx.
2268- */
2269- ConditionVariableBroadcast (& XLogCtl -> InitializedUpToCondVar );
2270- break ;
2271- }
2272- }
2151+ XLogCtl -> InitializedUpTo = NewPageEndPtr ;
22732152
22742153 npages ++ ;
22752154 }
2276-
2277- END_CRIT_SECTION ();
2278-
2279- /*
2280- * All the pages in WAL buffer before 'upto' were reserved for
2281- * initialization. However, some pages might be reserved by concurrent
2282- * processes. Wait till they finish initialization.
2283- */
2284- while (upto >= pg_atomic_read_u64 (& XLogCtl -> InitializedUpTo ))
2285- ConditionVariableSleep (& XLogCtl -> InitializedUpToCondVar , WAIT_EVENT_WAL_BUFFER_INIT );
2286- ConditionVariableCancelSleep ();
2287-
2288- pg_read_barrier ();
2155+ LWLockRelease (WALBufMappingLock );
22892156
22902157#ifdef WAL_DEBUG
22912158 if (XLOG_DEBUG && npages > 0 )
@@ -5178,10 +5045,6 @@ XLOGShmemInit(void)
51785045 pg_atomic_init_u64 (& XLogCtl -> logWriteResult , InvalidXLogRecPtr );
51795046 pg_atomic_init_u64 (& XLogCtl -> logFlushResult , InvalidXLogRecPtr );
51805047 pg_atomic_init_u64 (& XLogCtl -> unloggedLSN , InvalidXLogRecPtr );
5181-
5182- pg_atomic_init_u64 (& XLogCtl -> InitializeReserved , InvalidXLogRecPtr );
5183- pg_atomic_init_u64 (& XLogCtl -> InitializedUpTo , InvalidXLogRecPtr );
5184- ConditionVariableInit (& XLogCtl -> InitializedUpToCondVar );
51855048}
51865049
51875050/*
@@ -6205,8 +6068,7 @@ StartupXLOG(void)
62056068 memset (page + len , 0 , XLOG_BLCKSZ - len );
62066069
62076070 pg_atomic_write_u64 (& XLogCtl -> xlblocks [firstIdx ], endOfRecoveryInfo -> lastPageBeginPtr + XLOG_BLCKSZ );
6208- pg_atomic_write_u64 (& XLogCtl -> InitializedUpTo , endOfRecoveryInfo -> lastPageBeginPtr + XLOG_BLCKSZ );
6209- XLogCtl -> InitializedFrom = endOfRecoveryInfo -> lastPageBeginPtr ;
6071+ XLogCtl -> InitializedUpTo = endOfRecoveryInfo -> lastPageBeginPtr + XLOG_BLCKSZ ;
62106072 }
62116073 else
62126074 {
@@ -6215,10 +6077,8 @@ StartupXLOG(void)
62156077 * let the first attempt to insert a log record to initialize the next
62166078 * buffer.
62176079 */
6218- pg_atomic_write_u64 (& XLogCtl -> InitializedUpTo , EndOfLog );
6219- XLogCtl -> InitializedFrom = EndOfLog ;
6080+ XLogCtl -> InitializedUpTo = EndOfLog ;
62206081 }
6221- pg_atomic_write_u64 (& XLogCtl -> InitializeReserved , pg_atomic_read_u64 (& XLogCtl -> InitializedUpTo ));
62226082
62236083 /*
62246084 * Update local and shared status. This is OK to do without any locks
0 commit comments