diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c index bb260cffa682..5f07f1794151 100644 --- a/src/backend/access/common/bufmask.c +++ b/src/backend/access/common/bufmask.c @@ -56,8 +56,8 @@ mask_page_hint_bits(Page page) /* * During replay, if the page LSN has advanced past our XLOG record's LSN, - * we don't mark the page all-visible. See heap_xlog_visible() for - * details. + * we don't mark the page all-visible. See heap_xlog_prune_and_freeze() + * for more details. */ PageClearAllVisible(page); } diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 36fee9c994e4..eea3a3d2ddce 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -555,6 +555,7 @@ heap_prepare_pagescan(TableScanDesc sscan) Buffer buffer = scan->rs_cbuf; BlockNumber block = scan->rs_cblock; Snapshot snapshot; + Buffer *vmbuffer = NULL; Page page; int lines; bool all_visible; @@ -569,7 +570,9 @@ heap_prepare_pagescan(TableScanDesc sscan) /* * Prune and repair fragmentation for the whole page, if possible. */ - heap_page_prune_opt(scan->rs_base.rs_rd, buffer); + if (sscan->rs_flags & SO_ALLOW_VM_SET) + vmbuffer = &scan->rs_vmbuffer; + heap_page_prune_opt(scan->rs_base.rs_rd, buffer, vmbuffer); /* * We must hold share lock on the buffer content while examining tuple @@ -1246,6 +1249,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, sizeof(TBMIterateResult)); } + scan->rs_vmbuffer = InvalidBuffer; return (TableScanDesc) scan; } @@ -1284,6 +1288,12 @@ heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, scan->rs_cbuf = InvalidBuffer; } + if (BufferIsValid(scan->rs_vmbuffer)) + { + ReleaseBuffer(scan->rs_vmbuffer); + scan->rs_vmbuffer = InvalidBuffer; + } + /* * SO_TYPE_BITMAPSCAN would be cleaned up here, but it does not hold any * additional data vs a normal HeapScan @@ -1316,6 +1326,9 @@ heap_endscan(TableScanDesc sscan) if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); + if (BufferIsValid(scan->rs_vmbuffer)) + ReleaseBuffer(scan->rs_vmbuffer); + /* * Must free the read stream before freeing the BufferAccessStrategy. */ @@ -2091,6 +2104,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, TransactionId xid = GetCurrentTransactionId(); HeapTuple heaptup; Buffer buffer; + Page page; Buffer vmbuffer = InvalidBuffer; bool all_visible_cleared = false; @@ -2150,15 +2164,19 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, } /* - * XXX Should we set PageSetPrunable on this page ? + * Set pd_prune_xid to trigger heap_page_prune_and_freeze() once the page + * is full so that we can set the page all-visible in the VM. * - * The inserting transaction may eventually abort thus making this tuple - * DEAD and hence available for pruning. Though we don't want to optimize - * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the - * aborted tuple will never be pruned until next vacuum is triggered. + * Setting pd_prune_xid is also handy if the inserting transaction + * eventually aborts making this tuple DEAD and hence available for + * pruning. If no other tuple in this page is UPDATEd/DELETEd, the aborted + * tuple would never otherwise be pruned until next vacuum is triggered. * - * If you do add PageSetPrunable here, add it in heap_xlog_insert too. + * Don't set it if we are in bootstrap mode, though. */ + page = BufferGetPage(buffer); + if (TransactionIdIsNormal(xid)) + PageSetPrunable(page, xid); MarkBufferDirty(buffer); @@ -2168,7 +2186,6 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, xl_heap_insert xlrec; xl_heap_header xlhdr; XLogRecPtr recptr; - Page page = BufferGetPage(buffer); uint8 info = XLOG_HEAP_INSERT; int bufflags = 0; @@ -2524,16 +2541,21 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, else if (all_frozen_set) { PageSetAllVisible(page); - visibilitymap_set_vmbits(BufferGetBlockNumber(buffer), - vmbuffer, - VISIBILITYMAP_ALL_VISIBLE | - VISIBILITYMAP_ALL_FROZEN, - relation->rd_locator); + visibilitymap_set(BufferGetBlockNumber(buffer), + vmbuffer, + VISIBILITYMAP_ALL_VISIBLE | + VISIBILITYMAP_ALL_FROZEN, + relation->rd_locator); } /* - * XXX Should we set PageSetPrunable on this page ? See heap_insert() + * Set pd_prune_xid. See heap_insert() for more on why we do this when + * inserting tuples. This only makes sense if we aren't already + * setting the page frozen in the VM. We also don't set it in + * bootstrap mode. */ + if (!all_frozen_set && TransactionIdIsNormal(xid)) + PageSetPrunable(page, xid); MarkBufferDirty(buffer); @@ -8797,50 +8819,6 @@ bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate) return nblocksfavorable; } -/* - * Perform XLogInsert for a heap-visible operation. 'block' is the block - * being marked all-visible, and vm_buffer is the buffer containing the - * corresponding visibility map block. Both should have already been modified - * and dirtied. - * - * snapshotConflictHorizon comes from the largest xmin on the page being - * marked all-visible. REDO routine uses it to generate recovery conflicts. - * - * If checksums or wal_log_hints are enabled, we may also generate a full-page - * image of heap_buffer. Otherwise, we optimize away the FPI (by specifying - * REGBUF_NO_IMAGE for the heap buffer), in which case the caller should *not* - * update the heap page's LSN. - */ -XLogRecPtr -log_heap_visible(Relation rel, Buffer heap_buffer, Buffer vm_buffer, - TransactionId snapshotConflictHorizon, uint8 vmflags) -{ - xl_heap_visible xlrec; - XLogRecPtr recptr; - uint8 flags; - - Assert(BufferIsValid(heap_buffer)); - Assert(BufferIsValid(vm_buffer)); - - xlrec.snapshotConflictHorizon = snapshotConflictHorizon; - xlrec.flags = vmflags; - if (RelationIsAccessibleInLogicalDecoding(rel)) - xlrec.flags |= VISIBILITYMAP_XLOG_CATALOG_REL; - XLogBeginInsert(); - XLogRegisterData(&xlrec, SizeOfHeapVisible); - - XLogRegisterBuffer(0, vm_buffer, 0); - - flags = REGBUF_STANDARD; - if (!XLogHintBitIsNeeded()) - flags |= REGBUF_NO_IMAGE; - XLogRegisterBuffer(1, heap_buffer, flags); - - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE); - - return recptr; -} - /* * Perform XLogInsert for a heap-update operation. Caller must already * have modified the buffer(s) and marked them dirty. diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index bcbac844bb66..f05b9e4968de 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -85,6 +85,7 @@ heapam_index_fetch_begin(Relation rel) hscan->xs_base.rel = rel; hscan->xs_cbuf = InvalidBuffer; + hscan->xs_vmbuffer = InvalidBuffer; return &hscan->xs_base; } @@ -99,6 +100,12 @@ heapam_index_fetch_reset(IndexFetchTableData *scan) ReleaseBuffer(hscan->xs_cbuf); hscan->xs_cbuf = InvalidBuffer; } + + if (BufferIsValid(hscan->xs_vmbuffer)) + { + ReleaseBuffer(hscan->xs_vmbuffer); + hscan->xs_vmbuffer = InvalidBuffer; + } } static void @@ -138,7 +145,8 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, * Prune page, but only if we weren't already on this page */ if (prev_buf != hscan->xs_cbuf) - heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf); + heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf, + scan->modifies_base_rel ? NULL : &hscan->xs_vmbuffer); } /* Obtain share-lock on the buffer so we can examine visibility */ @@ -2471,6 +2479,7 @@ BitmapHeapScanNextBlock(TableScanDesc scan, TBMIterateResult *tbmres; OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE]; int noffsets = -1; + Buffer *vmbuffer = NULL; Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN); Assert(hscan->rs_read_stream); @@ -2517,7 +2526,9 @@ BitmapHeapScanNextBlock(TableScanDesc scan, /* * Prune and repair fragmentation for the whole page, if possible. */ - heap_page_prune_opt(scan->rs_rd, buffer); + if (scan->rs_flags & SO_ALLOW_VM_SET) + vmbuffer = &hscan->rs_vmbuffer; + heap_page_prune_opt(scan->rs_rd, buffer, vmbuffer); /* * We must hold share lock on the buffer content while examining tuple diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c index 05f6946fe60d..edd529dc3c07 100644 --- a/src/backend/access/heap/heapam_visibility.c +++ b/src/backend/access/heap/heapam_visibility.c @@ -1189,6 +1189,34 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, return res; } +/* + * Nearly the same as HeapTupleSatisfiesVacuum, but uses a GlobalVisState to + * determine whether or not a tuple is HEAPTUPLE_DEAD Or + * HEAPTUPLE_RECENTLY_DEAD. It serves the same purpose but can be used by + * callers that have not calculated a single OldestXmin value. + */ +HTSV_Result +HeapTupleSatisfiesVacuumGlobalVis(HeapTuple htup, GlobalVisState *vistest, + Buffer buffer) +{ + TransactionId dead_after = InvalidTransactionId; + HTSV_Result res; + + res = HeapTupleSatisfiesVacuumHorizon(htup, buffer, &dead_after); + + if (res == HEAPTUPLE_RECENTLY_DEAD) + { + Assert(TransactionIdIsValid(dead_after)); + + if (GlobalVisXidVisibleToAll(vistest, dead_after)) + res = HEAPTUPLE_DEAD; + } + else + Assert(!TransactionIdIsValid(dead_after)); + + return res; +} + /* * Work horse for HeapTupleSatisfiesVacuum and similar routines. * @@ -1447,7 +1475,7 @@ HeapTupleSatisfiesNonVacuumable(HeapTuple htup, Snapshot snapshot, { Assert(TransactionIdIsValid(dead_after)); - if (GlobalVisTestIsRemovableXid(snapshot->vistest, dead_after)) + if (GlobalVisXidVisibleToAll(snapshot->vistest, dead_after)) res = HEAPTUPLE_DEAD; } else @@ -1512,8 +1540,8 @@ HeapTupleIsSurelyDead(HeapTuple htup, GlobalVisState *vistest) return false; /* Deleter committed, so tuple is dead if the XID is old enough. */ - return GlobalVisTestIsRemovableXid(vistest, - HeapTupleHeaderGetRawXmax(tuple)); + return GlobalVisXidVisibleToAll(vistest, + HeapTupleHeaderGetRawXmax(tuple)); } /* diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c index 11cb3f74da5b..dac640f5c9d7 100644 --- a/src/backend/access/heap/heapam_xlog.c +++ b/src/backend/access/heap/heapam_xlog.c @@ -104,6 +104,8 @@ heap_xlog_prune_freeze(XLogReaderState *record) OffsetNumber *frz_offsets; char *dataptr = XLogRecGetBlockData(record, 0, &datalen); bool do_prune; + bool set_lsn = false; + bool mark_buffer_dirty = false; heap_xlog_deserialize_prune_and_freeze(dataptr, xlrec.flags, &nplans, &plans, &frz_offsets, @@ -157,17 +159,36 @@ heap_xlog_prune_freeze(XLogReaderState *record) /* There should be no more data */ Assert((char *) frz_offsets == dataptr + datalen); - if (vmflags & VISIBILITYMAP_VALID_BITS) - PageSetAllVisible(page); - - MarkBufferDirty(buffer); + if (do_prune || nplans > 0) + mark_buffer_dirty = set_lsn = true; /* - * See log_heap_prune_and_freeze() for commentary on when we set the - * heap page LSN. + * The critical integrity requirement here is that we must never end + * up with with the visibility map bit set and the page-level + * PD_ALL_VISIBLE bit clear. If that were to occur, a subsequent page + * modification would fail to clear the visibility map bit. + * + * vmflags may be nonzero with PD_ALL_VISIBLE already set (e.g. when + * marking an all-visible page all-frozen). If only the VM is updated, + * the heap page need not be dirtied. */ - if (do_prune || nplans > 0 || - ((vmflags & VISIBILITYMAP_VALID_BITS) && XLogHintBitIsNeeded())) + if ((vmflags & VISIBILITYMAP_VALID_BITS) && !PageIsAllVisible(page)) + { + PageSetAllVisible(page); + mark_buffer_dirty = true; + + /* + * See log_heap_prune_and_freeze() for commentary on when we set + * the heap page LSN. + */ + if (XLogHintBitIsNeeded()) + set_lsn = true; + } + + if (mark_buffer_dirty) + MarkBufferDirty(buffer); + + if (set_lsn) PageSetLSN(page, lsn); /* @@ -230,7 +251,7 @@ heap_xlog_prune_freeze(XLogReaderState *record) if (PageIsNew(vmpage)) PageInit(vmpage, BLCKSZ, 0); - visibilitymap_set_vmbits(blkno, vmbuffer, vmflags, rlocator); + visibilitymap_set(blkno, vmbuffer, vmflags, rlocator); Assert(BufferIsDirty(vmbuffer)); PageSetLSN(vmpage, lsn); @@ -243,142 +264,6 @@ heap_xlog_prune_freeze(XLogReaderState *record) XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); } -/* - * Replay XLOG_HEAP2_VISIBLE records. - * - * The critical integrity requirement here is that we must never end up with - * a situation where the visibility map bit is set, and the page-level - * PD_ALL_VISIBLE bit is clear. If that were to occur, then a subsequent - * page modification would fail to clear the visibility map bit. - */ -static void -heap_xlog_visible(XLogReaderState *record) -{ - XLogRecPtr lsn = record->EndRecPtr; - xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record); - Buffer vmbuffer = InvalidBuffer; - Buffer buffer; - Page page; - RelFileLocator rlocator; - BlockNumber blkno; - XLogRedoAction action; - - Assert((xlrec->flags & VISIBILITYMAP_XLOG_VALID_BITS) == xlrec->flags); - - XLogRecGetBlockTag(record, 1, &rlocator, NULL, &blkno); - - /* - * If there are any Hot Standby transactions running that have an xmin - * horizon old enough that this page isn't all-visible for them, they - * might incorrectly decide that an index-only scan can skip a heap fetch. - * - * NB: It might be better to throw some kind of "soft" conflict here that - * forces any index-only scan that is in flight to perform heap fetches, - * rather than killing the transaction outright. - */ - if (InHotStandby) - ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon, - xlrec->flags & VISIBILITYMAP_XLOG_CATALOG_REL, - rlocator); - - /* - * Read the heap page, if it still exists. If the heap file has dropped or - * truncated later in recovery, we don't need to update the page, but we'd - * better still update the visibility map. - */ - action = XLogReadBufferForRedo(record, 1, &buffer); - if (action == BLK_NEEDS_REDO) - { - /* - * We don't bump the LSN of the heap page when setting the visibility - * map bit (unless checksums or wal_hint_bits is enabled, in which - * case we must). This exposes us to torn page hazards, but since - * we're not inspecting the existing page contents in any way, we - * don't care. - */ - page = BufferGetPage(buffer); - - PageSetAllVisible(page); - - if (XLogHintBitIsNeeded()) - PageSetLSN(page, lsn); - - MarkBufferDirty(buffer); - } - else if (action == BLK_RESTORED) - { - /* - * If heap block was backed up, we already restored it and there's - * nothing more to do. (This can only happen with checksums or - * wal_log_hints enabled.) - */ - } - - if (BufferIsValid(buffer)) - { - Size space = PageGetFreeSpace(BufferGetPage(buffer)); - - UnlockReleaseBuffer(buffer); - - /* - * Since FSM is not WAL-logged and only updated heuristically, it - * easily becomes stale in standbys. If the standby is later promoted - * and runs VACUUM, it will skip updating individual free space - * figures for pages that became all-visible (or all-frozen, depending - * on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum - * propagates too optimistic free space values to upper FSM layers; - * later inserters try to use such pages only to find out that they - * are unusable. This can cause long stalls when there are many such - * pages. - * - * Forestall those problems by updating FSM's idea about a page that - * is becoming all-visible or all-frozen. - * - * Do this regardless of a full-page image being applied, since the - * FSM data is not in the page anyway. - */ - if (xlrec->flags & VISIBILITYMAP_VALID_BITS) - XLogRecordPageWithFreeSpace(rlocator, blkno, space); - } - - /* - * Even if we skipped the heap page update due to the LSN interlock, it's - * still safe to update the visibility map. Any WAL record that clears - * the visibility map bit does so before checking the page LSN, so any - * bits that need to be cleared will still be cleared. - */ - if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false, - &vmbuffer) == BLK_NEEDS_REDO) - { - Page vmpage = BufferGetPage(vmbuffer); - Relation reln; - uint8 vmbits; - - /* initialize the page if it was read as zeros */ - if (PageIsNew(vmpage)) - PageInit(vmpage, BLCKSZ, 0); - - /* remove VISIBILITYMAP_XLOG_* */ - vmbits = xlrec->flags & VISIBILITYMAP_VALID_BITS; - - /* - * XLogReadBufferForRedoExtended locked the buffer. But - * visibilitymap_set will handle locking itself. - */ - LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); - - reln = CreateFakeRelcacheEntry(rlocator); - - visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer, - xlrec->snapshotConflictHorizon, vmbits); - - ReleaseBuffer(vmbuffer); - FreeFakeRelcacheEntry(reln); - } - else if (BufferIsValid(vmbuffer)) - UnlockReleaseBuffer(vmbuffer); -} - /* * Given an "infobits" field from an XLog record, set the correct bits in the * given infomask and infomask2 for the tuple touched by the record. @@ -577,6 +462,12 @@ heap_xlog_insert(XLogReaderState *record) freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + /* + * Set the page prunable to trigger on-access pruning later which may + * set the page all-visible in the VM. + */ + PageSetPrunable(page, XLogRecGetXid(record)); + PageSetLSN(page, lsn); if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) @@ -726,9 +617,16 @@ heap_xlog_multi_insert(XLogReaderState *record) if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); - /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ + /* + * XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible. If + * we are not setting the page frozen, then set the page's prunable + * hint so that we trigger on-access pruning later which may set the + * page all-visible in the VM. + */ if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) PageSetAllVisible(page); + else + PageSetPrunable(page, XLogRecGetXid(record)); MarkBufferDirty(buffer); } @@ -756,8 +654,8 @@ heap_xlog_multi_insert(XLogReaderState *record) * * During recovery, however, no concurrent writers exist. Therefore, * updating the VM without holding the heap page lock is safe enough. This - * same approach is taken when replaying xl_heap_visible records (see - * heap_xlog_visible()). + * same approach is taken when replaying XLOG_HEAP2_PRUNE* records (see + * heap_xlog_prune_and_freeze()). */ if ((xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) && XLogReadBufferForRedoExtended(record, 1, RBM_ZERO_ON_ERROR, false, @@ -769,11 +667,11 @@ heap_xlog_multi_insert(XLogReaderState *record) if (PageIsNew(vmpage)) PageInit(vmpage, BLCKSZ, 0); - visibilitymap_set_vmbits(blkno, - vmbuffer, - VISIBILITYMAP_ALL_VISIBLE | - VISIBILITYMAP_ALL_FROZEN, - rlocator); + visibilitymap_set(blkno, + vmbuffer, + VISIBILITYMAP_ALL_VISIBLE | + VISIBILITYMAP_ALL_FROZEN, + rlocator); Assert(BufferIsDirty(vmbuffer)); PageSetLSN(vmpage, lsn); @@ -1354,9 +1252,6 @@ heap2_redo(XLogReaderState *record) case XLOG_HEAP2_PRUNE_VACUUM_CLEANUP: heap_xlog_prune_freeze(record); break; - case XLOG_HEAP2_VISIBLE: - heap_xlog_visible(record); - break; case XLOG_HEAP2_MULTI_INSERT: heap_xlog_multi_insert(record); break; diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 231bea679c68..e18ec37fdf5d 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -19,7 +19,7 @@ #include "access/htup_details.h" #include "access/multixact.h" #include "access/transam.h" -#include "access/visibilitymapdefs.h" +#include "access/visibilitymap.h" #include "access/xlog.h" #include "access/xloginsert.h" #include "commands/vacuum.h" @@ -44,6 +44,8 @@ typedef struct bool mark_unused_now; /* whether to attempt freezing tuples */ bool attempt_freeze; + /* whether or not to attempt updating the VM */ + bool attempt_update_vm; struct VacuumCutoffs *cutoffs; /*------------------------------------------------------- @@ -133,20 +135,17 @@ typedef struct * all_visible and all_frozen indicate if the all-visible and all-frozen * bits in the visibility map can be set for this page after pruning. * - * visibility_cutoff_xid is the newest xmin of live tuples on the page. - * The caller can use it as the conflict horizon, when setting the VM - * bits. It is only valid if we froze some tuples, and all_frozen is - * true. + * visibility_cutoff_xid is the newest xmin of live tuples on the page. It + * can be used as the conflict horizon when setting the VM or when + * freezing all the tuples on the page. It is only valid when all the live + * tuples on the page are all-visible. * - * NOTE: all_visible and all_frozen don't include LP_DEAD items. That's - * convenient for heap_page_prune_and_freeze(), to use them to decide - * whether to freeze the page or not. The all_visible and all_frozen - * values returned to the caller are adjusted to include LP_DEAD items at - * the end. - * - * all_frozen should only be considered valid if all_visible is also set; - * we don't bother to clear the all_frozen flag every time we clear the - * all_visible flag. + * NOTE: all_visible and all_frozen initially don't include LP_DEAD items. + * That's convenient for heap_page_prune_and_freeze() to use them to + * decide whether to opportunistically freeze the page or not. The + * all_visible and all_frozen values ultimately used to set the VM are + * adjusted to include LP_DEAD items after we determine whether or not to + * opportunistically freeze. */ bool all_visible; bool all_frozen; @@ -177,10 +176,23 @@ static void heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetN static void page_verify_redirects(Page page); +static TransactionId get_conflict_xid(bool do_prune, bool do_freeze, bool do_set_vm, + TransactionId latest_xid_removed, TransactionId frz_conflict_horizon, + TransactionId visibility_cutoff_xid, bool blk_already_av, + bool set_blk_all_frozen); static bool heap_page_will_freeze(Relation relation, Buffer buffer, bool did_tuple_hint_fpi, bool do_prune, bool do_hint_prune, - PruneState *prstate); - + PruneState *prstate, TransactionId *frz_conflict_horizon); +static bool heap_page_will_set_vis(Relation relation, + BlockNumber heap_blk, + Buffer heap_buf, + Buffer vmbuffer, + bool blk_known_av, + PruneReason reason, + bool do_prune, bool do_freeze, + PruneState *prstate, + uint8 *vmflags, + bool *do_set_pd_vis); /* * Optionally prune and repair fragmentation in the specified page. @@ -193,9 +205,13 @@ static bool heap_page_will_freeze(Relation relation, Buffer buffer, * if there's not any use in pruning. * * Caller must have pin on the buffer, and must *not* have a lock on it. + * + * If vmbuffer is not NULL, it is okay for pruning to set the visibility map if + * the page is all-visible. We will take care of pinning and, if needed, + * reading in the page of the visibility map. */ void -heap_page_prune_opt(Relation relation, Buffer buffer) +heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer) { Page page = BufferGetPage(buffer); TransactionId prune_xid; @@ -225,7 +241,7 @@ heap_page_prune_opt(Relation relation, Buffer buffer) */ vistest = GlobalVisTestFor(relation); - if (!GlobalVisTestIsRemovableXid(vistest, prune_xid)) + if (!GlobalVisXidVisibleToAll(vistest, prune_xid)) return; /* @@ -258,15 +274,34 @@ heap_page_prune_opt(Relation relation, Buffer buffer) if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) { OffsetNumber dummy_off_loc; + PruneFreezeParams params; PruneFreezeResult presult; + params.options = 0; + params.vmbuffer = InvalidBuffer; + + if (vmbuffer) + { + visibilitymap_pin(relation, BufferGetBlockNumber(buffer), vmbuffer); + params.options = HEAP_PAGE_PRUNE_UPDATE_VIS; + params.vmbuffer = *vmbuffer; + } + + params.relation = relation; + params.buffer = buffer; + params.reason = PRUNE_ON_ACCESS; + params.vistest = vistest; + params.cutoffs = NULL; + params.blk_known_av = false; + /* * For now, pass mark_unused_now as false regardless of whether or * not the relation has indexes, since we cannot safely determine * that during on-access pruning with the current implementation. */ - heap_page_prune_and_freeze(relation, buffer, vistest, 0, - NULL, &presult, PRUNE_ON_ACCESS, &dummy_off_loc, NULL, NULL); + params.options = 0; + + heap_page_prune_and_freeze(¶ms, &presult, &dummy_off_loc, NULL, NULL); /* * Report the number of tuples reclaimed to pgstats. This is @@ -304,7 +339,9 @@ heap_page_prune_opt(Relation relation, Buffer buffer) * performs several pre-freeze checks. * * The values of do_prune, do_hint_prune, and did_tuple_hint_fpi must be - * determined before calling this function. + * determined before calling this function. *frz_conflict_horizon is set to + * the snapshot conflict horizon we for the WAL record should we decide to + * freeze tuples. * * prstate is both an input and output parameter. * @@ -316,7 +353,8 @@ heap_page_will_freeze(Relation relation, Buffer buffer, bool did_tuple_hint_fpi, bool do_prune, bool do_hint_prune, - PruneState *prstate) + PruneState *prstate, + TransactionId *frz_conflict_horizon) { bool do_freeze = false; @@ -353,8 +391,10 @@ heap_page_will_freeze(Relation relation, Buffer buffer, * anymore. The opportunistic freeze heuristic must be improved; * however, for now, try to approximate the old logic. */ - if (prstate->all_visible && prstate->all_frozen && prstate->nfrozen > 0) + if (prstate->all_frozen && prstate->nfrozen > 0) { + Assert(prstate->all_visible); + /* * Freezing would make the page all-frozen. Have already emitted * an FPI or will do so anyway? @@ -384,6 +424,22 @@ heap_page_will_freeze(Relation relation, Buffer buffer, * critical section. */ heap_pre_freeze_checks(buffer, prstate->frozen, prstate->nfrozen); + + /* + * Calculate what the snapshot conflict horizon should be for a record + * freezing tuples. We can use the visibility_cutoff_xid as our cutoff + * for conflicts when the whole page is eligible to become all-frozen + * in the VM once we're done with it. Otherwise, we generate a + * conservative cutoff by stepping back from OldestXmin. + */ + if (prstate->all_frozen) + *frz_conflict_horizon = prstate->visibility_cutoff_xid; + else + { + /* Avoids false conflicts when hot_standby_feedback in use */ + *frz_conflict_horizon = prstate->cutoffs->OldestXmin; + TransactionIdRetreat(*frz_conflict_horizon); + } } else if (prstate->nfrozen > 0) { @@ -409,87 +465,145 @@ heap_page_will_freeze(Relation relation, Buffer buffer, return do_freeze; } - /* - * Prune and repair fragmentation and potentially freeze tuples on the - * specified page. - * - * Caller must have pin and buffer cleanup lock on the page. Note that we - * don't update the FSM information for page on caller's behalf. Caller might - * also need to account for a reduction in the length of the line pointer - * array following array truncation by us. - * - * If the HEAP_PRUNE_FREEZE option is set, we will also freeze tuples if it's - * required in order to advance relfrozenxid / relminmxid, or if it's - * considered advantageous for overall system performance to do so now. The - * 'cutoffs', 'presult', 'new_relfrozen_xid' and 'new_relmin_mxid' arguments - * are required when freezing. When HEAP_PRUNE_FREEZE option is set, we also - * set presult->all_visible and presult->all_frozen on exit, to indicate if - * the VM bits can be set. They are always set to false when the - * HEAP_PRUNE_FREEZE option is not set, because at the moment only callers - * that also freeze need that information. - * - * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD - * (see heap_prune_satisfies_vacuum). - * - * options: - * MARK_UNUSED_NOW indicates that dead items can be set LP_UNUSED during - * pruning. - * - * FREEZE indicates that we will also freeze tuples, and will return - * 'all_visible', 'all_frozen' flags to the caller. - * - * cutoffs contains the freeze cutoffs, established by VACUUM at the beginning - * of vacuuming the relation. Required if HEAP_PRUNE_FREEZE option is set. - * cutoffs->OldestXmin is also used to determine if dead tuples are - * HEAPTUPLE_RECENTLY_DEAD or HEAPTUPLE_DEAD. - * - * presult contains output parameters needed by callers, such as the number of - * tuples removed and the offsets of dead items on the page after pruning. - * heap_page_prune_and_freeze() is responsible for initializing it. Required - * by all callers. + * Decide whether to set the visibility map bits for heap_blk, using + * information from PruneState and blk_known_av. Some callers may already + * have examined this page’s VM bits (e.g., VACUUM in the previous + * heap_vac_scan_next_block() call) and can pass that along. * - * reason indicates why the pruning is performed. It is included in the WAL - * record for debugging and analysis purposes, but otherwise has no effect. + * This should be called only after do_freeze has been decided (and do_prune + * has been set), as these factor into our heuristic-based decision. * - * off_loc is the offset location required by the caller to use in error - * callback. - * - * new_relfrozen_xid and new_relmin_mxid must provided by the caller if the - * HEAP_PRUNE_FREEZE option is set. On entry, they contain the oldest XID and - * multi-XID seen on the relation so far. They will be updated with oldest - * values present on the page after pruning. After processing the whole - * relation, VACUUM can use these values as the new relfrozenxid/relminmxid - * for the relation. + * Returns true if one or both VM bits should be set, along with the desired + * flags in *vmflags. Also indicates via do_set_pd_vis whether PD_ALL_VISIBLE + * should be set on the heap page. */ -void -heap_page_prune_and_freeze(Relation relation, Buffer buffer, - GlobalVisState *vistest, - int options, - struct VacuumCutoffs *cutoffs, - PruneFreezeResult *presult, - PruneReason reason, - OffsetNumber *off_loc, - TransactionId *new_relfrozen_xid, - MultiXactId *new_relmin_mxid) +static bool +heap_page_will_set_vis(Relation relation, + BlockNumber heap_blk, + Buffer heap_buf, + Buffer vmbuffer, + bool blk_known_av, + PruneReason reason, + bool do_prune, bool do_freeze, + PruneState *prstate, + uint8 *vmflags, + bool *do_set_pd_vis) { - Page page = BufferGetPage(buffer); - BlockNumber blockno = BufferGetBlockNumber(buffer); - OffsetNumber offnum, - maxoff; - PruneState prstate; - HeapTupleData tup; - bool do_freeze; - bool do_prune; - bool do_hint_prune; - bool did_tuple_hint_fpi; - int64 fpi_before = pgWalUsage.wal_fpi; + Page heap_page = BufferGetPage(heap_buf); + bool do_set_vm = false; + + *do_set_pd_vis = false; + + if (!prstate->attempt_update_vm) + { + Assert(!prstate->all_visible && !prstate->all_frozen); + Assert(*vmflags == 0); + return false; + } + + /* + * If this is an on-access call and we're not actually pruning, avoid + * setting the visibility map if it would newly dirty the heap page or, if + * the page is already dirty, if doing so would require including a + * full-page image (FPI) of the heap page in the WAL. This situation + * should be rare, as on-access pruning is only attempted when + * pd_prune_xid is valid. + */ + if (reason == PRUNE_ON_ACCESS && + prstate->all_visible && + !do_prune && !do_freeze && + (!BufferIsDirty(heap_buf) || XLogCheckBufferNeedsBackup(heap_buf))) + { + prstate->all_visible = prstate->all_frozen = false; + return false; + } + + if (prstate->all_visible && !PageIsAllVisible(heap_page)) + *do_set_pd_vis = true; + if ((prstate->all_visible && !blk_known_av) || + (prstate->all_frozen && !VM_ALL_FROZEN(relation, heap_blk, &vmbuffer))) + { + *vmflags = VISIBILITYMAP_ALL_VISIBLE; + if (prstate->all_frozen) + *vmflags |= VISIBILITYMAP_ALL_FROZEN; + + do_set_vm = true; + } + + /* + * Now handle two potential corruption cases: + * + * These do not need to happen in a critical section and are not + * WAL-logged. + * + * As of PostgreSQL 9.2, the visibility map bit should never be set if the + * page-level bit is clear. However, it's possible that in vacuum the bit + * got cleared after heap_vac_scan_next_block() was called, so we must + * recheck with buffer lock before concluding that the VM is corrupt. + * + * This will never trigger for on-access pruning because it couldn't have + * done a previous visibility map lookup and thus will always pass + * blk_known_av as false. A future vacuum will have to take care of fixing + * the corruption. + */ + else if (blk_known_av && !PageIsAllVisible(heap_page) && + visibilitymap_get_status(relation, heap_blk, &vmbuffer) != 0) + { + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", + RelationGetRelationName(relation), heap_blk))); + + visibilitymap_clear(relation, heap_blk, vmbuffer, + VISIBILITYMAP_VALID_BITS); + } + + /* + * It's possible for the value returned by + * GetOldestNonRemovableTransactionId() to move backwards, so it's not + * wrong for us to see tuples that appear to not be visible to everyone + * yet, while PD_ALL_VISIBLE is already set. The real safe xmin value + * never moves backwards, but GetOldestNonRemovableTransactionId() is + * conservative and sometimes returns a value that's unnecessarily small, + * so if we see that contradiction it just means that the tuples that we + * think are not visible to everyone yet actually are, and the + * PD_ALL_VISIBLE flag is correct. + * + * There should never be LP_DEAD items on a page with PD_ALL_VISIBLE set, + * however. + */ + else if (prstate->lpdead_items > 0 && PageIsAllVisible(heap_page)) + { + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u", + RelationGetRelationName(relation), heap_blk))); + + PageClearAllVisible(heap_page); + MarkBufferDirty(heap_buf); + visibilitymap_clear(relation, heap_blk, vmbuffer, + VISIBILITYMAP_VALID_BITS); + } + + return do_set_vm; +} + +static void +prune_freeze_setup(PruneFreezeParams *params, PruneState *prstate, + TransactionId *new_relfrozen_xid, + MultiXactId *new_relmin_mxid, + PruneFreezeResult *presult) +{ /* Copy parameters to prstate */ - prstate.vistest = vistest; - prstate.mark_unused_now = (options & HEAP_PAGE_PRUNE_MARK_UNUSED_NOW) != 0; - prstate.attempt_freeze = (options & HEAP_PAGE_PRUNE_FREEZE) != 0; - prstate.cutoffs = cutoffs; + prstate->vistest = params->vistest; + prstate->mark_unused_now = + (params->options & HEAP_PAGE_PRUNE_MARK_UNUSED_NOW) != 0; + prstate->attempt_freeze = (params->options & HEAP_PAGE_PRUNE_FREEZE) != 0; + prstate->attempt_update_vm = + (params->options & HEAP_PAGE_PRUNE_UPDATE_VIS) != 0; + prstate->cutoffs = params->cutoffs; /* * Our strategy is to scan the page and make lists of items to change, @@ -502,88 +616,97 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * prunable, we will save the lowest relevant XID in new_prune_xid. Also * initialize the rest of our working state. */ - prstate.new_prune_xid = InvalidTransactionId; - prstate.latest_xid_removed = InvalidTransactionId; - prstate.nredirected = prstate.ndead = prstate.nunused = prstate.nfrozen = 0; - prstate.nroot_items = 0; - prstate.nheaponly_items = 0; + prstate->new_prune_xid = InvalidTransactionId; + prstate->latest_xid_removed = InvalidTransactionId; + prstate->nredirected = prstate->ndead = prstate->nunused = prstate->nfrozen = 0; + prstate->nroot_items = 0; + prstate->nheaponly_items = 0; /* initialize page freezing working state */ - prstate.pagefrz.freeze_required = false; - if (prstate.attempt_freeze) + prstate->pagefrz.freeze_required = false; + if (prstate->attempt_freeze) { Assert(new_relfrozen_xid && new_relmin_mxid); - prstate.pagefrz.FreezePageRelfrozenXid = *new_relfrozen_xid; - prstate.pagefrz.NoFreezePageRelfrozenXid = *new_relfrozen_xid; - prstate.pagefrz.FreezePageRelminMxid = *new_relmin_mxid; - prstate.pagefrz.NoFreezePageRelminMxid = *new_relmin_mxid; + prstate->pagefrz.FreezePageRelfrozenXid = *new_relfrozen_xid; + prstate->pagefrz.NoFreezePageRelfrozenXid = *new_relfrozen_xid; + prstate->pagefrz.FreezePageRelminMxid = *new_relmin_mxid; + prstate->pagefrz.NoFreezePageRelminMxid = *new_relmin_mxid; } else { Assert(new_relfrozen_xid == NULL && new_relmin_mxid == NULL); - prstate.pagefrz.FreezePageRelminMxid = InvalidMultiXactId; - prstate.pagefrz.NoFreezePageRelminMxid = InvalidMultiXactId; - prstate.pagefrz.FreezePageRelfrozenXid = InvalidTransactionId; - prstate.pagefrz.NoFreezePageRelfrozenXid = InvalidTransactionId; + prstate->pagefrz.FreezePageRelminMxid = InvalidMultiXactId; + prstate->pagefrz.NoFreezePageRelminMxid = InvalidMultiXactId; + prstate->pagefrz.FreezePageRelfrozenXid = InvalidTransactionId; + prstate->pagefrz.NoFreezePageRelfrozenXid = InvalidTransactionId; } - prstate.ndeleted = 0; - prstate.live_tuples = 0; - prstate.recently_dead_tuples = 0; - prstate.hastup = false; - prstate.lpdead_items = 0; - prstate.deadoffsets = presult->deadoffsets; + prstate->ndeleted = 0; + prstate->live_tuples = 0; + prstate->recently_dead_tuples = 0; + prstate->hastup = false; + prstate->lpdead_items = 0; + prstate->deadoffsets = presult->deadoffsets; /* - * Caller may update the VM after we're done. We can keep track of - * whether the page will be all-visible and all-frozen after pruning and - * freezing to help the caller to do that. + * Track whether the page could be marked all-visible and/or all-frozen. + * This information is used for opportunistic freezing and for updating + * the visibility map (VM) if requested by the caller. * - * Currently, only VACUUM sets the VM bits. To save the effort, only do - * the bookkeeping if the caller needs it. Currently, that's tied to - * HEAP_PAGE_PRUNE_FREEZE, but it could be a separate flag if you wanted - * to update the VM bits without also freezing or freeze without also - * setting the VM bits. + * Currently, only VACUUM performs freezing, but other callers may in the + * future. Visibility bookkeeping is required not just for setting the VM + * bits, but also for opportunistic freezing: we only consider freezing if + * the page would become all-frozen, or if it would be all-frozen except + * for dead tuples that VACUUM will remove. If attempt_update_vm is false, + * we will not set the VM bit even if the page is found to be all-visible. * - * In addition to telling the caller whether it can set the VM bit, we - * also use 'all_visible' and 'all_frozen' for our own decision-making. If - * the whole page would become frozen, we consider opportunistically - * freezing tuples. We will not be able to freeze the whole page if there - * are tuples present that are not visible to everyone or if there are - * dead tuples which are not yet removable. However, dead tuples which - * will be removed by the end of vacuuming should not preclude us from - * opportunistically freezing. Because of that, we do not clear - * all_visible when we see LP_DEAD items. We fix that at the end of the - * function, when we return the value to the caller, so that the caller - * doesn't set the VM bit incorrectly. + * If HEAP_PAGE_PRUNE_UPDATE_VIS is passed without HEAP_PAGE_PRUNE_FREEZE, + * prstate.all_frozen must be initialized to false, since we will not call + * heap_prepare_freeze_tuple() for each tuple. + * + * Dead tuples that will be removed by the end of vacuum should not + * prevent opportunistic freezing. Therefore, we do not clear all_visible + * when we encounter LP_DEAD items. Instead, we correct all_visible after + * deciding whether to freeze, but before updating the VM, to avoid + * setting the VM bit incorrectly. + * + * If neither freezing nor VM updates are requested, we skip the extra + * bookkeeping. In this case, initializing all_visible to false allows + * heap_prune_record_unchanged_lp_normal() to bypass unnecessary work. */ - if (prstate.attempt_freeze) + if (prstate->attempt_freeze) { - prstate.all_visible = true; - prstate.all_frozen = true; + prstate->all_visible = true; + prstate->all_frozen = true; + } + else if (prstate->attempt_update_vm) + { + prstate->all_visible = true; + prstate->all_frozen = false; } else { - /* - * Initializing to false allows skipping the work to update them in - * heap_prune_record_unchanged_lp_normal(). - */ - prstate.all_visible = false; - prstate.all_frozen = false; + prstate->all_visible = false; + prstate->all_frozen = false; } /* - * The visibility cutoff xid is the newest xmin of live tuples on the - * page. In the common case, this will be set as the conflict horizon the - * caller can use for updating the VM. If, at the end of freezing and - * pruning, the page is all-frozen, there is no possibility that any - * running transaction on the standby does not see tuples on the page as - * all-visible, so the conflict horizon remains InvalidTransactionId. + * The visibility cutoff xid is the newest xmin of live, committed tuples + * on the page older than the visibility horizon represented in the + * GlobalVisState. This field is only kept up-to-date if the page is + * all-visible. As soon as a tuple is encountered that is not visible to + * all, this field is unmaintained. As long as it is maintained, it can be + * used to calculate the snapshot conflict horizon when updating the VM + * and/or freezing all the tuples on the page. */ - prstate.visibility_cutoff_xid = InvalidTransactionId; + prstate->visibility_cutoff_xid = InvalidTransactionId; +} - maxoff = PageGetMaxOffsetNumber(page); - tup.t_tableOid = RelationGetRelid(relation); +static void +prune_freeze_plan(PruneState *prstate, BlockNumber blockno, Buffer buffer, Page page, + OffsetNumber maxoff, OffsetNumber *off_loc, HeapTuple tup) +{ + OffsetNumber offnum; /* * Determine HTSV for all tuples, and queue them up for processing as HOT @@ -592,9 +715,9 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * Determining HTSV only once for each tuple is required for correctness, * to deal with cases where running HTSV twice could result in different * results. For example, RECENTLY_DEAD can turn to DEAD if another - * checked item causes GlobalVisTestIsRemovableFullXid() to update the - * horizon, or INSERT_IN_PROGRESS can change to DEAD if the inserting - * transaction aborts. + * checked item causes GlobalVisXidVisibleToAll() to update the horizon, + * or INSERT_IN_PROGRESS can change to DEAD if the inserting transaction + * aborts. * * It's also good for performance. Most commonly tuples within a page are * stored at decreasing offsets (while the items are stored at increasing @@ -618,13 +741,13 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, */ *off_loc = offnum; - prstate.processed[offnum] = false; - prstate.htsv[offnum] = -1; + prstate->processed[offnum] = false; + prstate->htsv[offnum] = -1; /* Nothing to do if slot doesn't contain a tuple */ if (!ItemIdIsUsed(itemid)) { - heap_prune_record_unchanged_lp_unused(page, &prstate, offnum); + heap_prune_record_unchanged_lp_unused(page, prstate, offnum); continue; } @@ -634,17 +757,17 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * If the caller set mark_unused_now true, we can set dead line * pointers LP_UNUSED now. */ - if (unlikely(prstate.mark_unused_now)) - heap_prune_record_unused(&prstate, offnum, false); + if (unlikely(prstate->mark_unused_now)) + heap_prune_record_unused(prstate, offnum, false); else - heap_prune_record_unchanged_lp_dead(page, &prstate, offnum); + heap_prune_record_unchanged_lp_dead(page, prstate, offnum); continue; } if (ItemIdIsRedirected(itemid)) { /* This is the start of a HOT chain */ - prstate.root_items[prstate.nroot_items++] = offnum; + prstate->root_items[prstate->nroot_items++] = offnum; continue; } @@ -654,25 +777,19 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * Get the tuple's visibility status and queue it up for processing. */ htup = (HeapTupleHeader) PageGetItem(page, itemid); - tup.t_data = htup; - tup.t_len = ItemIdGetLength(itemid); - ItemPointerSet(&tup.t_self, blockno, offnum); + tup->t_data = htup; + tup->t_len = ItemIdGetLength(itemid); + ItemPointerSet(&tup->t_self, blockno, offnum); - prstate.htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup, - buffer); + prstate->htsv[offnum] = heap_prune_satisfies_vacuum(prstate, tup, + buffer); if (!HeapTupleHeaderIsHeapOnly(htup)) - prstate.root_items[prstate.nroot_items++] = offnum; + prstate->root_items[prstate->nroot_items++] = offnum; else - prstate.heaponly_items[prstate.nheaponly_items++] = offnum; + prstate->heaponly_items[prstate->nheaponly_items++] = offnum; } - /* - * If checksums are enabled, heap_prune_satisfies_vacuum() may have caused - * an FPI to be emitted. - */ - did_tuple_hint_fpi = fpi_before != pgWalUsage.wal_fpi; - /* * Process HOT chains. * @@ -684,30 +801,30 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * the page instead of using the root_items array, also did it in * ascending offset number order.) */ - for (int i = prstate.nroot_items - 1; i >= 0; i--) + for (int i = prstate->nroot_items - 1; i >= 0; i--) { - offnum = prstate.root_items[i]; + offnum = prstate->root_items[i]; /* Ignore items already processed as part of an earlier chain */ - if (prstate.processed[offnum]) + if (prstate->processed[offnum]) continue; /* see preceding loop */ *off_loc = offnum; /* Process this item or chain of items */ - heap_prune_chain(page, blockno, maxoff, offnum, &prstate); + heap_prune_chain(page, blockno, maxoff, offnum, prstate); } /* * Process any heap-only tuples that were not already processed as part of * a HOT chain. */ - for (int i = prstate.nheaponly_items - 1; i >= 0; i--) + for (int i = prstate->nheaponly_items - 1; i >= 0; i--) { - offnum = prstate.heaponly_items[i]; + offnum = prstate->heaponly_items[i]; - if (prstate.processed[offnum]) + if (prstate->processed[offnum]) continue; /* see preceding loop */ @@ -726,7 +843,7 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * return true for an XMIN_INVALID tuple, so this code will work even * when there were sequential updates within the aborted transaction.) */ - if (prstate.htsv[offnum] == HEAPTUPLE_DEAD) + if (prstate->htsv[offnum] == HEAPTUPLE_DEAD) { ItemId itemid = PageGetItemId(page, offnum); HeapTupleHeader htup = (HeapTupleHeader) PageGetItem(page, itemid); @@ -734,8 +851,8 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, if (likely(!HeapTupleHeaderIsHotUpdated(htup))) { HeapTupleHeaderAdvanceConflictHorizon(htup, - &prstate.latest_xid_removed); - heap_prune_record_unused(&prstate, offnum, true); + &prstate->latest_xid_removed); + heap_prune_record_unused(prstate, offnum, true); } else { @@ -752,7 +869,7 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, } } else - heap_prune_record_unchanged_lp_normal(page, &prstate, offnum); + heap_prune_record_unchanged_lp_normal(page, prstate, offnum); } /* We should now have processed every tuple exactly once */ @@ -763,12 +880,110 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, { *off_loc = offnum; - Assert(prstate.processed[offnum]); + Assert(prstate->processed[offnum]); } #endif + /* + * After processing all the live tuples on the page, if the newest xmin + * amongst them is not visible to everyone, the page cannot be + * all-visible. + */ + if (prstate->all_visible && + TransactionIdIsNormal(prstate->visibility_cutoff_xid) && + !GlobalVisXidVisibleToAll(prstate->vistest, prstate->visibility_cutoff_xid)) + prstate->all_visible = prstate->all_frozen = false; + /* Clear the offset information once we have processed the given page. */ *off_loc = InvalidOffsetNumber; +} + +/* + * Prune and repair fragmentation and potentially freeze tuples on the + * specified page. If the page's visibility status has changed, update it in + * the VM. + * + * Caller must have pin and buffer cleanup lock on the page. Note that we + * don't update the FSM information for page on caller's behalf. Caller might + * also need to account for a reduction in the length of the line pointer + * array following array truncation by us. + * + * params contains the input parameters used to control freezing and pruning + * behavior. See the definition of PruneFreezeParams for more on what each + * parameter does. + * + * If the HEAP_PRUNE_FREEZE option is set in params, we will freeze tuples if + * it's required in order to advance relfrozenxid / relminmxid, or if it's + * considered advantageous for overall system performance to do so now. The + * 'params.cutoffs', 'presult', 'new_relfrozen_xid' and 'new_relmin_mxid' + * arguments are required when freezing. + * + * If HEAP_PAGE_PRUNE_UPDATE_VIS is set in params and the visibility status of + * the page has changed, we will update the VM at the same time as pruning and + * freezing the heap page. We will also update presult->old_vmbits and + * presult->new_vmbits with the state of the VM before and after updating it + * for the caller to use in bookkeeping. + * + * presult contains output parameters needed by callers, such as the number of + * tuples removed and the offsets of dead items on the page after pruning. + * heap_page_prune_and_freeze() is responsible for initializing it. Required + * by all callers. + * + * off_loc is the offset location required by the caller to use in error + * callback. + * + * new_relfrozen_xid and new_relmin_mxid must provided by the caller if the + * HEAP_PRUNE_FREEZE option is set in params. On entry, they contain the + * oldest XID and multi-XID seen on the relation so far. They will be updated + * with oldest values present on the page after pruning. After processing the + * whole relation, VACUUM can use these values as the new + * relfrozenxid/relminmxid for the relation. + */ +void +heap_page_prune_and_freeze(PruneFreezeParams *params, + PruneFreezeResult *presult, + OffsetNumber *off_loc, + TransactionId *new_relfrozen_xid, + MultiXactId *new_relmin_mxid) +{ + Buffer buffer = params->buffer; + Buffer vmbuffer = params->vmbuffer; + Page page = BufferGetPage(buffer); + BlockNumber blockno = BufferGetBlockNumber(buffer); + OffsetNumber maxoff; + PruneState prstate; + HeapTupleData tup; + bool do_freeze; + bool do_prune; + bool do_hint_prune; + bool do_set_vm; + bool do_set_pd_vis; + bool did_tuple_hint_fpi; + int64 fpi_before = pgWalUsage.wal_fpi; + TransactionId frz_conflict_horizon = InvalidTransactionId; + TransactionId conflict_xid = InvalidTransactionId; + uint8 new_vmbits = 0; + uint8 old_vmbits = 0; + + maxoff = PageGetMaxOffsetNumber(page); + tup.t_tableOid = RelationGetRelid(params->relation); + + /* Initialize needed state in prstate */ + prune_freeze_setup(params, &prstate, new_relfrozen_xid, new_relmin_mxid, presult); + + /* + * Examine all line pointers and tuple visibility information to determine + * which line pointers should change state and which tuples may be frozen. + * Prepare queue of state changes to later be executed in a critical + * section. + */ + prune_freeze_plan(&prstate, blockno, buffer, page, maxoff, off_loc, &tup); + + /* + * If checksums are enabled, heap_prune_satisfies_vacuum() may have caused + * an FPI to be emitted. + */ + did_tuple_hint_fpi = fpi_before != pgWalUsage.wal_fpi; do_prune = prstate.nredirected > 0 || prstate.ndead > 0 || @@ -786,11 +1001,59 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * Decide if we want to go ahead with freezing according to the freeze * plans we prepared, or not. */ - do_freeze = heap_page_will_freeze(relation, buffer, + do_freeze = heap_page_will_freeze(params->relation, buffer, did_tuple_hint_fpi, do_prune, do_hint_prune, - &prstate); + &prstate, + &frz_conflict_horizon); + + /* + * While scanning the line pointers, we did not clear + * all_visible/all_frozen when encountering LP_DEAD items because we + * wanted the decision whether or not to freeze the page to be unaffected + * by the short-term presence of LP_DEAD items. These LP_DEAD items are + * effectively assumed to be LP_UNUSED items in the making. It doesn't + * matter which vacuum heap pass (initial pass or final pass) ends up + * setting the page all-frozen, as long as the ongoing VACUUM does it. + * + * Now that we finished determining whether or not to freeze the page, + * update all_visible and all_frozen so that they reflect the true state + * of the page for setting PD_ALL_VISIBLE and VM bits. + */ + if (prstate.lpdead_items > 0) + prstate.all_visible = prstate.all_frozen = false; + + Assert(!prstate.all_frozen || prstate.all_visible); + Assert(!prstate.all_visible || (prstate.lpdead_items == 0)); + + /* + * Decide whether to set the page-level PD_ALL_VISIBLE bit and the VM bits + * based on information from the VM and the all_visible/all_frozen flags. + * + * While it is valid for PD_ALL_VISIBLE to be set when the corresponding + * VM bit is clear, we strongly prefer to keep them in sync. + * + * Accordingly, we also allow updating only the VM when PD_ALL_VISIBLE has + * already been set. Setting only the VM is most common when setting an + * already all-visible page all-frozen. + */ + do_set_vm = heap_page_will_set_vis(params->relation, + blockno, buffer, vmbuffer, params->blk_known_av, + params->reason, do_prune, do_freeze, + &prstate, &new_vmbits, &do_set_pd_vis); + + /* We should only set the VM if PD_ALL_VISIBLE is set or will be */ + Assert(!do_set_vm || do_set_pd_vis || PageIsAllVisible(page)); + + conflict_xid = get_conflict_xid(do_prune, do_freeze, do_set_vm, + prstate.latest_xid_removed, frz_conflict_horizon, + prstate.visibility_cutoff_xid, params->blk_known_av, + (do_set_vm && (new_vmbits & VISIBILITYMAP_ALL_FROZEN))); + + /* Lock vmbuffer before entering a critical section */ + if (do_set_vm) + LockBuffer(vmbuffer, BUFFER_LOCK_EXCLUSIVE); /* Any error while applying the changes is critical */ START_CRIT_SECTION(); @@ -812,14 +1075,17 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, /* * If that's all we had to do to the page, this is a non-WAL-logged - * hint. If we are going to freeze or prune the page, we will mark - * the buffer dirty below. + * hint. If we are going to freeze or prune the page or set + * PD_ALL_VISIBLE, we will mark the buffer dirty below. + * + * Setting PD_ALL_VISIBLE is fully WAL-logged because it is forbidden + * for the VM to be set and PD_ALL_VISIBLE to be clear. */ - if (!do_freeze && !do_prune) + if (!do_freeze && !do_prune && !do_set_pd_vis) MarkBufferDirtyHint(buffer, true); } - if (do_prune || do_freeze) + if (do_prune || do_freeze || do_set_vm) { /* Apply the planned item changes and repair page fragmentation. */ if (do_prune) @@ -833,54 +1099,43 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, if (do_freeze) heap_freeze_prepared_tuples(buffer, prstate.frozen, prstate.nfrozen); - MarkBufferDirty(buffer); + if (do_set_pd_vis) + PageSetAllVisible(page); - /* - * Emit a WAL XLOG_HEAP2_PRUNE* record showing what we did - */ - if (RelationNeedsWAL(relation)) + if (do_prune || do_freeze || do_set_pd_vis) + MarkBufferDirty(buffer); + + if (do_set_vm) { - /* - * The snapshotConflictHorizon for the whole record should be the - * most conservative of all the horizons calculated for any of the - * possible modifications. If this record will prune tuples, any - * transactions on the standby older than the youngest xmax of the - * most recently removed tuple this record will prune will - * conflict. If this record will freeze tuples, any transactions - * on the standby with xids older than the youngest tuple this - * record will freeze will conflict. - */ - TransactionId frz_conflict_horizon = InvalidTransactionId; - TransactionId conflict_xid; + Assert(PageIsAllVisible(page)); - /* - * We can use the visibility_cutoff_xid as our cutoff for - * conflicts when the whole page is eligible to become all-frozen - * in the VM once we're done with it. Otherwise we generate a - * conservative cutoff by stepping back from OldestXmin. - */ - if (do_freeze) + old_vmbits = visibilitymap_set(blockno, + vmbuffer, new_vmbits, + params->relation->rd_locator); + if (old_vmbits == new_vmbits) { - if (prstate.all_visible && prstate.all_frozen) - frz_conflict_horizon = prstate.visibility_cutoff_xid; - else - { - /* Avoids false conflicts when hot_standby_feedback in use */ - frz_conflict_horizon = prstate.cutoffs->OldestXmin; - TransactionIdRetreat(frz_conflict_horizon); - } + LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); + /* Unset so we don't emit WAL since no change occurred */ + do_set_vm = false; } + } - if (TransactionIdFollows(frz_conflict_horizon, prstate.latest_xid_removed)) - conflict_xid = frz_conflict_horizon; - else - conflict_xid = prstate.latest_xid_removed; - - log_heap_prune_and_freeze(relation, buffer, - InvalidBuffer, /* vmbuffer */ - 0, /* vmflags */ + /* + * Emit a WAL XLOG_HEAP2_PRUNE* record showing what we did. If we were + * only updating the VM and it turns out it was already set, we will + * have unset do_set_vm earlier. As such, check it again before + * emitting the record. + */ + if (RelationNeedsWAL(params->relation) && + (do_prune || do_freeze || do_set_vm)) + { + log_heap_prune_and_freeze(params->relation, buffer, + do_set_vm ? vmbuffer : InvalidBuffer, + do_set_vm ? new_vmbits : 0, conflict_xid, - true, reason, + true, /* cleanup lock */ + do_set_pd_vis, + params->reason, prstate.frozen, prstate.nfrozen, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, @@ -890,50 +1145,46 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, END_CRIT_SECTION(); - /* Copy information back for caller */ - presult->ndeleted = prstate.ndeleted; - presult->nnewlpdead = prstate.ndead; - presult->nfrozen = prstate.nfrozen; - presult->live_tuples = prstate.live_tuples; - presult->recently_dead_tuples = prstate.recently_dead_tuples; + if (do_set_vm) + LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); /* - * It was convenient to ignore LP_DEAD items in all_visible earlier on to - * make the choice of whether or not to freeze the page unaffected by the - * short-term presence of LP_DEAD items. These LP_DEAD items were - * effectively assumed to be LP_UNUSED items in the making. It doesn't - * matter which vacuum heap pass (initial pass or final pass) ends up - * setting the page all-frozen, as long as the ongoing VACUUM does it. - * - * Now that freezing has been finalized, unset all_visible if there are - * any LP_DEAD items on the page. It needs to reflect the present state - * of the page, as expected by our caller. + * During its second pass over the heap, VACUUM calls + * heap_page_would_be_all_visible() to determine whether a page is + * all-visible and all-frozen. The logic here is similar. After completing + * pruning and freezing, use an assertion to verify that our results + * remain consistent with heap_page_would_be_all_visible(). */ - if (prstate.all_visible && prstate.lpdead_items == 0) - { - presult->all_visible = prstate.all_visible; - presult->all_frozen = prstate.all_frozen; - } - else +#ifdef USE_ASSERT_CHECKING + if (prstate.all_visible) { - presult->all_visible = false; - presult->all_frozen = false; + TransactionId debug_cutoff; + bool debug_all_frozen; + + Assert(prstate.lpdead_items == 0); + + if (!heap_page_is_all_visible(params->relation, buffer, + prstate.vistest, + &debug_all_frozen, + &debug_cutoff, off_loc)) + Assert(false); + + Assert(prstate.all_frozen == debug_all_frozen); + + Assert(!TransactionIdIsValid(debug_cutoff) || + debug_cutoff == prstate.visibility_cutoff_xid); } +#endif + /* Copy information back for caller */ + presult->ndeleted = prstate.ndeleted; + presult->nnewlpdead = prstate.ndead; + presult->nfrozen = prstate.nfrozen; + presult->live_tuples = prstate.live_tuples; + presult->recently_dead_tuples = prstate.recently_dead_tuples; presult->hastup = prstate.hastup; - - /* - * For callers planning to update the visibility map, the conflict horizon - * for that record must be the newest xmin on the page. However, if the - * page is completely frozen, there can be no conflict and the - * vm_conflict_horizon should remain InvalidTransactionId. This includes - * the case that we just froze all the tuples; the prune-freeze record - * included the conflict XID already so the caller doesn't need it. - */ - if (presult->all_frozen) - presult->vm_conflict_horizon = InvalidTransactionId; - else - presult->vm_conflict_horizon = prstate.visibility_cutoff_xid; + presult->new_vmbits = new_vmbits; + presult->old_vmbits = old_vmbits; presult->lpdead_items = prstate.lpdead_items; /* the presult->deadoffsets array was already filled in */ @@ -983,11 +1234,11 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) * Determine whether or not the tuple is considered dead when compared * with the provided GlobalVisState. On-access pruning does not provide * VacuumCutoffs. And for vacuum, even if the tuple's xmax is not older - * than OldestXmin, GlobalVisTestIsRemovableXid() could find the row dead - * if the GlobalVisState has been updated since the beginning of vacuuming + * than OldestXmin, GlobalVisXidVisibleToAll() could find the row dead if + * the GlobalVisState has been updated since the beginning of vacuuming * the relation. */ - if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after)) + if (GlobalVisXidVisibleToAll(prstate->vistest, dead_after)) return HEAPTUPLE_DEAD; return res; @@ -1299,8 +1550,11 @@ heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum, /* * Deliberately delay unsetting all_visible until later during pruning. - * Removable dead tuples shouldn't preclude freezing the page. + * Removable dead tuples shouldn't preclude freezing the page. If we won't + * attempt freezing, just unset all-visible now, though. */ + if (!prstate->attempt_freeze) + prstate->all_visible = prstate->all_frozen = false; /* Record the dead offset for vacuum */ prstate->deadoffsets[prstate->lpdead_items++] = offnum; @@ -1424,9 +1678,11 @@ heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumb { TransactionId xmin; + Assert(prstate->attempt_update_vm); + if (!HeapTupleHeaderXminCommitted(htup)) { - prstate->all_visible = false; + prstate->all_visible = prstate->all_frozen = false; break; } @@ -1439,19 +1695,6 @@ heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumb */ xmin = HeapTupleHeaderGetXmin(htup); - /* - * For now always use prstate->cutoffs for this test, because - * we only update 'all_visible' when freezing is requested. We - * could use GlobalVisTestIsRemovableXid instead, if a - * non-freezing caller wanted to set the VM bit. - */ - Assert(prstate->cutoffs); - if (!TransactionIdPrecedes(xmin, prstate->cutoffs->OldestXmin)) - { - prstate->all_visible = false; - break; - } - /* Track newest xmin on page. */ if (TransactionIdFollows(xmin, prstate->visibility_cutoff_xid) && TransactionIdIsNormal(xmin)) @@ -1461,7 +1704,7 @@ heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumb case HEAPTUPLE_RECENTLY_DEAD: prstate->recently_dead_tuples++; - prstate->all_visible = false; + prstate->all_visible = prstate->all_frozen = false; /* * This tuple will soon become DEAD. Update the hint field so @@ -1480,7 +1723,7 @@ heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumb * assumption is a bit shaky, but it is what acquire_sample_rows() * does, so be consistent. */ - prstate->all_visible = false; + prstate->all_visible = prstate->all_frozen = false; /* * If we wanted to optimize for aborts, we might consider marking @@ -1498,7 +1741,7 @@ heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumb * will commit and update the counters after we report. */ prstate->live_tuples++; - prstate->all_visible = false; + prstate->all_visible = prstate->all_frozen = false; /* * This tuple may soon become DEAD. Update the hint field so that @@ -1566,8 +1809,11 @@ heap_prune_record_unchanged_lp_dead(Page page, PruneState *prstate, OffsetNumber * Similarly, don't unset all_visible until later, at the end of * heap_page_prune_and_freeze(). This will allow us to attempt to freeze * the page after pruning. As long as we unset it before updating the - * visibility map, this will be correct. + * visibility map, this will be correct. If we won't attempt freezing, + * though, just unset all-visible now. */ + if (!prstate->attempt_freeze) + prstate->all_visible = prstate->all_frozen = false; /* Record the dead offset for vacuum */ prstate->deadoffsets[prstate->lpdead_items++] = offnum; @@ -2069,19 +2315,82 @@ heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, return nplans; } +/* + * Calculate the conflict horizon for the whole XLOG_HEAP2_PRUNE_VACUUM_SCAN + * or XLOG_HEAP2_PRUNE_ON_ACCESS record. + */ +static TransactionId +get_conflict_xid(bool do_prune, bool do_freeze, bool do_set_vm, + TransactionId latest_xid_removed, TransactionId frz_conflict_horizon, + TransactionId visibility_cutoff_xid, bool blk_already_av, + bool set_blk_all_frozen) +{ + TransactionId conflict_xid; + + /* + * We can omit the snapshot conflict horizon if we are not pruning or + * freezing any tuples and are setting an already all-visible page + * all-frozen in the VM. In this case, all of the tuples on the page must + * already be visible to all MVCC snapshots on the standby. + */ + if (!do_prune && !do_freeze && + do_set_vm && blk_already_av && set_blk_all_frozen) + return InvalidTransactionId; + + /* + * The snapshotConflictHorizon for the whole record should be the most + * conservative of all the horizons calculated for any of the possible + * modifications. If this record will prune tuples, any transactions on + * the standby older than the youngest xmax of the most recently removed + * tuple this record will prune will conflict. If this record will freeze + * tuples, any transactions on the standby with xids older than the + * youngest tuple this record will freeze will conflict. + */ + conflict_xid = InvalidTransactionId; + + /* + * If we are updating the VM, the conflict horizon is almost always the + * visibility cutoff XID. + * + * Separately, if we are freezing any tuples, as an optimization, we can + * use the visibility_cutoff_xid as the conflict horizon if the page will + * be all-frozen. This is true even if there are LP_DEAD line pointers + * because we ignored those when maintaining the visibility_cutoff_xid. + * This will have been calculated earlier as the frz_conflict_horizon when + * we determined we would freeze. + */ + if (do_set_vm) + conflict_xid = visibility_cutoff_xid; + else if (do_freeze) + conflict_xid = frz_conflict_horizon; + + /* + * If we are removing tuples with a younger xmax than our so far + * calculated conflict_xid, we must use this as our horizon. + */ + if (TransactionIdFollows(latest_xid_removed, conflict_xid)) + conflict_xid = latest_xid_removed; + + return conflict_xid; +} + /* * Write an XLOG_HEAP2_PRUNE* WAL record * * This is used for several different page maintenance operations: * - * - Page pruning, in VACUUM's 1st pass or on access: Some items are + * - Page pruning, in vacuum phase I or on-access: Some items are * redirected, some marked dead, and some removed altogether. * - * - Freezing: Items are marked as 'frozen'. + * - Freezing: During vacuum phase I, items are marked as 'frozen' + * + * - Reaping: During vacuum phase III, items that are already LP_DEAD are + * marked as unused. * - * - Vacuum, 2nd pass: Items that are already LP_DEAD are marked as unused. + * - VM updates: After vacuum phases I and III and on-access, the heap page + * may be marked all-visible and all-frozen. * - * They have enough commonalities that we use a single WAL record for them + * These changes all happen together, so we use a single WAL record for them * all. * * If replaying the record requires a cleanup lock, pass cleanup_lock = true. @@ -2093,6 +2402,15 @@ heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, * case, vmbuffer should already have been updated and marked dirty and should * still be pinned and locked. * + * set_pd_all_vis indicates that we set PD_ALL_VISIBLE and thus should update + * the page LSN when checksums/wal_log_hints are enabled even if we did not + * prune or freeze tuples on the page. + * + * In some cases, such as when heap_page_prune_and_freeze() is setting an + * already marked all-visible page all-frozen, PD_ALL_VISIBLE may already be + * set. So, it is possible for vmflags to be non-zero and set_pd_all_vis to be + * false. + * * Note: This function scribbles on the 'frozen' array. * * Note: This is called in a critical section, so careful what you do here. @@ -2102,6 +2420,7 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer, Buffer vmbuffer, uint8 vmflags, TransactionId conflict_xid, bool cleanup_lock, + bool set_pd_all_vis, PruneReason reason, HeapTupleFreeze *frozen, int nfrozen, OffsetNumber *redirected, int nredirected, @@ -2138,7 +2457,7 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer, */ if (!do_prune && nfrozen == 0 && - (!do_set_vm || !XLogHintBitIsNeeded())) + (!set_pd_all_vis || !XLogHintBitIsNeeded())) regbuf_flags_heap |= REGBUF_NO_IMAGE; /* @@ -2256,7 +2575,8 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer, * See comment at the top of the function about regbuf_flags_heap for * details on when we can advance the page LSN. */ - if (do_prune || nfrozen > 0 || (do_set_vm && XLogHintBitIsNeeded())) + if (do_prune || nfrozen > 0 || + (set_pd_all_vis && XLogHintBitIsNeeded())) { Assert(BufferIsDirty(buffer)); PageSetLSN(BufferGetPage(buffer), recptr); diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 61fe623cc608..5e3c1d503783 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -464,15 +464,8 @@ static void dead_items_add(LVRelState *vacrel, BlockNumber blkno, OffsetNumber * static void dead_items_reset(LVRelState *vacrel); static void dead_items_cleanup(LVRelState *vacrel); -#ifdef USE_ASSERT_CHECKING -static bool heap_page_is_all_visible(Relation rel, Buffer buf, - TransactionId OldestXmin, - bool *all_frozen, - TransactionId *visibility_cutoff_xid, - OffsetNumber *logging_offnum); -#endif static bool heap_page_would_be_all_visible(Relation rel, Buffer buf, - TransactionId OldestXmin, + GlobalVisState *vistest, OffsetNumber *deadoffsets, int ndeadoffsets, bool *all_frozen, @@ -1886,9 +1879,12 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, */ if (!PageIsAllVisible(page)) { + /* Lock vmbuffer before entering critical section */ + LockBuffer(vmbuffer, BUFFER_LOCK_EXCLUSIVE); + START_CRIT_SECTION(); - /* mark buffer dirty before writing a WAL record */ + /* Mark buffer dirty before writing any WAL records */ MarkBufferDirty(buf); /* @@ -1905,13 +1901,34 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, log_newpage_buffer(buf, true); PageSetAllVisible(page); - visibilitymap_set(vacrel->rel, blkno, buf, - InvalidXLogRecPtr, - vmbuffer, InvalidTransactionId, + visibilitymap_set(blkno, + vmbuffer, VISIBILITYMAP_ALL_VISIBLE | - VISIBILITYMAP_ALL_FROZEN); + VISIBILITYMAP_ALL_FROZEN, + vacrel->rel->rd_locator); + + /* + * Emit WAL for setting PD_ALL_VISIBLE on the heap page and + * setting the VM. + */ + if (RelationNeedsWAL(vacrel->rel)) + log_heap_prune_and_freeze(vacrel->rel, buf, + vmbuffer, + VISIBILITYMAP_ALL_VISIBLE | + VISIBILITYMAP_ALL_FROZEN, + InvalidTransactionId, /* conflict xid */ + false, /* cleanup lock */ + true, /* set_pd_all_vis */ + PRUNE_VACUUM_SCAN, /* reason */ + NULL, 0, + NULL, 0, + NULL, 0, + NULL, 0); + END_CRIT_SECTION(); + LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); + /* Count the newly all-frozen pages for logging */ vacrel->vm_new_visible_pages++; vacrel->vm_new_visible_frozen_pages++; @@ -1965,10 +1982,18 @@ lazy_scan_prune(LVRelState *vacrel, { Relation rel = vacrel->rel; PruneFreezeResult presult; - int prune_options = 0; + PruneFreezeParams params; Assert(BufferGetBlockNumber(buf) == blkno); + params.relation = rel; + params.buffer = buf; + params.reason = PRUNE_VACUUM_SCAN; + params.cutoffs = &vacrel->cutoffs; + params.vistest = vacrel->vistest; + params.vmbuffer = vmbuffer; + params.blk_known_av = all_visible_according_to_vm; + /* * Prune all HOT-update chains and potentially freeze tuples on this page. * @@ -1984,12 +2009,12 @@ lazy_scan_prune(LVRelState *vacrel, * tuples. Pruning will have determined whether or not the page is * all-visible. */ - prune_options = HEAP_PAGE_PRUNE_FREEZE; + params.options = HEAP_PAGE_PRUNE_FREEZE | HEAP_PAGE_PRUNE_UPDATE_VIS; if (vacrel->nindexes == 0) - prune_options |= HEAP_PAGE_PRUNE_MARK_UNUSED_NOW; + params.options |= HEAP_PAGE_PRUNE_MARK_UNUSED_NOW; - heap_page_prune_and_freeze(rel, buf, vacrel->vistest, prune_options, - &vacrel->cutoffs, &presult, PRUNE_VACUUM_SCAN, + heap_page_prune_and_freeze(¶ms, + &presult, &vacrel->offnum, &vacrel->NewRelfrozenXid, &vacrel->NewRelminMxid); @@ -2007,34 +2032,6 @@ lazy_scan_prune(LVRelState *vacrel, vacrel->new_frozen_tuple_pages++; } - /* - * VACUUM will call heap_page_is_all_visible() during the second pass over - * the heap to determine all_visible and all_frozen for the page -- this - * is a specialized version of the logic from this function. Now that - * we've finished pruning and freezing, make sure that we're in total - * agreement with heap_page_is_all_visible() using an assertion. - */ -#ifdef USE_ASSERT_CHECKING - /* Note that all_frozen value does not matter when !all_visible */ - if (presult.all_visible) - { - TransactionId debug_cutoff; - bool debug_all_frozen; - - Assert(presult.lpdead_items == 0); - - if (!heap_page_is_all_visible(vacrel->rel, buf, - vacrel->cutoffs.OldestXmin, &debug_all_frozen, - &debug_cutoff, &vacrel->offnum)) - Assert(false); - - Assert(presult.all_frozen == debug_all_frozen); - - Assert(!TransactionIdIsValid(debug_cutoff) || - debug_cutoff == presult.vm_conflict_horizon); - } -#endif - /* * Now save details of the LP_DEAD items from the page in vacrel */ @@ -2068,168 +2065,26 @@ lazy_scan_prune(LVRelState *vacrel, /* Did we find LP_DEAD items? */ *has_lpdead_items = (presult.lpdead_items > 0); - Assert(!presult.all_visible || !(*has_lpdead_items)); - /* - * Handle setting visibility map bit based on information from the VM (as - * of last heap_vac_scan_next_block() call), and from all_visible and - * all_frozen variables + * For the purposes of logging, count whether or not the page was newly + * set all-visible and, potentially, all-frozen. */ - if (!all_visible_according_to_vm && presult.all_visible) + if ((presult.old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0 && + (presult.new_vmbits & VISIBILITYMAP_ALL_VISIBLE) != 0) { - uint8 old_vmbits; - uint8 flags = VISIBILITYMAP_ALL_VISIBLE; - - if (presult.all_frozen) - { - Assert(!TransactionIdIsValid(presult.vm_conflict_horizon)); - flags |= VISIBILITYMAP_ALL_FROZEN; - } - - /* - * It should never be the case that the visibility map page is set - * while the page-level bit is clear, but the reverse is allowed (if - * checksums are not enabled). Regardless, set both bits so that we - * get back in sync. - * - * NB: If the heap page is all-visible but the VM bit is not set, we - * don't need to dirty the heap page. However, if checksums are - * enabled, we do need to make sure that the heap page is dirtied - * before passing it to visibilitymap_set(), because it may be logged. - * Given that this situation should only happen in rare cases after a - * crash, it is not worth optimizing. - */ - PageSetAllVisible(page); - MarkBufferDirty(buf); - old_vmbits = visibilitymap_set(vacrel->rel, blkno, buf, - InvalidXLogRecPtr, - vmbuffer, presult.vm_conflict_horizon, - flags); - - /* - * If the page wasn't already set all-visible and/or all-frozen in the - * VM, count it as newly set for logging. - */ - if ((old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0) - { - vacrel->vm_new_visible_pages++; - if (presult.all_frozen) - { - vacrel->vm_new_visible_frozen_pages++; - *vm_page_frozen = true; - } - } - else if ((old_vmbits & VISIBILITYMAP_ALL_FROZEN) == 0 && - presult.all_frozen) + vacrel->vm_new_visible_pages++; + if ((presult.new_vmbits & VISIBILITYMAP_ALL_FROZEN) != 0) { - vacrel->vm_new_frozen_pages++; + vacrel->vm_new_visible_frozen_pages++; *vm_page_frozen = true; } } - - /* - * As of PostgreSQL 9.2, the visibility map bit should never be set if the - * page-level bit is clear. However, it's possible that the bit got - * cleared after heap_vac_scan_next_block() was called, so we must recheck - * with buffer lock before concluding that the VM is corrupt. - */ - else if (all_visible_according_to_vm && !PageIsAllVisible(page) && - visibilitymap_get_status(vacrel->rel, blkno, &vmbuffer) != 0) + else if ((presult.old_vmbits & VISIBILITYMAP_ALL_FROZEN) == 0 && + (presult.new_vmbits & VISIBILITYMAP_ALL_FROZEN) != 0) { - ereport(WARNING, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", - vacrel->relname, blkno))); - - visibilitymap_clear(vacrel->rel, blkno, vmbuffer, - VISIBILITYMAP_VALID_BITS); - } - - /* - * It's possible for the value returned by - * GetOldestNonRemovableTransactionId() to move backwards, so it's not - * wrong for us to see tuples that appear to not be visible to everyone - * yet, while PD_ALL_VISIBLE is already set. The real safe xmin value - * never moves backwards, but GetOldestNonRemovableTransactionId() is - * conservative and sometimes returns a value that's unnecessarily small, - * so if we see that contradiction it just means that the tuples that we - * think are not visible to everyone yet actually are, and the - * PD_ALL_VISIBLE flag is correct. - * - * There should never be LP_DEAD items on a page with PD_ALL_VISIBLE set, - * however. - */ - else if (presult.lpdead_items > 0 && PageIsAllVisible(page)) - { - ereport(WARNING, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u", - vacrel->relname, blkno))); - - PageClearAllVisible(page); - MarkBufferDirty(buf); - visibilitymap_clear(vacrel->rel, blkno, vmbuffer, - VISIBILITYMAP_VALID_BITS); - } - - /* - * If the all-visible page is all-frozen but not marked as such yet, mark - * it as all-frozen. Note that all_frozen is only valid if all_visible is - * true, so we must check both all_visible and all_frozen. - */ - else if (all_visible_according_to_vm && presult.all_visible && - presult.all_frozen && !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer)) - { - uint8 old_vmbits; - - /* - * Avoid relying on all_visible_according_to_vm as a proxy for the - * page-level PD_ALL_VISIBLE bit being set, since it might have become - * stale -- even when all_visible is set - */ - if (!PageIsAllVisible(page)) - { - PageSetAllVisible(page); - MarkBufferDirty(buf); - } - - /* - * Set the page all-frozen (and all-visible) in the VM. - * - * We can pass InvalidTransactionId as our cutoff_xid, since a - * snapshotConflictHorizon sufficient to make everything safe for REDO - * was logged when the page's tuples were frozen. - */ - Assert(!TransactionIdIsValid(presult.vm_conflict_horizon)); - old_vmbits = visibilitymap_set(vacrel->rel, blkno, buf, - InvalidXLogRecPtr, - vmbuffer, InvalidTransactionId, - VISIBILITYMAP_ALL_VISIBLE | - VISIBILITYMAP_ALL_FROZEN); - - /* - * The page was likely already set all-visible in the VM. However, - * there is a small chance that it was modified sometime between - * setting all_visible_according_to_vm and checking the visibility - * during pruning. Check the return value of old_vmbits anyway to - * ensure the visibility map counters used for logging are accurate. - */ - if ((old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0) - { - vacrel->vm_new_visible_pages++; - vacrel->vm_new_visible_frozen_pages++; - *vm_page_frozen = true; - } - - /* - * We already checked that the page was not set all-frozen in the VM - * above, so we don't need to test the value of old_vmbits. - */ - else - { - vacrel->vm_new_frozen_pages++; - *vm_page_frozen = true; - } + Assert((presult.new_vmbits & VISIBILITYMAP_ALL_VISIBLE) != 0); + vacrel->vm_new_frozen_pages++; + *vm_page_frozen = true; } return presult.ndeleted; @@ -2886,7 +2741,7 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, * done outside the critical section. */ if (heap_page_would_be_all_visible(vacrel->rel, buffer, - vacrel->cutoffs.OldestXmin, + vacrel->vistest, deadoffsets, num_offsets, &all_frozen, &visibility_cutoff_xid, &vacrel->offnum)) @@ -2932,9 +2787,9 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, * set PD_ALL_VISIBLE. */ PageSetAllVisible(page); - visibilitymap_set_vmbits(blkno, - vmbuffer, vmflags, - vacrel->rel->rd_locator); + visibilitymap_set(blkno, + vmbuffer, vmflags, + vacrel->rel->rd_locator); conflict_xid = visibility_cutoff_xid; } @@ -2951,6 +2806,7 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, vmflags, conflict_xid, false, /* no cleanup lock required */ + (vmflags & VISIBILITYMAP_VALID_BITS) != 0, PRUNE_VACUUM_CLEANUP, NULL, 0, /* frozen */ NULL, 0, /* redirected */ @@ -3638,16 +3494,15 @@ dead_items_cleanup(LVRelState *vacrel) * that expect no LP_DEAD on the page. Currently assert-only, but there is no * reason not to use it outside of asserts. */ -static bool +bool heap_page_is_all_visible(Relation rel, Buffer buf, - TransactionId OldestXmin, + GlobalVisState *vistest, bool *all_frozen, TransactionId *visibility_cutoff_xid, OffsetNumber *logging_offnum) { - return heap_page_would_be_all_visible(rel, buf, - OldestXmin, + return heap_page_would_be_all_visible(rel, buf, vistest, NULL, 0, all_frozen, visibility_cutoff_xid, @@ -3668,7 +3523,7 @@ heap_page_is_all_visible(Relation rel, Buffer buf, * Returns true if the page is all-visible other than the provided * deadoffsets and false otherwise. * - * OldestXmin is used to determine visibility. + * vistest is used to determine visibility. * * Output parameters: * @@ -3687,7 +3542,7 @@ heap_page_is_all_visible(Relation rel, Buffer buf, */ static bool heap_page_would_be_all_visible(Relation rel, Buffer buf, - TransactionId OldestXmin, + GlobalVisState *vistest, OffsetNumber *deadoffsets, int ndeadoffsets, bool *all_frozen, @@ -3761,7 +3616,7 @@ heap_page_would_be_all_visible(Relation rel, Buffer buf, /* Visibility checks may do IO or allocate memory */ Assert(CritSectionCount == 0); - switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) + switch (HeapTupleSatisfiesVacuumGlobalVis(&tuple, vistest, buf)) { case HEAPTUPLE_LIVE: { @@ -3780,7 +3635,7 @@ heap_page_would_be_all_visible(Relation rel, Buffer buf, * that everyone sees it as committed? */ xmin = HeapTupleHeaderGetXmin(tuple.t_data); - if (!TransactionIdPrecedes(xmin, OldestXmin)) + if (!GlobalVisXidVisibleToAll(vistest, xmin)) { all_visible = false; *all_frozen = false; diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 2f5e61e23922..a75b5bb6b130 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -14,8 +14,7 @@ * visibilitymap_clear - clear bits for one page in the visibility map * visibilitymap_pin - pin a map page for setting a bit * visibilitymap_pin_ok - check whether correct map page is already pinned - * visibilitymap_set - set bit(s) in a previously pinned page and log - * visibilitymap_set_vmbits - set bit(s) in a pinned page + * visibilitymap_set - set bit(s) in a previously pinned page * visibilitymap_get_status - get status of bits * visibilitymap_count - count number of bits set in visibility map * visibilitymap_prepare_truncate - @@ -220,108 +219,6 @@ visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf) return BufferIsValid(vmbuf) && BufferGetBlockNumber(vmbuf) == mapBlock; } -/* - * visibilitymap_set - set bit(s) on a previously pinned page - * - * recptr is the LSN of the XLOG record we're replaying, if we're in recovery, - * or InvalidXLogRecPtr in normal running. The VM page LSN is advanced to the - * one provided; in normal running, we generate a new XLOG record and set the - * page LSN to that value (though the heap page's LSN may *not* be updated; - * see below). cutoff_xid is the largest xmin on the page being marked - * all-visible; it is needed for Hot Standby, and can be InvalidTransactionId - * if the page contains no tuples. It can also be set to InvalidTransactionId - * when a page that is already all-visible is being marked all-frozen. - * - * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling - * this function. Except in recovery, caller should also pass the heap - * buffer. When checksums are enabled and we're not in recovery, we must add - * the heap buffer to the WAL chain to protect it from being torn. - * - * You must pass a buffer containing the correct map page to this function. - * Call visibilitymap_pin first to pin the right one. This function doesn't do - * any I/O. - * - * Returns the state of the page's VM bits before setting flags. - */ -uint8 -visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, - XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, - uint8 flags) -{ - BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); - uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); - uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk); - Page page; - uint8 *map; - uint8 status; - -#ifdef TRACE_VISIBILITYMAP - elog(DEBUG1, "vm_set flags 0x%02X for %s %d", - flags, RelationGetRelationName(rel), heapBlk); -#endif - - Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); - Assert(InRecovery || PageIsAllVisible(BufferGetPage(heapBuf))); - Assert((flags & VISIBILITYMAP_VALID_BITS) == flags); - - /* Must never set all_frozen bit without also setting all_visible bit */ - Assert(flags != VISIBILITYMAP_ALL_FROZEN); - - /* Check that we have the right heap page pinned, if present */ - if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) - elog(ERROR, "wrong heap buffer passed to visibilitymap_set"); - - Assert(!BufferIsValid(heapBuf) || - BufferIsLockedByMeInMode(heapBuf, BUFFER_LOCK_EXCLUSIVE)); - - /* Check that we have the right VM page pinned */ - if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock) - elog(ERROR, "wrong VM buffer passed to visibilitymap_set"); - - page = BufferGetPage(vmBuf); - map = (uint8 *) PageGetContents(page); - LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE); - - status = (map[mapByte] >> mapOffset) & VISIBILITYMAP_VALID_BITS; - if (flags != status) - { - START_CRIT_SECTION(); - - map[mapByte] |= (flags << mapOffset); - MarkBufferDirty(vmBuf); - - if (RelationNeedsWAL(rel)) - { - if (XLogRecPtrIsInvalid(recptr)) - { - Assert(!InRecovery); - recptr = log_heap_visible(rel, heapBuf, vmBuf, cutoff_xid, flags); - - /* - * If data checksums are enabled (or wal_log_hints=on), we - * need to protect the heap page from being torn. - * - * If not, then we must *not* update the heap page's LSN. In - * this case, the FPI for the heap page was omitted from the - * WAL record inserted above, so it would be incorrect to - * update the heap page's LSN. - */ - if (XLogHintBitIsNeeded()) - { - Page heapPage = BufferGetPage(heapBuf); - - PageSetLSN(heapPage, recptr); - } - } - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - } - - LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK); - return status; -} /* * Set VM (visibility map) flags in the VM block in vmBuf. @@ -344,9 +241,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, * rlocator is used only for debugging messages. */ uint8 -visibilitymap_set_vmbits(BlockNumber heapBlk, - Buffer vmBuf, uint8 flags, - const RelFileLocator rlocator) +visibilitymap_set(BlockNumber heapBlk, + Buffer vmBuf, uint8 flags, + const RelFileLocator rlocator) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 0492d92d23b1..8d582a8eafd4 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -289,6 +289,32 @@ index_beginscan(Relation heapRelation, return scan; } +/* + * Similar to index_beginscan(), but allows the caller to indicate whether the + * query modifies the underlying base relation. This is used when the caller + * wants to attempt marking pages in the base relation as all-visible in the + * visibility map during on-access pruning. + */ +IndexScanDesc +index_beginscan_vmset(Relation heapRelation, + Relation indexRelation, + Snapshot snapshot, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, bool modifies_base_rel) +{ + IndexScanDesc scan; + + scan = index_beginscan(heapRelation, + indexRelation, + snapshot, + instrument, + nkeys, norderbys); + + scan->xs_heapfetch->modifies_base_rel = modifies_base_rel; + + return scan; +} + /* * index_beginscan_bitmap - start a scan of an index with amgetbitmap * @@ -620,6 +646,26 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, return scan; } +/* + * Parallel version of index_beginscan_vmset() + */ +IndexScanDesc +index_beginscan_parallel_vmset(Relation heaprel, Relation indexrel, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, + ParallelIndexScanDesc pscan, + bool modifies_base_rel) +{ + IndexScanDesc scan; + + scan = index_beginscan_parallel(heaprel, indexrel, + instrument, + nkeys, norderbys, + pscan); + scan->xs_heapfetch->modifies_base_rel = modifies_base_rel; + return scan; +} + /* ---------------- * index_getnext_tid - get the next TID from a scan * diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c index ca26d1f0ed15..08461fdf593a 100644 --- a/src/backend/access/rmgrdesc/heapdesc.c +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -349,13 +349,6 @@ heap2_desc(StringInfo buf, XLogReaderState *record) } } } - else if (info == XLOG_HEAP2_VISIBLE) - { - xl_heap_visible *xlrec = (xl_heap_visible *) rec; - - appendStringInfo(buf, "snapshotConflictHorizon: %u, flags: 0x%02X", - xlrec->snapshotConflictHorizon, xlrec->flags); - } else if (info == XLOG_HEAP2_MULTI_INSERT) { xl_heap_multi_insert *xlrec = (xl_heap_multi_insert *) rec; @@ -461,9 +454,6 @@ heap2_identify(uint8 info) case XLOG_HEAP2_PRUNE_VACUUM_CLEANUP: id = "PRUNE_VACUUM_CLEANUP"; break; - case XLOG_HEAP2_VISIBLE: - id = "VISIBLE"; - break; case XLOG_HEAP2_MULTI_INSERT: id = "MULTI_INSERT"; break; diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 71ef2e5036f2..1c0eb425ee96 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -536,7 +536,7 @@ vacuumRedirectAndPlaceholder(Relation index, Relation heaprel, Buffer buffer) */ if (dt->tupstate == SPGIST_REDIRECT && (!TransactionIdIsValid(dt->xid) || - GlobalVisTestIsRemovableXid(vistest, dt->xid))) + GlobalVisXidVisibleToAll(vistest, dt->xid))) { dt->tupstate = SPGIST_PLACEHOLDER; Assert(opaque->nRedirection > 0); diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 5e41404937eb..3e3a0f72a71e 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -49,6 +49,10 @@ char *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD; bool synchronize_seqscans = true; +/* Helper for table_beginscan_parallel() and table_beginscan_parallel_vmset() */ +static TableScanDesc table_beginscan_parallel_common(Relation relation, ParallelTableScanDesc pscan, + uint32 flags); + /* ---------------------------------------------------------------------------- * Slot functions. @@ -162,12 +166,14 @@ table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, } } -TableScanDesc -table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) +/* + * Common helper for table_beginscan_parallel() and table_beginscan_parallel_vmset() + */ +static TableScanDesc +table_beginscan_parallel_common(Relation relation, ParallelTableScanDesc pscan, + uint32 flags) { Snapshot snapshot; - uint32 flags = SO_TYPE_SEQSCAN | - SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator)); @@ -188,6 +194,31 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) pscan, flags); } +TableScanDesc +table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) +{ + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; + + return table_beginscan_parallel_common(relation, pscan, flags); +} + +/* + * Parallel version of table_beginscan_vmset() + */ +TableScanDesc +table_beginscan_parallel_vmset(Relation relation, ParallelTableScanDesc pscan, + bool modifies_rel) +{ + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; + + if (!modifies_rel) + flags |= SO_ALLOW_VM_SET; + + return table_beginscan_parallel_common(relation, pscan, flags); +} + /* ---------------------------------------------------------------------------- * Index scan related functions. diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 27c9eec697b1..0630a5af79e6 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -916,6 +916,10 @@ InitPlan(QueryDesc *queryDesc, int eflags) break; } + /* If it has a rowmark, the relation is modified */ + estate->es_modified_relids = bms_add_member(estate->es_modified_relids, + rc->rti); + /* Check that relation is a legal target for marking */ if (relation) CheckValidRowMarkRel(relation, rc->markType); diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index fdc65c2b42b3..28a06dcd2444 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -893,6 +893,8 @@ ExecInitResultRelation(EState *estate, ResultRelInfo *resultRelInfo, estate->es_result_relations = (ResultRelInfo **) palloc0(estate->es_range_table_size * sizeof(ResultRelInfo *)); estate->es_result_relations[rti - 1] = resultRelInfo; + estate->es_modified_relids = bms_add_member(estate->es_modified_relids, + rti); /* * Saving in the list allows to avoid needlessly traversing the whole diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index bf24f3d7fe0a..af6db9f79191 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -105,11 +105,16 @@ BitmapTableScanSetup(BitmapHeapScanState *node) */ if (!node->ss.ss_currentScanDesc) { + bool modifies_rel = + bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + node->ss.ps.state->es_modified_relids); + node->ss.ss_currentScanDesc = table_beginscan_bm(node->ss.ss_currentRelation, node->ss.ps.state->es_snapshot, 0, - NULL); + NULL, + modifies_rel); } node->ss.ss_currentScanDesc->st.rs_tbmiterator = tbmiterator; diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index f36929deec33..cbd1ecaa15f5 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -102,16 +102,22 @@ IndexNext(IndexScanState *node) if (scandesc == NULL) { + + bool modifies_base_rel = + bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + estate->es_modified_relids); + /* * We reach here if the index scan is not parallel, or if we're * serially executing an index scan that was planned to be parallel. */ - scandesc = index_beginscan(node->ss.ss_currentRelation, - node->iss_RelationDesc, - estate->es_snapshot, - &node->iss_Instrument, - node->iss_NumScanKeys, - node->iss_NumOrderByKeys); + scandesc = index_beginscan_vmset(node->ss.ss_currentRelation, + node->iss_RelationDesc, + estate->es_snapshot, + &node->iss_Instrument, + node->iss_NumScanKeys, + node->iss_NumOrderByKeys, + modifies_base_rel); node->iss_ScanDesc = scandesc; diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 94047d29430d..fd69275c181f 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -65,13 +65,18 @@ SeqNext(SeqScanState *node) if (scandesc == NULL) { + bool modifies_rel = + bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + estate->es_modified_relids); + /* * We reach here if the scan is not parallel, or if we're serially * executing a scan that was planned to be parallel. */ - scandesc = table_beginscan(node->ss.ss_currentRelation, - estate->es_snapshot, - 0, NULL); + scandesc = table_beginscan_vmset(node->ss.ss_currentRelation, + estate->es_snapshot, + 0, NULL, modifies_rel); + node->ss.ss_currentScanDesc = scandesc; } @@ -366,6 +371,7 @@ ExecSeqScanInitializeDSM(SeqScanState *node, ParallelContext *pcxt) { EState *estate = node->ss.ps.state; + bool modifies_rel; ParallelTableScanDesc pscan; pscan = shm_toc_allocate(pcxt->toc, node->pscan_len); @@ -373,8 +379,11 @@ ExecSeqScanInitializeDSM(SeqScanState *node, pscan, estate->es_snapshot); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); + modifies_rel = bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + estate->es_modified_relids); node->ss.ss_currentScanDesc = - table_beginscan_parallel(node->ss.ss_currentRelation, pscan); + table_beginscan_parallel_vmset(node->ss.ss_currentRelation, pscan, + modifies_rel); } /* ---------------------------------------------------------------- @@ -404,8 +413,13 @@ ExecSeqScanInitializeWorker(SeqScanState *node, ParallelWorkerContext *pwcxt) { ParallelTableScanDesc pscan; + bool modifies_rel = + bms_is_member(((Scan *) node->ss.ps.plan)->scanrelid, + node->ss.ps.state->es_modified_relids); pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->ss.ss_currentScanDesc = - table_beginscan_parallel(node->ss.ss_currentRelation, pscan); + table_beginscan_parallel_vmset(node->ss.ss_currentRelation, + pscan, + modifies_rel); } diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index cc03f0706e9c..2fdd4af90a83 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -454,7 +454,6 @@ heap2_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) case XLOG_HEAP2_PRUNE_ON_ACCESS: case XLOG_HEAP2_PRUNE_VACUUM_SCAN: case XLOG_HEAP2_PRUNE_VACUUM_CLEANUP: - case XLOG_HEAP2_VISIBLE: case XLOG_HEAP2_LOCK_UPDATED: break; default: diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 200f72c6e256..235c3b584f67 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -4181,8 +4181,7 @@ GlobalVisUpdate(void) * See comment for GlobalVisState for details. */ bool -GlobalVisTestIsRemovableFullXid(GlobalVisState *state, - FullTransactionId fxid) +GlobalVisFullXidVisibleToAll(GlobalVisState *state, FullTransactionId fxid) { /* * If fxid is older than maybe_needed bound, it definitely is visible to @@ -4216,14 +4215,14 @@ GlobalVisTestIsRemovableFullXid(GlobalVisState *state, } /* - * Wrapper around GlobalVisTestIsRemovableFullXid() for 32bit xids. + * Wrapper around GlobalVisFullXidVisibleToAll() for 32bit xids. * * It is crucial that this only gets called for xids from a source that * protects against xid wraparounds (e.g. from a table and thus protected by * relfrozenxid). */ bool -GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid) +GlobalVisXidVisibleToAll(GlobalVisState *state, TransactionId xid) { FullTransactionId fxid; @@ -4237,12 +4236,12 @@ GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid) */ fxid = FullXidRelativeTo(state->definitely_needed, xid); - return GlobalVisTestIsRemovableFullXid(state, fxid); + return GlobalVisFullXidVisibleToAll(state, fxid); } /* * Convenience wrapper around GlobalVisTestFor() and - * GlobalVisTestIsRemovableFullXid(), see their comments. + * GlobalVisFullXidVisibleToAll(), see their comments. */ bool GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid) @@ -4251,12 +4250,12 @@ GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid) state = GlobalVisTestFor(rel); - return GlobalVisTestIsRemovableFullXid(state, fxid); + return GlobalVisFullXidVisibleToAll(state, fxid); } /* * Convenience wrapper around GlobalVisTestFor() and - * GlobalVisTestIsRemovableXid(), see their comments. + * GlobalVisTestIsVisibleXid(), see their comments. */ bool GlobalVisCheckRemovableXid(Relation rel, TransactionId xid) @@ -4265,7 +4264,7 @@ GlobalVisCheckRemovableXid(Relation rel, TransactionId xid) state = GlobalVisTestFor(rel); - return GlobalVisTestIsRemovableXid(state, xid); + return GlobalVisXidVisibleToAll(state, xid); } /* diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 4222bdab0780..c619643e1211 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -475,12 +475,12 @@ ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon, * If we get passed InvalidTransactionId then we do nothing (no conflict). * * This can happen when replaying already-applied WAL records after a - * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE - * record that marks as frozen a page which was already all-visible. It's - * also quite common with records generated during index deletion - * (original execution of the deletion can reason that a recovery conflict - * which is sufficient for the deletion operation must take place before - * replay of the deletion record itself). + * standby crash or restart, or when replaying a record that marks as + * frozen a page which was already marked all-visible in the visibility + * map. It's also quite common with records generated during index + * deletion (original execution of the deletion can reason that a recovery + * conflict which is sufficient for the deletion operation must take place + * before replay of the deletion record itself). */ if (!TransactionIdIsValid(snapshotConflictHorizon)) return; diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 9200a22bd9f9..aa2112c8e045 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -178,6 +178,11 @@ extern IndexScanDesc index_beginscan(Relation heapRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, int nkeys, int norderbys); +extern IndexScanDesc index_beginscan_vmset(Relation heapRelation, + Relation indexRelation, + Snapshot snapshot, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, bool modifies_heap_rel); extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, @@ -204,6 +209,12 @@ extern IndexScanDesc index_beginscan_parallel(Relation heaprel, IndexScanInstrumentation *instrument, int nkeys, int norderbys, ParallelIndexScanDesc pscan); + +extern IndexScanDesc index_beginscan_parallel_vmset(Relation heaprel, Relation indexrel, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, + ParallelIndexScanDesc pscan, + bool modifies_rel); extern ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction); extern bool index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot); diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 909db73b7bbb..1d2cab64e9c3 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -42,6 +42,7 @@ /* "options" flag bits for heap_page_prune_and_freeze */ #define HEAP_PAGE_PRUNE_MARK_UNUSED_NOW (1 << 0) #define HEAP_PAGE_PRUNE_FREEZE (1 << 1) +#define HEAP_PAGE_PRUNE_UPDATE_VIS (1 << 2) typedef struct BulkInsertStateData *BulkInsertState; typedef struct GlobalVisState GlobalVisState; @@ -94,6 +95,13 @@ typedef struct HeapScanDescData */ ParallelBlockTableScanWorkerData *rs_parallelworkerdata; + /* + * For sequential scans and bitmap heap scans. If the relation is not + * being modified, on-access pruning may read in the current heap page's + * corresponding VM block to this buffer. + */ + Buffer rs_vmbuffer; + /* these fields only used in page-at-a-time mode and for bitmap scans */ uint32 rs_cindex; /* current tuple's index in vistuples */ uint32 rs_ntuples; /* number of visible tuples on page */ @@ -116,8 +124,18 @@ typedef struct IndexFetchHeapData { IndexFetchTableData xs_base; /* AM independent part of the descriptor */ - Buffer xs_cbuf; /* current heap buffer in scan, if any */ - /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ + /* + * Current heap buffer in scan, if any. NB: if xs_cbuf is not + * InvalidBuffer, we hold a pin on that buffer. + */ + Buffer xs_cbuf; + + /* + * For index scans that do not modify the underlying heap table, on-access + * pruning may read in the current heap page's corresponding VM block to + * this buffer. + */ + Buffer xs_vmbuffer; } IndexFetchHeapData; /* Result codes for HeapTupleSatisfiesVacuum */ @@ -221,6 +239,65 @@ typedef struct HeapPageFreeze } HeapPageFreeze; + +/* 'reason' codes for heap_page_prune_and_freeze() */ +typedef enum +{ + PRUNE_ON_ACCESS, /* on-access pruning */ + PRUNE_VACUUM_SCAN, /* VACUUM 1st heap pass */ + PRUNE_VACUUM_CLEANUP, /* VACUUM 2nd heap pass */ +} PruneReason; + +/* + * Input parameters to heap_page_prune_and_freeze() + */ +typedef struct PruneFreezeParams +{ + Relation relation; /* relation containing buffer to be pruned */ + Buffer buffer; /* buffer to be pruned */ + + /* + * + * vmbuffer is the buffer that must already contain contain the required + * block of the visibility map if we are to update it. blk_known_av is the + * visibility status of the heap block as of the last call to + * find_next_unskippable_block(). + */ + Buffer vmbuffer; + bool blk_known_av; + + /* + * The reason pruning was performed. It is used to set the WAL record + * opcode which is used for debugging and analysis purposes. + */ + PruneReason reason; + + /* + * Contains flag bits: + * + * MARK_UNUSED_NOW indicates that dead items can be set LP_UNUSED during + * pruning. + * + * FREEZE indicates that we will also freeze tuples + * + * UPDATE_VIS indicates that we will set the page's status in the VM. + */ + int options; + + /* + * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD + * (see heap_prune_satisfies_vacuum). + */ + GlobalVisState *vistest; + + /* + * cutoffs contains the freeze cutoffs, established by VACUUM at the + * beginning of vacuuming the relation. Required if HEAP_PRUNE_FREEZE + * option is set. + */ + struct VacuumCutoffs *cutoffs; +} PruneFreezeParams; + /* * Per-page state returned by heap_page_prune_and_freeze() */ @@ -235,19 +312,15 @@ typedef struct PruneFreezeResult int recently_dead_tuples; /* - * all_visible and all_frozen indicate if the all-visible and all-frozen - * bits in the visibility map can be set for this page, after pruning. + * old_vmbits are the state of the all-visible and all-frozen bits in the + * visibility map before updating it during phase I of vacuuming. + * new_vmbits are the state of those bits after phase I of vacuuming. * - * vm_conflict_horizon is the newest xmin of live tuples on the page. The - * caller can use it as the conflict horizon when setting the VM bits. It - * is only valid if we froze some tuples (nfrozen > 0), and all_frozen is - * true. - * - * These are only set if the HEAP_PRUNE_FREEZE option is set. + * These are only set if the HEAP_PAGE_PRUNE_UPDATE_VIS option is set and + * we have attempted to update the VM. */ - bool all_visible; - bool all_frozen; - TransactionId vm_conflict_horizon; + uint8 new_vmbits; + uint8 old_vmbits; /* * Whether or not the page makes rel truncation unsafe. This is set to @@ -264,13 +337,6 @@ typedef struct PruneFreezeResult OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; } PruneFreezeResult; -/* 'reason' codes for heap_page_prune_and_freeze() */ -typedef enum -{ - PRUNE_ON_ACCESS, /* on-access pruning */ - PRUNE_VACUUM_SCAN, /* VACUUM 1st heap pass */ - PRUNE_VACUUM_CLEANUP, /* VACUUM 2nd heap pass */ -} PruneReason; /* ---------------- * function prototypes for heap access method @@ -366,13 +432,10 @@ extern TransactionId heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate); /* in heap/pruneheap.c */ -extern void heap_page_prune_opt(Relation relation, Buffer buffer); -extern void heap_page_prune_and_freeze(Relation relation, Buffer buffer, - GlobalVisState *vistest, - int options, - struct VacuumCutoffs *cutoffs, +extern void heap_page_prune_opt(Relation relation, Buffer buffer, + Buffer *vmbuffer); +extern void heap_page_prune_and_freeze(PruneFreezeParams *params, PruneFreezeResult *presult, - PruneReason reason, OffsetNumber *off_loc, TransactionId *new_relfrozen_xid, MultiXactId *new_relmin_mxid); @@ -385,6 +448,7 @@ extern void log_heap_prune_and_freeze(Relation relation, Buffer buffer, Buffer vmbuffer, uint8 vmflags, TransactionId conflict_xid, bool cleanup_lock, + bool set_pd_all_vis, PruneReason reason, HeapTupleFreeze *frozen, int nfrozen, OffsetNumber *redirected, int nredirected, @@ -395,6 +459,14 @@ extern void log_heap_prune_and_freeze(Relation relation, Buffer buffer, extern void heap_vacuum_rel(Relation rel, const VacuumParams params, BufferAccessStrategy bstrategy); +#ifdef USE_ASSERT_CHECKING +extern bool heap_page_is_all_visible(Relation rel, Buffer buf, + GlobalVisState *vistest, + bool *all_frozen, + TransactionId *visibility_cutoff_xid, + OffsetNumber *logging_offnum); +#endif + /* in heap/heapam_visibility.c */ extern bool HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer); @@ -402,6 +474,8 @@ extern TM_Result HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, Buffer buffer); extern HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, Buffer buffer); +extern HTSV_Result HeapTupleSatisfiesVacuumGlobalVis(HeapTuple htup, + GlobalVisState *vistest, Buffer buffer); extern HTSV_Result HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *dead_after); extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 16c2b2e3c9c7..e9e77bd678b2 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -60,7 +60,6 @@ #define XLOG_HEAP2_PRUNE_ON_ACCESS 0x10 #define XLOG_HEAP2_PRUNE_VACUUM_SCAN 0x20 #define XLOG_HEAP2_PRUNE_VACUUM_CLEANUP 0x30 -#define XLOG_HEAP2_VISIBLE 0x40 #define XLOG_HEAP2_MULTI_INSERT 0x50 #define XLOG_HEAP2_LOCK_UPDATED 0x60 #define XLOG_HEAP2_NEW_CID 0x70 @@ -294,7 +293,13 @@ typedef struct xl_heap_prune #define SizeOfHeapPrune (offsetof(xl_heap_prune, flags) + sizeof(uint16)) -/* to handle recovery conflict during logical decoding on standby */ +/* + * To handle recovery conflict during logical decoding on standby, we must know + * if the table is a catalog table. Note that in visibilitymapdefs.h + * VISIBLITYMAP_XLOG_CATALOG_REL is also defined as (1 << 2). xl_heap_prune + * records should use XLHP_IS_CATALOG_REL, not VISIBILIYTMAP_XLOG_CATALOG_REL -- + * even if they only contain updates to the VM. + */ #define XLHP_IS_CATALOG_REL (1 << 1) /* @@ -443,20 +448,6 @@ typedef struct xl_heap_inplace #define MinSizeOfHeapInplace (offsetof(xl_heap_inplace, nmsgs) + sizeof(int)) -/* - * This is what we need to know about setting a visibility map bit - * - * Backup blk 0: visibility map buffer - * Backup blk 1: heap buffer - */ -typedef struct xl_heap_visible -{ - TransactionId snapshotConflictHorizon; - uint8 flags; -} xl_heap_visible; - -#define SizeOfHeapVisible (offsetof(xl_heap_visible, flags) + sizeof(uint8)) - typedef struct xl_heap_new_cid { /* @@ -500,11 +491,6 @@ extern void heap2_desc(StringInfo buf, XLogReaderState *record); extern const char *heap2_identify(uint8 info); extern void heap_xlog_logical_rewrite(XLogReaderState *r); -extern XLogRecPtr log_heap_visible(Relation rel, Buffer heap_buffer, - Buffer vm_buffer, - TransactionId snapshotConflictHorizon, - uint8 vmflags); - /* in heapdesc.c, so it can be shared between frontend/backend code */ extern void heap_xlog_deserialize_prune_and_freeze(char *cursor, uint16 flags, int *nplans, xlhp_freeze_plan **plans, diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index b5e0fb386c0a..f496e0b49397 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -121,6 +121,12 @@ typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker; typedef struct IndexFetchTableData { Relation rel; + + /* + * Some optimizations can only be performed if the query does not modify + * the underlying relation. Track that here. + */ + bool modifies_base_rel; } IndexFetchTableData; struct IndexScanInstrumentation; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index e16bf0256928..f250d4e7aec1 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -63,6 +63,8 @@ typedef enum ScanOptions /* unregister snapshot at scan end? */ SO_TEMP_SNAPSHOT = 1 << 9, + /* whether or not scan should attempt to set the VM */ + SO_ALLOW_VM_SET = 1 << 10, } ScanOptions; /* @@ -882,6 +884,25 @@ table_beginscan(Relation rel, Snapshot snapshot, return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); } +/* + * Similar to table_beginscan(), but allows the caller to indicate whether the + * query modifies the relation. This is used when the caller wants to attempt + * marking pages in the relation as all-visible in the visibility map during + * on-access pruning. + */ +static inline TableScanDesc +table_beginscan_vmset(Relation rel, Snapshot snapshot, + int nkeys, struct ScanKeyData *key, bool modifies_rel) +{ + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; + + if (!modifies_rel) + flags |= SO_ALLOW_VM_SET; + + return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); +} + /* * Like table_beginscan(), but for scanning catalog. It'll automatically use a * snapshot appropriate for scanning catalog relations. @@ -919,10 +940,13 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, */ static inline TableScanDesc table_beginscan_bm(Relation rel, Snapshot snapshot, - int nkeys, ScanKeyData *key) + int nkeys, ScanKeyData *key, bool modifies_rel) { uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE; + if (!modifies_rel) + flags |= SO_ALLOW_VM_SET; + return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); } @@ -1130,6 +1154,10 @@ extern void table_parallelscan_initialize(Relation rel, extern TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan); +extern TableScanDesc table_beginscan_parallel_vmset(Relation relation, + ParallelTableScanDesc pscan, + bool modifies_rel); + /* * Restart a parallel scan. Call this in the leader process. Caller is * responsible for making sure that all workers have finished the scan diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h index c6fa37be9682..05ba6786b471 100644 --- a/src/include/access/visibilitymap.h +++ b/src/include/access/visibilitymap.h @@ -15,7 +15,6 @@ #define VISIBILITYMAP_H #include "access/visibilitymapdefs.h" -#include "access/xlogdefs.h" #include "storage/block.h" #include "storage/buf.h" #include "storage/relfilelocator.h" @@ -32,15 +31,9 @@ extern bool visibilitymap_clear(Relation rel, BlockNumber heapBlk, extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); extern bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf); -extern uint8 visibilitymap_set(Relation rel, - BlockNumber heapBlk, Buffer heapBuf, - XLogRecPtr recptr, - Buffer vmBuf, - TransactionId cutoff_xid, - uint8 flags); -extern uint8 visibilitymap_set_vmbits(BlockNumber heapBlk, - Buffer vmBuf, uint8 flags, - const RelFileLocator rlocator); +extern uint8 visibilitymap_set(BlockNumber heapBlk, + Buffer vmBuf, uint8 flags, + const RelFileLocator rlocator); extern uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); extern void visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen); extern BlockNumber visibilitymap_prepare_truncate(Relation rel, diff --git a/src/include/access/visibilitymapdefs.h b/src/include/access/visibilitymapdefs.h index 5ad5c0208779..e01bce4c99fa 100644 --- a/src/include/access/visibilitymapdefs.h +++ b/src/include/access/visibilitymapdefs.h @@ -21,14 +21,5 @@ #define VISIBILITYMAP_ALL_FROZEN 0x02 #define VISIBILITYMAP_VALID_BITS 0x03 /* OR of all valid visibilitymap * flags bits */ -/* - * To detect recovery conflicts during logical decoding on a standby, we need - * to know if a table is a user catalog table. For that we add an additional - * bit into xl_heap_visible.flags, in addition to the above. - * - * NB: VISIBILITYMAP_XLOG_* may not be passed to visibilitymap_set(). - */ -#define VISIBILITYMAP_XLOG_CATALOG_REL 0x04 -#define VISIBILITYMAP_XLOG_VALID_BITS (VISIBILITYMAP_VALID_BITS | VISIBILITYMAP_XLOG_CATALOG_REL) #endif /* VISIBILITYMAPDEFS_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 18ae8f0d4bb8..0c3b0d601685 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -676,6 +676,12 @@ typedef struct EState * ExecDoInitialPruning() */ const char *es_sourceText; /* Source text from QueryDesc */ + /* + * RT indexes of relations modified by the query either through + * UPDATE/DELETE/INSERT/MERGE or SELECT FOR UPDATE + */ + Bitmapset *es_modified_relids; + JunkFilter *es_junkFilter; /* top-level junk filter, if any */ /* If query can insert/delete tuples, the command ID to mark them with */ diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index 604c1f902169..a0ea2cfcea29 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -100,8 +100,8 @@ extern char *ExportSnapshot(Snapshot snapshot); */ typedef struct GlobalVisState GlobalVisState; extern GlobalVisState *GlobalVisTestFor(Relation rel); -extern bool GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid); -extern bool GlobalVisTestIsRemovableFullXid(GlobalVisState *state, FullTransactionId fxid); +extern bool GlobalVisXidVisibleToAll(GlobalVisState *state, TransactionId xid); +extern bool GlobalVisFullXidVisibleToAll(GlobalVisState *state, FullTransactionId fxid); extern bool GlobalVisCheckRemovableXid(Relation rel, TransactionId xid); extern bool GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid); diff --git a/src/test/isolation/expected/index-killtuples.out b/src/test/isolation/expected/index-killtuples.out index be7ddd756ef0..b29f2434b000 100644 --- a/src/test/isolation/expected/index-killtuples.out +++ b/src/test/isolation/expected/index-killtuples.out @@ -54,7 +54,7 @@ step flush: SELECT FROM pg_stat_force_next_flush(); step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; new_heap_accesses ----------------- - 1 + 2 (1 row) step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); @@ -130,7 +130,7 @@ step flush: SELECT FROM pg_stat_force_next_flush(); step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; new_heap_accesses ----------------- - 1 + 2 (1 row) step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); @@ -283,7 +283,7 @@ step flush: SELECT FROM pg_stat_force_next_flush(); step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; new_heap_accesses ----------------- - 1 + 2 (1 row) step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl index ebe2fae17898..bdd9f0a62cd7 100644 --- a/src/test/recovery/t/035_standby_logical_decoding.pl +++ b/src/test/recovery/t/035_standby_logical_decoding.pl @@ -296,6 +296,7 @@ sub wait_until_vacuum_can_remove max_replication_slots = 4 max_wal_senders = 4 autovacuum = off +hot_standby_feedback = on }); $node_primary->dump_info; $node_primary->start; @@ -748,7 +749,7 @@ sub wait_until_vacuum_can_remove $logstart = -s $node_standby->logfile; reactive_slots_change_hfs_and_wait_for_xmins('shared_row_removal_', - 'no_conflict_', 0, 1); + 'no_conflict_', 1, 0); # This should not trigger a conflict wait_until_vacuum_can_remove( diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 018b5919cf66..6b4a40f616cb 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2343,6 +2343,7 @@ ProjectionPath PromptInterruptContext ProtocolVersion PrsStorage +PruneFreezeParams PruneFreezeResult PruneReason PruneState @@ -4280,7 +4281,6 @@ xl_heap_prune xl_heap_rewrite_mapping xl_heap_truncate xl_heap_update -xl_heap_visible xl_invalid_page xl_invalid_page_key xl_invalidations