Skip to content

Commit da189bc

Browse files
MMeentCommitfest Bot
authored andcommitted
IOS/TableAM: Support AM-specific fast visibility tests
Previously, we assumed VM_ALL_VISIBLE is universal across all AMs. This is probably not the case, so we introduce a new table method called "table_index_vischeck_tuples" which allows anyone to ask the AM whether a tuple is definitely visible to everyone or might be invisible to someone. The API is intended to replace direct calls to VM_ALL_VISIBLE and as such doesn't include "definitely dead to everyone", as the Heap AM's VM doesn't support *definitely dead* as output for its lookups; and thus it would be too expensive for the Heap AM to produce such results. A future commit will use this inside GIST and SP-GIST to fix a race condition between IOS and VACUUM, which causes a bug with tuple visibility, and a further patch will add support for this to nbtree.
1 parent e5a3c9d commit da189bc

File tree

11 files changed

+430
-82
lines changed

11 files changed

+430
-82
lines changed

src/backend/access/heap/heapam.c

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,11 +101,37 @@ static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status
101101
uint16 infomask, Relation rel, int *remaining,
102102
bool logLockFailure);
103103
static void index_delete_sort(TM_IndexDeleteOp *delstate);
104+
static inline int heap_ivc_process_block(Relation rel, Buffer *vmbuf,
105+
TM_VisCheck *checks, int nchecks);
106+
static void heap_ivc_process_all(Relation rel, Buffer *vmbuf,
107+
TM_VisCheck *checks, int nchecks);
104108
static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate);
105109
static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
106110
static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required,
107111
bool *copy);
108112

113+
/* sort template definitions for index */
114+
#define ST_SORT heap_ivc_sortby_tidheapblk
115+
#define ST_ELEMENT_TYPE TM_VisCheck
116+
#define ST_DECLARE
117+
#define ST_DEFINE
118+
#define ST_SCOPE static inline
119+
#define ST_COMPARE(a, b) ( \
120+
a->tidblkno < b->tidblkno ? -1 : ( \
121+
a->tidblkno > b->tidblkno ? 1 : 0 \
122+
) \
123+
)
124+
125+
#include "lib/sort_template.h"
126+
127+
#define ST_SORT heap_ivc_sortby_idx
128+
#define ST_ELEMENT_TYPE TM_VisCheck
129+
#define ST_DECLARE
130+
#define ST_DEFINE
131+
#define ST_SCOPE static inline
132+
#define ST_COMPARE(a, b) (((int) a->idxoffnum) - ((int) b->idxoffnum))
133+
#include "lib/sort_template.h"
134+
109135

110136
/*
111137
* Each tuple lock mode has a corresponding heavyweight lock, and one or two
@@ -8779,6 +8805,157 @@ bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate)
87798805
return nblocksfavorable;
87808806
}
87818807

8808+
/*
8809+
* heapam implementation of tableam's index_vischeck_tuples interface.
8810+
*
8811+
* This helper function is called by index AMs during index-only scans,
8812+
* to do VM-based visibility checks on individual tuples, so that the AM
8813+
* can hold the tuple in memory for e.g. reordering for extended periods of
8814+
* time while without holding thousands of pins to conflict with VACUUM.
8815+
*
8816+
* It's possible for this to generate a fair amount of I/O, since we may be
8817+
* checking hundreds of tuples from a single index block, but that is
8818+
* preferred over holding thousands of pins.
8819+
*
8820+
* We use heuristics to balance the costs of sorting TIDs with VM page
8821+
* lookups.
8822+
*/
8823+
void
8824+
heap_index_vischeck_tuples(Relation rel, TM_IndexVisibilityCheckOp *checkop)
8825+
{
8826+
Buffer vmbuf = *checkop->vmbuf;
8827+
Buffer storvmbuf = vmbuf;
8828+
TM_VisCheck *checks = checkop->checktids;
8829+
int checkntids = checkop->checkntids;
8830+
int upcomingvmbufchanges = 0;
8831+
8832+
/*
8833+
* The first index scan will have to pin the VM buffer, and that first
8834+
* change in the vm buffer shouldn't put us into the expensive VM page &
8835+
* sort path; so we special-case this operation.
8836+
*/
8837+
if (!BufferIsValid(vmbuf))
8838+
{
8839+
int processed;
8840+
processed = heap_ivc_process_block(rel, &vmbuf, checks,checkntids);
8841+
checkntids -= processed;
8842+
checks += processed;
8843+
storvmbuf = vmbuf;
8844+
Assert(processed > 0);
8845+
}
8846+
8847+
while (vmbuf == storvmbuf && checkntids > 0)
8848+
{
8849+
int processed;
8850+
8851+
processed = heap_ivc_process_block(rel, &vmbuf, checks,checkntids);
8852+
8853+
Assert(processed <= checkntids);
8854+
8855+
checkntids -= processed;
8856+
checks += processed;
8857+
}
8858+
8859+
*checkop->vmbuf = vmbuf;
8860+
8861+
if (checkntids == 0)
8862+
{
8863+
return;
8864+
}
8865+
8866+
upcomingvmbufchanges = 0;
8867+
8868+
for (int i = 1; i < checkntids; i++)
8869+
{
8870+
/*
8871+
* Instead of storing the previous iteration's result, we only match
8872+
* the block numbers
8873+
*/
8874+
BlockNumber lastblkno = checks[i - 1].tidblkno;
8875+
BlockNumber newblkno = checks[i].tidblkno;
8876+
/*
8877+
* divide-by-constant can be faster than BufferGetBlockNumber()
8878+
*/
8879+
BlockNumber lastvmblkno = HEAPBLK_TO_VMBLOCK(lastblkno);
8880+
BlockNumber newvmblkno = HEAPBLK_TO_VMBLOCK(newblkno);
8881+
8882+
if (lastvmblkno != newvmblkno)
8883+
upcomingvmbufchanges++;
8884+
}
8885+
8886+
if (upcomingvmbufchanges <= pg_ceil_log2_32(checkntids))
8887+
{
8888+
/*
8889+
* No big amount of VM buf changes, so do all visibility checks
8890+
* without sorting.
8891+
*/
8892+
heap_ivc_process_all(rel, checkop->vmbuf, checks, checkntids);
8893+
8894+
return;
8895+
}
8896+
8897+
/*
8898+
* Order the TIDs to heap order, so that we will only need to visit every
8899+
* VM page at most once.
8900+
*/
8901+
heap_ivc_sortby_tidheapblk(checks, checkntids);
8902+
8903+
/* do all visibility checks */
8904+
heap_ivc_process_all(rel, checkop->vmbuf, checks, checkntids);
8905+
8906+
/* put the checks back in index order */
8907+
heap_ivc_sortby_idx(checks, checkntids);
8908+
}
8909+
8910+
8911+
static inline int
8912+
heap_ivc_process_block(Relation rel, Buffer *vmbuf, TM_VisCheck *checks,
8913+
int nchecks)
8914+
{
8915+
BlockNumber blkno;
8916+
BlockNumber prevblkno = blkno = checks->tidblkno;
8917+
TMVC_Result result;
8918+
int processed = 0;
8919+
8920+
if (VM_ALL_VISIBLE(rel, blkno, vmbuf))
8921+
result = TMVC_Visible;
8922+
else
8923+
result = TMVC_MaybeVisible;
8924+
8925+
do
8926+
{
8927+
checks->vischeckresult = result;
8928+
8929+
nchecks--;
8930+
processed++;
8931+
checks++;
8932+
8933+
if (nchecks <= 0)
8934+
return processed;
8935+
8936+
blkno = checks->tidblkno;
8937+
} while (blkno == prevblkno);
8938+
8939+
return processed;
8940+
}
8941+
8942+
static void
8943+
heap_ivc_process_all(Relation rel, Buffer *vmbuf,
8944+
TM_VisCheck *checks, int nchecks)
8945+
{
8946+
while (nchecks > 0)
8947+
{
8948+
int processed;
8949+
8950+
processed = heap_ivc_process_block(rel, vmbuf, checks, nchecks);
8951+
8952+
Assert(processed <= nchecks);
8953+
8954+
nchecks -= processed;
8955+
checks += processed;
8956+
}
8957+
}
8958+
87828959
/*
87838960
* Perform XLogInsert for a heap-visible operation. 'block' is the block
87848961
* being marked all-visible, and vm_buffer is the buffer containing the

src/backend/access/heap/heapam_handler.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2648,6 +2648,7 @@ static const TableAmRoutine heapam_methods = {
26482648
.tuple_tid_valid = heapam_tuple_tid_valid,
26492649
.tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot,
26502650
.index_delete_tuples = heap_index_delete_tuples,
2651+
.index_vischeck_tuples = heap_index_vischeck_tuples,
26512652

26522653
.relation_set_new_filelocator = heapam_relation_set_new_filelocator,
26532654
.relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,

src/backend/access/heap/visibilitymap.c

Lines changed: 14 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -107,17 +107,6 @@
107107
*/
108108
#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
109109

110-
/* Number of heap blocks we can represent in one byte */
111-
#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
112-
113-
/* Number of heap blocks we can represent in one visibility map page. */
114-
#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
115-
116-
/* Mapping from heap block number to the right bit in the visibility map */
117-
#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
118-
#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
119-
#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
120-
121110
/* Masks for counting subsets of bits in the visibility map. */
122111
#define VISIBLE_MASK8 (0x55) /* The lower bit of each bit pair */
123112
#define FROZEN_MASK8 (0xaa) /* The upper bit of each bit pair */
@@ -137,9 +126,9 @@ static Buffer vm_extend(Relation rel, BlockNumber vm_nblocks);
137126
bool
138127
visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags)
139128
{
140-
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
141-
int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
142-
int mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
129+
BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk);
130+
int mapByte = HEAPBLK_TO_VMBYTE(heapBlk);
131+
int mapOffset = HEAPBLK_TO_VMOFFSET(heapBlk);
143132
uint8 mask = flags << mapOffset;
144133
char *map;
145134
bool cleared = false;
@@ -190,7 +179,7 @@ visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags
190179
void
191180
visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
192181
{
193-
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
182+
BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk);
194183

195184
/* Reuse the old pinned buffer if possible */
196185
if (BufferIsValid(*vmbuf))
@@ -214,7 +203,7 @@ visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
214203
bool
215204
visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf)
216205
{
217-
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
206+
BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk);
218207

219208
return BufferIsValid(vmbuf) && BufferGetBlockNumber(vmbuf) == mapBlock;
220209
}
@@ -247,9 +236,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
247236
XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid,
248237
uint8 flags)
249238
{
250-
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
251-
uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
252-
uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
239+
BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk);
240+
uint32 mapByte = HEAPBLK_TO_VMBYTE(heapBlk);
241+
uint8 mapOffset = HEAPBLK_TO_VMOFFSET(heapBlk);
253242
Page page;
254243
uint8 *map;
255244
uint8 status;
@@ -340,9 +329,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
340329
uint8
341330
visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
342331
{
343-
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
344-
uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
345-
uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
332+
BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk);
333+
uint32 mapByte = HEAPBLK_TO_VMBYTE(heapBlk);
334+
uint8 mapOffset = HEAPBLK_TO_VMOFFSET(heapBlk);
346335
char *map;
347336
uint8 result;
348337

@@ -445,9 +434,9 @@ visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks)
445434
BlockNumber newnblocks;
446435

447436
/* last remaining block, byte, and bit */
448-
BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks);
449-
uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks);
450-
uint8 truncOffset = HEAPBLK_TO_OFFSET(nheapblocks);
437+
BlockNumber truncBlock = HEAPBLK_TO_VMBLOCK(nheapblocks);
438+
uint32 truncByte = HEAPBLK_TO_VMBYTE(nheapblocks);
439+
uint8 truncOffset = HEAPBLK_TO_VMOFFSET(nheapblocks);
451440

452441
#ifdef TRACE_VISIBILITYMAP
453442
elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks);

src/backend/access/index/indexam.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -628,6 +628,12 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
628628
/* XXX: we should assert that a snapshot is pushed or registered */
629629
Assert(TransactionIdIsValid(RecentXmin));
630630

631+
/*
632+
* Reset xs_visrecheck, so we don't confuse the next tuple's visibility
633+
* state with that of the previous.
634+
*/
635+
scan->xs_visrecheck = TMVC_Unchecked;
636+
631637
/*
632638
* The AM's amgettuple proc finds the next index entry matching the scan
633639
* keys, and puts the TID into scan->xs_heaptid. It should also set

src/backend/access/table/tableamapi.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ GetTableAmRoutine(Oid amhandler)
6161
Assert(routine->tuple_get_latest_tid != NULL);
6262
Assert(routine->tuple_satisfies_snapshot != NULL);
6363
Assert(routine->index_delete_tuples != NULL);
64+
Assert(routine->index_vischeck_tuples != NULL);
6465

6566
Assert(routine->tuple_insert != NULL);
6667

0 commit comments

Comments
 (0)