diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index 2c0e71eedc65..3e00149304f5 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -148,8 +148,7 @@ blhandler(PG_FUNCTION_ARGS)
amroutine->amgettuple = NULL;
amroutine->amgetbitmap = blgetbitmap;
amroutine->amendscan = blendscan;
- amroutine->ammarkpos = NULL;
- amroutine->amrestrpos = NULL;
+ amroutine->amposreset = NULL;
amroutine->amestimateparallelscan = NULL;
amroutine->aminitparallelscan = NULL;
amroutine->amparallelrescan = NULL;
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index 63d7e376f195..e36519ab776b 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -163,8 +163,7 @@ typedef struct IndexAmRoutine
amgettuple_function amgettuple; /* can be NULL */
amgetbitmap_function amgetbitmap; /* can be NULL */
amendscan_function amendscan;
- ammarkpos_function ammarkpos; /* can be NULL */
- amrestrpos_function amrestrpos; /* can be NULL */
+ amposreset_function amposreset; /* can be NULL */
/* interface functions to support parallel index scans */
amestimateparallelscan_function amestimateparallelscan; /* can be NULL */
@@ -789,32 +788,25 @@ amendscan (IndexScanDesc scan);
void
-ammarkpos (IndexScanDesc scan);
+amposreset (IndexScanDesc scan);
- Mark current scan position. The access method need only support one
- remembered scan position per scan.
-
-
-
- The ammarkpos function need only be provided if the access
- method supports ordered scans. If it doesn't,
- the ammarkpos field in its IndexAmRoutine
- struct may be set to NULL.
-
-
-
-
-void
-amrestrpos (IndexScanDesc scan);
-
- Restore the scan to the most recently marked position.
-
-
-
- The amrestrpos function need only be provided if the access
- method supports ordered scans. If it doesn't,
- the amrestrpos field in its IndexAmRoutine
- struct may be set to NULL.
+ Notify index AM that core code will change the scan's position to an item
+ returned as part of an earlier batch. The index AM must therefore
+ invalidate any state that independently tracks the scan's progress
+ (e.g., array keys used with a ScalarArrayOpExpr qual). Called by the core
+ system when it is about to restore a mark.
+
+
+
+ The amposreset function need only be provided if the access
+ method supports ordered scans through the amgetbatch
+ interface. If it doesn't, the amposreset field
+ in its IndexAmRoutine struct should be set to
+ NULL. Index AMs that don't have any private state that might need to be
+ invalidated might still find it useful to provide an empty
+ amposreset function; if amposreset
+ is set to NULL, the core system will assume that it is unsafe to restore a
+ marked position.
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 2f7d1437919d..6472013ae53f 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -296,8 +296,7 @@ brinhandler(PG_FUNCTION_ARGS)
amroutine->amgettuple = NULL;
amroutine->amgetbitmap = bringetbitmap;
amroutine->amendscan = brinendscan;
- amroutine->ammarkpos = NULL;
- amroutine->amrestrpos = NULL;
+ amroutine->amposreset = NULL;
amroutine->amestimateparallelscan = NULL;
amroutine->aminitparallelscan = NULL;
amroutine->amparallelrescan = NULL;
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index 78f7b7a2495c..c9de3d120634 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -84,8 +84,7 @@ ginhandler(PG_FUNCTION_ARGS)
amroutine->amgettuple = NULL;
amroutine->amgetbitmap = gingetbitmap;
amroutine->amendscan = ginendscan;
- amroutine->ammarkpos = NULL;
- amroutine->amrestrpos = NULL;
+ amroutine->amposreset = NULL;
amroutine->amestimateparallelscan = NULL;
amroutine->aminitparallelscan = NULL;
amroutine->amparallelrescan = NULL;
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 5213cd71e977..d8065442249c 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -105,8 +105,7 @@ gisthandler(PG_FUNCTION_ARGS)
amroutine->amgettuple = gistgettuple;
amroutine->amgetbitmap = gistgetbitmap;
amroutine->amendscan = gistendscan;
- amroutine->ammarkpos = NULL;
- amroutine->amrestrpos = NULL;
+ amroutine->amposreset = NULL;
amroutine->amestimateparallelscan = NULL;
amroutine->aminitparallelscan = NULL;
amroutine->amparallelrescan = NULL;
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 53061c819fbf..b3d7f825cc47 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -104,8 +104,7 @@ hashhandler(PG_FUNCTION_ARGS)
amroutine->amgettuple = hashgettuple;
amroutine->amgetbitmap = hashgetbitmap;
amroutine->amendscan = hashendscan;
- amroutine->ammarkpos = NULL;
- amroutine->amrestrpos = NULL;
+ amroutine->amposreset = NULL;
amroutine->amestimateparallelscan = NULL;
amroutine->aminitparallelscan = NULL;
amroutine->amparallelrescan = NULL;
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index bcbac844bb66..6c41b3119ea4 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -84,7 +84,9 @@ heapam_index_fetch_begin(Relation rel)
IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData));
hscan->xs_base.rel = rel;
+ hscan->xs_base.rs = NULL;
hscan->xs_cbuf = InvalidBuffer;
+ hscan->xs_blk = InvalidBlockNumber;
return &hscan->xs_base;
}
@@ -94,10 +96,14 @@ heapam_index_fetch_reset(IndexFetchTableData *scan)
{
IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
+ if (scan->rs)
+ read_stream_reset(scan->rs);
+
if (BufferIsValid(hscan->xs_cbuf))
{
ReleaseBuffer(hscan->xs_cbuf);
hscan->xs_cbuf = InvalidBuffer;
+ hscan->xs_blk = InvalidBlockNumber;
}
}
@@ -108,6 +114,9 @@ heapam_index_fetch_end(IndexFetchTableData *scan)
heapam_index_fetch_reset(scan);
+ if (scan->rs)
+ read_stream_end(scan->rs);
+
pfree(hscan);
}
@@ -124,23 +133,37 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
Assert(TTS_IS_BUFFERTUPLE(slot));
- /* We can skip the buffer-switching logic if we're in mid-HOT chain. */
- if (!*call_again)
+ /*
+ * Switch to correct buffer if we don't have it already (we can skip this
+ * if we're in mid-HOT chain)
+ */
+ if (!*call_again && hscan->xs_blk != ItemPointerGetBlockNumber(tid))
{
- /* Switch to correct buffer if we don't have it already */
- Buffer prev_buf = hscan->xs_cbuf;
+ /* Remember this buffer's block number for next time */
+ hscan->xs_blk = ItemPointerGetBlockNumber(tid);
- hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf,
- hscan->xs_base.rel,
- ItemPointerGetBlockNumber(tid));
+ if (BufferIsValid(hscan->xs_cbuf))
+ ReleaseBuffer(hscan->xs_cbuf);
/*
- * Prune page, but only if we weren't already on this page
+ * When using a read stream, the stream will already know which block
+ * number comes next (though an assertion will verify a match below)
*/
- if (prev_buf != hscan->xs_cbuf)
- heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
+ if (scan->rs)
+ hscan->xs_cbuf = read_stream_next_buffer(scan->rs, NULL);
+ else
+ hscan->xs_cbuf = ReadBuffer(hscan->xs_base.rel, hscan->xs_blk);
+
+ /*
+ * Prune page when it is pinned for the first time
+ */
+ heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
}
+ /* Assert that the TID's block number's buffer is now pinned */
+ Assert(BufferIsValid(hscan->xs_cbuf));
+ Assert(BufferGetBlockNumber(hscan->xs_cbuf) == hscan->xs_blk);
+
/* Obtain share-lock on the buffer so we can examine visibility */
LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_SHARE);
got_heap_tuple = heap_hot_search_buffer(tid,
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 0cb27af13109..55e60c9ffde2 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -89,6 +89,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
scan->xs_snapshot = InvalidSnapshot; /* caller must initialize this */
scan->numberOfKeys = nkeys;
scan->numberOfOrderBys = norderbys;
+ scan->batchState = NULL; /* used by amgetbatch index AMs */
/*
* We allocate key workspace here, but it won't get filled until amrescan.
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 0492d92d23b1..0e089001cc12 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -44,6 +44,7 @@
#include "postgres.h"
#include "access/amapi.h"
+#include "access/nbtree.h" /* XXX for MaxTIDsPerBTreePage (should remove) */
#include "access/relation.h"
#include "access/reloptions.h"
#include "access/relscan.h"
@@ -51,9 +52,11 @@
#include "catalog/index.h"
#include "catalog/pg_type.h"
#include "nodes/execnodes.h"
+#include "optimizer/cost.h"
#include "pgstat.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
+#include "utils/memdebug.h"
#include "utils/ruleutils.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
@@ -107,8 +110,146 @@ do { \
static IndexScanDesc index_beginscan_internal(Relation indexRelation,
int nkeys, int norderbys, Snapshot snapshot,
ParallelIndexScanDesc pscan, bool temp_snap);
+static ItemPointer index_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction);
+static ItemPointer index_retail_getnext_tid(IndexScanDesc scan, ScanDirection direction);
static inline void validate_relation_kind(Relation r);
+/* index batching */
+static void index_batch_init(IndexScanDesc scan);
+static void index_batch_reset(IndexScanDesc scan, bool complete);
+static void index_batch_end(IndexScanDesc scan);
+static bool index_batch_getnext(IndexScanDesc scan, ScanDirection direction);
+static void index_batch_free(IndexScanDesc scan, IndexScanBatch batch);
+
+static BlockNumber index_scan_stream_read_next(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data);
+
+static pg_attribute_always_inline bool index_batch_pos_advance(IndexScanDesc scan,
+ IndexScanBatchPos *pos,
+ ScanDirection direction);
+static void index_batch_pos_reset(IndexScanDesc scan, IndexScanBatchPos *pos);
+static void index_batch_kill_item(IndexScanDesc scan);
+
+static void AssertCheckBatchPosValid(IndexScanDesc scan, IndexScanBatchPos *pos);
+static void AssertCheckBatch(IndexScanDesc scan, IndexScanBatch batch);
+static void AssertCheckBatches(IndexScanDesc scan);
+
+
+/*
+ * Maximum number of batches (leaf pages) we can keep in memory.
+ *
+ * The value 64 value is arbitrary, it's about 1MB of data with 8KB pages
+ * (512kB for pages, and then a bit of overhead). We should not really need
+ * this many batches in most cases, though. The read stream looks ahead just
+ * enough to queue enough IOs, adjusting the distance (TIDs, but ultimately
+ * the number of future batches) to meet that.
+ *
+ * In most cases an index leaf page has many (hundreds) index tuples, and
+ * it's enough to read one or maybe two leaf pages ahead to satisfy the
+ * distance.
+ *
+ * But there are cases where this may not quite work, for example:
+ *
+ * a) bloated index - many pages only have a single index item, so that
+ * achieving the distance requires too many leaf pages
+ *
+ * b) correlated index - duplicate blocks are skipped (the callback does not
+ * even return those, thanks to currentPrefetchBlock optimization), and are
+ * mostly ignored in the distance heuristics (read stream does not even see
+ * those TIDs, and there's no I/O either)
+ *
+ * c) index-only scan - the callback skips TIDs from all-visible blocks (not
+ * reading those is the whole point of index-only scans), and so it's
+ * invisible to the distance / IO heuristics (similarly to duplicates)
+ *
+ * In these cases we might need to read a significant number of batches to
+ * find the first block to return to the read stream. It's not clear if
+ * looking this far ahead is worth it - it's a lot of work / synchronous
+ * I/O, and the query may terminate before reaching those TIDs (e.g. due to
+ * a LIMIT clause).
+ *
+ * Currently, there's no way to "pause" a read stream - stop looking ahead
+ * for a while, but then resume the work when a batch gets freed. To simulate
+ * this, the read stream is terminated (as if there were no more data), and
+ * then reset after draining all the queued blocks in order to resume work.
+ * This works, but it "stalls" the I/O queue. If it happens very often, it
+ * can be a serious performance bottleneck.
+
+ * XXX Maybe 64 is too high? It also defines the maximum amount of overhead
+ * allowed. In the worst case, reading a single row might trigger reading this
+ * many leaf pages (e.g. with IOS, if most pages are all-visible). Which might
+ * be an issue with LIMIT queries, when we actually won't get that far.
+ */
+#define INDEX_SCAN_MAX_BATCHES 64
+
+/*
+ * Thresholds controlling when we cancel use of a read stream to do
+ * prefetching
+ */
+#define INDEX_SCAN_MIN_DISTANCE_NBATCHES 20
+#define INDEX_SCAN_MIN_TUPLE_DISTANCE 7
+
+#define INDEX_SCAN_BATCH_COUNT(scan) \
+ ((scan)->batchState->nextBatch - (scan)->batchState->headBatch)
+
+/* Did we already load batch with the requested index? */
+/* XXX shouldn't this also compare headBatch? maybe the batch was freed? */
+#define INDEX_SCAN_BATCH_LOADED(scan, idx) \
+ ((idx) < (scan)->batchState->nextBatch)
+
+/* Have we loaded the maximum number of batches? */
+#define INDEX_SCAN_BATCH_FULL(scan) \
+ (INDEX_SCAN_BATCH_COUNT(scan) == scan->batchState->maxBatches)
+
+/* Return batch for the provided index. */
+/* XXX Should this have an assert to enforce the batch is loaded? Maybe the
+ * index is too far back, but there happens to be a batch in the right slot?
+ * Could easily happen if we have to keep many batches around.
+ */
+#define INDEX_SCAN_BATCH(scan, idx) \
+ ((scan)->batchState->batches[(idx) % INDEX_SCAN_MAX_BATCHES])
+
+/* Is the position invalid/undefined? */
+#define INDEX_SCAN_POS_INVALID(pos) \
+ (((pos)->batch == -1) && ((pos)->index == -1))
+
+#ifdef INDEXAM_DEBUG
+#define DEBUG_LOG(...) elog(AmRegularBackendProcess() ? NOTICE : DEBUG2, __VA_ARGS__)
+#else
+#define DEBUG_LOG(...)
+#endif
+
+/* debug: print info about current batches */
+static void
+index_batch_print(const char *label, IndexScanDesc scan)
+{
+#ifdef INDEXAM_DEBUG
+ IndexScanBatchState *batches = scan->batchState;
+
+ if (!scan->batchState)
+ return;
+
+ if (!AmRegularBackendProcess())
+ return;
+ if (IsCatalogRelation(scan->indexRelation))
+ return;
+
+ DEBUG_LOG("%s: batches headBatch %d nextBatch %d maxBatches %d",
+ label,
+ batches->headBatch, batches->nextBatch, batches->maxBatches);
+
+ for (int i = batches->headBatch; i < batches->nextBatch; i++)
+ {
+ IndexScanBatchData *batch = INDEX_SCAN_BATCH(scan, i);
+ BTScanPos pos = (BTScanPos) batch->pos;
+
+ DEBUG_LOG(" batch %d currPage %u %p firstItem %d lastItem %d killed %d",
+ i, pos->currPage, batch, batch->firstItem, batch->lastItem,
+ batch->numKilled);
+ }
+#endif
+}
/* ----------------------------------------------------------------
* index_ interface functions
@@ -283,6 +424,9 @@ index_beginscan(Relation heapRelation,
scan->xs_snapshot = snapshot;
scan->instrument = instrument;
+ if (indexRelation->rd_indam->amgetbatch != NULL)
+ index_batch_init(scan);
+
/* prepare to fetch index matches from table */
scan->xs_heapfetch = table_index_fetch_begin(heapRelation);
@@ -380,6 +524,12 @@ index_rescan(IndexScanDesc scan,
scan->kill_prior_tuple = false; /* for safety */
scan->xs_heap_continue = false;
+ /*
+ * Reset the batching. This makes it look like there are no batches,
+ * discards reads already scheduled within the read stream, etc.
+ */
+ index_batch_reset(scan, true);
+
scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys,
orderbys, norderbys);
}
@@ -394,6 +544,9 @@ index_endscan(IndexScanDesc scan)
SCAN_CHECKS;
CHECK_SCAN_PROCEDURE(amendscan);
+ /* Cleanup batching, so that the AM can release pins and so on. */
+ index_batch_end(scan);
+
/* Release resources (like buffer pins) from table accesses */
if (scan->xs_heapfetch)
{
@@ -421,10 +574,42 @@ index_endscan(IndexScanDesc scan)
void
index_markpos(IndexScanDesc scan)
{
+ IndexScanBatchState *batchState = scan->batchState;
+ IndexScanBatchPos *markPos = &batchState->markPos;
+ IndexScanBatchData *markBatch = batchState->markBatch;
+
SCAN_CHECKS;
- CHECK_SCAN_PROCEDURE(ammarkpos);
- scan->indexRelation->rd_indam->ammarkpos(scan);
+ /*
+ * FIXME this should probably check there actually is a batch state. For
+ * now it works the only AM with mark/restore support is btree, and that
+ * has batching. But we should not rely on that, right?
+ */
+
+ /*
+ * Free the previous mark batch (if any), but only if the batch is no
+ * longer valid (in the current head/next range). This means that if we're
+ * marking the same batch (different item), we don't really do anything.
+ *
+ * XXX Should have some macro for this check, I guess.
+ */
+ if (markBatch != NULL && (markPos->batch < batchState->headBatch ||
+ markPos->batch >= batchState->nextBatch))
+ {
+ batchState->markBatch = NULL;
+ index_batch_free(scan, markBatch);
+ }
+
+ /* just copy the read position (which has to be valid) */
+ batchState->markPos = batchState->readPos;
+ batchState->markBatch = INDEX_SCAN_BATCH(scan, batchState->markPos.batch);
+
+ /*
+ * FIXME we need to make sure the batch does not get freed during the
+ * regular advances.
+ */
+
+ AssertCheckBatchPosValid(scan, &batchState->markPos);
}
/* ----------------
@@ -445,19 +630,60 @@ index_markpos(IndexScanDesc scan)
void
index_restrpos(IndexScanDesc scan)
{
+ IndexScanBatchState *batchState;
+ IndexScanBatchPos *markPos;
+ IndexScanBatchData *markBatch;
+
Assert(IsMVCCSnapshot(scan->xs_snapshot));
SCAN_CHECKS;
- CHECK_SCAN_PROCEDURE(amrestrpos);
+ CHECK_SCAN_PROCEDURE(amgetbatch);
+ CHECK_SCAN_PROCEDURE(amposreset);
- /* release resources (like buffer pins) from table accesses */
+ /*
+ * release resources (like buffer pins) from table accesses
+ *
+ * XXX: Currently, the distance is always remembered across any
+ * read_stream_reset calls (to work around the scan->batchState->reset
+ * behavior of resetting the stream to deal with running out of batches).
+ * We probably _should_ be forgetting the distance when we reset the
+ * stream here (through our table_index_fetch_reset call), though.
+ */
if (scan->xs_heapfetch)
table_index_fetch_reset(scan->xs_heapfetch);
scan->kill_prior_tuple = false; /* for safety */
scan->xs_heap_continue = false;
- scan->indexRelation->rd_indam->amrestrpos(scan);
+ /*
+ * FIXME this should probably check there actually is a batch state. For
+ * now it works the only AM with mark/restore support is btree, and that
+ * has batching. But we should not rely on that, right?
+ */
+
+ batchState = scan->batchState;
+ markPos = &batchState->markPos;
+ markBatch = scan->batchState->markBatch;
+
+ /*
+ * Call amposreset to let index AM know to invalidate any private state
+ * that independently tracks the scan's progress
+ */
+ scan->indexRelation->rd_indam->amposreset(scan, markBatch);
+
+ /*
+ * Reset the batching state, except for the marked batch, and make it look
+ * like we have a single batch -- the marked one.
+ */
+ index_batch_reset(scan, false);
+
+ batchState->markPos = *markPos;
+ batchState->readPos = *markPos;
+ batchState->headBatch = markPos->batch;
+ batchState->nextBatch = (batchState->headBatch + 1);
+
+ INDEX_SCAN_BATCH(scan, batchState->markPos.batch) = markBatch;
+ batchState->markBatch = markBatch; /* also remember this */
}
/*
@@ -579,6 +805,17 @@ index_parallelrescan(IndexScanDesc scan)
if (scan->xs_heapfetch)
table_index_fetch_reset(scan->xs_heapfetch);
+ /*
+ * Reset the batching. This makes it look like there are no batches,
+ * discards reads already scheduled to the read stream, etc.
+ *
+ * XXX We do this before calling amparallelrescan, so that it could
+ * reinitialize everything (this probably does not matter very much, now
+ * that we've moved all the batching logic to indexam.c, it was more
+ * important when the index AM was responsible for more of it).
+ */
+ index_batch_reset(scan, true);
+
/* amparallelrescan is optional; assume no-op if not provided by AM */
if (scan->indexRelation->rd_indam->amparallelrescan != NULL)
scan->indexRelation->rd_indam->amparallelrescan(scan);
@@ -614,6 +851,9 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel,
scan->xs_snapshot = snapshot;
scan->instrument = instrument;
+ if (indexrel->rd_indam->amgetbatch != NULL)
+ index_batch_init(scan);
+
/* prepare to fetch index matches from table */
scan->xs_heapfetch = table_index_fetch_begin(heaprel);
@@ -630,14 +870,286 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel,
ItemPointer
index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
{
- bool found;
-
SCAN_CHECKS;
- CHECK_SCAN_PROCEDURE(amgettuple);
/* XXX: we should assert that a snapshot is pushed or registered */
Assert(TransactionIdIsValid(RecentXmin));
+ /*
+ * Index AMs that support plain index scans must provide exactly one of
+ * either the amgetbatch or amgettuple callbacks
+ */
+ Assert(!(scan->indexRelation->rd_indam->amgettuple != NULL &&
+ scan->indexRelation->rd_indam->amgetbatch != NULL));
+
+ if (scan->batchState != NULL)
+ return index_batch_getnext_tid(scan, direction);
+ else
+ return index_retail_getnext_tid(scan, direction);
+}
+
+/* ----------------
+ * index_getnext_batch_tid - ambatch index_getnext_tid implementation
+ *
+ * If we advance to the next batch, we release the previous one (unless it's
+ * tracked for mark/restore).
+ *
+ * If the scan direction changes, we release all batches except the current
+ * one (per readPos), to make it look it's the only batch we loaded.
+ *
+ * Returns the first/next TID, or NULL if no more items.
+ *
+ * FIXME This only sets xs_heaptid and xs_itup (if requested). Not sure if
+ * we need to do something with xs_hitup. Should this set xs_hitup?
+ *
+ * XXX Maybe if we advance the position to the next batch, we could keep the
+ * batch for a bit more, in case the scan direction changes (as long as it
+ * fits into maxBatches)? But maybe that's unnecessary complexity for too
+ * little gain, we'd need to be careful about releasing the batches lazily.
+ * ----------------
+ */
+static ItemPointer
+index_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction)
+{
+ IndexScanBatchState *batchState = scan->batchState;
+ IndexScanBatchPos *readPos;
+
+ CHECK_SCAN_PROCEDURE(amgetbatch);
+
+ /* shouldn't get here without batching */
+ AssertCheckBatches(scan);
+
+ /* Initialize direction on first call */
+ if (batchState->direction == NoMovementScanDirection)
+ batchState->direction = direction;
+
+ /*
+ * Handle cancelling the use of the read stream for prefetching
+ */
+ else if (unlikely(batchState->disabled && scan->xs_heapfetch->rs))
+ {
+ index_batch_pos_reset(scan, &batchState->streamPos);
+
+ read_stream_reset(scan->xs_heapfetch->rs);
+ scan->xs_heapfetch->rs = NULL;
+ }
+
+ /*
+ * Handle change of scan direction (reset stream, ...).
+ *
+ * Release future batches properly, to make it look like the current batch
+ * is the only one we loaded. Also reset the stream position, as if we are
+ * just starting the scan.
+ */
+ else if (unlikely(batchState->direction != direction))
+ {
+ /* release "future" batches in the wrong direction */
+ while (batchState->nextBatch > batchState->headBatch + 1)
+ {
+ IndexScanBatch fbatch;
+
+ batchState->nextBatch--;
+ fbatch = INDEX_SCAN_BATCH(scan, batchState->nextBatch);
+ index_batch_free(scan, fbatch);
+ }
+
+ /*
+ * Remember the new direction, and make sure the scan is not marked as
+ * "finished" (we might have already read the last batch, but now we
+ * need to start over). Do this before resetting the stream - it
+ * should not invoke the callback until the first read, but it may
+ * seem a bit confusing otherwise.
+ */
+ batchState->direction = direction;
+ batchState->finished = false;
+ batchState->currentPrefetchBlock = InvalidBlockNumber;
+
+ index_batch_pos_reset(scan, &batchState->streamPos);
+ if (scan->xs_heapfetch->rs)
+ read_stream_reset(scan->xs_heapfetch->rs);
+ }
+
+ /* shortcut for the read position, for convenience */
+ readPos = &batchState->readPos;
+
+ DEBUG_LOG("index_batch_getnext_tid readPos %d %d direction %d",
+ readPos->batch, readPos->index, direction);
+
+ /*
+ * Try advancing the batch position. If that doesn't succeed, it means we
+ * don't have more items in the current batch, and there's no future batch
+ * loaded. So try loading another batch, and maybe retry.
+ *
+ * FIXME This loop shouldn't happen more than twice. Maybe we should have
+ * some protection against infinite loops? To detect cases when the
+ * advance/getnext functions get to disagree?
+ */
+ while (true)
+ {
+ /*
+ * If we manage to advance to the next items, return it and we're
+ * done. Otherwise try loading another batch.
+ */
+ if (index_batch_pos_advance(scan, readPos, direction))
+ {
+ IndexScanBatchData *readBatch = INDEX_SCAN_BATCH(scan, readPos->batch);
+
+ /* set the TID / itup for the scan */
+ scan->xs_heaptid = readBatch->items[readPos->index].heapTid;
+ if (scan->xs_want_itup)
+ scan->xs_itup =
+ (IndexTuple) (readBatch->currTuples +
+ readBatch->items[readPos->index].tupleOffset);
+
+ DEBUG_LOG("readBatch %p firstItem %d lastItem %d readPos %d/%d TID (%u,%u)",
+ readBatch, readBatch->firstItem, readBatch->lastItem,
+ readPos->batch, readPos->index,
+ ItemPointerGetBlockNumber(&scan->xs_heaptid),
+ ItemPointerGetOffsetNumber(&scan->xs_heaptid));
+
+ /*
+ * If we advanced to the next batch, release the batch we no
+ * longer need. The positions is the "read" position, and we can
+ * compare it to headBatch.
+ */
+ if (unlikely(readPos->batch != batchState->headBatch))
+ {
+ IndexScanBatchData *headBatch = INDEX_SCAN_BATCH(scan,
+ batchState->headBatch);
+
+ /*
+ * XXX When advancing readPos, the streamPos may get behind as
+ * we're only advancing it when actually requesting heap
+ * blocks. But we may not do that often enough - e.g. IOS may
+ * not need to access all-visible heap blocks, so the
+ * read_next callback does not get invoked for a long time.
+ * It's possible the stream gets so mucu behind the position
+ * gets invalid, as we already removed the batch. But that
+ * means we don't need any heap blocks until the current read
+ * position - if we did, we would not be in this situation (or
+ * it's a sign of a bug, as those two places are expected to
+ * be in sync). So if the streamPos still points at the batch
+ * we're about to free, just reset the position - we'll set it
+ * to readPos in the read_next callback later.
+ *
+ * XXX This can happen after the queue gets full, we "pause"
+ * the stream, and then reset it to continue. But I think that
+ * just increases the probability of hitting the issue, it's
+ * just more chance to to not advance the streamPos, which
+ * depends on when we try to fetch the first heap block after
+ * calling read_stream_reset().
+ *
+ * FIXME Simplify/clarify/shorten this comment. Can it
+ * actually happen, if we never pull from the stream in IOS?
+ * We probably don't look ahead for the first call.
+ */
+ if (unlikely(batchState->streamPos.batch == batchState->headBatch))
+ {
+ DEBUG_LOG("index_batch_pos_reset called early (streamPos.batch == headBatch)");
+ index_batch_pos_reset(scan, &batchState->streamPos);
+ }
+
+ DEBUG_LOG("index_batch_getnext_tid free headBatch %p headBatch %d nextBatch %d",
+ headBatch, batchState->headBatch, batchState->nextBatch);
+
+ /* Free the head batch (except when it's markBatch) */
+ index_batch_free(scan, headBatch);
+
+ /*
+ * In any case, remove the batch from the regular queue, even
+ * if we kept it for mark/restore.
+ */
+ batchState->headBatch++;
+
+ DEBUG_LOG("index_batch_getnext_tid batch freed headBatch %d nextBatch %d",
+ batchState->headBatch, batchState->nextBatch);
+
+ index_batch_print("index_batch_getnext_tid / free old batch", scan);
+
+ /* we can't skip any batches */
+ Assert(batchState->headBatch == readPos->batch);
+ }
+
+ pgstat_count_index_tuples(scan->indexRelation, 1);
+ return &scan->xs_heaptid;
+ }
+
+ /*
+ * We failed to advance, i.e. we ran out of currently loaded batches.
+ * So if we filled the queue, this is a good time to reset the stream
+ * (before we try loading the next batch).
+ */
+ if (unlikely(batchState->reset))
+ {
+ DEBUG_LOG("resetting read stream readPos %d,%d",
+ readPos->batch, readPos->index);
+
+ batchState->reset = false;
+ batchState->currentPrefetchBlock = InvalidBlockNumber;
+
+ /*
+ * Need to reset the stream position, it might be too far behind.
+ * Ultimately we want to set it to readPos, but we can't do that
+ * yet - readPos still point sat the old batch, so just reset it
+ * and we'll init it to readPos later in the callback.
+ */
+ index_batch_pos_reset(scan, &batchState->streamPos);
+
+ if (scan->xs_heapfetch->rs)
+ read_stream_reset(scan->xs_heapfetch->rs);
+ }
+
+ /*
+ * Failed to advance the read position, so try reading the next batch.
+ * If this fails, we're done - there's nothing more to load.
+ *
+ * Most of the batches should be loaded from read_stream_next_buffer,
+ * but we need to call index_batch_getnext here too, for two reasons.
+ * First, the read_stream only gets working after we try fetching the
+ * first heap tuple, so we need to load the initial batch (the head).
+ * Second, while most batches will be preloaded by the stream thanks
+ * to prefetching, it's possible to set effective_io_concurrency=0,
+ * and in that case all the batches get loaded from here.
+ */
+ if (!index_batch_getnext(scan, direction))
+ break;
+
+ DEBUG_LOG("loaded next batch, retry to advance position");
+ }
+
+ /*
+ * If we get here, we failed to advance the position and there are no more
+ * batches, so we're done.
+ */
+ DEBUG_LOG("no more batches to process");
+
+ /*
+ * Reset the position - we must not keep the last valid position, in case
+ * we change direction of the scan and start scanning again. If we kept
+ * the position, we'd skip the first item.
+ *
+ * XXX This is a bit strange. Do we really need to reset the position
+ * after returning the last item? I wonder if it means the API is not
+ * quite right.
+ */
+ index_batch_pos_reset(scan, readPos);
+
+ return NULL;
+}
+
+/* ----------------
+ * index_retail_getnext_tid - amgettuple index_getnext_tid implementation
+ *
+ * Returns the first/next TID, or NULL if no more items.
+ * ----------------
+ */
+static ItemPointer
+index_retail_getnext_tid(IndexScanDesc scan, ScanDirection direction)
+{
+ bool found;
+
+ CHECK_SCAN_PROCEDURE(amgettuple);
+
/*
* The AM's amgettuple proc finds the next index entry matching the scan
* keys, and puts the TID into scan->xs_heaptid. It should also set
@@ -704,9 +1216,18 @@ index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot)
* amgettuple call, in index_getnext_tid). We do not do this when in
* recovery because it may violate MVCC to do so. See comments in
* RelationGetIndexScan().
+ *
+ * XXX For scans using batching, record the flag in the batch (we will
+ * pass it to the AM later, when freeing it). Otherwise just pass it to
+ * the AM using the kill_prior_tuple field.
*/
if (!scan->xactStartedInRecovery)
- scan->kill_prior_tuple = all_dead;
+ {
+ if (scan->batchState == NULL)
+ scan->kill_prior_tuple = all_dead;
+ else if (all_dead)
+ index_batch_kill_item(scan);
+ }
return found;
}
@@ -1089,3 +1610,934 @@ index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions,
return build_local_reloptions(&relopts, attoptions, validate);
}
+
+/*
+ * Check that a position (batch,item) is valid with respect to the batches we
+ * have currently loaded.
+ *
+ * XXX The "marked" batch is an exception. The marked batch may get outside
+ * the range of current batches, so make sure to never check the position
+ * for that.
+ */
+static void
+AssertCheckBatchPosValid(IndexScanDesc scan, IndexScanBatchPos *pos)
+{
+#ifdef USE_ASSERT_CHECKING
+ IndexScanBatchState *batchState = scan->batchState;
+
+ /* make sure the position is valid for currently loaded batches */
+ Assert(pos->batch >= batchState->headBatch);
+ Assert(pos->batch < batchState->nextBatch);
+#endif
+}
+
+/*
+ * Check a single batch is valid.
+ */
+static void
+AssertCheckBatch(IndexScanDesc scan, IndexScanBatch batch)
+{
+#ifdef USE_ASSERT_CHECKING
+ /* there must be valid range of items */
+ Assert(batch->firstItem <= batch->lastItem);
+ Assert(batch->firstItem >= 0);
+
+ /* we should have items (buffer and pointers) */
+ Assert(batch->items != NULL);
+
+ /*
+ * The number of killed items must be valid, and there must be an array of
+ * indexes if there are items.
+ */
+ Assert(batch->numKilled >= 0);
+ Assert(!(batch->numKilled > 0 && batch->killedItems == NULL));
+
+ /* XXX can we check some of the other batch fields? */
+#endif
+}
+
+/*
+ * Check invariants on current batches
+ *
+ * Makes sure the indexes are set as expected, the buffer size is within
+ * limits, and so on.
+ */
+static void
+AssertCheckBatches(IndexScanDesc scan)
+{
+#ifdef USE_ASSERT_CHECKING
+ IndexScanBatchState *batchState = scan->batchState;
+
+ /* we should have batches initialized */
+ Assert(batchState != NULL);
+
+ /* We should not have too many batches. */
+ Assert(batchState->maxBatches > 0 &&
+ batchState->maxBatches <= INDEX_SCAN_MAX_BATCHES);
+
+ /*
+ * The head/next indexes should define a valid range (in the cyclic
+ * buffer, and should not overflow maxBatches.
+ */
+ Assert(batchState->headBatch >= 0 &&
+ batchState->headBatch <= batchState->nextBatch);
+ Assert(batchState->nextBatch - batchState->headBatch <=
+ batchState->maxBatches);
+
+ /* Check all current batches */
+ for (int i = batchState->headBatch; i < batchState->nextBatch; i++)
+ {
+ IndexScanBatch batch = INDEX_SCAN_BATCH(scan, i);
+
+ AssertCheckBatch(scan, batch);
+ }
+#endif
+}
+
+/*
+ * index_batch_pos_advance
+ * Advance the position to the next item, depending on scan direction.
+ *
+ * Advance the position to the next item, either in the same batch or the
+ * following one (if already available).
+ *
+ * We can advance only if we already have some batches loaded, and there's
+ * either enough items in the current batch, or some more items in the
+ * subsequent batches.
+ *
+ * If this is the first advance (right after loading the initial/head batch),
+ * position is still undefined. Otherwise we expect the position to be valid.
+ *
+ * Returns true if the position was advanced, false otherwise.
+ *
+ * The position is guaranteed to be valid only after a successful advance.
+ * If an advance fails (false returned), the position can be invalid.
+ *
+ * XXX This seems like a good place to enforce some "invariants", e.g.
+ * that the positions are always valid. We should never get here with
+ * invalid position (so probably should be initialized as part of loading the
+ * initial/head batch), and then invalidated if advance fails. Could be tricky
+ * for the stream position, though, because it can get "lag" for IOS etc.
+ */
+static pg_attribute_always_inline bool
+index_batch_pos_advance(IndexScanDesc scan, IndexScanBatchPos *pos,
+ ScanDirection direction)
+{
+ IndexScanBatchData *batch;
+
+ /* make sure we have batching initialized and consistent */
+ AssertCheckBatches(scan);
+
+ /* should know direction by now */
+ Assert(direction == scan->batchState->direction);
+ Assert(direction != NoMovementScanDirection);
+
+ /* We can't advance if there are no batches available. */
+ if (INDEX_SCAN_BATCH_COUNT(scan) == 0)
+ return false;
+
+ /*
+ * If the position has not been advanced yet, it has to be right after we
+ * loaded the initial batch (must be the head batch). In that case just
+ * initialize it to the batch's first item (or its last item, when
+ * scanning backwards).
+ *
+ * XXX Maybe we should just explicitly initialize the postition after
+ * loading the initial batch, without having to go through the advance.
+ */
+ if (INDEX_SCAN_POS_INVALID(pos))
+ {
+ /*
+ * We should have loaded the scan's initial batch, or maybe we have
+ * changed the direction of the scan after scanning all the way to the
+ * end (in which case the position is invalid, and we make it look
+ * like there is just one batch). We should have just one batch,
+ * though.
+ *
+ * XXX Actually, could there be more batches? Maybe we prefetched more
+ * batches right away? It doesn't seem to be a substantial invariant.
+ */
+ Assert(INDEX_SCAN_BATCH_COUNT(scan) == 1);
+
+ /*
+ * Get the initial batch (which must be the head), and initialize the
+ * position to the appropriate item for the current scan direction
+ */
+ batch = INDEX_SCAN_BATCH(scan, scan->batchState->headBatch);
+
+ pos->batch = scan->batchState->headBatch;
+
+ if (ScanDirectionIsForward(direction))
+ pos->index = batch->firstItem;
+ else
+ pos->index = batch->lastItem;
+
+ AssertCheckBatchPosValid(scan, pos);
+
+ return true;
+ }
+
+ /*
+ * The position is already defined, so we should have some batches loaded
+ * and the position has to be valid with respect to those.
+ */
+ AssertCheckBatchPosValid(scan, pos);
+
+ /*
+ * Advance to the next item in the same batch, if there are more items. If
+ * we're at the last item, we'll try advancing to the next batch later.
+ */
+ batch = INDEX_SCAN_BATCH(scan, pos->batch);
+
+ if (ScanDirectionIsForward(direction))
+ {
+ if (++pos->index <= batch->lastItem)
+ {
+ AssertCheckBatchPosValid(scan, pos);
+
+ return true;
+ }
+ }
+ else /* ScanDirectionIsBackward */
+ {
+ if (--pos->index >= batch->firstItem)
+ {
+ AssertCheckBatchPosValid(scan, pos);
+
+ return true;
+ }
+ }
+
+ /*
+ * We couldn't advance within the same batch, try advancing to the next
+ * batch, if it's already loaded.
+ */
+ if (INDEX_SCAN_BATCH_LOADED(scan, pos->batch + 1))
+ {
+ /* advance to the next batch */
+ pos->batch++;
+
+ batch = INDEX_SCAN_BATCH(scan, pos->batch);
+ Assert(batch != NULL);
+
+ if (ScanDirectionIsForward(direction))
+ pos->index = batch->firstItem;
+ else
+ pos->index = batch->lastItem;
+
+ AssertCheckBatchPosValid(scan, pos);
+
+ return true;
+ }
+
+ /* can't advance */
+ return false;
+}
+
+/*
+ * index_batch_pos_reset
+ * Reset the position, so that it looks as if never advanced.
+ */
+static void
+index_batch_pos_reset(IndexScanDesc scan, IndexScanBatchPos *pos)
+{
+ pos->batch = -1;
+ pos->index = -1;
+}
+
+/*
+ * index_scan_stream_read_next
+ * return the next block to pass to the read stream
+ *
+ * This assumes the "current" scan direction, requested by the caller.
+ *
+ * If the direction changes before consuming all blocks, we'll reset the stream
+ * and start from scratch. The scan direction change is handled elsewhere. Here
+ * we rely on having the correct value in batchState->direction.
+ *
+ * The position of the read_stream is stored in streamPos, which may be ahead of
+ * the current readPos (which is what got consumed by the scan).
+ *
+ * The streamPos can however also get behind readPos too, when some blocks are
+ * skipped and not returned to the read_stream. An example is an index scan on
+ * a correlated index, with many duplicate blocks are skipped, or an IOS where
+ * all-visible blocks are skipped.
+ *
+ * The initial batch is always loaded from index_batch_getnext_tid(). We don't
+ * get here until the first read_stream_next_buffer() call, when pulling the
+ * first heap tuple from the stream. After that, most batches should be loaded
+ * by this callback, driven by the read_stream look-ahead distance. However,
+ * with disabled prefetching (that is, with effective_io_concurrency=0), all
+ * batches will be loaded in index_batch_getnext_tid.
+ *
+ * It's possible we got here only fairly late in the scan, e.g. if many tuples
+ * got skipped in the index-only scan, etc. In this case just use the read
+ * position as a streamPos starting point.
+ *
+ * XXX It seems the readPos/streamPos comments should be placed elsewhere. The
+ * read_stream callback does not seem like the right place.
+ */
+static BlockNumber
+index_scan_stream_read_next(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ IndexScanDesc scan = (IndexScanDesc) callback_private_data;
+ IndexScanBatchState *batchState = scan->batchState;
+ IndexScanBatchPos *streamPos = &batchState->streamPos;
+ ScanDirection direction = batchState->direction;
+
+ /* By now we should know the direction of the scan. */
+ Assert(direction != NoMovementScanDirection);
+
+ /*
+ * The read position (readPos) has to be valid.
+ *
+ * We initialize/advance it before even attempting to read the heap tuple,
+ * and it gets invalidated when we reach the end of the scan (but then we
+ * don't invoke the callback again).
+ *
+ * XXX This applies to the readPos. We'll use streamPos to determine which
+ * blocks to pass to the stream, and readPos may be used to initialize it.
+ */
+ AssertCheckBatchPosValid(scan, &batchState->readPos);
+
+ /*
+ * Try to advance the streamPos to the next item, and if that doesn't
+ * succeed (if there are no more items in loaded batches), try loading the
+ * next one.
+ *
+ * FIXME Unlike index_batch_getnext_tid, this can loop more than twice. If
+ * many blocks get skipped due to currentPrefetchBlock or all-visibility
+ * (per the "prefetch" callback), we get to load additional batches. In
+ * the worst case we hit the INDEX_SCAN_MAX_BATCHES limit and have to
+ * "pause" the stream.
+ */
+ while (true)
+ {
+ bool advanced = false;
+
+ /*
+ * If the stream position has not been initialized yet, set it to the
+ * current read position. This is the item the caller is tring to
+ * read, so it's what we should return to the stream.
+ */
+ if (INDEX_SCAN_POS_INVALID(streamPos))
+ {
+ *streamPos = batchState->readPos;
+ advanced = true;
+ }
+ else if (index_batch_pos_advance(scan, streamPos, direction))
+ {
+ advanced = true;
+ }
+
+ /*
+ * FIXME Maybe check the streamPos is not behind readPos?
+ *
+ * FIXME Actually, could streamPos get stale/lagging behind readPos,
+ * and if yes how much. Could it get so far behind to not be valid,
+ * pointing at a freed batch? In that case we can't even advance it,
+ * and we should just initialize it to readPos. We might do that
+ * anyway, I guess, just to save on "pointless" advances (it must
+ * agree with readPos, we can't allow "retroactively" changing the
+ * block sequence).
+ */
+
+ /*
+ * If we advanced the position, either return the block for the TID,
+ * or skip it (and then try advancing again).
+ *
+ * The block may be "skipped" for two reasons. First, the caller may
+ * define a "prefetch" callback that tells us to skip items (IOS does
+ * this to skip all-visible pages). Second, currentPrefetchBlock is
+ * used to skip duplicate block numbers (a sequence of TIDS for the
+ * same block).
+ */
+ if (advanced)
+ {
+ IndexScanBatch streamBatch = INDEX_SCAN_BATCH(scan, streamPos->batch);
+ ItemPointer tid = &streamBatch->items[streamPos->index].heapTid;
+
+ DEBUG_LOG("index_scan_stream_read_next: index %d TID (%u,%u)",
+ streamPos->index,
+ ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
+
+ /*
+ * If there's a prefetch callback, use it to decide if we need to
+ * read the next block.
+ *
+ * We need to do this before checking currentPrefetchBlock; it's
+ * essential that the VM cache used by index-only scans is
+ * intialized here.
+ */
+ if (batchState->prefetch &&
+ !batchState->prefetch(scan, batchState->prefetchArg, streamPos))
+ {
+ DEBUG_LOG("index_scan_stream_read_next: skip block (callback)");
+ continue;
+ }
+
+ /* same block as before, don't need to read it */
+ if (batchState->currentPrefetchBlock == ItemPointerGetBlockNumber(tid))
+ {
+ DEBUG_LOG("index_scan_stream_read_next: skip block (currentPrefetchBlock)");
+ continue;
+ }
+
+ batchState->currentPrefetchBlock = ItemPointerGetBlockNumber(tid);
+
+ return batchState->currentPrefetchBlock;
+ }
+
+ /*
+ * Couldn't advance the position, no more items in the loaded batches.
+ * Try loading the next batch - if that succeeds, try advancing again
+ * (this time the advance should work, but we may skip all the items).
+ *
+ * If we fail to load the next batch, we're done.
+ */
+ if (!index_batch_getnext(scan, direction))
+ break;
+
+ /*
+ * Consider disabling prefetching when we can't keep a sufficiently
+ * large "index tuple distance" between readPos and streamPos.
+ *
+ * Only consider doing this when we're not on the scan's initial
+ * batch, when readPos and streamPos share the same batch.
+ */
+ if (!batchState->finished && !batchState->prefetchingLockedIn)
+ {
+ int indexdiff;
+
+ if (streamPos->batch <= INDEX_SCAN_MIN_DISTANCE_NBATCHES)
+ {
+ /* Too early to check if prefetching should be disabled */
+ }
+ else if (batchState->readPos.batch == streamPos->batch)
+ {
+ IndexScanBatchPos *readPos = &batchState->readPos;
+
+ if (ScanDirectionIsForward(direction))
+ indexdiff = streamPos->index - readPos->index;
+ else
+ {
+ IndexScanBatch readBatch =
+ INDEX_SCAN_BATCH(scan, readPos->batch);
+
+ indexdiff = (readPos->index - readBatch->firstItem) -
+ (streamPos->index - readBatch->firstItem);
+ }
+
+ if (indexdiff < INDEX_SCAN_MIN_TUPLE_DISTANCE)
+ {
+ batchState->disabled = true;
+ return InvalidBlockNumber;
+ }
+ else
+ {
+ batchState->prefetchingLockedIn = true;
+ }
+ }
+ else
+ batchState->prefetchingLockedIn = true;
+ }
+ }
+
+ /* no more items in this scan */
+ return InvalidBlockNumber;
+}
+
+/* ----------------
+ * index_batch_getnext - get the next batch of TIDs from a scan
+ *
+ * Returns true if we managed to read a batch of TIDs, or false if there are no
+ * more TIDs in the scan. The load may also return false if we used the maximum
+ * number of batches (INDEX_SCAN_MAX_BATCHES), in which case we'll reset the
+ * stream and continue the scan later.
+ *
+ * Returns true if the batch was loaded successfully, false otherwise.
+ *
+ * This only loads the TIDs and resets the various batch fields to fresh
+ * state. It does not set xs_heaptid/xs_itup/xs_hitup, that's the
+ * responsibility of the following index_batch_getnext_tid() calls.
+ * ----------------
+ */
+static bool
+index_batch_getnext(IndexScanDesc scan, ScanDirection direction)
+{
+ IndexScanBatchState *batchState = scan->batchState;
+ IndexScanBatch priorbatch = NULL,
+ batch = NULL;
+
+ SCAN_CHECKS;
+ CHECK_SCAN_PROCEDURE(amgetbatch);
+
+ /* XXX: we should assert that a snapshot is pushed or registered */
+ Assert(TransactionIdIsValid(RecentXmin));
+
+ /* Did we already read the last batch for this scan? */
+ if (batchState->finished)
+ return false;
+
+ /*
+ * If we already used the maximum number of batch slots available, it's
+ * pointless to try loading another one. This can happen for various
+ * reasons, e.g. for index-only scans on all-visible table, or skipping
+ * duplicate blocks on perfectly correlated indexes, etc.
+ *
+ * We could enlarge the array to allow more batches, but that's futile, we
+ * can always construct a case using more memory. Not only it would risk
+ * OOM, it'd also be inefficient because this happens early in the scan
+ * (so it'd interfere with LIMIT queries).
+ */
+ if (INDEX_SCAN_BATCH_FULL(scan))
+ {
+ DEBUG_LOG("index_batch_getnext: ran out of space for batches");
+ scan->batchState->reset = true;
+ return false;
+ }
+
+ index_batch_print("index_batch_getnext / start", scan);
+
+ /*
+ * Check if there's an existing batch that amgetbatch has to pick things
+ * up from
+ */
+ if (batchState->headBatch < batchState->nextBatch)
+ priorbatch = INDEX_SCAN_BATCH(scan, batchState->nextBatch - 1);
+
+ batch = scan->indexRelation->rd_indam->amgetbatch(scan, priorbatch,
+ direction);
+ if (batch != NULL)
+ {
+ /*
+ * We got the batch from the AM, but we need to add it to the queue.
+ * Maybe that should be part of the "batch allocation" that happens in
+ * the AM?
+ */
+ int batchIndex = batchState->nextBatch;
+
+ INDEX_SCAN_BATCH(scan, batchIndex) = batch;
+
+ batchState->nextBatch++;
+
+ DEBUG_LOG("index_batch_getnext headBatch %d nextBatch %d batch %p",
+ batchState->headBatch, batchState->nextBatch, batch);
+
+ /* Delay initializing stream until reading from scan's second batch */
+ if (priorbatch && !scan->xs_heapfetch->rs && !batchState->disabled &&
+ enable_indexscan_prefetch)
+ scan->xs_heapfetch->rs =
+ read_stream_begin_relation(READ_STREAM_DEFAULT, NULL,
+ scan->heapRelation, MAIN_FORKNUM,
+ index_scan_stream_read_next, scan, 0);
+ }
+ else
+ batchState->finished = true;
+
+ AssertCheckBatches(scan);
+
+ index_batch_print("index_batch_getnext / end", scan);
+
+ return (batch != NULL);
+}
+
+/*
+ * index_batch_init
+ * Initialize various fields / arrays needed by batching.
+ *
+ * FIXME This is a bit ad-hoc hodge podge, due to how I was adding more and
+ * more pieces. Some of the fields may be not quite necessary, needs cleanup.
+ */
+static void
+index_batch_init(IndexScanDesc scan)
+{
+ /* init batching info */
+ Assert(scan->indexRelation->rd_indam->amgetbatch != NULL);
+ Assert(scan->indexRelation->rd_indam->amfreebatch != NULL);
+
+ scan->batchState = palloc(sizeof(IndexScanBatchState));
+
+ /*
+ * Initialize the batch.
+ *
+ * We prefer to eagerly drop leaf page pins before amgetbatch returns.
+ * This avoids making VACUUM wait to acquire a cleanup lock on the page.
+ *
+ * We cannot safely drop leaf page pins during index-only scans due to a
+ * race condition involving VACUUM setting pages all-visible in the VM.
+ * It's also unsafe for plain index scans that use a non-MVCC snapshot.
+ *
+ * When we drop pins eagerly, the mechanism that marks index tuples as
+ * LP_DEAD has to deal with concurrent TID recycling races. The scheme
+ * used to detect unsafe TID recycling won't work when scanning unlogged
+ * relations (since it involves saving an affected page's LSN). Opt out
+ * of eager pin dropping during unlogged relation scans for now.
+ */
+ scan->batchState->dropPin =
+ (!scan->xs_want_itup && IsMVCCSnapshot(scan->xs_snapshot) &&
+ RelationNeedsWAL(scan->indexRelation));
+ scan->batchState->finished = false;
+ scan->batchState->reset = false;
+ scan->batchState->prefetchingLockedIn = false;
+ scan->batchState->disabled = false;
+ scan->batchState->currentPrefetchBlock = InvalidBlockNumber;
+ scan->batchState->direction = NoMovementScanDirection;
+ /* positions in the queue of batches */
+ index_batch_pos_reset(scan, &scan->batchState->readPos);
+ index_batch_pos_reset(scan, &scan->batchState->streamPos);
+ index_batch_pos_reset(scan, &scan->batchState->markPos);
+
+ scan->batchState->markBatch = NULL;
+ scan->batchState->maxBatches = INDEX_SCAN_MAX_BATCHES;
+ scan->batchState->headBatch = 0; /* initial head batch */
+ scan->batchState->nextBatch = 0; /* initial batch starts empty */
+
+ /* XXX init the cache of batches, capacity 16 is arbitrary */
+ scan->batchState->batchesCacheSize = 16;
+ scan->batchState->batchesCache = NULL;
+
+ scan->batchState->batches =
+ palloc(sizeof(IndexScanBatchData *) * scan->batchState->maxBatches);
+
+ scan->batchState->prefetch = NULL;
+ scan->batchState->prefetchArg = NULL;
+}
+
+/*
+ * index_batch_reset
+ * Reset the batch before reading the next chunk of data.
+ *
+ * complete - true means we reset even marked batch
+ *
+ * XXX Should this reset the batch memory context, xs_itup, xs_hitup, etc?
+ */
+static void
+index_batch_reset(IndexScanDesc scan, bool complete)
+{
+ IndexScanBatchState *batchState = scan->batchState;
+
+ /* bail out if batching not enabled */
+ if (!batchState)
+ return;
+
+ AssertCheckBatches(scan);
+
+ index_batch_print("index_batch_reset", scan);
+
+ /* With batching enabled, we should have a read stream. Reset it. */
+ Assert(scan->xs_heapfetch);
+ if (scan->xs_heapfetch->rs)
+ read_stream_reset(scan->xs_heapfetch->rs);
+
+ /* reset the positions */
+ index_batch_pos_reset(scan, &batchState->readPos);
+ index_batch_pos_reset(scan, &batchState->streamPos);
+
+ /*
+ * With "complete" reset, make sure to also free the marked batch, either
+ * by just forgetting it (if it's still in the queue), or by explicitly
+ * freeing it.
+ *
+ * XXX Do this before the loop, so that it calls the amfreebatch().
+ */
+ if (complete && unlikely(batchState->markBatch != NULL))
+ {
+ IndexScanBatchPos *markPos = &batchState->markPos;
+ IndexScanBatch markBatch = batchState->markBatch;
+
+ /* always reset the position, forget the marked batch */
+ batchState->markBatch = NULL;
+
+ /*
+ * If we've already moved past the marked batch (it's not in the
+ * current queue), free it explicitly. Otherwise it'll be in the freed
+ * later.
+ */
+ if (markPos->batch < batchState->headBatch ||
+ markPos->batch >= batchState->nextBatch)
+ index_batch_free(scan, markBatch);
+
+ /* reset position only after the queue range check */
+ index_batch_pos_reset(scan, &batchState->markPos);
+ }
+
+ /* release all currently loaded batches */
+ while (batchState->headBatch < batchState->nextBatch)
+ {
+ IndexScanBatch batch = INDEX_SCAN_BATCH(scan, batchState->headBatch);
+
+ DEBUG_LOG("freeing batch %d %p", batchState->headBatch, batch);
+
+ index_batch_free(scan, batch);
+
+ /* update the valid range, so that asserts / debugging works */
+ batchState->headBatch++;
+ }
+
+ /* reset relevant batch state fields */
+ Assert(batchState->maxBatches == INDEX_SCAN_MAX_BATCHES);
+ batchState->headBatch = 0; /* initial batch */
+ batchState->nextBatch = 0; /* initial batch is empty */
+
+ batchState->finished = false;
+ batchState->reset = false;
+ batchState->currentPrefetchBlock = InvalidBlockNumber;
+
+ AssertCheckBatches(scan);
+}
+
+static void
+index_batch_kill_item(IndexScanDesc scan)
+{
+ IndexScanBatchPos *readPos = &scan->batchState->readPos;
+ IndexScanBatchData *readBatch = INDEX_SCAN_BATCH(scan, readPos->batch);
+
+ AssertCheckBatchPosValid(scan, readPos);
+
+ /*
+ * XXX Maybe we can move the state that indicates if an item has been
+ * killed into IndexScanBatchData.items[] array.
+ *
+ * See:
+ * https://postgr.es/m/CAH2-WznLN7P0i2-YEnv3QGmeA5AMjdcjkraO_nz3H2Va1V1WOA@mail.gmail.com
+ */
+ if (readBatch->killedItems == NULL)
+ readBatch->killedItems = (int *)
+ palloc(MaxTIDsPerBTreePage * sizeof(int));
+ if (readBatch->numKilled < MaxTIDsPerBTreePage)
+ readBatch->killedItems[readBatch->numKilled++] = readPos->index;
+}
+
+static void
+index_batch_free(IndexScanDesc scan, IndexScanBatch batch)
+{
+ SCAN_CHECKS;
+ CHECK_SCAN_PROCEDURE(amfreebatch);
+
+ AssertCheckBatch(scan, batch);
+
+ /* don't free the batch that is marked */
+ if (batch == scan->batchState->markBatch)
+ return;
+
+ scan->indexRelation->rd_indam->amfreebatch(scan, batch);
+}
+
+/* */
+static void
+index_batch_end(IndexScanDesc scan)
+{
+ index_batch_reset(scan, true);
+
+ if (scan->batchState)
+ {
+ if (scan->batchState->batches)
+ pfree(scan->batchState->batches);
+
+ if (scan->batchState->batchesCache)
+ {
+ for (int i = 0; i < scan->batchState->batchesCacheSize; i++)
+ {
+ if (scan->batchState->batchesCache[i] == NULL)
+ continue;
+
+ pfree(scan->batchState->batchesCache[i]);
+ }
+
+ pfree(scan->batchState->batchesCache);
+ }
+ pfree(scan->batchState);
+ }
+}
+
+/*
+ * XXX Both index_batch_alloc() calls in btree use MaxTIDsPerBTreePage,
+ * which seems unfortunate - it increases the allocation sizes, even if
+ * the index would be fine with smaller arrays. This means all batches
+ * exceed ALLOC_CHUNK_LIMIT, forcing a separate malloc (expensive). The
+ * cache helps for longer queries, not for queries that only create a
+ * single batch, etc.
+ */
+IndexScanBatch
+index_batch_alloc(IndexScanDesc scan, int maxitems, bool want_itup)
+{
+ IndexScanBatch batch = NULL;
+
+ /*
+ * try to find a batch in the cache
+ *
+ * XXX We can get here with batchState==NULL for bitmapscans. Could that
+ * mean bitmapscans have issues with malloc/free on batches too? But the
+ * cache can't help with that, when it's in batchState.
+ */
+ if ((scan->batchState != NULL) &&
+ (scan->batchState->batchesCache != NULL))
+ {
+ /*
+ * try to find a batch in the cache, with maxitems high enough
+ *
+ * XXX Maybe should look for a batch with lowest maxitems? That should
+ * increase probability of cache hits in the future?
+ */
+ for (int i = 0; i < scan->batchState->batchesCacheSize; i++)
+ {
+ if ((scan->batchState->batchesCache[i] != NULL) &&
+ (scan->batchState->batchesCache[i]->maxitems >= maxitems))
+ {
+ batch = scan->batchState->batchesCache[i];
+ scan->batchState->batchesCache[i] = NULL;
+ break;
+ }
+ }
+ }
+
+ /* found a batch in the cache? */
+ if (batch)
+ {
+ /* for IOS, we expect to already have the currTuples */
+ Assert(!(want_itup && (batch->currTuples == NULL)));
+
+ /* XXX maybe we could keep these allocations too */
+ Assert(batch->pos == NULL);
+ Assert(batch->itemsvisibility == NULL);
+ }
+ else
+ {
+ batch = palloc(offsetof(IndexScanBatchData, items) +
+ sizeof(IndexScanBatchPosItem) * maxitems);
+
+ batch->maxitems = maxitems;
+
+ /*
+ * If we are doing an index-only scan, we need a tuple storage
+ * workspace. We allocate BLCKSZ for this, which should always give
+ * the index AM enough space to fit a full page's worth of tuples.
+ */
+ batch->currTuples = NULL;
+ if (want_itup)
+ batch->currTuples = palloc(BLCKSZ);
+ }
+
+ /* shared initialization */
+ batch->firstItem = -1;
+ batch->lastItem = -1;
+ batch->killedItems = NULL;
+ batch->numKilled = 0;
+
+ batch->buf = InvalidBuffer;
+ batch->pos = NULL;
+ batch->itemsvisibility = NULL; /* per-batch IOS visibility */
+
+ return batch;
+}
+
+/*
+ * Unlock batch->buf. If batch scan is dropPin, drop the pin, too. Dropping
+ * the pin prevents VACUUM from blocking on acquiring a cleanup lock.
+ */
+void
+index_batch_unlock(Relation rel, bool dropPin, IndexScanBatch batch)
+{
+ if (!dropPin)
+ {
+ if (!RelationUsesLocalBuffers(rel))
+ VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(batch->buf), BLCKSZ);
+
+ /* Just drop the lock (not the pin) */
+ LockBuffer(batch->buf, BUFFER_LOCK_UNLOCK);
+ return;
+ }
+
+ /*
+ * Drop both the lock and the pin.
+ *
+ * Have to set batch->lsn so that amfreebatch has a way to detect when
+ * concurrent heap TID recycling by VACUUM might have taken place. It'll
+ * only be safe to set any index tuple LP_DEAD bits when the page LSN
+ * hasn't advanced.
+ */
+ Assert(RelationNeedsWAL(rel));
+ batch->lsn = BufferGetLSNAtomic(batch->buf);
+ LockBuffer(batch->buf, BUFFER_LOCK_UNLOCK);
+ if (!RelationUsesLocalBuffers(rel))
+ VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(batch->buf), BLCKSZ);
+ ReleaseBuffer(batch->buf);
+ batch->buf = InvalidBuffer; /* defensive */
+}
+
+/* add the buffer to the cache, or free it */
+void
+index_batch_release(IndexScanDesc scan, IndexScanBatch batch)
+{
+ /*
+ * first free some allocations
+ *
+ * XXX We could keep/reuse some of those.
+ */
+
+ if (batch->killedItems != NULL)
+ {
+ pfree(batch->killedItems);
+ batch->killedItems = NULL;
+ }
+
+ if (batch->itemsvisibility != NULL)
+ {
+ pfree(batch->itemsvisibility);
+ batch->itemsvisibility = NULL;
+ }
+
+ /* XXX a bit unclear what's release by AM vs. indexam */
+ Assert(batch->pos == NULL);
+
+ /*
+ * try adding it to the cache - finds a slot that's either empty or has a
+ * lower maxitems value (and replace that batch)
+ *
+ * XXX maybe we should track the number of empty slots, and minimum value
+ * of maxitems, so that we can skip pointless searches?
+ *
+ * XXX ignores cases with batchState=NULL (can we get here with bitmap
+ * scans?)
+ */
+ if (scan->batchState != NULL)
+ {
+ /* lowest maxitems we found in the cache (to replace with batch) */
+ int maxitems = batch->maxitems;
+ int slot = scan->batchState->batchesCacheSize;
+
+ /* first time through, initialize the cache */
+ if (scan->batchState->batchesCache == NULL)
+ scan->batchState->batchesCache
+ = palloc0_array(IndexScanBatch,
+ scan->batchState->batchesCacheSize);
+
+ for (int i = 0; i < scan->batchState->batchesCacheSize; i++)
+ {
+ /* found empty slot, we're done */
+ if (scan->batchState->batchesCache[i] == NULL)
+ {
+ scan->batchState->batchesCache[i] = batch;
+ return;
+ }
+
+ /* update lowest maxitems? */
+ if (scan->batchState->batchesCache[i]->maxitems < maxitems)
+ {
+ maxitems = scan->batchState->batchesCache[i]->maxitems;
+ slot = i;
+ }
+ }
+
+ /* found a batch to replace? */
+ if (maxitems < batch->maxitems)
+ {
+ pfree(scan->batchState->batchesCache[slot]);
+ scan->batchState->batchesCache[slot] = batch;
+ }
+ }
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index fdff960c1302..18c734c4a695 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -158,11 +158,12 @@ bthandler(PG_FUNCTION_ARGS)
amroutine->amadjustmembers = btadjustmembers;
amroutine->ambeginscan = btbeginscan;
amroutine->amrescan = btrescan;
- amroutine->amgettuple = btgettuple;
+ amroutine->amgettuple = NULL;
+ amroutine->amgetbatch = btgetbatch;
+ amroutine->amfreebatch = btfreebatch;
amroutine->amgetbitmap = btgetbitmap;
amroutine->amendscan = btendscan;
- amroutine->ammarkpos = btmarkpos;
- amroutine->amrestrpos = btrestrpos;
+ amroutine->amposreset = btposreset;
amroutine->amestimateparallelscan = btestimateparallelscan;
amroutine->aminitparallelscan = btinitparallelscan;
amroutine->amparallelrescan = btparallelrescan;
@@ -220,13 +221,12 @@ btinsert(Relation rel, Datum *values, bool *isnull,
}
/*
- * btgettuple() -- Get the next tuple in the scan.
+ * btgetbatch() -- Get the next batch of tuples in the scan.
*/
-bool
-btgettuple(IndexScanDesc scan, ScanDirection dir)
+IndexScanBatch
+btgetbatch(IndexScanDesc scan, IndexScanBatch batch, ScanDirection dir)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- bool res;
Assert(scan->heapRelation != NULL);
@@ -241,44 +241,18 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
* the appropriate direction. If we haven't done so yet, we call
* _bt_first() to get the first item in the scan.
*/
- if (!BTScanPosIsValid(so->currPos))
- res = _bt_first(scan, dir);
+ if (batch == NULL)
+ batch = _bt_first(scan, dir);
else
- {
- /*
- * Check to see if we should kill the previously-fetched tuple.
- */
- if (scan->kill_prior_tuple)
- {
- /*
- * Yes, remember it for later. (We'll deal with all such
- * tuples at once right before leaving the index page.) The
- * test for numKilled overrun is not just paranoia: if the
- * caller reverses direction in the indexscan then the same
- * item might get entered multiple times. It's not worth
- * trying to optimize that, so we don't detect it, but instead
- * just forget any excess entries.
- */
- if (so->killedItems == NULL)
- so->killedItems = (int *)
- palloc(MaxTIDsPerBTreePage * sizeof(int));
- if (so->numKilled < MaxTIDsPerBTreePage)
- so->killedItems[so->numKilled++] = so->currPos.itemIndex;
- }
-
- /*
- * Now continue the scan.
- */
- res = _bt_next(scan, dir);
- }
+ batch = _bt_next(scan, dir, batch);
- /* If we have a tuple, return it ... */
- if (res)
+ /* If we have a batch, return it ... */
+ if (batch)
break;
/* ... otherwise see if we need another primitive index scan */
} while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
- return res;
+ return batch;
}
/*
@@ -288,6 +262,7 @@ int64
btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ IndexScanBatch batch;
int64 ntids = 0;
ItemPointer heapTid;
@@ -296,29 +271,33 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
/* Each loop iteration performs another primitive index scan */
do
{
- /* Fetch the first page & tuple */
- if (_bt_first(scan, ForwardScanDirection))
+ /* Fetch the first batch */
+ if ((batch = _bt_first(scan, ForwardScanDirection)))
{
- /* Save tuple ID, and continue scanning */
- heapTid = &scan->xs_heaptid;
+ int itemIndex = 0;
+
+ /* Save first tuple's TID */
+ heapTid = &batch->items[itemIndex].heapTid;
tbm_add_tuples(tbm, heapTid, 1, false);
ntids++;
for (;;)
{
- /*
- * Advance to next tuple within page. This is the same as the
- * easy case in _bt_next().
- */
- if (++so->currPos.itemIndex > so->currPos.lastItem)
+ /* Advance to next TID within page-sized batch */
+ if (++itemIndex > batch->lastItem)
{
+ /* btfreebatch won't be called */
+ ReleaseBuffer(batch->buf);
+
/* let _bt_next do the heavy lifting */
- if (!_bt_next(scan, ForwardScanDirection))
+ itemIndex = 0;
+ batch = _bt_next(scan, ForwardScanDirection, batch);
+ if (!batch)
break;
}
/* Save tuple ID, and continue scanning */
- heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
+ heapTid = &batch->items[itemIndex].heapTid;
tbm_add_tuples(tbm, heapTid, 1, false);
ntids++;
}
@@ -346,8 +325,6 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
/* allocate private workspace */
so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
- BTScanPosInvalidate(so->currPos);
- BTScanPosInvalidate(so->markPos);
if (scan->numberOfKeys > 0)
so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
else
@@ -361,16 +338,6 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
so->orderProcs = NULL;
so->arrayContext = NULL;
- so->killedItems = NULL; /* until needed */
- so->numKilled = 0;
-
- /*
- * We don't know yet whether the scan will be index-only, so we do not
- * allocate the tuple workspace arrays until btrescan. However, we set up
- * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
- */
- so->currTuples = so->markTuples = NULL;
-
scan->xs_itupdesc = RelationGetDescr(rel);
scan->opaque = so;
@@ -387,80 +354,50 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- /* we aren't holding any read locks, but gotta drop the pins */
- if (BTScanPosIsValid(so->currPos))
- {
- /* Before leaving current page, deal with any killed items */
- if (so->numKilled > 0)
- _bt_killitems(scan);
- BTScanPosUnpinIfPinned(so->currPos);
- BTScanPosInvalidate(so->currPos);
- }
-
/*
- * We prefer to eagerly drop leaf page pins before btgettuple returns.
- * This avoids making VACUUM wait to acquire a cleanup lock on the page.
- *
- * We cannot safely drop leaf page pins during index-only scans due to a
- * race condition involving VACUUM setting pages all-visible in the VM.
- * It's also unsafe for plain index scans that use a non-MVCC snapshot.
- *
- * When we drop pins eagerly, the mechanism that marks so->killedItems[]
- * index tuples LP_DEAD has to deal with concurrent TID recycling races.
- * The scheme used to detect unsafe TID recycling won't work when scanning
- * unlogged relations (since it involves saving an affected page's LSN).
- * Opt out of eager pin dropping during unlogged relation scans for now
- * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
- *
- * Also opt out of dropping leaf page pins eagerly during bitmap scans.
- * Pins cannot be held for more than an instant during bitmap scans either
- * way, so we might as well avoid wasting cycles on acquiring page LSNs.
- *
- * See nbtree/README section on making concurrent TID recycling safe.
- *
- * Note: so->dropPin should never change across rescans.
+ * Reset the scan keys
*/
- so->dropPin = (!scan->xs_want_itup &&
- IsMVCCSnapshot(scan->xs_snapshot) &&
- RelationNeedsWAL(scan->indexRelation) &&
- scan->heapRelation != NULL);
-
- so->markItemIndex = -1;
+ if (scankey && scan->numberOfKeys > 0)
+ memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
so->needPrimScan = false;
so->scanBehind = false;
so->oppositeDirCheck = false;
- BTScanPosUnpinIfPinned(so->markPos);
- BTScanPosInvalidate(so->markPos);
+ so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
+ so->numArrayKeys = 0; /* ditto */
+}
+/*
+ * btfreebatch() -- Free batch, releasing its buffer pin
+ *
+ * XXX Should we really be freeing memory like this? What if we were to just
+ * reuse most memory across distinct pages, avoiding pfree/palloc cycles?
+ */
+void
+btfreebatch(IndexScanDesc scan, IndexScanBatch batch)
+{
/*
- * Allocate tuple workspace arrays, if needed for an index-only scan and
- * not already done in a previous rescan call. To save on palloc
- * overhead, both workspaces are allocated as one palloc block; only this
- * function and btendscan know that.
- *
- * NOTE: this data structure also makes it safe to return data from a
- * "name" column, even though btree name_ops uses an underlying storage
- * datatype of cstring. The risk there is that "name" is supposed to be
- * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
- * However, since we only return data out of tuples sitting in the
- * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
- * data out of the markTuples array --- running off the end of memory for
- * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
- * adding special-case treatment for name_ops elsewhere.
+ * Check if there are tuples to kill from this batch (that weren't already
+ * killed earlier on)
*/
- if (scan->xs_want_itup && so->currTuples == NULL)
+ if (batch->numKilled > 0)
+ _bt_killitems(scan, batch);
+
+ if (batch->pos)
{
- so->currTuples = (char *) palloc(BLCKSZ * 2);
- so->markTuples = so->currTuples + BLCKSZ;
+ if (!scan->batchState || !scan->batchState->dropPin)
+ ReleaseBuffer(batch->buf);
+
+ pfree(batch->pos);
+
+ /* XXX maybe should be done in index_batch_free? */
+ batch->buf = InvalidBuffer;
+ batch->pos = NULL;
}
- /*
- * Reset the scan keys
- */
- if (scankey && scan->numberOfKeys > 0)
- memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
- so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
- so->numArrayKeys = 0; /* ditto */
+ /* XXX keep itemsvisibility, killItems and currTuples */
+
+ /* free the batch (or cache it for reuse) */
+ index_batch_release(scan, batch);
}
/*
@@ -471,116 +408,50 @@ btendscan(IndexScanDesc scan)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- /* we aren't holding any read locks, but gotta drop the pins */
- if (BTScanPosIsValid(so->currPos))
- {
- /* Before leaving current page, deal with any killed items */
- if (so->numKilled > 0)
- _bt_killitems(scan);
- BTScanPosUnpinIfPinned(so->currPos);
- }
-
- so->markItemIndex = -1;
- BTScanPosUnpinIfPinned(so->markPos);
-
- /* No need to invalidate positions, the RAM is about to be freed. */
-
/* Release storage */
if (so->keyData != NULL)
pfree(so->keyData);
/* so->arrayKeys and so->orderProcs are in arrayContext */
if (so->arrayContext != NULL)
MemoryContextDelete(so->arrayContext);
- if (so->killedItems != NULL)
- pfree(so->killedItems);
- if (so->currTuples != NULL)
- pfree(so->currTuples);
- /* so->markTuples should not be pfree'd, see btrescan */
pfree(so);
}
/*
- * btmarkpos() -- save current scan position
+ * btposreset() -- invalidate scan's array keys
*/
void
-btmarkpos(IndexScanDesc scan)
+btposreset(IndexScanDesc scan, IndexScanBatch markbatch)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ BTScanPos pos;
- /* There may be an old mark with a pin (but no lock). */
- BTScanPosUnpinIfPinned(so->markPos);
+ if (!so->numArrayKeys)
+ return;
/*
- * Just record the current itemIndex. If we later step to next page
- * before releasing the marked position, _bt_steppage makes a full copy of
- * the currPos struct in markPos. If (as often happens) the mark is moved
- * before we leave the page, we don't have to do that work.
+ * Core system is about to restore a mark associated with a previously
+ * returned batch. Reset the scan's arrays to make all this safe.
*/
- if (BTScanPosIsValid(so->currPos))
- so->markItemIndex = so->currPos.itemIndex;
- else
- {
- BTScanPosInvalidate(so->markPos);
- so->markItemIndex = -1;
- }
-}
-
-/*
- * btrestrpos() -- restore scan to last saved position
- */
-void
-btrestrpos(IndexScanDesc scan)
-{
- BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ pos = (BTScanPos) markbatch->pos;
+ _bt_start_array_keys(scan, pos->dir);
- if (so->markItemIndex >= 0)
- {
- /*
- * The scan has never moved to a new page since the last mark. Just
- * restore the itemIndex.
- *
- * NB: In this case we can't count on anything in so->markPos to be
- * accurate.
- */
- so->currPos.itemIndex = so->markItemIndex;
- }
+ /*
+ * Core system will invalidate all other batches.
+ *
+ * Deal with this by unsetting needPrimScan as well as moreRight (or as
+ * well as moreLeft, when scanning backwards). That way, the next time
+ * _bt_next is called it will step to the right (or to the left). At that
+ * point _bt_readpage will restore the scan's arrays to elements that
+ * correctly track the next page's position in the index's key space.
+ */
+ if (ScanDirectionIsForward(pos->dir))
+ pos->moreRight = true;
else
- {
- /*
- * The scan moved to a new page after last mark or restore, and we are
- * now restoring to the marked page. We aren't holding any read
- * locks, but if we're still holding the pin for the current position,
- * we must drop it.
- */
- if (BTScanPosIsValid(so->currPos))
- {
- /* Before leaving current page, deal with any killed items */
- if (so->numKilled > 0)
- _bt_killitems(scan);
- BTScanPosUnpinIfPinned(so->currPos);
- }
-
- if (BTScanPosIsValid(so->markPos))
- {
- /* bump pin on mark buffer for assignment to current buffer */
- if (BTScanPosIsPinned(so->markPos))
- IncrBufferRefCount(so->markPos.buf);
- memcpy(&so->currPos, &so->markPos,
- offsetof(BTScanPosData, items[1]) +
- so->markPos.lastItem * sizeof(BTScanPosItem));
- if (so->currTuples)
- memcpy(so->currTuples, so->markTuples,
- so->markPos.nextTupleOffset);
- /* Reset the scan's array keys (see _bt_steppage for why) */
- if (so->numArrayKeys)
- {
- _bt_start_array_keys(scan, so->currPos.dir);
- so->needPrimScan = false;
- }
- }
- else
- BTScanPosInvalidate(so->currPos);
- }
+ pos->moreLeft = true;
+ so->needPrimScan = false;
+ so->scanBehind = false;
+ so->oppositeDirCheck = false;
}
/*
@@ -827,15 +698,6 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
*next_scan_page = InvalidBlockNumber;
*last_curr_page = InvalidBlockNumber;
- /*
- * Reset so->currPos, and initialize moreLeft/moreRight such that the next
- * call to _bt_readnextpage treats this backend similarly to a serial
- * backend that steps from *last_curr_page to *next_scan_page (unless this
- * backend's so->currPos is initialized by _bt_readfirstpage before then).
- */
- BTScanPosInvalidate(so->currPos);
- so->currPos.moreLeft = so->currPos.moreRight = true;
-
if (first)
{
/*
@@ -985,8 +847,6 @@ _bt_parallel_done(IndexScanDesc scan)
BTParallelScanDesc btscan;
bool status_changed = false;
- Assert(!BTScanPosIsValid(so->currPos));
-
/* Do nothing, for non-parallel scans */
if (parallel_scan == NULL)
return;
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 0605356ec9f0..b019c19f806d 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -25,62 +25,33 @@
#include "utils/rel.h"
-static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
Buffer buf, bool forupdate, BTStack stack,
int access);
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
static int _bt_binsrch_posting(BTScanInsert key, Page page,
OffsetNumber offnum);
-static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
- OffsetNumber offnum, bool firstpage);
-static void _bt_saveitem(BTScanOpaque so, int itemIndex,
- OffsetNumber offnum, IndexTuple itup);
-static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
+static bool _bt_readpage(IndexScanDesc scan, IndexScanBatch newbatch,
+ ScanDirection dir, OffsetNumber offnum,
+ bool firstpage);
+static void _bt_saveitem(IndexScanBatch newbatch, int itemIndex,
+ OffsetNumber offnum, IndexTuple itup,
+ int *tupleOffset);
+static int _bt_setuppostingitems(IndexScanBatch newbatch, int itemIndex,
OffsetNumber offnum, const ItemPointerData *heapTid,
- IndexTuple itup);
-static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
+ IndexTuple itup, int *tupleOffset);
+static inline void _bt_savepostingitem(IndexScanBatch newbatch, int itemIndex,
OffsetNumber offnum,
- ItemPointer heapTid, int tupleOffset);
-static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
-static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
-static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
- ScanDirection dir);
-static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
- BlockNumber lastcurrblkno, ScanDirection dir,
- bool seized);
+ ItemPointer heapTid, int baseOffset);
+static IndexScanBatch _bt_readfirstpage(IndexScanDesc scan, IndexScanBatch firstbatch,
+ OffsetNumber offnum, ScanDirection dir);
+static IndexScanBatch _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
+ BlockNumber lastcurrblkno,
+ ScanDirection dir, bool firstpage);
static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
BlockNumber lastcurrblkno);
-static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
-
-
-/*
- * _bt_drop_lock_and_maybe_pin()
- *
- * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too.
- * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
- */
-static inline void
-_bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
-{
- if (!so->dropPin)
- {
- /* Just drop the lock (not the pin) */
- _bt_unlockbuf(rel, so->currPos.buf);
- return;
- }
-
- /*
- * Drop both the lock and the pin.
- *
- * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
- * when concurrent heap TID recycling by VACUUM might have taken place.
- */
- Assert(RelationNeedsWAL(rel));
- so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
- _bt_relbuf(rel, so->currPos.buf);
- so->currPos.buf = InvalidBuffer;
-}
+static IndexScanBatch _bt_endpoint(IndexScanDesc scan, ScanDirection dir,
+ IndexScanBatch firstbatch);
/*
* _bt_search() -- Search the tree for a particular scankey,
@@ -870,20 +841,16 @@ _bt_compare(Relation rel,
* conditions, and the tree ordering. We find the first item (or,
* if backwards scan, the last item) in the tree that satisfies the
* qualifications in the scan key. On success exit, data about the
- * matching tuple(s) on the page has been loaded into so->currPos. We'll
- * drop all locks and hold onto a pin on page's buffer, except during
- * so->dropPin scans, when we drop both the lock and the pin.
- * _bt_returnitem sets the next item to return to scan on success exit.
+ * matching tuple(s) on the page has been loaded into the returned batch.
*
- * If there are no matching items in the index, we return false, with no
- * pins or locks held. so->currPos will remain invalid.
+ * If there are no matching items in the index, we just return NULL.
*
* Note that scan->keyData[], and the so->keyData[] scankey built from it,
* are both search-type scankeys (see nbtree/README for more about this).
* Within this routine, we build a temporary insertion-type scankey to use
* in locating the scan start position.
*/
-bool
+IndexScanBatch
_bt_first(IndexScanDesc scan, ScanDirection dir)
{
Relation rel = scan->indexRelation;
@@ -897,8 +864,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
StrategyNumber strat_total = InvalidStrategy;
BlockNumber blkno = InvalidBlockNumber,
lastcurrblkno;
-
- Assert(!BTScanPosIsValid(so->currPos));
+ IndexScanBatch firstbatch;
/*
* Examine the scan keys and eliminate any redundant keys; also mark the
@@ -923,7 +889,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
*/
if (scan->parallel_scan != NULL &&
!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
- return false;
+ return false; /* definitely done (so->needPrimscan is unset) */
/*
* Initialize the scan's arrays (if any) for the current scan direction
@@ -940,14 +906,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
*/
Assert(scan->parallel_scan != NULL);
- Assert(!so->needPrimScan);
- Assert(blkno != P_NONE);
-
- if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
- return false;
- _bt_returnitem(scan, so);
- return true;
+ return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, true);
}
/*
@@ -1239,6 +1199,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
}
}
+ /* Allocate space for first batch */
+ firstbatch = index_batch_alloc(scan, MaxTIDsPerBTreePage, scan->xs_want_itup);
+ firstbatch->pos = palloc(sizeof(BTScanPosData));
+
/*
* If we found no usable boundary keys, we have to start from one end of
* the tree. Walk down that edge to the first or last key, and scan from
@@ -1247,7 +1211,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* Note: calls _bt_readfirstpage for us, which releases the parallel scan.
*/
if (keysz == 0)
- return _bt_endpoint(scan, dir);
+ return _bt_endpoint(scan, dir, firstbatch);
/*
* We want to start the scan somewhere within the index. Set up an
@@ -1515,12 +1479,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* position ourselves on the target leaf page.
*/
Assert(ScanDirectionIsBackward(dir) == inskey.backward);
- stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
+ stack = _bt_search(rel, NULL, &inskey, &firstbatch->buf, BT_READ);
/* don't need to keep the stack around... */
_bt_freestack(stack);
- if (!BufferIsValid(so->currPos.buf))
+ if (!BufferIsValid(firstbatch->buf))
{
Assert(!so->needPrimScan);
@@ -1536,11 +1500,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
if (IsolationIsSerializable())
{
PredicateLockRelation(rel, scan->xs_snapshot);
- stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
+ stack = _bt_search(rel, NULL, &inskey, &firstbatch->buf, BT_READ);
_bt_freestack(stack);
}
- if (!BufferIsValid(so->currPos.buf))
+ if (!BufferIsValid(firstbatch->buf))
{
_bt_parallel_done(scan);
return false;
@@ -1548,11 +1512,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
}
/* position to the precise item on the page */
- offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
+ offnum = _bt_binsrch(rel, &inskey, firstbatch->buf);
/*
* Now load data from the first page of the scan (usually the page
- * currently in so->currPos.buf).
+ * currently in firstbatch.buf).
*
* If inskey.nextkey = false and inskey.backward = false, offnum is
* positioned at the first non-pivot tuple >= inskey.scankeys.
@@ -1570,69 +1534,79 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* for the page. For example, when inskey is both < the leaf page's high
* key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
*/
- if (!_bt_readfirstpage(scan, offnum, dir))
- return false;
-
- _bt_returnitem(scan, so);
- return true;
+ return _bt_readfirstpage(scan, firstbatch, offnum, dir);
}
/*
* _bt_next() -- Get the next item in a scan.
*
- * On entry, so->currPos describes the current page, which may be pinned
- * but is not locked, and so->currPos.itemIndex identifies which item was
- * previously returned.
+ * On entry, priorbatch describes the batch that was last returned by
+ * btgetbatch. We'll use the prior batch's positioning information to
+ * decide which page to read next.
*
- * On success exit, so->currPos is updated as needed, and _bt_returnitem
- * sets the next item to return to the scan. so->currPos remains valid.
+ * On success exit, returns the next batch. There must be at least one
+ * matching tuple on any returned batch (else we'd just return NULL).
*
- * On failure exit (no more tuples), we invalidate so->currPos. It'll
- * still be possible for the scan to return tuples by changing direction,
- * though we'll need to call _bt_first anew in that other direction.
+ * On failure exit (no more tuples), we return NULL. It'll still be
+ * possible for the scan to return tuples by changing direction, though
+ * we'll need to call _bt_first anew in that other direction.
*/
-bool
-_bt_next(IndexScanDesc scan, ScanDirection dir)
+IndexScanBatch
+_bt_next(IndexScanDesc scan, ScanDirection dir, IndexScanBatch priorbatch)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ BlockNumber blkno,
+ lastcurrblkno;
+ BTScanPos priorpos = (BTScanPos) priorbatch->pos;
- Assert(BTScanPosIsValid(so->currPos));
+ Assert(BTScanPosIsValid(*priorpos));
- /*
- * Advance to next tuple on current page; or if there's no more, try to
- * step to the next page with data.
- */
+ /* Walk to the next page with data */
if (ScanDirectionIsForward(dir))
- {
- if (++so->currPos.itemIndex > so->currPos.lastItem)
- {
- if (!_bt_steppage(scan, dir))
- return false;
- }
- }
+ blkno = priorpos->nextPage;
else
+ blkno = priorpos->prevPage;
+ lastcurrblkno = priorpos->currPage;
+
+ /*
+ * Cancel primitive index scans that were scheduled when the call to
+ * _bt_readpage for pos happened to use the opposite direction to the one
+ * that we're stepping in now. (It's okay to leave the scan's array keys
+ * as-is, since the next _bt_readpage will advance them.)
+ */
+ if (priorpos->dir != dir)
+ so->needPrimScan = false;
+
+ if (blkno == P_NONE ||
+ (ScanDirectionIsForward(dir) ?
+ !priorpos->moreRight : !priorpos->moreLeft))
{
- if (--so->currPos.itemIndex < so->currPos.firstItem)
- {
- if (!_bt_steppage(scan, dir))
- return false;
- }
+ /*
+ * priorpos _bt_readpage call ended scan in this direction (though if
+ * so->needPrimScan was set the scan will continue in _bt_first)
+ */
+ _bt_parallel_done(scan);
+ return NULL;
}
- _bt_returnitem(scan, so);
- return true;
+ /* parallel scan must seize the scan to get next blkno */
+ if (scan->parallel_scan != NULL &&
+ !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
+ return NULL; /* done iff so->needPrimScan wasn't set */
+
+ return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
}
/*
- * _bt_readpage() -- Load data from current index page into so->currPos
+ * _bt_readpage() -- Load data from current index page into newbatch.
*
- * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
- * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
- * they are updated as appropriate. All other fields of so->currPos are
- * initialized from scratch here.
+ * Caller must have pinned and read-locked newbatch.buf; the buffer's state is
+ * not changed here. Also, pos.moreLeft and moreRight must be valid; they are
+ * updated as appropriate. All other fields of newbatch are initialized from
+ * scratch here.
*
* We scan the current page starting at offnum and moving in the indicated
- * direction. All items matching the scan keys are loaded into currPos.items.
+ * direction. All items matching the scan keys are saved in newbatch.items.
* moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
* that there can be no more matching tuples in the current scan direction
* (could just be for the current primitive index scan when scan has arrays).
@@ -1644,8 +1618,8 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
* Returns true if any matching items found on the page, false if none.
*/
static bool
-_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
- bool firstpage)
+_bt_readpage(IndexScanDesc scan, IndexScanBatch newbatch, ScanDirection dir,
+ OffsetNumber offnum, bool firstpage)
{
Relation rel = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque;
@@ -1656,37 +1630,35 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
BTReadPageState pstate;
bool arrayKeys;
int itemIndex,
+ tupleOffset = 0,
indnatts;
+ BTScanPos pos = newbatch->pos;
/* save the page/buffer block number, along with its sibling links */
- page = BufferGetPage(so->currPos.buf);
+ page = BufferGetPage(newbatch->buf);
opaque = BTPageGetOpaque(page);
- so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
- so->currPos.prevPage = opaque->btpo_prev;
- so->currPos.nextPage = opaque->btpo_next;
- /* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */
- so->currPos.dir = dir;
- so->currPos.nextTupleOffset = 0;
+ pos->currPage = BufferGetBlockNumber(newbatch->buf);
+ pos->prevPage = opaque->btpo_prev;
+ pos->nextPage = opaque->btpo_next;
+ pos->dir = dir;
+
+ so->pos = pos; /* _bt_checkkeys needs this */
/* either moreRight or moreLeft should be set now (may be unset later) */
- Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
- so->currPos.moreLeft);
+ Assert(ScanDirectionIsForward(dir) ? pos->moreRight : pos->moreLeft);
Assert(!P_IGNORE(opaque));
- Assert(BTScanPosIsPinned(so->currPos));
Assert(!so->needPrimScan);
if (scan->parallel_scan)
{
/* allow next/prev page to be read by other worker without delay */
if (ScanDirectionIsForward(dir))
- _bt_parallel_release(scan, so->currPos.nextPage,
- so->currPos.currPage);
+ _bt_parallel_release(scan, pos->nextPage, pos->currPage);
else
- _bt_parallel_release(scan, so->currPos.prevPage,
- so->currPos.currPage);
+ _bt_parallel_release(scan, pos->prevPage, pos->currPage);
}
- PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
+ PredicateLockPage(rel, pos->currPage, scan->xs_snapshot);
/* initialize local variables */
indnatts = IndexRelationGetNumberOfAttributes(rel);
@@ -1724,11 +1696,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
!_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
{
/* Schedule another primitive index scan after all */
- so->currPos.moreRight = false;
+ pos->moreRight = false;
so->needPrimScan = true;
if (scan->parallel_scan)
- _bt_parallel_primscan_schedule(scan,
- so->currPos.currPage);
+ _bt_parallel_primscan_schedule(scan, pos->currPage);
return false;
}
}
@@ -1792,28 +1763,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
if (!BTreeTupleIsPosting(itup))
{
/* Remember it */
- _bt_saveitem(so, itemIndex, offnum, itup);
+ _bt_saveitem(newbatch, itemIndex, offnum, itup, &tupleOffset);
itemIndex++;
}
else
{
- int tupleOffset;
+ int baseOffset;
/*
* Set up state to return posting list, and remember first
* TID
*/
- tupleOffset =
- _bt_setuppostingitems(so, itemIndex, offnum,
+ baseOffset =
+ _bt_setuppostingitems(newbatch, itemIndex, offnum,
BTreeTupleGetPostingN(itup, 0),
- itup);
+ itup, &tupleOffset);
itemIndex++;
/* Remember additional TIDs */
for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
{
- _bt_savepostingitem(so, itemIndex, offnum,
+ _bt_savepostingitem(newbatch, itemIndex, offnum,
BTreeTupleGetPostingN(itup, i),
- tupleOffset);
+ baseOffset);
itemIndex++;
}
}
@@ -1853,12 +1824,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
}
if (!pstate.continuescan)
- so->currPos.moreRight = false;
+ pos->moreRight = false;
Assert(itemIndex <= MaxTIDsPerBTreePage);
- so->currPos.firstItem = 0;
- so->currPos.lastItem = itemIndex - 1;
- so->currPos.itemIndex = 0;
+ newbatch->firstItem = 0;
+ newbatch->lastItem = itemIndex - 1;
}
else
{
@@ -1875,11 +1845,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
!_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
{
/* Schedule another primitive index scan after all */
- so->currPos.moreLeft = false;
+ pos->moreLeft = false;
so->needPrimScan = true;
if (scan->parallel_scan)
- _bt_parallel_primscan_schedule(scan,
- so->currPos.currPage);
+ _bt_parallel_primscan_schedule(scan, pos->currPage);
return false;
}
}
@@ -1980,11 +1949,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
{
/* Remember it */
itemIndex--;
- _bt_saveitem(so, itemIndex, offnum, itup);
+ _bt_saveitem(newbatch, itemIndex, offnum, itup, &tupleOffset);
}
else
{
- int tupleOffset;
+ int baseOffset;
/*
* Set up state to return posting list, and remember first
@@ -1997,17 +1966,17 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
* associated with the same posting list tuple.
*/
itemIndex--;
- tupleOffset =
- _bt_setuppostingitems(so, itemIndex, offnum,
+ baseOffset =
+ _bt_setuppostingitems(newbatch, itemIndex, offnum,
BTreeTupleGetPostingN(itup, 0),
- itup);
+ itup, &tupleOffset);
/* Remember additional TIDs */
for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
{
itemIndex--;
- _bt_savepostingitem(so, itemIndex, offnum,
+ _bt_savepostingitem(newbatch, itemIndex, offnum,
BTreeTupleGetPostingN(itup, i),
- tupleOffset);
+ baseOffset);
}
}
}
@@ -2023,12 +1992,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
* be found there
*/
if (!pstate.continuescan)
- so->currPos.moreLeft = false;
+ pos->moreLeft = false;
Assert(itemIndex >= 0);
- so->currPos.firstItem = itemIndex;
- so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
- so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
+ newbatch->firstItem = itemIndex;
+ newbatch->lastItem = MaxTIDsPerBTreePage - 1;
}
/*
@@ -2045,202 +2013,96 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
*/
Assert(!pstate.forcenonrequired);
- return (so->currPos.firstItem <= so->currPos.lastItem);
+ return (newbatch->firstItem <= newbatch->lastItem);
}
-/* Save an index item into so->currPos.items[itemIndex] */
+/* Save an index item into newbatch.items[itemIndex] */
static void
-_bt_saveitem(BTScanOpaque so, int itemIndex,
- OffsetNumber offnum, IndexTuple itup)
+_bt_saveitem(IndexScanBatch newbatch, int itemIndex, OffsetNumber offnum,
+ IndexTuple itup, int *tupleOffset)
{
- BTScanPosItem *currItem = &so->currPos.items[itemIndex];
-
Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
- currItem->heapTid = itup->t_tid;
- currItem->indexOffset = offnum;
- if (so->currTuples)
+ /* copy the populated part of the items array */
+ newbatch->items[itemIndex].heapTid = itup->t_tid;
+ newbatch->items[itemIndex].indexOffset = offnum;
+
+ if (newbatch->currTuples)
{
Size itupsz = IndexTupleSize(itup);
- currItem->tupleOffset = so->currPos.nextTupleOffset;
- memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
- so->currPos.nextTupleOffset += MAXALIGN(itupsz);
+ newbatch->items[itemIndex].tupleOffset = *tupleOffset;
+ memcpy(newbatch->currTuples + *tupleOffset, itup, itupsz);
+ *tupleOffset += MAXALIGN(itupsz);
}
}
/*
* Setup state to save TIDs/items from a single posting list tuple.
*
- * Saves an index item into so->currPos.items[itemIndex] for TID that is
- * returned to scan first. Second or subsequent TIDs for posting list should
- * be saved by calling _bt_savepostingitem().
+ * Saves an index item into newbatch.items[itemIndex] for TID that is returned
+ * to scan first. Second or subsequent TIDs for posting list should be saved
+ * by calling _bt_savepostingitem().
*
- * Returns an offset into tuple storage space that main tuple is stored at if
- * needed.
+ * Returns baseOffset, an offset into tuple storage space that main tuple is
+ * stored at if needed.
*/
static int
-_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
- const ItemPointerData *heapTid, IndexTuple itup)
+_bt_setuppostingitems(IndexScanBatch newbatch, int itemIndex,
+ OffsetNumber offnum, const ItemPointerData *heapTid,
+ IndexTuple itup, int *tupleOffset)
{
- BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+ IndexScanBatchPosItem *item = &newbatch->items[itemIndex];
Assert(BTreeTupleIsPosting(itup));
- currItem->heapTid = *heapTid;
- currItem->indexOffset = offnum;
- if (so->currTuples)
+ /* copy the populated part of the items array */
+ item->heapTid = *heapTid;
+ item->indexOffset = offnum;
+
+ if (newbatch->currTuples)
{
/* Save base IndexTuple (truncate posting list) */
IndexTuple base;
Size itupsz = BTreeTupleGetPostingOffset(itup);
itupsz = MAXALIGN(itupsz);
- currItem->tupleOffset = so->currPos.nextTupleOffset;
- base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
+ item->tupleOffset = *tupleOffset;
+ base = (IndexTuple) (newbatch->currTuples + *tupleOffset);
memcpy(base, itup, itupsz);
/* Defensively reduce work area index tuple header size */
base->t_info &= ~INDEX_SIZE_MASK;
base->t_info |= itupsz;
- so->currPos.nextTupleOffset += itupsz;
+ *tupleOffset += itupsz;
- return currItem->tupleOffset;
+ return item->tupleOffset;
}
return 0;
}
/*
- * Save an index item into so->currPos.items[itemIndex] for current posting
+ * Save an index item into newbatch.items[itemIndex] for current posting
* tuple.
*
* Assumes that _bt_setuppostingitems() has already been called for current
- * posting list tuple. Caller passes its return value as tupleOffset.
+ * posting list tuple. Caller passes its return value as baseOffset.
*/
static inline void
-_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
- ItemPointer heapTid, int tupleOffset)
+_bt_savepostingitem(IndexScanBatch newbatch, int itemIndex, OffsetNumber offnum,
+ ItemPointer heapTid, int baseOffset)
{
- BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+ IndexScanBatchPosItem *item = &newbatch->items[itemIndex];
- currItem->heapTid = *heapTid;
- currItem->indexOffset = offnum;
+ item->heapTid = *heapTid;
+ item->indexOffset = offnum;
/*
* Have index-only scans return the same base IndexTuple for every TID
* that originates from the same posting list
*/
- if (so->currTuples)
- currItem->tupleOffset = tupleOffset;
-}
-
-/*
- * Return the index item from so->currPos.items[so->currPos.itemIndex] to the
- * index scan by setting the relevant fields in caller's index scan descriptor
- */
-static inline void
-_bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
-{
- BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
-
- /* Most recent _bt_readpage must have succeeded */
- Assert(BTScanPosIsValid(so->currPos));
- Assert(so->currPos.itemIndex >= so->currPos.firstItem);
- Assert(so->currPos.itemIndex <= so->currPos.lastItem);
-
- /* Return next item, per amgettuple contract */
- scan->xs_heaptid = currItem->heapTid;
- if (so->currTuples)
- scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
-}
-
-/*
- * _bt_steppage() -- Step to next page containing valid data for scan
- *
- * Wrapper on _bt_readnextpage that performs final steps for the current page.
- *
- * On entry, so->currPos must be valid. Its buffer will be pinned, though
- * never locked. (Actually, when so->dropPin there won't even be a pin held,
- * though so->currPos.currPage must still be set to a valid block number.)
- */
-static bool
-_bt_steppage(IndexScanDesc scan, ScanDirection dir)
-{
- BTScanOpaque so = (BTScanOpaque) scan->opaque;
- BlockNumber blkno,
- lastcurrblkno;
-
- Assert(BTScanPosIsValid(so->currPos));
-
- /* Before leaving current page, deal with any killed items */
- if (so->numKilled > 0)
- _bt_killitems(scan);
-
- /*
- * Before we modify currPos, make a copy of the page data if there was a
- * mark position that needs it.
- */
- if (so->markItemIndex >= 0)
- {
- /* bump pin on current buffer for assignment to mark buffer */
- if (BTScanPosIsPinned(so->currPos))
- IncrBufferRefCount(so->currPos.buf);
- memcpy(&so->markPos, &so->currPos,
- offsetof(BTScanPosData, items[1]) +
- so->currPos.lastItem * sizeof(BTScanPosItem));
- if (so->markTuples)
- memcpy(so->markTuples, so->currTuples,
- so->currPos.nextTupleOffset);
- so->markPos.itemIndex = so->markItemIndex;
- so->markItemIndex = -1;
-
- /*
- * If we're just about to start the next primitive index scan
- * (possible with a scan that has arrays keys, and needs to skip to
- * continue in the current scan direction), moreLeft/moreRight only
- * indicate the end of the current primitive index scan. They must
- * never be taken to indicate that the top-level index scan has ended
- * (that would be wrong).
- *
- * We could handle this case by treating the current array keys as
- * markPos state. But depending on the current array state like this
- * would add complexity. Instead, we just unset markPos's copy of
- * moreRight or moreLeft (whichever might be affected), while making
- * btrestrpos reset the scan's arrays to their initial scan positions.
- * In effect, btrestrpos leaves advancing the arrays up to the first
- * _bt_readpage call (that takes place after it has restored markPos).
- */
- if (so->needPrimScan)
- {
- if (ScanDirectionIsForward(so->currPos.dir))
- so->markPos.moreRight = true;
- else
- so->markPos.moreLeft = true;
- }
-
- /* mark/restore not supported by parallel scans */
- Assert(!scan->parallel_scan);
- }
-
- BTScanPosUnpinIfPinned(so->currPos);
-
- /* Walk to the next page with data */
- if (ScanDirectionIsForward(dir))
- blkno = so->currPos.nextPage;
- else
- blkno = so->currPos.prevPage;
- lastcurrblkno = so->currPos.currPage;
-
- /*
- * Cancel primitive index scans that were scheduled when the call to
- * _bt_readpage for currPos happened to use the opposite direction to the
- * one that we're stepping in now. (It's okay to leave the scan's array
- * keys as-is, since the next _bt_readpage will advance them.)
- */
- if (so->currPos.dir != dir)
- so->needPrimScan = false;
-
- return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
+ if (newbatch->currTuples)
+ item->tupleOffset = baseOffset;
}
/*
@@ -2252,73 +2114,96 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
* to stop the scan on this page by calling _bt_checkkeys against the high
* key. See _bt_readpage for full details.
*
- * On entry, so->currPos must be pinned and locked (so offnum stays valid).
+ * On entry, firstbatch must be pinned and locked (so offnum stays valid).
* Parallel scan callers must have seized the scan before calling here.
*
- * On exit, we'll have updated so->currPos and retained locks and pins
+ * On exit, we'll have updated firstbatch and retained locks and pins
* according to the same rules as those laid out for _bt_readnextpage exit.
- * Like _bt_readnextpage, our return value indicates if there are any matching
- * records in the given direction.
*
* We always release the scan for a parallel scan caller, regardless of
* success or failure; we'll call _bt_parallel_release as soon as possible.
*/
-static bool
-_bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
+static IndexScanBatch
+_bt_readfirstpage(IndexScanDesc scan, IndexScanBatch firstbatch,
+ OffsetNumber offnum, ScanDirection dir)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Relation rel = scan->indexRelation;
+ BlockNumber blkno,
+ lastcurrblkno;
+ BTScanPos firstpos = firstbatch->pos;
- so->numKilled = 0; /* just paranoia */
- so->markItemIndex = -1; /* ditto */
-
- /* Initialize so->currPos for the first page (page in so->currPos.buf) */
+ /* Initialize firstbatch's position for the first page */
if (so->needPrimScan)
{
Assert(so->numArrayKeys);
- so->currPos.moreLeft = true;
- so->currPos.moreRight = true;
+ firstpos->moreLeft = true;
+ firstpos->moreRight = true;
so->needPrimScan = false;
}
else if (ScanDirectionIsForward(dir))
{
- so->currPos.moreLeft = false;
- so->currPos.moreRight = true;
+ firstpos->moreLeft = false;
+ firstpos->moreRight = true;
}
else
{
- so->currPos.moreLeft = true;
- so->currPos.moreRight = false;
+ firstpos->moreLeft = true;
+ firstpos->moreRight = false;
}
/*
* Attempt to load matching tuples from the first page.
*
- * Note that _bt_readpage will finish initializing the so->currPos fields.
+ * Note that _bt_readpage will finish initializing the firstbatch fields.
* _bt_readpage also releases parallel scan (even when it returns false).
*/
- if (_bt_readpage(scan, dir, offnum, true))
+ if (_bt_readpage(scan, firstbatch, dir, offnum, true))
{
- Relation rel = scan->indexRelation;
+ /* _bt_readpage succeeded */
+ index_batch_unlock(rel, scan->batchState && scan->batchState->dropPin,
+ firstbatch);
+ return firstbatch;
+ }
+
+ /* There's no actually-matching data on the page in firstbatch->buf */
+ _bt_relbuf(rel, firstbatch->buf);
+ firstbatch->buf = InvalidBuffer;
+ /* Walk to the next page with data */
+ if (ScanDirectionIsForward(dir))
+ blkno = firstpos->nextPage;
+ else
+ blkno = firstpos->prevPage;
+ lastcurrblkno = firstpos->currPage;
+
+ Assert(firstpos->dir == dir);
+
+ /* firstbatch will never be returned to scan, so free it outselves */
+ pfree(firstbatch);
+
+ if (blkno == P_NONE ||
+ (ScanDirectionIsForward(dir) ?
+ !firstpos->moreRight : !firstpos->moreLeft))
+ {
/*
- * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
- * so->currPos.buf in preparation for btgettuple returning tuples.
+ * firstbatch _bt_readpage call ended scan in this direction (though
+ * if so->needPrimScan was set the scan will continue in _bt_first)
*/
- Assert(BTScanPosIsPinned(so->currPos));
- _bt_drop_lock_and_maybe_pin(rel, so);
- return true;
+ pfree(firstpos);
+ _bt_parallel_done(scan);
+ return NULL;
}
- /* There's no actually-matching data on the page in so->currPos.buf */
- _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+ pfree(firstpos);
- /* Call _bt_readnextpage using its _bt_steppage wrapper function */
- if (!_bt_steppage(scan, dir))
- return false;
+ /* parallel scan must seize the scan to get next blkno */
+ if (scan->parallel_scan != NULL &&
+ !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
+ return NULL; /* done iff so->needPrimScan wasn't set */
- /* _bt_readpage for a later page (now in so->currPos) succeeded */
- return true;
+ return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
}
/*
@@ -2328,102 +2213,70 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
* previously-saved right link or left link. lastcurrblkno is the page that
* was current at the point where the blkno link was saved, which we use to
* reason about concurrent page splits/page deletions during backwards scans.
- * In the common case where seized=false, blkno is either so->currPos.nextPage
- * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage.
+ * blkno is the prior scan position's nextPage or prevPage (depending on scan
+ * direction), and lastcurrblkno is the prior position's currPage.
*
- * On entry, so->currPos shouldn't be locked by caller. so->currPos.buf must
- * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno
- * won't need to be read again in almost all cases). Parallel scan callers
- * that seized the scan before calling here should pass seized=true; such a
- * caller's blkno and lastcurrblkno arguments come from the seized scan.
- * seized=false callers just pass us the blkno/lastcurrblkno taken from their
- * so->currPos, which (along with so->currPos itself) can be used to end the
- * scan. A seized=false caller's blkno can never be assumed to be the page
- * that must be read next during a parallel scan, though. We must figure that
- * part out for ourselves by seizing the scan (the correct page to read might
- * already be beyond the seized=false caller's blkno during a parallel scan,
- * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE,
- * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset).
+ * On entry, no page should be locked by caller.
*
- * On success exit, so->currPos is updated to contain data from the next
- * interesting page, and we return true. We hold a pin on the buffer on
- * success exit (except during so->dropPin index scans, when we drop the pin
- * eagerly to avoid blocking VACUUM).
+ * On success exit, returns scan batch containing data from the next
+ * interesting page. We hold a pin on the buffer on success exit (except
+ * during dropPin plain index scans, when we drop the pin eagerly to avoid
+ * blocking VACUUM). If there are no more matching records in the given
+ * direction, we just return NULL.
*
- * If there are no more matching records in the given direction, we invalidate
- * so->currPos (while ensuring it retains no locks or pins), and return false.
- *
- * We always release the scan for a parallel scan caller, regardless of
- * success or failure; we'll call _bt_parallel_release as soon as possible.
+ * Parallel scan callers must seize the scan before calling here. blkno and
+ * lastcurrblkno should come from the seized scan. We'll release the scan as
+ * soon as possible.
*/
-static bool
+static IndexScanBatch
_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
- BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
+ BlockNumber lastcurrblkno, ScanDirection dir, bool firstpage)
{
Relation rel = scan->indexRelation;
- BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ IndexScanBatch newbatch;
+ BTScanPos newpos;
- Assert(so->currPos.currPage == lastcurrblkno || seized);
- Assert(!(blkno == P_NONE && seized));
- Assert(!BTScanPosIsPinned(so->currPos));
+ /* Allocate space for next batch */
+ newbatch = index_batch_alloc(scan, MaxTIDsPerBTreePage, scan->xs_want_itup);
+ newbatch->pos = palloc(sizeof(BTScanPosData));
+ newpos = newbatch->pos;
/*
- * Remember that the scan already read lastcurrblkno, a page to the left
- * of blkno (or remember reading a page to the right, for backwards scans)
+ * pos is the first valid page to the right (or to the left) of
+ * lastcurrblkno. Also provisionally assume that there'll be another page
+ * we'll need to the right (or to the left) ahead of _bt_readpage call.
*/
- if (ScanDirectionIsForward(dir))
- so->currPos.moreLeft = true;
- else
- so->currPos.moreRight = true;
+ newpos->moreLeft = true;
+ newpos->moreRight = true;
for (;;)
{
Page page;
BTPageOpaque opaque;
- if (blkno == P_NONE ||
- (ScanDirectionIsForward(dir) ?
- !so->currPos.moreRight : !so->currPos.moreLeft))
- {
- /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
- Assert(so->currPos.currPage == lastcurrblkno && !seized);
- BTScanPosInvalidate(so->currPos);
- _bt_parallel_done(scan); /* iff !so->needPrimScan */
- return false;
- }
-
- Assert(!so->needPrimScan);
-
- /* parallel scan must never actually visit so->currPos blkno */
- if (!seized && scan->parallel_scan != NULL &&
- !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
- {
- /* whole scan is now done (or another primitive scan required) */
- BTScanPosInvalidate(so->currPos);
- return false;
- }
+ Assert(!((BTScanOpaque) scan->opaque)->needPrimScan);
+ Assert(blkno != P_NONE && lastcurrblkno != P_NONE);
if (ScanDirectionIsForward(dir))
{
/* read blkno, but check for interrupts first */
CHECK_FOR_INTERRUPTS();
- so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+ newbatch->buf = _bt_getbuf(rel, blkno, BT_READ);
}
else
{
/* read blkno, avoiding race (also checks for interrupts) */
- so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
- lastcurrblkno);
- if (so->currPos.buf == InvalidBuffer)
+ newbatch->buf = _bt_lock_and_validate_left(rel, &blkno,
+ lastcurrblkno);
+ if (newbatch->buf == InvalidBuffer)
{
/* must have been a concurrent deletion of leftmost page */
- BTScanPosInvalidate(so->currPos);
_bt_parallel_done(scan);
- return false;
+ return NULL;
}
}
- page = BufferGetPage(so->currPos.buf);
+ page = BufferGetPage(newbatch->buf);
opaque = BTPageGetOpaque(page);
lastcurrblkno = blkno;
if (likely(!P_IGNORE(opaque)))
@@ -2431,17 +2284,17 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
/* see if there are any matches on this page */
if (ScanDirectionIsForward(dir))
{
- /* note that this will clear moreRight if we can stop */
- if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
+ if (_bt_readpage(scan, newbatch, dir,
+ P_FIRSTDATAKEY(opaque), firstpage))
break;
- blkno = so->currPos.nextPage;
+ blkno = newpos->nextPage;
}
else
{
- /* note that this will clear moreLeft if we can stop */
- if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
+ if (_bt_readpage(scan, newbatch, dir,
+ PageGetMaxOffsetNumber(page), firstpage))
break;
- blkno = so->currPos.prevPage;
+ blkno = newpos->prevPage;
}
}
else
@@ -2456,19 +2309,36 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
}
/* no matching tuples on this page */
- _bt_relbuf(rel, so->currPos.buf);
- seized = false; /* released by _bt_readpage (or by us) */
+ _bt_relbuf(rel, newbatch->buf);
+ newbatch->buf = InvalidBuffer;
+
+ /* Continue the scan in this direction? */
+ if (blkno == P_NONE ||
+ (ScanDirectionIsForward(dir) ?
+ !newpos->moreRight : !newpos->moreLeft))
+ {
+ /*
+ * blkno _bt_readpage call ended scan in this direction (though if
+ * so->needPrimScan was set the scan will continue in _bt_first)
+ */
+ _bt_parallel_done(scan);
+ return NULL;
+ }
+
+ /* parallel scan must seize the scan to get next blkno */
+ if (scan->parallel_scan != NULL &&
+ !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
+ return NULL; /* done iff so->needPrimScan wasn't set */
+
+ firstpage = false; /* next page cannot be first */
}
- /*
- * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
- * so->currPos.buf in preparation for btgettuple returning tuples.
- */
- Assert(so->currPos.currPage == blkno);
- Assert(BTScanPosIsPinned(so->currPos));
- _bt_drop_lock_and_maybe_pin(rel, so);
+ /* _bt_readpage succeeded */
+ Assert(newpos->currPage == blkno);
+ index_batch_unlock(rel, scan->batchState && scan->batchState->dropPin,
+ newbatch);
- return true;
+ return newbatch;
}
/*
@@ -2691,25 +2561,23 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
* Parallel scan callers must have seized the scan before calling here.
* Exit conditions are the same as for _bt_first().
*/
-static bool
-_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
+static IndexScanBatch
+_bt_endpoint(IndexScanDesc scan, ScanDirection dir, IndexScanBatch firstbatch)
{
Relation rel = scan->indexRelation;
- BTScanOpaque so = (BTScanOpaque) scan->opaque;
Page page;
BTPageOpaque opaque;
OffsetNumber start;
- Assert(!BTScanPosIsValid(so->currPos));
- Assert(!so->needPrimScan);
+ Assert(!((BTScanOpaque) scan->opaque)->needPrimScan);
/*
* Scan down to the leftmost or rightmost leaf page. This is a simplified
* version of _bt_search().
*/
- so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
+ firstbatch->buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
- if (!BufferIsValid(so->currPos.buf))
+ if (!BufferIsValid(firstbatch->buf))
{
/*
* Empty index. Lock the whole relation, as nothing finer to lock
@@ -2720,7 +2588,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
return false;
}
- page = BufferGetPage(so->currPos.buf);
+ page = BufferGetPage(firstbatch->buf);
opaque = BTPageGetOpaque(page);
Assert(P_ISLEAF(opaque));
@@ -2746,9 +2614,5 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
/*
* Now load data from the first page of the scan.
*/
- if (!_bt_readfirstpage(scan, start, dir))
- return false;
-
- _bt_returnitem(scan, so);
- return true;
+ return _bt_readfirstpage(scan, firstbatch, start, dir);
}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index ab0f98b0287c..9872de87a2d9 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1022,14 +1022,6 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir,
* Restore the array keys to the state they were in immediately before we
* were called. This ensures that the arrays only ever ratchet in the
* current scan direction.
- *
- * Without this, scans could overlook matching tuples when the scan
- * direction gets reversed just before btgettuple runs out of items to
- * return, but just after _bt_readpage prepares all the items from the
- * scan's final page in so->currPos. When we're on the final page it is
- * typical for so->currPos to get invalidated once btgettuple finally
- * returns false, which'll effectively invalidate the scan's array keys.
- * That hasn't happened yet, though -- and in general it may never happen.
*/
_bt_start_array_keys(scan, -dir);
@@ -1396,7 +1388,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Relation rel = scan->indexRelation;
- ScanDirection dir = so->currPos.dir;
+ ScanDirection dir = so->pos->dir;
int arrayidx = 0;
bool beyond_end_advance = false,
skip_array_advanced = false,
@@ -2033,13 +2025,13 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
* Note: We make a soft assumption that the current scan direction will
* also be used within _bt_next, when it is asked to step off this page.
* It is up to _bt_next to cancel this scheduled primitive index scan
- * whenever it steps to a page in the direction opposite currPos.dir.
+ * whenever it steps to a page in the direction opposite pos->dir.
*/
pstate->continuescan = false; /* Tell _bt_readpage we're done... */
so->needPrimScan = true; /* ...but call _bt_first again */
if (scan->parallel_scan)
- _bt_parallel_primscan_schedule(scan, so->currPos.currPage);
+ _bt_parallel_primscan_schedule(scan, so->pos->currPage);
/* Caller's tuple doesn't match the new qual */
return false;
@@ -2152,7 +2144,7 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
{
TupleDesc tupdesc = RelationGetDescr(scan->indexRelation);
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- ScanDirection dir = so->currPos.dir;
+ ScanDirection dir = so->pos->dir;
int ikey = pstate->startikey;
bool res;
@@ -3302,7 +3294,7 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
int tupnatts, TupleDesc tupdesc)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- ScanDirection dir = so->currPos.dir;
+ ScanDirection dir = so->pos->dir;
OffsetNumber aheadoffnum;
IndexTuple ahead;
@@ -3376,69 +3368,67 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
* current page and killed tuples thereon (generally, this should only be
* called if so->numKilled > 0).
*
- * Caller should not have a lock on the so->currPos page, but must hold a
- * buffer pin when !so->dropPin. When we return, it still won't be locked.
- * It'll continue to hold whatever pins were held before calling here.
+ * Caller should not have a lock on the batch position's page, but must hold a
+ * buffer pin when !dropPin. When we return, it still won't be locked. It'll
+ * continue to hold whatever pins were held before calling here.
*
* We match items by heap TID before assuming they are the right ones to set
* LP_DEAD. If the scan is one that holds a buffer pin on the target page
* continuously from initially reading the items until applying this function
- * (if it is a !so->dropPin scan), VACUUM cannot have deleted any items on the
+ * (if it is a !dropPin scan), VACUUM cannot have deleted any items on the
* page, so the page's TIDs can't have been recycled by now. There's no risk
* that we'll confuse a new index tuple that happens to use a recycled TID
* with a now-removed tuple with the same TID (that used to be on this same
* page). We can't rely on that during scans that drop buffer pins eagerly
- * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on
+ * (i.e. dropPin scans), though, so we must condition setting LP_DEAD bits on
* the page LSN having not changed since back when _bt_readpage saw the page.
* We totally give up on setting LP_DEAD bits when the page LSN changed.
*
- * We give up much less often during !so->dropPin scans, but it still happens.
+ * We tend to give up less often during !dropPin scans, but it still happens.
* We cope with cases where items have moved right due to insertions. If an
* item has moved off the current page due to a split, we'll fail to find it
* and just give up on it.
*/
void
-_bt_killitems(IndexScanDesc scan)
+_bt_killitems(IndexScanDesc scan, IndexScanBatch batch)
{
Relation rel = scan->indexRelation;
- BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ BTScanPos pos = (BTScanPos) batch->pos;
Page page;
BTPageOpaque opaque;
OffsetNumber minoff;
OffsetNumber maxoff;
- int numKilled = so->numKilled;
+ int numKilled = batch->numKilled;
bool killedsomething = false;
Buffer buf;
Assert(numKilled > 0);
- Assert(BTScanPosIsValid(so->currPos));
+ Assert(BTScanPosIsValid(*pos));
Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
- /* Always invalidate so->killedItems[] before leaving so->currPos */
- so->numKilled = 0;
+ /* Always invalidate batch->killedItems[] before freeing batch */
+ batch->numKilled = 0;
- if (!so->dropPin)
+ if (!scan->batchState->dropPin)
{
/*
* We have held the pin on this page since we read the index tuples,
* so all we need to do is lock it. The pin will have prevented
* concurrent VACUUMs from recycling any of the TIDs on the page.
*/
- Assert(BTScanPosIsPinned(so->currPos));
- buf = so->currPos.buf;
+ buf = batch->buf;
_bt_lockbuf(rel, buf, BT_READ);
}
else
{
XLogRecPtr latestlsn;
- Assert(!BTScanPosIsPinned(so->currPos));
Assert(RelationNeedsWAL(rel));
- buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
+ buf = _bt_getbuf(rel, pos->currPage, BT_READ);
latestlsn = BufferGetLSNAtomic(buf);
- Assert(so->currPos.lsn <= latestlsn);
- if (so->currPos.lsn != latestlsn)
+ Assert(batch->lsn <= latestlsn);
+ if (batch->lsn != latestlsn)
{
/* Modified, give up on hinting */
_bt_relbuf(rel, buf);
@@ -3455,12 +3445,11 @@ _bt_killitems(IndexScanDesc scan)
for (int i = 0; i < numKilled; i++)
{
- int itemIndex = so->killedItems[i];
- BTScanPosItem *kitem = &so->currPos.items[itemIndex];
+ int itemIndex = batch->killedItems[i];
+ IndexScanBatchPosItem *kitem = &batch->items[itemIndex];
OffsetNumber offnum = kitem->indexOffset;
- Assert(itemIndex >= so->currPos.firstItem &&
- itemIndex <= so->currPos.lastItem);
+ Assert(itemIndex >= batch->firstItem && itemIndex <= batch->lastItem);
if (offnum < minoff)
continue; /* pure paranoia */
while (offnum <= maxoff)
@@ -3485,7 +3474,7 @@ _bt_killitems(IndexScanDesc scan)
* correctness.
*
* Note that the page may have been modified in almost any way
- * since we first read it (in the !so->dropPin case), so it's
+ * since we first read it (in the !dropPin case), so it's
* possible that this posting list tuple wasn't a posting list
* tuple when we first encountered its heap TIDs.
*/
@@ -3501,7 +3490,8 @@ _bt_killitems(IndexScanDesc scan)
* though only in the common case where the page can't
* have been concurrently modified
*/
- Assert(kitem->indexOffset == offnum || !so->dropPin);
+ Assert(kitem->indexOffset == offnum ||
+ !scan->batchState->dropPin);
/*
* Read-ahead to later kitems here.
@@ -3518,7 +3508,7 @@ _bt_killitems(IndexScanDesc scan)
* correctly -- posting tuple still gets killed).
*/
if (pi < numKilled)
- kitem = &so->currPos.items[so->killedItems[pi++]];
+ kitem = &batch->items[batch->killedItems[pi++]];
}
/*
@@ -3568,7 +3558,7 @@ _bt_killitems(IndexScanDesc scan)
MarkBufferDirtyHint(buf, true);
}
- if (!so->dropPin)
+ if (!scan->batchState->dropPin)
_bt_unlockbuf(rel, buf);
else
_bt_relbuf(rel, buf);
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 87c31da71a52..9cbb77438a81 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -90,8 +90,7 @@ spghandler(PG_FUNCTION_ARGS)
amroutine->amgettuple = spggettuple;
amroutine->amgetbitmap = spggetbitmap;
amroutine->amendscan = spgendscan;
- amroutine->ammarkpos = NULL;
- amroutine->amrestrpos = NULL;
+ amroutine->amposreset = NULL;
amroutine->amestimateparallelscan = NULL;
amroutine->aminitparallelscan = NULL;
amroutine->amparallelrescan = NULL;
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index 5712fac36971..6d895e4ffce7 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -880,7 +880,7 @@ DefineIndex(Oid tableId,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("access method \"%s\" does not support multicolumn indexes",
accessMethodName)));
- if (exclusion && amRoutine->amgettuple == NULL)
+ if (exclusion && amRoutine->amgettuple == NULL && amRoutine->amgetbatch == NULL)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("access method \"%s\" does not support exclusion constraints",
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
index 1d0e8ad57b4a..ac337d9000f1 100644
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@@ -428,7 +428,7 @@ ExecSupportsMarkRestore(Path *pathnode)
case T_IndexOnlyScan:
/*
- * Not all index types support mark/restore.
+ * Not all index types support restoring a mark
*/
return castNode(IndexPath, pathnode)->indexinfo->amcanmarkpos;
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index f464cca9507a..5e7bafe07a34 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -49,7 +49,13 @@
static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
static void StoreIndexTuple(IndexOnlyScanState *node, TupleTableSlot *slot,
IndexTuple itup, TupleDesc itupdesc);
+static bool ios_prefetch_block(IndexScanDesc scan, void *arg,
+ IndexScanBatchPos *pos);
+/* values stored in ios_prefetch_block in the batch cache */
+#define IOS_UNKNOWN_VISIBILITY 0 /* default value */
+#define IOS_ALL_VISIBLE 1
+#define IOS_NOT_ALL_VISIBLE 2
/* ----------------------------------------------------------------
* IndexOnlyNext
@@ -103,6 +109,17 @@ IndexOnlyNext(IndexOnlyScanState *node)
node->ioss_ScanDesc->xs_want_itup = true;
node->ioss_VMBuffer = InvalidBuffer;
+ /*
+ * Set the prefetch callback info, if the scan has batching enabled
+ * (we only know what after index_beginscan, which also checks which
+ * callbacks are defined for the AM.
+ */
+ if (scandesc->batchState != NULL)
+ {
+ scandesc->batchState->prefetch = ios_prefetch_block;
+ scandesc->batchState->prefetchArg = (void *) node;
+ }
+
/*
* If no run-time keys to calculate or they are ready, go ahead and
* pass the scankeys to the index AM.
@@ -120,10 +137,42 @@ IndexOnlyNext(IndexOnlyScanState *node)
*/
while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
{
+ bool all_visible;
bool tuple_from_heap = false;
CHECK_FOR_INTERRUPTS();
+ /*
+ * Without batching, inspect the VM directly. With batching, we need
+ * to retrieve the visibility information seen by the read_stream
+ * callback (or rather by ios_prefetch_block), otherwise the
+ * read_stream might get out of sync (if the VM got updated since
+ * then).
+ */
+ if (scandesc->batchState == NULL)
+ {
+ all_visible = VM_ALL_VISIBLE(scandesc->heapRelation,
+ ItemPointerGetBlockNumber(tid),
+ &node->ioss_VMBuffer);
+ }
+ else
+ {
+ /*
+ * Reuse the previously determined page visibility info, or
+ * calculate it now. If we decided not to prefetch the block, the
+ * page had to be all-visible at that point. The VM bit might have
+ * changed since then, but the tuple visibility could not have.
+ *
+ * XXX It's a bit weird we use the visibility to decide if we
+ * should skip prefetching the block, and then deduce the
+ * visibility from that (even if it matches pretty clearly). But
+ * maybe we could/should have a more direct way to read the
+ * private state?
+ */
+ all_visible = !ios_prefetch_block(scandesc, node,
+ &scandesc->batchState->readPos);
+ }
+
/*
* We can skip the heap fetch if the TID references a heap page on
* which all tuples are known visible to everybody. In any case,
@@ -158,9 +207,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
* It's worth going through this complexity to avoid needing to lock
* the VM buffer, which could cause significant contention.
*/
- if (!VM_ALL_VISIBLE(scandesc->heapRelation,
- ItemPointerGetBlockNumber(tid),
- &node->ioss_VMBuffer))
+ if (!all_visible)
{
/*
* Rats, we have to visit the heap to check visibility.
@@ -889,3 +936,51 @@ ExecIndexOnlyScanRetrieveInstrumentation(IndexOnlyScanState *node)
node->ioss_SharedInfo = palloc(size);
memcpy(node->ioss_SharedInfo, SharedInfo, size);
}
+
+/* FIXME duplicate from indexam.c */
+#define INDEX_SCAN_BATCH(scan, idx) \
+ ((scan)->batchState->batches[(idx) % (scan)->batchState->maxBatches])
+
+/*
+ * ios_prefetch_block
+ * Callback to only prefetch blocks that are not all-visible.
+ *
+ * We don't want to inspect the visibility map repeatedly, so the result of
+ * VM_ALL_VISIBLE is stored in the batch private data. The values are set
+ * to 0 by default, so we use two constants to remember if all-visible or
+ * not all-visible.
+ *
+ * However, this is not merely a question of performance. The VM may get
+ * modified during the scan, and we need to make sure the two places (the
+ * read_next callback and the index_fetch_heap here) make the same decision,
+ * otherwise we might get out of sync with the stream. For example, the
+ * callback might find a page is all-visible (and skips reading the block),
+ * and then someone might update the page, resetting the VM bit. If this
+ * place attempts to read the page from the stream, it'll fail because it
+ * will probably receive an entirely different page.
+ */
+static bool
+ios_prefetch_block(IndexScanDesc scan, void *arg, IndexScanBatchPos *pos)
+{
+ IndexOnlyScanState *node = (IndexOnlyScanState *) arg;
+ IndexScanBatch batch = INDEX_SCAN_BATCH(scan, pos->batch);
+
+ if (batch->itemsvisibility == NULL)
+ batch->itemsvisibility = palloc0(sizeof(char) * (batch->lastItem + 1));
+
+ if (batch->itemsvisibility[pos->index] == IOS_UNKNOWN_VISIBILITY)
+ {
+ bool all_visible;
+ ItemPointer tid = &batch->items[pos->index].heapTid;
+
+ all_visible = VM_ALL_VISIBLE(scan->heapRelation,
+ ItemPointerGetBlockNumber(tid),
+ &node->ioss_VMBuffer);
+
+ batch->itemsvisibility[pos->index] =
+ all_visible ? IOS_ALL_VISIBLE : IOS_NOT_ALL_VISIBLE;
+ }
+
+ /* prefetch only blocks that are not all-visible */
+ return (batch->itemsvisibility[pos->index] == IOS_NOT_ALL_VISIBLE);
+}
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 94077e6a006d..ab2756d47aaa 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -144,6 +144,7 @@ int max_parallel_workers_per_gather = 2;
bool enable_seqscan = true;
bool enable_indexscan = true;
+bool enable_indexscan_prefetch = true;
bool enable_indexonlyscan = true;
bool enable_bitmapscan = true;
bool enable_tidscan = true;
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index f4b7343daced..a315439dcdd4 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -313,11 +313,11 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
info->amsearcharray = amroutine->amsearcharray;
info->amsearchnulls = amroutine->amsearchnulls;
info->amcanparallel = amroutine->amcanparallel;
- info->amhasgettuple = (amroutine->amgettuple != NULL);
+ info->amhasgettuple = (amroutine->amgettuple != NULL ||
+ amroutine->amgetbatch != NULL);
info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
relation->rd_tableam->scan_bitmap_next_tuple != NULL;
- info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
- amroutine->amrestrpos != NULL);
+ info->amcanmarkpos = amroutine->amposreset != NULL;
info->amcostestimate = amroutine->amcostestimate;
Assert(info->amcostestimate != NULL);
diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c
index 745fd3bab640..de6c66a15241 100644
--- a/src/backend/replication/logical/relation.c
+++ b/src/backend/replication/logical/relation.c
@@ -888,7 +888,8 @@ IsIndexUsableForReplicaIdentityFull(Relation idxrel, AttrMap *attrmap)
* The given index access method must implement "amgettuple", which will
* be used later to fetch the tuples. See RelationFindReplTupleByIndex().
*/
- if (GetIndexAmRoutineByAmId(idxrel->rd_rel->relam, false)->amgettuple == NULL)
+ if (GetIndexAmRoutineByAmId(idxrel->rd_rel->relam, false)->amgettuple == NULL &&
+ GetIndexAmRoutineByAmId(idxrel->rd_rel->relam, false)->amgetbatch == NULL)
return false;
return true;
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 031fde9f4cbe..e34e60060604 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -99,6 +99,7 @@ struct ReadStream
int16 forwarded_buffers;
int16 pinned_buffers;
int16 distance;
+ int16 distance_old;
int16 initialized_buffers;
int read_buffers_flags;
bool sync_mode; /* using io_method=sync */
@@ -464,6 +465,7 @@ read_stream_look_ahead(ReadStream *stream)
if (blocknum == InvalidBlockNumber)
{
/* End of stream. */
+ stream->distance_old = stream->distance;
stream->distance = 0;
break;
}
@@ -862,6 +864,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
else
{
/* No more blocks, end of stream. */
+ stream->distance_old = stream->distance;
stream->distance = 0;
stream->oldest_buffer_index = stream->next_buffer_index;
stream->pinned_buffers = 0;
@@ -1046,6 +1049,9 @@ read_stream_reset(ReadStream *stream)
int16 index;
Buffer buffer;
+ /* remember the old distance (if we reset before end of the stream) */
+ stream->distance_old = Max(stream->distance, stream->distance_old);
+
/* Stop looking ahead. */
stream->distance = 0;
@@ -1078,8 +1084,12 @@ read_stream_reset(ReadStream *stream)
Assert(stream->pinned_buffers == 0);
Assert(stream->ios_in_progress == 0);
- /* Start off assuming data is cached. */
- stream->distance = 1;
+ /*
+ * Restore the old distance, if we have one. Otherwise start assuming data
+ * is cached.
+ */
+ stream->distance = Max(1, stream->distance_old);
+ stream->distance_old = 0;
}
/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e8544acb7841..ab044b05e055 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1538,6 +1538,46 @@ ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
}
+/*
+ * Check if the buffer is already undergoing read AIO. If it is, assign the
+ * IO's wait reference to operation->io_wref, thereby allowing the caller to
+ * wait for that IO.
+ */
+static inline bool
+ReadBuffersIOAlreadyInProgress(ReadBuffersOperation *operation, Buffer buffer)
+{
+ BufferDesc *desc;
+ uint32 buf_state;
+ PgAioWaitRef iow;
+
+ pgaio_wref_clear(&iow);
+
+ if (BufferIsLocal(buffer))
+ {
+ desc = GetLocalBufferDescriptor(-buffer - 1);
+ buf_state = pg_atomic_read_u32(&desc->state);
+ if ((buf_state & BM_IO_IN_PROGRESS) && !(buf_state & BM_VALID))
+ iow = desc->io_wref;
+ }
+ else
+ {
+ desc = GetBufferDescriptor(buffer - 1);
+ buf_state = LockBufHdr(desc);
+
+ if ((buf_state & BM_IO_IN_PROGRESS) && !(buf_state & BM_VALID))
+ iow = desc->io_wref;
+ UnlockBufHdr(desc, buf_state);
+ }
+
+ if (pgaio_wref_valid(&iow))
+ {
+ operation->io_wref = iow;
+ return true;
+ }
+
+ return false;
+}
+
/*
* Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
*/
@@ -1670,7 +1710,7 @@ WaitReadBuffers(ReadBuffersOperation *operation)
*
* we first check if we already know the IO is complete.
*/
- if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
+ if ((operation->foreign_io || aio_ret->result.status == PGAIO_RS_UNKNOWN) &&
!pgaio_wref_check_done(&operation->io_wref))
{
instr_time io_start = pgstat_prepare_io_time(track_io_timing);
@@ -1689,11 +1729,66 @@ WaitReadBuffers(ReadBuffersOperation *operation)
Assert(pgaio_wref_check_done(&operation->io_wref));
}
- /*
- * We now are sure the IO completed. Check the results. This
- * includes reporting on errors if there were any.
- */
- ProcessReadBuffersResult(operation);
+ if (unlikely(operation->foreign_io))
+ {
+ Buffer buffer = operation->buffers[operation->nblocks_done];
+ BufferDesc *desc;
+ uint32 buf_state;
+
+ if (BufferIsLocal(buffer))
+ {
+ desc = GetLocalBufferDescriptor(-buffer - 1);
+ buf_state = pg_atomic_read_u32(&desc->state);
+ }
+ else
+ {
+ desc = GetBufferDescriptor(buffer - 1);
+ buf_state = LockBufHdr(desc);
+ UnlockBufHdr(desc, buf_state);
+ }
+
+ if (buf_state & BM_VALID)
+ {
+ operation->nblocks_done += 1;
+ Assert(operation->nblocks_done <= operation->nblocks);
+
+ /*
+ * Report and track this as a 'hit' for this backend, even
+ * though it must have started out as a miss in
+ * PinBufferForBlock(). The other backend (or ourselves,
+ * as part of a read started earlier) will track this as a
+ * 'read'.
+ */
+ TRACE_POSTGRESQL_BUFFER_READ_DONE(operation->forknum,
+ operation->blocknum + operation->nblocks_done,
+ operation->smgr->smgr_rlocator.locator.spcOid,
+ operation->smgr->smgr_rlocator.locator.dbOid,
+ operation->smgr->smgr_rlocator.locator.relNumber,
+ operation->smgr->smgr_rlocator.backend,
+ true);
+
+ if (BufferIsLocal(buffer))
+ pgBufferUsage.local_blks_hit += 1;
+ else
+ pgBufferUsage.shared_blks_hit += 1;
+
+ if (operation->rel)
+ pgstat_count_buffer_hit(operation->rel);
+
+ pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
+
+ if (VacuumCostActive)
+ VacuumCostBalance += VacuumCostPageHit;
+ }
+ }
+ else
+ {
+ /*
+ * We now are sure the IO completed. Check the results. This
+ * includes reporting on errors if there were any.
+ */
+ ProcessReadBuffersResult(operation);
+ }
}
/*
@@ -1779,6 +1874,43 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
io_object = IOOBJECT_RELATION;
}
+ /*
+ * If AIO is in progress, be it in this backend or another backend, we
+ * just associate the wait reference with the operation and wait in
+ * WaitReadBuffers(). This turns out to be important for performance in
+ * two workloads:
+ *
+ * 1) A read stream that has to read the same block multiple times within
+ * the readahead distance. This can happen e.g. for the table accesses of
+ * an index scan.
+ *
+ * 2) Concurrent scans by multiple backends on the same relation.
+ *
+ * If we were to synchronously wait for the in-progress IO, we'd not be
+ * able to keep enough I/O in flight.
+ *
+ * If we do find there is ongoing I/O for the buffer, we set up a 1-block
+ * ReadBuffersOperation that WaitReadBuffers then can wait on.
+ *
+ * It's possible that another backend starts IO on the buffer between this
+ * check and the ReadBuffersCanStartIO(nowait = false) below. In that case
+ * we will synchronously wait for the IO below, but the window for that is
+ * small enough that it won't happen often enough to have a significant
+ * performance impact.
+ */
+ if (ReadBuffersIOAlreadyInProgress(operation, buffers[nblocks_done]))
+ {
+ *nblocks_progress = 1;
+ operation->foreign_io = true;
+
+ CheckReadBuffersOperation(operation, false);
+
+
+ return true;
+ }
+
+ operation->foreign_io = false;
+
/*
* If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
* flag. The reason for that is that, hopefully, zero_damaged_pages isn't
@@ -1836,9 +1968,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
/*
* Check if we can start IO on the first to-be-read buffer.
*
- * If an I/O is already in progress in another backend, we want to wait
- * for the outcome: either done, or something went wrong and we will
- * retry.
+ * If a synchronous I/O is in progress in another backend (it can't be
+ * this backend), we want to wait for the outcome: either done, or
+ * something went wrong and we will retry.
*/
if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
{
diff --git a/src/backend/utils/adt/amutils.c b/src/backend/utils/adt/amutils.c
index 0af26d6acfab..1ebe0a76af43 100644
--- a/src/backend/utils/adt/amutils.c
+++ b/src/backend/utils/adt/amutils.c
@@ -363,7 +363,7 @@ indexam_property(FunctionCallInfo fcinfo,
PG_RETURN_BOOL(routine->amclusterable);
case AMPROP_INDEX_SCAN:
- PG_RETURN_BOOL(routine->amgettuple ? true : false);
+ PG_RETURN_BOOL(routine->amgettuple || routine->amgetbatch ? true : false);
case AMPROP_BITMAP_SCAN:
PG_RETURN_BOOL(routine->amgetbitmap ? true : false);
@@ -392,7 +392,7 @@ indexam_property(FunctionCallInfo fcinfo,
PG_RETURN_BOOL(routine->amcanmulticol);
case AMPROP_CAN_EXCLUDE:
- PG_RETURN_BOOL(routine->amgettuple ? true : false);
+ PG_RETURN_BOOL(routine->amgettuple || routine->amgetbatch ? true : false);
case AMPROP_CAN_INCLUDE:
PG_RETURN_BOOL(routine->amcaninclude);
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index cb23ad527826..652bb4c537d5 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6782,6 +6782,10 @@ get_actual_variable_endpoint(Relation heapRel,
* a huge amount of time here, so we give up once we've read too many heap
* pages. When we fail for that reason, the caller will end up using
* whatever extremal value is recorded in pg_statistic.
+ *
+ * XXX We're not using ios_prefetch_block here. That creates a window
+ * where the scan's read stream can get out of sync. At a minimum we'll
+ * need to close this window by explicitly disabling heap I/O prefetching.
*/
InitNonVacuumableSnapshot(SnapshotNonVacuumable,
GlobalVisTestFor(heapRel));
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index d6fc83338505..fc0caf840827 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -54,6 +54,13 @@
boot_val => 'true',
},
+{ name => 'enable_indexscan_prefetch', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
+ short_desc => 'Enables prefetching for index scans and index-only-scans.',
+ flags => 'GUC_EXPLAIN',
+ variable => 'enable_indexscan_prefetch',
+ boot_val => 'true',
+},
+
{ name => 'enable_bitmapscan', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
short_desc => 'Enables the planner\'s use of bitmap-scan plans.',
flags => 'GUC_EXPLAIN',
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index f62b61967ef6..20a0ffaa5385 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -412,6 +412,7 @@
#enable_incremental_sort = on
#enable_indexscan = on
#enable_indexonlyscan = on
+#enable_indexscan_prefetch = on
#enable_material = on
#enable_memoize = on
#enable_mergejoin = on
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 63dd41c1f21b..3a651744ef9a 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -198,6 +198,15 @@ typedef void (*amrescan_function) (IndexScanDesc scan,
typedef bool (*amgettuple_function) (IndexScanDesc scan,
ScanDirection direction);
+/* next batch of valid tuples */
+typedef IndexScanBatch(*amgetbatch_function) (IndexScanDesc scan,
+ IndexScanBatch batch,
+ ScanDirection direction);
+
+/* release batch of valid tuples */
+typedef void (*amfreebatch_function) (IndexScanDesc scan,
+ IndexScanBatch batch);
+
/* fetch all valid tuples */
typedef int64 (*amgetbitmap_function) (IndexScanDesc scan,
TIDBitmap *tbm);
@@ -205,11 +214,9 @@ typedef int64 (*amgetbitmap_function) (IndexScanDesc scan,
/* end index scan */
typedef void (*amendscan_function) (IndexScanDesc scan);
-/* mark current scan position */
-typedef void (*ammarkpos_function) (IndexScanDesc scan);
-
-/* restore marked scan position */
-typedef void (*amrestrpos_function) (IndexScanDesc scan);
+/* invalidate index AM state that independently tracks scan's position */
+typedef void (*amposreset_function) (IndexScanDesc scan,
+ IndexScanBatch batch);
/*
* Callback function signatures - for parallel index scans.
@@ -309,10 +316,11 @@ typedef struct IndexAmRoutine
ambeginscan_function ambeginscan;
amrescan_function amrescan;
amgettuple_function amgettuple; /* can be NULL */
+ amgetbatch_function amgetbatch; /* can be NULL */
+ amfreebatch_function amfreebatch; /* can be NULL */
amgetbitmap_function amgetbitmap; /* can be NULL */
amendscan_function amendscan;
- ammarkpos_function ammarkpos; /* can be NULL */
- amrestrpos_function amrestrpos; /* can be NULL */
+ amposreset_function amposreset; /* can be NULL */
/* interface functions to support parallel index scans */
amestimateparallelscan_function amestimateparallelscan; /* can be NULL */
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 9200a22bd9f9..913945c4b08a 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -15,6 +15,7 @@
#define GENAM_H
#include "access/htup.h"
+#include "access/itup.h"
#include "access/sdir.h"
#include "access/skey.h"
#include "nodes/tidbitmap.h"
@@ -115,6 +116,7 @@ typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state);
/* struct definitions appear in relscan.h */
typedef struct IndexScanDescData *IndexScanDesc;
+typedef struct IndexScanBatchData *IndexScanBatch;
typedef struct SysScanDescData *SysScanDesc;
typedef struct ParallelIndexScanDescData *ParallelIndexScanDesc;
@@ -228,6 +230,9 @@ extern void index_store_float8_orderby_distances(IndexScanDesc scan,
bool recheckOrderBy);
extern bytea *index_opclass_options(Relation indrel, AttrNumber attnum,
Datum attoptions, bool validate);
+extern IndexScanBatch index_batch_alloc(IndexScanDesc scan, int maxitems, bool want_itup);
+extern void index_batch_release(IndexScanDesc scan, IndexScanBatch batch);
+extern void index_batch_unlock(Relation rel, bool dropPin, IndexScanBatch batch);
/*
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 909db73b7bbb..744ad2fac145 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -117,6 +117,7 @@ typedef struct IndexFetchHeapData
IndexFetchTableData xs_base; /* AM independent part of the descriptor */
Buffer xs_cbuf; /* current heap buffer in scan, if any */
+ BlockNumber xs_blk; /* xs_cbuf's block number, if any */
/* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
} IndexFetchHeapData;
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 16be5c7a9c15..119705f64b81 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -939,10 +939,10 @@ typedef BTVacuumPostingData *BTVacuumPosting;
* processing. This approach minimizes lock/unlock traffic. We must always
* drop the lock to make it okay for caller to process the returned items.
* Whether or not we can also release the pin during this window will vary.
- * We drop the pin (when so->dropPin) to avoid blocking progress by VACUUM
- * (see nbtree/README section about making concurrent TID recycling safe).
- * We'll always release both the lock and the pin on the current page before
- * moving on to its sibling page.
+ * We drop the pin (when dropPin is set in batch state) to avoid blocking
+ * progress by VACUUM (see nbtree/README section about making concurrent TID
+ * recycling safe). We'll always release both the lock and the pin on the
+ * current page before moving on to its sibling page.
*
* If we are doing an index-only scan, we save the entire IndexTuple for each
* matched item, otherwise only its heap TID and offset. The IndexTuples go
@@ -961,74 +961,25 @@ typedef struct BTScanPosItem /* what we remember about each match */
typedef struct BTScanPosData
{
- Buffer buf; /* currPage buf (invalid means unpinned) */
-
/* page details as of the saved position's call to _bt_readpage */
BlockNumber currPage; /* page referenced by items array */
BlockNumber prevPage; /* currPage's left link */
BlockNumber nextPage; /* currPage's right link */
- XLogRecPtr lsn; /* currPage's LSN (when so->dropPin) */
/* scan direction for the saved position's call to _bt_readpage */
ScanDirection dir;
- /*
- * If we are doing an index-only scan, nextTupleOffset is the first free
- * location in the associated tuple storage workspace.
- */
- int nextTupleOffset;
-
/*
* moreLeft and moreRight track whether we think there may be matching
* index entries to the left and right of the current page, respectively.
*/
bool moreLeft;
bool moreRight;
-
- /*
- * The items array is always ordered in index order (ie, increasing
- * indexoffset). When scanning backwards it is convenient to fill the
- * array back-to-front, so we start at the last slot and fill downwards.
- * Hence we need both a first-valid-entry and a last-valid-entry counter.
- * itemIndex is a cursor showing which entry was last returned to caller.
- */
- int firstItem; /* first valid index in items[] */
- int lastItem; /* last valid index in items[] */
- int itemIndex; /* current index in items[] */
-
- BTScanPosItem items[MaxTIDsPerBTreePage]; /* MUST BE LAST */
} BTScanPosData;
typedef BTScanPosData *BTScanPos;
-#define BTScanPosIsPinned(scanpos) \
-( \
- AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
- !BufferIsValid((scanpos).buf)), \
- BufferIsValid((scanpos).buf) \
-)
-#define BTScanPosUnpin(scanpos) \
- do { \
- ReleaseBuffer((scanpos).buf); \
- (scanpos).buf = InvalidBuffer; \
- } while (0)
-#define BTScanPosUnpinIfPinned(scanpos) \
- do { \
- if (BTScanPosIsPinned(scanpos)) \
- BTScanPosUnpin(scanpos); \
- } while (0)
-
-#define BTScanPosIsValid(scanpos) \
-( \
- AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
- !BufferIsValid((scanpos).buf)), \
- BlockNumberIsValid((scanpos).currPage) \
-)
-#define BTScanPosInvalidate(scanpos) \
- do { \
- (scanpos).buf = InvalidBuffer; \
- (scanpos).currPage = InvalidBlockNumber; \
- } while (0)
+#define BTScanPosIsValid(scanpos) BlockNumberIsValid((scanpos).currPage)
/* We need one of these for each equality-type SK_SEARCHARRAY scan key */
typedef struct BTArrayKeyInfo
@@ -1066,32 +1017,7 @@ typedef struct BTScanOpaqueData
BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */
FmgrInfo *orderProcs; /* ORDER procs for required equality keys */
MemoryContext arrayContext; /* scan-lifespan context for array data */
-
- /* info about killed items if any (killedItems is NULL if never used) */
- int *killedItems; /* currPos.items indexes of killed items */
- int numKilled; /* number of currently stored items */
- bool dropPin; /* drop leaf pin before btgettuple returns? */
-
- /*
- * If we are doing an index-only scan, these are the tuple storage
- * workspaces for the currPos and markPos respectively. Each is of size
- * BLCKSZ, so it can hold as much as a full page's worth of tuples.
- */
- char *currTuples; /* tuple storage for currPos */
- char *markTuples; /* tuple storage for markPos */
-
- /*
- * If the marked position is on the same page as current position, we
- * don't use markPos, but just keep the marked itemIndex in markItemIndex
- * (all the rest of currPos is valid for the mark position). Hence, to
- * determine if there is a mark, first look at markItemIndex, then at
- * markPos.
- */
- int markItemIndex; /* itemIndex, or -1 if not valid */
-
- /* keep these last in struct for efficiency */
- BTScanPosData currPos; /* current position data */
- BTScanPosData markPos; /* marked position, if any */
+ BTScanPos pos;
} BTScanOpaqueData;
typedef BTScanOpaqueData *BTScanOpaque;
@@ -1191,14 +1117,15 @@ extern bool btinsert(Relation rel, Datum *values, bool *isnull,
extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
extern Size btestimateparallelscan(Relation rel, int nkeys, int norderbys);
extern void btinitparallelscan(void *target);
-extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
+extern IndexScanBatch btgetbatch(IndexScanDesc scan, IndexScanBatch batch,
+ ScanDirection dir);
extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
ScanKey orderbys, int norderbys);
+extern void btfreebatch(IndexScanDesc scan, IndexScanBatch batch);
extern void btparallelrescan(IndexScanDesc scan);
extern void btendscan(IndexScanDesc scan);
-extern void btmarkpos(IndexScanDesc scan);
-extern void btrestrpos(IndexScanDesc scan);
+extern void btposreset(IndexScanDesc scan, IndexScanBatch markbatch);
extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info,
IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback,
@@ -1306,8 +1233,9 @@ extern BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key,
Buffer *bufP, int access);
extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
-extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
-extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
+extern IndexScanBatch _bt_first(IndexScanDesc scan, ScanDirection dir);
+extern IndexScanBatch _bt_next(IndexScanDesc scan, ScanDirection dir,
+ IndexScanBatch priorbatch);
extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
/*
@@ -1327,7 +1255,7 @@ extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arra
extern bool _bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir,
IndexTuple finaltup);
extern void _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate);
-extern void _bt_killitems(IndexScanDesc scan);
+extern void _bt_killitems(IndexScanDesc scan, IndexScanBatch batch);
extern BTCycleId _bt_vacuum_cycleid(Relation rel);
extern BTCycleId _bt_start_vacuum(Relation rel);
extern void _bt_end_vacuum(Relation rel);
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index b5e0fb386c0a..eb306a9dfb0f 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -16,9 +16,11 @@
#include "access/htup_details.h"
#include "access/itup.h"
+#include "access/sdir.h"
#include "nodes/tidbitmap.h"
#include "port/atomics.h"
#include "storage/buf.h"
+#include "storage/read_stream.h"
#include "storage/relfilelocator.h"
#include "storage/spin.h"
#include "utils/relcache.h"
@@ -121,10 +123,161 @@ typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker;
typedef struct IndexFetchTableData
{
Relation rel;
+ ReadStream *rs;
} IndexFetchTableData;
struct IndexScanInstrumentation;
+/* Forward declaration, the prefetch callback needs IndexScanDescData. */
+typedef struct IndexScanBatchData IndexScanBatchData;
+
+typedef struct IndexScanBatchPosItem /* what we remember about each match */
+{
+ ItemPointerData heapTid; /* TID of referenced heap item */
+ OffsetNumber indexOffset; /* index item's location within page */
+ LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */
+} IndexScanBatchPosItem;
+
+/*
+ * Data about one batch of items returned by the index AM
+ */
+typedef struct IndexScanBatchData
+{
+ Buffer buf; /* currPage buf (invalid means unpinned) */
+ XLogRecPtr lsn; /* currPage's LSN (when dropPin) */
+
+ /*
+ * AM-specific state representing the current position of the scan within
+ * the index
+ */
+ void *pos;
+
+ /*
+ * The items array is always ordered in index order (ie, increasing
+ * indexoffset). When scanning backwards it is convenient to fill the
+ * array back-to-front, so we start at the last slot and fill downwards.
+ * Hence we need both a first-valid-entry and a last-valid-entry counter.
+ */
+ int firstItem; /* first valid index in items[] */
+ int lastItem; /* last valid index in items[] */
+
+ /* info about killed items if any (killedItems is NULL if never used) */
+ int *killedItems; /* indexes of killed items */
+ int numKilled; /* number of currently stored items */
+
+ /*
+ * If we are doing an index-only scan, these are the tuple storage
+ * workspaces for the matching tuples (tuples referenced by items[]). Each
+ * is of size BLCKSZ, so it can hold as much as a full page's worth of
+ * tuples.
+ *
+ * XXX maybe currTuples should be part of the am-specific per-batch state
+ * stored in "position" field?
+ */
+ char *currTuples; /* tuple storage for items[] */
+
+ /*
+ * batch contents (TIDs, index tuples, kill bitmap, ...)
+ *
+ * XXX Shouldn't this be part of the "IndexScanBatchPosItem" struct? To
+ * keep everything in one place? Or why should we have separate arrays?
+ * One advantage is that we don't need to allocate memory for arrays that
+ * we don't need ... e.g. if we don't need heap tuples, we don't allocate
+ * that. We couldn't do that with everything in one struct.
+ */
+ char *itemsvisibility; /* Index-only scan visibility cache */
+
+ int maxitems;
+ IndexScanBatchPosItem items[FLEXIBLE_ARRAY_MEMBER];
+} IndexScanBatchData;
+
+/*
+ * Position in the queue of batches - index of a batch, index of item in a batch.
+ */
+typedef struct IndexScanBatchPos
+{
+ int batch;
+ int index;
+} IndexScanBatchPos;
+
+typedef struct IndexScanDescData IndexScanDescData;
+typedef bool (*IndexPrefetchCallback) (IndexScanDescData * scan,
+ void *arg,
+ IndexScanBatchPos *pos);
+
+/*
+ * State used by amgetbatch index AMs, which manage per-page batches of items
+ * with matching index tuples using a circular buffer
+ */
+typedef struct IndexScanBatchState
+{
+ /* Index AM drops leaf pin before amgetbatch returns? */
+ bool dropPin;
+
+ /*
+ * Did we read the final batch in this scan direction? The batches may be
+ * loaded from multiple places, and we need to remember when we fail to
+ * load the next batch in a given scan (which means "no more batches").
+ * amgetbatch may restart the scan on the get call, so we need to remember
+ * it's over.
+ */
+ bool finished;
+ bool reset;
+
+ /*
+ * Did we disable prefetching/use of a read stream because it didn't pay
+ * for itself?
+ */
+ bool prefetchingLockedIn;
+ bool disabled;
+
+ /*
+ * During prefetching, currentPrefetchBlock is the table AM block number
+ * that was returned by our read stream callback most recently. Used to
+ * suppress duplicate successive read stream block requests.
+ *
+ * Prefetching can still perform non-successive requests for the same
+ * block number (in general we're prefetching in exactly the same order
+ * that the scan will return table AM TIDs in). We need to avoid
+ * duplicate successive requests because table AMs expect to be able to
+ * hang on to buffer pins across table_index_fetch_tuple calls.
+ */
+ BlockNumber currentPrefetchBlock;
+
+ /*
+ * Current scan direction, for the currently loaded batches. This is used
+ * to load data in the read stream API callback, etc.
+ */
+ ScanDirection direction;
+
+ /* positions in the queue of batches (batch + item) */
+ IndexScanBatchPos readPos; /* read position */
+ IndexScanBatchPos streamPos; /* prefetch position (for read stream API) */
+ IndexScanBatchPos markPos; /* mark/restore position */
+
+ IndexScanBatchData *markBatch;
+
+ /*
+ * Array of batches returned by the AM. The array has a capacity (but can
+ * be resized if needed). The headBatch is an index of the batch we're
+ * currently reading from (this needs to be translated by modulo
+ * maxBatches into index in the batches array).
+ */
+ int maxBatches; /* size of the batches array */
+ int headBatch; /* head batch slot */
+ int nextBatch; /* next empty batch slot */
+
+ /* small cache of unused batches, to reduce malloc/free traffic */
+ int batchesCacheSize;
+ IndexScanBatchData **batchesCache;
+
+ IndexScanBatchData **batches;
+
+ /* callback to skip prefetching in IOS etc. */
+ IndexPrefetchCallback prefetch;
+ void *prefetchArg;
+} IndexScanBatchState;
+
/*
* We use the same IndexScanDescData structure for both amgettuple-based
* and amgetbitmap-based index scans. Some fields are only relevant in
@@ -138,6 +291,8 @@ typedef struct IndexScanDescData
struct SnapshotData *xs_snapshot; /* snapshot to see */
int numberOfKeys; /* number of index qualifier conditions */
int numberOfOrderBys; /* number of ordering operators */
+ IndexScanBatchState *batchState; /* amgetbatch related state */
+
struct ScanKeyData *keyData; /* array of index qualifier descriptors */
struct ScanKeyData *orderByData; /* array of ordering op descriptors */
bool xs_want_itup; /* caller requests index tuples */
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 30d889b54c53..6879fe99b4bf 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1343,7 +1343,7 @@ typedef struct IndexOptInfo
/* does AM have amgetbitmap interface? */
bool amhasgetbitmap;
bool amcanparallel;
- /* does AM have ammarkpos interface? */
+ /* is AM prepared for us to restore a mark? */
bool amcanmarkpos;
/* AM's cost estimator */
/* Rather than include amapi.h here, we declare amcostestimate like this */
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index b523bcda8f3d..00f4c3d00118 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -51,6 +51,7 @@ extern PGDLLIMPORT Cost disable_cost;
extern PGDLLIMPORT int max_parallel_workers_per_gather;
extern PGDLLIMPORT bool enable_seqscan;
extern PGDLLIMPORT bool enable_indexscan;
+extern PGDLLIMPORT bool enable_indexscan_prefetch;
extern PGDLLIMPORT bool enable_indexonlyscan;
extern PGDLLIMPORT bool enable_bitmapscan;
extern PGDLLIMPORT bool enable_tidscan;
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index b5f8f3c5d42f..139499111878 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -147,6 +147,7 @@ struct ReadBuffersOperation
int flags;
int16 nblocks;
int16 nblocks_done;
+ bool foreign_io;
PgAioWaitRef io_wref;
PgAioReturn io_return;
};
diff --git a/src/test/modules/dummy_index_am/dummy_index_am.c b/src/test/modules/dummy_index_am/dummy_index_am.c
index 94ef639b6fcd..be1b0f55ca7e 100644
--- a/src/test/modules/dummy_index_am/dummy_index_am.c
+++ b/src/test/modules/dummy_index_am/dummy_index_am.c
@@ -319,8 +319,7 @@ dihandler(PG_FUNCTION_ARGS)
amroutine->amgettuple = NULL;
amroutine->amgetbitmap = NULL;
amroutine->amendscan = diendscan;
- amroutine->ammarkpos = NULL;
- amroutine->amrestrpos = NULL;
+ amroutine->amposreset = NULL;
amroutine->amestimateparallelscan = NULL;
amroutine->aminitparallelscan = NULL;
amroutine->amparallelrescan = NULL;
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 3b37fafa65b9..9702e3103955 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -159,6 +159,7 @@ select name, setting from pg_settings where name like 'enable%';
enable_incremental_sort | on
enable_indexonlyscan | on
enable_indexscan | on
+ enable_indexscan_prefetch | on
enable_material | on
enable_memoize | on
enable_mergejoin | on
@@ -173,7 +174,7 @@ select name, setting from pg_settings where name like 'enable%';
enable_seqscan | on
enable_sort | on
enable_tidscan | on
-(25 rows)
+(26 rows)
-- There are always wait event descriptions for various types. InjectionPoint
-- may be present or absent, depending on history since last postmaster start.
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 018b5919cf66..d018bb067c46 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -193,6 +193,8 @@ BOOL
BOOLEAN
BOX
BTArrayKeyInfo
+BTBatchInfo
+BTBatchScanPosData
BTBuildState
BTCallbackState
BTCycleId
@@ -1268,6 +1270,10 @@ IndexOrderByDistance
IndexPath
IndexRuntimeKeyInfo
IndexScan
+IndexScanBatchData
+IndexScanBatchPos
+IndexScanBatchPosItem
+IndexScanBatches
IndexScanDesc
IndexScanInstrumentation
IndexScanState
@@ -3425,10 +3431,10 @@ amestimateparallelscan_function
amgetbitmap_function
amgettreeheight_function
amgettuple_function
+amgetbatch_function
aminitparallelscan_function
aminsert_function
aminsertcleanup_function
-ammarkpos_function
amoptions_function
amparallelrescan_function
amproperty_function