diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index 2c0e71eedc65..3e00149304f5 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -148,8 +148,7 @@ blhandler(PG_FUNCTION_ARGS)
 	amroutine->amgettuple = NULL;
 	amroutine->amgetbitmap = blgetbitmap;
 	amroutine->amendscan = blendscan;
-	amroutine->ammarkpos = NULL;
-	amroutine->amrestrpos = NULL;
+	amroutine->amposreset = NULL;
 	amroutine->amestimateparallelscan = NULL;
 	amroutine->aminitparallelscan = NULL;
 	amroutine->amparallelrescan = NULL;
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index 63d7e376f195..e36519ab776b 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -163,8 +163,7 @@ typedef struct IndexAmRoutine
     amgettuple_function amgettuple;     /* can be NULL */
     amgetbitmap_function amgetbitmap;   /* can be NULL */
     amendscan_function amendscan;
-    ammarkpos_function ammarkpos;       /* can be NULL */
-    amrestrpos_function amrestrpos;     /* can be NULL */
+    amposreset_function amposreset; /* can be NULL */
 
     /* interface functions to support parallel index scans */
     amestimateparallelscan_function amestimateparallelscan;    /* can be NULL */
@@ -789,32 +788,25 @@ amendscan (IndexScanDesc scan);
   <para>
 <programlisting>
 void
-ammarkpos (IndexScanDesc scan);
+amposreset (IndexScanDesc scan);
 </programlisting>
-   Mark current scan position.  The access method need only support one
-   remembered scan position per scan.
-  </para>
-
-  <para>
-   The <function>ammarkpos</function> function need only be provided if the access
-   method supports ordered scans.  If it doesn't,
-   the <structfield>ammarkpos</structfield> field in its <structname>IndexAmRoutine</structname>
-   struct may be set to NULL.
-  </para>
-
-  <para>
-<programlisting>
-void
-amrestrpos (IndexScanDesc scan);
-</programlisting>
-   Restore the scan to the most recently marked position.
-  </para>
-
-  <para>
-   The <function>amrestrpos</function> function need only be provided if the access
-   method supports ordered scans.  If it doesn't,
-   the <structfield>amrestrpos</structfield> field in its <structname>IndexAmRoutine</structname>
-   struct may be set to NULL.
+   Notify index AM that core code will change the scan's position to an item
+   returned as part of an earlier batch.  The index AM must therefore
+   invalidate any state that independently tracks the scan's progress
+   (e.g., array keys used with a ScalarArrayOpExpr qual).  Called by the core
+   system when it is about to restore a mark.
+  </para>
+
+  <para>
+   The <function>amposreset</function> function need only be provided if the access
+   method supports ordered scans through the <function>amgetbatch</function>
+   interface.  If it doesn't, the <structfield>amposreset</structfield> field
+   in its <structname>IndexAmRoutine</structname> struct should be set to
+   NULL.  Index AMs that don't have any private state that might need to be
+   invalidated might still find it useful to provide an empty
+   <structfield>amposreset</structfield> function; if <function>amposreset</function>
+   is set to NULL, the core system will assume that it is unsafe to restore a
+   marked position.
   </para>
 
   <para>
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 2f7d1437919d..6472013ae53f 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -296,8 +296,7 @@ brinhandler(PG_FUNCTION_ARGS)
 	amroutine->amgettuple = NULL;
 	amroutine->amgetbitmap = bringetbitmap;
 	amroutine->amendscan = brinendscan;
-	amroutine->ammarkpos = NULL;
-	amroutine->amrestrpos = NULL;
+	amroutine->amposreset = NULL;
 	amroutine->amestimateparallelscan = NULL;
 	amroutine->aminitparallelscan = NULL;
 	amroutine->amparallelrescan = NULL;
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index 78f7b7a2495c..c9de3d120634 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -84,8 +84,7 @@ ginhandler(PG_FUNCTION_ARGS)
 	amroutine->amgettuple = NULL;
 	amroutine->amgetbitmap = gingetbitmap;
 	amroutine->amendscan = ginendscan;
-	amroutine->ammarkpos = NULL;
-	amroutine->amrestrpos = NULL;
+	amroutine->amposreset = NULL;
 	amroutine->amestimateparallelscan = NULL;
 	amroutine->aminitparallelscan = NULL;
 	amroutine->amparallelrescan = NULL;
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 5213cd71e977..d8065442249c 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -105,8 +105,7 @@ gisthandler(PG_FUNCTION_ARGS)
 	amroutine->amgettuple = gistgettuple;
 	amroutine->amgetbitmap = gistgetbitmap;
 	amroutine->amendscan = gistendscan;
-	amroutine->ammarkpos = NULL;
-	amroutine->amrestrpos = NULL;
+	amroutine->amposreset = NULL;
 	amroutine->amestimateparallelscan = NULL;
 	amroutine->aminitparallelscan = NULL;
 	amroutine->amparallelrescan = NULL;
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 53061c819fbf..b3d7f825cc47 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -104,8 +104,7 @@ hashhandler(PG_FUNCTION_ARGS)
 	amroutine->amgettuple = hashgettuple;
 	amroutine->amgetbitmap = hashgetbitmap;
 	amroutine->amendscan = hashendscan;
-	amroutine->ammarkpos = NULL;
-	amroutine->amrestrpos = NULL;
+	amroutine->amposreset = NULL;
 	amroutine->amestimateparallelscan = NULL;
 	amroutine->aminitparallelscan = NULL;
 	amroutine->amparallelrescan = NULL;
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index bcbac844bb66..6c41b3119ea4 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -84,7 +84,9 @@ heapam_index_fetch_begin(Relation rel)
 	IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData));
 
 	hscan->xs_base.rel = rel;
+	hscan->xs_base.rs = NULL;
 	hscan->xs_cbuf = InvalidBuffer;
+	hscan->xs_blk = InvalidBlockNumber;
 
 	return &hscan->xs_base;
 }
@@ -94,10 +96,14 @@ heapam_index_fetch_reset(IndexFetchTableData *scan)
 {
 	IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
 
+	if (scan->rs)
+		read_stream_reset(scan->rs);
+
 	if (BufferIsValid(hscan->xs_cbuf))
 	{
 		ReleaseBuffer(hscan->xs_cbuf);
 		hscan->xs_cbuf = InvalidBuffer;
+		hscan->xs_blk = InvalidBlockNumber;
 	}
 }
 
@@ -108,6 +114,9 @@ heapam_index_fetch_end(IndexFetchTableData *scan)
 
 	heapam_index_fetch_reset(scan);
 
+	if (scan->rs)
+		read_stream_end(scan->rs);
+
 	pfree(hscan);
 }
 
@@ -124,23 +133,37 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 
 	Assert(TTS_IS_BUFFERTUPLE(slot));
 
-	/* We can skip the buffer-switching logic if we're in mid-HOT chain. */
-	if (!*call_again)
+	/*
+	 * Switch to correct buffer if we don't have it already (we can skip this
+	 * if we're in mid-HOT chain)
+	 */
+	if (!*call_again && hscan->xs_blk != ItemPointerGetBlockNumber(tid))
 	{
-		/* Switch to correct buffer if we don't have it already */
-		Buffer		prev_buf = hscan->xs_cbuf;
+		/* Remember this buffer's block number for next time */
+		hscan->xs_blk = ItemPointerGetBlockNumber(tid);
 
-		hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf,
-											  hscan->xs_base.rel,
-											  ItemPointerGetBlockNumber(tid));
+		if (BufferIsValid(hscan->xs_cbuf))
+			ReleaseBuffer(hscan->xs_cbuf);
 
 		/*
-		 * Prune page, but only if we weren't already on this page
+		 * When using a read stream, the stream will already know which block
+		 * number comes next (though an assertion will verify a match below)
 		 */
-		if (prev_buf != hscan->xs_cbuf)
-			heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
+		if (scan->rs)
+			hscan->xs_cbuf = read_stream_next_buffer(scan->rs, NULL);
+		else
+			hscan->xs_cbuf = ReadBuffer(hscan->xs_base.rel, hscan->xs_blk);
+
+		/*
+		 * Prune page when it is pinned for the first time
+		 */
+		heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
 	}
 
+	/* Assert that the TID's block number's buffer is now pinned */
+	Assert(BufferIsValid(hscan->xs_cbuf));
+	Assert(BufferGetBlockNumber(hscan->xs_cbuf) == hscan->xs_blk);
+
 	/* Obtain share-lock on the buffer so we can examine visibility */
 	LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_SHARE);
 	got_heap_tuple = heap_hot_search_buffer(tid,
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 0cb27af13109..55e60c9ffde2 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -89,6 +89,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
 	scan->xs_snapshot = InvalidSnapshot;	/* caller must initialize this */
 	scan->numberOfKeys = nkeys;
 	scan->numberOfOrderBys = norderbys;
+	scan->batchState = NULL;	/* used by amgetbatch index AMs */
 
 	/*
 	 * We allocate key workspace here, but it won't get filled until amrescan.
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 0492d92d23b1..0e089001cc12 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -44,6 +44,7 @@
 #include "postgres.h"
 
 #include "access/amapi.h"
+#include "access/nbtree.h"		/* XXX for MaxTIDsPerBTreePage (should remove) */
 #include "access/relation.h"
 #include "access/reloptions.h"
 #include "access/relscan.h"
@@ -51,9 +52,11 @@
 #include "catalog/index.h"
 #include "catalog/pg_type.h"
 #include "nodes/execnodes.h"
+#include "optimizer/cost.h"
 #include "pgstat.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "utils/memdebug.h"
 #include "utils/ruleutils.h"
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
@@ -107,8 +110,146 @@ do { \
 static IndexScanDesc index_beginscan_internal(Relation indexRelation,
 											  int nkeys, int norderbys, Snapshot snapshot,
 											  ParallelIndexScanDesc pscan, bool temp_snap);
+static ItemPointer index_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction);
+static ItemPointer index_retail_getnext_tid(IndexScanDesc scan, ScanDirection direction);
 static inline void validate_relation_kind(Relation r);
 
+/* index batching */
+static void index_batch_init(IndexScanDesc scan);
+static void index_batch_reset(IndexScanDesc scan, bool complete);
+static void index_batch_end(IndexScanDesc scan);
+static bool index_batch_getnext(IndexScanDesc scan, ScanDirection direction);
+static void index_batch_free(IndexScanDesc scan, IndexScanBatch batch);
+
+static BlockNumber index_scan_stream_read_next(ReadStream *stream,
+											   void *callback_private_data,
+											   void *per_buffer_data);
+
+static pg_attribute_always_inline bool index_batch_pos_advance(IndexScanDesc scan,
+															   IndexScanBatchPos *pos,
+															   ScanDirection direction);
+static void index_batch_pos_reset(IndexScanDesc scan, IndexScanBatchPos *pos);
+static void index_batch_kill_item(IndexScanDesc scan);
+
+static void AssertCheckBatchPosValid(IndexScanDesc scan, IndexScanBatchPos *pos);
+static void AssertCheckBatch(IndexScanDesc scan, IndexScanBatch batch);
+static void AssertCheckBatches(IndexScanDesc scan);
+
+
+/*
+ * Maximum number of batches (leaf pages) we can keep in memory.
+ *
+ * The value 64 value is arbitrary, it's about 1MB of data with 8KB pages
+ * (512kB for pages, and then a bit of overhead). We should not really need
+ * this many batches in most cases, though. The read stream looks ahead just
+ * enough to queue enough IOs, adjusting the distance (TIDs, but ultimately
+ * the number of future batches) to meet that.
+ *
+ * In most cases an index leaf page has many (hundreds) index tuples, and
+ * it's enough to read one or maybe two leaf pages ahead to satisfy the
+ * distance.
+ *
+ * But there are cases where this may not quite work, for example:
+ *
+ * a) bloated index - many pages only have a single index item, so that
+ *    achieving the distance requires too many leaf pages
+ *
+ * b) correlated index - duplicate blocks are skipped (the callback does not
+ *    even return those, thanks to currentPrefetchBlock optimization), and are
+ *    mostly ignored in the distance heuristics (read stream does not even see
+ *    those TIDs, and there's no I/O either)
+ *
+ * c) index-only scan - the callback skips TIDs from all-visible blocks (not
+ *    reading those is the whole point of index-only scans), and so it's
+ *    invisible to the distance / IO heuristics (similarly to duplicates)
+ *
+ * In these cases we might need to read a significant number of batches to
+ * find the first block to return to the read stream. It's not clear if
+ * looking this far ahead is worth it - it's a lot of work / synchronous
+ * I/O, and the query may terminate before reaching those TIDs (e.g. due to
+ * a LIMIT clause).
+ *
+ * Currently, there's no way to "pause" a read stream - stop looking ahead
+ * for a while, but then resume the work when a batch gets freed. To simulate
+ * this, the read stream is terminated (as if there were no more data), and
+ * then reset after draining all the queued blocks in order to resume work.
+ * This works, but it "stalls" the I/O queue. If it happens very often, it
+ * can be a serious performance bottleneck.
+
+ * XXX Maybe 64 is too high? It also defines the maximum amount of overhead
+ * allowed. In the worst case, reading a single row might trigger reading this
+ * many leaf pages (e.g. with IOS, if most pages are all-visible). Which might
+ * be an issue with LIMIT queries, when we actually won't get that far.
+ */
+#define INDEX_SCAN_MAX_BATCHES	64
+
+/*
+ * Thresholds controlling when we cancel use of a read stream to do
+ * prefetching
+ */
+#define INDEX_SCAN_MIN_DISTANCE_NBATCHES	20
+#define INDEX_SCAN_MIN_TUPLE_DISTANCE		7
+
+#define INDEX_SCAN_BATCH_COUNT(scan) \
+	((scan)->batchState->nextBatch - (scan)->batchState->headBatch)
+
+/* Did we already load batch with the requested index? */
+/* XXX shouldn't this also compare headBatch? maybe the batch was freed? */
+#define INDEX_SCAN_BATCH_LOADED(scan, idx) \
+	((idx) < (scan)->batchState->nextBatch)
+
+/* Have we loaded the maximum number of batches? */
+#define INDEX_SCAN_BATCH_FULL(scan) \
+	(INDEX_SCAN_BATCH_COUNT(scan) == scan->batchState->maxBatches)
+
+/* Return batch for the provided index. */
+/* XXX Should this have an assert to enforce the batch is loaded? Maybe the
+ * index is too far back, but there happens to be a batch in the right slot?
+ * Could easily happen if we have to keep many batches around.
+ */
+#define INDEX_SCAN_BATCH(scan, idx)	\
+		((scan)->batchState->batches[(idx) % INDEX_SCAN_MAX_BATCHES])
+
+/* Is the position invalid/undefined? */
+#define INDEX_SCAN_POS_INVALID(pos) \
+		(((pos)->batch == -1) && ((pos)->index == -1))
+
+#ifdef INDEXAM_DEBUG
+#define DEBUG_LOG(...) elog(AmRegularBackendProcess() ? NOTICE : DEBUG2, __VA_ARGS__)
+#else
+#define DEBUG_LOG(...)
+#endif
+
+/* debug: print info about current batches */
+static void
+index_batch_print(const char *label, IndexScanDesc scan)
+{
+#ifdef INDEXAM_DEBUG
+	IndexScanBatchState *batches = scan->batchState;
+
+	if (!scan->batchState)
+		return;
+
+	if (!AmRegularBackendProcess())
+		return;
+	if (IsCatalogRelation(scan->indexRelation))
+		return;
+
+	DEBUG_LOG("%s: batches headBatch %d nextBatch %d maxBatches %d",
+			  label,
+			  batches->headBatch, batches->nextBatch, batches->maxBatches);
+
+	for (int i = batches->headBatch; i < batches->nextBatch; i++)
+	{
+		IndexScanBatchData *batch = INDEX_SCAN_BATCH(scan, i);
+		BTScanPos	pos = (BTScanPos) batch->pos;
+
+		DEBUG_LOG("    batch %d currPage %u %p firstItem %d lastItem %d killed %d",
+				  i, pos->currPage, batch, batch->firstItem, batch->lastItem,
+				  batch->numKilled);
+	}
+#endif
+}
 
 /* ----------------------------------------------------------------
  *				   index_ interface functions
@@ -283,6 +424,9 @@ index_beginscan(Relation heapRelation,
 	scan->xs_snapshot = snapshot;
 	scan->instrument = instrument;
 
+	if (indexRelation->rd_indam->amgetbatch != NULL)
+		index_batch_init(scan);
+
 	/* prepare to fetch index matches from table */
 	scan->xs_heapfetch = table_index_fetch_begin(heapRelation);
 
@@ -380,6 +524,12 @@ index_rescan(IndexScanDesc scan,
 	scan->kill_prior_tuple = false; /* for safety */
 	scan->xs_heap_continue = false;
 
+	/*
+	 * Reset the batching. This makes it look like there are no batches,
+	 * discards reads already scheduled within the read stream, etc.
+	 */
+	index_batch_reset(scan, true);
+
 	scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys,
 											orderbys, norderbys);
 }
@@ -394,6 +544,9 @@ index_endscan(IndexScanDesc scan)
 	SCAN_CHECKS;
 	CHECK_SCAN_PROCEDURE(amendscan);
 
+	/* Cleanup batching, so that the AM can release pins and so on. */
+	index_batch_end(scan);
+
 	/* Release resources (like buffer pins) from table accesses */
 	if (scan->xs_heapfetch)
 	{
@@ -421,10 +574,42 @@ index_endscan(IndexScanDesc scan)
 void
 index_markpos(IndexScanDesc scan)
 {
+	IndexScanBatchState *batchState = scan->batchState;
+	IndexScanBatchPos *markPos = &batchState->markPos;
+	IndexScanBatchData *markBatch = batchState->markBatch;
+
 	SCAN_CHECKS;
-	CHECK_SCAN_PROCEDURE(ammarkpos);
 
-	scan->indexRelation->rd_indam->ammarkpos(scan);
+	/*
+	 * FIXME this should probably check there actually is a batch state. For
+	 * now it works the only AM with mark/restore support is btree, and that
+	 * has batching. But we should not rely on that, right?
+	 */
+
+	/*
+	 * Free the previous mark batch (if any), but only if the batch is no
+	 * longer valid (in the current head/next range). This means that if we're
+	 * marking the same batch (different item), we don't really do anything.
+	 *
+	 * XXX Should have some macro for this check, I guess.
+	 */
+	if (markBatch != NULL && (markPos->batch < batchState->headBatch ||
+							  markPos->batch >= batchState->nextBatch))
+	{
+		batchState->markBatch = NULL;
+		index_batch_free(scan, markBatch);
+	}
+
+	/* just copy the read position (which has to be valid) */
+	batchState->markPos = batchState->readPos;
+	batchState->markBatch = INDEX_SCAN_BATCH(scan, batchState->markPos.batch);
+
+	/*
+	 * FIXME we need to make sure the batch does not get freed during the
+	 * regular advances.
+	 */
+
+	AssertCheckBatchPosValid(scan, &batchState->markPos);
 }
 
 /* ----------------
@@ -445,19 +630,60 @@ index_markpos(IndexScanDesc scan)
 void
 index_restrpos(IndexScanDesc scan)
 {
+	IndexScanBatchState *batchState;
+	IndexScanBatchPos *markPos;
+	IndexScanBatchData *markBatch;
+
 	Assert(IsMVCCSnapshot(scan->xs_snapshot));
 
 	SCAN_CHECKS;
-	CHECK_SCAN_PROCEDURE(amrestrpos);
+	CHECK_SCAN_PROCEDURE(amgetbatch);
+	CHECK_SCAN_PROCEDURE(amposreset);
 
-	/* release resources (like buffer pins) from table accesses */
+	/*
+	 * release resources (like buffer pins) from table accesses
+	 *
+	 * XXX: Currently, the distance is always remembered across any
+	 * read_stream_reset calls (to work around the scan->batchState->reset
+	 * behavior of resetting the stream to deal with running out of batches).
+	 * We probably _should_ be forgetting the distance when we reset the
+	 * stream here (through our table_index_fetch_reset call), though.
+	 */
 	if (scan->xs_heapfetch)
 		table_index_fetch_reset(scan->xs_heapfetch);
 
 	scan->kill_prior_tuple = false; /* for safety */
 	scan->xs_heap_continue = false;
 
-	scan->indexRelation->rd_indam->amrestrpos(scan);
+	/*
+	 * FIXME this should probably check there actually is a batch state. For
+	 * now it works the only AM with mark/restore support is btree, and that
+	 * has batching. But we should not rely on that, right?
+	 */
+
+	batchState = scan->batchState;
+	markPos = &batchState->markPos;
+	markBatch = scan->batchState->markBatch;
+
+	/*
+	 * Call amposreset to let index AM know to invalidate any private state
+	 * that independently tracks the scan's progress
+	 */
+	scan->indexRelation->rd_indam->amposreset(scan, markBatch);
+
+	/*
+	 * Reset the batching state, except for the marked batch, and make it look
+	 * like we have a single batch -- the marked one.
+	 */
+	index_batch_reset(scan, false);
+
+	batchState->markPos = *markPos;
+	batchState->readPos = *markPos;
+	batchState->headBatch = markPos->batch;
+	batchState->nextBatch = (batchState->headBatch + 1);
+
+	INDEX_SCAN_BATCH(scan, batchState->markPos.batch) = markBatch;
+	batchState->markBatch = markBatch;	/* also remember this */
 }
 
 /*
@@ -579,6 +805,17 @@ index_parallelrescan(IndexScanDesc scan)
 	if (scan->xs_heapfetch)
 		table_index_fetch_reset(scan->xs_heapfetch);
 
+	/*
+	 * Reset the batching. This makes it look like there are no batches,
+	 * discards reads already scheduled to the read stream, etc.
+	 *
+	 * XXX We do this before calling amparallelrescan, so that it could
+	 * reinitialize everything (this probably does not matter very much, now
+	 * that we've moved all the batching logic to indexam.c, it was more
+	 * important when the index AM was responsible for more of it).
+	 */
+	index_batch_reset(scan, true);
+
 	/* amparallelrescan is optional; assume no-op if not provided by AM */
 	if (scan->indexRelation->rd_indam->amparallelrescan != NULL)
 		scan->indexRelation->rd_indam->amparallelrescan(scan);
@@ -614,6 +851,9 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel,
 	scan->xs_snapshot = snapshot;
 	scan->instrument = instrument;
 
+	if (indexrel->rd_indam->amgetbatch != NULL)
+		index_batch_init(scan);
+
 	/* prepare to fetch index matches from table */
 	scan->xs_heapfetch = table_index_fetch_begin(heaprel);
 
@@ -630,14 +870,286 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel,
 ItemPointer
 index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 {
-	bool		found;
-
 	SCAN_CHECKS;
-	CHECK_SCAN_PROCEDURE(amgettuple);
 
 	/* XXX: we should assert that a snapshot is pushed or registered */
 	Assert(TransactionIdIsValid(RecentXmin));
 
+	/*
+	 * Index AMs that support plain index scans must provide exactly one of
+	 * either the amgetbatch or amgettuple callbacks
+	 */
+	Assert(!(scan->indexRelation->rd_indam->amgettuple != NULL &&
+			 scan->indexRelation->rd_indam->amgetbatch != NULL));
+
+	if (scan->batchState != NULL)
+		return index_batch_getnext_tid(scan, direction);
+	else
+		return index_retail_getnext_tid(scan, direction);
+}
+
+/* ----------------
+ *		index_getnext_batch_tid - ambatch index_getnext_tid implementation
+ *
+ * If we advance to the next batch, we release the previous one (unless it's
+ * tracked for mark/restore).
+ *
+ * If the scan direction changes, we release all batches except the current
+ * one (per readPos), to make it look it's the only batch we loaded.
+ *
+ * Returns the first/next TID, or NULL if no more items.
+ *
+ * FIXME This only sets xs_heaptid and xs_itup (if requested). Not sure if
+ * we need to do something with xs_hitup. Should this set xs_hitup?
+ *
+ * XXX Maybe if we advance the position to the next batch, we could keep the
+ * batch for a bit more, in case the scan direction changes (as long as it
+ * fits into maxBatches)? But maybe that's unnecessary complexity for too
+ * little gain, we'd need to be careful about releasing the batches lazily.
+ * ----------------
+ */
+static ItemPointer
+index_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction)
+{
+	IndexScanBatchState *batchState = scan->batchState;
+	IndexScanBatchPos *readPos;
+
+	CHECK_SCAN_PROCEDURE(amgetbatch);
+
+	/* shouldn't get here without batching */
+	AssertCheckBatches(scan);
+
+	/* Initialize direction on first call */
+	if (batchState->direction == NoMovementScanDirection)
+		batchState->direction = direction;
+
+	/*
+	 * Handle cancelling the use of the read stream for prefetching
+	 */
+	else if (unlikely(batchState->disabled && scan->xs_heapfetch->rs))
+	{
+		index_batch_pos_reset(scan, &batchState->streamPos);
+
+		read_stream_reset(scan->xs_heapfetch->rs);
+		scan->xs_heapfetch->rs = NULL;
+	}
+
+	/*
+	 * Handle change of scan direction (reset stream, ...).
+	 *
+	 * Release future batches properly, to make it look like the current batch
+	 * is the only one we loaded. Also reset the stream position, as if we are
+	 * just starting the scan.
+	 */
+	else if (unlikely(batchState->direction != direction))
+	{
+		/* release "future" batches in the wrong direction */
+		while (batchState->nextBatch > batchState->headBatch + 1)
+		{
+			IndexScanBatch fbatch;
+
+			batchState->nextBatch--;
+			fbatch = INDEX_SCAN_BATCH(scan, batchState->nextBatch);
+			index_batch_free(scan, fbatch);
+		}
+
+		/*
+		 * Remember the new direction, and make sure the scan is not marked as
+		 * "finished" (we might have already read the last batch, but now we
+		 * need to start over). Do this before resetting the stream - it
+		 * should not invoke the callback until the first read, but it may
+		 * seem a bit confusing otherwise.
+		 */
+		batchState->direction = direction;
+		batchState->finished = false;
+		batchState->currentPrefetchBlock = InvalidBlockNumber;
+
+		index_batch_pos_reset(scan, &batchState->streamPos);
+		if (scan->xs_heapfetch->rs)
+			read_stream_reset(scan->xs_heapfetch->rs);
+	}
+
+	/* shortcut for the read position, for convenience */
+	readPos = &batchState->readPos;
+
+	DEBUG_LOG("index_batch_getnext_tid readPos %d %d direction %d",
+			  readPos->batch, readPos->index, direction);
+
+	/*
+	 * Try advancing the batch position. If that doesn't succeed, it means we
+	 * don't have more items in the current batch, and there's no future batch
+	 * loaded. So try loading another batch, and maybe retry.
+	 *
+	 * FIXME This loop shouldn't happen more than twice. Maybe we should have
+	 * some protection against infinite loops? To detect cases when the
+	 * advance/getnext functions get to disagree?
+	 */
+	while (true)
+	{
+		/*
+		 * If we manage to advance to the next items, return it and we're
+		 * done. Otherwise try loading another batch.
+		 */
+		if (index_batch_pos_advance(scan, readPos, direction))
+		{
+			IndexScanBatchData *readBatch = INDEX_SCAN_BATCH(scan, readPos->batch);
+
+			/* set the TID / itup for the scan */
+			scan->xs_heaptid = readBatch->items[readPos->index].heapTid;
+			if (scan->xs_want_itup)
+				scan->xs_itup =
+					(IndexTuple) (readBatch->currTuples +
+								  readBatch->items[readPos->index].tupleOffset);
+
+			DEBUG_LOG("readBatch %p firstItem %d lastItem %d readPos %d/%d TID (%u,%u)",
+					  readBatch, readBatch->firstItem, readBatch->lastItem,
+					  readPos->batch, readPos->index,
+					  ItemPointerGetBlockNumber(&scan->xs_heaptid),
+					  ItemPointerGetOffsetNumber(&scan->xs_heaptid));
+
+			/*
+			 * If we advanced to the next batch, release the batch we no
+			 * longer need. The positions is the "read" position, and we can
+			 * compare it to headBatch.
+			 */
+			if (unlikely(readPos->batch != batchState->headBatch))
+			{
+				IndexScanBatchData *headBatch = INDEX_SCAN_BATCH(scan,
+																 batchState->headBatch);
+
+				/*
+				 * XXX When advancing readPos, the streamPos may get behind as
+				 * we're only advancing it when actually requesting heap
+				 * blocks. But we may not do that often enough - e.g. IOS may
+				 * not need to access all-visible heap blocks, so the
+				 * read_next callback does not get invoked for a long time.
+				 * It's possible the stream gets so mucu behind the position
+				 * gets invalid, as we already removed the batch. But that
+				 * means we don't need any heap blocks until the current read
+				 * position - if we did, we would not be in this situation (or
+				 * it's a sign of a bug, as those two places are expected to
+				 * be in sync). So if the streamPos still points at the batch
+				 * we're about to free, just reset the position - we'll set it
+				 * to readPos in the read_next callback later.
+				 *
+				 * XXX This can happen after the queue gets full, we "pause"
+				 * the stream, and then reset it to continue. But I think that
+				 * just increases the probability of hitting the issue, it's
+				 * just more chance to to not advance the streamPos, which
+				 * depends on when we try to fetch the first heap block after
+				 * calling read_stream_reset().
+				 *
+				 * FIXME Simplify/clarify/shorten this comment. Can it
+				 * actually happen, if we never pull from the stream in IOS?
+				 * We probably don't look ahead for the first call.
+				 */
+				if (unlikely(batchState->streamPos.batch == batchState->headBatch))
+				{
+					DEBUG_LOG("index_batch_pos_reset called early (streamPos.batch == headBatch)");
+					index_batch_pos_reset(scan, &batchState->streamPos);
+				}
+
+				DEBUG_LOG("index_batch_getnext_tid free headBatch %p headBatch %d nextBatch %d",
+						  headBatch, batchState->headBatch, batchState->nextBatch);
+
+				/* Free the head batch (except when it's markBatch) */
+				index_batch_free(scan, headBatch);
+
+				/*
+				 * In any case, remove the batch from the regular queue, even
+				 * if we kept it for mark/restore.
+				 */
+				batchState->headBatch++;
+
+				DEBUG_LOG("index_batch_getnext_tid batch freed headBatch %d nextBatch %d",
+						  batchState->headBatch, batchState->nextBatch);
+
+				index_batch_print("index_batch_getnext_tid / free old batch", scan);
+
+				/* we can't skip any batches */
+				Assert(batchState->headBatch == readPos->batch);
+			}
+
+			pgstat_count_index_tuples(scan->indexRelation, 1);
+			return &scan->xs_heaptid;
+		}
+
+		/*
+		 * We failed to advance, i.e. we ran out of currently loaded batches.
+		 * So if we filled the queue, this is a good time to reset the stream
+		 * (before we try loading the next batch).
+		 */
+		if (unlikely(batchState->reset))
+		{
+			DEBUG_LOG("resetting read stream readPos %d,%d",
+					  readPos->batch, readPos->index);
+
+			batchState->reset = false;
+			batchState->currentPrefetchBlock = InvalidBlockNumber;
+
+			/*
+			 * Need to reset the stream position, it might be too far behind.
+			 * Ultimately we want to set it to readPos, but we can't do that
+			 * yet - readPos still point sat the old batch, so just reset it
+			 * and we'll init it to readPos later in the callback.
+			 */
+			index_batch_pos_reset(scan, &batchState->streamPos);
+
+			if (scan->xs_heapfetch->rs)
+				read_stream_reset(scan->xs_heapfetch->rs);
+		}
+
+		/*
+		 * Failed to advance the read position, so try reading the next batch.
+		 * If this fails, we're done - there's nothing more to load.
+		 *
+		 * Most of the batches should be loaded from read_stream_next_buffer,
+		 * but we need to call index_batch_getnext here too, for two reasons.
+		 * First, the read_stream only gets working after we try fetching the
+		 * first heap tuple, so we need to load the initial batch (the head).
+		 * Second, while most batches will be preloaded by the stream thanks
+		 * to prefetching, it's possible to set effective_io_concurrency=0,
+		 * and in that case all the batches get loaded from here.
+		 */
+		if (!index_batch_getnext(scan, direction))
+			break;
+
+		DEBUG_LOG("loaded next batch, retry to advance position");
+	}
+
+	/*
+	 * If we get here, we failed to advance the position and there are no more
+	 * batches, so we're done.
+	 */
+	DEBUG_LOG("no more batches to process");
+
+	/*
+	 * Reset the position - we must not keep the last valid position, in case
+	 * we change direction of the scan and start scanning again. If we kept
+	 * the position, we'd skip the first item.
+	 *
+	 * XXX This is a bit strange. Do we really need to reset the position
+	 * after returning the last item? I wonder if it means the API is not
+	 * quite right.
+	 */
+	index_batch_pos_reset(scan, readPos);
+
+	return NULL;
+}
+
+/* ----------------
+ *		index_retail_getnext_tid - amgettuple index_getnext_tid implementation
+ *
+ * Returns the first/next TID, or NULL if no more items.
+ * ----------------
+ */
+static ItemPointer
+index_retail_getnext_tid(IndexScanDesc scan, ScanDirection direction)
+{
+	bool		found;
+
+	CHECK_SCAN_PROCEDURE(amgettuple);
+
 	/*
 	 * The AM's amgettuple proc finds the next index entry matching the scan
 	 * keys, and puts the TID into scan->xs_heaptid.  It should also set
@@ -704,9 +1216,18 @@ index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot)
 	 * amgettuple call, in index_getnext_tid).  We do not do this when in
 	 * recovery because it may violate MVCC to do so.  See comments in
 	 * RelationGetIndexScan().
+	 *
+	 * XXX For scans using batching, record the flag in the batch (we will
+	 * pass it to the AM later, when freeing it). Otherwise just pass it to
+	 * the AM using the kill_prior_tuple field.
 	 */
 	if (!scan->xactStartedInRecovery)
-		scan->kill_prior_tuple = all_dead;
+	{
+		if (scan->batchState == NULL)
+			scan->kill_prior_tuple = all_dead;
+		else if (all_dead)
+			index_batch_kill_item(scan);
+	}
 
 	return found;
 }
@@ -1089,3 +1610,934 @@ index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions,
 
 	return build_local_reloptions(&relopts, attoptions, validate);
 }
+
+/*
+ * Check that a position (batch,item) is valid with respect to the batches we
+ * have currently loaded.
+ *
+ * XXX The "marked" batch is an exception. The marked batch may get outside
+ * the range of current batches, so make sure to never check the position
+ * for that.
+ */
+static void
+AssertCheckBatchPosValid(IndexScanDesc scan, IndexScanBatchPos *pos)
+{
+#ifdef USE_ASSERT_CHECKING
+	IndexScanBatchState *batchState = scan->batchState;
+
+	/* make sure the position is valid for currently loaded batches */
+	Assert(pos->batch >= batchState->headBatch);
+	Assert(pos->batch < batchState->nextBatch);
+#endif
+}
+
+/*
+ * Check a single batch is valid.
+ */
+static void
+AssertCheckBatch(IndexScanDesc scan, IndexScanBatch batch)
+{
+#ifdef USE_ASSERT_CHECKING
+	/* there must be valid range of items */
+	Assert(batch->firstItem <= batch->lastItem);
+	Assert(batch->firstItem >= 0);
+
+	/* we should have items (buffer and pointers) */
+	Assert(batch->items != NULL);
+
+	/*
+	 * The number of killed items must be valid, and there must be an array of
+	 * indexes if there are items.
+	 */
+	Assert(batch->numKilled >= 0);
+	Assert(!(batch->numKilled > 0 && batch->killedItems == NULL));
+
+	/* XXX can we check some of the other batch fields? */
+#endif
+}
+
+/*
+ * Check invariants on current batches
+ *
+ * Makes sure the indexes are set as expected, the buffer size is within
+ * limits, and so on.
+ */
+static void
+AssertCheckBatches(IndexScanDesc scan)
+{
+#ifdef USE_ASSERT_CHECKING
+	IndexScanBatchState *batchState = scan->batchState;
+
+	/* we should have batches initialized */
+	Assert(batchState != NULL);
+
+	/* We should not have too many batches. */
+	Assert(batchState->maxBatches > 0 &&
+		   batchState->maxBatches <= INDEX_SCAN_MAX_BATCHES);
+
+	/*
+	 * The head/next indexes should define a valid range (in the cyclic
+	 * buffer, and should not overflow maxBatches.
+	 */
+	Assert(batchState->headBatch >= 0 &&
+		   batchState->headBatch <= batchState->nextBatch);
+	Assert(batchState->nextBatch - batchState->headBatch <=
+		   batchState->maxBatches);
+
+	/* Check all current batches */
+	for (int i = batchState->headBatch; i < batchState->nextBatch; i++)
+	{
+		IndexScanBatch batch = INDEX_SCAN_BATCH(scan, i);
+
+		AssertCheckBatch(scan, batch);
+	}
+#endif
+}
+
+/*
+ * index_batch_pos_advance
+ *		Advance the position to the next item, depending on scan direction.
+ *
+ * Advance the position to the next item, either in the same batch or the
+ * following one (if already available).
+ *
+ * We can advance only if we already have some batches loaded, and there's
+ * either enough items in the current batch, or some more items in the
+ * subsequent batches.
+ *
+ * If this is the first advance (right after loading the initial/head batch),
+ * position is still undefined. Otherwise we expect the position to be valid.
+ *
+ * Returns true if the position was advanced, false otherwise.
+ *
+ * The position is guaranteed to be valid only after a successful advance.
+ * If an advance fails (false returned), the position can be invalid.
+ *
+ * XXX This seems like a good place to enforce some "invariants", e.g.
+ * that the positions are always valid. We should never get here with
+ * invalid position (so probably should be initialized as part of loading the
+ * initial/head batch), and then invalidated if advance fails. Could be tricky
+ * for the stream position, though, because it can get "lag" for IOS etc.
+ */
+static pg_attribute_always_inline bool
+index_batch_pos_advance(IndexScanDesc scan, IndexScanBatchPos *pos,
+						ScanDirection direction)
+{
+	IndexScanBatchData *batch;
+
+	/* make sure we have batching initialized and consistent */
+	AssertCheckBatches(scan);
+
+	/* should know direction by now */
+	Assert(direction == scan->batchState->direction);
+	Assert(direction != NoMovementScanDirection);
+
+	/* We can't advance if there are no batches available. */
+	if (INDEX_SCAN_BATCH_COUNT(scan) == 0)
+		return false;
+
+	/*
+	 * If the position has not been advanced yet, it has to be right after we
+	 * loaded the initial batch (must be the head batch). In that case just
+	 * initialize it to the batch's first item (or its last item, when
+	 * scanning backwards).
+	 *
+	 * XXX Maybe we should just explicitly initialize the postition after
+	 * loading the initial batch, without having to go through the advance.
+	 */
+	if (INDEX_SCAN_POS_INVALID(pos))
+	{
+		/*
+		 * We should have loaded the scan's initial batch, or maybe we have
+		 * changed the direction of the scan after scanning all the way to the
+		 * end (in which case the position is invalid, and we make it look
+		 * like there is just one batch). We should have just one batch,
+		 * though.
+		 *
+		 * XXX Actually, could there be more batches? Maybe we prefetched more
+		 * batches right away? It doesn't seem to be a substantial invariant.
+		 */
+		Assert(INDEX_SCAN_BATCH_COUNT(scan) == 1);
+
+		/*
+		 * Get the initial batch (which must be the head), and initialize the
+		 * position to the appropriate item for the current scan direction
+		 */
+		batch = INDEX_SCAN_BATCH(scan, scan->batchState->headBatch);
+
+		pos->batch = scan->batchState->headBatch;
+
+		if (ScanDirectionIsForward(direction))
+			pos->index = batch->firstItem;
+		else
+			pos->index = batch->lastItem;
+
+		AssertCheckBatchPosValid(scan, pos);
+
+		return true;
+	}
+
+	/*
+	 * The position is already defined, so we should have some batches loaded
+	 * and the position has to be valid with respect to those.
+	 */
+	AssertCheckBatchPosValid(scan, pos);
+
+	/*
+	 * Advance to the next item in the same batch, if there are more items. If
+	 * we're at the last item, we'll try advancing to the next batch later.
+	 */
+	batch = INDEX_SCAN_BATCH(scan, pos->batch);
+
+	if (ScanDirectionIsForward(direction))
+	{
+		if (++pos->index <= batch->lastItem)
+		{
+			AssertCheckBatchPosValid(scan, pos);
+
+			return true;
+		}
+	}
+	else						/* ScanDirectionIsBackward */
+	{
+		if (--pos->index >= batch->firstItem)
+		{
+			AssertCheckBatchPosValid(scan, pos);
+
+			return true;
+		}
+	}
+
+	/*
+	 * We couldn't advance within the same batch, try advancing to the next
+	 * batch, if it's already loaded.
+	 */
+	if (INDEX_SCAN_BATCH_LOADED(scan, pos->batch + 1))
+	{
+		/* advance to the next batch */
+		pos->batch++;
+
+		batch = INDEX_SCAN_BATCH(scan, pos->batch);
+		Assert(batch != NULL);
+
+		if (ScanDirectionIsForward(direction))
+			pos->index = batch->firstItem;
+		else
+			pos->index = batch->lastItem;
+
+		AssertCheckBatchPosValid(scan, pos);
+
+		return true;
+	}
+
+	/* can't advance */
+	return false;
+}
+
+/*
+ * index_batch_pos_reset
+ *		Reset the position, so that it looks as if never advanced.
+ */
+static void
+index_batch_pos_reset(IndexScanDesc scan, IndexScanBatchPos *pos)
+{
+	pos->batch = -1;
+	pos->index = -1;
+}
+
+/*
+ * index_scan_stream_read_next
+ *		return the next block to pass to the read stream
+ *
+ * This assumes the "current" scan direction, requested by the caller.
+ *
+ * If the direction changes before consuming all blocks, we'll reset the stream
+ * and start from scratch. The scan direction change is handled elsewhere. Here
+ * we rely on having the correct value in batchState->direction.
+ *
+ * The position of the read_stream is stored in streamPos, which may be ahead of
+ * the current readPos (which is what got consumed by the scan).
+ *
+ * The streamPos can however also get behind readPos too, when some blocks are
+ * skipped and not returned to the read_stream. An example is an index scan on
+ * a correlated index, with many duplicate blocks are skipped, or an IOS where
+ * all-visible blocks are skipped.
+ *
+ * The initial batch is always loaded from index_batch_getnext_tid(). We don't
+ * get here until the first read_stream_next_buffer() call, when pulling the
+ * first heap tuple from the stream. After that, most batches should be loaded
+ * by this callback, driven by the read_stream look-ahead distance. However,
+ * with disabled prefetching (that is, with effective_io_concurrency=0), all
+ * batches will be loaded in index_batch_getnext_tid.
+ *
+ * It's possible we got here only fairly late in the scan, e.g. if many tuples
+ * got skipped in the index-only scan, etc. In this case just use the read
+ * position as a streamPos starting point.
+ *
+ * XXX It seems the readPos/streamPos comments should be placed elsewhere. The
+ * read_stream callback does not seem like the right place.
+ */
+static BlockNumber
+index_scan_stream_read_next(ReadStream *stream,
+							void *callback_private_data,
+							void *per_buffer_data)
+{
+	IndexScanDesc scan = (IndexScanDesc) callback_private_data;
+	IndexScanBatchState *batchState = scan->batchState;
+	IndexScanBatchPos *streamPos = &batchState->streamPos;
+	ScanDirection direction = batchState->direction;
+
+	/* By now we should know the direction of the scan. */
+	Assert(direction != NoMovementScanDirection);
+
+	/*
+	 * The read position (readPos) has to be valid.
+	 *
+	 * We initialize/advance it before even attempting to read the heap tuple,
+	 * and it gets invalidated when we reach the end of the scan (but then we
+	 * don't invoke the callback again).
+	 *
+	 * XXX This applies to the readPos. We'll use streamPos to determine which
+	 * blocks to pass to the stream, and readPos may be used to initialize it.
+	 */
+	AssertCheckBatchPosValid(scan, &batchState->readPos);
+
+	/*
+	 * Try to advance the streamPos to the next item, and if that doesn't
+	 * succeed (if there are no more items in loaded batches), try loading the
+	 * next one.
+	 *
+	 * FIXME Unlike index_batch_getnext_tid, this can loop more than twice. If
+	 * many blocks get skipped due to currentPrefetchBlock or all-visibility
+	 * (per the "prefetch" callback), we get to load additional batches. In
+	 * the worst case we hit the INDEX_SCAN_MAX_BATCHES limit and have to
+	 * "pause" the stream.
+	 */
+	while (true)
+	{
+		bool		advanced = false;
+
+		/*
+		 * If the stream position has not been initialized yet, set it to the
+		 * current read position. This is the item the caller is tring to
+		 * read, so it's what we should return to the stream.
+		 */
+		if (INDEX_SCAN_POS_INVALID(streamPos))
+		{
+			*streamPos = batchState->readPos;
+			advanced = true;
+		}
+		else if (index_batch_pos_advance(scan, streamPos, direction))
+		{
+			advanced = true;
+		}
+
+		/*
+		 * FIXME Maybe check the streamPos is not behind readPos?
+		 *
+		 * FIXME Actually, could streamPos get stale/lagging behind readPos,
+		 * and if yes how much. Could it get so far behind to not be valid,
+		 * pointing at a freed batch? In that case we can't even advance it,
+		 * and we should just initialize it to readPos. We might do that
+		 * anyway, I guess, just to save on "pointless" advances (it must
+		 * agree with readPos, we can't allow "retroactively" changing the
+		 * block sequence).
+		 */
+
+		/*
+		 * If we advanced the position, either return the block for the TID,
+		 * or skip it (and then try advancing again).
+		 *
+		 * The block may be "skipped" for two reasons. First, the caller may
+		 * define a "prefetch" callback that tells us to skip items (IOS does
+		 * this to skip all-visible pages). Second, currentPrefetchBlock is
+		 * used to skip duplicate block numbers (a sequence of TIDS for the
+		 * same block).
+		 */
+		if (advanced)
+		{
+			IndexScanBatch streamBatch = INDEX_SCAN_BATCH(scan, streamPos->batch);
+			ItemPointer tid = &streamBatch->items[streamPos->index].heapTid;
+
+			DEBUG_LOG("index_scan_stream_read_next: index %d TID (%u,%u)",
+					  streamPos->index,
+					  ItemPointerGetBlockNumber(tid),
+					  ItemPointerGetOffsetNumber(tid));
+
+			/*
+			 * If there's a prefetch callback, use it to decide if we need to
+			 * read the next block.
+			 *
+			 * We need to do this before checking currentPrefetchBlock; it's
+			 * essential that the VM cache used by index-only scans is
+			 * intialized here.
+			 */
+			if (batchState->prefetch &&
+				!batchState->prefetch(scan, batchState->prefetchArg, streamPos))
+			{
+				DEBUG_LOG("index_scan_stream_read_next: skip block (callback)");
+				continue;
+			}
+
+			/* same block as before, don't need to read it */
+			if (batchState->currentPrefetchBlock == ItemPointerGetBlockNumber(tid))
+			{
+				DEBUG_LOG("index_scan_stream_read_next: skip block (currentPrefetchBlock)");
+				continue;
+			}
+
+			batchState->currentPrefetchBlock = ItemPointerGetBlockNumber(tid);
+
+			return batchState->currentPrefetchBlock;
+		}
+
+		/*
+		 * Couldn't advance the position, no more items in the loaded batches.
+		 * Try loading the next batch - if that succeeds, try advancing again
+		 * (this time the advance should work, but we may skip all the items).
+		 *
+		 * If we fail to load the next batch, we're done.
+		 */
+		if (!index_batch_getnext(scan, direction))
+			break;
+
+		/*
+		 * Consider disabling prefetching when we can't keep a sufficiently
+		 * large "index tuple distance" between readPos and streamPos.
+		 *
+		 * Only consider doing this when we're not on the scan's initial
+		 * batch, when readPos and streamPos share the same batch.
+		 */
+		if (!batchState->finished && !batchState->prefetchingLockedIn)
+		{
+			int			indexdiff;
+
+			if (streamPos->batch <= INDEX_SCAN_MIN_DISTANCE_NBATCHES)
+			{
+				/* Too early to check if prefetching should be disabled */
+			}
+			else if (batchState->readPos.batch == streamPos->batch)
+			{
+				IndexScanBatchPos *readPos = &batchState->readPos;
+
+				if (ScanDirectionIsForward(direction))
+					indexdiff = streamPos->index - readPos->index;
+				else
+				{
+					IndexScanBatch readBatch =
+						INDEX_SCAN_BATCH(scan, readPos->batch);
+
+					indexdiff = (readPos->index - readBatch->firstItem) -
+						(streamPos->index - readBatch->firstItem);
+				}
+
+				if (indexdiff < INDEX_SCAN_MIN_TUPLE_DISTANCE)
+				{
+					batchState->disabled = true;
+					return InvalidBlockNumber;
+				}
+				else
+				{
+					batchState->prefetchingLockedIn = true;
+				}
+			}
+			else
+				batchState->prefetchingLockedIn = true;
+		}
+	}
+
+	/* no more items in this scan */
+	return InvalidBlockNumber;
+}
+
+/* ----------------
+ *		index_batch_getnext - get the next batch of TIDs from a scan
+ *
+ * Returns true if we managed to read a batch of TIDs, or false if there are no
+ * more TIDs in the scan. The load may also return false if we used the maximum
+ * number of batches (INDEX_SCAN_MAX_BATCHES), in which case we'll reset the
+ * stream and continue the scan later.
+ *
+ * Returns true if the batch was loaded successfully, false otherwise.
+ *
+ * This only loads the TIDs and resets the various batch fields to fresh
+ * state. It does not set xs_heaptid/xs_itup/xs_hitup, that's the
+ * responsibility of the following index_batch_getnext_tid() calls.
+ * ----------------
+ */
+static bool
+index_batch_getnext(IndexScanDesc scan, ScanDirection direction)
+{
+	IndexScanBatchState *batchState = scan->batchState;
+	IndexScanBatch priorbatch = NULL,
+				batch = NULL;
+
+	SCAN_CHECKS;
+	CHECK_SCAN_PROCEDURE(amgetbatch);
+
+	/* XXX: we should assert that a snapshot is pushed or registered */
+	Assert(TransactionIdIsValid(RecentXmin));
+
+	/* Did we already read the last batch for this scan? */
+	if (batchState->finished)
+		return false;
+
+	/*
+	 * If we already used the maximum number of batch slots available, it's
+	 * pointless to try loading another one. This can happen for various
+	 * reasons, e.g. for index-only scans on all-visible table, or skipping
+	 * duplicate blocks on perfectly correlated indexes, etc.
+	 *
+	 * We could enlarge the array to allow more batches, but that's futile, we
+	 * can always construct a case using more memory. Not only it would risk
+	 * OOM, it'd also be inefficient because this happens early in the scan
+	 * (so it'd interfere with LIMIT queries).
+	 */
+	if (INDEX_SCAN_BATCH_FULL(scan))
+	{
+		DEBUG_LOG("index_batch_getnext: ran out of space for batches");
+		scan->batchState->reset = true;
+		return false;
+	}
+
+	index_batch_print("index_batch_getnext / start", scan);
+
+	/*
+	 * Check if there's an existing batch that amgetbatch has to pick things
+	 * up from
+	 */
+	if (batchState->headBatch < batchState->nextBatch)
+		priorbatch = INDEX_SCAN_BATCH(scan, batchState->nextBatch - 1);
+
+	batch = scan->indexRelation->rd_indam->amgetbatch(scan, priorbatch,
+													  direction);
+	if (batch != NULL)
+	{
+		/*
+		 * We got the batch from the AM, but we need to add it to the queue.
+		 * Maybe that should be part of the "batch allocation" that happens in
+		 * the AM?
+		 */
+		int			batchIndex = batchState->nextBatch;
+
+		INDEX_SCAN_BATCH(scan, batchIndex) = batch;
+
+		batchState->nextBatch++;
+
+		DEBUG_LOG("index_batch_getnext headBatch %d nextBatch %d batch %p",
+				  batchState->headBatch, batchState->nextBatch, batch);
+
+		/* Delay initializing stream until reading from scan's second batch */
+		if (priorbatch && !scan->xs_heapfetch->rs && !batchState->disabled &&
+			enable_indexscan_prefetch)
+			scan->xs_heapfetch->rs =
+				read_stream_begin_relation(READ_STREAM_DEFAULT, NULL,
+										   scan->heapRelation, MAIN_FORKNUM,
+										   index_scan_stream_read_next, scan, 0);
+	}
+	else
+		batchState->finished = true;
+
+	AssertCheckBatches(scan);
+
+	index_batch_print("index_batch_getnext / end", scan);
+
+	return (batch != NULL);
+}
+
+/*
+ * index_batch_init
+ *		Initialize various fields / arrays needed by batching.
+ *
+ * FIXME This is a bit ad-hoc hodge podge, due to how I was adding more and
+ * more pieces. Some of the fields may be not quite necessary, needs cleanup.
+ */
+static void
+index_batch_init(IndexScanDesc scan)
+{
+	/* init batching info */
+	Assert(scan->indexRelation->rd_indam->amgetbatch != NULL);
+	Assert(scan->indexRelation->rd_indam->amfreebatch != NULL);
+
+	scan->batchState = palloc(sizeof(IndexScanBatchState));
+
+	/*
+	 * Initialize the batch.
+	 *
+	 * We prefer to eagerly drop leaf page pins before amgetbatch returns.
+	 * This avoids making VACUUM wait to acquire a cleanup lock on the page.
+	 *
+	 * We cannot safely drop leaf page pins during index-only scans due to a
+	 * race condition involving VACUUM setting pages all-visible in the VM.
+	 * It's also unsafe for plain index scans that use a non-MVCC snapshot.
+	 *
+	 * When we drop pins eagerly, the mechanism that marks index tuples as
+	 * LP_DEAD has to deal with concurrent TID recycling races.  The scheme
+	 * used to detect unsafe TID recycling won't work when scanning unlogged
+	 * relations (since it involves saving an affected page's LSN).  Opt out
+	 * of eager pin dropping during unlogged relation scans for now.
+	 */
+	scan->batchState->dropPin =
+		(!scan->xs_want_itup && IsMVCCSnapshot(scan->xs_snapshot) &&
+		 RelationNeedsWAL(scan->indexRelation));
+	scan->batchState->finished = false;
+	scan->batchState->reset = false;
+	scan->batchState->prefetchingLockedIn = false;
+	scan->batchState->disabled = false;
+	scan->batchState->currentPrefetchBlock = InvalidBlockNumber;
+	scan->batchState->direction = NoMovementScanDirection;
+	/* positions in the queue of batches */
+	index_batch_pos_reset(scan, &scan->batchState->readPos);
+	index_batch_pos_reset(scan, &scan->batchState->streamPos);
+	index_batch_pos_reset(scan, &scan->batchState->markPos);
+
+	scan->batchState->markBatch = NULL;
+	scan->batchState->maxBatches = INDEX_SCAN_MAX_BATCHES;
+	scan->batchState->headBatch = 0;	/* initial head batch */
+	scan->batchState->nextBatch = 0;	/* initial batch starts empty */
+
+	/* XXX init the cache of batches, capacity 16 is arbitrary */
+	scan->batchState->batchesCacheSize = 16;
+	scan->batchState->batchesCache = NULL;
+
+	scan->batchState->batches =
+		palloc(sizeof(IndexScanBatchData *) * scan->batchState->maxBatches);
+
+	scan->batchState->prefetch = NULL;
+	scan->batchState->prefetchArg = NULL;
+}
+
+/*
+ * index_batch_reset
+ *		Reset the batch before reading the next chunk of data.
+ *
+ * complete - true means we reset even marked batch
+ *
+  * XXX Should this reset the batch memory context, xs_itup, xs_hitup, etc?
+ */
+static void
+index_batch_reset(IndexScanDesc scan, bool complete)
+{
+	IndexScanBatchState *batchState = scan->batchState;
+
+	/* bail out if batching not enabled */
+	if (!batchState)
+		return;
+
+	AssertCheckBatches(scan);
+
+	index_batch_print("index_batch_reset", scan);
+
+	/* With batching enabled, we should have a read stream. Reset it. */
+	Assert(scan->xs_heapfetch);
+	if (scan->xs_heapfetch->rs)
+		read_stream_reset(scan->xs_heapfetch->rs);
+
+	/* reset the positions */
+	index_batch_pos_reset(scan, &batchState->readPos);
+	index_batch_pos_reset(scan, &batchState->streamPos);
+
+	/*
+	 * With "complete" reset, make sure to also free the marked batch, either
+	 * by just forgetting it (if it's still in the queue), or by explicitly
+	 * freeing it.
+	 *
+	 * XXX Do this before the loop, so that it calls the amfreebatch().
+	 */
+	if (complete && unlikely(batchState->markBatch != NULL))
+	{
+		IndexScanBatchPos *markPos = &batchState->markPos;
+		IndexScanBatch markBatch = batchState->markBatch;
+
+		/* always reset the position, forget the marked batch */
+		batchState->markBatch = NULL;
+
+		/*
+		 * If we've already moved past the marked batch (it's not in the
+		 * current queue), free it explicitly. Otherwise it'll be in the freed
+		 * later.
+		 */
+		if (markPos->batch < batchState->headBatch ||
+			markPos->batch >= batchState->nextBatch)
+			index_batch_free(scan, markBatch);
+
+		/* reset position only after the queue range check */
+		index_batch_pos_reset(scan, &batchState->markPos);
+	}
+
+	/* release all currently loaded batches */
+	while (batchState->headBatch < batchState->nextBatch)
+	{
+		IndexScanBatch batch = INDEX_SCAN_BATCH(scan, batchState->headBatch);
+
+		DEBUG_LOG("freeing batch %d %p", batchState->headBatch, batch);
+
+		index_batch_free(scan, batch);
+
+		/* update the valid range, so that asserts / debugging works */
+		batchState->headBatch++;
+	}
+
+	/* reset relevant batch state fields */
+	Assert(batchState->maxBatches == INDEX_SCAN_MAX_BATCHES);
+	batchState->headBatch = 0;	/* initial batch */
+	batchState->nextBatch = 0;	/* initial batch is empty */
+
+	batchState->finished = false;
+	batchState->reset = false;
+	batchState->currentPrefetchBlock = InvalidBlockNumber;
+
+	AssertCheckBatches(scan);
+}
+
+static void
+index_batch_kill_item(IndexScanDesc scan)
+{
+	IndexScanBatchPos *readPos = &scan->batchState->readPos;
+	IndexScanBatchData *readBatch = INDEX_SCAN_BATCH(scan, readPos->batch);
+
+	AssertCheckBatchPosValid(scan, readPos);
+
+	/*
+	 * XXX Maybe we can move the state that indicates if an item has been
+	 * killed into IndexScanBatchData.items[] array.
+	 *
+	 * See:
+	 * https://postgr.es/m/CAH2-WznLN7P0i2-YEnv3QGmeA5AMjdcjkraO_nz3H2Va1V1WOA@mail.gmail.com
+	 */
+	if (readBatch->killedItems == NULL)
+		readBatch->killedItems = (int *)
+			palloc(MaxTIDsPerBTreePage * sizeof(int));
+	if (readBatch->numKilled < MaxTIDsPerBTreePage)
+		readBatch->killedItems[readBatch->numKilled++] = readPos->index;
+}
+
+static void
+index_batch_free(IndexScanDesc scan, IndexScanBatch batch)
+{
+	SCAN_CHECKS;
+	CHECK_SCAN_PROCEDURE(amfreebatch);
+
+	AssertCheckBatch(scan, batch);
+
+	/* don't free the batch that is marked */
+	if (batch == scan->batchState->markBatch)
+		return;
+
+	scan->indexRelation->rd_indam->amfreebatch(scan, batch);
+}
+
+/* */
+static void
+index_batch_end(IndexScanDesc scan)
+{
+	index_batch_reset(scan, true);
+
+	if (scan->batchState)
+	{
+		if (scan->batchState->batches)
+			pfree(scan->batchState->batches);
+
+		if (scan->batchState->batchesCache)
+		{
+			for (int i = 0; i < scan->batchState->batchesCacheSize; i++)
+			{
+				if (scan->batchState->batchesCache[i] == NULL)
+					continue;
+
+				pfree(scan->batchState->batchesCache[i]);
+			}
+
+			pfree(scan->batchState->batchesCache);
+		}
+		pfree(scan->batchState);
+	}
+}
+
+/*
+ * XXX Both index_batch_alloc() calls in btree use MaxTIDsPerBTreePage,
+ * which seems unfortunate - it increases the allocation sizes, even if
+ * the index would be fine with smaller arrays. This means all batches
+ * exceed ALLOC_CHUNK_LIMIT, forcing a separate malloc (expensive). The
+ * cache helps for longer queries, not for queries that only create a
+ * single batch, etc.
+ */
+IndexScanBatch
+index_batch_alloc(IndexScanDesc scan, int maxitems, bool want_itup)
+{
+	IndexScanBatch batch = NULL;
+
+	/*
+	 * try to find a batch in the cache
+	 *
+	 * XXX We can get here with batchState==NULL for bitmapscans. Could that
+	 * mean bitmapscans have issues with malloc/free on batches too? But the
+	 * cache can't help with that, when it's in batchState.
+	 */
+	if ((scan->batchState != NULL) &&
+		(scan->batchState->batchesCache != NULL))
+	{
+		/*
+		 * try to find a batch in the cache, with maxitems high enough
+		 *
+		 * XXX Maybe should look for a batch with lowest maxitems? That should
+		 * increase probability of cache hits in the future?
+		 */
+		for (int i = 0; i < scan->batchState->batchesCacheSize; i++)
+		{
+			if ((scan->batchState->batchesCache[i] != NULL) &&
+				(scan->batchState->batchesCache[i]->maxitems >= maxitems))
+			{
+				batch = scan->batchState->batchesCache[i];
+				scan->batchState->batchesCache[i] = NULL;
+				break;
+			}
+		}
+	}
+
+	/* found a batch in the cache? */
+	if (batch)
+	{
+		/* for IOS, we expect to already have the currTuples */
+		Assert(!(want_itup && (batch->currTuples == NULL)));
+
+		/* XXX maybe we could keep these allocations too */
+		Assert(batch->pos == NULL);
+		Assert(batch->itemsvisibility == NULL);
+	}
+	else
+	{
+		batch = palloc(offsetof(IndexScanBatchData, items) +
+					   sizeof(IndexScanBatchPosItem) * maxitems);
+
+		batch->maxitems = maxitems;
+
+		/*
+		 * If we are doing an index-only scan, we need a tuple storage
+		 * workspace. We allocate BLCKSZ for this, which should always give
+		 * the index AM enough space to fit a full page's worth of tuples.
+		 */
+		batch->currTuples = NULL;
+		if (want_itup)
+			batch->currTuples = palloc(BLCKSZ);
+	}
+
+	/* shared initialization */
+	batch->firstItem = -1;
+	batch->lastItem = -1;
+	batch->killedItems = NULL;
+	batch->numKilled = 0;
+
+	batch->buf = InvalidBuffer;
+	batch->pos = NULL;
+	batch->itemsvisibility = NULL;	/* per-batch IOS visibility */
+
+	return batch;
+}
+
+/*
+ * Unlock batch->buf.  If batch scan is dropPin, drop the pin, too.  Dropping
+ * the pin prevents VACUUM from blocking on acquiring a cleanup lock.
+ */
+void
+index_batch_unlock(Relation rel, bool dropPin, IndexScanBatch batch)
+{
+	if (!dropPin)
+	{
+		if (!RelationUsesLocalBuffers(rel))
+			VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(batch->buf), BLCKSZ);
+
+		/* Just drop the lock (not the pin) */
+		LockBuffer(batch->buf, BUFFER_LOCK_UNLOCK);
+		return;
+	}
+
+	/*
+	 * Drop both the lock and the pin.
+	 *
+	 * Have to set batch->lsn so that amfreebatch has a way to detect when
+	 * concurrent heap TID recycling by VACUUM might have taken place.  It'll
+	 * only be safe to set any index tuple LP_DEAD bits when the page LSN
+	 * hasn't advanced.
+	 */
+	Assert(RelationNeedsWAL(rel));
+	batch->lsn = BufferGetLSNAtomic(batch->buf);
+	LockBuffer(batch->buf, BUFFER_LOCK_UNLOCK);
+	if (!RelationUsesLocalBuffers(rel))
+		VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(batch->buf), BLCKSZ);
+	ReleaseBuffer(batch->buf);
+	batch->buf = InvalidBuffer; /* defensive */
+}
+
+/* add the buffer to the cache, or free it */
+void
+index_batch_release(IndexScanDesc scan, IndexScanBatch batch)
+{
+	/*
+	 * first free some allocations
+	 *
+	 * XXX We could keep/reuse some of those.
+	 */
+
+	if (batch->killedItems != NULL)
+	{
+		pfree(batch->killedItems);
+		batch->killedItems = NULL;
+	}
+
+	if (batch->itemsvisibility != NULL)
+	{
+		pfree(batch->itemsvisibility);
+		batch->itemsvisibility = NULL;
+	}
+
+	/* XXX a bit unclear what's release by AM vs. indexam */
+	Assert(batch->pos == NULL);
+
+	/*
+	 * try adding it to the cache - finds a slot that's either empty or has a
+	 * lower maxitems value (and replace that batch)
+	 *
+	 * XXX maybe we should track the number of empty slots, and minimum value
+	 * of maxitems, so that we can skip pointless searches?
+	 *
+	 * XXX ignores cases with batchState=NULL (can we get here with bitmap
+	 * scans?)
+	 */
+	if (scan->batchState != NULL)
+	{
+		/* lowest maxitems we found in the cache (to replace with batch) */
+		int			maxitems = batch->maxitems;
+		int			slot = scan->batchState->batchesCacheSize;
+
+		/* first time through, initialize the cache */
+		if (scan->batchState->batchesCache == NULL)
+			scan->batchState->batchesCache
+				= palloc0_array(IndexScanBatch,
+								scan->batchState->batchesCacheSize);
+
+		for (int i = 0; i < scan->batchState->batchesCacheSize; i++)
+		{
+			/* found empty slot, we're done */
+			if (scan->batchState->batchesCache[i] == NULL)
+			{
+				scan->batchState->batchesCache[i] = batch;
+				return;
+			}
+
+			/* update lowest maxitems? */
+			if (scan->batchState->batchesCache[i]->maxitems < maxitems)
+			{
+				maxitems = scan->batchState->batchesCache[i]->maxitems;
+				slot = i;
+			}
+		}
+
+		/* found a batch to replace? */
+		if (maxitems < batch->maxitems)
+		{
+			pfree(scan->batchState->batchesCache[slot]);
+			scan->batchState->batchesCache[slot] = batch;
+		}
+	}
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index fdff960c1302..18c734c4a695 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -158,11 +158,12 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->amadjustmembers = btadjustmembers;
 	amroutine->ambeginscan = btbeginscan;
 	amroutine->amrescan = btrescan;
-	amroutine->amgettuple = btgettuple;
+	amroutine->amgettuple = NULL;
+	amroutine->amgetbatch = btgetbatch;
+	amroutine->amfreebatch = btfreebatch;
 	amroutine->amgetbitmap = btgetbitmap;
 	amroutine->amendscan = btendscan;
-	amroutine->ammarkpos = btmarkpos;
-	amroutine->amrestrpos = btrestrpos;
+	amroutine->amposreset = btposreset;
 	amroutine->amestimateparallelscan = btestimateparallelscan;
 	amroutine->aminitparallelscan = btinitparallelscan;
 	amroutine->amparallelrescan = btparallelrescan;
@@ -220,13 +221,12 @@ btinsert(Relation rel, Datum *values, bool *isnull,
 }
 
 /*
- *	btgettuple() -- Get the next tuple in the scan.
+ *	btgetbatch() -- Get the next batch of tuples in the scan.
  */
-bool
-btgettuple(IndexScanDesc scan, ScanDirection dir)
+IndexScanBatch
+btgetbatch(IndexScanDesc scan, IndexScanBatch batch, ScanDirection dir)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
-	bool		res;
 
 	Assert(scan->heapRelation != NULL);
 
@@ -241,44 +241,18 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 		 * the appropriate direction.  If we haven't done so yet, we call
 		 * _bt_first() to get the first item in the scan.
 		 */
-		if (!BTScanPosIsValid(so->currPos))
-			res = _bt_first(scan, dir);
+		if (batch == NULL)
+			batch = _bt_first(scan, dir);
 		else
-		{
-			/*
-			 * Check to see if we should kill the previously-fetched tuple.
-			 */
-			if (scan->kill_prior_tuple)
-			{
-				/*
-				 * Yes, remember it for later. (We'll deal with all such
-				 * tuples at once right before leaving the index page.)  The
-				 * test for numKilled overrun is not just paranoia: if the
-				 * caller reverses direction in the indexscan then the same
-				 * item might get entered multiple times. It's not worth
-				 * trying to optimize that, so we don't detect it, but instead
-				 * just forget any excess entries.
-				 */
-				if (so->killedItems == NULL)
-					so->killedItems = (int *)
-						palloc(MaxTIDsPerBTreePage * sizeof(int));
-				if (so->numKilled < MaxTIDsPerBTreePage)
-					so->killedItems[so->numKilled++] = so->currPos.itemIndex;
-			}
-
-			/*
-			 * Now continue the scan.
-			 */
-			res = _bt_next(scan, dir);
-		}
+			batch = _bt_next(scan, dir, batch);
 
-		/* If we have a tuple, return it ... */
-		if (res)
+		/* If we have a batch, return it ... */
+		if (batch)
 			break;
 		/* ... otherwise see if we need another primitive index scan */
 	} while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
 
-	return res;
+	return batch;
 }
 
 /*
@@ -288,6 +262,7 @@ int64
 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	IndexScanBatch batch;
 	int64		ntids = 0;
 	ItemPointer heapTid;
 
@@ -296,29 +271,33 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	/* Each loop iteration performs another primitive index scan */
 	do
 	{
-		/* Fetch the first page & tuple */
-		if (_bt_first(scan, ForwardScanDirection))
+		/* Fetch the first batch */
+		if ((batch = _bt_first(scan, ForwardScanDirection)))
 		{
-			/* Save tuple ID, and continue scanning */
-			heapTid = &scan->xs_heaptid;
+			int			itemIndex = 0;
+
+			/* Save first tuple's TID */
+			heapTid = &batch->items[itemIndex].heapTid;
 			tbm_add_tuples(tbm, heapTid, 1, false);
 			ntids++;
 
 			for (;;)
 			{
-				/*
-				 * Advance to next tuple within page.  This is the same as the
-				 * easy case in _bt_next().
-				 */
-				if (++so->currPos.itemIndex > so->currPos.lastItem)
+				/* Advance to next TID within page-sized batch */
+				if (++itemIndex > batch->lastItem)
 				{
+					/* btfreebatch won't be called */
+					ReleaseBuffer(batch->buf);
+
 					/* let _bt_next do the heavy lifting */
-					if (!_bt_next(scan, ForwardScanDirection))
+					itemIndex = 0;
+					batch = _bt_next(scan, ForwardScanDirection, batch);
+					if (!batch)
 						break;
 				}
 
 				/* Save tuple ID, and continue scanning */
-				heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
+				heapTid = &batch->items[itemIndex].heapTid;
 				tbm_add_tuples(tbm, heapTid, 1, false);
 				ntids++;
 			}
@@ -346,8 +325,6 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 
 	/* allocate private workspace */
 	so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
-	BTScanPosInvalidate(so->currPos);
-	BTScanPosInvalidate(so->markPos);
 	if (scan->numberOfKeys > 0)
 		so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
 	else
@@ -361,16 +338,6 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 	so->orderProcs = NULL;
 	so->arrayContext = NULL;
 
-	so->killedItems = NULL;		/* until needed */
-	so->numKilled = 0;
-
-	/*
-	 * We don't know yet whether the scan will be index-only, so we do not
-	 * allocate the tuple workspace arrays until btrescan.  However, we set up
-	 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
-	 */
-	so->currTuples = so->markTuples = NULL;
-
 	scan->xs_itupdesc = RelationGetDescr(rel);
 
 	scan->opaque = so;
@@ -387,80 +354,50 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
-	/* we aren't holding any read locks, but gotta drop the pins */
-	if (BTScanPosIsValid(so->currPos))
-	{
-		/* Before leaving current page, deal with any killed items */
-		if (so->numKilled > 0)
-			_bt_killitems(scan);
-		BTScanPosUnpinIfPinned(so->currPos);
-		BTScanPosInvalidate(so->currPos);
-	}
-
 	/*
-	 * We prefer to eagerly drop leaf page pins before btgettuple returns.
-	 * This avoids making VACUUM wait to acquire a cleanup lock on the page.
-	 *
-	 * We cannot safely drop leaf page pins during index-only scans due to a
-	 * race condition involving VACUUM setting pages all-visible in the VM.
-	 * It's also unsafe for plain index scans that use a non-MVCC snapshot.
-	 *
-	 * When we drop pins eagerly, the mechanism that marks so->killedItems[]
-	 * index tuples LP_DEAD has to deal with concurrent TID recycling races.
-	 * The scheme used to detect unsafe TID recycling won't work when scanning
-	 * unlogged relations (since it involves saving an affected page's LSN).
-	 * Opt out of eager pin dropping during unlogged relation scans for now
-	 * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
-	 *
-	 * Also opt out of dropping leaf page pins eagerly during bitmap scans.
-	 * Pins cannot be held for more than an instant during bitmap scans either
-	 * way, so we might as well avoid wasting cycles on acquiring page LSNs.
-	 *
-	 * See nbtree/README section on making concurrent TID recycling safe.
-	 *
-	 * Note: so->dropPin should never change across rescans.
+	 * Reset the scan keys
 	 */
-	so->dropPin = (!scan->xs_want_itup &&
-				   IsMVCCSnapshot(scan->xs_snapshot) &&
-				   RelationNeedsWAL(scan->indexRelation) &&
-				   scan->heapRelation != NULL);
-
-	so->markItemIndex = -1;
+	if (scankey && scan->numberOfKeys > 0)
+		memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
 	so->needPrimScan = false;
 	so->scanBehind = false;
 	so->oppositeDirCheck = false;
-	BTScanPosUnpinIfPinned(so->markPos);
-	BTScanPosInvalidate(so->markPos);
+	so->numberOfKeys = 0;		/* until _bt_preprocess_keys sets it */
+	so->numArrayKeys = 0;		/* ditto */
+}
 
+/*
+ *	btfreebatch() -- Free batch, releasing its buffer pin
+ *
+ * XXX Should we really be freeing memory like this? What if we were to just
+ * reuse most memory across distinct pages, avoiding pfree/palloc cycles?
+ */
+void
+btfreebatch(IndexScanDesc scan, IndexScanBatch batch)
+{
 	/*
-	 * Allocate tuple workspace arrays, if needed for an index-only scan and
-	 * not already done in a previous rescan call.  To save on palloc
-	 * overhead, both workspaces are allocated as one palloc block; only this
-	 * function and btendscan know that.
-	 *
-	 * NOTE: this data structure also makes it safe to return data from a
-	 * "name" column, even though btree name_ops uses an underlying storage
-	 * datatype of cstring.  The risk there is that "name" is supposed to be
-	 * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
-	 * However, since we only return data out of tuples sitting in the
-	 * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
-	 * data out of the markTuples array --- running off the end of memory for
-	 * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
-	 * adding special-case treatment for name_ops elsewhere.
+	 * Check if there are tuples to kill from this batch (that weren't already
+	 * killed earlier on)
 	 */
-	if (scan->xs_want_itup && so->currTuples == NULL)
+	if (batch->numKilled > 0)
+		_bt_killitems(scan, batch);
+
+	if (batch->pos)
 	{
-		so->currTuples = (char *) palloc(BLCKSZ * 2);
-		so->markTuples = so->currTuples + BLCKSZ;
+		if (!scan->batchState || !scan->batchState->dropPin)
+			ReleaseBuffer(batch->buf);
+
+		pfree(batch->pos);
+
+		/* XXX maybe should be done in index_batch_free? */
+		batch->buf = InvalidBuffer;
+		batch->pos = NULL;
 	}
 
-	/*
-	 * Reset the scan keys
-	 */
-	if (scankey && scan->numberOfKeys > 0)
-		memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
-	so->numberOfKeys = 0;		/* until _bt_preprocess_keys sets it */
-	so->numArrayKeys = 0;		/* ditto */
+	/* XXX keep itemsvisibility, killItems and currTuples */
+
+	/* free the batch (or cache it for reuse) */
+	index_batch_release(scan, batch);
 }
 
 /*
@@ -471,116 +408,50 @@ btendscan(IndexScanDesc scan)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
-	/* we aren't holding any read locks, but gotta drop the pins */
-	if (BTScanPosIsValid(so->currPos))
-	{
-		/* Before leaving current page, deal with any killed items */
-		if (so->numKilled > 0)
-			_bt_killitems(scan);
-		BTScanPosUnpinIfPinned(so->currPos);
-	}
-
-	so->markItemIndex = -1;
-	BTScanPosUnpinIfPinned(so->markPos);
-
-	/* No need to invalidate positions, the RAM is about to be freed. */
-
 	/* Release storage */
 	if (so->keyData != NULL)
 		pfree(so->keyData);
 	/* so->arrayKeys and so->orderProcs are in arrayContext */
 	if (so->arrayContext != NULL)
 		MemoryContextDelete(so->arrayContext);
-	if (so->killedItems != NULL)
-		pfree(so->killedItems);
-	if (so->currTuples != NULL)
-		pfree(so->currTuples);
-	/* so->markTuples should not be pfree'd, see btrescan */
 	pfree(so);
 }
 
 /*
- *	btmarkpos() -- save current scan position
+ *	btposreset() -- invalidate scan's array keys
  */
 void
-btmarkpos(IndexScanDesc scan)
+btposreset(IndexScanDesc scan, IndexScanBatch markbatch)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BTScanPos	pos;
 
-	/* There may be an old mark with a pin (but no lock). */
-	BTScanPosUnpinIfPinned(so->markPos);
+	if (!so->numArrayKeys)
+		return;
 
 	/*
-	 * Just record the current itemIndex.  If we later step to next page
-	 * before releasing the marked position, _bt_steppage makes a full copy of
-	 * the currPos struct in markPos.  If (as often happens) the mark is moved
-	 * before we leave the page, we don't have to do that work.
+	 * Core system is about to restore a mark associated with a previously
+	 * returned batch.  Reset the scan's arrays to make all this safe.
 	 */
-	if (BTScanPosIsValid(so->currPos))
-		so->markItemIndex = so->currPos.itemIndex;
-	else
-	{
-		BTScanPosInvalidate(so->markPos);
-		so->markItemIndex = -1;
-	}
-}
-
-/*
- *	btrestrpos() -- restore scan to last saved position
- */
-void
-btrestrpos(IndexScanDesc scan)
-{
-	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	pos = (BTScanPos) markbatch->pos;
+	_bt_start_array_keys(scan, pos->dir);
 
-	if (so->markItemIndex >= 0)
-	{
-		/*
-		 * The scan has never moved to a new page since the last mark.  Just
-		 * restore the itemIndex.
-		 *
-		 * NB: In this case we can't count on anything in so->markPos to be
-		 * accurate.
-		 */
-		so->currPos.itemIndex = so->markItemIndex;
-	}
+	/*
+	 * Core system will invalidate all other batches.
+	 *
+	 * Deal with this by unsetting needPrimScan as well as moreRight (or as
+	 * well as moreLeft, when scanning backwards).  That way, the next time
+	 * _bt_next is called it will step to the right (or to the left).  At that
+	 * point _bt_readpage will restore the scan's arrays to elements that
+	 * correctly track the next page's position in the index's key space.
+	 */
+	if (ScanDirectionIsForward(pos->dir))
+		pos->moreRight = true;
 	else
-	{
-		/*
-		 * The scan moved to a new page after last mark or restore, and we are
-		 * now restoring to the marked page.  We aren't holding any read
-		 * locks, but if we're still holding the pin for the current position,
-		 * we must drop it.
-		 */
-		if (BTScanPosIsValid(so->currPos))
-		{
-			/* Before leaving current page, deal with any killed items */
-			if (so->numKilled > 0)
-				_bt_killitems(scan);
-			BTScanPosUnpinIfPinned(so->currPos);
-		}
-
-		if (BTScanPosIsValid(so->markPos))
-		{
-			/* bump pin on mark buffer for assignment to current buffer */
-			if (BTScanPosIsPinned(so->markPos))
-				IncrBufferRefCount(so->markPos.buf);
-			memcpy(&so->currPos, &so->markPos,
-				   offsetof(BTScanPosData, items[1]) +
-				   so->markPos.lastItem * sizeof(BTScanPosItem));
-			if (so->currTuples)
-				memcpy(so->currTuples, so->markTuples,
-					   so->markPos.nextTupleOffset);
-			/* Reset the scan's array keys (see _bt_steppage for why) */
-			if (so->numArrayKeys)
-			{
-				_bt_start_array_keys(scan, so->currPos.dir);
-				so->needPrimScan = false;
-			}
-		}
-		else
-			BTScanPosInvalidate(so->currPos);
-	}
+		pos->moreLeft = true;
+	so->needPrimScan = false;
+	so->scanBehind = false;
+	so->oppositeDirCheck = false;
 }
 
 /*
@@ -827,15 +698,6 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
 	*next_scan_page = InvalidBlockNumber;
 	*last_curr_page = InvalidBlockNumber;
 
-	/*
-	 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
-	 * call to _bt_readnextpage treats this backend similarly to a serial
-	 * backend that steps from *last_curr_page to *next_scan_page (unless this
-	 * backend's so->currPos is initialized by _bt_readfirstpage before then).
-	 */
-	BTScanPosInvalidate(so->currPos);
-	so->currPos.moreLeft = so->currPos.moreRight = true;
-
 	if (first)
 	{
 		/*
@@ -985,8 +847,6 @@ _bt_parallel_done(IndexScanDesc scan)
 	BTParallelScanDesc btscan;
 	bool		status_changed = false;
 
-	Assert(!BTScanPosIsValid(so->currPos));
-
 	/* Do nothing, for non-parallel scans */
 	if (parallel_scan == NULL)
 		return;
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 0605356ec9f0..b019c19f806d 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -25,62 +25,33 @@
 #include "utils/rel.h"
 
 
-static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
 static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
 							Buffer buf, bool forupdate, BTStack stack,
 							int access);
 static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
 static int	_bt_binsrch_posting(BTScanInsert key, Page page,
 								OffsetNumber offnum);
-static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
-						 OffsetNumber offnum, bool firstpage);
-static void _bt_saveitem(BTScanOpaque so, int itemIndex,
-						 OffsetNumber offnum, IndexTuple itup);
-static int	_bt_setuppostingitems(BTScanOpaque so, int itemIndex,
+static bool _bt_readpage(IndexScanDesc scan, IndexScanBatch newbatch,
+						 ScanDirection dir, OffsetNumber offnum,
+						 bool firstpage);
+static void _bt_saveitem(IndexScanBatch newbatch, int itemIndex,
+						 OffsetNumber offnum, IndexTuple itup,
+						 int *tupleOffset);
+static int	_bt_setuppostingitems(IndexScanBatch newbatch, int itemIndex,
 								  OffsetNumber offnum, const ItemPointerData *heapTid,
-								  IndexTuple itup);
-static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
+								  IndexTuple itup, int *tupleOffset);
+static inline void _bt_savepostingitem(IndexScanBatch newbatch, int itemIndex,
 									   OffsetNumber offnum,
-									   ItemPointer heapTid, int tupleOffset);
-static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
-static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
-static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
-							  ScanDirection dir);
-static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
-							 BlockNumber lastcurrblkno, ScanDirection dir,
-							 bool seized);
+									   ItemPointer heapTid, int baseOffset);
+static IndexScanBatch _bt_readfirstpage(IndexScanDesc scan, IndexScanBatch firstbatch,
+										OffsetNumber offnum, ScanDirection dir);
+static IndexScanBatch _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
+									   BlockNumber lastcurrblkno,
+									   ScanDirection dir, bool firstpage);
 static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
 										 BlockNumber lastcurrblkno);
-static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
-
-
-/*
- *	_bt_drop_lock_and_maybe_pin()
- *
- * Unlock so->currPos.buf.  If scan is so->dropPin, drop the pin, too.
- * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
- */
-static inline void
-_bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
-{
-	if (!so->dropPin)
-	{
-		/* Just drop the lock (not the pin) */
-		_bt_unlockbuf(rel, so->currPos.buf);
-		return;
-	}
-
-	/*
-	 * Drop both the lock and the pin.
-	 *
-	 * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
-	 * when concurrent heap TID recycling by VACUUM might have taken place.
-	 */
-	Assert(RelationNeedsWAL(rel));
-	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
-	_bt_relbuf(rel, so->currPos.buf);
-	so->currPos.buf = InvalidBuffer;
-}
+static IndexScanBatch _bt_endpoint(IndexScanDesc scan, ScanDirection dir,
+								   IndexScanBatch firstbatch);
 
 /*
  *	_bt_search() -- Search the tree for a particular scankey,
@@ -870,20 +841,16 @@ _bt_compare(Relation rel,
  *		conditions, and the tree ordering.  We find the first item (or,
  *		if backwards scan, the last item) in the tree that satisfies the
  *		qualifications in the scan key.  On success exit, data about the
- *		matching tuple(s) on the page has been loaded into so->currPos.  We'll
- *		drop all locks and hold onto a pin on page's buffer, except during
- *		so->dropPin scans, when we drop both the lock and the pin.
- *		_bt_returnitem sets the next item to return to scan on success exit.
+ *		matching tuple(s) on the page has been loaded into the returned batch.
  *
- * If there are no matching items in the index, we return false, with no
- * pins or locks held.  so->currPos will remain invalid.
+ * If there are no matching items in the index, we just return NULL.
  *
  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
  * are both search-type scankeys (see nbtree/README for more about this).
  * Within this routine, we build a temporary insertion-type scankey to use
  * in locating the scan start position.
  */
-bool
+IndexScanBatch
 _bt_first(IndexScanDesc scan, ScanDirection dir)
 {
 	Relation	rel = scan->indexRelation;
@@ -897,8 +864,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	StrategyNumber strat_total = InvalidStrategy;
 	BlockNumber blkno = InvalidBlockNumber,
 				lastcurrblkno;
-
-	Assert(!BTScanPosIsValid(so->currPos));
+	IndexScanBatch firstbatch;
 
 	/*
 	 * Examine the scan keys and eliminate any redundant keys; also mark the
@@ -923,7 +889,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 */
 	if (scan->parallel_scan != NULL &&
 		!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
-		return false;
+		return false;			/* definitely done (so->needPrimscan is unset) */
 
 	/*
 	 * Initialize the scan's arrays (if any) for the current scan direction
@@ -940,14 +906,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		 * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
 		 */
 		Assert(scan->parallel_scan != NULL);
-		Assert(!so->needPrimScan);
-		Assert(blkno != P_NONE);
-
-		if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
-			return false;
 
-		_bt_returnitem(scan, so);
-		return true;
+		return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, true);
 	}
 
 	/*
@@ -1239,6 +1199,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		}
 	}
 
+	/* Allocate space for first batch */
+	firstbatch = index_batch_alloc(scan, MaxTIDsPerBTreePage, scan->xs_want_itup);
+	firstbatch->pos = palloc(sizeof(BTScanPosData));
+
 	/*
 	 * If we found no usable boundary keys, we have to start from one end of
 	 * the tree.  Walk down that edge to the first or last key, and scan from
@@ -1247,7 +1211,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
 	 */
 	if (keysz == 0)
-		return _bt_endpoint(scan, dir);
+		return _bt_endpoint(scan, dir, firstbatch);
 
 	/*
 	 * We want to start the scan somewhere within the index.  Set up an
@@ -1515,12 +1479,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 * position ourselves on the target leaf page.
 	 */
 	Assert(ScanDirectionIsBackward(dir) == inskey.backward);
-	stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
+	stack = _bt_search(rel, NULL, &inskey, &firstbatch->buf, BT_READ);
 
 	/* don't need to keep the stack around... */
 	_bt_freestack(stack);
 
-	if (!BufferIsValid(so->currPos.buf))
+	if (!BufferIsValid(firstbatch->buf))
 	{
 		Assert(!so->needPrimScan);
 
@@ -1536,11 +1500,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		if (IsolationIsSerializable())
 		{
 			PredicateLockRelation(rel, scan->xs_snapshot);
-			stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
+			stack = _bt_search(rel, NULL, &inskey, &firstbatch->buf, BT_READ);
 			_bt_freestack(stack);
 		}
 
-		if (!BufferIsValid(so->currPos.buf))
+		if (!BufferIsValid(firstbatch->buf))
 		{
 			_bt_parallel_done(scan);
 			return false;
@@ -1548,11 +1512,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	}
 
 	/* position to the precise item on the page */
-	offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
+	offnum = _bt_binsrch(rel, &inskey, firstbatch->buf);
 
 	/*
 	 * Now load data from the first page of the scan (usually the page
-	 * currently in so->currPos.buf).
+	 * currently in firstbatch.buf).
 	 *
 	 * If inskey.nextkey = false and inskey.backward = false, offnum is
 	 * positioned at the first non-pivot tuple >= inskey.scankeys.
@@ -1570,69 +1534,79 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 * for the page.  For example, when inskey is both < the leaf page's high
 	 * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
 	 */
-	if (!_bt_readfirstpage(scan, offnum, dir))
-		return false;
-
-	_bt_returnitem(scan, so);
-	return true;
+	return _bt_readfirstpage(scan, firstbatch, offnum, dir);
 }
 
 /*
  *	_bt_next() -- Get the next item in a scan.
  *
- *		On entry, so->currPos describes the current page, which may be pinned
- *		but is not locked, and so->currPos.itemIndex identifies which item was
- *		previously returned.
+ *		On entry, priorbatch describes the batch that was last returned by
+ *		btgetbatch.  We'll use the prior batch's positioning information to
+ *		decide which page to read next.
  *
- *		On success exit, so->currPos is updated as needed, and _bt_returnitem
- *		sets the next item to return to the scan.  so->currPos remains valid.
+ *		On success exit, returns the next batch.  There must be at least one
+ *		matching tuple on any returned batch (else we'd just return NULL).
  *
- *		On failure exit (no more tuples), we invalidate so->currPos.  It'll
- *		still be possible for the scan to return tuples by changing direction,
- *		though we'll need to call _bt_first anew in that other direction.
+ *		On failure exit (no more tuples), we return NULL.  It'll still be
+ *		possible for the scan to return tuples by changing direction, though
+ *		we'll need to call _bt_first anew in that other direction.
  */
-bool
-_bt_next(IndexScanDesc scan, ScanDirection dir)
+IndexScanBatch
+_bt_next(IndexScanDesc scan, ScanDirection dir, IndexScanBatch priorbatch)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BlockNumber blkno,
+				lastcurrblkno;
+	BTScanPos	priorpos = (BTScanPos) priorbatch->pos;
 
-	Assert(BTScanPosIsValid(so->currPos));
+	Assert(BTScanPosIsValid(*priorpos));
 
-	/*
-	 * Advance to next tuple on current page; or if there's no more, try to
-	 * step to the next page with data.
-	 */
+	/* Walk to the next page with data */
 	if (ScanDirectionIsForward(dir))
-	{
-		if (++so->currPos.itemIndex > so->currPos.lastItem)
-		{
-			if (!_bt_steppage(scan, dir))
-				return false;
-		}
-	}
+		blkno = priorpos->nextPage;
 	else
+		blkno = priorpos->prevPage;
+	lastcurrblkno = priorpos->currPage;
+
+	/*
+	 * Cancel primitive index scans that were scheduled when the call to
+	 * _bt_readpage for pos happened to use the opposite direction to the one
+	 * that we're stepping in now.  (It's okay to leave the scan's array keys
+	 * as-is, since the next _bt_readpage will advance them.)
+	 */
+	if (priorpos->dir != dir)
+		so->needPrimScan = false;
+
+	if (blkno == P_NONE ||
+		(ScanDirectionIsForward(dir) ?
+		 !priorpos->moreRight : !priorpos->moreLeft))
 	{
-		if (--so->currPos.itemIndex < so->currPos.firstItem)
-		{
-			if (!_bt_steppage(scan, dir))
-				return false;
-		}
+		/*
+		 * priorpos _bt_readpage call ended scan in this direction (though if
+		 * so->needPrimScan was set the scan will continue in _bt_first)
+		 */
+		_bt_parallel_done(scan);
+		return NULL;
 	}
 
-	_bt_returnitem(scan, so);
-	return true;
+	/* parallel scan must seize the scan to get next blkno */
+	if (scan->parallel_scan != NULL &&
+		!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
+		return NULL;			/* done iff so->needPrimScan wasn't set */
+
+	return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
 }
 
 /*
- *	_bt_readpage() -- Load data from current index page into so->currPos
+ *	_bt_readpage() -- Load data from current index page into newbatch.
  *
- * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
- * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
- * they are updated as appropriate.  All other fields of so->currPos are
- * initialized from scratch here.
+ * Caller must have pinned and read-locked newbatch.buf; the buffer's state is
+ * not changed here.  Also, pos.moreLeft and moreRight must be valid; they are
+ * updated as appropriate.  All other fields of newbatch are initialized from
+ * scratch here.
  *
  * We scan the current page starting at offnum and moving in the indicated
- * direction.  All items matching the scan keys are loaded into currPos.items.
+ * direction.  All items matching the scan keys are saved in newbatch.items.
  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
  * that there can be no more matching tuples in the current scan direction
  * (could just be for the current primitive index scan when scan has arrays).
@@ -1644,8 +1618,8 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
  * Returns true if any matching items found on the page, false if none.
  */
 static bool
-_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
-			 bool firstpage)
+_bt_readpage(IndexScanDesc scan, IndexScanBatch newbatch, ScanDirection dir,
+			 OffsetNumber offnum, bool firstpage)
 {
 	Relation	rel = scan->indexRelation;
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
@@ -1656,37 +1630,35 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 	BTReadPageState pstate;
 	bool		arrayKeys;
 	int			itemIndex,
+				tupleOffset = 0,
 				indnatts;
+	BTScanPos	pos = newbatch->pos;
 
 	/* save the page/buffer block number, along with its sibling links */
-	page = BufferGetPage(so->currPos.buf);
+	page = BufferGetPage(newbatch->buf);
 	opaque = BTPageGetOpaque(page);
-	so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
-	so->currPos.prevPage = opaque->btpo_prev;
-	so->currPos.nextPage = opaque->btpo_next;
-	/* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */
-	so->currPos.dir = dir;
-	so->currPos.nextTupleOffset = 0;
+	pos->currPage = BufferGetBlockNumber(newbatch->buf);
+	pos->prevPage = opaque->btpo_prev;
+	pos->nextPage = opaque->btpo_next;
+	pos->dir = dir;
+
+	so->pos = pos;				/* _bt_checkkeys needs this */
 
 	/* either moreRight or moreLeft should be set now (may be unset later) */
-	Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
-		   so->currPos.moreLeft);
+	Assert(ScanDirectionIsForward(dir) ? pos->moreRight : pos->moreLeft);
 	Assert(!P_IGNORE(opaque));
-	Assert(BTScanPosIsPinned(so->currPos));
 	Assert(!so->needPrimScan);
 
 	if (scan->parallel_scan)
 	{
 		/* allow next/prev page to be read by other worker without delay */
 		if (ScanDirectionIsForward(dir))
-			_bt_parallel_release(scan, so->currPos.nextPage,
-								 so->currPos.currPage);
+			_bt_parallel_release(scan, pos->nextPage, pos->currPage);
 		else
-			_bt_parallel_release(scan, so->currPos.prevPage,
-								 so->currPos.currPage);
+			_bt_parallel_release(scan, pos->prevPage, pos->currPage);
 	}
 
-	PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
+	PredicateLockPage(rel, pos->currPage, scan->xs_snapshot);
 
 	/* initialize local variables */
 	indnatts = IndexRelationGetNumberOfAttributes(rel);
@@ -1724,11 +1696,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 					!_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
 				{
 					/* Schedule another primitive index scan after all */
-					so->currPos.moreRight = false;
+					pos->moreRight = false;
 					so->needPrimScan = true;
 					if (scan->parallel_scan)
-						_bt_parallel_primscan_schedule(scan,
-													   so->currPos.currPage);
+						_bt_parallel_primscan_schedule(scan, pos->currPage);
 					return false;
 				}
 			}
@@ -1792,28 +1763,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 				if (!BTreeTupleIsPosting(itup))
 				{
 					/* Remember it */
-					_bt_saveitem(so, itemIndex, offnum, itup);
+					_bt_saveitem(newbatch, itemIndex, offnum, itup, &tupleOffset);
 					itemIndex++;
 				}
 				else
 				{
-					int			tupleOffset;
+					int			baseOffset;
 
 					/*
 					 * Set up state to return posting list, and remember first
 					 * TID
 					 */
-					tupleOffset =
-						_bt_setuppostingitems(so, itemIndex, offnum,
+					baseOffset =
+						_bt_setuppostingitems(newbatch, itemIndex, offnum,
 											  BTreeTupleGetPostingN(itup, 0),
-											  itup);
+											  itup, &tupleOffset);
 					itemIndex++;
 					/* Remember additional TIDs */
 					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
 					{
-						_bt_savepostingitem(so, itemIndex, offnum,
+						_bt_savepostingitem(newbatch, itemIndex, offnum,
 											BTreeTupleGetPostingN(itup, i),
-											tupleOffset);
+											baseOffset);
 						itemIndex++;
 					}
 				}
@@ -1853,12 +1824,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 		}
 
 		if (!pstate.continuescan)
-			so->currPos.moreRight = false;
+			pos->moreRight = false;
 
 		Assert(itemIndex <= MaxTIDsPerBTreePage);
-		so->currPos.firstItem = 0;
-		so->currPos.lastItem = itemIndex - 1;
-		so->currPos.itemIndex = 0;
+		newbatch->firstItem = 0;
+		newbatch->lastItem = itemIndex - 1;
 	}
 	else
 	{
@@ -1875,11 +1845,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 					!_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
 				{
 					/* Schedule another primitive index scan after all */
-					so->currPos.moreLeft = false;
+					pos->moreLeft = false;
 					so->needPrimScan = true;
 					if (scan->parallel_scan)
-						_bt_parallel_primscan_schedule(scan,
-													   so->currPos.currPage);
+						_bt_parallel_primscan_schedule(scan, pos->currPage);
 					return false;
 				}
 			}
@@ -1980,11 +1949,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 				{
 					/* Remember it */
 					itemIndex--;
-					_bt_saveitem(so, itemIndex, offnum, itup);
+					_bt_saveitem(newbatch, itemIndex, offnum, itup, &tupleOffset);
 				}
 				else
 				{
-					int			tupleOffset;
+					int			baseOffset;
 
 					/*
 					 * Set up state to return posting list, and remember first
@@ -1997,17 +1966,17 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 					 * associated with the same posting list tuple.
 					 */
 					itemIndex--;
-					tupleOffset =
-						_bt_setuppostingitems(so, itemIndex, offnum,
+					baseOffset =
+						_bt_setuppostingitems(newbatch, itemIndex, offnum,
 											  BTreeTupleGetPostingN(itup, 0),
-											  itup);
+											  itup, &tupleOffset);
 					/* Remember additional TIDs */
 					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
 					{
 						itemIndex--;
-						_bt_savepostingitem(so, itemIndex, offnum,
+						_bt_savepostingitem(newbatch, itemIndex, offnum,
 											BTreeTupleGetPostingN(itup, i),
-											tupleOffset);
+											baseOffset);
 					}
 				}
 			}
@@ -2023,12 +1992,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 		 * be found there
 		 */
 		if (!pstate.continuescan)
-			so->currPos.moreLeft = false;
+			pos->moreLeft = false;
 
 		Assert(itemIndex >= 0);
-		so->currPos.firstItem = itemIndex;
-		so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
-		so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
+		newbatch->firstItem = itemIndex;
+		newbatch->lastItem = MaxTIDsPerBTreePage - 1;
 	}
 
 	/*
@@ -2045,202 +2013,96 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 	 */
 	Assert(!pstate.forcenonrequired);
 
-	return (so->currPos.firstItem <= so->currPos.lastItem);
+	return (newbatch->firstItem <= newbatch->lastItem);
 }
 
-/* Save an index item into so->currPos.items[itemIndex] */
+/* Save an index item into newbatch.items[itemIndex] */
 static void
-_bt_saveitem(BTScanOpaque so, int itemIndex,
-			 OffsetNumber offnum, IndexTuple itup)
+_bt_saveitem(IndexScanBatch newbatch, int itemIndex, OffsetNumber offnum,
+			 IndexTuple itup, int *tupleOffset)
 {
-	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
-
 	Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
 
-	currItem->heapTid = itup->t_tid;
-	currItem->indexOffset = offnum;
-	if (so->currTuples)
+	/* copy the populated part of the items array */
+	newbatch->items[itemIndex].heapTid = itup->t_tid;
+	newbatch->items[itemIndex].indexOffset = offnum;
+
+	if (newbatch->currTuples)
 	{
 		Size		itupsz = IndexTupleSize(itup);
 
-		currItem->tupleOffset = so->currPos.nextTupleOffset;
-		memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
-		so->currPos.nextTupleOffset += MAXALIGN(itupsz);
+		newbatch->items[itemIndex].tupleOffset = *tupleOffset;
+		memcpy(newbatch->currTuples + *tupleOffset, itup, itupsz);
+		*tupleOffset += MAXALIGN(itupsz);
 	}
 }
 
 /*
  * Setup state to save TIDs/items from a single posting list tuple.
  *
- * Saves an index item into so->currPos.items[itemIndex] for TID that is
- * returned to scan first.  Second or subsequent TIDs for posting list should
- * be saved by calling _bt_savepostingitem().
+ * Saves an index item into newbatch.items[itemIndex] for TID that is returned
+ * to scan first.  Second or subsequent TIDs for posting list should be saved
+ * by calling _bt_savepostingitem().
  *
- * Returns an offset into tuple storage space that main tuple is stored at if
- * needed.
+ * Returns baseOffset, an offset into tuple storage space that main tuple is
+ * stored at if needed.
  */
 static int
-_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
-					  const ItemPointerData *heapTid, IndexTuple itup)
+_bt_setuppostingitems(IndexScanBatch newbatch, int itemIndex,
+					  OffsetNumber offnum, const ItemPointerData *heapTid,
+					  IndexTuple itup, int *tupleOffset)
 {
-	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+	IndexScanBatchPosItem *item = &newbatch->items[itemIndex];
 
 	Assert(BTreeTupleIsPosting(itup));
 
-	currItem->heapTid = *heapTid;
-	currItem->indexOffset = offnum;
-	if (so->currTuples)
+	/* copy the populated part of the items array */
+	item->heapTid = *heapTid;
+	item->indexOffset = offnum;
+
+	if (newbatch->currTuples)
 	{
 		/* Save base IndexTuple (truncate posting list) */
 		IndexTuple	base;
 		Size		itupsz = BTreeTupleGetPostingOffset(itup);
 
 		itupsz = MAXALIGN(itupsz);
-		currItem->tupleOffset = so->currPos.nextTupleOffset;
-		base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
+		item->tupleOffset = *tupleOffset;
+		base = (IndexTuple) (newbatch->currTuples + *tupleOffset);
 		memcpy(base, itup, itupsz);
 		/* Defensively reduce work area index tuple header size */
 		base->t_info &= ~INDEX_SIZE_MASK;
 		base->t_info |= itupsz;
-		so->currPos.nextTupleOffset += itupsz;
+		*tupleOffset += itupsz;
 
-		return currItem->tupleOffset;
+		return item->tupleOffset;
 	}
 
 	return 0;
 }
 
 /*
- * Save an index item into so->currPos.items[itemIndex] for current posting
+ * Save an index item into newbatch.items[itemIndex] for current posting
  * tuple.
  *
  * Assumes that _bt_setuppostingitems() has already been called for current
- * posting list tuple.  Caller passes its return value as tupleOffset.
+ * posting list tuple.  Caller passes its return value as baseOffset.
  */
 static inline void
-_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
-					ItemPointer heapTid, int tupleOffset)
+_bt_savepostingitem(IndexScanBatch newbatch, int itemIndex, OffsetNumber offnum,
+					ItemPointer heapTid, int baseOffset)
 {
-	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+	IndexScanBatchPosItem *item = &newbatch->items[itemIndex];
 
-	currItem->heapTid = *heapTid;
-	currItem->indexOffset = offnum;
+	item->heapTid = *heapTid;
+	item->indexOffset = offnum;
 
 	/*
 	 * Have index-only scans return the same base IndexTuple for every TID
 	 * that originates from the same posting list
 	 */
-	if (so->currTuples)
-		currItem->tupleOffset = tupleOffset;
-}
-
-/*
- * Return the index item from so->currPos.items[so->currPos.itemIndex] to the
- * index scan by setting the relevant fields in caller's index scan descriptor
- */
-static inline void
-_bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
-{
-	BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
-
-	/* Most recent _bt_readpage must have succeeded */
-	Assert(BTScanPosIsValid(so->currPos));
-	Assert(so->currPos.itemIndex >= so->currPos.firstItem);
-	Assert(so->currPos.itemIndex <= so->currPos.lastItem);
-
-	/* Return next item, per amgettuple contract */
-	scan->xs_heaptid = currItem->heapTid;
-	if (so->currTuples)
-		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
-}
-
-/*
- *	_bt_steppage() -- Step to next page containing valid data for scan
- *
- * Wrapper on _bt_readnextpage that performs final steps for the current page.
- *
- * On entry, so->currPos must be valid.  Its buffer will be pinned, though
- * never locked. (Actually, when so->dropPin there won't even be a pin held,
- * though so->currPos.currPage must still be set to a valid block number.)
- */
-static bool
-_bt_steppage(IndexScanDesc scan, ScanDirection dir)
-{
-	BTScanOpaque so = (BTScanOpaque) scan->opaque;
-	BlockNumber blkno,
-				lastcurrblkno;
-
-	Assert(BTScanPosIsValid(so->currPos));
-
-	/* Before leaving current page, deal with any killed items */
-	if (so->numKilled > 0)
-		_bt_killitems(scan);
-
-	/*
-	 * Before we modify currPos, make a copy of the page data if there was a
-	 * mark position that needs it.
-	 */
-	if (so->markItemIndex >= 0)
-	{
-		/* bump pin on current buffer for assignment to mark buffer */
-		if (BTScanPosIsPinned(so->currPos))
-			IncrBufferRefCount(so->currPos.buf);
-		memcpy(&so->markPos, &so->currPos,
-			   offsetof(BTScanPosData, items[1]) +
-			   so->currPos.lastItem * sizeof(BTScanPosItem));
-		if (so->markTuples)
-			memcpy(so->markTuples, so->currTuples,
-				   so->currPos.nextTupleOffset);
-		so->markPos.itemIndex = so->markItemIndex;
-		so->markItemIndex = -1;
-
-		/*
-		 * If we're just about to start the next primitive index scan
-		 * (possible with a scan that has arrays keys, and needs to skip to
-		 * continue in the current scan direction), moreLeft/moreRight only
-		 * indicate the end of the current primitive index scan.  They must
-		 * never be taken to indicate that the top-level index scan has ended
-		 * (that would be wrong).
-		 *
-		 * We could handle this case by treating the current array keys as
-		 * markPos state.  But depending on the current array state like this
-		 * would add complexity.  Instead, we just unset markPos's copy of
-		 * moreRight or moreLeft (whichever might be affected), while making
-		 * btrestrpos reset the scan's arrays to their initial scan positions.
-		 * In effect, btrestrpos leaves advancing the arrays up to the first
-		 * _bt_readpage call (that takes place after it has restored markPos).
-		 */
-		if (so->needPrimScan)
-		{
-			if (ScanDirectionIsForward(so->currPos.dir))
-				so->markPos.moreRight = true;
-			else
-				so->markPos.moreLeft = true;
-		}
-
-		/* mark/restore not supported by parallel scans */
-		Assert(!scan->parallel_scan);
-	}
-
-	BTScanPosUnpinIfPinned(so->currPos);
-
-	/* Walk to the next page with data */
-	if (ScanDirectionIsForward(dir))
-		blkno = so->currPos.nextPage;
-	else
-		blkno = so->currPos.prevPage;
-	lastcurrblkno = so->currPos.currPage;
-
-	/*
-	 * Cancel primitive index scans that were scheduled when the call to
-	 * _bt_readpage for currPos happened to use the opposite direction to the
-	 * one that we're stepping in now.  (It's okay to leave the scan's array
-	 * keys as-is, since the next _bt_readpage will advance them.)
-	 */
-	if (so->currPos.dir != dir)
-		so->needPrimScan = false;
-
-	return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
+	if (newbatch->currTuples)
+		item->tupleOffset = baseOffset;
 }
 
 /*
@@ -2252,73 +2114,96 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
  * to stop the scan on this page by calling _bt_checkkeys against the high
  * key.  See _bt_readpage for full details.
  *
- * On entry, so->currPos must be pinned and locked (so offnum stays valid).
+ * On entry, firstbatch must be pinned and locked (so offnum stays valid).
  * Parallel scan callers must have seized the scan before calling here.
  *
- * On exit, we'll have updated so->currPos and retained locks and pins
+ * On exit, we'll have updated firstbatch and retained locks and pins
  * according to the same rules as those laid out for _bt_readnextpage exit.
- * Like _bt_readnextpage, our return value indicates if there are any matching
- * records in the given direction.
  *
  * We always release the scan for a parallel scan caller, regardless of
  * success or failure; we'll call _bt_parallel_release as soon as possible.
  */
-static bool
-_bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
+static IndexScanBatch
+_bt_readfirstpage(IndexScanDesc scan, IndexScanBatch firstbatch,
+				  OffsetNumber offnum, ScanDirection dir)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Relation	rel = scan->indexRelation;
+	BlockNumber blkno,
+				lastcurrblkno;
+	BTScanPos	firstpos = firstbatch->pos;
 
-	so->numKilled = 0;			/* just paranoia */
-	so->markItemIndex = -1;		/* ditto */
-
-	/* Initialize so->currPos for the first page (page in so->currPos.buf) */
+	/* Initialize firstbatch's position for the first page */
 	if (so->needPrimScan)
 	{
 		Assert(so->numArrayKeys);
 
-		so->currPos.moreLeft = true;
-		so->currPos.moreRight = true;
+		firstpos->moreLeft = true;
+		firstpos->moreRight = true;
 		so->needPrimScan = false;
 	}
 	else if (ScanDirectionIsForward(dir))
 	{
-		so->currPos.moreLeft = false;
-		so->currPos.moreRight = true;
+		firstpos->moreLeft = false;
+		firstpos->moreRight = true;
 	}
 	else
 	{
-		so->currPos.moreLeft = true;
-		so->currPos.moreRight = false;
+		firstpos->moreLeft = true;
+		firstpos->moreRight = false;
 	}
 
 	/*
 	 * Attempt to load matching tuples from the first page.
 	 *
-	 * Note that _bt_readpage will finish initializing the so->currPos fields.
+	 * Note that _bt_readpage will finish initializing the firstbatch fields.
 	 * _bt_readpage also releases parallel scan (even when it returns false).
 	 */
-	if (_bt_readpage(scan, dir, offnum, true))
+	if (_bt_readpage(scan, firstbatch, dir, offnum, true))
 	{
-		Relation	rel = scan->indexRelation;
+		/* _bt_readpage succeeded */
+		index_batch_unlock(rel, scan->batchState && scan->batchState->dropPin,
+						   firstbatch);
+		return firstbatch;
+	}
+
+	/* There's no actually-matching data on the page in firstbatch->buf */
+	_bt_relbuf(rel, firstbatch->buf);
+	firstbatch->buf = InvalidBuffer;
 
+	/* Walk to the next page with data */
+	if (ScanDirectionIsForward(dir))
+		blkno = firstpos->nextPage;
+	else
+		blkno = firstpos->prevPage;
+	lastcurrblkno = firstpos->currPage;
+
+	Assert(firstpos->dir == dir);
+
+	/* firstbatch will never be returned to scan, so free it outselves */
+	pfree(firstbatch);
+
+	if (blkno == P_NONE ||
+		(ScanDirectionIsForward(dir) ?
+		 !firstpos->moreRight : !firstpos->moreLeft))
+	{
 		/*
-		 * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
-		 * so->currPos.buf in preparation for btgettuple returning tuples.
+		 * firstbatch _bt_readpage call ended scan in this direction (though
+		 * if so->needPrimScan was set the scan will continue in _bt_first)
 		 */
-		Assert(BTScanPosIsPinned(so->currPos));
-		_bt_drop_lock_and_maybe_pin(rel, so);
-		return true;
+		pfree(firstpos);
+		_bt_parallel_done(scan);
+		return NULL;
 	}
 
-	/* There's no actually-matching data on the page in so->currPos.buf */
-	_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+	pfree(firstpos);
 
-	/* Call _bt_readnextpage using its _bt_steppage wrapper function */
-	if (!_bt_steppage(scan, dir))
-		return false;
+	/* parallel scan must seize the scan to get next blkno */
+	if (scan->parallel_scan != NULL &&
+		!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
+		return NULL;			/* done iff so->needPrimScan wasn't set */
 
-	/* _bt_readpage for a later page (now in so->currPos) succeeded */
-	return true;
+	return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
 }
 
 /*
@@ -2328,102 +2213,70 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
  * previously-saved right link or left link.  lastcurrblkno is the page that
  * was current at the point where the blkno link was saved, which we use to
  * reason about concurrent page splits/page deletions during backwards scans.
- * In the common case where seized=false, blkno is either so->currPos.nextPage
- * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage.
+ * blkno is the prior scan position's nextPage or prevPage (depending on scan
+ * direction), and lastcurrblkno is the prior position's currPage.
  *
- * On entry, so->currPos shouldn't be locked by caller.  so->currPos.buf must
- * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno
- * won't need to be read again in almost all cases).  Parallel scan callers
- * that seized the scan before calling here should pass seized=true; such a
- * caller's blkno and lastcurrblkno arguments come from the seized scan.
- * seized=false callers just pass us the blkno/lastcurrblkno taken from their
- * so->currPos, which (along with so->currPos itself) can be used to end the
- * scan.  A seized=false caller's blkno can never be assumed to be the page
- * that must be read next during a parallel scan, though.  We must figure that
- * part out for ourselves by seizing the scan (the correct page to read might
- * already be beyond the seized=false caller's blkno during a parallel scan,
- * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE,
- * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset).
+ * On entry, no page should be locked by caller.
  *
- * On success exit, so->currPos is updated to contain data from the next
- * interesting page, and we return true.  We hold a pin on the buffer on
- * success exit (except during so->dropPin index scans, when we drop the pin
- * eagerly to avoid blocking VACUUM).
+ * On success exit, returns scan batch containing data from the next
+ * interesting page.  We hold a pin on the buffer on success exit (except
+ * during dropPin plain index scans, when we drop the pin eagerly to avoid
+ * blocking VACUUM).  If there are no more matching records in the given
+ * direction, we just return NULL.
  *
- * If there are no more matching records in the given direction, we invalidate
- * so->currPos (while ensuring it retains no locks or pins), and return false.
- *
- * We always release the scan for a parallel scan caller, regardless of
- * success or failure; we'll call _bt_parallel_release as soon as possible.
+ * Parallel scan callers must seize the scan before calling here.  blkno and
+ * lastcurrblkno should come from the seized scan.  We'll release the scan as
+ * soon as possible.
  */
-static bool
+static IndexScanBatch
 _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
-				 BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
+				 BlockNumber lastcurrblkno, ScanDirection dir, bool firstpage)
 {
 	Relation	rel = scan->indexRelation;
-	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	IndexScanBatch newbatch;
+	BTScanPos	newpos;
 
-	Assert(so->currPos.currPage == lastcurrblkno || seized);
-	Assert(!(blkno == P_NONE && seized));
-	Assert(!BTScanPosIsPinned(so->currPos));
+	/* Allocate space for next batch */
+	newbatch = index_batch_alloc(scan, MaxTIDsPerBTreePage, scan->xs_want_itup);
+	newbatch->pos = palloc(sizeof(BTScanPosData));
+	newpos = newbatch->pos;
 
 	/*
-	 * Remember that the scan already read lastcurrblkno, a page to the left
-	 * of blkno (or remember reading a page to the right, for backwards scans)
+	 * pos is the first valid page to the right (or to the left) of
+	 * lastcurrblkno.  Also provisionally assume that there'll be another page
+	 * we'll need to the right (or to the left) ahead of _bt_readpage call.
 	 */
-	if (ScanDirectionIsForward(dir))
-		so->currPos.moreLeft = true;
-	else
-		so->currPos.moreRight = true;
+	newpos->moreLeft = true;
+	newpos->moreRight = true;
 
 	for (;;)
 	{
 		Page		page;
 		BTPageOpaque opaque;
 
-		if (blkno == P_NONE ||
-			(ScanDirectionIsForward(dir) ?
-			 !so->currPos.moreRight : !so->currPos.moreLeft))
-		{
-			/* most recent _bt_readpage call (for lastcurrblkno) ended scan */
-			Assert(so->currPos.currPage == lastcurrblkno && !seized);
-			BTScanPosInvalidate(so->currPos);
-			_bt_parallel_done(scan);	/* iff !so->needPrimScan */
-			return false;
-		}
-
-		Assert(!so->needPrimScan);
-
-		/* parallel scan must never actually visit so->currPos blkno */
-		if (!seized && scan->parallel_scan != NULL &&
-			!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
-		{
-			/* whole scan is now done (or another primitive scan required) */
-			BTScanPosInvalidate(so->currPos);
-			return false;
-		}
+		Assert(!((BTScanOpaque) scan->opaque)->needPrimScan);
+		Assert(blkno != P_NONE && lastcurrblkno != P_NONE);
 
 		if (ScanDirectionIsForward(dir))
 		{
 			/* read blkno, but check for interrupts first */
 			CHECK_FOR_INTERRUPTS();
-			so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+			newbatch->buf = _bt_getbuf(rel, blkno, BT_READ);
 		}
 		else
 		{
 			/* read blkno, avoiding race (also checks for interrupts) */
-			so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
-														 lastcurrblkno);
-			if (so->currPos.buf == InvalidBuffer)
+			newbatch->buf = _bt_lock_and_validate_left(rel, &blkno,
+													   lastcurrblkno);
+			if (newbatch->buf == InvalidBuffer)
 			{
 				/* must have been a concurrent deletion of leftmost page */
-				BTScanPosInvalidate(so->currPos);
 				_bt_parallel_done(scan);
-				return false;
+				return NULL;
 			}
 		}
 
-		page = BufferGetPage(so->currPos.buf);
+		page = BufferGetPage(newbatch->buf);
 		opaque = BTPageGetOpaque(page);
 		lastcurrblkno = blkno;
 		if (likely(!P_IGNORE(opaque)))
@@ -2431,17 +2284,17 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 			/* see if there are any matches on this page */
 			if (ScanDirectionIsForward(dir))
 			{
-				/* note that this will clear moreRight if we can stop */
-				if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
+				if (_bt_readpage(scan, newbatch, dir,
+								 P_FIRSTDATAKEY(opaque), firstpage))
 					break;
-				blkno = so->currPos.nextPage;
+				blkno = newpos->nextPage;
 			}
 			else
 			{
-				/* note that this will clear moreLeft if we can stop */
-				if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
+				if (_bt_readpage(scan, newbatch, dir,
+								 PageGetMaxOffsetNumber(page), firstpage))
 					break;
-				blkno = so->currPos.prevPage;
+				blkno = newpos->prevPage;
 			}
 		}
 		else
@@ -2456,19 +2309,36 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 		}
 
 		/* no matching tuples on this page */
-		_bt_relbuf(rel, so->currPos.buf);
-		seized = false;			/* released by _bt_readpage (or by us) */
+		_bt_relbuf(rel, newbatch->buf);
+		newbatch->buf = InvalidBuffer;
+
+		/* Continue the scan in this direction? */
+		if (blkno == P_NONE ||
+			(ScanDirectionIsForward(dir) ?
+			 !newpos->moreRight : !newpos->moreLeft))
+		{
+			/*
+			 * blkno _bt_readpage call ended scan in this direction (though if
+			 * so->needPrimScan was set the scan will continue in _bt_first)
+			 */
+			_bt_parallel_done(scan);
+			return NULL;
+		}
+
+		/* parallel scan must seize the scan to get next blkno */
+		if (scan->parallel_scan != NULL &&
+			!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
+			return NULL;		/* done iff so->needPrimScan wasn't set */
+
+		firstpage = false;		/* next page cannot be first */
 	}
 
-	/*
-	 * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
-	 * so->currPos.buf in preparation for btgettuple returning tuples.
-	 */
-	Assert(so->currPos.currPage == blkno);
-	Assert(BTScanPosIsPinned(so->currPos));
-	_bt_drop_lock_and_maybe_pin(rel, so);
+	/* _bt_readpage succeeded */
+	Assert(newpos->currPage == blkno);
+	index_batch_unlock(rel, scan->batchState && scan->batchState->dropPin,
+					   newbatch);
 
-	return true;
+	return newbatch;
 }
 
 /*
@@ -2691,25 +2561,23 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
  * Parallel scan callers must have seized the scan before calling here.
  * Exit conditions are the same as for _bt_first().
  */
-static bool
-_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
+static IndexScanBatch
+_bt_endpoint(IndexScanDesc scan, ScanDirection dir, IndexScanBatch firstbatch)
 {
 	Relation	rel = scan->indexRelation;
-	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	Page		page;
 	BTPageOpaque opaque;
 	OffsetNumber start;
 
-	Assert(!BTScanPosIsValid(so->currPos));
-	Assert(!so->needPrimScan);
+	Assert(!((BTScanOpaque) scan->opaque)->needPrimScan);
 
 	/*
 	 * Scan down to the leftmost or rightmost leaf page.  This is a simplified
 	 * version of _bt_search().
 	 */
-	so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
+	firstbatch->buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
 
-	if (!BufferIsValid(so->currPos.buf))
+	if (!BufferIsValid(firstbatch->buf))
 	{
 		/*
 		 * Empty index. Lock the whole relation, as nothing finer to lock
@@ -2720,7 +2588,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 		return false;
 	}
 
-	page = BufferGetPage(so->currPos.buf);
+	page = BufferGetPage(firstbatch->buf);
 	opaque = BTPageGetOpaque(page);
 	Assert(P_ISLEAF(opaque));
 
@@ -2746,9 +2614,5 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 	/*
 	 * Now load data from the first page of the scan.
 	 */
-	if (!_bt_readfirstpage(scan, start, dir))
-		return false;
-
-	_bt_returnitem(scan, so);
-	return true;
+	return _bt_readfirstpage(scan, firstbatch, start, dir);
 }
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index ab0f98b0287c..9872de87a2d9 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1022,14 +1022,6 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir,
 	 * Restore the array keys to the state they were in immediately before we
 	 * were called.  This ensures that the arrays only ever ratchet in the
 	 * current scan direction.
-	 *
-	 * Without this, scans could overlook matching tuples when the scan
-	 * direction gets reversed just before btgettuple runs out of items to
-	 * return, but just after _bt_readpage prepares all the items from the
-	 * scan's final page in so->currPos.  When we're on the final page it is
-	 * typical for so->currPos to get invalidated once btgettuple finally
-	 * returns false, which'll effectively invalidate the scan's array keys.
-	 * That hasn't happened yet, though -- and in general it may never happen.
 	 */
 	_bt_start_array_keys(scan, -dir);
 
@@ -1396,7 +1388,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	Relation	rel = scan->indexRelation;
-	ScanDirection dir = so->currPos.dir;
+	ScanDirection dir = so->pos->dir;
 	int			arrayidx = 0;
 	bool		beyond_end_advance = false,
 				skip_array_advanced = false,
@@ -2033,13 +2025,13 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
 	 * Note: We make a soft assumption that the current scan direction will
 	 * also be used within _bt_next, when it is asked to step off this page.
 	 * It is up to _bt_next to cancel this scheduled primitive index scan
-	 * whenever it steps to a page in the direction opposite currPos.dir.
+	 * whenever it steps to a page in the direction opposite pos->dir.
 	 */
 	pstate->continuescan = false;	/* Tell _bt_readpage we're done... */
 	so->needPrimScan = true;	/* ...but call _bt_first again */
 
 	if (scan->parallel_scan)
-		_bt_parallel_primscan_schedule(scan, so->currPos.currPage);
+		_bt_parallel_primscan_schedule(scan, so->pos->currPage);
 
 	/* Caller's tuple doesn't match the new qual */
 	return false;
@@ -2152,7 +2144,7 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
 {
 	TupleDesc	tupdesc = RelationGetDescr(scan->indexRelation);
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
-	ScanDirection dir = so->currPos.dir;
+	ScanDirection dir = so->pos->dir;
 	int			ikey = pstate->startikey;
 	bool		res;
 
@@ -3302,7 +3294,7 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
 						 int tupnatts, TupleDesc tupdesc)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
-	ScanDirection dir = so->currPos.dir;
+	ScanDirection dir = so->pos->dir;
 	OffsetNumber aheadoffnum;
 	IndexTuple	ahead;
 
@@ -3376,69 +3368,67 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
  * current page and killed tuples thereon (generally, this should only be
  * called if so->numKilled > 0).
  *
- * Caller should not have a lock on the so->currPos page, but must hold a
- * buffer pin when !so->dropPin.  When we return, it still won't be locked.
- * It'll continue to hold whatever pins were held before calling here.
+ * Caller should not have a lock on the batch position's page, but must hold a
+ * buffer pin when !dropPin.  When we return, it still won't be locked.  It'll
+ * continue to hold whatever pins were held before calling here.
  *
  * We match items by heap TID before assuming they are the right ones to set
  * LP_DEAD.  If the scan is one that holds a buffer pin on the target page
  * continuously from initially reading the items until applying this function
- * (if it is a !so->dropPin scan), VACUUM cannot have deleted any items on the
+ * (if it is a !dropPin scan), VACUUM cannot have deleted any items on the
  * page, so the page's TIDs can't have been recycled by now.  There's no risk
  * that we'll confuse a new index tuple that happens to use a recycled TID
  * with a now-removed tuple with the same TID (that used to be on this same
  * page).  We can't rely on that during scans that drop buffer pins eagerly
- * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on
+ * (i.e. dropPin scans), though, so we must condition setting LP_DEAD bits on
  * the page LSN having not changed since back when _bt_readpage saw the page.
  * We totally give up on setting LP_DEAD bits when the page LSN changed.
  *
- * We give up much less often during !so->dropPin scans, but it still happens.
+ * We tend to give up less often during !dropPin scans, but it still happens.
  * We cope with cases where items have moved right due to insertions.  If an
  * item has moved off the current page due to a split, we'll fail to find it
  * and just give up on it.
  */
 void
-_bt_killitems(IndexScanDesc scan)
+_bt_killitems(IndexScanDesc scan, IndexScanBatch batch)
 {
 	Relation	rel = scan->indexRelation;
-	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BTScanPos	pos = (BTScanPos) batch->pos;
 	Page		page;
 	BTPageOpaque opaque;
 	OffsetNumber minoff;
 	OffsetNumber maxoff;
-	int			numKilled = so->numKilled;
+	int			numKilled = batch->numKilled;
 	bool		killedsomething = false;
 	Buffer		buf;
 
 	Assert(numKilled > 0);
-	Assert(BTScanPosIsValid(so->currPos));
+	Assert(BTScanPosIsValid(*pos));
 	Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
 
-	/* Always invalidate so->killedItems[] before leaving so->currPos */
-	so->numKilled = 0;
+	/* Always invalidate batch->killedItems[] before freeing batch */
+	batch->numKilled = 0;
 
-	if (!so->dropPin)
+	if (!scan->batchState->dropPin)
 	{
 		/*
 		 * We have held the pin on this page since we read the index tuples,
 		 * so all we need to do is lock it.  The pin will have prevented
 		 * concurrent VACUUMs from recycling any of the TIDs on the page.
 		 */
-		Assert(BTScanPosIsPinned(so->currPos));
-		buf = so->currPos.buf;
+		buf = batch->buf;
 		_bt_lockbuf(rel, buf, BT_READ);
 	}
 	else
 	{
 		XLogRecPtr	latestlsn;
 
-		Assert(!BTScanPosIsPinned(so->currPos));
 		Assert(RelationNeedsWAL(rel));
-		buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
+		buf = _bt_getbuf(rel, pos->currPage, BT_READ);
 
 		latestlsn = BufferGetLSNAtomic(buf);
-		Assert(so->currPos.lsn <= latestlsn);
-		if (so->currPos.lsn != latestlsn)
+		Assert(batch->lsn <= latestlsn);
+		if (batch->lsn != latestlsn)
 		{
 			/* Modified, give up on hinting */
 			_bt_relbuf(rel, buf);
@@ -3455,12 +3445,11 @@ _bt_killitems(IndexScanDesc scan)
 
 	for (int i = 0; i < numKilled; i++)
 	{
-		int			itemIndex = so->killedItems[i];
-		BTScanPosItem *kitem = &so->currPos.items[itemIndex];
+		int			itemIndex = batch->killedItems[i];
+		IndexScanBatchPosItem *kitem = &batch->items[itemIndex];
 		OffsetNumber offnum = kitem->indexOffset;
 
-		Assert(itemIndex >= so->currPos.firstItem &&
-			   itemIndex <= so->currPos.lastItem);
+		Assert(itemIndex >= batch->firstItem && itemIndex <= batch->lastItem);
 		if (offnum < minoff)
 			continue;			/* pure paranoia */
 		while (offnum <= maxoff)
@@ -3485,7 +3474,7 @@ _bt_killitems(IndexScanDesc scan)
 				 * correctness.
 				 *
 				 * Note that the page may have been modified in almost any way
-				 * since we first read it (in the !so->dropPin case), so it's
+				 * since we first read it (in the !dropPin case), so it's
 				 * possible that this posting list tuple wasn't a posting list
 				 * tuple when we first encountered its heap TIDs.
 				 */
@@ -3501,7 +3490,8 @@ _bt_killitems(IndexScanDesc scan)
 					 * though only in the common case where the page can't
 					 * have been concurrently modified
 					 */
-					Assert(kitem->indexOffset == offnum || !so->dropPin);
+					Assert(kitem->indexOffset == offnum ||
+						   !scan->batchState->dropPin);
 
 					/*
 					 * Read-ahead to later kitems here.
@@ -3518,7 +3508,7 @@ _bt_killitems(IndexScanDesc scan)
 					 * correctly -- posting tuple still gets killed).
 					 */
 					if (pi < numKilled)
-						kitem = &so->currPos.items[so->killedItems[pi++]];
+						kitem = &batch->items[batch->killedItems[pi++]];
 				}
 
 				/*
@@ -3568,7 +3558,7 @@ _bt_killitems(IndexScanDesc scan)
 		MarkBufferDirtyHint(buf, true);
 	}
 
-	if (!so->dropPin)
+	if (!scan->batchState->dropPin)
 		_bt_unlockbuf(rel, buf);
 	else
 		_bt_relbuf(rel, buf);
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 87c31da71a52..9cbb77438a81 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -90,8 +90,7 @@ spghandler(PG_FUNCTION_ARGS)
 	amroutine->amgettuple = spggettuple;
 	amroutine->amgetbitmap = spggetbitmap;
 	amroutine->amendscan = spgendscan;
-	amroutine->ammarkpos = NULL;
-	amroutine->amrestrpos = NULL;
+	amroutine->amposreset = NULL;
 	amroutine->amestimateparallelscan = NULL;
 	amroutine->aminitparallelscan = NULL;
 	amroutine->amparallelrescan = NULL;
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index 5712fac36971..6d895e4ffce7 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -880,7 +880,7 @@ DefineIndex(Oid tableId,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("access method \"%s\" does not support multicolumn indexes",
 						accessMethodName)));
-	if (exclusion && amRoutine->amgettuple == NULL)
+	if (exclusion && amRoutine->amgettuple == NULL && amRoutine->amgetbatch == NULL)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("access method \"%s\" does not support exclusion constraints",
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
index 1d0e8ad57b4a..ac337d9000f1 100644
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@@ -428,7 +428,7 @@ ExecSupportsMarkRestore(Path *pathnode)
 		case T_IndexOnlyScan:
 
 			/*
-			 * Not all index types support mark/restore.
+			 * Not all index types support restoring a mark
 			 */
 			return castNode(IndexPath, pathnode)->indexinfo->amcanmarkpos;
 
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index f464cca9507a..5e7bafe07a34 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -49,7 +49,13 @@
 static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
 static void StoreIndexTuple(IndexOnlyScanState *node, TupleTableSlot *slot,
 							IndexTuple itup, TupleDesc itupdesc);
+static bool ios_prefetch_block(IndexScanDesc scan, void *arg,
+							   IndexScanBatchPos *pos);
 
+/* values stored in ios_prefetch_block in the batch cache */
+#define		IOS_UNKNOWN_VISIBILITY		0	/* default value */
+#define		IOS_ALL_VISIBLE				1
+#define		IOS_NOT_ALL_VISIBLE			2
 
 /* ----------------------------------------------------------------
  *		IndexOnlyNext
@@ -103,6 +109,17 @@ IndexOnlyNext(IndexOnlyScanState *node)
 		node->ioss_ScanDesc->xs_want_itup = true;
 		node->ioss_VMBuffer = InvalidBuffer;
 
+		/*
+		 * Set the prefetch callback info, if the scan has batching enabled
+		 * (we only know what after index_beginscan, which also checks which
+		 * callbacks are defined for the AM.
+		 */
+		if (scandesc->batchState != NULL)
+		{
+			scandesc->batchState->prefetch = ios_prefetch_block;
+			scandesc->batchState->prefetchArg = (void *) node;
+		}
+
 		/*
 		 * If no run-time keys to calculate or they are ready, go ahead and
 		 * pass the scankeys to the index AM.
@@ -120,10 +137,42 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	 */
 	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
 	{
+		bool		all_visible;
 		bool		tuple_from_heap = false;
 
 		CHECK_FOR_INTERRUPTS();
 
+		/*
+		 * Without batching, inspect the VM directly. With batching, we need
+		 * to retrieve the visibility information seen by the read_stream
+		 * callback (or rather by ios_prefetch_block), otherwise the
+		 * read_stream might get out of sync (if the VM got updated since
+		 * then).
+		 */
+		if (scandesc->batchState == NULL)
+		{
+			all_visible = VM_ALL_VISIBLE(scandesc->heapRelation,
+										 ItemPointerGetBlockNumber(tid),
+										 &node->ioss_VMBuffer);
+		}
+		else
+		{
+			/*
+			 * Reuse the previously determined page visibility info, or
+			 * calculate it now. If we decided not to prefetch the block, the
+			 * page had to be all-visible at that point. The VM bit might have
+			 * changed since then, but the tuple visibility could not have.
+			 *
+			 * XXX It's a bit weird we use the visibility to decide if we
+			 * should skip prefetching the block, and then deduce the
+			 * visibility from that (even if it matches pretty clearly). But
+			 * maybe we could/should have a more direct way to read the
+			 * private state?
+			 */
+			all_visible = !ios_prefetch_block(scandesc, node,
+											  &scandesc->batchState->readPos);
+		}
+
 		/*
 		 * We can skip the heap fetch if the TID references a heap page on
 		 * which all tuples are known visible to everybody.  In any case,
@@ -158,9 +207,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 		 * It's worth going through this complexity to avoid needing to lock
 		 * the VM buffer, which could cause significant contention.
 		 */
-		if (!VM_ALL_VISIBLE(scandesc->heapRelation,
-							ItemPointerGetBlockNumber(tid),
-							&node->ioss_VMBuffer))
+		if (!all_visible)
 		{
 			/*
 			 * Rats, we have to visit the heap to check visibility.
@@ -889,3 +936,51 @@ ExecIndexOnlyScanRetrieveInstrumentation(IndexOnlyScanState *node)
 	node->ioss_SharedInfo = palloc(size);
 	memcpy(node->ioss_SharedInfo, SharedInfo, size);
 }
+
+/* FIXME duplicate from indexam.c */
+#define INDEX_SCAN_BATCH(scan, idx)	\
+		((scan)->batchState->batches[(idx) % (scan)->batchState->maxBatches])
+
+/*
+ * ios_prefetch_block
+ *		Callback to only prefetch blocks that are not all-visible.
+ *
+ * We don't want to inspect the visibility map repeatedly, so the result of
+ * VM_ALL_VISIBLE is stored in the batch private data. The values are set
+ * to 0 by default, so we use two constants to remember if all-visible or
+ * not all-visible.
+ *
+ * However, this is not merely a question of performance. The VM may get
+ * modified during the scan, and we need to make sure the two places (the
+ * read_next callback and the index_fetch_heap here) make the same decision,
+ * otherwise we might get out of sync with the stream. For example, the
+ * callback might find a page is all-visible (and skips reading the block),
+ * and then someone might update the page, resetting the VM bit. If this
+ * place attempts to read the page from the stream, it'll fail because it
+ * will probably receive an entirely different page.
+ */
+static bool
+ios_prefetch_block(IndexScanDesc scan, void *arg, IndexScanBatchPos *pos)
+{
+	IndexOnlyScanState *node = (IndexOnlyScanState *) arg;
+	IndexScanBatch batch = INDEX_SCAN_BATCH(scan, pos->batch);
+
+	if (batch->itemsvisibility == NULL)
+		batch->itemsvisibility = palloc0(sizeof(char) * (batch->lastItem + 1));
+
+	if (batch->itemsvisibility[pos->index] == IOS_UNKNOWN_VISIBILITY)
+	{
+		bool		all_visible;
+		ItemPointer tid = &batch->items[pos->index].heapTid;
+
+		all_visible = VM_ALL_VISIBLE(scan->heapRelation,
+									 ItemPointerGetBlockNumber(tid),
+									 &node->ioss_VMBuffer);
+
+		batch->itemsvisibility[pos->index] =
+			all_visible ? IOS_ALL_VISIBLE : IOS_NOT_ALL_VISIBLE;
+	}
+
+	/* prefetch only blocks that are not all-visible */
+	return (batch->itemsvisibility[pos->index] == IOS_NOT_ALL_VISIBLE);
+}
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 94077e6a006d..ab2756d47aaa 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -144,6 +144,7 @@ int			max_parallel_workers_per_gather = 2;
 
 bool		enable_seqscan = true;
 bool		enable_indexscan = true;
+bool		enable_indexscan_prefetch = true;
 bool		enable_indexonlyscan = true;
 bool		enable_bitmapscan = true;
 bool		enable_tidscan = true;
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index f4b7343daced..a315439dcdd4 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -313,11 +313,11 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				info->amsearcharray = amroutine->amsearcharray;
 				info->amsearchnulls = amroutine->amsearchnulls;
 				info->amcanparallel = amroutine->amcanparallel;
-				info->amhasgettuple = (amroutine->amgettuple != NULL);
+				info->amhasgettuple = (amroutine->amgettuple != NULL ||
+									   amroutine->amgetbatch != NULL);
 				info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
 					relation->rd_tableam->scan_bitmap_next_tuple != NULL;
-				info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
-									  amroutine->amrestrpos != NULL);
+				info->amcanmarkpos = amroutine->amposreset != NULL;
 				info->amcostestimate = amroutine->amcostestimate;
 				Assert(info->amcostestimate != NULL);
 
diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c
index 745fd3bab640..de6c66a15241 100644
--- a/src/backend/replication/logical/relation.c
+++ b/src/backend/replication/logical/relation.c
@@ -888,7 +888,8 @@ IsIndexUsableForReplicaIdentityFull(Relation idxrel, AttrMap *attrmap)
 	 * The given index access method must implement "amgettuple", which will
 	 * be used later to fetch the tuples.  See RelationFindReplTupleByIndex().
 	 */
-	if (GetIndexAmRoutineByAmId(idxrel->rd_rel->relam, false)->amgettuple == NULL)
+	if (GetIndexAmRoutineByAmId(idxrel->rd_rel->relam, false)->amgettuple == NULL &&
+		GetIndexAmRoutineByAmId(idxrel->rd_rel->relam, false)->amgetbatch == NULL)
 		return false;
 
 	return true;
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 031fde9f4cbe..e34e60060604 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -99,6 +99,7 @@ struct ReadStream
 	int16		forwarded_buffers;
 	int16		pinned_buffers;
 	int16		distance;
+	int16		distance_old;
 	int16		initialized_buffers;
 	int			read_buffers_flags;
 	bool		sync_mode;		/* using io_method=sync */
@@ -464,6 +465,7 @@ read_stream_look_ahead(ReadStream *stream)
 		if (blocknum == InvalidBlockNumber)
 		{
 			/* End of stream. */
+			stream->distance_old = stream->distance;
 			stream->distance = 0;
 			break;
 		}
@@ -862,6 +864,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		else
 		{
 			/* No more blocks, end of stream. */
+			stream->distance_old = stream->distance;
 			stream->distance = 0;
 			stream->oldest_buffer_index = stream->next_buffer_index;
 			stream->pinned_buffers = 0;
@@ -1046,6 +1049,9 @@ read_stream_reset(ReadStream *stream)
 	int16		index;
 	Buffer		buffer;
 
+	/* remember the old distance (if we reset before end of the stream) */
+	stream->distance_old = Max(stream->distance, stream->distance_old);
+
 	/* Stop looking ahead. */
 	stream->distance = 0;
 
@@ -1078,8 +1084,12 @@ read_stream_reset(ReadStream *stream)
 	Assert(stream->pinned_buffers == 0);
 	Assert(stream->ios_in_progress == 0);
 
-	/* Start off assuming data is cached. */
-	stream->distance = 1;
+	/*
+	 * Restore the old distance, if we have one. Otherwise start assuming data
+	 * is cached.
+	 */
+	stream->distance = Max(1, stream->distance_old);
+	stream->distance_old = 0;
 }
 
 /*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e8544acb7841..ab044b05e055 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1538,6 +1538,46 @@ ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
 		return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
 }
 
+/*
+ * Check if the buffer is already undergoing read AIO. If it is, assign the
+ * IO's wait reference to operation->io_wref, thereby allowing the caller to
+ * wait for that IO.
+ */
+static inline bool
+ReadBuffersIOAlreadyInProgress(ReadBuffersOperation *operation, Buffer buffer)
+{
+	BufferDesc *desc;
+	uint32		buf_state;
+	PgAioWaitRef iow;
+
+	pgaio_wref_clear(&iow);
+
+	if (BufferIsLocal(buffer))
+	{
+		desc = GetLocalBufferDescriptor(-buffer - 1);
+		buf_state = pg_atomic_read_u32(&desc->state);
+		if ((buf_state & BM_IO_IN_PROGRESS) && !(buf_state & BM_VALID))
+			iow = desc->io_wref;
+	}
+	else
+	{
+		desc = GetBufferDescriptor(buffer - 1);
+		buf_state = LockBufHdr(desc);
+
+		if ((buf_state & BM_IO_IN_PROGRESS) && !(buf_state & BM_VALID))
+			iow = desc->io_wref;
+		UnlockBufHdr(desc, buf_state);
+	}
+
+	if (pgaio_wref_valid(&iow))
+	{
+		operation->io_wref = iow;
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
  */
@@ -1670,7 +1710,7 @@ WaitReadBuffers(ReadBuffersOperation *operation)
 			 *
 			 * we first check if we already know the IO is complete.
 			 */
-			if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
+			if ((operation->foreign_io || aio_ret->result.status == PGAIO_RS_UNKNOWN) &&
 				!pgaio_wref_check_done(&operation->io_wref))
 			{
 				instr_time	io_start = pgstat_prepare_io_time(track_io_timing);
@@ -1689,11 +1729,66 @@ WaitReadBuffers(ReadBuffersOperation *operation)
 				Assert(pgaio_wref_check_done(&operation->io_wref));
 			}
 
-			/*
-			 * We now are sure the IO completed. Check the results. This
-			 * includes reporting on errors if there were any.
-			 */
-			ProcessReadBuffersResult(operation);
+			if (unlikely(operation->foreign_io))
+			{
+				Buffer		buffer = operation->buffers[operation->nblocks_done];
+				BufferDesc *desc;
+				uint32		buf_state;
+
+				if (BufferIsLocal(buffer))
+				{
+					desc = GetLocalBufferDescriptor(-buffer - 1);
+					buf_state = pg_atomic_read_u32(&desc->state);
+				}
+				else
+				{
+					desc = GetBufferDescriptor(buffer - 1);
+					buf_state = LockBufHdr(desc);
+					UnlockBufHdr(desc, buf_state);
+				}
+
+				if (buf_state & BM_VALID)
+				{
+					operation->nblocks_done += 1;
+					Assert(operation->nblocks_done <= operation->nblocks);
+
+					/*
+					 * Report and track this as a 'hit' for this backend, even
+					 * though it must have started out as a miss in
+					 * PinBufferForBlock(). The other backend (or ourselves,
+					 * as part of a read started earlier) will track this as a
+					 * 'read'.
+					 */
+					TRACE_POSTGRESQL_BUFFER_READ_DONE(operation->forknum,
+													  operation->blocknum + operation->nblocks_done,
+													  operation->smgr->smgr_rlocator.locator.spcOid,
+													  operation->smgr->smgr_rlocator.locator.dbOid,
+													  operation->smgr->smgr_rlocator.locator.relNumber,
+													  operation->smgr->smgr_rlocator.backend,
+													  true);
+
+					if (BufferIsLocal(buffer))
+						pgBufferUsage.local_blks_hit += 1;
+					else
+						pgBufferUsage.shared_blks_hit += 1;
+
+					if (operation->rel)
+						pgstat_count_buffer_hit(operation->rel);
+
+					pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
+
+					if (VacuumCostActive)
+						VacuumCostBalance += VacuumCostPageHit;
+				}
+			}
+			else
+			{
+				/*
+				 * We now are sure the IO completed. Check the results. This
+				 * includes reporting on errors if there were any.
+				 */
+				ProcessReadBuffersResult(operation);
+			}
 		}
 
 		/*
@@ -1779,6 +1874,43 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
 		io_object = IOOBJECT_RELATION;
 	}
 
+	/*
+	 * If AIO is in progress, be it in this backend or another backend, we
+	 * just associate the wait reference with the operation and wait in
+	 * WaitReadBuffers(). This turns out to be important for performance in
+	 * two workloads:
+	 *
+	 * 1) A read stream that has to read the same block multiple times within
+	 * the readahead distance. This can happen e.g. for the table accesses of
+	 * an index scan.
+	 *
+	 * 2) Concurrent scans by multiple backends on the same relation.
+	 *
+	 * If we were to synchronously wait for the in-progress IO, we'd not be
+	 * able to keep enough I/O in flight.
+	 *
+	 * If we do find there is ongoing I/O for the buffer, we set up a 1-block
+	 * ReadBuffersOperation that WaitReadBuffers then can wait on.
+	 *
+	 * It's possible that another backend starts IO on the buffer between this
+	 * check and the ReadBuffersCanStartIO(nowait = false) below. In that case
+	 * we will synchronously wait for the IO below, but the window for that is
+	 * small enough that it won't happen often enough to have a significant
+	 * performance impact.
+	 */
+	if (ReadBuffersIOAlreadyInProgress(operation, buffers[nblocks_done]))
+	{
+		*nblocks_progress = 1;
+		operation->foreign_io = true;
+
+		CheckReadBuffersOperation(operation, false);
+
+
+		return true;
+	}
+
+	operation->foreign_io = false;
+
 	/*
 	 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
 	 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
@@ -1836,9 +1968,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
 	/*
 	 * Check if we can start IO on the first to-be-read buffer.
 	 *
-	 * If an I/O is already in progress in another backend, we want to wait
-	 * for the outcome: either done, or something went wrong and we will
-	 * retry.
+	 * If a synchronous I/O is in progress in another backend (it can't be
+	 * this backend), we want to wait for the outcome: either done, or
+	 * something went wrong and we will retry.
 	 */
 	if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
 	{
diff --git a/src/backend/utils/adt/amutils.c b/src/backend/utils/adt/amutils.c
index 0af26d6acfab..1ebe0a76af43 100644
--- a/src/backend/utils/adt/amutils.c
+++ b/src/backend/utils/adt/amutils.c
@@ -363,7 +363,7 @@ indexam_property(FunctionCallInfo fcinfo,
 				PG_RETURN_BOOL(routine->amclusterable);
 
 			case AMPROP_INDEX_SCAN:
-				PG_RETURN_BOOL(routine->amgettuple ? true : false);
+				PG_RETURN_BOOL(routine->amgettuple || routine->amgetbatch ? true : false);
 
 			case AMPROP_BITMAP_SCAN:
 				PG_RETURN_BOOL(routine->amgetbitmap ? true : false);
@@ -392,7 +392,7 @@ indexam_property(FunctionCallInfo fcinfo,
 			PG_RETURN_BOOL(routine->amcanmulticol);
 
 		case AMPROP_CAN_EXCLUDE:
-			PG_RETURN_BOOL(routine->amgettuple ? true : false);
+			PG_RETURN_BOOL(routine->amgettuple || routine->amgetbatch ? true : false);
 
 		case AMPROP_CAN_INCLUDE:
 			PG_RETURN_BOOL(routine->amcaninclude);
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index cb23ad527826..652bb4c537d5 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6782,6 +6782,10 @@ get_actual_variable_endpoint(Relation heapRel,
 	 * a huge amount of time here, so we give up once we've read too many heap
 	 * pages.  When we fail for that reason, the caller will end up using
 	 * whatever extremal value is recorded in pg_statistic.
+	 *
+	 * XXX We're not using ios_prefetch_block here.  That creates a window
+	 * where the scan's read stream can get out of sync.  At a minimum we'll
+	 * need to close this window by explicitly disabling heap I/O prefetching.
 	 */
 	InitNonVacuumableSnapshot(SnapshotNonVacuumable,
 							  GlobalVisTestFor(heapRel));
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index d6fc83338505..fc0caf840827 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -54,6 +54,13 @@
   boot_val => 'true',
 },
 
+{ name => 'enable_indexscan_prefetch', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
+  short_desc => 'Enables prefetching for index scans and index-only-scans.',
+  flags => 'GUC_EXPLAIN',
+  variable => 'enable_indexscan_prefetch',
+  boot_val => 'true',
+},
+
 { name => 'enable_bitmapscan', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
   short_desc => 'Enables the planner\'s use of bitmap-scan plans.',
   flags => 'GUC_EXPLAIN',
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index f62b61967ef6..20a0ffaa5385 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -412,6 +412,7 @@
 #enable_incremental_sort = on
 #enable_indexscan = on
 #enable_indexonlyscan = on
+#enable_indexscan_prefetch = on
 #enable_material = on
 #enable_memoize = on
 #enable_mergejoin = on
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 63dd41c1f21b..3a651744ef9a 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -198,6 +198,15 @@ typedef void (*amrescan_function) (IndexScanDesc scan,
 typedef bool (*amgettuple_function) (IndexScanDesc scan,
 									 ScanDirection direction);
 
+/* next batch of valid tuples */
+typedef IndexScanBatch(*amgetbatch_function) (IndexScanDesc scan,
+											  IndexScanBatch batch,
+											  ScanDirection direction);
+
+/* release batch of valid tuples */
+typedef void (*amfreebatch_function) (IndexScanDesc scan,
+									  IndexScanBatch batch);
+
 /* fetch all valid tuples */
 typedef int64 (*amgetbitmap_function) (IndexScanDesc scan,
 									   TIDBitmap *tbm);
@@ -205,11 +214,9 @@ typedef int64 (*amgetbitmap_function) (IndexScanDesc scan,
 /* end index scan */
 typedef void (*amendscan_function) (IndexScanDesc scan);
 
-/* mark current scan position */
-typedef void (*ammarkpos_function) (IndexScanDesc scan);
-
-/* restore marked scan position */
-typedef void (*amrestrpos_function) (IndexScanDesc scan);
+/* invalidate index AM state that independently tracks scan's position */
+typedef void (*amposreset_function) (IndexScanDesc scan,
+									 IndexScanBatch batch);
 
 /*
  * Callback function signatures - for parallel index scans.
@@ -309,10 +316,11 @@ typedef struct IndexAmRoutine
 	ambeginscan_function ambeginscan;
 	amrescan_function amrescan;
 	amgettuple_function amgettuple; /* can be NULL */
+	amgetbatch_function amgetbatch; /* can be NULL */
+	amfreebatch_function amfreebatch;	/* can be NULL */
 	amgetbitmap_function amgetbitmap;	/* can be NULL */
 	amendscan_function amendscan;
-	ammarkpos_function ammarkpos;	/* can be NULL */
-	amrestrpos_function amrestrpos; /* can be NULL */
+	amposreset_function amposreset; /* can be NULL */
 
 	/* interface functions to support parallel index scans */
 	amestimateparallelscan_function amestimateparallelscan; /* can be NULL */
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 9200a22bd9f9..913945c4b08a 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -15,6 +15,7 @@
 #define GENAM_H
 
 #include "access/htup.h"
+#include "access/itup.h"
 #include "access/sdir.h"
 #include "access/skey.h"
 #include "nodes/tidbitmap.h"
@@ -115,6 +116,7 @@ typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state);
 
 /* struct definitions appear in relscan.h */
 typedef struct IndexScanDescData *IndexScanDesc;
+typedef struct IndexScanBatchData *IndexScanBatch;
 typedef struct SysScanDescData *SysScanDesc;
 
 typedef struct ParallelIndexScanDescData *ParallelIndexScanDesc;
@@ -228,6 +230,9 @@ extern void index_store_float8_orderby_distances(IndexScanDesc scan,
 												 bool recheckOrderBy);
 extern bytea *index_opclass_options(Relation indrel, AttrNumber attnum,
 									Datum attoptions, bool validate);
+extern IndexScanBatch index_batch_alloc(IndexScanDesc scan, int maxitems, bool want_itup);
+extern void index_batch_release(IndexScanDesc scan, IndexScanBatch batch);
+extern void index_batch_unlock(Relation rel, bool dropPin, IndexScanBatch batch);
 
 
 /*
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 909db73b7bbb..744ad2fac145 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -117,6 +117,7 @@ typedef struct IndexFetchHeapData
 	IndexFetchTableData xs_base;	/* AM independent part of the descriptor */
 
 	Buffer		xs_cbuf;		/* current heap buffer in scan, if any */
+	BlockNumber xs_blk;			/* xs_cbuf's block number, if any */
 	/* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
 } IndexFetchHeapData;
 
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 16be5c7a9c15..119705f64b81 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -939,10 +939,10 @@ typedef BTVacuumPostingData *BTVacuumPosting;
  * processing.  This approach minimizes lock/unlock traffic.  We must always
  * drop the lock to make it okay for caller to process the returned items.
  * Whether or not we can also release the pin during this window will vary.
- * We drop the pin (when so->dropPin) to avoid blocking progress by VACUUM
- * (see nbtree/README section about making concurrent TID recycling safe).
- * We'll always release both the lock and the pin on the current page before
- * moving on to its sibling page.
+ * We drop the pin (when dropPin is set in batch state) to avoid blocking
+ * progress by VACUUM (see nbtree/README section about making concurrent TID
+ * recycling safe).  We'll always release both the lock and the pin on the
+ * current page before moving on to its sibling page.
  *
  * If we are doing an index-only scan, we save the entire IndexTuple for each
  * matched item, otherwise only its heap TID and offset.  The IndexTuples go
@@ -961,74 +961,25 @@ typedef struct BTScanPosItem	/* what we remember about each match */
 
 typedef struct BTScanPosData
 {
-	Buffer		buf;			/* currPage buf (invalid means unpinned) */
-
 	/* page details as of the saved position's call to _bt_readpage */
 	BlockNumber currPage;		/* page referenced by items array */
 	BlockNumber prevPage;		/* currPage's left link */
 	BlockNumber nextPage;		/* currPage's right link */
-	XLogRecPtr	lsn;			/* currPage's LSN (when so->dropPin) */
 
 	/* scan direction for the saved position's call to _bt_readpage */
 	ScanDirection dir;
 
-	/*
-	 * If we are doing an index-only scan, nextTupleOffset is the first free
-	 * location in the associated tuple storage workspace.
-	 */
-	int			nextTupleOffset;
-
 	/*
 	 * moreLeft and moreRight track whether we think there may be matching
 	 * index entries to the left and right of the current page, respectively.
 	 */
 	bool		moreLeft;
 	bool		moreRight;
-
-	/*
-	 * The items array is always ordered in index order (ie, increasing
-	 * indexoffset).  When scanning backwards it is convenient to fill the
-	 * array back-to-front, so we start at the last slot and fill downwards.
-	 * Hence we need both a first-valid-entry and a last-valid-entry counter.
-	 * itemIndex is a cursor showing which entry was last returned to caller.
-	 */
-	int			firstItem;		/* first valid index in items[] */
-	int			lastItem;		/* last valid index in items[] */
-	int			itemIndex;		/* current index in items[] */
-
-	BTScanPosItem items[MaxTIDsPerBTreePage];	/* MUST BE LAST */
 } BTScanPosData;
 
 typedef BTScanPosData *BTScanPos;
 
-#define BTScanPosIsPinned(scanpos) \
-( \
-	AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
-				!BufferIsValid((scanpos).buf)), \
-	BufferIsValid((scanpos).buf) \
-)
-#define BTScanPosUnpin(scanpos) \
-	do { \
-		ReleaseBuffer((scanpos).buf); \
-		(scanpos).buf = InvalidBuffer; \
-	} while (0)
-#define BTScanPosUnpinIfPinned(scanpos) \
-	do { \
-		if (BTScanPosIsPinned(scanpos)) \
-			BTScanPosUnpin(scanpos); \
-	} while (0)
-
-#define BTScanPosIsValid(scanpos) \
-( \
-	AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
-				!BufferIsValid((scanpos).buf)), \
-	BlockNumberIsValid((scanpos).currPage) \
-)
-#define BTScanPosInvalidate(scanpos) \
-	do { \
-		(scanpos).buf = InvalidBuffer; \
-		(scanpos).currPage = InvalidBlockNumber; \
-	} while (0)
+#define BTScanPosIsValid(scanpos) BlockNumberIsValid((scanpos).currPage)
 
 /* We need one of these for each equality-type SK_SEARCHARRAY scan key */
 typedef struct BTArrayKeyInfo
@@ -1066,32 +1017,7 @@ typedef struct BTScanOpaqueData
 	BTArrayKeyInfo *arrayKeys;	/* info about each equality-type array key */
 	FmgrInfo   *orderProcs;		/* ORDER procs for required equality keys */
 	MemoryContext arrayContext; /* scan-lifespan context for array data */
-
-	/* info about killed items if any (killedItems is NULL if never used) */
-	int		   *killedItems;	/* currPos.items indexes of killed items */
-	int			numKilled;		/* number of currently stored items */
-	bool		dropPin;		/* drop leaf pin before btgettuple returns? */
-
-	/*
-	 * If we are doing an index-only scan, these are the tuple storage
-	 * workspaces for the currPos and markPos respectively.  Each is of size
-	 * BLCKSZ, so it can hold as much as a full page's worth of tuples.
-	 */
-	char	   *currTuples;		/* tuple storage for currPos */
-	char	   *markTuples;		/* tuple storage for markPos */
-
-	/*
-	 * If the marked position is on the same page as current position, we
-	 * don't use markPos, but just keep the marked itemIndex in markItemIndex
-	 * (all the rest of currPos is valid for the mark position). Hence, to
-	 * determine if there is a mark, first look at markItemIndex, then at
-	 * markPos.
-	 */
-	int			markItemIndex;	/* itemIndex, or -1 if not valid */
-
-	/* keep these last in struct for efficiency */
-	BTScanPosData currPos;		/* current position data */
-	BTScanPosData markPos;		/* marked position, if any */
+	BTScanPos	pos;
 } BTScanOpaqueData;
 
 typedef BTScanOpaqueData *BTScanOpaque;
@@ -1191,14 +1117,15 @@ extern bool btinsert(Relation rel, Datum *values, bool *isnull,
 extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
 extern Size btestimateparallelscan(Relation rel, int nkeys, int norderbys);
 extern void btinitparallelscan(void *target);
-extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
+extern IndexScanBatch btgetbatch(IndexScanDesc scan, IndexScanBatch batch,
+								 ScanDirection dir);
 extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
 extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 					 ScanKey orderbys, int norderbys);
+extern void btfreebatch(IndexScanDesc scan, IndexScanBatch batch);
 extern void btparallelrescan(IndexScanDesc scan);
 extern void btendscan(IndexScanDesc scan);
-extern void btmarkpos(IndexScanDesc scan);
-extern void btrestrpos(IndexScanDesc scan);
+extern void btposreset(IndexScanDesc scan, IndexScanBatch markbatch);
 extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info,
 										   IndexBulkDeleteResult *stats,
 										   IndexBulkDeleteCallback callback,
@@ -1306,8 +1233,9 @@ extern BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key,
 						  Buffer *bufP, int access);
 extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
 extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
-extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
-extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
+extern IndexScanBatch _bt_first(IndexScanDesc scan, ScanDirection dir);
+extern IndexScanBatch _bt_next(IndexScanDesc scan, ScanDirection dir,
+							   IndexScanBatch priorbatch);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
 
 /*
@@ -1327,7 +1255,7 @@ extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arra
 extern bool _bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir,
 									 IndexTuple finaltup);
 extern void _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate);
-extern void _bt_killitems(IndexScanDesc scan);
+extern void _bt_killitems(IndexScanDesc scan, IndexScanBatch batch);
 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
 extern BTCycleId _bt_start_vacuum(Relation rel);
 extern void _bt_end_vacuum(Relation rel);
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index b5e0fb386c0a..eb306a9dfb0f 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -16,9 +16,11 @@
 
 #include "access/htup_details.h"
 #include "access/itup.h"
+#include "access/sdir.h"
 #include "nodes/tidbitmap.h"
 #include "port/atomics.h"
 #include "storage/buf.h"
+#include "storage/read_stream.h"
 #include "storage/relfilelocator.h"
 #include "storage/spin.h"
 #include "utils/relcache.h"
@@ -121,10 +123,161 @@ typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker;
 typedef struct IndexFetchTableData
 {
 	Relation	rel;
+	ReadStream *rs;
 } IndexFetchTableData;
 
 struct IndexScanInstrumentation;
 
+/* Forward declaration, the prefetch callback needs IndexScanDescData. */
+typedef struct IndexScanBatchData IndexScanBatchData;
+
+typedef struct IndexScanBatchPosItem	/* what we remember about each match */
+{
+	ItemPointerData heapTid;	/* TID of referenced heap item */
+	OffsetNumber indexOffset;	/* index item's location within page */
+	LocationIndex tupleOffset;	/* IndexTuple's offset in workspace, if any */
+} IndexScanBatchPosItem;
+
+/*
+ * Data about one batch of items returned by the index AM
+ */
+typedef struct IndexScanBatchData
+{
+	Buffer		buf;			/* currPage buf (invalid means unpinned) */
+	XLogRecPtr	lsn;			/* currPage's LSN (when dropPin) */
+
+	/*
+	 * AM-specific state representing the current position of the scan within
+	 * the index
+	 */
+	void	   *pos;
+
+	/*
+	 * The items array is always ordered in index order (ie, increasing
+	 * indexoffset).  When scanning backwards it is convenient to fill the
+	 * array back-to-front, so we start at the last slot and fill downwards.
+	 * Hence we need both a first-valid-entry and a last-valid-entry counter.
+	 */
+	int			firstItem;		/* first valid index in items[] */
+	int			lastItem;		/* last valid index in items[] */
+
+	/* info about killed items if any (killedItems is NULL if never used) */
+	int		   *killedItems;	/* indexes of killed items */
+	int			numKilled;		/* number of currently stored items */
+
+	/*
+	 * If we are doing an index-only scan, these are the tuple storage
+	 * workspaces for the matching tuples (tuples referenced by items[]). Each
+	 * is of size BLCKSZ, so it can hold as much as a full page's worth of
+	 * tuples.
+	 *
+	 * XXX maybe currTuples should be part of the am-specific per-batch state
+	 * stored in "position" field?
+	 */
+	char	   *currTuples;		/* tuple storage for items[] */
+
+	/*
+	 * batch contents (TIDs, index tuples, kill bitmap, ...)
+	 *
+	 * XXX Shouldn't this be part of the "IndexScanBatchPosItem" struct? To
+	 * keep everything in one place? Or why should we have separate arrays?
+	 * One advantage is that we don't need to allocate memory for arrays that
+	 * we don't need ... e.g. if we don't need heap tuples, we don't allocate
+	 * that. We couldn't do that with everything in one struct.
+	 */
+	char	   *itemsvisibility;	/* Index-only scan visibility cache */
+
+	int			maxitems;
+	IndexScanBatchPosItem items[FLEXIBLE_ARRAY_MEMBER];
+} IndexScanBatchData;
+
+/*
+ * Position in the queue of batches - index of a batch, index of item in a batch.
+ */
+typedef struct IndexScanBatchPos
+{
+	int			batch;
+	int			index;
+} IndexScanBatchPos;
+
+typedef struct IndexScanDescData IndexScanDescData;
+typedef bool (*IndexPrefetchCallback) (IndexScanDescData * scan,
+									   void *arg,
+									   IndexScanBatchPos *pos);
+
+/*
+ * State used by amgetbatch index AMs, which manage per-page batches of items
+ * with matching index tuples using a circular buffer
+ */
+typedef struct IndexScanBatchState
+{
+	/* Index AM drops leaf pin before amgetbatch returns? */
+	bool		dropPin;
+
+	/*
+	 * Did we read the final batch in this scan direction? The batches may be
+	 * loaded from multiple places, and we need to remember when we fail to
+	 * load the next batch in a given scan (which means "no more batches").
+	 * amgetbatch may restart the scan on the get call, so we need to remember
+	 * it's over.
+	 */
+	bool		finished;
+	bool		reset;
+
+	/*
+	 * Did we disable prefetching/use of a read stream because it didn't pay
+	 * for itself?
+	 */
+	bool		prefetchingLockedIn;
+	bool		disabled;
+
+	/*
+	 * During prefetching, currentPrefetchBlock is the table AM block number
+	 * that was returned by our read stream callback most recently.  Used to
+	 * suppress duplicate successive read stream block requests.
+	 *
+	 * Prefetching can still perform non-successive requests for the same
+	 * block number (in general we're prefetching in exactly the same order
+	 * that the scan will return table AM TIDs in).  We need to avoid
+	 * duplicate successive requests because table AMs expect to be able to
+	 * hang on to buffer pins across table_index_fetch_tuple calls.
+	 */
+	BlockNumber currentPrefetchBlock;
+
+	/*
+	 * Current scan direction, for the currently loaded batches. This is used
+	 * to load data in the read stream API callback, etc.
+	 */
+	ScanDirection direction;
+
+	/* positions in the queue of batches (batch + item) */
+	IndexScanBatchPos readPos;	/* read position */
+	IndexScanBatchPos streamPos;	/* prefetch position (for read stream API) */
+	IndexScanBatchPos markPos;	/* mark/restore position */
+
+	IndexScanBatchData *markBatch;
+
+	/*
+	 * Array of batches returned by the AM. The array has a capacity (but can
+	 * be resized if needed). The headBatch is an index of the batch we're
+	 * currently reading from (this needs to be translated by modulo
+	 * maxBatches into index in the batches array).
+	 */
+	int			maxBatches;		/* size of the batches array */
+	int			headBatch;		/* head batch slot */
+	int			nextBatch;		/* next empty batch slot */
+
+	/* small cache of unused batches, to reduce malloc/free traffic */
+	int			batchesCacheSize;
+	IndexScanBatchData **batchesCache;
+
+	IndexScanBatchData **batches;
+
+	/* callback to skip prefetching in IOS etc. */
+	IndexPrefetchCallback prefetch;
+	void	   *prefetchArg;
+}			IndexScanBatchState;
+
 /*
  * We use the same IndexScanDescData structure for both amgettuple-based
  * and amgetbitmap-based index scans.  Some fields are only relevant in
@@ -138,6 +291,8 @@ typedef struct IndexScanDescData
 	struct SnapshotData *xs_snapshot;	/* snapshot to see */
 	int			numberOfKeys;	/* number of index qualifier conditions */
 	int			numberOfOrderBys;	/* number of ordering operators */
+	IndexScanBatchState *batchState;	/* amgetbatch related state */
+
 	struct ScanKeyData *keyData;	/* array of index qualifier descriptors */
 	struct ScanKeyData *orderByData;	/* array of ordering op descriptors */
 	bool		xs_want_itup;	/* caller requests index tuples */
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 30d889b54c53..6879fe99b4bf 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1343,7 +1343,7 @@ typedef struct IndexOptInfo
 	/* does AM have amgetbitmap interface? */
 	bool		amhasgetbitmap;
 	bool		amcanparallel;
-	/* does AM have ammarkpos interface? */
+	/* is AM prepared for us to restore a mark? */
 	bool		amcanmarkpos;
 	/* AM's cost estimator */
 	/* Rather than include amapi.h here, we declare amcostestimate like this */
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index b523bcda8f3d..00f4c3d00118 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -51,6 +51,7 @@ extern PGDLLIMPORT Cost disable_cost;
 extern PGDLLIMPORT int max_parallel_workers_per_gather;
 extern PGDLLIMPORT bool enable_seqscan;
 extern PGDLLIMPORT bool enable_indexscan;
+extern PGDLLIMPORT bool enable_indexscan_prefetch;
 extern PGDLLIMPORT bool enable_indexonlyscan;
 extern PGDLLIMPORT bool enable_bitmapscan;
 extern PGDLLIMPORT bool enable_tidscan;
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index b5f8f3c5d42f..139499111878 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -147,6 +147,7 @@ struct ReadBuffersOperation
 	int			flags;
 	int16		nblocks;
 	int16		nblocks_done;
+	bool		foreign_io;
 	PgAioWaitRef io_wref;
 	PgAioReturn io_return;
 };
diff --git a/src/test/modules/dummy_index_am/dummy_index_am.c b/src/test/modules/dummy_index_am/dummy_index_am.c
index 94ef639b6fcd..be1b0f55ca7e 100644
--- a/src/test/modules/dummy_index_am/dummy_index_am.c
+++ b/src/test/modules/dummy_index_am/dummy_index_am.c
@@ -319,8 +319,7 @@ dihandler(PG_FUNCTION_ARGS)
 	amroutine->amgettuple = NULL;
 	amroutine->amgetbitmap = NULL;
 	amroutine->amendscan = diendscan;
-	amroutine->ammarkpos = NULL;
-	amroutine->amrestrpos = NULL;
+	amroutine->amposreset = NULL;
 	amroutine->amestimateparallelscan = NULL;
 	amroutine->aminitparallelscan = NULL;
 	amroutine->amparallelrescan = NULL;
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 3b37fafa65b9..9702e3103955 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -159,6 +159,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_incremental_sort        | on
  enable_indexonlyscan           | on
  enable_indexscan               | on
+ enable_indexscan_prefetch      | on
  enable_material                | on
  enable_memoize                 | on
  enable_mergejoin               | on
@@ -173,7 +174,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(25 rows)
+(26 rows)
 
 -- There are always wait event descriptions for various types.  InjectionPoint
 -- may be present or absent, depending on history since last postmaster start.
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 018b5919cf66..d018bb067c46 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -193,6 +193,8 @@ BOOL
 BOOLEAN
 BOX
 BTArrayKeyInfo
+BTBatchInfo
+BTBatchScanPosData
 BTBuildState
 BTCallbackState
 BTCycleId
@@ -1268,6 +1270,10 @@ IndexOrderByDistance
 IndexPath
 IndexRuntimeKeyInfo
 IndexScan
+IndexScanBatchData
+IndexScanBatchPos
+IndexScanBatchPosItem
+IndexScanBatches
 IndexScanDesc
 IndexScanInstrumentation
 IndexScanState
@@ -3425,10 +3431,10 @@ amestimateparallelscan_function
 amgetbitmap_function
 amgettreeheight_function
 amgettuple_function
+amgetbatch_function
 aminitparallelscan_function
 aminsert_function
 aminsertcleanup_function
-ammarkpos_function
 amoptions_function
 amparallelrescan_function
 amproperty_function