From 1eadf153f6b60c404e60b21bc6e3db691a5e64ec Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Wed, 28 Aug 2024 18:21:27 +0200
Subject: [PATCH v20240906 1/5] WIP: index batching / prefetching

---
 src/backend/access/heap/heapam_handler.c      |   7 +-
 src/backend/access/index/genam.c              |  23 +-
 src/backend/access/index/indexam.c            | 829 +++++++++++++++++-
 src/backend/access/nbtree/nbtree.c            |  65 ++
 src/backend/access/nbtree/nbtsearch.c         | 280 ++++++
 src/backend/executor/execIndexing.c           |   7 +-
 src/backend/executor/execReplication.c        |   9 +-
 src/backend/executor/nodeIndexonlyscan.c      | 503 ++++++++---
 src/backend/executor/nodeIndexscan.c          | 117 ++-
 src/backend/utils/adt/selfuncs.c              |   7 +-
 src/backend/utils/misc/guc_tables.c           |  10 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/access/amapi.h                    |   5 +
 src/include/access/genam.h                    |  29 +-
 src/include/access/nbtree.h                   |  15 +
 src/include/access/relscan.h                  |  51 ++
 src/include/nodes/execnodes.h                 |   7 +
 src/test/regress/expected/sysviews.out        |   3 +-
 src/tools/pgindent/typedefs.list              |   3 +
 19 files changed, 1830 insertions(+), 141 deletions(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 1c6da286d43..86da23e41f4 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -750,7 +750,12 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 
 		tableScan = NULL;
 		heapScan = NULL;
-		indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
+
+		/*
+		 * XXX Maybe enable batching/prefetch for clustering. Seems like it
+		 * might be a pretty substantial win.
+		 */
+		indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0, false);
 		index_rescan(indexScan, NULL, 0, NULL, 0);
 	}
 	else
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 43c95d6109b..68fc490c992 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -438,8 +438,18 @@ systable_beginscan(Relation heapRelation,
 				elog(ERROR, "column is not in index");
 		}
 
+		/*
+		 * No batching/prefetch for catalogs. We don't expect that to help
+		 * very much, because we usually need just one row, and even if we
+		 * need multiple rows, they tend to be colocated in heap.
+		 *
+		 * XXX Maybe we could do that, the prefetching only ramps up over
+		 * time. But then we need to be careful about infinite recursion when
+		 * looking up effective_io_concurrency for a tablespace in the
+		 * catalog.
+		 */
 		sysscan->iscan = index_beginscan(heapRelation, irel,
-										 snapshot, nkeys, 0);
+										 snapshot, nkeys, 0, false);
 		index_rescan(sysscan->iscan, key, nkeys, NULL, 0);
 		sysscan->scan = NULL;
 	}
@@ -696,8 +706,17 @@ systable_beginscan_ordered(Relation heapRelation,
 			elog(ERROR, "column is not in index");
 	}
 
+	/*
+	 * No batching/prefetch for catalogs. We don't expect that to help very
+	 * much, because we usually need just one row, and even if we need
+	 * multiple rows, they tend to be colocated in heap.
+	 *
+	 * XXX Maybe we could do that, the prefetching only ramps up over time.
+	 * But then we need to be careful about infinite recursion when looking up
+	 * effective_io_concurrency for a tablespace in the catalog.
+	 */
 	sysscan->iscan = index_beginscan(heapRelation, indexRelation,
-									 snapshot, nkeys, 0);
+									 snapshot, nkeys, 0, false);
 	index_rescan(sysscan->iscan, key, nkeys, NULL, 0);
 	sysscan->scan = NULL;
 
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index dcd04b813d8..d170c1d439d 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -33,6 +33,12 @@
  *		index_can_return	- does index support index-only scans?
  *		index_getprocid - get a support procedure OID
  *		index_getprocinfo - get a support procedure's lookup info
+ *		index_batch_getnext	- get the next batch of TIDs from a scan
+ *		index_batch_getnext_tid	- get the next TIDs from a batch
+ *		index_batch_getnext_slot	- get the next tuple from a batch
+ *		index_batch_prefetch	- prefetch heap pages for a batch
+ *		index_batch_supported	- does the AM support/allow batching?
+ *		index_batch_add		- add an item (TID, itup) to the batch
  *
  * NOTES
  *		This file contains the index_ routines which used
@@ -54,10 +60,14 @@
 #include "pgstat.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "utils/memutils.h"
 #include "utils/ruleutils.h"
 #include "utils/snapmgr.h"
+#include "utils/spccache.h"
 #include "utils/syscache.h"
 
+/* enable reading batches / prefetching of TIDs from the index */
+bool		enable_indexscan_batching = false;
 
 /* ----------------------------------------------------------------
  *					macros used in index_ routines
@@ -109,6 +119,10 @@ static IndexScanDesc index_beginscan_internal(Relation indexRelation,
 											  ParallelIndexScanDesc pscan, bool temp_snap);
 static inline void validate_relation_kind(Relation r);
 
+/* index batching */
+static void index_batch_init(IndexScanDesc scan);
+static void index_batch_reset(IndexScanDesc scan);
+
 
 /* ----------------------------------------------------------------
  *				   index_ interface functions
@@ -256,7 +270,8 @@ IndexScanDesc
 index_beginscan(Relation heapRelation,
 				Relation indexRelation,
 				Snapshot snapshot,
-				int nkeys, int norderbys)
+				int nkeys, int norderbys,
+				bool enable_batching)
 {
 	IndexScanDesc scan;
 
@@ -274,6 +289,24 @@ index_beginscan(Relation heapRelation,
 	/* prepare to fetch index matches from table */
 	scan->xs_heapfetch = table_index_fetch_begin(heapRelation);
 
+	/*
+	 * If explicitly requested and supported by both the index AM and the
+	 * plan, initialize batching info.
+	 *
+	 * XXX We do this after ambeginscan(), which means the AM can't init the
+	 * private data in there (it doesn't even know if batching will be used at
+	 * that point).
+	 *
+	 * XXX Maybe we should have a separate "amcanbatch" call, to let the AM
+	 * decide if batching is supported depending on the scan details.
+	 */
+	if ((indexRelation->rd_indam->amgetbatch != NULL) &&
+		enable_batching &&
+		enable_indexscan_batching)
+	{
+		index_batch_init(scan);
+	}
+
 	return scan;
 }
 
@@ -333,6 +366,12 @@ index_beginscan_internal(Relation indexRelation,
 	scan->parallel_scan = pscan;
 	scan->xs_temp_snap = temp_snap;
 
+	/*
+	 * No batching by default, so set it to NULL. Will be initialized later if
+	 * batching is requested and AM supports it.
+	 */
+	scan->xs_batch = NULL;
+
 	return scan;
 }
 
@@ -368,6 +407,18 @@ index_rescan(IndexScanDesc scan,
 
 	scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys,
 											orderbys, norderbys);
+
+	/*
+	 * Reset the batch, to make it look empty.
+	 *
+	 * Done after the amrescan() call, in case the AM needs some of the batch
+	 * info (e.g. to properly transfer the killed tuples).
+	 *
+	 * XXX This is a bit misleading, because index_batch_reset does not reset
+	 * the killed tuples. So if that's the only justification, we could have
+	 * done it before the call.
+	 */
+	index_batch_reset(scan);
 }
 
 /* ----------------
@@ -444,6 +495,18 @@ index_restrpos(IndexScanDesc scan)
 	scan->xs_heap_continue = false;
 
 	scan->indexRelation->rd_indam->amrestrpos(scan);
+
+	/*
+	 * Reset the batch, to make it look empty.
+	 *
+	 * Done after the amrescan() call, in case the AM needs some of the batch
+	 * info (e.g. to properly transfer the killed tuples).
+	 *
+	 * XXX This is a bit misleading, because index_batch_reset does not reset
+	 * the killed tuples. So if that's the only justification, we could have
+	 * done it before the call.
+	 */
+	index_batch_reset(scan);
 }
 
 /*
@@ -539,7 +602,8 @@ index_parallelrescan(IndexScanDesc scan)
  */
 IndexScanDesc
 index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys,
-						 int norderbys, ParallelIndexScanDesc pscan)
+						 int norderbys, ParallelIndexScanDesc pscan,
+						 bool enable_batching)
 {
 	Snapshot	snapshot;
 	IndexScanDesc scan;
@@ -560,6 +624,24 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys,
 	/* prepare to fetch index matches from table */
 	scan->xs_heapfetch = table_index_fetch_begin(heaprel);
 
+	/*
+	 * If explicitly requested and supported by both the index AM and the
+	 * plan, initialize batching info.
+	 *
+	 * XXX We do this after ambeginscan(), which means the AM can't init the
+	 * private data in there (it doesn't even know if batching will be used at
+	 * that point).
+	 *
+	 * XXX Maybe we should have a separate "amcanbatch" call, to let the AM
+	 * decide if batching is supported depending on the scan details.
+	 */
+	if ((indexrel->rd_indam->amgetbatch != NULL) &&
+		enable_batching &&
+		enable_indexscan_batching)
+	{
+		index_batch_init(scan);
+	}
+
 	return scan;
 }
 
@@ -1037,3 +1119,746 @@ index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions,
 
 	return build_local_reloptions(&relopts, attoptions, validate);
 }
+
+/*
+ * INDEX BATCHING AND PREFETCHING
+ *
+ * Allows reading chunks of items from an index, instead of reading them
+ * one by one. This reduces the overhead of accessing index pages, and
+ * also allows acting on "future" TIDs - e.g. we can prefetch heap pages
+ * that will be needed, etc.
+ *
+ *
+ * index AM contract
+ * -----------------
+ *
+ * To support batching, the index AM needs to implement an optional callback
+ * amgetbatch() which loads data into the batch (in the scan descriptor).
+ *
+ * The index AM also needs to ensure it can perform all optimizations for
+ * all TIDs in the current batch. A good example of is the kill_prior_tuple
+ * optimization - with batching, the index AM may receive the information
+ * which tuples are to be killed with a delay - when loading the next
+ * batch, when ending/restarting the scan, etc. The AM needs to ensure it
+ * can still process such information, to keep the optimization effective.
+ *
+ * The AM may also need to keep pins required by the whole batch (not just
+ * the last tuple), etc.
+ *
+ * What this means/requires is very dependent on the index AM, of course.
+ * For B-Tree (and most other index AMs), batches spanning multiple leaf
+ * pages would be problematic. Such batches would work for basic index
+ * scans, but the kill_prior_tuple would be an issue - the AMs keep only
+ * a single leaf pinned. We'd either need to keep multiple pins, or allow
+ * reading older leaf pages pages (which might have been modified). Index
+ * only-scans is challenging too - we keep IndexTuple pointers into the
+ * leaf pages, which requires keeping those pins too.
+ *
+ * To solve this, we give the AM some control over batch boundaries. It is
+ * up to the index AM to pick which range of index items to load into the
+ * batch, and how to ensure all the optimizations are possible, keep pins,
+ * and so on. The index AM may use information about the batch (in the
+ * scan descriptor, maintained by indexam.c code), and may also keep some
+ * private information (in the existing "opaque" scan field).
+ *
+ * For most index AMs the easiest way is to not load batches spanning
+ * multiple leaf pages. This may impact the efficiency, especially for
+ * indexes with wide index tuples, as it means batches close to the end
+ * of the leaf page may be smaller.
+ *
+ * Note: There already is a pipeline break for prefetching - as we are
+ * getting closer to the end of a batch, we can't prefetch further than
+ * that, and the effective prefetch distance drops to 0.
+ *
+ * The alternative would be to make the index AMs more complex, to keep
+ * more leaf pages pinned, etc. The current model does not prohibit the
+ * index AMs from implementing that - it's entirely possible to keep the
+ * additional information in the "opaque" structure (say, list of pinned
+ * pages, and other necessary details).
+ *
+ * But that does not seem like a good trade off, as it's subject to
+ * "diminishing returns" - we see significant gains initially (with even
+ * small batches / prefetch distance), and as the batch grows the gains
+ * get smaller and smaller. It does not seem worth the complexity of
+ * pinning more pages etc. at least for the first version.
+ *
+ * To deal with the "prefetch pipeline break", that could be addressed by
+ * allowing multiple in-fligt batches - e.g. with prefetch distance 64
+ * we might have three batches of 32 items each, to prefetch far ahead.
+ * But that's not what this patch does yet.
+ *
+ *
+ * batch = sliding window
+ * ----------------------
+ *
+ * A good way to visualize a batch is a sliding window over the array
+ * of items on a leaf page. In the simplest example (forward scan with no
+ * changes of direction), we slice the array into smaller chunks, and
+ * then process each of those chunks.
+ *
+ * The batch size is adaptive - it starts small (only 8 elements) and
+ * increases as we read more batches (up to 64 elements). We don't want
+ * to regress cases that only need a single item (e.g. LIMIT 1 queries),
+ * and loading/copying a lot of data might cause that. So we start small
+ * and increase the size - that still improves cases reading a lot of
+ * data from the index, without hurting small queries.
+ *
+ * Note: This gradual ramp up is for batch size, independent of what we
+ * do for prefetch. The prefetch distance is gradually increased too, but
+ * it's independent / orthogonal to the batch size. The batch size limits
+ * how far ahead we can prefetch, of course.
+ *
+ * Note: The current limits on batch size (initial 8, maximum 64) are
+ * quite arbitrary, it just seemed those values are sane. We could adjust
+ * the initial size, but I don't think it'd make a fundamental difference.
+ * Growing the batches faster/slower has bigger impact.
+ *
+ * The maximum batch size does not matter much - it's true a btree index can
+ * have up to ~1300 items per 8K leaf page, but in most cases the actual
+ * number is lower, perhaps ~300. That's not that far from 64.
+ *
+ * Each batch has a firstIndex/lastIndex to track which part of the leaf
+ * page it currently represents.
+ *
+ *
+ * kill_prior_tuples
+ * -----------------
+ *
+ * If we decide a tuple can be killed, the batch item is marked accordingly,
+ * and the flag is reset to false (so that the index AM does not do something
+ * silly to a random tuple it thinks is "current").
+ *
+ * Then the next time the AM decides it's time to kill tuples, the AM needs
+ * to look at the batch and consider the tuples marked to be killed. B-Tree
+ * simply adds those TIDs to the regular "killItems" array.
+ *
+ *
+ * mark/restore
+ * ------------
+ *
+ * With batching, the index AM does not know the the "current" position on
+ * the leaf page - we don't propagate this to the index AM while walking
+ * items in the batch. To make ammarkpos() work, the index AM has to check
+ * the current position in the batch, and translate it to the proper page
+ * position, using the private information (about items in the batch).
+ *
+ * XXX This needs more work, I don't quite like how the two layers interact,
+ * it seems quite wrong to look at the batch info directly.
+ */
+
+/*
+ * Comprehensive check of various invariants on the index batch. Makes sure
+ * the indexes are set as expected, the buffer size is within limits, and
+ * so on.
+ */
+static void
+AssertCheckBatchInfo(IndexScanDesc scan)
+{
+#ifdef USE_ASSERT_CHECKING
+	/* all the arrays need to be allocated */
+	Assert((scan->xs_batch->heaptids != NULL) &&
+		   (scan->xs_batch->killedItems != NULL) &&
+		   (scan->xs_batch->privateData != NULL));
+
+	/* if IndexTuples expected, should be allocated too */
+	Assert(!(scan->xs_want_itup && (scan->xs_batch->itups == NULL)));
+
+	/* Various check on batch sizes */
+	Assert((scan->xs_batch->initSize >= 0) &&
+		   (scan->xs_batch->initSize <= scan->xs_batch->currSize) &&
+		   (scan->xs_batch->currSize <= scan->xs_batch->maxSize) &&
+		   (scan->xs_batch->maxSize <= 1024));	/* arbitrary limit */
+
+	/* Is the number of in the batch TIDs in a valid range? */
+	Assert((scan->xs_batch->nheaptids >= 0) &&
+		   (scan->xs_batch->nheaptids <= scan->xs_batch->maxSize));
+
+	/*
+	 * The current item must be between -1 and nheaptids. Those two extreme
+	 * values are starting points for forward/backward scans.
+	 */
+	Assert((scan->xs_batch->currIndex >= -1) &&
+		   (scan->xs_batch->currIndex <= scan->xs_batch->nheaptids));
+
+	/* check prefetch data */
+	Assert((scan->xs_batch->prefetchTarget >= 0) &&
+		   (scan->xs_batch->prefetchTarget <= scan->xs_batch->prefetchMaximum));
+
+	Assert((scan->xs_batch->prefetchIndex >= -1) &&
+		   (scan->xs_batch->prefetchIndex <= scan->xs_batch->nheaptids));
+
+	for (int i = 0; i < scan->xs_batch->nheaptids; i++)
+		Assert(ItemPointerIsValid(&scan->xs_batch->heaptids[i]));
+#endif
+}
+
+/* Is the batch full (TIDs up to capacity)? */
+#define	INDEX_BATCH_IS_FULL(scan)	\
+	((scan)->xs_batch->nheaptids == (scan)->xs_batch->currSize)
+
+/* Is the batch empty (no TIDs)? */
+#define	INDEX_BATCH_IS_EMPTY(scan)	\
+	((scan)->xs_batch->nheaptids == 0)
+
+/*
+ * Did we process all items? For forward scan it means the index points to the
+ * last item, for backward scans it has to point to the first one.
+ *
+ * This does not cover empty batches properly, because of backward scans.
+ */
+#define	INDEX_BATCH_IS_PROCESSED(scan, direction)	\
+	(ScanDirectionIsForward(direction) ? \
+		((scan)->xs_batch->nheaptids == ((scan)->xs_batch->currIndex + 1)) : \
+		((scan)->xs_batch->currIndex == 0))
+
+/* Does the batch items in the requested direction? */
+#define INDEX_BATCH_HAS_ITEMS(scan, direction) \
+	(!INDEX_BATCH_IS_EMPTY(scan) && !INDEX_BATCH_IS_PROCESSED(scan, direction))
+
+
+/* ----------------
+ *		index_batch_getnext - get the next batch of TIDs from a scan
+ *
+ * Returns true if we managed to read at least some TIDs into the batch,
+ * or false if there are no more TIDs in the scan. The xs_heaptids and
+ * xs_nheaptids fields contain the TIDS and the number of elements.
+ *
+ * XXX This only loads the TIDs and resets the various batch fields to
+ * fresh state. It does not set xs_heaptid/xs_itup/xs_hitup, that's the
+ * responsibility of the following index_batch_getnext_tid() calls.
+ * ----------------
+ */
+bool
+index_batch_getnext(IndexScanDesc scan, ScanDirection direction)
+{
+	bool		found;
+
+	SCAN_CHECKS;
+	CHECK_SCAN_PROCEDURE(amgetbatch);
+
+	/* XXX: we should assert that a snapshot is pushed or registered */
+	Assert(TransactionIdIsValid(RecentXmin));
+
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	/*
+	 * We never read a new batch before we run out of items in the current
+	 * one. The current batch has to be either empty or we ran out of items
+	 * (in the given direction).
+	 */
+	Assert(!INDEX_BATCH_HAS_ITEMS(scan, direction));
+
+	/*
+	 * Reset the current/prefetch positions in the batch.
+	 *
+	 * XXX Done before calling amgetbatch(), so that it sees the index as
+	 * invalid, batch as empty, and can add items.
+	 *
+	 * XXX Intentionally does not reset the nheaptids, because the AM does
+	 * rely on that when processing killed tuples. Maybe store the killed
+	 * tuples differently?
+	 */
+	scan->xs_batch->currIndex = -1;
+	scan->xs_batch->prefetchIndex = 0;
+	scan->xs_batch->nheaptids = 0;
+
+	/*
+	 * Reset the memory context with all per-batch data, allocated by the AM.
+	 * This might be tuples, or anything else the AM needs.
+	 *
+	 * XXX Make sure to reset the tuples, because the AM may do something with
+	 * them later (e.g. release them, as getNextNearest in gist), but we may
+	 * release them by the MemoryContextReset() call.
+	 *
+	 * This might break the AM if it relies on them pointing to the last
+	 * tuple, but at least it has the chance to do the right thing by checking
+	 * if the pointer is NULL.
+	 */
+	scan->xs_itup = NULL;
+	scan->xs_hitup = NULL;
+
+	MemoryContextReset(scan->xs_batch->ctx);
+
+	/*
+	 * The AM's amgetbatch proc loads a chunk of TIDs matching the scan keys,
+	 * and puts the TIDs into scan->xs_batch->heaptids.  It should also set
+	 * scan->xs_recheck and possibly
+	 * scan->xs_batch->itups/scan->xs_batch->hitups, though we pay no
+	 * attention to those fields here.
+	 *
+	 * FIXME At the moment this does nothing with hitup. Needs to be fixed?
+	 */
+	found = scan->indexRelation->rd_indam->amgetbatch(scan, direction);
+
+	/* Reset kill flag immediately for safety */
+	scan->kill_prior_tuple = false;
+	scan->xs_heap_continue = false;
+
+	/* If we're out of index entries, we're done */
+	if (!found)
+	{
+		/* release resources (like buffer pins) from table accesses */
+		if (scan->xs_heapfetch)
+			table_index_fetch_reset(scan->xs_heapfetch);
+
+		return false;
+	}
+
+	/* We should have a non-empty batch with items. */
+	Assert(INDEX_BATCH_HAS_ITEMS(scan, direction));
+
+	pgstat_count_index_tuples(scan->indexRelation, scan->xs_batch->nheaptids);
+
+	/*
+	 * Set the prefetch index to the first item in the loaded batch (we expect
+	 * the index AM to set that).
+	 *
+	 * FIXME Maybe set the currIndex here, not in the index AM. It seems much
+	 * more like indexam.c responsibility rather than something every index AM
+	 * should be doing (in _bt_first_batch etc.).
+	 *
+	 * FIXME It's a bit unclear who (indexam.c or the index AM) is responsible
+	 * for setting which fields. This needs clarification.
+	 */
+	scan->xs_batch->prefetchIndex = scan->xs_batch->currIndex;
+
+	/*
+	 * Try to increase the size of the batch. Intentionally done after the AM
+	 * call, so that the new value applies to the next batch. Otherwise we
+	 * would always skip the initial batch size.
+	 */
+	scan->xs_batch->currSize = Min(scan->xs_batch->currSize + 1,
+								   scan->xs_batch->maxSize);
+
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	/* Return the batch of TIDs we found. */
+	return true;
+}
+
+/* ----------------
+ *		index_getnext_batch_tid - get the next TID from the current batch
+ *
+ * Same calling convention as index_getnext_tid(), except that NULL means
+ * no more items in the current batch, there may be more batches.
+ *
+ * XXX This only sets xs_heaptid and xs_itup (if requested). Not sure if
+ * we need to do something with xs_hitup.
+ *
+ * FIXME Should this set xs_hitup?
+ * ----------------
+ */
+ItemPointer
+index_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction)
+{
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	/*
+	 * Bail out if he batch does not have more items in the requested directio
+	 * (either empty or everthing processed).
+	 */
+	if (!INDEX_BATCH_HAS_ITEMS(scan, direction))
+		return NULL;
+
+	/*
+	 * Advance to the next batch item - we know it's not empty and there are
+	 * items to process, so this is valid.
+	 */
+	if (ScanDirectionIsForward(direction))
+		scan->xs_batch->currIndex++;
+	else
+		scan->xs_batch->currIndex--;
+
+	/*
+	 * Next TID from the batch, optionally also the IndexTuple/HeapTuple.
+	 *
+	 * XXX Not sure how to decide which of the tuples to set, seems easier to
+	 * just set both, one of them will be NULL.
+	 *
+	 * XXX Do we need to reset the itups/htups array between batches? Doesn't
+	 * seem necessary, but maybe we could get bogus data?
+	 */
+	scan->xs_heaptid = scan->xs_batch->heaptids[scan->xs_batch->currIndex];
+	if (scan->xs_want_itup)
+	{
+		scan->xs_itup = scan->xs_batch->itups[scan->xs_batch->currIndex];
+		scan->xs_hitup = scan->xs_batch->htups[scan->xs_batch->currIndex];
+	}
+
+	scan->xs_recheck = scan->xs_batch->recheck[scan->xs_batch->currIndex];
+
+	/*
+	 * If there are order-by clauses, point to the appropriate chunk in the
+	 * arrays.
+	 */
+	if (scan->numberOfOrderBys > 0)
+	{
+		int			idx = scan->numberOfOrderBys * scan->xs_batch->currIndex;
+
+		scan->xs_orderbyvals = &scan->xs_batch->orderbyvals[idx];
+		scan->xs_orderbynulls = &scan->xs_batch->orderbynulls[idx];
+	}
+
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	return &scan->xs_heaptid;
+}
+
+/* ----------------
+ * index_getnext_batch_slot - get the next tuple from a scan batch
+ *
+ * Same calling convention as index_getnext_slot(), except that NULL means
+ * no more items only in the current batch, there may be more batches.
+ *
+ * XXX See index_getnext_slot comments.
+ * ----------------
+ */
+bool
+index_batch_getnext_slot(IndexScanDesc scan, ScanDirection direction,
+						 TupleTableSlot *slot)
+{
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	for (;;)
+	{
+		if (!scan->xs_heap_continue)
+		{
+			ItemPointer tid;
+
+			/* Time to fetch the next TID from the index */
+			tid = index_batch_getnext_tid(scan, direction);
+
+			/* If we're out of index entries, we're done */
+			if (tid == NULL)
+				break;
+
+			Assert(ItemPointerEquals(tid, &scan->xs_heaptid));
+		}
+
+		/*
+		 * Fetch the next (or only) visible heap tuple for this index entry.
+		 * If we don't find anything, loop around and grab the next TID from
+		 * the index.
+		 */
+		Assert(ItemPointerIsValid(&scan->xs_heaptid));
+		if (index_fetch_heap(scan, slot))
+		{
+			/* If we found a visible tuple, we shouldn't kill it. */
+			Assert(!scan->kill_prior_tuple);
+
+			/* comprehensive checks of batching info */
+			AssertCheckBatchInfo(scan);
+
+			return true;
+		}
+
+		/*
+		 * If we haven't found any visible tuple for the TID, chances are all
+		 * versions are dead and may kill it from the index. If so, flag it in
+		 * the kill bitmap - we'll translate it to indexes later.
+		 *
+		 * XXX With the firstIndex/lastIndex it would not be too hard to do
+		 * the translation here. But do we want to? How much is that
+		 * considered an internal detail of the AM?
+		 *
+		 * XXX Maybe we should integrate this into index_fetch_heap(), so that
+		 * we don't need to do it after each call? Seems easy to ferget/miss.
+		 */
+		if (scan->kill_prior_tuple)
+		{
+			scan->kill_prior_tuple = false;
+
+			/*
+			 * FIXME This is not great, because we'll have to walk through the
+			 * whole bitmap later, to maybe add killed tuples to the regular
+			 * array. Might be costly for large batches. Maybe it'd be better
+			 * to do what btree does and stash the indexes (just some limited
+			 * number).
+			 */
+			if (scan->xs_batch->nKilledItems < scan->xs_batch->maxSize)
+				scan->xs_batch->killedItems[scan->xs_batch->nKilledItems++]
+					= scan->xs_batch->currIndex;
+		}
+	}
+
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	return false;
+}
+
+/* ----------------
+ *		index_batch_prefetch - prefetch pages for TIDs in current batch
+ *
+ * The prefetch distance is increased gradually, similar to what we do for
+ * bitmap heap scans. We start from distance 0 (no prefetch), and then in each
+ * iteration increment the distance up to prefetchMaximum.
+ *
+ * The prefetch distance is reset (to 0) only on rescans, not between batches.
+ *
+ * It's possible to provide an index_prefetch_callback callback, to affect
+ * which items need to be prefetched. With prefetch_callback=NULL, all
+ * items are prefetched. With the callback provided, the item is prefetched
+ * iff the callback and returns true.
+ *
+ * The "arg" argument is used to pass a state for the plan node invoking the
+ * function, and is then passed to the callback. This means the callback is
+ * specific to the plan state.
+ *
+ * XXX the prefetchMaximum depends on effective_io_concurrency, and also on
+ * tablespace options.
+ *
+ * XXX For accesses that change scan direction, we may do a lot of unnecessary
+ * prefetching (because we will re-issue prefetches for what we recently read).
+ * I'm not sure if there's a simple way to track what was already prefetched.
+ * Maybe we could count how far we got (in the forward direction), keep that
+ * as a watermark, and never prefetch again below it.
+ *
+ * XXX Maybe wrap this in ifdef USE_PREFETCH?
+ * ----------------
+ */
+void
+index_batch_prefetch(IndexScanDesc scan, ScanDirection direction,
+					 index_prefetch_callback prefetch_callback, void *arg)
+{
+	int			prefetchStart,
+				prefetchEnd;
+
+	if (ScanDirectionIsForward(direction))
+	{
+		/* Where should we start to prefetch? */
+		prefetchStart = Max(scan->xs_batch->currIndex,
+							scan->xs_batch->prefetchIndex);
+
+		/*
+		 * Where should we stop prefetching? this is the first item that we do
+		 * NOT prefetch, i.e. it can be the first item after the batch.
+		 */
+		prefetchEnd = Min((scan->xs_batch->currIndex + 1) + scan->xs_batch->prefetchTarget,
+						  scan->xs_batch->nheaptids);
+
+		/* FIXME should calculate in a way to make this unnecessary */
+		prefetchStart = Max(Min(prefetchStart, scan->xs_batch->nheaptids - 1), 0);
+		prefetchEnd = Max(Min(prefetchEnd, scan->xs_batch->nheaptids - 1), 0);
+
+		/* remember how far we prefetched / where to start the next prefetch */
+		scan->xs_batch->prefetchIndex = prefetchEnd;
+	}
+	else
+	{
+		/* Where should we start to prefetch? */
+		prefetchEnd = Min(scan->xs_batch->currIndex,
+						  scan->xs_batch->prefetchIndex);
+
+		/*
+		 * Where should we stop prefetching? this is the first item that we do
+		 * NOT prefetch, i.e. it can be the first item after the batch.
+		 */
+		prefetchStart = Max((scan->xs_batch->currIndex - 1) - scan->xs_batch->prefetchTarget,
+							-1);
+
+		/* FIXME should calculate in a way to make this unnecessary */
+		prefetchStart = Max(Min(prefetchStart, scan->xs_batch->nheaptids - 1), 0);
+		prefetchEnd = Max(Min(prefetchEnd, scan->xs_batch->nheaptids - 1), 0);
+
+		/* remember how far we prefetched / where to start the next prefetch */
+		scan->xs_batch->prefetchIndex = prefetchStart;
+	}
+
+	/*
+	 * It's possible we get inverted prefetch range after a restrpos() call,
+	 * because we intentionally don't reset the prefetchIndex - we don't want
+	 * to prefetch pages over and over in this case. We'll do nothing in that
+	 * case, except for the AssertCheckBatchInfo().
+	 *
+	 * FIXME I suspect this actually does not work correctly if we change the
+	 * direction, because the prefetchIndex will flip between two extremes
+	 * thanks to the Min/Max.
+	 */
+
+	/*
+	 * Increase the prefetch distance, but not beyond prefetchMaximum. We
+	 * intentionally do this after calculating start/end, so that we start
+	 * actually prefetching only after the first item.
+	 */
+	scan->xs_batch->prefetchTarget = Min(scan->xs_batch->prefetchTarget + 1,
+										 scan->xs_batch->prefetchMaximum);
+
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	/* finally, do the actual prefetching */
+	for (int i = prefetchStart; i < prefetchEnd; i++)
+	{
+		/* skip block if the provided callback says so */
+		if (prefetch_callback && !prefetch_callback(scan, direction, arg, i))
+			continue;
+
+		PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM,
+					   ItemPointerGetBlockNumber(&scan->xs_batch->heaptids[i]));
+	}
+}
+
+/*
+ * index_batch_init
+ *		Initialize various fields / arrays needed by batching.
+ *
+ * FIXME This is a bit ad-hoc hodge podge, due to how I was adding more and
+ * more pieces. Some of the fields may be not quite necessary, needs cleanup.
+ */
+static void
+index_batch_init(IndexScanDesc scan)
+{
+	/* init batching info, but only if batch supported */
+	Assert(scan->indexRelation->rd_indam->amgetbatch != NULL);
+
+	scan->xs_batch = palloc0(sizeof(IndexScanBatchData));
+
+	/*
+	 * Set some reasonable batch size defaults.
+	 *
+	 * XXX Maybe should depend on prefetch distance, or something like that?
+	 * The initSize will affect how far ahead we can prefetch.
+	 */
+	scan->xs_batch->maxSize = 64;
+	scan->xs_batch->initSize = 8;
+	scan->xs_batch->currSize = scan->xs_batch->initSize;
+
+	/* initialize prefetching info */
+	scan->xs_batch->prefetchMaximum =
+		get_tablespace_io_concurrency(scan->heapRelation->rd_rel->reltablespace);
+	scan->xs_batch->prefetchTarget = 0;
+	scan->xs_batch->prefetchIndex = 0;
+
+	/* */
+	scan->xs_batch->currIndex = -1;
+
+	/* Preallocate the largest allowed array of TIDs. */
+	scan->xs_batch->nheaptids = 0;
+	scan->xs_batch->heaptids = palloc0(sizeof(ItemPointerData) * scan->xs_batch->maxSize);
+
+	/*
+	 * XXX We can't check scan->xs_want_itup, because that's set only after
+	 * the scan is initialized (and we initialize in beginscan). Maybe we
+	 * could (or should) allocate lazily.
+	 */
+	scan->xs_batch->itups = palloc(sizeof(IndexTuple) * scan->xs_batch->maxSize);
+	scan->xs_batch->htups = palloc(sizeof(HeapTuple) * scan->xs_batch->maxSize);
+
+	scan->xs_batch->recheck = palloc(sizeof(bool) * scan->xs_batch->maxSize);
+
+	/*
+	 * XXX Maybe use a more compact bitmap? We need just one bit per element,
+	 * not a bool. This is easier / more convenient to manipulate, though.
+	 *
+	 * XXX Maybe should allow more items thant the max batch size?
+	 */
+	scan->xs_batch->nKilledItems = 0;
+	scan->xs_batch->killedItems = (int *) palloc0(sizeof(int) * scan->xs_batch->maxSize);
+
+	/*
+	 * XXX Maybe allocate only when actually needed? Also, shouldn't we have a
+	 * memory context for the private data?
+	 */
+	scan->xs_batch->privateData = (Datum *) palloc0(sizeof(Datum) * scan->xs_batch->maxSize);
+
+	if (scan->numberOfOrderBys > 0)
+	{
+		int			cnt = (scan->xs_batch->maxSize * scan->numberOfOrderBys);
+
+		scan->xs_batch->orderbyvals = (Datum *) palloc0(sizeof(Datum) * cnt);
+		scan->xs_batch->orderbynulls = (bool *) palloc0(sizeof(Datum) * cnt);
+	}
+	else
+	{
+		scan->xs_batch->orderbyvals = NULL;
+		scan->xs_batch->orderbynulls = NULL;
+	}
+
+	scan->xs_batch->ctx = AllocSetContextCreate(CurrentMemoryContext,
+												"indexscan batch context",
+												ALLOCSET_DEFAULT_SIZES);
+
+	/* comprehensive checks */
+	AssertCheckBatchInfo(scan);
+}
+
+/*
+ * index_batch_reset
+ *		Reset the batch before reading the next chunk of data.
+ *
+ * FIXME Another bit in need of cleanup. The currIndex default (-1) is not quite
+ * correct, because for backwards scans is wrong.
+ */
+static void
+index_batch_reset(IndexScanDesc scan)
+{
+	/* bail out if batching not enabled */
+	if (!scan->xs_batch)
+		return;
+
+	scan->xs_batch->nheaptids = 0;
+	scan->xs_batch->prefetchIndex = 0;
+	scan->xs_batch->currIndex = -1;
+}
+
+/*
+ * index_batch_add
+ *		Add an item to the batch.
+ *
+ * The item is always a TID, and then also IndexTuple if requested (for IOS).
+ * Items are always added from the beginning (index 0).
+ *
+ * Returns true when adding the item was successful, or false when the batch
+ * is full (and the item should be added to the next batch).
+ */
+bool
+index_batch_add(IndexScanDesc scan, ItemPointerData tid, bool recheck,
+				IndexTuple itup, HeapTuple htup)
+{
+	/* comprehensive checks on the batch info */
+	AssertCheckBatchInfo(scan);
+
+	/* don't add TIDs beyond the current batch size */
+	if (INDEX_BATCH_IS_FULL(scan))
+		return false;
+
+	/*
+	 * There must be space for at least one entry.
+	 *
+	 * XXX Seems redundant with the earlier INDEX_BATCH_IS_FULL check.
+	 */
+	Assert(scan->xs_batch->nheaptids < scan->xs_batch->currSize);
+	Assert(scan->xs_batch->nheaptids >= 0);
+
+	scan->xs_batch->heaptids[scan->xs_batch->nheaptids] = tid;
+	scan->xs_batch->privateData[scan->xs_batch->nheaptids] = (Datum) 0;
+
+	if (scan->xs_want_itup)
+	{
+		scan->xs_batch->itups[scan->xs_batch->nheaptids] = itup;
+		scan->xs_batch->htups[scan->xs_batch->nheaptids] = htup;
+	}
+
+	scan->xs_batch->recheck[scan->xs_batch->nheaptids] = recheck;
+
+	if (scan->numberOfOrderBys > 0)
+	{
+		int			idx = scan->xs_batch->nheaptids * scan->numberOfOrderBys;
+
+		memcpy(&scan->xs_batch->orderbyvals[idx], scan->xs_orderbyvals, sizeof(Datum) * scan->numberOfOrderBys);
+		memcpy(&scan->xs_batch->orderbynulls[idx], scan->xs_orderbynulls, sizeof(bool) * scan->numberOfOrderBys);
+	}
+
+	scan->xs_batch->nheaptids++;
+
+	/* comprehensive checks on the batch info */
+	AssertCheckBatchInfo(scan);
+
+	return true;
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 686a3206f72..a32cc6e7a3a 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -141,6 +141,7 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->ambeginscan = btbeginscan;
 	amroutine->amrescan = btrescan;
 	amroutine->amgettuple = btgettuple;
+	amroutine->amgetbatch = btgetbatch;
 	amroutine->amgetbitmap = btgetbitmap;
 	amroutine->amendscan = btendscan;
 	amroutine->ammarkpos = btmarkpos;
@@ -259,6 +260,53 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 	return res;
 }
 
+/*
+ *	btgetbatch() -- Get the next batch of tuples in the scan.
+ *
+ * XXX Pretty much like btgettuple(), but for batches of tuples.
+ */
+bool
+btgetbatch(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	bool		res;
+
+	/* btree indexes are never lossy */
+	scan->xs_recheck = false;
+
+	/* Each loop iteration performs another primitive index scan */
+	do
+	{
+		/*
+		 * If we've already initialized this scan, we can just advance it in
+		 * the appropriate direction.  If we haven't done so yet, we call
+		 * _bt_first() to get the first item in the scan.
+		 */
+		if (!BTScanPosIsValid(so->currPos))
+			res = _bt_first_batch(scan, dir);
+		else
+		{
+			/*
+			 * Check to see if we should kill tuples from the previous batch.
+			 */
+			_bt_kill_batch(scan);
+
+			/*
+			 * Now continue the scan.
+			 */
+			res = _bt_next_batch(scan, dir);
+		}
+
+		/* If we have a tuple, return it ... */
+		if (res)
+			break;
+
+		/* ... otherwise see if we need another primitive index scan */
+	} while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
+
+	return res;
+}
+
 /*
  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
  */
@@ -338,6 +386,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 	so->killedItems = NULL;		/* until needed */
 	so->numKilled = 0;
 
+	/* batch range, initially empty */
+	so->batch.firstIndex = -1;
+	so->batch.lastIndex = -1;
+
 	/*
 	 * We don't know yet whether the scan will be index-only, so we do not
 	 * allocate the tuple workspace arrays until btrescan.  However, we set up
@@ -364,6 +416,9 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	/* we aren't holding any read locks, but gotta drop the pins */
 	if (BTScanPosIsValid(so->currPos))
 	{
+		/* Transfer killed items from the batch to the regular array. */
+		_bt_kill_batch(scan);
+
 		/* Before leaving current page, deal with any killed items */
 		if (so->numKilled > 0)
 			_bt_killitems(scan);
@@ -408,6 +463,10 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 				scan->numberOfKeys * sizeof(ScanKeyData));
 	so->numberOfKeys = 0;		/* until _bt_preprocess_keys sets it */
 	so->numArrayKeys = 0;		/* ditto */
+
+	/* Reset the batch (even if not batched scan) */
+	so->batch.firstIndex = -1;
+	so->batch.lastIndex = -1;
 }
 
 /*
@@ -421,6 +480,9 @@ btendscan(IndexScanDesc scan)
 	/* we aren't holding any read locks, but gotta drop the pins */
 	if (BTScanPosIsValid(so->currPos))
 	{
+		/* Transfer killed items from the batch to the regular array. */
+		_bt_kill_batch(scan);
+
 		/* Before leaving current page, deal with any killed items */
 		if (so->numKilled > 0)
 			_bt_killitems(scan);
@@ -501,6 +563,9 @@ btrestrpos(IndexScanDesc scan)
 		 */
 		if (BTScanPosIsValid(so->currPos))
 		{
+			/* Transfer killed items from the batch to the regular array. */
+			_bt_kill_batch(scan);
+
 			/* Before leaving current page, deal with any killed items */
 			if (so->numKilled > 0)
 				_bt_killitems(scan);
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 2551df8a671..3196ba640d5 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1527,6 +1527,283 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 	return true;
 }
 
+
+static void
+AssertCheckBTBatchInfo(BTScanOpaque so)
+{
+#ifdef USE_ASSERT_CHECKING
+	/* should be valid items (with respect to the current leaf page) */
+	Assert(so->currPos.firstItem <= so->batch.firstIndex);
+	Assert(so->batch.firstIndex <= so->batch.lastIndex);
+	Assert(so->batch.lastIndex <= so->currPos.lastItem);
+#endif
+}
+
+/*
+ * _bt_copy_batch
+ *		Copy a section of the leaf page into the batch.
+ */
+static void
+_bt_copy_batch(IndexScanDesc scan, ScanDirection dir, BTScanOpaque so,
+			   int start, int end)
+{
+	/*
+	 * We're reading the first batch, and there should always be at least one
+	 * item (otherwise _bt_first would return false). So we should never get
+	 * into situation with empty start/end range. In the worst case, there is
+	 * just a single item, in which case (start == end).
+	 */
+	Assert(start <= end);
+
+	/* The range of items should fit into the current batch size. */
+	Assert((end - start + 1) <= scan->xs_batch->currSize);
+
+	so->batch.firstIndex = start;
+	so->batch.lastIndex = end;
+
+	AssertCheckBTBatchInfo(so);
+
+	/*
+	 * Walk through the range of index tuples, copy them into the batch. If
+	 * requested, set the index tuple too.
+	 *
+	 * We don't know if the batch is full already - we just try to add it, and
+	 * bail out if it fails.
+	 *
+	 * FIXME This seems wrong, actually. We use currSize when calculating the
+	 * start/end range, so the add should always succeed.
+	 */
+	while (start <= end)
+	{
+		BTScanPosItem *currItem = &so->currPos.items[start];
+		IndexTuple	itup = NULL;
+
+		if (scan->xs_want_itup)
+			itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+		/* try to add it to batch, if there's space */
+		if (!index_batch_add(scan, currItem->heapTid, scan->xs_recheck, itup, NULL))
+			break;
+
+		start++;
+	}
+
+	/*
+	 * set the starting point
+	 *
+	 * XXX might be better done in indexam.c
+	 */
+	if (ScanDirectionIsForward(dir))
+		scan->xs_batch->currIndex = -1;
+	else
+		scan->xs_batch->currIndex = scan->xs_batch->nheaptids;
+
+	/* shouldn't be possible to end here with an empty batch */
+	Assert(scan->xs_batch->nheaptids > 0);
+}
+
+/*
+ *	_bt_first_batch() -- Find the first batch in a scan.
+ *
+ * A batch variant of _bt_first(). Most of the comments for that function
+ * apply here too.
+ *
+ * XXX This only populates the batch, it does not set any other fields like
+ * scan->xs_heaptid or scan->xs_itup. That happens in getnext_tid() calls.
+ *
+ * XXX I'm not sure it works to mix batched and non-batches calls, e.g. get
+ * a TID and then a batch of TIDs. It probably should work as long as we
+ * update itemIndex correctly, but we need to be careful about killed items
+ * (right now the two places use different ways to communicate which items
+ * should be killed).
+ */
+bool
+_bt_first_batch(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	Assert(scan->xs_batch->nheaptids == 0);
+
+	/*
+	 * Mark the batch as empty.
+	 *
+	 * This might seems a bit strange, because surely the batch should be
+	 * empty before reading the first batch (we reset those indexes both in
+	 * btbeginscan and btrescan). So why not an assert? We can get here in
+	 * different ways too - not just after beginscan/rescan, but also when
+	 * iterating over ScalarArrayOps - in which case we'll see the last batch
+	 * of the preceding scan.
+	 */
+	so->batch.firstIndex = -1;
+	so->batch.lastIndex = -1;
+
+	/* we haven't visited any leaf pages yet, so proceed to reading one */
+	if (_bt_first(scan, dir))
+	{
+		/* range of the leaf to copy into the batch */
+		int			start,
+					end;
+
+		/* determine which part of the leaf page to extract */
+		if (ScanDirectionIsForward(dir))
+		{
+			start = so->currPos.firstItem;
+			end = Min(start + (scan->xs_batch->currSize - 1), so->currPos.lastItem);
+			so->currPos.itemIndex = (end + 1);
+		}
+		else
+		{
+			end = so->currPos.lastItem;
+			start = Max(end - (scan->xs_batch->currSize - 1),
+						so->currPos.firstItem);
+			so->currPos.itemIndex = (start - 1);
+		}
+
+		/*
+		 * Copy the selected range of items into the batch, set the batch
+		 * current index properly (before first / after last item, depending
+		 * on scan direction.
+		 */
+		_bt_copy_batch(scan, dir, so, start, end);
+
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ *	_bt_next_batch() -- Get the next batch of items in a scan.
+ *
+ * A batch variant of _bt_next(). Most of the comments for that function
+ * apply here too.
+ *
+ * We should only get here only when the current batch has no more items
+ * in the given direction. We don't get here with empty batches, that's
+ * handled by _bt_fist_batch().
+ *
+ * XXX See also the comments at _bt_first_batch() about returning a single
+ * batch for the page, etc.
+ */
+bool
+_bt_next_batch(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	int			start,
+				end;
+
+	Assert(scan->xs_batch->nheaptids == 0);
+
+	AssertCheckBTBatchInfo(so);
+
+	/*
+	 * Check if we still have some items on the current leaf page. If yes,
+	 * load them into a batch and return.
+	 *
+	 * XXX try combining that with the next block, the inner while loop is
+	 * exactly the same.
+	 */
+	if (ScanDirectionIsForward(dir))
+	{
+		start = so->batch.lastIndex + 1;
+		end = Min(start + (scan->xs_batch->currSize - 1), so->currPos.lastItem);
+		so->currPos.itemIndex = (end + 1);
+	}
+	else
+	{
+		end = so->batch.firstIndex - 1;
+		start = Max(end - (scan->xs_batch->currSize - 1),
+					so->currPos.firstItem);
+		so->currPos.itemIndex = (start - 1);
+	}
+
+	/*
+	 * We have more items on the current leaf page.
+	 */
+	if (start <= end)
+	{
+		_bt_copy_batch(scan, dir, so, start, end);
+		return true;
+	}
+
+	/*
+	 * We've consumed all items from the current leaf page, so try reading the
+	 * next one, and process it.
+	 */
+	if (_bt_next(scan, dir))
+	{
+		/*
+		 * Check if we still have some items on the current leaf page. If yes,
+		 * load them into a batch and return.
+		 *
+		 * XXX try combining that with the next block, the inner while loop is
+		 * exactly the same.
+		 */
+		if (ScanDirectionIsForward(dir))
+		{
+			start = so->currPos.firstItem;
+			end = Min(start + (scan->xs_batch->currSize - 1), so->currPos.lastItem);
+			so->currPos.itemIndex = (end + 1);
+		}
+		else
+		{
+			end = so->currPos.lastItem;
+			start = Max(end - (scan->xs_batch->currSize - 1),
+						so->currPos.firstItem);
+			so->currPos.itemIndex = (start - 1);
+		}
+
+		_bt_copy_batch(scan, dir, so, start, end);
+
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ *	_bt_kill_batch() -- remember the items-to-be-killed from the current batch
+ *
+ * We simply translate the bitmap into the "regular" killedItems array, and let
+ * that to drive which items are killed.
+ */
+void
+_bt_kill_batch(IndexScanDesc scan)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	/* bail out if batching not enabled */
+	if (!scan->xs_batch)
+		return;
+
+	for (int i = 0; i < scan->xs_batch->nKilledItems; i++)
+	{
+		int			index = (so->batch.firstIndex + scan->xs_batch->killedItems[i]);
+
+		/* make sure we have a valid index (in the current leaf page) */
+		Assert((so->batch.firstIndex <= index) &&
+			   (index <= so->batch.lastIndex));
+
+		/*
+		 * Yes, remember it for later. (We'll deal with all such tuples at
+		 * once right before leaving the index page.)  The test for numKilled
+		 * overrun is not just paranoia: if the caller reverses direction in
+		 * the indexscan then the same item might get entered multiple times.
+		 * It's not worth trying to optimize that, so we don't detect it, but
+		 * instead just forget any excess entries.
+		 */
+		if (so->killedItems == NULL)
+			so->killedItems = (int *)
+				palloc(MaxTIDsPerBTreePage * sizeof(int));
+		if (so->numKilled < MaxTIDsPerBTreePage)
+			so->killedItems[so->numKilled++] = index;
+	}
+
+	/* now reset the number of killed items */
+	scan->xs_batch->nKilledItems = 0;
+}
+
 /*
  *	_bt_readpage() -- Load data from current index page into so->currPos
  *
@@ -2048,6 +2325,9 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 
 	Assert(BTScanPosIsValid(so->currPos));
 
+	/* Transfer killed items from the batch to the regular array. */
+	_bt_kill_batch(scan);
+
 	/* Before leaving current page, deal with any killed items */
 	if (so->numKilled > 0)
 		_bt_killitems(scan);
diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c
index 403a3f40551..eb0e4578d36 100644
--- a/src/backend/executor/execIndexing.c
+++ b/src/backend/executor/execIndexing.c
@@ -779,7 +779,12 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index,
 retry:
 	conflict = false;
 	found_self = false;
-	index_scan = index_beginscan(heap, index, &DirtySnapshot, indnkeyatts, 0);
+
+	/*
+	 * XXX Does not seem useful to do prefetching for checks of constraints.
+	 * We would probably need just the first item anyway.
+	 */
+	index_scan = index_beginscan(heap, index, &DirtySnapshot, indnkeyatts, 0, false);
 	index_rescan(index_scan, scankeys, indnkeyatts, NULL, 0);
 
 	while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot))
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
index 1086cbc9624..df69327ea79 100644
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -244,8 +244,13 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid,
 	/* Build scan key. */
 	skey_attoff = build_replindex_scan_key(skey, rel, idxrel, searchslot);
 
-	/* Start an index scan. */
-	scan = index_beginscan(rel, idxrel, &snap, skey_attoff, 0);
+	/*
+	 * Start an index scan.
+	 *
+	 * XXX No prefetching for replication identity. We expect to find just one
+	 * row, so prefetching is pointless.
+	 */
+	scan = index_beginscan(rel, idxrel, &snap, skey_attoff, 0, false);
 
 retry:
 	found = false;
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 612c6738950..dcf5fc5fffc 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -49,7 +49,13 @@
 static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
 static void StoreIndexTuple(IndexOnlyScanState *node, TupleTableSlot *slot,
 							IndexTuple itup, TupleDesc itupdesc);
+static bool ios_prefetch_block(IndexScanDesc scan, ScanDirection direction,
+							   void *data, int index);
 
+/* values stored in ios_prefetch_block in the batch cache */
+#define		IOS_UNKNOWN_VISIBILITY		0	/* XXX default value */
+#define		IOS_ALL_VISIBLE				1
+#define		IOS_NOT_ALL_VISIBLE			2
 
 /* ----------------------------------------------------------------
  *		IndexOnlyNext
@@ -93,11 +99,11 @@ IndexOnlyNext(IndexOnlyScanState *node)
 								   node->ioss_RelationDesc,
 								   estate->es_snapshot,
 								   node->ioss_NumScanKeys,
-								   node->ioss_NumOrderByKeys);
+								   node->ioss_NumOrderByKeys,
+								   node->ioss_CanBatch);
 
 		node->ioss_ScanDesc = scandesc;
 
-
 		/* Set it up for index-only scan */
 		node->ioss_ScanDesc->xs_want_itup = true;
 		node->ioss_VMBuffer = InvalidBuffer;
@@ -115,139 +121,359 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	}
 
 	/*
-	 * OK, now that we have what we need, fetch the next tuple.
+	 * If the batching is disabled by a GUC, or if it's not supported by the
+	 * index AM, do the original approach.
+	 *
+	 * XXX Maybe we should enable batching based on the plan too, so that we
+	 * don't do batching when it's probably useless (e.g. semijoins or queries
+	 * with LIMIT 1 etc.). But maybe the approach with slow ramp-up (starting
+	 * with small batches) will handle that well enough.
+	 *
+	 * XXX Perhaps it'd be possible to do both in index_getnext_slot(), i.e.
+	 * call either the original code without batching, or the new batching
+	 * code if supported/enabled. It's not great to have duplicated code.
 	 */
-	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
+	if (scandesc->xs_batch == NULL)
 	{
-		bool		tuple_from_heap = false;
-
-		CHECK_FOR_INTERRUPTS();
-
 		/*
-		 * We can skip the heap fetch if the TID references a heap page on
-		 * which all tuples are known visible to everybody.  In any case,
-		 * we'll use the index tuple not the heap tuple as the data source.
-		 *
-		 * Note on Memory Ordering Effects: visibilitymap_get_status does not
-		 * lock the visibility map buffer, and therefore the result we read
-		 * here could be slightly stale.  However, it can't be stale enough to
-		 * matter.
-		 *
-		 * We need to detect clearing a VM bit due to an insert right away,
-		 * because the tuple is present in the index page but not visible. The
-		 * reading of the TID by this scan (using a shared lock on the index
-		 * buffer) is serialized with the insert of the TID into the index
-		 * (using an exclusive lock on the index buffer). Because the VM bit
-		 * is cleared before updating the index, and locking/unlocking of the
-		 * index page acts as a full memory barrier, we are sure to see the
-		 * cleared bit if we see a recently-inserted TID.
-		 *
-		 * Deletes do not update the index page (only VACUUM will clear out
-		 * the TID), so the clearing of the VM bit by a delete is not
-		 * serialized with this test below, and we may see a value that is
-		 * significantly stale. However, we don't care about the delete right
-		 * away, because the tuple is still visible until the deleting
-		 * transaction commits or the statement ends (if it's our
-		 * transaction). In either case, the lock on the VM buffer will have
-		 * been released (acting as a write barrier) after clearing the bit.
-		 * And for us to have a snapshot that includes the deleting
-		 * transaction (making the tuple invisible), we must have acquired
-		 * ProcArrayLock after that time, acting as a read barrier.
-		 *
-		 * It's worth going through this complexity to avoid needing to lock
-		 * the VM buffer, which could cause significant contention.
+		 * OK, now that we have what we need, fetch the next tuple.
 		 */
-		if (!VM_ALL_VISIBLE(scandesc->heapRelation,
-							ItemPointerGetBlockNumber(tid),
-							&node->ioss_VMBuffer))
+		while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
 		{
+			bool		tuple_from_heap = false;
+
+			CHECK_FOR_INTERRUPTS();
+
 			/*
-			 * Rats, we have to visit the heap to check visibility.
+			 * We can skip the heap fetch if the TID references a heap page on
+			 * which all tuples are known visible to everybody.  In any case,
+			 * we'll use the index tuple not the heap tuple as the data
+			 * source.
+			 *
+			 * Note on Memory Ordering Effects: visibilitymap_get_status does
+			 * not lock the visibility map buffer, and therefore the result we
+			 * read here could be slightly stale.  However, it can't be stale
+			 * enough to matter.
+			 *
+			 * We need to detect clearing a VM bit due to an insert right
+			 * away, because the tuple is present in the index page but not
+			 * visible. The reading of the TID by this scan (using a shared
+			 * lock on the index buffer) is serialized with the insert of the
+			 * TID into the index (using an exclusive lock on the index
+			 * buffer). Because the VM bit is cleared before updating the
+			 * index, and locking/unlocking of the index page acts as a full
+			 * memory barrier, we are sure to see the cleared bit if we see a
+			 * recently-inserted TID.
+			 *
+			 * Deletes do not update the index page (only VACUUM will clear
+			 * out the TID), so the clearing of the VM bit by a delete is not
+			 * serialized with this test below, and we may see a value that is
+			 * significantly stale. However, we don't care about the delete
+			 * right away, because the tuple is still visible until the
+			 * deleting transaction commits or the statement ends (if it's our
+			 * transaction). In either case, the lock on the VM buffer will
+			 * have been released (acting as a write barrier) after clearing
+			 * the bit. And for us to have a snapshot that includes the
+			 * deleting transaction (making the tuple invisible), we must have
+			 * acquired ProcArrayLock after that time, acting as a read
+			 * barrier.
+			 *
+			 * It's worth going through this complexity to avoid needing to
+			 * lock the VM buffer, which could cause significant contention.
 			 */
-			InstrCountTuples2(node, 1);
-			if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
-				continue;		/* no visible tuple, try next index entry */
+			if (!VM_ALL_VISIBLE(scandesc->heapRelation,
+								ItemPointerGetBlockNumber(tid),
+								&node->ioss_VMBuffer))
+			{
+				/*
+				 * Rats, we have to visit the heap to check visibility.
+				 */
+				InstrCountTuples2(node, 1);
+				if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
+					continue;	/* no visible tuple, try next index entry */
+
+				ExecClearTuple(node->ioss_TableSlot);
+
+				/*
+				 * Only MVCC snapshots are supported here, so there should be
+				 * no need to keep following the HOT chain once a visible
+				 * entry has been found.  If we did want to allow that, we'd
+				 * need to keep more state to remember not to call
+				 * index_getnext_tid next time.
+				 */
+				if (scandesc->xs_heap_continue)
+					elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+
+				/*
+				 * Note: at this point we are holding a pin on the heap page,
+				 * as recorded in scandesc->xs_cbuf.  We could release that
+				 * pin now, but it's not clear whether it's a win to do so.
+				 * The next index entry might require a visit to the same heap
+				 * page.
+				 */
+
+				tuple_from_heap = true;
+			}
 
-			ExecClearTuple(node->ioss_TableSlot);
+			/*
+			 * Fill the scan tuple slot with data from the index.  This might
+			 * be provided in either HeapTuple or IndexTuple format.
+			 * Conceivably an index AM might fill both fields, in which case
+			 * we prefer the heap format, since it's probably a bit cheaper to
+			 * fill a slot from.
+			 */
+			if (scandesc->xs_hitup)
+			{
+				/*
+				 * We don't take the trouble to verify that the provided tuple
+				 * has exactly the slot's format, but it seems worth doing a
+				 * quick check on the number of fields.
+				 */
+				Assert(slot->tts_tupleDescriptor->natts ==
+					   scandesc->xs_hitupdesc->natts);
+				ExecForceStoreHeapTuple(scandesc->xs_hitup, slot, false);
+			}
+			else if (scandesc->xs_itup)
+				StoreIndexTuple(node, slot, scandesc->xs_itup, scandesc->xs_itupdesc);
+			else
+				elog(ERROR, "no data returned for index-only scan");
 
 			/*
-			 * Only MVCC snapshots are supported here, so there should be no
-			 * need to keep following the HOT chain once a visible entry has
-			 * been found.  If we did want to allow that, we'd need to keep
-			 * more state to remember not to call index_getnext_tid next time.
+			 * If the index was lossy, we have to recheck the index quals.
 			 */
-			if (scandesc->xs_heap_continue)
-				elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+			if (scandesc->xs_recheck)
+			{
+				econtext->ecxt_scantuple = slot;
+				if (!ExecQualAndReset(node->recheckqual, econtext))
+				{
+					/* Fails recheck, so drop it and loop back for another */
+					InstrCountFiltered2(node, 1);
+					continue;
+				}
+			}
 
 			/*
-			 * Note: at this point we are holding a pin on the heap page, as
-			 * recorded in scandesc->xs_cbuf.  We could release that pin now,
-			 * but it's not clear whether it's a win to do so.  The next index
-			 * entry might require a visit to the same heap page.
+			 * We don't currently support rechecking ORDER BY distances.  (In
+			 * principle, if the index can support retrieval of the originally
+			 * indexed value, it should be able to produce an exact distance
+			 * calculation too.  So it's not clear that adding code here for
+			 * recheck/re-sort would be worth the trouble.  But we should at
+			 * least throw an error if someone tries it.)
 			 */
+			if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby)
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("lossy distance functions are not supported in index-only scans")));
 
-			tuple_from_heap = true;
-		}
+			/*
+			 * If we didn't access the heap, then we'll need to take a
+			 * predicate lock explicitly, as if we had.  For now we do that at
+			 * page level.
+			 */
+			if (!tuple_from_heap)
+				PredicateLockPage(scandesc->heapRelation,
+								  ItemPointerGetBlockNumber(tid),
+								  estate->es_snapshot);
 
-		/*
-		 * Fill the scan tuple slot with data from the index.  This might be
-		 * provided in either HeapTuple or IndexTuple format.  Conceivably an
-		 * index AM might fill both fields, in which case we prefer the heap
-		 * format, since it's probably a bit cheaper to fill a slot from.
-		 */
-		if (scandesc->xs_hitup)
+			return slot;
+		}
+	}
+	else
+	{
+new_batch:
+		/* do we have TIDs in the current batch */
+		while ((tid = index_batch_getnext_tid(scandesc, direction)) != NULL)
 		{
+			bool		all_visible;
+			bool		tuple_from_heap = false;
+
+			CHECK_FOR_INTERRUPTS();
+
+			/* Is the index of the current item valid for the batch? */
+			Assert((scandesc->xs_batch->currIndex >= 0) &&
+				   (scandesc->xs_batch->currIndex < scandesc->xs_batch->nheaptids));
+
+			/* Prefetch some of the following items in the batch. */
+			index_batch_prefetch(scandesc, direction, ios_prefetch_block, node);
+
 			/*
-			 * We don't take the trouble to verify that the provided tuple has
-			 * exactly the slot's format, but it seems worth doing a quick
-			 * check on the number of fields.
+			 * Reuse the previously determined page visibility info, or
+			 * calculate it now. If we decided not to prefetch the block, the
+			 * page has to be all-visible.
+			 *
+			 * XXX It's a bir weird we use the visibility to decide if we
+			 * should skip prefetching the block, and then deduce the
+			 * visibility from that. Maybe we could/should have a more direct
+			 * way?
 			 */
-			Assert(slot->tts_tupleDescriptor->natts ==
-				   scandesc->xs_hitupdesc->natts);
-			ExecForceStoreHeapTuple(scandesc->xs_hitup, slot, false);
-		}
-		else if (scandesc->xs_itup)
-			StoreIndexTuple(node, slot, scandesc->xs_itup, scandesc->xs_itupdesc);
-		else
-			elog(ERROR, "no data returned for index-only scan");
+			all_visible = !ios_prefetch_block(scandesc, direction, node,
+											  scandesc->xs_batch->currIndex);
 
-		/*
-		 * If the index was lossy, we have to recheck the index quals.
-		 */
-		if (scandesc->xs_recheck)
-		{
-			econtext->ecxt_scantuple = slot;
-			if (!ExecQualAndReset(node->recheckqual, econtext))
+			/*
+			 * We can skip the heap fetch if the TID references a heap page on
+			 * which all tuples are known visible to everybody.  In any case,
+			 * we'll use the index tuple not the heap tuple as the data
+			 * source.
+			 *
+			 * Note on Memory Ordering Effects: visibilitymap_get_status does
+			 * not lock the visibility map buffer, and therefore the result we
+			 * read here could be slightly stale.  However, it can't be stale
+			 * enough to matter.
+			 *
+			 * We need to detect clearing a VM bit due to an insert right
+			 * away, because the tuple is present in the index page but not
+			 * visible. The reading of the TID by this scan (using a shared
+			 * lock on the index buffer) is serialized with the insert of the
+			 * TID into the index (using an exclusive lock on the index
+			 * buffer). Because the VM bit is cleared before updating the
+			 * index, and locking/unlocking of the index page acts as a full
+			 * memory barrier, we are sure to see the cleared bit if we see a
+			 * recently-inserted TID.
+			 *
+			 * Deletes do not update the index page (only VACUUM will clear
+			 * out the TID), so the clearing of the VM bit by a delete is not
+			 * serialized with this test below, and we may see a value that is
+			 * significantly stale. However, we don't care about the delete
+			 * right away, because the tuple is still visible until the
+			 * deleting transaction commits or the statement ends (if it's our
+			 * transaction). In either case, the lock on the VM buffer will
+			 * have been released (acting as a write barrier) after clearing
+			 * the bit. And for us to have a snapshot that includes the
+			 * deleting transaction (making the tuple invisible), we must have
+			 * acquired ProcArrayLock after that time, acting as a read
+			 * barrier.
+			 *
+			 * It's worth going through this complexity to avoid needing to
+			 * lock the VM buffer, which could cause significant contention.
+			 */
+			if (!all_visible)
 			{
-				/* Fails recheck, so drop it and loop back for another */
-				InstrCountFiltered2(node, 1);
-				continue;
+				/*
+				 * Rats, we have to visit the heap to check visibility.
+				 */
+				InstrCountTuples2(node, 1);
+				if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
+				{
+					/*
+					 * If we haven't found any visible tuple for the TID,
+					 * chances are all versions are dead and may kill it from
+					 * the index. If so, flag it in the kill bitmap - we'll
+					 * translate it to indexes later.
+					 *
+					 * XXX With the firstIndex/lastIndex it would not be too
+					 * hard to do the translation here. But do we want to? How
+					 * much is that considered an internal detail of the AM?
+					 *
+					 * FIXME Maybe we could/should make this part of
+					 * index_fetch_heap, so that we don't have to do this
+					 * after each call?
+					 */
+					if (scandesc->kill_prior_tuple)
+					{
+						scandesc->kill_prior_tuple = false;
+
+						if (scandesc->xs_batch->nKilledItems < scandesc->xs_batch->maxSize)
+							scandesc->xs_batch->killedItems[scandesc->xs_batch->nKilledItems++]
+								= scandesc->xs_batch->currIndex;
+					}
+
+					continue;	/* no visible tuple, try next index entry */
+				}
+
+				ExecClearTuple(node->ioss_TableSlot);
+
+				/*
+				 * Only MVCC snapshots are supported here, so there should be
+				 * no need to keep following the HOT chain once a visible
+				 * entry has been found.  If we did want to allow that, we'd
+				 * need to keep more state to remember not to call
+				 * index_getnext_tid next time.
+				 */
+				if (scandesc->xs_heap_continue)
+					elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+
+				/*
+				 * Note: at this point we are holding a pin on the heap page,
+				 * as recorded in scandesc->xs_cbuf.  We could release that
+				 * pin now, but it's not clear whether it's a win to do so.
+				 * The next index entry might require a visit to the same heap
+				 * page.
+				 */
+
+				tuple_from_heap = true;
 			}
-		}
 
-		/*
-		 * We don't currently support rechecking ORDER BY distances.  (In
-		 * principle, if the index can support retrieval of the originally
-		 * indexed value, it should be able to produce an exact distance
-		 * calculation too.  So it's not clear that adding code here for
-		 * recheck/re-sort would be worth the trouble.  But we should at least
-		 * throw an error if someone tries it.)
-		 */
-		if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("lossy distance functions are not supported in index-only scans")));
+			/*
+			 * Fill the scan tuple slot with data from the index.  This might
+			 * be provided in either HeapTuple or IndexTuple format.
+			 * Conceivably an index AM might fill both fields, in which case
+			 * we prefer the heap format, since it's probably a bit cheaper to
+			 * fill a slot from.
+			 */
+			if (scandesc->xs_hitup)
+			{
+				/*
+				 * We don't take the trouble to verify that the provided tuple
+				 * has exactly the slot's format, but it seems worth doing a
+				 * quick check on the number of fields.
+				 */
+				Assert(slot->tts_tupleDescriptor->natts ==
+					   scandesc->xs_hitupdesc->natts);
+				ExecForceStoreHeapTuple(scandesc->xs_hitup, slot, false);
+			}
+			else if (scandesc->xs_itup)
+				StoreIndexTuple(node, slot, scandesc->xs_itup, scandesc->xs_itupdesc);
+			else
+				elog(ERROR, "no data returned for index-only scan");
 
-		/*
-		 * If we didn't access the heap, then we'll need to take a predicate
-		 * lock explicitly, as if we had.  For now we do that at page level.
-		 */
-		if (!tuple_from_heap)
-			PredicateLockPage(scandesc->heapRelation,
-							  ItemPointerGetBlockNumber(tid),
-							  estate->es_snapshot);
+			/*
+			 * If the index was lossy, we have to recheck the index quals.
+			 */
+			if (scandesc->xs_recheck)
+			{
+				econtext->ecxt_scantuple = slot;
+				if (!ExecQualAndReset(node->recheckqual, econtext))
+				{
+					/* Fails recheck, so drop it and loop back for another */
+					InstrCountFiltered2(node, 1);
+					continue;
+				}
+			}
+
+			/*
+			 * We don't currently support rechecking ORDER BY distances.  (In
+			 * principle, if the index can support retrieval of the originally
+			 * indexed value, it should be able to produce an exact distance
+			 * calculation too.  So it's not clear that adding code here for
+			 * recheck/re-sort would be worth the trouble.  But we should at
+			 * least throw an error if someone tries it.)
+			 */
+			if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby)
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("lossy distance functions are not supported in index-only scans")));
+
+			/*
+			 * If we didn't access the heap, then we'll need to take a
+			 * predicate lock explicitly, as if we had.  For now we do that at
+			 * page level.
+			 */
+			if (!tuple_from_heap)
+				PredicateLockPage(scandesc->heapRelation,
+								  ItemPointerGetBlockNumber(tid),
+								  estate->es_snapshot);
+
+			return slot;
+		}
 
-		return slot;
+		/* batch is empty, try reading the next batch of tuples */
+		if (index_batch_getnext(scandesc, direction))
+		{
+			index_batch_prefetch(scandesc, direction, ios_prefetch_block, node);
+			goto new_batch;
+		}
+
+		return NULL;
 	}
 
 	/*
@@ -574,6 +800,16 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 	indexstate->recheckqual =
 		ExecInitQual(node->recheckqual, (PlanState *) indexstate);
 
+	/*
+	 * Can't do batching (and thus prefetching) when the plan requires mark
+	 * and restore. There's an issue with translating the mark/restore
+	 * positions between the batch in scan descriptor and the original
+	 * position recognized in the index AM.
+	 *
+	 * XXX Hopefully just a temporary limitation?
+	 */
+	indexstate->ioss_CanBatch = !(eflags & EXEC_FLAG_MARK);
+
 	/*
 	 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
 	 * here.  This allows an index-advisor plugin to EXPLAIN a plan containing
@@ -735,12 +971,20 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
 								  estate->es_snapshot,
 								  piscan);
 	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, piscan);
+
+	/*
+	 * XXX do we actually want prefetching for parallel index scans? Maybe
+	 * not, but then we need to be careful not to call index_batch_getnext_tid
+	 * (which now can happen, because we'll call IndexOnlyNext even for
+	 * parallel plans).
+	 */
 	node->ioss_ScanDesc =
 		index_beginscan_parallel(node->ss.ss_currentRelation,
 								 node->ioss_RelationDesc,
 								 node->ioss_NumScanKeys,
 								 node->ioss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 node->ioss_CanBatch);
 	node->ioss_ScanDesc->xs_want_itup = true;
 	node->ioss_VMBuffer = InvalidBuffer;
 
@@ -780,12 +1024,15 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
 	ParallelIndexScanDesc piscan;
 
 	piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
+
+	/* XXX do we actually want prefetching for parallel index scans? */
 	node->ioss_ScanDesc =
 		index_beginscan_parallel(node->ss.ss_currentRelation,
 								 node->ioss_RelationDesc,
 								 node->ioss_NumScanKeys,
 								 node->ioss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 node->ioss_CanBatch);
 	node->ioss_ScanDesc->xs_want_itup = true;
 
 	/*
@@ -797,3 +1044,35 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
 					 node->ioss_ScanKeys, node->ioss_NumScanKeys,
 					 node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
 }
+
+/*
+ * ios_prefetch_block
+ *		Callback to only prefetch blocks that are not all-visible.
+ *
+ * We don't want to inspect the visibility map repeatedly, so the result of
+ * VM_ALL_VISIBLE is stored in the batch private data. The values are set
+ * to 0 by default, so we use two constants to remember if all-visible or
+ * not all-visible.
+ */
+static bool
+ios_prefetch_block(IndexScanDesc scan, ScanDirection direction,
+				   void *arg, int index)
+{
+	IndexOnlyScanState *node = (IndexOnlyScanState *) arg;
+
+	if (scan->xs_batch->privateData[index] == IOS_UNKNOWN_VISIBILITY)
+	{
+		bool		all_visible;
+		ItemPointer tid = &scan->xs_batch->heaptids[index];
+
+		all_visible = VM_ALL_VISIBLE(scan->heapRelation,
+									 ItemPointerGetBlockNumber(tid),
+									 &node->ioss_VMBuffer);
+
+		scan->xs_batch->privateData[index]
+			= all_visible ? IOS_ALL_VISIBLE : IOS_NOT_ALL_VISIBLE;
+	}
+
+	/* prefetch only blocks that are not all-visible */
+	return (scan->xs_batch->privateData[index] == IOS_NOT_ALL_VISIBLE);
+}
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 8000feff4c9..6d5de6e2ce8 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -38,6 +38,7 @@
 #include "lib/pairingheap.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
+#include "optimizer/cost.h"
 #include "utils/array.h"
 #include "utils/datum.h"
 #include "utils/lsyscache.h"
@@ -110,7 +111,8 @@ IndexNext(IndexScanState *node)
 								   node->iss_RelationDesc,
 								   estate->es_snapshot,
 								   node->iss_NumScanKeys,
-								   node->iss_NumOrderByKeys);
+								   node->iss_NumOrderByKeys,
+								   node->iss_CanBatch);
 
 		node->iss_ScanDesc = scandesc;
 
@@ -125,28 +127,82 @@ IndexNext(IndexScanState *node)
 	}
 
 	/*
-	 * ok, now that we have what we need, fetch the next tuple.
+	 * If the batching is disabled by a GUC, or if it's not supported by the
+	 * index AM, do the original approach.
+	 *
+	 * XXX Maybe we should enable batching based on the plan too, so that we
+	 * don't do batching when it's probably useless (e.g. semijoins or queries
+	 * with LIMIT 1 etc.). But maybe the approach with slow ramp-up (starting
+	 * with small batches) will handle that well enough.
+	 *
+	 * XXX Perhaps it'd be possible to do both in index_getnext_slot(), i.e.
+	 * call either the original code without batching, or the new batching
+	 * code if supported/enabled. It's not great to have duplicated code.
 	 */
-	while (index_getnext_slot(scandesc, direction, slot))
+	if (scandesc->xs_batch == NULL)
 	{
-		CHECK_FOR_INTERRUPTS();
-
 		/*
-		 * If the index was lossy, we have to recheck the index quals using
-		 * the fetched tuple.
+		 * ok, now that we have what we need, fetch the next tuple.
 		 */
-		if (scandesc->xs_recheck)
+		while (index_getnext_slot(scandesc, direction, slot))
 		{
-			econtext->ecxt_scantuple = slot;
-			if (!ExecQualAndReset(node->indexqualorig, econtext))
+			CHECK_FOR_INTERRUPTS();
+
+			/*
+			 * If the index was lossy, we have to recheck the index quals
+			 * using the fetched tuple.
+			 */
+			if (scandesc->xs_recheck)
 			{
-				/* Fails recheck, so drop it and loop back for another */
-				InstrCountFiltered2(node, 1);
-				continue;
+				econtext->ecxt_scantuple = slot;
+				if (!ExecQualAndReset(node->indexqualorig, econtext))
+				{
+					/* Fails recheck, so drop it and loop back for another */
+					InstrCountFiltered2(node, 1);
+					continue;
+				}
+			}
+
+			return slot;
+		}
+	}
+	else
+	{
+new_batch:
+		/* do we have TIDs in the current batch */
+		while (index_batch_getnext_slot(scandesc, direction, slot))
+		{
+			CHECK_FOR_INTERRUPTS();
+
+			/* first, take care of prefetching further items */
+			index_batch_prefetch(scandesc, direction, NULL, NULL);
+
+			/*
+			 * If the index was lossy, we have to recheck the index quals
+			 * using the fetched tuple.
+			 */
+			if (scandesc->xs_recheck)
+			{
+				econtext->ecxt_scantuple = slot;
+				if (!ExecQualAndReset(node->indexqualorig, econtext))
+				{
+					/* Fails recheck, so drop it and loop back for another */
+					InstrCountFiltered2(node, 1);
+					continue;
+				}
 			}
+
+			return slot;
+		}
+
+		/* batch is empty, try reading the next batch of tuples */
+		if (index_batch_getnext(scandesc, direction))
+		{
+			index_batch_prefetch(scandesc, direction, NULL, NULL);
+			goto new_batch;
 		}
 
-		return slot;
+		return NULL;
 	}
 
 	/*
@@ -205,7 +261,8 @@ IndexNextWithReorder(IndexScanState *node)
 								   node->iss_RelationDesc,
 								   estate->es_snapshot,
 								   node->iss_NumScanKeys,
-								   node->iss_NumOrderByKeys);
+								   node->iss_NumOrderByKeys,
+								   false);	/* XXX should use batching? */
 
 		node->iss_ScanDesc = scandesc;
 
@@ -942,6 +999,16 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 	indexstate->indexorderbyorig =
 		ExecInitExprList(node->indexorderbyorig, (PlanState *) indexstate);
 
+	/*
+	 * Can't do batching (and thus prefetching) when the plan requires mark
+	 * and restore. There's an issue with translating the mark/restore
+	 * positions between the batch in scan descriptor and the original
+	 * position recognized in the index AM.
+	 *
+	 * XXX Hopefully just a temporary limitation?
+	 */
+	indexstate->iss_CanBatch = !(eflags & EXEC_FLAG_MARK);
+
 	/*
 	 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
 	 * here.  This allows an index-advisor plugin to EXPLAIN a plan containing
@@ -1670,12 +1737,20 @@ ExecIndexScanInitializeDSM(IndexScanState *node,
 								  estate->es_snapshot,
 								  piscan);
 	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, piscan);
+
+	/*
+	 * XXX do we actually want prefetching for parallel index scans? Maybe
+	 * not, but then we need to be careful not to call index_batch_getnext_tid
+	 * (which now can happen, because we'll call IndexOnlyNext even for
+	 * parallel plans).
+	 */
 	node->iss_ScanDesc =
 		index_beginscan_parallel(node->ss.ss_currentRelation,
 								 node->iss_RelationDesc,
 								 node->iss_NumScanKeys,
 								 node->iss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 node->iss_CanBatch);
 
 	/*
 	 * If no run-time keys to calculate or they are ready, go ahead and pass
@@ -1713,12 +1788,20 @@ ExecIndexScanInitializeWorker(IndexScanState *node,
 	ParallelIndexScanDesc piscan;
 
 	piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
+
+	/*
+	 * XXX do we actually want prefetching for parallel index scans? Maybe
+	 * not, but then we need to be careful not to call index_batch_getnext_tid
+	 * (which now can happen, because we'll call IndexOnlyNext even for
+	 * parallel plans).
+	 */
 	node->iss_ScanDesc =
 		index_beginscan_parallel(node->ss.ss_currentRelation,
 								 node->iss_RelationDesc,
 								 node->iss_NumScanKeys,
 								 node->iss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 node->iss_CanBatch);
 
 	/*
 	 * If no run-time keys to calculate or they are ready, go ahead and pass
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 03d7fb5f482..2c9c829c288 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6340,9 +6340,14 @@ get_actual_variable_endpoint(Relation heapRel,
 	InitNonVacuumableSnapshot(SnapshotNonVacuumable,
 							  GlobalVisTestFor(heapRel));
 
+	/*
+	 * XXX I'm not sure about batching/prefetching here. In most cases we
+	 * expect to find the endpoints immediately, but sometimes we have a lot
+	 * of dead tuples - and then prefetching might help.
+	 */
 	index_scan = index_beginscan(heapRel, indexRel,
 								 &SnapshotNonVacuumable,
-								 1, 0);
+								 1, 0, false);
 	/* Set it up for index-only scan */
 	index_scan->xs_want_itup = true;
 	index_rescan(index_scan, scankeys, 1, NULL, 0);
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 686309db58b..62a7f63a613 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -789,6 +789,16 @@ struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_indexscan_batching", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of index-scan batching."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&enable_indexscan_batching,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_indexonlyscan", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of index-only-scan plans."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 667e0dc40a2..b2509337755 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -398,6 +398,7 @@
 #enable_hashjoin = on
 #enable_incremental_sort = on
 #enable_indexscan = on
+#enable_indexscan_batching = on
 #enable_indexonlyscan = on
 #enable_material = on
 #enable_memoize = on
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index f25c9d58a7d..d9106501b69 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -177,6 +177,10 @@ typedef void (*amrescan_function) (IndexScanDesc scan,
 typedef bool (*amgettuple_function) (IndexScanDesc scan,
 									 ScanDirection direction);
 
+/* next batch of valid tuples */
+typedef bool (*amgetbatch_function) (IndexScanDesc scan,
+									 ScanDirection direction);
+
 /* fetch all valid tuples */
 typedef int64 (*amgetbitmap_function) (IndexScanDesc scan,
 									   TIDBitmap *tbm);
@@ -280,6 +284,7 @@ typedef struct IndexAmRoutine
 	ambeginscan_function ambeginscan;
 	amrescan_function amrescan;
 	amgettuple_function amgettuple; /* can be NULL */
+	amgetbatch_function amgetbatch; /* can be NULL */
 	amgetbitmap_function amgetbitmap;	/* can be NULL */
 	amendscan_function amendscan;
 	ammarkpos_function ammarkpos;	/* can be NULL */
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index fdcfbe8db74..5a919f7d37a 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -14,6 +14,7 @@
 #ifndef GENAM_H
 #define GENAM_H
 
+#include "access/itup.h"
 #include "access/sdir.h"
 #include "access/skey.h"
 #include "nodes/tidbitmap.h"
@@ -88,6 +89,7 @@ typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state);
 
 /* struct definitions appear in relscan.h */
 typedef struct IndexScanDescData *IndexScanDesc;
+typedef struct IndexScanBatchData *IndexScanBatch;
 typedef struct SysScanDescData *SysScanDesc;
 
 typedef struct ParallelIndexScanDescData *ParallelIndexScanDesc;
@@ -132,6 +134,8 @@ typedef struct IndexOrderByDistance
  * generalized index_ interface routines (in indexam.c)
  */
 
+extern PGDLLIMPORT bool enable_indexscan_batching;
+
 /*
  * IndexScanIsValid
  *		True iff the index scan is valid.
@@ -155,7 +159,8 @@ extern void index_insert_cleanup(Relation indexRelation,
 extern IndexScanDesc index_beginscan(Relation heapRelation,
 									 Relation indexRelation,
 									 Snapshot snapshot,
-									 int nkeys, int norderbys);
+									 int nkeys, int norderbys,
+									 bool enable_batching);
 extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation,
 											Snapshot snapshot,
 											int nkeys);
@@ -173,7 +178,8 @@ extern void index_parallelscan_initialize(Relation heapRelation,
 extern void index_parallelrescan(IndexScanDesc scan);
 extern IndexScanDesc index_beginscan_parallel(Relation heaprel,
 											  Relation indexrel, int nkeys, int norderbys,
-											  ParallelIndexScanDesc pscan);
+											  ParallelIndexScanDesc pscan,
+											  bool enable_batching);
 extern ItemPointer index_getnext_tid(IndexScanDesc scan,
 									 ScanDirection direction);
 struct TupleTableSlot;
@@ -182,6 +188,25 @@ extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction,
 							   struct TupleTableSlot *slot);
 extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap);
 
+extern bool index_batch_getnext(IndexScanDesc scan,
+								ScanDirection direction);
+extern ItemPointer index_batch_getnext_tid(IndexScanDesc scan,
+										   ScanDirection direction);
+extern bool index_batch_getnext_slot(IndexScanDesc scan,
+									 ScanDirection direction,
+									 struct TupleTableSlot *slot);
+extern bool index_batch_add(IndexScanDesc scan, ItemPointerData tid, bool recheck,
+							IndexTuple itup, HeapTuple htup);
+
+/*
+ * Typedef for callback function to determine if an item in index scan should
+ * be prefetched.
+ */
+typedef bool (*index_prefetch_callback) (IndexScanDesc scan, ScanDirection direction,
+										 void *arg, int index);
+extern void index_batch_prefetch(IndexScanDesc scan, ScanDirection direction,
+								 index_prefetch_callback callback, void *arg);
+
 extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
 												IndexBulkDeleteResult *istat,
 												IndexBulkDeleteCallback callback,
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 9af9b3ecdcc..3326d2d8958 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1037,6 +1037,14 @@ typedef struct BTArrayKeyInfo
 	Datum	   *elem_values;	/* array of num_elems Datums */
 } BTArrayKeyInfo;
 
+/* Information about the current batch (in batched index scans) */
+typedef struct BTBatchInfo
+{
+	/* Current range of items in a batch (if used). */
+	int			firstIndex;
+	int			lastIndex;
+} BTBatchInfo;
+
 typedef struct BTScanOpaqueData
 {
 	/* these fields are set by _bt_preprocess_keys(): */
@@ -1056,6 +1064,9 @@ typedef struct BTScanOpaqueData
 	int		   *killedItems;	/* currPos.items indexes of killed items */
 	int			numKilled;		/* number of currently stored items */
 
+	/* info about current batch */
+	BTBatchInfo batch;
+
 	/*
 	 * If we are doing an index-only scan, these are the tuple storage
 	 * workspaces for the currPos and markPos respectively.  Each is of size
@@ -1172,6 +1183,7 @@ extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
 extern Size btestimateparallelscan(int nkeys, int norderbys);
 extern void btinitparallelscan(void *target);
 extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
+extern bool btgetbatch(IndexScanDesc scan, ScanDirection dir);
 extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
 extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 					 ScanKey orderbys, int norderbys);
@@ -1276,6 +1288,9 @@ extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
 extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_first_batch(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_next_batch(IndexScanDesc scan, ScanDirection dir);
+extern void _bt_kill_batch(IndexScanDesc scan);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
 
 /*
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 521043304ab..9dfd933e22e 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -106,6 +106,54 @@ typedef struct IndexFetchTableData
 	Relation	rel;
 } IndexFetchTableData;
 
+/*
+ * Data about the current TID batch returned by the index AM.
+ *
+ * XXX Maybe this should be a separate struct instead, and the scan
+ * descriptor would have only a pointer, initialized only when the
+ * batching is actually used?
+ *
+ * XXX It's not quite clear which part of this is managed by indexam and
+ * what's up to the actual index AM implementation. Needs some clearer
+ * boundaries.
+ *
+ * XXX Should we have a pointer for optional state managed by the AM? Some
+ * custom AMs may need more per-batch information, not just the fields we
+ * have here.
+ */
+typedef struct IndexScanBatchData
+{
+	/* batch size - maximum, initial, current (with ramp up) */
+	int			maxSize;
+	int			initSize;
+	int			currSize;
+
+	/* memory context for per-batch data */
+	MemoryContext ctx;
+
+	/* batch prefetching */
+	int			prefetchTarget; /* current prefetch distance */
+	int			prefetchMaximum;	/* maximum prefetch distance */
+	int			prefetchIndex;	/* next item to prefetch */
+
+	/* batch contents (TIDs, index tuples, kill bitmap, ...) */
+	int			currIndex;		/* index of the current item */
+	int			nheaptids;		/* number of TIDs in the batch */
+	ItemPointerData *heaptids;	/* TIDs in the batch */
+	IndexTuple *itups;			/* IndexTuples, if requested */
+	HeapTuple  *htups;			/* HeapTuples, if requested */
+	bool	   *recheck;		/* recheck flags */
+	Datum	   *privateData;	/* private data for batch */
+
+	/* xs_orderbyvals / xs_orderbynulls */
+	Datum	   *orderbyvals;
+	bool	   *orderbynulls;
+
+	/* list of killed items */
+	int			nKilledItems;	/* number of killedItems elements */
+	int		   *killedItems;	/* list of indexes to kill */
+} IndexScanBatchData;
+
 /*
  * We use the same IndexScanDescData structure for both amgettuple-based
  * and amgetbitmap-based index scans.  Some fields are only relevant in
@@ -151,6 +199,9 @@ typedef struct IndexScanDescData
 
 	bool		xs_recheck;		/* T means scan keys must be rechecked */
 
+	/* Information used by batched index scans. */
+	IndexScanBatchData *xs_batch;
+
 	/*
 	 * When fetching with an ordering operator, the values of the ORDER BY
 	 * expressions of the last returned tuple, according to the index.  If
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 516b9487435..4589ad5ed78 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1646,6 +1646,7 @@ typedef struct
  *		OrderByTypByVals   is the datatype of order by expression pass-by-value?
  *		OrderByTypLens	   typlens of the datatypes of order by expressions
  *		PscanLen		   size of parallel index scan descriptor
+ *		CanBatch		   batching (and prefetching) enabled
  * ----------------
  */
 typedef struct IndexScanState
@@ -1673,6 +1674,10 @@ typedef struct IndexScanState
 	bool	   *iss_OrderByTypByVals;
 	int16	   *iss_OrderByTypLens;
 	Size		iss_PscanLen;
+
+	/* batching/prefetching enabled? */
+	bool		iss_CanBatch;
+
 } IndexScanState;
 
 /* ----------------
@@ -1694,6 +1699,7 @@ typedef struct IndexScanState
  *		PscanLen		   size of parallel index-only scan descriptor
  *		NameCStringAttNums attnums of name typed columns to pad to NAMEDATALEN
  *		NameCStringCount   number of elements in the NameCStringAttNums array
+ *		CanBatch		   batching (and prefetching) enabled
  * ----------------
  */
 typedef struct IndexOnlyScanState
@@ -1715,6 +1721,7 @@ typedef struct IndexOnlyScanState
 	Size		ioss_PscanLen;
 	AttrNumber *ioss_NameCStringAttNums;
 	int			ioss_NameCStringCount;
+	bool		ioss_CanBatch;
 } IndexOnlyScanState;
 
 /* ----------------
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index fad7fc3a7e0..14b38ed4d46 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -157,6 +157,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_incremental_sort        | on
  enable_indexonlyscan           | on
  enable_indexscan               | on
+ enable_indexscan_batching      | on
  enable_material                | on
  enable_memoize                 | on
  enable_mergejoin               | on
@@ -170,7 +171,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(22 rows)
+(23 rows)
 
 -- There are always wait event descriptions for various types.  InjectionPoint
 -- may be present or absent, depending on history since last postmaster start.
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index df3f336bec0..9cb66d6e25c 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -188,6 +188,7 @@ BOOL
 BOOLEAN
 BOX
 BTArrayKeyInfo
+BTBatchInfo
 BTBuildState
 BTCycleId
 BTDedupInterval
@@ -1219,6 +1220,7 @@ IndexOrderByDistance
 IndexPath
 IndexRuntimeKeyInfo
 IndexScan
+IndexScanBatchData
 IndexScanDesc
 IndexScanState
 IndexStateFlagsAction
@@ -3278,6 +3280,7 @@ amendscan_function
 amestimateparallelscan_function
 amgetbitmap_function
 amgettuple_function
+amgetbatch_function
 aminitparallelscan_function
 aminsert_function
 aminsertcleanup_function
-- 
2.46.0

