From 0860de6872b7e39f5a940c7d008e0c113f083bb1 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Mon, 30 Sep 2024 22:48:12 +0200
Subject: [PATCH v20241106 1/7] WIP: index batching / prefetching

Allows the index AM to provide items (TIDs and tuples) in batches, which
is then used to implement prefetching of heap tuples in index scans
(including index-only scans). This is similar to prefetching already
done in bitmap scans, and can result in significant speedups.

The index AM may implement an optional "amgetbatch" callback, returning
a batch of items. The indexam.c code then handles this transparently
through the existing "getnext" interface.

It is up to the index AM to return only batches that it can handle
internally. For example, most of the later patches adding support for
batching to relevant index AMs (btree, hash, gist, sp-gist) restrict the
batches to a single leaf page. This makes implementation of batching
much simpler, with only minimal changes to the index AMs, but it's not a
hard requirement. The index AM can produce batches spanning arbitrary
number of leaf pages. This is left as a possible future improvement.

Most of the batching/prefetching logic happens in indexam.c. This means
the executor code can continue to call the interface just like before.

The only "violation" happens in index-only scans, which need to check
the visibility map both when the prefetching pages (we don't want to
prefetch pages that are unnecessary) and later when reading the data.
For cached data the visibility map checks can be fairly expensive, so
it's desirable to keep and reuse the result of the first check.

At the moment, the prefetching does not handle mark/restore plans. This
is doable, but requires additional synchronization between the batching
and index AM code in the "opposite direction".

This patch does not actually add batching to any of the index AMs, it's
just the common infrastructure.

TODO Add the new index AM callback to sgml docs.
---
 src/backend/access/heap/heapam_handler.c      |   7 +-
 src/backend/access/index/genam.c              |  23 +-
 src/backend/access/index/indexam.c            | 808 +++++++++++++++++-
 src/backend/executor/execIndexing.c           |   7 +-
 src/backend/executor/execReplication.c        |   9 +-
 src/backend/executor/nodeIndexonlyscan.c      | 106 ++-
 src/backend/executor/nodeIndexscan.c          |  36 +-
 src/backend/utils/adt/selfuncs.c              |   7 +-
 src/backend/utils/misc/guc_tables.c           |  10 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/access/amapi.h                    |   5 +
 src/include/access/genam.h                    |  14 +-
 src/include/access/relscan.h                  |  64 ++
 src/include/nodes/execnodes.h                 |   7 +
 src/test/regress/expected/sysviews.out        |   3 +-
 src/tools/pgindent/typedefs.list              |   2 +
 16 files changed, 1084 insertions(+), 25 deletions(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index a8d95e0f1c1..3fceae759d2 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -749,7 +749,12 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 
 		tableScan = NULL;
 		heapScan = NULL;
-		indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
+
+		/*
+		 * XXX Maybe enable batching/prefetch for clustering. Seems like it
+		 * might be a pretty substantial win.
+		 */
+		indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0, false);
 		index_rescan(indexScan, NULL, 0, NULL, 0);
 	}
 	else
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 60c61039d66..3a9d2d483d7 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -445,8 +445,18 @@ systable_beginscan(Relation heapRelation,
 				elog(ERROR, "column is not in index");
 		}
 
+		/*
+		 * No batching/prefetch for catalogs. We don't expect that to help
+		 * very much, because we usually need just one row, and even if we
+		 * need multiple rows, they tend to be colocated in heap.
+		 *
+		 * XXX Maybe we could do that, the prefetching only ramps up over
+		 * time. But then we need to be careful about infinite recursion when
+		 * looking up effective_io_concurrency for a tablespace in the
+		 * catalog.
+		 */
 		sysscan->iscan = index_beginscan(heapRelation, irel,
-										 snapshot, nkeys, 0);
+										 snapshot, nkeys, 0, false);
 		index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0);
 		sysscan->scan = NULL;
 	}
@@ -708,8 +718,17 @@ systable_beginscan_ordered(Relation heapRelation,
 			elog(ERROR, "column is not in index");
 	}
 
+	/*
+	 * No batching/prefetch for catalogs. We don't expect that to help very
+	 * much, because we usually need just one row, and even if we need
+	 * multiple rows, they tend to be colocated in heap.
+	 *
+	 * XXX Maybe we could do that, the prefetching only ramps up over time.
+	 * But then we need to be careful about infinite recursion when looking up
+	 * effective_io_concurrency for a tablespace in the catalog.
+	 */
 	sysscan->iscan = index_beginscan(heapRelation, indexRelation,
-									 snapshot, nkeys, 0);
+									 snapshot, nkeys, 0, false);
 	index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0);
 	sysscan->scan = NULL;
 
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 1859be614c0..2849ab97cdf 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -33,6 +33,7 @@
  *		index_can_return	- does index support index-only scans?
  *		index_getprocid - get a support procedure OID
  *		index_getprocinfo - get a support procedure's lookup info
+ *		index_batch_add		- add an item (TID, itup) to the batch
  *
  * NOTES
  *		This file contains the index_ routines which used
@@ -54,10 +55,14 @@
 #include "pgstat.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "utils/memutils.h"
 #include "utils/ruleutils.h"
 #include "utils/snapmgr.h"
+#include "utils/spccache.h"
 #include "utils/syscache.h"
 
+/* enable reading batches / prefetching of TIDs from the index */
+bool		enable_indexscan_batching = false;
 
 /* ----------------------------------------------------------------
  *					macros used in index_ routines
@@ -109,6 +114,15 @@ static IndexScanDesc index_beginscan_internal(Relation indexRelation,
 											  ParallelIndexScanDesc pscan, bool temp_snap);
 static inline void validate_relation_kind(Relation r);
 
+/* index batching */
+static void index_batch_init(IndexScanDesc scan);
+static void index_batch_reset(IndexScanDesc scan);
+static bool index_batch_getnext(IndexScanDesc scan,
+								ScanDirection direction);
+static ItemPointer index_batch_getnext_tid(IndexScanDesc scan,
+										   ScanDirection direction);
+static void index_batch_prefetch(IndexScanDesc scan,
+								 ScanDirection direction);
 
 /* ----------------------------------------------------------------
  *				   index_ interface functions
@@ -256,7 +270,8 @@ IndexScanDesc
 index_beginscan(Relation heapRelation,
 				Relation indexRelation,
 				Snapshot snapshot,
-				int nkeys, int norderbys)
+				int nkeys, int norderbys,
+				bool enable_batching)
 {
 	IndexScanDesc scan;
 
@@ -274,6 +289,24 @@ index_beginscan(Relation heapRelation,
 	/* prepare to fetch index matches from table */
 	scan->xs_heapfetch = table_index_fetch_begin(heapRelation);
 
+	/*
+	 * If explicitly requested and supported by both the index AM and the
+	 * plan, initialize batching info.
+	 *
+	 * XXX We do this after ambeginscan(), which means the AM can't init the
+	 * private data in there (it doesn't even know if batching will be used at
+	 * that point).
+	 *
+	 * XXX Maybe we should have a separate "amcanbatch" call, to let the AM
+	 * decide if batching is supported depending on the scan details.
+	 */
+	if ((indexRelation->rd_indam->amgetbatch != NULL) &&
+		enable_batching &&
+		enable_indexscan_batching)
+	{
+		index_batch_init(scan);
+	}
+
 	return scan;
 }
 
@@ -333,6 +366,12 @@ index_beginscan_internal(Relation indexRelation,
 	scan->parallel_scan = pscan;
 	scan->xs_temp_snap = temp_snap;
 
+	/*
+	 * No batching by default, so set it to NULL. Will be initialized later if
+	 * batching is requested and AM supports it.
+	 */
+	scan->xs_batch = NULL;
+
 	return scan;
 }
 
@@ -368,6 +407,18 @@ index_rescan(IndexScanDesc scan,
 
 	scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys,
 											orderbys, norderbys);
+
+	/*
+	 * Reset the batch, to make it look empty.
+	 *
+	 * Done after the amrescan() call, in case the AM needs some of the batch
+	 * info (e.g. to properly transfer the killed tuples).
+	 *
+	 * XXX This is a bit misleading, because index_batch_reset does not reset
+	 * the killed tuples. So if that's the only justification, we could have
+	 * done it before the call.
+	 */
+	index_batch_reset(scan);
 }
 
 /* ----------------
@@ -444,6 +495,18 @@ index_restrpos(IndexScanDesc scan)
 	scan->xs_heap_continue = false;
 
 	scan->indexRelation->rd_indam->amrestrpos(scan);
+
+	/*
+	 * Reset the batch, to make it look empty.
+	 *
+	 * Done after the amrescan() call, in case the AM needs some of the batch
+	 * info (e.g. to properly transfer the killed tuples).
+	 *
+	 * XXX This is a bit misleading, because index_batch_reset does not reset
+	 * the killed tuples. So if that's the only justification, we could have
+	 * done it before the call.
+	 */
+	index_batch_reset(scan);
 }
 
 /*
@@ -539,7 +602,8 @@ index_parallelrescan(IndexScanDesc scan)
  */
 IndexScanDesc
 index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys,
-						 int norderbys, ParallelIndexScanDesc pscan)
+						 int norderbys, ParallelIndexScanDesc pscan,
+						 bool enable_batching)
 {
 	Snapshot	snapshot;
 	IndexScanDesc scan;
@@ -562,6 +626,24 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys,
 	/* prepare to fetch index matches from table */
 	scan->xs_heapfetch = table_index_fetch_begin(heaprel);
 
+	/*
+	 * If explicitly requested and supported by both the index AM and the
+	 * plan, initialize batching info.
+	 *
+	 * XXX We do this after ambeginscan(), which means the AM can't init the
+	 * private data in there (it doesn't even know if batching will be used at
+	 * that point).
+	 *
+	 * XXX Maybe we should have a separate "amcanbatch" call, to let the AM
+	 * decide if batching is supported depending on the scan details.
+	 */
+	if ((indexrel->rd_indam->amgetbatch != NULL) &&
+		enable_batching &&
+		enable_indexscan_batching)
+	{
+		index_batch_init(scan);
+	}
+
 	return scan;
 }
 
@@ -583,6 +665,53 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 	/* XXX: we should assert that a snapshot is pushed or registered */
 	Assert(TransactionIdIsValid(RecentXmin));
 
+	/*
+	 * When using batching (which may be disabled for various reasons (e.g.
+	 * through a GUC, the index AM not supporting it) do the old approach.
+	 *
+	 * XXX Maybe we should enable batching based on the plan too, so that we
+	 * don't do batching when it's probably useless (e.g. semijoins or queries
+	 * with LIMIT 1 etc.). But maybe the approach with slow ramp-up (starting
+	 * with small batches) will handle that well enough.
+	 *
+	 * XXX Perhaps it'd be possible to do both in index_getnext_slot(), i.e.
+	 * call either the original code without batching, or the new batching
+	 * code if supported/enabled. It's not great to have duplicated code.
+	 */
+	if (scan->xs_batch != NULL)
+	{
+batch_loaded:
+		/* Try getting a TID from the current batch (if we have one). */
+		while (index_batch_getnext_tid(scan, direction) != NULL)
+		{
+			/*
+			 * We've successfully loaded a TID from the batch, so issue
+			 * prefetches for future TIDs if needed.
+			 */
+			index_batch_prefetch(scan, direction);
+
+			return &scan->xs_heaptid;
+		}
+
+		/*
+		 * We either don't have any batch yet, or we've already processed
+		 * all items from the current batch. Try loading the next one.
+		 *
+		 * If we succeed, issue prefetches (using the current prefetch
+		 * distance without ramp up), and then go back to returning the
+		 * TIDs from the batch.
+		 *
+		 * XXX Maybe do this as a simple while/for loop without the goto.
+		 */
+		if (index_batch_getnext(scan, direction))
+		{
+			index_batch_prefetch(scan, direction);
+			goto batch_loaded;
+		}
+
+		return NULL;
+	}
+
 	/*
 	 * The AM's amgettuple proc finds the next index entry matching the scan
 	 * keys, and puts the TID into scan->xs_heaptid.  It should also set
@@ -651,7 +780,19 @@ index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot)
 	 * RelationGetIndexScan().
 	 */
 	if (!scan->xactStartedInRecovery)
-		scan->kill_prior_tuple = all_dead;
+	{
+		if (scan->xs_batch == NULL)
+		{
+			scan->kill_prior_tuple = all_dead;
+		}
+		else if (all_dead)
+		{
+			/* batch case - record the killed tuple in the batch */
+			if (scan->xs_batch->nKilledItems < scan->xs_batch->maxSize)
+				scan->xs_batch->killedItems[scan->xs_batch->nKilledItems++]
+					= scan->xs_batch->currIndex;
+		}
+	}
 
 	return found;
 }
@@ -1039,3 +1180,664 @@ index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions,
 
 	return build_local_reloptions(&relopts, attoptions, validate);
 }
+
+/*
+ * INDEX BATCHING AND PREFETCHING
+ *
+ * Allows reading chunks of items from an index, instead of reading them
+ * one by one. This reduces the overhead of accessing index pages, and
+ * also allows acting on "future" TIDs - e.g. we can prefetch heap pages
+ * that will be needed, etc.
+ *
+ *
+ * index AM contract
+ * -----------------
+ *
+ * To support batching, the index AM needs to implement an optional callback
+ * amgetbatch() which loads data into the batch (in the scan descriptor).
+ *
+ * The index AM also needs to ensure it can perform all optimizations for
+ * all TIDs in the current batch. A good example of is the kill_prior_tuple
+ * optimization - with batching, the index AM may receive the information
+ * which tuples are to be killed with a delay - when loading the next
+ * batch, when ending/restarting the scan, etc. The AM needs to ensure it
+ * can still process such information, to keep the optimization effective.
+ *
+ * The AM may also need to keep pins required by the whole batch (not just
+ * the last tuple), etc.
+ *
+ * What this means/requires is very dependent on the index AM, of course.
+ * For B-Tree (and most other index AMs), batches spanning multiple leaf
+ * pages would be problematic. Such batches would work for basic index
+ * scans, but the kill_prior_tuple would be an issue - the AMs keep only
+ * a single leaf pinned. We'd either need to keep multiple pins, or allow
+ * reading older leaf pages pages (which might have been modified). Index
+ * only-scans is challenging too - we keep IndexTuple pointers into the
+ * leaf pages, which requires keeping those pins too.
+ *
+ * To solve this, we give the AM some control over batch boundaries. It is
+ * up to the index AM to pick which range of index items to load into the
+ * batch, and how to ensure all the optimizations are possible, keep pins,
+ * and so on. The index AM may use information about the batch (in the
+ * scan descriptor, maintained by indexam.c code), and may also keep some
+ * private information (in the existing "opaque" scan field).
+ *
+ * For most index AMs the easiest way is to not load batches spanning
+ * multiple leaf pages. This may impact the efficiency, especially for
+ * indexes with wide index tuples, as it means batches close to the end
+ * of the leaf page may be smaller.
+ *
+ * Note: There already is a pipeline break for prefetching - as we are
+ * getting closer to the end of a batch, we can't prefetch further than
+ * that, and the effective prefetch distance drops to 0.
+ *
+ * The alternative would be to make the index AMs more complex, to keep
+ * more leaf pages pinned, etc. The current model does not prohibit the
+ * index AMs from implementing that - it's entirely possible to keep the
+ * additional information in the "opaque" structure (say, list of pinned
+ * pages, and other necessary details).
+ *
+ * But that does not seem like a good trade off, as it's subject to
+ * "diminishing returns" - we see significant gains initially (with even
+ * small batches / prefetch distance), and as the batch grows the gains
+ * get smaller and smaller. It does not seem worth the complexity of
+ * pinning more pages etc. at least for the first version.
+ *
+ * To deal with the "prefetch pipeline break", that could be addressed by
+ * allowing multiple in-fligt batches - e.g. with prefetch distance 64
+ * we might have three batches of 32 items each, to prefetch far ahead.
+ * But that's not what this patch does yet.
+ *
+ *
+ * batch = sliding window
+ * ----------------------
+ *
+ * A good way to visualize a batch is a sliding window over the array
+ * of items on a leaf page. In the simplest example (forward scan with no
+ * changes of direction), we slice the array into smaller chunks, and
+ * then process each of those chunks.
+ *
+ * The batch size is adaptive - it starts small (only 8 elements) and
+ * increases as we read more batches (up to 64 elements). We don't want
+ * to regress cases that only need a single item (e.g. LIMIT 1 queries),
+ * and loading/copying a lot of data might cause that. So we start small
+ * and increase the size - that still improves cases reading a lot of
+ * data from the index, without hurting small queries.
+ *
+ * Note: This gradual ramp up is for batch size, independent of what we
+ * do for prefetch. The prefetch distance is gradually increased too, but
+ * it's independent / orthogonal to the batch size. The batch size limits
+ * how far ahead we can prefetch, of course.
+ *
+ * Note: The current limits on batch size (initial 8, maximum 64) are
+ * quite arbitrary, it just seemed those values are sane. We could adjust
+ * the initial size, but I don't think it'd make a fundamental difference.
+ * Growing the batches faster/slower has bigger impact.
+ *
+ * The maximum batch size does not matter much - it's true a btree index can
+ * have up to ~1300 items per 8K leaf page, but in most cases the actual
+ * number is lower, perhaps ~300. That's not that far from 64.
+ *
+ * Each batch has a firstIndex/lastIndex to track which part of the leaf
+ * page it currently represents.
+ *
+ *
+ * kill_prior_tuples
+ * -----------------
+ *
+ * If we decide a tuple can be killed, the batch item is marked accordingly,
+ * and the flag is reset to false (so that the index AM does not do something
+ * silly to a random tuple it thinks is "current").
+ *
+ * Then the next time the AM decides it's time to kill tuples, the AM needs
+ * to look at the batch and consider the tuples marked to be killed. B-Tree
+ * simply adds those TIDs to the regular "killItems" array.
+ *
+ *
+ * mark/restore
+ * ------------
+ *
+ * With batching, the index AM does not know the the "current" position on
+ * the leaf page - we don't propagate this to the index AM while walking
+ * items in the batch. To make ammarkpos() work, the index AM has to check
+ * the current position in the batch, and translate it to the proper page
+ * position, using the private information (about items in the batch).
+ *
+ * XXX This needs more work, I don't quite like how the two layers interact,
+ * it seems quite wrong to look at the batch info directly.
+ */
+
+/*
+ * Comprehensive check of various invariants on the index batch. Makes sure
+ * the indexes are set as expected, the buffer size is within limits, and
+ * so on.
+ */
+static void
+AssertCheckBatchInfo(IndexScanDesc scan)
+{
+#ifdef USE_ASSERT_CHECKING
+	/* all the arrays need to be allocated */
+	Assert((scan->xs_batch->heaptids != NULL) &&
+		   (scan->xs_batch->killedItems != NULL) &&
+		   (scan->xs_batch->privateData != NULL));
+
+	/* if IndexTuples expected, should be allocated too */
+	Assert(!(scan->xs_want_itup && (scan->xs_batch->itups == NULL)));
+
+	/* Various check on batch sizes */
+	Assert((scan->xs_batch->initSize >= 0) &&
+		   (scan->xs_batch->initSize <= scan->xs_batch->currSize) &&
+		   (scan->xs_batch->currSize <= scan->xs_batch->maxSize) &&
+		   (scan->xs_batch->maxSize <= 1024));	/* arbitrary limit */
+
+	/* Is the number of in the batch TIDs in a valid range? */
+	Assert((scan->xs_batch->nheaptids >= 0) &&
+		   (scan->xs_batch->nheaptids <= scan->xs_batch->maxSize));
+
+	/*
+	 * The current item must be between -1 and nheaptids. Those two extreme
+	 * values are starting points for forward/backward scans.
+	 */
+	Assert((scan->xs_batch->currIndex >= -1) &&
+		   (scan->xs_batch->currIndex <= scan->xs_batch->nheaptids));
+
+	/* check prefetch data */
+	Assert((scan->xs_batch->prefetchTarget >= 0) &&
+		   (scan->xs_batch->prefetchTarget <= scan->xs_batch->prefetchMaximum));
+
+	Assert((scan->xs_batch->prefetchIndex >= -1) &&
+		   (scan->xs_batch->prefetchIndex <= scan->xs_batch->nheaptids));
+
+	for (int i = 0; i < scan->xs_batch->nheaptids; i++)
+		Assert(ItemPointerIsValid(&scan->xs_batch->heaptids[i]));
+#endif
+}
+
+/* Is the batch full (TIDs up to capacity)? */
+#define	INDEX_BATCH_IS_FULL(scan)	\
+	((scan)->xs_batch->nheaptids == (scan)->xs_batch->currSize)
+
+/* Is the batch empty (no TIDs)? */
+#define	INDEX_BATCH_IS_EMPTY(scan)	\
+	((scan)->xs_batch->nheaptids == 0)
+
+/*
+ * Did we process all items? For forward scan it means the index points to the
+ * last item, for backward scans it has to point to the first one.
+ *
+ * This does not cover empty batches properly, because of backward scans.
+ */
+#define	INDEX_BATCH_IS_PROCESSED(scan, direction)	\
+	(ScanDirectionIsForward(direction) ? \
+		((scan)->xs_batch->nheaptids == ((scan)->xs_batch->currIndex + 1)) : \
+		((scan)->xs_batch->currIndex == 0))
+
+/* Does the batch items in the requested direction? */
+#define INDEX_BATCH_HAS_ITEMS(scan, direction) \
+	(!INDEX_BATCH_IS_EMPTY(scan) && !INDEX_BATCH_IS_PROCESSED(scan, direction))
+
+
+/* ----------------
+ *		index_batch_getnext - get the next batch of TIDs from a scan
+ *
+ * Returns true if we managed to read at least some TIDs into the batch,
+ * or false if there are no more TIDs in the scan. The xs_heaptids and
+ * xs_nheaptids fields contain the TIDS and the number of elements.
+ *
+ * XXX This only loads the TIDs and resets the various batch fields to
+ * fresh state. It does not set xs_heaptid/xs_itup/xs_hitup, that's the
+ * responsibility of the following index_batch_getnext_tid() calls.
+ * ----------------
+ */
+static bool
+index_batch_getnext(IndexScanDesc scan, ScanDirection direction)
+{
+	bool		found;
+
+	SCAN_CHECKS;
+	CHECK_SCAN_PROCEDURE(amgetbatch);
+
+	/* XXX: we should assert that a snapshot is pushed or registered */
+	Assert(TransactionIdIsValid(RecentXmin));
+
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	/*
+	 * We never read a new batch before we run out of items in the current
+	 * one. The current batch has to be either empty or we ran out of items
+	 * (in the given direction).
+	 */
+	Assert(!INDEX_BATCH_HAS_ITEMS(scan, direction));
+
+	/*
+	 * Reset the current/prefetch positions in the batch.
+	 *
+	 * XXX Done before calling amgetbatch(), so that it sees the index as
+	 * invalid, batch as empty, and can add items.
+	 *
+	 * XXX Intentionally does not reset the nheaptids, because the AM does
+	 * rely on that when processing killed tuples. Maybe store the killed
+	 * tuples differently?
+	 */
+	scan->xs_batch->currIndex = -1;
+	scan->xs_batch->prefetchIndex = 0;
+	scan->xs_batch->nheaptids = 0;
+
+	/*
+	 * Reset the memory context with all per-batch data, allocated by the AM.
+	 * This might be tuples, or anything else the AM needs.
+	 *
+	 * XXX Make sure to reset the tuples, because the AM may do something with
+	 * them later (e.g. release them, as getNextNearest in gist), but we may
+	 * release them by the MemoryContextReset() call.
+	 *
+	 * This might break the AM if it relies on them pointing to the last
+	 * tuple, but at least it has the chance to do the right thing by checking
+	 * if the pointer is NULL.
+	 */
+	scan->xs_itup = NULL;
+	scan->xs_hitup = NULL;
+
+	MemoryContextReset(scan->xs_batch->ctx);
+
+	/*
+	 * The AM's amgetbatch proc loads a chunk of TIDs matching the scan keys,
+	 * and puts the TIDs into scan->xs_batch->heaptids.  It should also set
+	 * scan->xs_recheck and possibly
+	 * scan->xs_batch->itups/scan->xs_batch->hitups, though we pay no
+	 * attention to those fields here.
+	 *
+	 * FIXME At the moment this does nothing with hitup. Needs to be fixed?
+	 */
+	found = scan->indexRelation->rd_indam->amgetbatch(scan, direction);
+
+	/* Reset kill flag immediately for safety */
+	scan->kill_prior_tuple = false;
+	scan->xs_heap_continue = false;
+
+	/* If we're out of index entries, we're done */
+	if (!found)
+	{
+		/* release resources (like buffer pins) from table accesses */
+		if (scan->xs_heapfetch)
+			table_index_fetch_reset(scan->xs_heapfetch);
+
+		return false;
+	}
+
+	/* We should have a non-empty batch with items. */
+	Assert(INDEX_BATCH_HAS_ITEMS(scan, direction));
+
+	pgstat_count_index_tuples(scan->indexRelation, scan->xs_batch->nheaptids);
+
+	/*
+	 * Set the prefetch index to the first item in the loaded batch (we expect
+	 * the index AM to set that).
+	 *
+	 * FIXME Maybe set the currIndex here, not in the index AM. It seems much
+	 * more like indexam.c responsibility rather than something every index AM
+	 * should be doing (in _bt_first_batch etc.).
+	 *
+	 * FIXME It's a bit unclear who (indexam.c or the index AM) is responsible
+	 * for setting which fields. This needs clarification.
+	 */
+	scan->xs_batch->prefetchIndex = scan->xs_batch->currIndex;
+
+	/*
+	 * Try to increase the size of the batch. Intentionally done after the AM
+	 * call, so that the new value applies to the next batch. Otherwise we
+	 * would always skip the initial batch size.
+	 */
+	scan->xs_batch->currSize = Min(scan->xs_batch->currSize + 1,
+								   scan->xs_batch->maxSize);
+
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	/* Return the batch of TIDs we found. */
+	return true;
+}
+
+/* ----------------
+ *		index_getnext_batch_tid - get the next TID from the current batch
+ *
+ * Same calling convention as index_getnext_tid(), except that NULL means
+ * no more items in the current batch, there may be more batches.
+ *
+ * XXX This only sets xs_heaptid and xs_itup (if requested). Not sure if
+ * we need to do something with xs_hitup.
+ *
+ * FIXME Should this set xs_hitup?
+ * ----------------
+ */
+static ItemPointer
+index_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction)
+{
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	/*
+	 * Bail out if he batch does not have more items in the requested directio
+	 * (either empty or everthing processed).
+	 */
+	if (!INDEX_BATCH_HAS_ITEMS(scan, direction))
+		return NULL;
+
+	/*
+	 * Advance to the next batch item - we know it's not empty and there are
+	 * items to process, so this is valid.
+	 */
+	if (ScanDirectionIsForward(direction))
+		scan->xs_batch->currIndex++;
+	else
+		scan->xs_batch->currIndex--;
+
+	/*
+	 * Next TID from the batch, optionally also the IndexTuple/HeapTuple.
+	 *
+	 * XXX Not sure how to decide which of the tuples to set, seems easier to
+	 * just set both, one of them will be NULL.
+	 *
+	 * XXX Do we need to reset the itups/htups array between batches? Doesn't
+	 * seem necessary, but maybe we could get bogus data?
+	 */
+	scan->xs_heaptid = scan->xs_batch->heaptids[scan->xs_batch->currIndex];
+	if (scan->xs_want_itup)
+	{
+		scan->xs_itup = scan->xs_batch->itups[scan->xs_batch->currIndex];
+		scan->xs_hitup = scan->xs_batch->htups[scan->xs_batch->currIndex];
+	}
+
+	scan->xs_recheck = scan->xs_batch->recheck[scan->xs_batch->currIndex];
+
+	/*
+	 * If there are order-by clauses, point to the appropriate chunk in the
+	 * arrays.
+	 */
+	if (scan->numberOfOrderBys > 0)
+	{
+		int			idx = scan->numberOfOrderBys * scan->xs_batch->currIndex;
+
+		scan->xs_orderbyvals = &scan->xs_batch->orderbyvals[idx];
+		scan->xs_orderbynulls = &scan->xs_batch->orderbynulls[idx];
+	}
+
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	return &scan->xs_heaptid;
+}
+
+/* ----------------
+ *		index_batch_prefetch - prefetch pages for TIDs in current batch
+ *
+ * The prefetch distance is increased gradually, similar to what we do for
+ * bitmap heap scans. We start from distance 0 (no prefetch), and then in each
+ * iteration increment the distance up to prefetchMaximum.
+ *
+ * The prefetch distance is reset (to 0) only on rescans, not between batches.
+ *
+ * It's possible to provide an index_prefetch_callback callback, to affect
+ * which items need to be prefetched. With prefetch_callback=NULL, all
+ * items are prefetched. With the callback provided, the item is prefetched
+ * iff the callback and returns true.
+ *
+ * The "arg" argument is used to pass a state for the plan node invoking the
+ * function, and is then passed to the callback. This means the callback is
+ * specific to the plan state.
+ *
+ * XXX the prefetchMaximum depends on effective_io_concurrency, and also on
+ * tablespace options.
+ *
+ * XXX For accesses that change scan direction, we may do a lot of unnecessary
+ * prefetching (because we will re-issue prefetches for what we recently read).
+ * I'm not sure if there's a simple way to track what was already prefetched.
+ * Maybe we could count how far we got (in the forward direction), keep that
+ * as a watermark, and never prefetch again below it.
+ *
+ * XXX Maybe wrap this in ifdef USE_PREFETCH?
+ * ----------------
+ */
+static void
+index_batch_prefetch(IndexScanDesc scan, ScanDirection direction)
+{
+	int			prefetchStart,
+				prefetchEnd;
+
+	IndexPrefetchCallback	prefetch_callback = scan->xs_batch->prefetchCallback;
+	void *arg = scan->xs_batch->prefetchArgument;
+
+	if (ScanDirectionIsForward(direction))
+	{
+		/* Where should we start to prefetch? */
+		prefetchStart = Max(scan->xs_batch->currIndex,
+							scan->xs_batch->prefetchIndex);
+
+		/*
+		 * Where should we stop prefetching? this is the first item that we do
+		 * NOT prefetch, i.e. it can be the first item after the batch.
+		 */
+		prefetchEnd = Min((scan->xs_batch->currIndex + 1) + scan->xs_batch->prefetchTarget,
+						  scan->xs_batch->nheaptids);
+
+		/* FIXME should calculate in a way to make this unnecessary */
+		prefetchStart = Max(Min(prefetchStart, scan->xs_batch->nheaptids - 1), 0);
+		prefetchEnd = Max(Min(prefetchEnd, scan->xs_batch->nheaptids - 1), 0);
+
+		/* remember how far we prefetched / where to start the next prefetch */
+		scan->xs_batch->prefetchIndex = prefetchEnd;
+	}
+	else
+	{
+		/* Where should we start to prefetch? */
+		prefetchEnd = Min(scan->xs_batch->currIndex,
+						  scan->xs_batch->prefetchIndex);
+
+		/*
+		 * Where should we stop prefetching? this is the first item that we do
+		 * NOT prefetch, i.e. it can be the first item after the batch.
+		 */
+		prefetchStart = Max((scan->xs_batch->currIndex - 1) - scan->xs_batch->prefetchTarget,
+							-1);
+
+		/* FIXME should calculate in a way to make this unnecessary */
+		prefetchStart = Max(Min(prefetchStart, scan->xs_batch->nheaptids - 1), 0);
+		prefetchEnd = Max(Min(prefetchEnd, scan->xs_batch->nheaptids - 1), 0);
+
+		/* remember how far we prefetched / where to start the next prefetch */
+		scan->xs_batch->prefetchIndex = prefetchStart;
+	}
+
+	/*
+	 * It's possible we get inverted prefetch range after a restrpos() call,
+	 * because we intentionally don't reset the prefetchIndex - we don't want
+	 * to prefetch pages over and over in this case. We'll do nothing in that
+	 * case, except for the AssertCheckBatchInfo().
+	 *
+	 * FIXME I suspect this actually does not work correctly if we change the
+	 * direction, because the prefetchIndex will flip between two extremes
+	 * thanks to the Min/Max.
+	 */
+
+	/*
+	 * Increase the prefetch distance, but not beyond prefetchMaximum. We
+	 * intentionally do this after calculating start/end, so that we start
+	 * actually prefetching only after the first item.
+	 */
+	scan->xs_batch->prefetchTarget = Min(scan->xs_batch->prefetchTarget + 1,
+										 scan->xs_batch->prefetchMaximum);
+
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	/* finally, do the actual prefetching */
+	for (int i = prefetchStart; i < prefetchEnd; i++)
+	{
+		/* skip block if the provided callback says so */
+		if (prefetch_callback && !prefetch_callback(scan, arg, i))
+			continue;
+
+		PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM,
+					   ItemPointerGetBlockNumber(&scan->xs_batch->heaptids[i]));
+	}
+}
+
+/*
+ * index_batch_init
+ *		Initialize various fields / arrays needed by batching.
+ *
+ * FIXME This is a bit ad-hoc hodge podge, due to how I was adding more and
+ * more pieces. Some of the fields may be not quite necessary, needs cleanup.
+ */
+static void
+index_batch_init(IndexScanDesc scan)
+{
+	/* init batching info, but only if batch supported */
+	Assert(scan->indexRelation->rd_indam->amgetbatch != NULL);
+
+	scan->xs_batch = palloc0(sizeof(IndexScanBatchData));
+
+	/*
+	 * Set some reasonable batch size defaults.
+	 *
+	 * XXX Maybe should depend on prefetch distance, or something like that?
+	 * The initSize will affect how far ahead we can prefetch.
+	 */
+	scan->xs_batch->maxSize = 64;
+	scan->xs_batch->initSize = 8;
+	scan->xs_batch->currSize = scan->xs_batch->initSize;
+
+	/* initialize prefetching info */
+	scan->xs_batch->prefetchMaximum =
+		get_tablespace_io_concurrency(scan->heapRelation->rd_rel->reltablespace);
+	scan->xs_batch->prefetchTarget = 0;
+	scan->xs_batch->prefetchIndex = 0;
+
+	/* */
+	scan->xs_batch->currIndex = -1;
+
+	/* Preallocate the largest allowed array of TIDs. */
+	scan->xs_batch->nheaptids = 0;
+	scan->xs_batch->heaptids = palloc0(sizeof(ItemPointerData) * scan->xs_batch->maxSize);
+
+	/*
+	 * XXX We can't check scan->xs_want_itup, because that's set only after
+	 * the scan is initialized (and we initialize in beginscan). Maybe we
+	 * could (or should) allocate lazily.
+	 */
+	scan->xs_batch->itups = palloc(sizeof(IndexTuple) * scan->xs_batch->maxSize);
+	scan->xs_batch->htups = palloc(sizeof(HeapTuple) * scan->xs_batch->maxSize);
+
+	scan->xs_batch->recheck = palloc(sizeof(bool) * scan->xs_batch->maxSize);
+
+	/*
+	 * XXX Maybe use a more compact bitmap? We need just one bit per element,
+	 * not a bool. This is easier / more convenient to manipulate, though.
+	 *
+	 * XXX Maybe should allow more items thant the max batch size?
+	 */
+	scan->xs_batch->nKilledItems = 0;
+	scan->xs_batch->killedItems = (int *) palloc0(sizeof(int) * scan->xs_batch->maxSize);
+
+	/*
+	 * XXX Maybe allocate only when actually needed? Also, shouldn't we have a
+	 * memory context for the private data?
+	 */
+	scan->xs_batch->privateData = (Datum *) palloc0(sizeof(Datum) * scan->xs_batch->maxSize);
+
+	if (scan->numberOfOrderBys > 0)
+	{
+		int			cnt = (scan->xs_batch->maxSize * scan->numberOfOrderBys);
+
+		scan->xs_batch->orderbyvals = (Datum *) palloc0(sizeof(Datum) * cnt);
+		scan->xs_batch->orderbynulls = (bool *) palloc0(sizeof(Datum) * cnt);
+	}
+	else
+	{
+		scan->xs_batch->orderbyvals = NULL;
+		scan->xs_batch->orderbynulls = NULL;
+	}
+
+	scan->xs_batch->ctx = AllocSetContextCreate(CurrentMemoryContext,
+												"indexscan batch context",
+												ALLOCSET_DEFAULT_SIZES);
+
+	/* comprehensive checks */
+	AssertCheckBatchInfo(scan);
+}
+
+/*
+ * index_batch_reset
+ *		Reset the batch before reading the next chunk of data.
+ *
+ * FIXME Another bit in need of cleanup. The currIndex default (-1) is not quite
+ * correct, because for backwards scans is wrong.
+ */
+static void
+index_batch_reset(IndexScanDesc scan)
+{
+	/* bail out if batching not enabled */
+	if (!scan->xs_batch)
+		return;
+
+	scan->xs_batch->nheaptids = 0;
+	scan->xs_batch->prefetchIndex = 0;
+	scan->xs_batch->currIndex = -1;
+}
+
+/*
+ * index_batch_add
+ *		Add an item to the batch.
+ *
+ * The item is always a TID, and then also IndexTuple if requested (for IOS).
+ * Items are always added from the beginning (index 0).
+ *
+ * Returns true when adding the item was successful, or false when the batch
+ * is full (and the item should be added to the next batch).
+ */
+bool
+index_batch_add(IndexScanDesc scan, ItemPointerData tid, bool recheck,
+				IndexTuple itup, HeapTuple htup)
+{
+	/* comprehensive checks on the batch info */
+	AssertCheckBatchInfo(scan);
+
+	/* don't add TIDs beyond the current batch size */
+	if (INDEX_BATCH_IS_FULL(scan))
+		return false;
+
+	/*
+	 * There must be space for at least one entry.
+	 *
+	 * XXX Seems redundant with the earlier INDEX_BATCH_IS_FULL check.
+	 */
+	Assert(scan->xs_batch->nheaptids < scan->xs_batch->currSize);
+	Assert(scan->xs_batch->nheaptids >= 0);
+
+	scan->xs_batch->heaptids[scan->xs_batch->nheaptids] = tid;
+	scan->xs_batch->privateData[scan->xs_batch->nheaptids] = (Datum) 0;
+
+	if (scan->xs_want_itup)
+	{
+		scan->xs_batch->itups[scan->xs_batch->nheaptids] = itup;
+		scan->xs_batch->htups[scan->xs_batch->nheaptids] = htup;
+	}
+
+	scan->xs_batch->recheck[scan->xs_batch->nheaptids] = recheck;
+
+	if (scan->numberOfOrderBys > 0)
+	{
+		int			idx = scan->xs_batch->nheaptids * scan->numberOfOrderBys;
+
+		memcpy(&scan->xs_batch->orderbyvals[idx], scan->xs_orderbyvals, sizeof(Datum) * scan->numberOfOrderBys);
+		memcpy(&scan->xs_batch->orderbynulls[idx], scan->xs_orderbynulls, sizeof(bool) * scan->numberOfOrderBys);
+	}
+
+	scan->xs_batch->nheaptids++;
+
+	/* comprehensive checks on the batch info */
+	AssertCheckBatchInfo(scan);
+
+	return true;
+}
diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c
index f9a2fac79e4..742a963bc29 100644
--- a/src/backend/executor/execIndexing.c
+++ b/src/backend/executor/execIndexing.c
@@ -809,7 +809,12 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index,
 retry:
 	conflict = false;
 	found_self = false;
-	index_scan = index_beginscan(heap, index, &DirtySnapshot, indnkeyatts, 0);
+
+	/*
+	 * XXX Does not seem useful to do prefetching for checks of constraints.
+	 * We would probably need just the first item anyway.
+	 */
+	index_scan = index_beginscan(heap, index, &DirtySnapshot, indnkeyatts, 0, false);
 	index_rescan(index_scan, scankeys, indnkeyatts, NULL, 0);
 
 	while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot))
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
index 54025c9f150..6be3744361d 100644
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -244,8 +244,13 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid,
 	/* Build scan key. */
 	skey_attoff = build_replindex_scan_key(skey, rel, idxrel, searchslot);
 
-	/* Start an index scan. */
-	scan = index_beginscan(rel, idxrel, &snap, skey_attoff, 0);
+	/*
+	 * Start an index scan.
+	 *
+	 * XXX No prefetching for replication identity. We expect to find just one
+	 * row, so prefetching is pointless.
+	 */
+	scan = index_beginscan(rel, idxrel, &snap, skey_attoff, 0, false);
 
 retry:
 	found = false;
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 612c6738950..c030c0df6fe 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -49,7 +49,12 @@
 static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
 static void StoreIndexTuple(IndexOnlyScanState *node, TupleTableSlot *slot,
 							IndexTuple itup, TupleDesc itupdesc);
+static bool ios_prefetch_block(IndexScanDesc scan, void *data, int index);
 
+/* values stored in ios_prefetch_block in the batch cache */
+#define		IOS_UNKNOWN_VISIBILITY		0	/* XXX default value */
+#define		IOS_ALL_VISIBLE				1
+#define		IOS_NOT_ALL_VISIBLE			2
 
 /* ----------------------------------------------------------------
  *		IndexOnlyNext
@@ -93,15 +98,22 @@ IndexOnlyNext(IndexOnlyScanState *node)
 								   node->ioss_RelationDesc,
 								   estate->es_snapshot,
 								   node->ioss_NumScanKeys,
-								   node->ioss_NumOrderByKeys);
+								   node->ioss_NumOrderByKeys,
+								   node->ioss_CanBatch);
 
 		node->ioss_ScanDesc = scandesc;
 
-
 		/* Set it up for index-only scan */
 		node->ioss_ScanDesc->xs_want_itup = true;
 		node->ioss_VMBuffer = InvalidBuffer;
 
+		/* Also set the prefetch callback info, if baching enabled. */
+		if (scandesc->xs_batch != NULL)
+		{
+			scandesc->xs_batch->prefetchCallback = ios_prefetch_block;
+			scandesc->xs_batch->prefetchArgument = (void *) node;
+		}
+
 		/*
 		 * If no run-time keys to calculate or they are ready, go ahead and
 		 * pass the scankeys to the index AM.
@@ -119,10 +131,38 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	 */
 	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
 	{
+		bool		all_visible;
 		bool		tuple_from_heap = false;
 
 		CHECK_FOR_INTERRUPTS();
 
+		/* */
+		if (scandesc->xs_batch == NULL)
+		{
+			all_visible = VM_ALL_VISIBLE(scandesc->heapRelation,
+						  ItemPointerGetBlockNumber(tid),
+						  &node->ioss_VMBuffer);
+		}
+		else
+		{
+			/* Is the index of the current item valid for the batch? */
+			Assert((scandesc->xs_batch->currIndex >= 0) &&
+				   (scandesc->xs_batch->currIndex < scandesc->xs_batch->nheaptids));
+
+			/*
+			 * Reuse the previously determined page visibility info, or
+			 * calculate it now. If we decided not to prefetch the block, the
+			 * page has to be all-visible.
+			 *
+			 * XXX It's a bir weird we use the visibility to decide if we
+			 * should skip prefetching the block, and then deduce the
+			 * visibility from that. Maybe we could/should have a more direct
+			 * way?
+			 */
+			all_visible = !ios_prefetch_block(scandesc, node,
+											  scandesc->xs_batch->currIndex);
+		}
+
 		/*
 		 * We can skip the heap fetch if the TID references a heap page on
 		 * which all tuples are known visible to everybody.  In any case,
@@ -157,16 +197,14 @@ IndexOnlyNext(IndexOnlyScanState *node)
 		 * It's worth going through this complexity to avoid needing to lock
 		 * the VM buffer, which could cause significant contention.
 		 */
-		if (!VM_ALL_VISIBLE(scandesc->heapRelation,
-							ItemPointerGetBlockNumber(tid),
-							&node->ioss_VMBuffer))
+		if (!all_visible)
 		{
 			/*
 			 * Rats, we have to visit the heap to check visibility.
 			 */
 			InstrCountTuples2(node, 1);
 			if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
-				continue;		/* no visible tuple, try next index entry */
+				continue;	/* no visible tuple, try next index entry */
 
 			ExecClearTuple(node->ioss_TableSlot);
 
@@ -574,6 +612,16 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 	indexstate->recheckqual =
 		ExecInitQual(node->recheckqual, (PlanState *) indexstate);
 
+	/*
+	 * Can't do batching (and thus prefetching) when the plan requires mark
+	 * and restore. There's an issue with translating the mark/restore
+	 * positions between the batch in scan descriptor and the original
+	 * position recognized in the index AM.
+	 *
+	 * XXX Hopefully just a temporary limitation?
+	 */
+	indexstate->ioss_CanBatch = !(eflags & EXEC_FLAG_MARK);
+
 	/*
 	 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
 	 * here.  This allows an index-advisor plugin to EXPLAIN a plan containing
@@ -735,12 +783,20 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
 								  estate->es_snapshot,
 								  piscan);
 	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, piscan);
+
+	/*
+	 * XXX do we actually want prefetching for parallel index scans? Maybe
+	 * not, but then we need to be careful not to call index_batch_getnext_tid
+	 * (which now can happen, because we'll call IndexOnlyNext even for
+	 * parallel plans).
+	 */
 	node->ioss_ScanDesc =
 		index_beginscan_parallel(node->ss.ss_currentRelation,
 								 node->ioss_RelationDesc,
 								 node->ioss_NumScanKeys,
 								 node->ioss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 node->ioss_CanBatch);
 	node->ioss_ScanDesc->xs_want_itup = true;
 	node->ioss_VMBuffer = InvalidBuffer;
 
@@ -780,12 +836,15 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
 	ParallelIndexScanDesc piscan;
 
 	piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
+
+	/* XXX do we actually want prefetching for parallel index scans? */
 	node->ioss_ScanDesc =
 		index_beginscan_parallel(node->ss.ss_currentRelation,
 								 node->ioss_RelationDesc,
 								 node->ioss_NumScanKeys,
 								 node->ioss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 node->ioss_CanBatch);
 	node->ioss_ScanDesc->xs_want_itup = true;
 
 	/*
@@ -797,3 +856,34 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
 					 node->ioss_ScanKeys, node->ioss_NumScanKeys,
 					 node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
 }
+
+/*
+ * ios_prefetch_block
+ *		Callback to only prefetch blocks that are not all-visible.
+ *
+ * We don't want to inspect the visibility map repeatedly, so the result of
+ * VM_ALL_VISIBLE is stored in the batch private data. The values are set
+ * to 0 by default, so we use two constants to remember if all-visible or
+ * not all-visible.
+ */
+static bool
+ios_prefetch_block(IndexScanDesc scan, void *arg, int index)
+{
+	IndexOnlyScanState *node = (IndexOnlyScanState *) arg;
+
+	if (scan->xs_batch->privateData[index] == IOS_UNKNOWN_VISIBILITY)
+	{
+		bool		all_visible;
+		ItemPointer tid = &scan->xs_batch->heaptids[index];
+
+		all_visible = VM_ALL_VISIBLE(scan->heapRelation,
+									 ItemPointerGetBlockNumber(tid),
+									 &node->ioss_VMBuffer);
+
+		scan->xs_batch->privateData[index]
+			= all_visible ? IOS_ALL_VISIBLE : IOS_NOT_ALL_VISIBLE;
+	}
+
+	/* prefetch only blocks that are not all-visible */
+	return (scan->xs_batch->privateData[index] == IOS_NOT_ALL_VISIBLE);
+}
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 8000feff4c9..8bbd3606566 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -110,7 +110,8 @@ IndexNext(IndexScanState *node)
 								   node->iss_RelationDesc,
 								   estate->es_snapshot,
 								   node->iss_NumScanKeys,
-								   node->iss_NumOrderByKeys);
+								   node->iss_NumOrderByKeys,
+								   node->iss_CanBatch);
 
 		node->iss_ScanDesc = scandesc;
 
@@ -200,12 +201,15 @@ IndexNextWithReorder(IndexScanState *node)
 		/*
 		 * We reach here if the index scan is not parallel, or if we're
 		 * serially executing an index scan that was planned to be parallel.
+		 *
+		 * XXX Should we use batching here? And can we with reordering?
 		 */
 		scandesc = index_beginscan(node->ss.ss_currentRelation,
 								   node->iss_RelationDesc,
 								   estate->es_snapshot,
 								   node->iss_NumScanKeys,
-								   node->iss_NumOrderByKeys);
+								   node->iss_NumOrderByKeys,
+								   false);
 
 		node->iss_ScanDesc = scandesc;
 
@@ -942,6 +946,20 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 	indexstate->indexorderbyorig =
 		ExecInitExprList(node->indexorderbyorig, (PlanState *) indexstate);
 
+	/*
+	 * Can't do batching (and thus prefetching) when the plan requires mark
+	 * and restore. There's an issue with translating the mark/restore
+	 * positions between the batch in scan descriptor and the original
+	 * position recognized in the index AM.
+	 *
+	 * XXX Hopefully just a temporary limitation?
+	 *
+	 * XXX Maybe this should check if the index AM supports batching, or even
+	 * call something like "amcanbatch" (does not exist yet). Or check the
+	 * enable_indexscan_batching GUC? Now we check the GUC in index_beginscan.
+	 */
+	indexstate->iss_CanBatch = !(eflags & EXEC_FLAG_MARK);
+
 	/*
 	 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
 	 * here.  This allows an index-advisor plugin to EXPLAIN a plan containing
@@ -1670,12 +1688,17 @@ ExecIndexScanInitializeDSM(IndexScanState *node,
 								  estate->es_snapshot,
 								  piscan);
 	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, piscan);
+
+	/*
+	 * XXX do we actually want prefetching for parallel index scans?
+	 */
 	node->iss_ScanDesc =
 		index_beginscan_parallel(node->ss.ss_currentRelation,
 								 node->iss_RelationDesc,
 								 node->iss_NumScanKeys,
 								 node->iss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 node->iss_CanBatch);
 
 	/*
 	 * If no run-time keys to calculate or they are ready, go ahead and pass
@@ -1713,12 +1736,17 @@ ExecIndexScanInitializeWorker(IndexScanState *node,
 	ParallelIndexScanDesc piscan;
 
 	piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
+
+	/*
+	 * XXX do we actually want prefetching for parallel index scans?
+	 */
 	node->iss_ScanDesc =
 		index_beginscan_parallel(node->ss.ss_currentRelation,
 								 node->iss_RelationDesc,
 								 node->iss_NumScanKeys,
 								 node->iss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 node->iss_CanBatch);
 
 	/*
 	 * If no run-time keys to calculate or they are ready, go ahead and pass
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 08fa6774d9c..02ea0eea149 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6343,9 +6343,14 @@ get_actual_variable_endpoint(Relation heapRel,
 	InitNonVacuumableSnapshot(SnapshotNonVacuumable,
 							  GlobalVisTestFor(heapRel));
 
+	/*
+	 * XXX I'm not sure about batching/prefetching here. In most cases we
+	 * expect to find the endpoints immediately, but sometimes we have a lot
+	 * of dead tuples - and then prefetching might help.
+	 */
 	index_scan = index_beginscan(heapRel, indexRel,
 								 &SnapshotNonVacuumable,
-								 1, 0);
+								 1, 0, false);
 	/* Set it up for index-only scan */
 	index_scan->xs_want_itup = true;
 	index_rescan(index_scan, scankeys, 1, NULL, 0);
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 8a67f01200c..96806872613 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -789,6 +789,16 @@ struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_indexscan_batching", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of index-scan batching."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&enable_indexscan_batching,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_indexonlyscan", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of index-only-scan plans."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 39a3ac23127..20a1af47db2 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -399,6 +399,7 @@
 #enable_hashjoin = on
 #enable_incremental_sort = on
 #enable_indexscan = on
+#enable_indexscan_batching = on
 #enable_indexonlyscan = on
 #enable_material = on
 #enable_memoize = on
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index c51de742ea0..966a25d9ba3 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -184,6 +184,10 @@ typedef void (*amrescan_function) (IndexScanDesc scan,
 typedef bool (*amgettuple_function) (IndexScanDesc scan,
 									 ScanDirection direction);
 
+/* next batch of valid tuples */
+typedef bool (*amgetbatch_function) (IndexScanDesc scan,
+									 ScanDirection direction);
+
 /* fetch all valid tuples */
 typedef int64 (*amgetbitmap_function) (IndexScanDesc scan,
 									   TIDBitmap *tbm);
@@ -288,6 +292,7 @@ typedef struct IndexAmRoutine
 	ambeginscan_function ambeginscan;
 	amrescan_function amrescan;
 	amgettuple_function amgettuple; /* can be NULL */
+	amgetbatch_function amgetbatch; /* can be NULL */
 	amgetbitmap_function amgetbitmap;	/* can be NULL */
 	amendscan_function amendscan;
 	ammarkpos_function ammarkpos;	/* can be NULL */
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index c25f5d11b53..1d9a0868a9b 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -14,6 +14,7 @@
 #ifndef GENAM_H
 #define GENAM_H
 
+#include "access/itup.h"
 #include "access/sdir.h"
 #include "access/skey.h"
 #include "nodes/tidbitmap.h"
@@ -88,6 +89,7 @@ typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state);
 
 /* struct definitions appear in relscan.h */
 typedef struct IndexScanDescData *IndexScanDesc;
+typedef struct IndexScanBatchData *IndexScanBatch;
 typedef struct SysScanDescData *SysScanDesc;
 
 typedef struct ParallelIndexScanDescData *ParallelIndexScanDesc;
@@ -132,6 +134,8 @@ typedef struct IndexOrderByDistance
  * generalized index_ interface routines (in indexam.c)
  */
 
+extern PGDLLIMPORT bool enable_indexscan_batching;
+
 /*
  * IndexScanIsValid
  *		True iff the index scan is valid.
@@ -155,7 +159,8 @@ extern void index_insert_cleanup(Relation indexRelation,
 extern IndexScanDesc index_beginscan(Relation heapRelation,
 									 Relation indexRelation,
 									 Snapshot snapshot,
-									 int nkeys, int norderbys);
+									 int nkeys, int norderbys,
+									 bool enable_batching);
 extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation,
 											Snapshot snapshot,
 											int nkeys);
@@ -173,7 +178,8 @@ extern void index_parallelscan_initialize(Relation heapRelation,
 extern void index_parallelrescan(IndexScanDesc scan);
 extern IndexScanDesc index_beginscan_parallel(Relation heaprel,
 											  Relation indexrel, int nkeys, int norderbys,
-											  ParallelIndexScanDesc pscan);
+											  ParallelIndexScanDesc pscan,
+											  bool enable_batching);
 extern ItemPointer index_getnext_tid(IndexScanDesc scan,
 									 ScanDirection direction);
 struct TupleTableSlot;
@@ -182,6 +188,10 @@ extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction,
 							   struct TupleTableSlot *slot);
 extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap);
 
+/* index batching/prefetching */
+extern bool index_batch_add(IndexScanDesc scan, ItemPointerData tid, bool recheck,
+							IndexTuple itup, HeapTuple htup);
+
 extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
 												IndexBulkDeleteResult *istat,
 												IndexBulkDeleteCallback callback,
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index e1884acf493..8fd2da8514d 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -129,6 +129,9 @@ typedef struct IndexFetchTableData
 	Relation	rel;
 } IndexFetchTableData;
 
+/* Forward declaration, the prefetch callback needs IndexScanDescData. */
+typedef struct IndexScanBatchData IndexScanBatchData;
+
 /*
  * We use the same IndexScanDescData structure for both amgettuple-based
  * and amgetbitmap-based index scans.  Some fields are only relevant in
@@ -174,6 +177,9 @@ typedef struct IndexScanDescData
 
 	bool		xs_recheck;		/* T means scan keys must be rechecked */
 
+	/* Information used by batched index scans. */
+	IndexScanBatchData *xs_batch;
+
 	/*
 	 * When fetching with an ordering operator, the values of the ORDER BY
 	 * expressions of the last returned tuple, according to the index.  If
@@ -189,6 +195,64 @@ typedef struct IndexScanDescData
 	struct ParallelIndexScanDescData *parallel_scan;
 }			IndexScanDescData;
 
+/*
+ * Typedef for callback function to determine if an item in index scan should
+ * be prefetched.
+ */
+typedef bool (*IndexPrefetchCallback) (IndexScanDescData *scan,
+									   void *arg, int index);
+
+/*
+ * Data about the current TID batch returned by the index AM.
+ *
+ * XXX Maybe this should be a separate struct instead, and the scan
+ * descriptor would have only a pointer, initialized only when the
+ * batching is actually used?
+ *
+ * XXX It's not quite clear which part of this is managed by indexam and
+ * what's up to the actual index AM implementation. Needs some clearer
+ * boundaries.
+ *
+ * XXX Should we have a pointer for optional state managed by the AM? Some
+ * custom AMs may need more per-batch information, not just the fields we
+ * have here.
+ */
+typedef struct IndexScanBatchData
+{
+	/* batch size - maximum, initial, current (with ramp up) */
+	int			maxSize;
+	int			initSize;
+	int			currSize;
+
+	/* memory context for per-batch data */
+	MemoryContext ctx;
+
+	/* batch prefetching */
+	int			prefetchTarget; /* current prefetch distance */
+	int			prefetchMaximum;	/* maximum prefetch distance */
+	int			prefetchIndex;	/* next item to prefetch */
+
+	IndexPrefetchCallback	prefetchCallback;
+	void				   *prefetchArgument;
+
+	/* batch contents (TIDs, index tuples, kill bitmap, ...) */
+	int			currIndex;		/* index of the current item */
+	int			nheaptids;		/* number of TIDs in the batch */
+	ItemPointerData *heaptids;	/* TIDs in the batch */
+	IndexTuple *itups;			/* IndexTuples, if requested */
+	HeapTuple  *htups;			/* HeapTuples, if requested */
+	bool	   *recheck;		/* recheck flags */
+	Datum	   *privateData;	/* private data for batch */
+
+	/* xs_orderbyvals / xs_orderbynulls */
+	Datum	   *orderbyvals;
+	bool	   *orderbynulls;
+
+	/* list of killed items */
+	int			nKilledItems;	/* number of killedItems elements */
+	int		   *killedItems;	/* list of indexes to kill */
+} IndexScanBatchData;
+
 /* Generic structure for parallel scans */
 typedef struct ParallelIndexScanDescData
 {
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 182a6956bb0..e85f03cd0c1 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1654,6 +1654,7 @@ typedef struct
  *		OrderByTypByVals   is the datatype of order by expression pass-by-value?
  *		OrderByTypLens	   typlens of the datatypes of order by expressions
  *		PscanLen		   size of parallel index scan descriptor
+ *		CanBatch		   batching (and prefetching) enabled
  * ----------------
  */
 typedef struct IndexScanState
@@ -1681,6 +1682,10 @@ typedef struct IndexScanState
 	bool	   *iss_OrderByTypByVals;
 	int16	   *iss_OrderByTypLens;
 	Size		iss_PscanLen;
+
+	/* batching/prefetching enabled? */
+	bool		iss_CanBatch;
+
 } IndexScanState;
 
 /* ----------------
@@ -1702,6 +1707,7 @@ typedef struct IndexScanState
  *		PscanLen		   size of parallel index-only scan descriptor
  *		NameCStringAttNums attnums of name typed columns to pad to NAMEDATALEN
  *		NameCStringCount   number of elements in the NameCStringAttNums array
+ *		CanBatch		   batching (and prefetching) enabled
  * ----------------
  */
 typedef struct IndexOnlyScanState
@@ -1723,6 +1729,7 @@ typedef struct IndexOnlyScanState
 	Size		ioss_PscanLen;
 	AttrNumber *ioss_NameCStringAttNums;
 	int			ioss_NameCStringCount;
+	bool		ioss_CanBatch;
 } IndexOnlyScanState;
 
 /* ----------------
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index fad7fc3a7e0..14b38ed4d46 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -157,6 +157,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_incremental_sort        | on
  enable_indexonlyscan           | on
  enable_indexscan               | on
+ enable_indexscan_batching      | on
  enable_material                | on
  enable_memoize                 | on
  enable_mergejoin               | on
@@ -170,7 +171,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(22 rows)
+(23 rows)
 
 -- There are always wait event descriptions for various types.  InjectionPoint
 -- may be present or absent, depending on history since last postmaster start.
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 1847bbfa95c..6378e182238 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1220,6 +1220,7 @@ IndexOrderByDistance
 IndexPath
 IndexRuntimeKeyInfo
 IndexScan
+IndexScanBatchData
 IndexScanDesc
 IndexScanState
 IndexStateFlagsAction
@@ -3286,6 +3287,7 @@ amendscan_function
 amestimateparallelscan_function
 amgetbitmap_function
 amgettuple_function
+amgetbatch_function
 aminitparallelscan_function
 aminsert_function
 aminsertcleanup_function
-- 
2.47.0

