From f34b81e33b173a112bc60ad38b39ce3d5672c255 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Wed, 28 Aug 2024 18:21:27 +0200
Subject: [PATCH v20240831] WIP: index batching / prefetching

---
 src/backend/access/index/indexam.c            | 664 ++++++++++++++++++
 src/backend/access/nbtree/nbtree.c            |  56 ++
 src/backend/access/nbtree/nbtsearch.c         | 363 ++++++++++
 src/backend/executor/nodeIndexonlyscan.c      | 496 ++++++++++---
 src/backend/executor/nodeIndexscan.c          | 111 ++-
 src/backend/utils/misc/guc_tables.c           |  10 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/access/amapi.h                    |   5 +
 src/include/access/genam.h                    |  24 +
 src/include/access/nbtree.h                   |   4 +
 src/include/access/relscan.h                  |  40 ++
 src/include/nodes/execnodes.h                 |   7 +
 src/test/regress/expected/sysviews.out        |   3 +-
 13 files changed, 1662 insertions(+), 122 deletions(-)

diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index dcd04b813d8..7c2d7d24b13 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -33,6 +33,14 @@
  *		index_can_return	- does index support index-only scans?
  *		index_getprocid - get a support procedure OID
  *		index_getprocinfo - get a support procedure's lookup info
+ *		index_batch_getnext	- get the next batch of TIDs from a scan
+ *		index_batch_getnext_tid	- get the next TIDs from a batch
+ *		index_batch_getnext_slot	- get the next tuple from a batch
+ *		index_batch_prefetch	- prefetch heap pages for a batch
+ *		index_batch_supported	- does the AM support/allow batching?
+ *		index_batch_init	- initialize the TID batch arrays
+ *		index_batch_reset	- reset the TID batch (before next batch)
+ *		index_batch_add		- add an item (TID, itup) to the batch
  *
  * NOTES
  *		This file contains the index_ routines which used
@@ -44,6 +52,7 @@
 #include "postgres.h"
 
 #include "access/amapi.h"
+#include "access/nbtree.h"		/* XXX MaxTIDsPerBTreePage */
 #include "access/relation.h"
 #include "access/reloptions.h"
 #include "access/relscan.h"
@@ -56,8 +65,11 @@
 #include "storage/predicate.h"
 #include "utils/ruleutils.h"
 #include "utils/snapmgr.h"
+#include "utils/spccache.h"
 #include "utils/syscache.h"
 
+/* enable reading batches of TIDs from the index */
+bool		enable_indexscan_batching = false;
 
 /* ----------------------------------------------------------------
  *					macros used in index_ routines
@@ -333,6 +345,12 @@ index_beginscan_internal(Relation indexRelation,
 	scan->parallel_scan = pscan;
 	scan->xs_temp_snap = temp_snap;
 
+	/* No batching unless explicitly enabled, set everything to NULL. */
+	scan->xs_batch.heaptids = NULL;
+	scan->xs_batch.itups = NULL;
+	scan->xs_batch.privateData = NULL;
+	scan->xs_batch.killedItems = NULL;
+
 	return scan;
 }
 
@@ -368,6 +386,13 @@ index_rescan(IndexScanDesc scan,
 
 	scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys,
 											orderbys, norderbys);
+
+	/*
+	 * Reset the batch, to make it look empty. This needs to happen after the
+	 * amrestrpos() call, in case the AM needs some of the batch info (e.g. to
+	 * properly transfer the killed tuples).
+	 */
+	index_batch_reset(scan, ForwardScanDirection);
 }
 
 /* ----------------
@@ -444,6 +469,13 @@ index_restrpos(IndexScanDesc scan)
 	scan->xs_heap_continue = false;
 
 	scan->indexRelation->rd_indam->amrestrpos(scan);
+
+	/*
+	 * Reset the batch, to make it look empty. This needs to happen after the
+	 * amrestrpos() call, in case the AM needs some of the batch info (e.g. to
+	 * properly transfer the killed tuples).
+	 */
+	index_batch_reset(scan, ForwardScanDirection);
 }
 
 /*
@@ -1037,3 +1069,635 @@ index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions,
 
 	return build_local_reloptions(&relopts, attoptions, validate);
 }
+
+/*
+ * INDEX BATCHING AND PREFETCHING
+ *
+ * Allows reading chunks of items from an index, instead of reading them
+ * one by one. This reduces the overhead of accessing index pages, and
+ * also allows acting on "future" TIDs - e.g. we can prefetch heap pages
+ * that will be needed, etc.
+ *
+ *
+ * index AM contract
+ * -----------------
+ *
+ * This requires some level of cooperation from the index AM - the index
+ * needs to implement an optional callbacl amgettuplebatch() which fills
+ * data into the batch (in the scan descriptor).
+ *
+ * The index AM also needs to ensure it can perform all optimizations for
+ * all TIDs in the current batch. A good example of is the kill_prior_tuple
+ * optimization - with batching, the index AM may receive the information
+ * which tuples are to be killed with a delay - when loading the next
+ * batch, when ending/restarting the scan, etc. The AM needs to ensure it
+ * can still process such information.
+ *
+ * What this means/requires is very dependent on the index AM, of course.
+ * For B-Tree (and most other index AMs), batches spanning multiple leaf
+ * pages would be problematic. Such batches would work for basic index
+ * scans, but the kill_prior_tuple would be an issue - the AMs keep only
+ * a single leaf pinned. We'd either need to keep multiple pins, or allow
+ * reading older leaf pages pages (which might have been modified). Index
+ * only-scans is challenging too - we keep IndexTuple pointers into the
+ * leaf pages, which requires keeping those pins too.
+ *
+ * To solve this, we give the AM the control over batch boundaries. It is
+ * up to the index AM to pick which range of index items to load into the
+ * batch, and how to ensure all the optimizations are possible.
+ *
+ * For most index AMs the easiest way is to not load batches spanning
+ * multiple leaf pages. This may impact the efficiency, especially for
+ * indexes with wide index tuples, but those cases are rare.
+ *
+ * The alternative would be to make the index AMs more complex, to keep
+ * more leaf pages pinned, etc. But that does not seem like a good trade
+ * off due to "diminishing returns" behavior - we see significant gains
+ * initially (with even small batches), but as the batch grows the gains
+ * get smaller and smaller. It does not seem worth the complexity of
+ * pinning more pages etc.
+ *
+ *
+ * batch = sliding window
+ * ----------------------
+ *
+ * A good way to visualize the batch is a sliding window over the array
+ * of items on a leaf page. In the simplest example (forward scan with no
+ * changes of direction), we slice the array into smaller chunks, and
+ * then process each of those chunks.
+ *
+ * The batch size is adaptive - it starts small (only 8 elements) and
+ * increases as we read more batches (up to 64 elements). We don't want
+ * to regress cases that only need a single item (e.g. LIMIT 1 queries),
+ * and loading/copying a lot of data might cause that. So we start small
+ * and increase the size - that still improves cases reading a lot of
+ * data from the index, without hurting small queries.
+ *
+ * Note: This gradual ramp up is for batch size, independent of what we
+ * do for prefetch. The prefetch distance is gradually increased too, but
+ * it's independent / orthogonal to the batch size. The batch size limits
+ * how far ahead we can prefetch, of course.
+ *
+ * Note: The current limits on batch size (initial 8, maximum 64) are
+ * quite arbitrary, it just seemed those values are sane. We could adjust
+ * the initial size, but I don't think it'd make a fundamental difference.
+ * Growing the batches faster/slower has bigger impact.
+ *
+ * The maximum batch size does not matter much - it's true a btree index can
+ * have up to ~1300 items per 8K leaf page, but in most cases the actual
+ * number is lower, perhaps ~300. That's not that far from 64.
+ *
+ * Each batch has a firstIndex/lastIndex to track which part of the leaf
+ * page it currently represents.
+ *
+ *
+ * kill_prior_tuples
+ * -----------------
+ *
+ * If we decide a tuple can be killed, the batch item is marked accordingly,
+ * and the flag is reset to false (so that the index AM does not do something
+ * silly to a random tuple it thinks is "current").
+ *
+ * Then the next time the AM decides it's time to kill tuples, the AM needs
+ * to look at the batch and consider the tuples marked to be killed. B-Tree
+ * simply adds those TIDs to the regular "killItems" array.
+ *
+ *
+ * mark/restore limitation
+ * -----------------------
+ *
+ * At the moment, batching is incompatible with mark/restore - the index AM
+ * does not know what's the "current" item as seen by the user, because
+ * that's in the batch only. So the ammarkpos() has a bit misguided idea
+ * of what the current position is.
+ *
+ * For now, batching is simply disabled fro plans requiring mark/restore,
+ * which also disabes prefetching. In particular, this means mergejoin
+ * can't see this benefit.
+ *
+ * Note: It might be possible to fix this, if the AM could ask for current
+ * position in the batch. Now that we have the fistIndex/lastIndex, that
+ * should not be very difficult. Then markpos() would check if batching
+ * is used, and would use that current position instead of whatever it
+ * thinks is current (which is always to before/after the batch).
+ */
+
+/*
+ * Comprehensive check of various invariants on the index batch. Makes sure
+ * the indexes are set as expected, the buffer size is within limits, and
+ * so on.
+ */
+static void
+AssertCheckBatchInfo(IndexScanDesc scan)
+{
+#ifdef USE_ASSERT_CHECKING
+	/* all the arrays need to be allocated */
+	Assert((scan->xs_batch.heaptids != NULL) &&
+		   (scan->xs_batch.killedItems != NULL) &&
+		   (scan->xs_batch.privateData != NULL));
+
+	/* if IndexTuples expected, should be allocated too */
+	Assert(!(scan->xs_want_itup && (scan->xs_batch.itups == NULL)));
+
+	/* Various check on batch sizes */
+	Assert((scan->xs_batch.initSize >= 0) &&
+		   (scan->xs_batch.initSize <= scan->xs_batch.currSize) &&
+		   (scan->xs_batch.currSize <= scan->xs_batch.maxSize) &&
+		   (scan->xs_batch.maxSize <= 1024));	/* arbitrary limit */
+
+	/* Is the number of in the batch TIDs in a valid range? */
+	Assert((scan->xs_batch.nheaptids >= 0) &&
+		   (scan->xs_batch.nheaptids <= scan->xs_batch.maxSize));
+
+	/*
+	 * The current item must be between -1 and nheaptids. Those two extreme
+	 * values are starting points for forward/backward scans.
+	 */
+	Assert((scan->xs_batch.currIndex >= -1) &&
+		   (scan->xs_batch.currIndex <= scan->xs_batch.nheaptids));
+
+	/* check prefetch data */
+	Assert((scan->xs_batch.prefetchTarget >= 0) &&
+		   (scan->xs_batch.prefetchTarget <= scan->xs_batch.prefetchMaximum));
+
+	Assert((scan->xs_batch.prefetchIndex >= -1) &&
+		   (scan->xs_batch.prefetchIndex <= scan->xs_batch.nheaptids));
+
+	/*
+	 * XXX Not quite correct to use MaxTIDsPerBTreePage, which is btree
+	 * specific. Also we probably don't want to depend on AMs like this.
+	 */
+	Assert((scan->xs_batch.firstIndex >= -1) &&
+		   (scan->xs_batch.firstIndex <= MaxTIDsPerBTreePage));
+	Assert((scan->xs_batch.lastIndex >= -1) &&
+		   (scan->xs_batch.lastIndex <= MaxTIDsPerBTreePage));
+
+	for (int i = 0; i < scan->xs_batch.nheaptids; i++)
+		Assert(ItemPointerIsValid(&scan->xs_batch.heaptids[i]));
+#endif
+}
+
+/* Is the batch full (TIDs up to capacity)? */
+#define	INDEX_BATCH_IS_FULL(scan)	\
+	((scan)->xs_batch.nheaptids == (scan)->xs_batch.currSize)
+
+/* Is the batch empty (no TIDs)? */
+#define	INDEX_BATCH_IS_EMPTY(scan)	\
+	((scan)->xs_batch.nheaptids == 0)
+
+/*
+ * Did we process all items? For forward scan it means the index points to the
+ * last item, for backward scans it has to point to the first one.
+ *
+ * This does not cover empty batches properly, because of backward scans.
+ */
+#define	INDEX_BATCH_IS_PROCESSED(scan, direction)	\
+	(ScanDirectionIsForward(direction) ? \
+		((scan)->xs_batch.nheaptids == ((scan)->xs_batch.currIndex + 1)) : \
+		((scan)->xs_batch.currIndex == 0))
+
+/* Does the batch items in the requested direction? */
+#define INDEX_BATCH_HAS_ITEMS(scan, direction) \
+	(!INDEX_BATCH_IS_EMPTY(scan) && !INDEX_BATCH_IS_PROCESSED(scan, direction))
+
+
+/* ----------------
+ *		index_batch_getnext - get the next batch of TIDs from a scan
+ *
+ * Returns true if we managed to read at least some TIDs into the batch,
+ * or false if there are no more TIDs in the scan. The xs_heaptids and
+ * xs_nheaptids fields contain the TIDS and the number of elements.
+ *
+ * XXX This only loads the TIDs and resets the various batch fields to
+ * fresh state. It does not set xs_heaptid/xs_itup/xs_hitup, that's the
+ * responsibility of the following index_batch_getnext_tid() calls.
+ * ----------------
+ */
+bool
+index_batch_getnext(IndexScanDesc scan, ScanDirection direction)
+{
+	bool		found;
+
+	SCAN_CHECKS;
+	CHECK_SCAN_PROCEDURE(amgettuplebatch);
+
+	/* XXX: we should assert that a snapshot is pushed or registered */
+	Assert(TransactionIdIsValid(RecentXmin));
+
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	/*
+	 * We never read a new batch before we run out of items in the current
+	 * one. The current batch has to be either empty or we ran out of items
+	 * (in the given direction).
+	 */
+	Assert(!INDEX_BATCH_HAS_ITEMS(scan, direction));
+
+	/*
+	 * The AM's amgettuplebatch proc loads a chunk of TIDs matching the scan
+	 * keys, and puts the TIDs into scan->xs_batch.heaptids.  It should also
+	 * set scan->xs_recheck and possibly
+	 * scan->xs_batch.itups/scan->xs_batch.hitups, though we pay no attention
+	 * to those fields here.
+	 *
+	 * FIXME At the moment this does nothing with hitup. Needs to be fixed?
+	 */
+	found = scan->indexRelation->rd_indam->amgettuplebatch(scan, direction);
+
+	/* Reset kill flag immediately for safety */
+	scan->kill_prior_tuple = false;
+	scan->xs_heap_continue = false;
+
+	/* If we're out of index entries, we're done */
+	if (!found)
+	{
+		/* release resources (like buffer pins) from table accesses */
+		if (scan->xs_heapfetch)
+			table_index_fetch_reset(scan->xs_heapfetch);
+
+		return false;
+	}
+
+	/* We should have a non-empty batch with items. */
+	Assert(INDEX_BATCH_HAS_ITEMS(scan, direction));
+
+	pgstat_count_index_tuples(scan->indexRelation, scan->xs_batch.nheaptids);
+
+	/*
+	 * Set the prefetch index to the first item in the loaded batch (we expect
+	 * the index AM to set that).
+	 *
+	 * FIXME Maybe set the currIndex here, not in the index AM. It seems much
+	 * more like indexam.c responsibility rather than something every index AM
+	 * should be doing (in _bt_first_batch etc.).
+	 *
+	 * FIXME It's a bit unclear who (indexam.c or the index AM) is responsible
+	 * for setting which fields. This needs clarification.
+	 */
+	scan->xs_batch.prefetchIndex = scan->xs_batch.currIndex;
+
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	/* Return the batch of TIDs we found. */
+	return true;
+}
+
+/* ----------------
+ *		index_getnext_batch_tid - get the next TID from the current batch
+ *
+ * Same calling convention as index_getnext_tid(), except that NULL means
+ * no more items in the current batch, there may be more batches.
+ *
+ * XXX This only sets xs_heaptid and xs_itup (if requested). Not sure if
+ * we need to do something with xs_hitup.
+ *
+ * FIXME Should this set xs_hitup?
+ * ----------------
+ */
+ItemPointer
+index_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction)
+{
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	/*
+	 * Bail out if he batch does not have more items in the requested directio
+	 * (either empty or everthing processed).
+	 */
+	if (!INDEX_BATCH_HAS_ITEMS(scan, direction))
+		return NULL;
+
+	/*
+	 * Advance to the next batch item - we know it's not empty and there are
+	 * items to process, so this is valid.
+	 */
+	if (ScanDirectionIsForward(direction))
+		scan->xs_batch.currIndex++;
+	else
+		scan->xs_batch.currIndex--;
+
+	/* next TID from the batch, optionally also the IndexTuple */
+	scan->xs_heaptid = scan->xs_batch.heaptids[scan->xs_batch.currIndex];
+	if (scan->xs_want_itup)
+		scan->xs_itup = scan->xs_batch.itups[scan->xs_batch.currIndex];
+
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	return &scan->xs_heaptid;
+}
+
+/* ----------------
+ * index_getnext_batch_slot - get the next tuple from a scan batch
+ *
+ * Same calling convention as index_getnext_slot(), except that NULL means
+ * no more items only in the current batch, there may be more batches.
+ *
+ * XXX See index_getnext_slot comments.
+ * ----------------
+ */
+bool
+index_batch_getnext_slot(IndexScanDesc scan, ScanDirection direction,
+						 TupleTableSlot *slot)
+{
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	for (;;)
+	{
+		if (!scan->xs_heap_continue)
+		{
+			ItemPointer tid;
+
+			/* Time to fetch the next TID from the index */
+			tid = index_batch_getnext_tid(scan, direction);
+
+			/* If we're out of index entries, we're done */
+			if (tid == NULL)
+				break;
+
+			Assert(ItemPointerEquals(tid, &scan->xs_heaptid));
+		}
+
+		/*
+		 * Fetch the next (or only) visible heap tuple for this index entry.
+		 * If we don't find anything, loop around and grab the next TID from
+		 * the index.
+		 */
+		Assert(ItemPointerIsValid(&scan->xs_heaptid));
+		if (index_fetch_heap(scan, slot))
+		{
+			/* If we found a visible tuple, we shouldn't kill it. */
+			Assert(!scan->kill_prior_tuple);
+
+			/* comprehensive checks of batching info */
+			AssertCheckBatchInfo(scan);
+
+			return true;
+		}
+
+		/*
+		 * If we haven't found any visible tuple for the TID, chances are all
+		 * versions are dead and may kill it from the index. If so, flag it in
+		 * the kill bitmap - we'll translate it to indexes later.
+		 *
+		 * XXX With the firstIndex/lastIndex it would not be too hard to do
+		 * the translation here. But do we want to? How much is that
+		 * considered an internal detail of the AM?
+		 *
+		 * XXX Maybe we should integrate this into index_fetch_heap(), so that
+		 * we don't need to do it after each call? Seems easy to ferget/miss.
+		 */
+		if (scan->kill_prior_tuple)
+		{
+			/*
+			 * FIXME This is not great, because we'll have to walk through the
+			 * whole bitmap later, to maybe add killed tuples to the regular
+			 * array. Might be costly for large batches. Maybe it'd be better
+			 * to do what btree does and stash the indexes (just some limited
+			 * number).
+			 */
+			scan->xs_batch.killedItems[scan->xs_batch.currIndex] = true;
+			scan->kill_prior_tuple = false;
+		}
+	}
+
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	return false;
+}
+
+/* ----------------
+ *		index_getnext_batch_prefetch - prefetch pages for TIDs in current batch
+ *
+ * The prefetch distance is increased gradually, similar to what we do for
+ * bitmap heap scans. We start from distance 0 (no prefetch), and then in each
+ * iteration increment the distance up to prefetchMaximum.
+ *
+ * The prefetch distance is reset (to 0) only on rescans, not between batches.
+ *
+ * It's possible to provide an index_prefetch_callback callback, to affect
+ * which items need to be prefetched. With prefetch_callback=NULL, all
+ * items are prefetched. With the callback provided, the item is prefetched
+ * iff the callback and returns true.
+ *
+ * The "arg" argument is used to pass a state for the plan node invoking the
+ * function, and is then passed to the callback. This means the callback is
+ * specific to the plan state.
+ *
+ * XXX the prefetchMaximum depends on effective_io_concurrency, and also on
+ * tablespace options.
+ *
+ * XXX For accesses that change scan direction, we may do a lot of unnecessary
+ * prefetching (because we will re-issue prefetches for what we recently read).
+ * I'm not sure if there's a simple way to track what was already prefetched.
+ * Maybe we could count how far we got (in the forward direction), keep that
+ * as a watermark, and never prefetch again below it.
+ *
+ * XXX Maybe wrap this in ifdef USE_PREFETCH?
+ * ----------------
+ */
+void
+index_batch_prefetch(IndexScanDesc scan, ScanDirection direction,
+					 index_prefetch_callback prefetch_callback, void *arg)
+{
+	int			prefetchStart,
+				prefetchEnd;
+
+	if (ScanDirectionIsForward(direction))
+	{
+		/* Where should we start to prefetch? */
+		prefetchStart = Max(scan->xs_batch.currIndex,
+							scan->xs_batch.prefetchIndex);
+
+		/*
+		 * Where should we stop prefetching? this is the first item that we do
+		 * NOT prefetch, i.e. it can be the first item after the batch.
+		 */
+		prefetchEnd = Min((scan->xs_batch.currIndex + 1) + scan->xs_batch.prefetchTarget,
+						  scan->xs_batch.nheaptids);
+
+		/* FIXME should calculate in a way to make this unnecessary */
+		prefetchStart = Max(Min(prefetchStart, scan->xs_batch.nheaptids - 1), 0);
+		prefetchEnd = Max(Min(prefetchEnd, scan->xs_batch.nheaptids - 1), 0);
+
+		/* remember how far we prefetched / where to start the next prefetch */
+		scan->xs_batch.prefetchIndex = prefetchEnd;
+	}
+	else
+	{
+		/* Where should we start to prefetch? */
+		prefetchEnd = Min(scan->xs_batch.currIndex,
+						  scan->xs_batch.prefetchIndex);
+
+		/*
+		 * Where should we stop prefetching? this is the first item that we do
+		 * NOT prefetch, i.e. it can be the first item after the batch.
+		 */
+		prefetchStart = Max((scan->xs_batch.currIndex - 1) - scan->xs_batch.prefetchTarget,
+							-1);
+
+		/* FIXME should calculate in a way to make this unnecessary */
+		prefetchStart = Max(Min(prefetchStart, scan->xs_batch.nheaptids - 1), 0);
+		prefetchEnd = Max(Min(prefetchEnd, scan->xs_batch.nheaptids - 1), 0);
+
+		/* remember how far we prefetched / where to start the next prefetch */
+		scan->xs_batch.prefetchIndex = prefetchStart;
+	}
+
+	/* we shouldn't get inverted prefetch range */
+	Assert(prefetchStart <= prefetchEnd);
+
+	/*
+	 * Increase the prefetch distance, but not beyond prefetchMaximum. We
+	 * intentionally do this after calculating start/end, so that we start
+	 * actually prefetching only after the first item.
+	 */
+	scan->xs_batch.prefetchTarget = Min(scan->xs_batch.prefetchTarget + 1,
+										scan->xs_batch.prefetchMaximum);
+
+	/* comprehensive checks of batching info */
+	AssertCheckBatchInfo(scan);
+
+	/* finally, do the actual prefetching */
+	for (int i = prefetchStart; i < prefetchEnd; i++)
+	{
+		/* skip block if the provided callback says so */
+		if (prefetch_callback && !prefetch_callback(scan, direction, arg, i))
+			continue;
+
+		PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM,
+					   ItemPointerGetBlockNumber(&scan->xs_batch.heaptids[i]));
+	}
+}
+
+/*
+ * Does the access method support batching? We just check the index AM has
+ * amgettuplebatch() method.
+ */
+bool
+index_batch_supported(IndexScanDesc scan, ScanDirection direction)
+{
+	return (scan->indexRelation->rd_indam->amgettuplebatch != NULL);
+}
+
+/*
+ * index_batch_init
+ *		Initialize various fields / arrays needed by batching.
+ *
+ * FIXME This is a bit ad-hoc hodge podge, due to how I was adding more and
+ * more pieces. Some of the fields may be not quite necessary, needs cleanup.
+ */
+void
+index_batch_init(IndexScanDesc scan, ScanDirection direction)
+{
+	/* init batching info, but only if batch supported */
+	if (!index_batch_supported(scan, direction))
+		return;
+
+	scan->xs_batch.firstIndex = -1;
+	scan->xs_batch.lastIndex = -1;
+
+	/*
+	 * Set some reasonable batch size defaults.
+	 *
+	 * XXX Maybe should depend on prefetch distance, or something like that?
+	 * The initSize will affect how far ahead we can prefetch.
+	 */
+	scan->xs_batch.maxSize = 64;
+	scan->xs_batch.initSize = 8;
+	scan->xs_batch.currSize = scan->xs_batch.initSize;
+
+	/* initialize prefetching info */
+	scan->xs_batch.prefetchMaximum =
+		get_tablespace_io_concurrency(scan->heapRelation->rd_rel->reltablespace);
+	scan->xs_batch.prefetchTarget = 0;
+	scan->xs_batch.prefetchIndex = 0;
+
+	/* */
+	scan->xs_batch.currIndex = -1;
+
+	/* Preallocate the largest allowed array of TIDs. */
+	scan->xs_batch.nheaptids = 0;
+	scan->xs_batch.heaptids = palloc0(sizeof(ItemPointerData) * scan->xs_batch.maxSize);
+
+	if (scan->xs_want_itup)
+		scan->xs_batch.itups = palloc(sizeof(IndexTuple) * scan->xs_batch.maxSize);
+
+	/*
+	 * XXX Maybe use a more compact bitmap? We need just one bit per element,
+	 * not a bool. This is easier / more convenient to manipulate, though.
+	 */
+	scan->xs_batch.killedItems = (bool *) palloc0(sizeof(bool) * scan->xs_batch.maxSize);
+
+	/*
+	 * XXX Maybe allocate only when actually needed? Also, shouldn't we have a
+	 * memory context for the private data?
+	 */
+	scan->xs_batch.privateData = (Datum *) palloc0(sizeof(Datum) * scan->xs_batch.maxSize);
+
+	/* comprehensive checks */
+	AssertCheckBatchInfo(scan);
+}
+
+/*
+ * index_batch_reset
+ *		Reset the batch before reading the next chunk of data.
+ *
+ * FIXME Another bit in need of cleanup. The currIndex default (-1) is not quite
+ * correct, because for backwards scans is wrong.
+ */
+void
+index_batch_reset(IndexScanDesc scan, ScanDirection direction)
+{
+	/* maybe initialize */
+	scan->xs_batch.nheaptids = 0;
+	scan->xs_batch.prefetchIndex = 0;
+	scan->xs_batch.currIndex = -1;
+}
+
+/*
+ * index_batch_add
+ *		Add an item to the batch.
+ *
+ * The item is always a TID, and then also IndexTuple if requested (for IOS).
+ * Items are always added from the beginning (index 0).
+ *
+ * Returns true when adding the item was successful, or false when the batch
+ * is full (and the item should be added to the next batch).
+ */
+bool
+index_batch_add(IndexScanDesc scan, ItemPointerData tid, IndexTuple itup)
+{
+	/* comprehensive checks on the batch info */
+	AssertCheckBatchInfo(scan);
+
+	/* don't add TIDs beyond the current batch size */
+	if (INDEX_BATCH_IS_FULL(scan))
+		return false;
+
+	/*
+	 * There must be space for at least one entry.
+	 *
+	 * XXX Seems redundant with the earlier INDEX_BATCH_IS_FULL check.
+	 */
+	Assert(scan->xs_batch.nheaptids < scan->xs_batch.currSize);
+	Assert(scan->xs_batch.nheaptids >= 0);
+
+	scan->xs_batch.heaptids[scan->xs_batch.nheaptids] = tid;
+	scan->xs_batch.killedItems[scan->xs_batch.nheaptids] = false;
+	scan->xs_batch.privateData[scan->xs_batch.nheaptids] = (Datum) 0;
+
+	if (scan->xs_want_itup)
+		scan->xs_batch.itups[scan->xs_batch.nheaptids] = itup;
+
+	scan->xs_batch.nheaptids++;
+
+	/* comprehensive checks on the batch info */
+	AssertCheckBatchInfo(scan);
+
+	return true;
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 686a3206f72..53375de9c66 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -141,6 +141,7 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->ambeginscan = btbeginscan;
 	amroutine->amrescan = btrescan;
 	amroutine->amgettuple = btgettuple;
+	amroutine->amgettuplebatch = btgettuplebatch;
 	amroutine->amgetbitmap = btgetbitmap;
 	amroutine->amendscan = btendscan;
 	amroutine->ammarkpos = btmarkpos;
@@ -259,6 +260,52 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 	return res;
 }
 
+/*
+ *	btgettuplebatch() -- Get the next batch of tuples in the scan.
+ *
+ * XXX Pretty much like btgettuple(), but for batches of tuples.
+ */
+bool
+btgettuplebatch(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	bool		res;
+
+	/* btree indexes are never lossy */
+	scan->xs_recheck = false;
+
+	/* Each loop iteration performs another primitive index scan */
+	do
+	{
+		/*
+		 * If we've already initialized this scan, we can just advance it in
+		 * the appropriate direction.  If we haven't done so yet, we call
+		 * _bt_first() to get the first item in the scan.
+		 */
+		if (!BTScanPosIsValid(so->currPos))
+			res = _bt_first_batch(scan, dir);
+		else
+		{
+			/*
+			 * Check to see if we should kill tuples from the previous batch.
+			 */
+			_bt_kill_batch(scan);
+
+			/*
+			 * Now continue the scan.
+			 */
+			res = _bt_next_batch(scan, dir);
+		}
+
+		/* If we have a tuple, return it ... */
+		if (res)
+			break;
+		/* ... otherwise see if we need another primitive index scan */
+	} while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
+
+	return res;
+}
+
 /*
  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
  */
@@ -364,6 +411,9 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	/* we aren't holding any read locks, but gotta drop the pins */
 	if (BTScanPosIsValid(so->currPos))
 	{
+		/* Transfer killed items from the batch to the regular array. */
+		_bt_kill_batch(scan);
+
 		/* Before leaving current page, deal with any killed items */
 		if (so->numKilled > 0)
 			_bt_killitems(scan);
@@ -421,6 +471,9 @@ btendscan(IndexScanDesc scan)
 	/* we aren't holding any read locks, but gotta drop the pins */
 	if (BTScanPosIsValid(so->currPos))
 	{
+		/* Transfer killed items from the batch to the regular array. */
+		_bt_kill_batch(scan);
+
 		/* Before leaving current page, deal with any killed items */
 		if (so->numKilled > 0)
 			_bt_killitems(scan);
@@ -501,6 +554,9 @@ btrestrpos(IndexScanDesc scan)
 		 */
 		if (BTScanPosIsValid(so->currPos))
 		{
+			/* Transfer killed items from the batch to the regular array. */
+			_bt_kill_batch(scan);
+
 			/* Before leaving current page, deal with any killed items */
 			if (so->numKilled > 0)
 				_bt_killitems(scan);
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 2551df8a671..e7c3313696d 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1527,6 +1527,366 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 	return true;
 }
 
+/*
+ *	_bt_first_batch() -- Find the first batch in a scan.
+ *
+ * A batch variant of _bt_first(). Most of the comments for that function
+ * apply here too.
+ *
+ * XXX This only populates the batch, it does not set any other fields like
+ * scan->xs_heaptid or scan->xs_itup. That happens in getnext_tid() calls.
+ *
+ * XXX I'm not sure it works to mix batched and non-batches calls, e.g. get
+ * a TID and then a batch of TIDs. It probably should work as long as we
+ * update itemIndex correctly, but we need to be careful about killed items
+ * (right now the two places use different ways to communicate which items
+ * should be killed).
+ */
+bool
+_bt_first_batch(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	/* start a new batch */
+	index_batch_reset(scan, dir);
+
+	/*
+	 * Reset the batch size to the initial size.
+	 *
+	 * FIXME should be done in indexam.c probably, at the beginning of each
+	 * index rescan?
+	 */
+	scan->xs_batch.currSize = scan->xs_batch.initSize;
+
+	/* we haven't visited any leaf pages yet, so proceed to reading one */
+	if (_bt_first(scan, dir))
+	{
+		/* range of the leaf to copy into the batch */
+		int			start,
+					end;
+
+		/* determine which part of the leaf page to extract */
+		if (ScanDirectionIsForward(dir))
+		{
+			start = so->currPos.firstItem;
+			end = Min(start + (scan->xs_batch.currSize - 1), so->currPos.lastItem);
+			so->currPos.itemIndex = (end + 1);
+		}
+		else
+		{
+			end = so->currPos.lastItem;
+			start = Max(end - (scan->xs_batch.currSize - 1),
+						so->currPos.firstItem);
+			so->currPos.itemIndex = (start - 1);
+		}
+
+		/*
+		 * We're reading the first batch, and there should always be at least
+		 * one item (otherwise _bt_first would return false). So we should
+		 * never get into situation with empty start/end range. In the worst
+		 * case, there is just a single item, in which case (start == end).
+		 */
+		Assert(start <= end);
+
+		scan->xs_batch.firstIndex = start;
+		scan->xs_batch.lastIndex = end;
+
+		/* The range of items should fit into the current batch size. */
+		Assert((end - start + 1) <= scan->xs_batch.currSize);
+
+		/* should be valid items (with respect to the leaf page) */
+		Assert(so->currPos.firstItem <= scan->xs_batch.firstIndex);
+		Assert(scan->xs_batch.firstIndex <= scan->xs_batch.lastIndex);
+		Assert(scan->xs_batch.lastIndex <= so->currPos.lastItem);
+
+		/*
+		 * Walk through the range of index tuples, copy them into the batch.
+		 * If requested, set the index tuple too.
+		 *
+		 * We don't know if the batch is full already - we just try to add it,
+		 * and bail out if it fails.
+		 *
+		 * FIXME This seems wrong, actually. We use currSize when calculating
+		 * the start/end range, so the add should always succeed.
+		 */
+		while (start <= end)
+		{
+			BTScanPosItem *currItem = &so->currPos.items[start];
+			IndexTuple	itup = NULL;
+
+			if (scan->xs_want_itup)
+				itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+			/* try to add it to batch, if there's space */
+			if (!index_batch_add(scan, currItem->heapTid, itup))
+				break;
+
+			start++;
+		}
+
+		/*
+		 * set the starting point
+		 *
+		 * XXX might be better done in indexam.c
+		 */
+		if (ScanDirectionIsForward(dir))
+			scan->xs_batch.currIndex = -1;
+		else
+			scan->xs_batch.currIndex = scan->xs_batch.nheaptids;
+
+		/* shouldn't be possible to end here with an empty batch */
+		Assert(scan->xs_batch.nheaptids > 0);
+
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ *	_bt_next_batch() -- Get the next batch of items in a scan.
+ *
+ * A batch variant of _bt_next(). Most of the comments for that function
+ * apply here too.
+ *
+ * We should only get here only when the current batch has no more items
+ * in the given direction. We don't get here with empty batches, that's
+ * handled by _bt_fist_batch().
+ *
+ * XXX See also the comments at _bt_first_batch() about returning a single
+ * batch for the page, etc.
+ *
+ * FIXME There's a lot of redundant (almost the same) code here - handling
+ * the current and new leaf page is very similar, and it's also similar to
+ * _bt_first_batch(). We should try to reduce this a bit.
+ */
+bool
+_bt_next_batch(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	int			start,
+				end;
+
+	/* should be valid items (with respect to the leaf page) */
+	Assert(so->currPos.firstItem <= scan->xs_batch.firstIndex);
+	Assert(scan->xs_batch.firstIndex <= scan->xs_batch.lastIndex);
+	Assert(scan->xs_batch.lastIndex <= so->currPos.lastItem);
+
+	/*
+	 * Try to increase the size of the batch. Intentionally done before trying
+	 * to read items from the current page, so that the increased batch
+	 * applies to that too.
+	 *
+	 * FIXME Should be done in indexam.c probably? FIXME Maybe it should grow
+	 * faster? This is what bitmap scans do.
+	 */
+	scan->xs_batch.currSize = Min(scan->xs_batch.currSize + 1,
+								  scan->xs_batch.maxSize);
+
+	/*
+	 * Check if we still have some items on the current leaf page. If yes,
+	 * load them into a batch and return.
+	 *
+	 * XXX try combining that with the next block, the inner while loop is
+	 * exactly the same.
+	 */
+	if (ScanDirectionIsForward(dir))
+	{
+		start = scan->xs_batch.lastIndex + 1;
+		end = Min(start + (scan->xs_batch.currSize - 1), so->currPos.lastItem);
+		so->currPos.itemIndex = (end + 1);
+	}
+	else
+	{
+		end = scan->xs_batch.firstIndex - 1;
+		start = Max(end - (scan->xs_batch.currSize - 1),
+					so->currPos.firstItem);
+		so->currPos.itemIndex = (start - 1);
+	}
+
+	/*
+	 * reset the batch before loading new data
+	 *
+	 * XXX needs to happen after we calculate the start/end above, as it
+	 * resets some of the fields needed by the calculation.
+	 */
+	index_batch_reset(scan, dir);
+
+	/*
+	 * We have more items on the current leaf page.
+	 */
+	if (start <= end)
+	{
+		/* update the "window" the batch represents */
+		scan->xs_batch.firstIndex = start;
+		scan->xs_batch.lastIndex = end;
+
+		/* should fit into the current batch */
+		Assert((end - start + 1) <= scan->xs_batch.currSize);
+
+		/* should be valid items (with respect to the leaf page) */
+		Assert(so->currPos.firstItem <= scan->xs_batch.firstIndex);
+		Assert(scan->xs_batch.firstIndex <= scan->xs_batch.lastIndex);
+		Assert(scan->xs_batch.lastIndex <= so->currPos.lastItem);
+
+		/*
+		 * Walk through the range of index tuples, copy them into the batch.
+		 * If requested, set the index tuple too.
+		 *
+		 * We don't know if the batch is full already - we just try to add it,
+		 * and bail out if it fails.
+		 *
+		 * FIXME This seems wrong, actually. We use currSize when calculating
+		 * the start/end range, so the add should always succeed.
+		 */
+		while (start <= end)
+		{
+			BTScanPosItem *currItem = &so->currPos.items[start];
+			IndexTuple	itup = NULL;
+
+			if (scan->xs_want_itup)
+				itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+			/* try to add it to batch, if there's space */
+			if (!index_batch_add(scan, currItem->heapTid, itup))
+				break;
+
+			start++;
+		}
+
+		/*
+		 * set the starting point
+		 *
+		 * XXX might be better done in indexam.c
+		 */
+		if (ScanDirectionIsForward(dir))
+			scan->xs_batch.currIndex = -1;
+		else
+			scan->xs_batch.currIndex = scan->xs_batch.nheaptids;
+
+		/* shouldn't be possible to end here with an empty batch */
+		Assert(scan->xs_batch.nheaptids > 0);
+
+		return true;
+	}
+
+	/*
+	 * We've consumed all items from the current leaf page, so try reading the
+	 * next one, and process it.
+	 */
+	if (_bt_next(scan, dir))
+	{
+		/*
+		 * Check if we still have some items on the current leaf page. If yes,
+		 * load them into a batch and return.
+		 *
+		 * XXX try combining that with the next block, the inner while loop is
+		 * exactly the same.
+		 */
+		if (ScanDirectionIsForward(dir))
+		{
+			start = so->currPos.firstItem;
+			end = Min(start + (scan->xs_batch.currSize - 1), so->currPos.lastItem);
+			so->currPos.itemIndex = (end + 1);
+		}
+		else
+		{
+			end = so->currPos.lastItem;
+			start = Max(end - (scan->xs_batch.currSize - 1),
+						so->currPos.firstItem);
+			so->currPos.itemIndex = (start - 1);
+		}
+
+		/* update the "window" the batch represents */
+		scan->xs_batch.firstIndex = start;
+		scan->xs_batch.lastIndex = end;
+
+		/* should fit into the current batch */
+		Assert((end - start + 1) <= scan->xs_batch.currSize);
+
+		/* should be valid items (with respect to the leaf page) */
+		Assert(so->currPos.firstItem <= scan->xs_batch.firstIndex);
+		Assert(scan->xs_batch.firstIndex <= scan->xs_batch.lastIndex);
+		Assert(scan->xs_batch.lastIndex <= so->currPos.lastItem);
+
+		/*
+		 * Walk through the range of index tuples, copy them into the batch.
+		 * If requested, set the index tuple too.
+		 *
+		 * We don't know if the batch is full already - we just try to add it,
+		 * and bail out if it fails.
+		 *
+		 * FIXME This seems wrong, actually. We use currSize when calculating
+		 * the start/end range, so the add should always succeed.
+		 */
+		while (start <= end)
+		{
+			BTScanPosItem *currItem = &so->currPos.items[start];
+			IndexTuple	itup = NULL;
+
+			if (scan->xs_want_itup)
+				itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+			/* try to add it to batch, if there's space */
+			if (!index_batch_add(scan, currItem->heapTid, itup))
+				break;
+
+			start++;
+		}
+
+		/*
+		 * set the starting point
+		 *
+		 * XXX might be better done in indexam.c
+		 */
+		if (ScanDirectionIsForward(dir))
+			scan->xs_batch.currIndex = -1;
+		else
+			scan->xs_batch.currIndex = scan->xs_batch.nheaptids;
+
+		/* shouldn't be possible to end here with an empty batch */
+		Assert(scan->xs_batch.nheaptids > 0);
+
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ *	_bt_kill_batch() -- remember the items-to-be-killed from the current batch
+ *
+ * We simply translate the bitmap into the "regular" killedItems array, and let
+ * that to drive which items are killed.
+ */
+void
+_bt_kill_batch(IndexScanDesc scan)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	for (int i = 0; i < scan->xs_batch.nheaptids; i++)
+	{
+		/* Skip batch items not marked as killed. */
+		if (!scan->xs_batch.killedItems[i])
+			continue;
+
+		/*
+		 * Yes, remember it for later. (We'll deal with all such tuples at
+		 * once right before leaving the index page.)  The test for numKilled
+		 * overrun is not just paranoia: if the caller reverses direction in
+		 * the indexscan then the same item might get entered multiple times.
+		 * It's not worth trying to optimize that, so we don't detect it, but
+		 * instead just forget any excess entries.
+		 */
+		if (so->killedItems == NULL)
+			so->killedItems = (int *)
+				palloc(MaxTIDsPerBTreePage * sizeof(int));
+		if (so->numKilled < MaxTIDsPerBTreePage)
+			so->killedItems[so->numKilled++] = (scan->xs_batch.firstIndex + i);
+	}
+}
+
 /*
  *	_bt_readpage() -- Load data from current index page into so->currPos
  *
@@ -2048,6 +2408,9 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 
 	Assert(BTScanPosIsValid(so->currPos));
 
+	/* Transfer killed items from the batch to the regular array. */
+	_bt_kill_batch(scan);
+
 	/* Before leaving current page, deal with any killed items */
 	if (so->numKilled > 0)
 		_bt_killitems(scan);
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 612c6738950..b883efc7fa0 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -49,7 +49,13 @@
 static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
 static void StoreIndexTuple(IndexOnlyScanState *node, TupleTableSlot *slot,
 							IndexTuple itup, TupleDesc itupdesc);
+static bool ios_prefetch_block(IndexScanDesc scan, ScanDirection direction,
+							   void *data, int index);
 
+/* values stored in ios_prefetch_block in the batch cache */
+#define		IOS_UNKNOWN_VISIBILITY		0	/* XXX default value */
+#define		IOS_ALL_VISIBLE				1
+#define		IOS_NOT_ALL_VISIBLE			2
 
 /* ----------------------------------------------------------------
  *		IndexOnlyNext
@@ -112,142 +118,363 @@ IndexOnlyNext(IndexOnlyScanState *node)
 						 node->ioss_NumScanKeys,
 						 node->ioss_OrderByKeys,
 						 node->ioss_NumOrderByKeys);
+
+		index_batch_init(scandesc, ForwardScanDirection);
 	}
 
 	/*
-	 * OK, now that we have what we need, fetch the next tuple.
+	 * If the batching is disabled by a GUC, or if it's not supported by the
+	 * index AM, do the original approach.
+	 *
+	 * XXX Maybe we should enable batching based on the plan too, so that we
+	 * don't do batching when it's probably useless (e.g. semijoins or queries
+	 * with LIMIT 1 etc.). But maybe the approach with slow ramp-up (starting
+	 * with small batches) will handle that well enough.
+	 *
+	 * XXX Perhaps it'd be possible to do both in index_getnext_slot(), i.e.
+	 * call either the original code without batching, or the new batching
+	 * code if supported/enabled. It's not great to have duplicated code.
 	 */
-	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
+	if (!(enable_indexscan_batching &&
+		  index_batch_supported(scandesc, direction) &&
+		  node->ioss_CanBatch))
 	{
-		bool		tuple_from_heap = false;
-
-		CHECK_FOR_INTERRUPTS();
-
 		/*
-		 * We can skip the heap fetch if the TID references a heap page on
-		 * which all tuples are known visible to everybody.  In any case,
-		 * we'll use the index tuple not the heap tuple as the data source.
-		 *
-		 * Note on Memory Ordering Effects: visibilitymap_get_status does not
-		 * lock the visibility map buffer, and therefore the result we read
-		 * here could be slightly stale.  However, it can't be stale enough to
-		 * matter.
-		 *
-		 * We need to detect clearing a VM bit due to an insert right away,
-		 * because the tuple is present in the index page but not visible. The
-		 * reading of the TID by this scan (using a shared lock on the index
-		 * buffer) is serialized with the insert of the TID into the index
-		 * (using an exclusive lock on the index buffer). Because the VM bit
-		 * is cleared before updating the index, and locking/unlocking of the
-		 * index page acts as a full memory barrier, we are sure to see the
-		 * cleared bit if we see a recently-inserted TID.
-		 *
-		 * Deletes do not update the index page (only VACUUM will clear out
-		 * the TID), so the clearing of the VM bit by a delete is not
-		 * serialized with this test below, and we may see a value that is
-		 * significantly stale. However, we don't care about the delete right
-		 * away, because the tuple is still visible until the deleting
-		 * transaction commits or the statement ends (if it's our
-		 * transaction). In either case, the lock on the VM buffer will have
-		 * been released (acting as a write barrier) after clearing the bit.
-		 * And for us to have a snapshot that includes the deleting
-		 * transaction (making the tuple invisible), we must have acquired
-		 * ProcArrayLock after that time, acting as a read barrier.
-		 *
-		 * It's worth going through this complexity to avoid needing to lock
-		 * the VM buffer, which could cause significant contention.
+		 * OK, now that we have what we need, fetch the next tuple.
 		 */
-		if (!VM_ALL_VISIBLE(scandesc->heapRelation,
-							ItemPointerGetBlockNumber(tid),
-							&node->ioss_VMBuffer))
+		while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
 		{
+			bool		tuple_from_heap = false;
+
+			CHECK_FOR_INTERRUPTS();
+
 			/*
-			 * Rats, we have to visit the heap to check visibility.
+			 * We can skip the heap fetch if the TID references a heap page on
+			 * which all tuples are known visible to everybody.  In any case,
+			 * we'll use the index tuple not the heap tuple as the data
+			 * source.
+			 *
+			 * Note on Memory Ordering Effects: visibilitymap_get_status does
+			 * not lock the visibility map buffer, and therefore the result we
+			 * read here could be slightly stale.  However, it can't be stale
+			 * enough to matter.
+			 *
+			 * We need to detect clearing a VM bit due to an insert right
+			 * away, because the tuple is present in the index page but not
+			 * visible. The reading of the TID by this scan (using a shared
+			 * lock on the index buffer) is serialized with the insert of the
+			 * TID into the index (using an exclusive lock on the index
+			 * buffer). Because the VM bit is cleared before updating the
+			 * index, and locking/unlocking of the index page acts as a full
+			 * memory barrier, we are sure to see the cleared bit if we see a
+			 * recently-inserted TID.
+			 *
+			 * Deletes do not update the index page (only VACUUM will clear
+			 * out the TID), so the clearing of the VM bit by a delete is not
+			 * serialized with this test below, and we may see a value that is
+			 * significantly stale. However, we don't care about the delete
+			 * right away, because the tuple is still visible until the
+			 * deleting transaction commits or the statement ends (if it's our
+			 * transaction). In either case, the lock on the VM buffer will
+			 * have been released (acting as a write barrier) after clearing
+			 * the bit. And for us to have a snapshot that includes the
+			 * deleting transaction (making the tuple invisible), we must have
+			 * acquired ProcArrayLock after that time, acting as a read
+			 * barrier.
+			 *
+			 * It's worth going through this complexity to avoid needing to
+			 * lock the VM buffer, which could cause significant contention.
 			 */
-			InstrCountTuples2(node, 1);
-			if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
-				continue;		/* no visible tuple, try next index entry */
+			if (!VM_ALL_VISIBLE(scandesc->heapRelation,
+								ItemPointerGetBlockNumber(tid),
+								&node->ioss_VMBuffer))
+			{
+				/*
+				 * Rats, we have to visit the heap to check visibility.
+				 */
+				InstrCountTuples2(node, 1);
+				if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
+					continue;	/* no visible tuple, try next index entry */
+
+				ExecClearTuple(node->ioss_TableSlot);
+
+				/*
+				 * Only MVCC snapshots are supported here, so there should be
+				 * no need to keep following the HOT chain once a visible
+				 * entry has been found.  If we did want to allow that, we'd
+				 * need to keep more state to remember not to call
+				 * index_getnext_tid next time.
+				 */
+				if (scandesc->xs_heap_continue)
+					elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+
+				/*
+				 * Note: at this point we are holding a pin on the heap page,
+				 * as recorded in scandesc->xs_cbuf.  We could release that
+				 * pin now, but it's not clear whether it's a win to do so.
+				 * The next index entry might require a visit to the same heap
+				 * page.
+				 */
+
+				tuple_from_heap = true;
+			}
 
-			ExecClearTuple(node->ioss_TableSlot);
+			/*
+			 * Fill the scan tuple slot with data from the index.  This might
+			 * be provided in either HeapTuple or IndexTuple format.
+			 * Conceivably an index AM might fill both fields, in which case
+			 * we prefer the heap format, since it's probably a bit cheaper to
+			 * fill a slot from.
+			 */
+			if (scandesc->xs_hitup)
+			{
+				/*
+				 * We don't take the trouble to verify that the provided tuple
+				 * has exactly the slot's format, but it seems worth doing a
+				 * quick check on the number of fields.
+				 */
+				Assert(slot->tts_tupleDescriptor->natts ==
+					   scandesc->xs_hitupdesc->natts);
+				ExecForceStoreHeapTuple(scandesc->xs_hitup, slot, false);
+			}
+			else if (scandesc->xs_itup)
+				StoreIndexTuple(node, slot, scandesc->xs_itup, scandesc->xs_itupdesc);
+			else
+				elog(ERROR, "no data returned for index-only scan");
 
 			/*
-			 * Only MVCC snapshots are supported here, so there should be no
-			 * need to keep following the HOT chain once a visible entry has
-			 * been found.  If we did want to allow that, we'd need to keep
-			 * more state to remember not to call index_getnext_tid next time.
+			 * If the index was lossy, we have to recheck the index quals.
 			 */
-			if (scandesc->xs_heap_continue)
-				elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+			if (scandesc->xs_recheck)
+			{
+				econtext->ecxt_scantuple = slot;
+				if (!ExecQualAndReset(node->recheckqual, econtext))
+				{
+					/* Fails recheck, so drop it and loop back for another */
+					InstrCountFiltered2(node, 1);
+					continue;
+				}
+			}
 
 			/*
-			 * Note: at this point we are holding a pin on the heap page, as
-			 * recorded in scandesc->xs_cbuf.  We could release that pin now,
-			 * but it's not clear whether it's a win to do so.  The next index
-			 * entry might require a visit to the same heap page.
+			 * We don't currently support rechecking ORDER BY distances.  (In
+			 * principle, if the index can support retrieval of the originally
+			 * indexed value, it should be able to produce an exact distance
+			 * calculation too.  So it's not clear that adding code here for
+			 * recheck/re-sort would be worth the trouble.  But we should at
+			 * least throw an error if someone tries it.)
 			 */
+			if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby)
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("lossy distance functions are not supported in index-only scans")));
 
-			tuple_from_heap = true;
-		}
+			/*
+			 * If we didn't access the heap, then we'll need to take a
+			 * predicate lock explicitly, as if we had.  For now we do that at
+			 * page level.
+			 */
+			if (!tuple_from_heap)
+				PredicateLockPage(scandesc->heapRelation,
+								  ItemPointerGetBlockNumber(tid),
+								  estate->es_snapshot);
 
-		/*
-		 * Fill the scan tuple slot with data from the index.  This might be
-		 * provided in either HeapTuple or IndexTuple format.  Conceivably an
-		 * index AM might fill both fields, in which case we prefer the heap
-		 * format, since it's probably a bit cheaper to fill a slot from.
-		 */
-		if (scandesc->xs_hitup)
+			return slot;
+		}
+	}
+	else
+	{
+new_batch:
+		/* do we have TIDs in the current batch */
+		while ((tid = index_batch_getnext_tid(scandesc, direction)) != NULL)
 		{
+			bool		all_visible;
+			bool		tuple_from_heap = false;
+
+			CHECK_FOR_INTERRUPTS();
+
+			/* Is the index of the current item valid for the batch? */
+			Assert((scandesc->xs_batch.currIndex >= 0) &&
+				   (scandesc->xs_batch.currIndex < scandesc->xs_batch.nheaptids));
+
+			/* Prefetch some of the following items in the batch. */
+			index_batch_prefetch(scandesc, direction, ios_prefetch_block, node);
+
 			/*
-			 * We don't take the trouble to verify that the provided tuple has
-			 * exactly the slot's format, but it seems worth doing a quick
-			 * check on the number of fields.
+			 * Reuse the previously determined page visibility info, or
+			 * calculate it now. If we decided not to prefetch the block, the
+			 * page has to be all-visible.
+			 *
+			 * XXX It's a bir weird we use the visibility to decide if we
+			 * should skip prefetching the block, and then deduce the
+			 * visibility from that. Maybe we could/should have a more direct
+			 * way?
 			 */
-			Assert(slot->tts_tupleDescriptor->natts ==
-				   scandesc->xs_hitupdesc->natts);
-			ExecForceStoreHeapTuple(scandesc->xs_hitup, slot, false);
-		}
-		else if (scandesc->xs_itup)
-			StoreIndexTuple(node, slot, scandesc->xs_itup, scandesc->xs_itupdesc);
-		else
-			elog(ERROR, "no data returned for index-only scan");
+			all_visible = !ios_prefetch_block(scandesc, direction, node,
+											  scandesc->xs_batch.currIndex);
 
-		/*
-		 * If the index was lossy, we have to recheck the index quals.
-		 */
-		if (scandesc->xs_recheck)
-		{
-			econtext->ecxt_scantuple = slot;
-			if (!ExecQualAndReset(node->recheckqual, econtext))
+			/*
+			 * We can skip the heap fetch if the TID references a heap page on
+			 * which all tuples are known visible to everybody.  In any case,
+			 * we'll use the index tuple not the heap tuple as the data
+			 * source.
+			 *
+			 * Note on Memory Ordering Effects: visibilitymap_get_status does
+			 * not lock the visibility map buffer, and therefore the result we
+			 * read here could be slightly stale.  However, it can't be stale
+			 * enough to matter.
+			 *
+			 * We need to detect clearing a VM bit due to an insert right
+			 * away, because the tuple is present in the index page but not
+			 * visible. The reading of the TID by this scan (using a shared
+			 * lock on the index buffer) is serialized with the insert of the
+			 * TID into the index (using an exclusive lock on the index
+			 * buffer). Because the VM bit is cleared before updating the
+			 * index, and locking/unlocking of the index page acts as a full
+			 * memory barrier, we are sure to see the cleared bit if we see a
+			 * recently-inserted TID.
+			 *
+			 * Deletes do not update the index page (only VACUUM will clear
+			 * out the TID), so the clearing of the VM bit by a delete is not
+			 * serialized with this test below, and we may see a value that is
+			 * significantly stale. However, we don't care about the delete
+			 * right away, because the tuple is still visible until the
+			 * deleting transaction commits or the statement ends (if it's our
+			 * transaction). In either case, the lock on the VM buffer will
+			 * have been released (acting as a write barrier) after clearing
+			 * the bit. And for us to have a snapshot that includes the
+			 * deleting transaction (making the tuple invisible), we must have
+			 * acquired ProcArrayLock after that time, acting as a read
+			 * barrier.
+			 *
+			 * It's worth going through this complexity to avoid needing to
+			 * lock the VM buffer, which could cause significant contention.
+			 */
+			if (!all_visible)
 			{
-				/* Fails recheck, so drop it and loop back for another */
-				InstrCountFiltered2(node, 1);
-				continue;
+				/*
+				 * Rats, we have to visit the heap to check visibility.
+				 */
+				InstrCountTuples2(node, 1);
+				if (!index_fetch_heap(scandesc, node->ioss_TableSlot))
+				{
+					/*
+					 * If we haven't found any visible tuple for the TID,
+					 * chances are all versions are dead and may kill it from
+					 * the index. If so, flag it in the kill bitmap - we'll
+					 * translate it to indexes later.
+					 *
+					 * XXX With the firstIndex/lastIndex it would not be too
+					 * hard to do the translation here. But do we want to? How
+					 * much is that considered an internal detail of the AM?
+					 *
+					 * FIXME Maybe we could/should make this part of
+					 * index_fetch_heap, so that we don't have to do this
+					 * after each call?
+					 */
+					if (scandesc->kill_prior_tuple)
+					{
+						scandesc->xs_batch.killedItems[scandesc->xs_batch.currIndex] = true;
+						scandesc->kill_prior_tuple = false;
+					}
+
+					continue;	/* no visible tuple, try next index entry */
+				}
+
+				ExecClearTuple(node->ioss_TableSlot);
+
+				/*
+				 * Only MVCC snapshots are supported here, so there should be
+				 * no need to keep following the HOT chain once a visible
+				 * entry has been found.  If we did want to allow that, we'd
+				 * need to keep more state to remember not to call
+				 * index_getnext_tid next time.
+				 */
+				if (scandesc->xs_heap_continue)
+					elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+
+				/*
+				 * Note: at this point we are holding a pin on the heap page,
+				 * as recorded in scandesc->xs_cbuf.  We could release that
+				 * pin now, but it's not clear whether it's a win to do so.
+				 * The next index entry might require a visit to the same heap
+				 * page.
+				 */
+
+				tuple_from_heap = true;
 			}
-		}
 
-		/*
-		 * We don't currently support rechecking ORDER BY distances.  (In
-		 * principle, if the index can support retrieval of the originally
-		 * indexed value, it should be able to produce an exact distance
-		 * calculation too.  So it's not clear that adding code here for
-		 * recheck/re-sort would be worth the trouble.  But we should at least
-		 * throw an error if someone tries it.)
-		 */
-		if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("lossy distance functions are not supported in index-only scans")));
+			/*
+			 * Fill the scan tuple slot with data from the index.  This might
+			 * be provided in either HeapTuple or IndexTuple format.
+			 * Conceivably an index AM might fill both fields, in which case
+			 * we prefer the heap format, since it's probably a bit cheaper to
+			 * fill a slot from.
+			 */
+			if (scandesc->xs_hitup)
+			{
+				/*
+				 * We don't take the trouble to verify that the provided tuple
+				 * has exactly the slot's format, but it seems worth doing a
+				 * quick check on the number of fields.
+				 */
+				Assert(slot->tts_tupleDescriptor->natts ==
+					   scandesc->xs_hitupdesc->natts);
+				ExecForceStoreHeapTuple(scandesc->xs_hitup, slot, false);
+			}
+			else if (scandesc->xs_itup)
+				StoreIndexTuple(node, slot, scandesc->xs_itup, scandesc->xs_itupdesc);
+			else
+				elog(ERROR, "no data returned for index-only scan");
 
-		/*
-		 * If we didn't access the heap, then we'll need to take a predicate
-		 * lock explicitly, as if we had.  For now we do that at page level.
-		 */
-		if (!tuple_from_heap)
-			PredicateLockPage(scandesc->heapRelation,
-							  ItemPointerGetBlockNumber(tid),
-							  estate->es_snapshot);
+			/*
+			 * If the index was lossy, we have to recheck the index quals.
+			 */
+			if (scandesc->xs_recheck)
+			{
+				econtext->ecxt_scantuple = slot;
+				if (!ExecQualAndReset(node->recheckqual, econtext))
+				{
+					/* Fails recheck, so drop it and loop back for another */
+					InstrCountFiltered2(node, 1);
+					continue;
+				}
+			}
+
+			/*
+			 * We don't currently support rechecking ORDER BY distances.  (In
+			 * principle, if the index can support retrieval of the originally
+			 * indexed value, it should be able to produce an exact distance
+			 * calculation too.  So it's not clear that adding code here for
+			 * recheck/re-sort would be worth the trouble.  But we should at
+			 * least throw an error if someone tries it.)
+			 */
+			if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby)
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("lossy distance functions are not supported in index-only scans")));
+
+			/*
+			 * If we didn't access the heap, then we'll need to take a
+			 * predicate lock explicitly, as if we had.  For now we do that at
+			 * page level.
+			 */
+			if (!tuple_from_heap)
+				PredicateLockPage(scandesc->heapRelation,
+								  ItemPointerGetBlockNumber(tid),
+								  estate->es_snapshot);
+
+			return slot;
+		}
 
-		return slot;
+		/* batch is empty, try reading the next batch of tuples */
+		if (index_batch_getnext(scandesc, direction))
+		{
+			index_batch_prefetch(scandesc, direction, ios_prefetch_block, node);
+			goto new_batch;
+		}
+
+		return NULL;
 	}
 
 	/*
@@ -574,6 +801,16 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 	indexstate->recheckqual =
 		ExecInitQual(node->recheckqual, (PlanState *) indexstate);
 
+	/*
+	 * Can't do batching (and thus prefetching) when the plan requires mark
+	 * and restore. There's an issue with translating the mark/restore
+	 * positions between the batch in scan descriptor and the original
+	 * position recognized in the index AM.
+	 *
+	 * XXX Hopefully just a temporary limitation?
+	 */
+	indexstate->ioss_CanBatch = !(eflags & EXEC_FLAG_MARK);
+
 	/*
 	 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
 	 * here.  This allows an index-advisor plugin to EXPLAIN a plan containing
@@ -744,6 +981,14 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
 	node->ioss_ScanDesc->xs_want_itup = true;
 	node->ioss_VMBuffer = InvalidBuffer;
 
+	/*
+	 * XXX do we actually want prefetching for parallel index scans? Maybe
+	 * not, but then we need to be careful not to call index_batch_getnext_tid
+	 * (which now can happen, because we'll call IndexOnlyNext even for
+	 * parallel plans).
+	 */
+	index_batch_init(node->ioss_ScanDesc, ForwardScanDirection);
+
 	/*
 	 * If no run-time keys to calculate or they are ready, go ahead and pass
 	 * the scankeys to the index AM.
@@ -788,6 +1033,9 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
 								 piscan);
 	node->ioss_ScanDesc->xs_want_itup = true;
 
+	/* XXX do we actually want prefetching for parallel index scans? */
+	index_batch_init(node->ioss_ScanDesc, ForwardScanDirection);
+
 	/*
 	 * If no run-time keys to calculate or they are ready, go ahead and pass
 	 * the scankeys to the index AM.
@@ -797,3 +1045,35 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
 					 node->ioss_ScanKeys, node->ioss_NumScanKeys,
 					 node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
 }
+
+/*
+ * ios_prefetch_block
+ *		Callback to only prefetch blocks that are not all-visible.
+ *
+ * We don't want to inspect the visibility map repeatedly, so the result of
+ * VM_ALL_VISIBLE is stored in the batch private data. The values are set
+ * to 0 by default, so we use two constants to remember if all-visible or
+ * not all-visible.
+ */
+static bool
+ios_prefetch_block(IndexScanDesc scan, ScanDirection direction,
+				   void *arg, int index)
+{
+	IndexOnlyScanState *node = (IndexOnlyScanState *) arg;
+
+	if (scan->xs_batch.privateData[index] == IOS_UNKNOWN_VISIBILITY)
+	{
+		bool		all_visible;
+		ItemPointer tid = &scan->xs_batch.heaptids[index];
+
+		all_visible = VM_ALL_VISIBLE(scan->heapRelation,
+									 ItemPointerGetBlockNumber(tid),
+									 &node->ioss_VMBuffer);
+
+		scan->xs_batch.privateData[index]
+			= all_visible ? IOS_ALL_VISIBLE : IOS_NOT_ALL_VISIBLE;
+	}
+
+	/* prefetch only blocks that are not all-visible */
+	return (scan->xs_batch.privateData[index] == IOS_NOT_ALL_VISIBLE);
+}
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 8000feff4c9..d9f926536ad 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -38,6 +38,7 @@
 #include "lib/pairingheap.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
+#include "optimizer/cost.h"
 #include "utils/array.h"
 #include "utils/datum.h"
 #include "utils/lsyscache.h"
@@ -122,31 +123,89 @@ IndexNext(IndexScanState *node)
 			index_rescan(scandesc,
 						 node->iss_ScanKeys, node->iss_NumScanKeys,
 						 node->iss_OrderByKeys, node->iss_NumOrderByKeys);
+
+		index_batch_init(scandesc, ForwardScanDirection);
 	}
 
 	/*
-	 * ok, now that we have what we need, fetch the next tuple.
+	 * If the batching is disabled by a GUC, or if it's not supported by the
+	 * index AM, do the original approach.
+	 *
+	 * XXX Maybe we should enable batching based on the plan too, so that we
+	 * don't do batching when it's probably useless (e.g. semijoins or queries
+	 * with LIMIT 1 etc.). But maybe the approach with slow ramp-up (starting
+	 * with small batches) will handle that well enough.
+	 *
+	 * XXX Perhaps it'd be possible to do both in index_getnext_slot(), i.e.
+	 * call either the original code without batching, or the new batching
+	 * code if supported/enabled. It's not great to have duplicated code.
 	 */
-	while (index_getnext_slot(scandesc, direction, slot))
+	if (!(enable_indexscan_batching &&
+		  index_batch_supported(scandesc, direction) &&
+		  node->iss_CanBatch))
 	{
-		CHECK_FOR_INTERRUPTS();
-
 		/*
-		 * If the index was lossy, we have to recheck the index quals using
-		 * the fetched tuple.
+		 * ok, now that we have what we need, fetch the next tuple.
 		 */
-		if (scandesc->xs_recheck)
+		while (index_getnext_slot(scandesc, direction, slot))
 		{
-			econtext->ecxt_scantuple = slot;
-			if (!ExecQualAndReset(node->indexqualorig, econtext))
+			CHECK_FOR_INTERRUPTS();
+
+			/*
+			 * If the index was lossy, we have to recheck the index quals
+			 * using the fetched tuple.
+			 */
+			if (scandesc->xs_recheck)
 			{
-				/* Fails recheck, so drop it and loop back for another */
-				InstrCountFiltered2(node, 1);
-				continue;
+				econtext->ecxt_scantuple = slot;
+				if (!ExecQualAndReset(node->indexqualorig, econtext))
+				{
+					/* Fails recheck, so drop it and loop back for another */
+					InstrCountFiltered2(node, 1);
+					continue;
+				}
 			}
+
+			return slot;
 		}
+	}
+	else
+	{
+new_batch:
+		/* do we have TIDs in the current batch */
+		while (index_batch_getnext_slot(scandesc, direction, slot))
+		{
+			CHECK_FOR_INTERRUPTS();
 
-		return slot;
+			/* first, take care of prefetching further items */
+			index_batch_prefetch(scandesc, direction, NULL, NULL);
+
+			/*
+			 * If the index was lossy, we have to recheck the index quals
+			 * using the fetched tuple.
+			 */
+			if (scandesc->xs_recheck)
+			{
+				econtext->ecxt_scantuple = slot;
+				if (!ExecQualAndReset(node->indexqualorig, econtext))
+				{
+					/* Fails recheck, so drop it and loop back for another */
+					InstrCountFiltered2(node, 1);
+					continue;
+				}
+			}
+
+			return slot;
+		}
+
+		/* batch is empty, try reading the next batch of tuples */
+		if (index_batch_getnext(scandesc, direction))
+		{
+			index_batch_prefetch(scandesc, direction, NULL, NULL);
+			goto new_batch;
+		}
+
+		return NULL;
 	}
 
 	/*
@@ -942,6 +1001,16 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 	indexstate->indexorderbyorig =
 		ExecInitExprList(node->indexorderbyorig, (PlanState *) indexstate);
 
+	/*
+	 * Can't do batching (and thus prefetching) when the plan requires mark
+	 * and restore. There's an issue with translating the mark/restore
+	 * positions between the batch in scan descriptor and the original
+	 * position recognized in the index AM.
+	 *
+	 * XXX Hopefully just a temporary limitation?
+	 */
+	indexstate->iss_CanBatch = !(eflags & EXEC_FLAG_MARK);
+
 	/*
 	 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
 	 * here.  This allows an index-advisor plugin to EXPLAIN a plan containing
@@ -1677,6 +1746,14 @@ ExecIndexScanInitializeDSM(IndexScanState *node,
 								 node->iss_NumOrderByKeys,
 								 piscan);
 
+	/*
+	 * XXX do we actually want prefetching for parallel index scans? Maybe
+	 * not, but then we need to be careful not to call index_batch_getnext_tid
+	 * (which now can happen, because we'll call IndexOnlyNext even for
+	 * parallel plans).
+	 */
+	index_batch_init(node->iss_ScanDesc, ForwardScanDirection);
+
 	/*
 	 * If no run-time keys to calculate or they are ready, go ahead and pass
 	 * the scankeys to the index AM.
@@ -1720,6 +1797,14 @@ ExecIndexScanInitializeWorker(IndexScanState *node,
 								 node->iss_NumOrderByKeys,
 								 piscan);
 
+	/*
+	 * XXX do we actually want prefetching for parallel index scans? Maybe
+	 * not, but then we need to be careful not to call index_batch_getnext_tid
+	 * (which now can happen, because we'll call IndexOnlyNext even for
+	 * parallel plans).
+	 */
+	index_batch_init(node->iss_ScanDesc, ForwardScanDirection);
+
 	/*
 	 * If no run-time keys to calculate or they are ready, go ahead and pass
 	 * the scankeys to the index AM.
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 521ec5591c8..47a9ed9d527 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -789,6 +789,16 @@ struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_indexscan_batching", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of index-scan batching."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&enable_indexscan_batching,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_indexonlyscan", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of index-only-scan plans."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 667e0dc40a2..b2509337755 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -398,6 +398,7 @@
 #enable_hashjoin = on
 #enable_incremental_sort = on
 #enable_indexscan = on
+#enable_indexscan_batching = on
 #enable_indexonlyscan = on
 #enable_material = on
 #enable_memoize = on
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index f25c9d58a7d..8870f7eb676 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -177,6 +177,10 @@ typedef void (*amrescan_function) (IndexScanDesc scan,
 typedef bool (*amgettuple_function) (IndexScanDesc scan,
 									 ScanDirection direction);
 
+/* next batch of valid tuples */
+typedef bool (*amgettuplebatch_function) (IndexScanDesc scan,
+										  ScanDirection direction);
+
 /* fetch all valid tuples */
 typedef int64 (*amgetbitmap_function) (IndexScanDesc scan,
 									   TIDBitmap *tbm);
@@ -280,6 +284,7 @@ typedef struct IndexAmRoutine
 	ambeginscan_function ambeginscan;
 	amrescan_function amrescan;
 	amgettuple_function amgettuple; /* can be NULL */
+	amgettuplebatch_function amgettuplebatch;	/* can be NULL */
 	amgetbitmap_function amgetbitmap;	/* can be NULL */
 	amendscan_function amendscan;
 	ammarkpos_function ammarkpos;	/* can be NULL */
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index fdcfbe8db74..01b1cee3d30 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -14,6 +14,7 @@
 #ifndef GENAM_H
 #define GENAM_H
 
+#include "access/itup.h"
 #include "access/sdir.h"
 #include "access/skey.h"
 #include "nodes/tidbitmap.h"
@@ -132,6 +133,8 @@ typedef struct IndexOrderByDistance
  * generalized index_ interface routines (in indexam.c)
  */
 
+extern PGDLLIMPORT bool enable_indexscan_batching;
+
 /*
  * IndexScanIsValid
  *		True iff the index scan is valid.
@@ -182,6 +185,27 @@ extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction,
 							   struct TupleTableSlot *slot);
 extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap);
 
+extern bool index_batch_getnext(IndexScanDesc scan,
+								ScanDirection direction);
+extern ItemPointer index_batch_getnext_tid(IndexScanDesc scan,
+										   ScanDirection direction);
+extern bool index_batch_getnext_slot(IndexScanDesc scan,
+									 ScanDirection direction,
+									 struct TupleTableSlot *slot);
+extern bool index_batch_supported(IndexScanDesc scan, ScanDirection direction);
+extern void index_batch_init(IndexScanDesc scan, ScanDirection direction);
+extern void index_batch_reset(IndexScanDesc scan, ScanDirection direction);
+extern bool index_batch_add(IndexScanDesc scan, ItemPointerData tid, IndexTuple itup);
+
+/*
+ * Typedef for callback function to determine if an item in index scan should
+ * be prefetched.
+ */
+typedef bool (*index_prefetch_callback) (IndexScanDesc scan, ScanDirection direction,
+										 void *arg, int index);
+extern void index_batch_prefetch(IndexScanDesc scan, ScanDirection direction,
+								 index_prefetch_callback callback, void *arg);
+
 extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
 												IndexBulkDeleteResult *istat,
 												IndexBulkDeleteCallback callback,
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 9af9b3ecdcc..db1ba7ed647 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1172,6 +1172,7 @@ extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
 extern Size btestimateparallelscan(int nkeys, int norderbys);
 extern void btinitparallelscan(void *target);
 extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
+extern bool btgettuplebatch(IndexScanDesc scan, ScanDirection dir);
 extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
 extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 					 ScanKey orderbys, int norderbys);
@@ -1276,6 +1277,9 @@ extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
 extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_first_batch(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_next_batch(IndexScanDesc scan, ScanDirection dir);
+extern void _bt_kill_batch(IndexScanDesc scan);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
 
 /*
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 521043304ab..a58c294a8e5 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -151,6 +151,46 @@ typedef struct IndexScanDescData
 
 	bool		xs_recheck;		/* T means scan keys must be rechecked */
 
+	/*
+	 * Data about the current TID batch returned by the index AM.
+	 *
+	 * XXX Maybe this should be a separate struct instead, and the scan
+	 * descriptor would have only a pointer, initialized only when the
+	 * batching is actually used?
+	 *
+	 * XXX It's not quite clear which part of this is managed by indexam and
+	 * what's up to the actual index AM implementation. Needs some clearer
+	 * boundaries.
+	 *
+	 * XXX Should we have a pointer for optional state managed by the AM? Some
+	 * custom AMs may need more per-batch information, not just the fields we
+	 * have here.
+	 */
+	struct
+	{
+		/* batch size - maximum, initial, current (with ramp up) */
+		int			maxSize;
+		int			initSize;
+		int			currSize;
+
+		/* batch prefetching */
+		int			prefetchTarget; /* current prefetch distance */
+		int			prefetchMaximum;	/* maximum prefetch distance */
+		int			prefetchIndex;	/* next item to prefetch */
+
+		/* range of leaf page items copied into the current batch */
+		int			firstIndex;
+		int			lastIndex;
+
+		/* batch contents (TIDs, index tuples, kill bitmap, ...) */
+		int			nheaptids;	/* number of TIDs in the batch */
+		int			currIndex;	/* index of the current item */
+		ItemPointerData *heaptids;	/* TIDs in the batch */
+		IndexTuple *itups;		/* IndexTuples, if requested */
+		bool	   *killedItems;	/* bitmap of tuples to kill */
+		Datum	   *privateData;	/* private data for batch */
+	}			xs_batch;
+
 	/*
 	 * When fetching with an ordering operator, the values of the ORDER BY
 	 * expressions of the last returned tuple, according to the index.  If
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index af7d8fd1e72..ffa0923ed3c 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1644,6 +1644,7 @@ typedef struct
  *		OrderByTypByVals   is the datatype of order by expression pass-by-value?
  *		OrderByTypLens	   typlens of the datatypes of order by expressions
  *		PscanLen		   size of parallel index scan descriptor
+ *		CanBatch		   batching (and prefetching) enabled
  * ----------------
  */
 typedef struct IndexScanState
@@ -1671,6 +1672,10 @@ typedef struct IndexScanState
 	bool	   *iss_OrderByTypByVals;
 	int16	   *iss_OrderByTypLens;
 	Size		iss_PscanLen;
+
+	/* batching/prefetching enabled? */
+	bool		iss_CanBatch;
+
 } IndexScanState;
 
 /* ----------------
@@ -1692,6 +1697,7 @@ typedef struct IndexScanState
  *		PscanLen		   size of parallel index-only scan descriptor
  *		NameCStringAttNums attnums of name typed columns to pad to NAMEDATALEN
  *		NameCStringCount   number of elements in the NameCStringAttNums array
+ *		CanBatch		   batching (and prefetching) enabled
  * ----------------
  */
 typedef struct IndexOnlyScanState
@@ -1713,6 +1719,7 @@ typedef struct IndexOnlyScanState
 	Size		ioss_PscanLen;
 	AttrNumber *ioss_NameCStringAttNums;
 	int			ioss_NameCStringCount;
+	bool		ioss_CanBatch;
 } IndexOnlyScanState;
 
 /* ----------------
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index fad7fc3a7e0..14b38ed4d46 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -157,6 +157,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_incremental_sort        | on
  enable_indexonlyscan           | on
  enable_indexscan               | on
+ enable_indexscan_batching      | on
  enable_material                | on
  enable_memoize                 | on
  enable_mergejoin               | on
@@ -170,7 +171,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(22 rows)
+(23 rows)
 
 -- There are always wait event descriptions for various types.  InjectionPoint
 -- may be present or absent, depending on history since last postmaster start.
-- 
2.46.0

