From 4474a058c7a1d65b26c9d450065678823e73ce8e Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Mon, 30 Sep 2024 22:48:39 +0200
Subject: [PATCH v20241106 2/7] WIP: batching for nbtree indexes

Adds batching/prefetching for btree indexes. Returns only batches from a
single leaf page. Does not support mark/restore yet.
---
 src/backend/access/nbtree/nbtree.c    |  65 ++++++
 src/backend/access/nbtree/nbtsearch.c | 280 ++++++++++++++++++++++++++
 src/include/access/nbtree.h           |  15 ++
 src/tools/pgindent/typedefs.list      |   1 +
 4 files changed, 361 insertions(+)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 2919b12639d..bbaa3df2a0c 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -141,6 +141,7 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->ambeginscan = btbeginscan;
 	amroutine->amrescan = btrescan;
 	amroutine->amgettuple = btgettuple;
+	amroutine->amgetbatch = btgetbatch;
 	amroutine->amgetbitmap = btgetbitmap;
 	amroutine->amendscan = btendscan;
 	amroutine->ammarkpos = btmarkpos;
@@ -259,6 +260,53 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 	return res;
 }
 
+/*
+ *	btgetbatch() -- Get the next batch of tuples in the scan.
+ *
+ * XXX Pretty much like btgettuple(), but for batches of tuples.
+ */
+bool
+btgetbatch(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	bool		res;
+
+	/* btree indexes are never lossy */
+	scan->xs_recheck = false;
+
+	/* Each loop iteration performs another primitive index scan */
+	do
+	{
+		/*
+		 * If we've already initialized this scan, we can just advance it in
+		 * the appropriate direction.  If we haven't done so yet, we call
+		 * _bt_first() to get the first item in the scan.
+		 */
+		if (!BTScanPosIsValid(so->currPos))
+			res = _bt_first_batch(scan, dir);
+		else
+		{
+			/*
+			 * Check to see if we should kill tuples from the previous batch.
+			 */
+			_bt_kill_batch(scan);
+
+			/*
+			 * Now continue the scan.
+			 */
+			res = _bt_next_batch(scan, dir);
+		}
+
+		/* If we have a tuple, return it ... */
+		if (res)
+			break;
+
+		/* ... otherwise see if we need another primitive index scan */
+	} while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
+
+	return res;
+}
+
 /*
  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
  */
@@ -339,6 +387,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 	so->killedItems = NULL;		/* until needed */
 	so->numKilled = 0;
 
+	/* batch range, initially empty */
+	so->batch.firstIndex = -1;
+	so->batch.lastIndex = -1;
+
 	/*
 	 * We don't know yet whether the scan will be index-only, so we do not
 	 * allocate the tuple workspace arrays until btrescan.  However, we set up
@@ -365,6 +417,9 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	/* we aren't holding any read locks, but gotta drop the pins */
 	if (BTScanPosIsValid(so->currPos))
 	{
+		/* Transfer killed items from the batch to the regular array. */
+		_bt_kill_batch(scan);
+
 		/* Before leaving current page, deal with any killed items */
 		if (so->numKilled > 0)
 			_bt_killitems(scan);
@@ -408,6 +463,10 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 		memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
 	so->numberOfKeys = 0;		/* until _bt_preprocess_keys sets it */
 	so->numArrayKeys = 0;		/* ditto */
+
+	/* Reset the batch (even if not batched scan) */
+	so->batch.firstIndex = -1;
+	so->batch.lastIndex = -1;
 }
 
 /*
@@ -421,6 +480,9 @@ btendscan(IndexScanDesc scan)
 	/* we aren't holding any read locks, but gotta drop the pins */
 	if (BTScanPosIsValid(so->currPos))
 	{
+		/* Transfer killed items from the batch to the regular array. */
+		_bt_kill_batch(scan);
+
 		/* Before leaving current page, deal with any killed items */
 		if (so->numKilled > 0)
 			_bt_killitems(scan);
@@ -501,6 +563,9 @@ btrestrpos(IndexScanDesc scan)
 		 */
 		if (BTScanPosIsValid(so->currPos))
 		{
+			/* Transfer killed items from the batch to the regular array. */
+			_bt_kill_batch(scan);
+
 			/* Before leaving current page, deal with any killed items */
 			if (so->numKilled > 0)
 				_bt_killitems(scan);
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 1608dd49d57..931edab0550 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1504,6 +1504,283 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 	return true;
 }
 
+
+static void
+AssertCheckBTBatchInfo(BTScanOpaque so)
+{
+#ifdef USE_ASSERT_CHECKING
+	/* should be valid items (with respect to the current leaf page) */
+	Assert(so->currPos.firstItem <= so->batch.firstIndex);
+	Assert(so->batch.firstIndex <= so->batch.lastIndex);
+	Assert(so->batch.lastIndex <= so->currPos.lastItem);
+#endif
+}
+
+/*
+ * _bt_copy_batch
+ *		Copy a section of the leaf page into the batch.
+ */
+static void
+_bt_copy_batch(IndexScanDesc scan, ScanDirection dir, BTScanOpaque so,
+			   int start, int end)
+{
+	/*
+	 * We're reading the first batch, and there should always be at least one
+	 * item (otherwise _bt_first would return false). So we should never get
+	 * into situation with empty start/end range. In the worst case, there is
+	 * just a single item, in which case (start == end).
+	 */
+	Assert(start <= end);
+
+	/* The range of items should fit into the current batch size. */
+	Assert((end - start + 1) <= scan->xs_batch->currSize);
+
+	so->batch.firstIndex = start;
+	so->batch.lastIndex = end;
+
+	AssertCheckBTBatchInfo(so);
+
+	/*
+	 * Walk through the range of index tuples, copy them into the batch. If
+	 * requested, set the index tuple too.
+	 *
+	 * We don't know if the batch is full already - we just try to add it, and
+	 * bail out if it fails.
+	 *
+	 * FIXME This seems wrong, actually. We use currSize when calculating the
+	 * start/end range, so the add should always succeed.
+	 */
+	while (start <= end)
+	{
+		BTScanPosItem *currItem = &so->currPos.items[start];
+		IndexTuple	itup = NULL;
+
+		if (scan->xs_want_itup)
+			itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+		/* try to add it to batch, if there's space */
+		if (!index_batch_add(scan, currItem->heapTid, scan->xs_recheck, itup, NULL))
+			break;
+
+		start++;
+	}
+
+	/*
+	 * set the starting point
+	 *
+	 * XXX might be better done in indexam.c
+	 */
+	if (ScanDirectionIsForward(dir))
+		scan->xs_batch->currIndex = -1;
+	else
+		scan->xs_batch->currIndex = scan->xs_batch->nheaptids;
+
+	/* shouldn't be possible to end here with an empty batch */
+	Assert(scan->xs_batch->nheaptids > 0);
+}
+
+/*
+ *	_bt_first_batch() -- Find the first batch in a scan.
+ *
+ * A batch variant of _bt_first(). Most of the comments for that function
+ * apply here too.
+ *
+ * XXX This only populates the batch, it does not set any other fields like
+ * scan->xs_heaptid or scan->xs_itup. That happens in getnext_tid() calls.
+ *
+ * XXX I'm not sure it works to mix batched and non-batches calls, e.g. get
+ * a TID and then a batch of TIDs. It probably should work as long as we
+ * update itemIndex correctly, but we need to be careful about killed items
+ * (right now the two places use different ways to communicate which items
+ * should be killed).
+ */
+bool
+_bt_first_batch(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	Assert(scan->xs_batch->nheaptids == 0);
+
+	/*
+	 * Mark the batch as empty.
+	 *
+	 * This might seems a bit strange, because surely the batch should be
+	 * empty before reading the first batch (we reset those indexes both in
+	 * btbeginscan and btrescan). So why not an assert? We can get here in
+	 * different ways too - not just after beginscan/rescan, but also when
+	 * iterating over ScalarArrayOps - in which case we'll see the last batch
+	 * of the preceding scan.
+	 */
+	so->batch.firstIndex = -1;
+	so->batch.lastIndex = -1;
+
+	/* we haven't visited any leaf pages yet, so proceed to reading one */
+	if (_bt_first(scan, dir))
+	{
+		/* range of the leaf to copy into the batch */
+		int			start,
+					end;
+
+		/* determine which part of the leaf page to extract */
+		if (ScanDirectionIsForward(dir))
+		{
+			start = so->currPos.firstItem;
+			end = Min(start + (scan->xs_batch->currSize - 1), so->currPos.lastItem);
+			so->currPos.itemIndex = (end + 1);
+		}
+		else
+		{
+			end = so->currPos.lastItem;
+			start = Max(end - (scan->xs_batch->currSize - 1),
+						so->currPos.firstItem);
+			so->currPos.itemIndex = (start - 1);
+		}
+
+		/*
+		 * Copy the selected range of items into the batch, set the batch
+		 * current index properly (before first / after last item, depending
+		 * on scan direction.
+		 */
+		_bt_copy_batch(scan, dir, so, start, end);
+
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ *	_bt_next_batch() -- Get the next batch of items in a scan.
+ *
+ * A batch variant of _bt_next(). Most of the comments for that function
+ * apply here too.
+ *
+ * We should only get here only when the current batch has no more items
+ * in the given direction. We don't get here with empty batches, that's
+ * handled by _bt_fist_batch().
+ *
+ * XXX See also the comments at _bt_first_batch() about returning a single
+ * batch for the page, etc.
+ */
+bool
+_bt_next_batch(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	int			start,
+				end;
+
+	Assert(scan->xs_batch->nheaptids == 0);
+
+	AssertCheckBTBatchInfo(so);
+
+	/*
+	 * Check if we still have some items on the current leaf page. If yes,
+	 * load them into a batch and return.
+	 *
+	 * XXX try combining that with the next block, the inner while loop is
+	 * exactly the same.
+	 */
+	if (ScanDirectionIsForward(dir))
+	{
+		start = so->batch.lastIndex + 1;
+		end = Min(start + (scan->xs_batch->currSize - 1), so->currPos.lastItem);
+		so->currPos.itemIndex = (end + 1);
+	}
+	else
+	{
+		end = so->batch.firstIndex - 1;
+		start = Max(end - (scan->xs_batch->currSize - 1),
+					so->currPos.firstItem);
+		so->currPos.itemIndex = (start - 1);
+	}
+
+	/*
+	 * We have more items on the current leaf page.
+	 */
+	if (start <= end)
+	{
+		_bt_copy_batch(scan, dir, so, start, end);
+		return true;
+	}
+
+	/*
+	 * We've consumed all items from the current leaf page, so try reading the
+	 * next one, and process it.
+	 */
+	if (_bt_next(scan, dir))
+	{
+		/*
+		 * Check if we still have some items on the current leaf page. If yes,
+		 * load them into a batch and return.
+		 *
+		 * XXX try combining that with the next block, the inner while loop is
+		 * exactly the same.
+		 */
+		if (ScanDirectionIsForward(dir))
+		{
+			start = so->currPos.firstItem;
+			end = Min(start + (scan->xs_batch->currSize - 1), so->currPos.lastItem);
+			so->currPos.itemIndex = (end + 1);
+		}
+		else
+		{
+			end = so->currPos.lastItem;
+			start = Max(end - (scan->xs_batch->currSize - 1),
+						so->currPos.firstItem);
+			so->currPos.itemIndex = (start - 1);
+		}
+
+		_bt_copy_batch(scan, dir, so, start, end);
+
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ *	_bt_kill_batch() -- remember the items-to-be-killed from the current batch
+ *
+ * We simply translate the bitmap into the "regular" killedItems array, and let
+ * that to drive which items are killed.
+ */
+void
+_bt_kill_batch(IndexScanDesc scan)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	/* bail out if batching not enabled */
+	if (!scan->xs_batch)
+		return;
+
+	for (int i = 0; i < scan->xs_batch->nKilledItems; i++)
+	{
+		int			index = (so->batch.firstIndex + scan->xs_batch->killedItems[i]);
+
+		/* make sure we have a valid index (in the current leaf page) */
+		Assert((so->batch.firstIndex <= index) &&
+			   (index <= so->batch.lastIndex));
+
+		/*
+		 * Yes, remember it for later. (We'll deal with all such tuples at
+		 * once right before leaving the index page.)  The test for numKilled
+		 * overrun is not just paranoia: if the caller reverses direction in
+		 * the indexscan then the same item might get entered multiple times.
+		 * It's not worth trying to optimize that, so we don't detect it, but
+		 * instead just forget any excess entries.
+		 */
+		if (so->killedItems == NULL)
+			so->killedItems = (int *)
+				palloc(MaxTIDsPerBTreePage * sizeof(int));
+		if (so->numKilled < MaxTIDsPerBTreePage)
+			so->killedItems[so->numKilled++] = index;
+	}
+
+	/* now reset the number of killed items */
+	scan->xs_batch->nKilledItems = 0;
+}
+
 /*
  *	_bt_readpage() -- Load data from current index page into so->currPos
  *
@@ -2028,6 +2305,9 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 
 	Assert(BTScanPosIsValid(so->currPos));
 
+	/* Transfer killed items from the batch to the regular array. */
+	_bt_kill_batch(scan);
+
 	/* Before leaving current page, deal with any killed items */
 	if (so->numKilled > 0)
 		_bt_killitems(scan);
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 123fba624db..01644709cb8 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1028,6 +1028,14 @@ typedef struct BTArrayKeyInfo
 	Datum	   *elem_values;	/* array of num_elems Datums */
 } BTArrayKeyInfo;
 
+/* Information about the current batch (in batched index scans) */
+typedef struct BTBatchInfo
+{
+	/* Current range of items in a batch (if used). */
+	int			firstIndex;
+	int			lastIndex;
+} BTBatchInfo;
+
 typedef struct BTScanOpaqueData
 {
 	/* these fields are set by _bt_preprocess_keys(): */
@@ -1048,6 +1056,9 @@ typedef struct BTScanOpaqueData
 	int		   *killedItems;	/* currPos.items indexes of killed items */
 	int			numKilled;		/* number of currently stored items */
 
+	/* info about current batch */
+	BTBatchInfo batch;
+
 	/*
 	 * If we are doing an index-only scan, these are the tuple storage
 	 * workspaces for the currPos and markPos respectively.  Each is of size
@@ -1162,6 +1173,7 @@ extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
 extern Size btestimateparallelscan(int nkeys, int norderbys);
 extern void btinitparallelscan(void *target);
 extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
+extern bool btgetbatch(IndexScanDesc scan, ScanDirection dir);
 extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
 extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 					 ScanKey orderbys, int norderbys);
@@ -1269,6 +1281,9 @@ extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
 extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_first_batch(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_next_batch(IndexScanDesc scan, ScanDirection dir);
+extern void _bt_kill_batch(IndexScanDesc scan);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
 
 /*
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 6378e182238..d86582db454 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -188,6 +188,7 @@ BOOL
 BOOLEAN
 BOX
 BTArrayKeyInfo
+BTBatchInfo
 BTBuildState
 BTCycleId
 BTDedupInterval
-- 
2.47.0

