From e2a94f1f38fb73289a9d7701ab3a2e5bb5370374 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Wed, 1 Jan 2025 22:10:37 +0100
Subject: [PATCH v20250501 3/7] WIP: Don't read the same block repeatedly

---
 src/backend/access/heap/heapam_handler.c | 57 ++++++++++++++++++++++--
 src/backend/access/index/indexam.c       | 13 ++++++
 src/backend/storage/buffer/bufmgr.c      | 40 +++++++++++++++++
 src/include/access/relscan.h             |  2 +
 src/include/storage/bufmgr.h             |  2 +
 5 files changed, 111 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index f79d97a8c64..326d5fed681 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -136,6 +136,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 	{
 		/* Switch to correct buffer if we don't have it already */
 		Buffer		prev_buf = hscan->xs_cbuf;
+		bool		release_prev = true;
 
 		/*
 		 * Read the block for the requested TID. With a read stream, simply
@@ -157,7 +158,56 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 		 * API.
 		 */
 		if (scan->rs)
-			hscan->xs_cbuf = read_stream_next_buffer(scan->rs, NULL);
+		{
+			/*
+			 * If we're trying to read the same block as the last time, don't
+			 * try reading it from the stream again, but just return the last
+			 * buffer. We need to check if the previous buffer is still pinned
+			 * and contains the correct block (it might have been unpinned,
+			 * used for a different block, so we need to be careful).
+			 *
+			 * The place scheduling the blocks (index_scan_stream_read_next)
+			 * needs to do the same thing and not schedule the blocks if it
+			 * matches the previous one. Otherwise the stream will get out of
+			 * sync, causing confusion.
+			 *
+			 * This is what ReleaseAndReadBuffer does too, but it does not
+			 * have a queue of requests scheduled from somewhere else, so it
+			 * does not need to worry about that.
+			 *
+			 * XXX Maybe we should remember the block in IndexFetchTableData,
+			 * so that we can make the check even cheaper, without looking at
+			 * the buffer descriptor? But that assumes the buffer was not
+			 * unpinned (or repinned) elsewhere, before we got back here. But
+			 * can that even happen? If yes, I guess we shouldn't be releasing
+			 * the prev buffer anyway.
+			 *
+			 * XXX This has undesired impact on prefetch distance. The read
+			 * stream schedules reads for a certain number of future blocks,
+			 * but if we skip duplicate blocks, the prefetch distance may get
+			 * unexpectedly large (e.g. for correlated indexes, with long runs
+			 * of TIDs from the same heap page). This may spend a lot of CPU
+			 * time in the index_scan_stream_read_next callback, but more
+			 * importantly it may require reading (and keeping) a lot of leaf
+			 * pages from the index.
+			 *
+			 * XXX What if we pinned the buffer twice (increase the refcount),
+			 * so that if the caller unpins the buffer, we still keep the
+			 * second pin. Wouldn't that mean we don't need to worry about the
+			 * possibility someone loaded another page into the buffer?
+			 *
+			 * XXX We might also keep a longer history of recent blocks, not
+			 * just the immediately preceding one. But that makes it harder,
+			 * because the two places (read_next callback and here) need to
+			 * have a slightly different view.
+			 */
+			if (BufferMatches(hscan->xs_cbuf,
+							  hscan->xs_base.rel,
+							  ItemPointerGetBlockNumber(tid)))
+				release_prev = false;
+			else
+				hscan->xs_cbuf = read_stream_next_buffer(scan->rs, NULL);
+		}
 		else
 			hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf,
 												  hscan->xs_base.rel,
@@ -181,7 +231,8 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 			heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
 
 		/*
-		 * When using the read stream, release the old buffer.
+		 * When using the read stream, release the old buffer - but only if
+		 * we're reading a different block.
 		 *
 		 * XXX Not sure this is really needed, or maybe this is not the right
 		 * place to do this, and buffers should be released elsewhere. The
@@ -199,7 +250,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 		 * XXX Does this do the right thing when reading the same page? That
 		 * should return the same buffer, so won't we release it prematurely?
 		 */
-		if (scan->rs && (prev_buf != InvalidBuffer))
+		if (scan->rs && (prev_buf != InvalidBuffer) && release_prev)
 		{
 			ReleaseBuffer(prev_buf);
 		}
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index c5ba89499f1..f0fda6d761c 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -1915,6 +1915,15 @@ index_scan_stream_read_next(ReadStream *stream,
 				continue;
 			}
 
+			/* same block as before, don't need to read it */
+			if (scan->xs_batches->lastBlock == ItemPointerGetBlockNumber(tid))
+			{
+				DEBUG_LOG("index_scan_stream_read_next: skip block (lastBlock)");
+				continue;
+			}
+
+			scan->xs_batches->lastBlock = ItemPointerGetBlockNumber(tid);
+
 			return ItemPointerGetBlockNumber(tid);
 		}
 
@@ -2118,6 +2127,7 @@ index_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 		 */
 		scan->xs_batches->direction = direction;
 		scan->xs_batches->finished = false;
+		scan->xs_batches->lastBlock = InvalidBlockNumber;
 
 		index_batch_pos_reset(scan, &scan->xs_batches->streamPos);
 		read_stream_reset(scan->xs_heapfetch->rs);
@@ -2232,6 +2242,7 @@ index_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 					  scan->xs_batches->readPos.batch, scan->xs_batches->readPos.index);
 
 			scan->xs_batches->reset = false;
+			scan->xs_batches->lastBlock = InvalidBlockNumber;
 
 			/*
 			 * Need to reset the stream position, it might be too far behind.
@@ -2311,6 +2322,7 @@ index_batch_init(IndexScanDesc scan)
 	index_batch_pos_reset(scan, &scan->xs_batches->markPos);
 
 	// scan->xs_batches->currentBatch = NULL;
+	scan->xs_batches->lastBlock = InvalidBlockNumber;
 }
 
 /*
@@ -2393,6 +2405,7 @@ index_batch_reset(IndexScanDesc scan, bool complete)
 	batches->finished = false;
 	batches->reset = false;
 	// batches->currentBatch = NULL;
+	batches->lastBlock = InvalidBlockNumber;
 
 	AssertCheckBatches(scan);
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 0b317d2d809..35c3526e250 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3045,6 +3045,46 @@ ReleaseAndReadBuffer(Buffer buffer,
 	return ReadBuffer(relation, blockNum);
 }
 
+/*
+ * BufferMatches
+ *		Check if the buffer (still) contains the expected page.
+ *
+ * Check if the buffer contains the expected page. The buffer may be invalid,
+ * or valid and pinned.
+ */
+bool
+BufferMatches(Buffer buffer,
+			  Relation relation,
+			  BlockNumber blockNum)
+{
+	ForkNumber	forkNum = MAIN_FORKNUM;
+	BufferDesc *bufHdr;
+
+	if (BufferIsValid(buffer))
+	{
+		Assert(BufferIsPinned(buffer));
+		if (BufferIsLocal(buffer))
+		{
+			bufHdr = GetLocalBufferDescriptor(-buffer - 1);
+			if (bufHdr->tag.blockNum == blockNum &&
+				BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
+				BufTagGetForkNum(&bufHdr->tag) == forkNum)
+				return true;
+		}
+		else
+		{
+			bufHdr = GetBufferDescriptor(buffer - 1);
+			/* we have pin, so it's ok to examine tag without spinlock */
+			if (bufHdr->tag.blockNum == blockNum &&
+				BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
+				BufTagGetForkNum(&bufHdr->tag) == forkNum)
+				return true;
+		}
+	}
+
+	return false;
+}
+
 /*
  * PinBuffer -- make buffer unavailable for replacement.
  *
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index b63af845ca6..2bbd0db0223 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -242,6 +242,8 @@ typedef struct IndexScanBatches
 	bool		finished;
 	bool		reset;
 
+	BlockNumber lastBlock;
+
 	/*
 	 * Current scan direction, for the currently loaded batches. This is used
 	 * to load data in the read stream API callback, etc.
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 41fdc1e7693..3b7d4e6a6a2 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -237,6 +237,8 @@ extern void IncrBufferRefCount(Buffer buffer);
 extern void CheckBufferIsPinnedOnce(Buffer buffer);
 extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
 								   BlockNumber blockNum);
+extern bool BufferMatches(Buffer buffer, Relation relation,
+						  BlockNumber blockNum);
 
 extern Buffer ExtendBufferedRel(BufferManagerRelation bmr,
 								ForkNumber forkNum,
-- 
2.49.0

