public inbox for [email protected]  
help / color / mirror / Atom feed
Streamify more code paths
35+ messages / 4 participants
[nested] [flat]

* Streamify more code paths
@ 2025-12-25 05:51 Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2025-12-25 05:51 UTC (permalink / raw)
  To: pgsql-hackers <[email protected]>

Hi Hackers,

I noticed several additional paths in contrib modules, beyond [1],
that are potentially suitable for streamification:

1) pgstattuple — pgstatapprox.c and parts of pgstattuple_approx_internal
2) Bloom — scan paths in blgetbitmap() and maintenance paths in blbulkdelete()

The following patches streamify those code paths. No benchmarks have
been run yet.

[1] https://www.postgresql.org/message-id/flat/CABPTF7UeN2o-trr9r7K76rZExnO2M4SLfvTfbUY2CwQjCekgnQ%40mai...

Feedbacks welcome.

--
Best,
Xuneng


Attachments:

  [application/x-patch] v1-0003-Streamify-heap-bloat-estimation-scan.patch (6.0K, 2-v1-0003-Streamify-heap-bloat-estimation-scan.patch)
  download | inline diff:
From 8444107113a3b9a237520b41d322f7202e8c1502 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Thu, 25 Dec 2025 13:40:13 +0800
Subject: [PATCH v1 3/3] Streamify heap bloat estimation scan.

Introduce a read-stream callback to skip all-visible pages via VM/FSM lookup and stream-read the rest, reducing page reads and improving pgstattuple_approx execution time on large relations.
---
 contrib/pgstattuple/pgstatapprox.c | 125 ++++++++++++++++++++++-------
 1 file changed, 94 insertions(+), 31 deletions(-)

diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c
index a59ff4e9d4f..eb5c26ffc10 100644
--- a/contrib/pgstattuple/pgstatapprox.c
+++ b/contrib/pgstattuple/pgstatapprox.c
@@ -23,6 +23,7 @@
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
 #include "storage/procarray.h"
+#include "storage/read_stream.h"
 
 PG_FUNCTION_INFO_V1(pgstattuple_approx);
 PG_FUNCTION_INFO_V1(pgstattuple_approx_v1_5);
@@ -45,6 +46,61 @@ typedef struct output_type
 
 #define NUM_OUTPUT_COLUMNS 10
 
+/*
+ * Struct for statapprox_heap read stream callback.
+ */
+typedef struct StatApproxReadStreamPrivate
+{
+	Relation	rel;
+	output_type *stat;
+	BlockNumber current_blocknum;
+	BlockNumber nblocks;
+	BlockNumber scanned;		/* count of pages actually read */
+	Buffer		vmbuffer;		/* for VM lookups */
+}			StatApproxReadStreamPrivate;
+
+/*
+ * Read stream callback for statapprox_heap.
+ *
+ * This callback checks the visibility map for each block. If the block is
+ * all-visible, we can get the free space from the FSM without reading the
+ * actual page, and skip to the next block. Only blocks that are not
+ * all-visible are returned for actual reading.
+ */
+static BlockNumber
+statapprox_heap_read_stream_next(ReadStream *stream,
+								 void *callback_private_data,
+								 void *per_buffer_data)
+{
+	StatApproxReadStreamPrivate *p = callback_private_data;
+
+	while (p->current_blocknum < p->nblocks)
+	{
+		BlockNumber blkno = p->current_blocknum++;
+		Size		freespace;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * If the page has only visible tuples, then we can find out the free
+		 * space from the FSM and move on without reading the page.
+		 */
+		if (VM_ALL_VISIBLE(p->rel, blkno, &p->vmbuffer))
+		{
+			freespace = GetRecordedFreeSpace(p->rel, blkno);
+			p->stat->tuple_len += BLCKSZ - freespace;
+			p->stat->free_space += freespace;
+			continue;
+		}
+
+		/* This block needs to be read */
+		p->scanned++;
+		return blkno;
+	}
+
+	return InvalidBlockNumber;
+}
+
 /*
  * This function takes an already open relation and scans its pages,
  * skipping those that have the corresponding visibility map bit set.
@@ -58,53 +114,58 @@ typedef struct output_type
 static void
 statapprox_heap(Relation rel, output_type *stat)
 {
-	BlockNumber scanned,
-				nblocks,
-				blkno;
-	Buffer		vmbuffer = InvalidBuffer;
+	BlockNumber nblocks;
 	BufferAccessStrategy bstrategy;
 	TransactionId OldestXmin;
+	StatApproxReadStreamPrivate p;
+	ReadStream *stream;
 
 	OldestXmin = GetOldestNonRemovableTransactionId(rel);
 	bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
 	nblocks = RelationGetNumberOfBlocks(rel);
-	scanned = 0;
 
-	for (blkno = 0; blkno < nblocks; blkno++)
+	/* Initialize read stream private data */
+	p.rel = rel;
+	p.stat = stat;
+	p.current_blocknum = 0;
+	p.nblocks = nblocks;
+	p.scanned = 0;
+	p.vmbuffer = InvalidBuffer;
+
+	/*
+	 * Create the read stream. We don't use READ_STREAM_USE_BATCHING because
+	 * the callback accesses the visibility map which may need to read VM
+	 * pages. While this shouldn't cause deadlocks, we err on the side of
+	 * caution.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										statapprox_heap_read_stream_next,
+										&p,
+										0);
+
+	for (;;)
 	{
 		Buffer		buf;
 		Page		page;
 		OffsetNumber offnum,
 					maxoff;
-		Size		freespace;
-
-		CHECK_FOR_INTERRUPTS();
-
-		/*
-		 * If the page has only visible tuples, then we can find out the free
-		 * space from the FSM and move on.
-		 */
-		if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
-		{
-			freespace = GetRecordedFreeSpace(rel, blkno);
-			stat->tuple_len += BLCKSZ - freespace;
-			stat->free_space += freespace;
-			continue;
-		}
+		BlockNumber blkno;
 
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
-								 RBM_NORMAL, bstrategy);
+		buf = read_stream_next_buffer(stream, NULL);
+		if (buf == InvalidBuffer)
+			break;
 
 		LockBuffer(buf, BUFFER_LOCK_SHARE);
 
 		page = BufferGetPage(buf);
+		blkno = BufferGetBlockNumber(buf);
 
 		stat->free_space += PageGetExactFreeSpace(page);
 
-		/* We may count the page as scanned even if it's new/empty */
-		scanned++;
-
 		if (PageIsNew(page) || PageIsEmpty(page))
 		{
 			UnlockReleaseBuffer(buf);
@@ -169,6 +230,8 @@ statapprox_heap(Relation rel, output_type *stat)
 		UnlockReleaseBuffer(buf);
 	}
 
+	read_stream_end(stream);
+
 	stat->table_len = (uint64) nblocks * BLCKSZ;
 
 	/*
@@ -179,7 +242,7 @@ statapprox_heap(Relation rel, output_type *stat)
 	 * tuples in all-visible pages, so no correction is needed for that, and
 	 * we already accounted for the space in those pages, too.
 	 */
-	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, scanned,
+	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, p.scanned,
 											   stat->tuple_count);
 
 	/* It's not clear if we could get -1 here, but be safe. */
@@ -190,16 +253,16 @@ statapprox_heap(Relation rel, output_type *stat)
 	 */
 	if (nblocks != 0)
 	{
-		stat->scanned_percent = 100.0 * scanned / nblocks;
+		stat->scanned_percent = 100.0 * p.scanned / nblocks;
 		stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len;
 		stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len;
 		stat->free_percent = 100.0 * stat->free_space / stat->table_len;
 	}
 
-	if (BufferIsValid(vmbuffer))
+	if (BufferIsValid(p.vmbuffer))
 	{
-		ReleaseBuffer(vmbuffer);
-		vmbuffer = InvalidBuffer;
+		ReleaseBuffer(p.vmbuffer);
+		p.vmbuffer = InvalidBuffer;
 	}
 }
 
-- 
2.51.0



  [application/x-patch] v1-0002-Streamify-Bloom-VACUUM-paths.patch (4.2K, 3-v1-0002-Streamify-Bloom-VACUUM-paths.patch)
  download | inline diff:
From d3b15792ee09d7aba4df76273d8883a739d215ce Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Thu, 25 Dec 2025 13:40:02 +0800
Subject: [PATCH v1 2/3] Streamify Bloom VACUUM paths.

Use streaming reads in blbulkdelete() and blvacuumcleanup() to iterate index pages without repeated ReadBuffer calls, improving VACUUM performance and reducing buffer manager overhead during maintenance operations.
---
 contrib/bloom/blvacuum.c | 55 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blvacuum.c b/contrib/bloom/blvacuum.c
index e68a9008f56..7452302f022 100644
--- a/contrib/bloom/blvacuum.c
+++ b/contrib/bloom/blvacuum.c
@@ -17,6 +17,7 @@
 #include "commands/vacuum.h"
 #include "storage/bufmgr.h"
 #include "storage/indexfsm.h"
+#include "storage/read_stream.h"
 
 
 /*
@@ -40,6 +41,8 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Page		page;
 	BloomMetaPageData *metaData;
 	GenericXLogState *gxlogState;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (stats == NULL)
 		stats = palloc0_object(IndexBulkDeleteResult);
@@ -51,6 +54,25 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 * they can't contain tuples to delete.
 	 */
 	npages = RelationGetNumberOfBlocks(index);
+
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		BloomTuple *itup,
@@ -59,8 +81,7 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		gxlogState = GenericXLogStart(index);
@@ -133,6 +154,9 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/*
 	 * Update the metapage's notFullPage list with whatever we found.  Our
 	 * info could already be out of date at this point, but blinsert() will
@@ -166,6 +190,8 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	Relation	index = info->index;
 	BlockNumber npages,
 				blkno;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (info->analyze_only)
 		return stats;
@@ -181,6 +207,25 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	stats->num_pages = npages;
 	stats->pages_free = 0;
 	stats->num_index_tuples = 0;
+
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
@@ -188,8 +233,7 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -206,6 +250,9 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	IndexFreeSpaceMapVacuum(info->index);
 
 	return stats;
-- 
2.51.0



  [application/x-patch] v1-0001-Switch-Bloom-scan-paths-to-streaming-read.patch (2.3K, 4-v1-0001-Switch-Bloom-scan-paths-to-streaming-read.patch)
  download | inline diff:
From 0a211e788d964aebd876dc4472440e8f234ce38a Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Thu, 25 Dec 2025 13:39:42 +0800
Subject: [PATCH v1 1/3] Switch Bloom scan paths to streaming read.

Replace per-page ReadBuffer loops in blgetbitmap() with read_stream_begin_relation() and sequential buffer iteration, reducing buffer churn and improving scan efficiency on large Bloom indexes.
---
 contrib/bloom/blscan.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blscan.c b/contrib/bloom/blscan.c
index 0d71edbe91c..b1fdabaab74 100644
--- a/contrib/bloom/blscan.c
+++ b/contrib/bloom/blscan.c
@@ -17,6 +17,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
+#include "storage/read_stream.h"
 
 /*
  * Begin scan of bloom index.
@@ -75,11 +76,13 @@ int64
 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 {
 	int64		ntids = 0;
-	BlockNumber blkno = BLOOM_HEAD_BLKNO,
+	BlockNumber blkno,
 				npages;
 	int			i;
 	BufferAccessStrategy bas;
 	BloomScanOpaque so = (BloomScanOpaque) scan->opaque;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (so->sign == NULL)
 	{
@@ -119,14 +122,29 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	if (scan->instrument)
 		scan->instrument->nsearches++;
 
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										bas,
+										scan->indexRelation,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
 		Page		page;
 
-		buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM,
-									blkno, RBM_NORMAL, bas);
-
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -162,6 +180,9 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 		UnlockReleaseBuffer(buffer);
 		CHECK_FOR_INTERRUPTS();
 	}
+
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
 	FreeAccessStrategy(bas);
 
 	return ntids;
-- 
2.51.0



^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
@ 2025-12-25 06:33 ` Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2025-12-25 06:33 UTC (permalink / raw)
  To: pgsql-hackers <[email protected]>

Hi,

On Thu, Dec 25, 2025 at 1:51 PM Xuneng Zhou <[email protected]> wrote:
>
> Hi Hackers,
>
> I noticed several additional paths in contrib modules, beyond [1],
> that are potentially suitable for streamification:
>
> 1) pgstattuple — pgstatapprox.c and parts of pgstattuple_approx_internal
> 2) Bloom — scan paths in blgetbitmap() and maintenance paths in blbulkdelete()
>
> The following patches streamify those code paths. No benchmarks have
> been run yet.
>
> [1] https://www.postgresql.org/message-id/flat/CABPTF7UeN2o-trr9r7K76rZExnO2M4SLfvTfbUY2CwQjCekgnQ%40mai...
>
> Feedbacks welcome.
>

One more in ginvacuumcleanup().

-- 
Best,
Xuneng


Attachments:

  [application/octet-stream] v1-0004-Replace-synchronous-ReadBufferExtended-loop-with-.patch (2.5K, 2-v1-0004-Replace-synchronous-ReadBufferExtended-loop-with-.patch)
  download | inline diff:
From bd3f1b32528bff3897aab09c72c1143c93997b7d Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Thu, 25 Dec 2025 14:12:08 +0800
Subject: [PATCH v1 4/4] Replace synchronous ReadBufferExtended() loop with the
 streaming read in ginvacuumcleanup() to improve I/O efficiency during GIN
 index vacuum cleanup operations

---
 src/backend/access/gin/ginvacuum.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index d7baf7c847c..58e05c71256 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -22,6 +22,7 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "storage/read_stream.h"
 #include "utils/memutils.h"
 
 struct GinVacuumState
@@ -693,6 +694,8 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	BlockNumber totFreePages;
 	GinState	ginstate;
 	GinStatsData idxStat;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	/*
 	 * In an autovacuum analyze, we want to clean up pending insertions.
@@ -743,6 +746,24 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 	totFreePages = 0;
 
+	/* Scan all blocks starting from the root using streaming reads */
+	p.current_blocknum = GIN_ROOT_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
@@ -750,8 +771,8 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
+
 		LockBuffer(buffer, GIN_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -776,6 +797,9 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/* Update the metapage with accurate page and entry counts */
 	idxStat.nTotalPages = npages;
 	ginUpdateStats(info->index, &idxStat, false);
-- 
2.51.0



  [application/octet-stream] v1-0001-Switch-Bloom-scan-paths-to-the-streaming-read.patch (2.4K, 3-v1-0001-Switch-Bloom-scan-paths-to-the-streaming-read.patch)
  download | inline diff:
From 0a211e788d964aebd876dc4472440e8f234ce38a Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Thu, 25 Dec 2025 13:39:42 +0800
Subject: [PATCH v1 1/4] Switch Bloom scan paths to the streaming read.

Replace per-page ReadBuffer loops in blgetbitmap() with read_stream_begin_relation() and sequential buffer iteration, reducing buffer churn and improving scan efficiency on large Bloom indexes.
---
 contrib/bloom/blscan.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blscan.c b/contrib/bloom/blscan.c
index 0d71edbe91c..b1fdabaab74 100644
--- a/contrib/bloom/blscan.c
+++ b/contrib/bloom/blscan.c
@@ -17,6 +17,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
+#include "storage/read_stream.h"
 
 /*
  * Begin scan of bloom index.
@@ -75,11 +76,13 @@ int64
 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 {
 	int64		ntids = 0;
-	BlockNumber blkno = BLOOM_HEAD_BLKNO,
+	BlockNumber blkno,
 				npages;
 	int			i;
 	BufferAccessStrategy bas;
 	BloomScanOpaque so = (BloomScanOpaque) scan->opaque;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (so->sign == NULL)
 	{
@@ -119,14 +122,29 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	if (scan->instrument)
 		scan->instrument->nsearches++;
 
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										bas,
+										scan->indexRelation,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
 		Page		page;
 
-		buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM,
-									blkno, RBM_NORMAL, bas);
-
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -162,6 +180,9 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 		UnlockReleaseBuffer(buffer);
 		CHECK_FOR_INTERRUPTS();
 	}
+
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
 	FreeAccessStrategy(bas);
 
 	return ntids;
-- 
2.51.0



  [application/octet-stream] v1-0002-Streamify-Bloom-VACUUM-paths.patch (4.2K, 4-v1-0002-Streamify-Bloom-VACUUM-paths.patch)
  download | inline diff:
From d3b15792ee09d7aba4df76273d8883a739d215ce Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Thu, 25 Dec 2025 13:40:02 +0800
Subject: [PATCH v1 2/4] Streamify Bloom VACUUM paths.

Use streaming reads in blbulkdelete() and blvacuumcleanup() to iterate index pages without repeated ReadBuffer calls, improving VACUUM performance and reducing buffer manager overhead during maintenance operations.
---
 contrib/bloom/blvacuum.c | 55 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blvacuum.c b/contrib/bloom/blvacuum.c
index e68a9008f56..7452302f022 100644
--- a/contrib/bloom/blvacuum.c
+++ b/contrib/bloom/blvacuum.c
@@ -17,6 +17,7 @@
 #include "commands/vacuum.h"
 #include "storage/bufmgr.h"
 #include "storage/indexfsm.h"
+#include "storage/read_stream.h"
 
 
 /*
@@ -40,6 +41,8 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Page		page;
 	BloomMetaPageData *metaData;
 	GenericXLogState *gxlogState;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (stats == NULL)
 		stats = palloc0_object(IndexBulkDeleteResult);
@@ -51,6 +54,25 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 * they can't contain tuples to delete.
 	 */
 	npages = RelationGetNumberOfBlocks(index);
+
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		BloomTuple *itup,
@@ -59,8 +81,7 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		gxlogState = GenericXLogStart(index);
@@ -133,6 +154,9 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/*
 	 * Update the metapage's notFullPage list with whatever we found.  Our
 	 * info could already be out of date at this point, but blinsert() will
@@ -166,6 +190,8 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	Relation	index = info->index;
 	BlockNumber npages,
 				blkno;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (info->analyze_only)
 		return stats;
@@ -181,6 +207,25 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	stats->num_pages = npages;
 	stats->pages_free = 0;
 	stats->num_index_tuples = 0;
+
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
@@ -188,8 +233,7 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -206,6 +250,9 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	IndexFreeSpaceMapVacuum(info->index);
 
 	return stats;
-- 
2.51.0



  [application/octet-stream] v1-0003-Streamify-heap-bloat-estimation-scan.patch (6.0K, 5-v1-0003-Streamify-heap-bloat-estimation-scan.patch)
  download | inline diff:
From 8444107113a3b9a237520b41d322f7202e8c1502 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Thu, 25 Dec 2025 13:40:13 +0800
Subject: [PATCH v1 3/4] Streamify heap bloat estimation scan.

Introduce a read-stream callback to skip all-visible pages via VM/FSM lookup and stream-read the rest, reducing page reads and improving pgstattuple_approx execution time on large relations.
---
 contrib/pgstattuple/pgstatapprox.c | 125 ++++++++++++++++++++++-------
 1 file changed, 94 insertions(+), 31 deletions(-)

diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c
index a59ff4e9d4f..eb5c26ffc10 100644
--- a/contrib/pgstattuple/pgstatapprox.c
+++ b/contrib/pgstattuple/pgstatapprox.c
@@ -23,6 +23,7 @@
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
 #include "storage/procarray.h"
+#include "storage/read_stream.h"
 
 PG_FUNCTION_INFO_V1(pgstattuple_approx);
 PG_FUNCTION_INFO_V1(pgstattuple_approx_v1_5);
@@ -45,6 +46,61 @@ typedef struct output_type
 
 #define NUM_OUTPUT_COLUMNS 10
 
+/*
+ * Struct for statapprox_heap read stream callback.
+ */
+typedef struct StatApproxReadStreamPrivate
+{
+	Relation	rel;
+	output_type *stat;
+	BlockNumber current_blocknum;
+	BlockNumber nblocks;
+	BlockNumber scanned;		/* count of pages actually read */
+	Buffer		vmbuffer;		/* for VM lookups */
+}			StatApproxReadStreamPrivate;
+
+/*
+ * Read stream callback for statapprox_heap.
+ *
+ * This callback checks the visibility map for each block. If the block is
+ * all-visible, we can get the free space from the FSM without reading the
+ * actual page, and skip to the next block. Only blocks that are not
+ * all-visible are returned for actual reading.
+ */
+static BlockNumber
+statapprox_heap_read_stream_next(ReadStream *stream,
+								 void *callback_private_data,
+								 void *per_buffer_data)
+{
+	StatApproxReadStreamPrivate *p = callback_private_data;
+
+	while (p->current_blocknum < p->nblocks)
+	{
+		BlockNumber blkno = p->current_blocknum++;
+		Size		freespace;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * If the page has only visible tuples, then we can find out the free
+		 * space from the FSM and move on without reading the page.
+		 */
+		if (VM_ALL_VISIBLE(p->rel, blkno, &p->vmbuffer))
+		{
+			freespace = GetRecordedFreeSpace(p->rel, blkno);
+			p->stat->tuple_len += BLCKSZ - freespace;
+			p->stat->free_space += freespace;
+			continue;
+		}
+
+		/* This block needs to be read */
+		p->scanned++;
+		return blkno;
+	}
+
+	return InvalidBlockNumber;
+}
+
 /*
  * This function takes an already open relation and scans its pages,
  * skipping those that have the corresponding visibility map bit set.
@@ -58,53 +114,58 @@ typedef struct output_type
 static void
 statapprox_heap(Relation rel, output_type *stat)
 {
-	BlockNumber scanned,
-				nblocks,
-				blkno;
-	Buffer		vmbuffer = InvalidBuffer;
+	BlockNumber nblocks;
 	BufferAccessStrategy bstrategy;
 	TransactionId OldestXmin;
+	StatApproxReadStreamPrivate p;
+	ReadStream *stream;
 
 	OldestXmin = GetOldestNonRemovableTransactionId(rel);
 	bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
 	nblocks = RelationGetNumberOfBlocks(rel);
-	scanned = 0;
 
-	for (blkno = 0; blkno < nblocks; blkno++)
+	/* Initialize read stream private data */
+	p.rel = rel;
+	p.stat = stat;
+	p.current_blocknum = 0;
+	p.nblocks = nblocks;
+	p.scanned = 0;
+	p.vmbuffer = InvalidBuffer;
+
+	/*
+	 * Create the read stream. We don't use READ_STREAM_USE_BATCHING because
+	 * the callback accesses the visibility map which may need to read VM
+	 * pages. While this shouldn't cause deadlocks, we err on the side of
+	 * caution.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										statapprox_heap_read_stream_next,
+										&p,
+										0);
+
+	for (;;)
 	{
 		Buffer		buf;
 		Page		page;
 		OffsetNumber offnum,
 					maxoff;
-		Size		freespace;
-
-		CHECK_FOR_INTERRUPTS();
-
-		/*
-		 * If the page has only visible tuples, then we can find out the free
-		 * space from the FSM and move on.
-		 */
-		if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
-		{
-			freespace = GetRecordedFreeSpace(rel, blkno);
-			stat->tuple_len += BLCKSZ - freespace;
-			stat->free_space += freespace;
-			continue;
-		}
+		BlockNumber blkno;
 
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
-								 RBM_NORMAL, bstrategy);
+		buf = read_stream_next_buffer(stream, NULL);
+		if (buf == InvalidBuffer)
+			break;
 
 		LockBuffer(buf, BUFFER_LOCK_SHARE);
 
 		page = BufferGetPage(buf);
+		blkno = BufferGetBlockNumber(buf);
 
 		stat->free_space += PageGetExactFreeSpace(page);
 
-		/* We may count the page as scanned even if it's new/empty */
-		scanned++;
-
 		if (PageIsNew(page) || PageIsEmpty(page))
 		{
 			UnlockReleaseBuffer(buf);
@@ -169,6 +230,8 @@ statapprox_heap(Relation rel, output_type *stat)
 		UnlockReleaseBuffer(buf);
 	}
 
+	read_stream_end(stream);
+
 	stat->table_len = (uint64) nblocks * BLCKSZ;
 
 	/*
@@ -179,7 +242,7 @@ statapprox_heap(Relation rel, output_type *stat)
 	 * tuples in all-visible pages, so no correction is needed for that, and
 	 * we already accounted for the space in those pages, too.
 	 */
-	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, scanned,
+	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, p.scanned,
 											   stat->tuple_count);
 
 	/* It's not clear if we could get -1 here, but be safe. */
@@ -190,16 +253,16 @@ statapprox_heap(Relation rel, output_type *stat)
 	 */
 	if (nblocks != 0)
 	{
-		stat->scanned_percent = 100.0 * scanned / nblocks;
+		stat->scanned_percent = 100.0 * p.scanned / nblocks;
 		stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len;
 		stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len;
 		stat->free_percent = 100.0 * stat->free_space / stat->table_len;
 	}
 
-	if (BufferIsValid(vmbuffer))
+	if (BufferIsValid(p.vmbuffer))
 	{
-		ReleaseBuffer(vmbuffer);
-		vmbuffer = InvalidBuffer;
+		ReleaseBuffer(p.vmbuffer);
+		p.vmbuffer = InvalidBuffer;
 	}
 }
 
-- 
2.51.0



^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2025-12-26 10:59   ` Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Nazir Bilal Yavuz @ 2025-12-26 10:59 UTC (permalink / raw)
  To: Xuneng Zhou <[email protected]>; +Cc: pgsql-hackers <[email protected]>

Hi,

Thank you for working on this!

On Thu, 25 Dec 2025 at 09:34, Xuneng Zhou <[email protected]> wrote:
>
> Hi,
>
> On Thu, Dec 25, 2025 at 1:51 PM Xuneng Zhou <[email protected]> wrote:
> >
> > Hi Hackers,
> >
> > I noticed several additional paths in contrib modules, beyond [1],
> > that are potentially suitable for streamification:
> >
> > 1) pgstattuple — pgstatapprox.c and parts of pgstattuple_approx_internal
> > 2) Bloom — scan paths in blgetbitmap() and maintenance paths in blbulkdelete()
> >
> > The following patches streamify those code paths. No benchmarks have
> > been run yet.
> >
> > [1] https://www.postgresql.org/message-id/flat/CABPTF7UeN2o-trr9r7K76rZExnO2M4SLfvTfbUY2CwQjCekgnQ%40mai...
> >
> > Feedbacks welcome.
> >
>
> One more in ginvacuumcleanup().

0001, 0002 and 0004 LGTM.

0003:

+        buf = read_stream_next_buffer(stream, NULL);
+        if (buf == InvalidBuffer)
+            break;

I think we are loosening the check here. We were sure that there were
no InvalidBuffers until the nblocks. Streamified version does not have
this check, it exits from the loop the first time it sees an
InvalidBuffer, which may be wrong. You might want to add
'Assert(p.current_blocknum == nblocks);' before read_stream_end() to
have a similar check.

--
Regards,
Nazir Bilal Yavuz
Microsoft





^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
@ 2025-12-26 16:41     ` Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2025-12-26 16:41 UTC (permalink / raw)
  To: Nazir Bilal Yavuz <[email protected]>; +Cc: pgsql-hackers <[email protected]>

Hi Bilal,

Thanks for your review!

On Fri, Dec 26, 2025 at 6:59 PM Nazir Bilal Yavuz <[email protected]> wrote:
>
> Hi,
>
> Thank you for working on this!
>
> On Thu, 25 Dec 2025 at 09:34, Xuneng Zhou <[email protected]> wrote:
> >
> > Hi,
> >
> > On Thu, Dec 25, 2025 at 1:51 PM Xuneng Zhou <[email protected]> wrote:
> > >
> > > Hi Hackers,
> > >
> > > I noticed several additional paths in contrib modules, beyond [1],
> > > that are potentially suitable for streamification:
> > >
> > > 1) pgstattuple — pgstatapprox.c and parts of pgstattuple_approx_internal
> > > 2) Bloom — scan paths in blgetbitmap() and maintenance paths in blbulkdelete()
> > >
> > > The following patches streamify those code paths. No benchmarks have
> > > been run yet.
> > >
> > > [1] https://www.postgresql.org/message-id/flat/CABPTF7UeN2o-trr9r7K76rZExnO2M4SLfvTfbUY2CwQjCekgnQ%40mai...
> > >
> > > Feedbacks welcome.
> > >
> >
> > One more in ginvacuumcleanup().
>
> 0001, 0002 and 0004 LGTM.
>
> 0003:
>
> +        buf = read_stream_next_buffer(stream, NULL);
> +        if (buf == InvalidBuffer)
> +            break;
>
> I think we are loosening the check here. We were sure that there were
> no InvalidBuffers until the nblocks. Streamified version does not have
> this check, it exits from the loop the first time it sees an
> InvalidBuffer, which may be wrong. You might want to add
> 'Assert(p.current_blocknum == nblocks);' before read_stream_end() to
> have a similar check.
>

Agree. The check has been added in v2 per your suggestion.

--
Best,
Xuneng


Attachments:

  [application/octet-stream] v2-0001-Switch-Bloom-scan-paths-to-streaming-read.patch (2.4K, 2-v2-0001-Switch-Bloom-scan-paths-to-streaming-read.patch)
  download | inline diff:
From 314f9cdf6d8a62cc8523b377b8cf19df646b9913 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:28:10 +0800
Subject: [PATCH v2 1/4] Switch Bloom scan paths to streaming read. Replace
 per-page ReadBuffer loops in blgetbitmap() with read_stream_begin_relation()
 and sequential buffer iteration, reducing buffer churn and improving scan
 efficiency on large Bloom indexes.

---
 contrib/bloom/blscan.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blscan.c b/contrib/bloom/blscan.c
index 0d71edbe91c..b1fdabaab74 100644
--- a/contrib/bloom/blscan.c
+++ b/contrib/bloom/blscan.c
@@ -17,6 +17,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
+#include "storage/read_stream.h"
 
 /*
  * Begin scan of bloom index.
@@ -75,11 +76,13 @@ int64
 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 {
 	int64		ntids = 0;
-	BlockNumber blkno = BLOOM_HEAD_BLKNO,
+	BlockNumber blkno,
 				npages;
 	int			i;
 	BufferAccessStrategy bas;
 	BloomScanOpaque so = (BloomScanOpaque) scan->opaque;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (so->sign == NULL)
 	{
@@ -119,14 +122,29 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	if (scan->instrument)
 		scan->instrument->nsearches++;
 
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										bas,
+										scan->indexRelation,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
 		Page		page;
 
-		buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM,
-									blkno, RBM_NORMAL, bas);
-
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -162,6 +180,9 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 		UnlockReleaseBuffer(buffer);
 		CHECK_FOR_INTERRUPTS();
 	}
+
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
 	FreeAccessStrategy(bas);
 
 	return ntids;
-- 
2.51.0



  [application/octet-stream] v2-0003-Streamify-heap-bloat-estimation-scan-Introduc.patch (6.0K, 3-v2-0003-Streamify-heap-bloat-estimation-scan-Introduc.patch)
  download | inline diff:
From 7a3fcec0cf44cd6d9848c930e0d7f620cd193698 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:29:02 +0800
Subject: [PATCH v2 3/4] Streamify heap bloat estimation scan. Introduce a
 read-stream callback to skip all-visible pages via VM/FSM lookup and
 stream-read the rest, reducing page reads and improving pgstattuple_approx
 execution time on large relations.

---
 contrib/pgstattuple/pgstatapprox.c | 126 ++++++++++++++++++++++-------
 1 file changed, 95 insertions(+), 31 deletions(-)

diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c
index a59ff4e9d4f..cb05b530ca8 100644
--- a/contrib/pgstattuple/pgstatapprox.c
+++ b/contrib/pgstattuple/pgstatapprox.c
@@ -23,6 +23,7 @@
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
 #include "storage/procarray.h"
+#include "storage/read_stream.h"
 
 PG_FUNCTION_INFO_V1(pgstattuple_approx);
 PG_FUNCTION_INFO_V1(pgstattuple_approx_v1_5);
@@ -45,6 +46,61 @@ typedef struct output_type
 
 #define NUM_OUTPUT_COLUMNS 10
 
+/*
+ * Struct for statapprox_heap read stream callback.
+ */
+typedef struct StatApproxReadStreamPrivate
+{
+	Relation	rel;
+	output_type *stat;
+	BlockNumber current_blocknum;
+	BlockNumber nblocks;
+	BlockNumber scanned;		/* count of pages actually read */
+	Buffer		vmbuffer;		/* for VM lookups */
+}			StatApproxReadStreamPrivate;
+
+/*
+ * Read stream callback for statapprox_heap.
+ *
+ * This callback checks the visibility map for each block. If the block is
+ * all-visible, we can get the free space from the FSM without reading the
+ * actual page, and skip to the next block. Only blocks that are not
+ * all-visible are returned for actual reading.
+ */
+static BlockNumber
+statapprox_heap_read_stream_next(ReadStream *stream,
+								 void *callback_private_data,
+								 void *per_buffer_data)
+{
+	StatApproxReadStreamPrivate *p = callback_private_data;
+
+	while (p->current_blocknum < p->nblocks)
+	{
+		BlockNumber blkno = p->current_blocknum++;
+		Size		freespace;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * If the page has only visible tuples, then we can find out the free
+		 * space from the FSM and move on without reading the page.
+		 */
+		if (VM_ALL_VISIBLE(p->rel, blkno, &p->vmbuffer))
+		{
+			freespace = GetRecordedFreeSpace(p->rel, blkno);
+			p->stat->tuple_len += BLCKSZ - freespace;
+			p->stat->free_space += freespace;
+			continue;
+		}
+
+		/* This block needs to be read */
+		p->scanned++;
+		return blkno;
+	}
+
+	return InvalidBlockNumber;
+}
+
 /*
  * This function takes an already open relation and scans its pages,
  * skipping those that have the corresponding visibility map bit set.
@@ -58,53 +114,58 @@ typedef struct output_type
 static void
 statapprox_heap(Relation rel, output_type *stat)
 {
-	BlockNumber scanned,
-				nblocks,
-				blkno;
-	Buffer		vmbuffer = InvalidBuffer;
+	BlockNumber nblocks;
 	BufferAccessStrategy bstrategy;
 	TransactionId OldestXmin;
+	StatApproxReadStreamPrivate p;
+	ReadStream *stream;
 
 	OldestXmin = GetOldestNonRemovableTransactionId(rel);
 	bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
 	nblocks = RelationGetNumberOfBlocks(rel);
-	scanned = 0;
 
-	for (blkno = 0; blkno < nblocks; blkno++)
+	/* Initialize read stream private data */
+	p.rel = rel;
+	p.stat = stat;
+	p.current_blocknum = 0;
+	p.nblocks = nblocks;
+	p.scanned = 0;
+	p.vmbuffer = InvalidBuffer;
+
+	/*
+	 * Create the read stream. We don't use READ_STREAM_USE_BATCHING because
+	 * the callback accesses the visibility map which may need to read VM
+	 * pages. While this shouldn't cause deadlocks, we err on the side of
+	 * caution.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										statapprox_heap_read_stream_next,
+										&p,
+										0);
+
+	for (;;)
 	{
 		Buffer		buf;
 		Page		page;
 		OffsetNumber offnum,
 					maxoff;
-		Size		freespace;
-
-		CHECK_FOR_INTERRUPTS();
-
-		/*
-		 * If the page has only visible tuples, then we can find out the free
-		 * space from the FSM and move on.
-		 */
-		if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
-		{
-			freespace = GetRecordedFreeSpace(rel, blkno);
-			stat->tuple_len += BLCKSZ - freespace;
-			stat->free_space += freespace;
-			continue;
-		}
+		BlockNumber blkno;
 
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
-								 RBM_NORMAL, bstrategy);
+		buf = read_stream_next_buffer(stream, NULL);
+		if (buf == InvalidBuffer)
+			break;
 
 		LockBuffer(buf, BUFFER_LOCK_SHARE);
 
 		page = BufferGetPage(buf);
+		blkno = BufferGetBlockNumber(buf);
 
 		stat->free_space += PageGetExactFreeSpace(page);
 
-		/* We may count the page as scanned even if it's new/empty */
-		scanned++;
-
 		if (PageIsNew(page) || PageIsEmpty(page))
 		{
 			UnlockReleaseBuffer(buf);
@@ -169,6 +230,9 @@ statapprox_heap(Relation rel, output_type *stat)
 		UnlockReleaseBuffer(buf);
 	}
 
+	Assert(p.current_blocknum == nblocks);
+	read_stream_end(stream);
+
 	stat->table_len = (uint64) nblocks * BLCKSZ;
 
 	/*
@@ -179,7 +243,7 @@ statapprox_heap(Relation rel, output_type *stat)
 	 * tuples in all-visible pages, so no correction is needed for that, and
 	 * we already accounted for the space in those pages, too.
 	 */
-	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, scanned,
+	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, p.scanned,
 											   stat->tuple_count);
 
 	/* It's not clear if we could get -1 here, but be safe. */
@@ -190,16 +254,16 @@ statapprox_heap(Relation rel, output_type *stat)
 	 */
 	if (nblocks != 0)
 	{
-		stat->scanned_percent = 100.0 * scanned / nblocks;
+		stat->scanned_percent = 100.0 * p.scanned / nblocks;
 		stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len;
 		stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len;
 		stat->free_percent = 100.0 * stat->free_space / stat->table_len;
 	}
 
-	if (BufferIsValid(vmbuffer))
+	if (BufferIsValid(p.vmbuffer))
 	{
-		ReleaseBuffer(vmbuffer);
-		vmbuffer = InvalidBuffer;
+		ReleaseBuffer(p.vmbuffer);
+		p.vmbuffer = InvalidBuffer;
 	}
 }
 
-- 
2.51.0



  [application/octet-stream] v2-0002-Streamify-Bloom-VACUUM-paths-Use-streaming-re.patch (4.2K, 4-v2-0002-Streamify-Bloom-VACUUM-paths-Use-streaming-re.patch)
  download | inline diff:
From aafdfa0851ab61777e8877cfb50e3b34bf54b9b1 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:28:49 +0800
Subject: [PATCH v2 2/4] Streamify Bloom VACUUM paths. Use streaming reads
 in blbulkdelete() and blvacuumcleanup() to iterate index pages without
 repeated ReadBuffer calls, improving VACUUM performance and reducing buffer
 manager overhead during maintenance operations.

---
 contrib/bloom/blvacuum.c | 55 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blvacuum.c b/contrib/bloom/blvacuum.c
index e68a9008f56..7452302f022 100644
--- a/contrib/bloom/blvacuum.c
+++ b/contrib/bloom/blvacuum.c
@@ -17,6 +17,7 @@
 #include "commands/vacuum.h"
 #include "storage/bufmgr.h"
 #include "storage/indexfsm.h"
+#include "storage/read_stream.h"
 
 
 /*
@@ -40,6 +41,8 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Page		page;
 	BloomMetaPageData *metaData;
 	GenericXLogState *gxlogState;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (stats == NULL)
 		stats = palloc0_object(IndexBulkDeleteResult);
@@ -51,6 +54,25 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 * they can't contain tuples to delete.
 	 */
 	npages = RelationGetNumberOfBlocks(index);
+
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		BloomTuple *itup,
@@ -59,8 +81,7 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		gxlogState = GenericXLogStart(index);
@@ -133,6 +154,9 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/*
 	 * Update the metapage's notFullPage list with whatever we found.  Our
 	 * info could already be out of date at this point, but blinsert() will
@@ -166,6 +190,8 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	Relation	index = info->index;
 	BlockNumber npages,
 				blkno;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (info->analyze_only)
 		return stats;
@@ -181,6 +207,25 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	stats->num_pages = npages;
 	stats->pages_free = 0;
 	stats->num_index_tuples = 0;
+
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
@@ -188,8 +233,7 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -206,6 +250,9 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	IndexFreeSpaceMapVacuum(info->index);
 
 	return stats;
-- 
2.51.0



  [application/octet-stream] v2-0004-Replace-synchronous-ReadBufferExtended-loop-with.patch (2.5K, 5-v2-0004-Replace-synchronous-ReadBufferExtended-loop-with.patch)
  download | inline diff:
From d628f3a5cab752d57332865c323479795629fb28 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:29:09 +0800
Subject: [PATCH v2 4/4] Replace synchronous ReadBufferExtended() loop with the
 streaming read in ginvacuumcleanup() to improve I/O efficiency during GIN
 index vacuum cleanup operations

---
 src/backend/access/gin/ginvacuum.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index d7baf7c847c..58e05c71256 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -22,6 +22,7 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "storage/read_stream.h"
 #include "utils/memutils.h"
 
 struct GinVacuumState
@@ -693,6 +694,8 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	BlockNumber totFreePages;
 	GinState	ginstate;
 	GinStatsData idxStat;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	/*
 	 * In an autovacuum analyze, we want to clean up pending insertions.
@@ -743,6 +746,24 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 	totFreePages = 0;
 
+	/* Scan all blocks starting from the root using streaming reads */
+	p.current_blocknum = GIN_ROOT_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
@@ -750,8 +771,8 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
+
 		LockBuffer(buffer, GIN_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -776,6 +797,9 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/* Update the metapage with accurate page and entry counts */
 	idxStat.nTotalPages = npages;
 	ginUpdateStats(info->index, &idxStat, false);
-- 
2.51.0



^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2025-12-28 11:41       ` Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2025-12-28 11:41 UTC (permalink / raw)
  To: Nazir Bilal Yavuz <[email protected]>; +Cc: pgsql-hackers <[email protected]>

Hi,

On Sat, Dec 27, 2025 at 12:41 AM Xuneng Zhou <[email protected]> wrote:
>
> Hi Bilal,
>
> Thanks for your review!
>
> On Fri, Dec 26, 2025 at 6:59 PM Nazir Bilal Yavuz <[email protected]> wrote:
> >
> > Hi,
> >
> > Thank you for working on this!
> >
> > On Thu, 25 Dec 2025 at 09:34, Xuneng Zhou <[email protected]> wrote:
> > >
> > > Hi,
> > >
> > > On Thu, Dec 25, 2025 at 1:51 PM Xuneng Zhou <[email protected]> wrote:
> > > >
> > > > Hi Hackers,
> > > >
> > > > I noticed several additional paths in contrib modules, beyond [1],
> > > > that are potentially suitable for streamification:
> > > >
> > > > 1) pgstattuple — pgstatapprox.c and parts of pgstattuple_approx_internal
> > > > 2) Bloom — scan paths in blgetbitmap() and maintenance paths in blbulkdelete()
> > > >
> > > > The following patches streamify those code paths. No benchmarks have
> > > > been run yet.
> > > >
> > > > [1] https://www.postgresql.org/message-id/flat/CABPTF7UeN2o-trr9r7K76rZExnO2M4SLfvTfbUY2CwQjCekgnQ%40mai...
> > > >
> > > > Feedbacks welcome.
> > > >
> > >
> > > One more in ginvacuumcleanup().
> >
> > 0001, 0002 and 0004 LGTM.
> >
> > 0003:
> >
> > +        buf = read_stream_next_buffer(stream, NULL);
> > +        if (buf == InvalidBuffer)
> > +            break;
> >
> > I think we are loosening the check here. We were sure that there were
> > no InvalidBuffers until the nblocks. Streamified version does not have
> > this check, it exits from the loop the first time it sees an
> > InvalidBuffer, which may be wrong. You might want to add
> > 'Assert(p.current_blocknum == nblocks);' before read_stream_end() to
> > have a similar check.
> >
>
> Agree. The check has been added in v2 per your suggestion.
>

Two more to go:
patch 5: Streamify log_newpage_range() WAL logging path
patch 6: Streamify hash index VACUUM primary bucket page reads

Benchmarks will be conducted soon.


--
Best,
Xuneng


Attachments:

  [application/x-patch] v2-0002-Streamify-Bloom-VACUUM-paths-Use-streaming-re.patch (4.2K, 2-v2-0002-Streamify-Bloom-VACUUM-paths-Use-streaming-re.patch)
  download | inline diff:
From aafdfa0851ab61777e8877cfb50e3b34bf54b9b1 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:28:49 +0800
Subject: [PATCH v2 2/4] Streamify Bloom VACUUM paths. Use streaming reads
 in blbulkdelete() and blvacuumcleanup() to iterate index pages without
 repeated ReadBuffer calls, improving VACUUM performance and reducing buffer
 manager overhead during maintenance operations.

---
 contrib/bloom/blvacuum.c | 55 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blvacuum.c b/contrib/bloom/blvacuum.c
index e68a9008f56..7452302f022 100644
--- a/contrib/bloom/blvacuum.c
+++ b/contrib/bloom/blvacuum.c
@@ -17,6 +17,7 @@
 #include "commands/vacuum.h"
 #include "storage/bufmgr.h"
 #include "storage/indexfsm.h"
+#include "storage/read_stream.h"
 
 
 /*
@@ -40,6 +41,8 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Page		page;
 	BloomMetaPageData *metaData;
 	GenericXLogState *gxlogState;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (stats == NULL)
 		stats = palloc0_object(IndexBulkDeleteResult);
@@ -51,6 +54,25 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 * they can't contain tuples to delete.
 	 */
 	npages = RelationGetNumberOfBlocks(index);
+
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		BloomTuple *itup,
@@ -59,8 +81,7 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		gxlogState = GenericXLogStart(index);
@@ -133,6 +154,9 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/*
 	 * Update the metapage's notFullPage list with whatever we found.  Our
 	 * info could already be out of date at this point, but blinsert() will
@@ -166,6 +190,8 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	Relation	index = info->index;
 	BlockNumber npages,
 				blkno;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (info->analyze_only)
 		return stats;
@@ -181,6 +207,25 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	stats->num_pages = npages;
 	stats->pages_free = 0;
 	stats->num_index_tuples = 0;
+
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
@@ -188,8 +233,7 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -206,6 +250,9 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	IndexFreeSpaceMapVacuum(info->index);
 
 	return stats;
-- 
2.51.0



  [application/x-patch] v2-0004-Replace-synchronous-ReadBufferExtended-loop-with.patch (2.5K, 3-v2-0004-Replace-synchronous-ReadBufferExtended-loop-with.patch)
  download | inline diff:
From d628f3a5cab752d57332865c323479795629fb28 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:29:09 +0800
Subject: [PATCH v2 4/4] Replace synchronous ReadBufferExtended() loop with the
 streaming read in ginvacuumcleanup() to improve I/O efficiency during GIN
 index vacuum cleanup operations

---
 src/backend/access/gin/ginvacuum.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index d7baf7c847c..58e05c71256 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -22,6 +22,7 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "storage/read_stream.h"
 #include "utils/memutils.h"
 
 struct GinVacuumState
@@ -693,6 +694,8 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	BlockNumber totFreePages;
 	GinState	ginstate;
 	GinStatsData idxStat;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	/*
 	 * In an autovacuum analyze, we want to clean up pending insertions.
@@ -743,6 +746,24 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 	totFreePages = 0;
 
+	/* Scan all blocks starting from the root using streaming reads */
+	p.current_blocknum = GIN_ROOT_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
@@ -750,8 +771,8 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
+
 		LockBuffer(buffer, GIN_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -776,6 +797,9 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/* Update the metapage with accurate page and entry counts */
 	idxStat.nTotalPages = npages;
 	ginUpdateStats(info->index, &idxStat, false);
-- 
2.51.0



  [application/x-patch] v2-0001-Switch-Bloom-scan-paths-to-streaming-read.patch (2.4K, 4-v2-0001-Switch-Bloom-scan-paths-to-streaming-read.patch)
  download | inline diff:
From 314f9cdf6d8a62cc8523b377b8cf19df646b9913 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:28:10 +0800
Subject: [PATCH v2 1/4] Switch Bloom scan paths to streaming read. Replace
 per-page ReadBuffer loops in blgetbitmap() with read_stream_begin_relation()
 and sequential buffer iteration, reducing buffer churn and improving scan
 efficiency on large Bloom indexes.

---
 contrib/bloom/blscan.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blscan.c b/contrib/bloom/blscan.c
index 0d71edbe91c..b1fdabaab74 100644
--- a/contrib/bloom/blscan.c
+++ b/contrib/bloom/blscan.c
@@ -17,6 +17,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
+#include "storage/read_stream.h"
 
 /*
  * Begin scan of bloom index.
@@ -75,11 +76,13 @@ int64
 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 {
 	int64		ntids = 0;
-	BlockNumber blkno = BLOOM_HEAD_BLKNO,
+	BlockNumber blkno,
 				npages;
 	int			i;
 	BufferAccessStrategy bas;
 	BloomScanOpaque so = (BloomScanOpaque) scan->opaque;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (so->sign == NULL)
 	{
@@ -119,14 +122,29 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	if (scan->instrument)
 		scan->instrument->nsearches++;
 
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										bas,
+										scan->indexRelation,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
 		Page		page;
 
-		buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM,
-									blkno, RBM_NORMAL, bas);
-
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -162,6 +180,9 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 		UnlockReleaseBuffer(buffer);
 		CHECK_FOR_INTERRUPTS();
 	}
+
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
 	FreeAccessStrategy(bas);
 
 	return ntids;
-- 
2.51.0



  [application/x-patch] v2-0003-Streamify-heap-bloat-estimation-scan-Introduc.patch (6.0K, 5-v2-0003-Streamify-heap-bloat-estimation-scan-Introduc.patch)
  download | inline diff:
From 7a3fcec0cf44cd6d9848c930e0d7f620cd193698 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:29:02 +0800
Subject: [PATCH v2 3/4] Streamify heap bloat estimation scan. Introduce a
 read-stream callback to skip all-visible pages via VM/FSM lookup and
 stream-read the rest, reducing page reads and improving pgstattuple_approx
 execution time on large relations.

---
 contrib/pgstattuple/pgstatapprox.c | 126 ++++++++++++++++++++++-------
 1 file changed, 95 insertions(+), 31 deletions(-)

diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c
index a59ff4e9d4f..cb05b530ca8 100644
--- a/contrib/pgstattuple/pgstatapprox.c
+++ b/contrib/pgstattuple/pgstatapprox.c
@@ -23,6 +23,7 @@
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
 #include "storage/procarray.h"
+#include "storage/read_stream.h"
 
 PG_FUNCTION_INFO_V1(pgstattuple_approx);
 PG_FUNCTION_INFO_V1(pgstattuple_approx_v1_5);
@@ -45,6 +46,61 @@ typedef struct output_type
 
 #define NUM_OUTPUT_COLUMNS 10
 
+/*
+ * Struct for statapprox_heap read stream callback.
+ */
+typedef struct StatApproxReadStreamPrivate
+{
+	Relation	rel;
+	output_type *stat;
+	BlockNumber current_blocknum;
+	BlockNumber nblocks;
+	BlockNumber scanned;		/* count of pages actually read */
+	Buffer		vmbuffer;		/* for VM lookups */
+}			StatApproxReadStreamPrivate;
+
+/*
+ * Read stream callback for statapprox_heap.
+ *
+ * This callback checks the visibility map for each block. If the block is
+ * all-visible, we can get the free space from the FSM without reading the
+ * actual page, and skip to the next block. Only blocks that are not
+ * all-visible are returned for actual reading.
+ */
+static BlockNumber
+statapprox_heap_read_stream_next(ReadStream *stream,
+								 void *callback_private_data,
+								 void *per_buffer_data)
+{
+	StatApproxReadStreamPrivate *p = callback_private_data;
+
+	while (p->current_blocknum < p->nblocks)
+	{
+		BlockNumber blkno = p->current_blocknum++;
+		Size		freespace;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * If the page has only visible tuples, then we can find out the free
+		 * space from the FSM and move on without reading the page.
+		 */
+		if (VM_ALL_VISIBLE(p->rel, blkno, &p->vmbuffer))
+		{
+			freespace = GetRecordedFreeSpace(p->rel, blkno);
+			p->stat->tuple_len += BLCKSZ - freespace;
+			p->stat->free_space += freespace;
+			continue;
+		}
+
+		/* This block needs to be read */
+		p->scanned++;
+		return blkno;
+	}
+
+	return InvalidBlockNumber;
+}
+
 /*
  * This function takes an already open relation and scans its pages,
  * skipping those that have the corresponding visibility map bit set.
@@ -58,53 +114,58 @@ typedef struct output_type
 static void
 statapprox_heap(Relation rel, output_type *stat)
 {
-	BlockNumber scanned,
-				nblocks,
-				blkno;
-	Buffer		vmbuffer = InvalidBuffer;
+	BlockNumber nblocks;
 	BufferAccessStrategy bstrategy;
 	TransactionId OldestXmin;
+	StatApproxReadStreamPrivate p;
+	ReadStream *stream;
 
 	OldestXmin = GetOldestNonRemovableTransactionId(rel);
 	bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
 	nblocks = RelationGetNumberOfBlocks(rel);
-	scanned = 0;
 
-	for (blkno = 0; blkno < nblocks; blkno++)
+	/* Initialize read stream private data */
+	p.rel = rel;
+	p.stat = stat;
+	p.current_blocknum = 0;
+	p.nblocks = nblocks;
+	p.scanned = 0;
+	p.vmbuffer = InvalidBuffer;
+
+	/*
+	 * Create the read stream. We don't use READ_STREAM_USE_BATCHING because
+	 * the callback accesses the visibility map which may need to read VM
+	 * pages. While this shouldn't cause deadlocks, we err on the side of
+	 * caution.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										statapprox_heap_read_stream_next,
+										&p,
+										0);
+
+	for (;;)
 	{
 		Buffer		buf;
 		Page		page;
 		OffsetNumber offnum,
 					maxoff;
-		Size		freespace;
-
-		CHECK_FOR_INTERRUPTS();
-
-		/*
-		 * If the page has only visible tuples, then we can find out the free
-		 * space from the FSM and move on.
-		 */
-		if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
-		{
-			freespace = GetRecordedFreeSpace(rel, blkno);
-			stat->tuple_len += BLCKSZ - freespace;
-			stat->free_space += freespace;
-			continue;
-		}
+		BlockNumber blkno;
 
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
-								 RBM_NORMAL, bstrategy);
+		buf = read_stream_next_buffer(stream, NULL);
+		if (buf == InvalidBuffer)
+			break;
 
 		LockBuffer(buf, BUFFER_LOCK_SHARE);
 
 		page = BufferGetPage(buf);
+		blkno = BufferGetBlockNumber(buf);
 
 		stat->free_space += PageGetExactFreeSpace(page);
 
-		/* We may count the page as scanned even if it's new/empty */
-		scanned++;
-
 		if (PageIsNew(page) || PageIsEmpty(page))
 		{
 			UnlockReleaseBuffer(buf);
@@ -169,6 +230,9 @@ statapprox_heap(Relation rel, output_type *stat)
 		UnlockReleaseBuffer(buf);
 	}
 
+	Assert(p.current_blocknum == nblocks);
+	read_stream_end(stream);
+
 	stat->table_len = (uint64) nblocks * BLCKSZ;
 
 	/*
@@ -179,7 +243,7 @@ statapprox_heap(Relation rel, output_type *stat)
 	 * tuples in all-visible pages, so no correction is needed for that, and
 	 * we already accounted for the space in those pages, too.
 	 */
-	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, scanned,
+	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, p.scanned,
 											   stat->tuple_count);
 
 	/* It's not clear if we could get -1 here, but be safe. */
@@ -190,16 +254,16 @@ statapprox_heap(Relation rel, output_type *stat)
 	 */
 	if (nblocks != 0)
 	{
-		stat->scanned_percent = 100.0 * scanned / nblocks;
+		stat->scanned_percent = 100.0 * p.scanned / nblocks;
 		stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len;
 		stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len;
 		stat->free_percent = 100.0 * stat->free_space / stat->table_len;
 	}
 
-	if (BufferIsValid(vmbuffer))
+	if (BufferIsValid(p.vmbuffer))
 	{
-		ReleaseBuffer(vmbuffer);
-		vmbuffer = InvalidBuffer;
+		ReleaseBuffer(p.vmbuffer);
+		p.vmbuffer = InvalidBuffer;
 	}
 }
 
-- 
2.51.0



  [application/x-patch] v2-0005-Streamify-log_newpage_range-WAL-logging-path.patch (2.3K, 6-v2-0005-Streamify-log_newpage_range-WAL-logging-path.patch)
  download | inline diff:
From d719cbc20c28634f07106e89b5de0c89cf098a8e Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sun, 28 Dec 2025 18:29:07 +0800
Subject: [PATCH v2 5/6] Streamify log_newpage_range() WAL logging path

Refactor log_newpage_range() to use the Read Stream. This allows
prefetching of upcoming relation blocks during bulk WAL logging
perations, overlapping I/O with CPU-intensive XLogInsert and
WAL-writing work.
---
 src/backend/access/transam/xloginsert.c | 28 +++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index a56d5a55282..04c1a46143a 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -39,6 +39,7 @@
 #include "replication/origin.h"
 #include "storage/bufmgr.h"
 #include "storage/proc.h"
+#include "storage/read_stream.h"
 #include "utils/memutils.h"
 #include "utils/pgstat_internal.h"
 
@@ -1295,6 +1296,8 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 {
 	int			flags;
 	BlockNumber blkno;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	flags = REGBUF_FORCE_IMAGE;
 	if (page_std)
@@ -1307,6 +1310,23 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 	 */
 	XLogEnsureRecordSpace(XLR_MAX_BLOCK_ID - 1, 0);
 
+	/* Set up a streaming read for the range of blocks */
+	p.current_blocknum = startblk;
+	p.last_exclusive = endblk;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_USE_BATCHING,
+										NULL,
+										rel,
+										forknum,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	blkno = startblk;
 	while (blkno < endblk)
 	{
@@ -1321,8 +1341,10 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 		nbufs = 0;
 		while (nbufs < XLR_MAX_BLOCK_ID && blkno < endblk)
 		{
-			Buffer		buf = ReadBufferExtended(rel, forknum, blkno,
-												 RBM_NORMAL, NULL);
+			Buffer		buf = read_stream_next_buffer(stream, NULL);
+
+			if (!BufferIsValid(buf))
+				break;
 
 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 
@@ -1361,6 +1383,8 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 		}
 		END_CRIT_SECTION();
 	}
+
+	read_stream_end(stream);
 }
 
 /*
-- 
2.51.0



  [application/x-patch] v2-0006-Streamify-hash-index-VACUUM-primary-bucket-page-r.patch (5.1K, 7-v2-0006-Streamify-hash-index-VACUUM-primary-bucket-page-r.patch)
  download | inline diff:
From 7b3018a09bfb26f8c6d1a413fd80c631231073f3 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sun, 28 Dec 2025 18:29:28 +0800
Subject: [PATCH v2 6/6] Streamify hash index VACUUM primary bucket page reads

Refactor hashbulkdelete() to use the Read Stream  for primary bucket
pages. This enables prefetching of upcoming buckets while the current
one is being processed, improving I/O efficiency during hash index
vacuum operations.
---
 src/backend/access/hash/hash.c | 77 +++++++++++++++++++++++++++++++++-
 1 file changed, 76 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index e388252afdc..4000d5d8e99 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -30,6 +30,7 @@
 #include "nodes/execnodes.h"
 #include "optimizer/plancat.h"
 #include "pgstat.h"
+#include "storage/read_stream.h"
 #include "utils/fmgrprotos.h"
 #include "utils/index_selfuncs.h"
 #include "utils/rel.h"
@@ -42,12 +43,23 @@ typedef struct
 	Relation	heapRel;		/* heap relation descriptor */
 } HashBuildState;
 
+/* Working state for streaming reads in hashbulkdelete */
+typedef struct
+{
+	HashMetaPage metap;			/* cached metapage for BUCKET_TO_BLKNO */
+	Bucket		next_bucket;	/* next bucket to prefetch */
+	Bucket		max_bucket;		/* stop when next_bucket > max_bucket */
+}			HashBulkDeleteStreamPrivate;
+
 static void hashbuildCallback(Relation index,
 							  ItemPointer tid,
 							  Datum *values,
 							  bool *isnull,
 							  bool tupleIsAlive,
 							  void *state);
+static BlockNumber hash_bulkdelete_read_stream_cb(ReadStream *stream,
+												  void *callback_private_data,
+												  void *per_buffer_data);
 
 
 /*
@@ -450,6 +462,25 @@ hashendscan(IndexScanDesc scan)
 	scan->opaque = NULL;
 }
 
+/*
+ * Read stream callback for hashbulkdelete.
+ *
+ * Returns the block number of the primary page for the next bucket to
+ * vacuum, using the BUCKET_TO_BLKNO mapping from the cached metapage.
+ */
+static BlockNumber
+hash_bulkdelete_read_stream_cb(ReadStream *stream,
+							   void *callback_private_data,
+							   void *per_buffer_data)
+{
+	HashBulkDeleteStreamPrivate *p = callback_private_data;
+
+	if (p->next_bucket > p->max_bucket)
+		return InvalidBlockNumber;
+
+	return BUCKET_TO_BLKNO(p->metap, p->next_bucket++);
+}
+
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples.
  * The set of target tuples is specified via a callback routine that tells
@@ -474,6 +505,8 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Buffer		metabuf = InvalidBuffer;
 	HashMetaPage metap;
 	HashMetaPage cachedmetap;
+	HashBulkDeleteStreamPrivate stream_private;
+	ReadStream *stream = NULL;
 
 	tuples_removed = 0;
 	num_index_tuples = 0;
@@ -495,6 +528,24 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	cur_maxbucket = orig_maxbucket;
 
 loop_top:
+	/* Set up streaming read for primary bucket pages */
+	stream_private.metap = cachedmetap;
+	stream_private.next_bucket = cur_bucket;
+	stream_private.max_bucket = cur_maxbucket;
+
+	/*
+	 * It is safe to use batchmode as hash_bulkdelete_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										rel,
+										MAIN_FORKNUM,
+										hash_bulkdelete_read_stream_cb,
+										&stream_private,
+										0);
+
 	while (cur_bucket <= cur_maxbucket)
 	{
 		BlockNumber bucket_blkno;
@@ -514,7 +565,8 @@ loop_top:
 		 * We need to acquire a cleanup lock on the primary bucket page to out
 		 * wait concurrent scans before deleting the dead tuples.
 		 */
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
+		buf = read_stream_next_buffer(stream, NULL);
+		Assert(BufferIsValid(buf));
 		LockBufferForCleanup(buf);
 		_hash_checkpage(rel, buf, LH_BUCKET_PAGE);
 
@@ -545,6 +597,24 @@ loop_top:
 			{
 				cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
 				Assert(cachedmetap != NULL);
+
+				/*
+				 * Reset stream with updated metadata for remaining buckets.
+				 * The BUCKET_TO_BLKNO mapping depends on hashm_spares[],
+				 * which may have changed.
+				 */
+				read_stream_end(stream);
+				stream_private.metap = cachedmetap;
+				stream_private.next_bucket = cur_bucket + 1;
+				stream_private.max_bucket = cur_maxbucket;
+				stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+													READ_STREAM_USE_BATCHING,
+													info->strategy,
+													rel,
+													MAIN_FORKNUM,
+													hash_bulkdelete_read_stream_cb,
+													&stream_private,
+													0);
 			}
 		}
 
@@ -577,9 +647,14 @@ loop_top:
 		cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
 		Assert(cachedmetap != NULL);
 		cur_maxbucket = cachedmetap->hashm_maxbucket;
+		read_stream_end(stream);
 		goto loop_top;
 	}
 
+	/* Stream should be exhausted since we processed all buckets */
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/* Okay, we're really done.  Update tuple count in metapage. */
 	START_CRIT_SECTION();
 
-- 
2.51.0



^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2025-12-28 11:45         ` Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2025-12-28 11:45 UTC (permalink / raw)
  To: Nazir Bilal Yavuz <[email protected]>; +Cc: pgsql-hackers <[email protected]>

Hi,

On Sun, Dec 28, 2025 at 7:41 PM Xuneng Zhou <[email protected]> wrote:
>
> Hi,
>
> On Sat, Dec 27, 2025 at 12:41 AM Xuneng Zhou <[email protected]> wrote:
> >
> > Hi Bilal,
> >
> > Thanks for your review!
> >
> > On Fri, Dec 26, 2025 at 6:59 PM Nazir Bilal Yavuz <[email protected]> wrote:
> > >
> > > Hi,
> > >
> > > Thank you for working on this!
> > >
> > > On Thu, 25 Dec 2025 at 09:34, Xuneng Zhou <[email protected]> wrote:
> > > >
> > > > Hi,
> > > >
> > > > On Thu, Dec 25, 2025 at 1:51 PM Xuneng Zhou <[email protected]> wrote:
> > > > >
> > > > > Hi Hackers,
> > > > >
> > > > > I noticed several additional paths in contrib modules, beyond [1],
> > > > > that are potentially suitable for streamification:
> > > > >
> > > > > 1) pgstattuple — pgstatapprox.c and parts of pgstattuple_approx_internal
> > > > > 2) Bloom — scan paths in blgetbitmap() and maintenance paths in blbulkdelete()
> > > > >
> > > > > The following patches streamify those code paths. No benchmarks have
> > > > > been run yet.
> > > > >
> > > > > [1] https://www.postgresql.org/message-id/flat/CABPTF7UeN2o-trr9r7K76rZExnO2M4SLfvTfbUY2CwQjCekgnQ%40mai...
> > > > >
> > > > > Feedbacks welcome.
> > > > >
> > > >
> > > > One more in ginvacuumcleanup().
> > >
> > > 0001, 0002 and 0004 LGTM.
> > >
> > > 0003:
> > >
> > > +        buf = read_stream_next_buffer(stream, NULL);
> > > +        if (buf == InvalidBuffer)
> > > +            break;
> > >
> > > I think we are loosening the check here. We were sure that there were
> > > no InvalidBuffers until the nblocks. Streamified version does not have
> > > this check, it exits from the loop the first time it sees an
> > > InvalidBuffer, which may be wrong. You might want to add
> > > 'Assert(p.current_blocknum == nblocks);' before read_stream_end() to
> > > have a similar check.
> > >
> >
> > Agree. The check has been added in v2 per your suggestion.
> >
>
> Two more to go:
> patch 5: Streamify log_newpage_range() WAL logging path
> patch 6: Streamify hash index VACUUM primary bucket page reads
>
> Benchmarks will be conducted soon.
>

v6 in the last message has a problem and has not been updated. Attach
the right one again. Sorry for the noise.

-- 
Best,
Xuneng


Attachments:

  [application/octet-stream] v2-0002-Streamify-Bloom-VACUUM-paths-Use-streaming-re.patch (4.2K, 2-v2-0002-Streamify-Bloom-VACUUM-paths-Use-streaming-re.patch)
  download | inline diff:
From aafdfa0851ab61777e8877cfb50e3b34bf54b9b1 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:28:49 +0800
Subject: [PATCH v2 2/4] Streamify Bloom VACUUM paths. Use streaming reads
 in blbulkdelete() and blvacuumcleanup() to iterate index pages without
 repeated ReadBuffer calls, improving VACUUM performance and reducing buffer
 manager overhead during maintenance operations.

---
 contrib/bloom/blvacuum.c | 55 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blvacuum.c b/contrib/bloom/blvacuum.c
index e68a9008f56..7452302f022 100644
--- a/contrib/bloom/blvacuum.c
+++ b/contrib/bloom/blvacuum.c
@@ -17,6 +17,7 @@
 #include "commands/vacuum.h"
 #include "storage/bufmgr.h"
 #include "storage/indexfsm.h"
+#include "storage/read_stream.h"
 
 
 /*
@@ -40,6 +41,8 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Page		page;
 	BloomMetaPageData *metaData;
 	GenericXLogState *gxlogState;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (stats == NULL)
 		stats = palloc0_object(IndexBulkDeleteResult);
@@ -51,6 +54,25 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 * they can't contain tuples to delete.
 	 */
 	npages = RelationGetNumberOfBlocks(index);
+
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		BloomTuple *itup,
@@ -59,8 +81,7 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		gxlogState = GenericXLogStart(index);
@@ -133,6 +154,9 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/*
 	 * Update the metapage's notFullPage list with whatever we found.  Our
 	 * info could already be out of date at this point, but blinsert() will
@@ -166,6 +190,8 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	Relation	index = info->index;
 	BlockNumber npages,
 				blkno;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (info->analyze_only)
 		return stats;
@@ -181,6 +207,25 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	stats->num_pages = npages;
 	stats->pages_free = 0;
 	stats->num_index_tuples = 0;
+
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
@@ -188,8 +233,7 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -206,6 +250,9 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	IndexFreeSpaceMapVacuum(info->index);
 
 	return stats;
-- 
2.51.0



  [application/octet-stream] v2-0001-Switch-Bloom-scan-paths-to-streaming-read.patch (2.4K, 3-v2-0001-Switch-Bloom-scan-paths-to-streaming-read.patch)
  download | inline diff:
From 314f9cdf6d8a62cc8523b377b8cf19df646b9913 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:28:10 +0800
Subject: [PATCH v2 1/4] Switch Bloom scan paths to streaming read. Replace
 per-page ReadBuffer loops in blgetbitmap() with read_stream_begin_relation()
 and sequential buffer iteration, reducing buffer churn and improving scan
 efficiency on large Bloom indexes.

---
 contrib/bloom/blscan.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blscan.c b/contrib/bloom/blscan.c
index 0d71edbe91c..b1fdabaab74 100644
--- a/contrib/bloom/blscan.c
+++ b/contrib/bloom/blscan.c
@@ -17,6 +17,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
+#include "storage/read_stream.h"
 
 /*
  * Begin scan of bloom index.
@@ -75,11 +76,13 @@ int64
 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 {
 	int64		ntids = 0;
-	BlockNumber blkno = BLOOM_HEAD_BLKNO,
+	BlockNumber blkno,
 				npages;
 	int			i;
 	BufferAccessStrategy bas;
 	BloomScanOpaque so = (BloomScanOpaque) scan->opaque;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (so->sign == NULL)
 	{
@@ -119,14 +122,29 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	if (scan->instrument)
 		scan->instrument->nsearches++;
 
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										bas,
+										scan->indexRelation,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
 		Page		page;
 
-		buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM,
-									blkno, RBM_NORMAL, bas);
-
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -162,6 +180,9 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 		UnlockReleaseBuffer(buffer);
 		CHECK_FOR_INTERRUPTS();
 	}
+
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
 	FreeAccessStrategy(bas);
 
 	return ntids;
-- 
2.51.0



  [application/octet-stream] v2-0004-Replace-synchronous-ReadBufferExtended-loop-with.patch (2.5K, 4-v2-0004-Replace-synchronous-ReadBufferExtended-loop-with.patch)
  download | inline diff:
From d628f3a5cab752d57332865c323479795629fb28 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:29:09 +0800
Subject: [PATCH v2 4/4] Replace synchronous ReadBufferExtended() loop with the
 streaming read in ginvacuumcleanup() to improve I/O efficiency during GIN
 index vacuum cleanup operations

---
 src/backend/access/gin/ginvacuum.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index d7baf7c847c..58e05c71256 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -22,6 +22,7 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "storage/read_stream.h"
 #include "utils/memutils.h"
 
 struct GinVacuumState
@@ -693,6 +694,8 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	BlockNumber totFreePages;
 	GinState	ginstate;
 	GinStatsData idxStat;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	/*
 	 * In an autovacuum analyze, we want to clean up pending insertions.
@@ -743,6 +746,24 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 	totFreePages = 0;
 
+	/* Scan all blocks starting from the root using streaming reads */
+	p.current_blocknum = GIN_ROOT_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
@@ -750,8 +771,8 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
+
 		LockBuffer(buffer, GIN_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -776,6 +797,9 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/* Update the metapage with accurate page and entry counts */
 	idxStat.nTotalPages = npages;
 	ginUpdateStats(info->index, &idxStat, false);
-- 
2.51.0



  [application/octet-stream] v2-0003-Streamify-heap-bloat-estimation-scan-Introduc.patch (6.0K, 5-v2-0003-Streamify-heap-bloat-estimation-scan-Introduc.patch)
  download | inline diff:
From 7a3fcec0cf44cd6d9848c930e0d7f620cd193698 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:29:02 +0800
Subject: [PATCH v2 3/4] Streamify heap bloat estimation scan. Introduce a
 read-stream callback to skip all-visible pages via VM/FSM lookup and
 stream-read the rest, reducing page reads and improving pgstattuple_approx
 execution time on large relations.

---
 contrib/pgstattuple/pgstatapprox.c | 126 ++++++++++++++++++++++-------
 1 file changed, 95 insertions(+), 31 deletions(-)

diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c
index a59ff4e9d4f..cb05b530ca8 100644
--- a/contrib/pgstattuple/pgstatapprox.c
+++ b/contrib/pgstattuple/pgstatapprox.c
@@ -23,6 +23,7 @@
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
 #include "storage/procarray.h"
+#include "storage/read_stream.h"
 
 PG_FUNCTION_INFO_V1(pgstattuple_approx);
 PG_FUNCTION_INFO_V1(pgstattuple_approx_v1_5);
@@ -45,6 +46,61 @@ typedef struct output_type
 
 #define NUM_OUTPUT_COLUMNS 10
 
+/*
+ * Struct for statapprox_heap read stream callback.
+ */
+typedef struct StatApproxReadStreamPrivate
+{
+	Relation	rel;
+	output_type *stat;
+	BlockNumber current_blocknum;
+	BlockNumber nblocks;
+	BlockNumber scanned;		/* count of pages actually read */
+	Buffer		vmbuffer;		/* for VM lookups */
+}			StatApproxReadStreamPrivate;
+
+/*
+ * Read stream callback for statapprox_heap.
+ *
+ * This callback checks the visibility map for each block. If the block is
+ * all-visible, we can get the free space from the FSM without reading the
+ * actual page, and skip to the next block. Only blocks that are not
+ * all-visible are returned for actual reading.
+ */
+static BlockNumber
+statapprox_heap_read_stream_next(ReadStream *stream,
+								 void *callback_private_data,
+								 void *per_buffer_data)
+{
+	StatApproxReadStreamPrivate *p = callback_private_data;
+
+	while (p->current_blocknum < p->nblocks)
+	{
+		BlockNumber blkno = p->current_blocknum++;
+		Size		freespace;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * If the page has only visible tuples, then we can find out the free
+		 * space from the FSM and move on without reading the page.
+		 */
+		if (VM_ALL_VISIBLE(p->rel, blkno, &p->vmbuffer))
+		{
+			freespace = GetRecordedFreeSpace(p->rel, blkno);
+			p->stat->tuple_len += BLCKSZ - freespace;
+			p->stat->free_space += freespace;
+			continue;
+		}
+
+		/* This block needs to be read */
+		p->scanned++;
+		return blkno;
+	}
+
+	return InvalidBlockNumber;
+}
+
 /*
  * This function takes an already open relation and scans its pages,
  * skipping those that have the corresponding visibility map bit set.
@@ -58,53 +114,58 @@ typedef struct output_type
 static void
 statapprox_heap(Relation rel, output_type *stat)
 {
-	BlockNumber scanned,
-				nblocks,
-				blkno;
-	Buffer		vmbuffer = InvalidBuffer;
+	BlockNumber nblocks;
 	BufferAccessStrategy bstrategy;
 	TransactionId OldestXmin;
+	StatApproxReadStreamPrivate p;
+	ReadStream *stream;
 
 	OldestXmin = GetOldestNonRemovableTransactionId(rel);
 	bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
 	nblocks = RelationGetNumberOfBlocks(rel);
-	scanned = 0;
 
-	for (blkno = 0; blkno < nblocks; blkno++)
+	/* Initialize read stream private data */
+	p.rel = rel;
+	p.stat = stat;
+	p.current_blocknum = 0;
+	p.nblocks = nblocks;
+	p.scanned = 0;
+	p.vmbuffer = InvalidBuffer;
+
+	/*
+	 * Create the read stream. We don't use READ_STREAM_USE_BATCHING because
+	 * the callback accesses the visibility map which may need to read VM
+	 * pages. While this shouldn't cause deadlocks, we err on the side of
+	 * caution.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										statapprox_heap_read_stream_next,
+										&p,
+										0);
+
+	for (;;)
 	{
 		Buffer		buf;
 		Page		page;
 		OffsetNumber offnum,
 					maxoff;
-		Size		freespace;
-
-		CHECK_FOR_INTERRUPTS();
-
-		/*
-		 * If the page has only visible tuples, then we can find out the free
-		 * space from the FSM and move on.
-		 */
-		if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
-		{
-			freespace = GetRecordedFreeSpace(rel, blkno);
-			stat->tuple_len += BLCKSZ - freespace;
-			stat->free_space += freespace;
-			continue;
-		}
+		BlockNumber blkno;
 
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
-								 RBM_NORMAL, bstrategy);
+		buf = read_stream_next_buffer(stream, NULL);
+		if (buf == InvalidBuffer)
+			break;
 
 		LockBuffer(buf, BUFFER_LOCK_SHARE);
 
 		page = BufferGetPage(buf);
+		blkno = BufferGetBlockNumber(buf);
 
 		stat->free_space += PageGetExactFreeSpace(page);
 
-		/* We may count the page as scanned even if it's new/empty */
-		scanned++;
-
 		if (PageIsNew(page) || PageIsEmpty(page))
 		{
 			UnlockReleaseBuffer(buf);
@@ -169,6 +230,9 @@ statapprox_heap(Relation rel, output_type *stat)
 		UnlockReleaseBuffer(buf);
 	}
 
+	Assert(p.current_blocknum == nblocks);
+	read_stream_end(stream);
+
 	stat->table_len = (uint64) nblocks * BLCKSZ;
 
 	/*
@@ -179,7 +243,7 @@ statapprox_heap(Relation rel, output_type *stat)
 	 * tuples in all-visible pages, so no correction is needed for that, and
 	 * we already accounted for the space in those pages, too.
 	 */
-	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, scanned,
+	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, p.scanned,
 											   stat->tuple_count);
 
 	/* It's not clear if we could get -1 here, but be safe. */
@@ -190,16 +254,16 @@ statapprox_heap(Relation rel, output_type *stat)
 	 */
 	if (nblocks != 0)
 	{
-		stat->scanned_percent = 100.0 * scanned / nblocks;
+		stat->scanned_percent = 100.0 * p.scanned / nblocks;
 		stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len;
 		stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len;
 		stat->free_percent = 100.0 * stat->free_space / stat->table_len;
 	}
 
-	if (BufferIsValid(vmbuffer))
+	if (BufferIsValid(p.vmbuffer))
 	{
-		ReleaseBuffer(vmbuffer);
-		vmbuffer = InvalidBuffer;
+		ReleaseBuffer(p.vmbuffer);
+		p.vmbuffer = InvalidBuffer;
 	}
 }
 
-- 
2.51.0



  [application/octet-stream] v2-0005-Streamify-log_newpage_range-WAL-logging-path.patch (2.3K, 6-v2-0005-Streamify-log_newpage_range-WAL-logging-path.patch)
  download | inline diff:
From d719cbc20c28634f07106e89b5de0c89cf098a8e Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sun, 28 Dec 2025 18:29:07 +0800
Subject: [PATCH v2 5/6] Streamify log_newpage_range() WAL logging path

Refactor log_newpage_range() to use the Read Stream. This allows
prefetching of upcoming relation blocks during bulk WAL logging
perations, overlapping I/O with CPU-intensive XLogInsert and
WAL-writing work.
---
 src/backend/access/transam/xloginsert.c | 28 +++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index a56d5a55282..04c1a46143a 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -39,6 +39,7 @@
 #include "replication/origin.h"
 #include "storage/bufmgr.h"
 #include "storage/proc.h"
+#include "storage/read_stream.h"
 #include "utils/memutils.h"
 #include "utils/pgstat_internal.h"
 
@@ -1295,6 +1296,8 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 {
 	int			flags;
 	BlockNumber blkno;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	flags = REGBUF_FORCE_IMAGE;
 	if (page_std)
@@ -1307,6 +1310,23 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 	 */
 	XLogEnsureRecordSpace(XLR_MAX_BLOCK_ID - 1, 0);
 
+	/* Set up a streaming read for the range of blocks */
+	p.current_blocknum = startblk;
+	p.last_exclusive = endblk;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_USE_BATCHING,
+										NULL,
+										rel,
+										forknum,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	blkno = startblk;
 	while (blkno < endblk)
 	{
@@ -1321,8 +1341,10 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 		nbufs = 0;
 		while (nbufs < XLR_MAX_BLOCK_ID && blkno < endblk)
 		{
-			Buffer		buf = ReadBufferExtended(rel, forknum, blkno,
-												 RBM_NORMAL, NULL);
+			Buffer		buf = read_stream_next_buffer(stream, NULL);
+
+			if (!BufferIsValid(buf))
+				break;
 
 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 
@@ -1361,6 +1383,8 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 		}
 		END_CRIT_SECTION();
 	}
+
+	read_stream_end(stream);
 }
 
 /*
-- 
2.51.0



  [application/octet-stream] v2-0006-Streamify-hash-index-VACUUM-primary-bucket-page-r.patch (5.1K, 7-v2-0006-Streamify-hash-index-VACUUM-primary-bucket-page-r.patch)
  download | inline diff:
From 6461bc680f3b2af78a013503b03121b0b02b2cc4 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sun, 28 Dec 2025 18:29:28 +0800
Subject: [PATCH v2 6/6] Streamify hash index VACUUM primary bucket page reads

Refactor hashbulkdelete() to use the Read Stream  for primary bucket
pages. This enables prefetching of upcoming buckets while the current
one is being processed, improving I/O efficiency during hash index
vacuum operations.
---
 src/backend/access/hash/hash.c | 79 +++++++++++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index e388252afdc..9ebdc2d4931 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -30,6 +30,7 @@
 #include "nodes/execnodes.h"
 #include "optimizer/plancat.h"
 #include "pgstat.h"
+#include "storage/read_stream.h"
 #include "utils/fmgrprotos.h"
 #include "utils/index_selfuncs.h"
 #include "utils/rel.h"
@@ -42,12 +43,23 @@ typedef struct
 	Relation	heapRel;		/* heap relation descriptor */
 } HashBuildState;
 
+/* Working state for streaming reads in hashbulkdelete */
+typedef struct
+{
+	HashMetaPage metap;			/* cached metapage for BUCKET_TO_BLKNO */
+	Bucket		next_bucket;	/* next bucket to prefetch */
+	Bucket		max_bucket;		/* stop when next_bucket > max_bucket */
+}			HashBulkDeleteStreamPrivate;
+
 static void hashbuildCallback(Relation index,
 							  ItemPointer tid,
 							  Datum *values,
 							  bool *isnull,
 							  bool tupleIsAlive,
 							  void *state);
+static BlockNumber hash_bulkdelete_read_stream_cb(ReadStream *stream,
+												  void *callback_private_data,
+												  void *per_buffer_data);
 
 
 /*
@@ -450,6 +462,27 @@ hashendscan(IndexScanDesc scan)
 	scan->opaque = NULL;
 }
 
+/*
+ * Read stream callback for hashbulkdelete.
+ *
+ * Returns the block number of the primary page for the next bucket to
+ * vacuum, using the BUCKET_TO_BLKNO mapping from the cached metapage.
+ */
+static BlockNumber
+hash_bulkdelete_read_stream_cb(ReadStream *stream,
+							   void *callback_private_data,
+							   void *per_buffer_data)
+{
+	HashBulkDeleteStreamPrivate *p = callback_private_data;
+	Bucket		bucket;
+
+	if (p->next_bucket > p->max_bucket)
+		return InvalidBlockNumber;
+
+	bucket = p->next_bucket++;
+	return BUCKET_TO_BLKNO(p->metap, bucket);
+}
+
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples.
  * The set of target tuples is specified via a callback routine that tells
@@ -474,6 +507,8 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Buffer		metabuf = InvalidBuffer;
 	HashMetaPage metap;
 	HashMetaPage cachedmetap;
+	HashBulkDeleteStreamPrivate stream_private;
+	ReadStream *stream = NULL;
 
 	tuples_removed = 0;
 	num_index_tuples = 0;
@@ -495,6 +530,24 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	cur_maxbucket = orig_maxbucket;
 
 loop_top:
+	/* Set up streaming read for primary bucket pages */
+	stream_private.metap = cachedmetap;
+	stream_private.next_bucket = cur_bucket;
+	stream_private.max_bucket = cur_maxbucket;
+
+	/*
+	 * It is safe to use batchmode as hash_bulkdelete_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										rel,
+										MAIN_FORKNUM,
+										hash_bulkdelete_read_stream_cb,
+										&stream_private,
+										0);
+
 	while (cur_bucket <= cur_maxbucket)
 	{
 		BlockNumber bucket_blkno;
@@ -514,7 +567,8 @@ loop_top:
 		 * We need to acquire a cleanup lock on the primary bucket page to out
 		 * wait concurrent scans before deleting the dead tuples.
 		 */
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
+		buf = read_stream_next_buffer(stream, NULL);
+		Assert(BufferIsValid(buf));
 		LockBufferForCleanup(buf);
 		_hash_checkpage(rel, buf, LH_BUCKET_PAGE);
 
@@ -545,6 +599,24 @@ loop_top:
 			{
 				cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
 				Assert(cachedmetap != NULL);
+
+				/*
+				 * Reset stream with updated metadata for remaining buckets.
+				 * The BUCKET_TO_BLKNO mapping depends on hashm_spares[],
+				 * which may have changed.
+				 */
+				read_stream_end(stream);
+				stream_private.metap = cachedmetap;
+				stream_private.next_bucket = cur_bucket + 1;
+				stream_private.max_bucket = cur_maxbucket;
+				stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+													READ_STREAM_USE_BATCHING,
+													info->strategy,
+													rel,
+													MAIN_FORKNUM,
+													hash_bulkdelete_read_stream_cb,
+													&stream_private,
+													0);
 			}
 		}
 
@@ -577,9 +649,14 @@ loop_top:
 		cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
 		Assert(cachedmetap != NULL);
 		cur_maxbucket = cachedmetap->hashm_maxbucket;
+		read_stream_end(stream);
 		goto loop_top;
 	}
 
+	/* Stream should be exhausted since we processed all buckets */
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/* Okay, we're really done.  Update tuple count in metapage. */
 	START_CRIT_SECTION();
 
-- 
2.51.0



^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2025-12-29 10:58           ` Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Nazir Bilal Yavuz @ 2025-12-29 10:58 UTC (permalink / raw)
  To: Xuneng Zhou <[email protected]>; +Cc: pgsql-hackers <[email protected]>

Hi,

On Sun, 28 Dec 2025 at 14:46, Xuneng Zhou <[email protected]> wrote:
>
> Hi,
> >
> > Two more to go:
> > patch 5: Streamify log_newpage_range() WAL logging path
> > patch 6: Streamify hash index VACUUM primary bucket page reads
> >
> > Benchmarks will be conducted soon.
> >
>
> v6 in the last message has a problem and has not been updated. Attach
> the right one again. Sorry for the noise.

0003 and 0006:

You need to add 'StatApproxReadStreamPrivate' and
'HashBulkDeleteStreamPrivate' to the typedefs.list.

0005:

@@ -1321,8 +1341,10 @@ log_newpage_range(Relation rel, ForkNumber forknum,
         nbufs = 0;
         while (nbufs < XLR_MAX_BLOCK_ID && blkno < endblk)
         {
-            Buffer        buf = ReadBufferExtended(rel, forknum, blkno,
-                                                 RBM_NORMAL, NULL);
+            Buffer        buf = read_stream_next_buffer(stream, NULL);
+
+            if (!BufferIsValid(buf))
+                break;

We are loosening a check here, there should not be a invalid buffer in
the stream until the endblk. I think you can remove this
BufferIsValid() check, then we can learn if something goes wrong.

0006:

You can use read_stream_reset() instead of read_stream_end(), then you
can use the same stream with different variables, I believe this is
the preferred way.

Rest LGTM!

-- 
Regards,
Nazir Bilal Yavuz
Microsoft





^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
@ 2025-12-30 01:51             ` Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2025-12-30 01:51 UTC (permalink / raw)
  To: Nazir Bilal Yavuz <[email protected]>; +Cc: pgsql-hackers <[email protected]>

Hi,

Thanks for looking into this.

On Mon, Dec 29, 2025 at 6:58 PM Nazir Bilal Yavuz <[email protected]> wrote:
>
> Hi,
>
> On Sun, 28 Dec 2025 at 14:46, Xuneng Zhou <[email protected]> wrote:
> >
> > Hi,
> > >
> > > Two more to go:
> > > patch 5: Streamify log_newpage_range() WAL logging path
> > > patch 6: Streamify hash index VACUUM primary bucket page reads
> > >
> > > Benchmarks will be conducted soon.
> > >
> >
> > v6 in the last message has a problem and has not been updated. Attach
> > the right one again. Sorry for the noise.
>
> 0003 and 0006:
>
> You need to add 'StatApproxReadStreamPrivate' and
> 'HashBulkDeleteStreamPrivate' to the typedefs.list.

Done.

> 0005:
>
> @@ -1321,8 +1341,10 @@ log_newpage_range(Relation rel, ForkNumber forknum,
>          nbufs = 0;
>          while (nbufs < XLR_MAX_BLOCK_ID && blkno < endblk)
>          {
> -            Buffer        buf = ReadBufferExtended(rel, forknum, blkno,
> -                                                 RBM_NORMAL, NULL);
> +            Buffer        buf = read_stream_next_buffer(stream, NULL);
> +
> +            if (!BufferIsValid(buf))
> +                break;
>
> We are loosening a check here, there should not be a invalid buffer in
> the stream until the endblk. I think you can remove this
> BufferIsValid() check, then we can learn if something goes wrong.

My concern before for not adding assert at the end of streaming is the
potential early break in here:

/* Nothing more to do if all remaining blocks were empty. */
if (nbufs == 0)
    break;

After looking more closely, it turns out to be a misunderstanding of the logic.

> 0006:
>
> You can use read_stream_reset() instead of read_stream_end(), then you
> can use the same stream with different variables, I believe this is
> the preferred way.
>
> Rest LGTM!
>

Yeah, reset seems a more proper way here.

-- 
Best,
Xuneng


Attachments:

  [application/octet-stream] v3-0003-Streamify-heap-bloat-estimation-scan.-Introduce-a.patch (6.4K, 2-v3-0003-Streamify-heap-bloat-estimation-scan.-Introduce-a.patch)
  download | inline diff:
From 1b4c74f07fdd40049dc40206c6c1321b66de0ecd Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:29:02 +0800
Subject: [PATCH v3 3/6] Streamify heap bloat estimation scan. Introduce a 
 read-stream callback to skip all-visible pages via VM/FSM lookup and 
 stream-read the rest, reducing page reads and improving pgstattuple_approx 
 execution time on large relations.

---
 contrib/pgstattuple/pgstatapprox.c | 126 ++++++++++++++++++++++-------
 src/tools/pgindent/typedefs.list   |   1 +
 2 files changed, 96 insertions(+), 31 deletions(-)

diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c
index a59ff4e9d4f..cb05b530ca8 100644
--- a/contrib/pgstattuple/pgstatapprox.c
+++ b/contrib/pgstattuple/pgstatapprox.c
@@ -23,6 +23,7 @@
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
 #include "storage/procarray.h"
+#include "storage/read_stream.h"
 
 PG_FUNCTION_INFO_V1(pgstattuple_approx);
 PG_FUNCTION_INFO_V1(pgstattuple_approx_v1_5);
@@ -45,6 +46,61 @@ typedef struct output_type
 
 #define NUM_OUTPUT_COLUMNS 10
 
+/*
+ * Struct for statapprox_heap read stream callback.
+ */
+typedef struct StatApproxReadStreamPrivate
+{
+	Relation	rel;
+	output_type *stat;
+	BlockNumber current_blocknum;
+	BlockNumber nblocks;
+	BlockNumber scanned;		/* count of pages actually read */
+	Buffer		vmbuffer;		/* for VM lookups */
+}			StatApproxReadStreamPrivate;
+
+/*
+ * Read stream callback for statapprox_heap.
+ *
+ * This callback checks the visibility map for each block. If the block is
+ * all-visible, we can get the free space from the FSM without reading the
+ * actual page, and skip to the next block. Only blocks that are not
+ * all-visible are returned for actual reading.
+ */
+static BlockNumber
+statapprox_heap_read_stream_next(ReadStream *stream,
+								 void *callback_private_data,
+								 void *per_buffer_data)
+{
+	StatApproxReadStreamPrivate *p = callback_private_data;
+
+	while (p->current_blocknum < p->nblocks)
+	{
+		BlockNumber blkno = p->current_blocknum++;
+		Size		freespace;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * If the page has only visible tuples, then we can find out the free
+		 * space from the FSM and move on without reading the page.
+		 */
+		if (VM_ALL_VISIBLE(p->rel, blkno, &p->vmbuffer))
+		{
+			freespace = GetRecordedFreeSpace(p->rel, blkno);
+			p->stat->tuple_len += BLCKSZ - freespace;
+			p->stat->free_space += freespace;
+			continue;
+		}
+
+		/* This block needs to be read */
+		p->scanned++;
+		return blkno;
+	}
+
+	return InvalidBlockNumber;
+}
+
 /*
  * This function takes an already open relation and scans its pages,
  * skipping those that have the corresponding visibility map bit set.
@@ -58,53 +114,58 @@ typedef struct output_type
 static void
 statapprox_heap(Relation rel, output_type *stat)
 {
-	BlockNumber scanned,
-				nblocks,
-				blkno;
-	Buffer		vmbuffer = InvalidBuffer;
+	BlockNumber nblocks;
 	BufferAccessStrategy bstrategy;
 	TransactionId OldestXmin;
+	StatApproxReadStreamPrivate p;
+	ReadStream *stream;
 
 	OldestXmin = GetOldestNonRemovableTransactionId(rel);
 	bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
 	nblocks = RelationGetNumberOfBlocks(rel);
-	scanned = 0;
 
-	for (blkno = 0; blkno < nblocks; blkno++)
+	/* Initialize read stream private data */
+	p.rel = rel;
+	p.stat = stat;
+	p.current_blocknum = 0;
+	p.nblocks = nblocks;
+	p.scanned = 0;
+	p.vmbuffer = InvalidBuffer;
+
+	/*
+	 * Create the read stream. We don't use READ_STREAM_USE_BATCHING because
+	 * the callback accesses the visibility map which may need to read VM
+	 * pages. While this shouldn't cause deadlocks, we err on the side of
+	 * caution.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										statapprox_heap_read_stream_next,
+										&p,
+										0);
+
+	for (;;)
 	{
 		Buffer		buf;
 		Page		page;
 		OffsetNumber offnum,
 					maxoff;
-		Size		freespace;
-
-		CHECK_FOR_INTERRUPTS();
-
-		/*
-		 * If the page has only visible tuples, then we can find out the free
-		 * space from the FSM and move on.
-		 */
-		if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
-		{
-			freespace = GetRecordedFreeSpace(rel, blkno);
-			stat->tuple_len += BLCKSZ - freespace;
-			stat->free_space += freespace;
-			continue;
-		}
+		BlockNumber blkno;
 
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
-								 RBM_NORMAL, bstrategy);
+		buf = read_stream_next_buffer(stream, NULL);
+		if (buf == InvalidBuffer)
+			break;
 
 		LockBuffer(buf, BUFFER_LOCK_SHARE);
 
 		page = BufferGetPage(buf);
+		blkno = BufferGetBlockNumber(buf);
 
 		stat->free_space += PageGetExactFreeSpace(page);
 
-		/* We may count the page as scanned even if it's new/empty */
-		scanned++;
-
 		if (PageIsNew(page) || PageIsEmpty(page))
 		{
 			UnlockReleaseBuffer(buf);
@@ -169,6 +230,9 @@ statapprox_heap(Relation rel, output_type *stat)
 		UnlockReleaseBuffer(buf);
 	}
 
+	Assert(p.current_blocknum == nblocks);
+	read_stream_end(stream);
+
 	stat->table_len = (uint64) nblocks * BLCKSZ;
 
 	/*
@@ -179,7 +243,7 @@ statapprox_heap(Relation rel, output_type *stat)
 	 * tuples in all-visible pages, so no correction is needed for that, and
 	 * we already accounted for the space in those pages, too.
 	 */
-	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, scanned,
+	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, p.scanned,
 											   stat->tuple_count);
 
 	/* It's not clear if we could get -1 here, but be safe. */
@@ -190,16 +254,16 @@ statapprox_heap(Relation rel, output_type *stat)
 	 */
 	if (nblocks != 0)
 	{
-		stat->scanned_percent = 100.0 * scanned / nblocks;
+		stat->scanned_percent = 100.0 * p.scanned / nblocks;
 		stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len;
 		stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len;
 		stat->free_percent = 100.0 * stat->free_space / stat->table_len;
 	}
 
-	if (BufferIsValid(vmbuffer))
+	if (BufferIsValid(p.vmbuffer))
 	{
-		ReleaseBuffer(vmbuffer);
-		vmbuffer = InvalidBuffer;
+		ReleaseBuffer(p.vmbuffer);
+		p.vmbuffer = InvalidBuffer;
 	}
 }
 
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 5c88fa92f4e..dc6fc28fcab 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2898,6 +2898,7 @@ StartReplicationCmd
 StartupStatusEnum
 StatEntry
 StatExtEntry
+StatApproxReadStreamPrivate
 StateFileChunk
 StatisticExtInfo
 StatsBuildData
-- 
2.51.0



  [application/octet-stream] v3-0005-Streamify-log_newpage_range-WAL-logging-path.patch (2.3K, 3-v3-0005-Streamify-log_newpage_range-WAL-logging-path.patch)
  download | inline diff:
From 18f751fea55dcb62c8dcb2d47151a1f87f9ed4ce Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sun, 28 Dec 2025 18:29:07 +0800
Subject: [PATCH v3 5/6] Streamify log_newpage_range() WAL logging path

Refactor log_newpage_range() to use the Read Stream. This allows
prefetching of upcoming relation blocks during bulk WAL logging
perations, overlapping I/O with CPU-intensive XLogInsert and
WAL-writing work.
---
 src/backend/access/transam/xloginsert.c | 26 +++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index a56d5a55282..2075aea7037 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -39,6 +39,7 @@
 #include "replication/origin.h"
 #include "storage/bufmgr.h"
 #include "storage/proc.h"
+#include "storage/read_stream.h"
 #include "utils/memutils.h"
 #include "utils/pgstat_internal.h"
 
@@ -1295,6 +1296,8 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 {
 	int			flags;
 	BlockNumber blkno;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	flags = REGBUF_FORCE_IMAGE;
 	if (page_std)
@@ -1307,6 +1310,23 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 	 */
 	XLogEnsureRecordSpace(XLR_MAX_BLOCK_ID - 1, 0);
 
+	/* Set up a streaming read for the range of blocks */
+	p.current_blocknum = startblk;
+	p.last_exclusive = endblk;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_USE_BATCHING,
+										NULL,
+										rel,
+										forknum,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	blkno = startblk;
 	while (blkno < endblk)
 	{
@@ -1321,8 +1341,7 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 		nbufs = 0;
 		while (nbufs < XLR_MAX_BLOCK_ID && blkno < endblk)
 		{
-			Buffer		buf = ReadBufferExtended(rel, forknum, blkno,
-												 RBM_NORMAL, NULL);
+			Buffer		buf = read_stream_next_buffer(stream, NULL);
 
 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 
@@ -1361,6 +1380,9 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 		}
 		END_CRIT_SECTION();
 	}
+
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
 }
 
 /*
-- 
2.51.0



  [application/octet-stream] v3-0002-Streamify-Bloom-VACUUM-paths.-n-nUse-streaming-re.patch (4.2K, 4-v3-0002-Streamify-Bloom-VACUUM-paths.-n-nUse-streaming-re.patch)
  download | inline diff:
From aafdfa0851ab61777e8877cfb50e3b34bf54b9b1 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:28:49 +0800
Subject: [PATCH v3 2/6] Streamify Bloom VACUUM paths.\n\nUse streaming reads
 in blbulkdelete() and blvacuumcleanup() to iterate index pages without
 repeated ReadBuffer calls, improving VACUUM performance and reducing buffer
 manager overhead during maintenance operations.

---
 contrib/bloom/blvacuum.c | 55 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blvacuum.c b/contrib/bloom/blvacuum.c
index e68a9008f56..7452302f022 100644
--- a/contrib/bloom/blvacuum.c
+++ b/contrib/bloom/blvacuum.c
@@ -17,6 +17,7 @@
 #include "commands/vacuum.h"
 #include "storage/bufmgr.h"
 #include "storage/indexfsm.h"
+#include "storage/read_stream.h"
 
 
 /*
@@ -40,6 +41,8 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Page		page;
 	BloomMetaPageData *metaData;
 	GenericXLogState *gxlogState;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (stats == NULL)
 		stats = palloc0_object(IndexBulkDeleteResult);
@@ -51,6 +54,25 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 * they can't contain tuples to delete.
 	 */
 	npages = RelationGetNumberOfBlocks(index);
+
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		BloomTuple *itup,
@@ -59,8 +81,7 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		gxlogState = GenericXLogStart(index);
@@ -133,6 +154,9 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/*
 	 * Update the metapage's notFullPage list with whatever we found.  Our
 	 * info could already be out of date at this point, but blinsert() will
@@ -166,6 +190,8 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	Relation	index = info->index;
 	BlockNumber npages,
 				blkno;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (info->analyze_only)
 		return stats;
@@ -181,6 +207,25 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	stats->num_pages = npages;
 	stats->pages_free = 0;
 	stats->num_index_tuples = 0;
+
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
@@ -188,8 +233,7 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -206,6 +250,9 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	IndexFreeSpaceMapVacuum(info->index);
 
 	return stats;
-- 
2.51.0



  [application/octet-stream] v3-0001-Switch-Bloom-scan-paths-to-streaming-read.-n-nRep.patch (2.4K, 5-v3-0001-Switch-Bloom-scan-paths-to-streaming-read.-n-nRep.patch)
  download | inline diff:
From 314f9cdf6d8a62cc8523b377b8cf19df646b9913 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:28:10 +0800
Subject: [PATCH v3 1/6] Switch Bloom scan paths to streaming read.\n\nReplace
 per-page ReadBuffer loops in blgetbitmap() with read_stream_begin_relation()
 and sequential buffer iteration, reducing buffer churn and improving scan
 efficiency on large Bloom indexes.

---
 contrib/bloom/blscan.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blscan.c b/contrib/bloom/blscan.c
index 0d71edbe91c..b1fdabaab74 100644
--- a/contrib/bloom/blscan.c
+++ b/contrib/bloom/blscan.c
@@ -17,6 +17,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
+#include "storage/read_stream.h"
 
 /*
  * Begin scan of bloom index.
@@ -75,11 +76,13 @@ int64
 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 {
 	int64		ntids = 0;
-	BlockNumber blkno = BLOOM_HEAD_BLKNO,
+	BlockNumber blkno,
 				npages;
 	int			i;
 	BufferAccessStrategy bas;
 	BloomScanOpaque so = (BloomScanOpaque) scan->opaque;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (so->sign == NULL)
 	{
@@ -119,14 +122,29 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	if (scan->instrument)
 		scan->instrument->nsearches++;
 
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										bas,
+										scan->indexRelation,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
 		Page		page;
 
-		buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM,
-									blkno, RBM_NORMAL, bas);
-
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -162,6 +180,9 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 		UnlockReleaseBuffer(buffer);
 		CHECK_FOR_INTERRUPTS();
 	}
+
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
 	FreeAccessStrategy(bas);
 
 	return ntids;
-- 
2.51.0



  [application/octet-stream] v3-0004-Replace-synchronous-ReadBufferExtended-loop-with-.patch (2.5K, 6-v3-0004-Replace-synchronous-ReadBufferExtended-loop-with-.patch)
  download | inline diff:
From db1d1a1143f5a80c9a158dd9ed9d8f05d8e2e1d8 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:29:09 +0800
Subject: [PATCH v3 4/6] Replace synchronous ReadBufferExtended() loop with the
 streaming read in ginvacuumcleanup() to improve I/O efficiency during GIN
 index vacuum cleanup operations

---
 src/backend/access/gin/ginvacuum.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index d7baf7c847c..58e05c71256 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -22,6 +22,7 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "storage/read_stream.h"
 #include "utils/memutils.h"
 
 struct GinVacuumState
@@ -693,6 +694,8 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	BlockNumber totFreePages;
 	GinState	ginstate;
 	GinStatsData idxStat;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	/*
 	 * In an autovacuum analyze, we want to clean up pending insertions.
@@ -743,6 +746,24 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 	totFreePages = 0;
 
+	/* Scan all blocks starting from the root using streaming reads */
+	p.current_blocknum = GIN_ROOT_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
@@ -750,8 +771,8 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
+
 		LockBuffer(buffer, GIN_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -776,6 +797,9 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/* Update the metapage with accurate page and entry counts */
 	idxStat.nTotalPages = npages;
 	ginUpdateStats(info->index, &idxStat, false);
-- 
2.51.0



  [application/octet-stream] v3-0006-Streamify-hash-index-VACUUM-primary-bucket-page-r.patch (5.5K, 7-v3-0006-Streamify-hash-index-VACUUM-primary-bucket-page-r.patch)
  download | inline diff:
From 34004b05915d58aff505d928ab9199b676f06246 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sun, 28 Dec 2025 18:29:28 +0800
Subject: [PATCH v3 6/6] Streamify hash index VACUUM primary bucket page reads

Refactor hashbulkdelete() to use the Read Stream  for primary bucket
pages. This enables prefetching of upcoming buckets while the current
one is being processed, improving I/O efficiency during hash index
vacuum operations.
---
 src/backend/access/hash/hash.c   | 80 ++++++++++++++++++++++++++++++--
 src/tools/pgindent/typedefs.list |  1 +
 2 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index e388252afdc..8dc71b926a6 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -30,6 +30,7 @@
 #include "nodes/execnodes.h"
 #include "optimizer/plancat.h"
 #include "pgstat.h"
+#include "storage/read_stream.h"
 #include "utils/fmgrprotos.h"
 #include "utils/index_selfuncs.h"
 #include "utils/rel.h"
@@ -42,12 +43,23 @@ typedef struct
 	Relation	heapRel;		/* heap relation descriptor */
 } HashBuildState;
 
+/* Working state for streaming reads in hashbulkdelete */
+typedef struct
+{
+	HashMetaPage metap;			/* cached metapage for BUCKET_TO_BLKNO */
+	Bucket		next_bucket;	/* next bucket to prefetch */
+	Bucket		max_bucket;		/* stop when next_bucket > max_bucket */
+}			HashBulkDeleteStreamPrivate;
+
 static void hashbuildCallback(Relation index,
 							  ItemPointer tid,
 							  Datum *values,
 							  bool *isnull,
 							  bool tupleIsAlive,
 							  void *state);
+static BlockNumber hash_bulkdelete_read_stream_cb(ReadStream *stream,
+												  void *callback_private_data,
+												  void *per_buffer_data);
 
 
 /*
@@ -450,6 +462,27 @@ hashendscan(IndexScanDesc scan)
 	scan->opaque = NULL;
 }
 
+/*
+ * Read stream callback for hashbulkdelete.
+ *
+ * Returns the block number of the primary page for the next bucket to
+ * vacuum, using the BUCKET_TO_BLKNO mapping from the cached metapage.
+ */
+static BlockNumber
+hash_bulkdelete_read_stream_cb(ReadStream *stream,
+							   void *callback_private_data,
+							   void *per_buffer_data)
+{
+	HashBulkDeleteStreamPrivate *p = callback_private_data;
+	Bucket		bucket;
+
+	if (p->next_bucket > p->max_bucket)
+		return InvalidBlockNumber;
+
+	bucket = p->next_bucket++;
+	return BUCKET_TO_BLKNO(p->metap, bucket);
+}
+
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples.
  * The set of target tuples is specified via a callback routine that tells
@@ -474,6 +507,8 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Buffer		metabuf = InvalidBuffer;
 	HashMetaPage metap;
 	HashMetaPage cachedmetap;
+	HashBulkDeleteStreamPrivate stream_private;
+	ReadStream *stream = NULL;
 
 	tuples_removed = 0;
 	num_index_tuples = 0;
@@ -494,7 +529,25 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	cur_bucket = 0;
 	cur_maxbucket = orig_maxbucket;
 
-loop_top:
+	/* Set up streaming read for primary bucket pages */
+	stream_private.metap = cachedmetap;
+	stream_private.next_bucket = cur_bucket;
+	stream_private.max_bucket = cur_maxbucket;
+
+	/*
+	 * It is safe to use batchmode as hash_bulkdelete_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										rel,
+										MAIN_FORKNUM,
+										hash_bulkdelete_read_stream_cb,
+										&stream_private,
+										0);
+
+bucket_loop:
 	while (cur_bucket <= cur_maxbucket)
 	{
 		BlockNumber bucket_blkno;
@@ -514,7 +567,8 @@ loop_top:
 		 * We need to acquire a cleanup lock on the primary bucket page to out
 		 * wait concurrent scans before deleting the dead tuples.
 		 */
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
+		buf = read_stream_next_buffer(stream, NULL);
+		Assert(BufferIsValid(buf));
 		LockBufferForCleanup(buf);
 		_hash_checkpage(rel, buf, LH_BUCKET_PAGE);
 
@@ -545,6 +599,16 @@ loop_top:
 			{
 				cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
 				Assert(cachedmetap != NULL);
+
+				/*
+				 * Reset stream with updated metadata for remaining buckets.
+				 * The BUCKET_TO_BLKNO mapping depends on hashm_spares[],
+				 * which may have changed.
+				 */
+				stream_private.metap = cachedmetap;
+				stream_private.next_bucket = cur_bucket + 1;
+				stream_private.max_bucket = cur_maxbucket;
+				read_stream_reset(stream);
 			}
 		}
 
@@ -577,9 +641,19 @@ loop_top:
 		cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
 		Assert(cachedmetap != NULL);
 		cur_maxbucket = cachedmetap->hashm_maxbucket;
-		goto loop_top;
+
+		/* Reset stream to process additional buckets from split */
+		stream_private.metap = cachedmetap;
+		stream_private.next_bucket = cur_bucket;
+		stream_private.max_bucket = cur_maxbucket;
+		read_stream_reset(stream);
+		goto bucket_loop;
 	}
 
+	/* Stream should be exhausted since we processed all buckets */
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/* Okay, we're really done.  Update tuple count in metapage. */
 	START_CRIT_SECTION();
 
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index dc6fc28fcab..572be5598f2 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1176,6 +1176,7 @@ HashAggBatch
 HashAggSpill
 HashAllocFunc
 HashBuildState
+HashBulkDeleteStreamPrivate
 HashCompareFunc
 HashCopyFunc
 HashIndexStat
-- 
2.51.0



^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2025-12-30 02:43               ` Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2025-12-30 02:43 UTC (permalink / raw)
  To: Nazir Bilal Yavuz <[email protected]>; +Cc: pgsql-hackers <[email protected]>

Hi,

On Tue, Dec 30, 2025 at 9:51 AM Xuneng Zhou <[email protected]> wrote:
>
> Hi,
>
> Thanks for looking into this.
>
> On Mon, Dec 29, 2025 at 6:58 PM Nazir Bilal Yavuz <[email protected]> wrote:
> >
> > Hi,
> >
> > On Sun, 28 Dec 2025 at 14:46, Xuneng Zhou <[email protected]> wrote:
> > >
> > > Hi,
> > > >
> > > > Two more to go:
> > > > patch 5: Streamify log_newpage_range() WAL logging path
> > > > patch 6: Streamify hash index VACUUM primary bucket page reads
> > > >
> > > > Benchmarks will be conducted soon.
> > > >
> > >
> > > v6 in the last message has a problem and has not been updated. Attach
> > > the right one again. Sorry for the noise.
> >
> > 0003 and 0006:
> >
> > You need to add 'StatApproxReadStreamPrivate' and
> > 'HashBulkDeleteStreamPrivate' to the typedefs.list.
>
> Done.
>
> > 0005:
> >
> > @@ -1321,8 +1341,10 @@ log_newpage_range(Relation rel, ForkNumber forknum,
> >          nbufs = 0;
> >          while (nbufs < XLR_MAX_BLOCK_ID && blkno < endblk)
> >          {
> > -            Buffer        buf = ReadBufferExtended(rel, forknum, blkno,
> > -                                                 RBM_NORMAL, NULL);
> > +            Buffer        buf = read_stream_next_buffer(stream, NULL);
> > +
> > +            if (!BufferIsValid(buf))
> > +                break;
> >
> > We are loosening a check here, there should not be a invalid buffer in
> > the stream until the endblk. I think you can remove this
> > BufferIsValid() check, then we can learn if something goes wrong.
>
> My concern before for not adding assert at the end of streaming is the
> potential early break in here:
>
> /* Nothing more to do if all remaining blocks were empty. */
> if (nbufs == 0)
>     break;
>
> After looking more closely, it turns out to be a misunderstanding of the logic.
>
> > 0006:
> >
> > You can use read_stream_reset() instead of read_stream_end(), then you
> > can use the same stream with different variables, I believe this is
> > the preferred way.
> >
> > Rest LGTM!
> >
>
> Yeah, reset seems a more proper way here.
>

Run pgindent using the updated typedefs.list.

-- 
Best,
Xuneng


Attachments:

  [application/octet-stream] v4-0001-Switch-Bloom-scan-paths-to-streaming-read.-n-nRep.patch (2.4K, 2-v4-0001-Switch-Bloom-scan-paths-to-streaming-read.-n-nRep.patch)
  download | inline diff:
From 314f9cdf6d8a62cc8523b377b8cf19df646b9913 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:28:10 +0800
Subject: [PATCH v4 1/6] Switch Bloom scan paths to streaming read.\n\nReplace
 per-page ReadBuffer loops in blgetbitmap() with read_stream_begin_relation()
 and sequential buffer iteration, reducing buffer churn and improving scan
 efficiency on large Bloom indexes.

---
 contrib/bloom/blscan.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blscan.c b/contrib/bloom/blscan.c
index 0d71edbe91c..b1fdabaab74 100644
--- a/contrib/bloom/blscan.c
+++ b/contrib/bloom/blscan.c
@@ -17,6 +17,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
+#include "storage/read_stream.h"
 
 /*
  * Begin scan of bloom index.
@@ -75,11 +76,13 @@ int64
 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 {
 	int64		ntids = 0;
-	BlockNumber blkno = BLOOM_HEAD_BLKNO,
+	BlockNumber blkno,
 				npages;
 	int			i;
 	BufferAccessStrategy bas;
 	BloomScanOpaque so = (BloomScanOpaque) scan->opaque;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (so->sign == NULL)
 	{
@@ -119,14 +122,29 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	if (scan->instrument)
 		scan->instrument->nsearches++;
 
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										bas,
+										scan->indexRelation,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
 		Page		page;
 
-		buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM,
-									blkno, RBM_NORMAL, bas);
-
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -162,6 +180,9 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 		UnlockReleaseBuffer(buffer);
 		CHECK_FOR_INTERRUPTS();
 	}
+
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
 	FreeAccessStrategy(bas);
 
 	return ntids;
-- 
2.51.0



  [application/octet-stream] v4-0005-Streamify-log_newpage_range-WAL-logging-path.patch (2.3K, 3-v4-0005-Streamify-log_newpage_range-WAL-logging-path.patch)
  download | inline diff:
From 5d286e1a8ecce90921e44a655751f8cc5875458a Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sun, 28 Dec 2025 18:29:07 +0800
Subject: [PATCH v4 5/6] Streamify log_newpage_range() WAL logging path

Refactor log_newpage_range() to use the Read Stream. This allows
prefetching of upcoming relation blocks during bulk WAL logging
perations, overlapping I/O with CPU-intensive XLogInsert and
WAL-writing work.
---
 src/backend/access/transam/xloginsert.c | 26 +++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index a56d5a55282..2075aea7037 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -39,6 +39,7 @@
 #include "replication/origin.h"
 #include "storage/bufmgr.h"
 #include "storage/proc.h"
+#include "storage/read_stream.h"
 #include "utils/memutils.h"
 #include "utils/pgstat_internal.h"
 
@@ -1295,6 +1296,8 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 {
 	int			flags;
 	BlockNumber blkno;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	flags = REGBUF_FORCE_IMAGE;
 	if (page_std)
@@ -1307,6 +1310,23 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 	 */
 	XLogEnsureRecordSpace(XLR_MAX_BLOCK_ID - 1, 0);
 
+	/* Set up a streaming read for the range of blocks */
+	p.current_blocknum = startblk;
+	p.last_exclusive = endblk;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_USE_BATCHING,
+										NULL,
+										rel,
+										forknum,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	blkno = startblk;
 	while (blkno < endblk)
 	{
@@ -1321,8 +1341,7 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 		nbufs = 0;
 		while (nbufs < XLR_MAX_BLOCK_ID && blkno < endblk)
 		{
-			Buffer		buf = ReadBufferExtended(rel, forknum, blkno,
-												 RBM_NORMAL, NULL);
+			Buffer		buf = read_stream_next_buffer(stream, NULL);
 
 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 
@@ -1361,6 +1380,9 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 		}
 		END_CRIT_SECTION();
 	}
+
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
 }
 
 /*
-- 
2.51.0



  [application/octet-stream] v4-0004-Replace-synchronous-ReadBufferExtended-loop-with-.patch (2.5K, 4-v4-0004-Replace-synchronous-ReadBufferExtended-loop-with-.patch)
  download | inline diff:
From f39c8ffa3ba727f0c1656c7106130981bcfe59d0 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:29:09 +0800
Subject: [PATCH v4 4/6] Replace synchronous ReadBufferExtended() loop with the
 streaming read in ginvacuumcleanup() to improve I/O efficiency during GIN
 index vacuum cleanup operations

---
 src/backend/access/gin/ginvacuum.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index d7baf7c847c..58e05c71256 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -22,6 +22,7 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "storage/read_stream.h"
 #include "utils/memutils.h"
 
 struct GinVacuumState
@@ -693,6 +694,8 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	BlockNumber totFreePages;
 	GinState	ginstate;
 	GinStatsData idxStat;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	/*
 	 * In an autovacuum analyze, we want to clean up pending insertions.
@@ -743,6 +746,24 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 	totFreePages = 0;
 
+	/* Scan all blocks starting from the root using streaming reads */
+	p.current_blocknum = GIN_ROOT_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
@@ -750,8 +771,8 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
+
 		LockBuffer(buffer, GIN_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -776,6 +797,9 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/* Update the metapage with accurate page and entry counts */
 	idxStat.nTotalPages = npages;
 	ginUpdateStats(info->index, &idxStat, false);
-- 
2.51.0



  [application/octet-stream] v4-0002-Streamify-Bloom-VACUUM-paths.-n-nUse-streaming-re.patch (4.2K, 5-v4-0002-Streamify-Bloom-VACUUM-paths.-n-nUse-streaming-re.patch)
  download | inline diff:
From aafdfa0851ab61777e8877cfb50e3b34bf54b9b1 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:28:49 +0800
Subject: [PATCH v4 2/6] Streamify Bloom VACUUM paths.\n\nUse streaming reads
 in blbulkdelete() and blvacuumcleanup() to iterate index pages without
 repeated ReadBuffer calls, improving VACUUM performance and reducing buffer
 manager overhead during maintenance operations.

---
 contrib/bloom/blvacuum.c | 55 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blvacuum.c b/contrib/bloom/blvacuum.c
index e68a9008f56..7452302f022 100644
--- a/contrib/bloom/blvacuum.c
+++ b/contrib/bloom/blvacuum.c
@@ -17,6 +17,7 @@
 #include "commands/vacuum.h"
 #include "storage/bufmgr.h"
 #include "storage/indexfsm.h"
+#include "storage/read_stream.h"
 
 
 /*
@@ -40,6 +41,8 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Page		page;
 	BloomMetaPageData *metaData;
 	GenericXLogState *gxlogState;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (stats == NULL)
 		stats = palloc0_object(IndexBulkDeleteResult);
@@ -51,6 +54,25 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 * they can't contain tuples to delete.
 	 */
 	npages = RelationGetNumberOfBlocks(index);
+
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		BloomTuple *itup,
@@ -59,8 +81,7 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		gxlogState = GenericXLogStart(index);
@@ -133,6 +154,9 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/*
 	 * Update the metapage's notFullPage list with whatever we found.  Our
 	 * info could already be out of date at this point, but blinsert() will
@@ -166,6 +190,8 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	Relation	index = info->index;
 	BlockNumber npages,
 				blkno;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (info->analyze_only)
 		return stats;
@@ -181,6 +207,25 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	stats->num_pages = npages;
 	stats->pages_free = 0;
 	stats->num_index_tuples = 0;
+
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
@@ -188,8 +233,7 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -206,6 +250,9 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	IndexFreeSpaceMapVacuum(info->index);
 
 	return stats;
-- 
2.51.0



  [application/octet-stream] v4-0006-Streamify-hash-index-VACUUM-primary-bucket-page-r.patch (5.5K, 6-v4-0006-Streamify-hash-index-VACUUM-primary-bucket-page-r.patch)
  download | inline diff:
From 586b8007cff40671f7039d67b77f8e0154e8e782 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sun, 28 Dec 2025 18:29:28 +0800
Subject: [PATCH v4 6/6] Streamify hash index VACUUM primary bucket page reads

Refactor hashbulkdelete() to use the Read Stream  for primary bucket
pages. This enables prefetching of upcoming buckets while the current
one is being processed, improving I/O efficiency during hash index
vacuum operations.
---
 src/backend/access/hash/hash.c   | 80 ++++++++++++++++++++++++++++++--
 src/tools/pgindent/typedefs.list |  1 +
 2 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index e388252afdc..01219c0015e 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -30,6 +30,7 @@
 #include "nodes/execnodes.h"
 #include "optimizer/plancat.h"
 #include "pgstat.h"
+#include "storage/read_stream.h"
 #include "utils/fmgrprotos.h"
 #include "utils/index_selfuncs.h"
 #include "utils/rel.h"
@@ -42,12 +43,23 @@ typedef struct
 	Relation	heapRel;		/* heap relation descriptor */
 } HashBuildState;
 
+/* Working state for streaming reads in hashbulkdelete */
+typedef struct
+{
+	HashMetaPage metap;			/* cached metapage for BUCKET_TO_BLKNO */
+	Bucket		next_bucket;	/* next bucket to prefetch */
+	Bucket		max_bucket;		/* stop when next_bucket > max_bucket */
+} HashBulkDeleteStreamPrivate;
+
 static void hashbuildCallback(Relation index,
 							  ItemPointer tid,
 							  Datum *values,
 							  bool *isnull,
 							  bool tupleIsAlive,
 							  void *state);
+static BlockNumber hash_bulkdelete_read_stream_cb(ReadStream *stream,
+												  void *callback_private_data,
+												  void *per_buffer_data);
 
 
 /*
@@ -450,6 +462,27 @@ hashendscan(IndexScanDesc scan)
 	scan->opaque = NULL;
 }
 
+/*
+ * Read stream callback for hashbulkdelete.
+ *
+ * Returns the block number of the primary page for the next bucket to
+ * vacuum, using the BUCKET_TO_BLKNO mapping from the cached metapage.
+ */
+static BlockNumber
+hash_bulkdelete_read_stream_cb(ReadStream *stream,
+							   void *callback_private_data,
+							   void *per_buffer_data)
+{
+	HashBulkDeleteStreamPrivate *p = callback_private_data;
+	Bucket		bucket;
+
+	if (p->next_bucket > p->max_bucket)
+		return InvalidBlockNumber;
+
+	bucket = p->next_bucket++;
+	return BUCKET_TO_BLKNO(p->metap, bucket);
+}
+
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples.
  * The set of target tuples is specified via a callback routine that tells
@@ -474,6 +507,8 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Buffer		metabuf = InvalidBuffer;
 	HashMetaPage metap;
 	HashMetaPage cachedmetap;
+	HashBulkDeleteStreamPrivate stream_private;
+	ReadStream *stream = NULL;
 
 	tuples_removed = 0;
 	num_index_tuples = 0;
@@ -494,7 +529,25 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	cur_bucket = 0;
 	cur_maxbucket = orig_maxbucket;
 
-loop_top:
+	/* Set up streaming read for primary bucket pages */
+	stream_private.metap = cachedmetap;
+	stream_private.next_bucket = cur_bucket;
+	stream_private.max_bucket = cur_maxbucket;
+
+	/*
+	 * It is safe to use batchmode as hash_bulkdelete_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										rel,
+										MAIN_FORKNUM,
+										hash_bulkdelete_read_stream_cb,
+										&stream_private,
+										0);
+
+bucket_loop:
 	while (cur_bucket <= cur_maxbucket)
 	{
 		BlockNumber bucket_blkno;
@@ -514,7 +567,8 @@ loop_top:
 		 * We need to acquire a cleanup lock on the primary bucket page to out
 		 * wait concurrent scans before deleting the dead tuples.
 		 */
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
+		buf = read_stream_next_buffer(stream, NULL);
+		Assert(BufferIsValid(buf));
 		LockBufferForCleanup(buf);
 		_hash_checkpage(rel, buf, LH_BUCKET_PAGE);
 
@@ -545,6 +599,16 @@ loop_top:
 			{
 				cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
 				Assert(cachedmetap != NULL);
+
+				/*
+				 * Reset stream with updated metadata for remaining buckets.
+				 * The BUCKET_TO_BLKNO mapping depends on hashm_spares[],
+				 * which may have changed.
+				 */
+				stream_private.metap = cachedmetap;
+				stream_private.next_bucket = cur_bucket + 1;
+				stream_private.max_bucket = cur_maxbucket;
+				read_stream_reset(stream);
 			}
 		}
 
@@ -577,9 +641,19 @@ loop_top:
 		cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
 		Assert(cachedmetap != NULL);
 		cur_maxbucket = cachedmetap->hashm_maxbucket;
-		goto loop_top;
+
+		/* Reset stream to process additional buckets from split */
+		stream_private.metap = cachedmetap;
+		stream_private.next_bucket = cur_bucket;
+		stream_private.max_bucket = cur_maxbucket;
+		read_stream_reset(stream);
+		goto bucket_loop;
 	}
 
+	/* Stream should be exhausted since we processed all buckets */
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/* Okay, we're really done.  Update tuple count in metapage. */
 	START_CRIT_SECTION();
 
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index dc6fc28fcab..572be5598f2 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1176,6 +1176,7 @@ HashAggBatch
 HashAggSpill
 HashAllocFunc
 HashBuildState
+HashBulkDeleteStreamPrivate
 HashCompareFunc
 HashCopyFunc
 HashIndexStat
-- 
2.51.0



  [application/octet-stream] v4-0003-Streamify-heap-bloat-estimation-scan.-Introduce-a.patch (6.4K, 7-v4-0003-Streamify-heap-bloat-estimation-scan.-Introduce-a.patch)
  download | inline diff:
From 815eddd3f1d65f5485bd68f30e9ee52349ecb4f3 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:29:02 +0800
Subject: [PATCH v4 3/6] Streamify heap bloat estimation scan. Introduce a 
 read-stream callback to skip all-visible pages via VM/FSM lookup and 
 stream-read the rest, reducing page reads and improving pgstattuple_approx 
 execution time on large relations.

---
 contrib/pgstattuple/pgstatapprox.c | 126 ++++++++++++++++++++++-------
 src/tools/pgindent/typedefs.list   |   1 +
 2 files changed, 96 insertions(+), 31 deletions(-)

diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c
index a59ff4e9d4f..9904094d767 100644
--- a/contrib/pgstattuple/pgstatapprox.c
+++ b/contrib/pgstattuple/pgstatapprox.c
@@ -23,6 +23,7 @@
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
 #include "storage/procarray.h"
+#include "storage/read_stream.h"
 
 PG_FUNCTION_INFO_V1(pgstattuple_approx);
 PG_FUNCTION_INFO_V1(pgstattuple_approx_v1_5);
@@ -45,6 +46,61 @@ typedef struct output_type
 
 #define NUM_OUTPUT_COLUMNS 10
 
+/*
+ * Struct for statapprox_heap read stream callback.
+ */
+typedef struct StatApproxReadStreamPrivate
+{
+	Relation	rel;
+	output_type *stat;
+	BlockNumber current_blocknum;
+	BlockNumber nblocks;
+	BlockNumber scanned;		/* count of pages actually read */
+	Buffer		vmbuffer;		/* for VM lookups */
+} StatApproxReadStreamPrivate;
+
+/*
+ * Read stream callback for statapprox_heap.
+ *
+ * This callback checks the visibility map for each block. If the block is
+ * all-visible, we can get the free space from the FSM without reading the
+ * actual page, and skip to the next block. Only blocks that are not
+ * all-visible are returned for actual reading.
+ */
+static BlockNumber
+statapprox_heap_read_stream_next(ReadStream *stream,
+								 void *callback_private_data,
+								 void *per_buffer_data)
+{
+	StatApproxReadStreamPrivate *p = callback_private_data;
+
+	while (p->current_blocknum < p->nblocks)
+	{
+		BlockNumber blkno = p->current_blocknum++;
+		Size		freespace;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * If the page has only visible tuples, then we can find out the free
+		 * space from the FSM and move on without reading the page.
+		 */
+		if (VM_ALL_VISIBLE(p->rel, blkno, &p->vmbuffer))
+		{
+			freespace = GetRecordedFreeSpace(p->rel, blkno);
+			p->stat->tuple_len += BLCKSZ - freespace;
+			p->stat->free_space += freespace;
+			continue;
+		}
+
+		/* This block needs to be read */
+		p->scanned++;
+		return blkno;
+	}
+
+	return InvalidBlockNumber;
+}
+
 /*
  * This function takes an already open relation and scans its pages,
  * skipping those that have the corresponding visibility map bit set.
@@ -58,53 +114,58 @@ typedef struct output_type
 static void
 statapprox_heap(Relation rel, output_type *stat)
 {
-	BlockNumber scanned,
-				nblocks,
-				blkno;
-	Buffer		vmbuffer = InvalidBuffer;
+	BlockNumber nblocks;
 	BufferAccessStrategy bstrategy;
 	TransactionId OldestXmin;
+	StatApproxReadStreamPrivate p;
+	ReadStream *stream;
 
 	OldestXmin = GetOldestNonRemovableTransactionId(rel);
 	bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
 	nblocks = RelationGetNumberOfBlocks(rel);
-	scanned = 0;
 
-	for (blkno = 0; blkno < nblocks; blkno++)
+	/* Initialize read stream private data */
+	p.rel = rel;
+	p.stat = stat;
+	p.current_blocknum = 0;
+	p.nblocks = nblocks;
+	p.scanned = 0;
+	p.vmbuffer = InvalidBuffer;
+
+	/*
+	 * Create the read stream. We don't use READ_STREAM_USE_BATCHING because
+	 * the callback accesses the visibility map which may need to read VM
+	 * pages. While this shouldn't cause deadlocks, we err on the side of
+	 * caution.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										statapprox_heap_read_stream_next,
+										&p,
+										0);
+
+	for (;;)
 	{
 		Buffer		buf;
 		Page		page;
 		OffsetNumber offnum,
 					maxoff;
-		Size		freespace;
-
-		CHECK_FOR_INTERRUPTS();
-
-		/*
-		 * If the page has only visible tuples, then we can find out the free
-		 * space from the FSM and move on.
-		 */
-		if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
-		{
-			freespace = GetRecordedFreeSpace(rel, blkno);
-			stat->tuple_len += BLCKSZ - freespace;
-			stat->free_space += freespace;
-			continue;
-		}
+		BlockNumber blkno;
 
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
-								 RBM_NORMAL, bstrategy);
+		buf = read_stream_next_buffer(stream, NULL);
+		if (buf == InvalidBuffer)
+			break;
 
 		LockBuffer(buf, BUFFER_LOCK_SHARE);
 
 		page = BufferGetPage(buf);
+		blkno = BufferGetBlockNumber(buf);
 
 		stat->free_space += PageGetExactFreeSpace(page);
 
-		/* We may count the page as scanned even if it's new/empty */
-		scanned++;
-
 		if (PageIsNew(page) || PageIsEmpty(page))
 		{
 			UnlockReleaseBuffer(buf);
@@ -169,6 +230,9 @@ statapprox_heap(Relation rel, output_type *stat)
 		UnlockReleaseBuffer(buf);
 	}
 
+	Assert(p.current_blocknum == nblocks);
+	read_stream_end(stream);
+
 	stat->table_len = (uint64) nblocks * BLCKSZ;
 
 	/*
@@ -179,7 +243,7 @@ statapprox_heap(Relation rel, output_type *stat)
 	 * tuples in all-visible pages, so no correction is needed for that, and
 	 * we already accounted for the space in those pages, too.
 	 */
-	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, scanned,
+	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, p.scanned,
 											   stat->tuple_count);
 
 	/* It's not clear if we could get -1 here, but be safe. */
@@ -190,16 +254,16 @@ statapprox_heap(Relation rel, output_type *stat)
 	 */
 	if (nblocks != 0)
 	{
-		stat->scanned_percent = 100.0 * scanned / nblocks;
+		stat->scanned_percent = 100.0 * p.scanned / nblocks;
 		stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len;
 		stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len;
 		stat->free_percent = 100.0 * stat->free_space / stat->table_len;
 	}
 
-	if (BufferIsValid(vmbuffer))
+	if (BufferIsValid(p.vmbuffer))
 	{
-		ReleaseBuffer(vmbuffer);
-		vmbuffer = InvalidBuffer;
+		ReleaseBuffer(p.vmbuffer);
+		p.vmbuffer = InvalidBuffer;
 	}
 }
 
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 5c88fa92f4e..dc6fc28fcab 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2898,6 +2898,7 @@ StartReplicationCmd
 StartupStatusEnum
 StatEntry
 StatExtEntry
+StatApproxReadStreamPrivate
 StateFileChunk
 StatisticExtInfo
 StatsBuildData
-- 
2.51.0



^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2026-03-10 06:06                 ` Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2026-03-10 06:06 UTC (permalink / raw)
  To: pgsql-hackers <[email protected]>; +Cc: Nazir Bilal Yavuz <[email protected]>

Hi,

On Mon, Feb 9, 2026 at 6:40 PM Xuneng Zhou <[email protected]> wrote:
>
> Hi,
>
> On Thu, Feb 5, 2026 at 12:01 PM Xuneng Zhou <[email protected]> wrote:
> >
> > Hi,
> >
> > On Tue, Dec 30, 2025 at 10:43 AM Xuneng Zhou <[email protected]> wrote:
> > >
> > > Hi,
> > >
> > > On Tue, Dec 30, 2025 at 9:51 AM Xuneng Zhou <[email protected]> wrote:
> > > >
> > > > Hi,
> > > >
> > > > Thanks for looking into this.
> > > >
> > > > On Mon, Dec 29, 2025 at 6:58 PM Nazir Bilal Yavuz <[email protected]> wrote:
> > > > >
> > > > > Hi,
> > > > >
> > > > > On Sun, 28 Dec 2025 at 14:46, Xuneng Zhou <[email protected]> wrote:
> > > > > >
> > > > > > Hi,
> > > > > > >
> > > > > > > Two more to go:
> > > > > > > patch 5: Streamify log_newpage_range() WAL logging path
> > > > > > > patch 6: Streamify hash index VACUUM primary bucket page reads
> > > > > > >
> > > > > > > Benchmarks will be conducted soon.
> > > > > > >
> > > > > >
> > > > > > v6 in the last message has a problem and has not been updated. Attach
> > > > > > the right one again. Sorry for the noise.
> > > > >
> > > > > 0003 and 0006:
> > > > >
> > > > > You need to add 'StatApproxReadStreamPrivate' and
> > > > > 'HashBulkDeleteStreamPrivate' to the typedefs.list.
> > > >
> > > > Done.
> > > >
> > > > > 0005:
> > > > >
> > > > > @@ -1321,8 +1341,10 @@ log_newpage_range(Relation rel, ForkNumber forknum,
> > > > >          nbufs = 0;
> > > > >          while (nbufs < XLR_MAX_BLOCK_ID && blkno < endblk)
> > > > >          {
> > > > > -            Buffer        buf = ReadBufferExtended(rel, forknum, blkno,
> > > > > -                                                 RBM_NORMAL, NULL);
> > > > > +            Buffer        buf = read_stream_next_buffer(stream, NULL);
> > > > > +
> > > > > +            if (!BufferIsValid(buf))
> > > > > +                break;
> > > > >
> > > > > We are loosening a check here, there should not be a invalid buffer in
> > > > > the stream until the endblk. I think you can remove this
> > > > > BufferIsValid() check, then we can learn if something goes wrong.
> > > >
> > > > My concern before for not adding assert at the end of streaming is the
> > > > potential early break in here:
> > > >
> > > > /* Nothing more to do if all remaining blocks were empty. */
> > > > if (nbufs == 0)
> > > >     break;
> > > >
> > > > After looking more closely, it turns out to be a misunderstanding of the logic.
> > > >
> > > > > 0006:
> > > > >
> > > > > You can use read_stream_reset() instead of read_stream_end(), then you
> > > > > can use the same stream with different variables, I believe this is
> > > > > the preferred way.
> > > > >
> > > > > Rest LGTM!
> > > > >
> > > >
> > > > Yeah, reset seems a more proper way here.
> > > >
> > >
> > > Run pgindent using the updated typedefs.list.
> > >
> >
> > I've completed benchmarking of the v4 streaming read patches across
> > three I/O methods (io_uring, sync, worker). Tests were run with cold
> > cache on large datasets.
> >
> > --- Settings ---
> >
> > shared_buffers = '8GB'
> > effective_io_concurrency = 200
> > io_method = $IO_METHOD
> > io_workers = $IO_WORKERS
> > io_max_concurrency = $IO_MAX_CONCURRENCY
> > track_io_timing = on
> > autovacuum = off
> > checkpoint_timeout = 1h
> > max_wal_size = 10GB
> > max_parallel_workers_per_gather = 0
> >
> > --- Machine ---
> > CPU: 48-core
> > RAM: 256 GB DDR5
> > Disk: 2 x 1.92 TB NVMe SSD
> >
> > --- Executive Summary ---
> >
> > The patches provide significant benefits for I/O-bound sequential
> > operations, with the greatest improvements seen when using
> > asynchronous I/O methods (io_uring and worker). The synchronous I/O
> > mode shows reduced but still meaningful gains.
> >
> > --- Results by I/O Method
> >
> > Best Results: io_method=worker
> >
> > bloom_scan: 4.14x (75.9% faster); 93% fewer reads
> > pgstattuple: 1.59x (37.1% faster); 94% fewer reads
> > hash_vacuum: 1.05x (4.4% faster); 80% fewer reads
> > gin_vacuum: 1.06x (5.6% faster); 15% fewer reads
> > bloom_vacuum: 1.04x (3.9% faster); 76% fewer reads
> > wal_logging: 0.98x (-2.5%, neutral/slightly slower); no change in reads
> >
> > io_method=io_uring
> >
> > bloom_scan: 3.12x (68.0% faster); 93% fewer reads
> > pgstattuple: 1.50x (33.2% faster); 94% fewer reads
> > hash_vacuum: 1.03x (3.3% faster); 80% fewer reads
> > gin_vacuum: 1.02x (2.1% faster); 15% fewer reads
> > bloom_vacuum: 1.03x (3.4% faster); 76% fewer reads
> > wal_logging: 1.00x (-0.5%, neutral); no change in reads
> >
> > io_method=sync (baseline comparison)
> >
> > bloom_scan: 1.20x (16.4% faster); 93% fewer reads
> > pgstattuple: 1.10x (9.0% faster); 94% fewer reads
> > hash_vacuum: 1.01x (0.8% faster); 80% fewer reads
> > gin_vacuum: 1.02x (1.7% faster); 15% fewer reads
> > bloom_vacuum: 1.03x (2.8% faster); 76% fewer reads
> > wal_logging: 0.99x (-0.7%, neutral); no change in reads
> >
> > --- Observations ---
> >
> > Async I/O amplifies streaming benefits: The same patches show 3-4x
> > improvement with worker/io_uring vs 1.2x with sync.
> >
> > I/O operation reduction is consistent: All modes show the same ~93-94%
> > reduction in I/O operations for bloom_scan and pgstattuple.
> >
> > VACUUM operations show modest gains: Despite large I/O reductions
> > (76-80%), wall-clock improvements are smaller (3-15%) since VACUUM has
> > larger CPU overhead (tuple processing, index maintenance, WAL
> > logging).
> >
> > log_newpage_range shows no benefit: The patch provides no improvement (~0.97x).
> >
> > --
> > Best,
> > Xuneng
>
> There was an issue in the wal_log test of the original script.
>
> --- The original benchmark used:
> ALTER TABLE ... SET LOGGED
>
> This path performs a full table rewrite via ATRewriteTable()
> (tablecmds.c). It creates a new relfilenode and copies tuples into it.
> It does not call log_newpage_range() on rewritten pages.
>
> log_newpage_range() may only appear indirectly through the
> pending-sync logic in storage.c, and only when:
>
> wal_level = minimal, and
> relation size < wal_skip_threshold (default 2MB).
>
> Our test tables (1M–20M rows) are far larger than 2MB. In that case,
> PostgreSQL fsyncs the file instead of WAL-logging it. Therefore, the
> previous benchmark measured table rewrite I/O, not the
> log_newpage_range() path.
>
> --- Current design: GIN index build
>
> The benchmark now uses:
> CREATE INDEX ... USING gin (doc_tsv)
>
> This reliably exercises log_newpage_range() because:
> - ginbuild() constructs the index and WAL-logs all new index pages
> using log_newpage_range().
> - This is part of the normal GIN build path, independent of wal_skip_threshold.
> - The streaming-read patch modifies the WAL logging path inside
> log_newpage_range(), which this test directly targets.
>
> --- Results (wal_logging_large)
> worker: 1.00x (+0.5%); no meaningful change in reads
> io_uring: 1.01x (+1.3%); no meaningful change in reads
> sync: 1.01x (+1.1%); no meaningful change in reads
>
> --
> Best,
> Xuneng

Here’s v5 of the patchset. The wal_logging_large patch has been
removed, as no performance gains were observed in the benchmark runs.

-- 
Best,
Xuneng


Attachments:

  [application/octet-stream] v5-0001-Switch-Bloom-scan-paths-to-streaming-read.patch (2.3K, 2-v5-0001-Switch-Bloom-scan-paths-to-streaming-read.patch)
  download | inline diff:
From 7e93338b52a4bdde5ce96f59c8f252bca219d7a3 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:28:10 +0800
Subject: [PATCH v5 1/5] Switch Bloom scan paths to streaming read.

Replace per-page ReadBuffer loops in blgetbitmap() with read_stream_begin_relation() and sequential buffer iteration, reducing buffer churn and improving scan efficiency on large Bloom indexes.
---
 contrib/bloom/blscan.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blscan.c b/contrib/bloom/blscan.c
index 0d71edbe91c..b1fdabaab74 100644
--- a/contrib/bloom/blscan.c
+++ b/contrib/bloom/blscan.c
@@ -17,6 +17,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
+#include "storage/read_stream.h"
 
 /*
  * Begin scan of bloom index.
@@ -75,11 +76,13 @@ int64
 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 {
 	int64		ntids = 0;
-	BlockNumber blkno = BLOOM_HEAD_BLKNO,
+	BlockNumber blkno,
 				npages;
 	int			i;
 	BufferAccessStrategy bas;
 	BloomScanOpaque so = (BloomScanOpaque) scan->opaque;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (so->sign == NULL)
 	{
@@ -119,14 +122,29 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	if (scan->instrument)
 		scan->instrument->nsearches++;
 
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										bas,
+										scan->indexRelation,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
 		Page		page;
 
-		buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM,
-									blkno, RBM_NORMAL, bas);
-
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -162,6 +180,9 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 		UnlockReleaseBuffer(buffer);
 		CHECK_FOR_INTERRUPTS();
 	}
+
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
 	FreeAccessStrategy(bas);
 
 	return ntids;
-- 
2.51.0



  [application/octet-stream] v5-0003-Streamify-heap-bloat-estimation-scan.-Introduce-a.patch (6.4K, 3-v5-0003-Streamify-heap-bloat-estimation-scan.-Introduce-a.patch)
  download | inline diff:
From c79b79497be288b41c1953970a50272044008ef3 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:29:02 +0800
Subject: [PATCH v5 3/5] Streamify heap bloat estimation scan. Introduce a 
 read-stream callback to skip all-visible pages via VM/FSM lookup and 
 stream-read the rest, reducing page reads and improving pgstattuple_approx 
 execution time on large relations.

---
 contrib/pgstattuple/pgstatapprox.c | 126 ++++++++++++++++++++++-------
 src/tools/pgindent/typedefs.list   |   1 +
 2 files changed, 96 insertions(+), 31 deletions(-)

diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c
index a59ff4e9d4f..9904094d767 100644
--- a/contrib/pgstattuple/pgstatapprox.c
+++ b/contrib/pgstattuple/pgstatapprox.c
@@ -23,6 +23,7 @@
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
 #include "storage/procarray.h"
+#include "storage/read_stream.h"
 
 PG_FUNCTION_INFO_V1(pgstattuple_approx);
 PG_FUNCTION_INFO_V1(pgstattuple_approx_v1_5);
@@ -45,6 +46,61 @@ typedef struct output_type
 
 #define NUM_OUTPUT_COLUMNS 10
 
+/*
+ * Struct for statapprox_heap read stream callback.
+ */
+typedef struct StatApproxReadStreamPrivate
+{
+	Relation	rel;
+	output_type *stat;
+	BlockNumber current_blocknum;
+	BlockNumber nblocks;
+	BlockNumber scanned;		/* count of pages actually read */
+	Buffer		vmbuffer;		/* for VM lookups */
+} StatApproxReadStreamPrivate;
+
+/*
+ * Read stream callback for statapprox_heap.
+ *
+ * This callback checks the visibility map for each block. If the block is
+ * all-visible, we can get the free space from the FSM without reading the
+ * actual page, and skip to the next block. Only blocks that are not
+ * all-visible are returned for actual reading.
+ */
+static BlockNumber
+statapprox_heap_read_stream_next(ReadStream *stream,
+								 void *callback_private_data,
+								 void *per_buffer_data)
+{
+	StatApproxReadStreamPrivate *p = callback_private_data;
+
+	while (p->current_blocknum < p->nblocks)
+	{
+		BlockNumber blkno = p->current_blocknum++;
+		Size		freespace;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * If the page has only visible tuples, then we can find out the free
+		 * space from the FSM and move on without reading the page.
+		 */
+		if (VM_ALL_VISIBLE(p->rel, blkno, &p->vmbuffer))
+		{
+			freespace = GetRecordedFreeSpace(p->rel, blkno);
+			p->stat->tuple_len += BLCKSZ - freespace;
+			p->stat->free_space += freespace;
+			continue;
+		}
+
+		/* This block needs to be read */
+		p->scanned++;
+		return blkno;
+	}
+
+	return InvalidBlockNumber;
+}
+
 /*
  * This function takes an already open relation and scans its pages,
  * skipping those that have the corresponding visibility map bit set.
@@ -58,53 +114,58 @@ typedef struct output_type
 static void
 statapprox_heap(Relation rel, output_type *stat)
 {
-	BlockNumber scanned,
-				nblocks,
-				blkno;
-	Buffer		vmbuffer = InvalidBuffer;
+	BlockNumber nblocks;
 	BufferAccessStrategy bstrategy;
 	TransactionId OldestXmin;
+	StatApproxReadStreamPrivate p;
+	ReadStream *stream;
 
 	OldestXmin = GetOldestNonRemovableTransactionId(rel);
 	bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
 	nblocks = RelationGetNumberOfBlocks(rel);
-	scanned = 0;
 
-	for (blkno = 0; blkno < nblocks; blkno++)
+	/* Initialize read stream private data */
+	p.rel = rel;
+	p.stat = stat;
+	p.current_blocknum = 0;
+	p.nblocks = nblocks;
+	p.scanned = 0;
+	p.vmbuffer = InvalidBuffer;
+
+	/*
+	 * Create the read stream. We don't use READ_STREAM_USE_BATCHING because
+	 * the callback accesses the visibility map which may need to read VM
+	 * pages. While this shouldn't cause deadlocks, we err on the side of
+	 * caution.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										statapprox_heap_read_stream_next,
+										&p,
+										0);
+
+	for (;;)
 	{
 		Buffer		buf;
 		Page		page;
 		OffsetNumber offnum,
 					maxoff;
-		Size		freespace;
-
-		CHECK_FOR_INTERRUPTS();
-
-		/*
-		 * If the page has only visible tuples, then we can find out the free
-		 * space from the FSM and move on.
-		 */
-		if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
-		{
-			freespace = GetRecordedFreeSpace(rel, blkno);
-			stat->tuple_len += BLCKSZ - freespace;
-			stat->free_space += freespace;
-			continue;
-		}
+		BlockNumber blkno;
 
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
-								 RBM_NORMAL, bstrategy);
+		buf = read_stream_next_buffer(stream, NULL);
+		if (buf == InvalidBuffer)
+			break;
 
 		LockBuffer(buf, BUFFER_LOCK_SHARE);
 
 		page = BufferGetPage(buf);
+		blkno = BufferGetBlockNumber(buf);
 
 		stat->free_space += PageGetExactFreeSpace(page);
 
-		/* We may count the page as scanned even if it's new/empty */
-		scanned++;
-
 		if (PageIsNew(page) || PageIsEmpty(page))
 		{
 			UnlockReleaseBuffer(buf);
@@ -169,6 +230,9 @@ statapprox_heap(Relation rel, output_type *stat)
 		UnlockReleaseBuffer(buf);
 	}
 
+	Assert(p.current_blocknum == nblocks);
+	read_stream_end(stream);
+
 	stat->table_len = (uint64) nblocks * BLCKSZ;
 
 	/*
@@ -179,7 +243,7 @@ statapprox_heap(Relation rel, output_type *stat)
 	 * tuples in all-visible pages, so no correction is needed for that, and
 	 * we already accounted for the space in those pages, too.
 	 */
-	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, scanned,
+	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, p.scanned,
 											   stat->tuple_count);
 
 	/* It's not clear if we could get -1 here, but be safe. */
@@ -190,16 +254,16 @@ statapprox_heap(Relation rel, output_type *stat)
 	 */
 	if (nblocks != 0)
 	{
-		stat->scanned_percent = 100.0 * scanned / nblocks;
+		stat->scanned_percent = 100.0 * p.scanned / nblocks;
 		stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len;
 		stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len;
 		stat->free_percent = 100.0 * stat->free_space / stat->table_len;
 	}
 
-	if (BufferIsValid(vmbuffer))
+	if (BufferIsValid(p.vmbuffer))
 	{
-		ReleaseBuffer(vmbuffer);
-		vmbuffer = InvalidBuffer;
+		ReleaseBuffer(p.vmbuffer);
+		p.vmbuffer = InvalidBuffer;
 	}
 }
 
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 5c88fa92f4e..dc6fc28fcab 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2898,6 +2898,7 @@ StartReplicationCmd
 StartupStatusEnum
 StatEntry
 StatExtEntry
+StatApproxReadStreamPrivate
 StateFileChunk
 StatisticExtInfo
 StatsBuildData
-- 
2.51.0



  [application/octet-stream] v5-0004-Replace-synchronous-ReadBufferExtended-loop-with-.patch (2.5K, 4-v5-0004-Replace-synchronous-ReadBufferExtended-loop-with-.patch)
  download | inline diff:
From 94bda668e03149640264971f39236263690e09fb Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:29:09 +0800
Subject: [PATCH v5 4/5] Replace synchronous ReadBufferExtended() loop with the
 streaming read in ginvacuumcleanup() to improve I/O efficiency during GIN
 index vacuum cleanup operations

---
 src/backend/access/gin/ginvacuum.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index d7baf7c847c..58e05c71256 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -22,6 +22,7 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "storage/read_stream.h"
 #include "utils/memutils.h"
 
 struct GinVacuumState
@@ -693,6 +694,8 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	BlockNumber totFreePages;
 	GinState	ginstate;
 	GinStatsData idxStat;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	/*
 	 * In an autovacuum analyze, we want to clean up pending insertions.
@@ -743,6 +746,24 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 	totFreePages = 0;
 
+	/* Scan all blocks starting from the root using streaming reads */
+	p.current_blocknum = GIN_ROOT_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
@@ -750,8 +771,8 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
+
 		LockBuffer(buffer, GIN_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -776,6 +797,9 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/* Update the metapage with accurate page and entry counts */
 	idxStat.nTotalPages = npages;
 	ginUpdateStats(info->index, &idxStat, false);
-- 
2.51.0



  [application/octet-stream] v5-0002-Streamify-Bloom-VACUUM-paths.patch (4.2K, 5-v5-0002-Streamify-Bloom-VACUUM-paths.patch)
  download | inline diff:
From 4307f7dc0735a499d51826402d30d2c420dcd0d4 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:28:49 +0800
Subject: [PATCH v5 2/5] Streamify Bloom VACUUM paths.

Use streaming reads in blbulkdelete() and blvacuumcleanup() to iterate index pages without repeated ReadBuffer calls, improving VACUUM performance and reducing buffer manager overhead during maintenance operations.
---
 contrib/bloom/blvacuum.c | 55 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blvacuum.c b/contrib/bloom/blvacuum.c
index e68a9008f56..7452302f022 100644
--- a/contrib/bloom/blvacuum.c
+++ b/contrib/bloom/blvacuum.c
@@ -17,6 +17,7 @@
 #include "commands/vacuum.h"
 #include "storage/bufmgr.h"
 #include "storage/indexfsm.h"
+#include "storage/read_stream.h"
 
 
 /*
@@ -40,6 +41,8 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Page		page;
 	BloomMetaPageData *metaData;
 	GenericXLogState *gxlogState;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (stats == NULL)
 		stats = palloc0_object(IndexBulkDeleteResult);
@@ -51,6 +54,25 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 * they can't contain tuples to delete.
 	 */
 	npages = RelationGetNumberOfBlocks(index);
+
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		BloomTuple *itup,
@@ -59,8 +81,7 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		gxlogState = GenericXLogStart(index);
@@ -133,6 +154,9 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/*
 	 * Update the metapage's notFullPage list with whatever we found.  Our
 	 * info could already be out of date at this point, but blinsert() will
@@ -166,6 +190,8 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	Relation	index = info->index;
 	BlockNumber npages,
 				blkno;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (info->analyze_only)
 		return stats;
@@ -181,6 +207,25 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	stats->num_pages = npages;
 	stats->pages_free = 0;
 	stats->num_index_tuples = 0;
+
+	/* Scan all blocks except the metapage using streaming reads */
+	p.current_blocknum = BLOOM_HEAD_BLKNO;
+	p.last_exclusive = npages;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										index,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
 	{
 		Buffer		buffer;
@@ -188,8 +233,7 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 		vacuum_delay_point(false);
 
-		buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
-									RBM_NORMAL, info->strategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buffer);
 
@@ -206,6 +250,9 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	IndexFreeSpaceMapVacuum(info->index);
 
 	return stats;
-- 
2.51.0



  [application/octet-stream] v5-0005-Streamify-hash-index-VACUUM-primary-bucket-page-r.patch (5.5K, 6-v5-0005-Streamify-hash-index-VACUUM-primary-bucket-page-r.patch)
  download | inline diff:
From 11cc2d6bba8f14016ef47fa94c08bf60d987264e Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sun, 28 Dec 2025 18:29:28 +0800
Subject: [PATCH v5 5/5] Streamify hash index VACUUM primary bucket page reads

Refactor hashbulkdelete() to use the Read Stream  for primary bucket
pages. This enables prefetching of upcoming buckets while the current
one is being processed, improving I/O efficiency during hash index
vacuum operations.
---
 src/backend/access/hash/hash.c   | 80 ++++++++++++++++++++++++++++++--
 src/tools/pgindent/typedefs.list |  1 +
 2 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index e388252afdc..01219c0015e 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -30,6 +30,7 @@
 #include "nodes/execnodes.h"
 #include "optimizer/plancat.h"
 #include "pgstat.h"
+#include "storage/read_stream.h"
 #include "utils/fmgrprotos.h"
 #include "utils/index_selfuncs.h"
 #include "utils/rel.h"
@@ -42,12 +43,23 @@ typedef struct
 	Relation	heapRel;		/* heap relation descriptor */
 } HashBuildState;
 
+/* Working state for streaming reads in hashbulkdelete */
+typedef struct
+{
+	HashMetaPage metap;			/* cached metapage for BUCKET_TO_BLKNO */
+	Bucket		next_bucket;	/* next bucket to prefetch */
+	Bucket		max_bucket;		/* stop when next_bucket > max_bucket */
+} HashBulkDeleteStreamPrivate;
+
 static void hashbuildCallback(Relation index,
 							  ItemPointer tid,
 							  Datum *values,
 							  bool *isnull,
 							  bool tupleIsAlive,
 							  void *state);
+static BlockNumber hash_bulkdelete_read_stream_cb(ReadStream *stream,
+												  void *callback_private_data,
+												  void *per_buffer_data);
 
 
 /*
@@ -450,6 +462,27 @@ hashendscan(IndexScanDesc scan)
 	scan->opaque = NULL;
 }
 
+/*
+ * Read stream callback for hashbulkdelete.
+ *
+ * Returns the block number of the primary page for the next bucket to
+ * vacuum, using the BUCKET_TO_BLKNO mapping from the cached metapage.
+ */
+static BlockNumber
+hash_bulkdelete_read_stream_cb(ReadStream *stream,
+							   void *callback_private_data,
+							   void *per_buffer_data)
+{
+	HashBulkDeleteStreamPrivate *p = callback_private_data;
+	Bucket		bucket;
+
+	if (p->next_bucket > p->max_bucket)
+		return InvalidBlockNumber;
+
+	bucket = p->next_bucket++;
+	return BUCKET_TO_BLKNO(p->metap, bucket);
+}
+
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples.
  * The set of target tuples is specified via a callback routine that tells
@@ -474,6 +507,8 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Buffer		metabuf = InvalidBuffer;
 	HashMetaPage metap;
 	HashMetaPage cachedmetap;
+	HashBulkDeleteStreamPrivate stream_private;
+	ReadStream *stream = NULL;
 
 	tuples_removed = 0;
 	num_index_tuples = 0;
@@ -494,7 +529,25 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	cur_bucket = 0;
 	cur_maxbucket = orig_maxbucket;
 
-loop_top:
+	/* Set up streaming read for primary bucket pages */
+	stream_private.metap = cachedmetap;
+	stream_private.next_bucket = cur_bucket;
+	stream_private.max_bucket = cur_maxbucket;
+
+	/*
+	 * It is safe to use batchmode as hash_bulkdelete_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										rel,
+										MAIN_FORKNUM,
+										hash_bulkdelete_read_stream_cb,
+										&stream_private,
+										0);
+
+bucket_loop:
 	while (cur_bucket <= cur_maxbucket)
 	{
 		BlockNumber bucket_blkno;
@@ -514,7 +567,8 @@ loop_top:
 		 * We need to acquire a cleanup lock on the primary bucket page to out
 		 * wait concurrent scans before deleting the dead tuples.
 		 */
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
+		buf = read_stream_next_buffer(stream, NULL);
+		Assert(BufferIsValid(buf));
 		LockBufferForCleanup(buf);
 		_hash_checkpage(rel, buf, LH_BUCKET_PAGE);
 
@@ -545,6 +599,16 @@ loop_top:
 			{
 				cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
 				Assert(cachedmetap != NULL);
+
+				/*
+				 * Reset stream with updated metadata for remaining buckets.
+				 * The BUCKET_TO_BLKNO mapping depends on hashm_spares[],
+				 * which may have changed.
+				 */
+				stream_private.metap = cachedmetap;
+				stream_private.next_bucket = cur_bucket + 1;
+				stream_private.max_bucket = cur_maxbucket;
+				read_stream_reset(stream);
 			}
 		}
 
@@ -577,9 +641,19 @@ loop_top:
 		cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
 		Assert(cachedmetap != NULL);
 		cur_maxbucket = cachedmetap->hashm_maxbucket;
-		goto loop_top;
+
+		/* Reset stream to process additional buckets from split */
+		stream_private.metap = cachedmetap;
+		stream_private.next_bucket = cur_bucket;
+		stream_private.max_bucket = cur_maxbucket;
+		read_stream_reset(stream);
+		goto bucket_loop;
 	}
 
+	/* Stream should be exhausted since we processed all buckets */
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/* Okay, we're really done.  Update tuple count in metapage. */
 	START_CRIT_SECTION();
 
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index dc6fc28fcab..572be5598f2 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1176,6 +1176,7 @@ HashAggBatch
 HashAggSpill
 HashAllocFunc
 HashBuildState
+HashBulkDeleteStreamPrivate
 HashCompareFunc
 HashCopyFunc
 HashIndexStat
-- 
2.51.0



^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2026-03-10 10:28                   ` Michael Paquier <[email protected]>
  2026-03-10 13:23                     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 23:04                     ` Re: Streamify more code paths Andres Freund <[email protected]>
  0 siblings, 2 replies; 35+ messages in thread

From: Michael Paquier @ 2026-03-10 10:28 UTC (permalink / raw)
  To: Xuneng Zhou <[email protected]>; +Cc: pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

On Tue, Mar 10, 2026 at 02:06:12PM +0800, Xuneng Zhou wrote:
> Here’s v5 of the patchset. The wal_logging_large patch has been
> removed, as no performance gains were observed in the benchmark runs.

Looking at the numbers you are posting, it is harder to get excited
about the hash, gin, bloom_vacuum and wal_logging.  The worker method
seems more efficient, may show that we are out of noise level.

The results associated to pgstattuple and the bloom scans are on a
different level for the three methods.

Saying that, it is really nice that you have sent the benchmark.  The
measurement method looks in line with the goal here after review (IO
stats, calculations), and I have taken some time to run it to get an
idea of the difference for these five code paths, as of (slightly
edited the script for my own environment, result is the same):
./run_streaming_benchmark --baseline --io-method=io_uring/worker

I am not much interested in the sync case, so I have tested the two
other methods:

1) method=IO-uring
bloom_scan_large           base=   725.3ms  patch=    99.9ms   7.26x
( 86.2%)  (reads=19676->1294, io_time=688.36->33.69ms)
bloom_vacuum_large         base=  7414.9ms  patch=  7455.2ms   0.99x
( -0.5%)  (reads=48361->11597, io_time=459.02->257.51ms)
pgstattuple_large          base= 12642.9ms  patch= 11873.5ms   1.06x
(  6.1%)  (reads=206945->12983, io_time=6516.70->143.46ms)
gin_vacuum_large           base=  3546.8ms  patch=  2317.9ms   1.53x
( 34.6%)  (reads=20734->17735, io_time=3244.40->2021.53ms)
hash_vacuum_large          base= 12268.5ms  patch= 11751.1ms   1.04x
(  4.2%)  (reads=76677->15606, io_time=1483.10->315.03ms)
wal_logging_large          base= 33713.0ms  patch= 32773.9ms   1.03x
(  2.8%)  (reads=21641->21641, io_time=81.18->77.25ms)

2) method=worker io-workers=3
bloom_scan_large           base=   725.0ms  patch=   465.7ms   1.56x
( 35.8%)  (reads=19676->1294, io_time=688.70->52.20ms)
bloom_vacuum_large         base=  7138.3ms  patch=  7156.0ms   1.00x
( -0.2%)  (reads=48361->11597, io_time=284.56->64.37ms)
pgstattuple_large          base= 12429.3ms  patch= 11916.8ms   1.04x
(  4.1%)  (reads=206945->12983, io_time=6501.91->32.24ms)
gin_vacuum_large           base=  3769.4ms  patch=  3716.7ms   1.01x
(  1.4%)  (reads=20775->17684, io_time=3562.21->3528.14ms)
hash_vacuum_large          base= 11750.1ms  patch= 11289.0ms   1.04x
(  3.9%)  (reads=76677->15606, io_time=1296.03->98.72ms)
wal_logging_large          base= 32862.3ms  patch= 33179.7ms   0.99x
( -1.0%)  (reads=21641->21641, io_time=91.42->90.59ms) 

The bloom scan case is a winner in runtime for both cases, and in
terms of stats we get much better numbers for all of them.  These feel
rather in line with what you have, except for pgstattuple's runtime,
still its IO numbers feel good.  That's just to say that I'll review
them and try to do something about at least some of the pieces for
this release.
--
Michael


Attachments:

  [application/pgp-signature] signature.asc (833B, 2-signature.asc)
  download

^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
@ 2026-03-10 13:23                     ` Xuneng Zhou <[email protected]>
  2026-03-11 00:16                       ` Re: Streamify more code paths Andres Freund <[email protected]>
  2026-03-11 02:13                       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 07:53                       ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  1 sibling, 3 replies; 35+ messages in thread

From: Xuneng Zhou @ 2026-03-10 13:23 UTC (permalink / raw)
  To: Michael Paquier <[email protected]>; +Cc: pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

Hi Michael,

On Tue, Mar 10, 2026 at 6:28 PM Michael Paquier <[email protected]> wrote:
>
> On Tue, Mar 10, 2026 at 02:06:12PM +0800, Xuneng Zhou wrote:
> > Here’s v5 of the patchset. The wal_logging_large patch has been
> > removed, as no performance gains were observed in the benchmark runs.
>
> Looking at the numbers you are posting, it is harder to get excited
> about the hash, gin, bloom_vacuum and wal_logging.  The worker method
> seems more efficient, may show that we are out of noise level.
> The results associated to pgstattuple and the bloom scans are on a
> different level for the three methods.
>
> Saying that, it is really nice that you have sent the benchmark.  The
> measurement method looks in line with the goal here after review (IO
> stats, calculations), and I have taken some time to run it to get an
> idea of the difference for these five code paths, as of (slightly
> edited the script for my own environment, result is the same):
> ./run_streaming_benchmark --baseline --io-method=io_uring/worker
>
> I am not much interested in the sync case, so I have tested the two
> other methods:
>
> 1) method=IO-uring
> bloom_scan_large           base=   725.3ms  patch=    99.9ms   7.26x
> ( 86.2%)  (reads=19676->1294, io_time=688.36->33.69ms)
> bloom_vacuum_large         base=  7414.9ms  patch=  7455.2ms   0.99x
> ( -0.5%)  (reads=48361->11597, io_time=459.02->257.51ms)
> pgstattuple_large          base= 12642.9ms  patch= 11873.5ms   1.06x
> (  6.1%)  (reads=206945->12983, io_time=6516.70->143.46ms)
> gin_vacuum_large           base=  3546.8ms  patch=  2317.9ms   1.53x
> ( 34.6%)  (reads=20734->17735, io_time=3244.40->2021.53ms)
> hash_vacuum_large          base= 12268.5ms  patch= 11751.1ms   1.04x
> (  4.2%)  (reads=76677->15606, io_time=1483.10->315.03ms)
> wal_logging_large          base= 33713.0ms  patch= 32773.9ms   1.03x
> (  2.8%)  (reads=21641->21641, io_time=81.18->77.25ms)
>
> 2) method=worker io-workers=3
> bloom_scan_large           base=   725.0ms  patch=   465.7ms   1.56x
> ( 35.8%)  (reads=19676->1294, io_time=688.70->52.20ms)
> bloom_vacuum_large         base=  7138.3ms  patch=  7156.0ms   1.00x
> ( -0.2%)  (reads=48361->11597, io_time=284.56->64.37ms)
> pgstattuple_large          base= 12429.3ms  patch= 11916.8ms   1.04x
> (  4.1%)  (reads=206945->12983, io_time=6501.91->32.24ms)
> gin_vacuum_large           base=  3769.4ms  patch=  3716.7ms   1.01x
> (  1.4%)  (reads=20775->17684, io_time=3562.21->3528.14ms)
> hash_vacuum_large          base= 11750.1ms  patch= 11289.0ms   1.04x
> (  3.9%)  (reads=76677->15606, io_time=1296.03->98.72ms)
> wal_logging_large          base= 32862.3ms  patch= 33179.7ms   0.99x
> ( -1.0%)  (reads=21641->21641, io_time=91.42->90.59ms)
>
> The bloom scan case is a winner in runtime for both cases, and in
> terms of stats we get much better numbers for all of them.  These feel
> rather in line with what you have, except for pgstattuple's runtime,
> still its IO numbers feel good.

Thanks for running the benchmarks! The performance gains for hash,
gin, bloom_vacuum, and wal_logging is insignificant, likely because
these workloads are not I/O-bound. The default number of I/O workers
is three, which is fairly conservative. When I ran the benchmark
script with a higher number of I/O workers, some runs showed improved
performance.

> pgstattuple_large          base= 12429.3ms  patch= 11916.8ms   1.04x
> (  4.1%)  (reads=206945->12983, io_time=6501.91->32.24ms)

> pgstattuple_large          base= 12642.9ms  patch= 11873.5ms   1.06x
> (  6.1%)  (reads=206945->12983, io_time=6516.70->143.46ms)

Yeah, this looks somewhat strange. The io_time has been reduced
significantly, which should also lead to a substantial reduction in
runtime.

method=io_uring
pgstattuple_large          base=  5551.5ms  patch=  3498.2ms   1.59x
( 37.0%)  (reads=206945→12983, io_time=2323.49→207.14ms)

I ran the benchmark for this test again with io_uring, and the result
is consistent with previous runs. I’m not sure what might be
contributing to this behavior.

Another code path that showed significant performance improvement is
pgstatindex [1]. I've incorporated the test into the script too. Here
are the results from my testing:

method=worker io-workers=12
pgstatindex_large          base=   233.8ms  patch=    54.1ms   4.32x
( 76.8%)  (reads=27460→1757, io_time=213.94→6.31ms)

method=io_uring
pgstatindex_large          base=   224.2ms  patch=    56.4ms   3.98x
( 74.9%)  (reads=27460→1757, io_time=204.41→4.88ms)

>That's just to say that I'll review
> them and try to do something about at least some of the pieces for
> this release.

Thanks for that.

[1] https://www.postgresql.org/message-id/flat/CABPTF7UeN2o-trr9r7K76rZExnO2M4SLfvTfbUY2CwQjCekgnQ%40mai...

--
Best,
Xuneng


Attachments:

  [application/x-patch] v6-0001-Use-streaming-read-in-pgstatindex-functions.patch (4.4K, 2-v6-0001-Use-streaming-read-in-pgstatindex-functions.patch)
  download | inline diff:
From 2e925f32aada5b5aad4b7a82fe6d76c8db9fb075 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Tue, 10 Mar 2026 20:28:16 +0800
Subject: [PATCH v6] Use streaming read API in pgstatindex functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace synchronous ReadBufferExtended() loops with the streaming read
API in pgstatindex_impl() and pgstathashindex().

Author: Xuneng Zhou <[email protected]>
Reviewed-by: Nazir Bilal Yavuz <[email protected]>
Reviewed-by: wenhui qiu <[email protected]>
Reviewed-by: Shinya Kato <[email protected]>
---
 contrib/pgstattuple/pgstatindex.c | 57 ++++++++++++++++++++++++++-----
 1 file changed, 48 insertions(+), 9 deletions(-)

diff --git a/contrib/pgstattuple/pgstatindex.c b/contrib/pgstattuple/pgstatindex.c
index ef723af1f19..41cafe8559a 100644
--- a/contrib/pgstattuple/pgstatindex.c
+++ b/contrib/pgstattuple/pgstatindex.c
@@ -37,6 +37,7 @@
 #include "funcapi.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
+#include "storage/read_stream.h"
 #include "utils/rel.h"
 #include "utils/varlena.h"
 
@@ -217,6 +218,8 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo)
 	BlockNumber blkno;
 	BTIndexStat indexStat;
 	BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (!IS_INDEX(rel) || !IS_BTREE(rel))
 		ereport(ERROR,
@@ -273,10 +276,26 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo)
 	indexStat.fragments = 0;
 
 	/*
-	 * Scan all blocks except the metapage
+	 * Scan all blocks except the metapage (0th page) using streaming reads
 	 */
 	nblocks = RelationGetNumberOfBlocks(rel);
 
+	p.current_blocknum = BTREE_METAPAGE + 1;
+	p.last_exclusive = nblocks;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = 1; blkno < nblocks; blkno++)
 	{
 		Buffer		buffer;
@@ -285,8 +304,7 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo)
 
 		CHECK_FOR_INTERRUPTS();
 
-		/* Read and lock buffer */
-		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 
 		page = BufferGetPage(buffer);
@@ -322,11 +340,12 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo)
 		else
 			indexStat.internal_pages++;
 
-		/* Unlock and release buffer */
-		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
-		ReleaseBuffer(buffer);
+		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	relation_close(rel, AccessShareLock);
 
 	/*----------------------------
@@ -600,6 +619,8 @@ pgstathashindex(PG_FUNCTION_ARGS)
 	HashMetaPage metap;
 	float8		free_percent;
 	uint64		total_space;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	/*
 	 * This uses relation_open() and not index_open().  The latter allows
@@ -644,7 +665,23 @@ pgstathashindex(PG_FUNCTION_ARGS)
 	/* prepare access strategy for this index */
 	bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
-	/* Start from blkno 1 as 0th block is metapage */
+	/* Scan all blocks except the metapage (0th page) using streaming reads */
+	p.current_blocknum = HASH_METAPAGE + 1;
+	p.last_exclusive = nblocks;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	for (blkno = 1; blkno < nblocks; blkno++)
 	{
 		Buffer		buf;
@@ -652,8 +689,7 @@ pgstathashindex(PG_FUNCTION_ARGS)
 
 		CHECK_FOR_INTERRUPTS();
 
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
-								 bstrategy);
+		buf = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buf, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buf);
 
@@ -698,6 +734,9 @@ pgstathashindex(PG_FUNCTION_ARGS)
 		UnlockReleaseBuffer(buf);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/* Done accessing the index */
 	relation_close(rel, AccessShareLock);
 
-- 
2.51.0



  [application/x-sh] run_streaming_benchmark.sh (28.2K, 3-run_streaming_benchmark.sh)
  download

^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 13:23                     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2026-03-11 00:16                       ` Andres Freund <[email protected]>
  2026-03-11 02:23                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2 siblings, 1 reply; 35+ messages in thread

From: Andres Freund @ 2026-03-11 00:16 UTC (permalink / raw)
  To: Xuneng Zhou <[email protected]>; +Cc: Michael Paquier <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

Hi,

On 2026-03-10 19:27:59 -0400, Andres Freund wrote:
> > > pgstattuple_large          base= 12429.3ms  patch= 11916.8ms   1.04x
> > > (  4.1%)  (reads=206945->12983, io_time=6501.91->32.24ms)
> > 
> > > pgstattuple_large          base= 12642.9ms  patch= 11873.5ms   1.06x
> > > (  6.1%)  (reads=206945->12983, io_time=6516.70->143.46ms)
> > 
> > Yeah, this looks somewhat strange. The io_time has been reduced
> > significantly, which should also lead to a substantial reduction in
> > runtime.
> 
> It's possible that the bottleneck just moved, e.g to the checksum computation,
> if you have data checksums enabled.
> 
> It's also worth noting that likely each of the test reps measures
> something different, as likely
>   psql_run "$ROOT" "$PORT" -c "UPDATE heap_test SET data = data || '!' WHERE id % 5 = 0;"
> 
> leads to some out-of-page updates.
> 
> You're probably better off deleting some of the data in a transaction that is
> then rolled back. That will also unset all-visible, but won't otherwise change
> the layout, no matter how many test iterations you run.
> 
> 
> I'd also guess that you're seeing a relatively small win because you're
> updating every page. When reading every page from disk, the OS can do
> efficient readahead.  If there are only occasional misses, that does not work.

I think that last one is a big part - if I use
  BEGIN; DELETE FROM heap_test WHERE id % 500 = 0; ROLLBACK;
(which leaves a lot of 

I see much bigger wins due to the pgstattuple changes.

                       time buffered          time DIO
w/o read stream        2222.078 ms            2090.239 ms
w   read stream         299.455 ms             155.124 ms

That's with local storage. io_uring, but numbers with worker are similar.


Greetings,

Andres Freund





^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 13:23                     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 00:16                       ` Re: Streamify more code paths Andres Freund <[email protected]>
@ 2026-03-11 02:23                         ` Xuneng Zhou <[email protected]>
  2026-03-11 15:00                           ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2026-03-11 02:23 UTC (permalink / raw)
  To: Andres Freund <[email protected]>; +Cc: Michael Paquier <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

Hi,

On Wed, Mar 11, 2026 at 8:16 AM Andres Freund <[email protected]> wrote:
>
> Hi,
>
> On 2026-03-10 19:27:59 -0400, Andres Freund wrote:
> > > > pgstattuple_large          base= 12429.3ms  patch= 11916.8ms   1.04x
> > > > (  4.1%)  (reads=206945->12983, io_time=6501.91->32.24ms)
> > >
> > > > pgstattuple_large          base= 12642.9ms  patch= 11873.5ms   1.06x
> > > > (  6.1%)  (reads=206945->12983, io_time=6516.70->143.46ms)
> > >
> > > Yeah, this looks somewhat strange. The io_time has been reduced
> > > significantly, which should also lead to a substantial reduction in
> > > runtime.
> >
> > It's possible that the bottleneck just moved, e.g to the checksum computation,
> > if you have data checksums enabled.
> >
> > It's also worth noting that likely each of the test reps measures
> > something different, as likely
> >   psql_run "$ROOT" "$PORT" -c "UPDATE heap_test SET data = data || '!' WHERE id % 5 = 0;"
> >
> > leads to some out-of-page updates.
> >
> > You're probably better off deleting some of the data in a transaction that is
> > then rolled back. That will also unset all-visible, but won't otherwise change
> > the layout, no matter how many test iterations you run.
> >
> >
> > I'd also guess that you're seeing a relatively small win because you're
> > updating every page. When reading every page from disk, the OS can do
> > efficient readahead.  If there are only occasional misses, that does not work.
>
> I think that last one is a big part - if I use
>   BEGIN; DELETE FROM heap_test WHERE id % 500 = 0; ROLLBACK;
> (which leaves a lot of
>
> I see much bigger wins due to the pgstattuple changes.
>
>                        time buffered          time DIO
> w/o read stream        2222.078 ms            2090.239 ms
> w   read stream         299.455 ms             155.124 ms
>
> That's with local storage. io_uring, but numbers with worker are similar.
>

The results look great and interesting. This looks far better than
what I observed in my earlier tests. I’ll run perf for pgstattuple
without the switching to see what is keeping the CPU busy.

-- 
Best,
Xuneng





^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 13:23                     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 00:16                       ` Re: Streamify more code paths Andres Freund <[email protected]>
  2026-03-11 02:23                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2026-03-11 15:00                           ` Xuneng Zhou <[email protected]>
  2026-03-16 03:45                             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2026-03-11 15:00 UTC (permalink / raw)
  To: Andres Freund <[email protected]>; +Cc: Michael Paquier <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

Hi,

On Wed, Mar 11, 2026 at 10:23 AM Xuneng Zhou <[email protected]> wrote:
>
> Hi,
>
> On Wed, Mar 11, 2026 at 8:16 AM Andres Freund <[email protected]> wrote:
> >
> > Hi,
> >
> > On 2026-03-10 19:27:59 -0400, Andres Freund wrote:
> > > > > pgstattuple_large          base= 12429.3ms  patch= 11916.8ms   1.04x
> > > > > (  4.1%)  (reads=206945->12983, io_time=6501.91->32.24ms)
> > > >
> > > > > pgstattuple_large          base= 12642.9ms  patch= 11873.5ms   1.06x
> > > > > (  6.1%)  (reads=206945->12983, io_time=6516.70->143.46ms)
> > > >
> > > > Yeah, this looks somewhat strange. The io_time has been reduced
> > > > significantly, which should also lead to a substantial reduction in
> > > > runtime.
> > >
> > > It's possible that the bottleneck just moved, e.g to the checksum computation,
> > > if you have data checksums enabled.
> > >
> > > It's also worth noting that likely each of the test reps measures
> > > something different, as likely
> > >   psql_run "$ROOT" "$PORT" -c "UPDATE heap_test SET data = data || '!' WHERE id % 5 = 0;"
> > >
> > > leads to some out-of-page updates.
> > >
> > > You're probably better off deleting some of the data in a transaction that is
> > > then rolled back. That will also unset all-visible, but won't otherwise change
> > > the layout, no matter how many test iterations you run.
> > >
> > >
> > > I'd also guess that you're seeing a relatively small win because you're
> > > updating every page. When reading every page from disk, the OS can do
> > > efficient readahead.  If there are only occasional misses, that does not work.
> >
> > I think that last one is a big part - if I use
> >   BEGIN; DELETE FROM heap_test WHERE id % 500 = 0; ROLLBACK;
> > (which leaves a lot of
> >
> > I see much bigger wins due to the pgstattuple changes.
> >
> >                        time buffered          time DIO
> > w/o read stream        2222.078 ms            2090.239 ms
> > w   read stream         299.455 ms             155.124 ms
> >
> > That's with local storage. io_uring, but numbers with worker are similar.
> >
>
> The results look great and interesting. This looks far better than
> what I observed in my earlier tests. I’ll run perf for pgstattuple
> without the switching to see what is keeping the CPU busy.
>
> --
> Best,
> Xuneng

io_uring
pgstattuple_large          base=  1090.6ms  patch=   143.3ms   7.61x
( 86.9%)  (reads=20049→20049, io_time=1040.80→46.91ms)

I observed a similar magnitude of runtime reduction after switching to
pg_buffercache_evict_relation() and using BEGIN; DELETE FROM heap_test
WHERE id % 500 = 0; ROLLBACK. However, I lost the original flame
graphs after running many performance tests. I will regenerate them
and post them later.

-- 
Best,
Xuneng


Attachments:

  [image/svg+xml] patched_pgstattuple_large_improved.svg (509.5K, 2-patched_pgstattuple_large_improved.svg)
  download | view image

  [image/svg+xml] base_pgstattuple_large_improved.svg (426.7K, 3-base_pgstattuple_large_improved.svg)
  download | view image

^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 13:23                     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 00:16                       ` Re: Streamify more code paths Andres Freund <[email protected]>
  2026-03-11 02:23                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 15:00                           ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2026-03-16 03:45                             ` Xuneng Zhou <[email protected]>
  0 siblings, 0 replies; 35+ messages in thread

From: Xuneng Zhou @ 2026-03-16 03:45 UTC (permalink / raw)
  To: Andres Freund <[email protected]>; +Cc: Michael Paquier <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

On Wed, Mar 11, 2026 at 11:00 PM Xuneng Zhou <[email protected]> wrote:
>
> Hi,
>
> On Wed, Mar 11, 2026 at 10:23 AM Xuneng Zhou <[email protected]> wrote:
> >
> > Hi,
> >
> > On Wed, Mar 11, 2026 at 8:16 AM Andres Freund <[email protected]> wrote:
> > >
> > > Hi,
> > >
> > > On 2026-03-10 19:27:59 -0400, Andres Freund wrote:
> > > > > > pgstattuple_large          base= 12429.3ms  patch= 11916.8ms   1.04x
> > > > > > (  4.1%)  (reads=206945->12983, io_time=6501.91->32.24ms)
> > > > >
> > > > > > pgstattuple_large          base= 12642.9ms  patch= 11873.5ms   1.06x
> > > > > > (  6.1%)  (reads=206945->12983, io_time=6516.70->143.46ms)
> > > > >
> > > > > Yeah, this looks somewhat strange. The io_time has been reduced
> > > > > significantly, which should also lead to a substantial reduction in
> > > > > runtime.
> > > >
> > > > It's possible that the bottleneck just moved, e.g to the checksum computation,
> > > > if you have data checksums enabled.
> > > >
> > > > It's also worth noting that likely each of the test reps measures
> > > > something different, as likely
> > > >   psql_run "$ROOT" "$PORT" -c "UPDATE heap_test SET data = data || '!' WHERE id % 5 = 0;"
> > > >
> > > > leads to some out-of-page updates.
> > > >
> > > > You're probably better off deleting some of the data in a transaction that is
> > > > then rolled back. That will also unset all-visible, but won't otherwise change
> > > > the layout, no matter how many test iterations you run.
> > > >
> > > >
> > > > I'd also guess that you're seeing a relatively small win because you're
> > > > updating every page. When reading every page from disk, the OS can do
> > > > efficient readahead.  If there are only occasional misses, that does not work.
> > >
> > > I think that last one is a big part - if I use
> > >   BEGIN; DELETE FROM heap_test WHERE id % 500 = 0; ROLLBACK;
> > > (which leaves a lot of
> > >
> > > I see much bigger wins due to the pgstattuple changes.
> > >
> > >                        time buffered          time DIO
> > > w/o read stream        2222.078 ms            2090.239 ms
> > > w   read stream         299.455 ms             155.124 ms
> > >
> > > That's with local storage. io_uring, but numbers with worker are similar.
> > >
> >
> > The results look great and interesting. This looks far better than
> > what I observed in my earlier tests. I’ll run perf for pgstattuple
> > without the switching to see what is keeping the CPU busy.
> >
> > --
> > Best,
> > Xuneng
>
> io_uring
> pgstattuple_large          base=  1090.6ms  patch=   143.3ms   7.61x
> ( 86.9%)  (reads=20049→20049, io_time=1040.80→46.91ms)
>
> I observed a similar magnitude of runtime reduction after switching to
> pg_buffercache_evict_relation() and using BEGIN; DELETE FROM heap_test
> WHERE id % 500 = 0; ROLLBACK. However, I lost the original flame
> graphs after running many performance tests. I will regenerate them
> and post them later.

In the original setup, UPDATE ... WHERE id % 5 = 0 touches a large
fraction of heap pages. On touched pages, we clears PD_ALL_VISIBLE and
the corresponding visibility-map bit, and UPDATE also creates new
tuple versions, adding tuple-chain/page churn. Since
pgstattuple_approx skips only pages still marked all-visible, it ends
up reading most heap pages. With shared-buffer eviction and OS
page-cache drop before timing, many of those reads are cold misses, so
runtime is spent a lot in the buffer-miss path as seen from the
original flamegraph.

The rollback-delete setup changes this in two ways. First, BEGIN;
DELETE ... WHERE id % 500 = 0; ROLLBACK; still clears all-visible
state on touched pages, but does not leave persistent successor tuple
versions the way UPDATE does. Pages are still modified (tuple
headers/page flags), but physical churn is lower. Second, the
predicate is much sparser, so fewer pages lose all-visible status, and
pgstattuple_approx reads fewer heap pages.

A warmup pass further stabilizes results by setting hint bits for
aborted xmax once up front, so later repetitions avoid repeating part
of transaction-status resolution during visibility checks.

With fewer non-all-visible pages, fewer cold misses, less
eviction/victim churn, we can observe more pronounced speed-up in the
improved version of pgstattuple test.

-- 
Best,
Xuneng


Attachments:

  [image/svg+xml] patch_pgstattuple_original_medium.svg (215.2K, 2-patch_pgstattuple_original_medium.svg)
  download | view image

  [image/svg+xml] patch_pgstattuple_medium.svg (145.2K, 3-patch_pgstattuple_medium.svg)
  download | view image

^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 13:23                     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2026-03-11 02:13                       ` Xuneng Zhou <[email protected]>
  2 siblings, 0 replies; 35+ messages in thread

From: Xuneng Zhou @ 2026-03-11 02:13 UTC (permalink / raw)
  To: Andres Freund <[email protected]>; +Cc: Michael Paquier <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

Hi,

On Wed, Mar 11, 2026 at 7:28 AM Andres Freund <[email protected]> wrote:
>
> Hi,
>
> On 2026-03-10 21:23:26 +0800, Xuneng Zhou wrote:
> > On Tue, Mar 10, 2026 at 6:28 PM Michael Paquier <[email protected]> wrote:
> > Thanks for running the benchmarks! The performance gains for hash,
> > gin, bloom_vacuum, and wal_logging is insignificant, likely because
> > these workloads are not I/O-bound. The default number of I/O workers
> > is three, which is fairly conservative. When I ran the benchmark
> > script with a higher number of I/O workers, some runs showed improved
> > performance.
>
> FWIW, another thing that may be an issue is that you're restarting postgres
> all the time, as part of drop_caches().  That means we'll spend time reloading
> catalog metadata and initializing shared buffers (the first write to a shared
> buffers page is considerably more expensive than later ones, as the backing
> memory needs to be initialized first).
>
> I found it useful to use the pg_buffercache extension (specifically
> pg_buffercache_evict_relation()) to just drop the relation that is going to be
> tested from shared_buffers.

Good point. I'll switch to using pg_buffercache_evict_relation() to
evict only the target relation, keeping the cluster running. That
should reduce measurement noise to some extend.

>
> > > pgstattuple_large          base= 12429.3ms  patch= 11916.8ms   1.04x
> > > (  4.1%)  (reads=206945->12983, io_time=6501.91->32.24ms)
> >
> > > pgstattuple_large          base= 12642.9ms  patch= 11873.5ms   1.06x
> > > (  6.1%)  (reads=206945->12983, io_time=6516.70->143.46ms)
> >
> > Yeah, this looks somewhat strange. The io_time has been reduced
> > significantly, which should also lead to a substantial reduction in
> > runtime.
>
> It's possible that the bottleneck just moved, e.g to the checksum computation,
> if you have data checksums enabled.
>
> It's also worth noting that likely each of the test reps measures
> something different, as likely
>   psql_run "$ROOT" "$PORT" -c "UPDATE heap_test SET data = data || '!' WHERE id % 5 = 0;"
>
> leads to some out-of-page updates.
>
> You're probably better off deleting some of the data in a transaction that is
> then rolled back. That will also unset all-visible, but won't otherwise change
> the layout, no matter how many test iterations you run.
>
>
> I'd also guess that you're seeing a relatively small win because you're
> updating every page. When reading every page from disk, the OS can do
> efficient readahead.  If there are only occasional misses, that does not work.
>

Yeah, the repeated UPDATE changes the table layout across reps. I'll switch to:

BEGIN;
DELETE FROM heap_test WHERE id % N = 0;
ROLLBACK;

This clears the visibility map bits without altering the physical
layout, so every rep measures the same table state.

>
> > method=io_uring
> > pgstattuple_large          base=  5551.5ms  patch=  3498.2ms   1.59x
> > ( 37.0%)  (reads=206945→12983, io_time=2323.49→207.14ms)
> >
> > I ran the benchmark for this test again with io_uring, and the result
> > is consistent with previous runs. I’m not sure what might be
> > contributing to this behavior.
>
> What does a perf profile show?  Is the query CPU bound?

The runtime in my run of pgstattuple was reduced significantly due to
the reduction in I/O time. I don’t think running perf on my setup
would reveal anything particularly meaningful. The script has an
option to run with perf, so perhaps Michael could try it to see
whether the query becomes CPU-bound, if he’s interested and has time.

> > Another code path that showed significant performance improvement is
> > pgstatindex [1]. I've incorporated the test into the script too. Here
> > are the results from my testing:
> >
> > method=worker io-workers=12
> > pgstatindex_large          base=   233.8ms  patch=    54.1ms   4.32x
> > ( 76.8%)  (reads=27460→1757, io_time=213.94→6.31ms)
> >
> > method=io_uring
> > pgstatindex_large          base=   224.2ms  patch=    56.4ms   3.98x
> > ( 74.9%)  (reads=27460→1757, io_time=204.41→4.88ms)
>
> Nice!
>
>
> Greetings,
>
> Andres Freund



--
Best,
Xuneng





^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 13:23                     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2026-03-11 07:53                       ` Nazir Bilal Yavuz <[email protected]>
  2026-03-12 03:27                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2 siblings, 1 reply; 35+ messages in thread

From: Nazir Bilal Yavuz @ 2026-03-11 07:53 UTC (permalink / raw)
  To: Xuneng Zhou <[email protected]>; +Cc: Michael Paquier <[email protected]>; pgsql-hackers <[email protected]>

Hi,

On Tue, 10 Mar 2026 at 16:23, Xuneng Zhou <[email protected]> wrote:
>
> Another code path that showed significant performance improvement is
> pgstatindex [1]. I've incorporated the test into the script too. Here
> are the results from my testing:
>
> method=worker io-workers=12
> pgstatindex_large          base=   233.8ms  patch=    54.1ms   4.32x
> ( 76.8%)  (reads=27460→1757, io_time=213.94→6.31ms)
>
> method=io_uring
> pgstatindex_large          base=   224.2ms  patch=    56.4ms   3.98x
> ( 74.9%)  (reads=27460→1757, io_time=204.41→4.88ms)

I didn't run the benchmark yet but here is a small suggestion for the
pgstatindex patch:

+    p.current_blocknum = BTREE_METAPAGE + 1;
+    p.last_exclusive = nblocks;

     for (blkno = 1; blkno < nblocks; blkno++)

...

+    p.current_blocknum = HASH_METAPAGE + 1;
+    p.last_exclusive = nblocks;

     for (blkno = 1; blkno < nblocks; blkno++)

Could you move 'BTREE_METAPAGE + 1' and 'HASH_METAPAGE + 1' into
variables and then set p.current_blocknum and blkno using those
variables? p.current_blocknum and blkno should have the same initial
values, this change makes code less error prone and easier to read in
my opinion.

Other than the comment above, LGTM.

-- 
Regards,
Nazir Bilal Yavuz
Microsoft





^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 13:23                     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 07:53                       ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
@ 2026-03-12 03:27                         ` Xuneng Zhou <[email protected]>
  0 siblings, 0 replies; 35+ messages in thread

From: Xuneng Zhou @ 2026-03-12 03:27 UTC (permalink / raw)
  To: Nazir Bilal Yavuz <[email protected]>; +Cc: Michael Paquier <[email protected]>; pgsql-hackers <[email protected]>

Hi,

On Wed, Mar 11, 2026 at 3:53 PM Nazir Bilal Yavuz <[email protected]> wrote:
>
> Hi,
>
> On Tue, 10 Mar 2026 at 16:23, Xuneng Zhou <[email protected]> wrote:
> >
> > Another code path that showed significant performance improvement is
> > pgstatindex [1]. I've incorporated the test into the script too. Here
> > are the results from my testing:
> >
> > method=worker io-workers=12
> > pgstatindex_large          base=   233.8ms  patch=    54.1ms   4.32x
> > ( 76.8%)  (reads=27460→1757, io_time=213.94→6.31ms)
> >
> > method=io_uring
> > pgstatindex_large          base=   224.2ms  patch=    56.4ms   3.98x
> > ( 74.9%)  (reads=27460→1757, io_time=204.41→4.88ms)
>
> I didn't run the benchmark yet but here is a small suggestion for the
> pgstatindex patch:
>
> +    p.current_blocknum = BTREE_METAPAGE + 1;
> +    p.last_exclusive = nblocks;
>
>      for (blkno = 1; blkno < nblocks; blkno++)
>
> ...
>
> +    p.current_blocknum = HASH_METAPAGE + 1;
> +    p.last_exclusive = nblocks;
>
>      for (blkno = 1; blkno < nblocks; blkno++)
>
> Could you move 'BTREE_METAPAGE + 1' and 'HASH_METAPAGE + 1' into
> variables and then set p.current_blocknum and blkno using those
> variables? p.current_blocknum and blkno should have the same initial
> values, this change makes code less error prone and easier to read in
> my opinion.
>
> Other than the comment above, LGTM.
>

Thanks! That makes sense to me. Please see the patch I’ll post later.

-- 
Best,
Xuneng





^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
@ 2026-03-10 23:04                     ` Andres Freund <[email protected]>
  2026-03-10 23:29                       ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-11 01:37                       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  1 sibling, 2 replies; 35+ messages in thread

From: Andres Freund @ 2026-03-10 23:04 UTC (permalink / raw)
  To: Michael Paquier <[email protected]>; +Cc: Xuneng Zhou <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

Hi,

On 2026-03-10 19:28:29 +0900, Michael Paquier wrote:
> On Tue, Mar 10, 2026 at 02:06:12PM +0800, Xuneng Zhou wrote:
> > Here’s v5 of the patchset. The wal_logging_large patch has been
> > removed, as no performance gains were observed in the benchmark runs.
>
> Looking at the numbers you are posting, it is harder to get excited
> about the hash, gin, bloom_vacuum and wal_logging.

It's perhaps worth emphasizing that, to allow real world usage of direct IO,
we'll need streaming implementation for most of these. Also, on windows the OS
provided readahead is ... not aggressive, so you'll hit IO stalls much more
frequently than you'd on linux (and some of the BSDs).

It might be a good idea to run the benchmarks with debug_io_direct=data.
That'll make them very slow, since the write side doesn't yet use AIO and thus
will do a lot of synchronous writes, but it should still allow to evaluate the
gains from using read stream.


The other thing that's kinda important to evaluate read streams is to test on
higher latency storage, even without direct IO.  Many workloads are not at all
benefiting from AIO when run on a local NVMe SSD with < 10us latency, but are
severely IO bound when run on a cloud storage disk with 0.5ms - 4ms latency.


To be able to test such higher latencies locally, I've found it quite useful
to use dm_delay above a fast disk. See [1].


> The worker method seems more efficient, may show that we are out of noise
> level.

I think that's more likely to show that memory bandwidth, probably due to
checksum computations, is a factor. The memory copy (from the kernel page
cache, with buffered IO) and the checksum computations (when checksums are
enabled) are parallelized by worker, but not by io_uring.


Greetings,

Andres Freund


[1]

  https://docs.kernel.org/admin-guide/device-mapper/delay.html

  Assuming /dev/md0 is mounted to /srv, and a delay of 1ms should be
  introduced for it:

  umount /srv && dmsetup create delayed --table "0 $(blockdev --getsz /dev/md0) delay /dev/md0 0 1" /dev/md0  && mount /dev/mapper/delayed /srv/

  To update the amount of delay to 3ms the following can be used:
  dmsetup suspend delayed && dmsetup reload delayed --table "0 $(blockdev --getsz /dev/md0) delay /dev/md0 0 3" /dev/md0 && dmsetup resume delayed

  (I will often just update the delay to 0 for comparison runs, as that
  doesn't require remounting)





^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 23:04                     ` Re: Streamify more code paths Andres Freund <[email protected]>
@ 2026-03-10 23:29                       ` Michael Paquier <[email protected]>
  2026-03-11 02:22                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  1 sibling, 1 reply; 35+ messages in thread

From: Michael Paquier @ 2026-03-10 23:29 UTC (permalink / raw)
  To: Andres Freund <[email protected]>; +Cc: Xuneng Zhou <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

On Tue, Mar 10, 2026 at 07:04:37PM -0400, Andres Freund wrote:
> It might be a good idea to run the benchmarks with debug_io_direct=data.
> That'll make them very slow, since the write side doesn't yet use AIO and thus
> will do a lot of synchronous writes, but it should still allow to evaluate the
> gains from using read stream.

Ah, thanks for the tip.  I'll go try that.

> The other thing that's kinda important to evaluate read streams is to test on
> higher latency storage, even without direct IO.  Many workloads are not at all
> benefiting from AIO when run on a local NVMe SSD with < 10us latency, but are
> severely IO bound when run on a cloud storage disk with 0.5ms - 4ms latency.

My previous run was on a cloud instance, I don't have access to a SSD
with this amount of latency locally.

One thing that was standing on is the bloom bitmap case that was
looking really nice for a large number of rows, so I have applied
this part.  The rest is going to need a bit more testing to build more
confidence, as far as I can see.
--
Michael


Attachments:

  [application/pgp-signature] signature.asc (833B, 2-signature.asc)
  download

^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 23:04                     ` Re: Streamify more code paths Andres Freund <[email protected]>
  2026-03-10 23:29                       ` Re: Streamify more code paths Michael Paquier <[email protected]>
@ 2026-03-11 02:22                         ` Xuneng Zhou <[email protected]>
  0 siblings, 0 replies; 35+ messages in thread

From: Xuneng Zhou @ 2026-03-11 02:22 UTC (permalink / raw)
  To: Michael Paquier <[email protected]>; +Cc: Andres Freund <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

Hi,

On Wed, Mar 11, 2026 at 7:29 AM Michael Paquier <[email protected]> wrote:
>
> On Tue, Mar 10, 2026 at 07:04:37PM -0400, Andres Freund wrote:
> > It might be a good idea to run the benchmarks with debug_io_direct=data.
> > That'll make them very slow, since the write side doesn't yet use AIO and thus
> > will do a lot of synchronous writes, but it should still allow to evaluate the
> > gains from using read stream.
>
> Ah, thanks for the tip.  I'll go try that.
>
> > The other thing that's kinda important to evaluate read streams is to test on
> > higher latency storage, even without direct IO.  Many workloads are not at all
> > benefiting from AIO when run on a local NVMe SSD with < 10us latency, but are
> > severely IO bound when run on a cloud storage disk with 0.5ms - 4ms latency.
>
> My previous run was on a cloud instance, I don't have access to a SSD
> with this amount of latency locally.
>
> One thing that was standing on is the bloom bitmap case that was
> looking really nice for a large number of rows, so I have applied
> this part.  The rest is going to need a bit more testing to build more
> confidence, as far as I can see.
> --
> Michael

Thanks for pushing that. I’ll update the script with Andres’
suggestions and share it shortly.

-- 
Best,
Xuneng





^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 23:04                     ` Re: Streamify more code paths Andres Freund <[email protected]>
@ 2026-03-11 01:37                       ` Xuneng Zhou <[email protected]>
  2026-03-11 15:11                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  1 sibling, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2026-03-11 01:37 UTC (permalink / raw)
  To: Andres Freund <[email protected]>; +Cc: Michael Paquier <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

Hi Andres,

On Wed, Mar 11, 2026 at 7:04 AM Andres Freund <[email protected]> wrote:
>
> Hi,
>
> On 2026-03-10 19:28:29 +0900, Michael Paquier wrote:
> > On Tue, Mar 10, 2026 at 02:06:12PM +0800, Xuneng Zhou wrote:
> > > Here’s v5 of the patchset. The wal_logging_large patch has been
> > > removed, as no performance gains were observed in the benchmark runs.
> >
> > Looking at the numbers you are posting, it is harder to get excited
> > about the hash, gin, bloom_vacuum and wal_logging.
>
> It's perhaps worth emphasizing that, to allow real world usage of direct IO,
> we'll need streaming implementation for most of these. Also, on windows the OS
> provided readahead is ... not aggressive, so you'll hit IO stalls much more
> frequently than you'd on linux (and some of the BSDs).
>
> It might be a good idea to run the benchmarks with debug_io_direct=data.
> That'll make them very slow, since the write side doesn't yet use AIO and thus
> will do a lot of synchronous writes, but it should still allow to evaluate the
> gains from using read stream.
>
>
> The other thing that's kinda important to evaluate read streams is to test on
> higher latency storage, even without direct IO.  Many workloads are not at all
> benefiting from AIO when run on a local NVMe SSD with < 10us latency, but are
> severely IO bound when run on a cloud storage disk with 0.5ms - 4ms latency.
>
>
> To be able to test such higher latencies locally, I've found it quite useful
> to use dm_delay above a fast disk. See [1].

Thanks for the tips! I currently don’t have access to a machine or
cloud instance with slower SSDs or HDDs that have higher latency. I’ll
try running the benchmark with debug_io_direct=data and dm_delay, as
you suggested, to see if the results vary.

>
> > The worker method seems more efficient, may show that we are out of noise
> > level.
>
> I think that's more likely to show that memory bandwidth, probably due to
> checksum computations, is a factor. The memory copy (from the kernel page
> cache, with buffered IO) and the checksum computations (when checksums are
> enabled) are parallelized by worker, but not by io_uring.
>
>
> Greetings,
>
> Andres Freund
>
>
> [1]
>
>   https://docs.kernel.org/admin-guide/device-mapper/delay.html
>
>   Assuming /dev/md0 is mounted to /srv, and a delay of 1ms should be
>   introduced for it:
>
>   umount /srv && dmsetup create delayed --table "0 $(blockdev --getsz /dev/md0) delay /dev/md0 0 1" /dev/md0  && mount /dev/mapper/delayed /srv/
>
>   To update the amount of delay to 3ms the following can be used:
>   dmsetup suspend delayed && dmsetup reload delayed --table "0 $(blockdev --getsz /dev/md0) delay /dev/md0 0 3" /dev/md0 && dmsetup resume delayed
>
>   (I will often just update the delay to 0 for comparison runs, as that
>   doesn't require remounting)



-- 
Best,
Xuneng





^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 23:04                     ` Re: Streamify more code paths Andres Freund <[email protected]>
  2026-03-11 01:37                       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2026-03-11 15:11                         ` Xuneng Zhou <[email protected]>
  2026-03-11 21:33                           ` Re: Streamify more code paths Michael Paquier <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2026-03-11 15:11 UTC (permalink / raw)
  To: Andres Freund <[email protected]>; +Cc: Michael Paquier <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

On Wed, Mar 11, 2026 at 9:37 AM Xuneng Zhou <[email protected]> wrote:
>
> Hi Andres,
>
> On Wed, Mar 11, 2026 at 7:04 AM Andres Freund <[email protected]> wrote:
> >
> > Hi,
> >
> > On 2026-03-10 19:28:29 +0900, Michael Paquier wrote:
> > > On Tue, Mar 10, 2026 at 02:06:12PM +0800, Xuneng Zhou wrote:
> > > > Here’s v5 of the patchset. The wal_logging_large patch has been
> > > > removed, as no performance gains were observed in the benchmark runs.
> > >
> > > Looking at the numbers you are posting, it is harder to get excited
> > > about the hash, gin, bloom_vacuum and wal_logging.
> >
> > It's perhaps worth emphasizing that, to allow real world usage of direct IO,
> > we'll need streaming implementation for most of these. Also, on windows the OS
> > provided readahead is ... not aggressive, so you'll hit IO stalls much more
> > frequently than you'd on linux (and some of the BSDs).
> >
> > It might be a good idea to run the benchmarks with debug_io_direct=data.
> > That'll make them very slow, since the write side doesn't yet use AIO and thus
> > will do a lot of synchronous writes, but it should still allow to evaluate the
> > gains from using read stream.
> >
> >
> > The other thing that's kinda important to evaluate read streams is to test on
> > higher latency storage, even without direct IO.  Many workloads are not at all
> > benefiting from AIO when run on a local NVMe SSD with < 10us latency, but are
> > severely IO bound when run on a cloud storage disk with 0.5ms - 4ms latency.
> >
> >
> > To be able to test such higher latencies locally, I've found it quite useful
> > to use dm_delay above a fast disk. See [1].
>
> Thanks for the tips! I currently don’t have access to a machine or
> cloud instance with slower SSDs or HDDs that have higher latency. I’ll
> try running the benchmark with debug_io_direct=data and dm_delay, as
> you suggested, to see if the results vary.
>
> >
> > > The worker method seems more efficient, may show that we are out of noise
> > > level.
> >
> > I think that's more likely to show that memory bandwidth, probably due to
> > checksum computations, is a factor. The memory copy (from the kernel page
> > cache, with buffered IO) and the checksum computations (when checksums are
> > enabled) are parallelized by worker, but not by io_uring.
> >
> >
> > Greetings,
> >
> > Andres Freund
> >
> >
> > [1]
> >
> >   https://docs.kernel.org/admin-guide/device-mapper/delay.html
> >
> >   Assuming /dev/md0 is mounted to /srv, and a delay of 1ms should be
> >   introduced for it:
> >
> >   umount /srv && dmsetup create delayed --table "0 $(blockdev --getsz /dev/md0) delay /dev/md0 0 1" /dev/md0  && mount /dev/mapper/delayed /srv/
> >
> >   To update the amount of delay to 3ms the following can be used:
> >   dmsetup suspend delayed && dmsetup reload delayed --table "0 $(blockdev --getsz /dev/md0) delay /dev/md0 0 3" /dev/md0 && dmsetup resume delayed
> >
> >   (I will often just update the delay to 0 for comparison runs, as that
> >   doesn't require remounting)
>

With debug_io_direct=data and dm_delay, the results look quite promising!

medium size / io_uring
gin_vacuum_medium          base=  1619.9ms  patch=   301.8ms   5.37x
( 81.4%)  (reads=1571→947, io_time=1524.86→207.48ms)

The average runtime increases significantly after adding the manual
device delay, so it will take some time to complete all the test runs.
I was also busy with something else today... Once the runs are
finished, I’ll share the results and the script to reproduce them.

-- 
Best,
Xuneng





^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 23:04                     ` Re: Streamify more code paths Andres Freund <[email protected]>
  2026-03-11 01:37                       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 15:11                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2026-03-11 21:33                           ` Michael Paquier <[email protected]>
  2026-03-12 03:42                             ` Re: Streamify more code paths Michael Paquier <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Michael Paquier @ 2026-03-11 21:33 UTC (permalink / raw)
  To: Xuneng Zhou <[email protected]>; +Cc: Andres Freund <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

On Wed, Mar 11, 2026 at 11:11:23PM +0800, Xuneng Zhou wrote:
> The average runtime increases significantly after adding the manual
> device delay, so it will take some time to complete all the test runs.
> I was also busy with something else today... Once the runs are
> finished, I’ll share the results and the script to reproduce them.

Thanks for doing that.  On my side, I am going to look at the gin and
hash vacuum paths first with more testing as these don't use a custom
callback.  I don't think that I am going to need a lot of convincing,
but I'd rather produce some numbers myself because doing something.
I'll tweak a mounting point with the delay trick, as well.
--
Michael


Attachments:

  [application/pgp-signature] signature.asc (833B, 2-signature.asc)
  download

^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 23:04                     ` Re: Streamify more code paths Andres Freund <[email protected]>
  2026-03-11 01:37                       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 15:11                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 21:33                           ` Re: Streamify more code paths Michael Paquier <[email protected]>
@ 2026-03-12 03:42                             ` Michael Paquier <[email protected]>
  2026-03-12 04:39                               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Michael Paquier @ 2026-03-12 03:42 UTC (permalink / raw)
  To: Xuneng Zhou <[email protected]>; +Cc: Andres Freund <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

On Thu, Mar 12, 2026 at 06:33:08AM +0900, Michael Paquier wrote:
> Thanks for doing that.  On my side, I am going to look at the gin and
> hash vacuum paths first with more testing as these don't use a custom
> callback.  I don't think that I am going to need a lot of convincing,
> but I'd rather produce some numbers myself because doing something.
> I'll tweak a mounting point with the delay trick, as well.

While debug_io_direct has been helping a bit, the trick for the delay
to throttle the IO activity has helped much more with my runtime
numbers.  I have mounted a separate partition with a delay of 5ms,
disabled checkums (this part did not make a real difference), and
evicted shared buffers for relation and indexes before the VACUUM.

Then I got better numbers.  Here is an extract:
- worker=3:
gin_vacuum (100k tuples)   base=  1448.2ms  patch=   572.5ms   2.53x
( 60.5%)  (reads=175→104, io_time=1382.70→506.64ms)
gin_vacuum (300k tuples)   base=  3728.0ms  patch=  1332.0ms   2.80x
( 64.3%)  (reads=486→293, io_time=3669.89→1266.27ms)
bloom_vacuum (100k tuples) base= 21826.8ms  patch= 17220.3ms   1.27x
( 21.1%)  (reads=485→117, io_time=4773.33→270.56ms)
bloom_vacuum (300k tuples) base= 67054.0ms  patch= 53164.7ms   1.26x
( 20.7%)  (reads=1431.5→327.5, io_time=13880.2→381.395ms)
- io_uring:
gin_vacuum (100k tuples)   base=  1240.3ms  patch=   360.5ms   3.44x
( 70.9%)  (reads=175→104, io_time=1175.35→299.75ms) 
gin_vacuum (300k tuples)   base=  2829.9ms  patch=   642.0ms   4.41x
( 77.3%)  (reads=465.5→293, io_time=2768.46→579.04ms)
bloom_vacuum (100k tuples) base= 22121.7ms  patch= 17532.3ms   1.26x
( 20.7%)  (reads=485→117, io_time=4850.46→285.28ms)
bloom_vacuum (300k tuples) base= 67058.0ms  patch= 53118.0ms   1.26x
( 20.8%)  (reads=1431.5→327.5, io_time=13870.9→305.44ms)

The higher the number of tuples, the better the performance for each
individual operation, but the tests take a much longer time (tens of
seconds vs tens of minutes).  For GIN, the numbers can be quite good
once these reads are pushed.  For bloom, the runtime is improved, and
the IO numbers are much better.

At the end, I have applied these two parts.  Remains now the hash
vacuum and the two parts for pgstattuple.
--
Michael


Attachments:

  [application/pgp-signature] signature.asc (833B, 2-signature.asc)
  download

^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 23:04                     ` Re: Streamify more code paths Andres Freund <[email protected]>
  2026-03-11 01:37                       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 15:11                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 21:33                           ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 03:42                             ` Re: Streamify more code paths Michael Paquier <[email protected]>
@ 2026-03-12 04:39                               ` Xuneng Zhou <[email protected]>
  2026-03-12 15:35                                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2026-03-12 04:39 UTC (permalink / raw)
  To: Michael Paquier <[email protected]>; +Cc: Andres Freund <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

On Thu, Mar 12, 2026 at 11:42 AM Michael Paquier <[email protected]> wrote:
>
> On Thu, Mar 12, 2026 at 06:33:08AM +0900, Michael Paquier wrote:
> > Thanks for doing that.  On my side, I am going to look at the gin and
> > hash vacuum paths first with more testing as these don't use a custom
> > callback.  I don't think that I am going to need a lot of convincing,
> > but I'd rather produce some numbers myself because doing something.
> > I'll tweak a mounting point with the delay trick, as well.
>
> While debug_io_direct has been helping a bit, the trick for the delay
> to throttle the IO activity has helped much more with my runtime
> numbers.  I have mounted a separate partition with a delay of 5ms,
> disabled checkums (this part did not make a real difference), and
> evicted shared buffers for relation and indexes before the VACUUM.
>
> Then I got better numbers.  Here is an extract:
> - worker=3:
> gin_vacuum (100k tuples)   base=  1448.2ms  patch=   572.5ms   2.53x
> ( 60.5%)  (reads=175→104, io_time=1382.70→506.64ms)
> gin_vacuum (300k tuples)   base=  3728.0ms  patch=  1332.0ms   2.80x
> ( 64.3%)  (reads=486→293, io_time=3669.89→1266.27ms)
> bloom_vacuum (100k tuples) base= 21826.8ms  patch= 17220.3ms   1.27x
> ( 21.1%)  (reads=485→117, io_time=4773.33→270.56ms)
> bloom_vacuum (300k tuples) base= 67054.0ms  patch= 53164.7ms   1.26x
> ( 20.7%)  (reads=1431.5→327.5, io_time=13880.2→381.395ms)
> - io_uring:
> gin_vacuum (100k tuples)   base=  1240.3ms  patch=   360.5ms   3.44x
> ( 70.9%)  (reads=175→104, io_time=1175.35→299.75ms)
> gin_vacuum (300k tuples)   base=  2829.9ms  patch=   642.0ms   4.41x
> ( 77.3%)  (reads=465.5→293, io_time=2768.46→579.04ms)
> bloom_vacuum (100k tuples) base= 22121.7ms  patch= 17532.3ms   1.26x
> ( 20.7%)  (reads=485→117, io_time=4850.46→285.28ms)
> bloom_vacuum (300k tuples) base= 67058.0ms  patch= 53118.0ms   1.26x
> ( 20.8%)  (reads=1431.5→327.5, io_time=13870.9→305.44ms)
>
> The higher the number of tuples, the better the performance for each
> individual operation, but the tests take a much longer time (tens of
> seconds vs tens of minutes).  For GIN, the numbers can be quite good
> once these reads are pushed.  For bloom, the runtime is improved, and
> the IO numbers are much better.
>
> At the end, I have applied these two parts.  Remains now the hash
> vacuum and the two parts for pgstattuple.
> --
> Michael

Thanks for running the benchmarks and pushing!

Here're the results of my test with debug_io_direct and delay :

-- io_uring, medium size

bloom_vacuum_medium        base=  8355.2ms  patch=   715.0ms  11.68x
( 91.4%)  (reads=4732→1056, io_time=7699.47→86.52ms)
pgstattuple_medium         base=  4012.8ms  patch=   213.7ms  18.78x
( 94.7%)  (reads=2006→2006, io_time=4001.66→200.24ms)
pgstatindex_medium         base=  5490.6ms  patch=    37.9ms  144.88x
( 99.3%)  (reads=2745→173, io_time=5481.54→7.82ms)
hash_vacuum_medium         base= 34483.4ms  patch=  2703.5ms  12.75x
( 92.2%)  (reads=19166→3901, io_time=31948.33→308.05ms)
wal_logging_medium         base=  7778.6ms  patch=  7814.5ms   1.00x
( -0.5%)  (reads=2857→2845, io_time=11.84→11.45ms)

-- worker, medium size
bloom_vacuum_medium        base=  8376.2ms  patch=   747.7ms  11.20x
( 91.1%)  (reads=4732→1056, io_time=7688.91→65.49ms)
pgstattuple_medium         base=  4012.7ms  patch=   339.0ms  11.84x
( 91.6%)  (reads=2006→2006, io_time=4002.23→49.99ms)
pgstatindex_medium         base=  5490.3ms  patch=    38.3ms  143.23x
( 99.3%)  (reads=2745→173, io_time=5480.60→16.24ms)
hash_vacuum_medium         base= 34638.4ms  patch=  2940.2ms  11.78x
( 91.5%)  (reads=19166→3901, io_time=31881.61→242.01ms)
wal_logging_medium         base=  7440.1ms  patch=  7434.0ms   1.00x
(  0.1%)  (reads=2861→2825, io_time=10.62→10.71ms)

-- Setting read delay only
sudo dmsetup reload "$DM_DELAY_DEV" --table "0 $size delay $dev 0 $ms $dev 0 0"
Setting dm_delay on delayed to 2ms read / 0ms write

After setting the write delay to 0ms, I can observe more pronounced
speedups overall, since vacuum operation is write-intensive — delaying
writes might dominate the runtime and mask the read-path improvement
we're measuring. It also speeds up the runtime of the test.

-- wal_logging
The wal_logging patch does not seem to benefit from streamification in
this configuration either.

-- Delay settup
For anyone wanting to reproduce the results with a simulated-latency
device, here is the setup I used.

1. Create a 50GB file-backed block device (enough for PG data + indexes)

sudo dd if=/dev/zero of=/srv/delay_disk.img bs=1M count=50000 status=progress
sudo losetup /dev/loop0 /srv/delay_disk.img

2. Create the dm_delay device with 2ms delay
sudo dmsetup create delayed --table "0 $(sudo blockdev --getsz
/dev/loop0) delay /dev/loop0 0 2"

3. Format and mount it

sudo mkfs.ext4 /dev/mapper/delayed
sudo mkdir -p /srv/pg_delayed
sudo mount /dev/mapper/delayed /srv/pg_delayed
sudo chown $(whoami) /srv/pg_delayed

4. Run benchmark with WORKROOT pointing to the delayed device

WORKROOT=/srv/pg_delayed SIZES=medium REPS=3 \
  ./run_streaming_benchmark.sh --baseline --io-method io_uring \
    --test gin_vacuum --direct-io --io-delay 2 \
     the targeted patch


--
Best,
Xuneng


Attachments:

  [application/x-patch] v6-0003-Streamify-hash-index-VACUUM-primary-bucket-page-r.patch (5.5K, 2-v6-0003-Streamify-hash-index-VACUUM-primary-bucket-page-r.patch)
  download | inline diff:
From 2adfcd6c16f94e7dadb38ffc6cfed3457b363bf5 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sun, 28 Dec 2025 18:29:28 +0800
Subject: [PATCH v6 3/5] Streamify hash index VACUUM primary bucket page reads

Refactor hashbulkdelete() to use the Read Stream  for primary bucket
pages. This enables prefetching of upcoming buckets while the current
one is being processed, improving I/O efficiency during hash index
vacuum operations.
---
 src/backend/access/hash/hash.c   | 80 ++++++++++++++++++++++++++++++--
 src/tools/pgindent/typedefs.list |  1 +
 2 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index e88ddb32a05..6df5e7ccbd1 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -30,6 +30,7 @@
 #include "nodes/execnodes.h"
 #include "optimizer/plancat.h"
 #include "pgstat.h"
+#include "storage/read_stream.h"
 #include "utils/fmgrprotos.h"
 #include "utils/index_selfuncs.h"
 #include "utils/rel.h"
@@ -42,12 +43,23 @@ typedef struct
 	Relation	heapRel;		/* heap relation descriptor */
 } HashBuildState;
 
+/* Working state for streaming reads in hashbulkdelete */
+typedef struct
+{
+	HashMetaPage metap;			/* cached metapage for BUCKET_TO_BLKNO */
+	Bucket		next_bucket;	/* next bucket to prefetch */
+	Bucket		max_bucket;		/* stop when next_bucket > max_bucket */
+} HashBulkDeleteStreamPrivate;
+
 static void hashbuildCallback(Relation index,
 							  ItemPointer tid,
 							  Datum *values,
 							  bool *isnull,
 							  bool tupleIsAlive,
 							  void *state);
+static BlockNumber hash_bulkdelete_read_stream_cb(ReadStream *stream,
+												  void *callback_private_data,
+												  void *per_buffer_data);
 
 
 /*
@@ -451,6 +463,27 @@ hashendscan(IndexScanDesc scan)
 	scan->opaque = NULL;
 }
 
+/*
+ * Read stream callback for hashbulkdelete.
+ *
+ * Returns the block number of the primary page for the next bucket to
+ * vacuum, using the BUCKET_TO_BLKNO mapping from the cached metapage.
+ */
+static BlockNumber
+hash_bulkdelete_read_stream_cb(ReadStream *stream,
+							   void *callback_private_data,
+							   void *per_buffer_data)
+{
+	HashBulkDeleteStreamPrivate *p = callback_private_data;
+	Bucket		bucket;
+
+	if (p->next_bucket > p->max_bucket)
+		return InvalidBlockNumber;
+
+	bucket = p->next_bucket++;
+	return BUCKET_TO_BLKNO(p->metap, bucket);
+}
+
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples.
  * The set of target tuples is specified via a callback routine that tells
@@ -475,6 +508,8 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Buffer		metabuf = InvalidBuffer;
 	HashMetaPage metap;
 	HashMetaPage cachedmetap;
+	HashBulkDeleteStreamPrivate stream_private;
+	ReadStream *stream = NULL;
 
 	tuples_removed = 0;
 	num_index_tuples = 0;
@@ -495,7 +530,25 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	cur_bucket = 0;
 	cur_maxbucket = orig_maxbucket;
 
-loop_top:
+	/* Set up streaming read for primary bucket pages */
+	stream_private.metap = cachedmetap;
+	stream_private.next_bucket = cur_bucket;
+	stream_private.max_bucket = cur_maxbucket;
+
+	/*
+	 * It is safe to use batchmode as hash_bulkdelete_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_USE_BATCHING,
+										info->strategy,
+										rel,
+										MAIN_FORKNUM,
+										hash_bulkdelete_read_stream_cb,
+										&stream_private,
+										0);
+
+bucket_loop:
 	while (cur_bucket <= cur_maxbucket)
 	{
 		BlockNumber bucket_blkno;
@@ -515,7 +568,8 @@ loop_top:
 		 * We need to acquire a cleanup lock on the primary bucket page to out
 		 * wait concurrent scans before deleting the dead tuples.
 		 */
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
+		buf = read_stream_next_buffer(stream, NULL);
+		Assert(BufferIsValid(buf));
 		LockBufferForCleanup(buf);
 		_hash_checkpage(rel, buf, LH_BUCKET_PAGE);
 
@@ -546,6 +600,16 @@ loop_top:
 			{
 				cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
 				Assert(cachedmetap != NULL);
+
+				/*
+				 * Reset stream with updated metadata for remaining buckets.
+				 * The BUCKET_TO_BLKNO mapping depends on hashm_spares[],
+				 * which may have changed.
+				 */
+				stream_private.metap = cachedmetap;
+				stream_private.next_bucket = cur_bucket + 1;
+				stream_private.max_bucket = cur_maxbucket;
+				read_stream_reset(stream);
 			}
 		}
 
@@ -578,9 +642,19 @@ loop_top:
 		cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
 		Assert(cachedmetap != NULL);
 		cur_maxbucket = cachedmetap->hashm_maxbucket;
-		goto loop_top;
+
+		/* Reset stream to process additional buckets from split */
+		stream_private.metap = cachedmetap;
+		stream_private.next_bucket = cur_bucket;
+		stream_private.max_bucket = cur_maxbucket;
+		read_stream_reset(stream);
+		goto bucket_loop;
 	}
 
+	/* Stream should be exhausted since we processed all buckets */
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/* Okay, we're really done.  Update tuple count in metapage. */
 	START_CRIT_SECTION();
 
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index a67246138eb..0d60a17bc2c 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1185,6 +1185,7 @@ HashAggBatch
 HashAggSpill
 HashAllocFunc
 HashBuildState
+HashBulkDeleteStreamPrivate
 HashCompareFunc
 HashCopyFunc
 HashIndexStat
-- 
2.51.0



  [application/x-patch] v6-0002-Streamify-heap-bloat-estimation-scan.-Introduce-a.patch (6.4K, 3-v6-0002-Streamify-heap-bloat-estimation-scan.-Introduce-a.patch)
  download | inline diff:
From 4350511d40f5efed2be26c518cbb15c4c8435eb4 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Sat, 27 Dec 2025 00:29:02 +0800
Subject: [PATCH v6 2/5] Streamify heap bloat estimation scan. Introduce a 
 read-stream callback to skip all-visible pages via VM/FSM lookup and 
 stream-read the rest, reducing page reads and improving pgstattuple_approx 
 execution time on large relations.

---
 contrib/pgstattuple/pgstatapprox.c | 126 ++++++++++++++++++++++-------
 src/tools/pgindent/typedefs.list   |   1 +
 2 files changed, 96 insertions(+), 31 deletions(-)

diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c
index 3fad24cf248..68ae7720b31 100644
--- a/contrib/pgstattuple/pgstatapprox.c
+++ b/contrib/pgstattuple/pgstatapprox.c
@@ -23,6 +23,7 @@
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
 #include "storage/procarray.h"
+#include "storage/read_stream.h"
 
 PG_FUNCTION_INFO_V1(pgstattuple_approx);
 PG_FUNCTION_INFO_V1(pgstattuple_approx_v1_5);
@@ -45,6 +46,61 @@ typedef struct output_type
 
 #define NUM_OUTPUT_COLUMNS 10
 
+/*
+ * Struct for statapprox_heap read stream callback.
+ */
+typedef struct StatApproxReadStreamPrivate
+{
+	Relation	rel;
+	output_type *stat;
+	BlockNumber current_blocknum;
+	BlockNumber nblocks;
+	BlockNumber scanned;		/* count of pages actually read */
+	Buffer		vmbuffer;		/* for VM lookups */
+} StatApproxReadStreamPrivate;
+
+/*
+ * Read stream callback for statapprox_heap.
+ *
+ * This callback checks the visibility map for each block. If the block is
+ * all-visible, we can get the free space from the FSM without reading the
+ * actual page, and skip to the next block. Only blocks that are not
+ * all-visible are returned for actual reading.
+ */
+static BlockNumber
+statapprox_heap_read_stream_next(ReadStream *stream,
+								 void *callback_private_data,
+								 void *per_buffer_data)
+{
+	StatApproxReadStreamPrivate *p = callback_private_data;
+
+	while (p->current_blocknum < p->nblocks)
+	{
+		BlockNumber blkno = p->current_blocknum++;
+		Size		freespace;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * If the page has only visible tuples, then we can find out the free
+		 * space from the FSM and move on without reading the page.
+		 */
+		if (VM_ALL_VISIBLE(p->rel, blkno, &p->vmbuffer))
+		{
+			freespace = GetRecordedFreeSpace(p->rel, blkno);
+			p->stat->tuple_len += BLCKSZ - freespace;
+			p->stat->free_space += freespace;
+			continue;
+		}
+
+		/* This block needs to be read */
+		p->scanned++;
+		return blkno;
+	}
+
+	return InvalidBlockNumber;
+}
+
 /*
  * This function takes an already open relation and scans its pages,
  * skipping those that have the corresponding visibility map bit set.
@@ -58,53 +114,58 @@ typedef struct output_type
 static void
 statapprox_heap(Relation rel, output_type *stat)
 {
-	BlockNumber scanned,
-				nblocks,
-				blkno;
-	Buffer		vmbuffer = InvalidBuffer;
+	BlockNumber nblocks;
 	BufferAccessStrategy bstrategy;
 	TransactionId OldestXmin;
+	StatApproxReadStreamPrivate p;
+	ReadStream *stream;
 
 	OldestXmin = GetOldestNonRemovableTransactionId(rel);
 	bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
 	nblocks = RelationGetNumberOfBlocks(rel);
-	scanned = 0;
 
-	for (blkno = 0; blkno < nblocks; blkno++)
+	/* Initialize read stream private data */
+	p.rel = rel;
+	p.stat = stat;
+	p.current_blocknum = 0;
+	p.nblocks = nblocks;
+	p.scanned = 0;
+	p.vmbuffer = InvalidBuffer;
+
+	/*
+	 * Create the read stream. We don't use READ_STREAM_USE_BATCHING because
+	 * the callback accesses the visibility map which may need to read VM
+	 * pages. While this shouldn't cause deadlocks, we err on the side of
+	 * caution.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										statapprox_heap_read_stream_next,
+										&p,
+										0);
+
+	for (;;)
 	{
 		Buffer		buf;
 		Page		page;
 		OffsetNumber offnum,
 					maxoff;
-		Size		freespace;
-
-		CHECK_FOR_INTERRUPTS();
-
-		/*
-		 * If the page has only visible tuples, then we can find out the free
-		 * space from the FSM and move on.
-		 */
-		if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
-		{
-			freespace = GetRecordedFreeSpace(rel, blkno);
-			stat->tuple_len += BLCKSZ - freespace;
-			stat->free_space += freespace;
-			continue;
-		}
+		BlockNumber blkno;
 
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
-								 RBM_NORMAL, bstrategy);
+		buf = read_stream_next_buffer(stream, NULL);
+		if (buf == InvalidBuffer)
+			break;
 
 		LockBuffer(buf, BUFFER_LOCK_SHARE);
 
 		page = BufferGetPage(buf);
+		blkno = BufferGetBlockNumber(buf);
 
 		stat->free_space += PageGetExactFreeSpace(page);
 
-		/* We may count the page as scanned even if it's new/empty */
-		scanned++;
-
 		if (PageIsNew(page) || PageIsEmpty(page))
 		{
 			UnlockReleaseBuffer(buf);
@@ -169,6 +230,9 @@ statapprox_heap(Relation rel, output_type *stat)
 		UnlockReleaseBuffer(buf);
 	}
 
+	Assert(p.current_blocknum == nblocks);
+	read_stream_end(stream);
+
 	stat->table_len = (uint64) nblocks * BLCKSZ;
 
 	/*
@@ -179,7 +243,7 @@ statapprox_heap(Relation rel, output_type *stat)
 	 * tuples in all-visible pages, so no correction is needed for that, and
 	 * we already accounted for the space in those pages, too.
 	 */
-	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, scanned,
+	stat->tuple_count = vac_estimate_reltuples(rel, nblocks, p.scanned,
 											   stat->tuple_count);
 
 	/* It's not clear if we could get -1 here, but be safe. */
@@ -190,16 +254,16 @@ statapprox_heap(Relation rel, output_type *stat)
 	 */
 	if (nblocks != 0)
 	{
-		stat->scanned_percent = 100.0 * scanned / nblocks;
+		stat->scanned_percent = 100.0 * p.scanned / nblocks;
 		stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len;
 		stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len;
 		stat->free_percent = 100.0 * stat->free_space / stat->table_len;
 	}
 
-	if (BufferIsValid(vmbuffer))
+	if (BufferIsValid(p.vmbuffer))
 	{
-		ReleaseBuffer(vmbuffer);
-		vmbuffer = InvalidBuffer;
+		ReleaseBuffer(p.vmbuffer);
+		p.vmbuffer = InvalidBuffer;
 	}
 }
 
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 141b9d6e077..a67246138eb 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2917,6 +2917,7 @@ StartReplicationCmd
 StartupStatusEnum
 StatEntry
 StatExtEntry
+StatApproxReadStreamPrivate
 StateFileChunk
 StatisticExtInfo
 StatsBuildData
-- 
2.51.0



  [application/x-patch] v6-0005-Use-streaming-read-API-in-pgstatindex-functions.patch (4.5K, 4-v6-0005-Use-streaming-read-API-in-pgstatindex-functions.patch)
  download | inline diff:
From 085abea7e4c6998acdaf0ef96aa759ed30fd1d25 Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Thu, 12 Mar 2026 10:48:38 +0800
Subject: [PATCH v6 5/5] Use streaming read API in pgstatindex functions

Replace synchronous ReadBufferExtended() loops with the streaming read
API in pgstatindex_impl() and pgstathashindex().

Author: Xuneng Zhou <[email protected]>
Reviewed-by: Nazir Bilal Yavuz <[email protected]>
Reviewed-by: wenhui qiu <[email protected]>
Reviewed-by: Shinya Kato <[email protected]>
---
 contrib/pgstattuple/pgstatindex.c | 65 +++++++++++++++++++++++++------
 1 file changed, 54 insertions(+), 11 deletions(-)

diff --git a/contrib/pgstattuple/pgstatindex.c b/contrib/pgstattuple/pgstatindex.c
index ef723af1f19..2b3c351ecff 100644
--- a/contrib/pgstattuple/pgstatindex.c
+++ b/contrib/pgstattuple/pgstatindex.c
@@ -37,6 +37,7 @@
 #include "funcapi.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
+#include "storage/read_stream.h"
 #include "utils/rel.h"
 #include "utils/varlena.h"
 
@@ -217,6 +218,8 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo)
 	BlockNumber blkno;
 	BTIndexStat indexStat;
 	BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	if (!IS_INDEX(rel) || !IS_BTREE(rel))
 		ereport(ERROR,
@@ -273,11 +276,29 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo)
 	indexStat.fragments = 0;
 
 	/*
-	 * Scan all blocks except the metapage
+	 * Scan all blocks except the metapage (0th page) using streaming reads
 	 */
 	nblocks = RelationGetNumberOfBlocks(rel);
 
-	for (blkno = 1; blkno < nblocks; blkno++)
+	BlockNumber startblk = BTREE_METAPAGE + 1;
+
+	p.current_blocknum = startblk;
+	p.last_exclusive = nblocks;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
+	for (blkno = startblk; blkno < nblocks; blkno++)
 	{
 		Buffer		buffer;
 		Page		page;
@@ -285,8 +306,7 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo)
 
 		CHECK_FOR_INTERRUPTS();
 
-		/* Read and lock buffer */
-		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
+		buffer = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 
 		page = BufferGetPage(buffer);
@@ -322,11 +342,12 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo)
 		else
 			indexStat.internal_pages++;
 
-		/* Unlock and release buffer */
-		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
-		ReleaseBuffer(buffer);
+		UnlockReleaseBuffer(buffer);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	relation_close(rel, AccessShareLock);
 
 	/*----------------------------
@@ -600,6 +621,8 @@ pgstathashindex(PG_FUNCTION_ARGS)
 	HashMetaPage metap;
 	float8		free_percent;
 	uint64		total_space;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	/*
 	 * This uses relation_open() and not index_open().  The latter allows
@@ -644,16 +667,33 @@ pgstathashindex(PG_FUNCTION_ARGS)
 	/* prepare access strategy for this index */
 	bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
-	/* Start from blkno 1 as 0th block is metapage */
-	for (blkno = 1; blkno < nblocks; blkno++)
+	/* Scan all blocks except the metapage (0th page) using streaming reads */
+	BlockNumber startblk = HASH_METAPAGE + 1;
+
+	p.current_blocknum = startblk;
+	p.last_exclusive = nblocks;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_FULL |
+										READ_STREAM_USE_BATCHING,
+										bstrategy,
+										rel,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
+	for (blkno = startblk; blkno < nblocks; blkno++)
 	{
 		Buffer		buf;
 		Page		page;
 
 		CHECK_FOR_INTERRUPTS();
 
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
-								 bstrategy);
+		buf = read_stream_next_buffer(stream, NULL);
 		LockBuffer(buf, BUFFER_LOCK_SHARE);
 		page = BufferGetPage(buf);
 
@@ -698,6 +738,9 @@ pgstathashindex(PG_FUNCTION_ARGS)
 		UnlockReleaseBuffer(buf);
 	}
 
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
+
 	/* Done accessing the index */
 	relation_close(rel, AccessShareLock);
 
-- 
2.51.0



  [application/x-patch] v6-0004-Streamify-log_newpage_range-WAL-logging-path.patch (2.4K, 5-v6-0004-Streamify-log_newpage_range-WAL-logging-path.patch)
  download | inline diff:
From 14f1a6bed27acf0f517140b9b4f0afa4db64777b Mon Sep 17 00:00:00 2001
From: alterego655 <[email protected]>
Date: Thu, 12 Mar 2026 10:41:41 +0800
Subject: [PATCH v6 4/5] Streamify log_newpage_range() WAL logging path

Refactor log_newpage_range() to use the Read Stream API. This allows
prefetching of upcoming relation blocks during bulk WAL logging
operations, overlapping I/O with CPU-intensive XLogInsert and
WAL-writing work.
---
 src/backend/access/transam/xloginsert.c | 26 +++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index ac3c1a78396..71ef1ea2052 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -38,6 +38,7 @@
 #include "pg_trace.h"
 #include "replication/origin.h"
 #include "storage/bufmgr.h"
+#include "storage/read_stream.h"
 #include "storage/proc.h"
 #include "utils/memutils.h"
 #include "utils/pgstat_internal.h"
@@ -1296,6 +1297,8 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 {
 	int			flags;
 	BlockNumber blkno;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream;
 
 	flags = REGBUF_FORCE_IMAGE;
 	if (page_std)
@@ -1308,6 +1311,23 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 	 */
 	XLogEnsureRecordSpace(XLR_MAX_BLOCK_ID - 1, 0);
 
+	/* Set up a streaming read for the range of blocks */
+	p.current_blocknum = startblk;
+	p.last_exclusive = endblk;
+
+	/*
+	 * It is safe to use batchmode as block_range_read_stream_cb takes no
+	 * locks.
+	 */
+	stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
+										READ_STREAM_USE_BATCHING,
+										NULL,
+										rel,
+										forknum,
+										block_range_read_stream_cb,
+										&p,
+										0);
+
 	blkno = startblk;
 	while (blkno < endblk)
 	{
@@ -1322,8 +1342,7 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 		nbufs = 0;
 		while (nbufs < XLR_MAX_BLOCK_ID && blkno < endblk)
 		{
-			Buffer		buf = ReadBufferExtended(rel, forknum, blkno,
-												 RBM_NORMAL, NULL);
+			Buffer		buf = read_stream_next_buffer(stream, NULL);
 
 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 
@@ -1363,6 +1382,9 @@ log_newpage_range(Relation rel, ForkNumber forknum,
 		for (i = 0; i < nbufs; i++)
 			UnlockReleaseBuffer(bufpack[i]);
 	}
+
+	Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+	read_stream_end(stream);
 }
 
 /*
-- 
2.51.0



  [application/x-sh] run_streaming_benchmark.sh (32.6K, 6-run_streaming_benchmark.sh)
  download

^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 23:04                     ` Re: Streamify more code paths Andres Freund <[email protected]>
  2026-03-11 01:37                       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 15:11                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 21:33                           ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 03:42                             ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 04:39                               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2026-03-12 15:35                                 ` Xuneng Zhou <[email protected]>
  2026-03-13 01:49                                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2026-03-12 15:35 UTC (permalink / raw)
  To: Michael Paquier <[email protected]>; +Cc: Andres Freund <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

On Thu, Mar 12, 2026 at 12:39 PM Xuneng Zhou <[email protected]> wrote:
>
> On Thu, Mar 12, 2026 at 11:42 AM Michael Paquier <[email protected]> wrote:
> >
> > On Thu, Mar 12, 2026 at 06:33:08AM +0900, Michael Paquier wrote:
> > > Thanks for doing that.  On my side, I am going to look at the gin and
> > > hash vacuum paths first with more testing as these don't use a custom
> > > callback.  I don't think that I am going to need a lot of convincing,
> > > but I'd rather produce some numbers myself because doing something.
> > > I'll tweak a mounting point with the delay trick, as well.
> >
> > While debug_io_direct has been helping a bit, the trick for the delay
> > to throttle the IO activity has helped much more with my runtime
> > numbers.  I have mounted a separate partition with a delay of 5ms,
> > disabled checkums (this part did not make a real difference), and
> > evicted shared buffers for relation and indexes before the VACUUM.
> >
> > Then I got better numbers.  Here is an extract:
> > - worker=3:
> > gin_vacuum (100k tuples)   base=  1448.2ms  patch=   572.5ms   2.53x
> > ( 60.5%)  (reads=175→104, io_time=1382.70→506.64ms)
> > gin_vacuum (300k tuples)   base=  3728.0ms  patch=  1332.0ms   2.80x
> > ( 64.3%)  (reads=486→293, io_time=3669.89→1266.27ms)
> > bloom_vacuum (100k tuples) base= 21826.8ms  patch= 17220.3ms   1.27x
> > ( 21.1%)  (reads=485→117, io_time=4773.33→270.56ms)
> > bloom_vacuum (300k tuples) base= 67054.0ms  patch= 53164.7ms   1.26x
> > ( 20.7%)  (reads=1431.5→327.5, io_time=13880.2→381.395ms)
> > - io_uring:
> > gin_vacuum (100k tuples)   base=  1240.3ms  patch=   360.5ms   3.44x
> > ( 70.9%)  (reads=175→104, io_time=1175.35→299.75ms)
> > gin_vacuum (300k tuples)   base=  2829.9ms  patch=   642.0ms   4.41x
> > ( 77.3%)  (reads=465.5→293, io_time=2768.46→579.04ms)
> > bloom_vacuum (100k tuples) base= 22121.7ms  patch= 17532.3ms   1.26x
> > ( 20.7%)  (reads=485→117, io_time=4850.46→285.28ms)
> > bloom_vacuum (300k tuples) base= 67058.0ms  patch= 53118.0ms   1.26x
> > ( 20.8%)  (reads=1431.5→327.5, io_time=13870.9→305.44ms)
> >
> > The higher the number of tuples, the better the performance for each
> > individual operation, but the tests take a much longer time (tens of
> > seconds vs tens of minutes).  For GIN, the numbers can be quite good
> > once these reads are pushed.  For bloom, the runtime is improved, and
> > the IO numbers are much better.
> >
>
> -- io_uring, medium size
>
> bloom_vacuum_medium        base=  8355.2ms  patch=   715.0ms  11.68x
> ( 91.4%)  (reads=4732→1056, io_time=7699.47→86.52ms)
> pgstattuple_medium         base=  4012.8ms  patch=   213.7ms  18.78x
> ( 94.7%)  (reads=2006→2006, io_time=4001.66→200.24ms)
> pgstatindex_medium         base=  5490.6ms  patch=    37.9ms  144.88x
> ( 99.3%)  (reads=2745→173, io_time=5481.54→7.82ms)
> hash_vacuum_medium         base= 34483.4ms  patch=  2703.5ms  12.75x
> ( 92.2%)  (reads=19166→3901, io_time=31948.33→308.05ms)
> wal_logging_medium         base=  7778.6ms  patch=  7814.5ms   1.00x
> ( -0.5%)  (reads=2857→2845, io_time=11.84→11.45ms)
>
> -- worker, medium size
> bloom_vacuum_medium        base=  8376.2ms  patch=   747.7ms  11.20x
> ( 91.1%)  (reads=4732→1056, io_time=7688.91→65.49ms)
> pgstattuple_medium         base=  4012.7ms  patch=   339.0ms  11.84x
> ( 91.6%)  (reads=2006→2006, io_time=4002.23→49.99ms)
> pgstatindex_medium         base=  5490.3ms  patch=    38.3ms  143.23x
> ( 99.3%)  (reads=2745→173, io_time=5480.60→16.24ms)
> hash_vacuum_medium         base= 34638.4ms  patch=  2940.2ms  11.78x
> ( 91.5%)  (reads=19166→3901, io_time=31881.61→242.01ms)
> wal_logging_medium         base=  7440.1ms  patch=  7434.0ms   1.00x
> (  0.1%)  (reads=2861→2825, io_time=10.62→10.71ms)
>

Our io_time metric currently measures only read time and ignores write
I/O, which can be misleading. We now separate it into read_time and
write_time.

-- write-delay 2 ms
WORKROOT=/srv/pg_delayed SIZES=small REPS=3
./run_streaming_benchmark.sh --baseline --io-method worker
--io-workers 12 --test hash_vacuum --direct-io --read-delay 2
--write-delay 2
v6-0004-Streamify-hash-index-VACUUM-primary-bucket-page-r.patch

hash_vacuum_small          base= 16652.8ms  patch= 13493.2ms   1.23x
( 19.0%)  (reads=2338→815, read_time=4136.19→884.79ms,
writes=6218→6206, write_time=12313.81→12289.58ms)

-- write-delay 0 ms
WORKROOT=/srv/pg_delayed SIZES=small REPS=3
./run_streaming_benchmark.sh --baseline --io-method worker
--io-workers 12 --test hash_vacuum --direct-io --read-delay 2
--write-delay 0
v6-0004-Streamify-hash-index-VACUUM-primary-bucket-page-r.patch

hash_vacuum_small          base=  4310.2ms  patch=  1146.7ms   3.76x
( 73.4%)  (reads=2338→815, read_time=4002.24→833.47ms,
writes=6218→6206, write_time=186.69→140.96ms)

-- 
Best,
Xuneng


Attachments:

  [text/x-sh] run_streaming_benchmark.sh (34.5K, 2-run_streaming_benchmark.sh)
  download | inline:
#!/usr/bin/env bash
set -euo pipefail

###############################################################################
# Streaming Read Patches Benchmark
#
# Usage: ./run_streaming_bench.sh [OPTIONS] <patch>
#
# Options:
#   --clean           Remove existing builds and start fresh
#   --baseline        Also build and test vanilla PostgreSQL for comparison
#   --test TEST       Run specific test (bloom_scan, bloom_vacuum, pgstattuple,
#                     pgstatindex, gin_vacuum, wal_logging, hash_vacuum, or "all")
#   --io-method MODE  I/O method: io_uring, worker, or sync (default: io_uring)
#   --io-workers N    Number of I/O workers for worker mode (default: 3)
#   --io-concurrency N  Max concurrent I/Os per process (default: 64)
#   --direct-io         Enable direct IO (debug_io_direct=data), bypasses OS page cache
#   --read-delay MS     Simulate read latency via dm_delay (requires pre-created device)
#   --write-delay MS    Simulate write latency via dm_delay (default: 0, requires --read-delay)
#   --profile           Enable perf profiling and flamegraph generation
#
# Environment:
#   WORKROOT       Base directory (default: $HOME/pg_bench)
#   REPS           Repetitions per test (default: 5)
#   SIZES          Table sizes to test (default: "large")
#   FLAMEGRAPH_DIR Path to FlameGraph tools (default: $HOME/FlameGraph)
#   DM_DELAY_DEV   dm_delay device name for --read-delay (default: "delayed")
###############################################################################

log() { printf '\033[1;34m==>\033[0m %s\n' "$*"; }
die() { printf '\033[1;31mERROR:\033[0m %s\n' "$*" >&2; exit 1; }

# --- CLI parsing ---
CLEAN=0
BASELINE=0
DO_PROFILE=0
DIRECT_IO=0
IO_DELAY_MS=""
WRITE_DELAY_MS="0"
TEST="all"
IO_METHOD="${IO_METHOD:-io_uring}"
IO_WORKERS="${IO_WORKERS:-3}"
IO_MAX_CONCURRENCY="${IO_MAX_CONCURRENCY:-64}"
DM_DELAY_DEV="${DM_DELAY_DEV:-delayed}"
PATCH=""

while [[ $# -gt 0 ]]; do
  case "$1" in
    --clean)          CLEAN=1 ;;
    --baseline)       BASELINE=1 ;;
    --profile)        DO_PROFILE=1 ;;
    --direct-io)      DIRECT_IO=1 ;;
    --read-delay)     IO_DELAY_MS="$2"; shift ;;
    --write-delay)    WRITE_DELAY_MS="$2"; shift ;;
    --test)           TEST="$2"; shift ;;
    --io-method)      IO_METHOD="$2"; shift ;;
    --io-workers)     IO_WORKERS="$2"; shift ;;
    --io-concurrency) IO_MAX_CONCURRENCY="$2"; shift ;;
    -h|--help)        sed -n '3,27p' "$0" | sed 's/^# \?//'; exit 0 ;;
    -*)               die "Unknown option: $1" ;;
    *)                PATCH="$1" ;;
  esac
  shift
done

# Validate io_method
case "$IO_METHOD" in
  io_uring|worker|sync) ;;
  *) die "Invalid --io-method: $IO_METHOD (must be io_uring, worker, or sync)" ;;
esac

# Validate dm_delay device if --read-delay is used
if [[ -n "$IO_DELAY_MS" ]]; then
  command -v dmsetup >/dev/null 2>&1 || die "--read-delay requires dmsetup (sudo apt install dmsetup)"
  sudo dmsetup status "$DM_DELAY_DEV" >/dev/null 2>&1 \
    || die "dm_delay device '$DM_DELAY_DEV' not found. Create it first, e.g.:\n  umount /srv && dmsetup create $DM_DELAY_DEV --table \"0 \$(blockdev --getsz /dev/DEVICE) delay /dev/DEVICE 0 $IO_DELAY_MS\" && mount /dev/mapper/$DM_DELAY_DEV /srv/"
fi

[[ -z "$PATCH" ]] && die "Usage: $0 [--clean] [--baseline] [--test TEST] <patch>"
[[ ! -f "$PATCH" ]] && die "Patch not found: $PATCH"
[[ "$PATCH" != /* ]] && PATCH="$PWD/$PATCH"

# --- Profiling validation ---
FLAMEGRAPH_DIR="${FLAMEGRAPH_DIR:-$HOME/FlameGraph}"
PERF_SUDO="${PERF_SUDO:-sudo}"
PERF_EVENT="${PERF_EVENT:-cycles}"  # cycles = user+kernel; cycles:u = user-only
if [[ $DO_PROFILE -eq 1 ]]; then
  command -v perf >/dev/null 2>&1 || die "Need perf (sudo apt install linux-tools-$(uname -r))"
  [[ -x "$FLAMEGRAPH_DIR/stackcollapse-perf.pl" ]] || die "Missing $FLAMEGRAPH_DIR/stackcollapse-perf.pl (git clone https://github.com/brendangregg/FlameGraph)"
  [[ -x "$FLAMEGRAPH_DIR/flamegraph.pl" ]] || die "Missing $FLAMEGRAPH_DIR/flamegraph.pl"
fi

# --- Configuration ---
WORKROOT="${WORKROOT:-$HOME/pg_bench}"
REPS="${REPS:-5}"
SIZES="${SIZES:-large}"

ROOT_BASE="$WORKROOT/vanilla"
PATCH_TAG=$(basename "$PATCH" .patch | tr -dc '[:alnum:]_-' | cut -c1-40)
ROOT_PATCH="$WORKROOT/$PATCH_TAG"

# --- Helpers ---
pg() { echo "$1/pg/bin/$2"; }

pick_port() {
  for p in $(seq "${1:-5432}" 60000); do
    lsof -iTCP:"$p" -sTCP:LISTEN >/dev/null 2>&1 || { echo "$p"; return; }
  done
  die "No free port found"
}

set_io_delay() {
  local ms="$1"
  [[ -z "$IO_DELAY_MS" ]] && return
  local table size dev
  table=$(sudo dmsetup table "$DM_DELAY_DEV")
  size=$(echo "$table" | awk '{print $2}')
  dev=$(echo "$table" | awk '{print $4}')
  log "Setting dm_delay on $DM_DELAY_DEV to ${ms}ms read / ${WRITE_DELAY_MS}ms write"
  sudo dmsetup suspend "$DM_DELAY_DEV"
  sudo dmsetup reload "$DM_DELAY_DEV" --table "0 $size delay $dev 0 $ms $dev 0 $WRITE_DELAY_MS"
  sudo dmsetup resume "$DM_DELAY_DEV"
}

# --- Build PostgreSQL ---
build_pg() {
  local ROOT="$1" PATCH_FILE="${2:-}"
  
  [[ $CLEAN -eq 1 ]] && rm -rf "$ROOT"
  
  if [[ ! -x "$(pg "$ROOT" initdb)" ]]; then
    log "Building PostgreSQL: $ROOT"
    mkdir -p "$ROOT"
    
    git clone --depth 1 https://github.com/postgres/postgres "$ROOT/src" 2>/dev/null
    cd "$ROOT/src"
    
    [[ -n "$PATCH_FILE" ]] && { log "Applying patch"; git apply "$PATCH_FILE"; }
    
    ./configure --prefix="$ROOT/pg" --with-liburing \
      CFLAGS='-O2 -ggdb3 -fno-omit-frame-pointer' >/dev/null 2>&1
    
    make -j"$(nproc)" install >/dev/null 2>&1
  else
    log "Reusing build: $ROOT"
    cd "$ROOT/src"
  fi
  
  # Always install contribs (idempotent, catches reused builds missing new extensions)
  make -C contrib/bloom install >/dev/null 2>&1
  make -C contrib/pgstattuple install >/dev/null 2>&1
  make -C contrib/pg_buffercache install >/dev/null 2>&1
  make -C contrib/pg_prewarm install >/dev/null 2>&1
}

# --- Cluster management ---
init_cluster() {
  local ROOT="$1" PORT="$2"
  
  rm -rf "$ROOT/data"
  "$(pg "$ROOT" initdb)" -D "$ROOT/data" --no-locale >/dev/null 2>&1
  
  cat >> "$ROOT/data/postgresql.conf" <<EOF
port = $PORT
listen_addresses = '127.0.0.1'
shared_buffers = '32GB'
effective_io_concurrency = 200
io_method = $IO_METHOD
io_workers = $IO_WORKERS
io_max_concurrency = $IO_MAX_CONCURRENCY
track_io_timing = on
track_wal_io_timing = on
synchronous_commit = on
autovacuum = off
checkpoint_timeout = 1h
max_wal_size = 10GB
max_parallel_workers_per_gather = 0
EOF
  
  [[ $DIRECT_IO -eq 1 ]] && echo "debug_io_direct = data" >> "$ROOT/data/postgresql.conf"
  
  "$(pg "$ROOT" pg_ctl)" -D "$ROOT/data" -l "$ROOT/server.log" start -w >/dev/null
  
  psql_run "$ROOT" "$PORT" -c "CREATE EXTENSION IF NOT EXISTS pg_buffercache;"
  psql_run "$ROOT" "$PORT" -c "CREATE EXTENSION IF NOT EXISTS pg_prewarm;"
}

stop_cluster() {
  local ROOT="$1"
  "$(pg "$ROOT" pg_ctl)" -D "$ROOT/data" stop -m fast 2>/dev/null || true
}

drop_caches() {
  local ROOT="$1" PORT="$2"
  shift 2
  local rels=("$@")
  
  # Evict target relations from shared buffers (no PG restart needed)
  for rel in "${rels[@]}"; do
    psql_run "$ROOT" "$PORT" -c "SELECT pg_buffercache_evict_relation('${rel}'::regclass);" >/dev/null
  done
  
  # Drop OS page cache (skip with direct IO — no page cache involved)
  if [[ $DIRECT_IO -eq 0 ]]; then
    sync
    echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 2>&1 || true
    sleep 2
  fi
}

psql_run() {
  local ROOT="$1" PORT="$2"
  shift 2
  "$(pg "$ROOT" psql)" -h 127.0.0.1 -p "$PORT" -d postgres -v ON_ERROR_STOP=1 -Atq "$@"
}

# --- Timing ---
run_timed() {
  local ROOT="$1" PORT="$2" SQL="$3"
  local ms
  # -X: ignore .psqlrc, -v ON_ERROR_STOP=1: fail on SQL errors
  # Parse last Time: line, handle both "ms" and "s" units
  ms=$("$(pg "$ROOT" psql)" -h 127.0.0.1 -p "$PORT" -d postgres -X -v ON_ERROR_STOP=1 -At \
    -c '\timing on' -c "$SQL" 2>&1 | \
    awk '
      /Time:/ {
        val=$2; unit=$3;
        if (unit=="ms") ms=val;
        else if (unit=="s") ms=val*1000;
      }
      END { if (ms=="") exit 1; printf "%.3f\n", ms; }
    ')
  # Validate numeric output
  [[ "$ms" =~ ^[0-9]+(\.[0-9]+)?$ ]] || { echo "ERROR: Non-numeric timing: $ms" >&2; return 1; }
  echo "$ms"
}

# --- I/O Stats ---
# Run SQL and capture timing + I/O stats from pg_stat_io
# Resets stats before query, waits for flush, then reads absolute values
# Note: pg_stat_io has PGSTAT_MIN_INTERVAL=1000ms flush delay, so we wait 1.5s
#       after the query to ensure stats are flushed to shared memory.
# Note: pg_stat_io counts I/O operations, not pages (with io_combine_limit=128kB,
#       up to 16 pages per operation). This is expected behavior.
# Returns: ms,reads,read_time,writes,write_time
run_timed_with_io() {
  local ROOT="$1" PORT="$2" SQL="$3"
  local result
  
  # Reset stats, run query, wait for flush, read absolute values
  # - Filter by client backend and io worker (excludes bgwriter/checkpointer)
  # - 1.5s delay allows stats to flush (PGSTAT_MIN_INTERVAL=1000ms)
  result=$("$(pg "$ROOT" psql)" -h 127.0.0.1 -p "$PORT" -d postgres -X -v ON_ERROR_STOP=1 <<EOSQL
SELECT pg_stat_reset_shared('io');
\\timing on
$SQL
\\timing off
SELECT pg_sleep(1.5);
\\t on
SELECT 
  COALESCE(SUM(reads),0)::bigint,
  COALESCE(SUM(read_time),0)::numeric(12,2),
  COALESCE(SUM(writes),0)::bigint,
  COALESCE(SUM(write_time),0)::numeric(12,2)
FROM pg_stat_io 
WHERE object = 'relation' AND backend_type IN ('client backend', 'io worker');
EOSQL
  2>&1)
  
  # Parse timing (last Time: line)
  local ms
  ms=$(echo "$result" | awk '
    /Time:/ {
      val=$2; unit=$3;
      if (unit=="ms") ms=val;
      else if (unit=="s") ms=val*1000;
    }
    END { if (ms=="") exit 1; printf "%.3f\n", ms; }
  ')
  
  # Parse I/O stats (last non-empty line with pipe separator: reads|read_time|writes|write_time)
  local reads read_time writes write_time
  local io_line
  io_line=$(echo "$result" | grep '|' | tail -1)
  reads=$(echo "$io_line"     | cut -d'|' -f1 | tr -d ' ')
  read_time=$(echo "$io_line"  | cut -d'|' -f2 | tr -d ' ')
  writes=$(echo "$io_line"    | cut -d'|' -f3 | tr -d ' ')
  write_time=$(echo "$io_line" | cut -d'|' -f4 | tr -d ' ')
  
  # Default to 0 if not found
  [[ "$reads"      =~ ^-?[0-9]+$             ]] || reads=0
  [[ "$read_time"  =~ ^-?[0-9]+(\.[0-9]+)?$ ]] || read_time=0
  [[ "$writes"     =~ ^-?[0-9]+$             ]] || writes=0
  [[ "$write_time" =~ ^-?[0-9]+(\.[0-9]+)?$ ]] || write_time=0
  
  echo "$ms,$reads,$read_time,$writes,$write_time"
}

# --- Statistics ---
calc_median() {
  awk -F, 'NR>1{a[++n]=$2}END{
    if(n==0){print 0; exit}
    for(i=1;i<=n;i++)for(j=i+1;j<=n;j++)if(a[i]>a[j]){t=a[i];a[i]=a[j];a[j]=t}
    print (n%2)?a[int(n/2)+1]:(a[n/2]+a[n/2+1])/2
  }' "$1"
}

calc_median_col() {
  local file="$1" col="$2"
  awk -F, -v col="$col" 'NR>1{a[++n]=$col}END{
    if(n==0){print 0; exit}
    for(i=1;i<=n;i++)for(j=i+1;j<=n;j++)if(a[i]>a[j]){t=a[i];a[i]=a[j];a[j]=t}
    print (n%2)?a[int(n/2)+1]:(a[n/2]+a[n/2+1])/2
  }' "$file"
}

calc_stats() {
  local csv="$1"
  awk -F, 'NR>1{a[++n]=$2;s+=$2}END{
    if(n==0)exit
    for(i=1;i<=n;i++)for(j=i+1;j<=n;j++)if(a[i]>a[j]){t=a[i];a[i]=a[j];a[j]=t}
    med=(n%2)?a[int(n/2)+1]:(a[n/2]+a[n/2+1])/2
    avg=s/n; for(i=1;i<=n;i++)ss+=(a[i]-avg)^2; sd=sqrt(ss/n)
    printf "median=%.1fms mean=%.1f±%.1fms n=%d", med, avg, sd, n
  }' "$csv"
}

# --- Profiling ---
# Run a SQL command under perf, attaching to the backend PID.
# Generates perf.data and flamegraph SVG.
#   profile_sql ROOT PORT LABEL SQL
profile_sql() {
  [[ $DO_PROFILE -ne 1 ]] && return
  
  local ROOT="$1" PORT="$2" LABEL="$3" SQL="$4"
  local PROF_DIR="$ROOT/profile"
  mkdir -p "$PROF_DIR"
  
  local PERF_DATA="$PROF_DIR/${LABEL}.perf.data"
  local SVG="$PROF_DIR/${LABEL}.svg"
  local psql_bin
  psql_bin="$(pg "$ROOT" psql)"
  
  # Use a unique application_name to find the backend PID
  local APP="prof_${LABEL}_$$"
  
  # Launch a psql session that will first identify itself, then run the SQL
  # The pg_sleep() gives us time to find the backend PID and attach perf
  PGAPPNAME="$APP" "$psql_bin" -h 127.0.0.1 -p "$PORT" -d postgres \
    -X -v ON_ERROR_STOP=1 <<EOSQL >/dev/null 2>&1 &
SELECT pg_sleep(2);
$SQL
EOSQL
  local QUERY_SHELL_PID=$!
  
  # Find the backend PID via pg_stat_activity
  local BACKEND_PID=""
  for ((n=0; n<100; n++)); do
    BACKEND_PID=$("$psql_bin" -h 127.0.0.1 -p "$PORT" -d postgres -Atq \
      -c "SELECT pid FROM pg_stat_activity WHERE application_name='${APP}' ORDER BY backend_start DESC LIMIT 1;" 2>/dev/null)
    [[ -n "$BACKEND_PID" && -d "/proc/$BACKEND_PID" ]] && break
    sleep 0.05
  done
  
  if [[ -z "$BACKEND_PID" || ! -d "/proc/$BACKEND_PID" ]]; then
    log "WARNING: Could not find backend PID for profiling, skipping"
    wait "$QUERY_SHELL_PID" 2>/dev/null || true
    return
  fi
  
  log "Profiling backend PID $BACKEND_PID → $PERF_DATA"
  
  # Attach perf to the backend; we explicitly kill -INT it after the query finishes
  $PERF_SUDO perf record -g --call-graph dwarf \
    -p "$BACKEND_PID" -o "$PERF_DATA" \
    --event="$PERF_EVENT" 2>/dev/null &
  local PERF_PID=$!
  sleep 0.1
  
  # Verify perf actually started (permissions, valid PID, etc.)
  if ! kill -0 "$PERF_PID" 2>/dev/null; then
    log "WARNING: perf record failed to start (permissions/config?), skipping flamegraph"
    wait "$QUERY_SHELL_PID" 2>/dev/null || true
    return
  fi
  
  # Wait for the query to finish
  wait "$QUERY_SHELL_PID" 2>/dev/null || true
  
  # Give perf a moment to flush, then stop it
  sleep 0.5
  $PERF_SUDO kill -INT "$PERF_PID" 2>/dev/null || true; wait "$PERF_PID" 2>/dev/null || true
  
  # Generate flamegraph
  generate_flamegraph "$PERF_DATA" "$SVG" "$LABEL"
}

# Convert perf.data → flamegraph SVG
#   generate_flamegraph PERF_DATA SVG_PATH TITLE
generate_flamegraph() {
  local PERF_DATA="$1" SVG="$2" TITLE="$3"
  
  [[ -f "$PERF_DATA" ]] || return
  
  local FOLDED="${PERF_DATA%.perf.data}.folded"
  if $PERF_SUDO perf script -i "$PERF_DATA" 2>/dev/null \
      | "$FLAMEGRAPH_DIR/stackcollapse-perf.pl" > "$FOLDED" 2>/dev/null \
      && [[ -s "$FOLDED" ]]; then
    "$FLAMEGRAPH_DIR/flamegraph.pl" --title "$TITLE" --countname samples \
      "$FOLDED" > "$SVG" 2>/dev/null
    log "Flamegraph: $SVG"
    rm -f "$FOLDED"
  else
    log "WARNING: Failed to generate flamegraph for $TITLE"
    rm -f "$FOLDED"
  fi
}

# --- Benchmark runner ---
# benchmark ROOT PORT NAME SQL RELATION [RELATION...]
benchmark() {
  local ROOT="$1" PORT="$2" NAME="$3" SQL="$4"
  shift 4
  local rels=("$@")
  local OUT="$ROOT/results/${NAME}.csv"
  
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  for ((i=1; i<=REPS; i++)); do
    drop_caches "$ROOT" "$PORT" "${rels[@]}"
    local result ms reads read_time writes write_time
    result=$(run_timed_with_io "$ROOT" "$PORT" "$SQL")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "$NAME [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
}

# --- Data setup functions ---
setup_bloom() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=100000 ;;
    medium) NROWS=1000000 ;;
    large)  NROWS=10000000 ;;
  esac
  
  log "Creating Bloom test data ($SIZE: $NROWS rows)"
  psql_run "$ROOT" "$PORT" <<SQL
CREATE EXTENSION IF NOT EXISTS bloom;
DROP TABLE IF EXISTS bloom_test;
CREATE TABLE bloom_test (id INT, data TEXT, val1 INT, val2 INT);
INSERT INTO bloom_test SELECT i, 'data_'||i, i%1000, i%100 FROM generate_series(1,$NROWS) i;
CREATE INDEX bloom_idx ON bloom_test USING bloom (val1, val2);
VACUUM ANALYZE bloom_test;
CHECKPOINT;
SQL
}

setup_pgstattuple() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=100000 ;;
    medium) NROWS=1000000 ;;
    large)  NROWS=10000000 ;;
  esac
  
  log "Creating pgstattuple test data ($SIZE: $NROWS rows)"
  psql_run "$ROOT" "$PORT" <<SQL
CREATE EXTENSION IF NOT EXISTS pgstattuple;
DROP TABLE IF EXISTS heap_test;
CREATE TABLE heap_test (id SERIAL PRIMARY KEY, data TEXT);
INSERT INTO heap_test (data) SELECT repeat('x',100) FROM generate_series(1,$NROWS);
VACUUM ANALYZE heap_test;
CHECKPOINT;
SQL
}

setup_pgstatindex() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=100000 ;;
    medium) NROWS=1000000 ;;
    large)  NROWS=10000000 ;;
  esac
  
  log "Creating pgstatindex test data ($SIZE: $NROWS rows)"
  psql_run "$ROOT" "$PORT" <<SQL
CREATE EXTENSION IF NOT EXISTS pgstattuple;
DROP TABLE IF EXISTS idx_test;
CREATE TABLE idx_test (id SERIAL PRIMARY KEY, data TEXT);
INSERT INTO idx_test (data) SELECT 'data_row_' || i || '_' || repeat('x',50) FROM generate_series(1,$NROWS) i;
VACUUM ANALYZE idx_test;
CHECKPOINT;
SQL
}

setup_gin() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=100000 ;;
    medium) NROWS=1000000 ;;
    large)  NROWS=5000000 ;;
  esac
  
  log "Creating GIN test data ($SIZE: $NROWS rows)"
  psql_run "$ROOT" "$PORT" <<SQL
DROP TABLE IF EXISTS gin_test;
-- No PRIMARY KEY: isolate GIN index vacuum from btree overhead
CREATE TABLE gin_test (id INT, tags TEXT[]);
INSERT INTO gin_test (id, tags)
SELECT i, ARRAY(SELECT 'tag_'||(random()*100)::int FROM generate_series(1,5))
FROM generate_series(1,$NROWS) i;
CREATE INDEX gin_idx ON gin_test USING gin (tags);
VACUUM ANALYZE gin_test;
CHECKPOINT;
SQL
}

setup_hash() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=500000 ;;
    medium) NROWS=5000000 ;;
    large)  NROWS=20000000 ;;
  esac
  
  log "Creating Hash test data ($SIZE: $NROWS unique values)"
  psql_run "$ROOT" "$PORT" <<SQL
DROP TABLE IF EXISTS hash_test;
-- No PRIMARY KEY: isolate hash index vacuum from btree overhead
CREATE TABLE hash_test (id INT, data TEXT);
INSERT INTO hash_test SELECT i, 'x' FROM generate_series(1,$NROWS) i;
CREATE INDEX hash_idx ON hash_test USING hash (id);
VACUUM ANALYZE hash_test;
CHECKPOINT;
SQL
}

setup_wal() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=1000000 ;;
    medium) NROWS=5000000 ;;
    large)  NROWS=20000000 ;;
  esac
  
  log "Creating table for GIN index build / log_newpage_range test ($SIZE: $NROWS rows)"
  psql_run "$ROOT" "$PORT" <<SQL
DROP TABLE IF EXISTS wal_test;
-- Table with tsvector column for GIN indexing (full-text search)
-- GIN index builds always call log_newpage_range() at the end of
-- ginbuild() (gininsert.c) to WAL-log all index pages. 
CREATE TABLE wal_test (id INT, doc TEXT, doc_tsv TSVECTOR);
INSERT INTO wal_test
  SELECT i,
         'word' || (random()*10000)::int || ' term' || (random()*10000)::int
           || ' token' || (random()*5000)::int || ' phrase' || (random()*8000)::int,
         to_tsvector('simple',
           'word' || (random()*10000)::int || ' term' || (random()*10000)::int
           || ' token' || (random()*5000)::int || ' phrase' || (random()*8000)::int)
  FROM generate_series(1,$NROWS) i;
VACUUM ANALYZE wal_test;
CHECKPOINT;
SQL
}

# --- Test functions ---
test_bloom_scan() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  setup_bloom "$ROOT" "$PORT" "$SIZE"
  benchmark "$ROOT" "$PORT" "${LABEL}_bloom_scan_${SIZE}" \
    "SET enable_seqscan=off; SELECT COUNT(*) FROM bloom_test WHERE val1=42 AND val2=7;" \
    bloom_test bloom_idx
  # Profile after benchmark reps: shared_buffers memory already faulted in,
  # so page-fault noise is gone; drop_caches ensures cold IO for the profile.
  if [[ $DO_PROFILE -eq 1 ]]; then
    drop_caches "$ROOT" "$PORT" bloom_test bloom_idx
    profile_sql "$ROOT" "$PORT" "${LABEL}_bloom_scan_${SIZE}" \
      "SET enable_seqscan=off; SELECT COUNT(*) FROM bloom_test WHERE val1=42 AND val2=7;"
  fi
}

test_bloom_vacuum() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  local OUT="$ROOT/results/${LABEL}_bloom_vacuum_${SIZE}.csv"
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  for ((i=1; i<=REPS; i++)); do
    # Fresh table each run for consistent state
    setup_bloom "$ROOT" "$PORT" "$SIZE"
    # Create 10% dead tuples
    psql_run "$ROOT" "$PORT" -c "DELETE FROM bloom_test WHERE id % 10 = 0;"
    
    drop_caches "$ROOT" "$PORT" bloom_test bloom_idx
    local result ms reads read_time writes write_time
    result=$(run_timed_with_io "$ROOT" "$PORT" "VACUUM bloom_test;")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "${LABEL}_bloom_vacuum_${SIZE} [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
  
  if [[ $DO_PROFILE -eq 1 ]]; then
    setup_bloom "$ROOT" "$PORT" "$SIZE"
    psql_run "$ROOT" "$PORT" -c "DELETE FROM bloom_test WHERE id % 10 = 0;"
    drop_caches "$ROOT" "$PORT" bloom_test bloom_idx
    profile_sql "$ROOT" "$PORT" "${LABEL}_bloom_vacuum_${SIZE}" "VACUUM bloom_test;"
  fi
}

test_pgstattuple() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  local OUT="$ROOT/results/${LABEL}_pgstattuple_${SIZE}.csv"
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  # Setup once — rolled-back DELETE keeps layout identical across all reps
  setup_pgstattuple "$ROOT" "$PORT" "$SIZE"
  # Rolled-back DELETE clears the all-visible bit in the Visibility Map so
  # pgstattuple_approx must actually read those pages (it skips all-visible pages).
  # Using ROLLBACK keeps the physical layout identical across all reps (no TOAST
  # out-of-page updates, no dirty pages to flush from shared_buffers).
  psql_run "$ROOT" "$PORT" -c "BEGIN; DELETE FROM heap_test WHERE id % 500 = 0; ROLLBACK;"
  # Warmup pass: The rolled-back DELETE left every touched tuple with an xmax
  # pointing to the aborted transaction but no hint bits set. On the first
  # pgstattuple_approx call, HeapTupleSatisfiesVacuum → HeapTupleSatisfiesVacuumHorizon
  # must resolve each such xmax: TransactionIdIsInProgress (ProcArray scan) then
  # TransactionIdDidCommit (CLOG lookup) — only then can it call SetHintBits to
  # stamp HEAP_XMAX_INVALID and MarkBufferDirtyHint. Without this warmup, rep 1
  # pays ~1100ms extra CPU for those CLOG/ProcArray lookups. Subsequent reps hit
  # the early-exit at "if (t_infomask & HEAP_XMAX_INVALID) return HEAPTUPLE_LIVE"
  # and skip the expensive path entirely.
  # After this pass, the dirtied hint-bit pages are flushed to disk via
  # drop_caches, so all reps start from the same on-disk state.
  psql_run "$ROOT" "$PORT" -c "SELECT * FROM pgstattuple_approx('heap_test');" >/dev/null

  for ((i=1; i<=REPS; i++)); do
    drop_caches "$ROOT" "$PORT" heap_test heap_test_pkey
    local result ms reads read_time writes write_time
    result=$(run_timed_with_io "$ROOT" "$PORT" "SELECT * FROM pgstattuple_approx('heap_test');")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "${LABEL}_pgstattuple_${SIZE} [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
  
  if [[ $DO_PROFILE -eq 1 ]]; then
    psql_run "$ROOT" "$PORT" -c "BEGIN; DELETE FROM heap_test WHERE id % 500 = 0; ROLLBACK;"
    drop_caches "$ROOT" "$PORT" heap_test heap_test_pkey
    profile_sql "$ROOT" "$PORT" "${LABEL}_pgstattuple_${SIZE}" \
      "SELECT * FROM pgstattuple_approx('heap_test');"
  fi
}

test_pgstatindex() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  setup_pgstatindex "$ROOT" "$PORT" "$SIZE"
  benchmark "$ROOT" "$PORT" "${LABEL}_pgstatindex_${SIZE}" \
    "SELECT * FROM pgstatindex('idx_test_pkey');" \
    idx_test idx_test_pkey
  if [[ $DO_PROFILE -eq 1 ]]; then
    drop_caches "$ROOT" "$PORT" idx_test idx_test_pkey
    profile_sql "$ROOT" "$PORT" "${LABEL}_pgstatindex_${SIZE}" \
      "SELECT * FROM pgstatindex('idx_test_pkey');"
  fi
}

test_gin_vacuum() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  local OUT="$ROOT/results/${LABEL}_gin_vacuum_${SIZE}.csv"
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  for ((i=1; i<=REPS; i++)); do
    # Fresh table each run for consistent state
    setup_gin "$ROOT" "$PORT" "$SIZE"
    
    drop_caches "$ROOT" "$PORT" gin_test gin_idx
    local result ms reads read_time writes write_time
    # VACUUM ANALYZE forces ginvacuumcleanup() to run and scan all pages
    result=$(run_timed_with_io "$ROOT" "$PORT" "VACUUM ANALYZE gin_test;")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "${LABEL}_gin_vacuum_${SIZE} [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
  
  if [[ $DO_PROFILE -eq 1 ]]; then
    setup_gin "$ROOT" "$PORT" "$SIZE"
    drop_caches "$ROOT" "$PORT" gin_test gin_idx
    profile_sql "$ROOT" "$PORT" "${LABEL}_gin_vacuum_${SIZE}" "VACUUM ANALYZE gin_test;"
  fi
}

test_hash_vacuum() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  local OUT="$ROOT/results/${LABEL}_hash_vacuum_${SIZE}.csv"
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  for ((i=1; i<=REPS; i++)); do
    # Fresh table each run for consistent state
    setup_hash "$ROOT" "$PORT" "$SIZE"
    # Create 10% dead tuples
    psql_run "$ROOT" "$PORT" -c "DELETE FROM hash_test WHERE id % 10 = 0;"
    
    drop_caches "$ROOT" "$PORT" hash_test hash_idx
    local result ms reads read_time writes write_time
    result=$(run_timed_with_io "$ROOT" "$PORT" "VACUUM hash_test;")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "${LABEL}_hash_vacuum_${SIZE} [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
  
  if [[ $DO_PROFILE -eq 1 ]]; then
    setup_hash "$ROOT" "$PORT" "$SIZE"
    psql_run "$ROOT" "$PORT" -c "DELETE FROM hash_test WHERE id % 10 = 0;"
    drop_caches "$ROOT" "$PORT" hash_test hash_idx
    profile_sql "$ROOT" "$PORT" "${LABEL}_hash_vacuum_${SIZE}" "VACUUM hash_test;"
  fi
}

test_wal_logging() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  local OUT="$ROOT/results/${LABEL}_wal_logging_${SIZE}.csv"
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  # Build table once - only rebuild index each rep
  setup_wal "$ROOT" "$PORT" "$SIZE"
  
  local WAL_SQL="CREATE INDEX wal_test_gin_idx ON wal_test USING gin (doc_tsv);"
  
  for ((i=1; i<=REPS; i++)); do
    # Drop index from previous iteration
    psql_run "$ROOT" "$PORT" -c "DROP INDEX IF EXISTS wal_test_gin_idx;"
    
    # Drop OS caches - source table pages are COLD on disk
    drop_caches "$ROOT" "$PORT" wal_test
    
    # CREATE INDEX on GIN (tsvector_ops):
    # - GIN always uses the same build path: ginbuild() populates the
    #   index in memory, flushes to disk, then calls log_newpage_range()
    #   to read ALL index pages and write them to WAL (gininsert.c:785-790)
    local result ms reads read_time writes write_time
    result=$(run_timed_with_io "$ROOT" "$PORT" "$WAL_SQL")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "${LABEL}_wal_logging_${SIZE} [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
  
  if [[ $DO_PROFILE -eq 1 ]]; then
    psql_run "$ROOT" "$PORT" -c "DROP INDEX IF EXISTS wal_test_gin_idx;"
    drop_caches "$ROOT" "$PORT" wal_test
    profile_sql "$ROOT" "$PORT" "${LABEL}_wal_logging_${SIZE}" "$WAL_SQL"
  fi
}

# --- Run tests for a build ---
warmup_catalog() {
  local ROOT="$1" PORT="$2"
  # Explicitly prewarm catalog tables and their indexes into shared_buffers
  # so rep 1 doesn't pay disk-read cost for catalog pages.
  # pg_buffercache_evict_relation only evicts the test relation, not catalogs,
  # so these stay warm across all reps.
  psql_run "$ROOT" "$PORT" <<SQL >/dev/null
SELECT pg_prewarm('pg_class',     'buffer');
SELECT pg_prewarm('pg_attribute', 'buffer');
SELECT pg_prewarm('pg_namespace', 'buffer');
SELECT pg_prewarm('pg_proc',      'buffer');
SELECT pg_prewarm('pg_type',      'buffer');
SQL
}

run_tests() {
  local ROOT="$1" LABEL="$2"
  local PORT
  PORT=$(pick_port)
  
  log "[$LABEL] Starting cluster on port $PORT"
  init_cluster "$ROOT" "$PORT"
  warmup_catalog "$ROOT" "$PORT"
  set_io_delay "$IO_DELAY_MS"
  
  trap "stop_cluster '$ROOT'" EXIT
  
  for SIZE in $SIZES; do
    case "$TEST" in
      bloom_scan)   test_bloom_scan "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      bloom_vacuum) test_bloom_vacuum "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      pgstattuple)  test_pgstattuple "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      pgstatindex)  test_pgstatindex "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      gin_vacuum)   test_gin_vacuum "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      hash_vacuum)  test_hash_vacuum "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      wal_logging)  test_wal_logging "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      all)
        test_bloom_vacuum "$ROOT" "$PORT" "$LABEL" "$SIZE"
        test_pgstattuple "$ROOT" "$PORT" "$LABEL" "$SIZE"
        test_pgstatindex "$ROOT" "$PORT" "$LABEL" "$SIZE"
        test_hash_vacuum "$ROOT" "$PORT" "$LABEL" "$SIZE"
        test_wal_logging "$ROOT" "$PORT" "$LABEL" "$SIZE"
        ;;
      *) die "Unknown test: $TEST" ;;
    esac
  done
  
  stop_cluster "$ROOT"
  trap - EXIT
}

# --- Compare results ---
compare_results() {
  local base_csv="$1" patch_csv="$2" label="$3"
  
  [[ ! -f "$base_csv" || ! -f "$patch_csv" ]] && return
  
  local base_med patch_med
  base_med=$(calc_median "$base_csv")
  patch_med=$(calc_median "$patch_csv")
  
  # Guard against empty or zero values to prevent division by zero
  [[ -z "$base_med" || "$base_med" == "0" ]] && base_med="0.001"
  [[ -z "$patch_med" || "$patch_med" == "0" ]] && patch_med="0.001"
  
  local speedup pct
  speedup=$(awk "BEGIN { printf \"%.2f\", $base_med / $patch_med }")
  pct=$(awk "BEGIN { printf \"%.1f\", ($base_med - $patch_med) / $base_med * 100 }")
  
  local io_info=""
  if head -1 "$base_csv" | grep -q "reads"; then
    # Standard test: columns are run,ms,reads,read_time_ms,writes,write_time_ms
    local base_reads patch_reads base_rtime patch_rtime base_writes patch_writes base_wtime patch_wtime
    base_reads=$(calc_median_col "$base_csv" 3)
    patch_reads=$(calc_median_col "$patch_csv" 3)
    base_rtime=$(calc_median_col "$base_csv" 4)
    patch_rtime=$(calc_median_col "$patch_csv" 4)
    base_writes=$(calc_median_col "$base_csv" 5)
    patch_writes=$(calc_median_col "$patch_csv" 5)
    base_wtime=$(calc_median_col "$base_csv" 6)
    patch_wtime=$(calc_median_col "$patch_csv" 6)
    # Default to 0 if empty
    [[ -z "$base_reads" ]]   && base_reads=0
    [[ -z "$patch_reads" ]]  && patch_reads=0
    [[ -z "$base_rtime" ]]   && base_rtime=0
    [[ -z "$patch_rtime" ]]  && patch_rtime=0
    [[ -z "$base_writes" ]]  && base_writes=0
    [[ -z "$patch_writes" ]] && patch_writes=0
    [[ -z "$base_wtime" ]]   && base_wtime=0
    [[ -z "$patch_wtime" ]]  && patch_wtime=0
    io_info="  (reads=${base_reads}→${patch_reads}, read_time=${base_rtime}→${patch_rtime}ms, writes=${base_writes}→${patch_writes}, write_time=${base_wtime}→${patch_wtime}ms)"
  fi
  
  printf "%-26s base=%8.1fms  patch=%8.1fms  %5.2fx  (%5.1f%%)%s\n" \
    "$label" "$base_med" "$patch_med" "$speedup" "$pct" "$io_info"
}

print_summary() {
  echo ""
  echo "═══════════════════════════════════════════════════════════════════════"
  echo "                     STREAMING READ BENCHMARK RESULTS                   "
  echo "═══════════════════════════════════════════════════════════════════════"
  echo ""
  
  if [[ $BASELINE -eq 1 ]]; then
    printf "%-26s %-17s %-17s %-7s %-7s %s\n" "TEST" "BASELINE" "PATCHED" "SPEEDUP" "CHANGE" "I/O TIME"
    echo "─────────────────────────────────────────────────────────────────────────────────────────────────"
    
    for SIZE in $SIZES; do
      for test_name in bloom_scan bloom_vacuum pgstattuple pgstatindex gin_vacuum hash_vacuum wal_logging; do
        [[ "$TEST" != "all" && "$TEST" != "$test_name" ]] && continue
        compare_results \
          "$ROOT_BASE/results/base_${test_name}_${SIZE}.csv" \
          "$ROOT_PATCH/results/patched_${test_name}_${SIZE}.csv" \
          "${test_name}_${SIZE}"
      done
    done
  else
    echo "Results (patched only):"
    echo ""
    for f in "$ROOT_PATCH/results/"*.csv; do
      [[ -f "$f" ]] || continue
      printf "%-40s %s\n" "$(basename "$f" .csv):" "$(calc_stats "$f")"
    done
  fi
  
  echo ""
  echo "═══════════════════════════════════════════════════════════════════════"
  echo "CSV files: $ROOT_PATCH/results/"
  [[ $BASELINE -eq 1 ]] && echo "Baseline:  $ROOT_BASE/results/"
  
  # List generated flamegraphs
  if [[ $DO_PROFILE -eq 1 ]]; then
    local svgs=()
    for dir in "$ROOT_BASE/profile" "$ROOT_PATCH/profile"; do
      [[ -d "$dir" ]] || continue
      for svg in "$dir"/*.svg; do
        [[ -f "$svg" ]] && svgs+=("$svg")
      done
    done
    if [[ ${#svgs[@]} -gt 0 ]]; then
      echo ""
      echo "Flamegraphs:"
      for svg in "${svgs[@]}"; do echo "  $svg"; done
    fi
  fi
  
  echo "═══════════════════════════════════════════════════════════════════════"
}

# --- Main ---
main() {
  log "Streaming Read Benchmark"
  log "Patch: $PATCH ($PATCH_TAG)"
  log "Tests: $TEST"
  log "Sizes: $SIZES"
  log "Reps:  $REPS"
  log "I/O:   $IO_METHOD (workers=$IO_WORKERS, concurrency=$IO_MAX_CONCURRENCY)"
  [[ $DIRECT_IO -eq 1 ]] && log "Direct IO: enabled (debug_io_direct=data)"
  [[ -n "$IO_DELAY_MS" ]] && log "I/O delay: ${IO_DELAY_MS}ms read / ${WRITE_DELAY_MS}ms write via dm_delay ($DM_DELAY_DEV)"
  [[ $DO_PROFILE -eq 1 ]] && log "Profile: enabled (flamegraphs → <root>/profile/)"
  
  # Build
  if [[ $BASELINE -eq 1 ]]; then
    build_pg "$ROOT_BASE" ""
  fi
  build_pg "$ROOT_PATCH" "$PATCH"
  
  # Run tests
  if [[ $BASELINE -eq 1 ]]; then
    log "Running baseline tests"
    run_tests "$ROOT_BASE" "base"
  fi
  
  log "Running patched tests"
  run_tests "$ROOT_PATCH" "patched"
  
  # Summary
  print_summary
}

main

^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 23:04                     ` Re: Streamify more code paths Andres Freund <[email protected]>
  2026-03-11 01:37                       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 15:11                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 21:33                           ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 03:42                             ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 04:39                               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-12 15:35                                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2026-03-13 01:49                                   ` Michael Paquier <[email protected]>
  2026-03-13 02:39                                     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Michael Paquier @ 2026-03-13 01:49 UTC (permalink / raw)
  To: Xuneng Zhou <[email protected]>; +Cc: Andres Freund <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

On Thu, Mar 12, 2026 at 11:35:48PM +0800, Xuneng Zhou wrote:
> Our io_time metric currently measures only read time and ignores write
> I/O, which can be misleading. We now separate it into read_time and
> write_time.

I had a look at the pgstatindex part this morning, running my own test
under conditions similar to 6c228755add8, and here's one extract with
io_uring:
pgstatindex (100k tuples) base=32938.2ms patch=83.3ms 395.60x ( 99.7%)
(reads=2745->173, io_time=32932.09->59.75ms)

There was one issue with a declaration put in the middle of the code,
that I have fixed.  This one is now done, remains 3 pieces to
evaluate.
--
Michael


Attachments:

  [application/pgp-signature] signature.asc (833B, 2-signature.asc)
  download

^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 23:04                     ` Re: Streamify more code paths Andres Freund <[email protected]>
  2026-03-11 01:37                       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 15:11                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 21:33                           ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 03:42                             ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 04:39                               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-12 15:35                                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-13 01:49                                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
@ 2026-03-13 02:39                                     ` Xuneng Zhou <[email protected]>
  2026-03-14 09:56                                       ` Re: Streamify more code paths Michael Paquier <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2026-03-13 02:39 UTC (permalink / raw)
  To: Michael Paquier <[email protected]>; +Cc: Andres Freund <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

On Fri, Mar 13, 2026 at 9:50 AM Michael Paquier <[email protected]> wrote:
>
> On Thu, Mar 12, 2026 at 11:35:48PM +0800, Xuneng Zhou wrote:
> > Our io_time metric currently measures only read time and ignores write
> > I/O, which can be misleading. We now separate it into read_time and
> > write_time.
>
> I had a look at the pgstatindex part this morning, running my own test
> under conditions similar to 6c228755add8, and here's one extract with
> io_uring:
> pgstatindex (100k tuples) base=32938.2ms patch=83.3ms 395.60x ( 99.7%)
> (reads=2745->173, io_time=32932.09->59.75ms)

This result looks great!

> There was one issue with a declaration put in the middle of the code,
> that I have fixed.  This one is now done, remains 3 pieces to
> evaluate.

Thanks for fixing this and for taking the time to review and test the patches.

--
Best,
Xuneng





^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 23:04                     ` Re: Streamify more code paths Andres Freund <[email protected]>
  2026-03-11 01:37                       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 15:11                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 21:33                           ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 03:42                             ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 04:39                               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-12 15:35                                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-13 01:49                                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-13 02:39                                     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2026-03-14 09:56                                       ` Michael Paquier <[email protected]>
  2026-03-15 02:51                                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Michael Paquier @ 2026-03-14 09:56 UTC (permalink / raw)
  To: Xuneng Zhou <[email protected]>; +Cc: Andres Freund <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

On Fri, Mar 13, 2026 at 10:39:52AM +0800, Xuneng Zhou wrote:
> Thanks for fixing this and for taking the time to review and test
> the patches.

Looking at the rest, I have produced some numbers:
pgstattuple_small (20k tuples, io_uring) base= 60839.9ms
patch=10949.9ms 5.56x ( 82.0%) (reads=4139->260,
io_time=49616.97->55.25ms)
pgstattuple_small (20k tuples, worker=3) base= 60577.5ms
patch=11470.0ms 5.28x ( 81.1%) (reads=4139->260,
io_time=49359.79->69.60ms)
hash_vacuum (1M tuples, io_uring)  base=199929.0ms patch=161747.0ms
1.24x ( 19.1%) (reads=4665->1615, io_time=47084.8->9925.77ms) 
hash_vacuum (1M tuples, worker=12) base=203417.0ms patch=161687.0ms
1.26x ( 20.5%) (reads=4665->1615, io_time=48356.3->9917.24ms) 

The hash vacuum numbers are less amazing here than yours.  Trying out 
various configurations does not change the results much (I was puzzled
for a couple of hours that I did not see any performance impact but
forgot the eviction of the index pages from the shared buffers, that
influences the numbers to what I have here), but I'll take it anyway. 

One thing that I was wondering for the pgstattuple patch is if we
should have "scanned" put outside the private data of the callback as
we get back to the main loop once we know that the page is not
all-visible, so we could increment the counter in the main loop
instead of the callback.  Now I get that you have done that as it
feels cleaner for the "default" return path of the callback, while the
logic remains the same, so I have kept it as-is at the end, tweaked a
few things, and applied this one.

I have not been able to review yet the patch for the hash VACUUM
proposal, which would be the last one.
--
Michael


Attachments:

  [application/pgp-signature] signature.asc (833B, 2-signature.asc)
  download

^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 23:04                     ` Re: Streamify more code paths Andres Freund <[email protected]>
  2026-03-11 01:37                       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 15:11                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 21:33                           ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 03:42                             ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 04:39                               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-12 15:35                                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-13 01:49                                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-13 02:39                                     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-14 09:56                                       ` Re: Streamify more code paths Michael Paquier <[email protected]>
@ 2026-03-15 02:51                                         ` Xuneng Zhou <[email protected]>
  2026-03-15 03:47                                           ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2026-03-15 02:51 UTC (permalink / raw)
  To: Michael Paquier <[email protected]>; +Cc: Andres Freund <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

On Sat, Mar 14, 2026 at 5:56 PM Michael Paquier <[email protected]> wrote:
>
> On Fri, Mar 13, 2026 at 10:39:52AM +0800, Xuneng Zhou wrote:
> > Thanks for fixing this and for taking the time to review and test
> > the patches.
>
> Looking at the rest, I have produced some numbers:
> pgstattuple_small (20k tuples, io_uring) base= 60839.9ms
> patch=10949.9ms 5.56x ( 82.0%) (reads=4139->260,
> io_time=49616.97->55.25ms)
> pgstattuple_small (20k tuples, worker=3) base= 60577.5ms
> patch=11470.0ms 5.28x ( 81.1%) (reads=4139->260,
> io_time=49359.79->69.60ms)
> hash_vacuum (1M tuples, io_uring)  base=199929.0ms patch=161747.0ms
> 1.24x ( 19.1%) (reads=4665->1615, io_time=47084.8->9925.77ms)
> hash_vacuum (1M tuples, worker=12) base=203417.0ms patch=161687.0ms
> 1.26x ( 20.5%) (reads=4665->1615, io_time=48356.3->9917.24ms)
>
> The hash vacuum numbers are less amazing here than yours.  Trying out
> various configurations does not change the results much (I was puzzled
> for a couple of hours that I did not see any performance impact but
> forgot the eviction of the index pages from the shared buffers, that
> influences the numbers to what I have here), but I'll take it anyway.

My guess is that the results are influenced by the write delay. Vacuum
operations can be write-intensive, so when both read and write delays
are set to 2 ~ 5 ms, a large portion of the runtime may be spent on
writes. According to Amdahl’s Law, the overall performance improvement
from optimizing a single component is limited by the fraction of time
that component actually contributes to the total execution time. In
this case, the potential rate of speedup from streaming the read path
could be masked by the time spent performing writes.

To investigate this, I added a new option, write-delay. When it is set
to zero, the benchmark simulates a system with a fast write device and
a slow read device, reducing the proportion of time spent on writes.
Admittedly, this setup is somewhat artificial—we would not normally
expect such a large discrepancy between read and write performance in
real systems.

-- worker 12, write-delay 2 ms
hash_vacuum_medium         base= 33743.2ms  patch= 27371.3ms   1.23x
( 18.9%)  (reads=4662→1612, read_time=8242.51→1725.03ms,
writes=12689→12651, write_time=25144.87→25041.75ms)

-- worker 12, write-delay 0 ms
hash_vacuum_medium         base=  8601.1ms  patch=  2234.0ms   3.85x
( 74.0%)  (reads=4662→1612, read_time=8021.65→1637.87ms,
writes=12689→12651, write_time=337.38→288.15ms)

To better understand the behavior, the latest version of the script
separates the I/O time into read time and write time. This allows us
to directly observe their respective contributions and how they change
across runs. A further improvement would be to report the speedup for
the read and write components separately, making it easier to
understand where and how much the performance gains actually occur.

> One thing that I was wondering for the pgstattuple patch is if we
> should have "scanned" put outside the private data of the callback as
> we get back to the main loop once we know that the page is not
> all-visible, so we could increment the counter in the main loop
> instead of the callback.  Now I get that you have done that as it
> feels cleaner for the "default" return path of the callback, while the
> logic remains the same, so I have kept it as-is at the end, tweaked a
> few things, and applied this one.

Thanks for the review and for applying it. My reasoning for putting
scanned inside the callback was to keep all per-block accounting in
one place — the callback is already the point where the skip-vs-read
decision is made, so it seemed natural to count reads there as well.
But I agree the main loop would also be a clean spot for it.

> I have not been able to review yet the patch for the hash VACUUM
> proposal, which would be the last one.
> --
> Michael



-- 
Best,
Xuneng


Attachments:

  [text/x-sh] run_streaming_benchmark.sh (34.9K, 2-run_streaming_benchmark.sh)
  download | inline:
#!/usr/bin/env bash
set -euo pipefail

###############################################################################
# Streaming Read Patches Benchmark
#
# Usage: ./run_streaming_bench.sh [OPTIONS] <patch>
#
# Options:
#   --clean           Remove existing builds and start fresh
#   --baseline        Also build and test vanilla PostgreSQL for comparison
#   --test TEST       Run specific test (bloom_scan, bloom_vacuum, pgstattuple,
#                     pgstatindex, gin_vacuum, wal_logging, hash_vacuum, or "all")
#   --io-method MODE  I/O method: io_uring, worker, or sync (default: io_uring)
#   --io-workers N    Number of I/O workers for worker mode (default: 3)
#   --io-concurrency N  Max concurrent I/Os per process (default: 64)
#   --direct-io         Enable direct IO (debug_io_direct=data), bypasses OS page cache
#   --read-delay MS     Simulate read latency via dm_delay (requires pre-created device)
#   --write-delay MS    Simulate write latency via dm_delay (default: 0, requires --read-delay)
#   --profile           Enable perf profiling and flamegraph generation
#
# Environment:
#   WORKROOT       Base directory (default: $HOME/pg_bench)
#   REPS           Repetitions per test (default: 5)
#   SIZES          Table sizes to test (default: "large")
#   FLAMEGRAPH_DIR Path to FlameGraph tools (default: $HOME/FlameGraph)
#   DM_DELAY_DEV   dm_delay device name for --read-delay (default: "delayed")
###############################################################################

log() { printf '\033[1;34m==>\033[0m %s\n' "$*"; }
die() { printf '\033[1;31mERROR:\033[0m %s\n' "$*" >&2; exit 1; }

# --- CLI parsing ---
CLEAN=0
BASELINE=0
DO_PROFILE=0
DIRECT_IO=0
IO_DELAY_MS=""
WRITE_DELAY_MS="0"
TEST="all"
IO_METHOD="${IO_METHOD:-io_uring}"
IO_WORKERS="${IO_WORKERS:-3}"
IO_MAX_CONCURRENCY="${IO_MAX_CONCURRENCY:-64}"
DM_DELAY_DEV="${DM_DELAY_DEV:-delayed}"
PATCH=""

while [[ $# -gt 0 ]]; do
  case "$1" in
    --clean)          CLEAN=1 ;;
    --baseline)       BASELINE=1 ;;
    --profile)        DO_PROFILE=1 ;;
    --direct-io)      DIRECT_IO=1 ;;
    --read-delay)     IO_DELAY_MS="$2"; shift ;;
    --write-delay)    WRITE_DELAY_MS="$2"; shift ;;
    --test)           TEST="$2"; shift ;;
    --io-method)      IO_METHOD="$2"; shift ;;
    --io-workers)     IO_WORKERS="$2"; shift ;;
    --io-concurrency) IO_MAX_CONCURRENCY="$2"; shift ;;
    -h|--help)        sed -n '3,27p' "$0" | sed 's/^# \?//'; exit 0 ;;
    -*)               die "Unknown option: $1" ;;
    *)                PATCH="$1" ;;
  esac
  shift
done

# Validate io_method
case "$IO_METHOD" in
  io_uring|worker|sync) ;;
  *) die "Invalid --io-method: $IO_METHOD (must be io_uring, worker, or sync)" ;;
esac

# Validate dm_delay device if --read-delay is used
if [[ -n "$IO_DELAY_MS" ]]; then
  command -v dmsetup >/dev/null 2>&1 || die "--read-delay requires dmsetup (sudo apt install dmsetup)"
  sudo dmsetup status "$DM_DELAY_DEV" >/dev/null 2>&1 \
    || die "dm_delay device '$DM_DELAY_DEV' not found. Create it first, e.g.:\n  umount /srv && dmsetup create $DM_DELAY_DEV --table \"0 \$(blockdev --getsz /dev/DEVICE) delay /dev/DEVICE 0 $IO_DELAY_MS\" && mount /dev/mapper/$DM_DELAY_DEV /srv/"
fi

[[ -z "$PATCH" ]] && die "Usage: $0 [--clean] [--baseline] [--test TEST] <patch>"
[[ ! -f "$PATCH" ]] && die "Patch not found: $PATCH"
[[ "$PATCH" != /* ]] && PATCH="$PWD/$PATCH"

# --- Profiling validation ---
FLAMEGRAPH_DIR="${FLAMEGRAPH_DIR:-$HOME/FlameGraph}"
PERF_SUDO="${PERF_SUDO:-sudo}"
PERF_EVENT="${PERF_EVENT:-cycles}"  # cycles = user+kernel; cycles:u = user-only
if [[ $DO_PROFILE -eq 1 ]]; then
  command -v perf >/dev/null 2>&1 || die "Need perf (sudo apt install linux-tools-$(uname -r))"
  [[ -x "$FLAMEGRAPH_DIR/stackcollapse-perf.pl" ]] || die "Missing $FLAMEGRAPH_DIR/stackcollapse-perf.pl (git clone https://github.com/brendangregg/FlameGraph)"
  [[ -x "$FLAMEGRAPH_DIR/flamegraph.pl" ]] || die "Missing $FLAMEGRAPH_DIR/flamegraph.pl"
fi

# --- Configuration ---
WORKROOT="${WORKROOT:-$HOME/pg_bench}"
REPS="${REPS:-5}"
SIZES="${SIZES:-large}"

ROOT_BASE="$WORKROOT/vanilla"
PATCH_TAG=$(basename "$PATCH" .patch | tr -dc '[:alnum:]_-' | cut -c1-40)
ROOT_PATCH="$WORKROOT/$PATCH_TAG"

# --- Helpers ---
pg() { echo "$1/pg/bin/$2"; }

pick_port() {
  for p in $(seq "${1:-5432}" 60000); do
    lsof -iTCP:"$p" -sTCP:LISTEN >/dev/null 2>&1 || { echo "$p"; return; }
  done
  die "No free port found"
}

set_io_delay() {
  local ms="$1"
  [[ -z "$IO_DELAY_MS" ]] && return
  local table size dev
  table=$(sudo dmsetup table "$DM_DELAY_DEV")
  size=$(echo "$table" | awk '{print $2}')
  dev=$(echo "$table" | awk '{print $4}')
  log "Setting dm_delay on $DM_DELAY_DEV to ${ms}ms read / ${WRITE_DELAY_MS}ms write"
  sudo dmsetup suspend "$DM_DELAY_DEV"
  sudo dmsetup reload "$DM_DELAY_DEV" --table "0 $size delay $dev 0 $ms $dev 0 $WRITE_DELAY_MS"
  sudo dmsetup resume "$DM_DELAY_DEV"
}

# --- Build PostgreSQL ---
build_pg() {
  local ROOT="$1" PATCH_FILE="${2:-}"
  
  [[ $CLEAN -eq 1 ]] && rm -rf "$ROOT"
  
  if [[ ! -x "$(pg "$ROOT" initdb)" ]]; then
    log "Building PostgreSQL: $ROOT"
    mkdir -p "$ROOT"
    
    git clone --depth 1 https://github.com/postgres/postgres "$ROOT/src" 2>/dev/null
    cd "$ROOT/src"
    
    [[ -n "$PATCH_FILE" ]] && { log "Applying patch"; git apply "$PATCH_FILE"; }
    
    ./configure --prefix="$ROOT/pg" --with-liburing \
      CFLAGS='-O2 -ggdb3 -fno-omit-frame-pointer' >/dev/null 2>&1
    
    make -j"$(nproc)" install >/dev/null 2>&1
  else
    log "Reusing build: $ROOT"
    cd "$ROOT/src"
  fi
  
  # Always install contribs (idempotent, catches reused builds missing new extensions)
  make -C contrib/bloom install >/dev/null 2>&1
  make -C contrib/pgstattuple install >/dev/null 2>&1
  make -C contrib/pg_buffercache install >/dev/null 2>&1
  make -C contrib/pg_prewarm install >/dev/null 2>&1
}

# --- Cluster management ---
init_cluster() {
  local ROOT="$1" PORT="$2"
  
  rm -rf "$ROOT/data"
  "$(pg "$ROOT" initdb)" -D "$ROOT/data" --no-locale >/dev/null 2>&1
  
  cat >> "$ROOT/data/postgresql.conf" <<EOF
port = $PORT
listen_addresses = '127.0.0.1'
shared_buffers = '32GB'
effective_io_concurrency = 200
io_method = $IO_METHOD
io_workers = $IO_WORKERS
io_max_concurrency = $IO_MAX_CONCURRENCY
track_io_timing = on
track_wal_io_timing = on
synchronous_commit = on
autovacuum = off
checkpoint_timeout = 1h
max_wal_size = 10GB
max_parallel_workers_per_gather = 0
EOF
  
  [[ $DIRECT_IO -eq 1 ]] && echo "debug_io_direct = data" >> "$ROOT/data/postgresql.conf"
  
  "$(pg "$ROOT" pg_ctl)" -D "$ROOT/data" -l "$ROOT/server.log" start -w >/dev/null
  
  psql_run "$ROOT" "$PORT" -c "CREATE EXTENSION IF NOT EXISTS pg_buffercache;"
  psql_run "$ROOT" "$PORT" -c "CREATE EXTENSION IF NOT EXISTS pg_prewarm;"
}

stop_cluster() {
  local ROOT="$1"
  "$(pg "$ROOT" pg_ctl)" -D "$ROOT/data" stop -m fast 2>/dev/null || true
}

drop_caches() {
  local ROOT="$1" PORT="$2"
  shift 2
  local rels=("$@")
  
  # Evict target relations from shared buffers (no PG restart needed)
  for rel in "${rels[@]}"; do
    psql_run "$ROOT" "$PORT" -c "SELECT pg_buffercache_evict_relation('${rel}'::regclass);" >/dev/null
  done
  
  # Drop OS page cache (skip with direct IO — no page cache involved)
  if [[ $DIRECT_IO -eq 0 ]]; then
    sync
    echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 2>&1 || true
    sleep 2
  fi
}

psql_run() {
  local ROOT="$1" PORT="$2"
  shift 2
  "$(pg "$ROOT" psql)" -h 127.0.0.1 -p "$PORT" -d postgres -v ON_ERROR_STOP=1 -Atq "$@"
}

# --- Timing ---
run_timed() {
  local ROOT="$1" PORT="$2" SQL="$3"
  local ms
  # -X: ignore .psqlrc, -v ON_ERROR_STOP=1: fail on SQL errors
  # Parse last Time: line, handle both "ms" and "s" units
  ms=$("$(pg "$ROOT" psql)" -h 127.0.0.1 -p "$PORT" -d postgres -X -v ON_ERROR_STOP=1 -At \
    -c '\timing on' -c "$SQL" 2>&1 | \
    awk '
      /Time:/ {
        val=$2; unit=$3;
        if (unit=="ms") ms=val;
        else if (unit=="s") ms=val*1000;
      }
      END { if (ms=="") exit 1; printf "%.3f\n", ms; }
    ')
  # Validate numeric output
  [[ "$ms" =~ ^[0-9]+(\.[0-9]+)?$ ]] || { echo "ERROR: Non-numeric timing: $ms" >&2; return 1; }
  echo "$ms"
}

# --- I/O Stats ---
# Run SQL and capture timing + I/O stats from pg_stat_io
# Resets stats before query, waits for flush, then reads absolute values
# Note: pg_stat_io has PGSTAT_MIN_INTERVAL=1000ms flush delay, so we wait 1.5s
#       after the query to ensure stats are flushed to shared memory.
# Note: pg_stat_io counts I/O operations, not pages (with io_combine_limit=128kB,
#       up to 16 pages per operation). This is expected behavior.
# Returns: ms,reads,read_time,writes,write_time
run_timed_with_io() {
  local ROOT="$1" PORT="$2" SQL="$3"
  local result
  
  # Reset stats, run query, wait for flush, read absolute values
  # - Filter by client backend and io worker (excludes bgwriter/checkpointer)
  # - 1.5s delay allows stats to flush (PGSTAT_MIN_INTERVAL=1000ms)
  result=$("$(pg "$ROOT" psql)" -h 127.0.0.1 -p "$PORT" -d postgres -X -v ON_ERROR_STOP=1 <<EOSQL
SELECT pg_stat_reset_shared('io');
\\timing on
$SQL
\\timing off
SELECT pg_sleep(1.5);
\\t on
SELECT 
  COALESCE(SUM(reads),0)::bigint,
  COALESCE(SUM(read_time),0)::numeric(12,2),
  COALESCE(SUM(writes),0)::bigint,
  COALESCE(SUM(write_time),0)::numeric(12,2)
FROM pg_stat_io 
WHERE object = 'relation' AND backend_type IN ('client backend', 'io worker');
EOSQL
  2>&1)
  
  # Parse timing (last Time: line)
  local ms
  ms=$(echo "$result" | awk '
    /Time:/ {
      val=$2; unit=$3;
      if (unit=="ms") ms=val;
      else if (unit=="s") ms=val*1000;
    }
    END { if (ms=="") exit 1; printf "%.3f\n", ms; }
  ')
  
  # Parse I/O stats (last non-empty line with pipe separator: reads|read_time|writes|write_time)
  local reads read_time writes write_time
  local io_line
  io_line=$(echo "$result" | grep '|' | tail -1)
  reads=$(echo "$io_line"     | cut -d'|' -f1 | tr -d ' ')
  read_time=$(echo "$io_line"  | cut -d'|' -f2 | tr -d ' ')
  writes=$(echo "$io_line"    | cut -d'|' -f3 | tr -d ' ')
  write_time=$(echo "$io_line" | cut -d'|' -f4 | tr -d ' ')
  
  # Default to 0 if not found
  [[ "$reads"      =~ ^-?[0-9]+$             ]] || reads=0
  [[ "$read_time"  =~ ^-?[0-9]+(\.[0-9]+)?$ ]] || read_time=0
  [[ "$writes"     =~ ^-?[0-9]+$             ]] || writes=0
  [[ "$write_time" =~ ^-?[0-9]+(\.[0-9]+)?$ ]] || write_time=0
  
  echo "$ms,$reads,$read_time,$writes,$write_time"
}

# --- Statistics ---
calc_median() {
  awk -F, 'NR>1{a[++n]=$2}END{
    if(n==0){print 0; exit}
    for(i=1;i<=n;i++)for(j=i+1;j<=n;j++)if(a[i]>a[j]){t=a[i];a[i]=a[j];a[j]=t}
    print (n%2)?a[int(n/2)+1]:(a[n/2]+a[n/2+1])/2
  }' "$1"
}

calc_median_col() {
  local file="$1" col="$2"
  awk -F, -v col="$col" 'NR>1{a[++n]=$col}END{
    if(n==0){print 0; exit}
    for(i=1;i<=n;i++)for(j=i+1;j<=n;j++)if(a[i]>a[j]){t=a[i];a[i]=a[j];a[j]=t}
    print (n%2)?a[int(n/2)+1]:(a[n/2]+a[n/2+1])/2
  }' "$file"
}

calc_stats() {
  local csv="$1"
  awk -F, 'NR>1{a[++n]=$2;s+=$2}END{
    if(n==0)exit
    for(i=1;i<=n;i++)for(j=i+1;j<=n;j++)if(a[i]>a[j]){t=a[i];a[i]=a[j];a[j]=t}
    med=(n%2)?a[int(n/2)+1]:(a[n/2]+a[n/2+1])/2
    avg=s/n; for(i=1;i<=n;i++)ss+=(a[i]-avg)^2; sd=sqrt(ss/n)
    printf "median=%.1fms mean=%.1f±%.1fms n=%d", med, avg, sd, n
  }' "$csv"
}

# --- Profiling ---
# Run a SQL command under perf, attaching to the backend PID.
# Generates perf.data and flamegraph SVG.
#   profile_sql ROOT PORT LABEL SQL
profile_sql() {
  [[ $DO_PROFILE -ne 1 ]] && return
  
  local ROOT="$1" PORT="$2" LABEL="$3" SQL="$4"
  local PROF_DIR="$ROOT/profile"
  mkdir -p "$PROF_DIR"
  
  local PERF_DATA="$PROF_DIR/${LABEL}.perf.data"
  local SVG="$PROF_DIR/${LABEL}.svg"
  local psql_bin
  psql_bin="$(pg "$ROOT" psql)"
  
  # Use a unique application_name to find the backend PID
  local APP="prof_${LABEL}_$$"
  
  # Launch a psql session that will first identify itself, then run the SQL
  # The pg_sleep() gives us time to find the backend PID and attach perf
  PGAPPNAME="$APP" "$psql_bin" -h 127.0.0.1 -p "$PORT" -d postgres \
    -X -v ON_ERROR_STOP=1 <<EOSQL >/dev/null 2>&1 &
SELECT pg_sleep(2);
$SQL
EOSQL
  local QUERY_SHELL_PID=$!
  
  # Find the backend PID via pg_stat_activity
  local BACKEND_PID=""
  for ((n=0; n<100; n++)); do
    BACKEND_PID=$("$psql_bin" -h 127.0.0.1 -p "$PORT" -d postgres -Atq \
      -c "SELECT pid FROM pg_stat_activity WHERE application_name='${APP}' ORDER BY backend_start DESC LIMIT 1;" 2>/dev/null)
    [[ -n "$BACKEND_PID" && -d "/proc/$BACKEND_PID" ]] && break
    sleep 0.05
  done
  
  if [[ -z "$BACKEND_PID" || ! -d "/proc/$BACKEND_PID" ]]; then
    log "WARNING: Could not find backend PID for profiling, skipping"
    wait "$QUERY_SHELL_PID" 2>/dev/null || true
    return
  fi
  
  log "Profiling backend PID $BACKEND_PID → $PERF_DATA"
  
  # Attach perf to the backend; we explicitly kill -INT it after the query finishes
  $PERF_SUDO perf record -g --call-graph dwarf \
    -p "$BACKEND_PID" -o "$PERF_DATA" \
    --event="$PERF_EVENT" 2>/dev/null &
  local PERF_PID=$!
  sleep 0.1
  
  # Verify perf actually started (permissions, valid PID, etc.)
  if ! kill -0 "$PERF_PID" 2>/dev/null; then
    log "WARNING: perf record failed to start (permissions/config?), skipping flamegraph"
    wait "$QUERY_SHELL_PID" 2>/dev/null || true
    return
  fi
  
  # Wait for the query to finish
  wait "$QUERY_SHELL_PID" 2>/dev/null || true
  
  # Give perf a moment to flush, then stop it
  sleep 0.5
  $PERF_SUDO kill -INT "$PERF_PID" 2>/dev/null || true; wait "$PERF_PID" 2>/dev/null || true
  
  # Generate flamegraph
  generate_flamegraph "$PERF_DATA" "$SVG" "$LABEL"
}

# Convert perf.data → flamegraph SVG
#   generate_flamegraph PERF_DATA SVG_PATH TITLE
generate_flamegraph() {
  local PERF_DATA="$1" SVG="$2" TITLE="$3"
  
  [[ -f "$PERF_DATA" ]] || return
  
  local FOLDED="${PERF_DATA%.perf.data}.folded"
  if $PERF_SUDO perf script -i "$PERF_DATA" 2>/dev/null \
      | "$FLAMEGRAPH_DIR/stackcollapse-perf.pl" > "$FOLDED" 2>/dev/null \
      && [[ -s "$FOLDED" ]]; then
    "$FLAMEGRAPH_DIR/flamegraph.pl" --title "$TITLE" --countname samples \
      "$FOLDED" > "$SVG" 2>/dev/null
    log "Flamegraph: $SVG"
    rm -f "$FOLDED"
  else
    log "WARNING: Failed to generate flamegraph for $TITLE"
    rm -f "$FOLDED"
  fi
}

# --- Benchmark runner ---
# benchmark ROOT PORT NAME SQL RELATION [RELATION...]
benchmark() {
  local ROOT="$1" PORT="$2" NAME="$3" SQL="$4"
  shift 4
  local rels=("$@")
  local OUT="$ROOT/results/${NAME}.csv"
  
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  for ((i=1; i<=REPS; i++)); do
    drop_caches "$ROOT" "$PORT" "${rels[@]}"
    local result ms reads read_time writes write_time
    result=$(run_timed_with_io "$ROOT" "$PORT" "$SQL")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "$NAME [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
}

# --- Data setup functions ---
setup_bloom() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=100000 ;;
    medium) NROWS=1000000 ;;
    large)  NROWS=10000000 ;;
    *) die "Invalid size '$SIZE' (must be small, medium, or large)" ;;
  esac
  
  log "Creating Bloom test data ($SIZE: $NROWS rows)"
  psql_run "$ROOT" "$PORT" <<SQL
CREATE EXTENSION IF NOT EXISTS bloom;
DROP TABLE IF EXISTS bloom_test;
CREATE TABLE bloom_test (id INT, data TEXT, val1 INT, val2 INT);
INSERT INTO bloom_test SELECT i, 'data_'||i, i%1000, i%100 FROM generate_series(1,$NROWS) i;
CREATE INDEX bloom_idx ON bloom_test USING bloom (val1, val2);
VACUUM ANALYZE bloom_test;
CHECKPOINT;
SQL
}

setup_pgstattuple() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=100000 ;;
    medium) NROWS=1000000 ;;
    large)  NROWS=10000000 ;;
    *) die "Invalid size '$SIZE' (must be small, medium, or large)" ;;
  esac
  
  log "Creating pgstattuple test data ($SIZE: $NROWS rows)"
  psql_run "$ROOT" "$PORT" <<SQL
CREATE EXTENSION IF NOT EXISTS pgstattuple;
DROP TABLE IF EXISTS heap_test;
CREATE TABLE heap_test (id SERIAL PRIMARY KEY, data TEXT);
INSERT INTO heap_test (data) SELECT repeat('x',100) FROM generate_series(1,$NROWS);
VACUUM ANALYZE heap_test;
CHECKPOINT;
SQL
}

setup_pgstatindex() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=100000 ;;
    medium) NROWS=1000000 ;;
    large)  NROWS=10000000 ;;
    *) die "Invalid size '$SIZE' (must be small, medium, or large)" ;;
  esac
  
  log "Creating pgstatindex test data ($SIZE: $NROWS rows)"
  psql_run "$ROOT" "$PORT" <<SQL
CREATE EXTENSION IF NOT EXISTS pgstattuple;
DROP TABLE IF EXISTS idx_test;
CREATE TABLE idx_test (id SERIAL PRIMARY KEY, data TEXT);
INSERT INTO idx_test (data) SELECT 'data_row_' || i || '_' || repeat('x',50) FROM generate_series(1,$NROWS) i;
VACUUM ANALYZE idx_test;
CHECKPOINT;
SQL
}

setup_gin() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=100000 ;;
    medium) NROWS=1000000 ;;
    large)  NROWS=5000000 ;;
    *) die "Invalid size '$SIZE' (must be small, medium, or large)" ;;
  esac
  
  log "Creating GIN test data ($SIZE: $NROWS rows)"
  psql_run "$ROOT" "$PORT" <<SQL
DROP TABLE IF EXISTS gin_test;
-- No PRIMARY KEY: isolate GIN index vacuum from btree overhead
CREATE TABLE gin_test (id INT, tags TEXT[]);
INSERT INTO gin_test (id, tags)
SELECT i, ARRAY(SELECT 'tag_'||(random()*100)::int FROM generate_series(1,5))
FROM generate_series(1,$NROWS) i;
CREATE INDEX gin_idx ON gin_test USING gin (tags);
VACUUM ANALYZE gin_test;
CHECKPOINT;
SQL
}

setup_hash() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=500000 ;;
    medium) NROWS=1000000 ;;
    large)  NROWS=20000000 ;;
    *) die "Invalid size '$SIZE' (must be small, medium, or large)" ;;
  esac
  
  log "Creating Hash test data ($SIZE: $NROWS unique values)"
  psql_run "$ROOT" "$PORT" <<SQL
DROP TABLE IF EXISTS hash_test;
-- No PRIMARY KEY: isolate hash index vacuum from btree overhead
CREATE TABLE hash_test (id INT, data TEXT);
INSERT INTO hash_test SELECT i, 'x' FROM generate_series(1,$NROWS) i;
CREATE INDEX hash_idx ON hash_test USING hash (id);
VACUUM ANALYZE hash_test;
CHECKPOINT;
SQL
}

setup_wal() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=1000000 ;;
    medium) NROWS=5000000 ;;
    large)  NROWS=20000000 ;;
    *) die "Invalid size '$SIZE' (must be small, medium, or large)" ;;
  esac
  
  log "Creating table for GIN index build / log_newpage_range test ($SIZE: $NROWS rows)"
  psql_run "$ROOT" "$PORT" <<SQL
DROP TABLE IF EXISTS wal_test;
-- Table with tsvector column for GIN indexing (full-text search)
-- GIN index builds always call log_newpage_range() at the end of
-- ginbuild() (gininsert.c) to WAL-log all index pages. 
CREATE TABLE wal_test (id INT, doc TEXT, doc_tsv TSVECTOR);
INSERT INTO wal_test
  SELECT i,
         'word' || (random()*10000)::int || ' term' || (random()*10000)::int
           || ' token' || (random()*5000)::int || ' phrase' || (random()*8000)::int,
         to_tsvector('simple',
           'word' || (random()*10000)::int || ' term' || (random()*10000)::int
           || ' token' || (random()*5000)::int || ' phrase' || (random()*8000)::int)
  FROM generate_series(1,$NROWS) i;
VACUUM ANALYZE wal_test;
CHECKPOINT;
SQL
}

# --- Test functions ---
test_bloom_scan() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  setup_bloom "$ROOT" "$PORT" "$SIZE"
  benchmark "$ROOT" "$PORT" "${LABEL}_bloom_scan_${SIZE}" \
    "SET enable_seqscan=off; SELECT COUNT(*) FROM bloom_test WHERE val1=42 AND val2=7;" \
    bloom_test bloom_idx
  # Profile after benchmark reps: shared_buffers memory already faulted in,
  # so page-fault noise is gone; drop_caches ensures cold IO for the profile.
  if [[ $DO_PROFILE -eq 1 ]]; then
    drop_caches "$ROOT" "$PORT" bloom_test bloom_idx
    profile_sql "$ROOT" "$PORT" "${LABEL}_bloom_scan_${SIZE}" \
      "SET enable_seqscan=off; SELECT COUNT(*) FROM bloom_test WHERE val1=42 AND val2=7;"
  fi
}

test_bloom_vacuum() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  local OUT="$ROOT/results/${LABEL}_bloom_vacuum_${SIZE}.csv"
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  for ((i=1; i<=REPS; i++)); do
    # Fresh table each run for consistent state
    setup_bloom "$ROOT" "$PORT" "$SIZE"
    # Create 10% dead tuples
    psql_run "$ROOT" "$PORT" -c "DELETE FROM bloom_test WHERE id % 10 = 0;"
    
    drop_caches "$ROOT" "$PORT" bloom_test bloom_idx
    local result ms reads read_time writes write_time
    result=$(run_timed_with_io "$ROOT" "$PORT" "VACUUM bloom_test;")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "${LABEL}_bloom_vacuum_${SIZE} [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
  
  if [[ $DO_PROFILE -eq 1 ]]; then
    setup_bloom "$ROOT" "$PORT" "$SIZE"
    psql_run "$ROOT" "$PORT" -c "DELETE FROM bloom_test WHERE id % 10 = 0;"
    drop_caches "$ROOT" "$PORT" bloom_test bloom_idx
    profile_sql "$ROOT" "$PORT" "${LABEL}_bloom_vacuum_${SIZE}" "VACUUM bloom_test;"
  fi
}

test_pgstattuple() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  local OUT="$ROOT/results/${LABEL}_pgstattuple_${SIZE}.csv"
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  # Setup once — rolled-back DELETE keeps layout identical across all reps
  setup_pgstattuple "$ROOT" "$PORT" "$SIZE"
  # Rolled-back DELETE clears the all-visible bit in the Visibility Map so
  # pgstattuple_approx must actually read those pages (it skips all-visible pages).
  # Using ROLLBACK keeps the physical layout identical across all reps (no TOAST
  # out-of-page updates, no dirty pages to flush from shared_buffers).
  psql_run "$ROOT" "$PORT" -c "BEGIN; DELETE FROM heap_test WHERE id % 500 = 0; ROLLBACK;"
  # Warmup pass: The rolled-back DELETE left every touched tuple with an xmax
  # pointing to the aborted transaction but no hint bits set. On the first
  # pgstattuple_approx call, HeapTupleSatisfiesVacuum → HeapTupleSatisfiesVacuumHorizon
  # must resolve each such xmax: TransactionIdIsInProgress (ProcArray scan) then
  # TransactionIdDidCommit (CLOG lookup) — only then can it call SetHintBits to
  # stamp HEAP_XMAX_INVALID and MarkBufferDirtyHint. Without this warmup, rep 1
  # pays ~1100ms extra CPU for those CLOG/ProcArray lookups. Subsequent reps hit
  # the early-exit at "if (t_infomask & HEAP_XMAX_INVALID) return HEAPTUPLE_LIVE"
  # and skip the expensive path entirely.
  # After this pass, the dirtied hint-bit pages are flushed to disk via
  # drop_caches, so all reps start from the same on-disk state.
  psql_run "$ROOT" "$PORT" -c "SELECT * FROM pgstattuple_approx('heap_test');" >/dev/null

  for ((i=1; i<=REPS; i++)); do
    drop_caches "$ROOT" "$PORT" heap_test heap_test_pkey
    local result ms reads read_time writes write_time
    result=$(run_timed_with_io "$ROOT" "$PORT" "SELECT * FROM pgstattuple_approx('heap_test');")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "${LABEL}_pgstattuple_${SIZE} [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
  
  if [[ $DO_PROFILE -eq 1 ]]; then
    psql_run "$ROOT" "$PORT" -c "BEGIN; DELETE FROM heap_test WHERE id % 500 = 0; ROLLBACK;"
    drop_caches "$ROOT" "$PORT" heap_test heap_test_pkey
    profile_sql "$ROOT" "$PORT" "${LABEL}_pgstattuple_${SIZE}" \
      "SELECT * FROM pgstattuple_approx('heap_test');"
  fi
}

test_pgstatindex() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  setup_pgstatindex "$ROOT" "$PORT" "$SIZE"
  benchmark "$ROOT" "$PORT" "${LABEL}_pgstatindex_${SIZE}" \
    "SELECT * FROM pgstatindex('idx_test_pkey');" \
    idx_test idx_test_pkey
  if [[ $DO_PROFILE -eq 1 ]]; then
    drop_caches "$ROOT" "$PORT" idx_test idx_test_pkey
    profile_sql "$ROOT" "$PORT" "${LABEL}_pgstatindex_${SIZE}" \
      "SELECT * FROM pgstatindex('idx_test_pkey');"
  fi
}

test_gin_vacuum() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  local OUT="$ROOT/results/${LABEL}_gin_vacuum_${SIZE}.csv"
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  for ((i=1; i<=REPS; i++)); do
    # Fresh table each run for consistent state
    setup_gin "$ROOT" "$PORT" "$SIZE"
    
    drop_caches "$ROOT" "$PORT" gin_test gin_idx
    local result ms reads read_time writes write_time
    # VACUUM ANALYZE forces ginvacuumcleanup() to run and scan all pages
    result=$(run_timed_with_io "$ROOT" "$PORT" "VACUUM ANALYZE gin_test;")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "${LABEL}_gin_vacuum_${SIZE} [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
  
  if [[ $DO_PROFILE -eq 1 ]]; then
    setup_gin "$ROOT" "$PORT" "$SIZE"
    drop_caches "$ROOT" "$PORT" gin_test gin_idx
    profile_sql "$ROOT" "$PORT" "${LABEL}_gin_vacuum_${SIZE}" "VACUUM ANALYZE gin_test;"
  fi
}

test_hash_vacuum() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  local OUT="$ROOT/results/${LABEL}_hash_vacuum_${SIZE}.csv"
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  for ((i=1; i<=REPS; i++)); do
    # Fresh table each run for consistent state
    setup_hash "$ROOT" "$PORT" "$SIZE"
    # Create 10% dead tuples
    psql_run "$ROOT" "$PORT" -c "DELETE FROM hash_test WHERE id % 10 = 0;"
    
    drop_caches "$ROOT" "$PORT" hash_test hash_idx
    local result ms reads read_time writes write_time
    result=$(run_timed_with_io "$ROOT" "$PORT" "VACUUM hash_test;")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "${LABEL}_hash_vacuum_${SIZE} [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
  
  if [[ $DO_PROFILE -eq 1 ]]; then
    setup_hash "$ROOT" "$PORT" "$SIZE"
    psql_run "$ROOT" "$PORT" -c "DELETE FROM hash_test WHERE id % 10 = 0;"
    drop_caches "$ROOT" "$PORT" hash_test hash_idx
    profile_sql "$ROOT" "$PORT" "${LABEL}_hash_vacuum_${SIZE}" "VACUUM hash_test;"
  fi
}

test_wal_logging() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  local OUT="$ROOT/results/${LABEL}_wal_logging_${SIZE}.csv"
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  # Build table once - only rebuild index each rep
  setup_wal "$ROOT" "$PORT" "$SIZE"
  
  local WAL_SQL="CREATE INDEX wal_test_gin_idx ON wal_test USING gin (doc_tsv);"
  
  for ((i=1; i<=REPS; i++)); do
    # Drop index from previous iteration
    psql_run "$ROOT" "$PORT" -c "DROP INDEX IF EXISTS wal_test_gin_idx;"
    
    # Drop OS caches - source table pages are COLD on disk
    drop_caches "$ROOT" "$PORT" wal_test
    
    # CREATE INDEX on GIN (tsvector_ops):
    # - GIN always uses the same build path: ginbuild() populates the
    #   index in memory, flushes to disk, then calls log_newpage_range()
    #   to read ALL index pages and write them to WAL (gininsert.c:785-790)
    local result ms reads read_time writes write_time
    result=$(run_timed_with_io "$ROOT" "$PORT" "$WAL_SQL")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "${LABEL}_wal_logging_${SIZE} [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
  
  if [[ $DO_PROFILE -eq 1 ]]; then
    psql_run "$ROOT" "$PORT" -c "DROP INDEX IF EXISTS wal_test_gin_idx;"
    drop_caches "$ROOT" "$PORT" wal_test
    profile_sql "$ROOT" "$PORT" "${LABEL}_wal_logging_${SIZE}" "$WAL_SQL"
  fi
}

# --- Run tests for a build ---
warmup_catalog() {
  local ROOT="$1" PORT="$2"
  # Explicitly prewarm catalog tables and their indexes into shared_buffers
  # so rep 1 doesn't pay disk-read cost for catalog pages.
  # pg_buffercache_evict_relation only evicts the test relation, not catalogs,
  # so these stay warm across all reps.
  psql_run "$ROOT" "$PORT" <<SQL >/dev/null
SELECT pg_prewarm('pg_class',     'buffer');
SELECT pg_prewarm('pg_attribute', 'buffer');
SELECT pg_prewarm('pg_namespace', 'buffer');
SELECT pg_prewarm('pg_proc',      'buffer');
SELECT pg_prewarm('pg_type',      'buffer');
SQL
}

run_tests() {
  local ROOT="$1" LABEL="$2"
  local PORT
  PORT=$(pick_port)
  
  log "[$LABEL] Starting cluster on port $PORT"
  init_cluster "$ROOT" "$PORT"
  warmup_catalog "$ROOT" "$PORT"
  set_io_delay "$IO_DELAY_MS"
  
  trap "stop_cluster '$ROOT'" EXIT
  
  for SIZE in $SIZES; do
    case "$TEST" in
      bloom_scan)   test_bloom_scan "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      bloom_vacuum) test_bloom_vacuum "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      pgstattuple)  test_pgstattuple "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      pgstatindex)  test_pgstatindex "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      gin_vacuum)   test_gin_vacuum "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      hash_vacuum)  test_hash_vacuum "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      wal_logging)  test_wal_logging "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      all)
        test_bloom_vacuum "$ROOT" "$PORT" "$LABEL" "$SIZE"
        test_pgstattuple "$ROOT" "$PORT" "$LABEL" "$SIZE"
        test_pgstatindex "$ROOT" "$PORT" "$LABEL" "$SIZE"
        test_hash_vacuum "$ROOT" "$PORT" "$LABEL" "$SIZE"
        test_wal_logging "$ROOT" "$PORT" "$LABEL" "$SIZE"
        ;;
      *) die "Unknown test: $TEST" ;;
    esac
  done
  
  stop_cluster "$ROOT"
  trap - EXIT
}

# --- Compare results ---
compare_results() {
  local base_csv="$1" patch_csv="$2" label="$3"
  
  [[ ! -f "$base_csv" || ! -f "$patch_csv" ]] && return
  
  local base_med patch_med
  base_med=$(calc_median "$base_csv")
  patch_med=$(calc_median "$patch_csv")
  
  # Guard against empty or zero values to prevent division by zero
  [[ -z "$base_med" || "$base_med" == "0" ]] && base_med="0.001"
  [[ -z "$patch_med" || "$patch_med" == "0" ]] && patch_med="0.001"
  
  local speedup pct
  speedup=$(awk "BEGIN { printf \"%.2f\", $base_med / $patch_med }")
  pct=$(awk "BEGIN { printf \"%.1f\", ($base_med - $patch_med) / $base_med * 100 }")
  
  local io_info=""
  if head -1 "$base_csv" | grep -q "reads"; then
    # Standard test: columns are run,ms,reads,read_time_ms,writes,write_time_ms
    local base_reads patch_reads base_rtime patch_rtime base_writes patch_writes base_wtime patch_wtime
    base_reads=$(calc_median_col "$base_csv" 3)
    patch_reads=$(calc_median_col "$patch_csv" 3)
    base_rtime=$(calc_median_col "$base_csv" 4)
    patch_rtime=$(calc_median_col "$patch_csv" 4)
    base_writes=$(calc_median_col "$base_csv" 5)
    patch_writes=$(calc_median_col "$patch_csv" 5)
    base_wtime=$(calc_median_col "$base_csv" 6)
    patch_wtime=$(calc_median_col "$patch_csv" 6)
    # Default to 0 if empty
    [[ -z "$base_reads" ]]   && base_reads=0
    [[ -z "$patch_reads" ]]  && patch_reads=0
    [[ -z "$base_rtime" ]]   && base_rtime=0
    [[ -z "$patch_rtime" ]]  && patch_rtime=0
    [[ -z "$base_writes" ]]  && base_writes=0
    [[ -z "$patch_writes" ]] && patch_writes=0
    [[ -z "$base_wtime" ]]   && base_wtime=0
    [[ -z "$patch_wtime" ]]  && patch_wtime=0
    io_info="  (reads=${base_reads}→${patch_reads}, read_time=${base_rtime}→${patch_rtime}ms, writes=${base_writes}→${patch_writes}, write_time=${base_wtime}→${patch_wtime}ms)"
  fi
  
  printf "%-26s base=%8.1fms  patch=%8.1fms  %5.2fx  (%5.1f%%)%s\n" \
    "$label" "$base_med" "$patch_med" "$speedup" "$pct" "$io_info"
}

print_summary() {
  echo ""
  echo "═══════════════════════════════════════════════════════════════════════"
  echo "                     STREAMING READ BENCHMARK RESULTS                   "
  echo "═══════════════════════════════════════════════════════════════════════"
  echo ""
  
  if [[ $BASELINE -eq 1 ]]; then
    printf "%-26s %-17s %-17s %-7s %-7s %s\n" "TEST" "BASELINE" "PATCHED" "SPEEDUP" "CHANGE" "I/O TIME"
    echo "─────────────────────────────────────────────────────────────────────────────────────────────────"
    
    for SIZE in $SIZES; do
      for test_name in bloom_scan bloom_vacuum pgstattuple pgstatindex gin_vacuum hash_vacuum wal_logging; do
        [[ "$TEST" != "all" && "$TEST" != "$test_name" ]] && continue
        compare_results \
          "$ROOT_BASE/results/base_${test_name}_${SIZE}.csv" \
          "$ROOT_PATCH/results/patched_${test_name}_${SIZE}.csv" \
          "${test_name}_${SIZE}"
      done
    done
  else
    echo "Results (patched only):"
    echo ""
    for f in "$ROOT_PATCH/results/"*.csv; do
      [[ -f "$f" ]] || continue
      printf "%-40s %s\n" "$(basename "$f" .csv):" "$(calc_stats "$f")"
    done
  fi
  
  echo ""
  echo "═══════════════════════════════════════════════════════════════════════"
  echo "CSV files: $ROOT_PATCH/results/"
  [[ $BASELINE -eq 1 ]] && echo "Baseline:  $ROOT_BASE/results/"
  
  # List generated flamegraphs
  if [[ $DO_PROFILE -eq 1 ]]; then
    local svgs=()
    for dir in "$ROOT_BASE/profile" "$ROOT_PATCH/profile"; do
      [[ -d "$dir" ]] || continue
      for svg in "$dir"/*.svg; do
        [[ -f "$svg" ]] && svgs+=("$svg")
      done
    done
    if [[ ${#svgs[@]} -gt 0 ]]; then
      echo ""
      echo "Flamegraphs:"
      for svg in "${svgs[@]}"; do echo "  $svg"; done
    fi
  fi
  
  echo "═══════════════════════════════════════════════════════════════════════"
}

# --- Main ---
main() {
  log "Streaming Read Benchmark"
  log "Patch: $PATCH ($PATCH_TAG)"
  log "Tests: $TEST"
  log "Sizes: $SIZES"
  log "Reps:  $REPS"
  log "I/O:   $IO_METHOD (workers=$IO_WORKERS, concurrency=$IO_MAX_CONCURRENCY)"
  [[ $DIRECT_IO -eq 1 ]] && log "Direct IO: enabled (debug_io_direct=data)"
  [[ -n "$IO_DELAY_MS" ]] && log "I/O delay: ${IO_DELAY_MS}ms read / ${WRITE_DELAY_MS}ms write via dm_delay ($DM_DELAY_DEV)"
  [[ $DO_PROFILE -eq 1 ]] && log "Profile: enabled (flamegraphs → <root>/profile/)"
  
  # Build
  if [[ $BASELINE -eq 1 ]]; then
    build_pg "$ROOT_BASE" ""
  fi
  build_pg "$ROOT_PATCH" "$PATCH"
  
  # Run tests
  if [[ $BASELINE -eq 1 ]]; then
    log "Running baseline tests"
    run_tests "$ROOT_BASE" "base"
  fi
  
  log "Running patched tests"
  run_tests "$ROOT_PATCH" "patched"
  
  # Summary
  print_summary
}

main

^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 23:04                     ` Re: Streamify more code paths Andres Freund <[email protected]>
  2026-03-11 01:37                       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 15:11                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 21:33                           ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 03:42                             ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 04:39                               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-12 15:35                                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-13 01:49                                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-13 02:39                                     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-14 09:56                                       ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-15 02:51                                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2026-03-15 03:47                                           ` Xuneng Zhou <[email protected]>
  2026-03-16 01:00                                             ` Re: Streamify more code paths Michael Paquier <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Xuneng Zhou @ 2026-03-15 03:47 UTC (permalink / raw)
  To: Michael Paquier <[email protected]>; +Cc: Andres Freund <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

On Sun, Mar 15, 2026 at 10:51 AM Xuneng Zhou <[email protected]> wrote:
>
> On Sat, Mar 14, 2026 at 5:56 PM Michael Paquier <[email protected]> wrote:
> >
> > On Fri, Mar 13, 2026 at 10:39:52AM +0800, Xuneng Zhou wrote:
> > > Thanks for fixing this and for taking the time to review and test
> > > the patches.
> >
> > Looking at the rest, I have produced some numbers:
> > pgstattuple_small (20k tuples, io_uring) base= 60839.9ms
> > patch=10949.9ms 5.56x ( 82.0%) (reads=4139->260,
> > io_time=49616.97->55.25ms)
> > pgstattuple_small (20k tuples, worker=3) base= 60577.5ms
> > patch=11470.0ms 5.28x ( 81.1%) (reads=4139->260,
> > io_time=49359.79->69.60ms)
> > hash_vacuum (1M tuples, io_uring)  base=199929.0ms patch=161747.0ms
> > 1.24x ( 19.1%) (reads=4665->1615, io_time=47084.8->9925.77ms)
> > hash_vacuum (1M tuples, worker=12) base=203417.0ms patch=161687.0ms
> > 1.26x ( 20.5%) (reads=4665->1615, io_time=48356.3->9917.24ms)
> >
> > The hash vacuum numbers are less amazing here than yours.  Trying out
> > various configurations does not change the results much (I was puzzled
> > for a couple of hours that I did not see any performance impact but
> > forgot the eviction of the index pages from the shared buffers, that
> > influences the numbers to what I have here), but I'll take it anyway.
>
> My guess is that the results are influenced by the write delay. Vacuum
> operations can be write-intensive, so when both read and write delays
> are set to 2 ~ 5 ms, a large portion of the runtime may be spent on
> writes. According to Amdahl’s Law, the overall performance improvement
> from optimizing a single component is limited by the fraction of time
> that component actually contributes to the total execution time. In
> this case, the potential rate of speedup from streaming the read path
> could be masked by the time spent performing writes.
>
> To investigate this, I added a new option, write-delay. When it is set
> to zero, the benchmark simulates a system with a fast write device and
> a slow read device, reducing the proportion of time spent on writes.
> Admittedly, this setup is somewhat artificial—we would not normally
> expect such a large discrepancy between read and write performance in
> real systems.
>
> -- worker 12, write-delay 2 ms
> hash_vacuum_medium         base= 33743.2ms  patch= 27371.3ms   1.23x
> ( 18.9%)  (reads=4662→1612, read_time=8242.51→1725.03ms,
> writes=12689→12651, write_time=25144.87→25041.75ms)
>
> -- worker 12, write-delay 0 ms
> hash_vacuum_medium         base=  8601.1ms  patch=  2234.0ms   3.85x
> ( 74.0%)  (reads=4662→1612, read_time=8021.65→1637.87ms,
> writes=12689→12651, write_time=337.38→288.15ms)
>
> To better understand the behavior, the latest version of the script
> separates the I/O time into read time and write time. This allows us
> to directly observe their respective contributions and how they change
> across runs. A further improvement would be to report the speedup for
> the read and write components separately, making it easier to
> understand where and how much the performance gains actually occur.

The updated script now reports speedup separately for the read and
write paths like this:

hash_vacuum_medium         base= 33747.2ms  patch= 27379.7ms   1.23x  ( 18.9%)
                             read:  4662→1612 ops  8238.72→1725.86ms
(4.77x)    write:  12689→12651 ops  25146.51→25053.57ms  (1.00x)

I think it is useful to keep the write-delay option even with this
reporting. Separating the read and write delays also helps reduce the
overall runtime of the tests, especially for large data sizes: we only
slow down the read path while keeping the write path fast.

-- 
Best,
Xuneng


Attachments:

  [text/x-sh] run_streaming_benchmark.sh (35.4K, 2-run_streaming_benchmark.sh)
  download | inline:
#!/usr/bin/env bash
set -euo pipefail

###############################################################################
# Streaming Read Patches Benchmark
#
# Usage: ./run_streaming_bench.sh [OPTIONS] <patch>
#
# Options:
#   --clean           Remove existing builds and start fresh
#   --baseline        Also build and test vanilla PostgreSQL for comparison
#   --test TEST       Run specific test (bloom_scan, bloom_vacuum, pgstattuple,
#                     pgstatindex, gin_vacuum, wal_logging, hash_vacuum, or "all")
#   --io-method MODE  I/O method: io_uring, worker, or sync (default: io_uring)
#   --io-workers N    Number of I/O workers for worker mode (default: 3)
#   --io-concurrency N  Max concurrent I/Os per process (default: 64)
#   --direct-io         Enable direct IO (debug_io_direct=data), bypasses OS page cache
#   --read-delay MS     Simulate read latency via dm_delay (requires pre-created device)
#   --write-delay MS    Simulate write latency via dm_delay (default: 0, requires --read-delay)
#   --profile           Enable perf profiling and flamegraph generation
#
# Environment:
#   WORKROOT       Base directory (default: $HOME/pg_bench)
#   REPS           Repetitions per test (default: 5)
#   SIZES          Table sizes to test (default: "large")
#   FLAMEGRAPH_DIR Path to FlameGraph tools (default: $HOME/FlameGraph)
#   DM_DELAY_DEV   dm_delay device name for --read-delay (default: "delayed")
###############################################################################

log() { printf '\033[1;34m==>\033[0m %s\n' "$*"; }
die() { printf '\033[1;31mERROR:\033[0m %s\n' "$*" >&2; exit 1; }

# --- CLI parsing ---
CLEAN=0
BASELINE=0
DO_PROFILE=0
DIRECT_IO=0
IO_DELAY_MS=""
WRITE_DELAY_MS="0"
TEST="all"
IO_METHOD="${IO_METHOD:-io_uring}"
IO_WORKERS="${IO_WORKERS:-3}"
IO_MAX_CONCURRENCY="${IO_MAX_CONCURRENCY:-64}"
DM_DELAY_DEV="${DM_DELAY_DEV:-delayed}"
PATCH=""

while [[ $# -gt 0 ]]; do
  case "$1" in
    --clean)          CLEAN=1 ;;
    --baseline)       BASELINE=1 ;;
    --profile)        DO_PROFILE=1 ;;
    --direct-io)      DIRECT_IO=1 ;;
    --read-delay)     IO_DELAY_MS="$2"; shift ;;
    --write-delay)    WRITE_DELAY_MS="$2"; shift ;;
    --test)           TEST="$2"; shift ;;
    --io-method)      IO_METHOD="$2"; shift ;;
    --io-workers)     IO_WORKERS="$2"; shift ;;
    --io-concurrency) IO_MAX_CONCURRENCY="$2"; shift ;;
    -h|--help)        sed -n '3,27p' "$0" | sed 's/^# \?//'; exit 0 ;;
    -*)               die "Unknown option: $1" ;;
    *)                PATCH="$1" ;;
  esac
  shift
done

# Validate io_method
case "$IO_METHOD" in
  io_uring|worker|sync) ;;
  *) die "Invalid --io-method: $IO_METHOD (must be io_uring, worker, or sync)" ;;
esac

# Validate dm_delay device if --read-delay is used
if [[ -n "$IO_DELAY_MS" ]]; then
  command -v dmsetup >/dev/null 2>&1 || die "--read-delay requires dmsetup (sudo apt install dmsetup)"
  sudo dmsetup status "$DM_DELAY_DEV" >/dev/null 2>&1 \
    || die "dm_delay device '$DM_DELAY_DEV' not found. Create it first, e.g.:\n  umount /srv && dmsetup create $DM_DELAY_DEV --table \"0 \$(blockdev --getsz /dev/DEVICE) delay /dev/DEVICE 0 $IO_DELAY_MS\" && mount /dev/mapper/$DM_DELAY_DEV /srv/"
fi

[[ -z "$PATCH" ]] && die "Usage: $0 [--clean] [--baseline] [--test TEST] <patch>"
[[ ! -f "$PATCH" ]] && die "Patch not found: $PATCH"
[[ "$PATCH" != /* ]] && PATCH="$PWD/$PATCH"

# --- Profiling validation ---
FLAMEGRAPH_DIR="${FLAMEGRAPH_DIR:-$HOME/FlameGraph}"
PERF_SUDO="${PERF_SUDO:-sudo}"
PERF_EVENT="${PERF_EVENT:-cycles}"  # cycles = user+kernel; cycles:u = user-only
if [[ $DO_PROFILE -eq 1 ]]; then
  command -v perf >/dev/null 2>&1 || die "Need perf (sudo apt install linux-tools-$(uname -r))"
  [[ -x "$FLAMEGRAPH_DIR/stackcollapse-perf.pl" ]] || die "Missing $FLAMEGRAPH_DIR/stackcollapse-perf.pl (git clone https://github.com/brendangregg/FlameGraph)"
  [[ -x "$FLAMEGRAPH_DIR/flamegraph.pl" ]] || die "Missing $FLAMEGRAPH_DIR/flamegraph.pl"
fi

# --- Configuration ---
WORKROOT="${WORKROOT:-$HOME/pg_bench}"
REPS="${REPS:-5}"
SIZES="${SIZES:-large}"

ROOT_BASE="$WORKROOT/vanilla"
PATCH_TAG=$(basename "$PATCH" .patch | tr -dc '[:alnum:]_-' | cut -c1-40)
ROOT_PATCH="$WORKROOT/$PATCH_TAG"

# --- Helpers ---
pg() { echo "$1/pg/bin/$2"; }

pick_port() {
  for p in $(seq "${1:-5432}" 60000); do
    lsof -iTCP:"$p" -sTCP:LISTEN >/dev/null 2>&1 || { echo "$p"; return; }
  done
  die "No free port found"
}

set_io_delay() {
  local ms="$1"
  [[ -z "$IO_DELAY_MS" ]] && return
  local table size dev
  table=$(sudo dmsetup table "$DM_DELAY_DEV")
  size=$(echo "$table" | awk '{print $2}')
  dev=$(echo "$table" | awk '{print $4}')
  log "Setting dm_delay on $DM_DELAY_DEV to ${ms}ms read / ${WRITE_DELAY_MS}ms write"
  sudo dmsetup suspend "$DM_DELAY_DEV"
  sudo dmsetup reload "$DM_DELAY_DEV" --table "0 $size delay $dev 0 $ms $dev 0 $WRITE_DELAY_MS"
  sudo dmsetup resume "$DM_DELAY_DEV"
}

# --- Build PostgreSQL ---
build_pg() {
  local ROOT="$1" PATCH_FILE="${2:-}"
  
  [[ $CLEAN -eq 1 ]] && rm -rf "$ROOT"
  
  if [[ ! -x "$(pg "$ROOT" initdb)" ]]; then
    log "Building PostgreSQL: $ROOT"
    mkdir -p "$ROOT"
    
    git clone --depth 1 https://github.com/postgres/postgres "$ROOT/src" 2>/dev/null
    cd "$ROOT/src"
    
    [[ -n "$PATCH_FILE" ]] && { log "Applying patch"; git apply "$PATCH_FILE"; }
    
    ./configure --prefix="$ROOT/pg" --with-liburing \
      CFLAGS='-O2 -ggdb3 -fno-omit-frame-pointer' >/dev/null 2>&1
    
    make -j"$(nproc)" install >/dev/null 2>&1
  else
    log "Reusing build: $ROOT"
    cd "$ROOT/src"
  fi
  
  # Always install contribs (idempotent, catches reused builds missing new extensions)
  make -C contrib/bloom install >/dev/null 2>&1
  make -C contrib/pgstattuple install >/dev/null 2>&1
  make -C contrib/pg_buffercache install >/dev/null 2>&1
  make -C contrib/pg_prewarm install >/dev/null 2>&1
}

# --- Cluster management ---
init_cluster() {
  local ROOT="$1" PORT="$2"
  
  rm -rf "$ROOT/data"
  "$(pg "$ROOT" initdb)" -D "$ROOT/data" --no-locale >/dev/null 2>&1
  
  cat >> "$ROOT/data/postgresql.conf" <<EOF
port = $PORT
listen_addresses = '127.0.0.1'
shared_buffers = '32GB'
effective_io_concurrency = 200
io_method = $IO_METHOD
io_workers = $IO_WORKERS
io_max_concurrency = $IO_MAX_CONCURRENCY
track_io_timing = on
track_wal_io_timing = on
synchronous_commit = on
autovacuum = off
checkpoint_timeout = 1h
max_wal_size = 10GB
max_parallel_workers_per_gather = 0
EOF
  
  [[ $DIRECT_IO -eq 1 ]] && echo "debug_io_direct = data" >> "$ROOT/data/postgresql.conf"
  
  "$(pg "$ROOT" pg_ctl)" -D "$ROOT/data" -l "$ROOT/server.log" start -w >/dev/null
  
  psql_run "$ROOT" "$PORT" -c "CREATE EXTENSION IF NOT EXISTS pg_buffercache;"
  psql_run "$ROOT" "$PORT" -c "CREATE EXTENSION IF NOT EXISTS pg_prewarm;"
}

stop_cluster() {
  local ROOT="$1"
  "$(pg "$ROOT" pg_ctl)" -D "$ROOT/data" stop -m fast 2>/dev/null || true
}

drop_caches() {
  local ROOT="$1" PORT="$2"
  shift 2
  local rels=("$@")
  
  # Evict target relations from shared buffers (no PG restart needed)
  for rel in "${rels[@]}"; do
    psql_run "$ROOT" "$PORT" -c "SELECT pg_buffercache_evict_relation('${rel}'::regclass);" >/dev/null
  done
  
  # Drop OS page cache (skip with direct IO — no page cache involved)
  if [[ $DIRECT_IO -eq 0 ]]; then
    sync
    echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 2>&1 || true
    sleep 2
  fi
}

psql_run() {
  local ROOT="$1" PORT="$2"
  shift 2
  "$(pg "$ROOT" psql)" -h 127.0.0.1 -p "$PORT" -d postgres -v ON_ERROR_STOP=1 -Atq "$@"
}

# --- Timing ---
run_timed() {
  local ROOT="$1" PORT="$2" SQL="$3"
  local ms
  # -X: ignore .psqlrc, -v ON_ERROR_STOP=1: fail on SQL errors
  # Parse last Time: line, handle both "ms" and "s" units
  ms=$("$(pg "$ROOT" psql)" -h 127.0.0.1 -p "$PORT" -d postgres -X -v ON_ERROR_STOP=1 -At \
    -c '\timing on' -c "$SQL" 2>&1 | \
    awk '
      /Time:/ {
        val=$2; unit=$3;
        if (unit=="ms") ms=val;
        else if (unit=="s") ms=val*1000;
      }
      END { if (ms=="") exit 1; printf "%.3f\n", ms; }
    ')
  # Validate numeric output
  [[ "$ms" =~ ^[0-9]+(\.[0-9]+)?$ ]] || { echo "ERROR: Non-numeric timing: $ms" >&2; return 1; }
  echo "$ms"
}

# --- I/O Stats ---
# Run SQL and capture timing + I/O stats from pg_stat_io
# Resets stats before query, waits for flush, then reads absolute values
# Note: pg_stat_io has PGSTAT_MIN_INTERVAL=1000ms flush delay, so we wait 1.5s
#       after the query to ensure stats are flushed to shared memory.
# Note: pg_stat_io counts I/O operations, not pages (with io_combine_limit=128kB,
#       up to 16 pages per operation). This is expected behavior.
# Returns: ms,reads,read_time,writes,write_time
run_timed_with_io() {
  local ROOT="$1" PORT="$2" SQL="$3"
  local result
  
  # Reset stats, run query, wait for flush, read absolute values
  # - Filter by client backend and io worker (excludes bgwriter/checkpointer)
  # - 1.5s delay allows stats to flush (PGSTAT_MIN_INTERVAL=1000ms)
  result=$("$(pg "$ROOT" psql)" -h 127.0.0.1 -p "$PORT" -d postgres -X -v ON_ERROR_STOP=1 <<EOSQL
SELECT pg_stat_reset_shared('io');
\\timing on
$SQL
\\timing off
SELECT pg_sleep(1.5);
\\t on
SELECT 
  COALESCE(SUM(reads),0)::bigint,
  COALESCE(SUM(read_time),0)::numeric(12,2),
  COALESCE(SUM(writes),0)::bigint,
  COALESCE(SUM(write_time),0)::numeric(12,2)
FROM pg_stat_io 
WHERE object = 'relation' AND backend_type IN ('client backend', 'io worker');
EOSQL
  2>&1)
  
  # Parse timing (last Time: line)
  local ms
  ms=$(echo "$result" | awk '
    /Time:/ {
      val=$2; unit=$3;
      if (unit=="ms") ms=val;
      else if (unit=="s") ms=val*1000;
    }
    END { if (ms=="") exit 1; printf "%.3f\n", ms; }
  ')
  
  # Parse I/O stats (last non-empty line with pipe separator: reads|read_time|writes|write_time)
  local reads read_time writes write_time
  local io_line
  io_line=$(echo "$result" | grep '|' | tail -1)
  reads=$(echo "$io_line"     | cut -d'|' -f1 | tr -d ' ')
  read_time=$(echo "$io_line"  | cut -d'|' -f2 | tr -d ' ')
  writes=$(echo "$io_line"    | cut -d'|' -f3 | tr -d ' ')
  write_time=$(echo "$io_line" | cut -d'|' -f4 | tr -d ' ')
  
  # Default to 0 if not found
  [[ "$reads"      =~ ^-?[0-9]+$             ]] || reads=0
  [[ "$read_time"  =~ ^-?[0-9]+(\.[0-9]+)?$ ]] || read_time=0
  [[ "$writes"     =~ ^-?[0-9]+$             ]] || writes=0
  [[ "$write_time" =~ ^-?[0-9]+(\.[0-9]+)?$ ]] || write_time=0
  
  echo "$ms,$reads,$read_time,$writes,$write_time"
}

# --- Statistics ---
calc_median() {
  awk -F, 'NR>1{a[++n]=$2}END{
    if(n==0){print 0; exit}
    for(i=1;i<=n;i++)for(j=i+1;j<=n;j++)if(a[i]>a[j]){t=a[i];a[i]=a[j];a[j]=t}
    print (n%2)?a[int(n/2)+1]:(a[n/2]+a[n/2+1])/2
  }' "$1"
}

calc_median_col() {
  local file="$1" col="$2"
  awk -F, -v col="$col" 'NR>1{a[++n]=$col}END{
    if(n==0){print 0; exit}
    for(i=1;i<=n;i++)for(j=i+1;j<=n;j++)if(a[i]>a[j]){t=a[i];a[i]=a[j];a[j]=t}
    print (n%2)?a[int(n/2)+1]:(a[n/2]+a[n/2+1])/2
  }' "$file"
}

calc_stats() {
  local csv="$1"
  awk -F, 'NR>1{a[++n]=$2;s+=$2}END{
    if(n==0)exit
    for(i=1;i<=n;i++)for(j=i+1;j<=n;j++)if(a[i]>a[j]){t=a[i];a[i]=a[j];a[j]=t}
    med=(n%2)?a[int(n/2)+1]:(a[n/2]+a[n/2+1])/2
    avg=s/n; for(i=1;i<=n;i++)ss+=(a[i]-avg)^2; sd=sqrt(ss/n)
    printf "median=%.1fms mean=%.1f±%.1fms n=%d", med, avg, sd, n
  }' "$csv"
}

# --- Profiling ---
# Run a SQL command under perf, attaching to the backend PID.
# Generates perf.data and flamegraph SVG.
#   profile_sql ROOT PORT LABEL SQL
profile_sql() {
  [[ $DO_PROFILE -ne 1 ]] && return
  
  local ROOT="$1" PORT="$2" LABEL="$3" SQL="$4"
  local PROF_DIR="$ROOT/profile"
  mkdir -p "$PROF_DIR"
  
  local PERF_DATA="$PROF_DIR/${LABEL}.perf.data"
  local SVG="$PROF_DIR/${LABEL}.svg"
  local psql_bin
  psql_bin="$(pg "$ROOT" psql)"
  
  # Use a unique application_name to find the backend PID
  local APP="prof_${LABEL}_$$"
  
  # Launch a psql session that will first identify itself, then run the SQL
  # The pg_sleep() gives us time to find the backend PID and attach perf
  PGAPPNAME="$APP" "$psql_bin" -h 127.0.0.1 -p "$PORT" -d postgres \
    -X -v ON_ERROR_STOP=1 <<EOSQL >/dev/null 2>&1 &
SELECT pg_sleep(2);
$SQL
EOSQL
  local QUERY_SHELL_PID=$!
  
  # Find the backend PID via pg_stat_activity
  local BACKEND_PID=""
  for ((n=0; n<100; n++)); do
    BACKEND_PID=$("$psql_bin" -h 127.0.0.1 -p "$PORT" -d postgres -Atq \
      -c "SELECT pid FROM pg_stat_activity WHERE application_name='${APP}' ORDER BY backend_start DESC LIMIT 1;" 2>/dev/null)
    [[ -n "$BACKEND_PID" && -d "/proc/$BACKEND_PID" ]] && break
    sleep 0.05
  done
  
  if [[ -z "$BACKEND_PID" || ! -d "/proc/$BACKEND_PID" ]]; then
    log "WARNING: Could not find backend PID for profiling, skipping"
    wait "$QUERY_SHELL_PID" 2>/dev/null || true
    return
  fi
  
  log "Profiling backend PID $BACKEND_PID → $PERF_DATA"
  
  # Attach perf to the backend; we explicitly kill -INT it after the query finishes
  $PERF_SUDO perf record -g --call-graph dwarf \
    -p "$BACKEND_PID" -o "$PERF_DATA" \
    --event="$PERF_EVENT" 2>/dev/null &
  local PERF_PID=$!
  sleep 0.1
  
  # Verify perf actually started (permissions, valid PID, etc.)
  if ! kill -0 "$PERF_PID" 2>/dev/null; then
    log "WARNING: perf record failed to start (permissions/config?), skipping flamegraph"
    wait "$QUERY_SHELL_PID" 2>/dev/null || true
    return
  fi
  
  # Wait for the query to finish
  wait "$QUERY_SHELL_PID" 2>/dev/null || true
  
  # Give perf a moment to flush, then stop it
  sleep 0.5
  $PERF_SUDO kill -INT "$PERF_PID" 2>/dev/null || true; wait "$PERF_PID" 2>/dev/null || true
  
  # Generate flamegraph
  generate_flamegraph "$PERF_DATA" "$SVG" "$LABEL"
}

# Convert perf.data → flamegraph SVG
#   generate_flamegraph PERF_DATA SVG_PATH TITLE
generate_flamegraph() {
  local PERF_DATA="$1" SVG="$2" TITLE="$3"
  
  [[ -f "$PERF_DATA" ]] || return
  
  local FOLDED="${PERF_DATA%.perf.data}.folded"
  if $PERF_SUDO perf script -i "$PERF_DATA" 2>/dev/null \
      | "$FLAMEGRAPH_DIR/stackcollapse-perf.pl" > "$FOLDED" 2>/dev/null \
      && [[ -s "$FOLDED" ]]; then
    "$FLAMEGRAPH_DIR/flamegraph.pl" --title "$TITLE" --countname samples \
      "$FOLDED" > "$SVG" 2>/dev/null
    log "Flamegraph: $SVG"
    rm -f "$FOLDED"
  else
    log "WARNING: Failed to generate flamegraph for $TITLE"
    rm -f "$FOLDED"
  fi
}

# --- Benchmark runner ---
# benchmark ROOT PORT NAME SQL RELATION [RELATION...]
benchmark() {
  local ROOT="$1" PORT="$2" NAME="$3" SQL="$4"
  shift 4
  local rels=("$@")
  local OUT="$ROOT/results/${NAME}.csv"
  
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  for ((i=1; i<=REPS; i++)); do
    drop_caches "$ROOT" "$PORT" "${rels[@]}"
    local result ms reads read_time writes write_time
    result=$(run_timed_with_io "$ROOT" "$PORT" "$SQL")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "$NAME [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
}

# --- Data setup functions ---
setup_bloom() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=100000 ;;
    medium) NROWS=1000000 ;;
    large)  NROWS=10000000 ;;
    *) die "Invalid size '$SIZE' (must be small, medium, or large)" ;;
  esac
  
  log "Creating Bloom test data ($SIZE: $NROWS rows)"
  psql_run "$ROOT" "$PORT" <<SQL
CREATE EXTENSION IF NOT EXISTS bloom;
DROP TABLE IF EXISTS bloom_test;
CREATE TABLE bloom_test (id INT, data TEXT, val1 INT, val2 INT);
INSERT INTO bloom_test SELECT i, 'data_'||i, i%1000, i%100 FROM generate_series(1,$NROWS) i;
CREATE INDEX bloom_idx ON bloom_test USING bloom (val1, val2);
VACUUM ANALYZE bloom_test;
CHECKPOINT;
SQL
}

setup_pgstattuple() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=100000 ;;
    medium) NROWS=1000000 ;;
    large)  NROWS=10000000 ;;
    *) die "Invalid size '$SIZE' (must be small, medium, or large)" ;;
  esac
  
  log "Creating pgstattuple test data ($SIZE: $NROWS rows)"
  psql_run "$ROOT" "$PORT" <<SQL
CREATE EXTENSION IF NOT EXISTS pgstattuple;
DROP TABLE IF EXISTS heap_test;
CREATE TABLE heap_test (id SERIAL PRIMARY KEY, data TEXT);
INSERT INTO heap_test (data) SELECT repeat('x',100) FROM generate_series(1,$NROWS);
VACUUM ANALYZE heap_test;
CHECKPOINT;
SQL
}

setup_pgstatindex() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=100000 ;;
    medium) NROWS=1000000 ;;
    large)  NROWS=10000000 ;;
    *) die "Invalid size '$SIZE' (must be small, medium, or large)" ;;
  esac
  
  log "Creating pgstatindex test data ($SIZE: $NROWS rows)"
  psql_run "$ROOT" "$PORT" <<SQL
CREATE EXTENSION IF NOT EXISTS pgstattuple;
DROP TABLE IF EXISTS idx_test;
CREATE TABLE idx_test (id SERIAL PRIMARY KEY, data TEXT);
INSERT INTO idx_test (data) SELECT 'data_row_' || i || '_' || repeat('x',50) FROM generate_series(1,$NROWS) i;
VACUUM ANALYZE idx_test;
CHECKPOINT;
SQL
}

setup_gin() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=100000 ;;
    medium) NROWS=1000000 ;;
    large)  NROWS=5000000 ;;
    *) die "Invalid size '$SIZE' (must be small, medium, or large)" ;;
  esac
  
  log "Creating GIN test data ($SIZE: $NROWS rows)"
  psql_run "$ROOT" "$PORT" <<SQL
DROP TABLE IF EXISTS gin_test;
-- No PRIMARY KEY: isolate GIN index vacuum from btree overhead
CREATE TABLE gin_test (id INT, tags TEXT[]);
INSERT INTO gin_test (id, tags)
SELECT i, ARRAY(SELECT 'tag_'||(random()*100)::int FROM generate_series(1,5))
FROM generate_series(1,$NROWS) i;
CREATE INDEX gin_idx ON gin_test USING gin (tags);
VACUUM ANALYZE gin_test;
CHECKPOINT;
SQL
}

setup_hash() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=500000 ;;
    medium) NROWS=1000000 ;;
    large)  NROWS=20000000 ;;
    *) die "Invalid size '$SIZE' (must be small, medium, or large)" ;;
  esac
  
  log "Creating Hash test data ($SIZE: $NROWS unique values)"
  psql_run "$ROOT" "$PORT" <<SQL
DROP TABLE IF EXISTS hash_test;
-- No PRIMARY KEY: isolate hash index vacuum from btree overhead
CREATE TABLE hash_test (id INT, data TEXT);
INSERT INTO hash_test SELECT i, 'x' FROM generate_series(1,$NROWS) i;
CREATE INDEX hash_idx ON hash_test USING hash (id);
VACUUM ANALYZE hash_test;
CHECKPOINT;
SQL
}

setup_wal() {
  local ROOT="$1" PORT="$2" SIZE="$3"
  local NROWS
  case "$SIZE" in
    small)  NROWS=1000000 ;;
    medium) NROWS=5000000 ;;
    large)  NROWS=20000000 ;;
    *) die "Invalid size '$SIZE' (must be small, medium, or large)" ;;
  esac
  
  log "Creating table for GIN index build / log_newpage_range test ($SIZE: $NROWS rows)"
  psql_run "$ROOT" "$PORT" <<SQL
DROP TABLE IF EXISTS wal_test;
-- Table with tsvector column for GIN indexing (full-text search)
-- GIN index builds always call log_newpage_range() at the end of
-- ginbuild() (gininsert.c) to WAL-log all index pages. 
CREATE TABLE wal_test (id INT, doc TEXT, doc_tsv TSVECTOR);
INSERT INTO wal_test
  SELECT i,
         'word' || (random()*10000)::int || ' term' || (random()*10000)::int
           || ' token' || (random()*5000)::int || ' phrase' || (random()*8000)::int,
         to_tsvector('simple',
           'word' || (random()*10000)::int || ' term' || (random()*10000)::int
           || ' token' || (random()*5000)::int || ' phrase' || (random()*8000)::int)
  FROM generate_series(1,$NROWS) i;
VACUUM ANALYZE wal_test;
CHECKPOINT;
SQL
}

# --- Test functions ---
test_bloom_scan() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  setup_bloom "$ROOT" "$PORT" "$SIZE"
  benchmark "$ROOT" "$PORT" "${LABEL}_bloom_scan_${SIZE}" \
    "SET enable_seqscan=off; SELECT COUNT(*) FROM bloom_test WHERE val1=42 AND val2=7;" \
    bloom_test bloom_idx
  # Profile after benchmark reps: shared_buffers memory already faulted in,
  # so page-fault noise is gone; drop_caches ensures cold IO for the profile.
  if [[ $DO_PROFILE -eq 1 ]]; then
    drop_caches "$ROOT" "$PORT" bloom_test bloom_idx
    profile_sql "$ROOT" "$PORT" "${LABEL}_bloom_scan_${SIZE}" \
      "SET enable_seqscan=off; SELECT COUNT(*) FROM bloom_test WHERE val1=42 AND val2=7;"
  fi
}

test_bloom_vacuum() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  local OUT="$ROOT/results/${LABEL}_bloom_vacuum_${SIZE}.csv"
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  for ((i=1; i<=REPS; i++)); do
    # Fresh table each run for consistent state
    setup_bloom "$ROOT" "$PORT" "$SIZE"
    # Create 10% dead tuples
    psql_run "$ROOT" "$PORT" -c "DELETE FROM bloom_test WHERE id % 10 = 0;"
    
    drop_caches "$ROOT" "$PORT" bloom_test bloom_idx
    local result ms reads read_time writes write_time
    result=$(run_timed_with_io "$ROOT" "$PORT" "VACUUM bloom_test;")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "${LABEL}_bloom_vacuum_${SIZE} [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
  
  if [[ $DO_PROFILE -eq 1 ]]; then
    setup_bloom "$ROOT" "$PORT" "$SIZE"
    psql_run "$ROOT" "$PORT" -c "DELETE FROM bloom_test WHERE id % 10 = 0;"
    drop_caches "$ROOT" "$PORT" bloom_test bloom_idx
    profile_sql "$ROOT" "$PORT" "${LABEL}_bloom_vacuum_${SIZE}" "VACUUM bloom_test;"
  fi
}

test_pgstattuple() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  local OUT="$ROOT/results/${LABEL}_pgstattuple_${SIZE}.csv"
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  # Setup once — rolled-back DELETE keeps layout identical across all reps
  setup_pgstattuple "$ROOT" "$PORT" "$SIZE"
  # Rolled-back DELETE clears the all-visible bit in the Visibility Map so
  # pgstattuple_approx must actually read those pages (it skips all-visible pages).
  # Using ROLLBACK keeps the physical layout identical across all reps (no TOAST
  # out-of-page updates, no dirty pages to flush from shared_buffers).
  psql_run "$ROOT" "$PORT" -c "BEGIN; DELETE FROM heap_test WHERE id % 500 = 0; ROLLBACK;"
  # Warmup pass: The rolled-back DELETE left every touched tuple with an xmax
  # pointing to the aborted transaction but no hint bits set. On the first
  # pgstattuple_approx call, HeapTupleSatisfiesVacuum → HeapTupleSatisfiesVacuumHorizon
  # must resolve each such xmax: TransactionIdIsInProgress (ProcArray scan) then
  # TransactionIdDidCommit (CLOG lookup) — only then can it call SetHintBits to
  # stamp HEAP_XMAX_INVALID and MarkBufferDirtyHint. Without this warmup, rep 1
  # pays ~1100ms extra CPU for those CLOG/ProcArray lookups. Subsequent reps hit
  # the early-exit at "if (t_infomask & HEAP_XMAX_INVALID) return HEAPTUPLE_LIVE"
  # and skip the expensive path entirely.
  # After this pass, the dirtied hint-bit pages are flushed to disk via
  # drop_caches, so all reps start from the same on-disk state.
  psql_run "$ROOT" "$PORT" -c "SELECT * FROM pgstattuple_approx('heap_test');" >/dev/null

  for ((i=1; i<=REPS; i++)); do
    drop_caches "$ROOT" "$PORT" heap_test heap_test_pkey
    local result ms reads read_time writes write_time
    result=$(run_timed_with_io "$ROOT" "$PORT" "SELECT * FROM pgstattuple_approx('heap_test');")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "${LABEL}_pgstattuple_${SIZE} [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
  
  if [[ $DO_PROFILE -eq 1 ]]; then
    psql_run "$ROOT" "$PORT" -c "BEGIN; DELETE FROM heap_test WHERE id % 500 = 0; ROLLBACK;"
    drop_caches "$ROOT" "$PORT" heap_test heap_test_pkey
    profile_sql "$ROOT" "$PORT" "${LABEL}_pgstattuple_${SIZE}" \
      "SELECT * FROM pgstattuple_approx('heap_test');"
  fi
}

test_pgstatindex() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  setup_pgstatindex "$ROOT" "$PORT" "$SIZE"
  benchmark "$ROOT" "$PORT" "${LABEL}_pgstatindex_${SIZE}" \
    "SELECT * FROM pgstatindex('idx_test_pkey');" \
    idx_test idx_test_pkey
  if [[ $DO_PROFILE -eq 1 ]]; then
    drop_caches "$ROOT" "$PORT" idx_test idx_test_pkey
    profile_sql "$ROOT" "$PORT" "${LABEL}_pgstatindex_${SIZE}" \
      "SELECT * FROM pgstatindex('idx_test_pkey');"
  fi
}

test_gin_vacuum() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  local OUT="$ROOT/results/${LABEL}_gin_vacuum_${SIZE}.csv"
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  for ((i=1; i<=REPS; i++)); do
    # Fresh table each run for consistent state
    setup_gin "$ROOT" "$PORT" "$SIZE"
    
    drop_caches "$ROOT" "$PORT" gin_test gin_idx
    local result ms reads read_time writes write_time
    # VACUUM ANALYZE forces ginvacuumcleanup() to run and scan all pages
    result=$(run_timed_with_io "$ROOT" "$PORT" "VACUUM ANALYZE gin_test;")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "${LABEL}_gin_vacuum_${SIZE} [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
  
  if [[ $DO_PROFILE -eq 1 ]]; then
    setup_gin "$ROOT" "$PORT" "$SIZE"
    drop_caches "$ROOT" "$PORT" gin_test gin_idx
    profile_sql "$ROOT" "$PORT" "${LABEL}_gin_vacuum_${SIZE}" "VACUUM ANALYZE gin_test;"
  fi
}

test_hash_vacuum() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  local OUT="$ROOT/results/${LABEL}_hash_vacuum_${SIZE}.csv"
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  for ((i=1; i<=REPS; i++)); do
    # Fresh table each run for consistent state
    setup_hash "$ROOT" "$PORT" "$SIZE"
    # Create 10% dead tuples
    psql_run "$ROOT" "$PORT" -c "DELETE FROM hash_test WHERE id % 10 = 0;"
    
    drop_caches "$ROOT" "$PORT" hash_test hash_idx
    local result ms reads read_time writes write_time
    result=$(run_timed_with_io "$ROOT" "$PORT" "VACUUM hash_test;")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "${LABEL}_hash_vacuum_${SIZE} [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
  
  if [[ $DO_PROFILE -eq 1 ]]; then
    setup_hash "$ROOT" "$PORT" "$SIZE"
    psql_run "$ROOT" "$PORT" -c "DELETE FROM hash_test WHERE id % 10 = 0;"
    drop_caches "$ROOT" "$PORT" hash_test hash_idx
    profile_sql "$ROOT" "$PORT" "${LABEL}_hash_vacuum_${SIZE}" "VACUUM hash_test;"
  fi
}

test_wal_logging() {
  local ROOT="$1" PORT="$2" LABEL="$3" SIZE="$4"
  local OUT="$ROOT/results/${LABEL}_wal_logging_${SIZE}.csv"
  mkdir -p "$ROOT/results"
  echo "run,ms,reads,read_time_ms,writes,write_time_ms" > "$OUT"
  
  # Build table once - only rebuild index each rep
  setup_wal "$ROOT" "$PORT" "$SIZE"
  
  local WAL_SQL="CREATE INDEX wal_test_gin_idx ON wal_test USING gin (doc_tsv);"
  
  for ((i=1; i<=REPS; i++)); do
    # Drop index from previous iteration
    psql_run "$ROOT" "$PORT" -c "DROP INDEX IF EXISTS wal_test_gin_idx;"
    
    # Drop OS caches - source table pages are COLD on disk
    drop_caches "$ROOT" "$PORT" wal_test
    
    # CREATE INDEX on GIN (tsvector_ops):
    # - GIN always uses the same build path: ginbuild() populates the
    #   index in memory, flushes to disk, then calls log_newpage_range()
    #   to read ALL index pages and write them to WAL (gininsert.c:785-790)
    local result ms reads read_time writes write_time
    result=$(run_timed_with_io "$ROOT" "$PORT" "$WAL_SQL")
    IFS=',' read -r ms reads read_time writes write_time <<<"$result"
    echo "$i,$ms,$reads,$read_time,$writes,$write_time" >> "$OUT"
    log "${LABEL}_wal_logging_${SIZE} [$i/$REPS]: ${ms}ms (reads=$reads, read_time=${read_time}ms, writes=$writes, write_time=${write_time}ms)"
  done
  
  if [[ $DO_PROFILE -eq 1 ]]; then
    psql_run "$ROOT" "$PORT" -c "DROP INDEX IF EXISTS wal_test_gin_idx;"
    drop_caches "$ROOT" "$PORT" wal_test
    profile_sql "$ROOT" "$PORT" "${LABEL}_wal_logging_${SIZE}" "$WAL_SQL"
  fi
}

# --- Run tests for a build ---
warmup_catalog() {
  local ROOT="$1" PORT="$2"
  # Explicitly prewarm catalog tables and their indexes into shared_buffers
  # so rep 1 doesn't pay disk-read cost for catalog pages.
  # pg_buffercache_evict_relation only evicts the test relation, not catalogs,
  # so these stay warm across all reps.
  psql_run "$ROOT" "$PORT" <<SQL >/dev/null
SELECT pg_prewarm('pg_class',     'buffer');
SELECT pg_prewarm('pg_attribute', 'buffer');
SELECT pg_prewarm('pg_namespace', 'buffer');
SELECT pg_prewarm('pg_proc',      'buffer');
SELECT pg_prewarm('pg_type',      'buffer');
SQL
}

run_tests() {
  local ROOT="$1" LABEL="$2"
  local PORT
  PORT=$(pick_port)
  
  log "[$LABEL] Starting cluster on port $PORT"
  init_cluster "$ROOT" "$PORT"
  warmup_catalog "$ROOT" "$PORT"
  set_io_delay "$IO_DELAY_MS"
  
  trap "stop_cluster '$ROOT'" EXIT
  
  for SIZE in $SIZES; do
    case "$TEST" in
      bloom_scan)   test_bloom_scan "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      bloom_vacuum) test_bloom_vacuum "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      pgstattuple)  test_pgstattuple "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      pgstatindex)  test_pgstatindex "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      gin_vacuum)   test_gin_vacuum "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      hash_vacuum)  test_hash_vacuum "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      wal_logging)  test_wal_logging "$ROOT" "$PORT" "$LABEL" "$SIZE" ;;
      all)
        test_bloom_vacuum "$ROOT" "$PORT" "$LABEL" "$SIZE"
        test_pgstattuple "$ROOT" "$PORT" "$LABEL" "$SIZE"
        test_pgstatindex "$ROOT" "$PORT" "$LABEL" "$SIZE"
        test_hash_vacuum "$ROOT" "$PORT" "$LABEL" "$SIZE"
        test_wal_logging "$ROOT" "$PORT" "$LABEL" "$SIZE"
        ;;
      *) die "Unknown test: $TEST" ;;
    esac
  done
  
  stop_cluster "$ROOT"
  trap - EXIT
}

# --- Compare results ---
compare_results() {
  local base_csv="$1" patch_csv="$2" label="$3"
  
  [[ ! -f "$base_csv" || ! -f "$patch_csv" ]] && return
  
  local base_med patch_med
  base_med=$(calc_median "$base_csv")
  patch_med=$(calc_median "$patch_csv")
  
  # Guard against empty or zero values to prevent division by zero
  [[ -z "$base_med" || "$base_med" == "0" ]] && base_med="0.001"
  [[ -z "$patch_med" || "$patch_med" == "0" ]] && patch_med="0.001"
  
  local speedup pct
  speedup=$(awk "BEGIN { printf \"%.2f\", $base_med / $patch_med }")
  pct=$(awk "BEGIN { printf \"%.1f\", ($base_med - $patch_med) / $base_med * 100 }")
  
  local io_info="" io_detail=""
  if head -1 "$base_csv" | grep -q "reads"; then
    # Standard test: columns are run,ms,reads,read_time_ms,writes,write_time_ms
    local base_reads patch_reads base_rtime patch_rtime base_writes patch_writes base_wtime patch_wtime
    base_reads=$(calc_median_col "$base_csv" 3)
    patch_reads=$(calc_median_col "$patch_csv" 3)
    base_rtime=$(calc_median_col "$base_csv" 4)
    patch_rtime=$(calc_median_col "$patch_csv" 4)
    base_writes=$(calc_median_col "$base_csv" 5)
    patch_writes=$(calc_median_col "$patch_csv" 5)
    base_wtime=$(calc_median_col "$base_csv" 6)
    patch_wtime=$(calc_median_col "$patch_csv" 6)
    # Default to 0 if empty
    [[ -z "$base_reads" ]]   && base_reads=0
    [[ -z "$patch_reads" ]]  && patch_reads=0
    [[ -z "$base_rtime" ]]   && base_rtime=0
    [[ -z "$patch_rtime" ]]  && patch_rtime=0
    [[ -z "$base_writes" ]]  && base_writes=0
    [[ -z "$patch_writes" ]] && patch_writes=0
    [[ -z "$base_wtime" ]]   && base_wtime=0
    [[ -z "$patch_wtime" ]]  && patch_wtime=0

    # Per-component speedup (guard against division by zero)
    local r_speedup w_speedup
    if awk "BEGIN { exit ($patch_rtime > 0) ? 0 : 1 }"; then
      r_speedup=$(awk "BEGIN { printf \"%.2fx\", $base_rtime / $patch_rtime }")
    else
      r_speedup="n/a"
    fi
    if awk "BEGIN { exit ($patch_wtime > 0) ? 0 : 1 }"; then
      w_speedup=$(awk "BEGIN { printf \"%.2fx\", $base_wtime / $patch_wtime }")
    else
      w_speedup="n/a"
    fi

    io_info=$(printf "\n%-26s   read:  %s→%s ops  %s→%sms  (%s)    write:  %s→%s ops  %s→%sms  (%s)" \
      "" "$base_reads" "$patch_reads" "$base_rtime" "$patch_rtime" "$r_speedup" \
      "$base_writes" "$patch_writes" "$base_wtime" "$patch_wtime" "$w_speedup")
  fi
  
  printf "%-26s base=%8.1fms  patch=%8.1fms  %5.2fx  (%5.1f%%)%s\n" \
    "$label" "$base_med" "$patch_med" "$speedup" "$pct" "$io_info"
}

print_summary() {
  echo ""
  echo "═══════════════════════════════════════════════════════════════════════"
  echo "                     STREAMING READ BENCHMARK RESULTS                   "
  echo "═══════════════════════════════════════════════════════════════════════"
  echo ""
  
  if [[ $BASELINE -eq 1 ]]; then
    printf "%-26s %-17s %-17s %-7s %-7s %s\n" "TEST" "BASELINE" "PATCHED" "SPEEDUP" "CHANGE" "I/O TIME"
    echo "─────────────────────────────────────────────────────────────────────────────────────────────────"
    
    for SIZE in $SIZES; do
      for test_name in bloom_scan bloom_vacuum pgstattuple pgstatindex gin_vacuum hash_vacuum wal_logging; do
        [[ "$TEST" != "all" && "$TEST" != "$test_name" ]] && continue
        compare_results \
          "$ROOT_BASE/results/base_${test_name}_${SIZE}.csv" \
          "$ROOT_PATCH/results/patched_${test_name}_${SIZE}.csv" \
          "${test_name}_${SIZE}"
      done
    done
  else
    echo "Results (patched only):"
    echo ""
    for f in "$ROOT_PATCH/results/"*.csv; do
      [[ -f "$f" ]] || continue
      printf "%-40s %s\n" "$(basename "$f" .csv):" "$(calc_stats "$f")"
    done
  fi
  
  echo ""
  echo "═══════════════════════════════════════════════════════════════════════"
  echo "CSV files: $ROOT_PATCH/results/"
  [[ $BASELINE -eq 1 ]] && echo "Baseline:  $ROOT_BASE/results/"
  
  # List generated flamegraphs
  if [[ $DO_PROFILE -eq 1 ]]; then
    local svgs=()
    for dir in "$ROOT_BASE/profile" "$ROOT_PATCH/profile"; do
      [[ -d "$dir" ]] || continue
      for svg in "$dir"/*.svg; do
        [[ -f "$svg" ]] && svgs+=("$svg")
      done
    done
    if [[ ${#svgs[@]} -gt 0 ]]; then
      echo ""
      echo "Flamegraphs:"
      for svg in "${svgs[@]}"; do echo "  $svg"; done
    fi
  fi
  
  echo "═══════════════════════════════════════════════════════════════════════"
}

# --- Main ---
main() {
  log "Streaming Read Benchmark"
  log "Patch: $PATCH ($PATCH_TAG)"
  log "Tests: $TEST"
  log "Sizes: $SIZES"
  log "Reps:  $REPS"
  log "I/O:   $IO_METHOD (workers=$IO_WORKERS, concurrency=$IO_MAX_CONCURRENCY)"
  [[ $DIRECT_IO -eq 1 ]] && log "Direct IO: enabled (debug_io_direct=data)"
  [[ -n "$IO_DELAY_MS" ]] && log "I/O delay: ${IO_DELAY_MS}ms read / ${WRITE_DELAY_MS}ms write via dm_delay ($DM_DELAY_DEV)"
  [[ $DO_PROFILE -eq 1 ]] && log "Profile: enabled (flamegraphs → <root>/profile/)"
  
  # Build
  if [[ $BASELINE -eq 1 ]]; then
    build_pg "$ROOT_BASE" ""
  fi
  build_pg "$ROOT_PATCH" "$PATCH"
  
  # Run tests
  if [[ $BASELINE -eq 1 ]]; then
    log "Running baseline tests"
    run_tests "$ROOT_BASE" "base"
  fi
  
  log "Running patched tests"
  run_tests "$ROOT_PATCH" "patched"
  
  # Summary
  print_summary
}

main

^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 23:04                     ` Re: Streamify more code paths Andres Freund <[email protected]>
  2026-03-11 01:37                       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 15:11                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 21:33                           ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 03:42                             ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 04:39                               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-12 15:35                                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-13 01:49                                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-13 02:39                                     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-14 09:56                                       ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-15 02:51                                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-15 03:47                                           ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
@ 2026-03-16 01:00                                             ` Michael Paquier <[email protected]>
  2026-03-16 03:05                                               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  0 siblings, 1 reply; 35+ messages in thread

From: Michael Paquier @ 2026-03-16 01:00 UTC (permalink / raw)
  To: Xuneng Zhou <[email protected]>; +Cc: Andres Freund <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

On Sun, Mar 15, 2026 at 11:47:06AM +0800, Xuneng Zhou wrote:
> The updated script now reports speedup separately for the read and
> write paths like this:
> 
> hash_vacuum_medium         base= 33747.2ms  patch= 27379.7ms   1.23x  ( 18.9%)
>                              read:  4662→1612 ops  8238.72→1725.86ms
> (4.77x)    write:  12689→12651 ops  25146.51→25053.57ms  (1.00x)
> 
> I think it is useful to keep the write-delay option even with this
> reporting. Separating the read and write delays also helps reduce the
> overall runtime of the tests, especially for large data sizes: we only
> slow down the read path while keeping the write path fast.

These write numbers are more in line with what I was seeing in my last
tests, and I am not going to ignore a 20% runtime reduction.  The
stats numbers are looking nice, as well.

At the end, I have checked the logic of what you are introducing,
where things come down to the point of making sure that the data
pushed to the callback remains consistent with the paths where
_hash_getcachedmetap() is called.  I have also cross-checked the data
reports by pgstattuple for the relation and the index, to see that
these are consistent between HEAD and the patch, just in case.  And
applied it.  That was the last piece.
--
Michael


Attachments:

  [application/pgp-signature] signature.asc (833B, 2-signature.asc)
  download

^ permalink  raw  reply  [nested|flat] 35+ messages in thread

* Re: Streamify more code paths
  2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-25 06:33 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-26 10:59   ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-26 16:41     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:41       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-28 11:45         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-29 10:58           ` Re: Streamify more code paths Nazir Bilal Yavuz <[email protected]>
  2025-12-30 01:51             ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2025-12-30 02:43               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 06:06                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-10 10:28                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-10 23:04                     ` Re: Streamify more code paths Andres Freund <[email protected]>
  2026-03-11 01:37                       ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 15:11                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-11 21:33                           ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 03:42                             ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-12 04:39                               ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-12 15:35                                 ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-13 01:49                                   ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-13 02:39                                     ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-14 09:56                                       ` Re: Streamify more code paths Michael Paquier <[email protected]>
  2026-03-15 02:51                                         ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-15 03:47                                           ` Re: Streamify more code paths Xuneng Zhou <[email protected]>
  2026-03-16 01:00                                             ` Re: Streamify more code paths Michael Paquier <[email protected]>
@ 2026-03-16 03:05                                               ` Xuneng Zhou <[email protected]>
  0 siblings, 0 replies; 35+ messages in thread

From: Xuneng Zhou @ 2026-03-16 03:05 UTC (permalink / raw)
  To: Michael Paquier <[email protected]>; +Cc: Andres Freund <[email protected]>; pgsql-hackers <[email protected]>; Nazir Bilal Yavuz <[email protected]>

On Mon, Mar 16, 2026 at 9:00 AM Michael Paquier <[email protected]> wrote:
>
> On Sun, Mar 15, 2026 at 11:47:06AM +0800, Xuneng Zhou wrote:
> > The updated script now reports speedup separately for the read and
> > write paths like this:
> >
> > hash_vacuum_medium         base= 33747.2ms  patch= 27379.7ms   1.23x  ( 18.9%)
> >                              read:  4662→1612 ops  8238.72→1725.86ms
> > (4.77x)    write:  12689→12651 ops  25146.51→25053.57ms  (1.00x)
> >
> > I think it is useful to keep the write-delay option even with this
> > reporting. Separating the read and write delays also helps reduce the
> > overall runtime of the tests, especially for large data sizes: we only
> > slow down the read path while keeping the write path fast.
>
> These write numbers are more in line with what I was seeing in my last
> tests, and I am not going to ignore a 20% runtime reduction.  The
> stats numbers are looking nice, as well.
>
> At the end, I have checked the logic of what you are introducing,
> where things come down to the point of making sure that the data
> pushed to the callback remains consistent with the paths where
> _hash_getcachedmetap() is called.  I have also cross-checked the data
> reports by pgstattuple for the relation and the index, to see that
> these are consistent between HEAD and the patch, just in case.  And
> applied it.  That was the last piece.
> --
> Michael

Thanks for double-checking and pushing it.

-- 
Best,
Xuneng





^ permalink  raw  reply  [nested|flat] 35+ messages in thread


end of thread, other threads:[~2026-03-16 03:45 UTC | newest]

Thread overview: 35+ messages (download: mbox mbox.gz follow: Atom feed)
-- links below jump to the message on this page --
2025-12-25 05:51 Streamify more code paths Xuneng Zhou <[email protected]>
2025-12-25 06:33 ` Xuneng Zhou <[email protected]>
2025-12-26 10:59   ` Nazir Bilal Yavuz <[email protected]>
2025-12-26 16:41     ` Xuneng Zhou <[email protected]>
2025-12-28 11:41       ` Xuneng Zhou <[email protected]>
2025-12-28 11:45         ` Xuneng Zhou <[email protected]>
2025-12-29 10:58           ` Nazir Bilal Yavuz <[email protected]>
2025-12-30 01:51             ` Xuneng Zhou <[email protected]>
2025-12-30 02:43               ` Xuneng Zhou <[email protected]>
2026-03-10 06:06                 ` Xuneng Zhou <[email protected]>
2026-03-10 10:28                   ` Michael Paquier <[email protected]>
2026-03-10 13:23                     ` Xuneng Zhou <[email protected]>
2026-03-11 00:16                       ` Andres Freund <[email protected]>
2026-03-11 02:23                         ` Xuneng Zhou <[email protected]>
2026-03-11 15:00                           ` Xuneng Zhou <[email protected]>
2026-03-16 03:45                             ` Xuneng Zhou <[email protected]>
2026-03-11 02:13                       ` Xuneng Zhou <[email protected]>
2026-03-11 07:53                       ` Nazir Bilal Yavuz <[email protected]>
2026-03-12 03:27                         ` Xuneng Zhou <[email protected]>
2026-03-10 23:04                     ` Andres Freund <[email protected]>
2026-03-10 23:29                       ` Michael Paquier <[email protected]>
2026-03-11 02:22                         ` Xuneng Zhou <[email protected]>
2026-03-11 01:37                       ` Xuneng Zhou <[email protected]>
2026-03-11 15:11                         ` Xuneng Zhou <[email protected]>
2026-03-11 21:33                           ` Michael Paquier <[email protected]>
2026-03-12 03:42                             ` Michael Paquier <[email protected]>
2026-03-12 04:39                               ` Xuneng Zhou <[email protected]>
2026-03-12 15:35                                 ` Xuneng Zhou <[email protected]>
2026-03-13 01:49                                   ` Michael Paquier <[email protected]>
2026-03-13 02:39                                     ` Xuneng Zhou <[email protected]>
2026-03-14 09:56                                       ` Michael Paquier <[email protected]>
2026-03-15 02:51                                         ` Xuneng Zhou <[email protected]>
2026-03-15 03:47                                           ` Xuneng Zhou <[email protected]>
2026-03-16 01:00                                             ` Michael Paquier <[email protected]>
2026-03-16 03:05                                               ` Xuneng Zhou <[email protected]>

This inbox is served by agora; see mirroring instructions
for how to clone and mirror all data and code used for this inbox