From dd36f2194986e5366e1f0800eff6b8a61611dd0f Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Thu, 24 Jul 2025 13:00:24 +0200
Subject: [PATCH v1] pgstattuple: analyze TIDs on btree leaf pages

---
 contrib/pgstattuple/Makefile                  |   3 +-
 contrib/pgstattuple/pgstatindex.c             | 447 ++++++++++++++++++
 contrib/pgstattuple/pgstattuple--1.5--1.6.sql |  30 ++
 contrib/pgstattuple/pgstattuple.control       |   2 +-
 4 files changed, 480 insertions(+), 2 deletions(-)
 create mode 100644 contrib/pgstattuple/pgstattuple--1.5--1.6.sql

diff --git a/contrib/pgstattuple/Makefile b/contrib/pgstattuple/Makefile
index c5b17fc703e..d5c62ba36f9 100644
--- a/contrib/pgstattuple/Makefile
+++ b/contrib/pgstattuple/Makefile
@@ -10,7 +10,8 @@ OBJS = \
 EXTENSION = pgstattuple
 DATA = pgstattuple--1.4.sql pgstattuple--1.4--1.5.sql \
 	pgstattuple--1.3--1.4.sql pgstattuple--1.2--1.3.sql \
-	pgstattuple--1.1--1.2.sql pgstattuple--1.0--1.1.sql
+	pgstattuple--1.1--1.2.sql pgstattuple--1.0--1.1.sql \
+	pgstattuple--1.5--1.6.sql
 PGFILEDESC = "pgstattuple - tuple-level statistics"
 
 REGRESS = pgstattuple
diff --git a/contrib/pgstattuple/pgstatindex.c b/contrib/pgstattuple/pgstatindex.c
index 4b9d76ec4e4..aa4431075b0 100644
--- a/contrib/pgstattuple/pgstatindex.c
+++ b/contrib/pgstattuple/pgstatindex.c
@@ -62,6 +62,9 @@ PG_FUNCTION_INFO_V1(pg_relpages_v1_5);
 PG_FUNCTION_INFO_V1(pg_relpagesbyid_v1_5);
 PG_FUNCTION_INFO_V1(pgstatginindex_v1_5);
 
+PG_FUNCTION_INFO_V1(pgstatindex_nheap_v1_6);
+PG_FUNCTION_INFO_V1(pgstatindex_runs_v1_6);
+
 Datum		pgstatginindex_internal(Oid relid, FunctionCallInfo fcinfo);
 
 #define IS_INDEX(r) ((r)->rd_rel->relkind == RELKIND_INDEX)
@@ -128,6 +131,9 @@ static Datum pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo);
 static int64 pg_relpages_impl(Relation rel);
 static void GetHashPageStats(Page page, HashIndexStat *stats);
 
+static Datum pgstatindex_nheap_impl(Relation rel, int64 nblocks, FunctionCallInfo fcinfo);
+static Datum pgstatindex_runs_impl(Relation rel, int64 nblocks, FunctionCallInfo fcinfo);
+
 /* ------------------------------------------------------
  * pgstatindex()
  *
@@ -756,3 +762,444 @@ GetHashPageStats(Page page, HashIndexStat *stats)
 	}
 	stats->free_space += PageGetExactFreeSpace(page);
 }
+
+/*
+ */
+Datum
+pgstatindex_nheap_v1_6(PG_FUNCTION_ARGS)
+{
+	text	   *relname = PG_GETARG_TEXT_PP(0);
+	int64		nblocks = PG_GETARG_INT64(1);
+	Relation	rel;
+	RangeVar   *relrv;
+
+	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+	rel = relation_openrv(relrv, AccessShareLock);
+
+	PG_RETURN_DATUM(pgstatindex_nheap_impl(rel, nblocks, fcinfo));
+}
+
+/*
+ */
+Datum
+pgstatindex_runs_v1_6(PG_FUNCTION_ARGS)
+{
+	text	   *relname = PG_GETARG_TEXT_PP(0);
+	int64		nblocks = PG_GETARG_INT64(1);
+	Relation	rel;
+	RangeVar   *relrv;
+
+	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+	rel = relation_openrv(relrv, AccessShareLock);
+
+	PG_RETURN_DATUM(pgstatindex_runs_impl(rel, nblocks, fcinfo));
+}
+
+static int
+count_block_runs(int range_nblocks, BlockNumber *range_blocks)
+{
+	int			nruns = 1;
+
+	for (int i = 1; i < range_nblocks; i++)
+	{
+		if (range_blocks[i] != range_blocks[i-1])
+			nruns++;
+	}
+
+	return nruns;
+}
+
+static int
+blocknum_cmp(const void *a, const void *b)
+{
+	return memcmp(a, b, sizeof(BlockNumber));
+}
+
+static int
+count_blocks_distinct(int range_nblocks, BlockNumber *range_blocks)
+{
+	pg_qsort(range_blocks, range_nblocks, sizeof(BlockNumber), blocknum_cmp);
+
+	return count_block_runs(range_nblocks, range_blocks);
+}
+
+static Datum
+pgstatindex_nheap_impl(Relation rel, int64 range_len, FunctionCallInfo fcinfo)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	BlockNumber nblocks;
+	BlockNumber blkno;
+	BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
+
+	BlockNumber range_start = 0;
+	BlockNumber	*range_blocks = NULL;
+	int			range_nblocks;
+	int			range_leafs;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+
+	Datum		values[5];
+	bool		nulls[5];
+
+	/* no NULLs */
+	memset(nulls, 0, sizeof(nulls));
+
+	if (!IS_INDEX(rel) || !IS_BTREE(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("relation \"%s\" is not a btree index",
+						RelationGetRelationName(rel))));
+
+	/*
+	 * Reject attempts to read non-local temporary relations; we would be
+	 * likely to get wrong data since we have no visibility into the owning
+	 * session's local buffers.
+	 */
+	if (RELATION_IS_OTHER_TEMP(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot access temporary tables of other sessions")));
+
+	/*
+	 * A !indisready index could lead to ERRCODE_DATA_CORRUPTED later, so exit
+	 * early.  We're capable of assessing an indisready&&!indisvalid index,
+	 * but the results could be confusing.  For example, the index's size
+	 * could be too low for a valid index of the table.
+	 */
+	if (!rel->rd_index->indisvalid)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("index \"%s\" is not valid",
+						RelationGetRelationName(rel))));
+
+	InitMaterializedSRF(fcinfo, 0);
+	tupdesc = rsinfo->setDesc;
+	tupstore = rsinfo->setResult;
+
+	/* pre-allocate space for the maximum number of TIDs we might see
+	 * on range_len pages */
+	range_nblocks = 0;
+	range_blocks = palloc_array(BlockNumber, range_len * MaxTIDsPerBTreePage);
+
+	/*
+	 * Scan all blocks except the metapage
+	 */
+	nblocks = RelationGetNumberOfBlocks(rel);
+
+	for (blkno = 1; blkno < nblocks; blkno++)
+	{
+		Buffer		buffer;
+		Page		page;
+		BTPageOpaque opaque;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * If we started a new range, count the distinct blocks and add
+		 * a tuple into the result set (if there are any TIDs).
+		 */
+		if (range_start + range_len <= blkno)
+		{
+			if (range_nblocks > 0)
+			{
+				HeapTuple	tuple;
+				int			ndistinct;
+				int			nruns;
+
+				nruns = count_block_runs(range_nblocks, range_blocks);
+
+				/* this modifies the array, so do once */
+				ndistinct = count_blocks_distinct(range_nblocks, range_blocks);
+
+				values[0] = Int64GetDatum(range_start);
+				values[1] = Int64GetDatum(range_leafs);
+				values[2] = Int64GetDatum(range_nblocks);
+				values[3] = Int64GetDatum(ndistinct);
+				values[4] = Int64GetDatum(nruns);
+
+				/* Build and return the result tuple */
+				tuple = heap_form_tuple(tupdesc, values, nulls);
+
+				tuplestore_puttuple(tupstore, tuple);
+
+				range_nblocks = 0;
+				range_leafs = 0;
+			}
+
+			while (range_start + range_len <= blkno)
+				range_start += range_len;
+		}
+
+		/* Read and lock buffer */
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
+		LockBuffer(buffer, BUFFER_LOCK_SHARE);
+
+		page = BufferGetPage(buffer);
+		opaque = BTPageGetOpaque(page);
+
+		/*
+		 * Ignore deleted/dead pages, and internal (non-leaf) pages.
+		 */
+		if (!P_ISDELETED(opaque) && !P_IGNORE(opaque) && P_ISLEAF(opaque))
+		{
+			OffsetNumber	offset = FirstOffsetNumber;
+			OffsetNumber	maxoffset = PageGetMaxOffsetNumber(page);
+			bool			rightmost = P_RIGHTMOST(opaque);
+
+			while (offset < maxoffset)
+			{
+				ItemId		id = PageGetItemId(page, offset);
+				IndexTuple	itup = (IndexTuple) PageGetItem(page, id);
+				BlockNumber	block = ItemPointerGetBlockNumber(&itup->t_tid);
+				bool		ispivot = (!rightmost && offset == P_HIKEY);
+
+				Assert(range_nblocks >= 0);
+				Assert(range_nblocks < range_len * MaxTIDsPerBTreePage);
+
+				/* ignore pivot tuples */
+				if (!ispivot)
+					range_blocks[range_nblocks++] = block;
+
+				offset++;
+			}
+
+			range_leafs++;
+		}
+
+		/* Unlock and release buffer */
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+		ReleaseBuffer(buffer);
+	}
+
+	/* output the last range */
+	if (range_nblocks > 0)
+	{
+		HeapTuple	tuple;
+		int			ndistinct;
+		int			nruns;
+	
+		nruns = count_block_runs(range_nblocks, range_blocks);
+	
+		/* this modifies the array, so do once */
+		ndistinct = count_blocks_distinct(range_nblocks, range_blocks);
+	
+		values[0] = Int64GetDatum(range_start);
+		values[1] = Int64GetDatum(range_leafs);
+		values[2] = Int64GetDatum(range_nblocks);
+		values[3] = Int64GetDatum(ndistinct);
+		values[4] = Int64GetDatum(nruns);
+	
+		/* Build and return the result tuple */
+		tuple = heap_form_tuple(tupdesc, values, nulls);
+	
+		tuplestore_puttuple(tupstore, tuple);
+	
+		range_nblocks = 0;
+		range_leafs = 0;
+	}
+
+	relation_close(rel, AccessShareLock);
+
+	PG_RETURN_NULL();
+}
+
+static void
+count_run_lengths(int range_nblocks, BlockNumber *range_blocks, int *lengths)
+{
+	int			len = 1;
+	BlockNumber	curr = range_blocks[0];
+
+	for (int i = 0; i < range_nblocks; i++)
+	{
+		if (range_blocks[i] != curr)
+		{
+			lengths[len]++;
+			len = 1;
+			curr = range_blocks[i];
+			continue;
+		}
+
+		len++;
+	}
+
+	lengths[len]++;
+}
+
+static Datum
+pgstatindex_runs_impl(Relation rel, int64 range_len, FunctionCallInfo fcinfo)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	BlockNumber nblocks;
+	BlockNumber blkno;
+	BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
+
+	BlockNumber range_start = 0;
+	BlockNumber	*range_blocks = NULL;
+	int			*run_lengths = NULL;
+	int			range_nblocks;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+
+	Datum		values[3];
+	bool		nulls[3];
+
+	/* no NULLs */
+	memset(nulls, 0, sizeof(nulls));
+
+	if (!IS_INDEX(rel) || !IS_BTREE(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("relation \"%s\" is not a btree index",
+						RelationGetRelationName(rel))));
+
+	/*
+	 * Reject attempts to read non-local temporary relations; we would be
+	 * likely to get wrong data since we have no visibility into the owning
+	 * session's local buffers.
+	 */
+	if (RELATION_IS_OTHER_TEMP(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot access temporary tables of other sessions")));
+
+	/*
+	 * A !indisready index could lead to ERRCODE_DATA_CORRUPTED later, so exit
+	 * early.  We're capable of assessing an indisready&&!indisvalid index,
+	 * but the results could be confusing.  For example, the index's size
+	 * could be too low for a valid index of the table.
+	 */
+	if (!rel->rd_index->indisvalid)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("index \"%s\" is not valid",
+						RelationGetRelationName(rel))));
+
+	InitMaterializedSRF(fcinfo, 0);
+	tupdesc = rsinfo->setDesc;
+	tupstore = rsinfo->setResult;
+
+	/* pre-allocate space for the maximum number of TIDs we might see
+	 * on range_len pages */
+	range_nblocks = 0;
+	range_blocks = palloc_array(BlockNumber, range_len * MaxTIDsPerBTreePage);
+
+	/* lengths of runs (the range could be one long run) */
+	run_lengths = palloc_array(int, (range_len * MaxTIDsPerBTreePage + 1));
+
+	/*
+	 * Scan all blocks except the metapage
+	 */
+	nblocks = RelationGetNumberOfBlocks(rel);
+
+	for (blkno = 1; blkno < nblocks; blkno++)
+	{
+		Buffer		buffer;
+		Page		page;
+		BTPageOpaque opaque;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * If we started a new range, count the distinct blocks and add
+		 * a tuple into the result set (if there are any TIDs).
+		 */
+		if (range_start + range_len <= blkno)
+		{
+			if (range_nblocks > 0)
+			{
+				HeapTuple	tuple;
+
+				memset(run_lengths, 0, sizeof(int) * (range_len * MaxTIDsPerBTreePage + 1));
+				count_run_lengths(range_nblocks, range_blocks, run_lengths);
+
+				for (int i = 1; i <= (range_len * MaxTIDsPerBTreePage); i++)
+				{
+					if (run_lengths[i] > 0)
+					{
+						values[0] = Int64GetDatum(range_start);
+						values[1] = Int32GetDatum(i);
+						values[2] = Int32GetDatum(run_lengths[i]);
+
+						/* Build and return the result tuple */
+						tuple = heap_form_tuple(tupdesc, values, nulls);
+
+						tuplestore_puttuple(tupstore, tuple);
+					}
+				}
+
+				range_nblocks = 0;
+			}
+
+			while (range_start + range_len <= blkno)
+				range_start += range_len;
+		}
+
+		/* Read and lock buffer */
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
+		LockBuffer(buffer, BUFFER_LOCK_SHARE);
+
+		page = BufferGetPage(buffer);
+		opaque = BTPageGetOpaque(page);
+
+		/*
+		 * Ignore deleted/dead pages, and internal (non-leaf) pages.
+		 */
+		if (!P_ISDELETED(opaque) && !P_IGNORE(opaque) && P_ISLEAF(opaque))
+		{
+			OffsetNumber	offset = FirstOffsetNumber;
+			OffsetNumber	maxoffset = PageGetMaxOffsetNumber(page);
+			bool			rightmost = P_RIGHTMOST(opaque);
+
+			while (offset < maxoffset)
+			{
+				ItemId		id = PageGetItemId(page, offset);
+				IndexTuple	itup = (IndexTuple) PageGetItem(page, id);
+				BlockNumber	block = ItemPointerGetBlockNumber(&itup->t_tid);
+				bool		ispivot = (!rightmost && offset == P_HIKEY);
+
+				Assert(range_nblocks >= 0);
+				Assert(range_nblocks < range_len * MaxTIDsPerBTreePage);
+
+				/* ignore pivot tuples */
+				if (!ispivot)
+					range_blocks[range_nblocks++] = block;
+
+				offset++;
+			}
+		}
+
+		/* Unlock and release buffer */
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+		ReleaseBuffer(buffer);
+	}
+
+	/* output the last range */
+	if (range_nblocks > 0)
+	{
+		HeapTuple	tuple;
+
+		memset(run_lengths, 0, sizeof(int) * (range_len * MaxTIDsPerBTreePage + 1));
+		count_run_lengths(range_nblocks, range_blocks, run_lengths);
+
+		for (int i = 1; i <= (range_len * MaxTIDsPerBTreePage); i++)
+		{
+			if (run_lengths[i] > 0)
+			{
+				values[0] = Int64GetDatum(range_start);
+				values[1] = Int32GetDatum(i);
+				values[2] = Int32GetDatum(run_lengths[i]);
+
+				/* Build and return the result tuple */
+				tuple = heap_form_tuple(tupdesc, values, nulls);
+
+				tuplestore_puttuple(tupstore, tuple);
+			}
+		}
+
+		range_nblocks = 0;
+	}
+
+	relation_close(rel, AccessShareLock);
+
+	PG_RETURN_NULL();
+}
diff --git a/contrib/pgstattuple/pgstattuple--1.5--1.6.sql b/contrib/pgstattuple/pgstattuple--1.5--1.6.sql
new file mode 100644
index 00000000000..c6a450a301b
--- /dev/null
+++ b/contrib/pgstattuple/pgstattuple--1.5--1.6.sql
@@ -0,0 +1,30 @@
+/* contrib/pgstattuple/pgstattuple--1.5--1.6.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pgstattuple UPDATE TO '1.6'" to load this file. \quit
+
+CREATE OR REPLACE FUNCTION pgstatindex_nheap(IN relname text,
+    IN blocks BIGINT,			-- length of ranges to analyze
+    OUT block BIGINT,			-- first block of a range
+    OUT num_leafs BIGINT,		-- number of leaf pages
+    OUT num_items BIGINT,		-- number of heap TIDs
+    OUT num_blocks BIGINT,		-- number of distinct blocks
+    OUT num_runs BIGINT)		-- number of continuous runs
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pgstatindex_nheap_v1_6'
+LANGUAGE C STRICT PARALLEL SAFE;
+
+REVOKE EXECUTE ON FUNCTION pgstatindex_nheap(text, bigint) FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pgstatindex_nheap(text, bigint) TO pg_stat_scan_tables;
+
+CREATE OR REPLACE FUNCTION pgstatindex_runs(IN relname text,
+    IN blocks BIGINT,			-- length of ranges to analyze
+    OUT block BIGINT,			-- first block of a range
+    OUT run_length INT,			-- number of leaf pages
+    OUT run_count INT)		-- number of heap TIDs
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pgstatindex_runs_v1_6'
+LANGUAGE C STRICT PARALLEL SAFE;
+
+REVOKE EXECUTE ON FUNCTION pgstatindex_runs(text, bigint) FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pgstatindex_runs(text, bigint) TO pg_stat_scan_tables;
diff --git a/contrib/pgstattuple/pgstattuple.control b/contrib/pgstattuple/pgstattuple.control
index 6af40757b27..80d06958e90 100644
--- a/contrib/pgstattuple/pgstattuple.control
+++ b/contrib/pgstattuple/pgstattuple.control
@@ -1,5 +1,5 @@
 # pgstattuple extension
 comment = 'show tuple-level statistics'
-default_version = '1.5'
+default_version = '1.6'
 module_pathname = '$libdir/pgstattuple'
 relocatable = true
-- 
2.50.1

