From 352e4756fb7641a0142a7e2a1a0826d81427b935 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 3 Dec 2025 15:24:08 -0500
Subject: [PATCH v24 15/16] Allow on-access pruning to set pages all-visible

Many queries do not modify the underlying relation. For such queries, if
on-access pruning occurs during the scan, we can check whether the page
has become all-visible and update the visibility map accordingly.
Previously, only vacuum and COPY FREEZE marked pages as all-visible or
all-frozen.

This commit implements on-access VM setting for sequential scans as well
as for the underlying heap relation in index scans and bitmap heap
scans.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Discussion: https://postgr.es/m/flat/CAAKRu_ZMw6Npd_qm2KM%2BFwQ3cMOMx1Dh3VMhp8-V7SOLxdK9-g%40mail.gmail.com
---
 src/backend/access/heap/heapam.c              | 15 +++-
 src/backend/access/heap/heapam_handler.c      | 15 +++-
 src/backend/access/heap/pruneheap.c           | 74 ++++++++++++++++---
 src/include/access/heapam.h                   | 24 +++++-
 .../t/035_standby_logical_decoding.pl         |  3 +-
 5 files changed, 114 insertions(+), 17 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 3ad78ba4694..ecc04390ac7 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -570,6 +570,7 @@ heap_prepare_pagescan(TableScanDesc sscan)
 	Buffer		buffer = scan->rs_cbuf;
 	BlockNumber block = scan->rs_cblock;
 	Snapshot	snapshot;
+	Buffer	   *vmbuffer = NULL;
 	Page		page;
 	int			lines;
 	bool		all_visible;
@@ -584,7 +585,9 @@ heap_prepare_pagescan(TableScanDesc sscan)
 	/*
 	 * Prune and repair fragmentation for the whole page, if possible.
 	 */
-	heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
+	if (sscan->rs_flags & SO_HINT_REL_READ_ONLY)
+		vmbuffer = &scan->rs_vmbuffer;
+	heap_page_prune_opt(scan->rs_base.rs_rd, buffer, vmbuffer);
 
 	/*
 	 * We must hold share lock on the buffer content while examining tuple
@@ -1261,6 +1264,7 @@ heap_beginscan(Relation relation, Snapshot snapshot,
 														  sizeof(TBMIterateResult));
 	}
 
+	scan->rs_vmbuffer = InvalidBuffer;
 
 	return (TableScanDesc) scan;
 }
@@ -1299,6 +1303,12 @@ heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
 		scan->rs_cbuf = InvalidBuffer;
 	}
 
+	if (BufferIsValid(scan->rs_vmbuffer))
+	{
+		ReleaseBuffer(scan->rs_vmbuffer);
+		scan->rs_vmbuffer = InvalidBuffer;
+	}
+
 	/*
 	 * SO_TYPE_BITMAPSCAN would be cleaned up here, but it does not hold any
 	 * additional data vs a normal HeapScan
@@ -1331,6 +1341,9 @@ heap_endscan(TableScanDesc sscan)
 	if (BufferIsValid(scan->rs_cbuf))
 		ReleaseBuffer(scan->rs_cbuf);
 
+	if (BufferIsValid(scan->rs_vmbuffer))
+		ReleaseBuffer(scan->rs_vmbuffer);
+
 	/*
 	 * Must free the read stream before freeing the BufferAccessStrategy.
 	 */
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index d7fac94826d..27e3498f5f4 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -85,6 +85,7 @@ heapam_index_fetch_begin(Relation rel, uint32 flags)
 
 	hscan->xs_base.rel = rel;
 	hscan->xs_cbuf = InvalidBuffer;
+	hscan->xs_vmbuffer = InvalidBuffer;
 	hscan->modifies_base_rel = !(flags & SO_HINT_REL_READ_ONLY);
 
 	return &hscan->xs_base;
@@ -100,6 +101,12 @@ heapam_index_fetch_reset(IndexFetchTableData *scan)
 		ReleaseBuffer(hscan->xs_cbuf);
 		hscan->xs_cbuf = InvalidBuffer;
 	}
+
+	if (BufferIsValid(hscan->xs_vmbuffer))
+	{
+		ReleaseBuffer(hscan->xs_vmbuffer);
+		hscan->xs_vmbuffer = InvalidBuffer;
+	}
 }
 
 static void
@@ -139,7 +146,8 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 		 * Prune page, but only if we weren't already on this page
 		 */
 		if (prev_buf != hscan->xs_cbuf)
-			heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
+			heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf,
+								hscan->modifies_base_rel ? NULL : &hscan->xs_vmbuffer);
 	}
 
 	/* Obtain share-lock on the buffer so we can examine visibility */
@@ -2472,6 +2480,7 @@ BitmapHeapScanNextBlock(TableScanDesc scan,
 	TBMIterateResult *tbmres;
 	OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE];
 	int			noffsets = -1;
+	Buffer	   *vmbuffer = NULL;
 
 	Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN);
 	Assert(hscan->rs_read_stream);
@@ -2518,7 +2527,9 @@ BitmapHeapScanNextBlock(TableScanDesc scan,
 	/*
 	 * Prune and repair fragmentation for the whole page, if possible.
 	 */
-	heap_page_prune_opt(scan->rs_rd, buffer);
+	if (scan->rs_flags & SO_HINT_REL_READ_ONLY)
+		vmbuffer = &hscan->rs_vmbuffer;
+	heap_page_prune_opt(scan->rs_rd, buffer, vmbuffer);
 
 	/*
 	 * We must hold share lock on the buffer content while examining tuple
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 4ed2eff5e05..15239e0cbbd 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -205,7 +205,9 @@ static bool heap_page_will_set_vm(Relation relation,
 								  Buffer heap_buf,
 								  Buffer vmbuffer,
 								  bool blk_known_av,
-								  const PruneState *prstate,
+								  PruneReason reason,
+								  bool do_prune, bool do_freeze,
+								  PruneState *prstate,
 								  uint8 *new_vmbits);
 
 
@@ -220,9 +222,13 @@ static bool heap_page_will_set_vm(Relation relation,
  * if there's not any use in pruning.
  *
  * Caller must have pin on the buffer, and must *not* have a lock on it.
+ *
+ * If vmbuffer is not NULL, it is okay for pruning to set the visibility map if
+ * the page is all-visible. We will take care of pinning and, if needed,
+ * reading in the page of the visibility map.
  */
 void
-heap_page_prune_opt(Relation relation, Buffer buffer)
+heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer)
 {
 	Page		page = BufferGetPage(buffer);
 	TransactionId prune_xid;
@@ -304,6 +310,13 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
 				.cutoffs = NULL,
 			};
 
+			if (vmbuffer)
+			{
+				visibilitymap_pin(relation, BufferGetBlockNumber(buffer), vmbuffer);
+				params.options |= HEAP_PAGE_PRUNE_UPDATE_VM;
+				params.vmbuffer = *vmbuffer;
+			}
+
 			heap_page_prune_and_freeze(&params, &presult, &dummy_off_loc,
 									   NULL, NULL);
 
@@ -862,6 +875,9 @@ get_conflict_xid(bool do_prune, bool do_freeze, bool do_set_vm, uint8 new_vmbits
  * corrupted, it will fix them by clearing the VM bits and page visibility
  * hint. This does not need to be done in a critical section.
  *
+ * This should be called only after do_freeze has been decided (and do_prune
+ * has been set), as these factor into our heuristic-based decision.
+ *
  * Returns true if one or both VM bits should be set, along with returning the
  * desired what bits should be set in the VM in *new_vmbits.
  */
@@ -871,7 +887,9 @@ heap_page_will_set_vm(Relation relation,
 					  Buffer heap_buf,
 					  Buffer vmbuffer,
 					  bool blk_known_av,
-					  const PruneState *prstate,
+					  PruneReason reason,
+					  bool do_prune, bool do_freeze,
+					  PruneState *prstate,
 					  uint8 *new_vmbits)
 {
 	Page		heap_page = BufferGetPage(heap_buf);
@@ -884,6 +902,24 @@ heap_page_will_set_vm(Relation relation,
 		return false;
 	}
 
+	/*
+	 * If this is an on-access call and we're not actually pruning, avoid
+	 * setting the visibility map if it would newly dirty the heap page or, if
+	 * the page is already dirty, if doing so would require including a
+	 * full-page image (FPI) of the heap page in the WAL. This situation
+	 * should be rare, as on-access pruning is only attempted when
+	 * pd_prune_xid is valid.
+	 */
+	if (reason == PRUNE_ON_ACCESS &&
+		prstate->all_visible &&
+		!do_prune && !do_freeze &&
+		(!BufferIsDirty(heap_buf) || XLogCheckBufferNeedsBackup(heap_buf)))
+	{
+		prstate->all_visible = false;
+		prstate->all_frozen = false;
+		return false;
+	}
+
 	/*
 	 * Determine what the visibility map bits should be set to using the
 	 * values of all_visible and all_frozen determined during
@@ -906,14 +942,15 @@ heap_page_will_set_vm(Relation relation,
 	 * WAL-logged.
 	 *
 	 * As of PostgreSQL 9.2, the visibility map bit should never be set if the
-	 * page-level bit is clear.  However, it's possible that the bit got
-	 * cleared after heap_vac_scan_next_block() was called, so we must recheck
-	 * with buffer lock before concluding that the VM is corrupt.
+	 * page-level bit is clear.  However, it's possible that in vacuum the bit
+	 * got cleared after heap_vac_scan_next_block() was called, so we must
+	 * recheck with buffer lock before concluding that the VM is corrupt.
 	 *
 	 * Callers which did not check the visibility map and determine
 	 * blk_known_av will not be eligible for this, however the cost of
 	 * potentially needing to read the visibility map for pages that are not
-	 * all-visible is too high to justify generalizing the check.
+	 * all-visible is too high to justify generalizing the check. A future
+	 * vacuum will have to take care of fixing the corruption.
 	 */
 	else if (blk_known_av && !PageIsAllVisible(heap_page) &&
 			 visibilitymap_get_status(relation, heap_blk, &vmbuffer) != 0)
@@ -1129,6 +1166,7 @@ heap_page_prune_and_freeze(PruneFreezeParams *params,
 	 */
 	do_set_vm = heap_page_will_set_vm(params->relation,
 									  blockno, buffer, vmbuffer, params->blk_known_av,
+									  params->reason, do_prune, do_freeze,
 									  &prstate, &new_vmbits);
 
 	/*
@@ -1195,15 +1233,31 @@ heap_page_prune_and_freeze(PruneFreezeParams *params,
 			old_vmbits = visibilitymap_set(blockno,
 										   vmbuffer, new_vmbits,
 										   params->relation->rd_locator);
-			Assert(old_vmbits != new_vmbits);
+
+			/*
+			 * If on-access pruning set the VM in between when vacuum first
+			 * checked the visibility map and determined blk_known_av and when
+			 * we actually prune the page, we could end up trying to set the
+			 * VM only to find it is already set.
+			 */
+			if (old_vmbits == new_vmbits)
+			{
+				LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
+				/* Unset so we don't emit WAL since no change occured */
+				do_set_vm = false;
+			}
 		}
 
 		MarkBufferDirty(buffer);
 
 		/*
-		 * Emit a WAL XLOG_HEAP2_PRUNE* record showing what we did
+		 * Emit a WAL XLOG_HEAP2_PRUNE* record showing what we did. If we were
+		 * only planning to update the VM, and it turns out that it was
+		 * already set, there is no need to emit WAL. As such, we must check
+		 * that some change is required again.
 		 */
-		if (RelationNeedsWAL(params->relation))
+		if (RelationNeedsWAL(params->relation) &&
+			(do_prune || do_freeze || do_set_vm))
 		{
 			log_heap_prune_and_freeze(params->relation, buffer,
 									  do_set_vm ? vmbuffer : InvalidBuffer,
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 480a1bd654f..89538652566 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -95,6 +95,13 @@ typedef struct HeapScanDescData
 	 */
 	ParallelBlockTableScanWorkerData *rs_parallelworkerdata;
 
+	/*
+	 * For sequential scans and bitmap heap scans. If the relation is not
+	 * being modified, on-access pruning may read in the current heap page's
+	 * corresponding VM block to this buffer.
+	 */
+	Buffer		rs_vmbuffer;
+
 	/* these fields only used in page-at-a-time mode and for bitmap scans */
 	uint32		rs_cindex;		/* current tuple's index in vistuples */
 	uint32		rs_ntuples;		/* number of visible tuples on page */
@@ -117,8 +124,18 @@ typedef struct IndexFetchHeapData
 {
 	IndexFetchTableData xs_base;	/* AM independent part of the descriptor */
 
-	Buffer		xs_cbuf;		/* current heap buffer in scan, if any */
-	/* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
+	/*
+	 * Current heap buffer in scan, if any. NB: if xs_cbuf is not
+	 * InvalidBuffer, we hold a pin on that buffer.
+	 */
+	Buffer		xs_cbuf;
+
+	/*
+	 * For index scans that do not modify the underlying heap table, on-access
+	 * pruning may read in the current heap page's corresponding VM block to
+	 * this buffer.
+	 */
+	Buffer		xs_vmbuffer;
 
 	/*
 	 * Some optimizations can only be performed if the query does not modify
@@ -425,7 +442,8 @@ extern TransactionId heap_index_delete_tuples(Relation rel,
 											  TM_IndexDeleteOp *delstate);
 
 /* in heap/pruneheap.c */
-extern void heap_page_prune_opt(Relation relation, Buffer buffer);
+extern void heap_page_prune_opt(Relation relation, Buffer buffer,
+								Buffer *vmbuffer);
 extern void heap_page_prune_and_freeze(PruneFreezeParams *params,
 									   PruneFreezeResult *presult,
 									   OffsetNumber *off_loc,
diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl
index ebe2fae1789..bdd9f0a62cd 100644
--- a/src/test/recovery/t/035_standby_logical_decoding.pl
+++ b/src/test/recovery/t/035_standby_logical_decoding.pl
@@ -296,6 +296,7 @@ wal_level = 'logical'
 max_replication_slots = 4
 max_wal_senders = 4
 autovacuum = off
+hot_standby_feedback = on
 });
 $node_primary->dump_info;
 $node_primary->start;
@@ -748,7 +749,7 @@ check_pg_recvlogical_stderr($handle,
 $logstart = -s $node_standby->logfile;
 
 reactive_slots_change_hfs_and_wait_for_xmins('shared_row_removal_',
-	'no_conflict_', 0, 1);
+	'no_conflict_', 1, 0);
 
 # This should not trigger a conflict
 wait_until_vacuum_can_remove(
-- 
2.43.0

