From 0ca92d2ccee0e589a35a79f9046c3a7900ecacf4 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 25 Feb 2026 16:23:09 -0500
Subject: [PATCH v38 01/12] Fix visibility map corruption in more cases

Move VM corruption detection and repair into pruning. This allows VM
repair during on-access pruning, not only during vacuum.

Also, expand corruption detection to cover pages marked all-visible that
contain dead tuples and tuples inserted or updated by in-progress
transactions, rather than only all-visible pages with LP_DEAD items.

Pinning the correct VM page before on-access pruning is cheap when
compared to the cost of actually pruning. The vmbuffer is saved in the
scan descriptor, so a query should only need to pin each VM page once
and a single VM page covers a large number of heap pages.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
---
 src/backend/access/heap/pruneheap.c  | 176 ++++++++++++++++++++++++---
 src/backend/access/heap/vacuumlazy.c |  89 +-------------
 src/include/access/heapam.h          |  12 ++
 3 files changed, 175 insertions(+), 102 deletions(-)

diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 8d9f0694206..52cafb23c6b 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -19,7 +19,7 @@
 #include "access/htup_details.h"
 #include "access/multixact.h"
 #include "access/transam.h"
-#include "access/visibilitymapdefs.h"
+#include "access/visibilitymap.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
 #include "commands/vacuum.h"
@@ -114,6 +114,21 @@ typedef struct
 	 */
 	HeapPageFreeze pagefrz;
 
+	/*-------------------------------------------------------
+	 * Working state for visibility map processing
+	 *-------------------------------------------------------
+	 */
+
+	/*
+	 * Caller must provide a pinned vmbuffer corresponding to the heap block
+	 * passed to heap_page_prune_and_freeze(). We will fix any corruption
+	 * found in the VM.
+	 */
+	Buffer		vmbuffer;
+
+	/* Bits in the vmbuffer for this heap page */
+	uint8		vmbits;
+
 	/*-------------------------------------------------------
 	 * Information about what was done
 	 *
@@ -168,6 +183,7 @@ static void prune_freeze_setup(PruneFreezeParams *params,
 							   MultiXactId *new_relmin_mxid,
 							   PruneFreezeResult *presult,
 							   PruneState *prstate);
+static void heap_fix_vm_corruption(PruneState *prstate, OffsetNumber offnum);
 static void prune_freeze_plan(PruneState *prstate,
 							  OffsetNumber *off_loc);
 static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate,
@@ -175,7 +191,8 @@ static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate,
 static inline HTSV_Result htsv_get_valid_status(int status);
 static void heap_prune_chain(OffsetNumber maxoff,
 							 OffsetNumber rootoffnum, PruneState *prstate);
-static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
+static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid,
+									   OffsetNumber offnum);
 static void heap_prune_record_redirect(PruneState *prstate,
 									   OffsetNumber offnum, OffsetNumber rdoffnum,
 									   bool was_normal);
@@ -209,8 +226,9 @@ static bool heap_page_will_freeze(bool did_tuple_hint_fpi, bool do_prune, bool d
  * Caller must have pin on the buffer, and must *not* have a lock on it.
  *
  * This function may pin *vmbuffer. It's passed by reference so the caller can
- * reuse the pin across calls, avoiding repeated pin/unpin cycles. Caller is
- * responsible for unpinning it.
+ * reuse the pin across calls, avoiding repeated pin/unpin cycles. If we find
+ * VM corruption during pruning, we will fix it. Caller is responsible for
+ * unpinning *vmbuffer.
  */
 void
 heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer)
@@ -277,6 +295,16 @@ heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer)
 		{
 			OffsetNumber dummy_off_loc;
 			PruneFreezeResult presult;
+			PruneFreezeParams params;
+
+			visibilitymap_pin(relation, BufferGetBlockNumber(buffer), vmbuffer);
+
+			params.relation = relation;
+			params.buffer = buffer;
+			params.vmbuffer = *vmbuffer;
+			params.reason = PRUNE_ON_ACCESS;
+			params.vistest = vistest;
+			params.cutoffs = NULL;
 
 			/*
 			 * We don't pass the HEAP_PAGE_PRUNE_MARK_UNUSED_NOW option
@@ -284,14 +312,7 @@ heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer)
 			 * cannot safely determine that during on-access pruning with the
 			 * current implementation.
 			 */
-			PruneFreezeParams params = {
-				.relation = relation,
-				.buffer = buffer,
-				.reason = PRUNE_ON_ACCESS,
-				.options = 0,
-				.vistest = vistest,
-				.cutoffs = NULL,
-			};
+			params.options = 0;
 
 			heap_page_prune_and_freeze(&params, &presult, &dummy_off_loc,
 									   NULL, NULL);
@@ -354,6 +375,12 @@ prune_freeze_setup(PruneFreezeParams *params,
 	prstate->buffer = params->buffer;
 	prstate->page = BufferGetPage(params->buffer);
 
+	Assert(BufferIsValid(params->vmbuffer));
+	prstate->vmbuffer = params->vmbuffer;
+	prstate->vmbits = visibilitymap_get_status(prstate->relation,
+											   prstate->block,
+											   &prstate->vmbuffer);
+
 	/*
 	 * Our strategy is to scan the page and make lists of items to change,
 	 * then apply the changes within a critical section.  This keeps as much
@@ -770,6 +797,90 @@ heap_page_will_freeze(bool did_tuple_hint_fpi,
 	return do_freeze;
 }
 
+/*
+ * Helper to fix visibility-related corruption on a heap page and its
+ * corresponding VM page. An all-visible page cannot have dead items nor can
+ * it have tuples that are not visible to all running transactions. It clears
+ * the VM corruption as well as resetting the vmbits used during pruning.
+ *
+ * This function must be called while holding an exclusive lock on the heap
+ * buffer, and any dead items must have been discovered under that same lock.
+ * Although we do not hold a lock on the VM buffer, it is pinned, and the heap
+ * buffer is exclusively locked, ensuring that no other backend can update the
+ * VM bits corresponding to this heap page.
+ *
+ * This function makes changes to the VM and, potentially, the heap page, but
+ * it does not need to be done in a critical section.
+ */
+static void
+heap_fix_vm_corruption(PruneState *prstate, OffsetNumber offnum)
+{
+	const char *relname = RelationGetRelationName(prstate->relation);
+
+	Assert(BufferIsLockedByMeInMode(prstate->buffer, BUFFER_LOCK_EXCLUSIVE));
+
+	if (PageIsAllVisible(prstate->page))
+	{
+		/*
+		 * It's possible for the value returned by
+		 * GetOldestNonRemovableTransactionId() to move backwards, so it's not
+		 * wrong for us to see tuples that appear to not be visible to
+		 * everyone yet, while PD_ALL_VISIBLE is already set. The real safe
+		 * xmin value never moves backwards, but
+		 * GetOldestNonRemovableTransactionId() is conservative and sometimes
+		 * returns a value that's unnecessarily small, so if we see that
+		 * contradiction it just means that the tuples that we think are not
+		 * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag
+		 * is correct.
+		 *
+		 * However, there should never be LP_DEAD items, dead tuple versions,
+		 * or tuples inserted by an in-progress transaction on a page with
+		 * PD_ALL_VISIBLE set.
+		 */
+		if (prstate->lpdead_items > 0)
+		{
+			ereport(WARNING,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("dead line pointer found on page marked all-visible"),
+					 errcontext("relation \"%s\", page %u, tuple %u",
+								relname, prstate->block, offnum)));
+		}
+		else
+		{
+			ereport(WARNING,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("tuple not visible to all transactions found on page marked all-visible"),
+					 errcontext("relation \"%s\", page %u, tuple %u",
+								relname, prstate->block, offnum)));
+		}
+
+		/*
+		 * Mark the buffer dirty now in case we make no further changes and
+		 * therefore would not mark it dirty later.
+		 */
+		PageClearAllVisible(prstate->page);
+		MarkBufferDirtyHint(prstate->buffer, true);
+	}
+	else if (prstate->vmbits & VISIBILITYMAP_VALID_BITS)
+	{
+		/*
+		 * As of PostgreSQL 9.2, the visibility map bit should never be set if
+		 * the page-level bit is clear. However, for vacuum, it's possible
+		 * that the bit got cleared after heap_vac_scan_next_block() was
+		 * called, so we must recheck now that we have the buffer lock before
+		 * concluding that the VM is corrupt.
+		 */
+		ereport(WARNING,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("page is not marked all-visible but visibility map bit is set"),
+				 errcontext("relation \"%s\", page %u",
+							relname, prstate->block)));
+	}
+
+	visibilitymap_clear(prstate->relation, prstate->block, prstate->vmbuffer,
+						VISIBILITYMAP_VALID_BITS);
+	prstate->vmbits = 0;
+}
 
 /*
  * Prune and repair fragmentation and potentially freeze tuples on the
@@ -830,6 +941,10 @@ heap_page_prune_and_freeze(PruneFreezeParams *params,
 					   new_relfrozen_xid, new_relmin_mxid,
 					   presult, &prstate);
 
+	if ((prstate.vmbits & VISIBILITYMAP_VALID_BITS) &&
+		!PageIsAllVisible(prstate.page))
+		heap_fix_vm_corruption(&prstate, InvalidOffsetNumber);
+
 	/*
 	 * Examine all line pointers and tuple visibility information to determine
 	 * which line pointers should change state and which tuples may be frozen.
@@ -973,6 +1088,7 @@ heap_page_prune_and_freeze(PruneFreezeParams *params,
 	presult->set_all_visible = prstate.set_all_visible;
 	presult->set_all_frozen = prstate.set_all_frozen;
 	presult->hastup = prstate.hastup;
+	presult->vmbits = prstate.vmbits;
 
 	/*
 	 * For callers planning to update the visibility map, the conflict horizon
@@ -1295,7 +1411,8 @@ process_chain:
 
 /* Record lowest soon-prunable XID */
 static void
-heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
+heap_prune_record_prunable(PruneState *prstate, TransactionId xid,
+						   OffsetNumber offnum)
 {
 	/*
 	 * This should exactly match the PageSetPrunable macro.  We can't store
@@ -1305,6 +1422,13 @@ heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
 	if (!TransactionIdIsValid(prstate->new_prune_xid) ||
 		TransactionIdPrecedes(xid, prstate->new_prune_xid))
 		prstate->new_prune_xid = xid;
+
+	/*
+	 * It's incorrect for a page to be marked all-visible if it contains
+	 * prunable items.
+	 */
+	if (PageIsAllVisible(prstate->page))
+		heap_fix_vm_corruption(prstate, offnum);
 }
 
 /* Record line pointer to be redirected */
@@ -1388,6 +1512,15 @@ heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum,
 		heap_prune_record_unused(prstate, offnum, was_normal);
 	else
 		heap_prune_record_dead(prstate, offnum, was_normal);
+
+	/*
+	 * It's incorrect for the page to be set all-visible if it contains dead
+	 * items. Fix that on the heap page and check the VM for corruption as
+	 * well. Do that here rather than in heap_prune_record_dead() so we also
+	 * cover tuples that are directly marked LP_UNUSED via mark_unused_now.
+	 */
+	if (PageIsAllVisible(prstate->page))
+		heap_fix_vm_corruption(prstate, offnum);
 }
 
 /* Record line pointer to be marked unused */
@@ -1527,7 +1660,8 @@ heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum)
 			 * that the page is reconsidered for pruning in future.
 			 */
 			heap_prune_record_prunable(prstate,
-									   HeapTupleHeaderGetUpdateXid(htup));
+									   HeapTupleHeaderGetUpdateXid(htup),
+									   offnum);
 			break;
 
 		case HEAPTUPLE_INSERT_IN_PROGRESS:
@@ -1542,6 +1676,10 @@ heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum)
 			prstate->set_all_visible = false;
 			prstate->set_all_frozen = false;
 
+			/* The page should not be marked all-visible */
+			if (PageIsAllVisible(page))
+				heap_fix_vm_corruption(prstate, offnum);
+
 			/*
 			 * If we wanted to optimize for aborts, we might consider marking
 			 * the page prunable when we see INSERT_IN_PROGRESS.  But we
@@ -1566,7 +1704,8 @@ heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum)
 			 * the page is reconsidered for pruning in future.
 			 */
 			heap_prune_record_prunable(prstate,
-									   HeapTupleHeaderGetUpdateXid(htup));
+									   HeapTupleHeaderGetUpdateXid(htup),
+									   offnum);
 			break;
 
 		default:
@@ -1632,6 +1771,13 @@ heap_prune_record_unchanged_lp_dead(PruneState *prstate, OffsetNumber offnum)
 
 	/* Record the dead offset for vacuum */
 	prstate->deadoffsets[prstate->lpdead_items++] = offnum;
+
+	/*
+	 * It's incorrect for a page to be marked all-visible if it contains dead
+	 * items.
+	 */
+	if (PageIsAllVisible(prstate->page))
+		heap_fix_vm_corruption(prstate, offnum);
 }
 
 /*
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 82c5b28e0ad..957322648ca 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -425,11 +425,6 @@ static void find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis);
 static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf,
 								   BlockNumber blkno, Page page,
 								   bool sharelock, Buffer vmbuffer);
-static void identify_and_fix_vm_corruption(Relation rel, Buffer heap_buffer,
-										   BlockNumber heap_blk, Page heap_page,
-										   int nlpdead_items,
-										   Buffer vmbuffer,
-										   uint8 *vmbits);
 static int	lazy_scan_prune(LVRelState *vacrel, Buffer buf,
 							BlockNumber blkno, Page page,
 							Buffer vmbuffer,
@@ -1964,81 +1959,6 @@ cmpOffsetNumbers(const void *a, const void *b)
 	return pg_cmp_u16(*(const OffsetNumber *) a, *(const OffsetNumber *) b);
 }
 
-/*
- * Helper to correct any corruption detected on a heap page and its
- * corresponding visibility map page after pruning but before setting the
- * visibility map. It examines the heap page, the associated VM page, and the
- * number of dead items previously identified.
- *
- * This function must be called while holding an exclusive lock on the heap
- * buffer, and the dead items must have been discovered under that same lock.
-
- * The provided vmbits must reflect the current state of the VM block
- * referenced by vmbuffer. Although we do not hold a lock on the VM buffer, it
- * is pinned, and the heap buffer is exclusively locked, ensuring that no
- * other backend can update the VM bits corresponding to this heap page.
- *
- * If it clears corruption, it will zero out vmbits.
- */
-static void
-identify_and_fix_vm_corruption(Relation rel, Buffer heap_buffer,
-							   BlockNumber heap_blk, Page heap_page,
-							   int nlpdead_items,
-							   Buffer vmbuffer,
-							   uint8 *vmbits)
-{
-	Assert(visibilitymap_get_status(rel, heap_blk, &vmbuffer) == *vmbits);
-
-	Assert(BufferIsLockedByMeInMode(heap_buffer, BUFFER_LOCK_EXCLUSIVE));
-
-	/*
-	 * As of PostgreSQL 9.2, the visibility map bit should never be set if the
-	 * page-level bit is clear.  However, it's possible that the bit got
-	 * cleared after heap_vac_scan_next_block() was called, so we must recheck
-	 * with buffer lock before concluding that the VM is corrupt.
-	 */
-	if (!PageIsAllVisible(heap_page) &&
-		((*vmbits & VISIBILITYMAP_VALID_BITS) != 0))
-	{
-		ereport(WARNING,
-				(errcode(ERRCODE_DATA_CORRUPTED),
-				 errmsg("page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
-						RelationGetRelationName(rel), heap_blk)));
-
-		visibilitymap_clear(rel, heap_blk, vmbuffer,
-							VISIBILITYMAP_VALID_BITS);
-		*vmbits = 0;
-	}
-
-	/*
-	 * It's possible for the value returned by
-	 * GetOldestNonRemovableTransactionId() to move backwards, so it's not
-	 * wrong for us to see tuples that appear to not be visible to everyone
-	 * yet, while PD_ALL_VISIBLE is already set. The real safe xmin value
-	 * never moves backwards, but GetOldestNonRemovableTransactionId() is
-	 * conservative and sometimes returns a value that's unnecessarily small,
-	 * so if we see that contradiction it just means that the tuples that we
-	 * think are not visible to everyone yet actually are, and the
-	 * PD_ALL_VISIBLE flag is correct.
-	 *
-	 * There should never be LP_DEAD items on a page with PD_ALL_VISIBLE set,
-	 * however.
-	 */
-	else if (PageIsAllVisible(heap_page) && nlpdead_items > 0)
-	{
-		ereport(WARNING,
-				(errcode(ERRCODE_DATA_CORRUPTED),
-				 errmsg("page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u",
-						RelationGetRelationName(rel), heap_blk)));
-
-		PageClearAllVisible(heap_page);
-		MarkBufferDirty(heap_buffer);
-		visibilitymap_clear(rel, heap_blk, vmbuffer,
-							VISIBILITYMAP_VALID_BITS);
-		*vmbits = 0;
-	}
-}
-
 /*
  *	lazy_scan_prune() -- lazy_scan_heap() pruning and freezing.
  *
@@ -2070,6 +1990,7 @@ lazy_scan_prune(LVRelState *vacrel,
 	PruneFreezeParams params = {
 		.relation = rel,
 		.buffer = buf,
+		.vmbuffer = vmbuffer,
 		.reason = PRUNE_VACUUM_SCAN,
 		.options = HEAP_PAGE_PRUNE_FREEZE,
 		.vistest = vacrel->vistest,
@@ -2179,18 +2100,12 @@ lazy_scan_prune(LVRelState *vacrel,
 	Assert(!presult.set_all_visible || !(*has_lpdead_items));
 	Assert(!presult.set_all_frozen || presult.set_all_visible);
 
-	old_vmbits = visibilitymap_get_status(vacrel->rel, blkno, &vmbuffer);
-
-	identify_and_fix_vm_corruption(vacrel->rel, buf, blkno, page,
-								   presult.lpdead_items, vmbuffer,
-								   &old_vmbits);
-
 	if (!presult.set_all_visible)
 		return presult.ndeleted;
 
 	/* Set the visibility map and page visibility hint */
+	old_vmbits = presult.vmbits;
 	new_vmbits = VISIBILITYMAP_ALL_VISIBLE;
-
 	if (presult.set_all_frozen)
 		new_vmbits |= VISIBILITYMAP_ALL_FROZEN;
 
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 2fdc50b865b..c649e5f1980 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -262,6 +262,12 @@ typedef struct PruneFreezeParams
 	Relation	relation;		/* relation containing buffer to be pruned */
 	Buffer		buffer;			/* buffer to be pruned */
 
+	/*
+	 * Callers should provide a pinned vmbuffer corresponding to the heap
+	 * block in buffer. We will check for and repair any corruption in the VM.
+	 */
+	Buffer		vmbuffer;
+
 	/*
 	 * The reason pruning was performed.  It is used to set the WAL record
 	 * opcode which is used for debugging and analysis purposes.
@@ -324,6 +330,12 @@ typedef struct PruneFreezeResult
 	bool		set_all_frozen;
 	TransactionId vm_conflict_horizon;
 
+	/*
+	 * vmbits is the value of the vmbuffer's vmbits at the beginning of
+	 * pruning. It is cleared if VM corruption is found and corrected.
+	 */
+	uint8		vmbits;
+
 	/*
 	 * Whether or not the page makes rel truncation unsafe.  This is set to
 	 * 'true', even if the page contains LP_DEAD items.  VACUUM will remove
-- 
2.43.0

