From 7ae7f9d9f1c05cf66d7fee964db801cbcf52a324 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Mon, 15 Sep 2025 16:32:35 -0400
Subject: [PATCH v14 08/24] Set PD_ALL_VISIBLE in heap_page_prune_and_freeze

After phase I of vacuum, if the heap page was rendered all-visible, we
can set it as such in the VM. We also must set the page-level
PD_ALL_VISIBLE bit. By setting PD_ALL_VISIBLE while making the other
changes to the heap page instead of while updating the VM, we can omit
the heap page from the WAL chain during the VM update. The result is
that xl_heap_prune records include updates to PD_ALL_VISIBLE.

This commit doesn't yet remove the heap page from the WAL chain because
it does not change other users of visibilitymap_set().

Note that this is carefully coded such that if the only modification to
the page during heap_page_prune_and_freeze() is setting PD_ALL_VISIBLE
and checksums/wal_log_hints are disabled we will never emit a full page
image of the heap page.

This also fixes a longstanding issue where, when checksums/wal_log_hints
are enabled, an all-visible page being set all-frozen may not mark the
buffer dirty before visibilitymap_set() stamps it with the
xl_heap_visible LSN.

It is noteworthy that the checks for page corruption and an inconsistent
state between the heap page and the VM in lazy_scan_prune() now happen
after having set PD_ALL_VISIBLE. That is not a functional change because
the corruption cases are mutually exclusive with cases where we would
set PD_ALL_VISIBLE.
---
 src/backend/access/heap/heapam_xlog.c | 63 +++++++++++++++++++----
 src/backend/access/heap/pruneheap.c   | 72 ++++++++++++++++++++++++---
 src/backend/access/heap/vacuumlazy.c  | 29 +----------
 src/include/access/heapam.h           |  2 +
 src/include/access/heapam_xlog.h      |  2 +
 5 files changed, 125 insertions(+), 43 deletions(-)

diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c
index faa7c561a8a..a54238f2b59 100644
--- a/src/backend/access/heap/heapam_xlog.c
+++ b/src/backend/access/heap/heapam_xlog.c
@@ -90,6 +90,7 @@ heap_xlog_prune_freeze(XLogReaderState *record)
 		xlhp_freeze_plan *plans;
 		OffsetNumber *frz_offsets;
 		char	   *dataptr = XLogRecGetBlockData(record, 0, &datalen);
+		bool		do_prune;
 
 		heap_xlog_deserialize_prune_and_freeze(dataptr, xlrec.flags,
 											   &nplans, &plans, &frz_offsets,
@@ -97,11 +98,13 @@ heap_xlog_prune_freeze(XLogReaderState *record)
 											   &ndead, &nowdead,
 											   &nunused, &nowunused);
 
+		do_prune = nredirected > 0 || ndead > 0 || nunused > 0;
+
 		/*
 		 * Update all line pointers per the record, and repair fragmentation
 		 * if needed.
 		 */
-		if (nredirected > 0 || ndead > 0 || nunused > 0)
+		if (do_prune)
 			heap_page_prune_execute(buffer,
 									(xlrec.flags & XLHP_CLEANUP_LOCK) == 0,
 									redirected, nredirected,
@@ -138,17 +141,52 @@ heap_xlog_prune_freeze(XLogReaderState *record)
 		/* There should be no more data */
 		Assert((char *) frz_offsets == dataptr + datalen);
 
+		/*
+		 * The critical integrity requirement here is that we must never end
+		 * up with a situation where the visibility map bit is set, and the
+		 * page-level PD_ALL_VISIBLE bit is clear.  If that were to occur,
+		 * then a subsequent page modification would fail to clear the
+		 * visibility map bit.
+		 */
+		if (xlrec.flags & XLHP_SET_PD_ALL_VIS)
+			PageSetAllVisible(page);
+
 		/*
 		 * Note: we don't worry about updating the page's prunability hints.
 		 * At worst this will cause an extra prune cycle to occur soon.
 		 */
-
-		PageSetLSN(page, lsn);
 		MarkBufferDirty(buffer);
+
+		/*
+		 * We always emit a WAL record when setting PD_ALL_VISIBLE, but we are
+		 * careful not to emit a full page image unless
+		 * checksums/wal_log_hints are enabled. We only set the heap page LSN
+		 * if full page images were an option when emitting WAL. Otherwise,
+		 * subsequent modifications of the page may incorrectly skip emitting
+		 * a full page image.
+		 */
+		if (do_prune || nplans > 0 ||
+			(xlrec.flags & XLHP_SET_PD_ALL_VIS && XLogHintBitIsNeeded()))
+			PageSetLSN(page, lsn);
 	}
 
 	/*
-	 * If we released any space or line pointers, update the free space map.
+	 * If we released any space or line pointers or set PD_ALL_VISIBLE update
+	 * the freespace map.
+	 *
+	 * Even if we are just setting PD_ALL_VISIBLE (and thus not freeing up any
+	 * space), we'll still update the FSM for this page. Since the FSM is not
+	 * WAL-logged and only updated heuristically, it easily becomes stale in
+	 * standbys.  If the standby is later promoted and runs VACUUM, it will
+	 * skip updating individual free space figures for pages that became
+	 * all-visible (or all-frozen, depending on the vacuum mode,) which is
+	 * troublesome when FreeSpaceMapVacuum propagates too optimistic free
+	 * space values to upper FSM layers; later inserters try to use such pages
+	 * only to find out that they are unusable.  This can cause long stalls
+	 * when there are many such pages.
+	 *
+	 * Forestall those problems by updating FSM's idea about a page that is
+	 * becoming all-visible or all-frozen.
 	 *
 	 * Do this regardless of a full-page image being applied, since the FSM
 	 * data is not in the page anyway.
@@ -157,10 +195,16 @@ heap_xlog_prune_freeze(XLogReaderState *record)
 	{
 		if (xlrec.flags & (XLHP_HAS_REDIRECTIONS |
 						   XLHP_HAS_DEAD_ITEMS |
-						   XLHP_HAS_NOW_UNUSED_ITEMS))
+						   XLHP_HAS_NOW_UNUSED_ITEMS |
+						   XLHP_SET_PD_ALL_VIS))
 		{
 			Size		freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
 
+			/*
+			 * We want to avoid holding an exclusive lock on the heap buffer
+			 * while doing IO, so we'll release the lock on the heap buffer
+			 * first.
+			 */
 			UnlockReleaseBuffer(buffer);
 
 			XLogRecordPageWithFreeSpace(rlocator, blkno, freespace);
@@ -173,10 +217,11 @@ heap_xlog_prune_freeze(XLogReaderState *record)
 /*
  * Replay XLOG_HEAP2_VISIBLE records.
  *
- * The critical integrity requirement here is that we must never end up with
- * a situation where the visibility map bit is set, and the page-level
- * PD_ALL_VISIBLE bit is clear.  If that were to occur, then a subsequent
- * page modification would fail to clear the visibility map bit.
+ * It is imperative that the previously emitted record set PD_ALL_VISIBLE on
+ * the heap page. We must never end up with a situation where the visibility
+ * map bit is set, and the page-level PD_ALL_VISIBLE bit is clear.  If that
+ * were to occur, then a subsequent page modification would fail to clear the
+ * visibility map bit.
  */
 static void
 heap_xlog_visible(XLogReaderState *record)
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 5e536bd0d4d..9b25131543b 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -495,6 +495,7 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer,
 	bool		do_freeze;
 	bool		do_prune;
 	bool		do_hint_prune;
+	bool		do_set_pd_vis;
 	bool		did_tuple_hint_fpi;
 	int64		fpi_before = pgWalUsage.wal_fpi;
 	TransactionId frz_conflict_horizon = InvalidTransactionId;
@@ -824,6 +825,22 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer,
 		prstate.all_visible = prstate.all_frozen = false;
 
 	Assert(!prstate.all_frozen || prstate.all_visible);
+
+	/*
+	 * Though callers should set the VM if PD_ALL_VISIBLE is set here, it is
+	 * allowed for the page-level bit to be set and the VM to be clear.
+	 * Setting PD_ALL_VISIBLE when we are making the changes to the page that
+	 * render it all-visible allows us to omit the heap page from the WAL
+	 * chain when later updating the VM -- even when checksums/wal_log_hints
+	 * are enabled.
+	 */
+	do_set_pd_vis = false;
+	if ((options & HEAP_PAGE_PRUNE_UPDATE_VIS) != 0)
+	{
+		if (prstate.all_visible && !PageIsAllVisible(page))
+			do_set_pd_vis = true;
+	}
+
 	/* Any error while applying the changes is critical */
 	START_CRIT_SECTION();
 
@@ -844,14 +861,17 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer,
 
 		/*
 		 * If that's all we had to do to the page, this is a non-WAL-logged
-		 * hint.  If we are going to freeze or prune the page, we will mark
-		 * the buffer dirty below.
+		 * hint.  If we are going to freeze or prune the page or set
+		 * PD_ALL_VISIBLE, we will mark the buffer dirty below.
+		 *
+		 * Setting PD_ALL_VISIBLE is fully WAL-logged because it is forbidden
+		 * for the VM to be set and PD_ALL_VISIBLE to be clear.
 		 */
-		if (!do_freeze && !do_prune)
+		if (!do_freeze && !do_prune && !do_set_pd_vis)
 			MarkBufferDirtyHint(buffer, true);
 	}
 
-	if (do_prune || do_freeze)
+	if (do_prune || do_freeze || do_set_pd_vis)
 	{
 		/* Apply the planned item changes and repair page fragmentation. */
 		if (do_prune)
@@ -865,6 +885,9 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer,
 		if (do_freeze)
 			heap_freeze_prepared_tuples(buffer, prstate.frozen, prstate.nfrozen);
 
+		if (do_set_pd_vis)
+			PageSetAllVisible(page);
+
 		MarkBufferDirty(buffer);
 
 		/*
@@ -891,7 +914,9 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer,
 
 			log_heap_prune_and_freeze(relation, buffer,
 									  conflict_xid,
-									  true, reason,
+									  true,
+									  do_set_pd_vis,
+									  reason,
 									  prstate.frozen, prstate.nfrozen,
 									  prstate.redirected, prstate.nredirected,
 									  prstate.nowdead, prstate.ndead,
@@ -2078,6 +2103,10 @@ heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples,
  * replaying 'unused' items depends on whether they were all previously marked
  * as dead.
  *
+ * set_pd_all_vis indicates that we set PD_ALL_VISIBLE and thus should update
+ * the page LSN when checksums/wal_log_hints are enabled even if we did not
+ * prune or freeze tuples on the page.
+ *
  * Note: This function scribbles on the 'frozen' array.
  *
  * Note: This is called in a critical section, so careful what you do here.
@@ -2086,6 +2115,7 @@ void
 log_heap_prune_and_freeze(Relation relation, Buffer buffer,
 						  TransactionId conflict_xid,
 						  bool cleanup_lock,
+						  bool set_pd_all_vis,
 						  PruneReason reason,
 						  HeapTupleFreeze *frozen, int nfrozen,
 						  OffsetNumber *redirected, int nredirected,
@@ -2095,6 +2125,7 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer,
 	xl_heap_prune xlrec;
 	XLogRecPtr	recptr;
 	uint8		info;
+	uint8		regbuf_flags;
 
 	/* The following local variables hold data registered in the WAL record: */
 	xlhp_freeze_plan plans[MaxHeapTuplesPerPage];
@@ -2103,8 +2134,21 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer,
 	xlhp_prune_items dead_items;
 	xlhp_prune_items unused_items;
 	OffsetNumber frz_offsets[MaxHeapTuplesPerPage];
+	bool		do_prune = nredirected > 0 || ndead > 0 || nunused > 0;
 
 	xlrec.flags = 0;
+	regbuf_flags = REGBUF_STANDARD;
+
+	/*
+	 * We can avoid an FPI if the only modification we are making to the heap
+	 * page is to set PD_ALL_VISIBLE and checksums/wal_log_hints are disabled.
+	 * Note that if we explicitly skip an FPI, we must not set the heap page
+	 * LSN later.
+	 */
+	if (!do_prune &&
+		nfrozen == 0 &&
+		(!set_pd_all_vis || !XLogHintBitIsNeeded()))
+		regbuf_flags |= REGBUF_NO_IMAGE;
 
 	/*
 	 * Prepare data for the buffer.  The arrays are not actually in the
@@ -2112,7 +2156,7 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer,
 	 * page image, the arrays can be omitted.
 	 */
 	XLogBeginInsert();
-	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+	XLogRegisterBuffer(0, buffer, regbuf_flags);
 	if (nfrozen > 0)
 	{
 		int			nplans;
@@ -2169,6 +2213,8 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer,
 	 * Prepare the main xl_heap_prune record.  We already set the XLHP_HAS_*
 	 * flag above.
 	 */
+	if (set_pd_all_vis)
+		xlrec.flags |= XLHP_SET_PD_ALL_VIS;
 	if (RelationIsAccessibleInLogicalDecoding(relation))
 		xlrec.flags |= XLHP_IS_CATALOG_REL;
 	if (TransactionIdIsValid(conflict_xid))
@@ -2201,5 +2247,17 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer,
 	}
 	recptr = XLogInsert(RM_HEAP2_ID, info);
 
-	PageSetLSN(BufferGetPage(buffer), recptr);
+	/*
+	 * We must bump the page LSN if pruning or freezing. If we are only
+	 * updating PD_ALL_VISIBLE, though, we can skip doing this unless
+	 * wal_log_hints/checksums are enabled. Torn pages are possible if we
+	 * update PD_ALL_VISIBLE without bumping the LSN, but this is deemed okay
+	 * for page hint updates.
+	 */
+	if (do_prune || nfrozen > 0 ||
+		(set_pd_all_vis && XLogHintBitIsNeeded()))
+	{
+		Assert(BufferIsDirty(buffer));
+		PageSetLSN(BufferGetPage(buffer), recptr);
+	}
 }
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 50cc898087f..308abff16ca 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -1970,7 +1970,7 @@ lazy_scan_prune(LVRelState *vacrel,
 	 * tuples. Pruning will have determined whether or not the page is
 	 * all-visible.
 	 */
-	prune_options = HEAP_PAGE_PRUNE_FREEZE;
+	prune_options = HEAP_PAGE_PRUNE_FREEZE | HEAP_PAGE_PRUNE_UPDATE_VIS;
 	if (vacrel->nindexes == 0)
 		prune_options |= HEAP_PAGE_PRUNE_MARK_UNUSED_NOW;
 
@@ -2073,21 +2073,6 @@ lazy_scan_prune(LVRelState *vacrel,
 			flags |= VISIBILITYMAP_ALL_FROZEN;
 		}
 
-		/*
-		 * It should never be the case that the visibility map page is set
-		 * while the page-level bit is clear, but the reverse is allowed (if
-		 * checksums are not enabled).  Regardless, set both bits so that we
-		 * get back in sync.
-		 *
-		 * NB: If the heap page is all-visible but the VM bit is not set, we
-		 * don't need to dirty the heap page.  However, if checksums are
-		 * enabled, we do need to make sure that the heap page is dirtied
-		 * before passing it to visibilitymap_set(), because it may be logged.
-		 * Given that this situation should only happen in rare cases after a
-		 * crash, it is not worth optimizing.
-		 */
-		PageSetAllVisible(page);
-		MarkBufferDirty(buf);
 		old_vmbits = visibilitymap_set(vacrel->rel, blkno, buf,
 									   InvalidXLogRecPtr,
 									   vmbuffer, presult.vm_conflict_horizon,
@@ -2168,17 +2153,6 @@ lazy_scan_prune(LVRelState *vacrel,
 	{
 		uint8		old_vmbits;
 
-		/*
-		 * Avoid relying on all_visible_according_to_vm as a proxy for the
-		 * page-level PD_ALL_VISIBLE bit being set, since it might have become
-		 * stale -- even when all_visible is set
-		 */
-		if (!PageIsAllVisible(page))
-		{
-			PageSetAllVisible(page);
-			MarkBufferDirty(buf);
-		}
-
 		/*
 		 * Set the page all-frozen (and all-visible) in the VM.
 		 *
@@ -2891,6 +2865,7 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
 		log_heap_prune_and_freeze(vacrel->rel, buffer,
 								  InvalidTransactionId,
 								  false,	/* no cleanup lock required */
+								  false,
 								  PRUNE_VACUUM_CLEANUP,
 								  NULL, 0,	/* frozen */
 								  NULL, 0,	/* redirected */
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 34206a6a7d5..2f77d8dbcd6 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -42,6 +42,7 @@
 /* "options" flag bits for heap_page_prune_and_freeze */
 #define HEAP_PAGE_PRUNE_MARK_UNUSED_NOW		(1 << 0)
 #define HEAP_PAGE_PRUNE_FREEZE				(1 << 1)
+#define HEAP_PAGE_PRUNE_UPDATE_VIS			(1 << 2)
 
 typedef struct BulkInsertStateData *BulkInsertState;
 struct TupleTableSlot;
@@ -390,6 +391,7 @@ extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
 extern void log_heap_prune_and_freeze(Relation relation, Buffer buffer,
 									  TransactionId conflict_xid,
 									  bool cleanup_lock,
+									  bool set_pd_all_vis,
 									  PruneReason reason,
 									  HeapTupleFreeze *frozen, int nfrozen,
 									  OffsetNumber *redirected, int nredirected,
diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h
index d4c0625b632..7d3fb75dda7 100644
--- a/src/include/access/heapam_xlog.h
+++ b/src/include/access/heapam_xlog.h
@@ -294,6 +294,8 @@ typedef struct xl_heap_prune
 
 #define SizeOfHeapPrune (offsetof(xl_heap_prune, flags) + sizeof(uint8))
 
+#define		XLHP_SET_PD_ALL_VIS			(1 << 0)
+
 /* to handle recovery conflict during logical decoding on standby */
 #define		XLHP_IS_CATALOG_REL			(1 << 1)
 
-- 
2.43.0

