From 8483ddbb7f3226f73262be80031630638e413f37 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 29 Jul 2025 16:12:56 -0400
Subject: [PATCH v39 12/12] Set pd_prune_xid on insert
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that visibility map (VM) updates can occur during read-only queries,
it makes sense to also set the page’s pd_prune_xid hint during inserts
and on the new page during updates.

This enables heap_page_prune_and_freeze() to run and set the VM
all-visible after a page is filled with newly inserted tuples the first
time it is read.

This change also addresses a long-standing note in heap_insert() and
heap_multi_insert(), which observed that setting pd_prune_xid would
help clean up aborted insertions sooner. Without it, such tuples might
linger until VACUUM, whereas now they can be pruned earlier.

The index killtuples test had to be updated to reflect a larger number
of hits by some accesses. Since the prune_xid is set by the fill/insert
step, on-access pruning can happen during the first access step (before
the DELETE). This is when the VM is extended. After the DELETE, the next
access hits the VM block instead of extending it. Thus, an additional
buffer hit is counted for the table.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Discussion: https://postgr.es/m/flat/CAAKRu_ZMw6Npd_qm2KM%2BFwQ3cMOMx1Dh3VMhp8-V7SOLxdK9-g%40mail.gmail.com
---
 src/backend/access/heap/heapam.c              | 40 ++++++++++++-------
 src/backend/access/heap/heapam_xlog.c         | 19 ++++++++-
 src/backend/access/heap/pruneheap.c           | 17 ++++----
 .../modules/index/expected/killtuples.out     |  8 ++--
 4 files changed, 56 insertions(+), 28 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index dbdf6521c42..c199646b25d 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2156,6 +2156,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	TransactionId xid = GetCurrentTransactionId();
 	HeapTuple	heaptup;
 	Buffer		buffer;
+	Page		page;
 	Buffer		vmbuffer = InvalidBuffer;
 	bool		all_visible_cleared = false;
 
@@ -2182,6 +2183,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 									   &vmbuffer, NULL,
 									   0);
 
+	page = BufferGetPage(buffer);
+
 	/*
 	 * We're about to do the actual insert -- but check for conflict first, to
 	 * avoid possibly having to roll back work we've just done.
@@ -2205,25 +2208,30 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	RelationPutHeapTuple(relation, buffer, heaptup,
 						 (options & HEAP_INSERT_SPECULATIVE) != 0);
 
-	if (PageIsAllVisible(BufferGetPage(buffer)))
+	if (PageIsAllVisible(page))
 	{
 		all_visible_cleared = true;
-		PageClearAllVisible(BufferGetPage(buffer));
+		PageClearAllVisible(page);
 		visibilitymap_clear(relation,
 							ItemPointerGetBlockNumber(&(heaptup->t_self)),
 							vmbuffer, VISIBILITYMAP_VALID_BITS);
 	}
 
 	/*
-	 * XXX Should we set PageSetPrunable on this page ?
+	 * Set pd_prune_xid to trigger heap_page_prune_and_freeze() once the page
+	 * is full so that we can set the page all-visible in the VM on the next
+	 * page access.
 	 *
-	 * The inserting transaction may eventually abort thus making this tuple
-	 * DEAD and hence available for pruning. Though we don't want to optimize
-	 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
-	 * aborted tuple will never be pruned until next vacuum is triggered.
+	 * Setting pd_prune_xid is also handy if the inserting transaction
+	 * eventually aborts making this tuple DEAD and hence available for
+	 * pruning. If no other tuple in this page is UPDATEd/DELETEd, the aborted
+	 * tuple would never otherwise be pruned until next vacuum is triggered.
 	 *
-	 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
+	 * Don't set it if we are in bootstrap mode or we are inserting a frozen
+	 * tuple.
 	 */
+	if (TransactionIdIsNormal(xid) && !(options & HEAP_INSERT_FROZEN))
+		PageSetPrunable(page, xid);
 
 	MarkBufferDirty(buffer);
 
@@ -2233,7 +2241,6 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 		xl_heap_insert xlrec;
 		xl_heap_header xlhdr;
 		XLogRecPtr	recptr;
-		Page		page = BufferGetPage(buffer);
 		uint8		info = XLOG_HEAP_INSERT;
 		int			bufflags = 0;
 
@@ -2598,8 +2605,13 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 		}
 
 		/*
-		 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
+		 * Set pd_prune_xid. See heap_insert() for more on why we do this when
+		 * inserting tuples. This only makes sense if we aren't already
+		 * setting the page frozen in the VM. We also don't set it in
+		 * bootstrap mode.
 		 */
+		if (!all_frozen_set && TransactionIdIsNormal(xid))
+			PageSetPrunable(page, xid);
 
 		MarkBufferDirty(buffer);
 
@@ -4141,12 +4153,12 @@ l2:
 	 * the subsequent page pruning will be a no-op and the hint will be
 	 * cleared.
 	 *
-	 * XXX Should we set hint on newbuf as well?  If the transaction aborts,
-	 * there would be a prunable tuple in the newbuf; but for now we choose
-	 * not to optimize for aborts.  Note that heap_xlog_update must be kept in
-	 * sync if this decision changes.
+	 * We set the new page prunable as well. See heap_insert() for more on why
+	 * we do this when inserting tuples.
 	 */
 	PageSetPrunable(page, xid);
+	if (newbuf != buffer)
+		PageSetPrunable(newpage, xid);
 
 	if (use_hot_update)
 	{
diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c
index 1302bb13e18..f3f419d3dc1 100644
--- a/src/backend/access/heap/heapam_xlog.c
+++ b/src/backend/access/heap/heapam_xlog.c
@@ -450,6 +450,14 @@ heap_xlog_insert(XLogReaderState *record)
 
 		freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
 
+		/*
+		 * Set the page prunable to trigger on-access pruning later, which may
+		 * set the page all-visible in the VM. See comments in heap_insert().
+		 */
+		if (TransactionIdIsNormal(XLogRecGetXid(record)) &&
+			!HeapTupleHeaderXminFrozen(htup))
+			PageSetPrunable(page, XLogRecGetXid(record));
+
 		PageSetLSN(page, lsn);
 
 		if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
@@ -599,12 +607,19 @@ heap_xlog_multi_insert(XLogReaderState *record)
 		if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
 			PageClearAllVisible(page);
 
-		/* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
+		/*
+		 * XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible. If
+		 * we are not setting the page frozen, then set the page's prunable
+		 * hint so that we trigger on-access pruning later which may set the
+		 * page all-visible in the VM.
+		 */
 		if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)
 		{
 			PageSetAllVisible(page);
 			PageClearPrunable(page);
 		}
+		else
+			PageSetPrunable(page, XLogRecGetXid(record));
 
 		MarkBufferDirty(buffer);
 	}
@@ -921,6 +936,8 @@ heap_xlog_update(XLogReaderState *record, bool hot_update)
 		freespace = PageGetHeapFreeSpace(npage);
 
 		PageSetLSN(npage, lsn);
+		/* See heap_insert() for why we set pd_prune_xid on insert */
+		PageSetPrunable(npage, XLogRecGetXid(record));
 		MarkBufferDirty(nbuffer);
 	}
 
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 3cdc1a36441..7cb9e1e2aac 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -255,7 +255,8 @@ heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer,
 	/*
 	 * First check whether there's any chance there's something to prune,
 	 * determining the appropriate horizon is a waste if there's no prune_xid
-	 * (i.e. no updates/deletes left potentially dead tuples around).
+	 * (i.e. no updates/deletes left potentially dead tuples around and no
+	 * inserts inserted new tuples that may be visible to all).
 	 */
 	prune_xid = PageGetPruneXid(page);
 	if (!TransactionIdIsValid(prune_xid))
@@ -1848,16 +1849,14 @@ heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum)
 			prstate->set_all_visible = false;
 			prstate->set_all_frozen = false;
 
-			/* The page should not be marked all-visible */
-			if (PageIsAllVisible(page))
-				heap_fix_vm_corruption(prstate, offnum);
-
 			/*
-			 * If we wanted to optimize for aborts, we might consider marking
-			 * the page prunable when we see INSERT_IN_PROGRESS.  But we
-			 * don't.  See related decisions about when to mark the page
-			 * prunable in heapam.c.
+			 * Though there is nothing "prunable" on the page, we maintain
+			 * pd_prune_xid for inserts so that we have the opportunity to
+			 * mark them all-visible during the next round of pruning.
 			 */
+			heap_prune_record_prunable(prstate,
+									   HeapTupleHeaderGetXmin(htup),
+									   offnum);
 			break;
 
 		case HEAPTUPLE_DELETE_IN_PROGRESS:
diff --git a/src/test/modules/index/expected/killtuples.out b/src/test/modules/index/expected/killtuples.out
index be7ddd756ef..700144d6783 100644
--- a/src/test/modules/index/expected/killtuples.out
+++ b/src/test/modules/index/expected/killtuples.out
@@ -54,7 +54,7 @@ step flush: SELECT FROM pg_stat_force_next_flush();
 step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple';
 new_heap_accesses
 -----------------
-                1
+                2
 (1 row)
 
 step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple');
@@ -130,7 +130,7 @@ step flush: SELECT FROM pg_stat_force_next_flush();
 step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple';
 new_heap_accesses
 -----------------
-                1
+                2
 (1 row)
 
 step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple');
@@ -283,7 +283,7 @@ step flush: SELECT FROM pg_stat_force_next_flush();
 step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple';
 new_heap_accesses
 -----------------
-                1
+                2
 (1 row)
 
 step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple');
@@ -329,7 +329,7 @@ step flush: SELECT FROM pg_stat_force_next_flush();
 step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple';
 new_heap_accesses
 -----------------
-                1
+                2
 (1 row)
 
 step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple');
-- 
2.43.0

