From 815a2d10ebc6f672be5508a0c4a98ff866d0d71b Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 29 Jul 2025 16:12:56 -0400
Subject: [PATCH v35 18/18] Set pd_prune_xid on insert
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that visibility map (VM) updates can occur during read-only queries,
it makes sense to also set the page’s pd_prune_xid hint during inserts.
This enables heap_page_prune_and_freeze() to run and set the VM
all-visible after a page is filled with newly inserted tuples the first
time it is read.

This change also addresses a long-standing note in heap_insert() and
heap_multi_insert(), which observed that setting pd_prune_xid would
help clean up aborted insertions sooner. Without it, such tuples might
linger until VACUUM, whereas now they can be pruned earlier.

The index killtuples test had to be updated to reflect a larger number
of hits by some accesses. Since the prune_xid is set by the fill/insert
step, on-access pruning can happen during the first access step (before
the DELETE). This is when the VM is extended. After the DELETE, the next
access hits the VM block instead of extending it. Thus, an additional
buffer hit is counted for the table.

Reviewed-by: Chao Li <li.evan.chao@gmail.com>
---
 src/backend/access/heap/heapam.c              | 31 +++++++++++++------
 src/backend/access/heap/heapam_xlog.c         | 17 +++++++++-
 src/backend/access/heap/pruneheap.c           | 14 ++++-----
 .../modules/index/expected/killtuples.out     |  8 ++---
 4 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 5539bb8c10b..bb124bc767b 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2156,6 +2156,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	TransactionId xid = GetCurrentTransactionId();
 	HeapTuple	heaptup;
 	Buffer		buffer;
+	Page		page;
 	Buffer		vmbuffer = InvalidBuffer;
 	bool		all_visible_cleared = false;
 
@@ -2182,6 +2183,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 									   &vmbuffer, NULL,
 									   0);
 
+	page = BufferGetPage(buffer);
+
 	/*
 	 * We're about to do the actual insert -- but check for conflict first, to
 	 * avoid possibly having to roll back work we've just done.
@@ -2205,25 +2208,29 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	RelationPutHeapTuple(relation, buffer, heaptup,
 						 (options & HEAP_INSERT_SPECULATIVE) != 0);
 
-	if (PageIsAllVisible(BufferGetPage(buffer)))
+	if (PageIsAllVisible(page))
 	{
 		all_visible_cleared = true;
-		PageClearAllVisible(BufferGetPage(buffer));
+		PageClearAllVisible(page);
 		visibilitymap_clear(relation,
 							ItemPointerGetBlockNumber(&(heaptup->t_self)),
 							vmbuffer, VISIBILITYMAP_VALID_BITS);
 	}
 
 	/*
-	 * XXX Should we set PageSetPrunable on this page ?
+	 * Set pd_prune_xid to trigger heap_page_prune_and_freeze() once the page
+	 * is full so that we can set the page all-visible in the VM.
 	 *
-	 * The inserting transaction may eventually abort thus making this tuple
-	 * DEAD and hence available for pruning. Though we don't want to optimize
-	 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
-	 * aborted tuple will never be pruned until next vacuum is triggered.
+	 * Setting pd_prune_xid is also handy if the inserting transaction
+	 * eventually aborts making this tuple DEAD and hence available for
+	 * pruning. If no other tuple in this page is UPDATEd/DELETEd, the aborted
+	 * tuple would never otherwise be pruned until next vacuum is triggered.
 	 *
-	 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
+	 * Don't set it if we are in bootstrap mode or we are inserting a frozen
+	 * tuple.
 	 */
+	if (TransactionIdIsNormal(xid) && !(options & HEAP_INSERT_FROZEN))
+		PageSetPrunable(page, xid);
 
 	MarkBufferDirty(buffer);
 
@@ -2233,7 +2240,6 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 		xl_heap_insert xlrec;
 		xl_heap_header xlhdr;
 		XLogRecPtr	recptr;
-		Page		page = BufferGetPage(buffer);
 		uint8		info = XLOG_HEAP_INSERT;
 		int			bufflags = 0;
 
@@ -2598,8 +2604,13 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 		}
 
 		/*
-		 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
+		 * Set pd_prune_xid. See heap_insert() for more on why we do this when
+		 * inserting tuples. This only makes sense if we aren't already
+		 * setting the page frozen in the VM. We also don't set it in
+		 * bootstrap mode.
 		 */
+		if (!all_frozen_set && TransactionIdIsNormal(xid))
+			PageSetPrunable(page, xid);
 
 		MarkBufferDirty(buffer);
 
diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c
index df89f93edb4..edd5c946c6a 100644
--- a/src/backend/access/heap/heapam_xlog.c
+++ b/src/backend/access/heap/heapam_xlog.c
@@ -450,6 +450,14 @@ heap_xlog_insert(XLogReaderState *record)
 
 		freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
 
+		/*
+		 * Set the page prunable to trigger on-access pruning later, which may
+		 * set the page all-visible in the VM. See comments in heap_insert().
+		 */
+		if (TransactionIdIsNormal(XLogRecGetXid(record)) &&
+			!HeapTupleHeaderXminFrozen(htup))
+			PageSetPrunable(page, XLogRecGetXid(record));
+
 		PageSetLSN(page, lsn);
 
 		if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
@@ -599,12 +607,19 @@ heap_xlog_multi_insert(XLogReaderState *record)
 		if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
 			PageClearAllVisible(page);
 
-		/* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
+		/*
+		 * XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible. If
+		 * we are not setting the page frozen, then set the page's prunable
+		 * hint so that we trigger on-access pruning later which may set the
+		 * page all-visible in the VM.
+		 */
 		if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)
 		{
 			PageSetAllVisible(page);
 			PageClearPrunable(page);
 		}
+		else
+			PageSetPrunable(page, XLogRecGetXid(record));
 
 		MarkBufferDirty(buffer);
 	}
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index fc2ddcb5ab4..72a1c311bd0 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -1904,16 +1904,14 @@ heap_prune_record_unchanged_lp_normal(PruneState *prstate, OffsetNumber offnum)
 			prstate->set_all_visible = false;
 			prstate->set_all_frozen = false;
 
-			/* The page should not be marked all-visible */
-			if (PageIsAllVisible(page))
-				heap_fix_vm_corruption(prstate, offnum);
-
 			/*
-			 * If we wanted to optimize for aborts, we might consider marking
-			 * the page prunable when we see INSERT_IN_PROGRESS.  But we
-			 * don't.  See related decisions about when to mark the page
-			 * prunable in heapam.c.
+			 * Though there is nothing "prunable" on the page, we maintain
+			 * pd_prune_xid for inserts so that we have the opportunity to
+			 * mark them all-visible during the next round of pruning.
 			 */
+			heap_prune_record_prunable(prstate,
+									   HeapTupleHeaderGetXmin(htup),
+									   offnum);
 			break;
 
 		case HEAPTUPLE_DELETE_IN_PROGRESS:
diff --git a/src/test/modules/index/expected/killtuples.out b/src/test/modules/index/expected/killtuples.out
index be7ddd756ef..700144d6783 100644
--- a/src/test/modules/index/expected/killtuples.out
+++ b/src/test/modules/index/expected/killtuples.out
@@ -54,7 +54,7 @@ step flush: SELECT FROM pg_stat_force_next_flush();
 step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple';
 new_heap_accesses
 -----------------
-                1
+                2
 (1 row)
 
 step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple');
@@ -130,7 +130,7 @@ step flush: SELECT FROM pg_stat_force_next_flush();
 step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple';
 new_heap_accesses
 -----------------
-                1
+                2
 (1 row)
 
 step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple');
@@ -283,7 +283,7 @@ step flush: SELECT FROM pg_stat_force_next_flush();
 step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple';
 new_heap_accesses
 -----------------
-                1
+                2
 (1 row)
 
 step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple');
@@ -329,7 +329,7 @@ step flush: SELECT FROM pg_stat_force_next_flush();
 step result: SELECT heap_blks_read + heap_blks_hit - counter.heap_accesses AS new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple';
 new_heap_accesses
 -----------------
-                1
+                2
 (1 row)
 
 step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple');
-- 
2.43.0

