From 8ea0d82a1c72f1fcbf834cfa5a7913fce0778ac8 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Fri, 16 Jan 2026 21:55:02 +0100
Subject: [PATCH] Handle ENOENT status when querying NUMA node

We've assumed that touching the memory is sufficient for a page to be
located on one of the NUMA nodes. But that's not quite true, because
a page may be moved to swap after we touch it.

It's not hard to make that happen with commands like CREATE INDEX (which
uses only a small circular buffer in shared buffers, while loading large
amounts of data into page cache). This memory pressure may force a
significant fraction of shared buffers to swap.

We touch the memory before querying the status, but there is no
guarangee it won't be moved to swap in between. We do the touching only
during the first call, so later calls are more likely to be affected.

This only happens with regular memory pages (e.g. 4K). Hugepages cannot
be swapped out under memory pressure.

We can't prevent this - it's up to the kernel to move pages to swap.
Therefore, we have to accept ENOENT (-2) status as a valid result, and
handle it without failing. This patch simply treats -2 as unknown node,
and returns NULL in the two affected views (pg_shmem_allocations_numa
and pg_buffercache_numa).

Reported by Christoph Berg, investigation and fix by me. Backpatch to
18, where the two views were introduced.

Reported-by: Christoph Berg <myon@debian.org>
Discussion: 18
Backpatch-through: https://postgr.es/m/aTq5Gt_n-oS_QSpL@msg.df7cb.de
---
 contrib/pg_buffercache/pg_buffercache_pages.c | 12 +++++--
 src/backend/storage/ipc/shmem.c               | 32 +++++++++++++++----
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index dcba3fb5473..9ff0eb4b0a0 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -551,8 +551,16 @@ pg_buffercache_os_pages_internal(FunctionCallInfo fcinfo, bool include_numa)
 
 		if (fctx->include_numa)
 		{
-			values[2] = Int32GetDatum(fctx->record[i].numa_node);
-			nulls[2] = false;
+			/* status is valid node number */
+			if (fctx->record[i].numa_node >= 0)
+			{
+				values[2] = Int32GetDatum(fctx->record[i].numa_node);
+				nulls[2] = false;
+			} else {
+				/* some kind of error (e.g. pages moved to swap) */
+				values[2] = (Datum) 0;
+				nulls[2] = true;
+			}
 		}
 		else
 		{
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index d2f4710f141..1b536363152 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -599,7 +599,7 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
 	InitMaterializedSRF(fcinfo, 0);
 
 	max_nodes = pg_numa_get_max_node();
-	nodes = palloc_array(Size, max_nodes + 1);
+	nodes = palloc_array(Size, max_nodes + 2);
 
 	/*
 	 * Shared memory allocations can vary in size and may not align with OS
@@ -635,7 +635,6 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
 	hash_seq_init(&hstat, ShmemIndex);
 
 	/* output all allocated entries */
-	memset(nulls, 0, sizeof(nulls));
 	while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
 	{
 		int			i;
@@ -684,22 +683,33 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
 			elog(ERROR, "failed NUMA pages inquiry status: %m");
 
 		/* Count number of NUMA nodes used for this shared memory entry */
-		memset(nodes, 0, sizeof(Size) * (max_nodes + 1));
+		memset(nodes, 0, sizeof(Size) * (max_nodes + 2));
 
 		for (i = 0; i < shm_ent_page_count; i++)
 		{
 			int			s = pages_status[i];
 
 			/* Ensure we are adding only valid index to the array */
-			if (s < 0 || s > max_nodes)
+			if (s >= 0 && s <= max_nodes)
+			{
+				/* valid NUMA node */
+				nodes[s]++;
+				continue;
+			}
+			else if (s == -2)
 			{
-				elog(ERROR, "invalid NUMA node id outside of allowed range "
-					 "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
+				/* -2 means ENOENT (e.g. page was moved to swap) */
+				nodes[max_nodes + 1]++;
+				continue;
 			}
 
-			nodes[s]++;
+			elog(ERROR, "invalid NUMA node id outside of allowed range "
+				 "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
 		}
 
+		/* no NULLs for regular nodes */
+		memset(nulls, 0, sizeof(nulls));
+
 		/*
 		 * Add one entry for each NUMA node, including those without allocated
 		 * memory for this segment.
@@ -713,6 +723,14 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
 			tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
 								 values, nulls);
 		}
+
+		/* The last entry is used for pages without a NUMA node. */
+		nulls[1] = true;
+		values[0] = CStringGetTextDatum(ent->key);
+		values[2] = Int64GetDatum(nodes[max_nodes + 1] * os_page_size);
+
+		tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+							 values, nulls);
 	}
 
 	LWLockRelease(ShmemIndexLock);
-- 
2.52.0

