From 3d935f62665a18d96e6bec59cb1f3f7cd7daa068 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Fri, 27 Jun 2025 12:43:20 +0200
Subject: [PATCH 1/3] Add batching when calling numa_move_pages

There's a kernel bug in do_pages_stat(), resulting in numa_move_pages()
producing bogus status when querying location of memory pages. The bug
only affects systems combining 64-bit kernel and 32-bit user space. This
may seem uncommon, but we use such systems for building 32-bit Debian
packages (which happens in a 32-bit chroot).

This is a long-standing kernel bug (since 2010), affecting pretty much
all kernels, so it'll take time until all systems get a fixed kernel.
Luckily, we can work around that on our end, by batching the requests
the same way as in do_pages_stat(). On 32-bit systems we use batches of
16 pointers, same as do_pages_stat(). 64-bit systems are not affected,
so we use a much larger batch of 1024.

Reported-by: Christoph Berg <myon@debian.org>
Author: Christoph Berg <myon@debian.org>
Author: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Discussion: https://postgr.es/m/aEtDozLmtZddARdB@msg.df7cb.de
---
 src/include/port/pg_numa.h |  2 +-
 src/port/pg_numa.c         | 45 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/src/include/port/pg_numa.h b/src/include/port/pg_numa.h
index 40f1d324dcf..d707d149a43 100644
--- a/src/include/port/pg_numa.h
+++ b/src/include/port/pg_numa.h
@@ -29,7 +29,7 @@ extern PGDLLIMPORT int pg_numa_get_max_node(void);
 
 #else
 
-#define pg_numa_touch_mem_if_required(ro_volatile_var, ptr) \
+#define pg_numa_touch_mem_if_required(ptr) \
 	do {} while(0)
 
 #endif
diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c
index 4b487a2a4e8..54ab9c70d56 100644
--- a/src/port/pg_numa.c
+++ b/src/port/pg_numa.c
@@ -29,6 +29,19 @@
 #include <numa.h>
 #include <numaif.h>
 
+/*
+ * numa_move_pages() batch size, has to be <= 16 to work around a kernel bug
+ * in do_pages_stat() (chunked by DO_PAGES_STAT_CHUNK_NR). By using the same
+ * batch size, we make it work even on unfixed kernels.
+ *
+ * 64-bit system are not affected by the bug, and so use much larger batches.
+ */
+#if SIZEOF_SIZE_T == 4
+#define NUMA_QUERY_BATCH_SIZE 16
+#else
+#define NUMA_QUERY_BATCH_SIZE 1024
+#endif
+
 /* libnuma requires initialization as per numa(3) on Linux */
 int
 pg_numa_init(void)
@@ -46,7 +59,37 @@ pg_numa_init(void)
 int
 pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status)
 {
-	return numa_move_pages(pid, count, pages, NULL, status, 0);
+	unsigned long	next = 0;
+	int				ret = 0;
+
+	/*
+	 * Batch pointers passed to numa_move_pages to NUMA_QUERY_BATCH_SIZE
+	 * items, to work around a kernel bug in do_pages_stat().
+	 */
+	while (next < count)
+	{
+		unsigned long count_batch = Min(count - next,
+										NUMA_QUERY_BATCH_SIZE);
+
+		/*
+		 * Bail out if any of the batches errors out (ret<0). We ignore
+		 * (ret>0) which is used to return number of nonmigrated pages,
+		 * but we're not migrating any pages here.
+		 */
+		ret = numa_move_pages(pid, count_batch, &pages[next], NULL, &status[next], 0);
+		if (ret < 0)
+		{
+			/* plain error, return as is */
+			return ret;
+		}
+
+		next += count_batch;
+	}
+
+	/* should have consumed the input array exactly */
+	Assert(next == count);
+
+	return 0;
 }
 
 int
-- 
2.49.0

