Received: from malur.postgresql.org ([217.196.149.56]) by arkaria.postgresql.org with esmtps (TLS1.3) tls TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 (Exim 4.94.2) (envelope-from ) id 1uU8Ju-00GA7n-1p for pgsql-hackers@arkaria.postgresql.org; Tue, 24 Jun 2025 18:24:34 +0000 Received: from localhost ([127.0.0.1] helo=malur.postgresql.org) by malur.postgresql.org with esmtp (Exim 4.94.2) (envelope-from ) id 1uU8Js-00E95c-5M for pgsql-hackers@arkaria.postgresql.org; Tue, 24 Jun 2025 18:24:32 +0000 Received: from magus.postgresql.org ([2a02:c0:301:0:ffff::29]) by malur.postgresql.org with esmtps (TLS1.3) tls TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 (Exim 4.94.2) (envelope-from ) id 1uU8Jr-00E95T-S2 for pgsql-hackers@lists.postgresql.org; Tue, 24 Jun 2025 18:24:32 +0000 Received: from mout-p-101.mailbox.org ([80.241.56.151]) by magus.postgresql.org with esmtps (TLS1.3) tls TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 (Exim 4.96) (envelope-from ) id 1uU8Jq-003s8w-0H for pgsql-hackers@lists.postgresql.org; Tue, 24 Jun 2025 18:24:32 +0000 Received: from smtp1.mailbox.org (smtp1.mailbox.org [IPv6:2001:67c:2050:b231:465::1]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256) (No client certificate requested) by mout-p-101.mailbox.org (Postfix) with ESMTPS id 4bRYJ8650nz9tCZ; Tue, 24 Jun 2025 20:24:24 +0200 (CEST) Date: Tue, 24 Jun 2025 20:24:22 +0200 From: Christoph Berg To: Bertrand Drouvot Cc: Tomas Vondra , Andres Freund , Tomas Vondra , pgsql-hackers@lists.postgresql.org Subject: Re: pgsql: Introduce pg_shmem_allocations_numa view Message-ID: References: <0643ae61-cf9d-482c-9b2c-fb861b24fd22@vondra.me> <6342f601-77de-4ee0-8c2a-3deb50ceac5b@vondra.me> <8649a4e3-c60d-4f37-aa6f-e7e7c14c581e@vondra.me> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="LFOCbQrkx6X8iSve" Content-Disposition: inline In-Reply-To: X-Rspamd-Queue-Id: 4bRYJ8650nz9tCZ List-Id: List-Help: List-Subscribe: List-Post: List-Owner: List-Archive: Archived-At: Precedence: bulk --LFOCbQrkx6X8iSve Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Re: Bertrand Drouvot > Yes, something like: > > diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c > index c9ae3b45b76..070ad2f13e7 100644 > --- a/src/backend/storage/ipc/shmem.c > +++ b/src/backend/storage/ipc/shmem.c > @@ -689,8 +689,17 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) > CHECK_FOR_INTERRUPTS(); > } > > - if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1) > - elog(ERROR, "failed NUMA pages inquiry status: %m"); > + #define NUMA_QUERY_CHUNK_SIZE 16 /* has to be <= DO_PAGES_STAT_CHUNK_NR (do_pages_stat())*/ > + > + for (uint64 chunk_start = 0; chunk_start < shm_ent_page_count; chunk_start += NUMA_QUERY_CHUNK_SIZE) { > + uint64 chunk_size = Min(NUMA_QUERY_CHUNK_SIZE, shm_ent_page_count - chunk_start); > + > + if (pg_numa_query_pages(0, chunk_size, &page_ptrs[chunk_start], > + &pages_status[chunk_start]) == -1) > + elog(ERROR, "failed NUMA pages inquiry status: %m"); > + } > + > + #undef NUMA_QUERY_CHUNK_SIZE I uploaded a variant of this patch to Debian and it seems to have fixed the issue: https://buildd.debian.org/status/package.php?p=postgresql-18&suite=experimental (No reply from linux-mm yet.) Christoph --LFOCbQrkx6X8iSve Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename=move-pages32 Work around a Linux bug in move_pages In 32-bit mode on 64-bit kernels, move_pages() does not correctly advance to the next chunk. Work around by not asking for more than 16 pages at once so move_pages() internal loop is not executed more than once. https://www.postgresql.org/message-id/flat/a3a4fe3d-1a80-4e03-aa8e-150ee15f6c35%40vondra.me#6abe7eaa802b5b07bb70cc3229e63a9f https://marc.info/?l=linux-mm&m=175077821909222&w=2 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -390,8 +390,15 @@ pg_buffercache_numa_pages(PG_FUNCTION_AR memset(os_page_status, 0xff, sizeof(int) * os_page_count); /* Query NUMA status for all the pointers */ - if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1) - elog(ERROR, "failed NUMA pages inquiry: %m"); +#define NUMA_QUERY_CHUNK_SIZE 16 /* has to be <= DO_PAGES_STAT_CHUNK_NR (do_pages_stat())*/ + for (uint64 chunk_start = 0; chunk_start < os_page_count; chunk_start += NUMA_QUERY_CHUNK_SIZE) { + uint64 chunk_size = Min(NUMA_QUERY_CHUNK_SIZE, os_page_count - chunk_start); + + if (pg_numa_query_pages(0, chunk_size, &os_page_ptrs[chunk_start], + &os_page_status[chunk_start]) == -1) + elog(ERROR, "failed NUMA pages inquiry status: %m"); + } +#undef NUMA_QUERY_CHUNK_SIZE /* Initialize the multi-call context, load entries about buffers */ --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -689,8 +689,15 @@ pg_get_shmem_allocations_numa(PG_FUNCTIO CHECK_FOR_INTERRUPTS(); } - if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1) - elog(ERROR, "failed NUMA pages inquiry status: %m"); +#define NUMA_QUERY_CHUNK_SIZE 16 /* has to be <= DO_PAGES_STAT_CHUNK_NR (do_pages_stat())*/ + for (uint64 chunk_start = 0; chunk_start < shm_ent_page_count; chunk_start += NUMA_QUERY_CHUNK_SIZE) { + uint64 chunk_size = Min(NUMA_QUERY_CHUNK_SIZE, shm_ent_page_count - chunk_start); + + if (pg_numa_query_pages(0, chunk_size, &page_ptrs[chunk_start], + &pages_status[chunk_start]) == -1) + elog(ERROR, "failed NUMA pages inquiry status: %m"); + } +#undef NUMA_QUERY_CHUNK_SIZE /* Count number of NUMA nodes used for this shared memory entry */ memset(nodes, 0, sizeof(Size) * (max_nodes + 1)); --LFOCbQrkx6X8iSve--