public inbox for [email protected]  
help / color / mirror / Atom feed
From: Ilia Evdokimov <[email protected]>
To: David Geier <[email protected]>
To: Zsolt Parragi <[email protected]>
Cc: PostgreSQL Developers <[email protected]>
Subject: Re: Reduce planning time for large NOT IN lists containing NULL
Date: Wed, 25 Feb 2026 00:14:01 +0300
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
References: <[email protected]>
	<[email protected]>
	<[email protected]>
	<[email protected]>
	<[email protected]>
	<[email protected]>
	<[email protected]>
	<CAN4CZFNJHro_pEVTHggBX43f+AebwKLZdp39+V4pV0EB4BZuZw@mail.gmail.com>
	<[email protected]>
	<[email protected]>

On 2/24/26 11:29, David Geier wrote:

> Using array_contains_nulls() seems fine. In case the IN list doesn't
> contain NULL, the function can immediately bail thanks to the
> !ARR_HASNULL() check in the beginning.
>
> It only needs to iterate over the NULL-bitmap, if it exists. This is the
> case if there's actually a NULL element in the array, or if the array
> initially contained NULL and all NULLs got removed subsequently.
>
> If we ever find the latter case to matter we could remove the
> NULL-bitmap in array_set_element() / array_set_element_expanded(), when
> the last NULL element got removed.
>
>> Could you clarify what exactly this additional test meant to verify?
> Zsolt's test case creates an array that initially contains NULL. The
> NULL element is subsequently replaced by a non-NULL value but
> array_set_element_expanded() keeps the NULL-bitmap around. With that,
> your ARR_ISNULL() check bails and causes the selectivity estimation to
> incorrectly return 0.

Ah, right - thanks for the clarification. I agree.

Regarding the regression test: the suggestion test case is good, but 
there is not a straightforward way to expose the estimated row count 
without also showing the costs, and costs are unstable. To avoid that, I 
reused the parsing approach already present in stats_ext.sql to extract 
only the estimated row count from EXPLAIN.

Since the test table contains exactly 1000 rows and we run ANALYZE, all 
rows are included in the statistics sample. Therefore the estimate for x 
<> ALL(array[1, 99, 2]) is deterministically 997 rows, and the test 
stable and ensures we detect the incorrect early-zero estimate.

Let me know if you'd prefer a different approach. I've attached v4 patch.

-- 
Best regards,
Ilia Evdokimov,
Tantor Labs LLC,
https://tantorlabs.com/



Attachments:

  [text/x-patch] v4-0001-Reduce-planning-time-for-large-NOT-IN-lists-conta.patch (4.9K, 3-v4-0001-Reduce-planning-time-for-large-NOT-IN-lists-conta.patch)
  download | inline diff:
From a9dd1d1c9990a30173c5210570efccb034e8b5f4 Mon Sep 17 00:00:00 2001
From: Evdokimov Ilia <[email protected]>
Date: Wed, 25 Feb 2026 00:08:26 +0300
Subject: [PATCH v4] Reduce planning time for large NOT IN lists containing 
 NULL

For x <> ALL (...) / x NOT IN (...), the presence of a NULL element
makes the selectivity 0.0.

The planner currently still iterates over all elements and computes
per-element selectivity, even though the final result is known.

Add an early NULL check for constant arrays and immediately return
0.0 under ALL semantics.

This reduces planning time for large NOT IN / <> ALL lists without
changing semantics.
---
 src/backend/utils/adt/selfuncs.c          |  9 +++++
 src/test/regress/expected/expressions.out | 44 +++++++++++++++++++++++
 src/test/regress/sql/expressions.sql      | 41 +++++++++++++++++++++
 3 files changed, 94 insertions(+)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 29fec655593..eef3f0375a5 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -2018,6 +2018,11 @@ scalararraysel(PlannerInfo *root,
 		if (arrayisnull)		/* qual can't succeed if null array */
 			return (Selectivity) 0.0;
 		arrayval = DatumGetArrayTypeP(arraydatum);
+
+		/* Selectivity of "WHERE x NOT IN (NULL, ... )" is always 0 */
+		if (!useOr && array_contains_nulls(arrayval))
+			return (Selectivity) 0.0;
+
 		get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
 							 &elmlen, &elmbyval, &elmalign);
 		deconstruct_array(arrayval,
@@ -2115,6 +2120,10 @@ scalararraysel(PlannerInfo *root,
 			List	   *args;
 			Selectivity s2;
 
+			/* Selectivity of "WHERE x NOT IN (NULL, ... )" is always 0 */
+			if (!useOr && IsA(elem, Const) && ((Const *) elem)->constisnull)
+				return (Selectivity) 0.0;
+
 			/*
 			 * Theoretically, if elem isn't of nominal_element_type we should
 			 * insert a RelabelType, but it seems unlikely that any operator
diff --git a/src/test/regress/expected/expressions.out b/src/test/regress/expected/expressions.out
index 9a3c97b15a3..34f14a5775a 100644
--- a/src/test/regress/expected/expressions.out
+++ b/src/test/regress/expected/expressions.out
@@ -426,3 +426,47 @@ select * from inttest where a not in (0::myint,2::myint,3::myint,4::myint,5::myi
 (0 rows)
 
 rollback;
+-- Test <> ALL when array initially contained NULL but no longer does
+begin;
+create function check_estimated_rows(text) returns table (estimated int)
+language plpgsql as
+$$
+declare
+    ln text;
+    tmp text[];
+    first_row bool := true;
+begin
+    for ln in
+        execute format('explain %s', $1)
+    loop
+        if first_row then
+            first_row := false;
+            tmp := regexp_match(ln, 'rows=(\d*)');
+            return query select tmp[1]::int;
+        end if;
+    end loop;
+end;
+$$;
+create function replace_elem(arr int[], idx int, val int)
+returns int[] AS $$
+begin
+      arr[idx] := val;
+      return arr;
+end;
+$$ language plpgsql immutable;
+create table notin_test as select generate_series(1, 1000) as x;
+analyze notin_test;
+select * from check_estimated_rows('select * from notin_test where x <> all(array[1,99,3])');
+ estimated 
+-----------
+       997
+(1 row)
+
+-- same array, constructed from an array with a NULL
+select * from check_estimated_rows('select * from notin_test where x <> all(replace_elem(array[1,null,3], 2, 99))');
+ estimated 
+-----------
+       997
+(1 row)
+
+rollback;
diff --git a/src/test/regress/sql/expressions.sql b/src/test/regress/sql/expressions.sql
index e02c21f3368..ca94859bbf8 100644
--- a/src/test/regress/sql/expressions.sql
+++ b/src/test/regress/sql/expressions.sql
@@ -209,3 +209,44 @@ select * from inttest where a not in (1::myint,2::myint,3::myint,4::myint,5::myi
 select * from inttest where a not in (0::myint,2::myint,3::myint,4::myint,5::myint, null);
 
 rollback;
+
+-- Test <> ALL when array initially contained NULL but no longer does
+
+begin;
+
+create function check_estimated_rows(text) returns table (estimated int)
+language plpgsql as
+$$
+declare
+    ln text;
+    tmp text[];
+    first_row bool := true;
+begin
+    for ln in
+        execute format('explain %s', $1)
+    loop
+        if first_row then
+            first_row := false;
+            tmp := regexp_match(ln, 'rows=(\d*)');
+            return query select tmp[1]::int;
+        end if;
+    end loop;
+end;
+$$;
+
+create function replace_elem(arr int[], idx int, val int)
+returns int[] AS $$
+begin
+      arr[idx] := val;
+      return arr;
+end;
+$$ language plpgsql immutable;
+
+create table notin_test as select generate_series(1, 1000) as x;
+analyze notin_test;
+
+select * from check_estimated_rows('select * from notin_test where x <> all(array[1,99,3])');
+-- same array, constructed from an array with a NULL
+select * from check_estimated_rows('select * from notin_test where x <> all(replace_elem(array[1,null,3], 2, 99))');
+
+rollback;
\ No newline at end of file
-- 
2.34.1



view thread (9+ messages)  latest in thread

reply

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Reply to all the recipients using the --to and --cc options:
  reply via email

  To: [email protected]
  Cc: [email protected], [email protected], [email protected], [email protected]
  Subject: Re: Reduce planning time for large NOT IN lists containing NULL
  In-Reply-To: <[email protected]>

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

This inbox is served by agora; see mirroring instructions
for how to clone and mirror all data and code used for this inbox