From 7d668752ceb49b901571a96d156e0219da4e7c1f Mon Sep 17 00:00:00 2001
From: Evdokimov Ilia <ilya.evdokimov@tantorlabs.com>
Date: Wed, 25 Feb 2026 23:00:32 +0300
Subject: [PATCH v6 2/3] Use O(1) selectivity formula for eqsel/neqsel IN/ALL

Replace per-element iteration in ScalarArrayOpExpr selectivity
estimation with a closed-form probability formula when all elements
share the same eqsel()/neqsel() semantics.

Preserves existing independence/disjoint models while reducing
planning cost for large IN/ALL lists from O(N) to O(1).

Special handling added for unique columns using 1/reltuples.
---
 src/backend/utils/adt/selfuncs.c | 157 +++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index eef3f0375a5..f6091a576d8 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -184,6 +184,9 @@ get_relation_stats_hook_type get_relation_stats_hook = NULL;
 get_index_stats_hook_type get_index_stats_hook = NULL;
 
 static double eqsel_internal(PG_FUNCTION_ARGS, bool negate);
+static Selectivity calculate_combined_selectivity(Selectivity s2, int num_elems,
+							  bool useOr,
+							  bool isEquality, bool isInequality);
 static double eqjoinsel_inner(FmgrInfo *eqproc, Oid collation,
 							  Oid hashLeft, Oid hashRight,
 							  VariableStatData *vardata1, VariableStatData *vardata2,
@@ -1893,6 +1896,61 @@ strip_array_coercion(Node *node)
 	return node;
 }
 
+/*
+ * calculate_combined_selectivity
+ *
+ * Combine selectivities of N identical ScalarArrayOpExpr elements.
+ *
+ * This function assumes that all elements of the IN/ANY or ALL list
+ * have the same per-element selectivity s2, and computes the overall
+ * selectivity without iterating over the elements.
+ *
+ * For OR semantics (x = ANY (...)):
+ *   main model      : 1 - (1 - s2)^N
+ *   disjoint model  : N * s2
+ *
+ * For AND semantics (x <> ALL (...)):
+ *   main model      : s2^N
+ *   disjoint model  : 1 - N * (1 - s2)
+ *
+ * If the disjoint estimate is within [0,1], it is preferred.
+ * Otherwise, we fall back to the main (independence) model.
+ */
+static Selectivity
+calculate_combined_selectivity(Selectivity s2, int num_elems, bool useOr, bool isEquality, bool isInequality)
+{
+	bool		use_disjoint = false;
+	Selectivity	s1;
+	Selectivity	s1disjoint;
+
+	s1 = s1disjoint = (useOr ? 0.0 : 1.0);
+
+	if (useOr)
+	{
+		if (isEquality)
+		{
+			s1disjoint = s2 * num_elems;
+			if (s1disjoint >= 0.0 && s1disjoint <= 1.0)
+				use_disjoint = true;
+		}
+		s1 = use_disjoint ? s1disjoint : (1.0 - pow(1.0 - s2, num_elems));
+	}
+	else
+	{
+		if (isInequality)
+		{
+			s1disjoint = 1.0 + num_elems * (s2 - 1.0);
+			if (s1disjoint >= 0.0 && s1disjoint <= 1.0)
+				use_disjoint = true;
+		}
+		s1 = use_disjoint ? s1disjoint : pow(s2, num_elems);
+	}
+
+	CLAMP_PROBABILITY(s1);
+
+	return s1;
+}
+
 /*
  *		scalararraysel		- Selectivity of ScalarArrayOpExpr Node.
  */
@@ -2030,6 +2088,72 @@ scalararraysel(PlannerInfo *root,
 						  elmlen, elmbyval, elmalign,
 						  &elem_values, &elem_nulls, &num_elems);
 
+		/*
+		 * Try to avoid O(N^2) selectivity calculation for ScalarArrayOpExpr.
+		 *
+		 * For equality/inequality operators in restriction clauses,
+		 * attempt to derive a single per-element selectivity (s2) and
+		 * combine it in O(1) time using a closed-form formula instead
+		 * of iterating over all elements.
+		 */
+		if ((isEquality || isInequality) && !is_join_clause)
+		{
+			VariableStatData vardata;
+			Selectivity s2 = -1.0;
+			Node       *other_op = NULL;
+			bool        var_on_left;
+
+			/*
+			 * If the clause is of the form "var OP something" or
+			 * "something OP var", extract statistics for the variable.
+			 * Otherwise, fall back to a default per-element estimate.
+			 */
+			if (get_restriction_variable(root, clause->args, varRelid, &vardata, &other_op, &var_on_left))
+			{
+				/*
+				 * Fast path for unique columns.
+				 *
+				 * If the variable is known to be unique and the relation
+				 * has at least one tuple, equality selectivity is exactly
+				 * 1 / reltuples.
+				 */
+				if (vardata.isunique && vardata.rel && vardata.rel->tuples >= 1.0)
+				{
+					s2 = 1.0 / vardata.rel->tuples;
+					if (HeapTupleIsValid(vardata.statsTuple))
+					{
+						Form_pg_statistic stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
+						if (isInequality)
+							s2 = 1.0 - s2 - stats->stanullfrac;
+					}
+				}
+				else if (isInequality)
+				{
+					Oid negator = get_negator(operator);
+					if (!OidIsValid(negator))
+						s2 = 1.0 - DEFAULT_EQ_SEL;
+				}
+
+				ReleaseVariableStats(vardata);
+
+				if (s2 >= 0.0)
+				{
+					CLAMP_PROBABILITY(s2);
+
+					s1 = calculate_combined_selectivity(s2, num_elems, useOr, isEquality, isInequality);
+
+					return s1;
+				}
+			}
+			else
+			{
+				s2 = (isInequality) ? (1.0 - DEFAULT_EQ_SEL) : DEFAULT_EQ_SEL;
+				s1 = calculate_combined_selectivity(s2, num_elems, useOr, isEquality, isInequality);
+
+				return s1;
+			}
+		}
+
 		/*
 		 * For generic operators, we assume the probability of success is
 		 * independent for each array element.  But for "= ANY" or "<> ALL",
@@ -2105,6 +2229,39 @@ scalararraysel(PlannerInfo *root,
 		get_typlenbyval(arrayexpr->element_typeid,
 						&elmlen, &elmbyval);
 
+		/*
+		 * Try to avoid O(N^2) selectivity calculation for ScalarArrayOpExpr.
+		 *
+		 * For equality/inequality operators in restriction clauses,
+		 * attempt to derive a single per-element selectivity (s2) and
+		 * combine it in O(1) time using a closed-form formula instead
+		 * of iterating over all elements.
+		 */
+		if ((isEquality || isInequality) && !is_join_clause)
+		{
+			VariableStatData vardata;
+			Selectivity s2 = -1.0;
+			Node	*other_op = NULL;
+			bool	var_on_left;
+			int	num_elems = list_length(arrayexpr->elements);
+
+			/*
+			 * If expression is not variable = something or something =
+			 * variable, then fall back to default code path to compute
+			 * default selectivity.
+			 */
+			if (!get_restriction_variable(root, clause->args, varRelid,
+										 &vardata, &other_op, &var_on_left))
+			{
+				s2 = (isInequality) ? (1.0 - DEFAULT_EQ_SEL) : DEFAULT_EQ_SEL;
+				s1 = calculate_combined_selectivity(s2, num_elems, useOr, isEquality, isInequality);
+
+				return s1;
+			}
+			else
+				ReleaseVariableStats(vardata);
+		}
+
 		/*
 		 * We use the assumption of disjoint probabilities here too, although
 		 * the odds of equal array elements are rather higher if the elements
-- 
2.34.1

