From 57695ce330b758fc1ac165c9c96c0a4dd397b5d0 Mon Sep 17 00:00:00 2001
From: Evdokimov Ilia <ilya.evdokimov@tantorlabs.com>
Date: Mon, 29 Dec 2025 23:28:51 +0300
Subject: [PATCH v1] Use hash-based matching for MCVs in ScalarArrayOpExpr
 selectivity

When estimating selectivity for ScalarArrayOpExpr (IN / ANY / ALL) with
available MCV statistics, the planner currently matches IN-list elements
against the MCV array using a nested loop, resulting in O(N*M) behavior
for large IN-lists and MCV lists.

Introduce a hash-based matching path, analogous to the one used for MCV
matching in join selectivity estimation.  Build a hash table over the MCV
values and probe it for each IN-list element, avoiding repeated linear
scans of the MCV array.

The hash table is built over the MCV list rather than the IN-list, since
MCVs are guaranteed to be distinct and non-NULL, while IN-lists may
contain duplicates, NULLs, or non-Const expressions.

The hash-based path is enabled only when both a sufficiently large
IN-list and an MCV list are present, and suitable hash functions exist
for the equality operator.
---
 src/backend/utils/adt/selfuncs.c | 390 ++++++++++++++++++++++++++++++-
 src/tools/pgindent/typedefs.list |   3 +
 2 files changed, 392 insertions(+), 1 deletion(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index a996f0c4939..fcdf3b7ac1e 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -146,7 +146,7 @@
 /*
  * In production builds, switch to hash-based MCV matching when the lists are
  * large enough to amortize hash setup cost.  (This threshold is compared to
- * the sum of the lengths of the two MCV lists.  This is simplistic but seems
+ * the sum of the lengths of the two lists.  This is simplistic but seems
  * to work well enough.)  In debug builds, we use a smaller threshold so that
  * the regression tests cover both paths well.
  */
@@ -156,6 +156,12 @@
 #define EQJOINSEL_MCV_HASH_THRESHOLD 20
 #endif
 
+#ifndef USE_ASSERT_CHECKING
+#define SCALARARRAY_MCV_HASH_THRESHOLD 200
+#else
+#define SCALARARRAY_MCV_HASH_THRESHOLD 20
+#endif
+
 /* Entries in the simplehash hash table used by eqjoinsel_find_matches */
 typedef struct MCVHashEntry
 {
@@ -176,14 +182,40 @@ typedef struct MCVHashContext
 	int16		hash_typlen;	/* typlen of hashed data type */
 } MCVHashContext;
 
+/* Entries in the simplehash hash table used by scalararray_mcv_hash_match */
+typedef struct MCVInHashEntry
+{
+	Datum		value;			/* the value represented by this entry */
+	int			index;			/* its index in the relevant AttStatsSlot */
+	uint32		hash;			/* hash code for the Datum */
+	char		status;			/* status code used by simplehash.h */
+} MCVInHashEntry;
+
+/* private_data for the simplehash hash table */
+typedef struct MCVInHashContext
+{
+	FunctionCallInfo equal_fcinfo;	/* the equality join operator */
+	FunctionCallInfo hash_fcinfo;	/* the hash function to use */
+	bool		insert_mode;	/* doing inserts or lookups? */
+	bool		hash_typbyval;	/* typbyval of hashed data type */
+	int16		hash_typlen;	/* typlen of hashed data type */
+} MCVInHashContext;
+
 /* forward reference */
 typedef struct MCVHashTable_hash MCVHashTable_hash;
+typedef struct MCVInHashTable_hash MCVInHashTable_hash;
 
 /* Hooks for plugins to get control when we ask for stats */
 get_relation_stats_hook_type get_relation_stats_hook = NULL;
 get_index_stats_hook_type get_index_stats_hook = NULL;
 
 static double eqsel_internal(PG_FUNCTION_ARGS, bool negate);
+static double scalararray_mcv_hash_match(VariableStatData *vardata, Oid operator, Oid collation,
+										 Node *other_op, bool var_on_left, Datum *elem_values,
+										 bool *elem_nulls, int num_elems,
+										 bool useOr, bool isEquality, bool isInequality);
+static uint32 hash_mcv_in(MCVInHashTable_hash *tab, Datum key);
+static bool mcvs_in_equal(MCVInHashTable_hash *tab, Datum key0, Datum key1);
 static double eqjoinsel_inner(FmgrInfo *eqproc, Oid collation,
 							  Oid hashLeft, Oid hashRight,
 							  VariableStatData *vardata1, VariableStatData *vardata2,
@@ -287,6 +319,19 @@ static double btcost_correlation(IndexOptInfo *index,
 #define SH_DECLARE
 #include "lib/simplehash.h"
 
+#define SH_PREFIX				MCVInHashTable
+#define SH_ELEMENT_TYPE			MCVInHashEntry
+#define SH_KEY_TYPE				Datum
+#define SH_KEY					value
+#define SH_HASH_KEY(tab,key)	hash_mcv_in(tab, key)
+#define SH_EQUAL(tab,key0,key1)	mcvs_in_equal(tab, key0, key1)
+#define SH_SCOPE				static inline
+#define SH_STORE_HASH
+#define SH_GET_HASH(tab,ent)	(ent)->hash
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
 
 /*
  *		eqsel			- Selectivity of "=" for any data types.
@@ -2025,6 +2070,35 @@ scalararraysel(PlannerInfo *root,
 						  elmlen, elmbyval, elmalign,
 						  &elem_values, &elem_nulls, &num_elems);
 
+		/*
+		 * Try to calculate selectivity by hash-search O(N) instead of O(N^2)
+		 * in case of MCV matching.
+		 */
+		if ((isEquality || isInequality) && !is_join_clause)
+		{
+			VariableStatData vardata;
+			Node	   *other_op = NULL;
+			bool		var_on_left;
+
+			/*
+			 * If expression is not variable = something or something =
+			 * variable, then punt and return a default estimate.
+			 */
+			if (get_restriction_variable(root, clause->args, varRelid,
+										 &vardata, &other_op, &var_on_left))
+			{
+				s1 = scalararray_mcv_hash_match(&vardata, operator, clause->inputcollid,
+												other_op, var_on_left, elem_values,
+												elem_nulls, num_elems,
+												useOr, isEquality, isInequality);
+
+				ReleaseVariableStats(vardata);
+
+				if (s1 >= 0.0)
+					return s1;
+			}
+		}
+
 		/*
 		 * For generic operators, we assume the probability of success is
 		 * independent for each array element.  But for "= ANY" or "<> ALL",
@@ -2210,6 +2284,320 @@ scalararraysel(PlannerInfo *root,
 	return s1;
 }
 
+/*
+ * Estimate selectivity of a ScalarArrayOpExpr using MCV statistics and,
+ * when beneficial, a hash table for efficient matching.
+ *
+ * The function evaluates selectivity by comparing each element of the
+ * IN-list against the column's most-common values (MCVs).
+ *
+ * Inputs:
+ *	vardata: statistics and metadata for the variable being estimated
+ *	operator: equality or inequality operator to apply
+ *	collation: OID of collation to use
+ *	other_op: expression for the non-variable side of the comparison
+ *	var_on_left: true if the variable is on the left side of the operator
+ *	elem_values: array of IN-list element values
+ *	elem_nulls: array indicating which IN-list elements are NULL
+ *	num_elems: number of IN-list elements
+ *	useOr: true if elements are combined using OR semantics, false for AND
+ *	isEquality: true if the operator is equality
+ *	isInequality: true if the operator is inequality
+ *
+ * Result:
+ *	Returns a selectivity estimate in the range [0.0, 1.0], or -1.0 if the
+ *	selectivity cannot be reliably estimated by this function.
+ */
+static double
+scalararray_mcv_hash_match(VariableStatData *vardata, Oid operator, Oid collation,
+						   Node *other_op, bool var_on_left, Datum *elem_values,
+						   bool *elem_nulls, int num_elems,
+						   bool useOr, bool isEquality, bool isInequality)
+{
+	Form_pg_statistic stats;
+	AttStatsSlot sslot;
+	FmgrInfo	eqproc;
+	double		selec = -1.0,
+				s1disjoint,
+				nullfrac = 0.0;
+	Oid			hashLeft = InvalidOid,
+				hashRight = InvalidOid,
+				opfuncoid;
+	bool		have_mcvs = false;
+
+	Assert(isEquality || isInequality);
+
+	/*
+	 * If we matched the var to a unique index, DISTINCT or GROUP-BY clause,
+	 * assume there is exactly one match regardless of anything else.  (This
+	 * is slightly bogus, since the index or clause's equality operator might
+	 * be different from ours, but it's much more likely to be right than
+	 * ignoring the information.)
+	 */
+	if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0)
+		return -1.0;
+
+	/*
+	 * When asked about <>, we do the estimation using the corresponding =
+	 * operator.
+	 */
+	if (isInequality)
+	{
+		operator = get_negator(operator);
+		if (!OidIsValid(operator))
+			return -1.0;
+	}
+
+	opfuncoid = get_opcode(operator);
+
+	memset(&sslot, 0, sizeof(sslot));
+
+	if (HeapTupleIsValid(vardata->statsTuple))
+	{
+		if (statistic_proc_security_check(vardata, opfuncoid))
+			have_mcvs = get_attstatsslot(&sslot, vardata->statsTuple,
+										 STATISTIC_KIND_MCV, InvalidOid,
+										 ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
+	}
+
+	if (have_mcvs)
+	{
+		fmgr_info(opfuncoid, &eqproc);
+
+		/*
+		 * If the combined size of the MCV list and the IN-list is large
+		 * enough to justify hashing, attempt to look up hash functions for
+		 * the operator.
+		 */
+		if (sslot.nvalues + num_elems >= SCALARARRAY_MCV_HASH_THRESHOLD)
+			(void) get_op_hash_functions(operator, &hashLeft, &hashRight);
+	}
+
+	if (have_mcvs && OidIsValid(hashLeft) && OidIsValid(hashRight))
+	{
+		/* Use a hash table to speed up the matching */
+		LOCAL_FCINFO(fcinfo, 2);
+		LOCAL_FCINFO(hash_fcinfo, 1);
+		MCVInHashTable_hash *hashTable;
+		FmgrInfo	hash_proc;
+		MCVInHashContext hashContext;
+		double		sumallcommon = 0.0,
+					remaining_selec = 0.0;
+		bool		isdefault;
+		double		otherdistinct;
+
+		/* Grab the nullfrac for use below. */
+		stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+		nullfrac = stats->stanullfrac;
+
+		selec = s1disjoint = (useOr ? 0.0 : 1.0);
+
+		InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation,
+								 NULL, NULL);
+		fcinfo->args[0].isnull = false;
+		fcinfo->args[1].isnull = false;
+
+		/*
+		 * Build the hash table using the MCV array as keys, rather than the
+		 * IN-list elements.
+		 *
+		 * The MCV list is guaranteed to contain distinct values. Hashing the
+		 * MCVs allows us to efficiently identify whether a given IN-list
+		 * element matches a most-common value, which is exactly what is
+		 * needed for accurate selectivity estimation.
+		 */
+		fmgr_info(hashLeft, &hash_proc);
+		InitFunctionCallInfoData(*hash_fcinfo, &hash_proc, 1, collation,
+								 NULL, NULL);
+		hash_fcinfo->args[0].isnull = false;
+
+		hashContext.equal_fcinfo = fcinfo;
+		hashContext.hash_fcinfo = hash_fcinfo;
+		hashContext.insert_mode = true;
+
+		get_typlenbyval(sslot.valuetype,
+						&hashContext.hash_typlen,
+						&hashContext.hash_typbyval);
+
+		hashTable = MCVInHashTable_create(CurrentMemoryContext,
+										  sslot.nvalues,
+										  &hashContext);
+
+		for (int i = 0; i < sslot.nvalues; i++)
+		{
+			bool		found = false;
+			MCVInHashEntry *entry;
+
+			entry = MCVInHashTable_insert(hashTable, sslot.values[i], &found);
+
+			/*
+			 * MCVHashTable_insert will only report "found" if the new value
+			 * is equal to some previous one per datum_image_eq().  That
+			 * probably shouldn't happen, since we're not expecting duplicates
+			 * in the MCV list.  If we do find a dup, just ignore it, leaving
+			 * the hash entry's index pointing at the first occurrence.  That
+			 * matches the behavior that the non-hashed code path would have.
+			 */
+			if (likely(!found))
+				entry->index = i;
+
+			sumallcommon += sslot.numbers[i];
+		}
+
+		/*
+		 * Prepare to probe the hash table.  If the probe values are of a
+		 * different data type, then we need to change hash functions.  (This
+		 * code relies on the assumption that since we defined SH_STORE_HASH,
+		 * simplehash.h will never need to compute hash values for existing
+		 * hash table entries.)
+		 */
+		hashContext.insert_mode = false;
+		if (hashLeft != hashRight)
+		{
+
+			fmgr_info(hashRight, &hash_proc);
+			/* Resetting hash_fcinfo is probably unnecessary, but be safe */
+			InitFunctionCallInfoData(*hash_fcinfo, &hash_proc, 1, collation,
+									 NULL, NULL);
+			hash_fcinfo->args[0].isnull = false;
+		}
+
+		/*
+		 * Comparison is against a constant that is neither NULL nor any of
+		 * the common values.  Its selectivity cannot be more than this:
+		 */
+		remaining_selec = 1.0 - sumallcommon - nullfrac;
+		CLAMP_PROBABILITY(remaining_selec);
+
+		/*
+		 * and in fact it's probably a good deal less. We approximate that all
+		 * the not-common values share this remaining fraction equally, so we
+		 * divide by the number of other distinct values.
+		 */
+		otherdistinct = get_variable_numdistinct(vardata, &isdefault) - sslot.nnumbers;
+		if (otherdistinct > 1)
+			remaining_selec /= otherdistinct;
+
+		/*
+		 * Another cross-check: selectivity shouldn't be estimated as more
+		 * than the least common "most common value".
+		 */
+		if (sslot.nnumbers > 0 && remaining_selec > sslot.numbers[sslot.nnumbers - 1])
+			remaining_selec = sslot.numbers[sslot.nnumbers - 1];
+
+		/* Evaluate selectivity contribution of each IN-list element. */
+		for (int i = 0; i < num_elems; i++)
+		{
+			MCVInHashEntry *entry;
+			Selectivity s1;
+
+			/*
+			 * If the constant is NULL, assume operator is strict and return
+			 * s1 zero. (It's zero even for a negator op.)
+			 */
+			if (elem_nulls[i])
+				continue;
+
+			if (IsA(other_op, Const))
+			{
+				entry = MCVInHashTable_lookup(hashTable, elem_values[i]);
+
+				if (entry != NULL)
+				{
+					/*
+					 * As in the other code path, skip already-matched hash
+					 * entries
+					 */
+					s1 = sslot.numbers[entry->index];
+				}
+				else
+					s1 = remaining_selec;
+
+				if (isInequality)
+					s1 = 1.0 - s1 - nullfrac;
+			}
+			else
+				s1 = var_eq_non_const(vardata, operator, collation, other_op,
+									  var_on_left, isInequality);
+
+			CLAMP_PROBABILITY(s1);
+
+			if (useOr)
+			{
+				selec = selec + s1 - selec * s1;
+				if (isEquality)
+					s1disjoint += s1;
+			}
+			else
+			{
+				selec = selec * s1;
+				if (isInequality)
+					s1disjoint += s1 - 1.0;
+			}
+		}
+
+		MCVInHashTable_destroy(hashTable);
+
+		/* accept disjoint-probability estimate if in range */
+		if ((useOr ? isEquality : isInequality) &&
+			s1disjoint >= 0.0 && s1disjoint <= 1.0)
+			selec = s1disjoint;
+
+		CLAMP_PROBABILITY(selec);
+	}
+
+	free_attstatsslot(&sslot);
+
+	return selec;
+}
+
+/*
+ * Support functions for the hash tables used by eqjoinsel_find_matches
+ */
+static uint32
+hash_mcv_in(MCVInHashTable_hash *tab, Datum key)
+{
+	MCVInHashContext *context = (MCVInHashContext *) tab->private_data;
+	FunctionCallInfo fcinfo = context->hash_fcinfo;
+	Datum		fresult;
+
+	fcinfo->args[0].value = key;
+	fcinfo->isnull = false;
+	fresult = FunctionCallInvoke(fcinfo);
+	Assert(!fcinfo->isnull);
+	return DatumGetUInt32(fresult);
+}
+
+static bool
+mcvs_in_equal(MCVInHashTable_hash *tab, Datum key0, Datum key1)
+{
+	MCVInHashContext *context = (MCVInHashContext *) tab->private_data;
+
+	if (context->insert_mode)
+	{
+		/*
+		 * During the insertion step, any comparisons will be between two
+		 * Datums of the hash table's data type, so if the given operator is
+		 * cross-type it will be the wrong thing to use.  Fortunately, we can
+		 * use datum_image_eq instead.  The MCV values should all be distinct
+		 * anyway, so it's mostly pro-forma to compare them at all.
+		 */
+		return datum_image_eq(key0, key1,
+							  context->hash_typbyval, context->hash_typlen);
+	}
+	else
+	{
+		FunctionCallInfo fcinfo = context->equal_fcinfo;
+		Datum		fresult;
+
+		fcinfo->args[0].value = key0;
+		fcinfo->args[1].value = key1;
+		fcinfo->isnull = false;
+		fresult = FunctionCallInvoke(fcinfo);
+		return (!fcinfo->isnull && DatumGetBool(fresult));
+	}
+}
+
 /*
  * Estimate number of elements in the array yielded by an expression.
  *
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index ceb3fc5d980..17f5cc9476b 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1666,6 +1666,9 @@ MBuf
 MCVHashContext
 MCVHashEntry
 MCVHashTable_hash
+MCVInHashContext
+MCVInHashEntry
+MCVInHashTable_hash
 MCVItem
 MCVList
 MEMORY_BASIC_INFORMATION
-- 
2.34.1

