From 12665ca19f802a2ace50525cf5df8a6f95e860df Mon Sep 17 00:00:00 2001 From: Maxime Schoemans Date: Thu, 16 Apr 2026 16:28:17 +0200 Subject: [PATCH v7 3/3] Remove duplicate selectivity functions between range and multirange The multirange selectivity code duplicated 10 helper functions from rangetypes_selfuncs.c. Since both range and multirange types use the same histogram format (STATISTIC_KIND_BOUNDS_HISTOGRAM) and the same RangeBound representation, the functions are identical. Make the 10 shared functions non-static in rangetypes_selfuncs.c, export them via a new rangetypes_selfuncs.h header, and remove the copies from multirangetypes_selfuncs.c. --- .../utils/adt/multirangetypes_selfuncs.c | 772 +----------------- src/backend/utils/adt/rangetypes_selfuncs.c | 46 +- src/include/utils/rangetypes_selfuncs.h | 54 ++ 3 files changed, 67 insertions(+), 805 deletions(-) create mode 100644 src/include/utils/rangetypes_selfuncs.h diff --git a/src/backend/utils/adt/multirangetypes_selfuncs.c b/src/backend/utils/adt/multirangetypes_selfuncs.c index 241f8c6dbe0..fa5f23d09a9 100644 --- a/src/backend/utils/adt/multirangetypes_selfuncs.c +++ b/src/backend/utils/adt/multirangetypes_selfuncs.c @@ -27,6 +27,7 @@ #include "utils/lsyscache.h" #include "utils/multirangetypes.h" #include "utils/rangetypes.h" +#include "utils/rangetypes_selfuncs.h" #include "utils/selfuncs.h" #include "utils/typcache.h" @@ -38,37 +39,6 @@ static double calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata, const MultirangeType *constval, Oid operator); -static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache, - const RangeBound *constbound, - const RangeBound *hist, - int hist_nvalues, bool equal); -static int rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value, - const RangeBound *hist, int hist_length, bool equal); -static float8 get_position(TypeCacheEntry *typcache, const RangeBound *value, - const RangeBound *hist1, const RangeBound *hist2); -static float8 get_len_position(double value, double hist1, double hist2); -static float8 get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, - const RangeBound *bound2); -static int length_hist_bsearch(const Datum *length_hist_values, - int length_hist_nvalues, double value, - bool equal); -static double calc_length_hist_frac(const Datum *length_hist_values, - int length_hist_nvalues, double length1, - double length2, bool equal); -static double calc_hist_selectivity_contained(TypeCacheEntry *typcache, - const RangeBound *lower, - RangeBound *upper, - const RangeBound *hist_lower, - int hist_nvalues, - const Datum *length_hist_values, - int length_hist_nvalues); -static double calc_hist_selectivity_contains(TypeCacheEntry *typcache, - const RangeBound *lower, - const RangeBound *upper, - const RangeBound *hist_lower, - int hist_nvalues, - const Datum *length_hist_values, - int length_hist_nvalues); /* * Returns a default selectivity estimate for given operator, when we don't @@ -698,746 +668,6 @@ calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata, return hist_selec; } - -/* - * Look up the fraction of values less than (or equal, if 'equal' argument - * is true) a given const in a histogram of range bounds. - */ -static double -calc_hist_selectivity_scalar(TypeCacheEntry *typcache, const RangeBound *constbound, - const RangeBound *hist, int hist_nvalues, bool equal) -{ - Selectivity selec; - int index; - - /* - * Find the histogram bin the given constant falls into. Estimate - * selectivity as the number of preceding whole bins. - */ - index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal); - selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1); - - /* Adjust using linear interpolation within the bin */ - if (index >= 0 && index < hist_nvalues - 1) - selec += get_position(typcache, constbound, &hist[index], - &hist[index + 1]) / (Selectivity) (hist_nvalues - 1); - - return selec; -} - -/* - * Binary search on an array of range bounds. Returns greatest index of range - * bound in array which is less(less or equal) than given range bound. If all - * range bounds in array are greater or equal(greater) than given range bound, - * return -1. When "equal" flag is set conditions in brackets are used. - * - * This function is used in scalar operator selectivity estimation. Another - * goal of this function is to find a histogram bin where to stop - * interpolation of portion of bounds which are less than or equal to given bound. - */ -static int -rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist, - int hist_length, bool equal) -{ - int lower = -1, - upper = hist_length - 1, - cmp, - middle; - - while (lower < upper) - { - middle = (lower + upper + 1) / 2; - cmp = range_cmp_bounds(typcache, &hist[middle], value); - - if (cmp < 0 || (equal && cmp == 0)) - lower = middle; - else - upper = middle - 1; - } - return lower; -} - - -/* - * Binary search on length histogram. Returns greatest index of range length in - * histogram which is less than (less than or equal) the given length value. If - * all lengths in the histogram are greater than (greater than or equal) the - * given length, returns -1. - */ -static int -length_hist_bsearch(const Datum *length_hist_values, int length_hist_nvalues, - double value, bool equal) -{ - int lower = -1, - upper = length_hist_nvalues - 1, - middle; - - while (lower < upper) - { - double middleval; - - middle = (lower + upper + 1) / 2; - - middleval = DatumGetFloat8(length_hist_values[middle]); - if (middleval < value || (equal && middleval <= value)) - lower = middle; - else - upper = middle - 1; - } - return lower; -} - -/* - * Get relative position of value in histogram bin in [0,1] range. - */ -static float8 -get_position(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist1, - const RangeBound *hist2) -{ - bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid); - float8 position; - - if (!hist1->infinite && !hist2->infinite) - { - float8 bin_width; - - /* - * Both bounds are finite. Assuming the subtype's comparison function - * works sanely, the value must be finite, too, because it lies - * somewhere between the bounds. If it doesn't, arbitrarily return - * 0.5. - */ - if (value->infinite) - return 0.5; - - /* Can't interpolate without subdiff function */ - if (!has_subdiff) - return 0.5; - - /* Calculate relative position using subdiff function. */ - bin_width = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo, - typcache->rng_collation, - hist2->val, - hist1->val)); - if (isnan(bin_width) || bin_width <= 0.0) - return 0.5; /* punt for NaN or zero-width bin */ - - position = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo, - typcache->rng_collation, - value->val, - hist1->val)) - / bin_width; - - if (isnan(position)) - return 0.5; /* punt for NaN from subdiff, Inf/Inf, etc */ - - /* Relative position must be in [0,1] range */ - position = Max(position, 0.0); - position = Min(position, 1.0); - return position; - } - else if (hist1->infinite && !hist2->infinite) - { - /* - * Lower bin boundary is -infinite, upper is finite. If the value is - * -infinite, return 0.0 to indicate it's equal to the lower bound. - * Otherwise return 1.0 to indicate it's infinitely far from the lower - * bound. - */ - return ((value->infinite && value->lower) ? 0.0 : 1.0); - } - else if (!hist1->infinite && hist2->infinite) - { - /* same as above, but in reverse */ - return ((value->infinite && !value->lower) ? 1.0 : 0.0); - } - else - { - /* - * If both bin boundaries are infinite, they should be equal to each - * other, and the value should also be infinite and equal to both - * bounds. (But don't Assert that, to avoid crashing if a user creates - * a datatype with a broken comparison function). - * - * Assume the value to lie in the middle of the infinite bounds. - */ - return 0.5; - } -} - - -/* - * Get relative position of value in a length histogram bin in [0,1] range. - */ -static double -get_len_position(double value, double hist1, double hist2) -{ - if (!isinf(hist1) && !isinf(hist2)) - { - /* - * Both bounds are finite. The value should be finite too, because it - * lies somewhere between the bounds. If it doesn't, just return - * something. - */ - if (isinf(value)) - return 0.5; - - return 1.0 - (hist2 - value) / (hist2 - hist1); - } - else if (isinf(hist1) && !isinf(hist2)) - { - /* - * Lower bin boundary is -infinite, upper is finite. Return 1.0 to - * indicate the value is infinitely far from the lower bound. - */ - return 1.0; - } - else if (isinf(hist1) && isinf(hist2)) - { - /* same as above, but in reverse */ - return 0.0; - } - else - { - /* - * If both bin boundaries are infinite, they should be equal to each - * other, and the value should also be infinite and equal to both - * bounds. (But don't Assert that, to avoid crashing unnecessarily if - * the caller messes up) - * - * Assume the value to lie in the middle of the infinite bounds. - */ - return 0.5; - } -} - -/* - * Measure distance between two range bounds. - */ -static float8 -get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, const RangeBound *bound2) -{ - bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid); - - if (!bound1->infinite && !bound2->infinite) - { - /* - * Neither bound is infinite, use subdiff function or return default - * value of 1.0 if no subdiff is available. - */ - if (has_subdiff) - { - float8 res; - - res = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo, - typcache->rng_collation, - bound2->val, - bound1->val)); - /* Reject possible NaN result, also negative result */ - if (isnan(res) || res < 0.0) - return 1.0; - else - return res; - } - else - return 1.0; - } - else if (bound1->infinite && bound2->infinite) - { - /* Both bounds are infinite */ - if (bound1->lower == bound2->lower) - return 0.0; - else - return get_float8_infinity(); - } - else - { - /* One bound is infinite, the other is not */ - return get_float8_infinity(); - } -} - -/* - * Calculate the average of function P(x), in the interval [length1, length2], - * where P(x) is the fraction of tuples with length < x (or length <= x if - * 'equal' is true). - */ -static double -calc_length_hist_frac(const Datum *length_hist_values, int length_hist_nvalues, - double length1, double length2, bool equal) -{ - double frac; - double A, - B, - PA, - PB; - double pos; - int i; - double area; - - Assert(length2 >= length1); - - if (length2 < 0.0) - return 0.0; /* shouldn't happen, but doesn't hurt to check */ - - /* All lengths in the table are <= infinite. */ - if (isinf(length2) && equal) - return 1.0; - - /*---------- - * The average of a function between A and B can be calculated by the - * formula: - * - * B - * 1 / - * ------- | P(x)dx - * B - A / - * A - * - * The geometrical interpretation of the integral is the area under the - * graph of P(x). P(x) is defined by the length histogram. We calculate - * the area in a piecewise fashion, iterating through the length histogram - * bins. Each bin is a trapezoid: - * - * P(x2) - * /| - * / | - * P(x1)/ | - * | | - * | | - * ---+---+-- - * x1 x2 - * - * where x1 and x2 are the boundaries of the current histogram, and P(x1) - * and P(x1) are the cumulative fraction of tuples at the boundaries. - * - * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1) - * - * The first bin contains the lower bound passed by the caller, so we - * use linear interpolation between the previous and next histogram bin - * boundary to calculate P(x1). Likewise for the last bin: we use linear - * interpolation to calculate P(x2). For the bins in between, x1 and x2 - * lie on histogram bin boundaries, so P(x1) and P(x2) are simply: - * P(x1) = (bin index) / (number of bins) - * P(x2) = (bin index + 1 / (number of bins) - */ - - /* First bin, the one that contains lower bound */ - i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal); - if (i >= length_hist_nvalues - 1) - return 1.0; - - if (i < 0) - { - i = 0; - pos = 0.0; - } - else - { - /* interpolate length1's position in the bin */ - pos = get_len_position(length1, - DatumGetFloat8(length_hist_values[i]), - DatumGetFloat8(length_hist_values[i + 1])); - } - PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1); - B = length1; - - /* - * In the degenerate case that length1 == length2, simply return - * P(length1). This is not merely an optimization: if length1 == length2, - * we'd divide by zero later on. - */ - if (length2 == length1) - return PB; - - /* - * Loop through all the bins, until we hit the last bin, the one that - * contains the upper bound. (if lower and upper bounds are in the same - * bin, this falls out immediately) - */ - area = 0.0; - for (; i < length_hist_nvalues - 1; i++) - { - double bin_upper = DatumGetFloat8(length_hist_values[i + 1]); - - /* check if we've reached the last bin */ - if (!(bin_upper < length2 || (equal && bin_upper <= length2))) - break; - - /* the upper bound of previous bin is the lower bound of this bin */ - A = B; - PA = PB; - - B = bin_upper; - PB = (double) i / (double) (length_hist_nvalues - 1); - - /* - * Add the area of this trapezoid to the total. The point of the - * if-check is to avoid NaN, in the corner case that PA == PB == 0, - * and B - A == Inf. The area of a zero-height trapezoid (PA == PB == - * 0) is zero, regardless of the width (B - A). - */ - if (PA > 0 || PB > 0) - area += 0.5 * (PB + PA) * (B - A); - } - - /* Last bin */ - A = B; - PA = PB; - - B = length2; /* last bin ends at the query upper bound */ - if (i >= length_hist_nvalues - 1) - pos = 0.0; - else - { - if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1])) - pos = 0.0; - else - pos = get_len_position(length2, - DatumGetFloat8(length_hist_values[i]), - DatumGetFloat8(length_hist_values[i + 1])); - } - PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1); - - if (PA > 0 || PB > 0) - area += 0.5 * (PB + PA) * (B - A); - - /* - * Ok, we have calculated the area, ie. the integral. Divide by width to - * get the requested average. - * - * Avoid NaN arising from infinite / infinite. This happens at least if - * length2 is infinite. It's not clear what the correct value would be in - * that case, so 0.5 seems as good as any value. - */ - if (isinf(area) && isinf(length2)) - frac = 0.5; - else - frac = area / (length2 - length1); - - return frac; -} - -/* - * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction - * of multiranges that fall within the constant lower and upper bounds. This uses - * the histograms of range lower bounds and range lengths, on the assumption - * that the range lengths are independent of the lower bounds. - * - * The caller has already checked that constant lower and upper bounds are - * finite. - */ -static double -calc_hist_selectivity_contained(TypeCacheEntry *typcache, - const RangeBound *lower, RangeBound *upper, - const RangeBound *hist_lower, int hist_nvalues, - const Datum *length_hist_values, int length_hist_nvalues) -{ - int i, - upper_index; - float8 prev_dist; - double bin_width; - double upper_bin_width; - double sum_frac; - - /* - * Begin by finding the bin containing the upper bound, in the lower bound - * histogram. Any range with a lower bound > constant upper bound can't - * match, ie. there are no matches in bins greater than upper_index. - */ - upper->inclusive = !upper->inclusive; - upper->lower = true; - upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues, - false); - - /* - * If the upper bound value is below the histogram's lower limit, there - * are no matches. - */ - if (upper_index < 0) - return 0.0; - - /* - * If the upper bound value is at or beyond the histogram's upper limit, - * start our loop at the last actual bin, as though the upper bound were - * within that bin; get_position will clamp its result to 1.0 anyway. - * (This corresponds to assuming that the data population above the - * histogram's upper limit is empty, exactly like what we just assumed for - * the lower limit.) - */ - upper_index = Min(upper_index, hist_nvalues - 2); - - /* - * Calculate upper_bin_width, ie. the fraction of the (upper_index, - * upper_index + 1) bin which is greater than upper bound of query range - * using linear interpolation of subdiff function. - */ - upper_bin_width = get_position(typcache, upper, - &hist_lower[upper_index], - &hist_lower[upper_index + 1]); - - /* - * In the loop, dist and prev_dist are the distance of the "current" bin's - * lower and upper bounds from the constant upper bound. - * - * bin_width represents the width of the current bin. Normally it is 1.0, - * meaning a full width bin, but can be less in the corner cases: start - * and end of the loop. We start with bin_width = upper_bin_width, because - * we begin at the bin containing the upper bound. - */ - prev_dist = 0.0; - bin_width = upper_bin_width; - - sum_frac = 0.0; - for (i = upper_index; i >= 0; i--) - { - double dist; - double length_hist_frac; - bool final_bin = false; - - /* - * dist -- distance from upper bound of query range to lower bound of - * the current bin in the lower bound histogram. Or to the lower bound - * of the constant range, if this is the final bin, containing the - * constant lower bound. - */ - if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0) - { - dist = get_distance(typcache, lower, upper); - - /* - * Subtract from bin_width the portion of this bin that we want to - * ignore. - */ - bin_width -= get_position(typcache, lower, &hist_lower[i], - &hist_lower[i + 1]); - if (bin_width < 0.0) - bin_width = 0.0; - final_bin = true; - } - else - dist = get_distance(typcache, &hist_lower[i], upper); - - /* - * Estimate the fraction of tuples in this bin that are narrow enough - * to not exceed the distance to the upper bound of the query range. - */ - length_hist_frac = calc_length_hist_frac(length_hist_values, - length_hist_nvalues, - prev_dist, dist, true); - - /* - * Add the fraction of tuples in this bin, with a suitable length, to - * the total. - */ - sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1); - - if (final_bin) - break; - - bin_width = 1.0; - prev_dist = dist; - } - - return sum_frac; -} - -/* - * Calculate selectivity of "var @> const" operator, ie. estimate the fraction - * of multiranges that contain the constant lower and upper bounds. This uses - * the histograms of range lower bounds and range lengths, on the assumption - * that the range lengths are independent of the lower bounds. - */ -static double -calc_hist_selectivity_contains(TypeCacheEntry *typcache, - const RangeBound *lower, const RangeBound *upper, - const RangeBound *hist_lower, int hist_nvalues, - const Datum *length_hist_values, int length_hist_nvalues) -{ - int i, - lower_index; - double bin_width, - lower_bin_width; - double sum_frac; - float8 prev_dist; - - /* Find the bin containing the lower bound of query range. */ - lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues, - true); - - /* - * If the lower bound value is below the histogram's lower limit, there - * are no matches. - */ - if (lower_index < 0) - return 0.0; - - /* - * If the lower bound value is at or beyond the histogram's upper limit, - * start our loop at the last actual bin, as though the upper bound were - * within that bin; get_position will clamp its result to 1.0 anyway. - * (This corresponds to assuming that the data population above the - * histogram's upper limit is empty, exactly like what we just assumed for - * the lower limit.) - */ - lower_index = Min(lower_index, hist_nvalues - 2); - - /* - * Calculate lower_bin_width, ie. the fraction of the of (lower_index, - * lower_index + 1) bin which is greater than lower bound of query range - * using linear interpolation of subdiff function. - */ - lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index], - &hist_lower[lower_index + 1]); - - /* - * Loop through all the lower bound bins, smaller than the query lower - * bound. In the loop, dist and prev_dist are the distance of the - * "current" bin's lower and upper bounds from the constant upper bound. - * We begin from query lower bound, and walk backwards, so the first bin's - * upper bound is the query lower bound, and its distance to the query - * upper bound is the length of the query range. - * - * bin_width represents the width of the current bin. Normally it is 1.0, - * meaning a full width bin, except for the first bin, which is only - * counted up to the constant lower bound. - */ - prev_dist = get_distance(typcache, lower, upper); - sum_frac = 0.0; - bin_width = lower_bin_width; - for (i = lower_index; i >= 0; i--) - { - float8 dist; - double length_hist_frac; - - /* - * dist -- distance from upper bound of query range to current value - * of lower bound histogram or lower bound of query range (if we've - * reach it). - */ - dist = get_distance(typcache, &hist_lower[i], upper); - - /* - * Get average fraction of length histogram which covers intervals - * longer than (or equal to) distance to upper bound of query range. - */ - length_hist_frac = - 1.0 - calc_length_hist_frac(length_hist_values, - length_hist_nvalues, - prev_dist, dist, false); - - sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1); - - bin_width = 1.0; - prev_dist = dist; - } - - return sum_frac; -} - -/* - * Estimate join selectivity P(X < Y) using rangebound histograms. - * - * Based on: Diogo Repas, Zhicheng Luo, Maxime Schoemans, Mahmoud Sakr, 2022 - * "Selectivity Estimation of Inequality Joins In Databases" - * https://doi.org/10.48550/arXiv.2206.07396 - * - * hist1 and hist2 are arrays of RangeBound entries from the bounds histograms - * of two range-typed or multirange-typed attributes X and Y, respectively. - * Each array has at least 2 entries (one histogram bin). The entries carry - * full bound metadata (lower/upper flag, inclusive/exclusive), and all - * comparisons use range_cmp_bounds() so that bound semantics are preserved. - * - * The algorithm models each attribute's distribution as a piecewise function - * derived from its histogram, then computes: - * P(X < Y) = 0.5 * sum( (F_X(prev) + F_X(cur)) * (F_Y(cur) - F_Y(prev)) ) - * by parallel-scanning both histograms. - * - * The initial fast-forward loops skip histogram entries that fall entirely - * before the other histogram's range, so the main loop only processes the - * overlapping region. Bounds checks are required because the histograms may - * be completely disjoint (e.g., all of X is below all of Y). - */ -static double -calc_hist_join_selectivity(TypeCacheEntry *typcache, - const RangeBound *hist1, int nhist1, - const RangeBound *hist2, int nhist2) -{ - int i, - j; - double selectivity = 0.0; - double prev_sel1 = -1.0; /* negative sentinel skips first iter */ - double prev_sel2 = 0.0; - - Assert(nhist1 > 1); - Assert(nhist2 > 1); - - /* - * Fast-forward past hist1 entries that are entirely below hist2[0], and - * vice versa. Bounds checks prevent out-of-bounds access when the - * histograms are fully disjoint. - */ - for (i = 0; i < nhist1 && - range_cmp_bounds(typcache, &hist1[i], &hist2[0]) < 0; i++) - ; - for (j = 0; j < nhist2 && - range_cmp_bounds(typcache, &hist2[j], &hist1[0]) < 0; j++) - ; - - /* - * Handle fully-separated histograms. When all bounds in hist1 are below - * all bounds in hist2, P(X < Y) is ~1.0. When all of hist2 is below - * hist1, P(X < Y) is ~0.0. We return immediately rather than falling - * into the overlap walk with invalid indices. - */ - if (i >= nhist1) - return 1.0; - if (j >= nhist2) - return 0.0; - - /* Walk the overlapping region of both histograms */ - while (i < nhist1 && j < nhist2) - { - double cur_sel1, - cur_sel2; - RangeBound cur_sync; - int cmp; - - cmp = range_cmp_bounds(typcache, &hist1[i], &hist2[j]); - if (cmp < 0) - cur_sync = hist1[i++]; - else if (cmp > 0) - cur_sync = hist2[j++]; - else - { - /* Equal bounds: advance both */ - cur_sync = hist1[i]; - i++; - j++; - } - cur_sel1 = calc_hist_selectivity_scalar(typcache, &cur_sync, - hist1, nhist1, false); - cur_sel2 = calc_hist_selectivity_scalar(typcache, &cur_sync, - hist2, nhist2, false); - - /* Skip the first iteration (no previous point yet) */ - if (prev_sel1 >= 0) - selectivity += (prev_sel1 + cur_sel1) * (cur_sel2 - prev_sel2); - - prev_sel1 = cur_sel1; - prev_sel2 = cur_sel2; - } - - /* P(X < Y) = 0.5 * Sum(...) */ - selectivity /= 2; - - /* Include remainder of hist2 if hist1 was exhausted first */ - if (j < nhist2) - selectivity += 1 - prev_sel2; - - return selectivity; -} - /* * multirangejoinsel -- join selectivity for multirange operators * diff --git a/src/backend/utils/adt/rangetypes_selfuncs.c b/src/backend/utils/adt/rangetypes_selfuncs.c index cc702f28610..4f4baa7dc1a 100644 --- a/src/backend/utils/adt/rangetypes_selfuncs.c +++ b/src/backend/utils/adt/rangetypes_selfuncs.c @@ -26,6 +26,7 @@ #include "utils/fmgrprotos.h" #include "utils/lsyscache.h" #include "utils/rangetypes.h" +#include "utils/rangetypes_selfuncs.h" #include "utils/selfuncs.h" #include "utils/typcache.h" @@ -35,29 +36,6 @@ static double default_range_selectivity(Oid operator); static double calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata, const RangeType *constval, Oid operator); -static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache, - const RangeBound *constbound, - const RangeBound *hist, int hist_nvalues, - bool equal); -static int rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value, - const RangeBound *hist, int hist_length, bool equal); -static float8 get_position(TypeCacheEntry *typcache, const RangeBound *value, - const RangeBound *hist1, const RangeBound *hist2); -static float8 get_len_position(double value, double hist1, double hist2); -static float8 get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, - const RangeBound *bound2); -static int length_hist_bsearch(const Datum *length_hist_values, - int length_hist_nvalues, double value, bool equal); -static double calc_length_hist_frac(const Datum *length_hist_values, - int length_hist_nvalues, double length1, double length2, bool equal); -static double calc_hist_selectivity_contained(TypeCacheEntry *typcache, - const RangeBound *lower, RangeBound *upper, - const RangeBound *hist_lower, int hist_nvalues, - const Datum *length_hist_values, int length_hist_nvalues); -static double calc_hist_selectivity_contains(TypeCacheEntry *typcache, - const RangeBound *lower, const RangeBound *upper, - const RangeBound *hist_lower, int hist_nvalues, - const Datum *length_hist_values, int length_hist_nvalues); /* * Returns a default selectivity estimate for given operator, when we don't @@ -592,7 +570,7 @@ calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata, * Look up the fraction of values less than (or equal, if 'equal' argument * is true) a given const in a histogram of range bounds. */ -static double +double calc_hist_selectivity_scalar(TypeCacheEntry *typcache, const RangeBound *constbound, const RangeBound *hist, int hist_nvalues, bool equal) { @@ -624,7 +602,7 @@ calc_hist_selectivity_scalar(TypeCacheEntry *typcache, const RangeBound *constbo * goal of this function is to find a histogram bin where to stop * interpolation of portion of bounds which are less than or equal to given bound. */ -static int +int rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist, int hist_length, bool equal) { @@ -653,7 +631,7 @@ rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value, const RangeBou * all lengths in the histogram are greater than (greater than or equal) the * given length, returns -1. */ -static int +int length_hist_bsearch(const Datum *length_hist_values, int length_hist_nvalues, double value, bool equal) { @@ -679,7 +657,7 @@ length_hist_bsearch(const Datum *length_hist_values, int length_hist_nvalues, /* * Get relative position of value in histogram bin in [0,1] range. */ -static float8 +float8 get_position(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist1, const RangeBound *hist2) { @@ -758,7 +736,7 @@ get_position(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound /* * Get relative position of value in a length histogram bin in [0,1] range. */ -static double +double get_len_position(double value, double hist1, double hist2) { if (!isinf(hist1) && !isinf(hist2)) @@ -803,7 +781,7 @@ get_len_position(double value, double hist1, double hist2) /* * Measure distance between two range bounds. */ -static float8 +float8 get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, const RangeBound *bound2) { bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid); @@ -851,7 +829,7 @@ get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, const RangeBoun * where P(x) is the fraction of tuples with length < x (or length <= x if * 'equal' is true). */ -static double +double calc_length_hist_frac(const Datum *length_hist_values, int length_hist_nvalues, double length1, double length2, bool equal) { @@ -1014,7 +992,7 @@ calc_length_hist_frac(const Datum *length_hist_values, int length_hist_nvalues, * The caller has already checked that constant lower and upper bounds are * finite. */ -static double +double calc_hist_selectivity_contained(TypeCacheEntry *typcache, const RangeBound *lower, RangeBound *upper, const RangeBound *hist_lower, int hist_nvalues, @@ -1135,7 +1113,7 @@ calc_hist_selectivity_contained(TypeCacheEntry *typcache, * the histograms of range lower bounds and range lengths, on the assumption * that the range lengths are independent of the lower bounds. */ -static double +double calc_hist_selectivity_contains(TypeCacheEntry *typcache, const RangeBound *lower, const RangeBound *upper, const RangeBound *hist_lower, int hist_nvalues, @@ -1230,7 +1208,7 @@ calc_hist_selectivity_contains(TypeCacheEntry *typcache, * https://doi.org/10.48550/arXiv.2206.07396 * * hist1 and hist2 are arrays of RangeBound entries from the bounds histograms - * of two range-typed attributes X and Y, respectively. Each array has at + * of two range- or multirange-typed attributes X and Y, respectively. Each array has at * least 2 entries (one histogram bin). The entries carry full bound metadata * (lower/upper flag, inclusive/exclusive), and all comparisons use * range_cmp_bounds() so that bound semantics are preserved. @@ -1245,7 +1223,7 @@ calc_hist_selectivity_contains(TypeCacheEntry *typcache, * overlapping region. Bounds checks are required because the histograms may * be completely disjoint (e.g., all of X is below all of Y). */ -static double +double calc_hist_join_selectivity(TypeCacheEntry *typcache, const RangeBound *hist1, int nhist1, const RangeBound *hist2, int nhist2) diff --git a/src/include/utils/rangetypes_selfuncs.h b/src/include/utils/rangetypes_selfuncs.h new file mode 100644 index 00000000000..be6bda9ab11 --- /dev/null +++ b/src/include/utils/rangetypes_selfuncs.h @@ -0,0 +1,54 @@ +/*------------------------------------------------------------------------- + * + * rangetypes_selfuncs.h + * Shared helper functions for range and multirange selectivity estimation. + * + * These functions are defined in rangetypes_selfuncs.c and used by both + * rangetypes_selfuncs.c and multirangetypes_selfuncs.c. + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/rangetypes_selfuncs.h + * + *------------------------------------------------------------------------- + */ +#ifndef RANGETYPES_SELFUNCS_H +#define RANGETYPES_SELFUNCS_H + +#include "utils/rangetypes.h" + +extern double calc_hist_selectivity_scalar(TypeCacheEntry *typcache, + const RangeBound *constbound, + const RangeBound *hist, int hist_nvalues, + bool equal); +extern int rbound_bsearch(TypeCacheEntry *typcache, + const RangeBound *value, const RangeBound *hist, + int hist_length, bool equal); +extern int length_hist_bsearch(const Datum *length_hist_values, + int length_hist_nvalues, + double value, bool equal); +extern float8 get_position(TypeCacheEntry *typcache, + const RangeBound *value, + const RangeBound *hist1, const RangeBound *hist2); +extern double get_len_position(double value, double hist1, double hist2); +extern float8 get_distance(TypeCacheEntry *typcache, + const RangeBound *bound1, const RangeBound *bound2); +extern double calc_length_hist_frac(const Datum *length_hist_values, + int length_hist_nvalues, + double length1, double length2, bool equal); +extern double calc_hist_selectivity_contained(TypeCacheEntry *typcache, + const RangeBound *lower, RangeBound *upper, + const RangeBound *hist_lower, int hist_nvalues, + const Datum *length_hist_values, + int length_hist_nvalues); +extern double calc_hist_selectivity_contains(TypeCacheEntry *typcache, + const RangeBound *lower, const RangeBound *upper, + const RangeBound *hist_lower, int hist_nvalues, + const Datum *length_hist_values, + int length_hist_nvalues); +extern double calc_hist_join_selectivity(TypeCacheEntry *typcache, + const RangeBound *hist1, int nhist1, + const RangeBound *hist2, int nhist2); + +#endif /* RANGETYPES_SELFUNCS_H */ -- 2.50.1 (Apple Git-155)