From e8377401efd1af0d6489fc12eaba5bfd0d396b37 Mon Sep 17 00:00:00 2001 From: Alexandre Felipe Date: Thu, 5 Feb 2026 10:36:51 +0000 Subject: [PATCH 4/4] [MERGE-SCAN] Multi column Hande equality or SAOP constraints on multiple leading columns. Imposes the correct order on the entire prefix (ASC|DESC) NULLS (FIRST|LAST) Supports backward scans. Adds enable_indexmergescan parameter --- src/backend/access/nbtree/nbtmergescan.c | 341 ++++++++++++---------- src/backend/access/nbtree/nbtree.c | 215 ++++++++++---- src/backend/commands/explain.c | 94 ++++-- src/backend/optimizer/path/costsize.c | 1 + src/backend/optimizer/path/indxpath.c | 183 +++++++----- src/backend/optimizer/plan/createplan.c | 84 +++++- src/backend/optimizer/util/pathnode.c | 1 + src/backend/utils/misc/guc_parameters.dat | 7 + src/include/access/nbtree.h | 36 +-- src/include/nodes/pathnodes.h | 1 + src/include/nodes/plannodes.h | 4 + src/include/optimizer/cost.h | 1 + src/test/regress/expected/btree_merge.out | 278 +++++++++++++++++- src/test/regress/sql/btree_merge.sql | 159 +++++++++- 14 files changed, 1058 insertions(+), 347 deletions(-) diff --git a/src/backend/access/nbtree/nbtmergescan.c b/src/backend/access/nbtree/nbtmergescan.c index eda1e683525..0f1444b49b6 100644 --- a/src/backend/access/nbtree/nbtmergescan.c +++ b/src/backend/access/nbtree/nbtmergescan.c @@ -23,6 +23,7 @@ */ #include "postgres.h" +#include "access/genam.h" #include "access/nbtree.h" #include "access/relscan.h" #include "lib/pairingheap.h" @@ -40,27 +41,43 @@ static int bt_merge_heap_cmp(const pairingheap_node *a, void *arg); static bool bt_merge_cursor_init(BTMergeScanState *state, IndexScanDesc scan, - BTMergeCursor *cursor, - Datum prefix_value, - bool prefix_isnull); + BTMergeCursor *cursor); static bool bt_merge_cursor_advance(BTMergeScanState *state, IndexScanDesc scan, BTMergeCursor *cursor); -static Datum bt_merge_extract_sortkey(BTMergeScanState *state, - IndexScanDesc scan, - BTMergeCursor *cursor, - bool *isnull); +static IndexTuple bt_merge_get_index_tuple(BTMergeCursor *cursor); +/* + * bt_merge_get_index_tuple + * Get the current index tuple from a cursor. + * + * Returns the IndexTuple pointer from cursor->tuples, or NULL if exhausted. + */ +static IndexTuple +bt_merge_get_index_tuple(BTMergeCursor *cursor) +{ + BTScanPosItem *currItem; + + if (cursor->exhausted || cursor->tuples == NULL) + return NULL; + + currItem = &cursor->pos.items[cursor->pos.itemIndex]; + return (IndexTuple) (cursor->tuples + currItem->tupleOffset); +} + /* * bt_merge_heap_cmp - * Compare two cursors by their current sort key (suffix value). + * Compare two cursors by their current sort key (all suffix columns). * - * When sort keys are equal, uses prefix value as tiebreaker for - * deterministic ordering (ORDER BY suffix, prefix). + * Compares all suffix columns in order. When all suffix columns are equal, + * uses cursor_id as tiebreaker for deterministic ordering (preserves + * original prefix array order). * - * Returns positive if a > b (pairingheap is a max-heap, we want min-heap - * behavior so we invert the comparison). + * returns + * -1 if a comes before b + * 1 if b comes before a + * 0 if a and b are equal */ static int bt_merge_heap_cmp(const pairingheap_node *a, @@ -72,41 +89,65 @@ bt_merge_heap_cmp(const pairingheap_node *a, (pairingheap_node *) a); BTMergeCursor *cursor_b = pairingheap_container(BTMergeCursor, ph_node, (pairingheap_node *) b); - Datum key_a = cursor_a->sort_key; - Datum key_b = cursor_b->sort_key; - bool null_a = cursor_a->sort_key_isnull; - bool null_b = cursor_b->sort_key_isnull; - int32 cmp; - - /* Handle NULLs - NULLs sort last (NULLS LAST default for ASC) */ - if (null_a && null_b) - return 0; - if (null_a) - return -1; /* a is NULL, comes after b */ - if (null_b) - return 1; /* b is NULL, comes after a */ - - /* Compare using the suffix column's comparison function */ - cmp = DatumGetInt32(FunctionCall2Coll(&state->suffix_cmp, - state->suffix_collation, - key_a, key_b)); - - /* - * Use prefix value as tiebreaker for deterministic ordering. - * This ensures ORDER BY suffix, prefix behavior. - */ - if (cmp == 0) + IndexTuple itup_a; + IndexTuple itup_b; + int32 cmp = 0; + int col; + + /* Get the index tuples from each cursor */ + itup_a = bt_merge_get_index_tuple(cursor_a); + itup_b = bt_merge_get_index_tuple(cursor_b); + + /* Handle exhausted cursors */ + if (itup_a == NULL && itup_b == NULL) + return cursor_b->cursor_id - cursor_a->cursor_id; + if (itup_a == NULL) + return -1; /* a is exhausted, comes after b */ + if (itup_b == NULL) + return 1; /* b is exhausted, comes after a */ + + /* Compare all suffix columns in order */ + for (col = 0; col < state->index_rel->rd_index->indnkeyatts - state->num_prefix_cols && cmp == 0; col++) { - /* Compare prefix values (assumes pass-by-value int4 for now) */ - int32 prefix_a = DatumGetInt32(cursor_a->prefix_value); - int32 prefix_b = DatumGetInt32(cursor_b->prefix_value); - - if (prefix_a < prefix_b) - cmp = -1; - else if (prefix_a > prefix_b) - cmp = 1; + int attno = state->num_prefix_cols + col + 1; + int16 indoption = state->index_rel->rd_indoption[attno - 1]; + bool null_a, + null_b; + Datum key_a, + key_b; + + key_a = index_getattr(itup_a, attno, state->index_tupdesc, &null_a); + key_b = index_getattr(itup_b, attno, state->index_tupdesc, &null_b); + + /* Handle NULLs - return directly with all factors multiplied */ + if (null_a || null_b) + { + if (null_a && null_b) + continue; /* Both NULL, try next column */ + + return (null_a ? -1 : 1) + * ((indoption & INDOPTION_NULLS_FIRST) ? -1 : 1) + * (state->direction == BackwardScanDirection ? -1 : 1); + } + + /* Compare using index's comparison function and collation */ + cmp = DatumGetInt32(FunctionCall2Coll(index_getprocinfo(state->index_rel, attno, BTORDER_PROC), + TupleDescAttr(state->index_tupdesc, attno - 1)->attcollation, + key_a, key_b)); + + /* For DESC columns, invert to match physical index order */ + if ((indoption & INDOPTION_DESC)) + cmp = -cmp; } + /* For backward scan, invert the suffix comparison */ + if (state->direction == BackwardScanDirection) + cmp = -cmp; + + /* Use cursor_id as tiebreaker (always ascending for determinism) */ + if (cmp == 0) + cmp = cursor_a->cursor_id - cursor_b->cursor_id; + /* Negate for min-heap behavior */ return -cmp; } @@ -116,24 +157,32 @@ bt_merge_heap_cmp(const pairingheap_node *a, * bt_merge_init * Initialize a merge scan state. * - * Creates the merge state with one cursor per prefix value. + * Creates the merge state with one cursor per prefix combination. * The cursors will be positioned at their first matching tuples * when bt_merge_getnext is first called. + * + * Prefix columns are assumed to be 1..num_prefix_cols. + * Suffix columns are (num_prefix_cols+1)..indnkeyatts. + * Comparison functions are looked up from the index relation. */ BTMergeScanState * bt_merge_init(IndexScanDesc scan, - Datum *prefix_values, - bool *prefix_nulls, - int num_prefixes, - int prefix_attno, - int suffix_attno, - Oid suffix_cmp_oid, - Oid suffix_collation) + Datum **prefix_tuples, + bool **prefix_nulls, + int num_cursors, + int num_prefix_cols) { BTMergeScanState *state; + Relation rel = scan->indexRelation; + TupleDesc tupdesc = RelationGetDescr(rel); MemoryContext merge_context; MemoryContext old_context; int i; + int j; + + /* Check there are suffix columns to order by */ + if (rel->rd_index->indnkeyatts <= num_prefix_cols) + return NULL; /* Create memory context for merge scan allocations */ merge_context = AllocSetContextCreate(CurrentMemoryContext, @@ -144,33 +193,57 @@ bt_merge_init(IndexScanDesc scan, /* Allocate main state structure */ state = palloc0(sizeof(BTMergeScanState)); state->merge_context = merge_context; - state->num_cursors = num_prefixes; + state->num_cursors = num_cursors; state->active_cursors = 0; - state->prefix_attno = prefix_attno; - state->suffix_attno = suffix_attno; - state->suffix_collation = suffix_collation; + state->num_prefix_cols = num_prefix_cols; state->direction = ForwardScanDirection; state->initialized = false; state->tuples_accessed = 0; + state->index_tupdesc = tupdesc; - /* Set up suffix comparison function */ - fmgr_info(suffix_cmp_oid, &state->suffix_cmp); + /* Store reference to index relation (for cmp funcs, collations, indoption) */ + state->index_rel = rel; /* Allocate cursor array */ - state->cursors = palloc0(num_prefixes * sizeof(BTMergeCursor)); + state->cursors = palloc0(num_cursors * sizeof(BTMergeCursor)); /* Initialize cursor metadata (not positioned yet) */ - for (i = 0; i < num_prefixes; i++) + for (i = 0; i < num_cursors; i++) { BTMergeCursor *cursor = &state->cursors[i]; + bool has_null = false; cursor->cursor_id = i; - cursor->prefix_value = datumCopy(prefix_values[i], true, sizeof(Datum)); - cursor->prefix_isnull = prefix_nulls[i]; - cursor->exhausted = prefix_nulls[i]; /* NULL prefix = exhausted */ - cursor->sort_key_isnull = true; + + /* Check if any prefix value is NULL */ + for (j = 0; j < num_prefix_cols; j++) + { + if (prefix_nulls[i][j]) + { + has_null = true; + break; + } + } + + /* Skip cursors with NULL prefixes - they would match nothing */ + if (has_null) + { + cursor->prefix_values = NULL; + cursor->exhausted = true; + cursor->tuples = NULL; + BTScanPosInvalidate(cursor->pos); + continue; + } + + /* Copy prefix values for this cursor */ + cursor->prefix_values = palloc(num_prefix_cols * sizeof(Datum)); + for (j = 0; j < num_prefix_cols; j++) + { + cursor->prefix_values[j] = datumCopy(prefix_tuples[i][j], true, sizeof(Datum)); + } + cursor->exhausted = false; BTScanPosInvalidate(cursor->pos); - /* Allocate tuple workspace for index-only scans */ + /* Allocate tuple workspace for suffix key extraction */ cursor->tuples = palloc(BLCKSZ); } @@ -212,9 +285,7 @@ bt_merge_getnext(IndexScanDesc scan, ScanDirection dir) { BTMergeCursor *c = &state->cursors[i]; - if (!c->exhausted && - bt_merge_cursor_init(state, scan, c, - c->prefix_value, c->prefix_isnull)) + if (!c->exhausted && bt_merge_cursor_init(state, scan, c)) { /* Cursor has at least one tuple, add to heap */ pairingheap_add(state->merge_heap, &c->ph_node); @@ -303,33 +374,38 @@ bt_merge_end(BTMergeScanState *state) static bool bt_merge_cursor_init(BTMergeScanState *state, IndexScanDesc scan, - BTMergeCursor *cursor, - Datum prefix_value, - bool prefix_isnull) + BTMergeCursor *cursor) { BTScanOpaque so = (BTScanOpaque) scan->opaque; bool found; - - if (prefix_isnull) - { - cursor->exhausted = true; - return false; - } + bool save_want_itup; + int col; /* - * Modify the scan key to use this cursor's prefix value. - * We reuse the scan's existing key infrastructure. + * Modify the scan keys to use this cursor's prefix values. + * We modify scan->keyData (original keys) because _bt_first calls + * _bt_preprocess_keys which re-processes scan->keyData into so->keyData. + * Prefix columns are 1..num_prefix_cols. */ - for (int i = 0; i < so->numberOfKeys; i++) + for (col = 0; col < state->num_prefix_cols; col++) { - if (so->keyData[i].sk_attno == state->prefix_attno) + int attno = col + 1; /* 1-based attribute number */ + + for (int i = 0; i < scan->numberOfKeys; i++) { - so->keyData[i].sk_argument = prefix_value; - so->keyData[i].sk_flags &= ~(SK_SEARCHARRAY); - break; + if (scan->keyData[i].sk_attno == attno && + scan->keyData[i].sk_strategy == BTEqualStrategyNumber) + { + scan->keyData[i].sk_argument = cursor->prefix_values[col]; + scan->keyData[i].sk_flags &= ~(SK_SEARCHARRAY); + break; + } } } + /* Force key re-preprocessing for this cursor's prefix values */ + so->numberOfKeys = 0; + /* Invalidate current position to force _bt_first */ BTScanPosInvalidate(so->currPos); @@ -342,6 +418,14 @@ bt_merge_cursor_init(BTMergeScanState *state, so->numArrayKeys = 0; so->needPrimScan = false; + /* + * Force tuple data to be copied for suffix key extraction. + * This is needed even for regular (non-index-only) scans because + * the merge comparison function needs access to the suffix column. + */ + save_want_itup = scan->xs_want_itup; + scan->xs_want_itup = true; + /* Position at first matching tuple */ found = _bt_first(scan, state->direction); @@ -351,7 +435,7 @@ bt_merge_cursor_init(BTMergeScanState *state, memcpy(&cursor->pos, &so->currPos, sizeof(BTScanPosData)); /* - * Copy the tuple data for index-only scans. + * Copy the tuple data for suffix key extraction during heap comparison. * The tuple workspace contains copies of index tuples referenced * by items in currPos. */ @@ -360,12 +444,7 @@ bt_merge_cursor_init(BTMergeScanState *state, memcpy(cursor->tuples, so->currTuples, so->currPos.nextTupleOffset); } - /* Extract the sort key for heap ordering */ - cursor->sort_key = bt_merge_extract_sortkey(state, scan, cursor, - &cursor->sort_key_isnull); cursor->exhausted = false; - - /* Count this as a tuple access */ state->tuples_accessed++; /* Invalidate main scan position */ @@ -376,6 +455,9 @@ bt_merge_cursor_init(BTMergeScanState *state, cursor->exhausted = true; } + /* Restore original setting */ + scan->xs_want_itup = save_want_itup; + return found; } @@ -423,28 +505,38 @@ bt_merge_cursor_advance(BTMergeScanState *state, * call _bt_next, then swap back. */ BTScanPosData save_pos; + bool save_want_itup; memcpy(&save_pos, &so->currPos, sizeof(BTScanPosData)); memcpy(&so->currPos, &cursor->pos, sizeof(BTScanPosData)); + /* Force tuple data to be copied for suffix key extraction */ + save_want_itup = scan->xs_want_itup; + scan->xs_want_itup = true; + found = _bt_next(scan, state->direction); if (found) + { memcpy(&cursor->pos, &so->currPos, sizeof(BTScanPosData)); + /* + * Copy the new page's tuple data for suffix key extraction. + */ + if (so->currTuples && so->currPos.nextTupleOffset > 0) + { + memcpy(cursor->tuples, so->currTuples, so->currPos.nextTupleOffset); + } + } + + /* Restore original setting */ + scan->xs_want_itup = save_want_itup; + memcpy(&so->currPos, &save_pos, sizeof(BTScanPosData)); } if (found) { - /* - * Don't count here - the advanced-to tuple will be returned later - * and counted by index_getnext_tid at that time. - */ - - /* Extract new sort key */ - cursor->sort_key = bt_merge_extract_sortkey(state, scan, cursor, - &cursor->sort_key_isnull); state->tuples_accessed++; } else @@ -454,56 +546,3 @@ bt_merge_cursor_advance(BTMergeScanState *state, return found; } - - -/* - * bt_merge_extract_sortkey - * Extract the sort key (suffix column value) from the current tuple. - */ -static Datum -bt_merge_extract_sortkey(BTMergeScanState *state, - IndexScanDesc scan, - BTMergeCursor *cursor, - bool *isnull) -{ - Relation rel = scan->indexRelation; - Buffer buf; - Page page; - OffsetNumber offnum; - ItemId itemid; - IndexTuple itup; - TupleDesc tupdesc; - Datum result; - - if (cursor->pos.currPage == InvalidBlockNumber) - { - *isnull = true; - return (Datum) 0; - } - - /* Read the page */ - buf = ReadBuffer(rel, cursor->pos.currPage); - LockBuffer(buf, BT_READ); - page = BufferGetPage(buf); - - offnum = cursor->pos.items[cursor->pos.itemIndex].indexOffset; - itemid = PageGetItemId(page, offnum); - itup = (IndexTuple) PageGetItem(page, itemid); - tupdesc = RelationGetDescr(rel); - - /* Extract the suffix column value */ - result = index_getattr(itup, state->suffix_attno, tupdesc, isnull); - - /* Copy pass-by-reference values before releasing buffer */ - if (!*isnull) - { - Form_pg_attribute attr = TupleDescAttr(tupdesc, state->suffix_attno - 1); - - if (!attr->attbyval) - result = datumCopy(result, attr->attbyval, attr->attlen); - } - - UnlockReleaseBuffer(buf); - - return result; -} diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 0e55c4874b4..ee6b6c6783b 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -226,102 +226,197 @@ btinsert(Relation rel, Datum *values, bool *isnull, return result; } +/* + * PrefixColConstraint - holds constraint info for one prefix column + */ +typedef struct PrefixColConstraint +{ + int attno; /* attribute number (1-based) */ + int num_values; /* number of values (1 for equality, N for IN) */ + Datum *values; /* array of values */ + bool *nulls; /* array of null flags */ +} PrefixColConstraint; + /* * bt_init_merge_scan_from_keys - * Initialize merge scan state from the preprocessed scan keys. + * Initialize merge scan state from scan keys with multi-column support. + * + * Handles multiple prefix columns with equality or IN constraints. + * Expands Cartesian product of all prefix combinations. * * Returns true if merge scan was successfully initialized. - * Returns false if merge scan cannot be used (e.g., no suitable array key). + * Returns false if merge scan cannot be used. */ static bool bt_init_merge_scan_from_keys(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; Relation rel = scan->indexRelation; - TupleDesc itupdesc = RelationGetDescr(rel); - ScanKey arrayKey = NULL; - ArrayType *arr; - Datum *prefix_values; - bool *prefix_nulls; - int num_prefixes; - int prefix_attno; - int suffix_attno; - Oid suffix_cmp_oid; - Oid suffix_collation; - Oid opfamily; - Oid elemtype; - int16 elemlen; - bool elembyval; - char elemalign; + PrefixColConstraint *constraints; + int num_prefix_cols; + int total_cursors; + Datum **prefix_tuples; + bool **prefix_nulls; int i; + int j; + int col; - /* Look for SK_SEARCHARRAY on first column in the raw scan keys */ - for (i = 0; i < scan->numberOfKeys; i++) + /* + * Find prefix columns: all columns with equality/IN constraints before + * the suffix column. For now, assume columns 1..N are prefixes if they + * have equality constraints, and column N+1 is the suffix. + */ + num_prefix_cols = 0; + for (col = 1; col <= rel->rd_index->indnkeyatts; col++) { - ScanKey sk = &scan->keyData[i]; + bool has_equality = false; - if ((sk->sk_flags & SK_SEARCHARRAY) && - sk->sk_attno == 1 && - sk->sk_strategy == BTEqualStrategyNumber) + for (i = 0; i < scan->numberOfKeys; i++) { - arrayKey = sk; - break; + ScanKey sk = &scan->keyData[i]; + + if (sk->sk_attno == col && + sk->sk_strategy == BTEqualStrategyNumber) + { + has_equality = true; + break; + } } + + if (has_equality) + num_prefix_cols++; + else + break; /* First column without equality is suffix */ } - if (arrayKey == NULL) + if (num_prefix_cols == 0) return false; - /* Extract array values from the scan key */ - arr = DatumGetArrayTypeP(arrayKey->sk_argument); - num_prefixes = ArrayGetNItems(ARR_NDIM(arr), ARR_DIMS(arr)); - - if (num_prefixes < 2) - return false; + /* Allocate constraint array */ + constraints = palloc0(num_prefix_cols * sizeof(PrefixColConstraint)); - /* Get array element type info */ - elemtype = ARR_ELEMTYPE(arr); - get_typlenbyvalalign(elemtype, &elemlen, &elembyval, &elemalign); + /* Collect constraints for each prefix column */ + total_cursors = 1; + for (col = 0; col < num_prefix_cols; col++) + { + int attno = col + 1; + PrefixColConstraint *c = &constraints[col]; - /* Deconstruct the array into individual elements */ - deconstruct_array(arr, elemtype, elemlen, elembyval, elemalign, - &prefix_values, &prefix_nulls, &num_prefixes); + c->attno = attno; - /* Attribute numbers (1-based) */ - prefix_attno = 1; - suffix_attno = 2; + /* Look for array or scalar equality on this column */ + for (i = 0; i < scan->numberOfKeys; i++) + { + ScanKey sk = &scan->keyData[i]; - /* Get the opfamily from the index */ - opfamily = rel->rd_opfamily[suffix_attno - 1]; + if (sk->sk_attno == attno && + sk->sk_strategy == BTEqualStrategyNumber) + { + if (sk->sk_flags & SK_SEARCHARRAY) + { + /* IN clause - extract array elements */ + ArrayType *arr = DatumGetArrayTypeP(sk->sk_argument); + Oid elemtype = ARR_ELEMTYPE(arr); + int16 elemlen; + bool elembyval; + char elemalign; + + get_typlenbyvalalign(elemtype, &elemlen, &elembyval, &elemalign); + deconstruct_array(arr, elemtype, elemlen, elembyval, elemalign, + &c->values, &c->nulls, &c->num_values); + } + else + { + /* Simple equality - single value */ + c->num_values = 1; + c->values = palloc(sizeof(Datum)); + c->nulls = palloc(sizeof(bool)); + c->values[0] = sk->sk_argument; + c->nulls[0] = (sk->sk_flags & SK_ISNULL) != 0; + } + break; + } + } - /* Get collation from the suffix column */ - suffix_collation = TupleDescAttr(itupdesc, suffix_attno - 1)->attcollation; + if (c->num_values == 0) + { + /* No constraint found - shouldn't happen */ + pfree(constraints); + return false; + } - /* Get the comparison function OID for the suffix column */ - suffix_cmp_oid = get_opfamily_proc(opfamily, - TupleDescAttr(itupdesc, suffix_attno - 1)->atttypid, - TupleDescAttr(itupdesc, suffix_attno - 1)->atttypid, - BTORDER_PROC); + total_cursors *= c->num_values; + } - if (!OidIsValid(suffix_cmp_oid)) + if (total_cursors < 2) { - pfree(prefix_values); - pfree(prefix_nulls); + /* Not enough combinations for merge scan */ + for (col = 0; col < num_prefix_cols; col++) + { + if (constraints[col].values) + pfree(constraints[col].values); + if (constraints[col].nulls) + pfree(constraints[col].nulls); + } + pfree(constraints); return false; } + /* + * Expand Cartesian product of all prefix column values. + * Each cursor gets one combination of prefix values. + */ + prefix_tuples = palloc(total_cursors * sizeof(Datum *)); + prefix_nulls = palloc(total_cursors * sizeof(bool *)); + + for (i = 0; i < total_cursors; i++) + { + int idx = i; + + prefix_tuples[i] = palloc(num_prefix_cols * sizeof(Datum)); + prefix_nulls[i] = palloc(num_prefix_cols * sizeof(bool)); + + /* Compute which value from each column for cursor i */ + for (j = num_prefix_cols - 1; j >= 0; j--) + { + int val_idx = idx % constraints[j].num_values; + + prefix_tuples[i][j] = constraints[j].values[val_idx]; + prefix_nulls[i][j] = constraints[j].nulls[val_idx]; + idx /= constraints[j].num_values; + } + } + + /* + * Prefix tuples are passed to bt_merge_init in their current order. + * The cursor_id assignment preserves this order, which serves as + * tiebreaker when suffix values are equal. Future enhancement: + * allow executor to sort prefixes by arbitrary expressions. + */ + /* Initialize the merge scan state */ so->mergeState = bt_merge_init(scan, - prefix_values, + prefix_tuples, prefix_nulls, - num_prefixes, - prefix_attno, - suffix_attno, - suffix_cmp_oid, - suffix_collation); + total_cursors, + num_prefix_cols); - pfree(prefix_values); + /* Cleanup temporary allocations (bt_merge_init copies what it needs) */ + for (i = 0; i < total_cursors; i++) + { + pfree(prefix_tuples[i]); + pfree(prefix_nulls[i]); + } + pfree(prefix_tuples); pfree(prefix_nulls); + for (col = 0; col < num_prefix_cols; col++) + { + if (constraints[col].values) + pfree(constraints[col].values); + if (constraints[col].nulls) + pfree(constraints[col].nulls); + } + pfree(constraints); return (so->mergeState != NULL); } diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index b7bb111688c..1e2c3d5f9fb 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -87,6 +87,10 @@ static void show_qual(List *qual, const char *qlabel, static void show_scan_qual(List *qual, const char *qlabel, PlanState *planstate, List *ancestors, ExplainState *es); +static void show_index_qual_with_prefix(List *suffix_qual, List *prefix_qual, + List *default_qual, + PlanState *planstate, List *ancestors, + ExplainState *es); static void show_upper_qual(List *qual, const char *qlabel, PlanState *planstate, List *ancestors, ExplainState *es); @@ -1961,35 +1965,47 @@ ExplainNode(PlanState *planstate, List *ancestors, switch (nodeTag(plan)) { case T_IndexScan: - show_scan_qual(((IndexScan *) plan)->indexqualorig, - "Index Cond", planstate, ancestors, es); - if (((IndexScan *) plan)->indexqualorig) - show_instrumentation_count("Rows Removed by Index Recheck", 2, - planstate, es); - show_scan_qual(((IndexScan *) plan)->indexorderbyorig, - "Order By", planstate, ancestors, es); - show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); - if (plan->qual) - show_instrumentation_count("Rows Removed by Filter", 1, - planstate, es); - show_indexsearches_info(planstate, es); + { + IndexScan *iscan = (IndexScan *) plan; + + show_index_qual_with_prefix(iscan->indexqualorig, + iscan->indexprefixqual, + iscan->indexqualorig, + planstate, ancestors, es); + if (iscan->indexqualorig) + show_instrumentation_count("Rows Removed by Index Recheck", 2, + planstate, es); + show_scan_qual(iscan->indexorderbyorig, + "Order By", planstate, ancestors, es); + show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 1, + planstate, es); + show_indexsearches_info(planstate, es); + } break; case T_IndexOnlyScan: - show_scan_qual(((IndexOnlyScan *) plan)->indexqual, - "Index Cond", planstate, ancestors, es); - if (((IndexOnlyScan *) plan)->recheckqual) - show_instrumentation_count("Rows Removed by Index Recheck", 2, - planstate, es); - show_scan_qual(((IndexOnlyScan *) plan)->indexorderby, - "Order By", planstate, ancestors, es); - show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); - if (plan->qual) - show_instrumentation_count("Rows Removed by Filter", 1, - planstate, es); - if (es->analyze) - ExplainPropertyFloat("Heap Fetches", NULL, - planstate->instrument->ntuples2, 0, es); - show_indexsearches_info(planstate, es); + { + IndexOnlyScan *ioscan = (IndexOnlyScan *) plan; + + show_index_qual_with_prefix(ioscan->recheckqual, + ioscan->indexprefixqual, + ioscan->indexqual, + planstate, ancestors, es); + if (ioscan->recheckqual) + show_instrumentation_count("Rows Removed by Index Recheck", 2, + planstate, es); + show_scan_qual(ioscan->indexorderby, + "Order By", planstate, ancestors, es); + show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 1, + planstate, es); + if (es->analyze) + ExplainPropertyFloat("Heap Fetches", NULL, + planstate->instrument->ntuples2, 0, es); + show_indexsearches_info(planstate, es); + } break; case T_BitmapIndexScan: show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig, @@ -2555,6 +2571,30 @@ show_scan_qual(List *qual, const char *qlabel, show_qual(qual, qlabel, planstate, ancestors, useprefix, es); } +/* + * Show index quals with optional prefix separation for merge scans. + * + * For merge scans, shows "Index Cond" (suffix_qual) and "Index Prefixes" + * (prefix_qual) separately. For regular scans, shows default_qual as + * "Index Cond". + */ +static void +show_index_qual_with_prefix(List *suffix_qual, List *prefix_qual, + List *default_qual, + PlanState *planstate, List *ancestors, + ExplainState *es) +{ + if (prefix_qual) + { + show_scan_qual(suffix_qual, "Index Cond", planstate, ancestors, es); + show_scan_qual(prefix_qual, "Index Prefixes", planstate, ancestors, es); + } + else + { + show_scan_qual(default_qual, "Index Cond", planstate, ancestors, es); + } +} + /* * Show a qualifier expression for an upper-level plan node */ diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index c30d6e84672..1567551f9dd 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -145,6 +145,7 @@ int max_parallel_workers_per_gather = 2; bool enable_seqscan = true; bool enable_indexscan = true; bool enable_indexonlyscan = true; +bool enable_indexmergescan = true; bool enable_bitmapscan = true; bool enable_tidscan = true; bool enable_sort = true; diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 44b79f91335..55d635e9524 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -784,44 +784,17 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, } /* - * consider_merge_scan_path - * Check if this index can provide a merge scan path for queries of the form: - * WHERE prefix IN (...) AND suffix >= b ORDER BY suffix, prefix LIMIT N + * count_equality_values + * Count the number of equality values for index clauses on a column. * - * Merge scan allows lazily producing output sorted by (suffix, prefix) from - * an index on (prefix, suffix) by doing a K-way merge of K separate scans. + * Returns 1 for simple equality, N for IN-list with N elements, 0 if none. */ -static void -consider_merge_scan_path(PlannerInfo *root, RelOptInfo *rel, - IndexOptInfo *index, IndexClauseSet *clauses) +static int +count_equality_values(List *indexclauses) { - IndexPath *ipath; - List *index_clauses; - List *index_pathkeys; - List *merge_pathkeys; ListCell *lc; - int num_prefixes = 0; - int indexcol; - bool has_saop_on_first = false; - bool has_clause_on_second = false; - /* Need at least 2 index columns for merge scan */ - if (index->nkeycolumns < 2) - return; - - /* Index must be ordered and support gettuple */ - if (index->sortopfamily == NULL || !index->amhasgettuple) - return; - - /* Must have query pathkeys with at least 2 elements */ - if (root->query_pathkeys == NIL || list_length(root->query_pathkeys) < 2) - return; - - /* - * Check for ScalarArrayOpExpr on first column. - * Count the number of array elements (prefix values). - */ - foreach(lc, clauses->indexclauses[0]) + foreach(lc, indexclauses) { IndexClause *iclause = (IndexClause *) lfirst(lc); RestrictInfo *rinfo = iclause->rinfo; @@ -831,9 +804,6 @@ consider_merge_scan_path(PlannerInfo *root, RelOptInfo *rel, ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause; Node *arrayarg = (Node *) lsecond(saop->args); - has_saop_on_first = true; - - /* Try to determine the number of array elements */ if (IsA(arrayarg, Const)) { Const *con = (Const *) arrayarg; @@ -841,61 +811,135 @@ consider_merge_scan_path(PlannerInfo *root, RelOptInfo *rel, if (!con->constisnull) { ArrayType *arr = DatumGetArrayTypeP(con->constvalue); - num_prefixes = ArrayGetNItems(ARR_NDIM(arr), ARR_DIMS(arr)); + + return ArrayGetNItems(ARR_NDIM(arr), ARR_DIMS(arr)); } } else { /* Can't determine size, estimate conservatively */ - num_prefixes = 10; + return 10; } - break; + } + else if (IsA(rinfo->clause, OpExpr)) + { + /* Simple equality constraint = 1 value */ + return 1; } } - if (!has_saop_on_first || num_prefixes < 2) + return 0; +} + +/* + * consider_merge_scan_path + * Check if this index can provide a merge scan path for queries with + * equality/IN constraints on prefix columns and ORDER BY on suffix. + * + * Supports multiple prefix columns: + * - a = const AND b IN B -> len(B) cursors + * - a IN A AND b IN B -> len(A) * len(B) cursors + * - a IN A AND b = const -> len(A) cursors + * + * Merge scan allows lazily producing output sorted by suffix from an + * index on (prefixes..., suffix) by doing K-way merge of K separate scans. + */ +static void +consider_merge_scan_path(PlannerInfo *root, RelOptInfo *rel, + IndexOptInfo *index, IndexClauseSet *clauses) +{ + IndexPath *ipath; + List *index_clauses; + List *merge_pathkeys; + ListCell *lc; + int num_prefixes; + int suffix_indexcol; + int indexcol; + PathKey *query_first_pk; + ScanDirection scandirection; + + if (!enable_indexmergescan) return; - /* Check if there's any clause on second column */ - if (clauses->indexclauses[1] != NIL) - has_clause_on_second = true; + /* Need at least 2 index columns for merge scan */ + if (index->nkeycolumns < 2) + return; - if (!has_clause_on_second) + /* Index must be ordered and support gettuple */ + if (index->sortopfamily == NULL || !index->amhasgettuple) return; - /* - * Get the natural index pathkeys (prefix, suffix order). - * We need at least 2 pathkeys for merge scan to make sense. - */ - index_pathkeys = build_index_pathkeys(root, index, ForwardScanDirection); - if (list_length(index_pathkeys) < 2) + /* Must have query pathkeys */ + if (root->query_pathkeys == NIL) return; /* - * Check if query pathkeys are (suffix, prefix) - the REVERSED order. - * query_pathkeys[0] should match index_pathkeys[1] (suffix) - * query_pathkeys[1] should match index_pathkeys[0] (prefix) + * Find the suffix column: the index column (not the first) that matches + * the query's first ORDER BY column. We don't use build_index_pathkeys() + * because equality-constrained prefix columns don't produce pathkeys. + * + * Instead, we directly check each index column's expression against the + * query's first pathkey equivalence class. */ + query_first_pk = (PathKey *) linitial(root->query_pathkeys); + suffix_indexcol = -1; + + for (indexcol = 1; indexcol < index->nkeycolumns; indexcol++) { - PathKey *qpk0 = (PathKey *) linitial(root->query_pathkeys); - PathKey *qpk1 = (PathKey *) lsecond(root->query_pathkeys); - PathKey *ipk0 = (PathKey *) linitial(index_pathkeys); - PathKey *ipk1 = (PathKey *) lsecond(index_pathkeys); + TargetEntry *indextle = (TargetEntry *) list_nth(index->indextlist, indexcol); + EquivalenceMember *em; + + /* Check if this index column is in the query's first pathkey EC */ + em = find_ec_member_matching_expr(query_first_pk->pk_eclass, + indextle->expr, + index->rel->relids); + if (em != NULL) + { + suffix_indexcol = indexcol; + break; + } + } - /* Query's first pathkey must match index's SECOND pathkey (suffix) */ - if (qpk0->pk_eclass != ipk1->pk_eclass) - return; + if (suffix_indexcol < 1) + return; /* No suitable suffix column found */ - /* Query's second pathkey must match index's FIRST pathkey (prefix) */ - if (qpk1->pk_eclass != ipk0->pk_eclass) - return; + /* + * Determine scan direction based on query's sort direction and index's + * natural order. If both match, use forward; if opposite, use backward. + */ + { + bool query_is_desc = (query_first_pk->pk_cmptype == COMPARE_GT); + bool index_is_desc = index->reverse_sort[suffix_indexcol]; + + if (query_is_desc == index_is_desc) + scandirection = ForwardScanDirection; + else + scandirection = BackwardScanDirection; } /* - * The merge scan can satisfy the query's ORDER BY (suffix, prefix). - * Use the query's pathkeys directly since we've verified they match. - * This is critical: PostgreSQL compares pathkeys by pointer equality. + * Count prefix combinations: product of equality values for all columns + * before the suffix column. Each column must have equality constraint. */ + num_prefixes = 1; + for (indexcol = 0; indexcol < suffix_indexcol; indexcol++) + { + int col_count = count_equality_values(clauses->indexclauses[indexcol]); + + if (col_count == 0) + return; /* Gap in prefix - can't use merge scan */ + + num_prefixes *= col_count; + } + + if (num_prefixes < 2) + return; /* Need at least 2 cursors for merge scan */ + + /* Must have a clause on the suffix column */ + if (clauses->indexclauses[suffix_indexcol] == NIL) + return; + + /* Use query pathkeys for pointer equality */ merge_pathkeys = root->query_pathkeys; /* @@ -907,19 +951,20 @@ consider_merge_scan_path(PlannerInfo *root, RelOptInfo *rel, foreach(lc, clauses->indexclauses[indexcol]) { IndexClause *iclause = (IndexClause *) lfirst(lc); + index_clauses = lappend(index_clauses, iclause); } } /* - * Create the merge scan path with (suffix, prefix) pathkeys. + * Create the merge scan path with query's pathkeys. */ ipath = create_index_path(root, index, index_clauses, NIL, /* no ORDER BY expressions */ NIL, /* no ORDER BY columns */ merge_pathkeys, - ForwardScanDirection, + scandirection, check_index_only(rel, index), NULL, /* no outer relids */ 1.0, /* loop_count */ @@ -927,11 +972,11 @@ consider_merge_scan_path(PlannerInfo *root, RelOptInfo *rel, /* Enable merge scan with K-way merge */ ipath->num_merge_prefixes = num_prefixes; + ipath->suffix_indexcol = suffix_indexcol; /* * Adjust costs and row estimate for merge scan. * Merge scan reads exactly (limit + K - 1) tuples instead of all matching. - * The row estimate reflects actual tuple accesses, not total matches. */ if (root->limit_tuples > 0 && root->limit_tuples < ipath->path.rows) { diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 485b4b3e54e..7f7d9c26045 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -181,11 +181,11 @@ static SeqScan *make_seqscan(List *qptlist, List *qpqual, Index scanrelid); static SampleScan *make_samplescan(List *qptlist, List *qpqual, Index scanrelid, TableSampleClause *tsc); static IndexScan *make_indexscan(List *qptlist, List *qpqual, Index scanrelid, - Oid indexid, List *indexqual, List *indexqualorig, - List *indexorderby, List *indexorderbyorig, - List *indexorderbyops, - int num_merge_prefixes, - ScanDirection indexscandir); + Oid indexid, List *indexqual, List *indexqualorig, + List *indexorderby, List *indexorderbyorig, + List *indexorderbyops, + int num_merge_prefixes, + ScanDirection indexscandir); static IndexOnlyScan *make_indexonlyscan(List *qptlist, List *qpqual, Index scanrelid, Oid indexid, List *indexqual, List *recheckqual, @@ -193,6 +193,8 @@ static IndexOnlyScan *make_indexonlyscan(List *qptlist, List *qpqual, List *indextlist, int num_merge_prefixes, ScanDirection indexscandir); +static void set_merge_scan_qual_info(Scan *scan_plan, IndexPath *best_path, + List *stripped_indexquals, bool indexonly); static BitmapIndexScan *make_bitmap_indexscan(Index scanrelid, Oid indexid, List *indexqual, List *indexqualorig); @@ -3026,6 +3028,9 @@ create_indexscan_plan(PlannerInfo *root, best_path->num_merge_prefixes, best_path->indexscandir); + /* For merge scan, separate prefix and suffix quals for EXPLAIN */ + set_merge_scan_qual_info(scan_plan, best_path, stripped_indexquals, indexonly); + copy_generic_path_info(&scan_plan->plan, &best_path->path); return scan_plan; @@ -5585,6 +5590,75 @@ make_indexonlyscan(List *qptlist, return node; } +/* + * set_merge_scan_qual_info + * For merge scan, extract prefix quals for EXPLAIN output. + * + * Prefix quals are those on index columns before suffix_indexcol. + * This separates the equality/IN constraints (prefixes) from the + * range constraint (suffix) to make EXPLAIN output clearer. + */ +static void +set_merge_scan_qual_info(Scan *scan_plan, IndexPath *best_path, + List *stripped_indexquals, bool indexonly) +{ + List *prefix_quals = NIL; + List *suffix_quals = NIL; + ListCell *lc; + + /* Only process if this is a merge scan */ + if (best_path->num_merge_prefixes <= 0 || best_path->suffix_indexcol < 0) + return; + + /* + * Partition quals into prefix (columns before suffix) and suffix. + * We match each qual against the IndexClauses to determine which + * index column it references. + */ + foreach(lc, stripped_indexquals) + { + Node *clause = (Node *) lfirst(lc); + bool is_prefix = false; + ListCell *ic; + + foreach(ic, best_path->indexclauses) + { + IndexClause *iclause = (IndexClause *) lfirst(ic); + + if (iclause->indexcol < best_path->suffix_indexcol && + equal(clause, iclause->rinfo->clause)) + { + is_prefix = true; + break; + } + } + + if (is_prefix) + prefix_quals = lappend(prefix_quals, clause); + else + suffix_quals = lappend(suffix_quals, clause); + } + + /* Store the separated quals in the plan node. + * Prefix quals (equality/IN) don't need rechecking since they're exact + * matches, so we only store suffix quals in recheckqual/indexqualorig. + */ + if (indexonly) + { + IndexOnlyScan *ios = (IndexOnlyScan *) scan_plan; + + ios->indexprefixqual = prefix_quals; + ios->recheckqual = suffix_quals; + } + else + { + IndexScan *iscan = (IndexScan *) scan_plan; + + iscan->indexprefixqual = prefix_quals; + iscan->indexqualorig = suffix_quals; + } +} + static BitmapIndexScan * make_bitmap_indexscan(Index scanrelid, Oid indexid, diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 21746cd684c..ed5993cb49d 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1076,6 +1076,7 @@ create_index_path(PlannerInfo *root, pathnode->indexscandir = indexscandir; pathnode->num_merge_prefixes = 0; + pathnode->suffix_indexcol = -1; cost_index(pathnode, root, loop_count, partial_path); diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index f0260e6e412..0678fe5741b 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -877,6 +877,13 @@ boot_val => 'true', }, +{ name => 'enable_indexmergescan', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', + short_desc => 'Enables the planner\'s use of index merge-scan plans.', + flags => 'GUC_EXPLAIN', + variable => 'enable_indexmergescan', + boot_val => 'true', +}, + { name => 'enable_indexonlyscan', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD', short_desc => 'Enables the planner\'s use of index-only-scan plans.', flags => 'GUC_EXPLAIN', diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 0d4e7440760..0dff24ac151 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1052,20 +1052,20 @@ typedef struct BTArrayKeyInfo } BTArrayKeyInfo; /* - * BTMergeCursor - tracks scan state for one prefix value in merge scan + * BTMergeCursor - tracks scan state for one prefix in merge scan * * Each cursor maintains its own position within the index for a specific - * prefix value. Cursors are organized in a min-heap ordered by their - * current suffix key value for efficient K-way merge. + * prefix values. Cursors are organized in a min-heap ordered + * by their current suffix key value for efficient K-way merge. + * + * Note: cursors with any NULL prefix are marked exhausted (they would match nothing). + * The suffix key is extracted on-demand from the tuple data during comparison. */ typedef struct BTMergeCursor { pairingheap_node ph_node; /* pairing heap node for merge */ int cursor_id; /* index in merge state's cursors array */ - Datum prefix_value; /* the prefix value for this sub-scan */ - bool prefix_isnull; /* is prefix value NULL? */ - Datum sort_key; /* current tuple's sort key (suffix) */ - bool sort_key_isnull;/* is sort key NULL? */ + Datum *prefix_values; /* array of prefix values for this sub-scan */ bool exhausted; /* no more tuples for this prefix */ BTScanPosData pos; /* current position in index */ char *tuples; /* tuple storage workspace (BLCKSZ) */ @@ -1080,18 +1080,17 @@ typedef struct BTMergeCursor */ typedef struct BTMergeScanState { - int num_cursors; /* number of prefix values (K) */ + int num_cursors; /* number of prefix combinations (K) */ int active_cursors; /* cursors not yet exhausted */ BTMergeCursor *cursors; /* array of cursors */ - pairingheap *merge_heap; /* min-heap ordered by sort_key */ - int prefix_attno; /* attribute number of prefix column (1-based) */ - int suffix_attno; /* attribute number of suffix column (1-based) */ - FmgrInfo suffix_cmp; /* comparison function for suffix */ - Oid suffix_collation; /* collation for suffix comparison */ + pairingheap *merge_heap; /* min-heap ordered by suffix key */ + int num_prefix_cols;/* number of prefix columns (attno 1..N) */ ScanDirection direction; /* scan direction */ bool initialized; /* have cursors been initialized? */ MemoryContext merge_context;/* memory context for allocations */ int64 tuples_accessed;/* count of index tuples accessed */ + Relation index_rel; /* index relation (for cmp funcs, indoption) */ + TupleDesc index_tupdesc; /* index tuple descriptor for suffix extraction */ } BTMergeScanState; typedef struct BTScanOpaqueData @@ -1388,13 +1387,10 @@ extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc); * prototypes for functions in nbtmergescan.c */ extern BTMergeScanState *bt_merge_init(IndexScanDesc scan, - Datum *prefix_values, - bool *prefix_nulls, - int num_prefixes, - int prefix_attno, - int suffix_attno, - Oid suffix_cmp_oid, - Oid suffix_collation); + Datum **prefix_tuples, + bool **prefix_nulls, + int num_cursors, + int num_prefix_cols); extern bool bt_merge_getnext(IndexScanDesc scan, ScanDirection dir); extern void bt_merge_end(BTMergeScanState *state); diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index ced7e224a87..d7a40995213 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -2041,6 +2041,7 @@ typedef struct IndexPath Cost indextotalcost; Selectivity indexselectivity; int num_merge_prefixes; + int suffix_indexcol; } IndexPath; /* diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 86d8c92e01f..1725542744f 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -599,6 +599,8 @@ typedef struct IndexScan ScanDirection indexorderdir; /* Merge scan: K-way merge */ int num_merge_prefixes; + /* Merge scan: constraints on prefix columns for EXPLAIN */ + List *indexprefixqual; } IndexScan; /* ---------------- @@ -649,6 +651,8 @@ typedef struct IndexOnlyScan ScanDirection indexorderdir; /* Merge scan: K-way merge */ int num_merge_prefixes; + /* Merge scan: prefix quals (equality/IN on prefix columns) for EXPLAIN */ + List *indexprefixqual; } IndexOnlyScan; /* ---------------- diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index f2fd5d31507..a32cac4d0c7 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -52,6 +52,7 @@ extern PGDLLIMPORT int max_parallel_workers_per_gather; extern PGDLLIMPORT bool enable_seqscan; extern PGDLLIMPORT bool enable_indexscan; extern PGDLLIMPORT bool enable_indexonlyscan; +extern PGDLLIMPORT bool enable_indexmergescan; extern PGDLLIMPORT bool enable_bitmapscan; extern PGDLLIMPORT bool enable_tidscan; extern PGDLLIMPORT bool enable_sort; diff --git a/src/test/regress/expected/btree_merge.out b/src/test/regress/expected/btree_merge.out index 28509b331d7..a1e69e894ab 100644 --- a/src/test/regress/expected/btree_merge.out +++ b/src/test/regress/expected/btree_merge.out @@ -82,26 +82,27 @@ SHOW track_counts; -- should be 'on' on (1 row) --- Verify merge scan is used: no Sort node, rows=10 (N + K - 1 = 3 + 8 - 1) +-- Verify merge scan is used: no Sort node when ORDER BY suffix only +-- K = 8 prefixes, LIMIT 3 -> reads at most 3 + 8 - 1 = 10 tuples EXPLAIN (COSTS OFF) SELECT x, y FROM btree_merge_test WHERE x IN (1,2,5,8,13,21,34,55) AND y >= 19 -ORDER BY y, x +ORDER BY y LIMIT 3; - QUERY PLAN ------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------ Limit -> Index Only Scan using btree_merge_test_idx on btree_merge_test - Index Cond: ((x = ANY ('{1,2,5,8,13,21,34,55}'::integer[])) AND (y >= 19)) -(3 rows) + Index Cond: (y >= 19) + Index Prefixes: (x = ANY ('{1,2,5,8,13,21,34,55}'::integer[])) +(4 rows) --- From the limited query proposition this can be computed with 10 --- tupple accesses. +-- Verify the query produces correct results (sorted by y) SELECT x, y FROM btree_merge_test WHERE x IN (1,2,5,8,13,21,34,55) AND y >= 19 -ORDER BY y, x -- sort x to make result unique +ORDER BY y LIMIT 3; x | y ---+---- @@ -125,3 +126,262 @@ WHERE indexrelname = 'btree_merge_test_idx'; (1 row) DROP TABLE btree_merge_test; +-- ============================================ +-- Multi-column prefix tests +-- ============================================ +-- Create a 3-column table for multi-prefix testing +CREATE TABLE btree_merge_multi AS ( + SELECT a, b, c FROM + generate_series(1, 10) AS a, + generate_series(1, 10) AS b, + generate_series(1, 20) AS c + ORDER BY random() +); +CREATE INDEX btree_merge_multi_idx ON btree_merge_multi USING btree (a, b, c); +ANALYSE btree_merge_multi; +-- Test 1: a = const AND b IN B -> 3 cursors (just the IN list) +-- Merge scan triggered, no Sort node when ORDER BY suffix only +EXPLAIN (COSTS OFF) +SELECT a, b, c +FROM btree_merge_multi +WHERE a = 1 AND b IN (1, 2, 3) AND c >= 5 +ORDER BY c +LIMIT 3; + QUERY PLAN +------------------------------------------------------------------------ + Limit + -> Index Only Scan using btree_merge_multi_idx on btree_merge_multi + Index Cond: (c >= 5) + Index Prefixes: ((a = 1) AND (b = ANY ('{1,2,3}'::integer[]))) +(4 rows) + +SELECT a, b, c +FROM btree_merge_multi +WHERE a = 1 AND b IN (1, 2, 3) AND c >= 5 +ORDER BY c +LIMIT 3; + a | b | c +---+---+--- + 1 | 1 | 5 + 1 | 2 | 5 + 1 | 3 | 5 +(3 rows) + +-- Test 2: a IN A AND b IN B -> len(A) * len(B) cursors (Cartesian product) +-- With a IN (1,2), b IN (1,2,3), ORDER BY c LIMIT 4 +-- Should use 6 cursors (2*3), no Sort node needed +EXPLAIN (COSTS OFF) +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2) AND b IN (1, 2, 3) AND c >= 10 +ORDER BY c +LIMIT 4; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Limit + -> Index Only Scan using btree_merge_multi_idx on btree_merge_multi + Index Cond: (c >= 10) + Index Prefixes: ((a = ANY ('{1,2}'::integer[])) AND (b = ANY ('{1,2,3}'::integer[]))) +(4 rows) + +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2) AND b IN (1, 2, 3) AND c >= 10 +ORDER BY c +LIMIT 4; + a | b | c +---+---+---- + 1 | 1 | 10 + 1 | 2 | 10 + 1 | 3 | 10 + 2 | 1 | 10 +(4 rows) + +-- Test 3: a IN A AND b = const -> len(A) cursors +-- With a IN (1,2,3,4), b=5, ORDER BY c LIMIT 2 +-- Should use 4 cursors, no Sort node needed +EXPLAIN (COSTS OFF) +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3, 4) AND b = 5 AND c >= 8 +ORDER BY c +LIMIT 2; + QUERY PLAN +-------------------------------------------------------------------------- + Limit + -> Index Only Scan using btree_merge_multi_idx on btree_merge_multi + Index Cond: (c >= 8) + Index Prefixes: ((a = ANY ('{1,2,3,4}'::integer[])) AND (b = 5)) +(4 rows) + +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3, 4) AND b = 5 AND c >= 8 +ORDER BY c +LIMIT 2; + a | b | c +---+---+--- + 1 | 5 | 8 + 2 | 5 | 8 +(2 rows) + +-- Test 4: Backward scan direction (ORDER BY DESC) +-- With a IN (1,2,3), b IN (1,2), ORDER BY c DESC LIMIT 3 +-- Should use 6 cursors (3*2), no Sort node needed +EXPLAIN (COSTS OFF) +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b IN (1, 2) AND c <= 15 +ORDER BY c DESC +LIMIT 3; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Limit + -> Index Only Scan Backward using btree_merge_multi_idx on btree_merge_multi + Index Cond: (c <= 15) + Index Prefixes: ((a = ANY ('{1,2,3}'::integer[])) AND (b = ANY ('{1,2}'::integer[]))) +(4 rows) + +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b IN (1, 2) AND c <= 15 +ORDER BY c DESC +LIMIT 3; + a | b | c +---+---+---- + 1 | 1 | 15 + 1 | 2 | 15 + 2 | 1 | 15 +(3 rows) + +-- ================================================================= +-- Multi-column suffix tests +-- Index is on (a, b, c), testing with prefix on 'a' only +-- ================================================================= +-- Test 5: ORDER BY b (single column suffix) +-- With a IN (1,2,3), ORDER BY b LIMIT 6 +-- Prefix: a, Suffix: b +EXPLAIN (COSTS OFF) +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b >= 1 +ORDER BY b +LIMIT 6; + QUERY PLAN +------------------------------------------------------------------------ + Limit + -> Index Only Scan using btree_merge_multi_idx on btree_merge_multi + Index Cond: (b >= 1) + Index Prefixes: (a = ANY ('{1,2,3}'::integer[])) +(4 rows) + +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b >= 1 +ORDER BY b +LIMIT 6; + a | b | c +---+---+--- + 1 | 1 | 1 + 2 | 1 | 1 + 3 | 1 | 1 + 1 | 1 | 2 + 2 | 1 | 2 + 3 | 1 | 2 +(6 rows) + +-- Test 6: ORDER BY b DESC (single column suffix, backward) +-- With a IN (1,2,3), ORDER BY b DESC LIMIT 6 +EXPLAIN (COSTS OFF) +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b <= 10 +ORDER BY b DESC +LIMIT 6; + QUERY PLAN +--------------------------------------------------------------------------------- + Limit + -> Index Only Scan Backward using btree_merge_multi_idx on btree_merge_multi + Index Cond: (b <= 10) + Index Prefixes: (a = ANY ('{1,2,3}'::integer[])) +(4 rows) + +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b <= 10 +ORDER BY b DESC +LIMIT 6; + a | b | c +---+----+---- + 1 | 10 | 20 + 2 | 10 | 20 + 3 | 10 | 20 + 1 | 10 | 19 + 2 | 10 | 19 + 3 | 10 | 19 +(6 rows) + +-- Test 7: ORDER BY b, c (multi-column suffix) +-- With a IN (1,2,3), ORDER BY b, c LIMIT 6 +-- Prefix: a, Suffix: (b, c) +EXPLAIN (COSTS OFF) +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b >= 1 +ORDER BY b, c +LIMIT 6; + QUERY PLAN +------------------------------------------------------------------------ + Limit + -> Index Only Scan using btree_merge_multi_idx on btree_merge_multi + Index Cond: (b >= 1) + Index Prefixes: (a = ANY ('{1,2,3}'::integer[])) +(4 rows) + +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b >= 1 +ORDER BY b, c +LIMIT 6; + a | b | c +---+---+--- + 1 | 1 | 1 + 2 | 1 | 1 + 3 | 1 | 1 + 1 | 1 | 2 + 2 | 1 | 2 + 3 | 1 | 2 +(6 rows) + +-- Test 8: ORDER BY b DESC, c DESC (multi-column suffix, backward) +-- With a IN (1,2,3), ORDER BY b DESC, c DESC LIMIT 6 +EXPLAIN (COSTS OFF) +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b <= 10 +ORDER BY b DESC, c DESC +LIMIT 6; + QUERY PLAN +--------------------------------------------------------------------------------- + Limit + -> Index Only Scan Backward using btree_merge_multi_idx on btree_merge_multi + Index Cond: (b <= 10) + Index Prefixes: (a = ANY ('{1,2,3}'::integer[])) +(4 rows) + +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b <= 10 +ORDER BY b DESC, c DESC +LIMIT 6; + a | b | c +---+----+---- + 1 | 10 | 20 + 2 | 10 | 20 + 3 | 10 | 20 + 1 | 10 | 19 + 2 | 10 | 19 + 3 | 10 | 19 +(6 rows) + +DROP TABLE btree_merge_multi; diff --git a/src/test/regress/sql/btree_merge.sql b/src/test/regress/sql/btree_merge.sql index ad9cf03f869..792159b0c17 100644 --- a/src/test/regress/sql/btree_merge.sql +++ b/src/test/regress/sql/btree_merge.sql @@ -82,20 +82,20 @@ SET enable_seqscan = OFF; SET enable_bitmapscan = OFF; SHOW track_counts; -- should be 'on' --- Verify merge scan is used: no Sort node, rows=10 (N + K - 1 = 3 + 8 - 1) +-- Verify merge scan is used: no Sort node when ORDER BY suffix only +-- K = 8 prefixes, LIMIT 3 -> reads at most 3 + 8 - 1 = 10 tuples EXPLAIN (COSTS OFF) SELECT x, y FROM btree_merge_test WHERE x IN (1,2,5,8,13,21,34,55) AND y >= 19 -ORDER BY y, x +ORDER BY y LIMIT 3; --- From the limited query proposition this can be computed with 10 --- tupple accesses. +-- Verify the query produces correct results (sorted by y) SELECT x, y FROM btree_merge_test WHERE x IN (1,2,5,8,13,21,34,55) AND y >= 19 -ORDER BY y, x -- sort x to make result unique +ORDER BY y LIMIT 3; @@ -106,4 +106,151 @@ SELECT idx_scan, idx_tup_read, idx_tup_fetch FROM pg_stat_user_indexes WHERE indexrelname = 'btree_merge_test_idx'; -DROP TABLE btree_merge_test; \ No newline at end of file +DROP TABLE btree_merge_test; + +-- ============================================ +-- Multi-column prefix tests +-- ============================================ + +-- Create a 3-column table for multi-prefix testing +CREATE TABLE btree_merge_multi AS ( + SELECT a, b, c FROM + generate_series(1, 10) AS a, + generate_series(1, 10) AS b, + generate_series(1, 20) AS c + ORDER BY random() +); +CREATE INDEX btree_merge_multi_idx ON btree_merge_multi USING btree (a, b, c); +ANALYSE btree_merge_multi; + +-- Test 1: a = const AND b IN B -> 3 cursors (just the IN list) +-- Merge scan triggered, no Sort node when ORDER BY suffix only +EXPLAIN (COSTS OFF) +SELECT a, b, c +FROM btree_merge_multi +WHERE a = 1 AND b IN (1, 2, 3) AND c >= 5 +ORDER BY c +LIMIT 3; + +SELECT a, b, c +FROM btree_merge_multi +WHERE a = 1 AND b IN (1, 2, 3) AND c >= 5 +ORDER BY c +LIMIT 3; + +-- Test 2: a IN A AND b IN B -> len(A) * len(B) cursors (Cartesian product) +-- With a IN (1,2), b IN (1,2,3), ORDER BY c LIMIT 4 +-- Should use 6 cursors (2*3), no Sort node needed +EXPLAIN (COSTS OFF) +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2) AND b IN (1, 2, 3) AND c >= 10 +ORDER BY c +LIMIT 4; + +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2) AND b IN (1, 2, 3) AND c >= 10 +ORDER BY c +LIMIT 4; + +-- Test 3: a IN A AND b = const -> len(A) cursors +-- With a IN (1,2,3,4), b=5, ORDER BY c LIMIT 2 +-- Should use 4 cursors, no Sort node needed +EXPLAIN (COSTS OFF) +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3, 4) AND b = 5 AND c >= 8 +ORDER BY c +LIMIT 2; + +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3, 4) AND b = 5 AND c >= 8 +ORDER BY c +LIMIT 2; + +-- Test 4: Backward scan direction (ORDER BY DESC) +-- With a IN (1,2,3), b IN (1,2), ORDER BY c DESC LIMIT 3 +-- Should use 6 cursors (3*2), no Sort node needed +EXPLAIN (COSTS OFF) +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b IN (1, 2) AND c <= 15 +ORDER BY c DESC +LIMIT 3; + +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b IN (1, 2) AND c <= 15 +ORDER BY c DESC +LIMIT 3; + +-- ================================================================= +-- Multi-column suffix tests +-- Index is on (a, b, c), testing with prefix on 'a' only +-- ================================================================= + +-- Test 5: ORDER BY b (single column suffix) +-- With a IN (1,2,3), ORDER BY b LIMIT 6 +-- Prefix: a, Suffix: b +EXPLAIN (COSTS OFF) +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b >= 1 +ORDER BY b +LIMIT 6; + +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b >= 1 +ORDER BY b +LIMIT 6; + +-- Test 6: ORDER BY b DESC (single column suffix, backward) +-- With a IN (1,2,3), ORDER BY b DESC LIMIT 6 +EXPLAIN (COSTS OFF) +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b <= 10 +ORDER BY b DESC +LIMIT 6; + +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b <= 10 +ORDER BY b DESC +LIMIT 6; + +-- Test 7: ORDER BY b, c (multi-column suffix) +-- With a IN (1,2,3), ORDER BY b, c LIMIT 6 +-- Prefix: a, Suffix: (b, c) +EXPLAIN (COSTS OFF) +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b >= 1 +ORDER BY b, c +LIMIT 6; + +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b >= 1 +ORDER BY b, c +LIMIT 6; + +-- Test 8: ORDER BY b DESC, c DESC (multi-column suffix, backward) +-- With a IN (1,2,3), ORDER BY b DESC, c DESC LIMIT 6 +EXPLAIN (COSTS OFF) +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b <= 10 +ORDER BY b DESC, c DESC +LIMIT 6; + +SELECT a, b, c +FROM btree_merge_multi +WHERE a IN (1, 2, 3) AND b <= 10 +ORDER BY b DESC, c DESC +LIMIT 6; + +DROP TABLE btree_merge_multi; \ No newline at end of file -- 2.40.0