From d7eb969bd2da3bfd513391f4595871fb5a39c803 Mon Sep 17 00:00:00 2001 From: Tatsuo Ishii Date: Sun, 15 Feb 2026 17:47:48 +0900 Subject: [PATCH v43 4/8] Row pattern recognition patch (planner). --- src/backend/optimizer/plan/Makefile | 1 + src/backend/optimizer/plan/createplan.c | 76 +- src/backend/optimizer/plan/meson.build | 1 + src/backend/optimizer/plan/planner.c | 3 + src/backend/optimizer/plan/rpr.c | 1806 +++++++++++++++++++++ src/backend/optimizer/plan/setrefs.c | 27 +- src/backend/optimizer/prep/prepjointree.c | 9 + src/include/nodes/plannodes.h | 74 + src/include/optimizer/rpr.h | 57 + 9 files changed, 2036 insertions(+), 18 deletions(-) create mode 100644 src/backend/optimizer/plan/rpr.c create mode 100644 src/include/optimizer/rpr.h diff --git a/src/backend/optimizer/plan/Makefile b/src/backend/optimizer/plan/Makefile index 80ef162e484..7e0167789d8 100644 --- a/src/backend/optimizer/plan/Makefile +++ b/src/backend/optimizer/plan/Makefile @@ -19,6 +19,7 @@ OBJS = \ planagg.o \ planmain.o \ planner.o \ + rpr.o \ setrefs.o \ subselect.o diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 21f1988cf22..bbc2c7e71f4 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -35,6 +35,7 @@ #include "optimizer/prep.h" #include "optimizer/restrictinfo.h" #include "optimizer/subselect.h" +#include "optimizer/rpr.h" #include "optimizer/tlist.h" #include "parser/parse_clause.h" #include "parser/parsetree.h" @@ -287,7 +288,10 @@ static Memoize *make_memoize(Plan *lefttree, Oid *hashoperators, static WindowAgg *make_windowagg(List *tlist, WindowClause *wc, int partNumCols, AttrNumber *partColIdx, Oid *partOperators, Oid *partCollations, int ordNumCols, AttrNumber *ordColIdx, Oid *ordOperators, Oid *ordCollations, - List *runCondition, List *qual, bool topWindow, + List *runCondition, RPSkipTo rpSkipTo, + RPRPattern *compiledPattern, + List *defineClause, + List *qual, bool topWindow, Plan *lefttree); static Group *make_group(List *tlist, List *qual, int numGroupCols, AttrNumber *grpColIdx, Oid *grpOperators, Oid *grpCollations, @@ -2530,21 +2534,50 @@ create_windowagg_plan(PlannerInfo *root, WindowAggPath *best_path) ordNumCols++; } - /* And finally we can make the WindowAgg node */ - plan = make_windowagg(tlist, - wc, - partNumCols, - partColIdx, - partOperators, - partCollations, - ordNumCols, - ordColIdx, - ordOperators, - ordCollations, - best_path->runCondition, - best_path->qual, - best_path->topwindow, - subplan); + /* Build RPR pattern and filter defineClause */ + { + List *defineVariableList = NIL; + List *filteredDefineClause = NIL; + RPRPattern *compiledPattern = NULL; + + if (wc->rpPattern) + { + List *patternVars; + + /* + * Filter defineClause to include only variables used in PATTERN. + * This eliminates unnecessary DEFINE evaluations at runtime. + */ + patternVars = collectPatternVariables(wc->rpPattern); + filteredDefineClause = filterDefineClause(wc->defineClause, + patternVars, + &defineVariableList); + + compiledPattern = buildRPRPattern(wc->rpPattern, + defineVariableList, + wc->rpSkipTo, + wc->frameOptions); + } + + /* And finally we can make the WindowAgg node */ + plan = make_windowagg(tlist, + wc, + partNumCols, + partColIdx, + partOperators, + partCollations, + ordNumCols, + ordColIdx, + ordOperators, + ordCollations, + best_path->runCondition, + wc->rpSkipTo, + compiledPattern, + filteredDefineClause, + best_path->qual, + best_path->topwindow, + subplan); + } copy_generic_path_info(&plan->plan, (Path *) best_path); @@ -6611,7 +6644,10 @@ static WindowAgg * make_windowagg(List *tlist, WindowClause *wc, int partNumCols, AttrNumber *partColIdx, Oid *partOperators, Oid *partCollations, int ordNumCols, AttrNumber *ordColIdx, Oid *ordOperators, Oid *ordCollations, - List *runCondition, List *qual, bool topWindow, Plan *lefttree) + List *runCondition, RPSkipTo rpSkipTo, + RPRPattern *compiledPattern, + List *defineClause, + List *qual, bool topWindow, Plan *lefttree) { WindowAgg *node = makeNode(WindowAgg); Plan *plan = &node->plan; @@ -6638,6 +6674,12 @@ make_windowagg(List *tlist, WindowClause *wc, node->inRangeAsc = wc->inRangeAsc; node->inRangeNullsFirst = wc->inRangeNullsFirst; node->topWindow = topWindow; + node->rpSkipTo = rpSkipTo; + + /* Store compiled pattern for NFA execution */ + node->rpPattern = compiledPattern; + + node->defineClause = defineClause; plan->targetlist = tlist; plan->lefttree = lefttree; diff --git a/src/backend/optimizer/plan/meson.build b/src/backend/optimizer/plan/meson.build index c565b2adbcc..5b2381cd774 100644 --- a/src/backend/optimizer/plan/meson.build +++ b/src/backend/optimizer/plan/meson.build @@ -7,6 +7,7 @@ backend_sources += files( 'planagg.c', 'planmain.c', 'planner.c', + 'rpr.c', 'setrefs.c', 'subselect.c', ) diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 006b3281969..3950e0dd1af 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -1014,6 +1014,9 @@ subquery_planner(PlannerGlobal *glob, Query *parse, char *plan_name, EXPRKIND_LIMIT); wc->endOffset = preprocess_expression(root, wc->endOffset, EXPRKIND_LIMIT); + wc->defineClause = (List *) preprocess_expression(root, + (Node *) wc->defineClause, + EXPRKIND_TARGET); } parse->limitOffset = preprocess_expression(root, parse->limitOffset, diff --git a/src/backend/optimizer/plan/rpr.c b/src/backend/optimizer/plan/rpr.c new file mode 100644 index 00000000000..112ed034fe2 --- /dev/null +++ b/src/backend/optimizer/plan/rpr.c @@ -0,0 +1,1806 @@ +/*------------------------------------------------------------------------- + * + * rpr.c + * Row Pattern Recognition pattern compilation for planner + * + * This file contains functions for optimizing RPR pattern AST and + * compiling it to bytecode for execution by WindowAgg. + * + * Key components: + * 1. Pattern Optimization: Simplifies patterns before compilation + * (e.g., flatten nested SEQ/ALT, merge consecutive vars) + * 2. Pattern Compilation: Converts AST to flat element array for NFA + * 3. Absorption Analysis: Computes flags for O(n^2)->O(n) optimization + * + * Context Absorption Optimization: + * When a pattern starts with a greedy unbounded element (e.g., A+ or (A B)+), + * newer contexts cannot produce longer matches than older contexts. + * By absorbing (eliminating) redundant newer contexts, we reduce + * complexity from O(n^2) to O(n) for patterns like A+ B. + * + * The absorption analysis uses two element flags: + * - RPR_ELEM_ABSORBABLE: marks WHERE to compare (judgment point) + * - RPR_ELEM_ABSORBABLE_BRANCH: marks the absorbable region + * + * See computeAbsorbability() and the detailed comments before + * isUnboundedStart() for the full design explanation. + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/optimizer/plan/rpr.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "nodes/makefuncs.h" +#include "optimizer/rpr.h" + +/* Forward declarations - pattern comparison */ +static bool rprPatternEqual(RPRPatternNode *a, RPRPatternNode *b); +static bool rprPatternChildrenEqual(List *a, List *b); + +/* Forward declarations - pattern optimization (shared) */ +static RPRPatternNode *tryUnwrapSingleChild(RPRPatternNode *pattern); + +/* Forward declarations - SEQ optimization */ +static List *flattenSeqChildren(List *children); +static List *mergeConsecutiveVars(List *children); +static List *mergeConsecutiveGroups(List *children); +static List *mergeConsecutiveAlts(List *children); +static List *mergeGroupPrefixSuffix(List *children); +static RPRPatternNode *optimizeSeqPattern(RPRPatternNode *pattern); + +/* Forward declarations - ALT optimization */ +static List *flattenAltChildren(List *children); +static List *removeDuplicateAlternatives(List *children); +static RPRPatternNode *optimizeAltPattern(RPRPatternNode *pattern); + +/* Forward declarations - GROUP optimization */ +static RPRPatternNode *tryMultiplyQuantifiers(RPRPatternNode *pattern); +static RPRPatternNode *tryUnwrapGroup(RPRPatternNode *pattern); +static RPRPatternNode *optimizeGroupPattern(RPRPatternNode *pattern); + +/* Forward declarations - optimization dispatcher */ +static RPRPatternNode *optimizeRPRPattern(RPRPatternNode *pattern); + +/* Forward declarations - pattern compilation */ +static int collectDefineVariables(List *defineVariableList, char **varNames); +static void scanRPRPatternRecursive(RPRPatternNode *node, char **varNames, + int *numVars, int *numElements, + RPRDepth depth, RPRDepth *maxDepth); +static void scanRPRPattern(RPRPatternNode *node, char **varNames, int *numVars, + int *numElements, RPRDepth *maxDepth); +static RPRPattern *allocateRPRPattern(int numVars, int numElements, + RPRDepth maxDepth, char **varNamesStack); +static RPRVarId getVarIdFromPattern(RPRPattern *pat, const char *varName); +static void fillRPRPatternVar(RPRPatternNode *node, RPRPattern *pat, + int *idx, RPRDepth depth); +static void fillRPRPatternGroup(RPRPatternNode *node, RPRPattern *pat, + int *idx, RPRDepth depth); +static void fillRPRPatternAlt(RPRPatternNode *node, RPRPattern *pat, + int *idx, RPRDepth depth); +static void fillRPRPattern(RPRPatternNode *node, RPRPattern *pat, + int *idx, RPRDepth depth); +static void finalizeRPRPattern(RPRPattern *result); + +/* Forward declarations - context absorption */ +static bool isUnboundedStart(RPRPattern *pattern, RPRElemIdx idx); +static void computeAbsorbabilityRecursive(RPRPattern *pattern, + RPRElemIdx startIdx, + bool *hasAbsorbable); +static void computeAbsorbability(RPRPattern *pattern); + +/* + * rprPatternEqual + * Compare two RPRPatternNode trees for equality. + * + * Returns true if the trees are structurally identical. + */ +static bool +rprPatternEqual(RPRPatternNode *a, RPRPatternNode *b) +{ + /* Pattern nodes in children lists must never be NULL */ + Assert(a != NULL && b != NULL); + + /* Must have same node type and quantifiers */ + if (a->nodeType != b->nodeType) + return false; + if (a->min != b->min || a->max != b->max) + return false; + if (a->reluctant != b->reluctant) + return false; + + switch (a->nodeType) + { + case RPR_PATTERN_VAR: + return strcmp(a->varName, b->varName) == 0; + + case RPR_PATTERN_SEQ: + case RPR_PATTERN_ALT: + case RPR_PATTERN_GROUP: + return rprPatternChildrenEqual(a->children, b->children); + } + + return false; /* keep compiler quiet */ +} + +/* + * rprPatternChildrenEqual + * Compare children lists of two pattern nodes for equality. + * + * Returns true if the children lists are structurally identical. + */ +static bool +rprPatternChildrenEqual(List *a, List *b) +{ + ListCell *lca, + *lcb; + + if (list_length(a) != list_length(b)) + return false; + + forboth(lca, a, lcb, b) + { + if (!rprPatternEqual((RPRPatternNode *) lfirst(lca), + (RPRPatternNode *) lfirst(lcb))) + return false; + } + + return true; +} + +/* + * tryUnwrapSingleChild + * Try to unwrap pattern node with single child. + * + * Examples (internal node representation): + * SEQ[A] -> A (single-element sequence becomes the element) + * ALT[A] -> A (single-alternative becomes the alternative) + * + * If pattern has exactly one child, return the child directly. + * Otherwise returns the pattern unchanged. + * Used by both SEQ and ALT optimization. + */ +static RPRPatternNode * +tryUnwrapSingleChild(RPRPatternNode *pattern) +{ + if (list_length(pattern->children) == 1) + return (RPRPatternNode *) linitial(pattern->children); + + return pattern; +} + +/* + * flattenSeqChildren + * Recursively optimize children and flatten nested SEQ. + * + * Example: + * SEQ(A, SEQ(B, C)) -> SEQ(A, B, C) + * + * Returns a new list with optimized children, with nested SEQ children + * flattened into the parent list. + */ +static List * +flattenSeqChildren(List *children) +{ + ListCell *lc; + List *newChildren = NIL; + + foreach(lc, children) + { + RPRPatternNode *child = (RPRPatternNode *) lfirst(lc); + RPRPatternNode *opt = optimizeRPRPattern(child); + + /* GROUP{1,1} should have been unwrapped by optimizeGroupPattern */ + Assert(!(opt->nodeType == RPR_PATTERN_GROUP && + opt->min == 1 && opt->max == 1 && opt->reluctant < 0)); + + if (opt->nodeType == RPR_PATTERN_SEQ) + { + newChildren = list_concat(newChildren, + list_copy(opt->children)); + } + else + { + newChildren = lappend(newChildren, opt); + } + } + + return newChildren; +} + +/* + * mergeConsecutiveVars + * Merge consecutive identical VAR nodes. + * + * Examples: + * A{m1,M1} A{m2,M2} -> A{m1+m2, M1+M2} where INF + x = INF. + * + * Only merges non-reluctant VAR nodes with the same variable name. + */ +static List * +mergeConsecutiveVars(List *children) +{ + ListCell *lc; + List *mergedChildren = NIL; + RPRPatternNode *prev = NULL; + + foreach(lc, children) + { + RPRPatternNode *child = (RPRPatternNode *) lfirst(lc); + + if (child->nodeType == RPR_PATTERN_VAR && child->reluctant < 0) + { + /* ---------------------- + * Can merge consecutive VAR nodes if: + * 1. Same variable name + * 2. No min overflow: prev->min + child->min <= INF + * 3. No max overflow: prev->max + child->max <= INF (or either is INF) + */ + if (prev != NULL && + strcmp(prev->varName, child->varName) == 0 && + prev->min <= RPR_QUANTITY_INF - child->min && + (prev->max <= RPR_QUANTITY_INF - child->max || + prev->max == RPR_QUANTITY_INF || + child->max == RPR_QUANTITY_INF)) + { + /* + * Merge: accumulate min/max into prev. prev is guaranteed to + * be a non-reluctant VAR by the outer condition. + */ + Assert(prev->nodeType == RPR_PATTERN_VAR && prev->reluctant < 0); + + prev->min += child->min; + + if (prev->max == RPR_QUANTITY_INF || + child->max == RPR_QUANTITY_INF) + prev->max = RPR_QUANTITY_INF; + else + prev->max += child->max; + } + else + { + /* Flush previous and start new */ + if (prev != NULL) + mergedChildren = lappend(mergedChildren, prev); + prev = child; + } + } + else + { + /* Non-mergeable - flush previous */ + if (prev != NULL) + mergedChildren = lappend(mergedChildren, prev); + mergedChildren = lappend(mergedChildren, child); + prev = NULL; + } + } + + /* Flush remaining */ + if (prev != NULL) + mergedChildren = lappend(mergedChildren, prev); + + return mergedChildren; +} + +/* + * mergeConsecutiveGroups + * Merge consecutive identical GROUP nodes. + * + * Example: + * (A B)+ (A B)+ -> (A B){2,} + * + * Only merges non-reluctant GROUP nodes with identical children. + */ +static List * +mergeConsecutiveGroups(List *children) +{ + ListCell *lc; + List *mergedChildren = NIL; + RPRPatternNode *prev = NULL; + + foreach(lc, children) + { + RPRPatternNode *child = (RPRPatternNode *) lfirst(lc); + + if (child->nodeType == RPR_PATTERN_GROUP && child->reluctant < 0) + { + /* ---------------------- + * Can merge consecutive GROUP nodes if: + * 1. Identical children + * 2. No min overflow: prev->min + child->min <= INF + * 3. No max overflow: prev->max + child->max <= INF (or either is INF) + */ + if (prev != NULL && + rprPatternChildrenEqual(prev->children, child->children) && + prev->min <= RPR_QUANTITY_INF - child->min && + (prev->max <= RPR_QUANTITY_INF - child->max || + prev->max == RPR_QUANTITY_INF || + child->max == RPR_QUANTITY_INF)) + { + /* + * Merge: accumulate min/max into prev. prev is guaranteed to + * be a non-reluctant GROUP by the outer condition. + */ + Assert(prev->nodeType == RPR_PATTERN_GROUP && prev->reluctant < 0); + + prev->min += child->min; + + if (prev->max == RPR_QUANTITY_INF || + child->max == RPR_QUANTITY_INF) + prev->max = RPR_QUANTITY_INF; + else + prev->max += child->max; + } + else + { + /* Flush previous and start new */ + if (prev != NULL) + mergedChildren = lappend(mergedChildren, prev); + prev = child; + } + } + else + { + /* Non-mergeable - flush previous */ + if (prev != NULL) + mergedChildren = lappend(mergedChildren, prev); + mergedChildren = lappend(mergedChildren, child); + prev = NULL; + } + } + + /* Flush remaining */ + if (prev != NULL) + mergedChildren = lappend(mergedChildren, prev); + + return mergedChildren; +} + +/* + * mergeConsecutiveAlts + * Merge consecutive identical ALT nodes into a GROUP. + * + * Example: + * (A | B) (A | B) (A | B) -> (A | B){3} + * + * After GROUP{1,1} unwrap, bare alternations like (A | B) become ALT nodes + * in the SEQ. This step detects consecutive identical ALT nodes and wraps + * them in a GROUP with the appropriate quantifier. + */ +static List * +mergeConsecutiveAlts(List *children) +{ + ListCell *lc; + List *mergedChildren = NIL; + RPRPatternNode *prev = NULL; + int count = 0; + + foreach(lc, children) + { + RPRPatternNode *child = (RPRPatternNode *) lfirst(lc); + + if (child->nodeType == RPR_PATTERN_ALT && child->reluctant < 0) + { + if (prev != NULL && + rprPatternChildrenEqual(prev->children, child->children)) + { + /* Same ALT as prev - accumulate */ + count++; + } + else + { + /* Different ALT or first ALT - flush previous */ + if (prev != NULL) + { + if (count > 1) + { + /* Wrap in GROUP{count,count}(ALT) */ + RPRPatternNode *group = makeNode(RPRPatternNode); + + group->nodeType = RPR_PATTERN_GROUP; + group->min = count; + group->max = count; + group->reluctant = -1; + group->location = -1; + group->children = list_make1(prev); + mergedChildren = lappend(mergedChildren, group); + } + else + mergedChildren = lappend(mergedChildren, prev); + } + prev = child; + count = 1; + } + } + else + { + /* Non-ALT - flush previous */ + if (prev != NULL) + { + if (count > 1) + { + RPRPatternNode *group = makeNode(RPRPatternNode); + + group->nodeType = RPR_PATTERN_GROUP; + group->min = count; + group->max = count; + group->reluctant = -1; + group->location = -1; + group->children = list_make1(prev); + mergedChildren = lappend(mergedChildren, group); + } + else + mergedChildren = lappend(mergedChildren, prev); + } + mergedChildren = lappend(mergedChildren, child); + prev = NULL; + count = 0; + } + } + + /* Flush remaining */ + if (prev != NULL) + { + if (count > 1) + { + RPRPatternNode *group = makeNode(RPRPatternNode); + + group->nodeType = RPR_PATTERN_GROUP; + group->min = count; + group->max = count; + group->reluctant = -1; + group->location = -1; + group->children = list_make1(prev); + mergedChildren = lappend(mergedChildren, group); + } + else + mergedChildren = lappend(mergedChildren, prev); + } + + return mergedChildren; +} + +/* + * mergeGroupPrefixSuffix + * Merge sequence prefix/suffix into GROUP with matching children. + * + * Examples: + * A B (A B)+ -> (A B){2,} + * (A B)+ A B -> (A B){2,} + * A B (A B)+ A B -> (A B){3,} + */ +static List * +mergeGroupPrefixSuffix(List *children) +{ + List *result = NIL; + int numChildren = list_length(children); + int i; + int skipUntil = -1; /* skip suffix elements already absorbed */ + + for (i = 0; i < numChildren; i++) + { + RPRPatternNode *child = (RPRPatternNode *) list_nth(children, i); + + /* + * The suffix absorption logic below adjusts i to skip absorbed + * elements, ensuring we never revisit them. Verify this invariant. + */ + Assert(i >= skipUntil); + + /* + * If this is a GROUP, see if preceding/following elements match its + * children. GROUP's content may be wrapped in a SEQ - unwrap for + * comparison. + */ + if (child->nodeType == RPR_PATTERN_GROUP && child->reluctant < 0) + { + List *groupContent = child->children; + int groupChildCount; + int prefixLen = list_length(result); + + /* + * If GROUP has single SEQ child, compare with SEQ's children. + * e.g., (A B)+ internally contains sequence A B; compare against + * that. + */ + if (list_length(groupContent) == 1) + { + RPRPatternNode *inner = (RPRPatternNode *) linitial(groupContent); + + if (inner->nodeType == RPR_PATTERN_SEQ) + groupContent = inner->children; + } + + groupChildCount = list_length(groupContent); + + /* + * PREFIX MERGE: Check if preceding elements match. Keep absorbing + * as long as we have matching prefixes. + */ + while (prefixLen >= groupChildCount && groupChildCount > 0) + { + List *prefixElements = NIL; + int j; + + /* Extract last groupChildCount elements from prefix */ + for (j = prefixLen - groupChildCount; j < prefixLen; j++) + { + prefixElements = lappend(prefixElements, + list_nth(result, j)); + } + + /* Compare with GROUP's (possibly unwrapped) children */ + if (rprPatternChildrenEqual(prefixElements, groupContent) && + child->min < RPR_QUANTITY_INF) + { + /* + * Match! Merge by incrementing GROUP's min. Remove the + * prefix elements from output. + */ + child->min += 1; + + /* Rebuild result without matched prefix */ + { + List *trimmed = NIL; + + for (j = 0; j < prefixLen - groupChildCount; j++) + { + trimmed = lappend(trimmed, + list_nth(result, j)); + } + result = trimmed; + prefixLen = list_length(result); + } + } + else + { + list_free(prefixElements); + break; + } + + list_free(prefixElements); + } + + /* + * SUFFIX MERGE: Check if following elements match. Keep absorbing + * as long as we have matching suffixes. + */ + while (i + groupChildCount < numChildren && groupChildCount > 0) + { + List *suffixElements = NIL; + int j; + int suffixStart = i + 1; + + /* suffixStart always >= skipUntil after i adjustment */ + Assert(skipUntil <= suffixStart); + + /* Extract next groupChildCount elements as suffix */ + for (j = 0; j < groupChildCount; j++) + { + int idx = suffixStart + j; + + /* while condition guarantees idx < numChildren */ + Assert(idx < numChildren); + suffixElements = lappend(suffixElements, + list_nth(children, idx)); + } + + /* Compare with GROUP's children */ + if (list_length(suffixElements) == groupChildCount && + rprPatternChildrenEqual(suffixElements, groupContent) && + child->min < RPR_QUANTITY_INF) + { + /* + * Match! Absorb suffix by incrementing min and skipping. + */ + child->min += 1; + skipUntil = suffixStart + groupChildCount; + + /* + * Update i to continue suffix check after absorbed + * elements + */ + i = skipUntil - 1; + } + else + { + list_free(suffixElements); + break; + } + + list_free(suffixElements); + } + } + + result = lappend(result, child); + } + + return result; +} + +/* + * optimizeSeqPattern + * Optimize SEQ pattern node. + * + * Optimizations: + * 1. Flatten nested SEQ and GROUP{1,1} + * 2. Merge consecutive identical VAR nodes + * 3. Merge consecutive identical GROUP nodes + * 4. Merge consecutive identical ALT nodes into GROUP + * 5. Merge prefix/suffix into GROUP with matching children + * 6. Unwrap single-item SEQ + */ +static RPRPatternNode * +optimizeSeqPattern(RPRPatternNode *pattern) +{ + /* Recursively optimize children and flatten nested SEQ/GROUP{1,1} */ + pattern->children = flattenSeqChildren(pattern->children); + + /* Merge consecutive identical VAR nodes */ + pattern->children = mergeConsecutiveVars(pattern->children); + + /* Merge consecutive identical GROUP nodes */ + pattern->children = mergeConsecutiveGroups(pattern->children); + + /* Merge consecutive identical ALT nodes into GROUP */ + pattern->children = mergeConsecutiveAlts(pattern->children); + + /* Merge prefix/suffix into GROUP with matching children */ + pattern->children = mergeGroupPrefixSuffix(pattern->children); + + /* Unwrap single-item SEQ */ + return tryUnwrapSingleChild(pattern); +} + +/* + * flattenAltChildren + * Recursively optimize children and flatten nested ALT nodes. + * + * Example: + * (A | (B | C)) -> (A | B | C) + * + * Returns a new list with optimized children, with nested ALT children + * flattened into the parent list. + */ +static List * +flattenAltChildren(List *children) +{ + ListCell *lc; + List *newChildren = NIL; + + foreach(lc, children) + { + RPRPatternNode *child = (RPRPatternNode *) lfirst(lc); + RPRPatternNode *opt = optimizeRPRPattern(child); + + if (opt->nodeType == RPR_PATTERN_ALT) + newChildren = list_concat(newChildren, list_copy(opt->children)); + else + newChildren = lappend(newChildren, opt); + } + + return newChildren; +} + +/* + * removeDuplicateAlternatives + * Remove duplicate alternatives from a list. + * + * Examples: + * (A | B | A) -> (A | B) + * (X | Y | X | Z | Y) -> (X | Y | Z) + * + * Returns a new list with only unique children (first occurrence kept). + */ +static List * +removeDuplicateAlternatives(List *children) +{ + ListCell *lc; + ListCell *lc2; + List *uniqueChildren = NIL; + + foreach(lc, children) + { + RPRPatternNode *child = (RPRPatternNode *) lfirst(lc); + bool isDuplicate = false; + + foreach(lc2, uniqueChildren) + { + if (rprPatternEqual((RPRPatternNode *) lfirst(lc2), child)) + { + isDuplicate = true; + break; + } + } + + if (!isDuplicate) + uniqueChildren = lappend(uniqueChildren, child); + } + + return uniqueChildren; +} + +/* + * optimizeAltPattern + * Optimize ALT pattern node. + * + * Optimizations: + * 1. Flatten nested ALT + * 2. Remove duplicate alternatives + * 3. Unwrap single-item ALT + */ +static RPRPatternNode * +optimizeAltPattern(RPRPatternNode *pattern) +{ + /* Recursively optimize children and flatten nested ALT */ + pattern->children = flattenAltChildren(pattern->children); + + /* Remove duplicate alternatives */ + pattern->children = removeDuplicateAlternatives(pattern->children); + + /* Unwrap single-item ALT */ + return tryUnwrapSingleChild(pattern); +} + +/* + * tryMultiplyQuantifiers + * Try to multiply quantifiers. + * + * Multiplication is SAFE when: + * 1. Both unbounded: (A*)* -> A*, (A+)+ -> A+ + * 2. Outer exact: (A{m,n}){k} -> A{m*k, n*k} + * 3. Outer range + child {1,1}: (A){2,} -> A{2,} + * + * Multiplication is NOT safe when: + * - Only child unbounded: (A+){3} has different semantics + * - Outer range + child not {1,1}: gaps possible + * e.g., (A{2}){2,3} yields 4,6 only (not 4,5,6) + * + * Returns the child node with multiplied quantifiers if successful, + * otherwise returns the original pattern unchanged. + */ +static RPRPatternNode * +tryMultiplyQuantifiers(RPRPatternNode *pattern) +{ + RPRPatternNode *child; + int64 new_min_64; + int64 new_max_64; + + /* Parser always creates GROUP with exactly one child */ + Assert(list_length(pattern->children) == 1); + + if (pattern->reluctant >= 0) + return pattern; + + child = (RPRPatternNode *) linitial(pattern->children); + + if ((child->nodeType != RPR_PATTERN_VAR && + child->nodeType != RPR_PATTERN_GROUP) || + child->reluctant >= 0) + return pattern; + + /* Case 1: Both unbounded - (A*)* -> A*, (A+)+ -> A+ */ + if (child->max == RPR_QUANTITY_INF && pattern->max == RPR_QUANTITY_INF) + { + new_min_64 = (int64) child->min * pattern->min; + if (new_min_64 >= RPR_QUANTITY_INF) + return pattern; /* overflow, skip optimization */ + + child->min = (int) new_min_64; + child->max = RPR_QUANTITY_INF; + return child; + } + + /* + * Case 2/3: Safe when child is finite AND (outer is exact OR child is + * {1,1}) + */ + if (child->max != RPR_QUANTITY_INF && + (pattern->min == pattern->max || + (child->min == 1 && child->max == 1))) + { + new_min_64 = (int64) pattern->min * child->min; + if (new_min_64 >= RPR_QUANTITY_INF) + return pattern; + + if (pattern->max == RPR_QUANTITY_INF) + new_max_64 = RPR_QUANTITY_INF; + else + { + new_max_64 = (int64) pattern->max * child->max; + + if (new_max_64 >= RPR_QUANTITY_INF) + return pattern; + } + + child->min = (int) new_min_64; + child->max = (int) new_max_64; + return child; + } + + /* Not safe to multiply */ + return pattern; +} + +/* + * tryUnwrapGroup + * Try to unwrap GROUP{1,1} node. + * + * Examples: + * (A){1,1} -> A + * (A B){1,1} -> SEQ(A, B) (unwraps the inner SEQ) + * + * If GROUP has min=1, max=1, and is not reluctant, return the child directly. + * Otherwise returns the pattern unchanged. + * + * Note: Parser always creates GROUP with exactly one child via list_make1(). + */ +static RPRPatternNode * +tryUnwrapGroup(RPRPatternNode *pattern) +{ + if (pattern->min != 1 || pattern->max != 1 || pattern->reluctant >= 0) + return pattern; + + /* Parser always creates GROUP with single child */ + Assert(list_length(pattern->children) == 1); + return (RPRPatternNode *) linitial(pattern->children); +} + +/* + * optimizeGroupPattern + * Optimize GROUP pattern node. + * + * Optimizations: + * 1. Quantifier multiplication: (A{m}){n} -> A{m*n} + * 2. Unwrap GROUP{1,1} + */ +static RPRPatternNode * +optimizeGroupPattern(RPRPatternNode *pattern) +{ + ListCell *lc; + List *newChildren; + RPRPatternNode *result; + + /* Recursively optimize children */ + newChildren = NIL; + foreach(lc, pattern->children) + { + RPRPatternNode *child = (RPRPatternNode *) lfirst(lc); + + newChildren = lappend(newChildren, optimizeRPRPattern(child)); + } + pattern->children = newChildren; + + /* Try quantifier multiplication */ + result = tryMultiplyQuantifiers(pattern); + if (result != pattern) + return result; + + /* Try unwrapping GROUP{1,1} */ + return tryUnwrapGroup(pattern); +} + +/* + * optimizeRPRPattern + * Optimize RPRPatternNode tree (dispatcher). + * + * Dispatches to type-specific optimization functions. + * Returns the optimized pattern (may be a different node). + */ +static RPRPatternNode * +optimizeRPRPattern(RPRPatternNode *pattern) +{ + /* Pattern nodes from parser are never NULL */ + Assert(pattern != NULL); + + switch (pattern->nodeType) + { + case RPR_PATTERN_VAR: + return pattern; + case RPR_PATTERN_SEQ: + return optimizeSeqPattern(pattern); + case RPR_PATTERN_ALT: + return optimizeAltPattern(pattern); + case RPR_PATTERN_GROUP: + return optimizeGroupPattern(pattern); + } + + return pattern; /* keep compiler quiet */ +} + +/* + * collectDefineVariables + * Collect variable names from DEFINE clause. + * + * Populates varNames array with variable names in DEFINE order. + * This ensures varId == defineIdx, eliminating runtime mapping. + * Returns the number of variables collected. + */ +static int +collectDefineVariables(List *defineVariableList, char **varNames) +{ + ListCell *lc; + int numVars = 0; + + foreach(lc, defineVariableList) + { + /* Parser already checked this limit in transformDefineClause */ + Assert(numVars < RPR_VARID_MAX); + + varNames[numVars++] = strVal(lfirst(lc)); + } + + return numVars; +} + +/* + * scanRPRPatternRecursive + * Recursively scan pattern AST (pass 1 internal). + * + * Collects unique variable names and counts elements while tracking depth. + * Variables from DEFINE clause are already in varNames; this adds any + * additional variables found in the pattern. + */ +static void +scanRPRPatternRecursive(RPRPatternNode *node, char **varNames, int *numVars, + int *numElements, RPRDepth depth, RPRDepth *maxDepth) +{ + ListCell *lc; + int i; + + /* Pattern nodes from parser are never NULL */ + Assert(node != NULL); + + /* Check recursion depth limit before overflow occurs */ + if (depth >= RPR_DEPTH_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("pattern nesting too deep"), + errdetail("Pattern nesting depth %d exceeds maximum %d.", + depth, RPR_DEPTH_MAX - 1))); + + /* Track maximum depth */ + if (depth > *maxDepth) + *maxDepth = depth; + + switch (node->nodeType) + { + case RPR_PATTERN_VAR: + /* Count element */ + (*numElements)++; + + /* Collect variable name if not already present */ + for (i = 0; i < *numVars; i++) + { + if (strcmp(varNames[i], node->varName) == 0) + return; /* Already have this variable */ + } + + /* + * Variable not in DEFINE clause - this is valid per SQL standard. + * Such variables are implicitly TRUE. Add to varNames so they get + * a varId >= defineVariableList length, which executor treats as + * TRUE. + */ + Assert(*numVars < RPR_VARID_MAX); + varNames[(*numVars)++] = node->varName; + break; + + case RPR_PATTERN_SEQ: + /* Sequence: just recurse into children */ + foreach(lc, node->children) + { + scanRPRPatternRecursive((RPRPatternNode *) lfirst(lc), varNames, + numVars, numElements, depth, maxDepth); + } + break; + + case RPR_PATTERN_GROUP: + /* Add BEGIN element if group has non-trivial quantifier */ + if (node->min != 1 || node->max != 1) + (*numElements)++; + + /* Recurse into children at increased depth */ + foreach(lc, node->children) + { + scanRPRPatternRecursive((RPRPatternNode *) lfirst(lc), varNames, + numVars, numElements, depth + 1, maxDepth); + } + + /* Add END element if group has non-trivial quantifier */ + if (node->min != 1 || node->max != 1) + (*numElements)++; + break; + + case RPR_PATTERN_ALT: + /* Count ALT start element */ + (*numElements)++; + + /* Recurse into children at increased depth */ + foreach(lc, node->children) + { + scanRPRPatternRecursive((RPRPatternNode *) lfirst(lc), varNames, + numVars, numElements, depth + 1, maxDepth); + } + break; + } +} + +/* + * scanRPRPattern + * Scan pattern AST (pass 1 entry point). + * + * Collects unique variable names (appending to those from DEFINE clause), + * counts total elements (including FIN marker), and tracks maximum depth. + * Reports error if element count exceeds RPR_ELEMIDX_MAX. + */ +static void +scanRPRPattern(RPRPatternNode *node, char **varNames, int *numVars, + int *numElements, RPRDepth *maxDepth) +{ + *numElements = 0; + *maxDepth = 0; + + scanRPRPatternRecursive(node, varNames, numVars, numElements, 0, maxDepth); + + (*numElements)++; /* +1 for FIN marker */ + + if (*numElements > RPR_ELEMIDX_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("pattern too complex"), + errdetail("Pattern has %d elements, maximum is %d.", + *numElements, RPR_ELEMIDX_MAX))); +} + +/* + * allocateRPRPattern + * Allocate and initialize RPRPattern structure. + * + * Creates the pattern structure, copies variable names, and allocates + * the elements array. The elements array is zero-initialized. + */ +static RPRPattern * +allocateRPRPattern(int numVars, int numElements, RPRDepth maxDepth, + char **varNamesStack) +{ + RPRPattern *result; + int i; + + result = makeNode(RPRPattern); + result->numVars = numVars; + result->maxDepth = maxDepth + 1; /* +1: depth is 0-based */ + result->numElements = numElements; + + /* Copy varNames (pattern must have at least one variable) */ + Assert(numVars > 0); + result->varNames = palloc(numVars * sizeof(char *)); + for (i = 0; i < numVars; i++) + result->varNames[i] = pstrdup(varNamesStack[i]); + + /* Allocate elements array (zero-init for reserved fields) */ + Assert(numElements >= 2); + result->elements = palloc0(numElements * sizeof(RPRPatternElement)); + + return result; +} + +/* + * getVarIdFromPattern + * Get variable ID for a variable name from RPRPattern. + * + * Returns the index of the variable in the varNames array. + */ +static RPRVarId +getVarIdFromPattern(RPRPattern *pat, const char *varName) +{ + for (int i = 0; i < pat->numVars; i++) + { + if (strcmp(pat->varNames[i], varName) == 0) + return (RPRVarId) i; + } + + /* Should not happen - variable should already be collected */ + elog(ERROR, "pattern variable \"%s\" not found", varName); + pg_unreachable(); +} + +/* + * fillRPRPatternVar + * Fill a VAR pattern element. + */ +static void +fillRPRPatternVar(RPRPatternNode *node, RPRPattern *pat, int *idx, RPRDepth depth) +{ + RPRPatternElement *elem = &pat->elements[*idx]; + + memset(elem, 0, sizeof(RPRPatternElement)); + elem->varId = getVarIdFromPattern(pat, node->varName); + elem->depth = depth; + elem->min = node->min; + elem->max = (node->max == INT_MAX) ? RPR_QUANTITY_INF : node->max; + elem->next = RPR_ELEMIDX_INVALID; + elem->jump = RPR_ELEMIDX_INVALID; + if (node->reluctant >= 0) + elem->flags |= RPR_ELEM_RELUCTANT; + (*idx)++; +} + +/* + * fillRPRPatternGroup + * Fill a GROUP pattern and its children. + * + * Creates elements for group content at increased depth, plus an END marker + * if the group has a non-trivial quantifier. + */ +static void +fillRPRPatternGroup(RPRPatternNode *node, RPRPattern *pat, int *idx, RPRDepth depth) +{ + ListCell *lc; + int groupStartIdx = *idx; + int beginIdx = -1; + + /* Add BEGIN marker if group has non-trivial quantifier */ + if (node->min != 1 || node->max != 1) + { + RPRPatternElement *elem = &pat->elements[*idx]; + + beginIdx = *idx; + memset(elem, 0, sizeof(RPRPatternElement)); + elem->varId = RPR_VARID_BEGIN; + elem->depth = depth; + elem->min = node->min; + elem->max = (node->max == INT_MAX) ? RPR_QUANTITY_INF : node->max; + elem->next = RPR_ELEMIDX_INVALID; /* set by finalize */ + elem->jump = RPR_ELEMIDX_INVALID; /* set after END */ + if (node->reluctant >= 0) + elem->flags |= RPR_ELEM_RELUCTANT; + (*idx)++; + groupStartIdx = *idx; /* children start after BEGIN */ + } + + foreach(lc, node->children) + { + fillRPRPattern((RPRPatternNode *) lfirst(lc), pat, idx, depth + 1); + } + + /* Add group end marker if group has non-trivial quantifier */ + if (node->min != 1 || node->max != 1) + { + RPRPatternElement *beginElem = &pat->elements[beginIdx]; + RPRPatternElement *endElem = &pat->elements[*idx]; + + memset(endElem, 0, sizeof(RPRPatternElement)); + endElem->varId = RPR_VARID_END; + endElem->depth = depth; + endElem->min = node->min; + endElem->max = (node->max == INT_MAX) ? RPR_QUANTITY_INF : node->max; + endElem->next = RPR_ELEMIDX_INVALID; + endElem->jump = groupStartIdx; /* loop to first child */ + if (node->reluctant >= 0) + endElem->flags |= RPR_ELEM_RELUCTANT; + (*idx)++; + + /* Set BEGIN skip pointer (next is set by finalize) */ + beginElem->jump = *idx; /* skip: go to after END */ + } +} + +/* + * fillRPRPatternAlt + * Fill an ALT pattern and its alternatives. + * + * Creates ALT_START marker, fills each alternative at increased depth, + * sets jump pointers for backtracking, and next pointers for successful paths. + */ +static void +fillRPRPatternAlt(RPRPatternNode *node, RPRPattern *pat, int *idx, RPRDepth depth) +{ + ListCell *lc; + RPRPatternElement *elem; + List *altBranchStarts = NIL; + List *altEndPositions = NIL; + + /* Add alternation start marker */ + elem = &pat->elements[*idx]; + memset(elem, 0, sizeof(RPRPatternElement)); + elem->varId = RPR_VARID_ALT; + elem->depth = depth; + elem->min = 1; + elem->max = 1; + elem->next = RPR_ELEMIDX_INVALID; + elem->jump = RPR_ELEMIDX_INVALID; + (*idx)++; + + /* Fill each alternative */ + foreach(lc, node->children) + { + RPRPatternNode *alt = (RPRPatternNode *) lfirst(lc); + int branchStart = *idx; + + altBranchStarts = lappend_int(altBranchStarts, branchStart); + fillRPRPattern(alt, pat, idx, depth + 1); + altEndPositions = lappend_int(altEndPositions, *idx - 1); + } + + /* Set jump on first element of each alternative to next alternative */ + foreach(lc, altBranchStarts) + { + int firstElemIdx = lfirst_int(lc); + + if (lnext(altBranchStarts, lc) != NULL) + pat->elements[firstElemIdx].jump = lfirst_int(lnext(altBranchStarts, lc)); + } + + /* Set next on last element of each alternative to after the alternation */ + { + int afterAltIdx = *idx; + ListCell *lc2 = list_head(altBranchStarts); + + foreach(lc, altEndPositions) + { + int endPos = lfirst_int(lc); + int branchStart = lfirst_int(lc2); + + if (pat->elements[endPos].next != RPR_ELEMIDX_INVALID) + { + /* + * An inner ALT already set next on this element. Redirect + * all elements in this branch that share the same target to + * point to after this ALT instead. + */ + int oldTarget = pat->elements[endPos].next; + int j; + + for (j = branchStart; j <= endPos; j++) + { + if (pat->elements[j].next == oldTarget) + pat->elements[j].next = afterAltIdx; + } + } + else + { + pat->elements[endPos].next = afterAltIdx; + } + + lc2 = lnext(altBranchStarts, lc2); + } + } + + list_free(altBranchStarts); + list_free(altEndPositions); +} + +/* + * fillRPRPattern + * Fill pattern elements array from AST (pass 2). + * + * Recursively traverses AST and populates pre-allocated elements array. + * Dispatches to type-specific fill functions. + */ +static void +fillRPRPattern(RPRPatternNode *node, RPRPattern *pat, int *idx, RPRDepth depth) +{ + ListCell *lc; + + /* Pattern nodes from parser are never NULL */ + Assert(node != NULL); + + switch (node->nodeType) + { + case RPR_PATTERN_SEQ: + foreach(lc, node->children) + { + fillRPRPattern((RPRPatternNode *) lfirst(lc), pat, idx, depth); + } + break; + + case RPR_PATTERN_VAR: + fillRPRPatternVar(node, pat, idx, depth); + break; + + case RPR_PATTERN_GROUP: + fillRPRPatternGroup(node, pat, idx, depth); + break; + + case RPR_PATTERN_ALT: + fillRPRPatternAlt(node, pat, idx, depth); + break; + } +} + +/* + * finalizeRPRPattern + * Finalize pattern structure after filling elements. + * + * This performs: + * 1. Initialize absorption flag to false + * 2. Set up next pointers for sequential flow + * 3. Add FIN marker at the end + */ +static void +finalizeRPRPattern(RPRPattern *result) +{ + int finIdx = result->numElements - 1; + int i; + RPRPatternElement *finElem; + + /* Initialize absorption flag */ + result->isAbsorbable = false; + + /* Set up next pointers for elements that don't have one */ + for (i = 0; i < finIdx; i++) + { + RPRPatternElement *elem = &result->elements[i]; + + if (elem->next == RPR_ELEMIDX_INVALID) + elem->next = (i < finIdx - 1) ? i + 1 : finIdx; + } + + /* Add FIN marker at the end */ + finElem = &result->elements[finIdx]; + memset(finElem, 0, sizeof(RPRPatternElement)); + finElem->varId = RPR_VARID_FIN; + finElem->depth = 0; + finElem->min = 1; + finElem->max = 1; + finElem->next = RPR_ELEMIDX_INVALID; + finElem->jump = RPR_ELEMIDX_INVALID; +} + +/*------------------------------------------------------------------------- + * CONTEXT ABSORPTION: TWO-FLAG DESIGN + *------------------------------------------------------------------------- + * + * Context absorption eliminates redundant match searches by absorbing newer + * contexts that cannot produce longer matches than older contexts. This + * achieves O(n^2) -> O(n) performance improvement for patterns like A+ B. + * + * Core Insight: + * For pattern A+ B, if Ctx1 starts at row 0 and Ctx2 starts at row 1, + * both matching A continuously, Ctx1 will always have more A matches. + * When B finally appears, Ctx1's match (0 to current) is always longer + * than Ctx2's match (1 to current). So Ctx2 can be safely eliminated. + * + * Two Flags: + * 1. RPR_ELEM_ABSORBABLE - "Absorption judgment point" + * WHERE contexts can be compared for absorption. + * - Simple unbounded VAR (A+): the VAR element itself + * - Unbounded GROUP ((A B)+): the END element only + * + * 2. RPR_ELEM_ABSORBABLE_BRANCH - "Absorbable region marker" + * ALL elements within the absorbable region. + * - Used for tracking state.isAbsorbable at runtime + * - States leaving this region become non-absorbable permanently + * + * Why Two Flags? + * For pattern "(A B)+", contexts at different positions (one at A, + * another at B) cannot be compared - they must synchronize at END. + * + * Example: "(A B)+" with input A B A B A B... + * Row 0 (A): Ctx1 starts, matches A + * Row 1 (B): Ctx1 matches B -> END (count=1) + * Row 2 (A): Ctx1 loops to A, Ctx2 starts at A + * Row 3 (B): Ctx1 at END (count=2), Ctx2 at END (count=1) + * -> Both at END, comparable! Ctx1 absorbs Ctx2. + * + * Contexts synchronize at END every group-length rows. Therefore: + * - ABSORBABLE marks END as judgment point (where to compare) + * - ABSORBABLE_BRANCH keeps state.isAbsorbable=true through A->B->END + * + * Pattern Examples: + * + * Pattern: A+ B + * Element 0 (A): ABSORBABLE | ABSORBABLE_BRANCH <- judgment point + * Element 1 (B): (none) + * -> Compare at A every row. When contexts move to B, absorption stops. + * + * Pattern: (A B)+ C + * Element 0 (A): ABSORBABLE_BRANCH + * Element 1 (B): ABSORBABLE_BRANCH + * Element 2 (END): ABSORBABLE | ABSORBABLE_BRANCH <- judgment point + * Element 3 (C): (none) + * -> Compare at END every 2 rows. When contexts move to C, absorption stops. + * + * Pattern: (A+ B+)+ C + * Element 0 (A): ABSORBABLE | ABSORBABLE_BRANCH <- only first A+ flagged + * Element 1 (B): (none) + * Element 2 (END): (none) + * Element 3 (C): (none) + * -> Only first unbounded portion (A+) gets flags. Absorption happens + * at A during first iteration. After moving to B+, absorption stops. + * + * First Unbounded Portion Strategy: + * The algorithm only flags the FIRST unbounded portion starting from + * element 0. This is sufficient because: + * - Absorption in first portion already achieves O(n) complexity + * - Later portions have different synchronization characteristics + * - Nested unbounded patterns are too complex for simple absorption + * - Complex patterns (nested groups, etc.) naturally die from mismatch + * + * Runtime Usage (in nodeWindowAgg.c): + * - state.isAbsorbable = (previous && elem.ABSORBABLE_BRANCH) + * - Monotonic: once false, stays false (cannot re-enter region) + * - context.hasAbsorbableState: can absorb others (>=1 absorbable state) + * - context.allStatesAbsorbable: can be absorbed (ALL states absorbable) + * - Absorption check: if Ctx1.hasAbsorbable && Ctx2.allAbsorbable, + * compare counts at same elemIdx, absorb if Ctx1.count >= Ctx2.count + * + *------------------------------------------------------------------------- + */ + +/* + * isUnboundedStart + * Check if the element at idx starts an unbounded greedy sequence. + * + * For context absorption to work, the sequence starting at idx must be: + * - Unbounded (max = infinity) + * - Greedy (not reluctant) + * - At the start of current scope + * + * Algorithm: + * - Traverse elements within current scope (parentDepth to startDepth) + * - For GROUP: must be unbounded greedy AND contain only simple {1,1} VARs + * - Sets ABSORBABLE and ABSORBABLE_BRANCH flags on matching elements + * + * Two cases are handled: + * 1. Simple VAR: A+ B C - A has max=INF, gets both flags + * 2. Group: (A B)+ C - END has max=INF, all children are {1,1} VARs + * A,B,END get ABSORBABLE_BRANCH, only END gets ABSORBABLE + * + * Returns false for patterns where absorption cannot work: + * - A B+ (unbounded not at start) + * - A+? B (reluctant quantifier) + * - (A | B)+ (ALT inside group) + * - (A B+)+ (unbounded element inside group) + * - ((A B)+ C)+ (nested unbounded groups) + */ +static bool +isUnboundedStart(RPRPattern *pattern, RPRElemIdx idx) +{ + RPRPatternElement *elem = &pattern->elements[idx]; + RPRDepth startDepth = elem->depth; + RPRPatternElement *nextElem; + RPRPatternElement *e; + + /* Case 1: Simple unbounded VAR at start (greedy only) */ + if (RPRElemIsVar(elem) && elem->max == RPR_QUANTITY_INF && + !RPRElemIsReluctant(elem)) + { + /* Set both flags on first element */ + elem->flags |= RPR_ELEM_ABSORBABLE_BRANCH | RPR_ELEM_ABSORBABLE; + return true; + } + + /* + * Case 2: Unbounded GROUP - traverse siblings at startDepth and check if + * they're all simple {1,1} VARs, then check if END at startDepth - 1 is + * unbounded greedy. + */ + for (e = elem; e->depth == startDepth; e = nextElem) + { + /* Must be simple {1,1} VAR */ + if (!RPRElemIsVar(e) || e->min != 1 || e->max != 1) + return false; + + Assert(e->next != RPR_ELEMIDX_INVALID); + nextElem = &pattern->elements[e->next]; + } + + /* Now e should be END at startDepth - 1 */ + if (e->depth == startDepth - 1 && + RPRElemIsEnd(e) && e->max == RPR_QUANTITY_INF && + !RPRElemIsReluctant(e)) + { + Assert(e->jump == idx); /* END points back to first child */ + + /* Set ABSORBABLE_BRANCH on all children, ABSORBABLE on END only */ + for (e = elem; !RPRElemIsEnd(e); e = &pattern->elements[e->next]) + e->flags |= RPR_ELEM_ABSORBABLE_BRANCH; + e->flags |= RPR_ELEM_ABSORBABLE_BRANCH | RPR_ELEM_ABSORBABLE; + return true; + } + + return false; +} + +/* + * computeAbsorbabilityRecursive + * Recursively check absorbability starting from given index. + * + * If the element at startIdx is ALT, recursively checks each branch independently. + * Each branch gets its own absorbability status, and if any branch is absorbable, + * the ALT element itself is marked with RPR_ELEM_ABSORBABLE_BRANCH. + * + * If BEGIN, skips to first child. + * + * Otherwise (VAR), checks if the element starts an unbounded sequence via + * isUnboundedStart. + */ +static void +computeAbsorbabilityRecursive(RPRPattern *pattern, RPRElemIdx startIdx, + bool *hasAbsorbable) +{ + RPRPatternElement *elem = &pattern->elements[startIdx]; + + if (RPRElemIsAlt(elem)) + { + /* ALT: recursively check each branch */ + RPRElemIdx branchIdx = elem->next; + RPRPatternElement *branchFirst; + bool branchAbsorbable; + + while (branchIdx != RPR_ELEMIDX_INVALID) + { + branchAbsorbable = false; + + Assert(branchIdx < pattern->numElements); + branchFirst = &pattern->elements[branchIdx]; + + /* Stop if element is outside ALT scope (not a branch) */ + if (branchFirst->depth <= elem->depth) + break; + + /* Recursively check this branch */ + computeAbsorbabilityRecursive(pattern, branchIdx, &branchAbsorbable); + if (branchAbsorbable) + { + *hasAbsorbable = true; + } + + branchIdx = branchFirst->jump; + } + + /* Mark ALT element if any branch is absorbable */ + if (*hasAbsorbable) + elem->flags |= RPR_ELEM_ABSORBABLE_BRANCH; + } + else if (RPRElemIsBegin(elem)) + { + /* BEGIN: skip to first child and check that */ + computeAbsorbabilityRecursive(pattern, elem->next, hasAbsorbable); + + /* Mark BEGIN element if contents are absorbable */ + if (*hasAbsorbable) + elem->flags |= RPR_ELEM_ABSORBABLE_BRANCH; + } + else + { + /* Should never reach END - structural invariant of pattern AST */ + Assert(!RPRElemIsEnd(elem)); + + /* Non-ALT, non-BEGIN: check if unbounded start */ + if (isUnboundedStart(pattern, startIdx)) + { + *hasAbsorbable = true; + } + } +} + +/* + * computeAbsorbability + * Determine if pattern supports context absorption optimization. + * + * Context absorption eliminates redundant match searches by absorbing + * newer contexts that cannot produce longer matches than older contexts. + * This achieves O(n^2) -> O(n) performance improvement. + * + * Only greedy unbounded quantifiers at pattern start can be absorbable. + * Reluctant quantifiers are excluded because they don't maintain monotonic + * decrease property required for safe absorption. + * + * This function sets two flags: + * RPR_ELEM_ABSORBABLE: Absorption judgment point + * - Simple unbounded VAR: the VAR itself (e.g., A in A+) + * - Unbounded GROUP: the END element (e.g., END in (A B)+) + * RPR_ELEM_ABSORBABLE_BRANCH: All elements in absorbable region + * - All elements within the same scope as unbounded start + * + * Examples: + * A+ B C - absorbable (A gets both flags) + * (A B)+ C - absorbable (A,B,END get BRANCH, END gets ABSORBABLE) + * A B+ - NOT absorbable (unbounded not at start) + * A+? B C - NOT absorbable (reluctant quantifier) + * (A+ B+)+ - only first A+ on first iteration (nested unbounded not supported) + * A+ | B+ - both branches absorbable independently + * A+ | C D - only A+ branch absorbable (C D branch not absorbable) + * ((A+ B) | C) D - nested ALT: A+ branch is absorbable + */ +static void +computeAbsorbability(RPRPattern *pattern) +{ + bool hasAbsorbable = false; + + /* Parser always produces at least one element + FIN */ + Assert(pattern->numElements >= 2); + + /* Start recursion from first element */ + computeAbsorbabilityRecursive(pattern, 0, &hasAbsorbable); + pattern->isAbsorbable = hasAbsorbable; +} + +/* + * collectPatternVariablesRecursive + * Recursively collect variable names from pattern AST. + */ +static void +collectPatternVariablesRecursive(RPRPatternNode *node, List **varNames) +{ + ListCell *lc; + + Assert(node != NULL); + + switch (node->nodeType) + { + case RPR_PATTERN_VAR: + /* Add variable if not already in list */ + foreach(lc, *varNames) + { + if (strcmp(strVal(lfirst(lc)), node->varName) == 0) + return; /* Already collected */ + } + *varNames = lappend(*varNames, makeString(pstrdup(node->varName))); + break; + + case RPR_PATTERN_SEQ: + case RPR_PATTERN_ALT: + case RPR_PATTERN_GROUP: + foreach(lc, node->children) + { + collectPatternVariablesRecursive((RPRPatternNode *) lfirst(lc), + varNames); + } + break; + } +} + +/* + * collectPatternVariables + * Collect unique variable names used in PATTERN clause. + * + * Returns a list of String nodes containing variable names. + * Used to filter defineClause before varId assignment. + */ +List * +collectPatternVariables(RPRPatternNode *pattern) +{ + List *varNames = NIL; + + /* Caller ensures pattern is not NULL */ + Assert(pattern != NULL); + + collectPatternVariablesRecursive(pattern, &varNames); + return varNames; +} + +/* + * filterDefineClause + * Filter defineClause to include only variables used in PATTERN. + * + * This eliminates unnecessary DEFINE evaluations at runtime. + * Also builds defineVariableList from the filtered result. + * + * Returns filtered defineClause (list of TargetEntry). + * Sets *defineVariableList to list of variable names (String nodes). + */ +List * +filterDefineClause(List *defineClause, List *patternVars, + List **defineVariableList) +{ + List *filteredDefineClause = NIL; + ListCell *lc; + ListCell *lc2; + + *defineVariableList = NIL; + + /* Filter defineClause: keep only variables used in PATTERN */ + foreach(lc, defineClause) + { + TargetEntry *te = (TargetEntry *) lfirst(lc); + + foreach(lc2, patternVars) + { + if (strcmp(strVal(lfirst(lc2)), te->resname) == 0) + { + filteredDefineClause = lappend(filteredDefineClause, te); + break; + } + } + } + + /* Build defineVariableList from filtered defineClause */ + foreach(lc, filteredDefineClause) + { + TargetEntry *te = (TargetEntry *) lfirst(lc); + + *defineVariableList = lappend(*defineVariableList, + makeString(pstrdup(te->resname))); + } + + return filteredDefineClause; +} + +/* + * buildRPRPattern + * Compile pattern AST to flat bytecode array. + * + * Compilation phases: + * 1. Optimize AST (flatten, merge, deduplicate) + * 2. Scan: collect variables, count elements (pass 1) + * 3. Allocate result structure + * 4. Fill elements from AST (pass 2) + * 5. Finalize pattern structure + * 6. Compute context absorption eligibility + * + * Called from createplan.c during plan creation. + */ +RPRPattern * +buildRPRPattern(RPRPatternNode *pattern, List *defineVariableList, + RPSkipTo rpSkipTo, int frameOptions) +{ + RPRPattern *result; + RPRPatternNode *optimized; + char *varNamesStack[RPR_VARID_MAX]; + int numVars; + int numElements; + RPRDepth maxDepth; + int idx; + bool hasLimitedFrame; + + /* Caller must check for NULL pattern before calling */ + Assert(pattern != NULL); + + /* Optimize the pattern tree */ + optimized = optimizeRPRPattern(copyObject(pattern)); + + /* Collect variable names from DEFINE clause */ + numVars = collectDefineVariables(defineVariableList, varNamesStack); + + /* Scan pattern: collect variables, count elements, validate limits */ + scanRPRPattern(optimized, varNamesStack, &numVars, &numElements, &maxDepth); + + /* Allocate result structure */ + result = allocateRPRPattern(numVars, numElements, maxDepth, varNamesStack); + + /* Fill elements (pass 2) */ + idx = 0; + fillRPRPattern(optimized, result, &idx, 0); + + /* Finalize: set up next pointers, flags, and add FIN marker */ + finalizeRPRPattern(result); + + /* + * Compute context absorption eligibility. Absorption requires both + * structural absorbability and runtime conditions. Check runtime + * conditions first to avoid unnecessary pattern analysis. + * + * Runtime conditions for absorption: + * + * 1. SKIP TO PAST LAST ROW required (not SKIP TO NEXT ROW): With NEXT + * ROW, after each match the search resumes from the next row, so contexts + * are immediately discarded. No redundant contexts accumulate, making + * absorption unnecessary. + * + * 2. Unbounded frame end required (not ROWS with bounded end): With a + * bounded frame (e.g., ROWS BETWEEN CURRENT ROW AND 10 FOLLOWING), + * matches may be truncated at frame boundaries. This changes the + * absorption semantics - older contexts don't necessarily produce longer + * matches when frame limits apply differently to each context. + */ + hasLimitedFrame = (frameOptions & FRAMEOPTION_ROWS) && + !(frameOptions & FRAMEOPTION_END_UNBOUNDED_FOLLOWING); + + if (rpSkipTo == ST_PAST_LAST_ROW && !hasLimitedFrame) + { + /* Runtime conditions met - check structural absorbability */ + computeAbsorbability(result); + } + + return result; +} diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index 1b5b9b5ed9c..14367afcc7c 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -214,7 +214,6 @@ static List *set_windowagg_runcondition_references(PlannerInfo *root, static void record_elided_node(PlannerGlobal *glob, int plan_node_id, NodeTag elided_type, Bitmapset *relids); - /***************************************************************************** * * SUBPLAN REFERENCES @@ -2627,6 +2626,32 @@ set_upper_references(PlannerInfo *root, Plan *plan, int rtoffset) NRM_EQUAL, NUM_EXEC_QUAL(plan)); + /* + * Modifies an expression tree in each DEFINE clause so that all Var + * nodes's varno refers to OUTER_VAR. + */ + if (IsA(plan, WindowAgg)) + { + WindowAgg *wplan = (WindowAgg *) plan; + + if (wplan->defineClause != NIL) + { + foreach(l, wplan->defineClause) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + + tle->expr = (Expr *) + fix_upper_expr(root, + (Node *) tle->expr, + subplan_itlist, + OUTER_VAR, + rtoffset, + NRM_EQUAL, + NUM_EXEC_QUAL(plan)); + } + } + } + pfree(subplan_itlist); } diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c index c90f4b32733..bcedc5b849e 100644 --- a/src/backend/optimizer/prep/prepjointree.c +++ b/src/backend/optimizer/prep/prepjointree.c @@ -2514,6 +2514,15 @@ perform_pullup_replace_vars(PlannerInfo *root, parse->returningList = (List *) pullup_replace_vars((Node *) parse->returningList, rvcontext); + foreach(lc, parse->windowClause) + { + WindowClause *wc = lfirst_node(WindowClause, lc); + + if (wc->defineClause != NIL) + wc->defineClause = (List *) + pullup_replace_vars((Node *) wc->defineClause, rvcontext); + } + if (parse->onConflict) { parse->onConflict->onConflictSet = (List *) diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 8c9321aab8c..9c09848d4ae 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -20,6 +20,7 @@ #include "lib/stringinfo.h" #include "nodes/bitmapset.h" #include "nodes/lockoptions.h" +#include "nodes/parsenodes.h" #include "nodes/primnodes.h" @@ -1242,6 +1243,70 @@ typedef struct Agg List *chain; } Agg; +/* ---------------- + * Row Pattern Recognition compiled pattern types + * ---------------- + */ + +/* Type definitions for RPR pattern elements */ +typedef uint8 RPRVarId; /* pattern variable ID */ +typedef uint8 RPRElemFlags; /* element flags */ +typedef uint8 RPRDepth; /* group nesting depth */ +typedef int32 RPRQuantity; /* quantifier min/max */ +typedef int16 RPRElemIdx; /* element array index */ + +/* + * RPRPatternElement - flat element for NFA pattern matching (16 bytes) + * + * Layout optimized for alignment (no padding holes): + * varId(1) + depth(1) + flags(1) + reserved(1) + min(4) + max(4) + next(2) + jump(2) + */ +typedef struct RPRPatternElement +{ + RPRVarId varId; /* variable ID, or special value for control */ + RPRDepth depth; /* group nesting depth */ + RPRElemFlags flags; /* flags (reluctant, etc.) */ + uint8 reserved; /* reserved padding byte */ + RPRQuantity min; /* quantifier minimum */ + RPRQuantity max; /* quantifier maximum */ + RPRElemIdx next; /* next element index */ + RPRElemIdx jump; /* jump target (for ALT/GROUP) */ +} RPRPatternElement; + +/* + * RPRPattern - compiled pattern for NFA execution + * + * Requires custom copy/out/read functions due to elements array. + */ +typedef struct RPRPattern +{ + pg_node_attr(custom_copy_equal, custom_read_write) + + NodeTag type; /* T_RPRPattern */ + int numVars; /* number of pattern variables */ + char **varNames; /* array of variable names (DEFINE order + * first) */ + RPRDepth maxDepth; /* maximum group nesting depth */ + int numElements; /* number of elements */ + RPRPatternElement *elements; /* array of pattern elements */ + + /*---------------- + * Context absorption optimization. + * + * Absorption is only safe when later matches are guaranteed to be + * suffixes of earlier matches. This requires simple pattern structure: + * + * Case 1: No ALT, single unbounded element (A+, (A B)+) + * Case 2: Top-level ALT with each branch being single unbounded (A+ | B+) + * + * Complex patterns like A B (A B)+ could theoretically be transformed to + * (A B){2,} for absorption, but this changes lexical order and is not + * implemented. Similarly, (A|B)+ cannot be absorbed because different + * start positions produce different match contents (not suffix relation). + */ + bool isAbsorbable; /* true if pattern supports context absorption */ +} RPRPattern; + /* ---------------- * window aggregate node * ---------------- @@ -1312,6 +1377,15 @@ typedef struct WindowAgg /* nulls sort first for in_range tests? */ bool inRangeNullsFirst; + /* Row Pattern Recognition AFTER MATCH SKIP clause */ + RPSkipTo rpSkipTo; /* Row Pattern Skip To type */ + + /* Compiled Row Pattern for NFA execution */ + struct RPRPattern *rpPattern; + + /* Row Pattern DEFINE clause (list of TargetEntry) */ + List *defineClause; + /* * false for all apart from the WindowAgg that's closest to the root of * the plan diff --git a/src/include/optimizer/rpr.h b/src/include/optimizer/rpr.h new file mode 100644 index 00000000000..7a8938dbebd --- /dev/null +++ b/src/include/optimizer/rpr.h @@ -0,0 +1,57 @@ +/*------------------------------------------------------------------------- + * + * rpr.h + * Row Pattern Recognition pattern compilation for planner + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/optimizer/rpr.h + * + *------------------------------------------------------------------------- + */ +#ifndef OPTIMIZER_RPR_H +#define OPTIMIZER_RPR_H + +#include "nodes/parsenodes.h" +#include "nodes/plannodes.h" + +/* Limits and special values */ +#define RPR_VARID_MAX 251 /* max pattern variables: 251 */ +#define RPR_QUANTITY_INF INT32_MAX /* unbounded quantifier */ +#define RPR_COUNT_MAX INT32_MAX /* max runtime count (NFA state) */ +#define RPR_ELEMIDX_MAX INT16_MAX /* max pattern elements */ +#define RPR_ELEMIDX_INVALID ((RPRElemIdx) -1) /* invalid index */ +#define RPR_DEPTH_MAX (UINT8_MAX - 1) /* max pattern nesting depth: 254 */ +#define RPR_DEPTH_NONE UINT8_MAX /* no enclosing group (top-level) */ + +/* Special varId values for control elements (252-255) */ +#define RPR_VARID_BEGIN ((RPRVarId) 252) /* group begin */ +#define RPR_VARID_END ((RPRVarId) 253) /* group end */ +#define RPR_VARID_ALT ((RPRVarId) 254) /* alternation start */ +#define RPR_VARID_FIN ((RPRVarId) 255) /* pattern finish */ + +/* Element flags */ +#define RPR_ELEM_RELUCTANT 0x01 /* reluctant (non-greedy) + * quantifier */ +#define RPR_ELEM_ABSORBABLE_BRANCH 0x02 /* element in absorbable region */ +#define RPR_ELEM_ABSORBABLE 0x04 /* absorption judgment point */ + +/* Accessor macros for RPRPatternElement */ +#define RPRElemIsReluctant(e) ((e)->flags & RPR_ELEM_RELUCTANT) +#define RPRElemIsAbsorbableBranch(e) ((e)->flags & RPR_ELEM_ABSORBABLE_BRANCH) +#define RPRElemIsAbsorbable(e) ((e)->flags & RPR_ELEM_ABSORBABLE) +#define RPRElemIsVar(e) ((e)->varId <= RPR_VARID_MAX) +#define RPRElemIsBegin(e) ((e)->varId == RPR_VARID_BEGIN) +#define RPRElemIsEnd(e) ((e)->varId == RPR_VARID_END) +#define RPRElemIsAlt(e) ((e)->varId == RPR_VARID_ALT) +#define RPRElemIsFin(e) ((e)->varId == RPR_VARID_FIN) +#define RPRElemCanSkip(e) ((e)->min == 0) + +extern List *collectPatternVariables(RPRPatternNode *pattern); +extern List *filterDefineClause(List *defineClause, List *patternVars, + List **defineVariableList); +extern RPRPattern *buildRPRPattern(RPRPatternNode *pattern, List *defineVariableList, + RPSkipTo rpSkipTo, int frameOptions); + +#endif /* OPTIMIZER_RPR_H */ -- 2.43.0