From 7e2c79631e33b9f4a2d5a189f13a72f8ec1ef73f Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 2 Feb 2026 23:34:36 -0500 Subject: [PATCH v14 04/16] Limit get_actual_variable_range to scan three index pages. get_actual_variable_range scans an index to find actual min/max values for planner selectivity estimation. Since this happens during planning, we can't afford to spend too much time on it. Commit 9c6ad5eaa9 added VISITED_PAGES_LIMIT (a limit of 100 heap page visits) to bound the amount of work performed, giving up and falling back to the pg_statistic extremal value when the limit is exceeded. But that isn't effective in cases with more extreme concentrations of dead index tuples. Recent benchmark results from Mark Callaghan show that VISITED_PAGES_LIMIT isn't effective once the dead index tuple problem gets out of hand (which is expected with queue-like tables that continually delete older records and insert newer ones). The root cause is that VISITED_PAGES_LIMIT counts heap page visits, but when many index tuples are marked LP_DEAD, _bt_readpage traverses arbitrarily many index pages without returning any tuples -- the heap page counter in selfuncs.c never gets a chance to increment, so VISITED_PAGES_LIMIT never triggers. Furthermore, the design of setting LP_DEAD bits to help future calls is ultimately counterproductive: each LP_DEAD tuple is one fewer that counts against VISITED_PAGES_LIMIT, so the more LP_DEAD bits we set, the less effective the limit becomes at bailing out early. Replace VISITED_PAGES_LIMIT with a mechanism that limits get_actual_variable_range to scanning only the extremal index leaf page, and two additional index pages, rather than counting heap page visits. This provides a hard guarantee on the maximum work per call. Unlike VISITED_PAGES_LIMIT, this limit cannot be eroded by LP_DEAD bits. This approach also has the merit of being compatible with the index prefetching commit's new table_index_getnext_slot() interface. That approach hides heap access details from callers like selfuncs.c, making VISITED_PAGES_LIMIT impractical to implement without pushing ad-hoc logic into the table AM layer. Author: Peter Geoghegan Discussion: https://postgr.es/m/CAH2-Wzkt1WkKp4VRJu3qHfmKXc8W+XYv1RXg5d2d3fSvAeO=rg@mail.gmail.com --- src/include/access/relscan.h | 8 ++++++ src/backend/access/heap/heapam.c | 3 --- src/backend/access/heap/heapam_handler.c | 11 ++++++++ src/backend/access/index/genam.c | 1 + src/backend/access/nbtree/nbtsearch.c | 32 +++++++++++++++++++++--- src/backend/utils/adt/selfuncs.c | 31 ++++++++++++++--------- 6 files changed, 68 insertions(+), 18 deletions(-) diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index d3ab5f91c..b125d4f7c 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -381,6 +381,14 @@ typedef struct IndexScanDescData /* parallel index scan information, in shared memory */ struct ParallelIndexScanDescData *parallel_scan; + + /* + * Counter to request early abort during get_actual_variable_range scans. + * When nonzero, the scan will read at most this many leaf pages before + * giving up (regardless of whether those pages had matching items). Zero + * means disabled (normal scan behavior). + */ + int xs_read_extremal_only; } IndexScanDescData; /* Generic structure for parallel scans */ diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 8f1c11a93..3cb536d6a 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1884,9 +1884,6 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, * If we can't see it, maybe no one else can either. At caller * request, check whether all chain members are dead to all * transactions. - * - * Note: if you change the criterion here for what is "dead", fix the - * planner's get_actual_variable_range() function to match. */ if (all_dead && *all_dead) { diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 247619ea6..cda3e74cb 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -557,6 +557,17 @@ heapam_batch_getnext(IndexScanDesc scan, ScanDirection direction, /* Append batch to the end of ring buffer/write it to buffer index */ index_scan_batch_append(scan, batch); + + /* + * xs_read_extremal_only scans are used by get_actual_variable_range + * to find min/max values. They only need a value from one of the + * extremal leaf pages, so once we have one batch, we give up. + */ + if (unlikely(scan->xs_read_extremal_only) && priorBatch) + { + Assert(scan->xs_want_itup); + return NULL; + } } else { diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 6e87169c2..d50e3fa71 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -126,6 +126,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_itupdesc = NULL; scan->xs_hitup = NULL; scan->xs_hitupdesc = NULL; + scan->xs_read_extremal_only = 0; scan->batch_index_opaque_size = 0; scan->batch_tuples_workspace = 0; diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 4d1b111da..d0102ed37 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1656,6 +1656,7 @@ _bt_readfirstpage(IndexScanDesc scan, IndexScanBatch firstbatch, { BTScanOpaque so = (BTScanOpaque) scan->opaque; BTBatchData *btfirstbatch = BTBatchGetData(firstbatch); + bool extscandone = false; BlockNumber blkno, lastcurrblkno; @@ -1704,14 +1705,28 @@ _bt_readfirstpage(IndexScanDesc scan, IndexScanBatch firstbatch, Assert(firstbatch->dir == dir); - if (blkno == P_NONE || + /* + * Maintain xs_read_extremal_only, a limit on the number of leaf pages + * we'll read before giving up and ending the scan + */ + if (unlikely(scan->xs_read_extremal_only)) + { + if (--scan->xs_read_extremal_only == 0) + extscandone = true; + } + + if (blkno == P_NONE || extscandone || (ScanDirectionIsForward(dir) ? !btfirstbatch->moreRight : !btfirstbatch->moreLeft)) { /* * firstbatch _bt_readpage call ended scan in this direction (though - * if so->needPrimScan was set the scan will continue in _bt_first) + * if so->needPrimScan was set the scan will continue in _bt_first). + * + * Also cut our losses during xs_read_extremal_only scans, which are + * limited to scanning only a few leaf pages in the index. */ + Assert(!scan->xs_read_extremal_only || !so->needPrimScan); indexam_util_batch_release(scan, firstbatch); _bt_parallel_done(scan); return NULL; @@ -1752,6 +1767,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool firstpage) { Relation rel = scan->indexRelation; + bool extscandone = false; IndexScanBatch newbatch; BTBatchData *btnewbatch; @@ -1829,8 +1845,18 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, /* no matching tuples on this page */ _bt_relbuf(rel, btnewbatch->buf); + /* + * Maintain xs_read_extremal_only, a limit on the number of leaf pages + * we'll read before giving up and ending the scan + */ + if (unlikely(scan->xs_read_extremal_only)) + { + if (--scan->xs_read_extremal_only == 0) + extscandone = true; + } + /* Continue the scan in this direction? */ - if (blkno == P_NONE || + if (blkno == P_NONE || extscandone || (ScanDirectionIsForward(dir) ? !btnewbatch->moreRight : !btnewbatch->moreLeft)) { diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 6d80ae003..09f2b9652 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -7081,13 +7081,12 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata, * * scankeys is a 1-element scankey array set up to reject nulls. * typLen/typByVal describe the datatype of the index's first column. - * tableslot is a slot suitable to hold table tuples, in case we need - * to probe the heap. + * tableslot is a slot suitable to hold table tuples. * (We could compute these values locally, but that would mean computing them * twice when get_actual_variable_range needs both the min and the max.) * - * Failure occurs either when the index is empty, or we decide that it's - * taking too long to find a suitable tuple. + * Failure occurs either when the index is empty, or when it takes too long to + * find a suitable tuple. */ static bool get_actual_variable_endpoint(Relation heapRel, @@ -7147,22 +7146,30 @@ get_actual_variable_endpoint(Relation heapRel, * * Despite all this care, there are situations where we might find many * non-visible tuples near the end of the index. We don't want to expend - * a huge amount of time here, so we give up once we've read too many heap - * pages. When we fail for that reason, the caller will end up using - * whatever extremal value is recorded in pg_statistic. - * - * XXX This can't work with the new table_index_getnext_slot interface, - * which simply won't return a tuple that isn't visible to our snapshot. - * table_index_getnext_slot will need some kind of callback that provides - * a way for the scan to give up when the costs start to get out of hand. + * a huge amount of time here, so we give up after reading a few extremal + * index leaf pages without finding matching items (generally only seen + * when pages have many index tuples with set LP_DEAD bits). When we give + * up the caller will end up using whatever extremal value is recorded in + * pg_statistic. */ InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(heapRel)); + /* Set up an index-only scan */ index_scan = index_beginscan(heapRel, indexRel, true, &SnapshotNonVacuumable, NULL, 1, 0); Assert(index_scan->xs_want_itup); + + /* + * Make our scan read at most 3 index leaf pages before it just gives up. + * This is on the conservative side; giving up after the first leaf page + * would work just as well in most cases. But it's possible that the + * index's leftmost/rightmost leaf page is one with very few index tuples + * (with or without their LP_DEAD bits set). + */ + index_scan->xs_read_extremal_only = 3; + index_rescan(index_scan, scankeys, 1, NULL, 0); /* Fetch first/next tuple in specified direction */ -- 2.53.0